{ "best_global_step": 40000, "best_metric": 1.2896699905395508, "best_model_checkpoint": "/mnt/fast/nobackup/scratch4weeks/jp01166/runs/cache/sft-gemma-all/checkpoint-40000", "epoch": 3.0, "eval_steps": 10000, "global_step": 123363, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.0668744587898256, "epoch": 0.0012159237372631989, "grad_norm": 24.75, "learning_rate": 1.5885881017993193e-07, "loss": 2.198721160888672, "mean_token_accuracy": 0.5864152985811234, "num_tokens": 22306.0, "step": 50 }, { "entropy": 2.1583500289916993, "epoch": 0.0024318474745263978, "grad_norm": 17.0, "learning_rate": 3.209596368941482e-07, "loss": 2.2618275451660157, "mean_token_accuracy": 0.5739503240585327, "num_tokens": 44901.0, "step": 100 }, { "entropy": 2.335817322731018, "epoch": 0.0036477712117895965, "grad_norm": 39.25, "learning_rate": 4.830604636083645e-07, "loss": 2.464931640625, "mean_token_accuracy": 0.5488416990637779, "num_tokens": 70629.0, "step": 150 }, { "entropy": 2.0593938672542573, "epoch": 0.0048636949490527956, "grad_norm": 17.625, "learning_rate": 6.451612903225807e-07, "loss": 2.136796112060547, "mean_token_accuracy": 0.5940703397989273, "num_tokens": 96434.0, "step": 200 }, { "entropy": 2.085271170139313, "epoch": 0.006079618686315995, "grad_norm": 14.875, "learning_rate": 8.07262117036797e-07, "loss": 2.1690493774414064, "mean_token_accuracy": 0.5723984533548355, "num_tokens": 122029.0, "step": 250 }, { "entropy": 2.1235363256931303, "epoch": 0.007295542423579193, "grad_norm": 15.4375, "learning_rate": 9.693629437510132e-07, "loss": 2.1991546630859373, "mean_token_accuracy": 0.5778445535898209, "num_tokens": 145121.0, "step": 300 }, { "entropy": 1.970875732898712, "epoch": 0.008511466160842392, "grad_norm": 29.875, "learning_rate": 1.1314637704652294e-06, "loss": 2.006485900878906, "mean_token_accuracy": 0.5936725288629532, "num_tokens": 173958.0, "step": 350 }, { "entropy": 1.9844615650177002, "epoch": 0.009727389898105591, "grad_norm": 24.375, "learning_rate": 1.2935645971794458e-06, "loss": 2.0047389221191407, "mean_token_accuracy": 0.5903229850530625, "num_tokens": 199406.0, "step": 400 }, { "entropy": 2.1039308857917787, "epoch": 0.01094331363536879, "grad_norm": 12.6875, "learning_rate": 1.455665423893662e-06, "loss": 2.0571018981933595, "mean_token_accuracy": 0.5918576717376709, "num_tokens": 224778.0, "step": 450 }, { "entropy": 2.1126053285598756, "epoch": 0.01215923737263199, "grad_norm": 50.25, "learning_rate": 1.617766250607878e-06, "loss": 2.0924346923828123, "mean_token_accuracy": 0.5758953464031219, "num_tokens": 247102.0, "step": 500 }, { "entropy": 2.074563525915146, "epoch": 0.013375161109895187, "grad_norm": 25.75, "learning_rate": 1.7798670773220944e-06, "loss": 2.072195281982422, "mean_token_accuracy": 0.5791842472553254, "num_tokens": 269535.0, "step": 550 }, { "entropy": 1.8033800542354583, "epoch": 0.014591084847158386, "grad_norm": 24.25, "learning_rate": 1.941967904036311e-06, "loss": 1.8501751708984375, "mean_token_accuracy": 0.6051643115282058, "num_tokens": 290239.0, "step": 600 }, { "entropy": 1.6157683980464936, "epoch": 0.015807008584421585, "grad_norm": 10.25, "learning_rate": 2.104068730750527e-06, "loss": 1.6177255249023437, "mean_token_accuracy": 0.6378308194875717, "num_tokens": 319193.0, "step": 650 }, { "entropy": 1.80018323302269, "epoch": 0.017022932321684784, "grad_norm": 28.375, "learning_rate": 2.2661695574647433e-06, "loss": 1.8902894592285155, "mean_token_accuracy": 0.5988817799091339, "num_tokens": 341526.0, "step": 700 }, { "entropy": 1.6552906584739686, "epoch": 0.018238856058947983, "grad_norm": 14.1875, "learning_rate": 2.4282703841789597e-06, "loss": 1.7186477661132813, "mean_token_accuracy": 0.6206609660387039, "num_tokens": 367613.0, "step": 750 }, { "entropy": 1.7910411632061005, "epoch": 0.019454779796211182, "grad_norm": 13.8125, "learning_rate": 2.5903712108931757e-06, "loss": 1.8455790710449218, "mean_token_accuracy": 0.5967503237724304, "num_tokens": 392195.0, "step": 800 }, { "entropy": 1.6605371713638306, "epoch": 0.02067070353347438, "grad_norm": 24.875, "learning_rate": 2.752472037607392e-06, "loss": 1.7082533264160156, "mean_token_accuracy": 0.6284109729528428, "num_tokens": 415736.0, "step": 850 }, { "entropy": 1.6441483396291732, "epoch": 0.02188662727073758, "grad_norm": 21.875, "learning_rate": 2.914572864321608e-06, "loss": 1.6916964721679688, "mean_token_accuracy": 0.6224294906854629, "num_tokens": 440270.0, "step": 900 }, { "entropy": 1.5696145236492156, "epoch": 0.02310255100800078, "grad_norm": 24.875, "learning_rate": 3.0766736910358245e-06, "loss": 1.5629232788085938, "mean_token_accuracy": 0.6408716893196106, "num_tokens": 463870.0, "step": 950 }, { "entropy": 1.6726100981235503, "epoch": 0.02431847474526398, "grad_norm": 54.0, "learning_rate": 3.238774517750041e-06, "loss": 1.7071998596191407, "mean_token_accuracy": 0.6224956113100052, "num_tokens": 488473.0, "step": 1000 }, { "entropy": 1.6888540291786194, "epoch": 0.025534398482527174, "grad_norm": 26.875, "learning_rate": 3.4008753444642573e-06, "loss": 1.7024165344238282, "mean_token_accuracy": 0.6172494041919708, "num_tokens": 514088.0, "step": 1050 }, { "entropy": 1.6874533534049987, "epoch": 0.026750322219790373, "grad_norm": 17.25, "learning_rate": 3.5629761711784733e-06, "loss": 1.6871415710449218, "mean_token_accuracy": 0.6155230277776718, "num_tokens": 536598.0, "step": 1100 }, { "entropy": 1.643469157218933, "epoch": 0.027966245957053572, "grad_norm": 17.75, "learning_rate": 3.7250769978926893e-06, "loss": 1.6556111145019532, "mean_token_accuracy": 0.6142888140678405, "num_tokens": 560891.0, "step": 1150 }, { "entropy": 1.7472346234321594, "epoch": 0.02918216969431677, "grad_norm": 11.5625, "learning_rate": 3.887177824606906e-06, "loss": 1.7839370727539063, "mean_token_accuracy": 0.607236179113388, "num_tokens": 582776.0, "step": 1200 }, { "entropy": 1.5802231597900391, "epoch": 0.03039809343157997, "grad_norm": 12.1875, "learning_rate": 4.049278651321122e-06, "loss": 1.5907369995117187, "mean_token_accuracy": 0.6316962081193924, "num_tokens": 611600.0, "step": 1250 }, { "entropy": 1.5554675936698914, "epoch": 0.03161401716884317, "grad_norm": 15.4375, "learning_rate": 4.2113794780353386e-06, "loss": 1.5523271179199218, "mean_token_accuracy": 0.6353584504127503, "num_tokens": 636188.0, "step": 1300 }, { "entropy": 1.7447377490997313, "epoch": 0.03282994090610637, "grad_norm": 34.5, "learning_rate": 4.373480304749554e-06, "loss": 1.782577362060547, "mean_token_accuracy": 0.6015062314271927, "num_tokens": 660104.0, "step": 1350 }, { "entropy": 1.6731853330135344, "epoch": 0.03404586464336957, "grad_norm": 30.5, "learning_rate": 4.535581131463771e-06, "loss": 1.6792092895507813, "mean_token_accuracy": 0.62296560049057, "num_tokens": 680848.0, "step": 1400 }, { "entropy": 1.4956641221046447, "epoch": 0.035261788380632764, "grad_norm": 13.625, "learning_rate": 4.697681958177987e-06, "loss": 1.5146919250488282, "mean_token_accuracy": 0.6385853987932205, "num_tokens": 704322.0, "step": 1450 }, { "entropy": 1.678462438583374, "epoch": 0.036477712117895966, "grad_norm": 14.6875, "learning_rate": 4.859782784892203e-06, "loss": 1.6970613098144531, "mean_token_accuracy": 0.6119580936431884, "num_tokens": 730981.0, "step": 1500 }, { "entropy": 1.4885662651062013, "epoch": 0.03769363585515916, "grad_norm": 12.1875, "learning_rate": 5.02188361160642e-06, "loss": 1.4837728881835937, "mean_token_accuracy": 0.646560240983963, "num_tokens": 759499.0, "step": 1550 }, { "entropy": 1.6921260821819306, "epoch": 0.038909559592422364, "grad_norm": 18.75, "learning_rate": 5.183984438320636e-06, "loss": 1.7565621948242187, "mean_token_accuracy": 0.6040036684274673, "num_tokens": 781886.0, "step": 1600 }, { "entropy": 1.554752072095871, "epoch": 0.04012548332968556, "grad_norm": 17.125, "learning_rate": 5.346085265034852e-06, "loss": 1.5662123107910155, "mean_token_accuracy": 0.6267619240283966, "num_tokens": 810270.0, "step": 1650 }, { "entropy": 1.638468360900879, "epoch": 0.04134140706694876, "grad_norm": 28.0, "learning_rate": 5.508186091749068e-06, "loss": 1.6533805847167968, "mean_token_accuracy": 0.6208187502622604, "num_tokens": 835956.0, "step": 1700 }, { "entropy": 1.5301895558834075, "epoch": 0.04255733080421196, "grad_norm": 10.8125, "learning_rate": 5.670286918463285e-06, "loss": 1.5361077880859375, "mean_token_accuracy": 0.6400426685810089, "num_tokens": 862265.0, "step": 1750 }, { "entropy": 1.6788738763332367, "epoch": 0.04377325454147516, "grad_norm": 11.75, "learning_rate": 5.832387745177501e-06, "loss": 1.6939462280273438, "mean_token_accuracy": 0.6123456406593323, "num_tokens": 890140.0, "step": 1800 }, { "entropy": 1.5587623345851898, "epoch": 0.044989178278738357, "grad_norm": 28.0, "learning_rate": 5.994488571891717e-06, "loss": 1.5941902160644532, "mean_token_accuracy": 0.6195802873373032, "num_tokens": 914020.0, "step": 1850 }, { "entropy": 1.5674131083488465, "epoch": 0.04620510201600156, "grad_norm": 26.875, "learning_rate": 6.156589398605933e-06, "loss": 1.5720237731933593, "mean_token_accuracy": 0.6327799332141876, "num_tokens": 940767.0, "step": 1900 }, { "entropy": 1.5794262766838074, "epoch": 0.047421025753264755, "grad_norm": 17.0, "learning_rate": 6.31869022532015e-06, "loss": 1.5950434875488282, "mean_token_accuracy": 0.6300889700651169, "num_tokens": 964532.0, "step": 1950 }, { "entropy": 1.6117138254642487, "epoch": 0.04863694949052796, "grad_norm": 35.5, "learning_rate": 6.480791052034367e-06, "loss": 1.6434945678710937, "mean_token_accuracy": 0.6230405074357986, "num_tokens": 991922.0, "step": 2000 }, { "entropy": 1.6922417318820953, "epoch": 0.04985287322779115, "grad_norm": 13.0625, "learning_rate": 6.642891878748582e-06, "loss": 1.7199708557128905, "mean_token_accuracy": 0.6093730437755585, "num_tokens": 1012589.0, "step": 2050 }, { "entropy": 1.5416508603096009, "epoch": 0.05106879696505435, "grad_norm": 16.75, "learning_rate": 6.804992705462799e-06, "loss": 1.5371168518066407, "mean_token_accuracy": 0.644890204668045, "num_tokens": 1038995.0, "step": 2100 }, { "entropy": 1.5054280638694764, "epoch": 0.05228472070231755, "grad_norm": 15.8125, "learning_rate": 6.967093532177015e-06, "loss": 1.513526611328125, "mean_token_accuracy": 0.6333647811412811, "num_tokens": 1062028.0, "step": 2150 }, { "entropy": 1.5385205233097077, "epoch": 0.05350064443958075, "grad_norm": 23.75, "learning_rate": 7.129194358891231e-06, "loss": 1.5517079162597656, "mean_token_accuracy": 0.6310275882482529, "num_tokens": 1085275.0, "step": 2200 }, { "entropy": 1.5780585902929305, "epoch": 0.05471656817684395, "grad_norm": 12.75, "learning_rate": 7.291295185605447e-06, "loss": 1.583107452392578, "mean_token_accuracy": 0.6326770955324172, "num_tokens": 1110496.0, "step": 2250 }, { "entropy": 1.4798088109493255, "epoch": 0.055932491914107145, "grad_norm": 13.5, "learning_rate": 7.4533960123196635e-06, "loss": 1.4789918518066407, "mean_token_accuracy": 0.6431665086746216, "num_tokens": 1137326.0, "step": 2300 }, { "entropy": 1.5962856280803681, "epoch": 0.05714841565137035, "grad_norm": 18.75, "learning_rate": 7.61549683903388e-06, "loss": 1.6417024230957031, "mean_token_accuracy": 0.6221880859136582, "num_tokens": 1160499.0, "step": 2350 }, { "entropy": 1.5746853697299956, "epoch": 0.05836433938863354, "grad_norm": 26.375, "learning_rate": 7.777597665748096e-06, "loss": 1.5826925659179687, "mean_token_accuracy": 0.6249787449836731, "num_tokens": 1185038.0, "step": 2400 }, { "entropy": 1.385513722896576, "epoch": 0.059580263125896746, "grad_norm": 28.125, "learning_rate": 7.939698492462312e-06, "loss": 1.389461212158203, "mean_token_accuracy": 0.6642078137397767, "num_tokens": 1207785.0, "step": 2450 }, { "entropy": 1.5219133865833283, "epoch": 0.06079618686315994, "grad_norm": 15.8125, "learning_rate": 8.101799319176529e-06, "loss": 1.5483840942382812, "mean_token_accuracy": 0.6332084041833878, "num_tokens": 1232546.0, "step": 2500 }, { "entropy": 1.5127995693683625, "epoch": 0.062012110600423144, "grad_norm": 24.875, "learning_rate": 8.263900145890745e-06, "loss": 1.53638671875, "mean_token_accuracy": 0.6372886282205582, "num_tokens": 1254933.0, "step": 2550 }, { "entropy": 1.5824606907367706, "epoch": 0.06322803433768634, "grad_norm": 13.875, "learning_rate": 8.42600097260496e-06, "loss": 1.6021611022949218, "mean_token_accuracy": 0.6279019457101822, "num_tokens": 1278316.0, "step": 2600 }, { "entropy": 1.4939566349983215, "epoch": 0.06444395807494954, "grad_norm": 17.875, "learning_rate": 8.588101799319178e-06, "loss": 1.5074623107910157, "mean_token_accuracy": 0.6422136145830154, "num_tokens": 1301411.0, "step": 2650 }, { "entropy": 1.5538059055805207, "epoch": 0.06565988181221274, "grad_norm": 13.0, "learning_rate": 8.750202626033393e-06, "loss": 1.5859347534179689, "mean_token_accuracy": 0.6212028527259826, "num_tokens": 1324443.0, "step": 2700 }, { "entropy": 1.567455997467041, "epoch": 0.06687580554947593, "grad_norm": 19.75, "learning_rate": 8.912303452747609e-06, "loss": 1.591248779296875, "mean_token_accuracy": 0.6292648339271545, "num_tokens": 1347747.0, "step": 2750 }, { "entropy": 1.4364886450767518, "epoch": 0.06809172928673914, "grad_norm": 16.375, "learning_rate": 9.074404279461826e-06, "loss": 1.4474269104003907, "mean_token_accuracy": 0.6486590111255646, "num_tokens": 1374196.0, "step": 2800 }, { "entropy": 1.5205616021156312, "epoch": 0.06930765302400234, "grad_norm": 21.375, "learning_rate": 9.236505106176043e-06, "loss": 1.543278045654297, "mean_token_accuracy": 0.6356135910749435, "num_tokens": 1395303.0, "step": 2850 }, { "entropy": 1.3929960095882417, "epoch": 0.07052357676126553, "grad_norm": 20.5, "learning_rate": 9.398605932890259e-06, "loss": 1.4070027160644532, "mean_token_accuracy": 0.6570306396484376, "num_tokens": 1422027.0, "step": 2900 }, { "entropy": 1.5233114314079286, "epoch": 0.07173950049852873, "grad_norm": 14.375, "learning_rate": 9.560706759604474e-06, "loss": 1.549197998046875, "mean_token_accuracy": 0.6362422639131546, "num_tokens": 1443469.0, "step": 2950 }, { "entropy": 1.5757437813282014, "epoch": 0.07295542423579193, "grad_norm": 21.25, "learning_rate": 9.722807586318692e-06, "loss": 1.5966835021972656, "mean_token_accuracy": 0.6204159265756607, "num_tokens": 1468592.0, "step": 3000 }, { "entropy": 1.456451005935669, "epoch": 0.07417134797305514, "grad_norm": 14.25, "learning_rate": 9.884908413032907e-06, "loss": 1.4693218994140624, "mean_token_accuracy": 0.6421864742040634, "num_tokens": 1489398.0, "step": 3050 }, { "entropy": 1.3885511684417724, "epoch": 0.07538727171031832, "grad_norm": 18.125, "learning_rate": 1.0047009239747123e-05, "loss": 1.4218399047851562, "mean_token_accuracy": 0.6468707519769669, "num_tokens": 1512646.0, "step": 3100 }, { "entropy": 1.4030572485923767, "epoch": 0.07660319544758153, "grad_norm": 16.875, "learning_rate": 1.020911006646134e-05, "loss": 1.4316555786132812, "mean_token_accuracy": 0.6563237577676773, "num_tokens": 1541030.0, "step": 3150 }, { "entropy": 1.492719452381134, "epoch": 0.07781911918484473, "grad_norm": 13.125, "learning_rate": 1.0371210893175556e-05, "loss": 1.536722412109375, "mean_token_accuracy": 0.6381469589471817, "num_tokens": 1564897.0, "step": 3200 }, { "entropy": 1.37459615111351, "epoch": 0.07903504292210793, "grad_norm": 23.25, "learning_rate": 1.0533311719889771e-05, "loss": 1.3721340942382811, "mean_token_accuracy": 0.6644773435592651, "num_tokens": 1588771.0, "step": 3250 }, { "entropy": 1.5302176141738892, "epoch": 0.08025096665937112, "grad_norm": 12.5, "learning_rate": 1.069541254660399e-05, "loss": 1.5313838195800782, "mean_token_accuracy": 0.6386620742082596, "num_tokens": 1614534.0, "step": 3300 }, { "entropy": 1.5269215285778046, "epoch": 0.08146689039663432, "grad_norm": 13.0625, "learning_rate": 1.0857513373318204e-05, "loss": 1.5401788330078126, "mean_token_accuracy": 0.635187155008316, "num_tokens": 1637211.0, "step": 3350 }, { "entropy": 1.4004489189386369, "epoch": 0.08268281413389753, "grad_norm": 38.75, "learning_rate": 1.101961420003242e-05, "loss": 1.4387033081054688, "mean_token_accuracy": 0.6467935192584991, "num_tokens": 1658603.0, "step": 3400 }, { "entropy": 1.4483498442173004, "epoch": 0.08389873787116071, "grad_norm": 14.0, "learning_rate": 1.1181715026746638e-05, "loss": 1.470782012939453, "mean_token_accuracy": 0.6470567601919174, "num_tokens": 1685905.0, "step": 3450 }, { "entropy": 1.5432403266429902, "epoch": 0.08511466160842392, "grad_norm": 9.0, "learning_rate": 1.1343815853460854e-05, "loss": 1.5546658325195313, "mean_token_accuracy": 0.6397872543334961, "num_tokens": 1710798.0, "step": 3500 }, { "entropy": 1.583395118713379, "epoch": 0.08633058534568712, "grad_norm": 26.25, "learning_rate": 1.150591668017507e-05, "loss": 1.593956298828125, "mean_token_accuracy": 0.6316499876976013, "num_tokens": 1736637.0, "step": 3550 }, { "entropy": 1.3510558140277862, "epoch": 0.08754650908295032, "grad_norm": 10.8125, "learning_rate": 1.1668017506889287e-05, "loss": 1.3948712158203125, "mean_token_accuracy": 0.6609514570236206, "num_tokens": 1761700.0, "step": 3600 }, { "entropy": 1.6313312339782715, "epoch": 0.08876243282021351, "grad_norm": 13.9375, "learning_rate": 1.1830118333603502e-05, "loss": 1.665828857421875, "mean_token_accuracy": 0.6124255412817001, "num_tokens": 1785421.0, "step": 3650 }, { "entropy": 1.55949871301651, "epoch": 0.08997835655747671, "grad_norm": 15.6875, "learning_rate": 1.199221916031772e-05, "loss": 1.5649404907226563, "mean_token_accuracy": 0.6313118708133697, "num_tokens": 1805720.0, "step": 3700 }, { "entropy": 1.5290585064888, "epoch": 0.09119428029473992, "grad_norm": 20.25, "learning_rate": 1.2154319987031935e-05, "loss": 1.5403419494628907, "mean_token_accuracy": 0.6313535642623901, "num_tokens": 1829044.0, "step": 3750 }, { "entropy": 1.4771985065937043, "epoch": 0.09241020403200312, "grad_norm": 12.25, "learning_rate": 1.231642081374615e-05, "loss": 1.504022216796875, "mean_token_accuracy": 0.6402160704135895, "num_tokens": 1851548.0, "step": 3800 }, { "entropy": 1.4588620173931122, "epoch": 0.0936261277692663, "grad_norm": 21.25, "learning_rate": 1.2478521640460368e-05, "loss": 1.478091278076172, "mean_token_accuracy": 0.6467790853977203, "num_tokens": 1874542.0, "step": 3850 }, { "entropy": 1.5040128922462463, "epoch": 0.09484205150652951, "grad_norm": 33.0, "learning_rate": 1.2640622467174584e-05, "loss": 1.521408233642578, "mean_token_accuracy": 0.6404318171739578, "num_tokens": 1898221.0, "step": 3900 }, { "entropy": 1.4508005261421204, "epoch": 0.09605797524379271, "grad_norm": 18.875, "learning_rate": 1.28027232938888e-05, "loss": 1.474298095703125, "mean_token_accuracy": 0.6376293307542801, "num_tokens": 1922022.0, "step": 3950 }, { "entropy": 1.453290513753891, "epoch": 0.09727389898105591, "grad_norm": 16.125, "learning_rate": 1.2964824120603017e-05, "loss": 1.490994873046875, "mean_token_accuracy": 0.6439892292022705, "num_tokens": 1944826.0, "step": 4000 }, { "entropy": 1.550618554353714, "epoch": 0.0984898227183191, "grad_norm": 13.25, "learning_rate": 1.3126924947317232e-05, "loss": 1.56427490234375, "mean_token_accuracy": 0.6199309819936752, "num_tokens": 1971391.0, "step": 4050 }, { "entropy": 1.4941031754016876, "epoch": 0.0997057464555823, "grad_norm": 16.875, "learning_rate": 1.3289025774031448e-05, "loss": 1.5170695495605468, "mean_token_accuracy": 0.6431309616565705, "num_tokens": 1994305.0, "step": 4100 }, { "entropy": 1.518970759510994, "epoch": 0.10092167019284551, "grad_norm": 31.125, "learning_rate": 1.3451126600745665e-05, "loss": 1.5380148315429687, "mean_token_accuracy": 0.631448637843132, "num_tokens": 2017941.0, "step": 4150 }, { "entropy": 1.5606809628009797, "epoch": 0.1021375939301087, "grad_norm": 8.6875, "learning_rate": 1.361322742745988e-05, "loss": 1.5783900451660156, "mean_token_accuracy": 0.6328846895694733, "num_tokens": 2040508.0, "step": 4200 }, { "entropy": 1.4725416994094849, "epoch": 0.1033535176673719, "grad_norm": 23.125, "learning_rate": 1.3775328254174098e-05, "loss": 1.4994126892089843, "mean_token_accuracy": 0.6423375242948532, "num_tokens": 2067852.0, "step": 4250 }, { "entropy": 1.5680527472496033, "epoch": 0.1045694414046351, "grad_norm": 42.25, "learning_rate": 1.3937429080888313e-05, "loss": 1.5853048706054687, "mean_token_accuracy": 0.6254552721977233, "num_tokens": 2091489.0, "step": 4300 }, { "entropy": 1.5139513254165649, "epoch": 0.1057853651418983, "grad_norm": 12.0, "learning_rate": 1.4099529907602529e-05, "loss": 1.5160346984863282, "mean_token_accuracy": 0.633191146850586, "num_tokens": 2117429.0, "step": 4350 }, { "entropy": 1.4020550954341888, "epoch": 0.1070012888791615, "grad_norm": 12.8125, "learning_rate": 1.4261630734316748e-05, "loss": 1.4279100036621093, "mean_token_accuracy": 0.6437346935272217, "num_tokens": 2142044.0, "step": 4400 }, { "entropy": 1.4561577689647676, "epoch": 0.1082172126164247, "grad_norm": 12.9375, "learning_rate": 1.4423731561030962e-05, "loss": 1.4733489990234374, "mean_token_accuracy": 0.6401446509361267, "num_tokens": 2164259.0, "step": 4450 }, { "entropy": 1.6199225294589996, "epoch": 0.1094331363536879, "grad_norm": 16.75, "learning_rate": 1.4585832387745177e-05, "loss": 1.628731689453125, "mean_token_accuracy": 0.6181291300058365, "num_tokens": 2190229.0, "step": 4500 }, { "entropy": 1.4404736125469209, "epoch": 0.1106490600909511, "grad_norm": 19.25, "learning_rate": 1.4747933214459396e-05, "loss": 1.4796826171875, "mean_token_accuracy": 0.6445726370811462, "num_tokens": 2214325.0, "step": 4550 }, { "entropy": 1.5006324470043182, "epoch": 0.11186498382821429, "grad_norm": 32.25, "learning_rate": 1.4910034041173612e-05, "loss": 1.5061546325683595, "mean_token_accuracy": 0.6348070406913757, "num_tokens": 2234413.0, "step": 4600 }, { "entropy": 1.445968635082245, "epoch": 0.11308090756547749, "grad_norm": 9.5625, "learning_rate": 1.5072134867887827e-05, "loss": 1.4621702575683593, "mean_token_accuracy": 0.6432409322261811, "num_tokens": 2256467.0, "step": 4650 }, { "entropy": 1.4061364006996155, "epoch": 0.1142968313027407, "grad_norm": 15.3125, "learning_rate": 1.5234235694602045e-05, "loss": 1.411553192138672, "mean_token_accuracy": 0.6506155019998551, "num_tokens": 2276295.0, "step": 4700 }, { "entropy": 1.4272890830039977, "epoch": 0.1155127550400039, "grad_norm": 13.1875, "learning_rate": 1.539633652131626e-05, "loss": 1.4479811096191406, "mean_token_accuracy": 0.6441948717832565, "num_tokens": 2299091.0, "step": 4750 }, { "entropy": 1.383376832008362, "epoch": 0.11672867877726709, "grad_norm": 14.1875, "learning_rate": 1.5558437348030474e-05, "loss": 1.4205516052246094, "mean_token_accuracy": 0.656414099931717, "num_tokens": 2320081.0, "step": 4800 }, { "entropy": 1.4523517096042633, "epoch": 0.11794460251453029, "grad_norm": 13.3125, "learning_rate": 1.5720538174744693e-05, "loss": 1.4595103454589844, "mean_token_accuracy": 0.652843508720398, "num_tokens": 2346831.0, "step": 4850 }, { "entropy": 1.4729406583309173, "epoch": 0.11916052625179349, "grad_norm": 16.0, "learning_rate": 1.588263900145891e-05, "loss": 1.495215606689453, "mean_token_accuracy": 0.6485115504264831, "num_tokens": 2370393.0, "step": 4900 }, { "entropy": 1.4624007964134216, "epoch": 0.12037644998905668, "grad_norm": 12.4375, "learning_rate": 1.6044739828173128e-05, "loss": 1.4809745788574218, "mean_token_accuracy": 0.6382767295837403, "num_tokens": 2393925.0, "step": 4950 }, { "entropy": 1.5056415271759034, "epoch": 0.12159237372631988, "grad_norm": 15.0625, "learning_rate": 1.620684065488734e-05, "loss": 1.5284567260742188, "mean_token_accuracy": 0.6304517805576324, "num_tokens": 2418985.0, "step": 5000 }, { "entropy": 1.357025750875473, "epoch": 0.12280829746358309, "grad_norm": 16.625, "learning_rate": 1.6368941481601555e-05, "loss": 1.3810400390625, "mean_token_accuracy": 0.6591059362888336, "num_tokens": 2442622.0, "step": 5050 }, { "entropy": 1.4365251207351684, "epoch": 0.12402422120084629, "grad_norm": 12.875, "learning_rate": 1.6531042308315774e-05, "loss": 1.4627096557617187, "mean_token_accuracy": 0.6473317009210586, "num_tokens": 2469320.0, "step": 5100 }, { "entropy": 1.4312341761589051, "epoch": 0.12524014493810948, "grad_norm": 25.125, "learning_rate": 1.669314313502999e-05, "loss": 1.4464021301269532, "mean_token_accuracy": 0.639363893866539, "num_tokens": 2492381.0, "step": 5150 }, { "entropy": 1.4435406112670899, "epoch": 0.12645606867537268, "grad_norm": 30.375, "learning_rate": 1.6855243961744205e-05, "loss": 1.4525704956054688, "mean_token_accuracy": 0.639933915734291, "num_tokens": 2518642.0, "step": 5200 }, { "entropy": 1.368011245727539, "epoch": 0.12767199241263588, "grad_norm": 16.375, "learning_rate": 1.7017344788458424e-05, "loss": 1.3863734436035156, "mean_token_accuracy": 0.6589767926931381, "num_tokens": 2542168.0, "step": 5250 }, { "entropy": 1.413929327726364, "epoch": 0.12888791614989908, "grad_norm": 13.75, "learning_rate": 1.717944561517264e-05, "loss": 1.4329158020019532, "mean_token_accuracy": 0.6527084410190582, "num_tokens": 2566205.0, "step": 5300 }, { "entropy": 1.4252416801452636, "epoch": 0.1301038398871623, "grad_norm": 19.75, "learning_rate": 1.7341546441886855e-05, "loss": 1.4323243713378906, "mean_token_accuracy": 0.648655309677124, "num_tokens": 2593806.0, "step": 5350 }, { "entropy": 1.4800836110115052, "epoch": 0.1313197636244255, "grad_norm": 24.0, "learning_rate": 1.750364726860107e-05, "loss": 1.4900382995605468, "mean_token_accuracy": 0.6402974653244019, "num_tokens": 2614610.0, "step": 5400 }, { "entropy": 1.478970912694931, "epoch": 0.13253568736168866, "grad_norm": 21.375, "learning_rate": 1.7665748095315287e-05, "loss": 1.5252902221679687, "mean_token_accuracy": 0.6400543594360352, "num_tokens": 2638508.0, "step": 5450 }, { "entropy": 1.3980468690395356, "epoch": 0.13375161109895187, "grad_norm": 12.0, "learning_rate": 1.7827848922029506e-05, "loss": 1.4086732482910156, "mean_token_accuracy": 0.6514744007587433, "num_tokens": 2661321.0, "step": 5500 }, { "entropy": 1.4723073470592498, "epoch": 0.13496753483621507, "grad_norm": 21.25, "learning_rate": 1.798994974874372e-05, "loss": 1.4912113952636719, "mean_token_accuracy": 0.6386963146924972, "num_tokens": 2683870.0, "step": 5550 }, { "entropy": 1.4292308163642884, "epoch": 0.13618345857347827, "grad_norm": 12.5625, "learning_rate": 1.8152050575457937e-05, "loss": 1.4589237976074219, "mean_token_accuracy": 0.649688446521759, "num_tokens": 2710671.0, "step": 5600 }, { "entropy": 1.5057873964309691, "epoch": 0.13739938231074147, "grad_norm": 12.4375, "learning_rate": 1.8314151402172152e-05, "loss": 1.5193449401855468, "mean_token_accuracy": 0.6351486265659332, "num_tokens": 2735204.0, "step": 5650 }, { "entropy": 1.4096543097496033, "epoch": 0.13861530604800468, "grad_norm": 18.375, "learning_rate": 1.8476252228886368e-05, "loss": 1.4426884460449219, "mean_token_accuracy": 0.6421583139896393, "num_tokens": 2757726.0, "step": 5700 }, { "entropy": 1.45072083234787, "epoch": 0.13983122978526788, "grad_norm": 17.625, "learning_rate": 1.8638353055600583e-05, "loss": 1.4809243774414063, "mean_token_accuracy": 0.6484523755311966, "num_tokens": 2781587.0, "step": 5750 }, { "entropy": 1.4184762823581696, "epoch": 0.14104715352253105, "grad_norm": 17.0, "learning_rate": 1.8800453882314802e-05, "loss": 1.4281791687011718, "mean_token_accuracy": 0.6526647561788559, "num_tokens": 2804787.0, "step": 5800 }, { "entropy": 1.5409626805782317, "epoch": 0.14226307725979426, "grad_norm": 14.625, "learning_rate": 1.8962554709029018e-05, "loss": 1.584885711669922, "mean_token_accuracy": 0.6272022634744644, "num_tokens": 2823883.0, "step": 5850 }, { "entropy": 1.4843133807182312, "epoch": 0.14347900099705746, "grad_norm": 9.0625, "learning_rate": 1.9124655535743234e-05, "loss": 1.513236541748047, "mean_token_accuracy": 0.6381356942653656, "num_tokens": 2846240.0, "step": 5900 }, { "entropy": 1.3732129693031312, "epoch": 0.14469492473432066, "grad_norm": 20.625, "learning_rate": 1.928675636245745e-05, "loss": 1.3867538452148438, "mean_token_accuracy": 0.6593318802118301, "num_tokens": 2866858.0, "step": 5950 }, { "entropy": 1.3941351449489594, "epoch": 0.14591084847158386, "grad_norm": 10.75, "learning_rate": 1.9448857189171665e-05, "loss": 1.4219868469238282, "mean_token_accuracy": 0.6577207517623901, "num_tokens": 2888063.0, "step": 6000 }, { "entropy": 1.3301638638973237, "epoch": 0.14712677220884707, "grad_norm": 8.375, "learning_rate": 1.9610958015885884e-05, "loss": 1.3678416442871093, "mean_token_accuracy": 0.663114618062973, "num_tokens": 2913767.0, "step": 6050 }, { "entropy": 1.3696937215328218, "epoch": 0.14834269594611027, "grad_norm": 20.625, "learning_rate": 1.97730588426001e-05, "loss": 1.3951480102539062, "mean_token_accuracy": 0.6484826403856278, "num_tokens": 2935586.0, "step": 6100 }, { "entropy": 1.4353530180454255, "epoch": 0.14955861968337347, "grad_norm": 17.25, "learning_rate": 1.9935159669314315e-05, "loss": 1.4695782470703125, "mean_token_accuracy": 0.6468281590938568, "num_tokens": 2956261.0, "step": 6150 }, { "entropy": 1.582057716846466, "epoch": 0.15077454342063665, "grad_norm": 12.9375, "learning_rate": 1.9999996766286944e-05, "loss": 1.5857652282714845, "mean_token_accuracy": 0.623684932589531, "num_tokens": 2980757.0, "step": 6200 }, { "entropy": 1.4491410672664642, "epoch": 0.15199046715789985, "grad_norm": 27.5, "learning_rate": 1.999997700471471e-05, "loss": 1.4714703369140625, "mean_token_accuracy": 0.6362096995115281, "num_tokens": 3008792.0, "step": 6250 }, { "entropy": 1.477791155576706, "epoch": 0.15320639089516305, "grad_norm": 13.5625, "learning_rate": 1.9999939278112956e-05, "loss": 1.5068814086914062, "mean_token_accuracy": 0.6404138171672821, "num_tokens": 3031690.0, "step": 6300 }, { "entropy": 1.438215082883835, "epoch": 0.15442231463242626, "grad_norm": 29.625, "learning_rate": 1.999988358654946e-05, "loss": 1.4753460693359375, "mean_token_accuracy": 0.6516618031263351, "num_tokens": 3053496.0, "step": 6350 }, { "entropy": 1.4555094075202941, "epoch": 0.15563823836968946, "grad_norm": 13.4375, "learning_rate": 1.9999809930124273e-05, "loss": 1.4889549255371093, "mean_token_accuracy": 0.6522561889886856, "num_tokens": 3074132.0, "step": 6400 }, { "entropy": 1.4781759572029114, "epoch": 0.15685416210695266, "grad_norm": 14.625, "learning_rate": 1.9999718308969715e-05, "loss": 1.506678466796875, "mean_token_accuracy": 0.6417236030101776, "num_tokens": 3096837.0, "step": 6450 }, { "entropy": 1.389329754114151, "epoch": 0.15807008584421586, "grad_norm": 13.875, "learning_rate": 1.9999608723250387e-05, "loss": 1.4044952392578125, "mean_token_accuracy": 0.6513937681913375, "num_tokens": 3123038.0, "step": 6500 }, { "entropy": 1.4210147505998612, "epoch": 0.15928600958147904, "grad_norm": 16.25, "learning_rate": 1.999948117316316e-05, "loss": 1.4397543334960938, "mean_token_accuracy": 0.6514596009254455, "num_tokens": 3150366.0, "step": 6550 }, { "entropy": 1.416086037158966, "epoch": 0.16050193331874224, "grad_norm": 7.9375, "learning_rate": 1.9999335658937178e-05, "loss": 1.4254562377929687, "mean_token_accuracy": 0.6512540256977082, "num_tokens": 3176964.0, "step": 6600 }, { "entropy": 1.3843409407138825, "epoch": 0.16171785705600544, "grad_norm": 34.75, "learning_rate": 1.999917218083386e-05, "loss": 1.386285858154297, "mean_token_accuracy": 0.6555273520946503, "num_tokens": 3199676.0, "step": 6650 }, { "entropy": 1.4313719260692597, "epoch": 0.16293378079326865, "grad_norm": 16.25, "learning_rate": 1.999899073914689e-05, "loss": 1.4699916076660156, "mean_token_accuracy": 0.6433839970827102, "num_tokens": 3226215.0, "step": 6700 }, { "entropy": 1.4223970532417298, "epoch": 0.16414970453053185, "grad_norm": 13.8125, "learning_rate": 1.9998791334202238e-05, "loss": 1.4260699462890625, "mean_token_accuracy": 0.6485925984382629, "num_tokens": 3247605.0, "step": 6750 }, { "entropy": 1.5494444942474366, "epoch": 0.16536562826779505, "grad_norm": 11.3125, "learning_rate": 1.9998573966358132e-05, "loss": 1.5653074645996095, "mean_token_accuracy": 0.6212706965208054, "num_tokens": 3271836.0, "step": 6800 }, { "entropy": 1.316110999584198, "epoch": 0.16658155200505825, "grad_norm": 40.0, "learning_rate": 1.9998338636005075e-05, "loss": 1.3513818359375, "mean_token_accuracy": 0.6638361817598343, "num_tokens": 3291735.0, "step": 6850 }, { "entropy": 1.3604478681087493, "epoch": 0.16779747574232143, "grad_norm": 13.25, "learning_rate": 1.999808534356584e-05, "loss": 1.38185791015625, "mean_token_accuracy": 0.6588828003406525, "num_tokens": 3314195.0, "step": 6900 }, { "entropy": 1.3751016163825989, "epoch": 0.16901339947958463, "grad_norm": 13.3125, "learning_rate": 1.999781408949547e-05, "loss": 1.3841876220703124, "mean_token_accuracy": 0.6489233309030533, "num_tokens": 3342635.0, "step": 6950 }, { "entropy": 1.3994618821144105, "epoch": 0.17022932321684783, "grad_norm": 11.8125, "learning_rate": 1.9997524874281267e-05, "loss": 1.4228060913085938, "mean_token_accuracy": 0.6448993694782257, "num_tokens": 3367086.0, "step": 7000 }, { "entropy": 1.4444366884231568, "epoch": 0.17144524695411104, "grad_norm": 17.75, "learning_rate": 1.9997217698442818e-05, "loss": 1.4717689514160157, "mean_token_accuracy": 0.6449766361713409, "num_tokens": 3390756.0, "step": 7050 }, { "entropy": 1.3520021843910217, "epoch": 0.17266117069137424, "grad_norm": 9.125, "learning_rate": 1.9996892562531964e-05, "loss": 1.3962490844726563, "mean_token_accuracy": 0.6558065283298492, "num_tokens": 3414721.0, "step": 7100 }, { "entropy": 1.5260631638765334, "epoch": 0.17387709442863744, "grad_norm": 22.75, "learning_rate": 1.9996549467132814e-05, "loss": 1.5475566101074218, "mean_token_accuracy": 0.6248638820648194, "num_tokens": 3438943.0, "step": 7150 }, { "entropy": 1.4186983227729797, "epoch": 0.17509301816590064, "grad_norm": 9.375, "learning_rate": 1.9996188412861738e-05, "loss": 1.453130340576172, "mean_token_accuracy": 0.6424750190973282, "num_tokens": 3467347.0, "step": 7200 }, { "entropy": 1.371187574863434, "epoch": 0.17630894190316385, "grad_norm": 21.25, "learning_rate": 1.9995809400367375e-05, "loss": 1.3855265808105468, "mean_token_accuracy": 0.660103268623352, "num_tokens": 3487512.0, "step": 7250 }, { "entropy": 1.318514620065689, "epoch": 0.17752486564042702, "grad_norm": 11.8125, "learning_rate": 1.999541243033062e-05, "loss": 1.3372036743164062, "mean_token_accuracy": 0.6684290885925293, "num_tokens": 3510743.0, "step": 7300 }, { "entropy": 1.4096943950653076, "epoch": 0.17874078937769022, "grad_norm": 10.625, "learning_rate": 1.999499750346464e-05, "loss": 1.427345733642578, "mean_token_accuracy": 0.6545080327987671, "num_tokens": 3534229.0, "step": 7350 }, { "entropy": 1.3656837725639344, "epoch": 0.17995671311495343, "grad_norm": 14.375, "learning_rate": 1.9994564620514848e-05, "loss": 1.3841372680664064, "mean_token_accuracy": 0.662466898560524, "num_tokens": 3554561.0, "step": 7400 }, { "entropy": 1.4537952709197999, "epoch": 0.18117263685221663, "grad_norm": 17.625, "learning_rate": 1.9994113782258926e-05, "loss": 1.4680213928222656, "mean_token_accuracy": 0.6411845338344574, "num_tokens": 3579544.0, "step": 7450 }, { "entropy": 1.428108971118927, "epoch": 0.18238856058947983, "grad_norm": 18.0, "learning_rate": 1.9993644989506804e-05, "loss": 1.4648129272460937, "mean_token_accuracy": 0.6439302313327789, "num_tokens": 3604196.0, "step": 7500 }, { "entropy": 1.3567175853252411, "epoch": 0.18360448432674303, "grad_norm": 12.75, "learning_rate": 1.9993158243100674e-05, "loss": 1.3832569885253907, "mean_token_accuracy": 0.6598049914836883, "num_tokens": 3625400.0, "step": 7550 }, { "entropy": 1.5048708295822144, "epoch": 0.18482040806400624, "grad_norm": 32.75, "learning_rate": 1.9992653543914974e-05, "loss": 1.5334588623046874, "mean_token_accuracy": 0.631193453669548, "num_tokens": 3645056.0, "step": 7600 }, { "entropy": 1.324746962785721, "epoch": 0.1860363318012694, "grad_norm": 8.5, "learning_rate": 1.9992130892856406e-05, "loss": 1.3391998291015625, "mean_token_accuracy": 0.6633648931980133, "num_tokens": 3672112.0, "step": 7650 }, { "entropy": 1.2737340748310089, "epoch": 0.1872522555385326, "grad_norm": 11.0625, "learning_rate": 1.9991590290863917e-05, "loss": 1.3149012756347656, "mean_token_accuracy": 0.6726942735910416, "num_tokens": 3698112.0, "step": 7700 }, { "entropy": 1.401622622013092, "epoch": 0.18846817927579582, "grad_norm": 25.75, "learning_rate": 1.9991031738908697e-05, "loss": 1.4225611877441406, "mean_token_accuracy": 0.6519571566581726, "num_tokens": 3720018.0, "step": 7750 }, { "entropy": 1.4882529711723327, "epoch": 0.18968410301305902, "grad_norm": 14.625, "learning_rate": 1.999045523799419e-05, "loss": 1.5180319213867188, "mean_token_accuracy": 0.6387058049440384, "num_tokens": 3745300.0, "step": 7800 }, { "entropy": 1.3210166847705842, "epoch": 0.19090002675032222, "grad_norm": 13.0, "learning_rate": 1.9989860789156084e-05, "loss": 1.338840789794922, "mean_token_accuracy": 0.6743619084358216, "num_tokens": 3766642.0, "step": 7850 }, { "entropy": 1.4352219879627228, "epoch": 0.19211595048758542, "grad_norm": 22.875, "learning_rate": 1.9989248393462314e-05, "loss": 1.4517008972167968, "mean_token_accuracy": 0.6362943840026856, "num_tokens": 3786836.0, "step": 7900 }, { "entropy": 1.4541539388895035, "epoch": 0.19333187422484863, "grad_norm": 14.75, "learning_rate": 1.998861805201305e-05, "loss": 1.4839596557617187, "mean_token_accuracy": 0.6308450359106064, "num_tokens": 3809060.0, "step": 7950 }, { "entropy": 1.4515194153785707, "epoch": 0.19454779796211183, "grad_norm": 19.625, "learning_rate": 1.9987969765940707e-05, "loss": 1.4979852294921876, "mean_token_accuracy": 0.6437677538394928, "num_tokens": 3831455.0, "step": 8000 }, { "entropy": 1.3769458937644958, "epoch": 0.195763721699375, "grad_norm": 16.875, "learning_rate": 1.9987303536409935e-05, "loss": 1.368787841796875, "mean_token_accuracy": 0.6550470149517059, "num_tokens": 3856347.0, "step": 8050 }, { "entropy": 1.3586655569076538, "epoch": 0.1969796454366382, "grad_norm": 13.0, "learning_rate": 1.9986619364617615e-05, "loss": 1.3965447998046876, "mean_token_accuracy": 0.6604114055633545, "num_tokens": 3880902.0, "step": 8100 }, { "entropy": 1.418174958229065, "epoch": 0.1981955691739014, "grad_norm": 9.3125, "learning_rate": 1.9985917251792877e-05, "loss": 1.4301986694335938, "mean_token_accuracy": 0.6462347322702408, "num_tokens": 3904048.0, "step": 8150 }, { "entropy": 1.2731504744291307, "epoch": 0.1994114929111646, "grad_norm": 27.0, "learning_rate": 1.9985197199197058e-05, "loss": 1.289828338623047, "mean_token_accuracy": 0.6714211130142211, "num_tokens": 3927075.0, "step": 8200 }, { "entropy": 1.353734695315361, "epoch": 0.20062741664842781, "grad_norm": 14.5625, "learning_rate": 1.998445920812375e-05, "loss": 1.395948486328125, "mean_token_accuracy": 0.6597176039218903, "num_tokens": 3950634.0, "step": 8250 }, { "entropy": 1.549665174484253, "epoch": 0.20184334038569102, "grad_norm": 22.125, "learning_rate": 1.9983703279898756e-05, "loss": 1.5582119750976562, "mean_token_accuracy": 0.6361096531152726, "num_tokens": 3973068.0, "step": 8300 }, { "entropy": 1.3809632694721221, "epoch": 0.20305926412295422, "grad_norm": 21.75, "learning_rate": 1.9982929415880096e-05, "loss": 1.4242587280273438, "mean_token_accuracy": 0.646506444811821, "num_tokens": 3996909.0, "step": 8350 }, { "entropy": 1.41182710647583, "epoch": 0.2042751878602174, "grad_norm": 17.25, "learning_rate": 1.9982137617458037e-05, "loss": 1.4330474853515625, "mean_token_accuracy": 0.653466277718544, "num_tokens": 4021873.0, "step": 8400 }, { "entropy": 1.428081431388855, "epoch": 0.2054911115974806, "grad_norm": 14.375, "learning_rate": 1.9981327886055045e-05, "loss": 1.447379150390625, "mean_token_accuracy": 0.6474003106355667, "num_tokens": 4048007.0, "step": 8450 }, { "entropy": 1.3433163285255432, "epoch": 0.2067070353347438, "grad_norm": 7.46875, "learning_rate": 1.9980500223125805e-05, "loss": 1.365953369140625, "mean_token_accuracy": 0.6640760397911072, "num_tokens": 4070811.0, "step": 8500 }, { "entropy": 1.4667051482200621, "epoch": 0.207922959072007, "grad_norm": 10.1875, "learning_rate": 1.997965463015722e-05, "loss": 1.4962435913085939, "mean_token_accuracy": 0.6461986035108567, "num_tokens": 4095209.0, "step": 8550 }, { "entropy": 1.4608798694610596, "epoch": 0.2091388828092702, "grad_norm": 14.0, "learning_rate": 1.997879110866841e-05, "loss": 1.472832489013672, "mean_token_accuracy": 0.6428077042102813, "num_tokens": 4115962.0, "step": 8600 }, { "entropy": 1.4936528980731965, "epoch": 0.2103548065465334, "grad_norm": 14.25, "learning_rate": 1.997790966021069e-05, "loss": 1.51927490234375, "mean_token_accuracy": 0.6351505017280579, "num_tokens": 4144078.0, "step": 8650 }, { "entropy": 1.422002568244934, "epoch": 0.2115707302837966, "grad_norm": 19.375, "learning_rate": 1.9977010286367592e-05, "loss": 1.4494761657714843, "mean_token_accuracy": 0.6474332308769226, "num_tokens": 4169877.0, "step": 8700 }, { "entropy": 1.4593291258811951, "epoch": 0.2127866540210598, "grad_norm": 10.8125, "learning_rate": 1.9976092988754846e-05, "loss": 1.4586489868164063, "mean_token_accuracy": 0.6468196451663971, "num_tokens": 4192733.0, "step": 8750 }, { "entropy": 1.4480651116371155, "epoch": 0.214002577758323, "grad_norm": 15.125, "learning_rate": 1.9975157769020387e-05, "loss": 1.4826956176757813, "mean_token_accuracy": 0.6428126984834671, "num_tokens": 4217221.0, "step": 8800 }, { "entropy": 1.3266998064517974, "epoch": 0.2152185014955862, "grad_norm": 14.0625, "learning_rate": 1.997420462884434e-05, "loss": 1.3254013061523438, "mean_token_accuracy": 0.66106210231781, "num_tokens": 4242428.0, "step": 8850 }, { "entropy": 1.4742414259910583, "epoch": 0.2164344252328494, "grad_norm": 14.25, "learning_rate": 1.997323356993903e-05, "loss": 1.5112515258789063, "mean_token_accuracy": 0.6464951080083847, "num_tokens": 4266913.0, "step": 8900 }, { "entropy": 1.4189593636989593, "epoch": 0.2176503489701126, "grad_norm": 16.75, "learning_rate": 1.9972244594048972e-05, "loss": 1.4506854248046874, "mean_token_accuracy": 0.6427810025215149, "num_tokens": 4293112.0, "step": 8950 }, { "entropy": 1.2184300231933594, "epoch": 0.2188662727073758, "grad_norm": 8.5625, "learning_rate": 1.997123770295086e-05, "loss": 1.2323180389404298, "mean_token_accuracy": 0.6789401412010193, "num_tokens": 4318027.0, "step": 9000 }, { "entropy": 1.3231325697898866, "epoch": 0.220082196444639, "grad_norm": 11.75, "learning_rate": 1.9970212898453597e-05, "loss": 1.3379078674316407, "mean_token_accuracy": 0.6609453135728836, "num_tokens": 4344930.0, "step": 9050 }, { "entropy": 1.3748965734243392, "epoch": 0.2212981201819022, "grad_norm": 10.6875, "learning_rate": 1.9969170182398235e-05, "loss": 1.3943161010742187, "mean_token_accuracy": 0.6570073461532593, "num_tokens": 4367074.0, "step": 9100 }, { "entropy": 1.4903193187713624, "epoch": 0.22251404391916538, "grad_norm": 17.25, "learning_rate": 1.9968109556658033e-05, "loss": 1.5075881958007813, "mean_token_accuracy": 0.6363273197412491, "num_tokens": 4390324.0, "step": 9150 }, { "entropy": 1.343743063211441, "epoch": 0.22372996765642858, "grad_norm": 13.125, "learning_rate": 1.9967031023138408e-05, "loss": 1.400292205810547, "mean_token_accuracy": 0.6528400164842606, "num_tokens": 4416060.0, "step": 9200 }, { "entropy": 1.4334269964694977, "epoch": 0.22494589139369178, "grad_norm": 10.25, "learning_rate": 1.9965934583776948e-05, "loss": 1.4536843872070313, "mean_token_accuracy": 0.6462130582332611, "num_tokens": 4437959.0, "step": 9250 }, { "entropy": 1.3542230260372161, "epoch": 0.22616181513095499, "grad_norm": 12.8125, "learning_rate": 1.9964820240543422e-05, "loss": 1.3801271057128905, "mean_token_accuracy": 0.6595211905241013, "num_tokens": 4462098.0, "step": 9300 }, { "entropy": 1.4540643310546875, "epoch": 0.2273777388682182, "grad_norm": 17.625, "learning_rate": 1.9963687995439755e-05, "loss": 1.457567901611328, "mean_token_accuracy": 0.6543348044157028, "num_tokens": 4486575.0, "step": 9350 }, { "entropy": 1.441443372964859, "epoch": 0.2285936626054814, "grad_norm": 11.875, "learning_rate": 1.9962537850500028e-05, "loss": 1.463466796875, "mean_token_accuracy": 0.6509877395629883, "num_tokens": 4514332.0, "step": 9400 }, { "entropy": 1.4157578670978546, "epoch": 0.2298095863427446, "grad_norm": 12.875, "learning_rate": 1.9961369807790487e-05, "loss": 1.4331541442871094, "mean_token_accuracy": 0.6477777856588364, "num_tokens": 4536170.0, "step": 9450 }, { "entropy": 1.32439444065094, "epoch": 0.2310255100800078, "grad_norm": 13.875, "learning_rate": 1.996018386940953e-05, "loss": 1.3555912780761719, "mean_token_accuracy": 0.6675613570213318, "num_tokens": 4561050.0, "step": 9500 }, { "entropy": 1.4144940280914307, "epoch": 0.23224143381727097, "grad_norm": 22.625, "learning_rate": 1.99589800374877e-05, "loss": 1.4338632202148438, "mean_token_accuracy": 0.6514286041259766, "num_tokens": 4590005.0, "step": 9550 }, { "entropy": 1.3332500863075256, "epoch": 0.23345735755453417, "grad_norm": 15.875, "learning_rate": 1.9957758314187697e-05, "loss": 1.3446029663085937, "mean_token_accuracy": 0.6604770720005035, "num_tokens": 4612791.0, "step": 9600 }, { "entropy": 1.28560242831707, "epoch": 0.23467328129179738, "grad_norm": 18.5, "learning_rate": 1.995651870170435e-05, "loss": 1.322718963623047, "mean_token_accuracy": 0.6741638034582138, "num_tokens": 4638368.0, "step": 9650 }, { "entropy": 1.3478366100788117, "epoch": 0.23588920502906058, "grad_norm": 13.75, "learning_rate": 1.9955261202264636e-05, "loss": 1.3538250732421875, "mean_token_accuracy": 0.658582569360733, "num_tokens": 4662039.0, "step": 9700 }, { "entropy": 1.3146557772159577, "epoch": 0.23710512876632378, "grad_norm": 12.5, "learning_rate": 1.9953985818127655e-05, "loss": 1.3241146850585936, "mean_token_accuracy": 0.6698850983381271, "num_tokens": 4687145.0, "step": 9750 }, { "entropy": 1.4604837024211883, "epoch": 0.23832105250358698, "grad_norm": 9.6875, "learning_rate": 1.9952692551584648e-05, "loss": 1.4833985900878905, "mean_token_accuracy": 0.6412402665615082, "num_tokens": 4710156.0, "step": 9800 }, { "entropy": 1.4133691000938415, "epoch": 0.23953697624085019, "grad_norm": 11.6875, "learning_rate": 1.9951381404958976e-05, "loss": 1.4279647827148438, "mean_token_accuracy": 0.6547023403644562, "num_tokens": 4733562.0, "step": 9850 }, { "entropy": 1.327355801463127, "epoch": 0.24075289997811336, "grad_norm": 31.75, "learning_rate": 1.9950052380606123e-05, "loss": 1.338971405029297, "mean_token_accuracy": 0.673723783493042, "num_tokens": 4754860.0, "step": 9900 }, { "entropy": 1.3004949402809143, "epoch": 0.24196882371537656, "grad_norm": 8.9375, "learning_rate": 1.9948705480913694e-05, "loss": 1.3192561340332032, "mean_token_accuracy": 0.680578915476799, "num_tokens": 4779854.0, "step": 9950 }, { "entropy": 1.2827515828609466, "epoch": 0.24318474745263977, "grad_norm": 15.4375, "learning_rate": 1.99473407083014e-05, "loss": 1.3295083618164063, "mean_token_accuracy": 0.673518785238266, "num_tokens": 4803208.0, "step": 10000 }, { "epoch": 0.24318474745263977, "eval_entropy": 1.3634422828375472, "eval_loss": 1.3953608274459839, "eval_mean_token_accuracy": 0.6577719137623733, "eval_num_tokens": 4803208.0, "eval_runtime": 391.7818, "eval_samples_per_second": 11.662, "eval_steps_per_second": 11.662, "step": 10000 }, { "entropy": 1.4405075049400329, "epoch": 0.24440067118990297, "grad_norm": 13.125, "learning_rate": 1.9945958065221066e-05, "loss": 1.4597462463378905, "mean_token_accuracy": 0.6450659942626953, "num_tokens": 4822782.0, "step": 10050 }, { "entropy": 1.3897208321094512, "epoch": 0.24561659492716617, "grad_norm": 14.8125, "learning_rate": 1.994455755415662e-05, "loss": 1.4074008178710937, "mean_token_accuracy": 0.6519137012958527, "num_tokens": 4849911.0, "step": 10100 }, { "entropy": 1.384155832529068, "epoch": 0.24683251866442937, "grad_norm": 37.0, "learning_rate": 1.994313917762409e-05, "loss": 1.4199198913574218, "mean_token_accuracy": 0.6633716720342636, "num_tokens": 4870358.0, "step": 10150 }, { "entropy": 1.4569268202781678, "epoch": 0.24804844240169258, "grad_norm": 10.6875, "learning_rate": 1.9941702938171596e-05, "loss": 1.4695025634765626, "mean_token_accuracy": 0.6374693113565445, "num_tokens": 4894777.0, "step": 10200 }, { "entropy": 1.325004380941391, "epoch": 0.24926436613895578, "grad_norm": 10.875, "learning_rate": 1.994024883837936e-05, "loss": 1.3411891174316406, "mean_token_accuracy": 0.6654567927122116, "num_tokens": 4919065.0, "step": 10250 }, { "entropy": 1.3635669374465942, "epoch": 0.25048028987621895, "grad_norm": 13.625, "learning_rate": 1.993877688085968e-05, "loss": 1.3931698608398437, "mean_token_accuracy": 0.6610210704803466, "num_tokens": 4944095.0, "step": 10300 }, { "entropy": 1.3690563642978668, "epoch": 0.25169621361348216, "grad_norm": 22.75, "learning_rate": 1.9937287068256935e-05, "loss": 1.3963906860351563, "mean_token_accuracy": 0.6576419854164124, "num_tokens": 4963501.0, "step": 10350 }, { "entropy": 1.4601987373828889, "epoch": 0.25291213735074536, "grad_norm": 10.625, "learning_rate": 1.9935779403247584e-05, "loss": 1.4812765502929688, "mean_token_accuracy": 0.6394760447740555, "num_tokens": 4989071.0, "step": 10400 }, { "entropy": 1.3836608982086183, "epoch": 0.25412806108800856, "grad_norm": 18.25, "learning_rate": 1.9934253888540162e-05, "loss": 1.402833251953125, "mean_token_accuracy": 0.6581112742424011, "num_tokens": 5012870.0, "step": 10450 }, { "entropy": 1.3601440787315369, "epoch": 0.25534398482527176, "grad_norm": 14.25, "learning_rate": 1.993271052687526e-05, "loss": 1.3740946960449218, "mean_token_accuracy": 0.6592150765657425, "num_tokens": 5037353.0, "step": 10500 }, { "entropy": 1.2508243936300278, "epoch": 0.25655990856253497, "grad_norm": 8.1875, "learning_rate": 1.993114932102555e-05, "loss": 1.26514404296875, "mean_token_accuracy": 0.6827860379219055, "num_tokens": 5061084.0, "step": 10550 }, { "entropy": 1.3775366199016572, "epoch": 0.25777583229979817, "grad_norm": 17.5, "learning_rate": 1.9929570273795734e-05, "loss": 1.4158352661132811, "mean_token_accuracy": 0.6632061892747879, "num_tokens": 5082797.0, "step": 10600 }, { "entropy": 1.324419274330139, "epoch": 0.25899175603706137, "grad_norm": 15.875, "learning_rate": 1.9927973388022594e-05, "loss": 1.3593496704101562, "mean_token_accuracy": 0.6661099493503571, "num_tokens": 5105494.0, "step": 10650 }, { "entropy": 1.299939376115799, "epoch": 0.2602076797743246, "grad_norm": 12.4375, "learning_rate": 1.992635866657494e-05, "loss": 1.3060321044921874, "mean_token_accuracy": 0.6719350510835648, "num_tokens": 5130096.0, "step": 10700 }, { "entropy": 1.3315664291381837, "epoch": 0.2614236035115878, "grad_norm": 13.5625, "learning_rate": 1.9924726112353635e-05, "loss": 1.3584848022460938, "mean_token_accuracy": 0.6648550212383271, "num_tokens": 5153970.0, "step": 10750 }, { "entropy": 1.4555249679088593, "epoch": 0.262639527248851, "grad_norm": 18.75, "learning_rate": 1.992307572829157e-05, "loss": 1.4722076416015626, "mean_token_accuracy": 0.6415256208181381, "num_tokens": 5175107.0, "step": 10800 }, { "entropy": 1.4519980537891388, "epoch": 0.2638554509861141, "grad_norm": 17.875, "learning_rate": 1.9921407517353675e-05, "loss": 1.4467922973632812, "mean_token_accuracy": 0.653086895942688, "num_tokens": 5195985.0, "step": 10850 }, { "entropy": 1.3780898344516754, "epoch": 0.26507137472337733, "grad_norm": 11.375, "learning_rate": 1.99197214825369e-05, "loss": 1.4186203002929687, "mean_token_accuracy": 0.6571265757083893, "num_tokens": 5221752.0, "step": 10900 }, { "entropy": 1.3679954254627227, "epoch": 0.26628729846064053, "grad_norm": 17.375, "learning_rate": 1.9918017626870217e-05, "loss": 1.3826828002929688, "mean_token_accuracy": 0.6560606497526169, "num_tokens": 5245683.0, "step": 10950 }, { "entropy": 1.390333970785141, "epoch": 0.26750322219790373, "grad_norm": 15.625, "learning_rate": 1.991629595341462e-05, "loss": 1.4038157653808594, "mean_token_accuracy": 0.6516782116889953, "num_tokens": 5270410.0, "step": 11000 }, { "entropy": 1.426293585896492, "epoch": 0.26871914593516694, "grad_norm": 12.9375, "learning_rate": 1.9914556465263106e-05, "loss": 1.455247039794922, "mean_token_accuracy": 0.6538374018669129, "num_tokens": 5293150.0, "step": 11050 }, { "entropy": 1.3825832056999205, "epoch": 0.26993506967243014, "grad_norm": 11.9375, "learning_rate": 1.9912799165540678e-05, "loss": 1.3969204711914063, "mean_token_accuracy": 0.6515971791744232, "num_tokens": 5313569.0, "step": 11100 }, { "entropy": 1.446112298965454, "epoch": 0.27115099340969334, "grad_norm": 14.125, "learning_rate": 1.9911024057404333e-05, "loss": 1.4679776000976563, "mean_token_accuracy": 0.6422459930181503, "num_tokens": 5336206.0, "step": 11150 }, { "entropy": 1.383216608762741, "epoch": 0.27236691714695654, "grad_norm": 11.875, "learning_rate": 1.990923114404307e-05, "loss": 1.4096681213378905, "mean_token_accuracy": 0.6597874766588211, "num_tokens": 5357796.0, "step": 11200 }, { "entropy": 1.4007442843914033, "epoch": 0.27358284088421975, "grad_norm": 8.0, "learning_rate": 1.990742042867787e-05, "loss": 1.4349781799316406, "mean_token_accuracy": 0.6485688811540604, "num_tokens": 5386633.0, "step": 11250 }, { "entropy": 1.43349023938179, "epoch": 0.27479876462148295, "grad_norm": 11.8125, "learning_rate": 1.9905591914561694e-05, "loss": 1.4487321472167969, "mean_token_accuracy": 0.6549130123853684, "num_tokens": 5408089.0, "step": 11300 }, { "entropy": 1.3387443202733993, "epoch": 0.27601468835874615, "grad_norm": 19.0, "learning_rate": 1.990374560497948e-05, "loss": 1.354937744140625, "mean_token_accuracy": 0.6602998560667038, "num_tokens": 5434746.0, "step": 11350 }, { "entropy": 1.4188622093200685, "epoch": 0.27723061209600935, "grad_norm": 11.1875, "learning_rate": 1.990188150324814e-05, "loss": 1.4426919555664062, "mean_token_accuracy": 0.6618695271015167, "num_tokens": 5458359.0, "step": 11400 }, { "entropy": 1.452605711221695, "epoch": 0.27844653583327256, "grad_norm": 16.125, "learning_rate": 1.9899999612716546e-05, "loss": 1.4684849548339844, "mean_token_accuracy": 0.6434920489788055, "num_tokens": 5480235.0, "step": 11450 }, { "entropy": 1.3477305352687836, "epoch": 0.27966245957053576, "grad_norm": 35.5, "learning_rate": 1.989809993676552e-05, "loss": 1.3721090698242187, "mean_token_accuracy": 0.6501813805103303, "num_tokens": 5504185.0, "step": 11500 }, { "entropy": 1.4461342310905456, "epoch": 0.28087838330779896, "grad_norm": 19.125, "learning_rate": 1.9896182478807852e-05, "loss": 1.4391476440429687, "mean_token_accuracy": 0.6506197285652161, "num_tokens": 5525754.0, "step": 11550 }, { "entropy": 1.4783993935585023, "epoch": 0.2820943070450621, "grad_norm": 15.25, "learning_rate": 1.9894247242288264e-05, "loss": 1.5452786254882813, "mean_token_accuracy": 0.6398125952482223, "num_tokens": 5548011.0, "step": 11600 }, { "entropy": 1.4089122760295867, "epoch": 0.2833102307823253, "grad_norm": 38.25, "learning_rate": 1.9892294230683426e-05, "loss": 1.3949668884277344, "mean_token_accuracy": 0.6563910365104675, "num_tokens": 5571163.0, "step": 11650 }, { "entropy": 1.3170185434818267, "epoch": 0.2845261545195885, "grad_norm": 9.9375, "learning_rate": 1.9890323447501937e-05, "loss": 1.336775360107422, "mean_token_accuracy": 0.6741905057430267, "num_tokens": 5594757.0, "step": 11700 }, { "entropy": 1.2651079320907592, "epoch": 0.2857420782568517, "grad_norm": 13.0, "learning_rate": 1.9888334896284315e-05, "loss": 1.3105648803710936, "mean_token_accuracy": 0.6781243693828582, "num_tokens": 5618755.0, "step": 11750 }, { "entropy": 1.3939058923721312, "epoch": 0.2869580019941149, "grad_norm": 12.375, "learning_rate": 1.9886328580603016e-05, "loss": 1.4172285461425782, "mean_token_accuracy": 0.6551655411720276, "num_tokens": 5642949.0, "step": 11800 }, { "entropy": 1.4069949781894684, "epoch": 0.2881739257313781, "grad_norm": 10.5625, "learning_rate": 1.9884304504062398e-05, "loss": 1.4290065002441406, "mean_token_accuracy": 0.6588560628890991, "num_tokens": 5665464.0, "step": 11850 }, { "entropy": 1.3344212567806244, "epoch": 0.2893898494686413, "grad_norm": 22.625, "learning_rate": 1.9882262670298724e-05, "loss": 1.352523651123047, "mean_token_accuracy": 0.6565149760246277, "num_tokens": 5688520.0, "step": 11900 }, { "entropy": 1.4241867065429688, "epoch": 0.2906057732059045, "grad_norm": 18.5, "learning_rate": 1.9880203082980167e-05, "loss": 1.43351806640625, "mean_token_accuracy": 0.6516069859266281, "num_tokens": 5710062.0, "step": 11950 }, { "entropy": 1.407847911119461, "epoch": 0.29182169694316773, "grad_norm": 27.125, "learning_rate": 1.987812574580679e-05, "loss": 1.4364730834960937, "mean_token_accuracy": 0.6490225619077683, "num_tokens": 5734665.0, "step": 12000 }, { "entropy": 1.3852595067024231, "epoch": 0.29303762068043093, "grad_norm": 11.625, "learning_rate": 1.9876030662510543e-05, "loss": 1.4184376525878906, "mean_token_accuracy": 0.6633582043647767, "num_tokens": 5756409.0, "step": 12050 }, { "entropy": 1.3124177634716034, "epoch": 0.29425354441769414, "grad_norm": 11.1875, "learning_rate": 1.987391783685526e-05, "loss": 1.3344577026367188, "mean_token_accuracy": 0.6687740057706832, "num_tokens": 5773872.0, "step": 12100 }, { "entropy": 1.2994925904273986, "epoch": 0.29546946815495734, "grad_norm": 11.4375, "learning_rate": 1.9871787272636642e-05, "loss": 1.31047607421875, "mean_token_accuracy": 0.6716444098949432, "num_tokens": 5798098.0, "step": 12150 }, { "entropy": 1.3483073258399962, "epoch": 0.29668539189222054, "grad_norm": 12.6875, "learning_rate": 1.9869638973682267e-05, "loss": 1.367478790283203, "mean_token_accuracy": 0.6685003572702408, "num_tokens": 5820699.0, "step": 12200 }, { "entropy": 1.399265763759613, "epoch": 0.29790131562948374, "grad_norm": 17.875, "learning_rate": 1.986747294385157e-05, "loss": 1.4361648559570312, "mean_token_accuracy": 0.6523605555295944, "num_tokens": 5845168.0, "step": 12250 }, { "entropy": 1.5281825304031371, "epoch": 0.29911723936674695, "grad_norm": 18.25, "learning_rate": 1.986528918703584e-05, "loss": 1.553737335205078, "mean_token_accuracy": 0.6381681704521179, "num_tokens": 5870974.0, "step": 12300 }, { "entropy": 1.1699262797832488, "epoch": 0.3003331631040101, "grad_norm": 10.1875, "learning_rate": 1.9863087707158206e-05, "loss": 1.1904005432128906, "mean_token_accuracy": 0.6924188947677612, "num_tokens": 5895836.0, "step": 12350 }, { "entropy": 1.2982468473911286, "epoch": 0.3015490868412733, "grad_norm": 14.25, "learning_rate": 1.9860868508173646e-05, "loss": 1.335687255859375, "mean_token_accuracy": 0.6742163169384002, "num_tokens": 5918938.0, "step": 12400 }, { "entropy": 1.4104686403274536, "epoch": 0.3027650105785365, "grad_norm": 10.125, "learning_rate": 1.9858631594068965e-05, "loss": 1.4160772705078124, "mean_token_accuracy": 0.6559062296152115, "num_tokens": 5942627.0, "step": 12450 }, { "entropy": 1.4564190423488617, "epoch": 0.3039809343157997, "grad_norm": 10.25, "learning_rate": 1.9856376968862797e-05, "loss": 1.4695722961425781, "mean_token_accuracy": 0.6456558871269226, "num_tokens": 5968498.0, "step": 12500 }, { "entropy": 1.3163057637214661, "epoch": 0.3051968580530629, "grad_norm": 12.0625, "learning_rate": 1.9854104636605593e-05, "loss": 1.3310919189453125, "mean_token_accuracy": 0.6668384206295014, "num_tokens": 5996414.0, "step": 12550 }, { "entropy": 1.3332480788230896, "epoch": 0.3064127817903261, "grad_norm": 13.6875, "learning_rate": 1.985181460137961e-05, "loss": 1.3371315002441406, "mean_token_accuracy": 0.6618221086263657, "num_tokens": 6017116.0, "step": 12600 }, { "entropy": 1.3115463471412658, "epoch": 0.3076287055275893, "grad_norm": 13.0, "learning_rate": 1.9849506867298912e-05, "loss": 1.361874542236328, "mean_token_accuracy": 0.670465635061264, "num_tokens": 6038371.0, "step": 12650 }, { "entropy": 1.496714904308319, "epoch": 0.3088446292648525, "grad_norm": 12.375, "learning_rate": 1.9847181438509367e-05, "loss": 1.496382293701172, "mean_token_accuracy": 0.633553683757782, "num_tokens": 6061099.0, "step": 12700 }, { "entropy": 1.340651397705078, "epoch": 0.3100605530021157, "grad_norm": 14.375, "learning_rate": 1.984483831918862e-05, "loss": 1.368036651611328, "mean_token_accuracy": 0.6609839397668839, "num_tokens": 6084860.0, "step": 12750 }, { "entropy": 1.371337149143219, "epoch": 0.3112764767393789, "grad_norm": 7.65625, "learning_rate": 1.9842477513546097e-05, "loss": 1.3823930358886718, "mean_token_accuracy": 0.6674776977300644, "num_tokens": 6108634.0, "step": 12800 }, { "entropy": 1.3688041031360627, "epoch": 0.3124924004766421, "grad_norm": 13.625, "learning_rate": 1.984009902582301e-05, "loss": 1.4022319030761718, "mean_token_accuracy": 0.6531870496273041, "num_tokens": 6132266.0, "step": 12850 }, { "entropy": 1.2864142364263536, "epoch": 0.3137083242139053, "grad_norm": 27.375, "learning_rate": 1.9837702860292323e-05, "loss": 1.3285462951660156, "mean_token_accuracy": 0.6679386675357819, "num_tokens": 6154759.0, "step": 12900 }, { "entropy": 1.3575939416885376, "epoch": 0.3149242479511685, "grad_norm": 11.5625, "learning_rate": 1.9835289021258765e-05, "loss": 1.3745039367675782, "mean_token_accuracy": 0.6583252894878387, "num_tokens": 6180576.0, "step": 12950 }, { "entropy": 1.3699369782209396, "epoch": 0.3161401716884317, "grad_norm": 11.5, "learning_rate": 1.983285751305882e-05, "loss": 1.3821368408203125, "mean_token_accuracy": 0.6557186806201935, "num_tokens": 6206852.0, "step": 13000 }, { "entropy": 1.364334613084793, "epoch": 0.3173560954256949, "grad_norm": 17.625, "learning_rate": 1.9830408340060704e-05, "loss": 1.412694549560547, "mean_token_accuracy": 0.6561678564548492, "num_tokens": 6232462.0, "step": 13050 }, { "entropy": 1.3076295816898347, "epoch": 0.3185720191629581, "grad_norm": 16.5, "learning_rate": 1.9827941506664378e-05, "loss": 1.3237649536132812, "mean_token_accuracy": 0.6666223227977752, "num_tokens": 6256591.0, "step": 13100 }, { "entropy": 1.2743502974510192, "epoch": 0.3197879429002213, "grad_norm": 22.0, "learning_rate": 1.982545701730152e-05, "loss": 1.2788471221923827, "mean_token_accuracy": 0.6796506971120835, "num_tokens": 6277933.0, "step": 13150 }, { "entropy": 1.3931656634807588, "epoch": 0.3210038666374845, "grad_norm": 14.0625, "learning_rate": 1.982295487643554e-05, "loss": 1.4019422912597657, "mean_token_accuracy": 0.6654171240329743, "num_tokens": 6305114.0, "step": 13200 }, { "entropy": 1.2552088165283204, "epoch": 0.3222197903747477, "grad_norm": 9.9375, "learning_rate": 1.9820435088561543e-05, "loss": 1.2881381225585937, "mean_token_accuracy": 0.6655318236351013, "num_tokens": 6330471.0, "step": 13250 }, { "entropy": 1.4251118302345276, "epoch": 0.3234357141120109, "grad_norm": 25.0, "learning_rate": 1.9817897658206353e-05, "loss": 1.447893829345703, "mean_token_accuracy": 0.6532726192474365, "num_tokens": 6355386.0, "step": 13300 }, { "entropy": 1.3610649406909943, "epoch": 0.3246516378492741, "grad_norm": 11.875, "learning_rate": 1.981534258992848e-05, "loss": 1.3657151794433593, "mean_token_accuracy": 0.6628685653209686, "num_tokens": 6377769.0, "step": 13350 }, { "entropy": 1.48193168759346, "epoch": 0.3258675615865373, "grad_norm": 13.9375, "learning_rate": 1.9812769888318122e-05, "loss": 1.5057868957519531, "mean_token_accuracy": 0.6453309386968613, "num_tokens": 6402666.0, "step": 13400 }, { "entropy": 1.3141684699058533, "epoch": 0.3270834853238005, "grad_norm": 15.0625, "learning_rate": 1.9810179557997155e-05, "loss": 1.3363287353515625, "mean_token_accuracy": 0.6756933188438415, "num_tokens": 6423468.0, "step": 13450 }, { "entropy": 1.3354079985618592, "epoch": 0.3282994090610637, "grad_norm": 17.5, "learning_rate": 1.9807571603619125e-05, "loss": 1.3691647338867188, "mean_token_accuracy": 0.6640063011646271, "num_tokens": 6445792.0, "step": 13500 }, { "entropy": 1.3250058054924012, "epoch": 0.3295153327983269, "grad_norm": 19.125, "learning_rate": 1.9804946029869245e-05, "loss": 1.32216796875, "mean_token_accuracy": 0.666671826839447, "num_tokens": 6470030.0, "step": 13550 }, { "entropy": 1.342184989452362, "epoch": 0.3307312565355901, "grad_norm": 11.8125, "learning_rate": 1.980230284146437e-05, "loss": 1.3636636352539062, "mean_token_accuracy": 0.6643161237239837, "num_tokens": 6494272.0, "step": 13600 }, { "entropy": 1.2998303186893463, "epoch": 0.3319471802728533, "grad_norm": 21.875, "learning_rate": 1.979964204315301e-05, "loss": 1.3272611999511719, "mean_token_accuracy": 0.6690734219551087, "num_tokens": 6515561.0, "step": 13650 }, { "entropy": 1.4335996007919312, "epoch": 0.3331631040101165, "grad_norm": 18.0, "learning_rate": 1.9796963639715307e-05, "loss": 1.4603152465820313, "mean_token_accuracy": 0.6411524963378906, "num_tokens": 6539498.0, "step": 13700 }, { "entropy": 1.4037420177459716, "epoch": 0.3343790277473797, "grad_norm": 12.1875, "learning_rate": 1.9794267635963037e-05, "loss": 1.4043864440917968, "mean_token_accuracy": 0.6540981578826904, "num_tokens": 6565873.0, "step": 13750 }, { "entropy": 1.4457140374183655, "epoch": 0.33559495148464286, "grad_norm": 8.375, "learning_rate": 1.9791554036739583e-05, "loss": 1.4486589050292968, "mean_token_accuracy": 0.6504659283161164, "num_tokens": 6589205.0, "step": 13800 }, { "entropy": 1.3613514685630799, "epoch": 0.33681087522190606, "grad_norm": 11.4375, "learning_rate": 1.9788822846919946e-05, "loss": 1.4005308532714844, "mean_token_accuracy": 0.6504681122303009, "num_tokens": 6611232.0, "step": 13850 }, { "entropy": 1.3708666706085204, "epoch": 0.33802679895916926, "grad_norm": 14.25, "learning_rate": 1.978607407141073e-05, "loss": 1.408736572265625, "mean_token_accuracy": 0.6535277003049851, "num_tokens": 6636023.0, "step": 13900 }, { "entropy": 1.2733291399478912, "epoch": 0.33924272269643246, "grad_norm": 20.5, "learning_rate": 1.9783307715150134e-05, "loss": 1.2694283294677735, "mean_token_accuracy": 0.6805810058116912, "num_tokens": 6657699.0, "step": 13950 }, { "entropy": 1.402235826253891, "epoch": 0.34045864643369567, "grad_norm": 10.0625, "learning_rate": 1.978052378310793e-05, "loss": 1.4166754150390626, "mean_token_accuracy": 0.6551039123535156, "num_tokens": 6680358.0, "step": 14000 }, { "entropy": 1.320087846517563, "epoch": 0.34167457017095887, "grad_norm": 10.8125, "learning_rate": 1.977772228028547e-05, "loss": 1.3560577392578126, "mean_token_accuracy": 0.6636132740974426, "num_tokens": 6701156.0, "step": 14050 }, { "entropy": 1.3533766102790832, "epoch": 0.34289049390822207, "grad_norm": 15.8125, "learning_rate": 1.9774903211715683e-05, "loss": 1.3664378356933593, "mean_token_accuracy": 0.6650082421302795, "num_tokens": 6724152.0, "step": 14100 }, { "entropy": 1.3842066061496734, "epoch": 0.3441064176454853, "grad_norm": 11.25, "learning_rate": 1.9772066582463037e-05, "loss": 1.4075332641601563, "mean_token_accuracy": 0.6568050587177277, "num_tokens": 6750834.0, "step": 14150 }, { "entropy": 1.372491990327835, "epoch": 0.3453223413827485, "grad_norm": 9.5, "learning_rate": 1.976921239762356e-05, "loss": 1.405125732421875, "mean_token_accuracy": 0.6532856214046479, "num_tokens": 6776949.0, "step": 14200 }, { "entropy": 1.472944141626358, "epoch": 0.3465382651200117, "grad_norm": 8.5625, "learning_rate": 1.9766340662324816e-05, "loss": 1.4993121337890625, "mean_token_accuracy": 0.6439858031272888, "num_tokens": 6799728.0, "step": 14250 }, { "entropy": 1.3481272387504577, "epoch": 0.3477541888572749, "grad_norm": 14.0, "learning_rate": 1.9763451381725895e-05, "loss": 1.3657192993164062, "mean_token_accuracy": 0.6685252463817597, "num_tokens": 6821816.0, "step": 14300 }, { "entropy": 1.3680882561206817, "epoch": 0.3489701125945381, "grad_norm": 24.625, "learning_rate": 1.976054456101741e-05, "loss": 1.3800767517089845, "mean_token_accuracy": 0.6704615688323975, "num_tokens": 6846777.0, "step": 14350 }, { "entropy": 1.4021734178066254, "epoch": 0.3501860363318013, "grad_norm": 18.0, "learning_rate": 1.9757620205421492e-05, "loss": 1.449576873779297, "mean_token_accuracy": 0.6399721366167068, "num_tokens": 6868393.0, "step": 14400 }, { "entropy": 1.3507512521743774, "epoch": 0.3514019600690645, "grad_norm": 11.625, "learning_rate": 1.9754678320191755e-05, "loss": 1.3832998657226563, "mean_token_accuracy": 0.659759926199913, "num_tokens": 6888496.0, "step": 14450 }, { "entropy": 1.3310839438438415, "epoch": 0.3526178838063277, "grad_norm": 14.9375, "learning_rate": 1.9751718910613326e-05, "loss": 1.3656375122070312, "mean_token_accuracy": 0.6618148672580719, "num_tokens": 6916673.0, "step": 14500 }, { "entropy": 1.3405847084522247, "epoch": 0.35383380754359084, "grad_norm": 10.1875, "learning_rate": 1.97487419820028e-05, "loss": 1.3432722473144532, "mean_token_accuracy": 0.658719470500946, "num_tokens": 6945608.0, "step": 14550 }, { "entropy": 1.3814513540267945, "epoch": 0.35504973128085404, "grad_norm": 9.3125, "learning_rate": 1.9745747539708257e-05, "loss": 1.4135124206542968, "mean_token_accuracy": 0.6510818326473236, "num_tokens": 6967886.0, "step": 14600 }, { "entropy": 1.306699669957161, "epoch": 0.35626565501811724, "grad_norm": 24.25, "learning_rate": 1.974273558910923e-05, "loss": 1.3334230041503907, "mean_token_accuracy": 0.6715396279096604, "num_tokens": 6988597.0, "step": 14650 }, { "entropy": 1.4052578938007354, "epoch": 0.35748157875538045, "grad_norm": 27.125, "learning_rate": 1.9739706135616704e-05, "loss": 1.428536376953125, "mean_token_accuracy": 0.6554183566570282, "num_tokens": 7012897.0, "step": 14700 }, { "entropy": 1.3582494747638703, "epoch": 0.35869750249264365, "grad_norm": 19.875, "learning_rate": 1.973665918467312e-05, "loss": 1.3749063110351563, "mean_token_accuracy": 0.6525084978342056, "num_tokens": 7035038.0, "step": 14750 }, { "entropy": 1.2659495279192925, "epoch": 0.35991342622990685, "grad_norm": 18.375, "learning_rate": 1.9733594741752343e-05, "loss": 1.2867591857910157, "mean_token_accuracy": 0.6816987043619156, "num_tokens": 7056457.0, "step": 14800 }, { "entropy": 1.3294420689344406, "epoch": 0.36112934996717005, "grad_norm": 6.46875, "learning_rate": 1.9730512812359674e-05, "loss": 1.33912109375, "mean_token_accuracy": 0.6753888714313507, "num_tokens": 7082198.0, "step": 14850 }, { "entropy": 1.2871837103366852, "epoch": 0.36234527370443326, "grad_norm": 19.5, "learning_rate": 1.972741340203181e-05, "loss": 1.317548828125, "mean_token_accuracy": 0.6692567491531372, "num_tokens": 7103849.0, "step": 14900 }, { "entropy": 1.2596738147735596, "epoch": 0.36356119744169646, "grad_norm": 9.6875, "learning_rate": 1.9724296516336878e-05, "loss": 1.2753697204589844, "mean_token_accuracy": 0.6839113909006119, "num_tokens": 7127551.0, "step": 14950 }, { "entropy": 1.2845628851652144, "epoch": 0.36477712117895966, "grad_norm": 11.75, "learning_rate": 1.972116216087437e-05, "loss": 1.30343017578125, "mean_token_accuracy": 0.6655472505092621, "num_tokens": 7154013.0, "step": 15000 }, { "entropy": 1.3783683967590332, "epoch": 0.36599304491622286, "grad_norm": 16.75, "learning_rate": 1.9718010341275186e-05, "loss": 1.3806613159179688, "mean_token_accuracy": 0.669706079363823, "num_tokens": 7177866.0, "step": 15050 }, { "entropy": 1.3218148744106293, "epoch": 0.36720896865348607, "grad_norm": 10.125, "learning_rate": 1.971484106320159e-05, "loss": 1.3351339721679687, "mean_token_accuracy": 0.6743785917758942, "num_tokens": 7202557.0, "step": 15100 }, { "entropy": 1.351182392835617, "epoch": 0.36842489239074927, "grad_norm": 9.875, "learning_rate": 1.971165433234721e-05, "loss": 1.3941619873046875, "mean_token_accuracy": 0.6614046686887741, "num_tokens": 7231216.0, "step": 15150 }, { "entropy": 1.4141523826122284, "epoch": 0.3696408161280125, "grad_norm": 10.0, "learning_rate": 1.970845015443704e-05, "loss": 1.4193815612792968, "mean_token_accuracy": 0.6492344576120377, "num_tokens": 7254899.0, "step": 15200 }, { "entropy": 1.3329472720623017, "epoch": 0.3708567398652757, "grad_norm": 13.5625, "learning_rate": 1.97052285352274e-05, "loss": 1.348129425048828, "mean_token_accuracy": 0.6649358481168747, "num_tokens": 7281232.0, "step": 15250 }, { "entropy": 1.3759358096122742, "epoch": 0.3720726636025388, "grad_norm": 10.125, "learning_rate": 1.9701989480505953e-05, "loss": 1.393030548095703, "mean_token_accuracy": 0.6614656698703766, "num_tokens": 7302736.0, "step": 15300 }, { "entropy": 1.331213105916977, "epoch": 0.373288587339802, "grad_norm": 12.375, "learning_rate": 1.9698732996091683e-05, "loss": 1.3707992553710937, "mean_token_accuracy": 0.6558082485198975, "num_tokens": 7322069.0, "step": 15350 }, { "entropy": 1.3943939673900605, "epoch": 0.3745045110770652, "grad_norm": 10.9375, "learning_rate": 1.9695459087834893e-05, "loss": 1.4305287170410157, "mean_token_accuracy": 0.6547650533914566, "num_tokens": 7342779.0, "step": 15400 }, { "entropy": 1.4097651499509811, "epoch": 0.37572043481432843, "grad_norm": 11.25, "learning_rate": 1.9692167761617177e-05, "loss": 1.401673583984375, "mean_token_accuracy": 0.6525028043985367, "num_tokens": 7367510.0, "step": 15450 }, { "entropy": 1.348128719329834, "epoch": 0.37693635855159163, "grad_norm": 10.25, "learning_rate": 1.9688859023351426e-05, "loss": 1.3725396728515624, "mean_token_accuracy": 0.6528500199317933, "num_tokens": 7390784.0, "step": 15500 }, { "entropy": 1.348331207036972, "epoch": 0.37815228228885484, "grad_norm": 23.625, "learning_rate": 1.968553287898181e-05, "loss": 1.3845314025878905, "mean_token_accuracy": 0.6584691995382309, "num_tokens": 7416814.0, "step": 15550 }, { "entropy": 1.3747143590450286, "epoch": 0.37936820602611804, "grad_norm": 11.5, "learning_rate": 1.968218933448378e-05, "loss": 1.3823483276367188, "mean_token_accuracy": 0.6612180989980697, "num_tokens": 7439479.0, "step": 15600 }, { "entropy": 1.3415183436870575, "epoch": 0.38058412976338124, "grad_norm": 11.625, "learning_rate": 1.9678828395864028e-05, "loss": 1.3728912353515625, "mean_token_accuracy": 0.6576075261831283, "num_tokens": 7460238.0, "step": 15650 }, { "entropy": 1.348120265007019, "epoch": 0.38180005350064444, "grad_norm": 15.875, "learning_rate": 1.9675450069160506e-05, "loss": 1.3593112182617189, "mean_token_accuracy": 0.6662253117561341, "num_tokens": 7484792.0, "step": 15700 }, { "entropy": 1.2930688166618347, "epoch": 0.38301597723790765, "grad_norm": 31.125, "learning_rate": 1.9672054360442404e-05, "loss": 1.29710205078125, "mean_token_accuracy": 0.6797441148757934, "num_tokens": 7512176.0, "step": 15750 }, { "entropy": 1.3702389430999755, "epoch": 0.38423190097517085, "grad_norm": 26.375, "learning_rate": 1.9668641275810135e-05, "loss": 1.4037969970703126, "mean_token_accuracy": 0.6591525685787201, "num_tokens": 7540770.0, "step": 15800 }, { "entropy": 1.2843931591510773, "epoch": 0.38544782471243405, "grad_norm": 11.8125, "learning_rate": 1.9665210821395334e-05, "loss": 1.299756317138672, "mean_token_accuracy": 0.6734819734096527, "num_tokens": 7564482.0, "step": 15850 }, { "entropy": 1.40345567882061, "epoch": 0.38666374844969725, "grad_norm": 17.125, "learning_rate": 1.9661763003360828e-05, "loss": 1.4211683654785157, "mean_token_accuracy": 0.6546843355894089, "num_tokens": 7591083.0, "step": 15900 }, { "entropy": 1.301384568810463, "epoch": 0.38787967218696046, "grad_norm": 7.9375, "learning_rate": 1.9658297827900658e-05, "loss": 1.3391444396972656, "mean_token_accuracy": 0.6691087424755097, "num_tokens": 7617464.0, "step": 15950 }, { "entropy": 1.3249428725242616, "epoch": 0.38909559592422366, "grad_norm": 14.5625, "learning_rate": 1.9654815301240022e-05, "loss": 1.339383087158203, "mean_token_accuracy": 0.6676345002651215, "num_tokens": 7642107.0, "step": 16000 }, { "entropy": 1.3298946559429168, "epoch": 0.3903115196614868, "grad_norm": 17.375, "learning_rate": 1.9651315429635315e-05, "loss": 1.361123046875, "mean_token_accuracy": 0.6587427151203156, "num_tokens": 7664235.0, "step": 16050 }, { "entropy": 1.2816927671432494, "epoch": 0.39152744339875, "grad_norm": 13.5, "learning_rate": 1.9647798219374083e-05, "loss": 1.324132843017578, "mean_token_accuracy": 0.6740401709079742, "num_tokens": 7684971.0, "step": 16100 }, { "entropy": 1.3426925599575044, "epoch": 0.3927433671360132, "grad_norm": 11.5, "learning_rate": 1.9644263676775005e-05, "loss": 1.3588578796386719, "mean_token_accuracy": 0.6649782240390778, "num_tokens": 7711466.0, "step": 16150 }, { "entropy": 1.2486580526828766, "epoch": 0.3939592908732764, "grad_norm": 18.125, "learning_rate": 1.9640711808187928e-05, "loss": 1.2623871612548827, "mean_token_accuracy": 0.6803760981559753, "num_tokens": 7737297.0, "step": 16200 }, { "entropy": 1.3353942263126373, "epoch": 0.3951752146105396, "grad_norm": 19.0, "learning_rate": 1.96371426199938e-05, "loss": 1.3399244689941405, "mean_token_accuracy": 0.66956817984581, "num_tokens": 7761860.0, "step": 16250 }, { "entropy": 1.3143821513652802, "epoch": 0.3963911383478028, "grad_norm": 9.6875, "learning_rate": 1.9633556118604695e-05, "loss": 1.3398666381835938, "mean_token_accuracy": 0.663307643532753, "num_tokens": 7789696.0, "step": 16300 }, { "entropy": 1.3404415810108186, "epoch": 0.397607062085066, "grad_norm": 11.875, "learning_rate": 1.9629952310463788e-05, "loss": 1.360478973388672, "mean_token_accuracy": 0.6636138713359833, "num_tokens": 7811852.0, "step": 16350 }, { "entropy": 1.2569666278362275, "epoch": 0.3988229858223292, "grad_norm": 13.9375, "learning_rate": 1.9626331202045344e-05, "loss": 1.2638678741455078, "mean_token_accuracy": 0.6833014905452728, "num_tokens": 7837421.0, "step": 16400 }, { "entropy": 1.3410275077819824, "epoch": 0.4000389095595924, "grad_norm": 8.75, "learning_rate": 1.9622692799854712e-05, "loss": 1.358903045654297, "mean_token_accuracy": 0.6635140180587769, "num_tokens": 7868928.0, "step": 16450 }, { "entropy": 1.4212123942375183, "epoch": 0.40125483329685563, "grad_norm": 14.4375, "learning_rate": 1.961903711042831e-05, "loss": 1.46584716796875, "mean_token_accuracy": 0.6444900065660477, "num_tokens": 7889226.0, "step": 16500 }, { "entropy": 1.3355804657936097, "epoch": 0.40247075703411883, "grad_norm": 9.9375, "learning_rate": 1.9615364140333603e-05, "loss": 1.342626495361328, "mean_token_accuracy": 0.6685896790027619, "num_tokens": 7913905.0, "step": 16550 }, { "entropy": 1.2199784821271897, "epoch": 0.40368668077138203, "grad_norm": 23.125, "learning_rate": 1.9611673896169115e-05, "loss": 1.2244717407226562, "mean_token_accuracy": 0.6879379785060883, "num_tokens": 7938073.0, "step": 16600 }, { "entropy": 1.3338352990150453, "epoch": 0.40490260450864524, "grad_norm": 13.0, "learning_rate": 1.960796638456439e-05, "loss": 1.371741943359375, "mean_token_accuracy": 0.6606375831365585, "num_tokens": 7962262.0, "step": 16650 }, { "entropy": 1.4018150174617767, "epoch": 0.40611852824590844, "grad_norm": 15.625, "learning_rate": 1.960424161218e-05, "loss": 1.4248286437988282, "mean_token_accuracy": 0.6452900570631027, "num_tokens": 7984932.0, "step": 16700 }, { "entropy": 1.4673922860622406, "epoch": 0.40733445198317164, "grad_norm": 9.3125, "learning_rate": 1.9600499585707532e-05, "loss": 1.480220489501953, "mean_token_accuracy": 0.6449990385770797, "num_tokens": 8008561.0, "step": 16750 }, { "entropy": 1.250671375989914, "epoch": 0.4085503757204348, "grad_norm": 24.25, "learning_rate": 1.9596740311869553e-05, "loss": 1.2613917541503907, "mean_token_accuracy": 0.681624003648758, "num_tokens": 8031915.0, "step": 16800 }, { "entropy": 1.3035448157787324, "epoch": 0.409766299457698, "grad_norm": 19.875, "learning_rate": 1.959296379741963e-05, "loss": 1.3373255920410156, "mean_token_accuracy": 0.6691129720211029, "num_tokens": 8051128.0, "step": 16850 }, { "entropy": 1.3189557051658631, "epoch": 0.4109822231949612, "grad_norm": 22.25, "learning_rate": 1.95891700491423e-05, "loss": 1.343825225830078, "mean_token_accuracy": 0.6666096711158752, "num_tokens": 8072528.0, "step": 16900 }, { "entropy": 1.2530205535888672, "epoch": 0.4121981469322244, "grad_norm": 12.0625, "learning_rate": 1.9585359073853052e-05, "loss": 1.2794951629638671, "mean_token_accuracy": 0.6786214506626129, "num_tokens": 8097577.0, "step": 16950 }, { "entropy": 1.305523355603218, "epoch": 0.4134140706694876, "grad_norm": 12.8125, "learning_rate": 1.9581530878398338e-05, "loss": 1.3227667236328124, "mean_token_accuracy": 0.6719636541604995, "num_tokens": 8121250.0, "step": 17000 }, { "entropy": 1.3591957092285156, "epoch": 0.4146299944067508, "grad_norm": 9.9375, "learning_rate": 1.9577685469655534e-05, "loss": 1.3817080688476562, "mean_token_accuracy": 0.6601633387804031, "num_tokens": 8142231.0, "step": 17050 }, { "entropy": 1.287785371541977, "epoch": 0.415845918144014, "grad_norm": 15.5, "learning_rate": 1.9573822854532942e-05, "loss": 1.3446173095703124, "mean_token_accuracy": 0.6723465591669082, "num_tokens": 8165842.0, "step": 17100 }, { "entropy": 1.3249898982048034, "epoch": 0.4170618418812772, "grad_norm": 36.0, "learning_rate": 1.9569943039969782e-05, "loss": 1.3502384948730468, "mean_token_accuracy": 0.663458126783371, "num_tokens": 8191585.0, "step": 17150 }, { "entropy": 1.4236452329158782, "epoch": 0.4182777656185404, "grad_norm": 17.5, "learning_rate": 1.9566046032936166e-05, "loss": 1.4374853515625, "mean_token_accuracy": 0.6431270289421082, "num_tokens": 8217473.0, "step": 17200 }, { "entropy": 1.384716055393219, "epoch": 0.4194936893558036, "grad_norm": 11.75, "learning_rate": 1.9562131840433095e-05, "loss": 1.4250132751464843, "mean_token_accuracy": 0.6443495559692383, "num_tokens": 8238738.0, "step": 17250 }, { "entropy": 1.3473649954795837, "epoch": 0.4207096130930668, "grad_norm": 16.0, "learning_rate": 1.9558200469492445e-05, "loss": 1.3718504333496093, "mean_token_accuracy": 0.6553225868940353, "num_tokens": 8263210.0, "step": 17300 }, { "entropy": 1.3332541739940644, "epoch": 0.42192553683033, "grad_norm": 10.8125, "learning_rate": 1.9554251927176948e-05, "loss": 1.333675079345703, "mean_token_accuracy": 0.6753918206691742, "num_tokens": 8285052.0, "step": 17350 }, { "entropy": 1.32386834025383, "epoch": 0.4231414605675932, "grad_norm": 10.1875, "learning_rate": 1.955028622058019e-05, "loss": 1.3582894897460938, "mean_token_accuracy": 0.6622885763645172, "num_tokens": 8307830.0, "step": 17400 }, { "entropy": 1.3138227558135986, "epoch": 0.4243573843048564, "grad_norm": 28.75, "learning_rate": 1.954630335682659e-05, "loss": 1.337092742919922, "mean_token_accuracy": 0.6632252943515777, "num_tokens": 8330194.0, "step": 17450 }, { "entropy": 1.3081154453754424, "epoch": 0.4255733080421196, "grad_norm": 6.5, "learning_rate": 1.95423033430714e-05, "loss": 1.3343458557128907, "mean_token_accuracy": 0.6702668309211731, "num_tokens": 8354462.0, "step": 17500 }, { "entropy": 1.334598846435547, "epoch": 0.42678923177938277, "grad_norm": 15.375, "learning_rate": 1.9538286186500657e-05, "loss": 1.3216534423828126, "mean_token_accuracy": 0.6642862921953201, "num_tokens": 8375378.0, "step": 17550 }, { "entropy": 1.2683018243312836, "epoch": 0.428005155516646, "grad_norm": 27.0, "learning_rate": 1.9534251894331223e-05, "loss": 1.3191384887695312, "mean_token_accuracy": 0.6758394408226013, "num_tokens": 8399004.0, "step": 17600 }, { "entropy": 1.5451405084133147, "epoch": 0.4292210792539092, "grad_norm": 23.625, "learning_rate": 1.953020047381073e-05, "loss": 1.5525184631347657, "mean_token_accuracy": 0.636204658150673, "num_tokens": 8424463.0, "step": 17650 }, { "entropy": 1.4353686475753784, "epoch": 0.4304370029911724, "grad_norm": 11.625, "learning_rate": 1.952613193221758e-05, "loss": 1.4653851318359374, "mean_token_accuracy": 0.6461656403541565, "num_tokens": 8447810.0, "step": 17700 }, { "entropy": 1.3230755412578583, "epoch": 0.4316529267284356, "grad_norm": 10.4375, "learning_rate": 1.9522046276860945e-05, "loss": 1.3346868896484374, "mean_token_accuracy": 0.6611644065380097, "num_tokens": 8469213.0, "step": 17750 }, { "entropy": 1.323573524951935, "epoch": 0.4328688504656988, "grad_norm": 9.0625, "learning_rate": 1.9517943515080728e-05, "loss": 1.3581544494628905, "mean_token_accuracy": 0.6689886540174484, "num_tokens": 8494653.0, "step": 17800 }, { "entropy": 1.3139730679988861, "epoch": 0.434084774202962, "grad_norm": 27.0, "learning_rate": 1.9513823654247566e-05, "loss": 1.343040008544922, "mean_token_accuracy": 0.6703617048263549, "num_tokens": 8519414.0, "step": 17850 }, { "entropy": 1.4456289827823638, "epoch": 0.4353006979402252, "grad_norm": 13.625, "learning_rate": 1.950968670176282e-05, "loss": 1.4324411010742188, "mean_token_accuracy": 0.647697640657425, "num_tokens": 8542351.0, "step": 17900 }, { "entropy": 1.3289079892635345, "epoch": 0.4365166216774884, "grad_norm": 30.125, "learning_rate": 1.9505532665058563e-05, "loss": 1.3435369873046874, "mean_token_accuracy": 0.6646923875808716, "num_tokens": 8565894.0, "step": 17950 }, { "entropy": 1.2968444484472275, "epoch": 0.4377325454147516, "grad_norm": 15.375, "learning_rate": 1.9501361551597545e-05, "loss": 1.3322413635253907, "mean_token_accuracy": 0.6760247415304184, "num_tokens": 8589252.0, "step": 18000 }, { "entropy": 1.2782684803009032, "epoch": 0.4389484691520148, "grad_norm": 14.0, "learning_rate": 1.9497173368873195e-05, "loss": 1.2773072052001953, "mean_token_accuracy": 0.6696325647830963, "num_tokens": 8616553.0, "step": 18050 }, { "entropy": 1.3243836176395416, "epoch": 0.440164392889278, "grad_norm": 9.0, "learning_rate": 1.9492968124409625e-05, "loss": 1.3609857177734375, "mean_token_accuracy": 0.6620546442270279, "num_tokens": 8640642.0, "step": 18100 }, { "entropy": 1.3269654273986817, "epoch": 0.4413803166265412, "grad_norm": 15.25, "learning_rate": 1.9488745825761577e-05, "loss": 1.3496092224121095, "mean_token_accuracy": 0.663681583404541, "num_tokens": 8664447.0, "step": 18150 }, { "entropy": 1.295697456598282, "epoch": 0.4425962403638044, "grad_norm": 40.5, "learning_rate": 1.9484506480514445e-05, "loss": 1.3039933776855468, "mean_token_accuracy": 0.6680256175994873, "num_tokens": 8687389.0, "step": 18200 }, { "entropy": 1.3109469652175902, "epoch": 0.4438121641010676, "grad_norm": 14.5625, "learning_rate": 1.948025009628424e-05, "loss": 1.3374449157714843, "mean_token_accuracy": 0.6642142081260681, "num_tokens": 8712232.0, "step": 18250 }, { "entropy": 1.230424872636795, "epoch": 0.44502808783833075, "grad_norm": 7.78125, "learning_rate": 1.947597668071759e-05, "loss": 1.2407559967041015, "mean_token_accuracy": 0.6855530726909638, "num_tokens": 8736079.0, "step": 18300 }, { "entropy": 1.330066789984703, "epoch": 0.44624401157559396, "grad_norm": 14.5, "learning_rate": 1.9471686241491713e-05, "loss": 1.3380238342285156, "mean_token_accuracy": 0.6701244902610779, "num_tokens": 8758984.0, "step": 18350 }, { "entropy": 1.2937812173366547, "epoch": 0.44745993531285716, "grad_norm": 6.6875, "learning_rate": 1.9467378786314413e-05, "loss": 1.3180087280273438, "mean_token_accuracy": 0.6767090505361557, "num_tokens": 8783969.0, "step": 18400 }, { "entropy": 1.3533368176221847, "epoch": 0.44867585905012036, "grad_norm": 13.0, "learning_rate": 1.9463054322924068e-05, "loss": 1.382022705078125, "mean_token_accuracy": 0.6678499418497086, "num_tokens": 8806200.0, "step": 18450 }, { "entropy": 1.250989625453949, "epoch": 0.44989178278738357, "grad_norm": 14.0625, "learning_rate": 1.9458712859089604e-05, "loss": 1.2897526550292968, "mean_token_accuracy": 0.6740556871891021, "num_tokens": 8828583.0, "step": 18500 }, { "entropy": 1.402356127500534, "epoch": 0.45110770652464677, "grad_norm": 15.5625, "learning_rate": 1.945435440261049e-05, "loss": 1.4310250854492188, "mean_token_accuracy": 0.6537684804201126, "num_tokens": 8854651.0, "step": 18550 }, { "entropy": 1.3133639764785767, "epoch": 0.45232363026190997, "grad_norm": 23.375, "learning_rate": 1.9449978961316728e-05, "loss": 1.3189021301269532, "mean_token_accuracy": 0.6796930754184722, "num_tokens": 8876421.0, "step": 18600 }, { "entropy": 1.2162423133850098, "epoch": 0.4535395539991732, "grad_norm": 14.6875, "learning_rate": 1.944558654306883e-05, "loss": 1.2522057342529296, "mean_token_accuracy": 0.6835614454746246, "num_tokens": 8895341.0, "step": 18650 }, { "entropy": 1.337358944416046, "epoch": 0.4547554777364364, "grad_norm": 12.5, "learning_rate": 1.94411771557578e-05, "loss": 1.3765570068359374, "mean_token_accuracy": 0.6625603502988815, "num_tokens": 8919174.0, "step": 18700 }, { "entropy": 1.4019660663604736, "epoch": 0.4559714014736996, "grad_norm": 9.125, "learning_rate": 1.9436750807305137e-05, "loss": 1.4416159057617188, "mean_token_accuracy": 0.6487028443813324, "num_tokens": 8942315.0, "step": 18750 }, { "entropy": 1.3347241336107254, "epoch": 0.4571873252109628, "grad_norm": 15.4375, "learning_rate": 1.943230750566281e-05, "loss": 1.3478022766113282, "mean_token_accuracy": 0.6716166710853577, "num_tokens": 8963860.0, "step": 18800 }, { "entropy": 1.3247536742687225, "epoch": 0.458403248948226, "grad_norm": 14.0625, "learning_rate": 1.9427847258813237e-05, "loss": 1.3526986694335938, "mean_token_accuracy": 0.6709321486949921, "num_tokens": 8985728.0, "step": 18850 }, { "entropy": 1.28573803126812, "epoch": 0.4596191726854892, "grad_norm": 26.25, "learning_rate": 1.9423370074769288e-05, "loss": 1.3026007080078126, "mean_token_accuracy": 0.6779029333591461, "num_tokens": 9011454.0, "step": 18900 }, { "entropy": 1.2626501286029816, "epoch": 0.4608350964227524, "grad_norm": 10.5625, "learning_rate": 1.941887596157425e-05, "loss": 1.2911955261230468, "mean_token_accuracy": 0.6754503285884857, "num_tokens": 9038263.0, "step": 18950 }, { "entropy": 1.3394160491228104, "epoch": 0.4620510201600156, "grad_norm": 13.6875, "learning_rate": 1.9414364927301838e-05, "loss": 1.3525753784179688, "mean_token_accuracy": 0.6680638134479523, "num_tokens": 9061701.0, "step": 19000 }, { "entropy": 1.2796891415119171, "epoch": 0.46326694389727874, "grad_norm": 23.625, "learning_rate": 1.9409836980056148e-05, "loss": 1.2964247131347657, "mean_token_accuracy": 0.6734309309720993, "num_tokens": 9085852.0, "step": 19050 }, { "entropy": 1.4118064558506012, "epoch": 0.46448286763454194, "grad_norm": 11.25, "learning_rate": 1.9405292127971672e-05, "loss": 1.4139346313476562, "mean_token_accuracy": 0.6537890702486038, "num_tokens": 9108328.0, "step": 19100 }, { "entropy": 1.2888099229335785, "epoch": 0.46569879137180514, "grad_norm": 13.5625, "learning_rate": 1.9400730379213273e-05, "loss": 1.307164306640625, "mean_token_accuracy": 0.6613247931003571, "num_tokens": 9131559.0, "step": 19150 }, { "entropy": 1.2891862463951111, "epoch": 0.46691471510906835, "grad_norm": 9.3125, "learning_rate": 1.939615174197616e-05, "loss": 1.3006558227539062, "mean_token_accuracy": 0.6716349118947983, "num_tokens": 9151744.0, "step": 19200 }, { "entropy": 1.3837578332424163, "epoch": 0.46813063884633155, "grad_norm": 31.125, "learning_rate": 1.939155622448589e-05, "loss": 1.4306004333496094, "mean_token_accuracy": 0.6549830067157746, "num_tokens": 9175540.0, "step": 19250 }, { "entropy": 1.3457844483852386, "epoch": 0.46934656258359475, "grad_norm": 7.125, "learning_rate": 1.938694383499834e-05, "loss": 1.383126220703125, "mean_token_accuracy": 0.6570611202716827, "num_tokens": 9199826.0, "step": 19300 }, { "entropy": 1.3807936489582062, "epoch": 0.47056248632085795, "grad_norm": 10.3125, "learning_rate": 1.9382314581799705e-05, "loss": 1.400204620361328, "mean_token_accuracy": 0.650184589624405, "num_tokens": 9223647.0, "step": 19350 }, { "entropy": 1.4052372694015502, "epoch": 0.47177841005812116, "grad_norm": 13.8125, "learning_rate": 1.9377668473206467e-05, "loss": 1.4343511962890625, "mean_token_accuracy": 0.6473797696828842, "num_tokens": 9244090.0, "step": 19400 }, { "entropy": 1.4092572152614593, "epoch": 0.47299433379538436, "grad_norm": 12.1875, "learning_rate": 1.9373005517565394e-05, "loss": 1.4293630981445313, "mean_token_accuracy": 0.6471135985851287, "num_tokens": 9268347.0, "step": 19450 }, { "entropy": 1.3438863360881805, "epoch": 0.47421025753264756, "grad_norm": 10.125, "learning_rate": 1.936832572325352e-05, "loss": 1.376971435546875, "mean_token_accuracy": 0.6616802406311035, "num_tokens": 9291083.0, "step": 19500 }, { "entropy": 1.4349727189540864, "epoch": 0.47542618126991076, "grad_norm": 10.375, "learning_rate": 1.9363629098678127e-05, "loss": 1.4563771057128907, "mean_token_accuracy": 0.6460433274507522, "num_tokens": 9316736.0, "step": 19550 }, { "entropy": 1.2122897326946258, "epoch": 0.47664210500717397, "grad_norm": 14.1875, "learning_rate": 1.9358915652276735e-05, "loss": 1.2322742462158203, "mean_token_accuracy": 0.6833026933670044, "num_tokens": 9343108.0, "step": 19600 }, { "entropy": 1.3426519584655763, "epoch": 0.47785802874443717, "grad_norm": 13.125, "learning_rate": 1.9354185392517084e-05, "loss": 1.379697265625, "mean_token_accuracy": 0.6685261106491089, "num_tokens": 9368765.0, "step": 19650 }, { "entropy": 1.3641179263591767, "epoch": 0.47907395248170037, "grad_norm": 9.5625, "learning_rate": 1.9349438327897122e-05, "loss": 1.3763404846191407, "mean_token_accuracy": 0.6559316128492355, "num_tokens": 9396264.0, "step": 19700 }, { "entropy": 1.2473935449123383, "epoch": 0.4802898762189636, "grad_norm": 16.625, "learning_rate": 1.9344674466944978e-05, "loss": 1.2569503021240234, "mean_token_accuracy": 0.683245108127594, "num_tokens": 9420948.0, "step": 19750 }, { "entropy": 1.3384798204898833, "epoch": 0.4815057999562267, "grad_norm": 22.625, "learning_rate": 1.9339893818218965e-05, "loss": 1.356145477294922, "mean_token_accuracy": 0.6677833658456802, "num_tokens": 9443521.0, "step": 19800 }, { "entropy": 1.3697667050361633, "epoch": 0.4827217236934899, "grad_norm": 13.0625, "learning_rate": 1.9335096390307554e-05, "loss": 1.39396484375, "mean_token_accuracy": 0.6626768779754638, "num_tokens": 9466214.0, "step": 19850 }, { "entropy": 1.2398199605941773, "epoch": 0.4839376474307531, "grad_norm": 9.75, "learning_rate": 1.933028219182936e-05, "loss": 1.2444359588623046, "mean_token_accuracy": 0.6809522479772567, "num_tokens": 9489068.0, "step": 19900 }, { "entropy": 1.365413458943367, "epoch": 0.48515357116801633, "grad_norm": 10.6875, "learning_rate": 1.9325451231433114e-05, "loss": 1.3810707092285157, "mean_token_accuracy": 0.6614898753166198, "num_tokens": 9513107.0, "step": 19950 }, { "entropy": 1.3137288194894792, "epoch": 0.48636949490527953, "grad_norm": 10.875, "learning_rate": 1.9320603517797682e-05, "loss": 1.33023681640625, "mean_token_accuracy": 0.6726621055603027, "num_tokens": 9537582.0, "step": 20000 }, { "epoch": 0.48636949490527953, "eval_entropy": 1.2844082633908662, "eval_loss": 1.3354496955871582, "eval_mean_token_accuracy": 0.6693376800468125, "eval_num_tokens": 9537582.0, "eval_runtime": 392.1454, "eval_samples_per_second": 11.651, "eval_steps_per_second": 11.651, "step": 20000 }, { "entropy": 1.4147557055950164, "epoch": 0.48758541864254273, "grad_norm": 15.75, "learning_rate": 1.9315739059632013e-05, "loss": 1.4435247802734374, "mean_token_accuracy": 0.6537052977085114, "num_tokens": 9557843.0, "step": 20050 }, { "entropy": 1.3076054763793945, "epoch": 0.48880134237980594, "grad_norm": 13.6875, "learning_rate": 1.931085786567514e-05, "loss": 1.330025177001953, "mean_token_accuracy": 0.6645999103784561, "num_tokens": 9580414.0, "step": 20100 }, { "entropy": 1.327368289232254, "epoch": 0.49001726611706914, "grad_norm": 10.25, "learning_rate": 1.930595994469616e-05, "loss": 1.3639236450195313, "mean_token_accuracy": 0.669587237238884, "num_tokens": 9600672.0, "step": 20150 }, { "entropy": 1.1433790719509125, "epoch": 0.49123318985433234, "grad_norm": 11.125, "learning_rate": 1.9301045305494223e-05, "loss": 1.1501567840576172, "mean_token_accuracy": 0.7024638056755066, "num_tokens": 9627224.0, "step": 20200 }, { "entropy": 1.260531051158905, "epoch": 0.49244911359159554, "grad_norm": 15.8125, "learning_rate": 1.9296113956898516e-05, "loss": 1.2808346557617187, "mean_token_accuracy": 0.6754031991958618, "num_tokens": 9651274.0, "step": 20250 }, { "entropy": 1.2998846465349196, "epoch": 0.49366503732885875, "grad_norm": 12.6875, "learning_rate": 1.9291165907768242e-05, "loss": 1.30984619140625, "mean_token_accuracy": 0.6785664224624633, "num_tokens": 9674996.0, "step": 20300 }, { "entropy": 1.3473336124420165, "epoch": 0.49488096106612195, "grad_norm": 20.0, "learning_rate": 1.9286201166992608e-05, "loss": 1.35676025390625, "mean_token_accuracy": 0.6694793277978897, "num_tokens": 9697048.0, "step": 20350 }, { "entropy": 1.4348745238780976, "epoch": 0.49609688480338515, "grad_norm": 13.5, "learning_rate": 1.92812197434908e-05, "loss": 1.4683428955078126, "mean_token_accuracy": 0.6435595166683197, "num_tokens": 9722653.0, "step": 20400 }, { "entropy": 1.336161008477211, "epoch": 0.49731280854064835, "grad_norm": 13.625, "learning_rate": 1.927622164621198e-05, "loss": 1.3396339416503906, "mean_token_accuracy": 0.6638877683877945, "num_tokens": 9747198.0, "step": 20450 }, { "entropy": 1.2911191487312317, "epoch": 0.49852873227791156, "grad_norm": 17.5, "learning_rate": 1.9271206884135273e-05, "loss": 1.317979736328125, "mean_token_accuracy": 0.6690568661689759, "num_tokens": 9770770.0, "step": 20500 }, { "entropy": 1.2375590080022811, "epoch": 0.4997446560151747, "grad_norm": 10.8125, "learning_rate": 1.9266175466269727e-05, "loss": 1.2593755340576172, "mean_token_accuracy": 0.6843004721403122, "num_tokens": 9793346.0, "step": 20550 }, { "entropy": 1.3002306306362152, "epoch": 0.5009605797524379, "grad_norm": 12.8125, "learning_rate": 1.9261127401654324e-05, "loss": 1.3279205322265626, "mean_token_accuracy": 0.6736592692136765, "num_tokens": 9815503.0, "step": 20600 }, { "entropy": 1.2715729933977127, "epoch": 0.5021765034897011, "grad_norm": 11.375, "learning_rate": 1.925606269935795e-05, "loss": 1.2791580963134765, "mean_token_accuracy": 0.6747674745321274, "num_tokens": 9839193.0, "step": 20650 }, { "entropy": 1.3318681210279464, "epoch": 0.5033924272269643, "grad_norm": 9.625, "learning_rate": 1.9250981368479373e-05, "loss": 1.3651458740234375, "mean_token_accuracy": 0.6663754838705063, "num_tokens": 9857469.0, "step": 20700 }, { "entropy": 1.3163228130340576, "epoch": 0.5046083509642275, "grad_norm": 9.8125, "learning_rate": 1.9245883418147242e-05, "loss": 1.3300949096679688, "mean_token_accuracy": 0.6681941467523574, "num_tokens": 9884236.0, "step": 20750 }, { "entropy": 1.3904693925380707, "epoch": 0.5058242747014907, "grad_norm": 13.75, "learning_rate": 1.9240768857520062e-05, "loss": 1.413912811279297, "mean_token_accuracy": 0.6520731329917908, "num_tokens": 9906757.0, "step": 20800 }, { "entropy": 1.31770862698555, "epoch": 0.5070401984387539, "grad_norm": 28.625, "learning_rate": 1.9235637695786175e-05, "loss": 1.33938232421875, "mean_token_accuracy": 0.665419231057167, "num_tokens": 9930634.0, "step": 20850 }, { "entropy": 1.2324526011943817, "epoch": 0.5082561221760171, "grad_norm": 11.25, "learning_rate": 1.923048994216375e-05, "loss": 1.2652508544921874, "mean_token_accuracy": 0.685901573896408, "num_tokens": 9950096.0, "step": 20900 }, { "entropy": 1.3414457583427428, "epoch": 0.5094720459132803, "grad_norm": 10.8125, "learning_rate": 1.922532560590077e-05, "loss": 1.3557400512695312, "mean_token_accuracy": 0.6580870169401168, "num_tokens": 9976541.0, "step": 20950 }, { "entropy": 1.4262979102134705, "epoch": 0.5106879696505435, "grad_norm": 19.5, "learning_rate": 1.922014469627499e-05, "loss": 1.4482168579101562, "mean_token_accuracy": 0.6491158890724182, "num_tokens": 9998444.0, "step": 21000 }, { "entropy": 1.2938120687007904, "epoch": 0.5119038933878067, "grad_norm": 8.8125, "learning_rate": 1.921494722259396e-05, "loss": 1.3073367309570312, "mean_token_accuracy": 0.6663001143932342, "num_tokens": 10023778.0, "step": 21050 }, { "entropy": 1.4132488310337066, "epoch": 0.5131198171250699, "grad_norm": 12.3125, "learning_rate": 1.9209733194194972e-05, "loss": 1.415035400390625, "mean_token_accuracy": 0.6589645719528199, "num_tokens": 10046579.0, "step": 21100 }, { "entropy": 1.240094347000122, "epoch": 0.5143357408623331, "grad_norm": 42.75, "learning_rate": 1.9204502620445065e-05, "loss": 1.257367401123047, "mean_token_accuracy": 0.6827336347103119, "num_tokens": 10067032.0, "step": 21150 }, { "entropy": 1.3129330611228942, "epoch": 0.5155516645995963, "grad_norm": 19.0, "learning_rate": 1.9199255510741007e-05, "loss": 1.348618927001953, "mean_token_accuracy": 0.6754098284244537, "num_tokens": 10090110.0, "step": 21200 }, { "entropy": 1.5054458904266357, "epoch": 0.5167675883368595, "grad_norm": 41.75, "learning_rate": 1.9193991874509268e-05, "loss": 1.5220925903320313, "mean_token_accuracy": 0.6406021642684937, "num_tokens": 10114013.0, "step": 21250 }, { "entropy": 1.3261887037754059, "epoch": 0.5179835120741227, "grad_norm": 17.5, "learning_rate": 1.9188711721206005e-05, "loss": 1.346175537109375, "mean_token_accuracy": 0.6638609105348587, "num_tokens": 10141137.0, "step": 21300 }, { "entropy": 1.30419478058815, "epoch": 0.519199435811386, "grad_norm": 20.25, "learning_rate": 1.9183415060317044e-05, "loss": 1.316258544921875, "mean_token_accuracy": 0.6770498871803283, "num_tokens": 10167577.0, "step": 21350 }, { "entropy": 1.2507498288154602, "epoch": 0.5204153595486491, "grad_norm": 9.1875, "learning_rate": 1.9178101901357888e-05, "loss": 1.2603435516357422, "mean_token_accuracy": 0.6819305509328842, "num_tokens": 10193988.0, "step": 21400 }, { "entropy": 1.2886829471588135, "epoch": 0.5216312832859124, "grad_norm": 12.75, "learning_rate": 1.9172772253873654e-05, "loss": 1.3048802185058594, "mean_token_accuracy": 0.6759050953388214, "num_tokens": 10216304.0, "step": 21450 }, { "entropy": 1.207609043121338, "epoch": 0.5228472070231756, "grad_norm": 14.375, "learning_rate": 1.9167426127439092e-05, "loss": 1.2210121154785156, "mean_token_accuracy": 0.6882837575674057, "num_tokens": 10239570.0, "step": 21500 }, { "entropy": 1.3036307179927826, "epoch": 0.5240631307604388, "grad_norm": 19.0, "learning_rate": 1.9162063531658562e-05, "loss": 1.3343681335449218, "mean_token_accuracy": 0.6699310314655303, "num_tokens": 10263707.0, "step": 21550 }, { "entropy": 1.277033587694168, "epoch": 0.525279054497702, "grad_norm": 14.25, "learning_rate": 1.9156684476166e-05, "loss": 1.2944772338867188, "mean_token_accuracy": 0.6730248314142228, "num_tokens": 10288791.0, "step": 21600 }, { "entropy": 1.2923642158508302, "epoch": 0.526494978234965, "grad_norm": 9.5625, "learning_rate": 1.9151288970624922e-05, "loss": 1.3182792663574219, "mean_token_accuracy": 0.6745221793651581, "num_tokens": 10312695.0, "step": 21650 }, { "entropy": 1.365564637184143, "epoch": 0.5277109019722283, "grad_norm": 14.5625, "learning_rate": 1.9145877024728388e-05, "loss": 1.3867529296875, "mean_token_accuracy": 0.652696562409401, "num_tokens": 10336258.0, "step": 21700 }, { "entropy": 1.282685525417328, "epoch": 0.5289268257094915, "grad_norm": 17.25, "learning_rate": 1.9140448648199e-05, "loss": 1.3021788024902343, "mean_token_accuracy": 0.672601238489151, "num_tokens": 10361042.0, "step": 21750 }, { "entropy": 1.2616631948947907, "epoch": 0.5301427494467547, "grad_norm": 12.125, "learning_rate": 1.913500385078887e-05, "loss": 1.29764892578125, "mean_token_accuracy": 0.6766226315498352, "num_tokens": 10382819.0, "step": 21800 }, { "entropy": 1.3737204837799073, "epoch": 0.5313586731840179, "grad_norm": 8.6875, "learning_rate": 1.912954264227962e-05, "loss": 1.4012850952148437, "mean_token_accuracy": 0.6639855688810349, "num_tokens": 10411532.0, "step": 21850 }, { "entropy": 1.4137662947177887, "epoch": 0.5325745969212811, "grad_norm": 11.0, "learning_rate": 1.9124065032482348e-05, "loss": 1.442166290283203, "mean_token_accuracy": 0.6501247125864029, "num_tokens": 10433706.0, "step": 21900 }, { "entropy": 1.264396549463272, "epoch": 0.5337905206585443, "grad_norm": 30.125, "learning_rate": 1.911857103123762e-05, "loss": 1.2885516357421876, "mean_token_accuracy": 0.6867783391475677, "num_tokens": 10457996.0, "step": 21950 }, { "entropy": 1.3174823093414307, "epoch": 0.5350064443958075, "grad_norm": 17.375, "learning_rate": 1.9113060648415443e-05, "loss": 1.3227313232421876, "mean_token_accuracy": 0.6696613609790802, "num_tokens": 10482216.0, "step": 22000 }, { "entropy": 1.3434513247013091, "epoch": 0.5362223681330707, "grad_norm": 15.0625, "learning_rate": 1.910753389391527e-05, "loss": 1.3706483459472656, "mean_token_accuracy": 0.6626712989807129, "num_tokens": 10505953.0, "step": 22050 }, { "entropy": 1.3033747577667236, "epoch": 0.5374382918703339, "grad_norm": 17.875, "learning_rate": 1.9101990777665943e-05, "loss": 1.3227203369140625, "mean_token_accuracy": 0.6693976205587387, "num_tokens": 10531804.0, "step": 22100 }, { "entropy": 1.3347691345214843, "epoch": 0.5386542156075971, "grad_norm": 13.3125, "learning_rate": 1.909643130962571e-05, "loss": 1.351047821044922, "mean_token_accuracy": 0.6692690932750702, "num_tokens": 10558528.0, "step": 22150 }, { "entropy": 1.3121741533279419, "epoch": 0.5398701393448603, "grad_norm": 17.625, "learning_rate": 1.9090855499782206e-05, "loss": 1.3403887939453125, "mean_token_accuracy": 0.6638339418172836, "num_tokens": 10581262.0, "step": 22200 }, { "entropy": 1.3335012710094452, "epoch": 0.5410860630821235, "grad_norm": 13.5, "learning_rate": 1.90852633581524e-05, "loss": 1.3490902709960937, "mean_token_accuracy": 0.6645049041509629, "num_tokens": 10605314.0, "step": 22250 }, { "entropy": 1.1842589354515076, "epoch": 0.5423019868193867, "grad_norm": 10.6875, "learning_rate": 1.9079654894782624e-05, "loss": 1.195136184692383, "mean_token_accuracy": 0.6900103259086608, "num_tokens": 10630681.0, "step": 22300 }, { "entropy": 1.4612477552890777, "epoch": 0.5435179105566499, "grad_norm": 19.875, "learning_rate": 1.9074030119748513e-05, "loss": 1.496312713623047, "mean_token_accuracy": 0.6374152517318725, "num_tokens": 10659186.0, "step": 22350 }, { "entropy": 1.2800756293535231, "epoch": 0.5447338342939131, "grad_norm": 26.875, "learning_rate": 1.9068389043155026e-05, "loss": 1.2985145568847656, "mean_token_accuracy": 0.6811231917142868, "num_tokens": 10684329.0, "step": 22400 }, { "entropy": 1.326293774843216, "epoch": 0.5459497580311763, "grad_norm": 17.0, "learning_rate": 1.9062731675136387e-05, "loss": 1.3531968688964844, "mean_token_accuracy": 0.6713584762811661, "num_tokens": 10706334.0, "step": 22450 }, { "entropy": 1.313564749956131, "epoch": 0.5471656817684395, "grad_norm": 9.6875, "learning_rate": 1.9057058025856104e-05, "loss": 1.3162336730957032, "mean_token_accuracy": 0.6778070849180221, "num_tokens": 10733397.0, "step": 22500 }, { "entropy": 1.2251176643371582, "epoch": 0.5483816055057027, "grad_norm": 51.0, "learning_rate": 1.9051368105506923e-05, "loss": 1.2297530364990235, "mean_token_accuracy": 0.6806858241558075, "num_tokens": 10758017.0, "step": 22550 }, { "entropy": 1.4259882855415345, "epoch": 0.5495975292429659, "grad_norm": 10.0, "learning_rate": 1.9045661924310832e-05, "loss": 1.4605207824707032, "mean_token_accuracy": 0.648930413722992, "num_tokens": 10782566.0, "step": 22600 }, { "entropy": 1.343395665884018, "epoch": 0.5508134529802291, "grad_norm": 18.875, "learning_rate": 1.903993949251902e-05, "loss": 1.3688192749023438, "mean_token_accuracy": 0.6602388268709183, "num_tokens": 10802537.0, "step": 22650 }, { "entropy": 1.3202217280864716, "epoch": 0.5520293767174923, "grad_norm": 13.3125, "learning_rate": 1.9034200820411882e-05, "loss": 1.3148880004882812, "mean_token_accuracy": 0.6768527722358704, "num_tokens": 10825245.0, "step": 22700 }, { "entropy": 1.4026498925685882, "epoch": 0.5532453004547555, "grad_norm": 10.4375, "learning_rate": 1.902844591829898e-05, "loss": 1.4664749145507812, "mean_token_accuracy": 0.6468406808376312, "num_tokens": 10851499.0, "step": 22750 }, { "entropy": 1.2114727950096131, "epoch": 0.5544612241920187, "grad_norm": 9.375, "learning_rate": 1.902267479651904e-05, "loss": 1.204442825317383, "mean_token_accuracy": 0.6929528576135635, "num_tokens": 10877391.0, "step": 22800 }, { "entropy": 1.3889543092250825, "epoch": 0.5556771479292819, "grad_norm": 14.3125, "learning_rate": 1.9016887465439914e-05, "loss": 1.427325439453125, "mean_token_accuracy": 0.6545898866653442, "num_tokens": 10905072.0, "step": 22850 }, { "entropy": 1.26247878074646, "epoch": 0.5568930716665451, "grad_norm": 16.875, "learning_rate": 1.901108393545859e-05, "loss": 1.2738922119140625, "mean_token_accuracy": 0.6794955503940582, "num_tokens": 10930679.0, "step": 22900 }, { "entropy": 1.3456266152858734, "epoch": 0.5581089954038083, "grad_norm": 16.875, "learning_rate": 1.9005264217001155e-05, "loss": 1.35648681640625, "mean_token_accuracy": 0.6638967800140381, "num_tokens": 10956112.0, "step": 22950 }, { "entropy": 1.3809300792217254, "epoch": 0.5593249191410715, "grad_norm": 21.25, "learning_rate": 1.8999428320522762e-05, "loss": 1.4016903686523436, "mean_token_accuracy": 0.6577708804607392, "num_tokens": 10985488.0, "step": 23000 }, { "entropy": 1.3069904124736786, "epoch": 0.5605408428783347, "grad_norm": 14.0, "learning_rate": 1.8993576256507653e-05, "loss": 1.3066386413574218, "mean_token_accuracy": 0.6700779050588608, "num_tokens": 11012334.0, "step": 23050 }, { "entropy": 1.2815304517745971, "epoch": 0.5617567666155979, "grad_norm": 14.9375, "learning_rate": 1.8987708035469098e-05, "loss": 1.31633544921875, "mean_token_accuracy": 0.672404111623764, "num_tokens": 11032984.0, "step": 23100 }, { "entropy": 1.2769182682037354, "epoch": 0.562972690352861, "grad_norm": 10.0, "learning_rate": 1.8981823667949394e-05, "loss": 1.3089669799804688, "mean_token_accuracy": 0.6799270629882812, "num_tokens": 11057786.0, "step": 23150 }, { "entropy": 1.2607621788978576, "epoch": 0.5641886140901242, "grad_norm": 19.25, "learning_rate": 1.8975923164519855e-05, "loss": 1.2790190124511718, "mean_token_accuracy": 0.6684930455684662, "num_tokens": 11081392.0, "step": 23200 }, { "entropy": 1.3620776689052583, "epoch": 0.5654045378273874, "grad_norm": 12.0, "learning_rate": 1.897000653578077e-05, "loss": 1.3830155944824218, "mean_token_accuracy": 0.6575759673118591, "num_tokens": 11106151.0, "step": 23250 }, { "entropy": 1.3702842199802399, "epoch": 0.5666204615646506, "grad_norm": 10.3125, "learning_rate": 1.8964073792361412e-05, "loss": 1.3763529968261718, "mean_token_accuracy": 0.6590202581882477, "num_tokens": 11131141.0, "step": 23300 }, { "entropy": 1.240385546684265, "epoch": 0.5678363853019138, "grad_norm": 5.84375, "learning_rate": 1.8958124944919988e-05, "loss": 1.2732952880859374, "mean_token_accuracy": 0.6764565628767013, "num_tokens": 11157946.0, "step": 23350 }, { "entropy": 1.2977541959285737, "epoch": 0.569052309039177, "grad_norm": 23.625, "learning_rate": 1.8952160004143653e-05, "loss": 1.3196389770507813, "mean_token_accuracy": 0.6733503097295761, "num_tokens": 11185007.0, "step": 23400 }, { "entropy": 1.388230448961258, "epoch": 0.5702682327764402, "grad_norm": 16.375, "learning_rate": 1.8946178980748463e-05, "loss": 1.414645233154297, "mean_token_accuracy": 0.6542486208677292, "num_tokens": 11207190.0, "step": 23450 }, { "entropy": 1.3280630403757094, "epoch": 0.5714841565137034, "grad_norm": 15.0625, "learning_rate": 1.8940181885479366e-05, "loss": 1.3417585754394532, "mean_token_accuracy": 0.669080416560173, "num_tokens": 11231805.0, "step": 23500 }, { "entropy": 1.3226020568609238, "epoch": 0.5727000802509666, "grad_norm": 11.5, "learning_rate": 1.893416872911019e-05, "loss": 1.335025634765625, "mean_token_accuracy": 0.6700393456220627, "num_tokens": 11256223.0, "step": 23550 }, { "entropy": 1.3489735555648803, "epoch": 0.5739160039882298, "grad_norm": 12.0625, "learning_rate": 1.8928139522443606e-05, "loss": 1.382759552001953, "mean_token_accuracy": 0.6605659013986588, "num_tokens": 11278664.0, "step": 23600 }, { "entropy": 1.3069506287574768, "epoch": 0.575131927725493, "grad_norm": 14.625, "learning_rate": 1.8922094276311136e-05, "loss": 1.3104568481445313, "mean_token_accuracy": 0.6634412384033204, "num_tokens": 11302636.0, "step": 23650 }, { "entropy": 1.349776073694229, "epoch": 0.5763478514627562, "grad_norm": 13.4375, "learning_rate": 1.8916033001573103e-05, "loss": 1.3747947692871094, "mean_token_accuracy": 0.6641810059547424, "num_tokens": 11324747.0, "step": 23700 }, { "entropy": 1.301251264810562, "epoch": 0.5775637752000194, "grad_norm": 14.9375, "learning_rate": 1.8909955709118626e-05, "loss": 1.317711181640625, "mean_token_accuracy": 0.6701475375890732, "num_tokens": 11345752.0, "step": 23750 }, { "entropy": 1.2677265697717666, "epoch": 0.5787796989372826, "grad_norm": 9.375, "learning_rate": 1.8903862409865616e-05, "loss": 1.2672372436523438, "mean_token_accuracy": 0.6758001744747162, "num_tokens": 11368075.0, "step": 23800 }, { "entropy": 1.327401452064514, "epoch": 0.5799956226745459, "grad_norm": 10.625, "learning_rate": 1.889775311476071e-05, "loss": 1.3437554931640625, "mean_token_accuracy": 0.6774942791461944, "num_tokens": 11392517.0, "step": 23850 }, { "entropy": 1.34876171708107, "epoch": 0.581211546411809, "grad_norm": 13.6875, "learning_rate": 1.889162783477932e-05, "loss": 1.4015939331054688, "mean_token_accuracy": 0.660224586725235, "num_tokens": 11416929.0, "step": 23900 }, { "entropy": 1.3848629474639893, "epoch": 0.5824274701490723, "grad_norm": 9.3125, "learning_rate": 1.8885486580925535e-05, "loss": 1.389674835205078, "mean_token_accuracy": 0.6591099292039871, "num_tokens": 11438159.0, "step": 23950 }, { "entropy": 1.2995729398727418, "epoch": 0.5836433938863355, "grad_norm": 27.875, "learning_rate": 1.8879329364232177e-05, "loss": 1.3289573669433594, "mean_token_accuracy": 0.6684831017255783, "num_tokens": 11460965.0, "step": 24000 }, { "entropy": 1.366872764825821, "epoch": 0.5848593176235987, "grad_norm": 12.625, "learning_rate": 1.8873156195760722e-05, "loss": 1.3890394592285156, "mean_token_accuracy": 0.6585630071163178, "num_tokens": 11483897.0, "step": 24050 }, { "entropy": 1.3344593226909638, "epoch": 0.5860752413608619, "grad_norm": 12.1875, "learning_rate": 1.8866967086601313e-05, "loss": 1.3356553649902343, "mean_token_accuracy": 0.6715430945158005, "num_tokens": 11505941.0, "step": 24100 }, { "entropy": 1.317805471420288, "epoch": 0.5872911650981251, "grad_norm": 21.0, "learning_rate": 1.8860762047872727e-05, "loss": 1.3219692993164063, "mean_token_accuracy": 0.6722541534900666, "num_tokens": 11530376.0, "step": 24150 }, { "entropy": 1.2993505001068115, "epoch": 0.5885070888353883, "grad_norm": 16.75, "learning_rate": 1.885454109072236e-05, "loss": 1.336446533203125, "mean_token_accuracy": 0.6622891354560853, "num_tokens": 11553364.0, "step": 24200 }, { "entropy": 1.2426889419555665, "epoch": 0.5897230125726515, "grad_norm": 19.75, "learning_rate": 1.8848304226326205e-05, "loss": 1.2532803344726562, "mean_token_accuracy": 0.6855344617366791, "num_tokens": 11576115.0, "step": 24250 }, { "entropy": 1.3228561568260193, "epoch": 0.5909389363099147, "grad_norm": 15.9375, "learning_rate": 1.884205146588884e-05, "loss": 1.3516427612304687, "mean_token_accuracy": 0.6676874357461929, "num_tokens": 11600628.0, "step": 24300 }, { "entropy": 1.415769292116165, "epoch": 0.5921548600471779, "grad_norm": 12.0, "learning_rate": 1.8835782820643383e-05, "loss": 1.4527090454101563, "mean_token_accuracy": 0.6495622384548188, "num_tokens": 11622559.0, "step": 24350 }, { "entropy": 1.3307215082645416, "epoch": 0.5933707837844411, "grad_norm": 12.5625, "learning_rate": 1.8829498301851505e-05, "loss": 1.3372592163085937, "mean_token_accuracy": 0.665298969745636, "num_tokens": 11648705.0, "step": 24400 }, { "entropy": 1.3165913534164428, "epoch": 0.5945867075217043, "grad_norm": 18.125, "learning_rate": 1.8823197920803393e-05, "loss": 1.315489501953125, "mean_token_accuracy": 0.6685507941246033, "num_tokens": 11675442.0, "step": 24450 }, { "entropy": 1.2535370910167694, "epoch": 0.5958026312589675, "grad_norm": 12.5625, "learning_rate": 1.8816881688817714e-05, "loss": 1.2741710662841796, "mean_token_accuracy": 0.6803452056646347, "num_tokens": 11697757.0, "step": 24500 }, { "entropy": 1.1903128737211228, "epoch": 0.5970185549962307, "grad_norm": 14.5625, "learning_rate": 1.8810549617241633e-05, "loss": 1.212571258544922, "mean_token_accuracy": 0.69110759973526, "num_tokens": 11724289.0, "step": 24550 }, { "entropy": 1.3124532616138458, "epoch": 0.5982344787334939, "grad_norm": 10.9375, "learning_rate": 1.880420171745076e-05, "loss": 1.3287937927246094, "mean_token_accuracy": 0.6690134733915329, "num_tokens": 11748977.0, "step": 24600 }, { "entropy": 1.2266589975357056, "epoch": 0.599450402470757, "grad_norm": 11.0, "learning_rate": 1.879783800084914e-05, "loss": 1.2577111053466796, "mean_token_accuracy": 0.6877829110622407, "num_tokens": 11770153.0, "step": 24650 }, { "entropy": 1.31385737657547, "epoch": 0.6006663262080202, "grad_norm": 19.125, "learning_rate": 1.8791458478869236e-05, "loss": 1.3229670715332031, "mean_token_accuracy": 0.6718249839544296, "num_tokens": 11793229.0, "step": 24700 }, { "entropy": 1.2605192112922667, "epoch": 0.6018822499452834, "grad_norm": 13.0, "learning_rate": 1.8785063162971907e-05, "loss": 1.2839352416992187, "mean_token_accuracy": 0.680154048204422, "num_tokens": 11816500.0, "step": 24750 }, { "entropy": 1.3683338892459869, "epoch": 0.6030981736825466, "grad_norm": 5.59375, "learning_rate": 1.8778652064646373e-05, "loss": 1.3966172790527345, "mean_token_accuracy": 0.6518291085958481, "num_tokens": 11840818.0, "step": 24800 }, { "entropy": 1.3590532231330872, "epoch": 0.6043140974198098, "grad_norm": 16.875, "learning_rate": 1.8772225195410235e-05, "loss": 1.3805215454101563, "mean_token_accuracy": 0.6559701609611511, "num_tokens": 11863523.0, "step": 24850 }, { "entropy": 1.374942775964737, "epoch": 0.605530021157073, "grad_norm": 10.5, "learning_rate": 1.8765782566809393e-05, "loss": 1.3963037109375, "mean_token_accuracy": 0.6617388141155243, "num_tokens": 11886396.0, "step": 24900 }, { "entropy": 1.2944754886627197, "epoch": 0.6067459448943362, "grad_norm": 21.75, "learning_rate": 1.8759324190418083e-05, "loss": 1.3090264892578125, "mean_token_accuracy": 0.6745695775747299, "num_tokens": 11910753.0, "step": 24950 }, { "entropy": 1.2975403773784637, "epoch": 0.6079618686315994, "grad_norm": 12.3125, "learning_rate": 1.8752850077838822e-05, "loss": 1.3076971435546876, "mean_token_accuracy": 0.676093397140503, "num_tokens": 11935566.0, "step": 25000 }, { "entropy": 1.3426438307762145, "epoch": 0.6091777923688626, "grad_norm": 12.375, "learning_rate": 1.87463602407024e-05, "loss": 1.3632693481445313, "mean_token_accuracy": 0.6626334375143051, "num_tokens": 11963391.0, "step": 25050 }, { "entropy": 1.321216385960579, "epoch": 0.6103937161061258, "grad_norm": 17.0, "learning_rate": 1.8739854690667854e-05, "loss": 1.3483554077148439, "mean_token_accuracy": 0.6733912640810013, "num_tokens": 11985158.0, "step": 25100 }, { "entropy": 1.3350305616855622, "epoch": 0.611609639843389, "grad_norm": 22.25, "learning_rate": 1.873333343942245e-05, "loss": 1.3500537109375, "mean_token_accuracy": 0.6623563915491104, "num_tokens": 12008937.0, "step": 25150 }, { "entropy": 1.2291875404119492, "epoch": 0.6128255635806522, "grad_norm": 11.25, "learning_rate": 1.8726796498681666e-05, "loss": 1.2421768188476563, "mean_token_accuracy": 0.6879375231266022, "num_tokens": 12032082.0, "step": 25200 }, { "entropy": 1.266599963903427, "epoch": 0.6140414873179154, "grad_norm": 15.125, "learning_rate": 1.872024388018916e-05, "loss": 1.2750885772705078, "mean_token_accuracy": 0.6697411209344863, "num_tokens": 12055562.0, "step": 25250 }, { "entropy": 1.2942755365371703, "epoch": 0.6152574110551786, "grad_norm": 15.3125, "learning_rate": 1.8713675595716752e-05, "loss": 1.315091094970703, "mean_token_accuracy": 0.6653907370567321, "num_tokens": 12076990.0, "step": 25300 }, { "entropy": 1.319559565782547, "epoch": 0.6164733347924418, "grad_norm": 8.8125, "learning_rate": 1.870709165706442e-05, "loss": 1.3674752807617188, "mean_token_accuracy": 0.6664522969722748, "num_tokens": 12100743.0, "step": 25350 }, { "entropy": 1.3017646861076355, "epoch": 0.617689258529705, "grad_norm": 15.4375, "learning_rate": 1.870049207606026e-05, "loss": 1.3214773559570312, "mean_token_accuracy": 0.661553965806961, "num_tokens": 12124044.0, "step": 25400 }, { "entropy": 1.319299921989441, "epoch": 0.6189051822669682, "grad_norm": 10.25, "learning_rate": 1.8693876864560453e-05, "loss": 1.338673095703125, "mean_token_accuracy": 0.6617295980453491, "num_tokens": 12148904.0, "step": 25450 }, { "entropy": 1.3103798568248748, "epoch": 0.6201211060042314, "grad_norm": 22.625, "learning_rate": 1.868724603444928e-05, "loss": 1.3202755737304688, "mean_token_accuracy": 0.676274505853653, "num_tokens": 12172280.0, "step": 25500 }, { "entropy": 1.198788977563381, "epoch": 0.6213370297414946, "grad_norm": 8.9375, "learning_rate": 1.868059959763907e-05, "loss": 1.2292264556884767, "mean_token_accuracy": 0.6957440012693406, "num_tokens": 12194144.0, "step": 25550 }, { "entropy": 1.4170632398128509, "epoch": 0.6225529534787578, "grad_norm": 12.5, "learning_rate": 1.8673937566070203e-05, "loss": 1.446917724609375, "mean_token_accuracy": 0.6483105617761612, "num_tokens": 12220122.0, "step": 25600 }, { "entropy": 1.2135070103406906, "epoch": 0.623768877216021, "grad_norm": 10.3125, "learning_rate": 1.866725995171106e-05, "loss": 1.2253093719482422, "mean_token_accuracy": 0.6891676145792007, "num_tokens": 12248047.0, "step": 25650 }, { "entropy": 1.4056518131494522, "epoch": 0.6249848009532842, "grad_norm": 15.0, "learning_rate": 1.866056676655802e-05, "loss": 1.4434060668945312, "mean_token_accuracy": 0.6575584179162979, "num_tokens": 12271876.0, "step": 25700 }, { "entropy": 1.2513913810253143, "epoch": 0.6262007246905474, "grad_norm": 18.0, "learning_rate": 1.8653858022635444e-05, "loss": 1.284222412109375, "mean_token_accuracy": 0.687210990190506, "num_tokens": 12295192.0, "step": 25750 }, { "entropy": 1.3519789803028106, "epoch": 0.6274166484278106, "grad_norm": 9.8125, "learning_rate": 1.8647133731995634e-05, "loss": 1.38398193359375, "mean_token_accuracy": 0.6725940752029419, "num_tokens": 12322940.0, "step": 25800 }, { "entropy": 1.2972270327806472, "epoch": 0.6286325721650738, "grad_norm": 24.0, "learning_rate": 1.8640393906718825e-05, "loss": 1.324559783935547, "mean_token_accuracy": 0.670093966126442, "num_tokens": 12346371.0, "step": 25850 }, { "entropy": 1.3292463970184327, "epoch": 0.629848495902337, "grad_norm": 9.4375, "learning_rate": 1.8633638558913163e-05, "loss": 1.3464459228515624, "mean_token_accuracy": 0.6632275140285492, "num_tokens": 12373931.0, "step": 25900 }, { "entropy": 1.3722746098041534, "epoch": 0.6310644196396002, "grad_norm": 11.9375, "learning_rate": 1.8626867700714676e-05, "loss": 1.4065289306640625, "mean_token_accuracy": 0.6572035503387451, "num_tokens": 12398650.0, "step": 25950 }, { "entropy": 1.3643992221355439, "epoch": 0.6322803433768635, "grad_norm": 12.0625, "learning_rate": 1.862008134428726e-05, "loss": 1.3682986450195314, "mean_token_accuracy": 0.6714404892921447, "num_tokens": 12423537.0, "step": 26000 }, { "entropy": 1.3989584469795227, "epoch": 0.6334962671141267, "grad_norm": 11.5625, "learning_rate": 1.8613279501822656e-05, "loss": 1.4357014465332032, "mean_token_accuracy": 0.6565348976850509, "num_tokens": 12450815.0, "step": 26050 }, { "entropy": 1.3903049504756928, "epoch": 0.6347121908513897, "grad_norm": 14.0625, "learning_rate": 1.8606462185540417e-05, "loss": 1.4131419372558593, "mean_token_accuracy": 0.656800040602684, "num_tokens": 12475277.0, "step": 26100 }, { "entropy": 1.3283718860149383, "epoch": 0.635928114588653, "grad_norm": 21.625, "learning_rate": 1.85996294076879e-05, "loss": 1.349588623046875, "mean_token_accuracy": 0.6687670373916625, "num_tokens": 12501670.0, "step": 26150 }, { "entropy": 1.4030838513374329, "epoch": 0.6371440383259162, "grad_norm": 16.0, "learning_rate": 1.8592781180540242e-05, "loss": 1.4024839782714844, "mean_token_accuracy": 0.6600656712055206, "num_tokens": 12521496.0, "step": 26200 }, { "entropy": 1.2166911792755126, "epoch": 0.6383599620631794, "grad_norm": 22.25, "learning_rate": 1.8585917516400332e-05, "loss": 1.238456039428711, "mean_token_accuracy": 0.6919553208351136, "num_tokens": 12543154.0, "step": 26250 }, { "entropy": 1.272672320008278, "epoch": 0.6395758858004426, "grad_norm": 14.875, "learning_rate": 1.8579038427598783e-05, "loss": 1.3089054870605468, "mean_token_accuracy": 0.6749801963567734, "num_tokens": 12566398.0, "step": 26300 }, { "entropy": 1.2954730808734893, "epoch": 0.6407918095377058, "grad_norm": 22.125, "learning_rate": 1.857214392649394e-05, "loss": 1.3113539123535156, "mean_token_accuracy": 0.6733168935775757, "num_tokens": 12586713.0, "step": 26350 }, { "entropy": 1.3454414868354798, "epoch": 0.642007733274969, "grad_norm": 18.875, "learning_rate": 1.856523402547181e-05, "loss": 1.3806295776367188, "mean_token_accuracy": 0.6666394621133804, "num_tokens": 12607066.0, "step": 26400 }, { "entropy": 1.2859765374660492, "epoch": 0.6432236570122322, "grad_norm": 13.6875, "learning_rate": 1.8558308736946088e-05, "loss": 1.295975799560547, "mean_token_accuracy": 0.6747210788726806, "num_tokens": 12630805.0, "step": 26450 }, { "entropy": 1.2644166445732117, "epoch": 0.6444395807494954, "grad_norm": 11.0, "learning_rate": 1.8551368073358104e-05, "loss": 1.2837307739257813, "mean_token_accuracy": 0.6801409602165223, "num_tokens": 12655712.0, "step": 26500 }, { "entropy": 1.30873504281044, "epoch": 0.6456555044867586, "grad_norm": 8.6875, "learning_rate": 1.8544412047176802e-05, "loss": 1.325537109375, "mean_token_accuracy": 0.6594963145256042, "num_tokens": 12680094.0, "step": 26550 }, { "entropy": 1.3239508652687073, "epoch": 0.6468714282240218, "grad_norm": 6.09375, "learning_rate": 1.853744067089874e-05, "loss": 1.3499873352050782, "mean_token_accuracy": 0.668631374835968, "num_tokens": 12706778.0, "step": 26600 }, { "entropy": 1.3840764439105988, "epoch": 0.648087351961285, "grad_norm": 15.0625, "learning_rate": 1.853045395704804e-05, "loss": 1.39324951171875, "mean_token_accuracy": 0.6557315582036972, "num_tokens": 12733471.0, "step": 26650 }, { "entropy": 1.228836236000061, "epoch": 0.6493032756985482, "grad_norm": 17.625, "learning_rate": 1.8523451918176385e-05, "loss": 1.2441872406005858, "mean_token_accuracy": 0.6843521642684937, "num_tokens": 12755800.0, "step": 26700 }, { "entropy": 1.3713362896442414, "epoch": 0.6505191994358114, "grad_norm": 6.1875, "learning_rate": 1.8516434566862987e-05, "loss": 1.39049560546875, "mean_token_accuracy": 0.6596233689785004, "num_tokens": 12778406.0, "step": 26750 }, { "entropy": 1.2861829257011415, "epoch": 0.6517351231730746, "grad_norm": 13.25, "learning_rate": 1.8509401915714565e-05, "loss": 1.2867579650878906, "mean_token_accuracy": 0.6677285397052765, "num_tokens": 12799735.0, "step": 26800 }, { "entropy": 1.2528273367881775, "epoch": 0.6529510469103378, "grad_norm": 16.25, "learning_rate": 1.8502353977365333e-05, "loss": 1.2729203033447265, "mean_token_accuracy": 0.6796846479177475, "num_tokens": 12822337.0, "step": 26850 }, { "entropy": 1.3833631992340087, "epoch": 0.654166970647601, "grad_norm": 12.75, "learning_rate": 1.8495290764476954e-05, "loss": 1.4130447387695313, "mean_token_accuracy": 0.651303693652153, "num_tokens": 12846855.0, "step": 26900 }, { "entropy": 1.3571466261148453, "epoch": 0.6553828943848642, "grad_norm": 25.5, "learning_rate": 1.8488212289738546e-05, "loss": 1.3650424194335937, "mean_token_accuracy": 0.668608660697937, "num_tokens": 12870367.0, "step": 26950 }, { "entropy": 1.2507404291629791, "epoch": 0.6565988181221274, "grad_norm": 26.25, "learning_rate": 1.848111856586664e-05, "loss": 1.2809530639648437, "mean_token_accuracy": 0.6852850127220154, "num_tokens": 12892546.0, "step": 27000 }, { "entropy": 1.2615238636732102, "epoch": 0.6578147418593906, "grad_norm": 8.3125, "learning_rate": 1.8474009605605153e-05, "loss": 1.2767186737060547, "mean_token_accuracy": 0.6740546345710754, "num_tokens": 12914576.0, "step": 27050 }, { "entropy": 1.3660405957698822, "epoch": 0.6590306655966538, "grad_norm": 8.625, "learning_rate": 1.846688542172539e-05, "loss": 1.3845634460449219, "mean_token_accuracy": 0.6592479854822159, "num_tokens": 12937875.0, "step": 27100 }, { "entropy": 1.3234642136096955, "epoch": 0.660246589333917, "grad_norm": 8.3125, "learning_rate": 1.8459746027025997e-05, "loss": 1.3244120788574218, "mean_token_accuracy": 0.6706123250722885, "num_tokens": 12961222.0, "step": 27150 }, { "entropy": 1.2799186217784881, "epoch": 0.6614625130711802, "grad_norm": 27.625, "learning_rate": 1.8452591434332944e-05, "loss": 1.3013082885742187, "mean_token_accuracy": 0.6734585338830947, "num_tokens": 12984860.0, "step": 27200 }, { "entropy": 1.3346337920427322, "epoch": 0.6626784368084434, "grad_norm": 13.3125, "learning_rate": 1.8445421656499505e-05, "loss": 1.3454840087890625, "mean_token_accuracy": 0.6688212269544601, "num_tokens": 13009753.0, "step": 27250 }, { "entropy": 1.3138964462280274, "epoch": 0.6638943605457066, "grad_norm": 22.0, "learning_rate": 1.8438236706406244e-05, "loss": 1.3283103942871093, "mean_token_accuracy": 0.6746186733245849, "num_tokens": 13033189.0, "step": 27300 }, { "entropy": 1.2998664346337319, "epoch": 0.6651102842829698, "grad_norm": 18.5, "learning_rate": 1.8431036596960973e-05, "loss": 1.3239945983886718, "mean_token_accuracy": 0.6672069132328033, "num_tokens": 13054291.0, "step": 27350 }, { "entropy": 1.302411378622055, "epoch": 0.666326208020233, "grad_norm": 9.875, "learning_rate": 1.8423821341098737e-05, "loss": 1.3211967468261718, "mean_token_accuracy": 0.6749076604843139, "num_tokens": 13078914.0, "step": 27400 }, { "entropy": 1.293629150390625, "epoch": 0.6675421317574962, "grad_norm": 6.5625, "learning_rate": 1.8416590951781797e-05, "loss": 1.307070770263672, "mean_token_accuracy": 0.6722654628753663, "num_tokens": 13103266.0, "step": 27450 }, { "entropy": 1.3258531606197357, "epoch": 0.6687580554947594, "grad_norm": 15.0, "learning_rate": 1.8409345441999593e-05, "loss": 1.3493324279785157, "mean_token_accuracy": 0.6655721652507782, "num_tokens": 13128626.0, "step": 27500 }, { "entropy": 1.2583172857761382, "epoch": 0.6699739792320226, "grad_norm": 15.75, "learning_rate": 1.840208482476874e-05, "loss": 1.2805874633789063, "mean_token_accuracy": 0.6861791983246803, "num_tokens": 13152669.0, "step": 27550 }, { "entropy": 1.3013251721858978, "epoch": 0.6711899029692857, "grad_norm": 22.875, "learning_rate": 1.8394809113132993e-05, "loss": 1.33163330078125, "mean_token_accuracy": 0.6712827455997467, "num_tokens": 13174745.0, "step": 27600 }, { "entropy": 1.221940097808838, "epoch": 0.6724058267065489, "grad_norm": 27.0, "learning_rate": 1.8387518320163206e-05, "loss": 1.23398193359375, "mean_token_accuracy": 0.6918888336420059, "num_tokens": 13199434.0, "step": 27650 }, { "entropy": 1.2448347479104995, "epoch": 0.6736217504438121, "grad_norm": 9.1875, "learning_rate": 1.838021245895735e-05, "loss": 1.253019790649414, "mean_token_accuracy": 0.6880134356021881, "num_tokens": 13225385.0, "step": 27700 }, { "entropy": 1.329750555753708, "epoch": 0.6748376741810753, "grad_norm": 14.375, "learning_rate": 1.8372891542640453e-05, "loss": 1.3551748657226563, "mean_token_accuracy": 0.6718474733829498, "num_tokens": 13249452.0, "step": 27750 }, { "entropy": 1.3671722611784936, "epoch": 0.6760535979183385, "grad_norm": 14.375, "learning_rate": 1.8365555584364597e-05, "loss": 1.413792724609375, "mean_token_accuracy": 0.6582028090953826, "num_tokens": 13270998.0, "step": 27800 }, { "entropy": 1.2581644195318222, "epoch": 0.6772695216556017, "grad_norm": 12.625, "learning_rate": 1.835820459730888e-05, "loss": 1.2650010681152344, "mean_token_accuracy": 0.6858339273929596, "num_tokens": 13297102.0, "step": 27850 }, { "entropy": 1.4055026853084565, "epoch": 0.6784854453928649, "grad_norm": 15.4375, "learning_rate": 1.8350838594679397e-05, "loss": 1.4329147338867188, "mean_token_accuracy": 0.6497586488723754, "num_tokens": 13318854.0, "step": 27900 }, { "entropy": 1.22731325507164, "epoch": 0.6797013691301281, "grad_norm": 11.6875, "learning_rate": 1.834345758970923e-05, "loss": 1.2378468322753906, "mean_token_accuracy": 0.6927320080995559, "num_tokens": 13344635.0, "step": 27950 }, { "entropy": 1.2405635231733323, "epoch": 0.6809172928673913, "grad_norm": 10.5, "learning_rate": 1.833606159565841e-05, "loss": 1.2667850494384765, "mean_token_accuracy": 0.6821321439743042, "num_tokens": 13367795.0, "step": 28000 }, { "entropy": 1.3358918333053589, "epoch": 0.6821332166046545, "grad_norm": 21.25, "learning_rate": 1.8328650625813887e-05, "loss": 1.3553237915039062, "mean_token_accuracy": 0.6652609139680863, "num_tokens": 13390013.0, "step": 28050 }, { "entropy": 1.2222345566749573, "epoch": 0.6833491403419177, "grad_norm": 18.75, "learning_rate": 1.8321224693489522e-05, "loss": 1.2192764282226562, "mean_token_accuracy": 0.694032015800476, "num_tokens": 13409810.0, "step": 28100 }, { "entropy": 1.3167165696620942, "epoch": 0.6845650640791809, "grad_norm": 17.25, "learning_rate": 1.8313783812026056e-05, "loss": 1.3371757507324218, "mean_token_accuracy": 0.661169970035553, "num_tokens": 13437116.0, "step": 28150 }, { "entropy": 1.3578705751895905, "epoch": 0.6857809878164441, "grad_norm": 14.375, "learning_rate": 1.8306327994791084e-05, "loss": 1.392783660888672, "mean_token_accuracy": 0.6549809366464615, "num_tokens": 13463290.0, "step": 28200 }, { "entropy": 1.3775210750102997, "epoch": 0.6869969115537073, "grad_norm": 17.875, "learning_rate": 1.8298857255179037e-05, "loss": 1.3979052734375, "mean_token_accuracy": 0.6627584421634674, "num_tokens": 13482532.0, "step": 28250 }, { "entropy": 1.3162289083003997, "epoch": 0.6882128352909705, "grad_norm": 31.875, "learning_rate": 1.829137160661115e-05, "loss": 1.3264112854003907, "mean_token_accuracy": 0.6691288709640503, "num_tokens": 13506823.0, "step": 28300 }, { "entropy": 1.3630560839176178, "epoch": 0.6894287590282338, "grad_norm": 17.0, "learning_rate": 1.828387106253544e-05, "loss": 1.383082733154297, "mean_token_accuracy": 0.6613066041469574, "num_tokens": 13530602.0, "step": 28350 }, { "entropy": 1.229476106762886, "epoch": 0.690644682765497, "grad_norm": 12.4375, "learning_rate": 1.8276355636426692e-05, "loss": 1.2510147857666016, "mean_token_accuracy": 0.68285848736763, "num_tokens": 13552061.0, "step": 28400 }, { "entropy": 1.2685564732551575, "epoch": 0.6918606065027602, "grad_norm": 15.75, "learning_rate": 1.8268825341786422e-05, "loss": 1.277625503540039, "mean_token_accuracy": 0.6798837089538574, "num_tokens": 13575169.0, "step": 28450 }, { "entropy": 1.362042201757431, "epoch": 0.6930765302400234, "grad_norm": 8.625, "learning_rate": 1.8261280192142857e-05, "loss": 1.392928924560547, "mean_token_accuracy": 0.6564533972740173, "num_tokens": 13599924.0, "step": 28500 }, { "entropy": 1.3145499658584594, "epoch": 0.6942924539772866, "grad_norm": 16.25, "learning_rate": 1.8253720201050913e-05, "loss": 1.3401885986328126, "mean_token_accuracy": 0.675891135931015, "num_tokens": 13626576.0, "step": 28550 }, { "entropy": 1.2959332406520843, "epoch": 0.6955083777145498, "grad_norm": 12.0, "learning_rate": 1.8246145382092166e-05, "loss": 1.3343682861328126, "mean_token_accuracy": 0.6730635017156601, "num_tokens": 13647399.0, "step": 28600 }, { "entropy": 1.2070212000608445, "epoch": 0.696724301451813, "grad_norm": 9.5625, "learning_rate": 1.823855574887483e-05, "loss": 1.2043225860595703, "mean_token_accuracy": 0.7007857769727707, "num_tokens": 13673694.0, "step": 28650 }, { "entropy": 1.2257911908626555, "epoch": 0.6979402251890762, "grad_norm": 20.25, "learning_rate": 1.823095131503374e-05, "loss": 1.255253448486328, "mean_token_accuracy": 0.6820999073982239, "num_tokens": 13699052.0, "step": 28700 }, { "entropy": 1.2766425538063049, "epoch": 0.6991561489263394, "grad_norm": 15.875, "learning_rate": 1.822333209423031e-05, "loss": 1.2983558654785157, "mean_token_accuracy": 0.6702512127161026, "num_tokens": 13722742.0, "step": 28750 }, { "entropy": 1.2710252845287322, "epoch": 0.7003720726636026, "grad_norm": 8.375, "learning_rate": 1.821569810015253e-05, "loss": 1.2784712219238281, "mean_token_accuracy": 0.6751464152336121, "num_tokens": 13749144.0, "step": 28800 }, { "entropy": 1.1709763503074646, "epoch": 0.7015879964008658, "grad_norm": 15.6875, "learning_rate": 1.8208049346514917e-05, "loss": 1.184038848876953, "mean_token_accuracy": 0.6944513726234436, "num_tokens": 13772252.0, "step": 28850 }, { "entropy": 1.2944838845729827, "epoch": 0.702803920138129, "grad_norm": 53.0, "learning_rate": 1.8200385847058516e-05, "loss": 1.2811213684082032, "mean_token_accuracy": 0.6814097690582276, "num_tokens": 13797821.0, "step": 28900 }, { "entropy": 1.2964469802379608, "epoch": 0.7040198438753922, "grad_norm": 11.9375, "learning_rate": 1.819270761555086e-05, "loss": 1.3228421020507812, "mean_token_accuracy": 0.6681522041559219, "num_tokens": 13819216.0, "step": 28950 }, { "entropy": 1.29955069065094, "epoch": 0.7052357676126554, "grad_norm": 10.9375, "learning_rate": 1.8185014665785936e-05, "loss": 1.3359342956542968, "mean_token_accuracy": 0.6642140626907349, "num_tokens": 13840370.0, "step": 29000 }, { "entropy": 1.2508266884088517, "epoch": 0.7064516913499186, "grad_norm": 22.5, "learning_rate": 1.8177307011584194e-05, "loss": 1.2767788696289062, "mean_token_accuracy": 0.6805707895755768, "num_tokens": 13864653.0, "step": 29050 }, { "entropy": 1.2859594118595123, "epoch": 0.7076676150871817, "grad_norm": 15.25, "learning_rate": 1.8169584666792487e-05, "loss": 1.2976702880859374, "mean_token_accuracy": 0.6729725921154022, "num_tokens": 13888065.0, "step": 29100 }, { "entropy": 1.270310698747635, "epoch": 0.7088835388244449, "grad_norm": 22.0, "learning_rate": 1.816184764528406e-05, "loss": 1.2935488891601563, "mean_token_accuracy": 0.6735805612802506, "num_tokens": 13914034.0, "step": 29150 }, { "entropy": 1.2501510977745056, "epoch": 0.7100994625617081, "grad_norm": 15.6875, "learning_rate": 1.8154095960958525e-05, "loss": 1.2789421081542969, "mean_token_accuracy": 0.6786725962162018, "num_tokens": 13937551.0, "step": 29200 }, { "entropy": 1.2147483670711516, "epoch": 0.7113153862989713, "grad_norm": 20.75, "learning_rate": 1.8146329627741842e-05, "loss": 1.2166007232666016, "mean_token_accuracy": 0.6919803965091705, "num_tokens": 13958575.0, "step": 29250 }, { "entropy": 1.3411088454723359, "epoch": 0.7125313100362345, "grad_norm": 9.8125, "learning_rate": 1.813854865958628e-05, "loss": 1.3578678894042968, "mean_token_accuracy": 0.659036859869957, "num_tokens": 13984213.0, "step": 29300 }, { "entropy": 1.1746557873487473, "epoch": 0.7137472337734977, "grad_norm": 12.5625, "learning_rate": 1.8130753070470408e-05, "loss": 1.1808556365966796, "mean_token_accuracy": 0.6971986711025238, "num_tokens": 14009563.0, "step": 29350 }, { "entropy": 1.395166459083557, "epoch": 0.7149631575107609, "grad_norm": 12.5625, "learning_rate": 1.8122942874399054e-05, "loss": 1.4041143798828124, "mean_token_accuracy": 0.6548512560129166, "num_tokens": 14032804.0, "step": 29400 }, { "entropy": 1.2904534560441971, "epoch": 0.7161790812480241, "grad_norm": 13.5625, "learning_rate": 1.811511808540329e-05, "loss": 1.3176031494140625, "mean_token_accuracy": 0.671400500535965, "num_tokens": 14060313.0, "step": 29450 }, { "entropy": 1.2375965690612794, "epoch": 0.7173950049852873, "grad_norm": 8.1875, "learning_rate": 1.8107278717540405e-05, "loss": 1.243343276977539, "mean_token_accuracy": 0.6886941480636597, "num_tokens": 14084507.0, "step": 29500 }, { "entropy": 1.351090202331543, "epoch": 0.7186109287225505, "grad_norm": 11.1875, "learning_rate": 1.8099424784893882e-05, "loss": 1.3620030212402343, "mean_token_accuracy": 0.6695238447189331, "num_tokens": 14111946.0, "step": 29550 }, { "entropy": 1.361289101243019, "epoch": 0.7198268524598137, "grad_norm": 14.5625, "learning_rate": 1.8091556301573363e-05, "loss": 1.3848806762695312, "mean_token_accuracy": 0.6607132971286773, "num_tokens": 14134744.0, "step": 29600 }, { "entropy": 1.4362358117103577, "epoch": 0.7210427761970769, "grad_norm": 12.9375, "learning_rate": 1.8083673281714634e-05, "loss": 1.4768971252441405, "mean_token_accuracy": 0.6394012719392776, "num_tokens": 14160889.0, "step": 29650 }, { "entropy": 1.2883304530382156, "epoch": 0.7222586999343401, "grad_norm": 15.5625, "learning_rate": 1.80757757394796e-05, "loss": 1.2935758972167968, "mean_token_accuracy": 0.6869478803873063, "num_tokens": 14187655.0, "step": 29700 }, { "entropy": 1.232308396100998, "epoch": 0.7234746236716033, "grad_norm": 16.875, "learning_rate": 1.8067863689056247e-05, "loss": 1.2578567504882812, "mean_token_accuracy": 0.67534075319767, "num_tokens": 14209800.0, "step": 29750 }, { "entropy": 1.3428675317764283, "epoch": 0.7246905474088665, "grad_norm": 12.8125, "learning_rate": 1.8059937144658632e-05, "loss": 1.366387939453125, "mean_token_accuracy": 0.6705012047290801, "num_tokens": 14234003.0, "step": 29800 }, { "entropy": 1.2437283539772033, "epoch": 0.7259064711461297, "grad_norm": 13.5, "learning_rate": 1.8051996120526847e-05, "loss": 1.257786636352539, "mean_token_accuracy": 0.6829189836978913, "num_tokens": 14254053.0, "step": 29850 }, { "entropy": 1.3363167715072632, "epoch": 0.7271223948833929, "grad_norm": 12.875, "learning_rate": 1.8044040630927e-05, "loss": 1.3371769714355468, "mean_token_accuracy": 0.6711208117008209, "num_tokens": 14275947.0, "step": 29900 }, { "entropy": 1.1701286536455155, "epoch": 0.7283383186206561, "grad_norm": 14.9375, "learning_rate": 1.8036070690151183e-05, "loss": 1.1949696350097656, "mean_token_accuracy": 0.6962116420269012, "num_tokens": 14298902.0, "step": 29950 }, { "entropy": 1.283930288553238, "epoch": 0.7295542423579193, "grad_norm": 7.09375, "learning_rate": 1.802808631251745e-05, "loss": 1.3241773986816405, "mean_token_accuracy": 0.6730696439743042, "num_tokens": 14324238.0, "step": 30000 }, { "epoch": 0.7295542423579193, "eval_entropy": 1.310916506877533, "eval_loss": 1.3078851699829102, "eval_mean_token_accuracy": 0.6749541904556331, "eval_num_tokens": 14324238.0, "eval_runtime": 391.7817, "eval_samples_per_second": 11.662, "eval_steps_per_second": 11.662, "step": 30000 }, { "entropy": 1.2495182812213899, "epoch": 0.7307701660951825, "grad_norm": 7.46875, "learning_rate": 1.8020087512369794e-05, "loss": 1.248457489013672, "mean_token_accuracy": 0.6848618942499161, "num_tokens": 14348745.0, "step": 30050 }, { "entropy": 1.2735689461231232, "epoch": 0.7319860898324457, "grad_norm": 21.0, "learning_rate": 1.8012074304078115e-05, "loss": 1.3068658447265624, "mean_token_accuracy": 0.6840433216094971, "num_tokens": 14374493.0, "step": 30100 }, { "entropy": 1.3124530816078186, "epoch": 0.7332020135697089, "grad_norm": 9.625, "learning_rate": 1.80040467020382e-05, "loss": 1.3351017761230468, "mean_token_accuracy": 0.6705321550369263, "num_tokens": 14401268.0, "step": 30150 }, { "entropy": 1.24998312830925, "epoch": 0.7344179373069721, "grad_norm": 8.5, "learning_rate": 1.799600472067169e-05, "loss": 1.2521861267089844, "mean_token_accuracy": 0.6819658672809601, "num_tokens": 14423622.0, "step": 30200 }, { "entropy": 1.4004323124885558, "epoch": 0.7356338610442353, "grad_norm": 16.875, "learning_rate": 1.798794837442606e-05, "loss": 1.4423455810546875, "mean_token_accuracy": 0.6544573539495468, "num_tokens": 14445361.0, "step": 30250 }, { "entropy": 1.2775141680240631, "epoch": 0.7368497847814985, "grad_norm": 12.75, "learning_rate": 1.7979877677774603e-05, "loss": 1.2902922058105468, "mean_token_accuracy": 0.6814062190055847, "num_tokens": 14465677.0, "step": 30300 }, { "entropy": 1.2451773285865784, "epoch": 0.7380657085187617, "grad_norm": 17.0, "learning_rate": 1.7971792645216374e-05, "loss": 1.2529192352294922, "mean_token_accuracy": 0.6826837146282196, "num_tokens": 14490135.0, "step": 30350 }, { "entropy": 1.355046319961548, "epoch": 0.739281632256025, "grad_norm": 12.5625, "learning_rate": 1.7963693291276192e-05, "loss": 1.391610107421875, "mean_token_accuracy": 0.6622405433654786, "num_tokens": 14511540.0, "step": 30400 }, { "entropy": 1.3895890021324158, "epoch": 0.7404975559932881, "grad_norm": 12.375, "learning_rate": 1.7955579630504604e-05, "loss": 1.4158137512207032, "mean_token_accuracy": 0.6533851462602616, "num_tokens": 14531993.0, "step": 30450 }, { "entropy": 1.3373268687725066, "epoch": 0.7417134797305514, "grad_norm": 9.5, "learning_rate": 1.7947451677477863e-05, "loss": 1.3753326416015625, "mean_token_accuracy": 0.6624751788377762, "num_tokens": 14557295.0, "step": 30500 }, { "entropy": 1.3651358252763748, "epoch": 0.7429294034678146, "grad_norm": 8.5, "learning_rate": 1.793930944679789e-05, "loss": 1.3644561767578125, "mean_token_accuracy": 0.6710851734876633, "num_tokens": 14580110.0, "step": 30550 }, { "entropy": 1.2810404855012893, "epoch": 0.7441453272050776, "grad_norm": 10.875, "learning_rate": 1.7931152953092257e-05, "loss": 1.3071978759765626, "mean_token_accuracy": 0.6774614870548248, "num_tokens": 14604944.0, "step": 30600 }, { "entropy": 1.3682792222499847, "epoch": 0.7453612509423408, "grad_norm": 8.75, "learning_rate": 1.7922982211014174e-05, "loss": 1.410800018310547, "mean_token_accuracy": 0.6592587125301361, "num_tokens": 14628085.0, "step": 30650 }, { "entropy": 1.3626940059661865, "epoch": 0.746577174679604, "grad_norm": 17.375, "learning_rate": 1.7914797235242427e-05, "loss": 1.3866787719726563, "mean_token_accuracy": 0.6600571709871292, "num_tokens": 14650076.0, "step": 30700 }, { "entropy": 1.3390605020523072, "epoch": 0.7477930984168673, "grad_norm": 10.1875, "learning_rate": 1.7906598040481385e-05, "loss": 1.351983642578125, "mean_token_accuracy": 0.665844299197197, "num_tokens": 14675017.0, "step": 30750 }, { "entropy": 1.3153307902812958, "epoch": 0.7490090221541305, "grad_norm": 10.125, "learning_rate": 1.7898384641460962e-05, "loss": 1.3226689147949218, "mean_token_accuracy": 0.6717447280883789, "num_tokens": 14697920.0, "step": 30800 }, { "entropy": 1.1787913537025452, "epoch": 0.7502249458913937, "grad_norm": 18.75, "learning_rate": 1.7890157052936584e-05, "loss": 1.195465087890625, "mean_token_accuracy": 0.7025556600093842, "num_tokens": 14719339.0, "step": 30850 }, { "entropy": 1.243127819299698, "epoch": 0.7514408696286569, "grad_norm": 12.6875, "learning_rate": 1.7881915289689175e-05, "loss": 1.2531111907958985, "mean_token_accuracy": 0.6769532060623169, "num_tokens": 14743517.0, "step": 30900 }, { "entropy": 1.3311399698257447, "epoch": 0.7526567933659201, "grad_norm": 16.75, "learning_rate": 1.787365936652512e-05, "loss": 1.3597422790527345, "mean_token_accuracy": 0.6691668742895126, "num_tokens": 14768243.0, "step": 30950 }, { "entropy": 1.457090915441513, "epoch": 0.7538727171031833, "grad_norm": 16.75, "learning_rate": 1.7865389298276245e-05, "loss": 1.4807286071777344, "mean_token_accuracy": 0.6378058969974518, "num_tokens": 14791147.0, "step": 31000 }, { "entropy": 1.3188925790786743, "epoch": 0.7550886408404465, "grad_norm": 17.25, "learning_rate": 1.7857105099799783e-05, "loss": 1.3365071105957032, "mean_token_accuracy": 0.6654577994346619, "num_tokens": 14814569.0, "step": 31050 }, { "entropy": 1.3424291408061981, "epoch": 0.7563045645777097, "grad_norm": 17.125, "learning_rate": 1.7848806785978356e-05, "loss": 1.3696784973144531, "mean_token_accuracy": 0.6693021357059479, "num_tokens": 14838205.0, "step": 31100 }, { "entropy": 1.2400598180294038, "epoch": 0.7575204883149729, "grad_norm": 10.3125, "learning_rate": 1.7840494371719948e-05, "loss": 1.2807940673828124, "mean_token_accuracy": 0.6790558713674545, "num_tokens": 14861831.0, "step": 31150 }, { "entropy": 1.3330831801891327, "epoch": 0.7587364120522361, "grad_norm": 10.6875, "learning_rate": 1.7832167871957862e-05, "loss": 1.3338279724121094, "mean_token_accuracy": 0.6690352594852448, "num_tokens": 14883510.0, "step": 31200 }, { "entropy": 1.2469097924232484, "epoch": 0.7599523357894993, "grad_norm": 11.0, "learning_rate": 1.7823827301650724e-05, "loss": 1.258400421142578, "mean_token_accuracy": 0.6770782947540284, "num_tokens": 14907660.0, "step": 31250 }, { "entropy": 1.2547933149337769, "epoch": 0.7611682595267625, "grad_norm": 10.8125, "learning_rate": 1.7815472675782417e-05, "loss": 1.253731460571289, "mean_token_accuracy": 0.6843715536594391, "num_tokens": 14932967.0, "step": 31300 }, { "entropy": 1.381563572883606, "epoch": 0.7623841832640257, "grad_norm": 21.875, "learning_rate": 1.780710400936209e-05, "loss": 1.4012057495117187, "mean_token_accuracy": 0.6535730797052384, "num_tokens": 14954889.0, "step": 31350 }, { "entropy": 1.3124976098537444, "epoch": 0.7636001070012889, "grad_norm": 19.0, "learning_rate": 1.7798721317424113e-05, "loss": 1.3379446411132812, "mean_token_accuracy": 0.6669400709867478, "num_tokens": 14977213.0, "step": 31400 }, { "entropy": 1.3330624771118165, "epoch": 0.7648160307385521, "grad_norm": 8.625, "learning_rate": 1.7790324615028055e-05, "loss": 1.349387664794922, "mean_token_accuracy": 0.6659294873476028, "num_tokens": 15002647.0, "step": 31450 }, { "entropy": 1.3502304446697235, "epoch": 0.7660319544758153, "grad_norm": 14.4375, "learning_rate": 1.7781913917258645e-05, "loss": 1.3741587829589843, "mean_token_accuracy": 0.6693767982721329, "num_tokens": 15031067.0, "step": 31500 }, { "entropy": 1.2757529616355896, "epoch": 0.7672478782130785, "grad_norm": 14.8125, "learning_rate": 1.777348923922576e-05, "loss": 1.308182373046875, "mean_token_accuracy": 0.6796799236536026, "num_tokens": 15049274.0, "step": 31550 }, { "entropy": 1.242182047367096, "epoch": 0.7684638019503417, "grad_norm": 41.5, "learning_rate": 1.7765050596064397e-05, "loss": 1.256709442138672, "mean_token_accuracy": 0.684654848575592, "num_tokens": 15070584.0, "step": 31600 }, { "entropy": 1.260639282464981, "epoch": 0.7696797256876049, "grad_norm": 11.1875, "learning_rate": 1.775659800293464e-05, "loss": 1.2867503356933594, "mean_token_accuracy": 0.6854908221960068, "num_tokens": 15095854.0, "step": 31650 }, { "entropy": 1.2249360179901123, "epoch": 0.7708956494248681, "grad_norm": 17.125, "learning_rate": 1.774813147502163e-05, "loss": 1.258040313720703, "mean_token_accuracy": 0.686975474357605, "num_tokens": 15114062.0, "step": 31700 }, { "entropy": 1.2977924448251725, "epoch": 0.7721115731621313, "grad_norm": 14.9375, "learning_rate": 1.7739651027535542e-05, "loss": 1.3258702087402343, "mean_token_accuracy": 0.6686629390716553, "num_tokens": 15135571.0, "step": 31750 }, { "entropy": 1.3065683811903, "epoch": 0.7733274968993945, "grad_norm": 11.5625, "learning_rate": 1.7731156675711566e-05, "loss": 1.3173158264160156, "mean_token_accuracy": 0.6717148971557617, "num_tokens": 15156913.0, "step": 31800 }, { "entropy": 1.3867783665657043, "epoch": 0.7745434206366577, "grad_norm": 13.0625, "learning_rate": 1.772264843480986e-05, "loss": 1.3991978454589844, "mean_token_accuracy": 0.6573692619800567, "num_tokens": 15183472.0, "step": 31850 }, { "entropy": 1.243765811920166, "epoch": 0.7757593443739209, "grad_norm": 13.4375, "learning_rate": 1.771412632011554e-05, "loss": 1.2541871643066407, "mean_token_accuracy": 0.6887331688404084, "num_tokens": 15206062.0, "step": 31900 }, { "entropy": 1.2265130192041398, "epoch": 0.7769752681111841, "grad_norm": 7.59375, "learning_rate": 1.770559034693865e-05, "loss": 1.243491439819336, "mean_token_accuracy": 0.6858005738258361, "num_tokens": 15227812.0, "step": 31950 }, { "entropy": 1.2555934953689576, "epoch": 0.7781911918484473, "grad_norm": 16.625, "learning_rate": 1.769704053061412e-05, "loss": 1.2737808990478516, "mean_token_accuracy": 0.6761786299943924, "num_tokens": 15250751.0, "step": 32000 }, { "entropy": 1.211946923136711, "epoch": 0.7794071155857105, "grad_norm": 17.75, "learning_rate": 1.7688476886501758e-05, "loss": 1.2317932891845702, "mean_token_accuracy": 0.6918349468708038, "num_tokens": 15277276.0, "step": 32050 }, { "entropy": 1.2673168325424193, "epoch": 0.7806230393229736, "grad_norm": 13.9375, "learning_rate": 1.767989942998621e-05, "loss": 1.2965071105957031, "mean_token_accuracy": 0.6747112292051315, "num_tokens": 15304753.0, "step": 32100 }, { "entropy": 1.282315604686737, "epoch": 0.7818389630602368, "grad_norm": 10.0625, "learning_rate": 1.7671308176476946e-05, "loss": 1.2866700744628907, "mean_token_accuracy": 0.6761971247196198, "num_tokens": 15324802.0, "step": 32150 }, { "entropy": 1.2833102142810822, "epoch": 0.7830548867975, "grad_norm": 10.5625, "learning_rate": 1.76627031414082e-05, "loss": 1.2855580139160157, "mean_token_accuracy": 0.6733474695682525, "num_tokens": 15350109.0, "step": 32200 }, { "entropy": 1.3226429986953736, "epoch": 0.7842708105347632, "grad_norm": 12.0, "learning_rate": 1.7654084340238987e-05, "loss": 1.3386595153808594, "mean_token_accuracy": 0.6631902587413788, "num_tokens": 15373670.0, "step": 32250 }, { "entropy": 1.2814471352100372, "epoch": 0.7854867342720264, "grad_norm": 34.25, "learning_rate": 1.764545178845304e-05, "loss": 1.2878831481933595, "mean_token_accuracy": 0.6784471917152405, "num_tokens": 15400943.0, "step": 32300 }, { "entropy": 1.2299909722805022, "epoch": 0.7867026580092896, "grad_norm": 12.0, "learning_rate": 1.7636805501558804e-05, "loss": 1.2589624786376954, "mean_token_accuracy": 0.6827116614580154, "num_tokens": 15426305.0, "step": 32350 }, { "entropy": 1.5097826981544495, "epoch": 0.7879185817465528, "grad_norm": 10.75, "learning_rate": 1.7628145495089394e-05, "loss": 1.5199713134765624, "mean_token_accuracy": 0.635250037908554, "num_tokens": 15452742.0, "step": 32400 }, { "entropy": 1.294278804063797, "epoch": 0.789134505483816, "grad_norm": 12.1875, "learning_rate": 1.761947178460257e-05, "loss": 1.321103057861328, "mean_token_accuracy": 0.677585883140564, "num_tokens": 15475899.0, "step": 32450 }, { "entropy": 1.2839925384521484, "epoch": 0.7903504292210792, "grad_norm": 18.75, "learning_rate": 1.761078438568072e-05, "loss": 1.3126020812988282, "mean_token_accuracy": 0.6661591792106628, "num_tokens": 15500857.0, "step": 32500 }, { "entropy": 1.248317950963974, "epoch": 0.7915663529583424, "grad_norm": 19.875, "learning_rate": 1.7602083313930806e-05, "loss": 1.2577658843994142, "mean_token_accuracy": 0.6775270545482636, "num_tokens": 15524431.0, "step": 32550 }, { "entropy": 1.2741739201545714, "epoch": 0.7927822766956056, "grad_norm": 8.0, "learning_rate": 1.7593368584984375e-05, "loss": 1.284134979248047, "mean_token_accuracy": 0.677637649178505, "num_tokens": 15548236.0, "step": 32600 }, { "entropy": 1.3496142375469207, "epoch": 0.7939982004328688, "grad_norm": 14.375, "learning_rate": 1.75846402144975e-05, "loss": 1.359871826171875, "mean_token_accuracy": 0.666935904622078, "num_tokens": 15571975.0, "step": 32650 }, { "entropy": 1.2635667765140532, "epoch": 0.795214124170132, "grad_norm": 10.6875, "learning_rate": 1.7575898218150754e-05, "loss": 1.2672352600097656, "mean_token_accuracy": 0.6794650614261627, "num_tokens": 15596124.0, "step": 32700 }, { "entropy": 1.3562224280834199, "epoch": 0.7964300479073952, "grad_norm": 36.25, "learning_rate": 1.75671426116492e-05, "loss": 1.3669236755371095, "mean_token_accuracy": 0.6643521797657013, "num_tokens": 15620803.0, "step": 32750 }, { "entropy": 1.3684999024868012, "epoch": 0.7976459716446584, "grad_norm": 19.0, "learning_rate": 1.7558373410722347e-05, "loss": 1.3956919860839845, "mean_token_accuracy": 0.6563726311922073, "num_tokens": 15645271.0, "step": 32800 }, { "entropy": 1.3034540951251983, "epoch": 0.7988618953819216, "grad_norm": 18.875, "learning_rate": 1.7549590631124122e-05, "loss": 1.3228506469726562, "mean_token_accuracy": 0.670553103685379, "num_tokens": 15666732.0, "step": 32850 }, { "entropy": 1.2424634909629821, "epoch": 0.8000778191191849, "grad_norm": 11.4375, "learning_rate": 1.754079428863286e-05, "loss": 1.2735608673095704, "mean_token_accuracy": 0.6843376004695892, "num_tokens": 15691125.0, "step": 32900 }, { "entropy": 1.153019380569458, "epoch": 0.801293742856448, "grad_norm": 14.625, "learning_rate": 1.753198439905124e-05, "loss": 1.1630577087402343, "mean_token_accuracy": 0.7032513380050659, "num_tokens": 15714953.0, "step": 32950 }, { "entropy": 1.290501699447632, "epoch": 0.8025096665937113, "grad_norm": 24.5, "learning_rate": 1.7523160978206298e-05, "loss": 1.3031010437011719, "mean_token_accuracy": 0.672417933344841, "num_tokens": 15738367.0, "step": 33000 }, { "entropy": 1.2721804177761078, "epoch": 0.8037255903309745, "grad_norm": 16.5, "learning_rate": 1.7514324041949376e-05, "loss": 1.277992935180664, "mean_token_accuracy": 0.6773512184619903, "num_tokens": 15760698.0, "step": 33050 }, { "entropy": 1.284292607307434, "epoch": 0.8049415140682377, "grad_norm": 12.0625, "learning_rate": 1.7505473606156086e-05, "loss": 1.3037937927246093, "mean_token_accuracy": 0.6789377272129059, "num_tokens": 15788702.0, "step": 33100 }, { "entropy": 1.2152335494756699, "epoch": 0.8061574378055009, "grad_norm": 13.4375, "learning_rate": 1.74966096867263e-05, "loss": 1.2338765716552735, "mean_token_accuracy": 0.6869011771678925, "num_tokens": 15812087.0, "step": 33150 }, { "entropy": 1.3610283041000366, "epoch": 0.8073733615427641, "grad_norm": 34.25, "learning_rate": 1.7487732299584115e-05, "loss": 1.40349853515625, "mean_token_accuracy": 0.650696964263916, "num_tokens": 15832988.0, "step": 33200 }, { "entropy": 1.3530211353302002, "epoch": 0.8085892852800273, "grad_norm": 17.25, "learning_rate": 1.7478841460677816e-05, "loss": 1.3507275390625, "mean_token_accuracy": 0.6674026501178741, "num_tokens": 15855455.0, "step": 33250 }, { "entropy": 1.284665321111679, "epoch": 0.8098052090172905, "grad_norm": 13.75, "learning_rate": 1.7469937185979858e-05, "loss": 1.2980374145507811, "mean_token_accuracy": 0.6682228177785874, "num_tokens": 15880092.0, "step": 33300 }, { "entropy": 1.2614272880554198, "epoch": 0.8110211327545537, "grad_norm": 18.125, "learning_rate": 1.7461019491486842e-05, "loss": 1.2846394348144532, "mean_token_accuracy": 0.6762393468618393, "num_tokens": 15906270.0, "step": 33350 }, { "entropy": 1.2376176613569259, "epoch": 0.8122370564918169, "grad_norm": 14.0625, "learning_rate": 1.7452088393219456e-05, "loss": 1.253946533203125, "mean_token_accuracy": 0.6850462085008622, "num_tokens": 15930471.0, "step": 33400 }, { "entropy": 1.3559562683105468, "epoch": 0.8134529802290801, "grad_norm": 13.375, "learning_rate": 1.7443143907222492e-05, "loss": 1.3730575561523437, "mean_token_accuracy": 0.6650490820407867, "num_tokens": 15957540.0, "step": 33450 }, { "entropy": 1.373849047422409, "epoch": 0.8146689039663433, "grad_norm": 9.75, "learning_rate": 1.7434186049564775e-05, "loss": 1.3857582092285157, "mean_token_accuracy": 0.6631242781877518, "num_tokens": 15981488.0, "step": 33500 }, { "entropy": 1.3197306948900223, "epoch": 0.8158848277036064, "grad_norm": 19.0, "learning_rate": 1.742521483633917e-05, "loss": 1.342644500732422, "mean_token_accuracy": 0.6728410363197327, "num_tokens": 16001160.0, "step": 33550 }, { "entropy": 1.3785529029369354, "epoch": 0.8171007514408696, "grad_norm": 16.5, "learning_rate": 1.741623028366251e-05, "loss": 1.4092950439453125, "mean_token_accuracy": 0.6561831468343735, "num_tokens": 16025319.0, "step": 33600 }, { "entropy": 1.194454652070999, "epoch": 0.8183166751781328, "grad_norm": 10.25, "learning_rate": 1.7407232407675622e-05, "loss": 1.2044487762451173, "mean_token_accuracy": 0.6953711688518525, "num_tokens": 16048544.0, "step": 33650 }, { "entropy": 1.2495973479747773, "epoch": 0.819532598915396, "grad_norm": 17.125, "learning_rate": 1.7398221224543243e-05, "loss": 1.254457015991211, "mean_token_accuracy": 0.6839439296722412, "num_tokens": 16072633.0, "step": 33700 }, { "entropy": 1.3171969044208527, "epoch": 0.8207485226526592, "grad_norm": 20.625, "learning_rate": 1.7389196750454036e-05, "loss": 1.3473527526855469, "mean_token_accuracy": 0.6674605339765549, "num_tokens": 16093543.0, "step": 33750 }, { "entropy": 1.2738258934020996, "epoch": 0.8219644463899224, "grad_norm": 15.8125, "learning_rate": 1.7380159001620528e-05, "loss": 1.3096241760253906, "mean_token_accuracy": 0.6642470586299897, "num_tokens": 16119365.0, "step": 33800 }, { "entropy": 1.2518900805711746, "epoch": 0.8231803701271856, "grad_norm": 13.8125, "learning_rate": 1.7371107994279098e-05, "loss": 1.2600080871582031, "mean_token_accuracy": 0.6829512226581573, "num_tokens": 16143498.0, "step": 33850 }, { "entropy": 1.267427898645401, "epoch": 0.8243962938644488, "grad_norm": 13.1875, "learning_rate": 1.736204374468995e-05, "loss": 1.297550048828125, "mean_token_accuracy": 0.6720146667957306, "num_tokens": 16164520.0, "step": 33900 }, { "entropy": 1.304386180639267, "epoch": 0.825612217601712, "grad_norm": 13.125, "learning_rate": 1.7352966269137064e-05, "loss": 1.3281967163085937, "mean_token_accuracy": 0.6723226773738861, "num_tokens": 16184412.0, "step": 33950 }, { "entropy": 1.3184413778781892, "epoch": 0.8268281413389752, "grad_norm": 10.875, "learning_rate": 1.7343875583928197e-05, "loss": 1.3438044738769532, "mean_token_accuracy": 0.662082394361496, "num_tokens": 16208188.0, "step": 34000 }, { "entropy": 1.269478040933609, "epoch": 0.8280440650762384, "grad_norm": 22.75, "learning_rate": 1.7334771705394825e-05, "loss": 1.2788394927978515, "mean_token_accuracy": 0.680755558013916, "num_tokens": 16231222.0, "step": 34050 }, { "entropy": 1.304408929347992, "epoch": 0.8292599888135016, "grad_norm": 15.625, "learning_rate": 1.732565464989213e-05, "loss": 1.3101168823242189, "mean_token_accuracy": 0.6699317651987076, "num_tokens": 16250640.0, "step": 34100 }, { "entropy": 1.2953523236513138, "epoch": 0.8304759125507648, "grad_norm": 12.75, "learning_rate": 1.7316524433798974e-05, "loss": 1.3259376525878905, "mean_token_accuracy": 0.6787937474250794, "num_tokens": 16271606.0, "step": 34150 }, { "entropy": 1.222363872528076, "epoch": 0.831691836288028, "grad_norm": 28.0, "learning_rate": 1.7307381073517843e-05, "loss": 1.2318798828125, "mean_token_accuracy": 0.6911684781312942, "num_tokens": 16297119.0, "step": 34200 }, { "entropy": 1.2891963398456574, "epoch": 0.8329077600252912, "grad_norm": 10.1875, "learning_rate": 1.7298224585474857e-05, "loss": 1.2901005554199219, "mean_token_accuracy": 0.6738870775699616, "num_tokens": 16323126.0, "step": 34250 }, { "entropy": 1.285206691622734, "epoch": 0.8341236837625544, "grad_norm": 15.0, "learning_rate": 1.7289054986119712e-05, "loss": 1.3184848022460938, "mean_token_accuracy": 0.6728054642677307, "num_tokens": 16345490.0, "step": 34300 }, { "entropy": 1.3291216444969178, "epoch": 0.8353396074998176, "grad_norm": 17.5, "learning_rate": 1.7279872291925654e-05, "loss": 1.361817626953125, "mean_token_accuracy": 0.6805755782127381, "num_tokens": 16369344.0, "step": 34350 }, { "entropy": 1.2319292575120926, "epoch": 0.8365555312370808, "grad_norm": 14.9375, "learning_rate": 1.727067651938946e-05, "loss": 1.2503050994873046, "mean_token_accuracy": 0.6743580114841461, "num_tokens": 16393587.0, "step": 34400 }, { "entropy": 1.3432572519779205, "epoch": 0.837771454974344, "grad_norm": 10.875, "learning_rate": 1.7261467685031398e-05, "loss": 1.3681056213378906, "mean_token_accuracy": 0.6636675626039505, "num_tokens": 16416681.0, "step": 34450 }, { "entropy": 1.2829254722595216, "epoch": 0.8389873787116072, "grad_norm": 9.625, "learning_rate": 1.725224580539521e-05, "loss": 1.2972833251953124, "mean_token_accuracy": 0.674921510219574, "num_tokens": 16439426.0, "step": 34500 }, { "entropy": 1.3170963537693023, "epoch": 0.8402033024488704, "grad_norm": 11.625, "learning_rate": 1.7243010897048066e-05, "loss": 1.3343716430664063, "mean_token_accuracy": 0.6722611224651337, "num_tokens": 16462214.0, "step": 34550 }, { "entropy": 1.2718374288082124, "epoch": 0.8414192261861336, "grad_norm": 9.8125, "learning_rate": 1.7233762976580538e-05, "loss": 1.2900433349609375, "mean_token_accuracy": 0.6693054741621017, "num_tokens": 16486602.0, "step": 34600 }, { "entropy": 1.2556602382659912, "epoch": 0.8426351499233968, "grad_norm": 19.5, "learning_rate": 1.7224502060606586e-05, "loss": 1.2930685424804687, "mean_token_accuracy": 0.6736362385749817, "num_tokens": 16508822.0, "step": 34650 }, { "entropy": 1.2631418931484222, "epoch": 0.84385107366066, "grad_norm": 10.125, "learning_rate": 1.7215228165763515e-05, "loss": 1.257822494506836, "mean_token_accuracy": 0.6758167684078217, "num_tokens": 16536698.0, "step": 34700 }, { "entropy": 1.3088359129428864, "epoch": 0.8450669973979232, "grad_norm": 17.0, "learning_rate": 1.7205941308711935e-05, "loss": 1.3415408325195313, "mean_token_accuracy": 0.6581904220581055, "num_tokens": 16564291.0, "step": 34750 }, { "entropy": 1.308411535024643, "epoch": 0.8462829211351864, "grad_norm": 18.125, "learning_rate": 1.7196641506135755e-05, "loss": 1.320352783203125, "mean_token_accuracy": 0.6690320885181427, "num_tokens": 16585565.0, "step": 34800 }, { "entropy": 1.2323154938220977, "epoch": 0.8474988448724496, "grad_norm": 11.8125, "learning_rate": 1.718732877474213e-05, "loss": 1.2303726959228516, "mean_token_accuracy": 0.6877257800102234, "num_tokens": 16608581.0, "step": 34850 }, { "entropy": 1.419217362701893, "epoch": 0.8487147686097128, "grad_norm": 20.375, "learning_rate": 1.717800313126146e-05, "loss": 1.4445025634765625, "mean_token_accuracy": 0.6477040314674377, "num_tokens": 16634661.0, "step": 34900 }, { "entropy": 1.258851877450943, "epoch": 0.849930692346976, "grad_norm": 11.3125, "learning_rate": 1.716866459244732e-05, "loss": 1.2835916137695313, "mean_token_accuracy": 0.6827876710891724, "num_tokens": 16662371.0, "step": 34950 }, { "entropy": 1.349702821969986, "epoch": 0.8511466160842392, "grad_norm": 14.625, "learning_rate": 1.7159313175076464e-05, "loss": 1.3797615051269532, "mean_token_accuracy": 0.6594671577215194, "num_tokens": 16685712.0, "step": 35000 }, { "entropy": 1.2575137317180634, "epoch": 0.8523625398215023, "grad_norm": 14.4375, "learning_rate": 1.714994889594878e-05, "loss": 1.2665340423583984, "mean_token_accuracy": 0.683359968662262, "num_tokens": 16707079.0, "step": 35050 }, { "entropy": 1.19801906645298, "epoch": 0.8535784635587655, "grad_norm": 10.3125, "learning_rate": 1.7140571771887254e-05, "loss": 1.218580780029297, "mean_token_accuracy": 0.6915638720989228, "num_tokens": 16730704.0, "step": 35100 }, { "entropy": 1.3534709548950195, "epoch": 0.8547943872960287, "grad_norm": 18.25, "learning_rate": 1.7131181819737972e-05, "loss": 1.3613264465332031, "mean_token_accuracy": 0.6641484344005585, "num_tokens": 16754464.0, "step": 35150 }, { "entropy": 1.254922744035721, "epoch": 0.856010311033292, "grad_norm": 18.375, "learning_rate": 1.7121779056370034e-05, "loss": 1.2717888641357422, "mean_token_accuracy": 0.678503006696701, "num_tokens": 16777420.0, "step": 35200 }, { "entropy": 1.2415324914455415, "epoch": 0.8572262347705552, "grad_norm": 21.375, "learning_rate": 1.7112363498675582e-05, "loss": 1.2547886657714844, "mean_token_accuracy": 0.685390156507492, "num_tokens": 16800552.0, "step": 35250 }, { "entropy": 1.272292674779892, "epoch": 0.8584421585078184, "grad_norm": 16.5, "learning_rate": 1.7102935163569724e-05, "loss": 1.2930398559570313, "mean_token_accuracy": 0.6807149744033814, "num_tokens": 16824844.0, "step": 35300 }, { "entropy": 1.3452618491649628, "epoch": 0.8596580822450816, "grad_norm": 22.375, "learning_rate": 1.7093494067990537e-05, "loss": 1.3895854187011718, "mean_token_accuracy": 0.6573434501886368, "num_tokens": 16847596.0, "step": 35350 }, { "entropy": 1.374908002614975, "epoch": 0.8608740059823448, "grad_norm": 16.875, "learning_rate": 1.708404022889901e-05, "loss": 1.3920635986328125, "mean_token_accuracy": 0.6558866262435913, "num_tokens": 16869132.0, "step": 35400 }, { "entropy": 1.1898237538337708, "epoch": 0.862089929719608, "grad_norm": 10.875, "learning_rate": 1.7074573663279036e-05, "loss": 1.2067510986328125, "mean_token_accuracy": 0.690337918996811, "num_tokens": 16899050.0, "step": 35450 }, { "entropy": 1.2296862840652465, "epoch": 0.8633058534568712, "grad_norm": 15.5625, "learning_rate": 1.7065094388137365e-05, "loss": 1.2407195281982422, "mean_token_accuracy": 0.683502956032753, "num_tokens": 16917906.0, "step": 35500 }, { "entropy": 1.276252520084381, "epoch": 0.8645217771941344, "grad_norm": 10.75, "learning_rate": 1.7055602420503586e-05, "loss": 1.300546875, "mean_token_accuracy": 0.6767788815498352, "num_tokens": 16940729.0, "step": 35550 }, { "entropy": 1.1484793853759765, "epoch": 0.8657377009313976, "grad_norm": 11.875, "learning_rate": 1.7046097777430083e-05, "loss": 1.187029571533203, "mean_token_accuracy": 0.7012014269828797, "num_tokens": 16964005.0, "step": 35600 }, { "entropy": 1.2503758072853088, "epoch": 0.8669536246686608, "grad_norm": 12.3125, "learning_rate": 1.703658047599202e-05, "loss": 1.2415087890625, "mean_token_accuracy": 0.682029949426651, "num_tokens": 16987114.0, "step": 35650 }, { "entropy": 1.2901885902881622, "epoch": 0.868169548405924, "grad_norm": 11.9375, "learning_rate": 1.7027050533287283e-05, "loss": 1.330028076171875, "mean_token_accuracy": 0.6618649590015412, "num_tokens": 17010195.0, "step": 35700 }, { "entropy": 1.3162678730487825, "epoch": 0.8693854721431872, "grad_norm": 10.875, "learning_rate": 1.7017507966436492e-05, "loss": 1.3268826293945313, "mean_token_accuracy": 0.670789390206337, "num_tokens": 17032032.0, "step": 35750 }, { "entropy": 1.2425071787834168, "epoch": 0.8706013958804504, "grad_norm": 13.8125, "learning_rate": 1.7007952792582934e-05, "loss": 1.2755671691894532, "mean_token_accuracy": 0.6813260114192963, "num_tokens": 17055412.0, "step": 35800 }, { "entropy": 1.2610075497627258, "epoch": 0.8718173196177136, "grad_norm": 30.375, "learning_rate": 1.699838502889255e-05, "loss": 1.2878074645996094, "mean_token_accuracy": 0.6810142815113067, "num_tokens": 17075545.0, "step": 35850 }, { "entropy": 1.286871303319931, "epoch": 0.8730332433549768, "grad_norm": 18.25, "learning_rate": 1.6988804692553887e-05, "loss": 1.316726837158203, "mean_token_accuracy": 0.6711949896812439, "num_tokens": 17095580.0, "step": 35900 }, { "entropy": 1.255395352244377, "epoch": 0.87424916709224, "grad_norm": 15.1875, "learning_rate": 1.6979211800778096e-05, "loss": 1.2773341369628906, "mean_token_accuracy": 0.6739886516332626, "num_tokens": 17119554.0, "step": 35950 }, { "entropy": 1.2459684097766877, "epoch": 0.8754650908295032, "grad_norm": 12.625, "learning_rate": 1.6969606370798868e-05, "loss": 1.2484110260009766, "mean_token_accuracy": 0.692210533618927, "num_tokens": 17140839.0, "step": 36000 }, { "entropy": 1.360015528202057, "epoch": 0.8766810145667664, "grad_norm": 8.9375, "learning_rate": 1.6959988419872427e-05, "loss": 1.3801754760742186, "mean_token_accuracy": 0.6672072565555572, "num_tokens": 17163109.0, "step": 36050 }, { "entropy": 1.1799780642986297, "epoch": 0.8778969383040296, "grad_norm": 9.6875, "learning_rate": 1.6950357965277495e-05, "loss": 1.186567840576172, "mean_token_accuracy": 0.7001185035705566, "num_tokens": 17190456.0, "step": 36100 }, { "entropy": 1.2043678921461105, "epoch": 0.8791128620412928, "grad_norm": 10.9375, "learning_rate": 1.694071502431525e-05, "loss": 1.2082476043701171, "mean_token_accuracy": 0.6926586985588074, "num_tokens": 17215993.0, "step": 36150 }, { "entropy": 1.3217507600784302, "epoch": 0.880328785778556, "grad_norm": 11.0, "learning_rate": 1.69310596143093e-05, "loss": 1.3795762634277344, "mean_token_accuracy": 0.6663976693153382, "num_tokens": 17238834.0, "step": 36200 }, { "entropy": 1.2496319198608399, "epoch": 0.8815447095158192, "grad_norm": 19.5, "learning_rate": 1.692139175260566e-05, "loss": 1.2696981048583984, "mean_token_accuracy": 0.6807166248559952, "num_tokens": 17261135.0, "step": 36250 }, { "entropy": 1.3278730642795562, "epoch": 0.8827606332530824, "grad_norm": 10.9375, "learning_rate": 1.691171145657271e-05, "loss": 1.34739013671875, "mean_token_accuracy": 0.6685321515798569, "num_tokens": 17286799.0, "step": 36300 }, { "entropy": 1.2731690609455109, "epoch": 0.8839765569903456, "grad_norm": 17.625, "learning_rate": 1.6902018743601172e-05, "loss": 1.28620849609375, "mean_token_accuracy": 0.6775994396209717, "num_tokens": 17312092.0, "step": 36350 }, { "entropy": 1.3370947933197022, "epoch": 0.8851924807276088, "grad_norm": 16.375, "learning_rate": 1.689231363110408e-05, "loss": 1.3556080627441407, "mean_token_accuracy": 0.6618657821416855, "num_tokens": 17333867.0, "step": 36400 }, { "entropy": 1.3221871626377106, "epoch": 0.886408404464872, "grad_norm": 11.3125, "learning_rate": 1.6882596136516726e-05, "loss": 1.36583251953125, "mean_token_accuracy": 0.6654245531558991, "num_tokens": 17356993.0, "step": 36450 }, { "entropy": 1.3576951730251312, "epoch": 0.8876243282021352, "grad_norm": 13.25, "learning_rate": 1.687286627729666e-05, "loss": 1.3581985473632812, "mean_token_accuracy": 0.6607654535770416, "num_tokens": 17380352.0, "step": 36500 }, { "entropy": 1.3103331196308137, "epoch": 0.8888402519393983, "grad_norm": 13.9375, "learning_rate": 1.6863124070923655e-05, "loss": 1.3235359191894531, "mean_token_accuracy": 0.6649000614881515, "num_tokens": 17403665.0, "step": 36550 }, { "entropy": 1.2323751950263977, "epoch": 0.8900561756766615, "grad_norm": 11.0, "learning_rate": 1.6853369534899635e-05, "loss": 1.2416665649414063, "mean_token_accuracy": 0.6826064240932465, "num_tokens": 17429508.0, "step": 36600 }, { "entropy": 1.279969607591629, "epoch": 0.8912720994139247, "grad_norm": 13.25, "learning_rate": 1.6843602686748707e-05, "loss": 1.2924313354492187, "mean_token_accuracy": 0.676679322719574, "num_tokens": 17453209.0, "step": 36650 }, { "entropy": 1.2748051333427428, "epoch": 0.8924880231511879, "grad_norm": 17.0, "learning_rate": 1.683382354401708e-05, "loss": 1.3034757995605468, "mean_token_accuracy": 0.6752795100212097, "num_tokens": 17476340.0, "step": 36700 }, { "entropy": 1.3369374430179597, "epoch": 0.8937039468884511, "grad_norm": 23.125, "learning_rate": 1.682403212427305e-05, "loss": 1.3636166381835937, "mean_token_accuracy": 0.6611566281318665, "num_tokens": 17499936.0, "step": 36750 }, { "entropy": 1.341022423505783, "epoch": 0.8949198706257143, "grad_norm": 22.0, "learning_rate": 1.6814228445106976e-05, "loss": 1.3623599243164062, "mean_token_accuracy": 0.6601166373491287, "num_tokens": 17523945.0, "step": 36800 }, { "entropy": 1.3122738993167877, "epoch": 0.8961357943629775, "grad_norm": 9.625, "learning_rate": 1.6804412524131232e-05, "loss": 1.3244032287597656, "mean_token_accuracy": 0.6674284994602203, "num_tokens": 17550578.0, "step": 36850 }, { "entropy": 1.2971325016021729, "epoch": 0.8973517181002407, "grad_norm": 11.125, "learning_rate": 1.6794584378980196e-05, "loss": 1.311529083251953, "mean_token_accuracy": 0.6713308560848236, "num_tokens": 17576727.0, "step": 36900 }, { "entropy": 1.2327189445495605, "epoch": 0.8985676418375039, "grad_norm": 33.75, "learning_rate": 1.6784744027310195e-05, "loss": 1.2621558380126954, "mean_token_accuracy": 0.6733198487758636, "num_tokens": 17600021.0, "step": 36950 }, { "entropy": 1.2863137567043303, "epoch": 0.8997835655747671, "grad_norm": 11.625, "learning_rate": 1.677489148679949e-05, "loss": 1.3142202758789063, "mean_token_accuracy": 0.6714473164081574, "num_tokens": 17622919.0, "step": 37000 }, { "entropy": 1.2464148890972138, "epoch": 0.9009994893120303, "grad_norm": 11.5, "learning_rate": 1.676502677514824e-05, "loss": 1.2683675384521484, "mean_token_accuracy": 0.6842757403850556, "num_tokens": 17649119.0, "step": 37050 }, { "entropy": 1.2564572274684906, "epoch": 0.9022154130492935, "grad_norm": 11.0, "learning_rate": 1.6755149910078476e-05, "loss": 1.2694277954101563, "mean_token_accuracy": 0.68529117166996, "num_tokens": 17676819.0, "step": 37100 }, { "entropy": 1.314138227701187, "epoch": 0.9034313367865567, "grad_norm": 16.75, "learning_rate": 1.6745260909334055e-05, "loss": 1.3345933532714844, "mean_token_accuracy": 0.6627428305149078, "num_tokens": 17700297.0, "step": 37150 }, { "entropy": 1.1812559515237808, "epoch": 0.9046472605238199, "grad_norm": 12.6875, "learning_rate": 1.6735359790680625e-05, "loss": 1.2051509857177733, "mean_token_accuracy": 0.6976480287313461, "num_tokens": 17723991.0, "step": 37200 }, { "entropy": 1.2362561333179474, "epoch": 0.9058631842610831, "grad_norm": 14.9375, "learning_rate": 1.672544657190563e-05, "loss": 1.2616867065429687, "mean_token_accuracy": 0.6874185848236084, "num_tokens": 17748531.0, "step": 37250 }, { "entropy": 1.3679325413703918, "epoch": 0.9070791079983463, "grad_norm": 26.0, "learning_rate": 1.671552127081823e-05, "loss": 1.3900859069824218, "mean_token_accuracy": 0.6515968602895736, "num_tokens": 17769970.0, "step": 37300 }, { "entropy": 1.322537863254547, "epoch": 0.9082950317356095, "grad_norm": 21.75, "learning_rate": 1.67055839052493e-05, "loss": 1.3315280151367188, "mean_token_accuracy": 0.6741810494661331, "num_tokens": 17792966.0, "step": 37350 }, { "entropy": 1.225341365337372, "epoch": 0.9095109554728728, "grad_norm": 20.625, "learning_rate": 1.669563449305139e-05, "loss": 1.2605479431152344, "mean_token_accuracy": 0.6828233462572098, "num_tokens": 17818723.0, "step": 37400 }, { "entropy": 1.2141255927085877, "epoch": 0.910726879210136, "grad_norm": 16.25, "learning_rate": 1.6685673052098684e-05, "loss": 1.2203114318847657, "mean_token_accuracy": 0.6857129967212677, "num_tokens": 17841926.0, "step": 37450 }, { "entropy": 1.4218631052970887, "epoch": 0.9119428029473992, "grad_norm": 38.0, "learning_rate": 1.667569960028699e-05, "loss": 1.4245736694335938, "mean_token_accuracy": 0.6509634119272232, "num_tokens": 17865360.0, "step": 37500 }, { "entropy": 1.3917211699485779, "epoch": 0.9131587266846624, "grad_norm": 37.25, "learning_rate": 1.6665714155533678e-05, "loss": 1.4096817016601562, "mean_token_accuracy": 0.6539940387010574, "num_tokens": 17888937.0, "step": 37550 }, { "entropy": 1.3376558113098145, "epoch": 0.9143746504219256, "grad_norm": 12.5625, "learning_rate": 1.665571673577767e-05, "loss": 1.3781118774414063, "mean_token_accuracy": 0.6496420496702194, "num_tokens": 17915022.0, "step": 37600 }, { "entropy": 1.3153994929790498, "epoch": 0.9155905741591888, "grad_norm": 11.625, "learning_rate": 1.6645707358979405e-05, "loss": 1.324610137939453, "mean_token_accuracy": 0.6684761941432953, "num_tokens": 17938560.0, "step": 37650 }, { "entropy": 1.2306906712055206, "epoch": 0.916806497896452, "grad_norm": 12.5625, "learning_rate": 1.6635686043120796e-05, "loss": 1.2488372802734375, "mean_token_accuracy": 0.6815380817651748, "num_tokens": 17963164.0, "step": 37700 }, { "entropy": 1.2518015468120576, "epoch": 0.9180224216337152, "grad_norm": 18.375, "learning_rate": 1.662565280620521e-05, "loss": 1.2612264251708984, "mean_token_accuracy": 0.6808405268192291, "num_tokens": 17987842.0, "step": 37750 }, { "entropy": 1.1906898403167725, "epoch": 0.9192383453709784, "grad_norm": 19.125, "learning_rate": 1.6615607666257427e-05, "loss": 1.2289363098144532, "mean_token_accuracy": 0.6857355403900146, "num_tokens": 18014182.0, "step": 37800 }, { "entropy": 1.3611641561985015, "epoch": 0.9204542691082416, "grad_norm": 14.5625, "learning_rate": 1.6605550641323616e-05, "loss": 1.4014422607421875, "mean_token_accuracy": 0.661869940161705, "num_tokens": 18038683.0, "step": 37850 }, { "entropy": 1.2072744190692901, "epoch": 0.9216701928455048, "grad_norm": 14.0625, "learning_rate": 1.6595481749471288e-05, "loss": 1.231659164428711, "mean_token_accuracy": 0.692221587896347, "num_tokens": 18065424.0, "step": 37900 }, { "entropy": 1.3240436327457428, "epoch": 0.922886116582768, "grad_norm": 20.0, "learning_rate": 1.658540100878928e-05, "loss": 1.3206703186035156, "mean_token_accuracy": 0.6752091217041015, "num_tokens": 18091413.0, "step": 37950 }, { "entropy": 1.317883276939392, "epoch": 0.9241020403200312, "grad_norm": 21.75, "learning_rate": 1.6575308437387718e-05, "loss": 1.3408163452148438, "mean_token_accuracy": 0.6655285203456879, "num_tokens": 18116456.0, "step": 38000 }, { "entropy": 1.3144543492794036, "epoch": 0.9253179640572943, "grad_norm": 16.25, "learning_rate": 1.6565204053397973e-05, "loss": 1.345282745361328, "mean_token_accuracy": 0.6674581515789032, "num_tokens": 18140343.0, "step": 38050 }, { "entropy": 1.2689211601018906, "epoch": 0.9265338877945575, "grad_norm": 14.8125, "learning_rate": 1.6555087874972645e-05, "loss": 1.2840608215332032, "mean_token_accuracy": 0.6865440213680267, "num_tokens": 18164541.0, "step": 38100 }, { "entropy": 1.319093039035797, "epoch": 0.9277498115318207, "grad_norm": 15.3125, "learning_rate": 1.654495992028552e-05, "loss": 1.3359408569335938, "mean_token_accuracy": 0.6659414631128311, "num_tokens": 18187485.0, "step": 38150 }, { "entropy": 1.339752323627472, "epoch": 0.9289657352690839, "grad_norm": 8.875, "learning_rate": 1.653482020753154e-05, "loss": 1.3436843872070312, "mean_token_accuracy": 0.6645578151941299, "num_tokens": 18211819.0, "step": 38200 }, { "entropy": 1.3156696581840515, "epoch": 0.9301816590063471, "grad_norm": 23.375, "learning_rate": 1.6524668754926768e-05, "loss": 1.3424374389648437, "mean_token_accuracy": 0.6738470143079758, "num_tokens": 18236217.0, "step": 38250 }, { "entropy": 1.253431454896927, "epoch": 0.9313975827436103, "grad_norm": 27.25, "learning_rate": 1.651450558070837e-05, "loss": 1.2507430267333985, "mean_token_accuracy": 0.6749428570270538, "num_tokens": 18258714.0, "step": 38300 }, { "entropy": 1.357618942260742, "epoch": 0.9326135064808735, "grad_norm": 9.4375, "learning_rate": 1.6504330703134548e-05, "loss": 1.4158270263671875, "mean_token_accuracy": 0.6563140833377838, "num_tokens": 18281171.0, "step": 38350 }, { "entropy": 1.3190254867076874, "epoch": 0.9338294302181367, "grad_norm": 18.25, "learning_rate": 1.6494144140484543e-05, "loss": 1.341153564453125, "mean_token_accuracy": 0.6615365046262741, "num_tokens": 18305175.0, "step": 38400 }, { "entropy": 1.2771383452415466, "epoch": 0.9350453539553999, "grad_norm": 16.0, "learning_rate": 1.6483945911058602e-05, "loss": 1.3055924987792968, "mean_token_accuracy": 0.6730807518959045, "num_tokens": 18330739.0, "step": 38450 }, { "entropy": 1.2982803308963775, "epoch": 0.9362612776926631, "grad_norm": 22.25, "learning_rate": 1.6473736033177897e-05, "loss": 1.318198699951172, "mean_token_accuracy": 0.6752356290817261, "num_tokens": 18354592.0, "step": 38500 }, { "entropy": 1.133040714263916, "epoch": 0.9374772014299263, "grad_norm": 24.5, "learning_rate": 1.646351452518456e-05, "loss": 1.1367547607421875, "mean_token_accuracy": 0.706446100473404, "num_tokens": 18379543.0, "step": 38550 }, { "entropy": 1.1862151217460633, "epoch": 0.9386931251671895, "grad_norm": 18.875, "learning_rate": 1.6453281405441592e-05, "loss": 1.2043597412109375, "mean_token_accuracy": 0.6941111636161804, "num_tokens": 18401507.0, "step": 38600 }, { "entropy": 1.2463269603252412, "epoch": 0.9399090489044527, "grad_norm": 13.5625, "learning_rate": 1.6443036692332876e-05, "loss": 1.2620709228515625, "mean_token_accuracy": 0.676375179886818, "num_tokens": 18427270.0, "step": 38650 }, { "entropy": 1.2079343068599702, "epoch": 0.9411249726417159, "grad_norm": 10.8125, "learning_rate": 1.64327804042631e-05, "loss": 1.236028594970703, "mean_token_accuracy": 0.6864644378423691, "num_tokens": 18452108.0, "step": 38700 }, { "entropy": 1.217312661409378, "epoch": 0.9423408963789791, "grad_norm": 20.75, "learning_rate": 1.642251255965777e-05, "loss": 1.2232259368896485, "mean_token_accuracy": 0.6895075744390488, "num_tokens": 18473665.0, "step": 38750 }, { "entropy": 1.2350401592254638, "epoch": 0.9435568201162423, "grad_norm": 13.875, "learning_rate": 1.6412233176963132e-05, "loss": 1.2547639465332032, "mean_token_accuracy": 0.6901251167058945, "num_tokens": 18495842.0, "step": 38800 }, { "entropy": 1.2326401162147522, "epoch": 0.9447727438535055, "grad_norm": 15.9375, "learning_rate": 1.6401942274646172e-05, "loss": 1.2633992767333984, "mean_token_accuracy": 0.6861291080713272, "num_tokens": 18521236.0, "step": 38850 }, { "entropy": 1.2005161726474762, "epoch": 0.9459886675907687, "grad_norm": 11.0, "learning_rate": 1.6391639871194573e-05, "loss": 1.203843536376953, "mean_token_accuracy": 0.688817384839058, "num_tokens": 18547125.0, "step": 38900 }, { "entropy": 1.3235130405426025, "epoch": 0.9472045913280319, "grad_norm": 13.8125, "learning_rate": 1.6381325985116672e-05, "loss": 1.3529127502441407, "mean_token_accuracy": 0.663879325389862, "num_tokens": 18572640.0, "step": 38950 }, { "entropy": 1.2179168701171874, "epoch": 0.9484205150652951, "grad_norm": 13.3125, "learning_rate": 1.637100063494144e-05, "loss": 1.2318245697021484, "mean_token_accuracy": 0.6880240756273269, "num_tokens": 18598871.0, "step": 39000 }, { "entropy": 1.2494645619392395, "epoch": 0.9496364388025583, "grad_norm": 19.125, "learning_rate": 1.636066383921844e-05, "loss": 1.2559146118164062, "mean_token_accuracy": 0.6802505320310592, "num_tokens": 18622055.0, "step": 39050 }, { "entropy": 1.2329797661304474, "epoch": 0.9508523625398215, "grad_norm": 17.125, "learning_rate": 1.63503156165178e-05, "loss": 1.2679521942138672, "mean_token_accuracy": 0.6851815617084503, "num_tokens": 18646718.0, "step": 39100 }, { "entropy": 1.2790827798843383, "epoch": 0.9520682862770847, "grad_norm": 20.75, "learning_rate": 1.6339955985430175e-05, "loss": 1.315712432861328, "mean_token_accuracy": 0.6726947736740112, "num_tokens": 18670987.0, "step": 39150 }, { "entropy": 1.2478013682365416, "epoch": 0.9532842100143479, "grad_norm": 12.4375, "learning_rate": 1.6329584964566717e-05, "loss": 1.256207275390625, "mean_token_accuracy": 0.6810627812147141, "num_tokens": 18694791.0, "step": 39200 }, { "entropy": 1.3294434988498687, "epoch": 0.9545001337516111, "grad_norm": 12.6875, "learning_rate": 1.631920257255904e-05, "loss": 1.3368319702148437, "mean_token_accuracy": 0.6634562999010086, "num_tokens": 18719098.0, "step": 39250 }, { "entropy": 1.2743344491720199, "epoch": 0.9557160574888743, "grad_norm": 12.6875, "learning_rate": 1.630880882805918e-05, "loss": 1.3052153015136718, "mean_token_accuracy": 0.6701937907934189, "num_tokens": 18742082.0, "step": 39300 }, { "entropy": 1.2906163358688354, "epoch": 0.9569319812261375, "grad_norm": 15.25, "learning_rate": 1.6298403749739573e-05, "loss": 1.3065521240234375, "mean_token_accuracy": 0.6765269792079925, "num_tokens": 18763834.0, "step": 39350 }, { "entropy": 1.2569253516197205, "epoch": 0.9581479049634007, "grad_norm": 16.5, "learning_rate": 1.628798735629302e-05, "loss": 1.272566375732422, "mean_token_accuracy": 0.684459981918335, "num_tokens": 18789296.0, "step": 39400 }, { "entropy": 1.2971358382701874, "epoch": 0.959363828700664, "grad_norm": 10.75, "learning_rate": 1.6277559666432644e-05, "loss": 1.3079290771484375, "mean_token_accuracy": 0.6648435151576996, "num_tokens": 18812495.0, "step": 39450 }, { "entropy": 1.2431987559795379, "epoch": 0.9605797524379271, "grad_norm": 13.125, "learning_rate": 1.6267120698891862e-05, "loss": 1.2582263946533203, "mean_token_accuracy": 0.6806806480884552, "num_tokens": 18834284.0, "step": 39500 }, { "entropy": 1.2703800797462463, "epoch": 0.9617956761751902, "grad_norm": 10.125, "learning_rate": 1.625667047242435e-05, "loss": 1.3077757263183594, "mean_token_accuracy": 0.6787149906158447, "num_tokens": 18856429.0, "step": 39550 }, { "entropy": 1.2358007001876832, "epoch": 0.9630115999124534, "grad_norm": 8.8125, "learning_rate": 1.624620900580402e-05, "loss": 1.2575350952148439, "mean_token_accuracy": 0.6830630826950074, "num_tokens": 18878998.0, "step": 39600 }, { "entropy": 1.2961362135410308, "epoch": 0.9642275236497166, "grad_norm": 21.75, "learning_rate": 1.623573631782497e-05, "loss": 1.3227479553222656, "mean_token_accuracy": 0.6734661042690278, "num_tokens": 18901042.0, "step": 39650 }, { "entropy": 1.249845697581768, "epoch": 0.9654434473869798, "grad_norm": 12.1875, "learning_rate": 1.6225252427301458e-05, "loss": 1.2513157653808593, "mean_token_accuracy": 0.686974606513977, "num_tokens": 18924658.0, "step": 39700 }, { "entropy": 1.2363671976327897, "epoch": 0.966659371124243, "grad_norm": 13.625, "learning_rate": 1.6214757353067864e-05, "loss": 1.2587667846679687, "mean_token_accuracy": 0.6793397414684296, "num_tokens": 18949765.0, "step": 39750 }, { "entropy": 1.28436891913414, "epoch": 0.9678752948615063, "grad_norm": 11.625, "learning_rate": 1.620425111397867e-05, "loss": 1.3060990905761718, "mean_token_accuracy": 0.6690796583890914, "num_tokens": 18973351.0, "step": 39800 }, { "entropy": 1.3710987496376037, "epoch": 0.9690912185987695, "grad_norm": 10.3125, "learning_rate": 1.61937337289084e-05, "loss": 1.372046661376953, "mean_token_accuracy": 0.6559437960386276, "num_tokens": 18996217.0, "step": 39850 }, { "entropy": 1.3446544072031974, "epoch": 0.9703071423360327, "grad_norm": 14.0, "learning_rate": 1.618320521675162e-05, "loss": 1.37428466796875, "mean_token_accuracy": 0.6707555377483367, "num_tokens": 19022432.0, "step": 39900 }, { "entropy": 1.2522403195500373, "epoch": 0.9715230660732959, "grad_norm": 18.25, "learning_rate": 1.617266559642287e-05, "loss": 1.257581024169922, "mean_token_accuracy": 0.6800504672527313, "num_tokens": 19047177.0, "step": 39950 }, { "entropy": 1.195557255744934, "epoch": 0.9727389898105591, "grad_norm": 21.25, "learning_rate": 1.6162114886856653e-05, "loss": 1.1868400573730469, "mean_token_accuracy": 0.6939961671829223, "num_tokens": 19071069.0, "step": 40000 }, { "epoch": 0.9727389898105591, "eval_entropy": 1.2480564676722543, "eval_loss": 1.2896699905395508, "eval_mean_token_accuracy": 0.6786122245570976, "eval_num_tokens": 19071069.0, "eval_runtime": 391.42, "eval_samples_per_second": 11.673, "eval_steps_per_second": 11.673, "step": 40000 }, { "entropy": 1.17752769947052, "epoch": 0.9739549135478223, "grad_norm": 30.125, "learning_rate": 1.6151553107007395e-05, "loss": 1.2155185699462892, "mean_token_accuracy": 0.6909275734424591, "num_tokens": 19093367.0, "step": 40050 }, { "entropy": 1.2135307204723358, "epoch": 0.9751708372850855, "grad_norm": 21.125, "learning_rate": 1.614098027584941e-05, "loss": 1.2284493255615234, "mean_token_accuracy": 0.6831330001354218, "num_tokens": 19119359.0, "step": 40100 }, { "entropy": 1.2377767658233643, "epoch": 0.9763867610223487, "grad_norm": 16.25, "learning_rate": 1.6130396412376865e-05, "loss": 1.2508012390136718, "mean_token_accuracy": 0.6896417611837387, "num_tokens": 19142703.0, "step": 40150 }, { "entropy": 1.2434790712594985, "epoch": 0.9776026847596119, "grad_norm": 22.375, "learning_rate": 1.6119801535603738e-05, "loss": 1.2776036071777344, "mean_token_accuracy": 0.6875056785345077, "num_tokens": 19165881.0, "step": 40200 }, { "entropy": 1.2916355055570603, "epoch": 0.9788186084968751, "grad_norm": 12.5, "learning_rate": 1.610919566456381e-05, "loss": 1.2804730224609375, "mean_token_accuracy": 0.6771031749248505, "num_tokens": 19190449.0, "step": 40250 }, { "entropy": 1.171314183473587, "epoch": 0.9800345322341383, "grad_norm": 14.25, "learning_rate": 1.6098578818310597e-05, "loss": 1.2004362487792968, "mean_token_accuracy": 0.69127974152565, "num_tokens": 19215760.0, "step": 40300 }, { "entropy": 1.4617342591285705, "epoch": 0.9812504559714015, "grad_norm": 12.0625, "learning_rate": 1.608795101591734e-05, "loss": 1.4912564086914062, "mean_token_accuracy": 0.6418628990650177, "num_tokens": 19238651.0, "step": 40350 }, { "entropy": 1.3614255559444428, "epoch": 0.9824663797086647, "grad_norm": 15.125, "learning_rate": 1.6077312276476964e-05, "loss": 1.37155517578125, "mean_token_accuracy": 0.666025276184082, "num_tokens": 19261256.0, "step": 40400 }, { "entropy": 1.3310783487558364, "epoch": 0.9836823034459279, "grad_norm": 20.5, "learning_rate": 1.6066662619102035e-05, "loss": 1.34533447265625, "mean_token_accuracy": 0.6667497718334198, "num_tokens": 19285230.0, "step": 40450 }, { "entropy": 1.322493314743042, "epoch": 0.9848982271831911, "grad_norm": 10.6875, "learning_rate": 1.6056002062924738e-05, "loss": 1.3362847900390624, "mean_token_accuracy": 0.6680021250247955, "num_tokens": 19305289.0, "step": 40500 }, { "entropy": 1.2567736756801606, "epoch": 0.9861141509204543, "grad_norm": 12.3125, "learning_rate": 1.6045330627096844e-05, "loss": 1.2866607666015626, "mean_token_accuracy": 0.6791130590438843, "num_tokens": 19330995.0, "step": 40550 }, { "entropy": 1.227874948978424, "epoch": 0.9873300746577175, "grad_norm": 14.4375, "learning_rate": 1.6034648330789658e-05, "loss": 1.2342717742919922, "mean_token_accuracy": 0.684545772075653, "num_tokens": 19352952.0, "step": 40600 }, { "entropy": 1.1573973786830902, "epoch": 0.9885459983949807, "grad_norm": 26.75, "learning_rate": 1.6023955193194e-05, "loss": 1.1512420654296875, "mean_token_accuracy": 0.7017405247688293, "num_tokens": 19372964.0, "step": 40650 }, { "entropy": 1.3050174283981324, "epoch": 0.9897619221322439, "grad_norm": 13.6875, "learning_rate": 1.6013251233520174e-05, "loss": 1.3422059631347656, "mean_token_accuracy": 0.6679460883140564, "num_tokens": 19395491.0, "step": 40700 }, { "entropy": 1.1733437848091126, "epoch": 0.9909778458695071, "grad_norm": 16.375, "learning_rate": 1.600253647099791e-05, "loss": 1.178622055053711, "mean_token_accuracy": 0.6975393629074097, "num_tokens": 19419684.0, "step": 40750 }, { "entropy": 1.2502824777364732, "epoch": 0.9921937696067703, "grad_norm": 10.875, "learning_rate": 1.599181092487636e-05, "loss": 1.2737794494628907, "mean_token_accuracy": 0.6838245928287506, "num_tokens": 19444836.0, "step": 40800 }, { "entropy": 1.2082511651515961, "epoch": 0.9934096933440335, "grad_norm": 9.875, "learning_rate": 1.5981074614424047e-05, "loss": 1.2456179809570314, "mean_token_accuracy": 0.6839610928297043, "num_tokens": 19468084.0, "step": 40850 }, { "entropy": 1.3019201791286468, "epoch": 0.9946256170812967, "grad_norm": 5.90625, "learning_rate": 1.597032755892882e-05, "loss": 1.324671173095703, "mean_token_accuracy": 0.6723885989189148, "num_tokens": 19490703.0, "step": 40900 }, { "entropy": 1.259577248096466, "epoch": 0.9958415408185599, "grad_norm": 22.0, "learning_rate": 1.595956977769785e-05, "loss": 1.2800320434570311, "mean_token_accuracy": 0.6777480751276016, "num_tokens": 19515358.0, "step": 40950 }, { "entropy": 1.3341368222236634, "epoch": 0.9970574645558231, "grad_norm": 12.6875, "learning_rate": 1.594880129005756e-05, "loss": 1.3325468444824218, "mean_token_accuracy": 0.6641083562374115, "num_tokens": 19536954.0, "step": 41000 }, { "entropy": 1.304046437740326, "epoch": 0.9982733882930862, "grad_norm": 11.3125, "learning_rate": 1.5938022115353618e-05, "loss": 1.3348501586914063, "mean_token_accuracy": 0.6716163933277131, "num_tokens": 19563712.0, "step": 41050 }, { "entropy": 1.319958540201187, "epoch": 0.9994893120303494, "grad_norm": 9.9375, "learning_rate": 1.5927232272950885e-05, "loss": 1.32943603515625, "mean_token_accuracy": 0.6699466997385025, "num_tokens": 19588061.0, "step": 41100 }, { "entropy": 1.2257022935152053, "epoch": 1.0007052357676127, "grad_norm": 14.3125, "learning_rate": 1.591643178223339e-05, "loss": 1.163029098510742, "mean_token_accuracy": 0.7029915690422058, "num_tokens": 19609800.0, "step": 41150 }, { "entropy": 1.0365826654434205, "epoch": 1.0019211595048758, "grad_norm": 16.0, "learning_rate": 1.5905620662604297e-05, "loss": 1.0248902893066407, "mean_token_accuracy": 0.7349356544017792, "num_tokens": 19634670.0, "step": 41200 }, { "entropy": 1.0859514027833939, "epoch": 1.0031370832421391, "grad_norm": 14.5, "learning_rate": 1.5894798933485846e-05, "loss": 1.0818561553955077, "mean_token_accuracy": 0.7166036885976791, "num_tokens": 19658367.0, "step": 41250 }, { "entropy": 1.0165627825260162, "epoch": 1.0043530069794022, "grad_norm": 28.875, "learning_rate": 1.5883966614319365e-05, "loss": 1.016882095336914, "mean_token_accuracy": 0.7327527081966401, "num_tokens": 19680964.0, "step": 41300 }, { "entropy": 1.0256896379590035, "epoch": 1.0055689307166655, "grad_norm": 10.1875, "learning_rate": 1.5873123724565178e-05, "loss": 1.0310304260253906, "mean_token_accuracy": 0.7336001014709472, "num_tokens": 19706910.0, "step": 41350 }, { "entropy": 1.085079321861267, "epoch": 1.0067848544539286, "grad_norm": 4.78125, "learning_rate": 1.5862270283702624e-05, "loss": 1.0701345825195312, "mean_token_accuracy": 0.7299644637107849, "num_tokens": 19731760.0, "step": 41400 }, { "entropy": 1.101848999261856, "epoch": 1.008000778191192, "grad_norm": 10.1875, "learning_rate": 1.5851406311229983e-05, "loss": 1.1069840240478515, "mean_token_accuracy": 0.7131460398435593, "num_tokens": 19753702.0, "step": 41450 }, { "entropy": 0.9988143044710159, "epoch": 1.009216701928455, "grad_norm": 19.75, "learning_rate": 1.5840531826664454e-05, "loss": 0.9801694488525391, "mean_token_accuracy": 0.7396875667572022, "num_tokens": 19775074.0, "step": 41500 }, { "entropy": 1.0338347619771957, "epoch": 1.0104326256657183, "grad_norm": 15.25, "learning_rate": 1.5829646849542134e-05, "loss": 1.03847412109375, "mean_token_accuracy": 0.7288343155384064, "num_tokens": 19800918.0, "step": 41550 }, { "entropy": 1.0094644278287888, "epoch": 1.0116485494029814, "grad_norm": 16.5, "learning_rate": 1.5818751399417957e-05, "loss": 1.0138330078125, "mean_token_accuracy": 0.7327321529388428, "num_tokens": 19828605.0, "step": 41600 }, { "entropy": 1.1174944519996644, "epoch": 1.0128644731402447, "grad_norm": 19.875, "learning_rate": 1.5807845495865675e-05, "loss": 1.129132080078125, "mean_token_accuracy": 0.710137597322464, "num_tokens": 19851597.0, "step": 41650 }, { "entropy": 1.0831511157751084, "epoch": 1.0140803968775078, "grad_norm": 7.65625, "learning_rate": 1.5796929158477822e-05, "loss": 1.1055765533447266, "mean_token_accuracy": 0.7097451829910278, "num_tokens": 19875820.0, "step": 41700 }, { "entropy": 1.1419637978076935, "epoch": 1.015296320614771, "grad_norm": 13.5, "learning_rate": 1.578600240686568e-05, "loss": 1.128932876586914, "mean_token_accuracy": 0.7180210399627686, "num_tokens": 19898428.0, "step": 41750 }, { "entropy": 1.1053716051578522, "epoch": 1.0165122443520342, "grad_norm": 16.625, "learning_rate": 1.5775065260659226e-05, "loss": 1.105461654663086, "mean_token_accuracy": 0.7085025852918625, "num_tokens": 19919630.0, "step": 41800 }, { "entropy": 1.050548769235611, "epoch": 1.0177281680892973, "grad_norm": 13.125, "learning_rate": 1.576411773950713e-05, "loss": 1.0412920379638673, "mean_token_accuracy": 0.7279394042491912, "num_tokens": 19943226.0, "step": 41850 }, { "entropy": 1.1296495002508165, "epoch": 1.0189440918265606, "grad_norm": 30.25, "learning_rate": 1.5753159863076684e-05, "loss": 1.133263168334961, "mean_token_accuracy": 0.7139160859584809, "num_tokens": 19961532.0, "step": 41900 }, { "entropy": 1.021832218170166, "epoch": 1.0201600155638237, "grad_norm": 15.3125, "learning_rate": 1.5742191651053794e-05, "loss": 1.0297850036621095, "mean_token_accuracy": 0.7277181255817413, "num_tokens": 19985228.0, "step": 41950 }, { "entropy": 1.0499229270219803, "epoch": 1.021375939301087, "grad_norm": 11.6875, "learning_rate": 1.5731213123142932e-05, "loss": 1.0248780059814453, "mean_token_accuracy": 0.7339975821971894, "num_tokens": 20005327.0, "step": 42000 }, { "entropy": 1.0889889669418336, "epoch": 1.0225918630383501, "grad_norm": 11.1875, "learning_rate": 1.57202242990671e-05, "loss": 1.1012786102294922, "mean_token_accuracy": 0.7129674303531647, "num_tokens": 20032163.0, "step": 42050 }, { "entropy": 0.9689486682415008, "epoch": 1.0238077867756135, "grad_norm": 28.75, "learning_rate": 1.57092251985678e-05, "loss": 0.96230712890625, "mean_token_accuracy": 0.7396260297298431, "num_tokens": 20053080.0, "step": 42100 }, { "entropy": 1.1016816008090973, "epoch": 1.0250237105128766, "grad_norm": 12.0625, "learning_rate": 1.569821584140499e-05, "loss": 1.110254440307617, "mean_token_accuracy": 0.7142639684677125, "num_tokens": 20075891.0, "step": 42150 }, { "entropy": 0.9778645408153533, "epoch": 1.0262396342501399, "grad_norm": 15.75, "learning_rate": 1.568719624735706e-05, "loss": 0.961946029663086, "mean_token_accuracy": 0.7459349429607391, "num_tokens": 20095123.0, "step": 42200 }, { "entropy": 1.0926250839233398, "epoch": 1.027455557987403, "grad_norm": 17.875, "learning_rate": 1.567616643622079e-05, "loss": 1.1072129821777343, "mean_token_accuracy": 0.711172045469284, "num_tokens": 20121138.0, "step": 42250 }, { "entropy": 1.109446108341217, "epoch": 1.0286714817246663, "grad_norm": 14.0625, "learning_rate": 1.566512642781131e-05, "loss": 1.1217416381835938, "mean_token_accuracy": 0.7168016695976257, "num_tokens": 20145749.0, "step": 42300 }, { "entropy": 1.0487771451473236, "epoch": 1.0298874054619294, "grad_norm": 19.25, "learning_rate": 1.565407624196208e-05, "loss": 1.054056396484375, "mean_token_accuracy": 0.7283505964279174, "num_tokens": 20171832.0, "step": 42350 }, { "entropy": 1.1488595259189607, "epoch": 1.0311033291991927, "grad_norm": 12.5625, "learning_rate": 1.5643015898524832e-05, "loss": 1.1618900299072266, "mean_token_accuracy": 0.6965837520360947, "num_tokens": 20200351.0, "step": 42400 }, { "entropy": 1.0390840125083924, "epoch": 1.0323192529364558, "grad_norm": 11.0, "learning_rate": 1.563194541736955e-05, "loss": 1.0258870697021485, "mean_token_accuracy": 0.731323139667511, "num_tokens": 20224169.0, "step": 42450 }, { "entropy": 0.9570994484424591, "epoch": 1.033535176673719, "grad_norm": 21.125, "learning_rate": 1.562086481838444e-05, "loss": 0.9709564971923829, "mean_token_accuracy": 0.7499929535388946, "num_tokens": 20249498.0, "step": 42500 }, { "entropy": 1.101138318181038, "epoch": 1.0347511004109822, "grad_norm": 10.5625, "learning_rate": 1.560977412147586e-05, "loss": 1.1188225555419922, "mean_token_accuracy": 0.7106661784648896, "num_tokens": 20271172.0, "step": 42550 }, { "entropy": 1.1122498899698257, "epoch": 1.0359670241482455, "grad_norm": 14.9375, "learning_rate": 1.5598673346568348e-05, "loss": 1.118797836303711, "mean_token_accuracy": 0.7127311277389526, "num_tokens": 20297308.0, "step": 42600 }, { "entropy": 0.8547952255606651, "epoch": 1.0371829478855086, "grad_norm": 14.75, "learning_rate": 1.5587562513604505e-05, "loss": 0.8550521087646484, "mean_token_accuracy": 0.776091023683548, "num_tokens": 20319981.0, "step": 42650 }, { "entropy": 1.1174312508106232, "epoch": 1.038398871622772, "grad_norm": 14.0, "learning_rate": 1.5576441642545036e-05, "loss": 1.1335577392578124, "mean_token_accuracy": 0.7058888685703277, "num_tokens": 20345984.0, "step": 42700 }, { "entropy": 1.0754284608364104, "epoch": 1.039614795360035, "grad_norm": 17.25, "learning_rate": 1.5565310753368653e-05, "loss": 1.0971035003662108, "mean_token_accuracy": 0.72421315908432, "num_tokens": 20369444.0, "step": 42750 }, { "entropy": 1.1048167717456818, "epoch": 1.0408307190972983, "grad_norm": 15.8125, "learning_rate": 1.5554169866072086e-05, "loss": 1.1132131958007812, "mean_token_accuracy": 0.7200241190195084, "num_tokens": 20392551.0, "step": 42800 }, { "entropy": 0.9277893018722534, "epoch": 1.0420466428345614, "grad_norm": 12.375, "learning_rate": 1.5543019000670012e-05, "loss": 0.9344986724853516, "mean_token_accuracy": 0.7508158981800079, "num_tokens": 20418182.0, "step": 42850 }, { "entropy": 1.0158471816778183, "epoch": 1.0432625665718247, "grad_norm": 29.125, "learning_rate": 1.5531858177195042e-05, "loss": 1.0332571411132812, "mean_token_accuracy": 0.7332357013225556, "num_tokens": 20444461.0, "step": 42900 }, { "entropy": 1.0184337151050569, "epoch": 1.0444784903090878, "grad_norm": 13.5, "learning_rate": 1.552068741569767e-05, "loss": 1.0302410888671876, "mean_token_accuracy": 0.7279232013225555, "num_tokens": 20470123.0, "step": 42950 }, { "entropy": 1.0620062071084977, "epoch": 1.045694414046351, "grad_norm": 13.25, "learning_rate": 1.550950673624626e-05, "loss": 1.081025161743164, "mean_token_accuracy": 0.7087981379032136, "num_tokens": 20496818.0, "step": 43000 }, { "entropy": 1.0229092103242874, "epoch": 1.0469103377836142, "grad_norm": 15.6875, "learning_rate": 1.549831615892697e-05, "loss": 1.0208181762695312, "mean_token_accuracy": 0.7398644173145295, "num_tokens": 20522868.0, "step": 43050 }, { "entropy": 1.0758934396505355, "epoch": 1.0481262615208775, "grad_norm": 12.9375, "learning_rate": 1.548711570384376e-05, "loss": 1.0881788635253906, "mean_token_accuracy": 0.7175721836090088, "num_tokens": 20545769.0, "step": 43100 }, { "entropy": 1.167164198756218, "epoch": 1.0493421852581406, "grad_norm": 11.3125, "learning_rate": 1.5475905391118313e-05, "loss": 1.1898109436035156, "mean_token_accuracy": 0.7042736285924911, "num_tokens": 20568280.0, "step": 43150 }, { "entropy": 1.037467440366745, "epoch": 1.050558108995404, "grad_norm": 10.8125, "learning_rate": 1.546468524089005e-05, "loss": 1.028110885620117, "mean_token_accuracy": 0.7326459085941315, "num_tokens": 20594123.0, "step": 43200 }, { "entropy": 1.1276749658584595, "epoch": 1.051774032732667, "grad_norm": 17.25, "learning_rate": 1.545345527331605e-05, "loss": 1.1313798522949219, "mean_token_accuracy": 0.7064532554149627, "num_tokens": 20616298.0, "step": 43250 }, { "entropy": 1.074656886458397, "epoch": 1.05298995646993, "grad_norm": 23.75, "learning_rate": 1.544221550857102e-05, "loss": 1.054702911376953, "mean_token_accuracy": 0.726213401556015, "num_tokens": 20636972.0, "step": 43300 }, { "entropy": 0.9934366476535798, "epoch": 1.0542058802071934, "grad_norm": 17.5, "learning_rate": 1.543096596684728e-05, "loss": 1.0133909606933593, "mean_token_accuracy": 0.7311198186874389, "num_tokens": 20657971.0, "step": 43350 }, { "entropy": 1.0937840461730957, "epoch": 1.0554218039444565, "grad_norm": 9.5625, "learning_rate": 1.5419706668354712e-05, "loss": 1.1026046752929688, "mean_token_accuracy": 0.7144168072938919, "num_tokens": 20683424.0, "step": 43400 }, { "entropy": 1.0389201933145522, "epoch": 1.0566377276817198, "grad_norm": 8.75, "learning_rate": 1.540843763332073e-05, "loss": 1.0583563232421875, "mean_token_accuracy": 0.7182553565502167, "num_tokens": 20706465.0, "step": 43450 }, { "entropy": 1.032156649827957, "epoch": 1.057853651418983, "grad_norm": 12.875, "learning_rate": 1.539715888199023e-05, "loss": 1.0498662567138672, "mean_token_accuracy": 0.7340823352336884, "num_tokens": 20730488.0, "step": 43500 }, { "entropy": 1.0893183374404907, "epoch": 1.0590695751562462, "grad_norm": 12.6875, "learning_rate": 1.538587043462557e-05, "loss": 1.0965233612060548, "mean_token_accuracy": 0.7185414135456085, "num_tokens": 20751217.0, "step": 43550 }, { "entropy": 1.0484262263774873, "epoch": 1.0602854988935093, "grad_norm": 17.75, "learning_rate": 1.5374572311506524e-05, "loss": 1.0649872589111329, "mean_token_accuracy": 0.7288954925537109, "num_tokens": 20771747.0, "step": 43600 }, { "entropy": 1.001895074248314, "epoch": 1.0615014226307726, "grad_norm": 14.375, "learning_rate": 1.5363264532930254e-05, "loss": 1.0032268524169923, "mean_token_accuracy": 0.7392091429233552, "num_tokens": 20792768.0, "step": 43650 }, { "entropy": 1.0551827436685561, "epoch": 1.0627173463680357, "grad_norm": 19.625, "learning_rate": 1.5351947119211264e-05, "loss": 1.0597721862792968, "mean_token_accuracy": 0.7261555182933808, "num_tokens": 20816279.0, "step": 43700 }, { "entropy": 1.1142620050907135, "epoch": 1.063933270105299, "grad_norm": 12.5, "learning_rate": 1.5340620090681363e-05, "loss": 1.1212269592285156, "mean_token_accuracy": 0.7133354735374451, "num_tokens": 20842448.0, "step": 43750 }, { "entropy": 1.120469936132431, "epoch": 1.0651491938425621, "grad_norm": 16.625, "learning_rate": 1.532928346768964e-05, "loss": 1.129591827392578, "mean_token_accuracy": 0.7097442066669464, "num_tokens": 20866312.0, "step": 43800 }, { "entropy": 1.1104649472236634, "epoch": 1.0663651175798254, "grad_norm": 13.9375, "learning_rate": 1.5317937270602424e-05, "loss": 1.1308428955078125, "mean_token_accuracy": 0.7033222454786301, "num_tokens": 20891264.0, "step": 43850 }, { "entropy": 1.036952890753746, "epoch": 1.0675810413170885, "grad_norm": 14.1875, "learning_rate": 1.5306581519803233e-05, "loss": 1.0325540161132813, "mean_token_accuracy": 0.7295211768150329, "num_tokens": 20915812.0, "step": 43900 }, { "entropy": 0.9968514847755432, "epoch": 1.0687969650543518, "grad_norm": 7.15625, "learning_rate": 1.5295216235692755e-05, "loss": 0.9871125030517578, "mean_token_accuracy": 0.7412255108356476, "num_tokens": 20937270.0, "step": 43950 }, { "entropy": 1.0331070458889007, "epoch": 1.070012888791615, "grad_norm": 12.6875, "learning_rate": 1.52838414386888e-05, "loss": 1.0503214263916016, "mean_token_accuracy": 0.7249454605579376, "num_tokens": 20962904.0, "step": 44000 }, { "entropy": 1.09283954590559, "epoch": 1.0712288125288782, "grad_norm": 21.5, "learning_rate": 1.527245714922628e-05, "loss": 1.0932463836669921, "mean_token_accuracy": 0.7201733076572419, "num_tokens": 20986365.0, "step": 44050 }, { "entropy": 1.0569049042463303, "epoch": 1.0724447362661413, "grad_norm": 14.75, "learning_rate": 1.5261063387757142e-05, "loss": 1.0749256896972657, "mean_token_accuracy": 0.7248133772611618, "num_tokens": 21011235.0, "step": 44100 }, { "entropy": 1.02895827293396, "epoch": 1.0736606600034047, "grad_norm": 14.6875, "learning_rate": 1.5249660174750367e-05, "loss": 1.0417445373535157, "mean_token_accuracy": 0.7292307609319687, "num_tokens": 21036348.0, "step": 44150 }, { "entropy": 1.1084667855501176, "epoch": 1.0748765837406677, "grad_norm": 13.4375, "learning_rate": 1.5238247530691901e-05, "loss": 1.1073839569091797, "mean_token_accuracy": 0.7174331456422806, "num_tokens": 21062709.0, "step": 44200 }, { "entropy": 1.019925827383995, "epoch": 1.076092507477931, "grad_norm": 13.0, "learning_rate": 1.522682547608464e-05, "loss": 1.0364785766601563, "mean_token_accuracy": 0.7284039258956909, "num_tokens": 21087663.0, "step": 44250 }, { "entropy": 0.9827379286289215, "epoch": 1.0773084312151942, "grad_norm": 17.75, "learning_rate": 1.5215394031448399e-05, "loss": 0.9654759979248047, "mean_token_accuracy": 0.7463980352878571, "num_tokens": 21109942.0, "step": 44300 }, { "entropy": 0.9909191870689392, "epoch": 1.0785243549524575, "grad_norm": 12.0, "learning_rate": 1.5203953217319833e-05, "loss": 1.009072265625, "mean_token_accuracy": 0.7387565767765045, "num_tokens": 21132308.0, "step": 44350 }, { "entropy": 1.1002849447727203, "epoch": 1.0797402786897206, "grad_norm": 14.0, "learning_rate": 1.5192503054252454e-05, "loss": 1.1087372589111328, "mean_token_accuracy": 0.7207002127170563, "num_tokens": 21153556.0, "step": 44400 }, { "entropy": 1.020677752494812, "epoch": 1.0809562024269839, "grad_norm": 23.0, "learning_rate": 1.518104356281656e-05, "loss": 1.0330602264404296, "mean_token_accuracy": 0.7343488013744355, "num_tokens": 21181179.0, "step": 44450 }, { "entropy": 1.1214724349975587, "epoch": 1.082172126164247, "grad_norm": 11.125, "learning_rate": 1.5169574763599206e-05, "loss": 1.1294394683837892, "mean_token_accuracy": 0.7049579066038132, "num_tokens": 21210487.0, "step": 44500 }, { "entropy": 1.091588210463524, "epoch": 1.0833880499015103, "grad_norm": 28.0, "learning_rate": 1.5158096677204173e-05, "loss": 1.0988902282714843, "mean_token_accuracy": 0.7270997846126557, "num_tokens": 21233133.0, "step": 44550 }, { "entropy": 1.0699863588809968, "epoch": 1.0846039736387734, "grad_norm": 24.125, "learning_rate": 1.5146609324251925e-05, "loss": 1.0538405609130859, "mean_token_accuracy": 0.7234089183807373, "num_tokens": 21256094.0, "step": 44600 }, { "entropy": 1.0919910615682602, "epoch": 1.0858198973760365, "grad_norm": 19.5, "learning_rate": 1.5135112725379575e-05, "loss": 1.1166837310791016, "mean_token_accuracy": 0.723498592376709, "num_tokens": 21276618.0, "step": 44650 }, { "entropy": 1.0432269859313965, "epoch": 1.0870358211132998, "grad_norm": 25.375, "learning_rate": 1.5123606901240835e-05, "loss": 1.0422335052490235, "mean_token_accuracy": 0.731784086227417, "num_tokens": 21296907.0, "step": 44700 }, { "entropy": 1.084926141500473, "epoch": 1.088251744850563, "grad_norm": 12.375, "learning_rate": 1.5112091872506012e-05, "loss": 1.0974710083007813, "mean_token_accuracy": 0.7117995524406433, "num_tokens": 21323412.0, "step": 44750 }, { "entropy": 0.9851578068733216, "epoch": 1.0894676685878262, "grad_norm": 20.375, "learning_rate": 1.510056765986193e-05, "loss": 0.997236099243164, "mean_token_accuracy": 0.7312912213802337, "num_tokens": 21351658.0, "step": 44800 }, { "entropy": 1.1072747057676315, "epoch": 1.0906835923250893, "grad_norm": 13.8125, "learning_rate": 1.5089034284011916e-05, "loss": 1.1042525482177734, "mean_token_accuracy": 0.7188129210472107, "num_tokens": 21375380.0, "step": 44850 }, { "entropy": 1.0737012922763824, "epoch": 1.0918995160623526, "grad_norm": 9.625, "learning_rate": 1.5077491765675768e-05, "loss": 1.0777804565429687, "mean_token_accuracy": 0.7193688833713532, "num_tokens": 21399776.0, "step": 44900 }, { "entropy": 1.0803762513399124, "epoch": 1.0931154397996157, "grad_norm": 26.75, "learning_rate": 1.5065940125589697e-05, "loss": 1.1015398406982422, "mean_token_accuracy": 0.710215123295784, "num_tokens": 21424714.0, "step": 44950 }, { "entropy": 1.1818352591991426, "epoch": 1.094331363536879, "grad_norm": 31.125, "learning_rate": 1.5054379384506307e-05, "loss": 1.174248046875, "mean_token_accuracy": 0.6966704785823822, "num_tokens": 21448839.0, "step": 45000 }, { "entropy": 1.0356277644634246, "epoch": 1.095547287274142, "grad_norm": 10.125, "learning_rate": 1.5042809563194555e-05, "loss": 1.0376033782958984, "mean_token_accuracy": 0.7304682648181915, "num_tokens": 21472127.0, "step": 45050 }, { "entropy": 0.9915796488523483, "epoch": 1.0967632110114054, "grad_norm": 17.5, "learning_rate": 1.5031230682439705e-05, "loss": 1.0212298583984376, "mean_token_accuracy": 0.7337250494956971, "num_tokens": 21496285.0, "step": 45100 }, { "entropy": 1.0620082235336303, "epoch": 1.0979791347486685, "grad_norm": 24.5, "learning_rate": 1.5019642763043294e-05, "loss": 1.066450653076172, "mean_token_accuracy": 0.7233717060089111, "num_tokens": 21519324.0, "step": 45150 }, { "entropy": 1.1055776196718217, "epoch": 1.0991950584859318, "grad_norm": 22.125, "learning_rate": 1.5008045825823107e-05, "loss": 1.1141819763183594, "mean_token_accuracy": 0.7105332446098328, "num_tokens": 21542392.0, "step": 45200 }, { "entropy": 1.0454763919115067, "epoch": 1.1004109822231949, "grad_norm": 25.875, "learning_rate": 1.4996439891613124e-05, "loss": 1.0500022125244142, "mean_token_accuracy": 0.7251096045970917, "num_tokens": 21569083.0, "step": 45250 }, { "entropy": 1.02343006670475, "epoch": 1.1016269059604582, "grad_norm": 15.3125, "learning_rate": 1.4984824981263481e-05, "loss": 1.0439102935791016, "mean_token_accuracy": 0.7319287014007568, "num_tokens": 21590265.0, "step": 45300 }, { "entropy": 1.0999270105361938, "epoch": 1.1028428296977213, "grad_norm": 18.75, "learning_rate": 1.4973201115640455e-05, "loss": 1.1369355773925782, "mean_token_accuracy": 0.7054049062728882, "num_tokens": 21614130.0, "step": 45350 }, { "entropy": 1.1421314287185669, "epoch": 1.1040587534349846, "grad_norm": 12.0, "learning_rate": 1.4961568315626397e-05, "loss": 1.1386830139160156, "mean_token_accuracy": 0.7101169037818909, "num_tokens": 21639160.0, "step": 45400 }, { "entropy": 1.179116799235344, "epoch": 1.1052746771722477, "grad_norm": 17.375, "learning_rate": 1.4949926602119718e-05, "loss": 1.18278564453125, "mean_token_accuracy": 0.7140402525663376, "num_tokens": 21669898.0, "step": 45450 }, { "entropy": 1.1492326009273528, "epoch": 1.106490600909511, "grad_norm": 24.0, "learning_rate": 1.4938275996034837e-05, "loss": 1.2034682464599609, "mean_token_accuracy": 0.6993312376737595, "num_tokens": 21691202.0, "step": 45500 }, { "entropy": 1.125652097761631, "epoch": 1.107706524646774, "grad_norm": 23.125, "learning_rate": 1.4926616518302153e-05, "loss": 1.1034360504150391, "mean_token_accuracy": 0.7225061786174775, "num_tokens": 21714777.0, "step": 45550 }, { "entropy": 1.0047913748025894, "epoch": 1.1089224483840374, "grad_norm": 11.8125, "learning_rate": 1.4914948189867995e-05, "loss": 1.0168608856201171, "mean_token_accuracy": 0.7283059060573578, "num_tokens": 21742288.0, "step": 45600 }, { "entropy": 1.0976434653997422, "epoch": 1.1101383721213005, "grad_norm": 11.6875, "learning_rate": 1.4903271031694601e-05, "loss": 1.076703338623047, "mean_token_accuracy": 0.7283291774988174, "num_tokens": 21766675.0, "step": 45650 }, { "entropy": 1.0675057196617126, "epoch": 1.1113542958585638, "grad_norm": 9.3125, "learning_rate": 1.4891585064760072e-05, "loss": 1.0740365600585937, "mean_token_accuracy": 0.7195557844638825, "num_tokens": 21787921.0, "step": 45700 }, { "entropy": 1.085394492149353, "epoch": 1.112570219595827, "grad_norm": 13.625, "learning_rate": 1.4879890310058327e-05, "loss": 1.0901954650878907, "mean_token_accuracy": 0.7169584977626801, "num_tokens": 21810888.0, "step": 45750 }, { "entropy": 1.066525307893753, "epoch": 1.1137861433330902, "grad_norm": 25.125, "learning_rate": 1.4868186788599073e-05, "loss": 1.0574069213867188, "mean_token_accuracy": 0.7263559448719025, "num_tokens": 21830657.0, "step": 45800 }, { "entropy": 1.091929211616516, "epoch": 1.1150020670703533, "grad_norm": 18.0, "learning_rate": 1.4856474521407771e-05, "loss": 1.0959893035888673, "mean_token_accuracy": 0.7209522563219071, "num_tokens": 21851995.0, "step": 45850 }, { "entropy": 1.0583029752969741, "epoch": 1.1162179908076166, "grad_norm": 13.9375, "learning_rate": 1.4844753529525595e-05, "loss": 1.074813461303711, "mean_token_accuracy": 0.7222145092487335, "num_tokens": 21877020.0, "step": 45900 }, { "entropy": 1.032777248620987, "epoch": 1.1174339145448797, "grad_norm": 20.625, "learning_rate": 1.4833023834009385e-05, "loss": 1.0503728485107422, "mean_token_accuracy": 0.7306879031658172, "num_tokens": 21903477.0, "step": 45950 }, { "entropy": 1.022947252392769, "epoch": 1.118649838282143, "grad_norm": 15.6875, "learning_rate": 1.4821285455931629e-05, "loss": 1.0312307739257813, "mean_token_accuracy": 0.733131799697876, "num_tokens": 21927233.0, "step": 46000 }, { "entropy": 1.0234703874588014, "epoch": 1.1198657620194061, "grad_norm": 13.25, "learning_rate": 1.4809538416380396e-05, "loss": 1.0256100463867188, "mean_token_accuracy": 0.7287358468770981, "num_tokens": 21953147.0, "step": 46050 }, { "entropy": 1.068297371864319, "epoch": 1.1210816857566694, "grad_norm": 16.25, "learning_rate": 1.4797782736459333e-05, "loss": 1.0865676879882813, "mean_token_accuracy": 0.7168714714050293, "num_tokens": 21978532.0, "step": 46100 }, { "entropy": 0.9939989158511162, "epoch": 1.1222976094939325, "grad_norm": 15.625, "learning_rate": 1.4786018437287606e-05, "loss": 0.9969815063476563, "mean_token_accuracy": 0.7412820756435394, "num_tokens": 21997510.0, "step": 46150 }, { "entropy": 1.1040262544155122, "epoch": 1.1235135332311956, "grad_norm": 19.125, "learning_rate": 1.4774245539999855e-05, "loss": 1.1032032775878906, "mean_token_accuracy": 0.7180810594558715, "num_tokens": 22023404.0, "step": 46200 }, { "entropy": 1.02311325609684, "epoch": 1.124729456968459, "grad_norm": 21.5, "learning_rate": 1.4762464065746172e-05, "loss": 1.0360639190673828, "mean_token_accuracy": 0.7272508180141449, "num_tokens": 22044969.0, "step": 46250 }, { "entropy": 1.0026508510112762, "epoch": 1.1259453807057223, "grad_norm": 10.75, "learning_rate": 1.4750674035692062e-05, "loss": 1.0075936126708984, "mean_token_accuracy": 0.7326958870887756, "num_tokens": 22070366.0, "step": 46300 }, { "entropy": 1.0126875334978103, "epoch": 1.1271613044429853, "grad_norm": 13.5625, "learning_rate": 1.4738875471018402e-05, "loss": 1.02803466796875, "mean_token_accuracy": 0.7252357935905457, "num_tokens": 22090601.0, "step": 46350 }, { "entropy": 0.9228693246841431, "epoch": 1.1283772281802484, "grad_norm": 18.875, "learning_rate": 1.4727068392921389e-05, "loss": 0.9110924530029297, "mean_token_accuracy": 0.7552372133731842, "num_tokens": 22116603.0, "step": 46400 }, { "entropy": 1.036385423541069, "epoch": 1.1295931519175118, "grad_norm": 16.125, "learning_rate": 1.4715252822612526e-05, "loss": 1.079150390625, "mean_token_accuracy": 0.7226733052730561, "num_tokens": 22139590.0, "step": 46450 }, { "entropy": 1.0408498764038085, "epoch": 1.1308090756547748, "grad_norm": 15.375, "learning_rate": 1.4703428781318571e-05, "loss": 1.0291957092285156, "mean_token_accuracy": 0.7296907448768616, "num_tokens": 22161632.0, "step": 46500 }, { "entropy": 1.1069449126720428, "epoch": 1.1320249993920382, "grad_norm": 8.625, "learning_rate": 1.4691596290281494e-05, "loss": 1.1414222717285156, "mean_token_accuracy": 0.7131977248191833, "num_tokens": 22182514.0, "step": 46550 }, { "entropy": 1.1195755004882812, "epoch": 1.1332409231293012, "grad_norm": 14.875, "learning_rate": 1.4679755370758452e-05, "loss": 1.108554916381836, "mean_token_accuracy": 0.7182742857933044, "num_tokens": 22203129.0, "step": 46600 }, { "entropy": 1.1143310534954072, "epoch": 1.1344568468665646, "grad_norm": 16.375, "learning_rate": 1.4667906044021741e-05, "loss": 1.126001434326172, "mean_token_accuracy": 0.7142716068029403, "num_tokens": 22225809.0, "step": 46650 }, { "entropy": 1.0009319400787353, "epoch": 1.1356727706038277, "grad_norm": 18.875, "learning_rate": 1.4656048331358755e-05, "loss": 1.0122996520996095, "mean_token_accuracy": 0.7303437077999115, "num_tokens": 22247597.0, "step": 46700 }, { "entropy": 0.9614537942409516, "epoch": 1.136888694341091, "grad_norm": 16.5, "learning_rate": 1.4644182254071963e-05, "loss": 0.9500968933105469, "mean_token_accuracy": 0.7503053450584412, "num_tokens": 22271548.0, "step": 46750 }, { "entropy": 1.027302812933922, "epoch": 1.138104618078354, "grad_norm": 20.125, "learning_rate": 1.4632307833478858e-05, "loss": 1.0355605316162109, "mean_token_accuracy": 0.7277568531036377, "num_tokens": 22295804.0, "step": 46800 }, { "entropy": 1.077018871307373, "epoch": 1.1393205418156174, "grad_norm": 12.1875, "learning_rate": 1.4620425090911919e-05, "loss": 1.1117363739013673, "mean_token_accuracy": 0.7162330985069275, "num_tokens": 22316615.0, "step": 46850 }, { "entropy": 1.0203612411022187, "epoch": 1.1405364655528805, "grad_norm": 25.625, "learning_rate": 1.4608534047718579e-05, "loss": 1.0279058074951173, "mean_token_accuracy": 0.7302329814434052, "num_tokens": 22342935.0, "step": 46900 }, { "entropy": 1.0315338903665543, "epoch": 1.1417523892901438, "grad_norm": 23.0, "learning_rate": 1.4596634725261178e-05, "loss": 1.031958999633789, "mean_token_accuracy": 0.7321675145626068, "num_tokens": 22368837.0, "step": 46950 }, { "entropy": 1.0471099507808685, "epoch": 1.1429683130274069, "grad_norm": 15.875, "learning_rate": 1.4584727144916934e-05, "loss": 1.0408748626708983, "mean_token_accuracy": 0.732913749217987, "num_tokens": 22389655.0, "step": 47000 }, { "entropy": 1.0999655556678771, "epoch": 1.1441842367646702, "grad_norm": 22.375, "learning_rate": 1.4572811328077903e-05, "loss": 1.1157471466064453, "mean_token_accuracy": 0.7217414116859436, "num_tokens": 22412520.0, "step": 47050 }, { "entropy": 1.0766043400764465, "epoch": 1.1454001605019333, "grad_norm": 12.5, "learning_rate": 1.4560887296150932e-05, "loss": 1.0919187927246095, "mean_token_accuracy": 0.7181952512264251, "num_tokens": 22438002.0, "step": 47100 }, { "entropy": 1.02794871032238, "epoch": 1.1466160842391966, "grad_norm": 18.0, "learning_rate": 1.4548955070557624e-05, "loss": 1.0449018096923828, "mean_token_accuracy": 0.7267157065868378, "num_tokens": 22464428.0, "step": 47150 }, { "entropy": 1.0022287702560424, "epoch": 1.1478320079764597, "grad_norm": 8.6875, "learning_rate": 1.4537014672734315e-05, "loss": 1.0291314697265626, "mean_token_accuracy": 0.7382057547569275, "num_tokens": 22486788.0, "step": 47200 }, { "entropy": 1.036295336484909, "epoch": 1.149047931713723, "grad_norm": 12.75, "learning_rate": 1.4525066124132007e-05, "loss": 1.0531877899169921, "mean_token_accuracy": 0.7335865545272827, "num_tokens": 22512046.0, "step": 47250 }, { "entropy": 1.1276061522960663, "epoch": 1.150263855450986, "grad_norm": 18.875, "learning_rate": 1.451310944621636e-05, "loss": 1.1358766174316406, "mean_token_accuracy": 0.7180974888801575, "num_tokens": 22536099.0, "step": 47300 }, { "entropy": 0.9721841537952423, "epoch": 1.1514797791882494, "grad_norm": 21.375, "learning_rate": 1.4501144660467623e-05, "loss": 0.9745923614501953, "mean_token_accuracy": 0.7431524026393891, "num_tokens": 22560136.0, "step": 47350 }, { "entropy": 0.9872484433650971, "epoch": 1.1526957029255125, "grad_norm": 12.0625, "learning_rate": 1.4489171788380624e-05, "loss": 0.9875614929199219, "mean_token_accuracy": 0.7311826610565185, "num_tokens": 22584917.0, "step": 47400 }, { "entropy": 1.0540044343471526, "epoch": 1.1539116266627758, "grad_norm": 47.25, "learning_rate": 1.4477190851464709e-05, "loss": 1.0627963256835937, "mean_token_accuracy": 0.7244084072113037, "num_tokens": 22606176.0, "step": 47450 }, { "entropy": 1.0786057645082474, "epoch": 1.155127550400039, "grad_norm": 8.625, "learning_rate": 1.4465201871243716e-05, "loss": 1.0965796661376954, "mean_token_accuracy": 0.7212444925308228, "num_tokens": 22632477.0, "step": 47500 }, { "entropy": 1.136163512468338, "epoch": 1.156343474137302, "grad_norm": 20.625, "learning_rate": 1.445320486925594e-05, "loss": 1.1583113861083985, "mean_token_accuracy": 0.7041622620820999, "num_tokens": 22658484.0, "step": 47550 }, { "entropy": 1.118614684343338, "epoch": 1.1575593978745653, "grad_norm": 18.75, "learning_rate": 1.4441199867054067e-05, "loss": 1.1144579315185548, "mean_token_accuracy": 0.7187115228176117, "num_tokens": 22683083.0, "step": 47600 }, { "entropy": 1.0458070403337478, "epoch": 1.1587753216118286, "grad_norm": 12.8125, "learning_rate": 1.4429186886205176e-05, "loss": 1.0714490509033203, "mean_token_accuracy": 0.7222697257995605, "num_tokens": 22709578.0, "step": 47650 }, { "entropy": 1.101076665520668, "epoch": 1.1599912453490917, "grad_norm": 9.5, "learning_rate": 1.4417165948290675e-05, "loss": 1.1198554992675782, "mean_token_accuracy": 0.7167080855369568, "num_tokens": 22732153.0, "step": 47700 }, { "entropy": 0.9890038192272186, "epoch": 1.1612071690863548, "grad_norm": 10.375, "learning_rate": 1.4405137074906259e-05, "loss": 0.9813487243652343, "mean_token_accuracy": 0.7457172864675522, "num_tokens": 22756756.0, "step": 47750 }, { "entropy": 0.9994833660125733, "epoch": 1.162423092823618, "grad_norm": 15.125, "learning_rate": 1.4393100287661887e-05, "loss": 1.0138793182373047, "mean_token_accuracy": 0.7330734646320343, "num_tokens": 22775899.0, "step": 47800 }, { "entropy": 1.0937603068351747, "epoch": 1.1636390165608814, "grad_norm": 16.25, "learning_rate": 1.438105560818173e-05, "loss": 1.1108261108398438, "mean_token_accuracy": 0.7156578040122986, "num_tokens": 22799062.0, "step": 47850 }, { "entropy": 0.9734578114748001, "epoch": 1.1648549402981445, "grad_norm": 18.5, "learning_rate": 1.4369003058104145e-05, "loss": 0.9951388549804687, "mean_token_accuracy": 0.7355878019332885, "num_tokens": 22821998.0, "step": 47900 }, { "entropy": 1.1117220076918601, "epoch": 1.1660708640354076, "grad_norm": 25.25, "learning_rate": 1.4356942659081616e-05, "loss": 1.1226145935058593, "mean_token_accuracy": 0.7144471025466919, "num_tokens": 22842933.0, "step": 47950 }, { "entropy": 1.1810298311710357, "epoch": 1.167286787772671, "grad_norm": 24.125, "learning_rate": 1.4344874432780745e-05, "loss": 1.188315658569336, "mean_token_accuracy": 0.6937733733654022, "num_tokens": 22869193.0, "step": 48000 }, { "entropy": 1.122211027741432, "epoch": 1.168502711509934, "grad_norm": 15.1875, "learning_rate": 1.4332798400882177e-05, "loss": 1.1286930084228515, "mean_token_accuracy": 0.7117782700061798, "num_tokens": 22893452.0, "step": 48050 }, { "entropy": 0.9249673014879227, "epoch": 1.1697186352471973, "grad_norm": 18.875, "learning_rate": 1.432071458508059e-05, "loss": 0.9370259094238281, "mean_token_accuracy": 0.7536309635639191, "num_tokens": 22914941.0, "step": 48100 }, { "entropy": 1.0793990784883498, "epoch": 1.1709345589844604, "grad_norm": 16.5, "learning_rate": 1.4308623007084651e-05, "loss": 1.0816592407226562, "mean_token_accuracy": 0.7212891936302185, "num_tokens": 22941304.0, "step": 48150 }, { "entropy": 1.0323576053977013, "epoch": 1.1721504827217237, "grad_norm": 8.3125, "learning_rate": 1.4296523688616962e-05, "loss": 1.0386216735839844, "mean_token_accuracy": 0.7300590354204178, "num_tokens": 22965673.0, "step": 48200 }, { "entropy": 1.0028495639562607, "epoch": 1.1733664064589868, "grad_norm": 10.625, "learning_rate": 1.4284416651414033e-05, "loss": 1.0258335876464844, "mean_token_accuracy": 0.7356867605447769, "num_tokens": 22989496.0, "step": 48250 }, { "entropy": 0.9745978707075119, "epoch": 1.1745823301962501, "grad_norm": 13.3125, "learning_rate": 1.427230191722624e-05, "loss": 0.9728524017333985, "mean_token_accuracy": 0.7416182804107666, "num_tokens": 23013970.0, "step": 48300 }, { "entropy": 1.0506127375364303, "epoch": 1.1757982539335132, "grad_norm": 14.375, "learning_rate": 1.4260179507817796e-05, "loss": 1.054390640258789, "mean_token_accuracy": 0.7339584845304489, "num_tokens": 23037710.0, "step": 48350 }, { "entropy": 1.0713644874095918, "epoch": 1.1770141776707765, "grad_norm": 15.8125, "learning_rate": 1.4248049444966687e-05, "loss": 1.0873005676269532, "mean_token_accuracy": 0.715683456659317, "num_tokens": 23060795.0, "step": 48400 }, { "entropy": 0.9898362827301025, "epoch": 1.1782301014080396, "grad_norm": 17.75, "learning_rate": 1.4235911750464664e-05, "loss": 1.0048770141601562, "mean_token_accuracy": 0.7338375592231751, "num_tokens": 23086358.0, "step": 48450 }, { "entropy": 1.0274456864595414, "epoch": 1.179446025145303, "grad_norm": 21.875, "learning_rate": 1.4223766446117177e-05, "loss": 1.022465057373047, "mean_token_accuracy": 0.7350074815750122, "num_tokens": 23111548.0, "step": 48500 }, { "entropy": 1.013544760942459, "epoch": 1.180661948882566, "grad_norm": 12.625, "learning_rate": 1.4211613553743351e-05, "loss": 1.0219825744628905, "mean_token_accuracy": 0.7345675623416901, "num_tokens": 23135698.0, "step": 48550 }, { "entropy": 0.9603467112779618, "epoch": 1.1818778726198294, "grad_norm": 11.9375, "learning_rate": 1.4199453095175947e-05, "loss": 0.9715656280517578, "mean_token_accuracy": 0.7438748186826706, "num_tokens": 23158635.0, "step": 48600 }, { "entropy": 1.0295919448137283, "epoch": 1.1830937963570924, "grad_norm": 13.6875, "learning_rate": 1.4187285092261313e-05, "loss": 1.0384776306152343, "mean_token_accuracy": 0.7300784230232239, "num_tokens": 23185812.0, "step": 48650 }, { "entropy": 1.102224002480507, "epoch": 1.1843097200943558, "grad_norm": 10.1875, "learning_rate": 1.4175109566859352e-05, "loss": 1.1294873809814454, "mean_token_accuracy": 0.7106547230482101, "num_tokens": 23208741.0, "step": 48700 }, { "entropy": 1.0655569237470628, "epoch": 1.1855256438316188, "grad_norm": 24.5, "learning_rate": 1.4162926540843478e-05, "loss": 1.069531707763672, "mean_token_accuracy": 0.7250601243972778, "num_tokens": 23232294.0, "step": 48750 }, { "entropy": 1.060010477900505, "epoch": 1.1867415675688822, "grad_norm": 21.0, "learning_rate": 1.415073603610059e-05, "loss": 1.0907740020751953, "mean_token_accuracy": 0.7248198980093002, "num_tokens": 23259662.0, "step": 48800 }, { "entropy": 1.0138564419746399, "epoch": 1.1879574913061453, "grad_norm": 11.625, "learning_rate": 1.4138538074531013e-05, "loss": 1.022681350708008, "mean_token_accuracy": 0.7381428921222687, "num_tokens": 23281982.0, "step": 48850 }, { "entropy": 1.0252105593681335, "epoch": 1.1891734150434086, "grad_norm": 17.625, "learning_rate": 1.4126332678048471e-05, "loss": 1.0450669860839843, "mean_token_accuracy": 0.7309494423866272, "num_tokens": 23306078.0, "step": 48900 }, { "entropy": 1.0953800815343857, "epoch": 1.1903893387806717, "grad_norm": 12.1875, "learning_rate": 1.4114119868580047e-05, "loss": 1.1099066925048828, "mean_token_accuracy": 0.7160072565078736, "num_tokens": 23330613.0, "step": 48950 }, { "entropy": 1.1080654847621918, "epoch": 1.191605262517935, "grad_norm": 18.75, "learning_rate": 1.4101899668066134e-05, "loss": 1.1009870147705079, "mean_token_accuracy": 0.7216741919517518, "num_tokens": 23355062.0, "step": 49000 }, { "entropy": 1.1130453002452851, "epoch": 1.192821186255198, "grad_norm": 13.75, "learning_rate": 1.4089672098460421e-05, "loss": 1.12833251953125, "mean_token_accuracy": 0.711790667772293, "num_tokens": 23380663.0, "step": 49050 }, { "entropy": 1.12525494992733, "epoch": 1.1940371099924612, "grad_norm": 9.75, "learning_rate": 1.4077437181729812e-05, "loss": 1.139881820678711, "mean_token_accuracy": 0.7112097996473312, "num_tokens": 23405689.0, "step": 49100 }, { "entropy": 1.0418315920233727, "epoch": 1.1952530337297245, "grad_norm": 14.4375, "learning_rate": 1.4065194939854423e-05, "loss": 1.0585295104980468, "mean_token_accuracy": 0.7262923491001129, "num_tokens": 23428365.0, "step": 49150 }, { "entropy": 1.0426855850219727, "epoch": 1.1964689574669878, "grad_norm": 18.875, "learning_rate": 1.4052945394827532e-05, "loss": 1.0380699157714843, "mean_token_accuracy": 0.7345839822292328, "num_tokens": 23450189.0, "step": 49200 }, { "entropy": 1.0993912386894227, "epoch": 1.1976848812042509, "grad_norm": 18.125, "learning_rate": 1.4040688568655533e-05, "loss": 1.1093394470214843, "mean_token_accuracy": 0.7199843508005143, "num_tokens": 23472662.0, "step": 49250 }, { "entropy": 0.9634354478120803, "epoch": 1.198900804941514, "grad_norm": 16.5, "learning_rate": 1.4028424483357901e-05, "loss": 0.9425975799560546, "mean_token_accuracy": 0.7437006688117981, "num_tokens": 23493184.0, "step": 49300 }, { "entropy": 1.0277219009399414, "epoch": 1.2001167286787773, "grad_norm": 28.875, "learning_rate": 1.4016153160967147e-05, "loss": 1.0481037139892577, "mean_token_accuracy": 0.7270358204841614, "num_tokens": 23514205.0, "step": 49350 }, { "entropy": 1.0481401407718658, "epoch": 1.2013326524160404, "grad_norm": 8.0625, "learning_rate": 1.40038746235288e-05, "loss": 1.0720317840576172, "mean_token_accuracy": 0.7214667737483978, "num_tokens": 23543213.0, "step": 49400 }, { "entropy": 1.1547881078720093, "epoch": 1.2025485761533037, "grad_norm": 14.4375, "learning_rate": 1.3991588893101325e-05, "loss": 1.1749071502685546, "mean_token_accuracy": 0.705828378200531, "num_tokens": 23568559.0, "step": 49450 }, { "entropy": 1.0952143251895905, "epoch": 1.2037644998905668, "grad_norm": 19.0, "learning_rate": 1.3979295991756134e-05, "loss": 1.0861087036132813, "mean_token_accuracy": 0.7284623652696609, "num_tokens": 23589317.0, "step": 49500 }, { "entropy": 0.9893753290176391, "epoch": 1.20498042362783, "grad_norm": 12.8125, "learning_rate": 1.3966995941577508e-05, "loss": 1.009127655029297, "mean_token_accuracy": 0.7292374575138092, "num_tokens": 23612817.0, "step": 49550 }, { "entropy": 1.0607143932580947, "epoch": 1.2061963473650932, "grad_norm": 16.75, "learning_rate": 1.3954688764662574e-05, "loss": 1.087067642211914, "mean_token_accuracy": 0.7125902140140533, "num_tokens": 23640841.0, "step": 49600 }, { "entropy": 1.111394681930542, "epoch": 1.2074122711023565, "grad_norm": 39.0, "learning_rate": 1.3942374483121263e-05, "loss": 1.120852508544922, "mean_token_accuracy": 0.7058968275785447, "num_tokens": 23661628.0, "step": 49650 }, { "entropy": 1.1059180760383607, "epoch": 1.2086281948396196, "grad_norm": 27.0, "learning_rate": 1.393005311907627e-05, "loss": 1.0959346771240235, "mean_token_accuracy": 0.7158129274845123, "num_tokens": 23686563.0, "step": 49700 }, { "entropy": 0.9637121003866196, "epoch": 1.209844118576883, "grad_norm": 17.0, "learning_rate": 1.391772469466301e-05, "loss": 0.9787490844726563, "mean_token_accuracy": 0.7456681168079377, "num_tokens": 23713103.0, "step": 49750 }, { "entropy": 1.005218668282032, "epoch": 1.211060042314146, "grad_norm": 19.75, "learning_rate": 1.390538923202959e-05, "loss": 1.022917709350586, "mean_token_accuracy": 0.7379381960630417, "num_tokens": 23736785.0, "step": 49800 }, { "entropy": 1.07193221449852, "epoch": 1.2122759660514093, "grad_norm": 20.125, "learning_rate": 1.389304675333675e-05, "loss": 1.0600127410888671, "mean_token_accuracy": 0.720809485912323, "num_tokens": 23758916.0, "step": 49850 }, { "entropy": 1.0169485211372375, "epoch": 1.2134918897886724, "grad_norm": 16.0, "learning_rate": 1.388069728075784e-05, "loss": 1.0387734985351562, "mean_token_accuracy": 0.7272176802158355, "num_tokens": 23785163.0, "step": 49900 }, { "entropy": 1.0250469690561295, "epoch": 1.2147078135259357, "grad_norm": 12.875, "learning_rate": 1.3868340836478781e-05, "loss": 1.0430210876464843, "mean_token_accuracy": 0.7246137082576751, "num_tokens": 23810040.0, "step": 49950 }, { "entropy": 0.9752432537078858, "epoch": 1.2159237372631988, "grad_norm": 12.5, "learning_rate": 1.3855977442698012e-05, "loss": 0.9668922424316406, "mean_token_accuracy": 0.7514810419082641, "num_tokens": 23835052.0, "step": 50000 }, { "epoch": 1.2159237372631988, "eval_entropy": 1.117307169855776, "eval_loss": 1.3038936853408813, "eval_mean_token_accuracy": 0.6780400415183314, "eval_num_tokens": 23835052.0, "eval_runtime": 392.1558, "eval_samples_per_second": 11.651, "eval_steps_per_second": 11.651, "step": 50000 }, { "entropy": 1.013805137872696, "epoch": 1.2171396610004621, "grad_norm": 25.0, "learning_rate": 1.3843607121626455e-05, "loss": 1.0060276794433594, "mean_token_accuracy": 0.7342334115505218, "num_tokens": 23858534.0, "step": 50050 }, { "entropy": 0.947613987326622, "epoch": 1.2183555847377252, "grad_norm": 11.8125, "learning_rate": 1.3831229895487475e-05, "loss": 0.9551549530029297, "mean_token_accuracy": 0.7465425980091095, "num_tokens": 23880471.0, "step": 50100 }, { "entropy": 1.0199540215730667, "epoch": 1.2195715084749885, "grad_norm": 36.75, "learning_rate": 1.3818845786516853e-05, "loss": 1.0334054565429687, "mean_token_accuracy": 0.7332679927349091, "num_tokens": 23905540.0, "step": 50150 }, { "entropy": 1.0664208179712296, "epoch": 1.2207874322122516, "grad_norm": 12.6875, "learning_rate": 1.3806454816962727e-05, "loss": 1.1030130004882812, "mean_token_accuracy": 0.727715140581131, "num_tokens": 23930050.0, "step": 50200 }, { "entropy": 1.0535260635614394, "epoch": 1.222003355949515, "grad_norm": 18.0, "learning_rate": 1.379405700908556e-05, "loss": 1.0496598815917968, "mean_token_accuracy": 0.7305492866039276, "num_tokens": 23955649.0, "step": 50250 }, { "entropy": 1.116010817885399, "epoch": 1.223219279686778, "grad_norm": 13.5, "learning_rate": 1.3781652385158106e-05, "loss": 1.142310791015625, "mean_token_accuracy": 0.6990325003862381, "num_tokens": 23978237.0, "step": 50300 }, { "entropy": 1.1545332229137422, "epoch": 1.2244352034240413, "grad_norm": 9.375, "learning_rate": 1.3769240967465352e-05, "loss": 1.1698941802978515, "mean_token_accuracy": 0.7140199542045593, "num_tokens": 23999198.0, "step": 50350 }, { "entropy": 0.9655260783433914, "epoch": 1.2256511271613044, "grad_norm": 15.9375, "learning_rate": 1.3756822778304505e-05, "loss": 0.963580322265625, "mean_token_accuracy": 0.7493829524517059, "num_tokens": 24021945.0, "step": 50400 }, { "entropy": 0.9926175355911255, "epoch": 1.2268670508985677, "grad_norm": 11.75, "learning_rate": 1.3744397839984927e-05, "loss": 1.0154157257080079, "mean_token_accuracy": 0.7374128425121307, "num_tokens": 24045891.0, "step": 50450 }, { "entropy": 1.1089438515901566, "epoch": 1.2280829746358308, "grad_norm": 14.4375, "learning_rate": 1.3731966174828107e-05, "loss": 1.1202703857421874, "mean_token_accuracy": 0.7146027791500091, "num_tokens": 24068184.0, "step": 50500 }, { "entropy": 1.0501934325695037, "epoch": 1.2292988983730941, "grad_norm": 29.5, "learning_rate": 1.3719527805167616e-05, "loss": 1.075977783203125, "mean_token_accuracy": 0.7235097163915634, "num_tokens": 24091553.0, "step": 50550 }, { "entropy": 1.1370071893930436, "epoch": 1.2305148221103572, "grad_norm": 18.875, "learning_rate": 1.3707082753349084e-05, "loss": 1.1456235504150392, "mean_token_accuracy": 0.7077477204799653, "num_tokens": 24114244.0, "step": 50600 }, { "entropy": 1.0267274430394173, "epoch": 1.2317307458476203, "grad_norm": 19.25, "learning_rate": 1.3694631041730126e-05, "loss": 1.042945022583008, "mean_token_accuracy": 0.7275592398643493, "num_tokens": 24136116.0, "step": 50650 }, { "entropy": 1.0831756234169005, "epoch": 1.2329466695848836, "grad_norm": 14.1875, "learning_rate": 1.3682172692680331e-05, "loss": 1.1095892333984374, "mean_token_accuracy": 0.7172052985429764, "num_tokens": 24160479.0, "step": 50700 }, { "entropy": 1.0117048758268357, "epoch": 1.234162593322147, "grad_norm": 16.5, "learning_rate": 1.3669707728581214e-05, "loss": 1.0106489562988281, "mean_token_accuracy": 0.7342265462875366, "num_tokens": 24183822.0, "step": 50750 }, { "entropy": 0.9748645779490471, "epoch": 1.23537851705941, "grad_norm": 12.8125, "learning_rate": 1.3657236171826167e-05, "loss": 1.0218390655517577, "mean_token_accuracy": 0.7359099340438843, "num_tokens": 24208179.0, "step": 50800 }, { "entropy": 1.0851183557510375, "epoch": 1.2365944407966731, "grad_norm": 8.4375, "learning_rate": 1.3644758044820437e-05, "loss": 1.0705934143066407, "mean_token_accuracy": 0.7232696557044983, "num_tokens": 24234631.0, "step": 50850 }, { "entropy": 1.0354025691747666, "epoch": 1.2378103645339364, "grad_norm": 13.5625, "learning_rate": 1.3632273369981062e-05, "loss": 1.0261061096191406, "mean_token_accuracy": 0.7246600037813187, "num_tokens": 24259816.0, "step": 50900 }, { "entropy": 1.172890990972519, "epoch": 1.2390262882711995, "grad_norm": 14.5625, "learning_rate": 1.361978216973685e-05, "loss": 1.1910769653320312, "mean_token_accuracy": 0.7047970074415207, "num_tokens": 24284046.0, "step": 50950 }, { "entropy": 1.0706455934047698, "epoch": 1.2402422120084629, "grad_norm": 11.9375, "learning_rate": 1.360728446652833e-05, "loss": 1.0660034942626953, "mean_token_accuracy": 0.7228343796730041, "num_tokens": 24307441.0, "step": 51000 }, { "entropy": 1.0096706825494766, "epoch": 1.241458135745726, "grad_norm": 16.375, "learning_rate": 1.3594780282807715e-05, "loss": 1.023451690673828, "mean_token_accuracy": 0.7351455307006836, "num_tokens": 24329357.0, "step": 51050 }, { "entropy": 1.00720676779747, "epoch": 1.2426740594829893, "grad_norm": 15.4375, "learning_rate": 1.3582269641038863e-05, "loss": 1.0127318572998047, "mean_token_accuracy": 0.7410959708690643, "num_tokens": 24357372.0, "step": 51100 }, { "entropy": 0.9618192303180695, "epoch": 1.2438899832202523, "grad_norm": 22.125, "learning_rate": 1.3569752563697225e-05, "loss": 0.9582817840576172, "mean_token_accuracy": 0.7461307287216187, "num_tokens": 24382087.0, "step": 51150 }, { "entropy": 0.9940299481153488, "epoch": 1.2451059069575157, "grad_norm": 18.375, "learning_rate": 1.3557229073269824e-05, "loss": 1.0001033782958983, "mean_token_accuracy": 0.7400388622283935, "num_tokens": 24409009.0, "step": 51200 }, { "entropy": 1.1163755029439926, "epoch": 1.2463218306947788, "grad_norm": 10.3125, "learning_rate": 1.3544699192255193e-05, "loss": 1.1493161773681642, "mean_token_accuracy": 0.7172418296337127, "num_tokens": 24431036.0, "step": 51250 }, { "entropy": 1.2064093434810639, "epoch": 1.247537754432042, "grad_norm": 24.5, "learning_rate": 1.3532162943163357e-05, "loss": 1.2202953338623046, "mean_token_accuracy": 0.6910722059011459, "num_tokens": 24454750.0, "step": 51300 }, { "entropy": 0.979099383354187, "epoch": 1.2487536781693052, "grad_norm": 15.3125, "learning_rate": 1.3519620348515777e-05, "loss": 0.9876481628417969, "mean_token_accuracy": 0.737482123374939, "num_tokens": 24477451.0, "step": 51350 }, { "entropy": 1.1845598661899566, "epoch": 1.2499696019065685, "grad_norm": 11.375, "learning_rate": 1.3507071430845308e-05, "loss": 1.1617166900634766, "mean_token_accuracy": 0.7125364714860916, "num_tokens": 24503115.0, "step": 51400 }, { "entropy": 1.0060270476341246, "epoch": 1.2511855256438316, "grad_norm": 12.1875, "learning_rate": 1.3494516212696173e-05, "loss": 1.020411376953125, "mean_token_accuracy": 0.7316588127613067, "num_tokens": 24527069.0, "step": 51450 }, { "entropy": 1.0636667239665984, "epoch": 1.2524014493810949, "grad_norm": 8.5625, "learning_rate": 1.3481954716623915e-05, "loss": 1.0632229614257813, "mean_token_accuracy": 0.7259135985374451, "num_tokens": 24551568.0, "step": 51500 }, { "entropy": 1.019275215268135, "epoch": 1.253617373118358, "grad_norm": 19.375, "learning_rate": 1.3469386965195343e-05, "loss": 1.0398987579345702, "mean_token_accuracy": 0.7318524527549743, "num_tokens": 24574171.0, "step": 51550 }, { "entropy": 0.9478723275661468, "epoch": 1.2548332968556213, "grad_norm": 15.875, "learning_rate": 1.3456812980988513e-05, "loss": 0.9660787963867188, "mean_token_accuracy": 0.742346670627594, "num_tokens": 24600667.0, "step": 51600 }, { "entropy": 1.000501965880394, "epoch": 1.2560492205928844, "grad_norm": 14.625, "learning_rate": 1.3444232786592678e-05, "loss": 1.0060649871826173, "mean_token_accuracy": 0.748118691444397, "num_tokens": 24626480.0, "step": 51650 }, { "entropy": 0.9490923172235489, "epoch": 1.2572651443301477, "grad_norm": 27.25, "learning_rate": 1.3431646404608248e-05, "loss": 0.9452471160888671, "mean_token_accuracy": 0.7471734714508057, "num_tokens": 24644703.0, "step": 51700 }, { "entropy": 1.0453764927387237, "epoch": 1.2584810680674108, "grad_norm": 14.375, "learning_rate": 1.3419053857646742e-05, "loss": 1.0591593170166016, "mean_token_accuracy": 0.7199491173028946, "num_tokens": 24671470.0, "step": 51750 }, { "entropy": 1.1090049588680266, "epoch": 1.259696991804674, "grad_norm": 20.5, "learning_rate": 1.3406455168330768e-05, "loss": 1.1366436004638671, "mean_token_accuracy": 0.7087627607584, "num_tokens": 24696821.0, "step": 51800 }, { "entropy": 1.083514186143875, "epoch": 1.2609129155419372, "grad_norm": 11.625, "learning_rate": 1.3393850359293953e-05, "loss": 1.0775301361083984, "mean_token_accuracy": 0.716284077167511, "num_tokens": 24723373.0, "step": 51850 }, { "entropy": 1.0544514900445938, "epoch": 1.2621288392792005, "grad_norm": 17.375, "learning_rate": 1.3381239453180928e-05, "loss": 1.0663682556152343, "mean_token_accuracy": 0.7335773956775665, "num_tokens": 24749625.0, "step": 51900 }, { "entropy": 1.0051100528240204, "epoch": 1.2633447630164636, "grad_norm": 20.25, "learning_rate": 1.336862247264728e-05, "loss": 1.033408203125, "mean_token_accuracy": 0.7300581800937652, "num_tokens": 24770735.0, "step": 51950 }, { "entropy": 1.0484256649017334, "epoch": 1.2645606867537267, "grad_norm": 10.5625, "learning_rate": 1.3355999440359498e-05, "loss": 1.044684600830078, "mean_token_accuracy": 0.7304644322395325, "num_tokens": 24793642.0, "step": 52000 }, { "entropy": 0.9908795893192291, "epoch": 1.26577661049099, "grad_norm": 15.25, "learning_rate": 1.334337037899495e-05, "loss": 1.0073400115966797, "mean_token_accuracy": 0.7400866687297821, "num_tokens": 24818007.0, "step": 52050 }, { "entropy": 1.0934137445688248, "epoch": 1.2669925342282533, "grad_norm": 12.25, "learning_rate": 1.3330735311241831e-05, "loss": 1.1104911041259766, "mean_token_accuracy": 0.7141048175096512, "num_tokens": 24843172.0, "step": 52100 }, { "entropy": 1.2080158936977385, "epoch": 1.2682084579655164, "grad_norm": 13.125, "learning_rate": 1.331809425979914e-05, "loss": 1.2083290863037108, "mean_token_accuracy": 0.6921576988697052, "num_tokens": 24873050.0, "step": 52150 }, { "entropy": 1.142852427959442, "epoch": 1.2694243817027795, "grad_norm": 25.5, "learning_rate": 1.3305447247376604e-05, "loss": 1.1523322296142577, "mean_token_accuracy": 0.7158802831172943, "num_tokens": 24896615.0, "step": 52200 }, { "entropy": 1.0759734416007996, "epoch": 1.2706403054400428, "grad_norm": 11.625, "learning_rate": 1.3292794296694678e-05, "loss": 1.0785868072509766, "mean_token_accuracy": 0.7184501791000366, "num_tokens": 24919563.0, "step": 52250 }, { "entropy": 1.1461998277902603, "epoch": 1.2718562291773061, "grad_norm": 44.75, "learning_rate": 1.3280135430484476e-05, "loss": 1.1789727783203126, "mean_token_accuracy": 0.6986223578453064, "num_tokens": 24945067.0, "step": 52300 }, { "entropy": 1.1014630210399627, "epoch": 1.2730721529145692, "grad_norm": 35.5, "learning_rate": 1.3267470671487735e-05, "loss": 1.1003955078125, "mean_token_accuracy": 0.7246175348758698, "num_tokens": 24968907.0, "step": 52350 }, { "entropy": 0.969337517619133, "epoch": 1.2742880766518323, "grad_norm": 14.5625, "learning_rate": 1.3254800042456792e-05, "loss": 0.9725222015380859, "mean_token_accuracy": 0.7441230165958405, "num_tokens": 24993059.0, "step": 52400 }, { "entropy": 0.9802043044567108, "epoch": 1.2755040003890956, "grad_norm": 15.8125, "learning_rate": 1.324212356615452e-05, "loss": 0.9806236267089844, "mean_token_accuracy": 0.7441103386878968, "num_tokens": 25020813.0, "step": 52450 }, { "entropy": 1.014942781329155, "epoch": 1.276719924126359, "grad_norm": 17.0, "learning_rate": 1.3229441265354299e-05, "loss": 0.9987006378173828, "mean_token_accuracy": 0.7347248029708863, "num_tokens": 25043653.0, "step": 52500 }, { "entropy": 1.08211323261261, "epoch": 1.277935847863622, "grad_norm": 11.5625, "learning_rate": 1.3216753162839967e-05, "loss": 1.1038885498046875, "mean_token_accuracy": 0.715260442495346, "num_tokens": 25068990.0, "step": 52550 }, { "entropy": 0.9993743848800659, "epoch": 1.2791517716008851, "grad_norm": 17.75, "learning_rate": 1.3204059281405794e-05, "loss": 0.9920268249511719, "mean_token_accuracy": 0.7454243087768555, "num_tokens": 25092262.0, "step": 52600 }, { "entropy": 1.0871873962879182, "epoch": 1.2803676953381484, "grad_norm": 14.75, "learning_rate": 1.3191359643856428e-05, "loss": 1.1172415161132812, "mean_token_accuracy": 0.7191149908304214, "num_tokens": 25115030.0, "step": 52650 }, { "entropy": 1.1056962072849275, "epoch": 1.2815836190754115, "grad_norm": 27.0, "learning_rate": 1.3178654273006856e-05, "loss": 1.1344316864013673, "mean_token_accuracy": 0.7114555740356445, "num_tokens": 25138524.0, "step": 52700 }, { "entropy": 1.1444579201936722, "epoch": 1.2827995428126748, "grad_norm": 29.0, "learning_rate": 1.3165943191682372e-05, "loss": 1.1744424438476562, "mean_token_accuracy": 0.7107711064815522, "num_tokens": 25158940.0, "step": 52750 }, { "entropy": 1.0524203968048096, "epoch": 1.284015466549938, "grad_norm": 12.5, "learning_rate": 1.3153226422718515e-05, "loss": 1.0652312469482421, "mean_token_accuracy": 0.7231173574924469, "num_tokens": 25185068.0, "step": 52800 }, { "entropy": 1.0556749212741852, "epoch": 1.2852313902872012, "grad_norm": 27.625, "learning_rate": 1.3140503988961055e-05, "loss": 1.0646086883544923, "mean_token_accuracy": 0.7267605262994766, "num_tokens": 25211177.0, "step": 52850 }, { "entropy": 1.0082054644823075, "epoch": 1.2864473140244643, "grad_norm": 14.6875, "learning_rate": 1.312777591326594e-05, "loss": 1.01151123046875, "mean_token_accuracy": 0.735344854593277, "num_tokens": 25238126.0, "step": 52900 }, { "entropy": 1.0714115500450134, "epoch": 1.2876632377617276, "grad_norm": 15.8125, "learning_rate": 1.3115042218499237e-05, "loss": 1.089359588623047, "mean_token_accuracy": 0.718910356760025, "num_tokens": 25266994.0, "step": 52950 }, { "entropy": 1.1535674333572388, "epoch": 1.2888791614989907, "grad_norm": 29.75, "learning_rate": 1.3102302927537127e-05, "loss": 1.1542550659179687, "mean_token_accuracy": 0.7026466977596283, "num_tokens": 25289489.0, "step": 53000 }, { "entropy": 1.014628741145134, "epoch": 1.290095085236254, "grad_norm": 15.125, "learning_rate": 1.3089558063265832e-05, "loss": 1.023413848876953, "mean_token_accuracy": 0.726420567035675, "num_tokens": 25313559.0, "step": 53050 }, { "entropy": 1.1425517189502716, "epoch": 1.2913110089735171, "grad_norm": 10.5625, "learning_rate": 1.3076807648581594e-05, "loss": 1.1810126495361328, "mean_token_accuracy": 0.7118821823596955, "num_tokens": 25338702.0, "step": 53100 }, { "entropy": 1.0592652356624603, "epoch": 1.2925269327107805, "grad_norm": 16.25, "learning_rate": 1.3064051706390621e-05, "loss": 1.0550986480712892, "mean_token_accuracy": 0.7272113800048828, "num_tokens": 25362724.0, "step": 53150 }, { "entropy": 1.03740063726902, "epoch": 1.2937428564480435, "grad_norm": 15.0, "learning_rate": 1.305129025960906e-05, "loss": 1.0389327239990234, "mean_token_accuracy": 0.7340187835693359, "num_tokens": 25383747.0, "step": 53200 }, { "entropy": 1.0689948463439942, "epoch": 1.2949587801853069, "grad_norm": 26.875, "learning_rate": 1.3038523331162927e-05, "loss": 1.0624019622802734, "mean_token_accuracy": 0.7259775388240814, "num_tokens": 25410906.0, "step": 53250 }, { "entropy": 0.9976338458061218, "epoch": 1.29617470392257, "grad_norm": 11.125, "learning_rate": 1.302575094398811e-05, "loss": 0.9996446228027344, "mean_token_accuracy": 0.736806845664978, "num_tokens": 25438844.0, "step": 53300 }, { "entropy": 1.0047679960727691, "epoch": 1.297390627659833, "grad_norm": 27.25, "learning_rate": 1.3012973121030295e-05, "loss": 1.0001228332519532, "mean_token_accuracy": 0.7405001485347747, "num_tokens": 25462525.0, "step": 53350 }, { "entropy": 1.1110136413574219, "epoch": 1.2986065513970964, "grad_norm": 17.375, "learning_rate": 1.3000189885244926e-05, "loss": 1.1311353302001954, "mean_token_accuracy": 0.7164069378376007, "num_tokens": 25486150.0, "step": 53400 }, { "entropy": 1.1392388874292374, "epoch": 1.2998224751343597, "grad_norm": 11.125, "learning_rate": 1.2987401259597175e-05, "loss": 1.152042465209961, "mean_token_accuracy": 0.705044777393341, "num_tokens": 25507020.0, "step": 53450 }, { "entropy": 0.9787498822808266, "epoch": 1.3010383988716228, "grad_norm": 16.875, "learning_rate": 1.29746072670619e-05, "loss": 0.9984375762939454, "mean_token_accuracy": 0.7360725843906403, "num_tokens": 25527909.0, "step": 53500 }, { "entropy": 1.0387410199642182, "epoch": 1.3022543226088859, "grad_norm": 12.9375, "learning_rate": 1.2961807930623602e-05, "loss": 1.0765815734863282, "mean_token_accuracy": 0.7218941307067871, "num_tokens": 25550473.0, "step": 53550 }, { "entropy": 1.0359888184070587, "epoch": 1.3034702463461492, "grad_norm": 13.0, "learning_rate": 1.2949003273276379e-05, "loss": 1.0329330444335938, "mean_token_accuracy": 0.7291775119304656, "num_tokens": 25575801.0, "step": 53600 }, { "entropy": 1.1576277258992196, "epoch": 1.3046861700834125, "grad_norm": 26.375, "learning_rate": 1.293619331802389e-05, "loss": 1.1661383056640624, "mean_token_accuracy": 0.7070462882518769, "num_tokens": 25602846.0, "step": 53650 }, { "entropy": 1.08353156208992, "epoch": 1.3059020938206756, "grad_norm": 16.5, "learning_rate": 1.2923378087879301e-05, "loss": 1.1010247039794923, "mean_token_accuracy": 0.72098592877388, "num_tokens": 25626671.0, "step": 53700 }, { "entropy": 1.0068843325972556, "epoch": 1.3071180175579387, "grad_norm": 21.875, "learning_rate": 1.2910557605865275e-05, "loss": 1.0075125122070312, "mean_token_accuracy": 0.725981377363205, "num_tokens": 25653168.0, "step": 53750 }, { "entropy": 0.9768001300096512, "epoch": 1.308333941295202, "grad_norm": 13.25, "learning_rate": 1.2897731895013896e-05, "loss": 0.9845682525634766, "mean_token_accuracy": 0.7424379670619965, "num_tokens": 25678068.0, "step": 53800 }, { "entropy": 1.015016458630562, "epoch": 1.3095498650324653, "grad_norm": 12.375, "learning_rate": 1.2884900978366642e-05, "loss": 1.0163926696777343, "mean_token_accuracy": 0.733174616098404, "num_tokens": 25704915.0, "step": 53850 }, { "entropy": 1.0280596882104873, "epoch": 1.3107657887697284, "grad_norm": 14.625, "learning_rate": 1.2872064878974344e-05, "loss": 1.0419410705566405, "mean_token_accuracy": 0.7275616800785065, "num_tokens": 25727418.0, "step": 53900 }, { "entropy": 0.9981159019470215, "epoch": 1.3119817125069915, "grad_norm": 13.625, "learning_rate": 1.2859223619897148e-05, "loss": 1.0169502258300782, "mean_token_accuracy": 0.7358665025234222, "num_tokens": 25751494.0, "step": 53950 }, { "entropy": 0.9596805530786514, "epoch": 1.3131976362442548, "grad_norm": 10.6875, "learning_rate": 1.2846377224204468e-05, "loss": 0.9647893524169922, "mean_token_accuracy": 0.7459017145633697, "num_tokens": 25776133.0, "step": 54000 }, { "entropy": 1.0190869480371476, "epoch": 1.3144135599815179, "grad_norm": 23.5, "learning_rate": 1.2833525714974944e-05, "loss": 1.0277189636230468, "mean_token_accuracy": 0.7214406764507294, "num_tokens": 25799857.0, "step": 54050 }, { "entropy": 1.085708292722702, "epoch": 1.3156294837187812, "grad_norm": 21.5, "learning_rate": 1.2820669115296403e-05, "loss": 1.0865394592285156, "mean_token_accuracy": 0.7239895737171174, "num_tokens": 25821772.0, "step": 54100 }, { "entropy": 1.09927672624588, "epoch": 1.3168454074560443, "grad_norm": 24.875, "learning_rate": 1.2807807448265816e-05, "loss": 1.1248332977294921, "mean_token_accuracy": 0.7143106269836426, "num_tokens": 25843600.0, "step": 54150 }, { "entropy": 1.0590357661247254, "epoch": 1.3180613311933076, "grad_norm": 26.875, "learning_rate": 1.279494073698926e-05, "loss": 1.0605966186523437, "mean_token_accuracy": 0.7315478187799453, "num_tokens": 25867719.0, "step": 54200 }, { "entropy": 1.0173889309167863, "epoch": 1.3192772549305707, "grad_norm": 17.5, "learning_rate": 1.2782069004581878e-05, "loss": 1.04541748046875, "mean_token_accuracy": 0.7314181733131409, "num_tokens": 25890119.0, "step": 54250 }, { "entropy": 1.097025171518326, "epoch": 1.320493178667834, "grad_norm": 14.875, "learning_rate": 1.276919227416782e-05, "loss": 1.108005599975586, "mean_token_accuracy": 0.7186164283752441, "num_tokens": 25911904.0, "step": 54300 }, { "entropy": 1.0122834235429763, "epoch": 1.321709102405097, "grad_norm": 20.0, "learning_rate": 1.2756310568880229e-05, "loss": 1.0319984436035157, "mean_token_accuracy": 0.728993182182312, "num_tokens": 25934790.0, "step": 54350 }, { "entropy": 1.0736159336566926, "epoch": 1.3229250261423604, "grad_norm": 25.25, "learning_rate": 1.2743423911861175e-05, "loss": 1.085203628540039, "mean_token_accuracy": 0.7221173846721649, "num_tokens": 25955673.0, "step": 54400 }, { "entropy": 1.0328627490997315, "epoch": 1.3241409498796235, "grad_norm": 17.875, "learning_rate": 1.2730532326261633e-05, "loss": 1.0360723114013672, "mean_token_accuracy": 0.7260555577278137, "num_tokens": 25977213.0, "step": 54450 }, { "entropy": 1.0777006351947784, "epoch": 1.3253568736168868, "grad_norm": 23.0, "learning_rate": 1.2717635835241426e-05, "loss": 1.0828778076171874, "mean_token_accuracy": 0.7203318977355957, "num_tokens": 26000561.0, "step": 54500 }, { "entropy": 1.010474957227707, "epoch": 1.32657279735415, "grad_norm": 18.625, "learning_rate": 1.2704734461969193e-05, "loss": 1.0110215759277343, "mean_token_accuracy": 0.7343871200084686, "num_tokens": 26022064.0, "step": 54550 }, { "entropy": 1.0306059336662292, "epoch": 1.3277887210914132, "grad_norm": 10.5625, "learning_rate": 1.2691828229622339e-05, "loss": 1.0398931884765625, "mean_token_accuracy": 0.721512268781662, "num_tokens": 26046954.0, "step": 54600 }, { "entropy": 1.0107235455513, "epoch": 1.3290046448286763, "grad_norm": 14.5625, "learning_rate": 1.2678917161387005e-05, "loss": 1.020569076538086, "mean_token_accuracy": 0.7352240788936615, "num_tokens": 26069533.0, "step": 54650 }, { "entropy": 1.0506460911035538, "epoch": 1.3302205685659396, "grad_norm": 12.625, "learning_rate": 1.2666001280458016e-05, "loss": 1.0585768127441406, "mean_token_accuracy": 0.7242963182926178, "num_tokens": 26093534.0, "step": 54700 }, { "entropy": 1.105814887881279, "epoch": 1.3314364923032027, "grad_norm": 14.0625, "learning_rate": 1.2653080610038846e-05, "loss": 1.1372799682617187, "mean_token_accuracy": 0.7062553143501282, "num_tokens": 26118820.0, "step": 54750 }, { "entropy": 1.09937590777874, "epoch": 1.332652416040466, "grad_norm": 11.75, "learning_rate": 1.2640155173341566e-05, "loss": 1.098336181640625, "mean_token_accuracy": 0.7182293313741684, "num_tokens": 26142511.0, "step": 54800 }, { "entropy": 1.0573474532365799, "epoch": 1.3338683397777291, "grad_norm": 19.0, "learning_rate": 1.2627224993586817e-05, "loss": 1.059156494140625, "mean_token_accuracy": 0.7260095244646072, "num_tokens": 26165194.0, "step": 54850 }, { "entropy": 1.0278708279132842, "epoch": 1.3350842635149922, "grad_norm": 17.0, "learning_rate": 1.2614290094003755e-05, "loss": 1.0469074249267578, "mean_token_accuracy": 0.7232207036018372, "num_tokens": 26194237.0, "step": 54900 }, { "entropy": 1.1176218956708908, "epoch": 1.3363001872522555, "grad_norm": 23.375, "learning_rate": 1.2601350497830028e-05, "loss": 1.1317610931396485, "mean_token_accuracy": 0.7243021178245544, "num_tokens": 26218148.0, "step": 54950 }, { "entropy": 1.0374754774570465, "epoch": 1.3375161109895188, "grad_norm": 36.25, "learning_rate": 1.2588406228311701e-05, "loss": 1.0314154052734374, "mean_token_accuracy": 0.7306578516960144, "num_tokens": 26241746.0, "step": 55000 }, { "entropy": 1.0188864159584046, "epoch": 1.338732034726782, "grad_norm": 16.75, "learning_rate": 1.2575457308703252e-05, "loss": 1.0426578521728516, "mean_token_accuracy": 0.7267228448390961, "num_tokens": 26264614.0, "step": 55050 }, { "entropy": 1.0686158615350723, "epoch": 1.339947958464045, "grad_norm": 30.25, "learning_rate": 1.2562503762267506e-05, "loss": 1.0955960845947266, "mean_token_accuracy": 0.7239203912019729, "num_tokens": 26291346.0, "step": 55100 }, { "entropy": 0.9922471341490745, "epoch": 1.3411638822013083, "grad_norm": 14.9375, "learning_rate": 1.2549545612275601e-05, "loss": 0.9855073547363281, "mean_token_accuracy": 0.7448565816879272, "num_tokens": 26316724.0, "step": 55150 }, { "entropy": 1.1049098825454713, "epoch": 1.3423798059385716, "grad_norm": 23.125, "learning_rate": 1.253658288200695e-05, "loss": 1.160699996948242, "mean_token_accuracy": 0.7080820691585541, "num_tokens": 26342022.0, "step": 55200 }, { "entropy": 1.1754768127202988, "epoch": 1.3435957296758347, "grad_norm": 32.75, "learning_rate": 1.2523615594749179e-05, "loss": 1.176814956665039, "mean_token_accuracy": 0.6967479157447815, "num_tokens": 26370720.0, "step": 55250 }, { "entropy": 0.9302823901176452, "epoch": 1.3448116534130978, "grad_norm": 5.53125, "learning_rate": 1.2510643773798114e-05, "loss": 0.9344210815429688, "mean_token_accuracy": 0.751964213848114, "num_tokens": 26392881.0, "step": 55300 }, { "entropy": 0.9456373453140259, "epoch": 1.3460275771503611, "grad_norm": 8.125, "learning_rate": 1.2497667442457733e-05, "loss": 0.9516117858886719, "mean_token_accuracy": 0.7530845963954925, "num_tokens": 26416245.0, "step": 55350 }, { "entropy": 1.0508791720867157, "epoch": 1.3472435008876245, "grad_norm": 13.5625, "learning_rate": 1.2484686624040098e-05, "loss": 1.0497135925292969, "mean_token_accuracy": 0.7260791218280792, "num_tokens": 26444527.0, "step": 55400 }, { "entropy": 1.0234286022186279, "epoch": 1.3484594246248875, "grad_norm": 16.875, "learning_rate": 1.2471701341865342e-05, "loss": 1.0487828063964844, "mean_token_accuracy": 0.7284441131353379, "num_tokens": 26470346.0, "step": 55450 }, { "entropy": 1.1190640193223953, "epoch": 1.3496753483621506, "grad_norm": 19.0, "learning_rate": 1.245871161926162e-05, "loss": 1.1188704681396484, "mean_token_accuracy": 0.7094190227985382, "num_tokens": 26492566.0, "step": 55500 }, { "entropy": 1.1546835404634477, "epoch": 1.350891272099414, "grad_norm": 14.0, "learning_rate": 1.244571747956506e-05, "loss": 1.1645946502685547, "mean_token_accuracy": 0.7007417976856232, "num_tokens": 26513366.0, "step": 55550 }, { "entropy": 0.9515783423185349, "epoch": 1.352107195836677, "grad_norm": 15.5625, "learning_rate": 1.2432718946119723e-05, "loss": 0.9613963317871094, "mean_token_accuracy": 0.7465871822834015, "num_tokens": 26537454.0, "step": 55600 }, { "entropy": 1.1297689652442933, "epoch": 1.3533231195739404, "grad_norm": 12.1875, "learning_rate": 1.2419716042277572e-05, "loss": 1.1457245635986328, "mean_token_accuracy": 0.7086853808164597, "num_tokens": 26558335.0, "step": 55650 }, { "entropy": 1.0429291903972626, "epoch": 1.3545390433112035, "grad_norm": 11.9375, "learning_rate": 1.2406708791398408e-05, "loss": 1.053858642578125, "mean_token_accuracy": 0.7194636595249176, "num_tokens": 26586240.0, "step": 55700 }, { "entropy": 0.9975267523527145, "epoch": 1.3557549670484668, "grad_norm": 20.75, "learning_rate": 1.2393697216849852e-05, "loss": 1.0009669494628906, "mean_token_accuracy": 0.7482252180576324, "num_tokens": 26609910.0, "step": 55750 }, { "entropy": 1.02648468375206, "epoch": 1.3569708907857299, "grad_norm": 8.3125, "learning_rate": 1.2380681342007294e-05, "loss": 1.0398664093017578, "mean_token_accuracy": 0.7331681191921234, "num_tokens": 26638145.0, "step": 55800 }, { "entropy": 1.0850692266225814, "epoch": 1.3581868145229932, "grad_norm": 14.5, "learning_rate": 1.2367661190253844e-05, "loss": 1.0793971252441406, "mean_token_accuracy": 0.7177355748414993, "num_tokens": 26663952.0, "step": 55850 }, { "entropy": 1.0030385875701904, "epoch": 1.3594027382602563, "grad_norm": 50.5, "learning_rate": 1.2354636784980291e-05, "loss": 1.0073432922363281, "mean_token_accuracy": 0.7368400770425797, "num_tokens": 26688703.0, "step": 55900 }, { "entropy": 1.024024955034256, "epoch": 1.3606186619975196, "grad_norm": 19.375, "learning_rate": 1.234160814958508e-05, "loss": 1.0394618225097656, "mean_token_accuracy": 0.72988560795784, "num_tokens": 26713162.0, "step": 55950 }, { "entropy": 1.089662970304489, "epoch": 1.3618345857347827, "grad_norm": 17.125, "learning_rate": 1.232857530747424e-05, "loss": 1.1201802062988282, "mean_token_accuracy": 0.7142687255144119, "num_tokens": 26739955.0, "step": 56000 }, { "entropy": 1.035409631729126, "epoch": 1.363050509472046, "grad_norm": 6.0625, "learning_rate": 1.2315538282061368e-05, "loss": 1.0531453704833984, "mean_token_accuracy": 0.7375445878505706, "num_tokens": 26765315.0, "step": 56050 }, { "entropy": 1.0198426729440688, "epoch": 1.364266433209309, "grad_norm": 22.25, "learning_rate": 1.2302497096767571e-05, "loss": 1.0434074401855469, "mean_token_accuracy": 0.7355719876289367, "num_tokens": 26784786.0, "step": 56100 }, { "entropy": 1.0562584072351455, "epoch": 1.3654823569465724, "grad_norm": 12.875, "learning_rate": 1.2289451775021432e-05, "loss": 1.0517003631591797, "mean_token_accuracy": 0.7217370438575744, "num_tokens": 26805355.0, "step": 56150 }, { "entropy": 1.0774294185638427, "epoch": 1.3666982806838355, "grad_norm": 22.75, "learning_rate": 1.227640234025896e-05, "loss": 1.073239288330078, "mean_token_accuracy": 0.7237315607070923, "num_tokens": 26828807.0, "step": 56200 }, { "entropy": 1.1370603901147842, "epoch": 1.3679142044210988, "grad_norm": 14.625, "learning_rate": 1.2263348815923561e-05, "loss": 1.1426678466796876, "mean_token_accuracy": 0.7057658410072327, "num_tokens": 26852195.0, "step": 56250 }, { "entropy": 0.9495389819145202, "epoch": 1.3691301281583619, "grad_norm": 11.6875, "learning_rate": 1.2250291225465985e-05, "loss": 0.945886001586914, "mean_token_accuracy": 0.7490170323848724, "num_tokens": 26876646.0, "step": 56300 }, { "entropy": 1.1244718527793884, "epoch": 1.3703460518956252, "grad_norm": 29.625, "learning_rate": 1.223722959234428e-05, "loss": 1.1465278625488282, "mean_token_accuracy": 0.7013667869567871, "num_tokens": 26902319.0, "step": 56350 }, { "entropy": 1.0789467215538024, "epoch": 1.3715619756328883, "grad_norm": 15.0, "learning_rate": 1.2224163940023768e-05, "loss": 1.074238052368164, "mean_token_accuracy": 0.7216557043790818, "num_tokens": 26925149.0, "step": 56400 }, { "entropy": 1.0677374660968781, "epoch": 1.3727778993701514, "grad_norm": 12.0625, "learning_rate": 1.2211094291976984e-05, "loss": 1.0769838714599609, "mean_token_accuracy": 0.7256829696893692, "num_tokens": 26949846.0, "step": 56450 }, { "entropy": 1.1187461632490159, "epoch": 1.3739938231074147, "grad_norm": 13.3125, "learning_rate": 1.2198020671683641e-05, "loss": 1.142988815307617, "mean_token_accuracy": 0.715254145860672, "num_tokens": 26973410.0, "step": 56500 }, { "entropy": 1.0222478610277177, "epoch": 1.375209746844678, "grad_norm": 25.125, "learning_rate": 1.2184943102630598e-05, "loss": 1.0524510955810547, "mean_token_accuracy": 0.7275463229417801, "num_tokens": 26996994.0, "step": 56550 }, { "entropy": 1.2099426007270813, "epoch": 1.376425670581941, "grad_norm": 18.625, "learning_rate": 1.217186160831179e-05, "loss": 1.2371376037597657, "mean_token_accuracy": 0.6887109452486038, "num_tokens": 27016931.0, "step": 56600 }, { "entropy": 1.0892434960603714, "epoch": 1.3776415943192042, "grad_norm": 24.625, "learning_rate": 1.2158776212228219e-05, "loss": 1.090030975341797, "mean_token_accuracy": 0.7211398124694824, "num_tokens": 27039160.0, "step": 56650 }, { "entropy": 1.1144648706912994, "epoch": 1.3788575180564675, "grad_norm": 13.0625, "learning_rate": 1.2145686937887895e-05, "loss": 1.1296578979492187, "mean_token_accuracy": 0.7157839381694794, "num_tokens": 27063278.0, "step": 56700 }, { "entropy": 1.0128988653421402, "epoch": 1.3800734417937308, "grad_norm": 15.4375, "learning_rate": 1.2132593808805787e-05, "loss": 1.017112274169922, "mean_token_accuracy": 0.7377462422847748, "num_tokens": 27091236.0, "step": 56750 }, { "entropy": 1.0207082587480545, "epoch": 1.381289365530994, "grad_norm": 15.6875, "learning_rate": 1.2119496848503798e-05, "loss": 1.0071522521972656, "mean_token_accuracy": 0.7372329854965209, "num_tokens": 27117527.0, "step": 56800 }, { "entropy": 1.0074589347839356, "epoch": 1.382505289268257, "grad_norm": 10.5, "learning_rate": 1.2106396080510706e-05, "loss": 1.0375909423828125, "mean_token_accuracy": 0.729313246011734, "num_tokens": 27138107.0, "step": 56850 }, { "entropy": 1.003211772441864, "epoch": 1.3837212130055203, "grad_norm": 13.4375, "learning_rate": 1.2093291528362136e-05, "loss": 0.9998277282714844, "mean_token_accuracy": 0.7313429188728332, "num_tokens": 27161834.0, "step": 56900 }, { "entropy": 1.0882153445482254, "epoch": 1.3849371367427836, "grad_norm": 15.6875, "learning_rate": 1.2080183215600509e-05, "loss": 1.1065789031982423, "mean_token_accuracy": 0.7126589393615723, "num_tokens": 27186334.0, "step": 56950 }, { "entropy": 0.9903446817398072, "epoch": 1.3861530604800467, "grad_norm": 18.125, "learning_rate": 1.2067071165774999e-05, "loss": 0.989286880493164, "mean_token_accuracy": 0.7377019381523132, "num_tokens": 27214668.0, "step": 57000 }, { "entropy": 1.0771780222654344, "epoch": 1.3873689842173098, "grad_norm": 16.5, "learning_rate": 1.2053955402441503e-05, "loss": 1.1128218841552735, "mean_token_accuracy": 0.7140244245529175, "num_tokens": 27237502.0, "step": 57050 }, { "entropy": 0.9567780250310898, "epoch": 1.3885849079545731, "grad_norm": 14.5, "learning_rate": 1.2040835949162575e-05, "loss": 0.9582101440429688, "mean_token_accuracy": 0.7504405844211578, "num_tokens": 27259553.0, "step": 57100 }, { "entropy": 1.017450840473175, "epoch": 1.3898008316918362, "grad_norm": 14.0625, "learning_rate": 1.202771282950741e-05, "loss": 1.0239982604980469, "mean_token_accuracy": 0.7306475752592086, "num_tokens": 27282281.0, "step": 57150 }, { "entropy": 1.093652548789978, "epoch": 1.3910167554290995, "grad_norm": 12.6875, "learning_rate": 1.2014586067051786e-05, "loss": 1.1076268005371093, "mean_token_accuracy": 0.7138592946529388, "num_tokens": 27311971.0, "step": 57200 }, { "entropy": 1.011155235171318, "epoch": 1.3922326791663626, "grad_norm": 20.0, "learning_rate": 1.2001455685378021e-05, "loss": 0.9882084655761719, "mean_token_accuracy": 0.7430735266208649, "num_tokens": 27334082.0, "step": 57250 }, { "entropy": 1.1594024455547334, "epoch": 1.393448602903626, "grad_norm": 24.125, "learning_rate": 1.1988321708074942e-05, "loss": 1.1832711029052734, "mean_token_accuracy": 0.6997262716293335, "num_tokens": 27359447.0, "step": 57300 }, { "entropy": 1.1318856060504914, "epoch": 1.394664526640889, "grad_norm": 18.75, "learning_rate": 1.197518415873783e-05, "loss": 1.135982437133789, "mean_token_accuracy": 0.7109047448635102, "num_tokens": 27379692.0, "step": 57350 }, { "entropy": 1.022715076804161, "epoch": 1.3958804503781523, "grad_norm": 15.875, "learning_rate": 1.1962043060968384e-05, "loss": 1.030087890625, "mean_token_accuracy": 0.7404312282800675, "num_tokens": 27404933.0, "step": 57400 }, { "entropy": 1.0507593524456025, "epoch": 1.3970963741154154, "grad_norm": 18.0, "learning_rate": 1.1948898438374685e-05, "loss": 1.0998904418945312, "mean_token_accuracy": 0.710768471956253, "num_tokens": 27431084.0, "step": 57450 }, { "entropy": 1.031509618163109, "epoch": 1.3983122978526787, "grad_norm": 11.625, "learning_rate": 1.193575031457114e-05, "loss": 1.0314838409423828, "mean_token_accuracy": 0.728602055311203, "num_tokens": 27456118.0, "step": 57500 }, { "entropy": 1.065010513663292, "epoch": 1.3995282215899418, "grad_norm": 19.875, "learning_rate": 1.1922598713178438e-05, "loss": 1.0882814025878906, "mean_token_accuracy": 0.7204366385936737, "num_tokens": 27481393.0, "step": 57550 }, { "entropy": 1.0739942574501038, "epoch": 1.4007441453272051, "grad_norm": 13.4375, "learning_rate": 1.1909443657823531e-05, "loss": 1.0719107818603515, "mean_token_accuracy": 0.725602900981903, "num_tokens": 27506560.0, "step": 57600 }, { "entropy": 0.9849730724096298, "epoch": 1.4019600690644682, "grad_norm": 7.875, "learning_rate": 1.1896285172139572e-05, "loss": 0.9890288543701172, "mean_token_accuracy": 0.7405325210094452, "num_tokens": 27530457.0, "step": 57650 }, { "entropy": 0.9695141619443893, "epoch": 1.4031759928017316, "grad_norm": 15.5, "learning_rate": 1.1883123279765866e-05, "loss": 0.9738130950927735, "mean_token_accuracy": 0.7491734635829925, "num_tokens": 27552187.0, "step": 57700 }, { "entropy": 1.0466733837127686, "epoch": 1.4043919165389946, "grad_norm": 25.375, "learning_rate": 1.186995800434785e-05, "loss": 1.0514488983154298, "mean_token_accuracy": 0.7298216152191163, "num_tokens": 27573216.0, "step": 57750 }, { "entropy": 1.077525531053543, "epoch": 1.4056078402762577, "grad_norm": 31.5, "learning_rate": 1.1856789369537039e-05, "loss": 1.082899169921875, "mean_token_accuracy": 0.717076712846756, "num_tokens": 27596349.0, "step": 57800 }, { "entropy": 1.0224676036834717, "epoch": 1.406823764013521, "grad_norm": 14.4375, "learning_rate": 1.1843617398990968e-05, "loss": 1.0357215118408203, "mean_token_accuracy": 0.7305222928524018, "num_tokens": 27619175.0, "step": 57850 }, { "entropy": 0.9055188196897507, "epoch": 1.4080396877507844, "grad_norm": 21.875, "learning_rate": 1.1830442116373183e-05, "loss": 0.9236668395996094, "mean_token_accuracy": 0.7546677827835083, "num_tokens": 27642974.0, "step": 57900 }, { "entropy": 0.9280591529607772, "epoch": 1.4092556114880475, "grad_norm": 18.625, "learning_rate": 1.1817263545353174e-05, "loss": 0.9315620422363281, "mean_token_accuracy": 0.7496540927886963, "num_tokens": 27670319.0, "step": 57950 }, { "entropy": 0.9790265637636185, "epoch": 1.4104715352253105, "grad_norm": 19.875, "learning_rate": 1.1804081709606328e-05, "loss": 0.9802582550048828, "mean_token_accuracy": 0.7415940868854523, "num_tokens": 27689504.0, "step": 58000 }, { "entropy": 0.9631727516651154, "epoch": 1.4116874589625739, "grad_norm": 16.5, "learning_rate": 1.1790896632813915e-05, "loss": 0.9865300750732422, "mean_token_accuracy": 0.7382265675067902, "num_tokens": 27712541.0, "step": 58050 }, { "entropy": 1.0066139543056487, "epoch": 1.4129033826998372, "grad_norm": 25.125, "learning_rate": 1.1777708338663018e-05, "loss": 1.0130008697509765, "mean_token_accuracy": 0.7364522647857666, "num_tokens": 27737293.0, "step": 58100 }, { "entropy": 1.0370841860771178, "epoch": 1.4141193064371003, "grad_norm": 20.5, "learning_rate": 1.1764516850846499e-05, "loss": 1.045273895263672, "mean_token_accuracy": 0.7238877260684967, "num_tokens": 27762545.0, "step": 58150 }, { "entropy": 1.0163979578018187, "epoch": 1.4153352301743634, "grad_norm": 21.375, "learning_rate": 1.1751322193062955e-05, "loss": 1.0351234436035157, "mean_token_accuracy": 0.7372513210773468, "num_tokens": 27784674.0, "step": 58200 }, { "entropy": 1.0539644145965577, "epoch": 1.4165511539116267, "grad_norm": 9.75, "learning_rate": 1.1738124389016692e-05, "loss": 1.0846697998046875, "mean_token_accuracy": 0.7206973993778228, "num_tokens": 27809841.0, "step": 58250 }, { "entropy": 1.1210889613628388, "epoch": 1.41776707764889, "grad_norm": 16.875, "learning_rate": 1.1724923462417647e-05, "loss": 1.1378515625, "mean_token_accuracy": 0.6993285357952118, "num_tokens": 27837341.0, "step": 58300 }, { "entropy": 1.0417276173830032, "epoch": 1.418983001386153, "grad_norm": 12.9375, "learning_rate": 1.1711719436981387e-05, "loss": 1.046751708984375, "mean_token_accuracy": 0.730153511762619, "num_tokens": 27864290.0, "step": 58350 }, { "entropy": 1.0523381930589677, "epoch": 1.4201989251234162, "grad_norm": 19.375, "learning_rate": 1.1698512336429038e-05, "loss": 1.056630859375, "mean_token_accuracy": 0.7230587840080261, "num_tokens": 27888042.0, "step": 58400 }, { "entropy": 1.0411742383241653, "epoch": 1.4214148488606795, "grad_norm": 7.5, "learning_rate": 1.1685302184487239e-05, "loss": 1.0642853546142579, "mean_token_accuracy": 0.7220291209220886, "num_tokens": 27913331.0, "step": 58450 }, { "entropy": 1.0625030934810638, "epoch": 1.4226307725979428, "grad_norm": 13.125, "learning_rate": 1.1672089004888133e-05, "loss": 1.0604217529296875, "mean_token_accuracy": 0.7310965663194656, "num_tokens": 27936887.0, "step": 58500 }, { "entropy": 1.0885680532455444, "epoch": 1.4238466963352059, "grad_norm": 15.6875, "learning_rate": 1.1658872821369287e-05, "loss": 1.0818824768066406, "mean_token_accuracy": 0.7269440293312073, "num_tokens": 27957819.0, "step": 58550 }, { "entropy": 1.004431341290474, "epoch": 1.425062620072469, "grad_norm": 11.8125, "learning_rate": 1.1645653657673666e-05, "loss": 1.0298860168457031, "mean_token_accuracy": 0.7350693273544312, "num_tokens": 27979144.0, "step": 58600 }, { "entropy": 1.054539818763733, "epoch": 1.4262785438097323, "grad_norm": 10.75, "learning_rate": 1.1632431537549589e-05, "loss": 1.0588455963134766, "mean_token_accuracy": 0.7303541815280914, "num_tokens": 28001650.0, "step": 58650 }, { "entropy": 1.026737025976181, "epoch": 1.4274944675469954, "grad_norm": 26.875, "learning_rate": 1.1619206484750698e-05, "loss": 1.052571029663086, "mean_token_accuracy": 0.7173071098327637, "num_tokens": 28026211.0, "step": 58700 }, { "entropy": 0.9736247861385345, "epoch": 1.4287103912842587, "grad_norm": 14.25, "learning_rate": 1.1605978523035886e-05, "loss": 0.9566144561767578, "mean_token_accuracy": 0.7427229833602905, "num_tokens": 28050271.0, "step": 58750 }, { "entropy": 0.9600906759500504, "epoch": 1.4299263150215218, "grad_norm": 14.1875, "learning_rate": 1.1592747676169282e-05, "loss": 0.9789591979980469, "mean_token_accuracy": 0.7473980545997619, "num_tokens": 28073607.0, "step": 58800 }, { "entropy": 1.2424307531118393, "epoch": 1.431142238758785, "grad_norm": 7.34375, "learning_rate": 1.1579513967920196e-05, "loss": 1.248048324584961, "mean_token_accuracy": 0.6934221178293228, "num_tokens": 28102398.0, "step": 58850 }, { "entropy": 1.0733378773927689, "epoch": 1.4323581624960482, "grad_norm": 11.5625, "learning_rate": 1.1566277422063079e-05, "loss": 1.0995455932617189, "mean_token_accuracy": 0.716650505065918, "num_tokens": 28127316.0, "step": 58900 }, { "entropy": 1.0398844105005265, "epoch": 1.4335740862333115, "grad_norm": 14.5, "learning_rate": 1.1553038062377478e-05, "loss": 1.0375257873535155, "mean_token_accuracy": 0.7260085272789002, "num_tokens": 28152052.0, "step": 58950 }, { "entropy": 1.017767464518547, "epoch": 1.4347900099705746, "grad_norm": 32.25, "learning_rate": 1.1539795912648003e-05, "loss": 1.0236495208740235, "mean_token_accuracy": 0.7407971167564392, "num_tokens": 28171051.0, "step": 59000 }, { "entropy": 1.0479836970567704, "epoch": 1.436005933707838, "grad_norm": 17.375, "learning_rate": 1.1526550996664259e-05, "loss": 1.0619031524658202, "mean_token_accuracy": 0.7299083018302918, "num_tokens": 28195767.0, "step": 59050 }, { "entropy": 1.1430797725915909, "epoch": 1.437221857445101, "grad_norm": 11.0625, "learning_rate": 1.1513303338220835e-05, "loss": 1.1527129364013673, "mean_token_accuracy": 0.7038804721832276, "num_tokens": 28220532.0, "step": 59100 }, { "entropy": 1.1229410344362258, "epoch": 1.4384377811823643, "grad_norm": 22.25, "learning_rate": 1.1500052961117252e-05, "loss": 1.1419001007080078, "mean_token_accuracy": 0.7112986171245574, "num_tokens": 28244412.0, "step": 59150 }, { "entropy": 1.0001625341176987, "epoch": 1.4396537049196274, "grad_norm": 20.375, "learning_rate": 1.1486799889157893e-05, "loss": 0.9996794128417968, "mean_token_accuracy": 0.74361652135849, "num_tokens": 28266483.0, "step": 59200 }, { "entropy": 0.9414834839105606, "epoch": 1.4408696286568907, "grad_norm": 15.3125, "learning_rate": 1.1473544146152004e-05, "loss": 0.9619404602050782, "mean_token_accuracy": 0.7500218498706818, "num_tokens": 28288921.0, "step": 59250 }, { "entropy": 1.0214059495925902, "epoch": 1.4420855523941538, "grad_norm": 16.875, "learning_rate": 1.146028575591362e-05, "loss": 1.045082778930664, "mean_token_accuracy": 0.7288913893699646, "num_tokens": 28316729.0, "step": 59300 }, { "entropy": 1.0598227399587632, "epoch": 1.443301476131417, "grad_norm": 15.8125, "learning_rate": 1.1447024742261534e-05, "loss": 1.089275131225586, "mean_token_accuracy": 0.7191540718078613, "num_tokens": 28342375.0, "step": 59350 }, { "entropy": 1.080426460504532, "epoch": 1.4445173998686802, "grad_norm": 25.0, "learning_rate": 1.1433761129019248e-05, "loss": 1.0671541595458984, "mean_token_accuracy": 0.7226643443107605, "num_tokens": 28369680.0, "step": 59400 }, { "entropy": 1.0712395799160004, "epoch": 1.4457333236059435, "grad_norm": 11.3125, "learning_rate": 1.1420494940014937e-05, "loss": 1.0950267028808593, "mean_token_accuracy": 0.7257779383659363, "num_tokens": 28395459.0, "step": 59450 }, { "entropy": 1.023721616268158, "epoch": 1.4469492473432066, "grad_norm": 15.125, "learning_rate": 1.1407226199081407e-05, "loss": 1.0382070922851563, "mean_token_accuracy": 0.7255243730545043, "num_tokens": 28422126.0, "step": 59500 }, { "entropy": 1.079383634328842, "epoch": 1.4481651710804697, "grad_norm": 18.875, "learning_rate": 1.1393954930056039e-05, "loss": 1.0766160583496094, "mean_token_accuracy": 0.7177036046981812, "num_tokens": 28449142.0, "step": 59550 }, { "entropy": 1.116480843424797, "epoch": 1.449381094817733, "grad_norm": 15.3125, "learning_rate": 1.1380681156780771e-05, "loss": 1.114368896484375, "mean_token_accuracy": 0.7195276510715485, "num_tokens": 28470160.0, "step": 59600 }, { "entropy": 1.095802137851715, "epoch": 1.4505970185549963, "grad_norm": 16.125, "learning_rate": 1.1367404903102022e-05, "loss": 1.108647918701172, "mean_token_accuracy": 0.7148735976219177, "num_tokens": 28492165.0, "step": 59650 }, { "entropy": 0.9638242852687836, "epoch": 1.4518129422922594, "grad_norm": 15.3125, "learning_rate": 1.1354126192870677e-05, "loss": 0.9489512634277344, "mean_token_accuracy": 0.7440077817440033, "num_tokens": 28514999.0, "step": 59700 }, { "entropy": 1.0869210594892502, "epoch": 1.4530288660295225, "grad_norm": 23.0, "learning_rate": 1.134084504994204e-05, "loss": 1.092483673095703, "mean_token_accuracy": 0.7149652707576751, "num_tokens": 28538235.0, "step": 59750 }, { "entropy": 1.0256401354074478, "epoch": 1.4542447897667858, "grad_norm": 17.125, "learning_rate": 1.1327561498175772e-05, "loss": 1.0363218688964844, "mean_token_accuracy": 0.7271182465553284, "num_tokens": 28559344.0, "step": 59800 }, { "entropy": 1.0524402046203614, "epoch": 1.4554607135040492, "grad_norm": 12.3125, "learning_rate": 1.1314275561435868e-05, "loss": 1.0579508209228516, "mean_token_accuracy": 0.7271922439336777, "num_tokens": 28584279.0, "step": 59850 }, { "entropy": 0.9914318433403969, "epoch": 1.4566766372413122, "grad_norm": 11.0, "learning_rate": 1.1300987263590609e-05, "loss": 1.0184000396728516, "mean_token_accuracy": 0.7321703958511353, "num_tokens": 28608722.0, "step": 59900 }, { "entropy": 0.9727129226922989, "epoch": 1.4578925609785753, "grad_norm": 14.9375, "learning_rate": 1.1287696628512519e-05, "loss": 0.9592554473876953, "mean_token_accuracy": 0.7502239418029785, "num_tokens": 28633045.0, "step": 59950 }, { "entropy": 1.0181272399425507, "epoch": 1.4591084847158386, "grad_norm": 42.5, "learning_rate": 1.127440368007831e-05, "loss": 1.0365283966064454, "mean_token_accuracy": 0.7341143000125885, "num_tokens": 28656372.0, "step": 60000 }, { "epoch": 1.4591084847158386, "eval_entropy": 1.1171362991307179, "eval_loss": 1.2984925508499146, "eval_mean_token_accuracy": 0.6794324309240417, "eval_num_tokens": 28656372.0, "eval_runtime": 391.0248, "eval_samples_per_second": 11.685, "eval_steps_per_second": 11.685, "step": 60000 }, { "entropy": 1.04897368311882, "epoch": 1.4603244084531017, "grad_norm": 12.75, "learning_rate": 1.1261108442168864e-05, "loss": 1.0646489715576173, "mean_token_accuracy": 0.7244741022586823, "num_tokens": 28680619.0, "step": 60050 }, { "entropy": 1.1691409134864807, "epoch": 1.461540332190365, "grad_norm": 12.9375, "learning_rate": 1.1247810938669173e-05, "loss": 1.192813949584961, "mean_token_accuracy": 0.6940682870149613, "num_tokens": 28706520.0, "step": 60100 }, { "entropy": 1.1082885974645615, "epoch": 1.4627562559276281, "grad_norm": 17.375, "learning_rate": 1.1234511193468292e-05, "loss": 1.131165542602539, "mean_token_accuracy": 0.7171860718727112, "num_tokens": 28731727.0, "step": 60150 }, { "entropy": 1.0609053158760071, "epoch": 1.4639721796648915, "grad_norm": 11.4375, "learning_rate": 1.1221209230459305e-05, "loss": 1.0734633636474609, "mean_token_accuracy": 0.7151597177982331, "num_tokens": 28753337.0, "step": 60200 }, { "entropy": 0.9922232973575592, "epoch": 1.4651881034021546, "grad_norm": 17.25, "learning_rate": 1.1207905073539286e-05, "loss": 1.0014823913574218, "mean_token_accuracy": 0.7369194686412811, "num_tokens": 28776521.0, "step": 60250 }, { "entropy": 1.0453960806131364, "epoch": 1.4664040271394179, "grad_norm": 33.75, "learning_rate": 1.1194598746609247e-05, "loss": 1.0583928680419923, "mean_token_accuracy": 0.7187484002113342, "num_tokens": 28800610.0, "step": 60300 }, { "entropy": 1.0324378272891044, "epoch": 1.467619950876681, "grad_norm": 29.0, "learning_rate": 1.1181290273574098e-05, "loss": 1.0439817810058594, "mean_token_accuracy": 0.7336442661285401, "num_tokens": 28825213.0, "step": 60350 }, { "entropy": 1.0273615735769273, "epoch": 1.4688358746139443, "grad_norm": 10.625, "learning_rate": 1.1167979678342609e-05, "loss": 1.0405608367919923, "mean_token_accuracy": 0.7354606068134308, "num_tokens": 28846645.0, "step": 60400 }, { "entropy": 1.1538165420293809, "epoch": 1.4700517983512074, "grad_norm": 33.0, "learning_rate": 1.115466698482735e-05, "loss": 1.1758214569091796, "mean_token_accuracy": 0.7061018007993698, "num_tokens": 28870904.0, "step": 60450 }, { "entropy": 1.0976719433069229, "epoch": 1.4712677220884707, "grad_norm": 39.5, "learning_rate": 1.1141352216944674e-05, "loss": 1.0986181640625, "mean_token_accuracy": 0.7178906011581421, "num_tokens": 28893626.0, "step": 60500 }, { "entropy": 1.0251168417930603, "epoch": 1.4724836458257338, "grad_norm": 16.125, "learning_rate": 1.112803539861466e-05, "loss": 1.0272953033447265, "mean_token_accuracy": 0.7212985777854919, "num_tokens": 28919237.0, "step": 60550 }, { "entropy": 1.0254731947183608, "epoch": 1.473699569562997, "grad_norm": 10.8125, "learning_rate": 1.111471655376106e-05, "loss": 1.0226531982421876, "mean_token_accuracy": 0.7335396659374237, "num_tokens": 28943310.0, "step": 60600 }, { "entropy": 0.9939493113756179, "epoch": 1.4749154933002602, "grad_norm": 16.125, "learning_rate": 1.1101395706311277e-05, "loss": 1.0060607147216798, "mean_token_accuracy": 0.7396328401565552, "num_tokens": 28967907.0, "step": 60650 }, { "entropy": 1.1087494373321534, "epoch": 1.4761314170375235, "grad_norm": 13.375, "learning_rate": 1.1088072880196306e-05, "loss": 1.1473308563232423, "mean_token_accuracy": 0.7053141319751739, "num_tokens": 28989858.0, "step": 60700 }, { "entropy": 1.0809915578365326, "epoch": 1.4773473407747866, "grad_norm": 29.125, "learning_rate": 1.1074748099350699e-05, "loss": 1.0918978881835937, "mean_token_accuracy": 0.7213046491146088, "num_tokens": 29015650.0, "step": 60750 }, { "entropy": 1.1523029279708863, "epoch": 1.47856326451205, "grad_norm": 36.5, "learning_rate": 1.1061421387712516e-05, "loss": 1.1546858215332032, "mean_token_accuracy": 0.7059702515602112, "num_tokens": 29042139.0, "step": 60800 }, { "entropy": 1.0531483393907548, "epoch": 1.479779188249313, "grad_norm": 14.4375, "learning_rate": 1.1048092769223298e-05, "loss": 1.0593535614013672, "mean_token_accuracy": 0.7279757559299469, "num_tokens": 29064713.0, "step": 60850 }, { "entropy": 1.0963643288612366, "epoch": 1.480995111986576, "grad_norm": 12.875, "learning_rate": 1.103476226782799e-05, "loss": 1.110780258178711, "mean_token_accuracy": 0.7230631875991821, "num_tokens": 29090522.0, "step": 60900 }, { "entropy": 1.10781454205513, "epoch": 1.4822110357238394, "grad_norm": 11.0, "learning_rate": 1.102142990747494e-05, "loss": 1.1137989807128905, "mean_token_accuracy": 0.7191085582971573, "num_tokens": 29112833.0, "step": 60950 }, { "entropy": 0.9949025648832321, "epoch": 1.4834269594611027, "grad_norm": 8.75, "learning_rate": 1.1008095712115829e-05, "loss": 0.990078353881836, "mean_token_accuracy": 0.7356062823534012, "num_tokens": 29137175.0, "step": 61000 }, { "entropy": 1.0252605664730072, "epoch": 1.4846428831983658, "grad_norm": 13.125, "learning_rate": 1.0994759705705632e-05, "loss": 1.0492635345458985, "mean_token_accuracy": 0.7265597438812256, "num_tokens": 29160664.0, "step": 61050 }, { "entropy": 1.0823705673217774, "epoch": 1.4858588069356289, "grad_norm": 23.75, "learning_rate": 1.0981421912202572e-05, "loss": 1.0896253204345703, "mean_token_accuracy": 0.7229091477394104, "num_tokens": 29185708.0, "step": 61100 }, { "entropy": 1.0481963461637498, "epoch": 1.4870747306728922, "grad_norm": 27.125, "learning_rate": 1.0968082355568099e-05, "loss": 1.0446250915527344, "mean_token_accuracy": 0.7304726433753967, "num_tokens": 29208076.0, "step": 61150 }, { "entropy": 0.9494848895072937, "epoch": 1.4882906544101555, "grad_norm": 21.875, "learning_rate": 1.0954741059766818e-05, "loss": 0.9646263122558594, "mean_token_accuracy": 0.7411012256145477, "num_tokens": 29231303.0, "step": 61200 }, { "entropy": 1.0294305604696274, "epoch": 1.4895065781474186, "grad_norm": 13.0625, "learning_rate": 1.0941398048766458e-05, "loss": 1.0387551116943359, "mean_token_accuracy": 0.7291430866718293, "num_tokens": 29254478.0, "step": 61250 }, { "entropy": 1.0610681456327438, "epoch": 1.4907225018846817, "grad_norm": 22.625, "learning_rate": 1.092805334653784e-05, "loss": 1.087912826538086, "mean_token_accuracy": 0.7160311567783356, "num_tokens": 29281248.0, "step": 61300 }, { "entropy": 1.0855142325162888, "epoch": 1.491938425621945, "grad_norm": 16.875, "learning_rate": 1.0914706977054806e-05, "loss": 1.0579692840576171, "mean_token_accuracy": 0.7298198986053467, "num_tokens": 29301943.0, "step": 61350 }, { "entropy": 1.0311577260494231, "epoch": 1.4931543493592083, "grad_norm": 21.25, "learning_rate": 1.090135896429421e-05, "loss": 1.0428280639648437, "mean_token_accuracy": 0.7293343281745911, "num_tokens": 29323314.0, "step": 61400 }, { "entropy": 1.00466738820076, "epoch": 1.4943702730964714, "grad_norm": 10.0, "learning_rate": 1.0888009332235854e-05, "loss": 1.0197222900390626, "mean_token_accuracy": 0.734057959318161, "num_tokens": 29345906.0, "step": 61450 }, { "entropy": 1.1249745643138886, "epoch": 1.4955861968337345, "grad_norm": 26.0, "learning_rate": 1.087465810486244e-05, "loss": 1.1404303741455077, "mean_token_accuracy": 0.7170071136951447, "num_tokens": 29366930.0, "step": 61500 }, { "entropy": 1.024164187312126, "epoch": 1.4968021205709978, "grad_norm": 13.8125, "learning_rate": 1.0861305306159545e-05, "loss": 1.0072457122802734, "mean_token_accuracy": 0.7283740621805191, "num_tokens": 29391117.0, "step": 61550 }, { "entropy": 1.0783828872442245, "epoch": 1.498018044308261, "grad_norm": 15.0, "learning_rate": 1.0847950960115568e-05, "loss": 1.1237003326416015, "mean_token_accuracy": 0.7118296551704407, "num_tokens": 29417944.0, "step": 61600 }, { "entropy": 1.0619436210393907, "epoch": 1.4992339680455242, "grad_norm": 16.125, "learning_rate": 1.0834595090721682e-05, "loss": 1.0571080780029296, "mean_token_accuracy": 0.7222172164916992, "num_tokens": 29442448.0, "step": 61650 }, { "entropy": 1.0853009098768234, "epoch": 1.5004498917827873, "grad_norm": 32.25, "learning_rate": 1.0821237721971807e-05, "loss": 1.1087355041503906, "mean_token_accuracy": 0.7192370820045472, "num_tokens": 29463775.0, "step": 61700 }, { "entropy": 1.0817710703611374, "epoch": 1.5016658155200506, "grad_norm": 14.5625, "learning_rate": 1.0807878877862548e-05, "loss": 1.0865257263183594, "mean_token_accuracy": 0.7209163552522659, "num_tokens": 29488286.0, "step": 61750 }, { "entropy": 1.0112564492225646, "epoch": 1.502881739257314, "grad_norm": 8.3125, "learning_rate": 1.079451858239316e-05, "loss": 1.0279396820068358, "mean_token_accuracy": 0.7386095058918, "num_tokens": 29513329.0, "step": 61800 }, { "entropy": 0.9982373106479645, "epoch": 1.504097662994577, "grad_norm": 16.25, "learning_rate": 1.0781156859565514e-05, "loss": 1.0041990661621094, "mean_token_accuracy": 0.735728600025177, "num_tokens": 29537052.0, "step": 61850 }, { "entropy": 1.0419027656316757, "epoch": 1.5053135867318401, "grad_norm": 21.25, "learning_rate": 1.0767793733384036e-05, "loss": 1.0277965545654297, "mean_token_accuracy": 0.7333485901355743, "num_tokens": 29560484.0, "step": 61900 }, { "entropy": 1.005578765273094, "epoch": 1.5065295104691034, "grad_norm": 16.0, "learning_rate": 1.0754429227855679e-05, "loss": 1.0046352386474608, "mean_token_accuracy": 0.7289977812767029, "num_tokens": 29583047.0, "step": 61950 }, { "entropy": 1.0901249623298646, "epoch": 1.5077454342063665, "grad_norm": 12.1875, "learning_rate": 1.0741063366989867e-05, "loss": 1.100882797241211, "mean_token_accuracy": 0.7145828807353973, "num_tokens": 29610662.0, "step": 62000 }, { "entropy": 1.0317211472988128, "epoch": 1.5089613579436296, "grad_norm": 12.375, "learning_rate": 1.072769617479847e-05, "loss": 1.0449739074707032, "mean_token_accuracy": 0.7199174046516419, "num_tokens": 29636325.0, "step": 62050 }, { "entropy": 1.1316528463363646, "epoch": 1.510177281680893, "grad_norm": 9.6875, "learning_rate": 1.0714327675295739e-05, "loss": 1.158927459716797, "mean_token_accuracy": 0.7048649358749389, "num_tokens": 29660893.0, "step": 62100 }, { "entropy": 1.0739480590820312, "epoch": 1.5113932054181562, "grad_norm": 8.125, "learning_rate": 1.070095789249828e-05, "loss": 1.0857657623291015, "mean_token_accuracy": 0.7147703087329864, "num_tokens": 29683160.0, "step": 62150 }, { "entropy": 1.1102619636058808, "epoch": 1.5126091291554193, "grad_norm": 14.1875, "learning_rate": 1.0687586850425002e-05, "loss": 1.1397525787353515, "mean_token_accuracy": 0.7122361874580383, "num_tokens": 29706661.0, "step": 62200 }, { "entropy": 1.1580451381206514, "epoch": 1.5138250528926824, "grad_norm": 13.4375, "learning_rate": 1.0674214573097076e-05, "loss": 1.159133529663086, "mean_token_accuracy": 0.7098443996906281, "num_tokens": 29729229.0, "step": 62250 }, { "entropy": 1.0770572632551194, "epoch": 1.5150409766299457, "grad_norm": 17.125, "learning_rate": 1.0660841084537896e-05, "loss": 1.1133444213867187, "mean_token_accuracy": 0.7180780982971191, "num_tokens": 29754996.0, "step": 62300 }, { "entropy": 1.0238256204128264, "epoch": 1.516256900367209, "grad_norm": 30.5, "learning_rate": 1.0647466408773026e-05, "loss": 1.0281260681152344, "mean_token_accuracy": 0.7295376694202423, "num_tokens": 29778341.0, "step": 62350 }, { "entropy": 1.1064566779136658, "epoch": 1.5174728241044722, "grad_norm": 13.875, "learning_rate": 1.0634090569830168e-05, "loss": 1.1415961456298829, "mean_token_accuracy": 0.7032273888587952, "num_tokens": 29802450.0, "step": 62400 }, { "entropy": 1.1959719014167787, "epoch": 1.5186887478417352, "grad_norm": 14.5625, "learning_rate": 1.0620713591739111e-05, "loss": 1.2008097076416016, "mean_token_accuracy": 0.7087286365032196, "num_tokens": 29826782.0, "step": 62450 }, { "entropy": 1.071339048743248, "epoch": 1.5199046715789986, "grad_norm": 18.25, "learning_rate": 1.0607335498531685e-05, "loss": 1.0711934661865234, "mean_token_accuracy": 0.721354923248291, "num_tokens": 29850132.0, "step": 62500 }, { "entropy": 1.0808813667297363, "epoch": 1.5211205953162619, "grad_norm": 14.5, "learning_rate": 1.0593956314241738e-05, "loss": 1.109319076538086, "mean_token_accuracy": 0.7182936847209931, "num_tokens": 29870573.0, "step": 62550 }, { "entropy": 1.0184749019145967, "epoch": 1.522336519053525, "grad_norm": 15.1875, "learning_rate": 1.0580576062905065e-05, "loss": 1.0302418518066405, "mean_token_accuracy": 0.7319350361824035, "num_tokens": 29891798.0, "step": 62600 }, { "entropy": 0.9733039498329162, "epoch": 1.523552442790788, "grad_norm": 15.6875, "learning_rate": 1.0567194768559381e-05, "loss": 1.0047384643554687, "mean_token_accuracy": 0.7432723486423493, "num_tokens": 29915644.0, "step": 62650 }, { "entropy": 1.102737311720848, "epoch": 1.5247683665280514, "grad_norm": 26.25, "learning_rate": 1.055381245524428e-05, "loss": 1.1088141632080077, "mean_token_accuracy": 0.719790153503418, "num_tokens": 29936838.0, "step": 62700 }, { "entropy": 1.1687860488891602, "epoch": 1.5259842902653147, "grad_norm": 8.75, "learning_rate": 1.0540429147001178e-05, "loss": 1.1779611206054688, "mean_token_accuracy": 0.7017170733213425, "num_tokens": 29964231.0, "step": 62750 }, { "entropy": 0.9657506543397903, "epoch": 1.5272002140025778, "grad_norm": 17.5, "learning_rate": 1.0527044867873288e-05, "loss": 0.9591667175292968, "mean_token_accuracy": 0.7456932604312897, "num_tokens": 29982855.0, "step": 62800 }, { "entropy": 1.0941114491224289, "epoch": 1.5284161377398409, "grad_norm": 16.875, "learning_rate": 1.051365964190556e-05, "loss": 1.0946868133544922, "mean_token_accuracy": 0.7133577287197113, "num_tokens": 30003787.0, "step": 62850 }, { "entropy": 1.12908458173275, "epoch": 1.5296320614771042, "grad_norm": 19.25, "learning_rate": 1.0500273493144647e-05, "loss": 1.1274006652832032, "mean_token_accuracy": 0.7100715178251267, "num_tokens": 30025040.0, "step": 62900 }, { "entropy": 1.0146717268228531, "epoch": 1.5308479852143675, "grad_norm": 18.25, "learning_rate": 1.0486886445638854e-05, "loss": 1.019323196411133, "mean_token_accuracy": 0.7323240971565247, "num_tokens": 30052783.0, "step": 62950 }, { "entropy": 1.0161329066753388, "epoch": 1.5320639089516306, "grad_norm": 15.8125, "learning_rate": 1.0473498523438121e-05, "loss": 1.026114273071289, "mean_token_accuracy": 0.7369299066066742, "num_tokens": 30074228.0, "step": 63000 }, { "entropy": 0.9925145661830902, "epoch": 1.5332798326888937, "grad_norm": 9.5, "learning_rate": 1.0460109750593934e-05, "loss": 1.0008731079101563, "mean_token_accuracy": 0.7351546216011048, "num_tokens": 30098041.0, "step": 63050 }, { "entropy": 1.0631601047515868, "epoch": 1.534495756426157, "grad_norm": 16.0, "learning_rate": 1.044672015115932e-05, "loss": 1.0906798553466797, "mean_token_accuracy": 0.7187147265672684, "num_tokens": 30121402.0, "step": 63100 }, { "entropy": 1.026601951122284, "epoch": 1.5357116801634203, "grad_norm": 13.5, "learning_rate": 1.043332974918879e-05, "loss": 1.0299557495117186, "mean_token_accuracy": 0.7331348180770874, "num_tokens": 30145758.0, "step": 63150 }, { "entropy": 0.9705434739589691, "epoch": 1.5369276039006834, "grad_norm": 12.9375, "learning_rate": 1.0419938568738298e-05, "loss": 0.9743463134765625, "mean_token_accuracy": 0.7405224692821503, "num_tokens": 30166900.0, "step": 63200 }, { "entropy": 1.1103163594007492, "epoch": 1.5381435276379465, "grad_norm": 29.875, "learning_rate": 1.040654663386519e-05, "loss": 1.113380126953125, "mean_token_accuracy": 0.7179055845737458, "num_tokens": 30187648.0, "step": 63250 }, { "entropy": 1.1000190836191177, "epoch": 1.5393594513752098, "grad_norm": 13.0, "learning_rate": 1.039315396862818e-05, "loss": 1.128774642944336, "mean_token_accuracy": 0.7076512265205384, "num_tokens": 30215310.0, "step": 63300 }, { "entropy": 1.0658751440048218, "epoch": 1.540575375112473, "grad_norm": 19.25, "learning_rate": 1.0379760597087276e-05, "loss": 1.0816213226318359, "mean_token_accuracy": 0.7190074682235718, "num_tokens": 30237055.0, "step": 63350 }, { "entropy": 1.0508594411611556, "epoch": 1.541791298849736, "grad_norm": 17.75, "learning_rate": 1.036636654330377e-05, "loss": 1.0584161376953125, "mean_token_accuracy": 0.7295702850818634, "num_tokens": 30266376.0, "step": 63400 }, { "entropy": 1.0785071277618408, "epoch": 1.5430072225869993, "grad_norm": 24.375, "learning_rate": 1.0352971831340176e-05, "loss": 1.0797045135498047, "mean_token_accuracy": 0.712654447555542, "num_tokens": 30289498.0, "step": 63450 }, { "entropy": 0.9066701519489289, "epoch": 1.5442231463242626, "grad_norm": 21.625, "learning_rate": 1.0339576485260186e-05, "loss": 0.9041043090820312, "mean_token_accuracy": 0.7654973530769348, "num_tokens": 30310915.0, "step": 63500 }, { "entropy": 1.0341929256916047, "epoch": 1.5454390700615257, "grad_norm": 25.375, "learning_rate": 1.0326180529128632e-05, "loss": 1.0690788269042968, "mean_token_accuracy": 0.726358276605606, "num_tokens": 30330810.0, "step": 63550 }, { "entropy": 0.9805673730373382, "epoch": 1.5466549937987888, "grad_norm": 17.875, "learning_rate": 1.031278398701145e-05, "loss": 0.9934425354003906, "mean_token_accuracy": 0.7425825941562653, "num_tokens": 30350766.0, "step": 63600 }, { "entropy": 1.096551775932312, "epoch": 1.547870917536052, "grad_norm": 13.75, "learning_rate": 1.029938688297562e-05, "loss": 1.1067649841308593, "mean_token_accuracy": 0.7073927569389343, "num_tokens": 30375996.0, "step": 63650 }, { "entropy": 1.0077642285823822, "epoch": 1.5490868412733154, "grad_norm": 9.25, "learning_rate": 1.0285989241089134e-05, "loss": 1.0220103454589844, "mean_token_accuracy": 0.730487904548645, "num_tokens": 30397029.0, "step": 63700 }, { "entropy": 1.0044749337434768, "epoch": 1.5503027650105785, "grad_norm": 29.375, "learning_rate": 1.0272591085420951e-05, "loss": 1.0139543914794922, "mean_token_accuracy": 0.7308397090435028, "num_tokens": 30417607.0, "step": 63750 }, { "entropy": 1.07502226293087, "epoch": 1.5515186887478416, "grad_norm": 10.625, "learning_rate": 1.0259192440040953e-05, "loss": 1.0863434600830078, "mean_token_accuracy": 0.7275008940696717, "num_tokens": 30443988.0, "step": 63800 }, { "entropy": 1.0351419991254807, "epoch": 1.552734612485105, "grad_norm": 12.6875, "learning_rate": 1.0245793329019896e-05, "loss": 1.0372911071777344, "mean_token_accuracy": 0.7360386854410171, "num_tokens": 30470118.0, "step": 63850 }, { "entropy": 0.9698303532600403, "epoch": 1.5539505362223682, "grad_norm": 19.125, "learning_rate": 1.0232393776429388e-05, "loss": 0.9508734893798828, "mean_token_accuracy": 0.7485504460334778, "num_tokens": 30491157.0, "step": 63900 }, { "entropy": 1.066955839395523, "epoch": 1.5551664599596313, "grad_norm": 21.375, "learning_rate": 1.021899380634181e-05, "loss": 1.0683878326416016, "mean_token_accuracy": 0.7268337631225585, "num_tokens": 30513160.0, "step": 63950 }, { "entropy": 0.9829577857255936, "epoch": 1.5563823836968944, "grad_norm": 12.5, "learning_rate": 1.020559344283031e-05, "loss": 0.9925920104980469, "mean_token_accuracy": 0.7413454306125641, "num_tokens": 30537785.0, "step": 64000 }, { "entropy": 1.0395965039730073, "epoch": 1.5575983074341577, "grad_norm": 12.5, "learning_rate": 1.0192192709968736e-05, "loss": 1.0522652435302735, "mean_token_accuracy": 0.7319646513462067, "num_tokens": 30563778.0, "step": 64050 }, { "entropy": 1.203722357749939, "epoch": 1.558814231171421, "grad_norm": 20.625, "learning_rate": 1.0178791631831593e-05, "loss": 1.2203373718261719, "mean_token_accuracy": 0.6895324963331223, "num_tokens": 30590516.0, "step": 64100 }, { "entropy": 1.0712510085105895, "epoch": 1.5600301549086841, "grad_norm": 12.1875, "learning_rate": 1.0165390232494019e-05, "loss": 1.0742120361328125, "mean_token_accuracy": 0.7217804950475692, "num_tokens": 30612602.0, "step": 64150 }, { "entropy": 1.061323186159134, "epoch": 1.5612460786459472, "grad_norm": 14.3125, "learning_rate": 1.015198853603172e-05, "loss": 1.0669564819335937, "mean_token_accuracy": 0.7287099730968475, "num_tokens": 30634953.0, "step": 64200 }, { "entropy": 1.0147472178936006, "epoch": 1.5624620023832105, "grad_norm": 14.5625, "learning_rate": 1.0138586566520943e-05, "loss": 1.0420791625976562, "mean_token_accuracy": 0.7283444905281067, "num_tokens": 30658024.0, "step": 64250 }, { "entropy": 1.0095073235034944, "epoch": 1.5636779261204738, "grad_norm": 12.5, "learning_rate": 1.0125184348038413e-05, "loss": 0.9872137451171875, "mean_token_accuracy": 0.7415936613082885, "num_tokens": 30683231.0, "step": 64300 }, { "entropy": 1.0338542938232422, "epoch": 1.564893849857737, "grad_norm": 19.125, "learning_rate": 1.0111781904661317e-05, "loss": 1.0659596252441406, "mean_token_accuracy": 0.7278216481208801, "num_tokens": 30705545.0, "step": 64350 }, { "entropy": 1.0038412749767303, "epoch": 1.566109773595, "grad_norm": 23.5, "learning_rate": 1.009837926046724e-05, "loss": 1.023806610107422, "mean_token_accuracy": 0.7318216884136199, "num_tokens": 30731265.0, "step": 64400 }, { "entropy": 1.0687776011228562, "epoch": 1.5673256973322633, "grad_norm": 25.375, "learning_rate": 1.008497643953412e-05, "loss": 1.0676013946533203, "mean_token_accuracy": 0.7259909701347351, "num_tokens": 30753913.0, "step": 64450 }, { "entropy": 1.1552897971868514, "epoch": 1.5685416210695267, "grad_norm": 30.0, "learning_rate": 1.0071573465940227e-05, "loss": 1.1597048950195312, "mean_token_accuracy": 0.7056771069765091, "num_tokens": 30776045.0, "step": 64500 }, { "entropy": 1.080007675886154, "epoch": 1.5697575448067898, "grad_norm": 8.375, "learning_rate": 1.0058170363764096e-05, "loss": 1.1046817779541016, "mean_token_accuracy": 0.7222706019878388, "num_tokens": 30799644.0, "step": 64550 }, { "entropy": 1.0441304302215577, "epoch": 1.5709734685440528, "grad_norm": 25.125, "learning_rate": 1.0044767157084492e-05, "loss": 1.0555299377441407, "mean_token_accuracy": 0.7270373952388763, "num_tokens": 30823685.0, "step": 64600 }, { "entropy": 1.07829853951931, "epoch": 1.5721893922813162, "grad_norm": 25.125, "learning_rate": 1.0031363869980375e-05, "loss": 1.0839434051513672, "mean_token_accuracy": 0.7249143970012665, "num_tokens": 30848756.0, "step": 64650 }, { "entropy": 1.1184885495901107, "epoch": 1.5734053160185795, "grad_norm": 10.875, "learning_rate": 1.0017960526530846e-05, "loss": 1.1314781188964844, "mean_token_accuracy": 0.7167399120330811, "num_tokens": 30873913.0, "step": 64700 }, { "entropy": 1.0665105032920836, "epoch": 1.5746212397558426, "grad_norm": 12.6875, "learning_rate": 1.00045571508151e-05, "loss": 1.075699005126953, "mean_token_accuracy": 0.7257876551151276, "num_tokens": 30899091.0, "step": 64750 }, { "entropy": 1.0685885071754455, "epoch": 1.5758371634931057, "grad_norm": 19.5, "learning_rate": 9.9911537669124e-06, "loss": 1.0867007446289063, "mean_token_accuracy": 0.7217640626430512, "num_tokens": 30922852.0, "step": 64800 }, { "entropy": 0.9790044057369233, "epoch": 1.577053087230369, "grad_norm": 15.25, "learning_rate": 9.977750398902023e-06, "loss": 0.989991683959961, "mean_token_accuracy": 0.7407605564594268, "num_tokens": 30949281.0, "step": 64850 }, { "entropy": 1.034365560412407, "epoch": 1.578269010967632, "grad_norm": 25.75, "learning_rate": 9.964347070863212e-06, "loss": 1.050226364135742, "mean_token_accuracy": 0.7274121320247651, "num_tokens": 30970271.0, "step": 64900 }, { "entropy": 1.0290661883354186, "epoch": 1.5794849347048951, "grad_norm": 15.0, "learning_rate": 9.950943806875138e-06, "loss": 1.0333161926269532, "mean_token_accuracy": 0.7300195848941803, "num_tokens": 30994546.0, "step": 64950 }, { "entropy": 1.0422152316570281, "epoch": 1.5807008584421585, "grad_norm": 47.0, "learning_rate": 9.937540631016866e-06, "loss": 1.066397476196289, "mean_token_accuracy": 0.7241289722919464, "num_tokens": 31016614.0, "step": 65000 }, { "entropy": 1.109142904281616, "epoch": 1.5819167821794218, "grad_norm": 30.0, "learning_rate": 9.92413756736729e-06, "loss": 1.1374166870117188, "mean_token_accuracy": 0.7097100245952607, "num_tokens": 31039057.0, "step": 65050 }, { "entropy": 1.0504359287023544, "epoch": 1.5831327059166849, "grad_norm": 16.5, "learning_rate": 9.910734640005111e-06, "loss": 1.0381865692138672, "mean_token_accuracy": 0.7244411432743072, "num_tokens": 31066444.0, "step": 65100 }, { "entropy": 1.0061732602119446, "epoch": 1.584348629653948, "grad_norm": 26.5, "learning_rate": 9.897331873008787e-06, "loss": 1.0208829498291017, "mean_token_accuracy": 0.7315636563301087, "num_tokens": 31088315.0, "step": 65150 }, { "entropy": 0.9897393721342087, "epoch": 1.5855645533912113, "grad_norm": 18.875, "learning_rate": 9.883929290456476e-06, "loss": 0.9838695526123047, "mean_token_accuracy": 0.7475393259525299, "num_tokens": 31111352.0, "step": 65200 }, { "entropy": 1.0675284099578857, "epoch": 1.5867804771284746, "grad_norm": 20.5, "learning_rate": 9.87052691642602e-06, "loss": 1.0832481384277344, "mean_token_accuracy": 0.7190818333625794, "num_tokens": 31132738.0, "step": 65250 }, { "entropy": 1.0509715640544892, "epoch": 1.5879964008657377, "grad_norm": 13.125, "learning_rate": 9.857124774994875e-06, "loss": 1.0804593658447266, "mean_token_accuracy": 0.7133583343029022, "num_tokens": 31156385.0, "step": 65300 }, { "entropy": 0.9851748406887054, "epoch": 1.5892123246030008, "grad_norm": 20.5, "learning_rate": 9.843722890240081e-06, "loss": 0.9737649536132813, "mean_token_accuracy": 0.7432654237747193, "num_tokens": 31179456.0, "step": 65350 }, { "entropy": 1.0041329437494277, "epoch": 1.590428248340264, "grad_norm": 12.375, "learning_rate": 9.830321286238227e-06, "loss": 1.022803726196289, "mean_token_accuracy": 0.7291398966312408, "num_tokens": 31203948.0, "step": 65400 }, { "entropy": 1.0897718060016632, "epoch": 1.5916441720775274, "grad_norm": 15.625, "learning_rate": 9.81691998706538e-06, "loss": 1.096092529296875, "mean_token_accuracy": 0.7205407351255417, "num_tokens": 31226715.0, "step": 65450 }, { "entropy": 1.0610156756639482, "epoch": 1.5928600958147905, "grad_norm": 13.3125, "learning_rate": 9.803519016797076e-06, "loss": 1.0977529907226562, "mean_token_accuracy": 0.7198502349853516, "num_tokens": 31247760.0, "step": 65500 }, { "entropy": 0.9648764836788177, "epoch": 1.5940760195520536, "grad_norm": 14.0625, "learning_rate": 9.790118399508247e-06, "loss": 0.9894435119628906, "mean_token_accuracy": 0.741221798658371, "num_tokens": 31269450.0, "step": 65550 }, { "entropy": 1.0451366877555848, "epoch": 1.595291943289317, "grad_norm": 23.375, "learning_rate": 9.776718159273199e-06, "loss": 1.0564450073242186, "mean_token_accuracy": 0.7294206291437149, "num_tokens": 31294952.0, "step": 65600 }, { "entropy": 1.1194828712940217, "epoch": 1.5965078670265802, "grad_norm": 12.875, "learning_rate": 9.763318320165565e-06, "loss": 1.1323332977294922, "mean_token_accuracy": 0.7099667763710023, "num_tokens": 31319890.0, "step": 65650 }, { "entropy": 1.1047960925102234, "epoch": 1.5977237907638433, "grad_norm": 14.375, "learning_rate": 9.749918906258239e-06, "loss": 1.1149553680419921, "mean_token_accuracy": 0.7148602437973023, "num_tokens": 31342876.0, "step": 65700 }, { "entropy": 1.0324284493923188, "epoch": 1.5989397145011064, "grad_norm": 41.75, "learning_rate": 9.73651994162337e-06, "loss": 1.0175080871582032, "mean_token_accuracy": 0.7290987527370453, "num_tokens": 31365970.0, "step": 65750 }, { "entropy": 0.9935515934228897, "epoch": 1.6001556382383697, "grad_norm": 16.25, "learning_rate": 9.723121450332298e-06, "loss": 0.9948262023925781, "mean_token_accuracy": 0.7429851996898651, "num_tokens": 31389739.0, "step": 65800 }, { "entropy": 0.9569682544469833, "epoch": 1.601371561975633, "grad_norm": 19.625, "learning_rate": 9.709723456455495e-06, "loss": 0.9584075164794922, "mean_token_accuracy": 0.7420991742610932, "num_tokens": 31413571.0, "step": 65850 }, { "entropy": 1.0871985828876496, "epoch": 1.602587485712896, "grad_norm": 17.75, "learning_rate": 9.696325984062564e-06, "loss": 1.1045375061035156, "mean_token_accuracy": 0.7151115572452545, "num_tokens": 31438452.0, "step": 65900 }, { "entropy": 1.0396313536167145, "epoch": 1.6038034094501592, "grad_norm": 15.0, "learning_rate": 9.68292905722215e-06, "loss": 1.0532053375244141, "mean_token_accuracy": 0.7245143496990204, "num_tokens": 31459721.0, "step": 65950 }, { "entropy": 1.0899536997079848, "epoch": 1.6050193331874225, "grad_norm": 28.25, "learning_rate": 9.66953270000193e-06, "loss": 1.122783203125, "mean_token_accuracy": 0.7138630640506745, "num_tokens": 31483780.0, "step": 66000 }, { "entropy": 0.9391698095202446, "epoch": 1.6062352569246858, "grad_norm": 14.8125, "learning_rate": 9.656136936468558e-06, "loss": 0.9397526550292968, "mean_token_accuracy": 0.7467468667030335, "num_tokens": 31508978.0, "step": 66050 }, { "entropy": 0.9953205060958862, "epoch": 1.607451180661949, "grad_norm": 15.375, "learning_rate": 9.642741790687613e-06, "loss": 1.0091362762451173, "mean_token_accuracy": 0.7375274693965912, "num_tokens": 31532230.0, "step": 66100 }, { "entropy": 1.046612418293953, "epoch": 1.608667104399212, "grad_norm": 13.8125, "learning_rate": 9.629347286723575e-06, "loss": 1.069105758666992, "mean_token_accuracy": 0.7292892879247665, "num_tokens": 31559664.0, "step": 66150 }, { "entropy": 1.047754996418953, "epoch": 1.6098830281364753, "grad_norm": 31.625, "learning_rate": 9.615953448639758e-06, "loss": 1.0508331298828124, "mean_token_accuracy": 0.7229628098011017, "num_tokens": 31582528.0, "step": 66200 }, { "entropy": 1.0567557448148728, "epoch": 1.6110989518737386, "grad_norm": 9.25, "learning_rate": 9.60256030049829e-06, "loss": 1.0881367492675782, "mean_token_accuracy": 0.7161280417442322, "num_tokens": 31611042.0, "step": 66250 }, { "entropy": 1.005825120806694, "epoch": 1.6123148756110017, "grad_norm": 17.625, "learning_rate": 9.58916786636006e-06, "loss": 0.9961501312255859, "mean_token_accuracy": 0.7333925437927246, "num_tokens": 31637420.0, "step": 66300 }, { "entropy": 0.9563955914974213, "epoch": 1.6135307993482648, "grad_norm": 14.25, "learning_rate": 9.575776170284666e-06, "loss": 0.9583890533447266, "mean_token_accuracy": 0.7432342898845673, "num_tokens": 31660532.0, "step": 66350 }, { "entropy": 0.9870337736606598, "epoch": 1.6147467230855281, "grad_norm": 12.8125, "learning_rate": 9.56238523633039e-06, "loss": 0.9987638092041016, "mean_token_accuracy": 0.733349814414978, "num_tokens": 31682471.0, "step": 66400 }, { "entropy": 1.055628153383732, "epoch": 1.6159626468227912, "grad_norm": 11.375, "learning_rate": 9.548995088554133e-06, "loss": 1.0411167907714844, "mean_token_accuracy": 0.7287552332878113, "num_tokens": 31705509.0, "step": 66450 }, { "entropy": 1.0674532294273376, "epoch": 1.6171785705600543, "grad_norm": 28.875, "learning_rate": 9.535605751011394e-06, "loss": 1.12416015625, "mean_token_accuracy": 0.7180546402931214, "num_tokens": 31729968.0, "step": 66500 }, { "entropy": 1.0599749970436096, "epoch": 1.6183944942973176, "grad_norm": 11.125, "learning_rate": 9.522217247756215e-06, "loss": 1.0682505035400391, "mean_token_accuracy": 0.7244072079658508, "num_tokens": 31754093.0, "step": 66550 }, { "entropy": 1.026769316792488, "epoch": 1.619610418034581, "grad_norm": 13.375, "learning_rate": 9.508829602841132e-06, "loss": 1.0331168365478516, "mean_token_accuracy": 0.7337642049789429, "num_tokens": 31779700.0, "step": 66600 }, { "entropy": 1.0984537053108214, "epoch": 1.620826341771844, "grad_norm": 76.0, "learning_rate": 9.495442840317149e-06, "loss": 1.1246310424804689, "mean_token_accuracy": 0.7112787652015686, "num_tokens": 31804915.0, "step": 66650 }, { "entropy": 1.0695460307598115, "epoch": 1.6220422655091071, "grad_norm": 14.0625, "learning_rate": 9.482056984233677e-06, "loss": 1.0893602752685547, "mean_token_accuracy": 0.7124170756340027, "num_tokens": 31828285.0, "step": 66700 }, { "entropy": 1.1839474070072173, "epoch": 1.6232581892463704, "grad_norm": 19.0, "learning_rate": 9.468672058638498e-06, "loss": 1.2036026763916015, "mean_token_accuracy": 0.6984344094991684, "num_tokens": 31852791.0, "step": 66750 }, { "entropy": 1.0796971106529236, "epoch": 1.6244741129836338, "grad_norm": 16.625, "learning_rate": 9.455288087577734e-06, "loss": 1.1100633239746094, "mean_token_accuracy": 0.7149677872657776, "num_tokens": 31877768.0, "step": 66800 }, { "entropy": 0.936751050055027, "epoch": 1.6256900367208968, "grad_norm": 10.0, "learning_rate": 9.441905095095776e-06, "loss": 0.9175774383544922, "mean_token_accuracy": 0.7609848654270173, "num_tokens": 31900833.0, "step": 66850 }, { "entropy": 1.0973862099647522, "epoch": 1.62690596045816, "grad_norm": 22.5, "learning_rate": 9.428523105235266e-06, "loss": 1.1298392486572266, "mean_token_accuracy": 0.7127432405948639, "num_tokens": 31923829.0, "step": 66900 }, { "entropy": 1.0765296977758407, "epoch": 1.6281218841954233, "grad_norm": 24.25, "learning_rate": 9.41514214203705e-06, "loss": 1.100014419555664, "mean_token_accuracy": 0.723384667634964, "num_tokens": 31945265.0, "step": 66950 }, { "entropy": 1.0498353844881059, "epoch": 1.6293378079326866, "grad_norm": 11.25, "learning_rate": 9.401762229540116e-06, "loss": 1.0595762634277344, "mean_token_accuracy": 0.7281376594305038, "num_tokens": 31974600.0, "step": 67000 }, { "entropy": 1.0188115096092225, "epoch": 1.6305537316699497, "grad_norm": 13.0, "learning_rate": 9.388383391781576e-06, "loss": 1.0344901275634766, "mean_token_accuracy": 0.7266708171367645, "num_tokens": 31999031.0, "step": 67050 }, { "entropy": 1.0107264894247054, "epoch": 1.6317696554072127, "grad_norm": 12.1875, "learning_rate": 9.375005652796601e-06, "loss": 1.0142886352539062, "mean_token_accuracy": 0.7415572726726531, "num_tokens": 32020426.0, "step": 67100 }, { "entropy": 1.0724192368984222, "epoch": 1.632985579144476, "grad_norm": 12.5625, "learning_rate": 9.361629036618398e-06, "loss": 1.1015989685058594, "mean_token_accuracy": 0.723365763425827, "num_tokens": 32045825.0, "step": 67150 }, { "entropy": 1.1183999180793762, "epoch": 1.6342015028817394, "grad_norm": 22.5, "learning_rate": 9.348253567278156e-06, "loss": 1.1332205200195313, "mean_token_accuracy": 0.7059864443540573, "num_tokens": 32075420.0, "step": 67200 }, { "entropy": 1.0640532743930817, "epoch": 1.6354174266190025, "grad_norm": 15.1875, "learning_rate": 9.334879268804995e-06, "loss": 1.0645923614501953, "mean_token_accuracy": 0.7349377083778381, "num_tokens": 32099805.0, "step": 67250 }, { "entropy": 1.0014745962619782, "epoch": 1.6366333503562656, "grad_norm": 14.75, "learning_rate": 9.32150616522594e-06, "loss": 0.9828022003173829, "mean_token_accuracy": 0.7377782022953033, "num_tokens": 32121892.0, "step": 67300 }, { "entropy": 1.1144921731948854, "epoch": 1.6378492740935289, "grad_norm": 11.8125, "learning_rate": 9.30813428056586e-06, "loss": 1.1302577209472657, "mean_token_accuracy": 0.7183329957723618, "num_tokens": 32143846.0, "step": 67350 }, { "entropy": 1.0845633387565612, "epoch": 1.6390651978307922, "grad_norm": 16.25, "learning_rate": 9.294763638847448e-06, "loss": 1.0930826568603516, "mean_token_accuracy": 0.7247971904277801, "num_tokens": 32163223.0, "step": 67400 }, { "entropy": 1.0691143172979354, "epoch": 1.6402811215680553, "grad_norm": 16.875, "learning_rate": 9.281394264091157e-06, "loss": 1.0936296844482423, "mean_token_accuracy": 0.7192421448230744, "num_tokens": 32185751.0, "step": 67450 }, { "entropy": 1.0743076038360595, "epoch": 1.6414970453053184, "grad_norm": 18.875, "learning_rate": 9.268026180315155e-06, "loss": 1.0979795837402344, "mean_token_accuracy": 0.7250823581218719, "num_tokens": 32207337.0, "step": 67500 }, { "entropy": 1.019801321029663, "epoch": 1.6427129690425817, "grad_norm": 24.0, "learning_rate": 9.254659411535309e-06, "loss": 1.03505615234375, "mean_token_accuracy": 0.7287458467483521, "num_tokens": 32228996.0, "step": 67550 }, { "entropy": 1.1409683209657668, "epoch": 1.643928892779845, "grad_norm": 8.3125, "learning_rate": 9.24129398176511e-06, "loss": 1.1362506866455078, "mean_token_accuracy": 0.7141190952062607, "num_tokens": 32252830.0, "step": 67600 }, { "entropy": 1.0619378983974457, "epoch": 1.645144816517108, "grad_norm": 20.625, "learning_rate": 9.227929915015642e-06, "loss": 1.0478400421142577, "mean_token_accuracy": 0.7284006369113922, "num_tokens": 32274465.0, "step": 67650 }, { "entropy": 1.0488220888376236, "epoch": 1.6463607402543712, "grad_norm": 21.875, "learning_rate": 9.214567235295556e-06, "loss": 1.0724166870117187, "mean_token_accuracy": 0.7193017864227295, "num_tokens": 32297678.0, "step": 67700 }, { "entropy": 1.011282005906105, "epoch": 1.6475766639916345, "grad_norm": 12.875, "learning_rate": 9.20120596661099e-06, "loss": 1.0336949157714843, "mean_token_accuracy": 0.7427577710151673, "num_tokens": 32317910.0, "step": 67750 }, { "entropy": 1.026838657259941, "epoch": 1.6487925877288978, "grad_norm": 9.75, "learning_rate": 9.187846132965563e-06, "loss": 1.0375721740722657, "mean_token_accuracy": 0.7380401903390884, "num_tokens": 32344770.0, "step": 67800 }, { "entropy": 1.0064872580766677, "epoch": 1.6500085114661607, "grad_norm": 12.75, "learning_rate": 9.174487758360315e-06, "loss": 1.0084961700439452, "mean_token_accuracy": 0.7338501906394959, "num_tokens": 32369021.0, "step": 67850 }, { "entropy": 1.1782633018493653, "epoch": 1.651224435203424, "grad_norm": 19.0, "learning_rate": 9.161130866793653e-06, "loss": 1.1907101440429688, "mean_token_accuracy": 0.696990225315094, "num_tokens": 32392091.0, "step": 67900 }, { "entropy": 1.0047927230596543, "epoch": 1.6524403589406873, "grad_norm": 14.0625, "learning_rate": 9.147775482261335e-06, "loss": 1.0052634429931642, "mean_token_accuracy": 0.7349001574516296, "num_tokens": 32416403.0, "step": 67950 }, { "entropy": 1.0878383713960647, "epoch": 1.6536562826779504, "grad_norm": 12.5625, "learning_rate": 9.134421628756393e-06, "loss": 1.1137867736816407, "mean_token_accuracy": 0.7199014520645142, "num_tokens": 32438488.0, "step": 68000 }, { "entropy": 0.9945025563240051, "epoch": 1.6548722064152135, "grad_norm": 19.375, "learning_rate": 9.121069330269129e-06, "loss": 0.9973374938964844, "mean_token_accuracy": 0.7355703639984131, "num_tokens": 32463419.0, "step": 68050 }, { "entropy": 1.0032887864112854, "epoch": 1.6560881301524768, "grad_norm": 14.75, "learning_rate": 9.10771861078704e-06, "loss": 1.0060478973388671, "mean_token_accuracy": 0.7383735978603363, "num_tokens": 32485706.0, "step": 68100 }, { "entropy": 1.0053199070692063, "epoch": 1.6573040538897401, "grad_norm": 16.0, "learning_rate": 9.094369494294783e-06, "loss": 1.0246002197265625, "mean_token_accuracy": 0.7311720204353332, "num_tokens": 32508459.0, "step": 68150 }, { "entropy": 0.9577264618873597, "epoch": 1.6585199776270032, "grad_norm": 10.4375, "learning_rate": 9.081022004774149e-06, "loss": 0.9845348358154297, "mean_token_accuracy": 0.7409092164039612, "num_tokens": 32527994.0, "step": 68200 }, { "entropy": 1.1056599581241608, "epoch": 1.6597359013642663, "grad_norm": 8.625, "learning_rate": 9.06767616620399e-06, "loss": 1.1073206329345704, "mean_token_accuracy": 0.7192136979103089, "num_tokens": 32555799.0, "step": 68250 }, { "entropy": 1.0263287568092345, "epoch": 1.6609518251015296, "grad_norm": 23.625, "learning_rate": 9.0543320025602e-06, "loss": 1.0155023193359376, "mean_token_accuracy": 0.7330289578437805, "num_tokens": 32576460.0, "step": 68300 }, { "entropy": 1.0581214541196824, "epoch": 1.662167748838793, "grad_norm": 11.6875, "learning_rate": 9.040989537815672e-06, "loss": 1.0877339935302734, "mean_token_accuracy": 0.7210550099611283, "num_tokens": 32598974.0, "step": 68350 }, { "entropy": 1.0228035360574723, "epoch": 1.663383672576056, "grad_norm": 9.5625, "learning_rate": 9.027648795940226e-06, "loss": 1.0222586822509765, "mean_token_accuracy": 0.7300154054164887, "num_tokens": 32622471.0, "step": 68400 }, { "entropy": 1.0811778020858764, "epoch": 1.664599596313319, "grad_norm": 24.125, "learning_rate": 9.014309800900605e-06, "loss": 1.0882061767578124, "mean_token_accuracy": 0.7182281732559204, "num_tokens": 32646291.0, "step": 68450 }, { "entropy": 1.0240449601411818, "epoch": 1.6658155200505824, "grad_norm": 14.125, "learning_rate": 9.000972576660414e-06, "loss": 1.0512718963623047, "mean_token_accuracy": 0.7256808745861053, "num_tokens": 32671266.0, "step": 68500 }, { "entropy": 0.9841360259056091, "epoch": 1.6670314437878457, "grad_norm": 17.125, "learning_rate": 8.98763714718006e-06, "loss": 0.9649549102783204, "mean_token_accuracy": 0.7313773572444916, "num_tokens": 32695674.0, "step": 68550 }, { "entropy": 0.9975669091939926, "epoch": 1.6682473675251088, "grad_norm": 13.5625, "learning_rate": 8.974303536416746e-06, "loss": 1.0068740844726562, "mean_token_accuracy": 0.7299323564767838, "num_tokens": 32722345.0, "step": 68600 }, { "entropy": 1.074147093296051, "epoch": 1.669463291262372, "grad_norm": 19.0, "learning_rate": 8.96097176832439e-06, "loss": 1.09708984375, "mean_token_accuracy": 0.7128173834085465, "num_tokens": 32748721.0, "step": 68650 }, { "entropy": 1.052025585770607, "epoch": 1.6706792149996352, "grad_norm": 17.5, "learning_rate": 8.94764186685361e-06, "loss": 1.0399343872070312, "mean_token_accuracy": 0.7336821836233139, "num_tokens": 32768631.0, "step": 68700 }, { "entropy": 1.1523979997634888, "epoch": 1.6718951387368985, "grad_norm": 19.75, "learning_rate": 8.934313855951676e-06, "loss": 1.1594995880126953, "mean_token_accuracy": 0.7196511021256446, "num_tokens": 32791828.0, "step": 68750 }, { "entropy": 0.9662564104795456, "epoch": 1.6731110624741616, "grad_norm": 14.4375, "learning_rate": 8.920987759562444e-06, "loss": 0.9915035247802735, "mean_token_accuracy": 0.7438735580444336, "num_tokens": 32810948.0, "step": 68800 }, { "entropy": 1.043775936961174, "epoch": 1.6743269862114247, "grad_norm": 15.5625, "learning_rate": 8.90766360162635e-06, "loss": 1.0610317230224608, "mean_token_accuracy": 0.7228279387950898, "num_tokens": 32836468.0, "step": 68850 }, { "entropy": 1.113478901386261, "epoch": 1.675542909948688, "grad_norm": 12.875, "learning_rate": 8.894341406080325e-06, "loss": 1.1371955108642577, "mean_token_accuracy": 0.713716961145401, "num_tokens": 32858114.0, "step": 68900 }, { "entropy": 1.1057050436735154, "epoch": 1.6767588336859514, "grad_norm": 17.625, "learning_rate": 8.8810211968578e-06, "loss": 1.1062653350830078, "mean_token_accuracy": 0.7187708413600922, "num_tokens": 32881742.0, "step": 68950 }, { "entropy": 0.9573877030611038, "epoch": 1.6779747574232144, "grad_norm": 18.125, "learning_rate": 8.867702997888623e-06, "loss": 0.9841678619384766, "mean_token_accuracy": 0.7463148033618927, "num_tokens": 32905514.0, "step": 69000 }, { "entropy": 1.0257745754718781, "epoch": 1.6791906811604775, "grad_norm": 15.9375, "learning_rate": 8.854386833099023e-06, "loss": 1.0411515045166015, "mean_token_accuracy": 0.7224098950624466, "num_tokens": 32928228.0, "step": 69050 }, { "entropy": 1.1127609848976134, "epoch": 1.6804066048977409, "grad_norm": 10.6875, "learning_rate": 8.841072726411597e-06, "loss": 1.157367935180664, "mean_token_accuracy": 0.710539065003395, "num_tokens": 32953421.0, "step": 69100 }, { "entropy": 1.0008608794212341, "epoch": 1.6816225286350042, "grad_norm": 30.375, "learning_rate": 8.827760701745227e-06, "loss": 0.9949913024902344, "mean_token_accuracy": 0.7456972825527192, "num_tokens": 32975368.0, "step": 69150 }, { "entropy": 0.9989182889461518, "epoch": 1.6828384523722673, "grad_norm": 6.28125, "learning_rate": 8.814450783015054e-06, "loss": 1.0270294189453124, "mean_token_accuracy": 0.7345108848810196, "num_tokens": 33001854.0, "step": 69200 }, { "entropy": 1.1569655400514602, "epoch": 1.6840543761095303, "grad_norm": 11.875, "learning_rate": 8.801142994132454e-06, "loss": 1.1840201568603517, "mean_token_accuracy": 0.7060720956325531, "num_tokens": 33024902.0, "step": 69250 }, { "entropy": 1.037959550023079, "epoch": 1.6852702998467937, "grad_norm": 9.4375, "learning_rate": 8.787837359004954e-06, "loss": 1.0231490325927735, "mean_token_accuracy": 0.7313673973083497, "num_tokens": 33051135.0, "step": 69300 }, { "entropy": 1.00947798371315, "epoch": 1.6864862235840568, "grad_norm": 10.625, "learning_rate": 8.774533901536221e-06, "loss": 1.015967559814453, "mean_token_accuracy": 0.7301330143213272, "num_tokens": 33076871.0, "step": 69350 }, { "entropy": 1.0708277988433839, "epoch": 1.6877021473213198, "grad_norm": 13.875, "learning_rate": 8.761232645626018e-06, "loss": 1.0858914947509766, "mean_token_accuracy": 0.7229865598678589, "num_tokens": 33103913.0, "step": 69400 }, { "entropy": 1.0402701640129088, "epoch": 1.6889180710585832, "grad_norm": 16.0, "learning_rate": 8.74793361517014e-06, "loss": 1.0477133178710938, "mean_token_accuracy": 0.7327624940872193, "num_tokens": 33129154.0, "step": 69450 }, { "entropy": 1.0816839975118637, "epoch": 1.6901339947958465, "grad_norm": 11.0, "learning_rate": 8.734636834060393e-06, "loss": 1.105973587036133, "mean_token_accuracy": 0.7140889430046081, "num_tokens": 33154867.0, "step": 69500 }, { "entropy": 1.1055969923734665, "epoch": 1.6913499185331096, "grad_norm": 14.5625, "learning_rate": 8.721342326184531e-06, "loss": 1.119363021850586, "mean_token_accuracy": 0.7194713914394378, "num_tokens": 33179262.0, "step": 69550 }, { "entropy": 1.0643524730205536, "epoch": 1.6925658422703727, "grad_norm": 19.0, "learning_rate": 8.70805011542624e-06, "loss": 1.0867516326904296, "mean_token_accuracy": 0.7215639835596085, "num_tokens": 33203496.0, "step": 69600 }, { "entropy": 1.0608297330141068, "epoch": 1.693781766007636, "grad_norm": 11.0625, "learning_rate": 8.694760225665068e-06, "loss": 1.060484619140625, "mean_token_accuracy": 0.7258275103569031, "num_tokens": 33225793.0, "step": 69650 }, { "entropy": 1.0103814542293548, "epoch": 1.6949976897448993, "grad_norm": 14.5625, "learning_rate": 8.681472680776395e-06, "loss": 1.021062774658203, "mean_token_accuracy": 0.7361329030990601, "num_tokens": 33250131.0, "step": 69700 }, { "entropy": 1.1228554129600525, "epoch": 1.6962136134821624, "grad_norm": 22.0, "learning_rate": 8.668187504631393e-06, "loss": 1.1436370849609374, "mean_token_accuracy": 0.709384183883667, "num_tokens": 33272454.0, "step": 69750 }, { "entropy": 1.0939618754386902, "epoch": 1.6974295372194255, "grad_norm": 15.5625, "learning_rate": 8.654904721096968e-06, "loss": 1.1170790100097656, "mean_token_accuracy": 0.7181785351037979, "num_tokens": 33295442.0, "step": 69800 }, { "entropy": 1.079058591723442, "epoch": 1.6986454609566888, "grad_norm": 13.0625, "learning_rate": 8.64162435403574e-06, "loss": 1.0945380401611329, "mean_token_accuracy": 0.7229240727424622, "num_tokens": 33318995.0, "step": 69850 }, { "entropy": 1.0373855489492416, "epoch": 1.699861384693952, "grad_norm": 14.5625, "learning_rate": 8.628346427305983e-06, "loss": 1.064837417602539, "mean_token_accuracy": 0.7254097157716751, "num_tokens": 33342751.0, "step": 69900 }, { "entropy": 1.0746212255954743, "epoch": 1.7010773084312152, "grad_norm": 18.875, "learning_rate": 8.615070964761582e-06, "loss": 1.0860567474365235, "mean_token_accuracy": 0.7251433753967285, "num_tokens": 33365601.0, "step": 69950 }, { "entropy": 1.092942132949829, "epoch": 1.7022932321684783, "grad_norm": 29.125, "learning_rate": 8.601797990252005e-06, "loss": 1.1074642944335937, "mean_token_accuracy": 0.7142662799358368, "num_tokens": 33390871.0, "step": 70000 }, { "epoch": 1.7022932321684783, "eval_entropy": 1.1229370533784049, "eval_loss": 1.29446280002594, "eval_mean_token_accuracy": 0.6798908558043695, "eval_num_tokens": 33390871.0, "eval_runtime": 391.1911, "eval_samples_per_second": 11.68, "eval_steps_per_second": 11.68, "step": 70000 }, { "entropy": 1.0442512375116348, "epoch": 1.7035091559057416, "grad_norm": 15.8125, "learning_rate": 8.588527527622242e-06, "loss": 1.0685316467285155, "mean_token_accuracy": 0.7269330286979675, "num_tokens": 33415214.0, "step": 70050 }, { "entropy": 1.0386088275909424, "epoch": 1.704725079643005, "grad_norm": 27.375, "learning_rate": 8.575259600712768e-06, "loss": 1.0333318328857422, "mean_token_accuracy": 0.7328995239734649, "num_tokens": 33437379.0, "step": 70100 }, { "entropy": 1.1140181308984756, "epoch": 1.705941003380268, "grad_norm": 12.4375, "learning_rate": 8.56199423335952e-06, "loss": 1.153847427368164, "mean_token_accuracy": 0.7202204179763794, "num_tokens": 33463665.0, "step": 70150 }, { "entropy": 1.0018051832914352, "epoch": 1.707156927117531, "grad_norm": 31.625, "learning_rate": 8.548731449393814e-06, "loss": 0.9668994140625, "mean_token_accuracy": 0.7481958544254304, "num_tokens": 33485293.0, "step": 70200 }, { "entropy": 1.0815638428926468, "epoch": 1.7083728508547944, "grad_norm": 12.125, "learning_rate": 8.535471272642337e-06, "loss": 1.1067759704589843, "mean_token_accuracy": 0.7181530106067657, "num_tokens": 33507156.0, "step": 70250 }, { "entropy": 1.149090050458908, "epoch": 1.7095887745920577, "grad_norm": 15.25, "learning_rate": 8.522213726927096e-06, "loss": 1.1576456451416015, "mean_token_accuracy": 0.7027329939603806, "num_tokens": 33533228.0, "step": 70300 }, { "entropy": 1.0220014083385467, "epoch": 1.7108046983293208, "grad_norm": 42.75, "learning_rate": 8.508958836065359e-06, "loss": 1.0339404296875, "mean_token_accuracy": 0.7327229750156402, "num_tokens": 33559552.0, "step": 70350 }, { "entropy": 1.1357226657867432, "epoch": 1.712020622066584, "grad_norm": 25.5, "learning_rate": 8.495706623869638e-06, "loss": 1.135846405029297, "mean_token_accuracy": 0.7130814743041992, "num_tokens": 33582297.0, "step": 70400 }, { "entropy": 1.1107436972856521, "epoch": 1.7132365458038472, "grad_norm": 11.9375, "learning_rate": 8.482457114147615e-06, "loss": 1.11326171875, "mean_token_accuracy": 0.719298437833786, "num_tokens": 33603991.0, "step": 70450 }, { "entropy": 0.992585446536541, "epoch": 1.7144524695411105, "grad_norm": 20.25, "learning_rate": 8.46921033070214e-06, "loss": 0.9950186157226563, "mean_token_accuracy": 0.7387228322029114, "num_tokens": 33628874.0, "step": 70500 }, { "entropy": 0.9889318466186523, "epoch": 1.7156683932783736, "grad_norm": 8.1875, "learning_rate": 8.45596629733115e-06, "loss": 0.9724835968017578, "mean_token_accuracy": 0.7459668344259263, "num_tokens": 33651818.0, "step": 70550 }, { "entropy": 0.9125360438227653, "epoch": 1.7168843170156367, "grad_norm": 11.125, "learning_rate": 8.442725037827639e-06, "loss": 0.9093623352050781, "mean_token_accuracy": 0.7505097353458404, "num_tokens": 33675927.0, "step": 70600 }, { "entropy": 1.1178356647491454, "epoch": 1.7181002407529, "grad_norm": 12.625, "learning_rate": 8.429486575979632e-06, "loss": 1.1237918090820314, "mean_token_accuracy": 0.714579911828041, "num_tokens": 33700849.0, "step": 70650 }, { "entropy": 1.1304315412044526, "epoch": 1.7193161644901633, "grad_norm": 9.4375, "learning_rate": 8.416250935570112e-06, "loss": 1.1629947662353515, "mean_token_accuracy": 0.7173077464103699, "num_tokens": 33722340.0, "step": 70700 }, { "entropy": 1.0112928766012192, "epoch": 1.7205320882274264, "grad_norm": 15.1875, "learning_rate": 8.403018140377003e-06, "loss": 1.0123699188232422, "mean_token_accuracy": 0.7389455687999725, "num_tokens": 33742965.0, "step": 70750 }, { "entropy": 0.9886850190162658, "epoch": 1.7217480119646895, "grad_norm": 36.25, "learning_rate": 8.389788214173124e-06, "loss": 1.008373031616211, "mean_token_accuracy": 0.7456795972585678, "num_tokens": 33766183.0, "step": 70800 }, { "entropy": 0.9971941965818405, "epoch": 1.7229639357019528, "grad_norm": 13.3125, "learning_rate": 8.376561180726115e-06, "loss": 0.9932032775878906, "mean_token_accuracy": 0.7401838338375092, "num_tokens": 33793098.0, "step": 70850 }, { "entropy": 0.94774609208107, "epoch": 1.724179859439216, "grad_norm": 11.5, "learning_rate": 8.363337063798448e-06, "loss": 0.9744734191894531, "mean_token_accuracy": 0.7401870906352996, "num_tokens": 33818194.0, "step": 70900 }, { "entropy": 1.1268050354719161, "epoch": 1.725395783176479, "grad_norm": 11.5625, "learning_rate": 8.35011588714734e-06, "loss": 1.1616381072998048, "mean_token_accuracy": 0.7112197893857956, "num_tokens": 33844310.0, "step": 70950 }, { "entropy": 1.0492735183238984, "epoch": 1.7266117069137423, "grad_norm": 18.625, "learning_rate": 8.33689767452472e-06, "loss": 1.0506687927246094, "mean_token_accuracy": 0.7335241830348969, "num_tokens": 33866351.0, "step": 71000 }, { "entropy": 1.1009494584798813, "epoch": 1.7278276306510056, "grad_norm": 43.25, "learning_rate": 8.323682449677214e-06, "loss": 1.1446448516845704, "mean_token_accuracy": 0.7103006196022034, "num_tokens": 33891002.0, "step": 71050 }, { "entropy": 1.095614368915558, "epoch": 1.7290435543882687, "grad_norm": 29.25, "learning_rate": 8.310470236346057e-06, "loss": 1.1022552490234374, "mean_token_accuracy": 0.7195475375652314, "num_tokens": 33911713.0, "step": 71100 }, { "entropy": 1.0995247852802277, "epoch": 1.7302594781255318, "grad_norm": 15.0, "learning_rate": 8.297261058267087e-06, "loss": 1.099834213256836, "mean_token_accuracy": 0.7160825461149216, "num_tokens": 33934641.0, "step": 71150 }, { "entropy": 1.1165579396486283, "epoch": 1.7314754018627951, "grad_norm": 30.0, "learning_rate": 8.28405493917069e-06, "loss": 1.143903045654297, "mean_token_accuracy": 0.7024952089786529, "num_tokens": 33958607.0, "step": 71200 }, { "entropy": 1.0383460253477097, "epoch": 1.7326913256000585, "grad_norm": 35.75, "learning_rate": 8.270851902781749e-06, "loss": 1.0621559143066406, "mean_token_accuracy": 0.7276467025279999, "num_tokens": 33980319.0, "step": 71250 }, { "entropy": 1.088269486427307, "epoch": 1.7339072493373215, "grad_norm": 9.125, "learning_rate": 8.257651972819612e-06, "loss": 1.1059480285644532, "mean_token_accuracy": 0.7221580111980438, "num_tokens": 34004652.0, "step": 71300 }, { "entropy": 1.1358056890964507, "epoch": 1.7351231730745846, "grad_norm": 23.875, "learning_rate": 8.244455172998042e-06, "loss": 1.1221353912353516, "mean_token_accuracy": 0.705435322523117, "num_tokens": 34028044.0, "step": 71350 }, { "entropy": 0.9756538590788841, "epoch": 1.736339096811848, "grad_norm": 13.5625, "learning_rate": 8.231261527025191e-06, "loss": 0.9748752593994141, "mean_token_accuracy": 0.7384315800666809, "num_tokens": 34053359.0, "step": 71400 }, { "entropy": 1.1455681651830674, "epoch": 1.7375550205491113, "grad_norm": 17.125, "learning_rate": 8.218071058603539e-06, "loss": 1.1752926635742187, "mean_token_accuracy": 0.6997312372922897, "num_tokens": 34077104.0, "step": 71450 }, { "entropy": 1.0452940624952316, "epoch": 1.7387709442863744, "grad_norm": 13.75, "learning_rate": 8.204883791429846e-06, "loss": 1.0778826141357423, "mean_token_accuracy": 0.7205056589841843, "num_tokens": 34105277.0, "step": 71500 }, { "entropy": 1.0813349407911301, "epoch": 1.7399868680236374, "grad_norm": 14.375, "learning_rate": 8.19169974919514e-06, "loss": 1.0824181365966796, "mean_token_accuracy": 0.7185827177762986, "num_tokens": 34130807.0, "step": 71550 }, { "entropy": 1.0226768180727959, "epoch": 1.7412027917609008, "grad_norm": 21.25, "learning_rate": 8.178518955584642e-06, "loss": 1.0222006225585938, "mean_token_accuracy": 0.7352511239051819, "num_tokens": 34152577.0, "step": 71600 }, { "entropy": 1.1120004951953888, "epoch": 1.742418715498164, "grad_norm": 14.375, "learning_rate": 8.165341434277738e-06, "loss": 1.1094679260253906, "mean_token_accuracy": 0.7253813207149505, "num_tokens": 34176658.0, "step": 71650 }, { "entropy": 0.9561682558059692, "epoch": 1.7436346392354272, "grad_norm": 23.625, "learning_rate": 8.152167208947949e-06, "loss": 0.9411027526855469, "mean_token_accuracy": 0.7569305300712585, "num_tokens": 34195604.0, "step": 71700 }, { "entropy": 0.9948094290494919, "epoch": 1.7448505629726903, "grad_norm": 13.5625, "learning_rate": 8.138996303262854e-06, "loss": 1.0040502166748047, "mean_token_accuracy": 0.7394539046287537, "num_tokens": 34218858.0, "step": 71750 }, { "entropy": 1.030834939479828, "epoch": 1.7460664867099536, "grad_norm": 46.75, "learning_rate": 8.125828740884084e-06, "loss": 1.0358871459960937, "mean_token_accuracy": 0.7305410599708557, "num_tokens": 34243862.0, "step": 71800 }, { "entropy": 0.9884623342752457, "epoch": 1.7472824104472169, "grad_norm": 13.0, "learning_rate": 8.112664545467252e-06, "loss": 1.0014740753173828, "mean_token_accuracy": 0.7422335493564606, "num_tokens": 34267088.0, "step": 71850 }, { "entropy": 1.0598668295145035, "epoch": 1.74849833418448, "grad_norm": 11.3125, "learning_rate": 8.099503740661934e-06, "loss": 1.1153407287597656, "mean_token_accuracy": 0.7164733904600143, "num_tokens": 34291327.0, "step": 71900 }, { "entropy": 1.0675525486469268, "epoch": 1.749714257921743, "grad_norm": 27.875, "learning_rate": 8.086346350111609e-06, "loss": 1.0591020202636718, "mean_token_accuracy": 0.7221309840679169, "num_tokens": 34317051.0, "step": 71950 }, { "entropy": 1.1070539611577987, "epoch": 1.7509301816590064, "grad_norm": 18.25, "learning_rate": 8.073192397453618e-06, "loss": 1.1438373565673827, "mean_token_accuracy": 0.7121417802572251, "num_tokens": 34345592.0, "step": 72000 }, { "entropy": 1.0691108202934265, "epoch": 1.7521461053962697, "grad_norm": 14.875, "learning_rate": 8.06004190631913e-06, "loss": 1.08783203125, "mean_token_accuracy": 0.719677472114563, "num_tokens": 34368168.0, "step": 72050 }, { "entropy": 1.0903123283386231, "epoch": 1.7533620291335328, "grad_norm": 10.5, "learning_rate": 8.046894900333105e-06, "loss": 1.1145364379882812, "mean_token_accuracy": 0.7160152173042298, "num_tokens": 34395670.0, "step": 72100 }, { "entropy": 0.9747272652387619, "epoch": 1.7545779528707959, "grad_norm": 13.875, "learning_rate": 8.033751403114223e-06, "loss": 0.9733143615722656, "mean_token_accuracy": 0.7432304179668426, "num_tokens": 34417793.0, "step": 72150 }, { "entropy": 1.1634168457984924, "epoch": 1.7557938766080592, "grad_norm": 22.875, "learning_rate": 8.020611438274873e-06, "loss": 1.1870079803466798, "mean_token_accuracy": 0.700301178097725, "num_tokens": 34439174.0, "step": 72200 }, { "entropy": 1.0579523706436158, "epoch": 1.7570098003453225, "grad_norm": 12.75, "learning_rate": 8.00747502942109e-06, "loss": 1.0732541656494141, "mean_token_accuracy": 0.7267381715774536, "num_tokens": 34461833.0, "step": 72250 }, { "entropy": 1.0046902287006378, "epoch": 1.7582257240825856, "grad_norm": 15.1875, "learning_rate": 7.994342200152532e-06, "loss": 0.9949966430664062, "mean_token_accuracy": 0.7425755989551545, "num_tokens": 34483564.0, "step": 72300 }, { "entropy": 1.024497910141945, "epoch": 1.7594416478198487, "grad_norm": 26.875, "learning_rate": 7.981212974062421e-06, "loss": 1.031656723022461, "mean_token_accuracy": 0.7231290292739868, "num_tokens": 34504916.0, "step": 72350 }, { "entropy": 1.0046057718992234, "epoch": 1.760657571557112, "grad_norm": 21.75, "learning_rate": 7.968087374737496e-06, "loss": 1.0435999298095704, "mean_token_accuracy": 0.7381795716285705, "num_tokens": 34525630.0, "step": 72400 }, { "entropy": 1.1075061583518981, "epoch": 1.761873495294375, "grad_norm": 17.625, "learning_rate": 7.954965425757999e-06, "loss": 1.1157183074951171, "mean_token_accuracy": 0.7182832825183868, "num_tokens": 34545191.0, "step": 72450 }, { "entropy": 1.05896630525589, "epoch": 1.7630894190316382, "grad_norm": 14.625, "learning_rate": 7.941847150697598e-06, "loss": 1.0796356201171875, "mean_token_accuracy": 0.7262122595310211, "num_tokens": 34569061.0, "step": 72500 }, { "entropy": 1.024133824110031, "epoch": 1.7643053427689015, "grad_norm": 15.1875, "learning_rate": 7.928732573123365e-06, "loss": 1.0317518615722656, "mean_token_accuracy": 0.7292092037200928, "num_tokens": 34593937.0, "step": 72550 }, { "entropy": 1.1158144462108612, "epoch": 1.7655212665061648, "grad_norm": 11.8125, "learning_rate": 7.915621716595742e-06, "loss": 1.1229063415527343, "mean_token_accuracy": 0.7156474220752717, "num_tokens": 34616670.0, "step": 72600 }, { "entropy": 1.0233523094654082, "epoch": 1.766737190243428, "grad_norm": 13.5, "learning_rate": 7.902514604668465e-06, "loss": 1.0340209197998047, "mean_token_accuracy": 0.7318562513589859, "num_tokens": 34640297.0, "step": 72650 }, { "entropy": 1.0465597760677339, "epoch": 1.767953113980691, "grad_norm": 9.375, "learning_rate": 7.88941126088856e-06, "loss": 1.0432003021240235, "mean_token_accuracy": 0.7247599411010742, "num_tokens": 34662557.0, "step": 72700 }, { "entropy": 1.0743946665525437, "epoch": 1.7691690377179543, "grad_norm": 23.75, "learning_rate": 7.876311708796269e-06, "loss": 1.071804962158203, "mean_token_accuracy": 0.723683375120163, "num_tokens": 34686842.0, "step": 72750 }, { "entropy": 1.0081606167554855, "epoch": 1.7703849614552176, "grad_norm": 17.875, "learning_rate": 7.863215971925039e-06, "loss": 1.03400146484375, "mean_token_accuracy": 0.7327864468097687, "num_tokens": 34708648.0, "step": 72800 }, { "entropy": 1.087872143983841, "epoch": 1.7716008851924807, "grad_norm": 22.125, "learning_rate": 7.850124073801452e-06, "loss": 1.0985821533203124, "mean_token_accuracy": 0.7187406206130982, "num_tokens": 34732500.0, "step": 72850 }, { "entropy": 1.084412204027176, "epoch": 1.7728168089297438, "grad_norm": 20.375, "learning_rate": 7.837036037945189e-06, "loss": 1.100673370361328, "mean_token_accuracy": 0.7160728812217713, "num_tokens": 34755232.0, "step": 72900 }, { "entropy": 1.0253525710105895, "epoch": 1.7740327326670071, "grad_norm": 16.75, "learning_rate": 7.823951887869008e-06, "loss": 1.0271630859375, "mean_token_accuracy": 0.7336431193351746, "num_tokens": 34780546.0, "step": 72950 }, { "entropy": 1.086661987900734, "epoch": 1.7752486564042704, "grad_norm": 11.3125, "learning_rate": 7.810871647078674e-06, "loss": 1.1216253662109374, "mean_token_accuracy": 0.7142149996757508, "num_tokens": 34802626.0, "step": 73000 }, { "entropy": 1.0291650062799453, "epoch": 1.7764645801415335, "grad_norm": 19.125, "learning_rate": 7.797795339072928e-06, "loss": 1.0332555389404297, "mean_token_accuracy": 0.733384656906128, "num_tokens": 34827140.0, "step": 73050 }, { "entropy": 1.0352059358358383, "epoch": 1.7776805038787966, "grad_norm": 11.125, "learning_rate": 7.78472298734346e-06, "loss": 1.0587666320800782, "mean_token_accuracy": 0.7287163043022156, "num_tokens": 34851632.0, "step": 73100 }, { "entropy": 1.055933382511139, "epoch": 1.77889642761606, "grad_norm": 14.5, "learning_rate": 7.771654615374831e-06, "loss": 1.069560775756836, "mean_token_accuracy": 0.7195778524875641, "num_tokens": 34876384.0, "step": 73150 }, { "entropy": 1.1149441659450532, "epoch": 1.7801123513533232, "grad_norm": 14.3125, "learning_rate": 7.75859024664447e-06, "loss": 1.122978744506836, "mean_token_accuracy": 0.7105575811862945, "num_tokens": 34901438.0, "step": 73200 }, { "entropy": 0.9422327160835267, "epoch": 1.7813282750905863, "grad_norm": 5.78125, "learning_rate": 7.745529904622609e-06, "loss": 0.9555217742919921, "mean_token_accuracy": 0.7482283401489258, "num_tokens": 34926957.0, "step": 73250 }, { "entropy": 1.0511515462398529, "epoch": 1.7825441988278494, "grad_norm": 15.5625, "learning_rate": 7.732473612772239e-06, "loss": 1.0458245086669922, "mean_token_accuracy": 0.7231554687023163, "num_tokens": 34952537.0, "step": 73300 }, { "entropy": 1.0102924287319184, "epoch": 1.7837601225651127, "grad_norm": 10.8125, "learning_rate": 7.719421394549089e-06, "loss": 1.013709259033203, "mean_token_accuracy": 0.7332949113845825, "num_tokens": 34975895.0, "step": 73350 }, { "entropy": 1.0181903803348542, "epoch": 1.784976046302376, "grad_norm": 16.625, "learning_rate": 7.706373273401553e-06, "loss": 1.0394078063964844, "mean_token_accuracy": 0.7343083220720291, "num_tokens": 35000375.0, "step": 73400 }, { "entropy": 1.040903257727623, "epoch": 1.7861919700396391, "grad_norm": 16.75, "learning_rate": 7.693329272770678e-06, "loss": 1.0278367614746093, "mean_token_accuracy": 0.7279182970523834, "num_tokens": 35024680.0, "step": 73450 }, { "entropy": 1.1374514979124069, "epoch": 1.7874078937769022, "grad_norm": 21.0, "learning_rate": 7.680289416090105e-06, "loss": 1.1526802062988282, "mean_token_accuracy": 0.7146807771921158, "num_tokens": 35048724.0, "step": 73500 }, { "entropy": 1.0229621183872224, "epoch": 1.7886238175141655, "grad_norm": 21.375, "learning_rate": 7.667253726786026e-06, "loss": 1.0105319213867188, "mean_token_accuracy": 0.7389314037561416, "num_tokens": 35069671.0, "step": 73550 }, { "entropy": 0.9808883088827133, "epoch": 1.7898397412514289, "grad_norm": 12.1875, "learning_rate": 7.65422222827715e-06, "loss": 1.021462631225586, "mean_token_accuracy": 0.7325612461566925, "num_tokens": 35090900.0, "step": 73600 }, { "entropy": 0.983916203379631, "epoch": 1.791055664988692, "grad_norm": 15.875, "learning_rate": 7.641194943974654e-06, "loss": 0.9925391387939453, "mean_token_accuracy": 0.7339942276477813, "num_tokens": 35115098.0, "step": 73650 }, { "entropy": 0.9810805606842041, "epoch": 1.792271588725955, "grad_norm": 13.625, "learning_rate": 7.62817189728215e-06, "loss": 0.9826386260986328, "mean_token_accuracy": 0.7374123322963715, "num_tokens": 35138140.0, "step": 73700 }, { "entropy": 1.0356188893318177, "epoch": 1.7934875124632184, "grad_norm": 13.625, "learning_rate": 7.615153111595635e-06, "loss": 1.0390327453613282, "mean_token_accuracy": 0.7248045754432678, "num_tokens": 35162493.0, "step": 73750 }, { "entropy": 1.1191011798381805, "epoch": 1.7947034362004817, "grad_norm": 12.5, "learning_rate": 7.6021386103034425e-06, "loss": 1.1039617919921876, "mean_token_accuracy": 0.725299054980278, "num_tokens": 35183195.0, "step": 73800 }, { "entropy": 1.0024803733825685, "epoch": 1.7959193599377445, "grad_norm": 19.625, "learning_rate": 7.589128416786222e-06, "loss": 1.021378860473633, "mean_token_accuracy": 0.7368553912639618, "num_tokens": 35206285.0, "step": 73850 }, { "entropy": 0.9965019398927688, "epoch": 1.7971352836750079, "grad_norm": 24.375, "learning_rate": 7.57612255441688e-06, "loss": 1.0125430297851563, "mean_token_accuracy": 0.7386737358570099, "num_tokens": 35228558.0, "step": 73900 }, { "entropy": 1.0740536284446716, "epoch": 1.7983512074122712, "grad_norm": 19.75, "learning_rate": 7.563121046560533e-06, "loss": 1.1042209625244142, "mean_token_accuracy": 0.7175178867578507, "num_tokens": 35252009.0, "step": 73950 }, { "entropy": 0.9655701124668121, "epoch": 1.7995671311495343, "grad_norm": 34.5, "learning_rate": 7.55012391657449e-06, "loss": 0.9598641967773438, "mean_token_accuracy": 0.7535808849334716, "num_tokens": 35272979.0, "step": 74000 }, { "entropy": 1.0058109545707703, "epoch": 1.8007830548867974, "grad_norm": 21.25, "learning_rate": 7.537131187808182e-06, "loss": 1.0288771057128907, "mean_token_accuracy": 0.732348370552063, "num_tokens": 35301762.0, "step": 74050 }, { "entropy": 0.9578034776449204, "epoch": 1.8019989786240607, "grad_norm": 17.75, "learning_rate": 7.524142883603139e-06, "loss": 0.9355921936035156, "mean_token_accuracy": 0.7500776195526123, "num_tokens": 35327569.0, "step": 74100 }, { "entropy": 1.0229935055971147, "epoch": 1.803214902361324, "grad_norm": 27.75, "learning_rate": 7.511159027292944e-06, "loss": 1.0583384704589844, "mean_token_accuracy": 0.7293102836608887, "num_tokens": 35349174.0, "step": 74150 }, { "entropy": 1.1182122695446015, "epoch": 1.804430826098587, "grad_norm": 14.6875, "learning_rate": 7.498179642203184e-06, "loss": 1.1032311248779296, "mean_token_accuracy": 0.7169514489173889, "num_tokens": 35369903.0, "step": 74200 }, { "entropy": 1.0567125964164734, "epoch": 1.8056467498358502, "grad_norm": 11.5, "learning_rate": 7.485204751651419e-06, "loss": 1.0755728912353515, "mean_token_accuracy": 0.7200210154056549, "num_tokens": 35393575.0, "step": 74250 }, { "entropy": 1.0474311012029647, "epoch": 1.8068626735731135, "grad_norm": 17.0, "learning_rate": 7.472234378947129e-06, "loss": 1.0598624420166016, "mean_token_accuracy": 0.7259575021266937, "num_tokens": 35416135.0, "step": 74300 }, { "entropy": 1.0675765758752822, "epoch": 1.8080785973103768, "grad_norm": 13.4375, "learning_rate": 7.4592685473916784e-06, "loss": 1.0875867462158204, "mean_token_accuracy": 0.7254581654071808, "num_tokens": 35439276.0, "step": 74350 }, { "entropy": 0.9786824309825897, "epoch": 1.8092945210476399, "grad_norm": 15.25, "learning_rate": 7.446307280278283e-06, "loss": 0.99627197265625, "mean_token_accuracy": 0.7309071195125579, "num_tokens": 35463370.0, "step": 74400 }, { "entropy": 1.1200006073713302, "epoch": 1.810510444784903, "grad_norm": 37.25, "learning_rate": 7.4333506008919445e-06, "loss": 1.1324295043945312, "mean_token_accuracy": 0.7170352709293365, "num_tokens": 35486628.0, "step": 74450 }, { "entropy": 1.0085653460025787, "epoch": 1.8117263685221663, "grad_norm": 16.375, "learning_rate": 7.420398532509434e-06, "loss": 1.004126663208008, "mean_token_accuracy": 0.7322060012817383, "num_tokens": 35508938.0, "step": 74500 }, { "entropy": 1.1187127393484115, "epoch": 1.8129422922594296, "grad_norm": 9.625, "learning_rate": 7.407451098399225e-06, "loss": 1.1402481842041015, "mean_token_accuracy": 0.7131182980537415, "num_tokens": 35530740.0, "step": 74550 }, { "entropy": 1.069097700715065, "epoch": 1.8141582159966927, "grad_norm": 18.0, "learning_rate": 7.394508321821485e-06, "loss": 1.0598419952392577, "mean_token_accuracy": 0.737654978632927, "num_tokens": 35553645.0, "step": 74600 }, { "entropy": 1.0395387399196625, "epoch": 1.8153741397339558, "grad_norm": 9.8125, "learning_rate": 7.381570226028002e-06, "loss": 1.0700325775146484, "mean_token_accuracy": 0.7237686324119568, "num_tokens": 35580020.0, "step": 74650 }, { "entropy": 1.1005397623777389, "epoch": 1.816590063471219, "grad_norm": 23.375, "learning_rate": 7.3686368342621496e-06, "loss": 1.1081439971923828, "mean_token_accuracy": 0.7222279858589172, "num_tokens": 35601087.0, "step": 74700 }, { "entropy": 1.0459712952375413, "epoch": 1.8178059872084824, "grad_norm": 16.25, "learning_rate": 7.355708169758864e-06, "loss": 1.0672029113769532, "mean_token_accuracy": 0.7223205411434174, "num_tokens": 35621988.0, "step": 74750 }, { "entropy": 1.0836662030220032, "epoch": 1.8190219109457455, "grad_norm": 14.6875, "learning_rate": 7.342784255744584e-06, "loss": 1.098870086669922, "mean_token_accuracy": 0.7200578236579895, "num_tokens": 35645226.0, "step": 74800 }, { "entropy": 1.0720903623104094, "epoch": 1.8202378346830086, "grad_norm": 24.5, "learning_rate": 7.329865115437205e-06, "loss": 1.0896231842041015, "mean_token_accuracy": 0.7228025984764099, "num_tokens": 35671656.0, "step": 74850 }, { "entropy": 1.1221754902601242, "epoch": 1.821453758420272, "grad_norm": 11.3125, "learning_rate": 7.3169507720460635e-06, "loss": 1.1435344696044922, "mean_token_accuracy": 0.7020730662345886, "num_tokens": 35695130.0, "step": 74900 }, { "entropy": 1.028617109656334, "epoch": 1.8226696821575352, "grad_norm": 10.1875, "learning_rate": 7.3040412487718626e-06, "loss": 1.0491188049316407, "mean_token_accuracy": 0.729990895986557, "num_tokens": 35721203.0, "step": 74950 }, { "entropy": 1.1468114030361176, "epoch": 1.8238856058947983, "grad_norm": 14.5625, "learning_rate": 7.291136568806651e-06, "loss": 1.15789306640625, "mean_token_accuracy": 0.7031688541173935, "num_tokens": 35744332.0, "step": 75000 }, { "entropy": 1.1056769454479218, "epoch": 1.8251015296320614, "grad_norm": 15.75, "learning_rate": 7.278236755333784e-06, "loss": 1.1230496978759765, "mean_token_accuracy": 0.7072365140914917, "num_tokens": 35767548.0, "step": 75050 }, { "entropy": 1.0202328419685365, "epoch": 1.8263174533693247, "grad_norm": 14.375, "learning_rate": 7.265341831527864e-06, "loss": 1.0380425262451172, "mean_token_accuracy": 0.7268264973163605, "num_tokens": 35789862.0, "step": 75100 }, { "entropy": 1.0828881645202637, "epoch": 1.827533377106588, "grad_norm": 15.9375, "learning_rate": 7.252451820554713e-06, "loss": 1.0777157592773436, "mean_token_accuracy": 0.7180150645971298, "num_tokens": 35813699.0, "step": 75150 }, { "entropy": 1.0564890456199647, "epoch": 1.8287493008438511, "grad_norm": 8.75, "learning_rate": 7.2395667455713255e-06, "loss": 1.0702208709716796, "mean_token_accuracy": 0.729250454902649, "num_tokens": 35841468.0, "step": 75200 }, { "entropy": 1.1254418164491653, "epoch": 1.8299652245811142, "grad_norm": 17.875, "learning_rate": 7.226686629725834e-06, "loss": 1.1073995971679687, "mean_token_accuracy": 0.7241263973712921, "num_tokens": 35864031.0, "step": 75250 }, { "entropy": 1.0236229085922242, "epoch": 1.8311811483183775, "grad_norm": 19.625, "learning_rate": 7.213811496157457e-06, "loss": 1.033991241455078, "mean_token_accuracy": 0.7266656649112702, "num_tokens": 35889566.0, "step": 75300 }, { "entropy": 1.0426781684160233, "epoch": 1.8323970720556406, "grad_norm": 10.5, "learning_rate": 7.200941367996458e-06, "loss": 1.035079345703125, "mean_token_accuracy": 0.7277414786815644, "num_tokens": 35912350.0, "step": 75350 }, { "entropy": 1.054875375032425, "epoch": 1.8336129957929037, "grad_norm": 15.25, "learning_rate": 7.188076268364121e-06, "loss": 1.0497119903564454, "mean_token_accuracy": 0.7297617024183274, "num_tokens": 35932554.0, "step": 75400 }, { "entropy": 1.1232732433080672, "epoch": 1.834828919530167, "grad_norm": 12.6875, "learning_rate": 7.175216220372683e-06, "loss": 1.1495374298095704, "mean_token_accuracy": 0.711915236711502, "num_tokens": 35961484.0, "step": 75450 }, { "entropy": 1.1237189295887946, "epoch": 1.8360448432674303, "grad_norm": 15.0625, "learning_rate": 7.162361247125311e-06, "loss": 1.155655288696289, "mean_token_accuracy": 0.7123685938119888, "num_tokens": 35989802.0, "step": 75500 }, { "entropy": 1.024847149848938, "epoch": 1.8372607670046934, "grad_norm": 12.75, "learning_rate": 7.1495113717160645e-06, "loss": 1.0286493682861328, "mean_token_accuracy": 0.7272166609764099, "num_tokens": 36015971.0, "step": 75550 }, { "entropy": 1.0356195175647736, "epoch": 1.8384766907419565, "grad_norm": 13.25, "learning_rate": 7.136666617229823e-06, "loss": 1.048484115600586, "mean_token_accuracy": 0.7310180282592773, "num_tokens": 36041734.0, "step": 75600 }, { "entropy": 1.0687865424156189, "epoch": 1.8396926144792198, "grad_norm": 14.75, "learning_rate": 7.123827006742286e-06, "loss": 1.0963182830810547, "mean_token_accuracy": 0.717542685866356, "num_tokens": 36064460.0, "step": 75650 }, { "entropy": 1.1001538330316543, "epoch": 1.8409085382164831, "grad_norm": 15.6875, "learning_rate": 7.110992563319905e-06, "loss": 1.1135391998291015, "mean_token_accuracy": 0.7165139424800873, "num_tokens": 36090246.0, "step": 75700 }, { "entropy": 1.1273748207092285, "epoch": 1.8421244619537462, "grad_norm": 11.1875, "learning_rate": 7.0981633100198445e-06, "loss": 1.153203887939453, "mean_token_accuracy": 0.706843296289444, "num_tokens": 36113728.0, "step": 75750 }, { "entropy": 1.042561411857605, "epoch": 1.8433403856910093, "grad_norm": 15.0625, "learning_rate": 7.0853392698899544e-06, "loss": 1.0399301910400391, "mean_token_accuracy": 0.7287979590892791, "num_tokens": 36138099.0, "step": 75800 }, { "entropy": 1.0642791223526, "epoch": 1.8445563094282726, "grad_norm": 21.25, "learning_rate": 7.072520465968709e-06, "loss": 1.0801153564453125, "mean_token_accuracy": 0.7203715813159942, "num_tokens": 36160815.0, "step": 75850 }, { "entropy": 1.1360101848840714, "epoch": 1.845772233165536, "grad_norm": 25.875, "learning_rate": 7.0597069212851786e-06, "loss": 1.1678963470458985, "mean_token_accuracy": 0.707863352894783, "num_tokens": 36185785.0, "step": 75900 }, { "entropy": 1.100436151623726, "epoch": 1.846988156902799, "grad_norm": 21.5, "learning_rate": 7.046898658858996e-06, "loss": 1.086731185913086, "mean_token_accuracy": 0.7190963745117187, "num_tokens": 36211082.0, "step": 75950 }, { "entropy": 0.903278471827507, "epoch": 1.8482040806400621, "grad_norm": 15.625, "learning_rate": 7.0340957017002876e-06, "loss": 0.9255272674560547, "mean_token_accuracy": 0.7504626142978669, "num_tokens": 36234248.0, "step": 76000 }, { "entropy": 1.0301302570104598, "epoch": 1.8494200043773255, "grad_norm": 16.625, "learning_rate": 7.021298072809662e-06, "loss": 1.0210110473632812, "mean_token_accuracy": 0.7368491041660309, "num_tokens": 36259371.0, "step": 76050 }, { "entropy": 1.0846767234802246, "epoch": 1.8506359281145888, "grad_norm": 30.625, "learning_rate": 7.00850579517814e-06, "loss": 1.0924180603027345, "mean_token_accuracy": 0.7175328814983368, "num_tokens": 36279968.0, "step": 76100 }, { "entropy": 1.0723867511749268, "epoch": 1.8518518518518519, "grad_norm": 14.875, "learning_rate": 6.995718891787149e-06, "loss": 1.0869900512695312, "mean_token_accuracy": 0.7247734051942826, "num_tokens": 36304195.0, "step": 76150 }, { "entropy": 1.0373640513420106, "epoch": 1.853067775589115, "grad_norm": 19.375, "learning_rate": 6.98293738560845e-06, "loss": 1.0497236633300782, "mean_token_accuracy": 0.7296155202388763, "num_tokens": 36324882.0, "step": 76200 }, { "entropy": 1.0376073318719863, "epoch": 1.8542836993263783, "grad_norm": 18.75, "learning_rate": 6.970161299604106e-06, "loss": 1.045980224609375, "mean_token_accuracy": 0.7246279168128967, "num_tokens": 36349755.0, "step": 76250 }, { "entropy": 1.1249932217597962, "epoch": 1.8554996230636416, "grad_norm": 14.125, "learning_rate": 6.957390656726449e-06, "loss": 1.1482044219970704, "mean_token_accuracy": 0.7127174234390259, "num_tokens": 36371978.0, "step": 76300 }, { "entropy": 1.010819525718689, "epoch": 1.8567155468009047, "grad_norm": 16.625, "learning_rate": 6.944625479918028e-06, "loss": 1.0222634887695312, "mean_token_accuracy": 0.7349280369281769, "num_tokens": 36394595.0, "step": 76350 }, { "entropy": 0.9769522136449814, "epoch": 1.8579314705381678, "grad_norm": 13.75, "learning_rate": 6.93186579211157e-06, "loss": 0.9771368408203125, "mean_token_accuracy": 0.7366494190692902, "num_tokens": 36417930.0, "step": 76400 }, { "entropy": 1.0083582836389542, "epoch": 1.859147394275431, "grad_norm": 12.4375, "learning_rate": 6.919111616229954e-06, "loss": 1.0127828216552734, "mean_token_accuracy": 0.7327344787120819, "num_tokens": 36441346.0, "step": 76450 }, { "entropy": 1.081952806711197, "epoch": 1.8603633180126944, "grad_norm": 20.125, "learning_rate": 6.9063629751861386e-06, "loss": 1.0879878997802734, "mean_token_accuracy": 0.7190508991479874, "num_tokens": 36464488.0, "step": 76500 }, { "entropy": 1.084717354774475, "epoch": 1.8615792417499575, "grad_norm": 14.25, "learning_rate": 6.8936198918831485e-06, "loss": 1.098580551147461, "mean_token_accuracy": 0.7116803324222565, "num_tokens": 36488397.0, "step": 76550 }, { "entropy": 1.004954421520233, "epoch": 1.8627951654872206, "grad_norm": 16.75, "learning_rate": 6.88088238921403e-06, "loss": 1.0360802459716796, "mean_token_accuracy": 0.7305432176589965, "num_tokens": 36511036.0, "step": 76600 }, { "entropy": 1.0030851030349732, "epoch": 1.8640110892244839, "grad_norm": 19.25, "learning_rate": 6.8681504900617905e-06, "loss": 1.009821090698242, "mean_token_accuracy": 0.7323006981611252, "num_tokens": 36533165.0, "step": 76650 }, { "entropy": 1.0210929691791535, "epoch": 1.8652270129617472, "grad_norm": 8.5625, "learning_rate": 6.85542421729938e-06, "loss": 1.0226612854003907, "mean_token_accuracy": 0.7287266540527344, "num_tokens": 36562479.0, "step": 76700 }, { "entropy": 1.0072043931484223, "epoch": 1.8664429366990103, "grad_norm": 14.125, "learning_rate": 6.842703593789635e-06, "loss": 1.0134764862060548, "mean_token_accuracy": 0.7294907057285309, "num_tokens": 36583599.0, "step": 76750 }, { "entropy": 1.0361030650138856, "epoch": 1.8676588604362734, "grad_norm": 15.875, "learning_rate": 6.829988642385243e-06, "loss": 1.04614501953125, "mean_token_accuracy": 0.7310619378089904, "num_tokens": 36608587.0, "step": 76800 }, { "entropy": 0.9501076954603195, "epoch": 1.8688747841735367, "grad_norm": 11.4375, "learning_rate": 6.817279385928715e-06, "loss": 0.9632610321044922, "mean_token_accuracy": 0.7467865765094757, "num_tokens": 36631693.0, "step": 76850 }, { "entropy": 0.9895213884115219, "epoch": 1.8700907079107998, "grad_norm": 14.3125, "learning_rate": 6.804575847252309e-06, "loss": 1.0285332489013672, "mean_token_accuracy": 0.7314321506023407, "num_tokens": 36653376.0, "step": 76900 }, { "entropy": 1.1540192329883576, "epoch": 1.8713066316480629, "grad_norm": 33.5, "learning_rate": 6.79187804917803e-06, "loss": 1.1736653900146485, "mean_token_accuracy": 0.7077850699424744, "num_tokens": 36674527.0, "step": 76950 }, { "entropy": 1.1417565250396728, "epoch": 1.8725225553853262, "grad_norm": 23.5, "learning_rate": 6.7791860145175555e-06, "loss": 1.1635665130615234, "mean_token_accuracy": 0.7110464584827423, "num_tokens": 36697931.0, "step": 77000 }, { "entropy": 0.9746208691596985, "epoch": 1.8737384791225895, "grad_norm": 16.0, "learning_rate": 6.766499766072221e-06, "loss": 0.9582096099853515, "mean_token_accuracy": 0.745807558298111, "num_tokens": 36723829.0, "step": 77050 }, { "entropy": 1.0399888920783997, "epoch": 1.8749544028598526, "grad_norm": 10.375, "learning_rate": 6.75381932663296e-06, "loss": 1.063019027709961, "mean_token_accuracy": 0.7258497202396392, "num_tokens": 36749696.0, "step": 77100 }, { "entropy": 1.0050259786844253, "epoch": 1.8761703265971157, "grad_norm": 19.125, "learning_rate": 6.741144718980269e-06, "loss": 1.0340279388427733, "mean_token_accuracy": 0.733732614517212, "num_tokens": 36772075.0, "step": 77150 }, { "entropy": 1.0465113812685012, "epoch": 1.877386250334379, "grad_norm": 11.5625, "learning_rate": 6.728475965884178e-06, "loss": 1.0733109283447266, "mean_token_accuracy": 0.7250096523761749, "num_tokens": 36795992.0, "step": 77200 }, { "entropy": 1.0312710386514663, "epoch": 1.8786021740716423, "grad_norm": 12.625, "learning_rate": 6.715813090104183e-06, "loss": 1.028415985107422, "mean_token_accuracy": 0.7310451257228852, "num_tokens": 36819229.0, "step": 77250 }, { "entropy": 1.0681267589330674, "epoch": 1.8798180978089054, "grad_norm": 35.5, "learning_rate": 6.703156114389232e-06, "loss": 1.0558699798583984, "mean_token_accuracy": 0.7250878816843033, "num_tokens": 36841722.0, "step": 77300 }, { "entropy": 0.9702767300605774, "epoch": 1.8810340215461685, "grad_norm": 15.1875, "learning_rate": 6.690505061477678e-06, "loss": 0.9765476226806641, "mean_token_accuracy": 0.7441020894050598, "num_tokens": 36866708.0, "step": 77350 }, { "entropy": 1.0169500035047532, "epoch": 1.8822499452834318, "grad_norm": 14.375, "learning_rate": 6.67785995409722e-06, "loss": 1.0318396759033204, "mean_token_accuracy": 0.7300978779792786, "num_tokens": 36893384.0, "step": 77400 }, { "entropy": 0.9983062744140625, "epoch": 1.8834658690206951, "grad_norm": 13.6875, "learning_rate": 6.665220814964884e-06, "loss": 1.0032768249511719, "mean_token_accuracy": 0.7348007476329803, "num_tokens": 36917910.0, "step": 77450 }, { "entropy": 1.0260655617713927, "epoch": 1.8846817927579582, "grad_norm": 12.375, "learning_rate": 6.652587666786976e-06, "loss": 1.0529013824462892, "mean_token_accuracy": 0.7256330627202988, "num_tokens": 36940601.0, "step": 77500 }, { "entropy": 0.9657204461097717, "epoch": 1.8858977164952213, "grad_norm": 15.125, "learning_rate": 6.639960532259036e-06, "loss": 0.9795107269287109, "mean_token_accuracy": 0.7450915968418121, "num_tokens": 36961003.0, "step": 77550 }, { "entropy": 1.0119869750738144, "epoch": 1.8871136402324846, "grad_norm": 16.125, "learning_rate": 6.627339434065798e-06, "loss": 1.0121041870117187, "mean_token_accuracy": 0.7347657334804535, "num_tokens": 36985627.0, "step": 77600 }, { "entropy": 0.9877213299274444, "epoch": 1.888329563969748, "grad_norm": 12.9375, "learning_rate": 6.614724394881154e-06, "loss": 0.9974105072021484, "mean_token_accuracy": 0.737681874036789, "num_tokens": 37012607.0, "step": 77650 }, { "entropy": 0.9898084676265717, "epoch": 1.889545487707011, "grad_norm": 12.125, "learning_rate": 6.602115437368113e-06, "loss": 1.0029043579101562, "mean_token_accuracy": 0.7376559853553772, "num_tokens": 37039962.0, "step": 77700 }, { "entropy": 1.0789085882902145, "epoch": 1.8907614114442741, "grad_norm": 27.0, "learning_rate": 6.589512584178758e-06, "loss": 1.1069232940673828, "mean_token_accuracy": 0.7219537597894669, "num_tokens": 37065878.0, "step": 77750 }, { "entropy": 0.9989105340838432, "epoch": 1.8919773351815374, "grad_norm": 7.0, "learning_rate": 6.576915857954198e-06, "loss": 1.0310519409179688, "mean_token_accuracy": 0.731660305261612, "num_tokens": 37092707.0, "step": 77800 }, { "entropy": 1.0591658401489257, "epoch": 1.8931932589188007, "grad_norm": 10.6875, "learning_rate": 6.564325281324553e-06, "loss": 1.077318115234375, "mean_token_accuracy": 0.7253519999980926, "num_tokens": 37112652.0, "step": 77850 }, { "entropy": 1.0356089660525323, "epoch": 1.8944091826560638, "grad_norm": 24.0, "learning_rate": 6.551740876908867e-06, "loss": 1.041001434326172, "mean_token_accuracy": 0.7294846975803375, "num_tokens": 37137840.0, "step": 77900 }, { "entropy": 1.136568422317505, "epoch": 1.895625106393327, "grad_norm": 27.5, "learning_rate": 6.5391626673151236e-06, "loss": 1.1512086486816406, "mean_token_accuracy": 0.7091615146398544, "num_tokens": 37161230.0, "step": 77950 }, { "entropy": 1.1152929258346558, "epoch": 1.8968410301305902, "grad_norm": 13.375, "learning_rate": 6.526590675140163e-06, "loss": 1.1305944061279296, "mean_token_accuracy": 0.712394835948944, "num_tokens": 37183722.0, "step": 78000 }, { "entropy": 0.9350994426012039, "epoch": 1.8980569538678536, "grad_norm": 9.3125, "learning_rate": 6.514024922969653e-06, "loss": 0.9416493988037109, "mean_token_accuracy": 0.7507370722293853, "num_tokens": 37205894.0, "step": 78050 }, { "entropy": 1.087921867966652, "epoch": 1.8992728776051166, "grad_norm": 21.125, "learning_rate": 6.501465433378064e-06, "loss": 1.0833631896972655, "mean_token_accuracy": 0.7231875330209732, "num_tokens": 37231446.0, "step": 78100 }, { "entropy": 1.0258314359188079, "epoch": 1.9004888013423797, "grad_norm": 8.75, "learning_rate": 6.488912228928601e-06, "loss": 1.0304458618164063, "mean_token_accuracy": 0.7282595562934876, "num_tokens": 37255949.0, "step": 78150 }, { "entropy": 1.038578120470047, "epoch": 1.901704725079643, "grad_norm": 22.75, "learning_rate": 6.476365332173188e-06, "loss": 1.0420667266845702, "mean_token_accuracy": 0.7273482656478882, "num_tokens": 37282159.0, "step": 78200 }, { "entropy": 1.0714293092489242, "epoch": 1.9029206488169064, "grad_norm": 11.8125, "learning_rate": 6.463824765652417e-06, "loss": 1.0706198883056641, "mean_token_accuracy": 0.7207549273967743, "num_tokens": 37309075.0, "step": 78250 }, { "entropy": 1.0456053245067596, "epoch": 1.9041365725541692, "grad_norm": 24.375, "learning_rate": 6.451290551895499e-06, "loss": 1.0652519989013671, "mean_token_accuracy": 0.719414736032486, "num_tokens": 37334557.0, "step": 78300 }, { "entropy": 1.050436061024666, "epoch": 1.9053524962914326, "grad_norm": 28.5, "learning_rate": 6.438762713420239e-06, "loss": 1.062214584350586, "mean_token_accuracy": 0.7274596679210663, "num_tokens": 37353909.0, "step": 78350 }, { "entropy": 1.0107188200950623, "epoch": 1.9065684200286959, "grad_norm": 19.0, "learning_rate": 6.426241272732991e-06, "loss": 1.0378185272216798, "mean_token_accuracy": 0.730886207818985, "num_tokens": 37378619.0, "step": 78400 }, { "entropy": 1.021126202940941, "epoch": 1.907784343765959, "grad_norm": 13.5, "learning_rate": 6.413726252328611e-06, "loss": 1.0303973388671874, "mean_token_accuracy": 0.7287275385856629, "num_tokens": 37402014.0, "step": 78450 }, { "entropy": 1.1486397254467011, "epoch": 1.909000267503222, "grad_norm": 22.25, "learning_rate": 6.40121767469042e-06, "loss": 1.1450789642333985, "mean_token_accuracy": 0.7109511452913284, "num_tokens": 37429378.0, "step": 78500 }, { "entropy": 1.1853489458560944, "epoch": 1.9102161912404854, "grad_norm": 9.5625, "learning_rate": 6.388715562290163e-06, "loss": 1.2212769317626953, "mean_token_accuracy": 0.6939260858297348, "num_tokens": 37455688.0, "step": 78550 }, { "entropy": 1.0747613775730134, "epoch": 1.9114321149777487, "grad_norm": 23.0, "learning_rate": 6.37621993758798e-06, "loss": 1.079906005859375, "mean_token_accuracy": 0.7208926701545715, "num_tokens": 37476910.0, "step": 78600 }, { "entropy": 1.0423617666959764, "epoch": 1.9126480387150118, "grad_norm": 22.875, "learning_rate": 6.363730823032348e-06, "loss": 1.051079559326172, "mean_token_accuracy": 0.7272143352031708, "num_tokens": 37501706.0, "step": 78650 }, { "entropy": 1.0293262195587158, "epoch": 1.9138639624522749, "grad_norm": 8.0625, "learning_rate": 6.351248241060044e-06, "loss": 1.0467371368408203, "mean_token_accuracy": 0.7310244381427765, "num_tokens": 37526436.0, "step": 78700 }, { "entropy": 1.1008820968866349, "epoch": 1.9150798861895382, "grad_norm": 13.25, "learning_rate": 6.338772214096125e-06, "loss": 1.0972066497802735, "mean_token_accuracy": 0.7202953362464904, "num_tokens": 37552874.0, "step": 78750 }, { "entropy": 1.081252126097679, "epoch": 1.9162958099268015, "grad_norm": 14.9375, "learning_rate": 6.326302764553852e-06, "loss": 1.0932911682128905, "mean_token_accuracy": 0.7178847646713257, "num_tokens": 37575357.0, "step": 78800 }, { "entropy": 1.0317867714166642, "epoch": 1.9175117336640646, "grad_norm": 13.25, "learning_rate": 6.3138399148346825e-06, "loss": 1.0492823028564453, "mean_token_accuracy": 0.727627363204956, "num_tokens": 37599091.0, "step": 78850 }, { "entropy": 1.09637150734663, "epoch": 1.9187276574013277, "grad_norm": 28.625, "learning_rate": 6.301383687328219e-06, "loss": 1.1136483001708983, "mean_token_accuracy": 0.7118858516216278, "num_tokens": 37624186.0, "step": 78900 }, { "entropy": 1.0943343609571456, "epoch": 1.919943581138591, "grad_norm": 20.125, "learning_rate": 6.288934104412156e-06, "loss": 1.1278373718261718, "mean_token_accuracy": 0.7141846132278442, "num_tokens": 37649180.0, "step": 78950 }, { "entropy": 1.0443596452474595, "epoch": 1.9211595048758543, "grad_norm": 14.4375, "learning_rate": 6.276491188452263e-06, "loss": 1.0442012023925782, "mean_token_accuracy": 0.7316697925329209, "num_tokens": 37671474.0, "step": 79000 }, { "entropy": 1.1959511184692382, "epoch": 1.9223754286131174, "grad_norm": 12.3125, "learning_rate": 6.264054961802322e-06, "loss": 1.2121590423583983, "mean_token_accuracy": 0.6998715049028397, "num_tokens": 37694557.0, "step": 79050 }, { "entropy": 1.1338950151205063, "epoch": 1.9235913523503805, "grad_norm": 11.9375, "learning_rate": 6.251625446804102e-06, "loss": 1.151397933959961, "mean_token_accuracy": 0.7056976503133774, "num_tokens": 37722218.0, "step": 79100 }, { "entropy": 1.006742194890976, "epoch": 1.9248072760876438, "grad_norm": 25.0, "learning_rate": 6.239202665787321e-06, "loss": 0.9974612426757813, "mean_token_accuracy": 0.7369723701477051, "num_tokens": 37744818.0, "step": 79150 }, { "entropy": 1.107929664850235, "epoch": 1.926023199824907, "grad_norm": 27.0, "learning_rate": 6.226786641069584e-06, "loss": 1.136128921508789, "mean_token_accuracy": 0.7095722788572312, "num_tokens": 37771410.0, "step": 79200 }, { "entropy": 1.048833932876587, "epoch": 1.9272391235621702, "grad_norm": 37.5, "learning_rate": 6.214377394956373e-06, "loss": 1.0570339202880858, "mean_token_accuracy": 0.7229857194423676, "num_tokens": 37798394.0, "step": 79250 }, { "entropy": 1.0322424119710922, "epoch": 1.9284550472994333, "grad_norm": 10.5625, "learning_rate": 6.20197494974099e-06, "loss": 1.030535430908203, "mean_token_accuracy": 0.7342934048175812, "num_tokens": 37821335.0, "step": 79300 }, { "entropy": 1.0247244137525557, "epoch": 1.9296709710366966, "grad_norm": 31.625, "learning_rate": 6.189579327704508e-06, "loss": 1.0743618774414063, "mean_token_accuracy": 0.7159861743450164, "num_tokens": 37844809.0, "step": 79350 }, { "entropy": 0.9496233212947846, "epoch": 1.93088689477396, "grad_norm": 13.0, "learning_rate": 6.1771905511157556e-06, "loss": 0.9457400512695312, "mean_token_accuracy": 0.7548992812633515, "num_tokens": 37868943.0, "step": 79400 }, { "entropy": 1.026628378033638, "epoch": 1.932102818511223, "grad_norm": 21.875, "learning_rate": 6.1648086422312515e-06, "loss": 1.027425765991211, "mean_token_accuracy": 0.7346813106536865, "num_tokens": 37889985.0, "step": 79450 }, { "entropy": 1.102431083917618, "epoch": 1.933318742248486, "grad_norm": 14.6875, "learning_rate": 6.152433623295189e-06, "loss": 1.1048403930664064, "mean_token_accuracy": 0.7181644368171692, "num_tokens": 37911403.0, "step": 79500 }, { "entropy": 0.9846072489023209, "epoch": 1.9345346659857494, "grad_norm": 14.0625, "learning_rate": 6.140065516539375e-06, "loss": 0.9961219024658203, "mean_token_accuracy": 0.7404272222518921, "num_tokens": 37934926.0, "step": 79550 }, { "entropy": 1.032532427608967, "epoch": 1.9357505897230127, "grad_norm": 29.25, "learning_rate": 6.127704344183198e-06, "loss": 1.0593767547607422, "mean_token_accuracy": 0.7292724847793579, "num_tokens": 37954547.0, "step": 79600 }, { "entropy": 1.1168547654151917, "epoch": 1.9369665134602758, "grad_norm": 13.5625, "learning_rate": 6.115350128433599e-06, "loss": 1.149422149658203, "mean_token_accuracy": 0.712553591132164, "num_tokens": 37981574.0, "step": 79650 }, { "entropy": 1.1070887139439582, "epoch": 1.938182437197539, "grad_norm": 53.75, "learning_rate": 6.103002891485006e-06, "loss": 1.1245429992675782, "mean_token_accuracy": 0.7152118623256684, "num_tokens": 38005462.0, "step": 79700 }, { "entropy": 1.0839282613992691, "epoch": 1.9393983609348022, "grad_norm": 10.0, "learning_rate": 6.090662655519319e-06, "loss": 1.1035118865966798, "mean_token_accuracy": 0.7139450711011887, "num_tokens": 38032906.0, "step": 79750 }, { "entropy": 1.0468620663881303, "epoch": 1.9406142846720653, "grad_norm": 16.125, "learning_rate": 6.078329442705867e-06, "loss": 1.0646863555908204, "mean_token_accuracy": 0.7248991477489471, "num_tokens": 38057017.0, "step": 79800 }, { "entropy": 0.957463544011116, "epoch": 1.9418302084093284, "grad_norm": 23.125, "learning_rate": 6.066003275201347e-06, "loss": 0.9412318420410156, "mean_token_accuracy": 0.7513750684261322, "num_tokens": 38081276.0, "step": 79850 }, { "entropy": 1.0284032100439071, "epoch": 1.9430461321465917, "grad_norm": 18.75, "learning_rate": 6.053684175149808e-06, "loss": 1.0653746032714844, "mean_token_accuracy": 0.7280886846780777, "num_tokens": 38106000.0, "step": 79900 }, { "entropy": 1.058583369255066, "epoch": 1.944262055883855, "grad_norm": 12.0625, "learning_rate": 6.0413721646825964e-06, "loss": 1.0828941345214844, "mean_token_accuracy": 0.7236041420698166, "num_tokens": 38128593.0, "step": 79950 }, { "entropy": 1.002117738723755, "epoch": 1.9454779796211181, "grad_norm": 15.1875, "learning_rate": 6.0290672659183325e-06, "loss": 0.9916483306884766, "mean_token_accuracy": 0.742242739200592, "num_tokens": 38153065.0, "step": 80000 }, { "epoch": 1.9454779796211181, "eval_entropy": 1.1121079134853697, "eval_loss": 1.2937538623809814, "eval_mean_token_accuracy": 0.6803684088836277, "eval_num_tokens": 38153065.0, "eval_runtime": 390.6798, "eval_samples_per_second": 11.695, "eval_steps_per_second": 11.695, "step": 80000 }, { "entropy": 1.0611077040433883, "epoch": 1.9466939033583812, "grad_norm": 8.5, "learning_rate": 6.016769500962853e-06, "loss": 1.0792049407958983, "mean_token_accuracy": 0.7208302891254426, "num_tokens": 38177109.0, "step": 80050 }, { "entropy": 1.0423497074842454, "epoch": 1.9479098270956445, "grad_norm": 24.125, "learning_rate": 6.004478891909176e-06, "loss": 1.0619432067871093, "mean_token_accuracy": 0.7283716952800751, "num_tokens": 38204073.0, "step": 80100 }, { "entropy": 1.046692163348198, "epoch": 1.9491257508329078, "grad_norm": 18.875, "learning_rate": 5.992195460837471e-06, "loss": 1.0690690612792968, "mean_token_accuracy": 0.7276489007472992, "num_tokens": 38230038.0, "step": 80150 }, { "entropy": 1.1186953908205033, "epoch": 1.950341674570171, "grad_norm": 47.0, "learning_rate": 5.979919229815011e-06, "loss": 1.1281797790527344, "mean_token_accuracy": 0.7149170714616776, "num_tokens": 38248516.0, "step": 80200 }, { "entropy": 0.9392022651433944, "epoch": 1.951557598307434, "grad_norm": 41.0, "learning_rate": 5.967650220896124e-06, "loss": 0.9368399810791016, "mean_token_accuracy": 0.7518910372257233, "num_tokens": 38271114.0, "step": 80250 }, { "entropy": 0.9588869524002075, "epoch": 1.9527735220446973, "grad_norm": 23.75, "learning_rate": 5.955388456122184e-06, "loss": 0.9716487884521484, "mean_token_accuracy": 0.7446791696548462, "num_tokens": 38291565.0, "step": 80300 }, { "entropy": 0.9864284253120422, "epoch": 1.9539894457819607, "grad_norm": 13.75, "learning_rate": 5.943133957521523e-06, "loss": 0.9888604736328125, "mean_token_accuracy": 0.7490763485431671, "num_tokens": 38309367.0, "step": 80350 }, { "entropy": 1.0912788373231888, "epoch": 1.9552053695192237, "grad_norm": 20.75, "learning_rate": 5.930886747109446e-06, "loss": 1.104812469482422, "mean_token_accuracy": 0.713081384897232, "num_tokens": 38334390.0, "step": 80400 }, { "entropy": 1.0594469559192659, "epoch": 1.9564212932564868, "grad_norm": 15.375, "learning_rate": 5.91864684688815e-06, "loss": 1.088111114501953, "mean_token_accuracy": 0.7239407056570053, "num_tokens": 38356256.0, "step": 80450 }, { "entropy": 1.0899956154823303, "epoch": 1.9576372169937502, "grad_norm": 13.75, "learning_rate": 5.9064142788466994e-06, "loss": 1.1231449127197266, "mean_token_accuracy": 0.7104306882619857, "num_tokens": 38380557.0, "step": 80500 }, { "entropy": 0.9902652108669281, "epoch": 1.9588531407310135, "grad_norm": 15.75, "learning_rate": 5.894189064960996e-06, "loss": 1.0004283905029296, "mean_token_accuracy": 0.7381913912296295, "num_tokens": 38403521.0, "step": 80550 }, { "entropy": 1.0481272864341735, "epoch": 1.9600690644682766, "grad_norm": 21.0, "learning_rate": 5.881971227193714e-06, "loss": 1.0481510162353516, "mean_token_accuracy": 0.7300284659862518, "num_tokens": 38427516.0, "step": 80600 }, { "entropy": 1.0634418630599975, "epoch": 1.9612849882055396, "grad_norm": 15.875, "learning_rate": 5.869760787494292e-06, "loss": 1.06069091796875, "mean_token_accuracy": 0.7185734713077545, "num_tokens": 38454743.0, "step": 80650 }, { "entropy": 1.0427658784389495, "epoch": 1.962500911942803, "grad_norm": 13.875, "learning_rate": 5.8575577677988735e-06, "loss": 1.0491986846923829, "mean_token_accuracy": 0.7265565395355225, "num_tokens": 38478487.0, "step": 80700 }, { "entropy": 1.0393750089406968, "epoch": 1.9637168356800663, "grad_norm": 28.125, "learning_rate": 5.845362190030264e-06, "loss": 1.0685882568359375, "mean_token_accuracy": 0.7193291437625885, "num_tokens": 38497725.0, "step": 80750 }, { "entropy": 1.057287870645523, "epoch": 1.9649327594173294, "grad_norm": 21.375, "learning_rate": 5.833174076097908e-06, "loss": 1.0773413848876954, "mean_token_accuracy": 0.7183080869913101, "num_tokens": 38518029.0, "step": 80800 }, { "entropy": 1.0176406836509704, "epoch": 1.9661486831545925, "grad_norm": 37.0, "learning_rate": 5.820993447897836e-06, "loss": 1.0264263153076172, "mean_token_accuracy": 0.7294310486316681, "num_tokens": 38544171.0, "step": 80850 }, { "entropy": 0.9253982397913932, "epoch": 1.9673646068918558, "grad_norm": 15.75, "learning_rate": 5.808820327312634e-06, "loss": 0.9388565826416015, "mean_token_accuracy": 0.7518985843658448, "num_tokens": 38565261.0, "step": 80900 }, { "entropy": 1.0419474571943284, "epoch": 1.968580530629119, "grad_norm": 14.8125, "learning_rate": 5.796654736211404e-06, "loss": 1.0590795135498048, "mean_token_accuracy": 0.7297858554124832, "num_tokens": 38589531.0, "step": 80950 }, { "entropy": 0.9528706428408623, "epoch": 1.9697964543663822, "grad_norm": 12.5, "learning_rate": 5.784496696449715e-06, "loss": 0.9385469055175781, "mean_token_accuracy": 0.7536850440502166, "num_tokens": 38613085.0, "step": 81000 }, { "entropy": 0.9963955688476562, "epoch": 1.9710123781036453, "grad_norm": 14.625, "learning_rate": 5.7723462298695645e-06, "loss": 1.0193953704833985, "mean_token_accuracy": 0.7338818454742432, "num_tokens": 38634166.0, "step": 81050 }, { "entropy": 1.059339182972908, "epoch": 1.9722283018409086, "grad_norm": 15.1875, "learning_rate": 5.76020335829936e-06, "loss": 1.0792028045654296, "mean_token_accuracy": 0.7274816310405732, "num_tokens": 38658804.0, "step": 81100 }, { "entropy": 0.9724841311573982, "epoch": 1.973444225578172, "grad_norm": 25.875, "learning_rate": 5.7480681035538495e-06, "loss": 0.9828000640869141, "mean_token_accuracy": 0.7445386618375778, "num_tokens": 38681070.0, "step": 81150 }, { "entropy": 1.0338044267892839, "epoch": 1.974660149315435, "grad_norm": 18.25, "learning_rate": 5.735940487434111e-06, "loss": 1.0274979400634765, "mean_token_accuracy": 0.7295913553237915, "num_tokens": 38698944.0, "step": 81200 }, { "entropy": 1.0113266146183013, "epoch": 1.975876073052698, "grad_norm": 9.5625, "learning_rate": 5.723820531727486e-06, "loss": 1.015476608276367, "mean_token_accuracy": 0.7401099979877472, "num_tokens": 38727528.0, "step": 81250 }, { "entropy": 0.996207589507103, "epoch": 1.9770919967899614, "grad_norm": 12.25, "learning_rate": 5.711708258207569e-06, "loss": 1.000556411743164, "mean_token_accuracy": 0.7381815993785859, "num_tokens": 38752842.0, "step": 81300 }, { "entropy": 1.0465625309944153, "epoch": 1.9783079205272245, "grad_norm": 19.75, "learning_rate": 5.699603688634139e-06, "loss": 1.068922882080078, "mean_token_accuracy": 0.7308431339263916, "num_tokens": 38772055.0, "step": 81350 }, { "entropy": 1.1528121602535248, "epoch": 1.9795238442644876, "grad_norm": 25.125, "learning_rate": 5.687506844753139e-06, "loss": 1.1678045654296876, "mean_token_accuracy": 0.7036880552768707, "num_tokens": 38797504.0, "step": 81400 }, { "entropy": 1.105405918955803, "epoch": 1.9807397680017509, "grad_norm": 50.0, "learning_rate": 5.6754177482966414e-06, "loss": 1.129139404296875, "mean_token_accuracy": 0.7181373554468155, "num_tokens": 38819492.0, "step": 81450 }, { "entropy": 1.1220918077230453, "epoch": 1.9819556917390142, "grad_norm": 32.75, "learning_rate": 5.663336420982786e-06, "loss": 1.1237078094482422, "mean_token_accuracy": 0.7175128573179245, "num_tokens": 38847263.0, "step": 81500 }, { "entropy": 1.112402564883232, "epoch": 1.9831716154762773, "grad_norm": 19.25, "learning_rate": 5.651262884515773e-06, "loss": 1.1098297119140625, "mean_token_accuracy": 0.721532289981842, "num_tokens": 38869150.0, "step": 81550 }, { "entropy": 0.9925910374522209, "epoch": 1.9843875392135404, "grad_norm": 31.375, "learning_rate": 5.639197160585788e-06, "loss": 1.0090249633789063, "mean_token_accuracy": 0.7386035990715026, "num_tokens": 38894383.0, "step": 81600 }, { "entropy": 0.9484946519136429, "epoch": 1.9856034629508037, "grad_norm": 12.625, "learning_rate": 5.627139270868988e-06, "loss": 0.9417104339599609, "mean_token_accuracy": 0.7533659863471985, "num_tokens": 38916352.0, "step": 81650 }, { "entropy": 0.9762229073047638, "epoch": 1.986819386688067, "grad_norm": 20.5, "learning_rate": 5.615089237027466e-06, "loss": 0.9849710083007812, "mean_token_accuracy": 0.7433941811323166, "num_tokens": 38940815.0, "step": 81700 }, { "entropy": 1.0608352273702621, "epoch": 1.98803531042533, "grad_norm": 10.75, "learning_rate": 5.603047080709185e-06, "loss": 1.090807876586914, "mean_token_accuracy": 0.718224590420723, "num_tokens": 38967310.0, "step": 81750 }, { "entropy": 0.9754500687122345, "epoch": 1.9892512341625932, "grad_norm": 16.5, "learning_rate": 5.591012823547967e-06, "loss": 1.0024430847167969, "mean_token_accuracy": 0.7376100450754166, "num_tokens": 38987552.0, "step": 81800 }, { "entropy": 1.0415171828866006, "epoch": 1.9904671578998565, "grad_norm": 15.5, "learning_rate": 5.5789864871634436e-06, "loss": 1.0319233703613282, "mean_token_accuracy": 0.7361729699373245, "num_tokens": 39010815.0, "step": 81850 }, { "entropy": 1.0609261083602906, "epoch": 1.9916830816371198, "grad_norm": 11.5625, "learning_rate": 5.566968093161011e-06, "loss": 1.0752648162841796, "mean_token_accuracy": 0.7185404396057129, "num_tokens": 39038577.0, "step": 81900 }, { "entropy": 1.0644001418352127, "epoch": 1.992899005374383, "grad_norm": 16.625, "learning_rate": 5.5549576631317945e-06, "loss": 1.0622994232177734, "mean_token_accuracy": 0.7272921115159988, "num_tokens": 39058163.0, "step": 81950 }, { "entropy": 1.019466494321823, "epoch": 1.994114929111646, "grad_norm": 18.125, "learning_rate": 5.542955218652624e-06, "loss": 1.04466796875, "mean_token_accuracy": 0.7210451793670655, "num_tokens": 39079190.0, "step": 82000 }, { "entropy": 1.0432225787639617, "epoch": 1.9953308528489093, "grad_norm": 12.6875, "learning_rate": 5.530960781285969e-06, "loss": 1.0485654449462891, "mean_token_accuracy": 0.7249899005889893, "num_tokens": 39104053.0, "step": 82050 }, { "entropy": 1.0385715234279633, "epoch": 1.9965467765861726, "grad_norm": 52.0, "learning_rate": 5.5189743725799285e-06, "loss": 1.0439997100830078, "mean_token_accuracy": 0.7288470208644867, "num_tokens": 39126490.0, "step": 82100 }, { "entropy": 0.9868138992786407, "epoch": 1.9977627003234357, "grad_norm": 19.0, "learning_rate": 5.506996014068164e-06, "loss": 0.9900110626220703, "mean_token_accuracy": 0.7389319396018982, "num_tokens": 39147949.0, "step": 82150 }, { "entropy": 1.117902365922928, "epoch": 1.9989786240606988, "grad_norm": 19.375, "learning_rate": 5.495025727269887e-06, "loss": 1.1472412109375, "mean_token_accuracy": 0.7063706123828888, "num_tokens": 39174950.0, "step": 82200 }, { "entropy": 1.0289127081632614, "epoch": 2.000194547797962, "grad_norm": 27.875, "learning_rate": 5.483063533689801e-06, "loss": 1.0368164825439452, "mean_token_accuracy": 0.7387083387374878, "num_tokens": 39200388.0, "step": 82250 }, { "entropy": 0.9489276540279389, "epoch": 2.0014104715352254, "grad_norm": 15.25, "learning_rate": 5.471109454818067e-06, "loss": 0.9096758270263672, "mean_token_accuracy": 0.7497152781486511, "num_tokens": 39222549.0, "step": 82300 }, { "entropy": 1.152012374997139, "epoch": 2.0026263952724883, "grad_norm": 15.0, "learning_rate": 5.459163512130279e-06, "loss": 1.1334358978271484, "mean_token_accuracy": 0.7155215102434158, "num_tokens": 39245605.0, "step": 82350 }, { "entropy": 1.0514961647987366, "epoch": 2.0038423190097516, "grad_norm": 23.875, "learning_rate": 5.447225727087399e-06, "loss": 1.0615825653076172, "mean_token_accuracy": 0.7238313961029053, "num_tokens": 39271189.0, "step": 82400 }, { "entropy": 1.0861409276723861, "epoch": 2.005058242747015, "grad_norm": 15.25, "learning_rate": 5.43529612113575e-06, "loss": 1.0702657318115234, "mean_token_accuracy": 0.729324248433113, "num_tokens": 39295708.0, "step": 82450 }, { "entropy": 1.011818115711212, "epoch": 2.0062741664842783, "grad_norm": 12.9375, "learning_rate": 5.423374715706955e-06, "loss": 0.9968651580810547, "mean_token_accuracy": 0.7449310600757599, "num_tokens": 39319903.0, "step": 82500 }, { "entropy": 0.8433882063627243, "epoch": 2.007490090221541, "grad_norm": 20.625, "learning_rate": 5.411461532217899e-06, "loss": 0.8211443328857422, "mean_token_accuracy": 0.7781620192527771, "num_tokens": 39339717.0, "step": 82550 }, { "entropy": 0.9054801541566849, "epoch": 2.0087060139588044, "grad_norm": 20.875, "learning_rate": 5.399556592070703e-06, "loss": 0.8661109161376953, "mean_token_accuracy": 0.7665883100032806, "num_tokens": 39361225.0, "step": 82600 }, { "entropy": 1.0159638500213624, "epoch": 2.0099219376960678, "grad_norm": 19.125, "learning_rate": 5.387659916652668e-06, "loss": 1.0221646118164063, "mean_token_accuracy": 0.7324800646305084, "num_tokens": 39383299.0, "step": 82650 }, { "entropy": 1.002085320353508, "epoch": 2.011137861433331, "grad_norm": 24.25, "learning_rate": 5.3757715273362645e-06, "loss": 1.0027784729003906, "mean_token_accuracy": 0.7395622301101684, "num_tokens": 39406969.0, "step": 82700 }, { "entropy": 1.0224939519166947, "epoch": 2.012353785170594, "grad_norm": 26.0, "learning_rate": 5.363891445479068e-06, "loss": 1.0303044891357422, "mean_token_accuracy": 0.7448958468437195, "num_tokens": 39431193.0, "step": 82750 }, { "entropy": 0.9765566331148148, "epoch": 2.0135697089078572, "grad_norm": 14.9375, "learning_rate": 5.352019692423728e-06, "loss": 0.962557144165039, "mean_token_accuracy": 0.7449906945228577, "num_tokens": 39455725.0, "step": 82800 }, { "entropy": 1.006060311794281, "epoch": 2.0147856326451206, "grad_norm": 19.125, "learning_rate": 5.340156289497929e-06, "loss": 0.9982253265380859, "mean_token_accuracy": 0.7435039508342743, "num_tokens": 39478810.0, "step": 82850 }, { "entropy": 1.0440497654676437, "epoch": 2.016001556382384, "grad_norm": 20.25, "learning_rate": 5.328301258014365e-06, "loss": 1.0497218322753907, "mean_token_accuracy": 0.72787069439888, "num_tokens": 39500081.0, "step": 82900 }, { "entropy": 0.9894450759887695, "epoch": 2.0172174801196467, "grad_norm": 8.9375, "learning_rate": 5.316454619270677e-06, "loss": 0.944776382446289, "mean_token_accuracy": 0.750332088470459, "num_tokens": 39521803.0, "step": 82950 }, { "entropy": 1.0366404205560684, "epoch": 2.01843340385691, "grad_norm": 21.125, "learning_rate": 5.304616394549445e-06, "loss": 1.06203369140625, "mean_token_accuracy": 0.7275150483846664, "num_tokens": 39545162.0, "step": 83000 }, { "entropy": 1.041969463825226, "epoch": 2.0196493275941734, "grad_norm": 15.125, "learning_rate": 5.292786605118112e-06, "loss": 1.0445215606689453, "mean_token_accuracy": 0.7319275224208832, "num_tokens": 39569057.0, "step": 83050 }, { "entropy": 1.0377963173389435, "epoch": 2.0208652513314367, "grad_norm": 10.5625, "learning_rate": 5.280965272228989e-06, "loss": 1.0262889862060547, "mean_token_accuracy": 0.7375992906093597, "num_tokens": 39593551.0, "step": 83100 }, { "entropy": 1.1082997435331345, "epoch": 2.0220811750686996, "grad_norm": 13.1875, "learning_rate": 5.269152417119179e-06, "loss": 1.1098148345947265, "mean_token_accuracy": 0.7139271652698517, "num_tokens": 39622787.0, "step": 83150 }, { "entropy": 1.040958948135376, "epoch": 2.023297098805963, "grad_norm": 18.5, "learning_rate": 5.257348061010557e-06, "loss": 1.0601010894775391, "mean_token_accuracy": 0.7302805376052857, "num_tokens": 39648441.0, "step": 83200 }, { "entropy": 1.005948098897934, "epoch": 2.024513022543226, "grad_norm": 25.375, "learning_rate": 5.245552225109739e-06, "loss": 1.0064572143554686, "mean_token_accuracy": 0.7420567083358764, "num_tokens": 39671442.0, "step": 83250 }, { "entropy": 0.976248431801796, "epoch": 2.0257289462804895, "grad_norm": 21.75, "learning_rate": 5.233764930608022e-06, "loss": 0.98120361328125, "mean_token_accuracy": 0.7453186202049256, "num_tokens": 39695612.0, "step": 83300 }, { "entropy": 1.0113830423355104, "epoch": 2.0269448700177524, "grad_norm": 24.0, "learning_rate": 5.2219861986813654e-06, "loss": 1.0170974731445312, "mean_token_accuracy": 0.7411076700687409, "num_tokens": 39720019.0, "step": 83350 }, { "entropy": 1.0613459295034409, "epoch": 2.0281607937550157, "grad_norm": 7.78125, "learning_rate": 5.210216050490351e-06, "loss": 1.0661268615722657, "mean_token_accuracy": 0.7259841549396515, "num_tokens": 39748299.0, "step": 83400 }, { "entropy": 1.1020708340406418, "epoch": 2.029376717492279, "grad_norm": 10.4375, "learning_rate": 5.198454507180127e-06, "loss": 1.1233419799804687, "mean_token_accuracy": 0.7196387326717377, "num_tokens": 39771434.0, "step": 83450 }, { "entropy": 1.0254553407430649, "epoch": 2.030592641229542, "grad_norm": 19.375, "learning_rate": 5.186701589880392e-06, "loss": 1.0286945343017577, "mean_token_accuracy": 0.7319644087553024, "num_tokens": 39795510.0, "step": 83500 }, { "entropy": 0.9912846016883851, "epoch": 2.031808564966805, "grad_norm": 26.5, "learning_rate": 5.174957319705338e-06, "loss": 0.9996720123291015, "mean_token_accuracy": 0.7445267581939697, "num_tokens": 39818510.0, "step": 83550 }, { "entropy": 1.0784502565860747, "epoch": 2.0330244887040685, "grad_norm": 15.75, "learning_rate": 5.1632217177536356e-06, "loss": 1.0843694305419922, "mean_token_accuracy": 0.7288901817798614, "num_tokens": 39844282.0, "step": 83600 }, { "entropy": 0.8673380145430565, "epoch": 2.034240412441332, "grad_norm": 13.5625, "learning_rate": 5.1514948051083815e-06, "loss": 0.8618402862548828, "mean_token_accuracy": 0.7695235884189606, "num_tokens": 39868825.0, "step": 83650 }, { "entropy": 1.0142088633775712, "epoch": 2.0354563361785947, "grad_norm": 10.75, "learning_rate": 5.139776602837048e-06, "loss": 1.0192671203613282, "mean_token_accuracy": 0.7403693234920502, "num_tokens": 39891439.0, "step": 83700 }, { "entropy": 0.9686830002069473, "epoch": 2.036672259915858, "grad_norm": 27.5, "learning_rate": 5.1280671319914745e-06, "loss": 0.9713897705078125, "mean_token_accuracy": 0.7528708863258362, "num_tokens": 39912197.0, "step": 83750 }, { "entropy": 0.9887857902050018, "epoch": 2.0378881836531213, "grad_norm": 8.5, "learning_rate": 5.1163664136078095e-06, "loss": 0.9969513702392578, "mean_token_accuracy": 0.747105787396431, "num_tokens": 39939981.0, "step": 83800 }, { "entropy": 1.0554524487257004, "epoch": 2.0391041073903846, "grad_norm": 13.375, "learning_rate": 5.1046744687064676e-06, "loss": 1.0523094940185547, "mean_token_accuracy": 0.7251684075593948, "num_tokens": 39963449.0, "step": 83850 }, { "entropy": 0.9139573490619659, "epoch": 2.0403200311276475, "grad_norm": 11.9375, "learning_rate": 5.092991318292121e-06, "loss": 0.9060731506347657, "mean_token_accuracy": 0.7586776340007781, "num_tokens": 39989236.0, "step": 83900 }, { "entropy": 1.0480838775634767, "epoch": 2.041535954864911, "grad_norm": 16.0, "learning_rate": 5.081316983353624e-06, "loss": 1.0826076507568358, "mean_token_accuracy": 0.7183652758598328, "num_tokens": 40013168.0, "step": 83950 }, { "entropy": 0.9718182104825973, "epoch": 2.042751878602174, "grad_norm": 13.375, "learning_rate": 5.069651484864009e-06, "loss": 0.9610228729248047, "mean_token_accuracy": 0.7458216321468353, "num_tokens": 40037246.0, "step": 84000 }, { "entropy": 1.0338670879602432, "epoch": 2.0439678023394374, "grad_norm": 15.5, "learning_rate": 5.057994843780424e-06, "loss": 1.0446466827392578, "mean_token_accuracy": 0.7304138386249542, "num_tokens": 40060851.0, "step": 84050 }, { "entropy": 1.028624368906021, "epoch": 2.0451837260767003, "grad_norm": 22.25, "learning_rate": 5.046347081044103e-06, "loss": 1.020725326538086, "mean_token_accuracy": 0.7338481748104095, "num_tokens": 40082838.0, "step": 84100 }, { "entropy": 0.973401500582695, "epoch": 2.0463996498139636, "grad_norm": 11.8125, "learning_rate": 5.034708217580341e-06, "loss": 0.9872682189941406, "mean_token_accuracy": 0.7495310777425765, "num_tokens": 40110373.0, "step": 84150 }, { "entropy": 1.0101853227615356, "epoch": 2.047615573551227, "grad_norm": 21.625, "learning_rate": 5.023078274298432e-06, "loss": 1.0139545440673827, "mean_token_accuracy": 0.738765742778778, "num_tokens": 40132230.0, "step": 84200 }, { "entropy": 0.9146408551931381, "epoch": 2.0488314972884902, "grad_norm": 16.0, "learning_rate": 5.011457272091655e-06, "loss": 0.9016697692871094, "mean_token_accuracy": 0.7650940442085266, "num_tokens": 40155390.0, "step": 84250 }, { "entropy": 1.0337323409318924, "epoch": 2.050047421025753, "grad_norm": 27.875, "learning_rate": 4.999845231837227e-06, "loss": 1.0600850677490234, "mean_token_accuracy": 0.7324154615402222, "num_tokens": 40179449.0, "step": 84300 }, { "entropy": 1.0495912969112395, "epoch": 2.0512633447630164, "grad_norm": 31.5, "learning_rate": 4.988242174396256e-06, "loss": 1.0577619934082032, "mean_token_accuracy": 0.7279470956325531, "num_tokens": 40202035.0, "step": 84350 }, { "entropy": 0.9615462498366832, "epoch": 2.0524792685002797, "grad_norm": 18.25, "learning_rate": 4.976648120613718e-06, "loss": 0.9692646789550782, "mean_token_accuracy": 0.7450887793302536, "num_tokens": 40227561.0, "step": 84400 }, { "entropy": 1.0402577170729637, "epoch": 2.053695192237543, "grad_norm": 13.375, "learning_rate": 4.9650630913184075e-06, "loss": 1.083727798461914, "mean_token_accuracy": 0.7254922533035278, "num_tokens": 40252968.0, "step": 84450 }, { "entropy": 1.0224912905693053, "epoch": 2.054911115974806, "grad_norm": 12.6875, "learning_rate": 4.953487107322915e-06, "loss": 1.0101384735107422, "mean_token_accuracy": 0.7363063848018646, "num_tokens": 40276731.0, "step": 84500 }, { "entropy": 1.0344573718309402, "epoch": 2.0561270397120692, "grad_norm": 22.875, "learning_rate": 4.941920189423585e-06, "loss": 1.024039764404297, "mean_token_accuracy": 0.7318169337511062, "num_tokens": 40302776.0, "step": 84550 }, { "entropy": 1.0109822350740432, "epoch": 2.0573429634493325, "grad_norm": 11.3125, "learning_rate": 4.9303623584004555e-06, "loss": 1.0037932586669922, "mean_token_accuracy": 0.7459576642513275, "num_tokens": 40326186.0, "step": 84600 }, { "entropy": 0.9683415031433106, "epoch": 2.058558887186596, "grad_norm": 13.5625, "learning_rate": 4.918813635017262e-06, "loss": 0.9838975524902344, "mean_token_accuracy": 0.7402579498291015, "num_tokens": 40350367.0, "step": 84650 }, { "entropy": 0.9817614591121674, "epoch": 2.0597748109238587, "grad_norm": 17.5, "learning_rate": 4.907274040021365e-06, "loss": 0.9835805511474609, "mean_token_accuracy": 0.7493320584297181, "num_tokens": 40372557.0, "step": 84700 }, { "entropy": 1.0753659018874169, "epoch": 2.060990734661122, "grad_norm": 21.875, "learning_rate": 4.895743594143721e-06, "loss": 1.0676514434814453, "mean_token_accuracy": 0.7284519708156586, "num_tokens": 40402428.0, "step": 84750 }, { "entropy": 0.9248937958478928, "epoch": 2.0622066583983854, "grad_norm": 6.6875, "learning_rate": 4.884222318098869e-06, "loss": 0.9092051696777343, "mean_token_accuracy": 0.7660789692401886, "num_tokens": 40425346.0, "step": 84800 }, { "entropy": 1.0144396007061005, "epoch": 2.0634225821356487, "grad_norm": 16.375, "learning_rate": 4.872710232584854e-06, "loss": 1.025062026977539, "mean_token_accuracy": 0.7279070734977722, "num_tokens": 40445884.0, "step": 84850 }, { "entropy": 1.0353853780031204, "epoch": 2.0646385058729115, "grad_norm": 14.5, "learning_rate": 4.861207358283223e-06, "loss": 1.0416678619384765, "mean_token_accuracy": 0.7345634239912033, "num_tokens": 40469633.0, "step": 84900 }, { "entropy": 1.0624809947609901, "epoch": 2.065854429610175, "grad_norm": 15.375, "learning_rate": 4.849713715858979e-06, "loss": 1.0959622192382812, "mean_token_accuracy": 0.7219945073127747, "num_tokens": 40492307.0, "step": 84950 }, { "entropy": 1.0068687188625336, "epoch": 2.067070353347438, "grad_norm": 19.625, "learning_rate": 4.838229325960516e-06, "loss": 1.008110809326172, "mean_token_accuracy": 0.7435042518377304, "num_tokens": 40517836.0, "step": 85000 }, { "entropy": 0.9351586270332336, "epoch": 2.0682862770847015, "grad_norm": 13.1875, "learning_rate": 4.826754209219635e-06, "loss": 0.9492865753173828, "mean_token_accuracy": 0.7498727357387542, "num_tokens": 40541229.0, "step": 85050 }, { "entropy": 0.9728618478775024, "epoch": 2.0695022008219643, "grad_norm": 25.375, "learning_rate": 4.815288386251455e-06, "loss": 0.9523411560058593, "mean_token_accuracy": 0.7546172726154328, "num_tokens": 40563580.0, "step": 85100 }, { "entropy": 0.9841512072086335, "epoch": 2.0707181245592277, "grad_norm": 14.9375, "learning_rate": 4.8038318776544125e-06, "loss": 0.9966397857666016, "mean_token_accuracy": 0.7503954708576203, "num_tokens": 40584872.0, "step": 85150 }, { "entropy": 0.8623242962360382, "epoch": 2.071934048296491, "grad_norm": 11.5625, "learning_rate": 4.79238470401021e-06, "loss": 0.8516252899169922, "mean_token_accuracy": 0.7701353538036346, "num_tokens": 40609444.0, "step": 85200 }, { "entropy": 0.9282606381177902, "epoch": 2.073149972033754, "grad_norm": 16.25, "learning_rate": 4.780946885883774e-06, "loss": 0.9241779327392579, "mean_token_accuracy": 0.7554946577548981, "num_tokens": 40633658.0, "step": 85250 }, { "entropy": 0.9382172572612762, "epoch": 2.074365895771017, "grad_norm": 12.875, "learning_rate": 4.769518443823222e-06, "loss": 0.9188758087158203, "mean_token_accuracy": 0.752756633758545, "num_tokens": 40658767.0, "step": 85300 }, { "entropy": 1.000344721674919, "epoch": 2.0755818195082805, "grad_norm": 11.375, "learning_rate": 4.758099398359831e-06, "loss": 1.0314955139160156, "mean_token_accuracy": 0.7352711236476899, "num_tokens": 40684053.0, "step": 85350 }, { "entropy": 1.0180498522520065, "epoch": 2.076797743245544, "grad_norm": 18.375, "learning_rate": 4.746689770008001e-06, "loss": 1.0225936126708985, "mean_token_accuracy": 0.7344070756435395, "num_tokens": 40707916.0, "step": 85400 }, { "entropy": 1.042858436703682, "epoch": 2.0780136669828067, "grad_norm": 24.75, "learning_rate": 4.73528957926521e-06, "loss": 1.0396802520751953, "mean_token_accuracy": 0.7337767255306243, "num_tokens": 40732728.0, "step": 85450 }, { "entropy": 0.9442036521434783, "epoch": 2.07922959072007, "grad_norm": 17.875, "learning_rate": 4.723898846611977e-06, "loss": 0.9624746704101562, "mean_token_accuracy": 0.7415838372707367, "num_tokens": 40756796.0, "step": 85500 }, { "entropy": 0.9676345548033715, "epoch": 2.0804455144573333, "grad_norm": 20.75, "learning_rate": 4.712517592511841e-06, "loss": 0.9702048492431641, "mean_token_accuracy": 0.7441361176967621, "num_tokens": 40778686.0, "step": 85550 }, { "entropy": 0.8367918384075165, "epoch": 2.0816614381945966, "grad_norm": 18.875, "learning_rate": 4.701145837411301e-06, "loss": 0.830169677734375, "mean_token_accuracy": 0.7795566618442535, "num_tokens": 40799546.0, "step": 85600 }, { "entropy": 0.9794091922044754, "epoch": 2.0828773619318595, "grad_norm": 28.25, "learning_rate": 4.689783601739793e-06, "loss": 0.9889635467529296, "mean_token_accuracy": 0.7458252370357513, "num_tokens": 40825571.0, "step": 85650 }, { "entropy": 1.0063688004016875, "epoch": 2.0840932856691228, "grad_norm": 14.9375, "learning_rate": 4.6784309059096615e-06, "loss": 1.0109487915039062, "mean_token_accuracy": 0.7353627598285675, "num_tokens": 40852750.0, "step": 85700 }, { "entropy": 1.0200265401601791, "epoch": 2.085309209406386, "grad_norm": 29.25, "learning_rate": 4.667087770316096e-06, "loss": 1.0151416015625, "mean_token_accuracy": 0.7382558059692382, "num_tokens": 40873736.0, "step": 85750 }, { "entropy": 1.0778705549240113, "epoch": 2.0865251331436494, "grad_norm": 25.5, "learning_rate": 4.655754215337124e-06, "loss": 1.0651473999023438, "mean_token_accuracy": 0.735242925286293, "num_tokens": 40898358.0, "step": 85800 }, { "entropy": 1.0009285092353821, "epoch": 2.0877410568809123, "grad_norm": 13.9375, "learning_rate": 4.644430261333562e-06, "loss": 0.9954179382324219, "mean_token_accuracy": 0.741981440782547, "num_tokens": 40918917.0, "step": 85850 }, { "entropy": 1.080433520078659, "epoch": 2.0889569806181756, "grad_norm": 13.1875, "learning_rate": 4.6331159286489695e-06, "loss": 1.071349105834961, "mean_token_accuracy": 0.7300617837905884, "num_tokens": 40940273.0, "step": 85900 }, { "entropy": 1.0286549496650697, "epoch": 2.090172904355439, "grad_norm": 21.875, "learning_rate": 4.621811237609624e-06, "loss": 1.0229792022705078, "mean_token_accuracy": 0.7430519366264343, "num_tokens": 40963811.0, "step": 85950 }, { "entropy": 1.0036456930637359, "epoch": 2.091388828092702, "grad_norm": 15.8125, "learning_rate": 4.610516208524482e-06, "loss": 1.0159654235839843, "mean_token_accuracy": 0.739109696149826, "num_tokens": 40985464.0, "step": 86000 }, { "entropy": 1.0089019286632537, "epoch": 2.092604751829965, "grad_norm": 54.5, "learning_rate": 4.599230861685145e-06, "loss": 1.0220670318603515, "mean_token_accuracy": 0.7389716029167175, "num_tokens": 41010001.0, "step": 86050 }, { "entropy": 1.0203933775424958, "epoch": 2.0938206755672284, "grad_norm": 15.8125, "learning_rate": 4.587955217365821e-06, "loss": 1.0406178283691405, "mean_token_accuracy": 0.7402129626274109, "num_tokens": 41031325.0, "step": 86100 }, { "entropy": 1.0086980086565018, "epoch": 2.0950365993044917, "grad_norm": 11.625, "learning_rate": 4.576689295823281e-06, "loss": 1.0005313110351564, "mean_token_accuracy": 0.7395895981788635, "num_tokens": 41058235.0, "step": 86150 }, { "entropy": 0.9429724800586701, "epoch": 2.096252523041755, "grad_norm": 18.0, "learning_rate": 4.565433117296841e-06, "loss": 0.9466065216064453, "mean_token_accuracy": 0.7433424699306488, "num_tokens": 41082611.0, "step": 86200 }, { "entropy": 0.9341649651527405, "epoch": 2.097468446779018, "grad_norm": 10.375, "learning_rate": 4.554186702008292e-06, "loss": 0.9203137969970703, "mean_token_accuracy": 0.7606729739904403, "num_tokens": 41106260.0, "step": 86250 }, { "entropy": 0.9928660893440246, "epoch": 2.098684370516281, "grad_norm": 19.875, "learning_rate": 4.542950070161907e-06, "loss": 1.006229782104492, "mean_token_accuracy": 0.7326812350749969, "num_tokens": 41132666.0, "step": 86300 }, { "entropy": 0.8766815394163132, "epoch": 2.0999002942535445, "grad_norm": 17.75, "learning_rate": 4.531723241944376e-06, "loss": 0.8540803527832032, "mean_token_accuracy": 0.7743929159641266, "num_tokens": 41154852.0, "step": 86350 }, { "entropy": 0.979033288359642, "epoch": 2.101116217990808, "grad_norm": 27.125, "learning_rate": 4.520506237524769e-06, "loss": 0.9507260131835937, "mean_token_accuracy": 0.7531220883131027, "num_tokens": 41180119.0, "step": 86400 }, { "entropy": 0.9699729919433594, "epoch": 2.1023321417280707, "grad_norm": 28.125, "learning_rate": 4.509299077054522e-06, "loss": 0.9943119049072265, "mean_token_accuracy": 0.7391938412189484, "num_tokens": 41200194.0, "step": 86450 }, { "entropy": 0.9724733752012252, "epoch": 2.103548065465334, "grad_norm": 18.25, "learning_rate": 4.498101780667373e-06, "loss": 0.9883483123779296, "mean_token_accuracy": 0.7433962273597717, "num_tokens": 41221268.0, "step": 86500 }, { "entropy": 1.0094112271070481, "epoch": 2.1047639892025973, "grad_norm": 9.5625, "learning_rate": 4.4869143684793425e-06, "loss": 1.037797088623047, "mean_token_accuracy": 0.7295894718170166, "num_tokens": 41245919.0, "step": 86550 }, { "entropy": 0.9308664238452912, "epoch": 2.10597991293986, "grad_norm": 18.25, "learning_rate": 4.475736860588703e-06, "loss": 0.9334203338623047, "mean_token_accuracy": 0.7485705745220185, "num_tokens": 41269560.0, "step": 86600 }, { "entropy": 0.9925245302915573, "epoch": 2.1071958366771235, "grad_norm": 21.5, "learning_rate": 4.464569277075918e-06, "loss": 0.9804193878173828, "mean_token_accuracy": 0.7435371220111847, "num_tokens": 41294859.0, "step": 86650 }, { "entropy": 0.8935581290721893, "epoch": 2.108411760414387, "grad_norm": 8.125, "learning_rate": 4.453411638003633e-06, "loss": 0.9020847320556641, "mean_token_accuracy": 0.7677188956737518, "num_tokens": 41317936.0, "step": 86700 }, { "entropy": 1.0250574266910553, "epoch": 2.10962768415165, "grad_norm": 15.875, "learning_rate": 4.442263963416632e-06, "loss": 1.019996337890625, "mean_token_accuracy": 0.736659722328186, "num_tokens": 41337680.0, "step": 86750 }, { "entropy": 1.0252177178859712, "epoch": 2.110843607888913, "grad_norm": 11.5625, "learning_rate": 4.431126273341787e-06, "loss": 1.0451042938232422, "mean_token_accuracy": 0.7266496038436889, "num_tokens": 41364625.0, "step": 86800 }, { "entropy": 0.9971945458650588, "epoch": 2.1120595316261763, "grad_norm": 16.625, "learning_rate": 4.4199985877880355e-06, "loss": 1.0294869232177735, "mean_token_accuracy": 0.7283528292179108, "num_tokens": 41391585.0, "step": 86850 }, { "entropy": 0.8929921633005142, "epoch": 2.1132754553634396, "grad_norm": 21.625, "learning_rate": 4.408880926746342e-06, "loss": 0.8942640686035156, "mean_token_accuracy": 0.7591548478603363, "num_tokens": 41415217.0, "step": 86900 }, { "entropy": 0.9683902657032013, "epoch": 2.114491379100703, "grad_norm": 15.25, "learning_rate": 4.397773310189665e-06, "loss": 0.9807273101806641, "mean_token_accuracy": 0.74647829413414, "num_tokens": 41439449.0, "step": 86950 }, { "entropy": 0.9331954163312912, "epoch": 2.115707302837966, "grad_norm": 13.875, "learning_rate": 4.3866757580729204e-06, "loss": 0.9390144348144531, "mean_token_accuracy": 0.7459540498256684, "num_tokens": 41464843.0, "step": 87000 }, { "entropy": 0.9677608400583267, "epoch": 2.116923226575229, "grad_norm": 12.75, "learning_rate": 4.375588290332933e-06, "loss": 1.0034735870361329, "mean_token_accuracy": 0.7388115578889847, "num_tokens": 41488977.0, "step": 87050 }, { "entropy": 1.0248280361294746, "epoch": 2.1181391503124924, "grad_norm": 11.4375, "learning_rate": 4.364510926888422e-06, "loss": 1.033421401977539, "mean_token_accuracy": 0.7332920414209366, "num_tokens": 41514902.0, "step": 87100 }, { "entropy": 0.9311991554498672, "epoch": 2.1193550740497558, "grad_norm": 12.8125, "learning_rate": 4.353443687639949e-06, "loss": 0.9265489959716797, "mean_token_accuracy": 0.7500851267576217, "num_tokens": 41540651.0, "step": 87150 }, { "entropy": 0.8945588898658753, "epoch": 2.1205709977870186, "grad_norm": 18.0, "learning_rate": 4.342386592469883e-06, "loss": 0.890170669555664, "mean_token_accuracy": 0.7657928943634034, "num_tokens": 41564328.0, "step": 87200 }, { "entropy": 1.0748685961961746, "epoch": 2.121786921524282, "grad_norm": 19.125, "learning_rate": 4.331339661242379e-06, "loss": 1.0769546508789063, "mean_token_accuracy": 0.7283113300800323, "num_tokens": 41588614.0, "step": 87250 }, { "entropy": 1.0431868135929108, "epoch": 2.1230028452615453, "grad_norm": 18.0, "learning_rate": 4.320302913803322e-06, "loss": 1.0686030578613281, "mean_token_accuracy": 0.7271284306049347, "num_tokens": 41610722.0, "step": 87300 }, { "entropy": 1.022841340303421, "epoch": 2.1242187689988086, "grad_norm": 18.875, "learning_rate": 4.30927636998031e-06, "loss": 1.0150672149658204, "mean_token_accuracy": 0.7359809988737106, "num_tokens": 41637337.0, "step": 87350 }, { "entropy": 0.9412894344329834, "epoch": 2.1254346927360714, "grad_norm": 14.375, "learning_rate": 4.298260049582613e-06, "loss": 0.9280206298828125, "mean_token_accuracy": 0.7683633255958557, "num_tokens": 41658123.0, "step": 87400 }, { "entropy": 0.9607746195793152, "epoch": 2.1266506164733348, "grad_norm": 11.0625, "learning_rate": 4.2872539724011174e-06, "loss": 0.9541420745849609, "mean_token_accuracy": 0.7390760231018066, "num_tokens": 41684178.0, "step": 87450 }, { "entropy": 1.0599571090936661, "epoch": 2.127866540210598, "grad_norm": 28.0, "learning_rate": 4.276258158208326e-06, "loss": 1.0482490539550782, "mean_token_accuracy": 0.7267882895469665, "num_tokens": 41708158.0, "step": 87500 }, { "entropy": 0.9054130190610885, "epoch": 2.1290824639478614, "grad_norm": 15.5, "learning_rate": 4.265272626758292e-06, "loss": 0.9175888824462891, "mean_token_accuracy": 0.7518428611755371, "num_tokens": 41731648.0, "step": 87550 }, { "entropy": 0.9865778189897537, "epoch": 2.1302983876851243, "grad_norm": 9.1875, "learning_rate": 4.254297397786605e-06, "loss": 1.0062123870849609, "mean_token_accuracy": 0.7362125289440155, "num_tokens": 41757245.0, "step": 87600 }, { "entropy": 0.9800968259572983, "epoch": 2.1315143114223876, "grad_norm": 16.125, "learning_rate": 4.243332491010341e-06, "loss": 0.9823035430908204, "mean_token_accuracy": 0.7446006453037262, "num_tokens": 41782570.0, "step": 87650 }, { "entropy": 1.0548397880792617, "epoch": 2.132730235159651, "grad_norm": 9.6875, "learning_rate": 4.2323779261280325e-06, "loss": 1.0521666717529297, "mean_token_accuracy": 0.7276716953516007, "num_tokens": 41805730.0, "step": 87700 }, { "entropy": 0.9817381501197815, "epoch": 2.133946158896914, "grad_norm": 20.25, "learning_rate": 4.221433722819632e-06, "loss": 0.9799076843261719, "mean_token_accuracy": 0.7482276445627213, "num_tokens": 41828835.0, "step": 87750 }, { "entropy": 1.0604998952150344, "epoch": 2.135162082634177, "grad_norm": 12.25, "learning_rate": 4.2104999007464755e-06, "loss": 1.0692472076416015, "mean_token_accuracy": 0.7214491477608681, "num_tokens": 41856405.0, "step": 87800 }, { "entropy": 0.947783961892128, "epoch": 2.1363780063714404, "grad_norm": 13.8125, "learning_rate": 4.199576479551255e-06, "loss": 0.9638334655761719, "mean_token_accuracy": 0.7476010239124298, "num_tokens": 41879147.0, "step": 87850 }, { "entropy": 1.0053998929262162, "epoch": 2.1375939301087037, "grad_norm": 16.125, "learning_rate": 4.188663478857976e-06, "loss": 1.0220954132080078, "mean_token_accuracy": 0.7326918017864227, "num_tokens": 41904601.0, "step": 87900 }, { "entropy": 0.9939982217550277, "epoch": 2.1388098538459666, "grad_norm": 7.625, "learning_rate": 4.177760918271916e-06, "loss": 0.9908580780029297, "mean_token_accuracy": 0.7408955919742585, "num_tokens": 41933547.0, "step": 87950 }, { "entropy": 0.9497399067878723, "epoch": 2.14002577758323, "grad_norm": 6.8125, "learning_rate": 4.16686881737961e-06, "loss": 0.9446603393554688, "mean_token_accuracy": 0.7496318221092224, "num_tokens": 41958108.0, "step": 88000 }, { "entropy": 1.0137356626987457, "epoch": 2.141241701320493, "grad_norm": 13.125, "learning_rate": 4.15598719574879e-06, "loss": 1.0079136657714844, "mean_token_accuracy": 0.7410041105747223, "num_tokens": 41987327.0, "step": 88050 }, { "entropy": 1.0404154396057128, "epoch": 2.1424576250577565, "grad_norm": 15.625, "learning_rate": 4.145116072928361e-06, "loss": 1.0534268951416015, "mean_token_accuracy": 0.7256925064325332, "num_tokens": 42013113.0, "step": 88100 }, { "entropy": 0.955589793920517, "epoch": 2.14367354879502, "grad_norm": 18.75, "learning_rate": 4.134255468448383e-06, "loss": 0.9523347473144531, "mean_token_accuracy": 0.7502446329593658, "num_tokens": 42035197.0, "step": 88150 }, { "entropy": 0.9552141183614731, "epoch": 2.1448894725322827, "grad_norm": 17.875, "learning_rate": 4.123405401819998e-06, "loss": 0.9717326354980469, "mean_token_accuracy": 0.747040411233902, "num_tokens": 42057368.0, "step": 88200 }, { "entropy": 0.9507090339064598, "epoch": 2.146105396269546, "grad_norm": 25.125, "learning_rate": 4.112565892535434e-06, "loss": 0.9519109344482422, "mean_token_accuracy": 0.754434597492218, "num_tokens": 42085481.0, "step": 88250 }, { "entropy": 0.9474497300386429, "epoch": 2.1473213200068093, "grad_norm": 18.0, "learning_rate": 4.101736960067948e-06, "loss": 0.9311833953857422, "mean_token_accuracy": 0.7558534228801728, "num_tokens": 42105095.0, "step": 88300 }, { "entropy": 0.963852441906929, "epoch": 2.148537243744072, "grad_norm": 16.25, "learning_rate": 4.090918623871791e-06, "loss": 0.9799910736083984, "mean_token_accuracy": 0.7481193840503693, "num_tokens": 42133571.0, "step": 88350 }, { "entropy": 1.0217581099271775, "epoch": 2.1497531674813355, "grad_norm": 14.1875, "learning_rate": 4.080110903382181e-06, "loss": 1.0239775085449219, "mean_token_accuracy": 0.7325740730762482, "num_tokens": 42157435.0, "step": 88400 }, { "entropy": 1.0263285392522812, "epoch": 2.150969091218599, "grad_norm": 28.5, "learning_rate": 4.06931381801526e-06, "loss": 1.0115155029296874, "mean_token_accuracy": 0.731691654920578, "num_tokens": 42182519.0, "step": 88450 }, { "entropy": 1.083029617667198, "epoch": 2.152185014955862, "grad_norm": 14.4375, "learning_rate": 4.058527387168073e-06, "loss": 1.0848270416259767, "mean_token_accuracy": 0.7343082851171494, "num_tokens": 42206808.0, "step": 88500 }, { "entropy": 0.9576358038187027, "epoch": 2.153400938693125, "grad_norm": 13.125, "learning_rate": 4.0477516302185215e-06, "loss": 0.9548990631103516, "mean_token_accuracy": 0.7493225634098053, "num_tokens": 42231892.0, "step": 88550 }, { "entropy": 0.982500855922699, "epoch": 2.1546168624303883, "grad_norm": 14.375, "learning_rate": 4.0369865665253275e-06, "loss": 0.9733237457275391, "mean_token_accuracy": 0.7461659026145935, "num_tokens": 42253643.0, "step": 88600 }, { "entropy": 1.0352164727449418, "epoch": 2.1558327861676516, "grad_norm": 10.75, "learning_rate": 4.026232215428001e-06, "loss": 1.0680189514160157, "mean_token_accuracy": 0.7243820011615754, "num_tokens": 42279516.0, "step": 88650 }, { "entropy": 0.9970568740367889, "epoch": 2.157048709904915, "grad_norm": 12.875, "learning_rate": 4.015488596246811e-06, "loss": 0.9938472747802735, "mean_token_accuracy": 0.745024425983429, "num_tokens": 42305653.0, "step": 88700 }, { "entropy": 0.9829078698158265, "epoch": 2.158264633642178, "grad_norm": 17.25, "learning_rate": 4.004755728282744e-06, "loss": 0.9773599243164063, "mean_token_accuracy": 0.7379100048542022, "num_tokens": 42328752.0, "step": 88750 }, { "entropy": 0.9530080169439316, "epoch": 2.159480557379441, "grad_norm": 24.875, "learning_rate": 3.994033630817481e-06, "loss": 0.9481230163574219, "mean_token_accuracy": 0.7524331969022751, "num_tokens": 42350377.0, "step": 88800 }, { "entropy": 0.9945683334767819, "epoch": 2.1606964811167044, "grad_norm": 12.0625, "learning_rate": 3.983322323113337e-06, "loss": 0.998324966430664, "mean_token_accuracy": 0.7419644606113434, "num_tokens": 42375125.0, "step": 88850 }, { "entropy": 0.9664711159467697, "epoch": 2.1619124048539677, "grad_norm": 19.875, "learning_rate": 3.972621824413258e-06, "loss": 0.9805253601074219, "mean_token_accuracy": 0.7480589962005615, "num_tokens": 42394458.0, "step": 88900 }, { "entropy": 1.1431365442276, "epoch": 2.1631283285912306, "grad_norm": 16.5, "learning_rate": 3.961932153940767e-06, "loss": 1.1439228057861328, "mean_token_accuracy": 0.7119633573293686, "num_tokens": 42418271.0, "step": 88950 }, { "entropy": 1.1176392155885697, "epoch": 2.164344252328494, "grad_norm": 14.8125, "learning_rate": 3.951253330899922e-06, "loss": 1.1159663391113281, "mean_token_accuracy": 0.7165620660781861, "num_tokens": 42443199.0, "step": 89000 }, { "entropy": 0.92720234811306, "epoch": 2.1655601760657572, "grad_norm": 12.625, "learning_rate": 3.940585374475317e-06, "loss": 0.9124219512939453, "mean_token_accuracy": 0.7566104280948639, "num_tokens": 42467552.0, "step": 89050 }, { "entropy": 0.9833748596906662, "epoch": 2.1667760998030205, "grad_norm": 18.25, "learning_rate": 3.929928303832003e-06, "loss": 0.9688626861572266, "mean_token_accuracy": 0.7453418838977813, "num_tokens": 42492292.0, "step": 89100 }, { "entropy": 0.8365012675523757, "epoch": 2.1679920235402834, "grad_norm": 23.125, "learning_rate": 3.9192821381154854e-06, "loss": 0.8461428833007812, "mean_token_accuracy": 0.773023452758789, "num_tokens": 42518955.0, "step": 89150 }, { "entropy": 0.9808978796005249, "epoch": 2.1692079472775467, "grad_norm": 18.75, "learning_rate": 3.90864689645168e-06, "loss": 0.9889413452148438, "mean_token_accuracy": 0.7481205940246582, "num_tokens": 42544578.0, "step": 89200 }, { "entropy": 1.040524154305458, "epoch": 2.17042387101481, "grad_norm": 22.625, "learning_rate": 3.898022597946871e-06, "loss": 1.0698012542724609, "mean_token_accuracy": 0.7270412564277648, "num_tokens": 42569882.0, "step": 89250 }, { "entropy": 1.0142230081558228, "epoch": 2.171639794752073, "grad_norm": 19.25, "learning_rate": 3.887409261687685e-06, "loss": 1.0032269287109374, "mean_token_accuracy": 0.7390109813213348, "num_tokens": 42593654.0, "step": 89300 }, { "entropy": 0.8549220824241638, "epoch": 2.1728557184893362, "grad_norm": 11.8125, "learning_rate": 3.8768069067410515e-06, "loss": 0.8558211517333985, "mean_token_accuracy": 0.7687974441051483, "num_tokens": 42618133.0, "step": 89350 }, { "entropy": 1.054013923406601, "epoch": 2.1740716422265995, "grad_norm": 27.75, "learning_rate": 3.866215552154184e-06, "loss": 1.0442610168457032, "mean_token_accuracy": 0.7260336458683014, "num_tokens": 42644833.0, "step": 89400 }, { "entropy": 0.9581487637758255, "epoch": 2.175287565963863, "grad_norm": 18.75, "learning_rate": 3.855635216954523e-06, "loss": 0.9832208251953125, "mean_token_accuracy": 0.7452184963226318, "num_tokens": 42668278.0, "step": 89450 }, { "entropy": 0.9981160598993302, "epoch": 2.176503489701126, "grad_norm": 10.9375, "learning_rate": 3.845065920149715e-06, "loss": 1.009400405883789, "mean_token_accuracy": 0.7335880744457245, "num_tokens": 42692875.0, "step": 89500 }, { "entropy": 1.0643509531021118, "epoch": 2.177719413438389, "grad_norm": 19.125, "learning_rate": 3.834507680727579e-06, "loss": 1.0826402282714844, "mean_token_accuracy": 0.7294096231460572, "num_tokens": 42719861.0, "step": 89550 }, { "entropy": 0.9540279287099839, "epoch": 2.1789353371756524, "grad_norm": 15.0625, "learning_rate": 3.823960517656064e-06, "loss": 0.9681752014160157, "mean_token_accuracy": 0.7461137163639069, "num_tokens": 42748306.0, "step": 89600 }, { "entropy": 0.9647149765491485, "epoch": 2.1801512609129157, "grad_norm": 11.0625, "learning_rate": 3.813424449883223e-06, "loss": 1.0008318328857422, "mean_token_accuracy": 0.7345140194892883, "num_tokens": 42769908.0, "step": 89650 }, { "entropy": 1.039735741019249, "epoch": 2.1813671846501785, "grad_norm": 15.1875, "learning_rate": 3.80289949633718e-06, "loss": 1.0521786499023438, "mean_token_accuracy": 0.7255383861064911, "num_tokens": 42791050.0, "step": 89700 }, { "entropy": 1.0406758043169975, "epoch": 2.182583108387442, "grad_norm": 22.875, "learning_rate": 3.7923856759260837e-06, "loss": 1.0413614654541015, "mean_token_accuracy": 0.7358419132232666, "num_tokens": 42814545.0, "step": 89750 }, { "entropy": 1.0808759200572968, "epoch": 2.183799032124705, "grad_norm": 34.75, "learning_rate": 3.7818830075380917e-06, "loss": 1.0689360046386718, "mean_token_accuracy": 0.7318137037754059, "num_tokens": 42837490.0, "step": 89800 }, { "entropy": 1.0485502290725708, "epoch": 2.1850149558619685, "grad_norm": 22.375, "learning_rate": 3.771391510041321e-06, "loss": 1.0499148559570313, "mean_token_accuracy": 0.7333120954036713, "num_tokens": 42865192.0, "step": 89850 }, { "entropy": 0.9164808923006058, "epoch": 2.1862308795992313, "grad_norm": 14.75, "learning_rate": 3.760911202283816e-06, "loss": 0.9133340454101563, "mean_token_accuracy": 0.7575887489318848, "num_tokens": 42888215.0, "step": 89900 }, { "entropy": 0.9778921547532081, "epoch": 2.1874468033364947, "grad_norm": 11.9375, "learning_rate": 3.750442103093531e-06, "loss": 0.9862149047851563, "mean_token_accuracy": 0.7504960626363755, "num_tokens": 42911191.0, "step": 89950 }, { "entropy": 1.0189005655050278, "epoch": 2.188662727073758, "grad_norm": 20.125, "learning_rate": 3.7399842312782718e-06, "loss": 1.0344547271728515, "mean_token_accuracy": 0.7262771856784821, "num_tokens": 42933876.0, "step": 90000 }, { "epoch": 2.188662727073758, "eval_entropy": 1.0834449306864216, "eval_loss": 1.306657314300537, "eval_mean_token_accuracy": 0.6793523185178524, "eval_num_tokens": 42933876.0, "eval_runtime": 391.0357, "eval_samples_per_second": 11.684, "eval_steps_per_second": 11.684, "step": 90000 }, { "entropy": 0.9098175525665283, "epoch": 2.1898786508110213, "grad_norm": 16.125, "learning_rate": 3.7295376056256795e-06, "loss": 0.9057701110839844, "mean_token_accuracy": 0.7568503749370575, "num_tokens": 42959732.0, "step": 90050 }, { "entropy": 0.9883267283439636, "epoch": 2.191094574548284, "grad_norm": 23.5, "learning_rate": 3.7191022449031965e-06, "loss": 0.9639598846435546, "mean_token_accuracy": 0.7493646252155304, "num_tokens": 42984251.0, "step": 90100 }, { "entropy": 0.9310219204425811, "epoch": 2.1923104982855475, "grad_norm": 11.375, "learning_rate": 3.7086781678580197e-06, "loss": 0.9363758850097657, "mean_token_accuracy": 0.7581924903392792, "num_tokens": 43007344.0, "step": 90150 }, { "entropy": 1.017392299771309, "epoch": 2.193526422022811, "grad_norm": 9.6875, "learning_rate": 3.698265393217074e-06, "loss": 1.0426522827148437, "mean_token_accuracy": 0.7355796229839325, "num_tokens": 43029304.0, "step": 90200 }, { "entropy": 0.9308489549160004, "epoch": 2.194742345760074, "grad_norm": 12.8125, "learning_rate": 3.687863939686983e-06, "loss": 0.9304693603515625, "mean_token_accuracy": 0.7536886215209961, "num_tokens": 43053073.0, "step": 90250 }, { "entropy": 0.9242009156942368, "epoch": 2.195958269497337, "grad_norm": 13.875, "learning_rate": 3.6774738259540333e-06, "loss": 0.9235498046875, "mean_token_accuracy": 0.7532008647918701, "num_tokens": 43081708.0, "step": 90300 }, { "entropy": 1.0282861608266831, "epoch": 2.1971741932346003, "grad_norm": 27.5, "learning_rate": 3.6670950706841414e-06, "loss": 1.0302552795410156, "mean_token_accuracy": 0.7314958500862122, "num_tokens": 43105421.0, "step": 90350 }, { "entropy": 0.891103350520134, "epoch": 2.1983901169718636, "grad_norm": 15.0, "learning_rate": 3.656727692522808e-06, "loss": 0.8834201049804687, "mean_token_accuracy": 0.7647206270694733, "num_tokens": 43128885.0, "step": 90400 }, { "entropy": 0.9105528491735458, "epoch": 2.199606040709127, "grad_norm": 17.125, "learning_rate": 3.646371710095108e-06, "loss": 0.9164854431152344, "mean_token_accuracy": 0.7585061299800873, "num_tokens": 43149473.0, "step": 90450 }, { "entropy": 1.0268357479572296, "epoch": 2.2008219644463898, "grad_norm": 15.0, "learning_rate": 3.6360271420056336e-06, "loss": 1.0289561462402343, "mean_token_accuracy": 0.7371851944923401, "num_tokens": 43171231.0, "step": 90500 }, { "entropy": 1.0063996776938438, "epoch": 2.202037888183653, "grad_norm": 13.5, "learning_rate": 3.6256940068384728e-06, "loss": 1.0104620361328125, "mean_token_accuracy": 0.7289917600154877, "num_tokens": 43196122.0, "step": 90550 }, { "entropy": 1.0088332641124724, "epoch": 2.2032538119209164, "grad_norm": 15.4375, "learning_rate": 3.6153723231571803e-06, "loss": 0.999853515625, "mean_token_accuracy": 0.7394966459274293, "num_tokens": 43214830.0, "step": 90600 }, { "entropy": 1.0625594455003737, "epoch": 2.2044697356581797, "grad_norm": 10.375, "learning_rate": 3.605062109504729e-06, "loss": 1.0688348388671876, "mean_token_accuracy": 0.7323668789863587, "num_tokens": 43238753.0, "step": 90650 }, { "entropy": 0.989998510479927, "epoch": 2.2056856593954426, "grad_norm": 14.625, "learning_rate": 3.594763384403497e-06, "loss": 0.9946788024902343, "mean_token_accuracy": 0.7431291055679321, "num_tokens": 43263123.0, "step": 90700 }, { "entropy": 1.0172670233249663, "epoch": 2.206901583132706, "grad_norm": 20.75, "learning_rate": 3.5844761663552095e-06, "loss": 1.0401494598388672, "mean_token_accuracy": 0.733867073059082, "num_tokens": 43285611.0, "step": 90750 }, { "entropy": 0.968183079957962, "epoch": 2.208117506869969, "grad_norm": 16.375, "learning_rate": 3.574200473840935e-06, "loss": 0.9838841247558594, "mean_token_accuracy": 0.7393718010187149, "num_tokens": 43310353.0, "step": 90800 }, { "entropy": 0.9430635815858841, "epoch": 2.2093334306072325, "grad_norm": 20.875, "learning_rate": 3.5639363253210236e-06, "loss": 0.9484710693359375, "mean_token_accuracy": 0.74380539894104, "num_tokens": 43332554.0, "step": 90850 }, { "entropy": 1.0472551214694976, "epoch": 2.2105493543444954, "grad_norm": 18.0, "learning_rate": 3.5536837392350863e-06, "loss": 1.0562085723876953, "mean_token_accuracy": 0.7258752596378326, "num_tokens": 43358039.0, "step": 90900 }, { "entropy": 0.9782012236118317, "epoch": 2.2117652780817587, "grad_norm": 20.875, "learning_rate": 3.5434427340019728e-06, "loss": 1.002340316772461, "mean_token_accuracy": 0.7404397279024124, "num_tokens": 43379178.0, "step": 90950 }, { "entropy": 0.993825306892395, "epoch": 2.212981201819022, "grad_norm": 25.625, "learning_rate": 3.533213328019721e-06, "loss": 0.9954617309570313, "mean_token_accuracy": 0.7459117114543915, "num_tokens": 43405035.0, "step": 91000 }, { "entropy": 1.0840084981918334, "epoch": 2.214197125556285, "grad_norm": 15.0625, "learning_rate": 3.522995539665531e-06, "loss": 1.0696353912353516, "mean_token_accuracy": 0.729013602733612, "num_tokens": 43429837.0, "step": 91050 }, { "entropy": 1.0168333107233047, "epoch": 2.215413049293548, "grad_norm": 9.0625, "learning_rate": 3.512789387295732e-06, "loss": 1.0241415405273437, "mean_token_accuracy": 0.7408850562572479, "num_tokens": 43451559.0, "step": 91100 }, { "entropy": 0.92669252961874, "epoch": 2.2166289730308115, "grad_norm": 3.890625, "learning_rate": 3.502594889245744e-06, "loss": 0.9046266174316406, "mean_token_accuracy": 0.7552792119979859, "num_tokens": 43474604.0, "step": 91150 }, { "entropy": 1.020438425540924, "epoch": 2.217844896768075, "grad_norm": 21.125, "learning_rate": 3.4924120638300584e-06, "loss": 1.0201556396484375, "mean_token_accuracy": 0.733896609544754, "num_tokens": 43493276.0, "step": 91200 }, { "entropy": 0.961784882247448, "epoch": 2.2190608205053377, "grad_norm": 9.375, "learning_rate": 3.4822409293421975e-06, "loss": 0.9530787658691406, "mean_token_accuracy": 0.7490857183933258, "num_tokens": 43516572.0, "step": 91250 }, { "entropy": 0.997021204829216, "epoch": 2.220276744242601, "grad_norm": 25.75, "learning_rate": 3.4720815040546684e-06, "loss": 1.0014445495605468, "mean_token_accuracy": 0.7432357573509216, "num_tokens": 43541191.0, "step": 91300 }, { "entropy": 0.9187290859222412, "epoch": 2.2214926679798643, "grad_norm": 12.0, "learning_rate": 3.4619338062189587e-06, "loss": 0.9118334197998047, "mean_token_accuracy": 0.7611398720741271, "num_tokens": 43562708.0, "step": 91350 }, { "entropy": 0.9750349378585815, "epoch": 2.2227085917171276, "grad_norm": 12.125, "learning_rate": 3.4517978540654763e-06, "loss": 0.970452880859375, "mean_token_accuracy": 0.7432847344875335, "num_tokens": 43585612.0, "step": 91400 }, { "entropy": 0.9067876639962197, "epoch": 2.2239245154543905, "grad_norm": 11.0, "learning_rate": 3.4416736658035265e-06, "loss": 0.9384122467041016, "mean_token_accuracy": 0.7491550099849701, "num_tokens": 43611484.0, "step": 91450 }, { "entropy": 0.9748253202438355, "epoch": 2.225140439191654, "grad_norm": 10.75, "learning_rate": 3.4315612596212932e-06, "loss": 0.9591217041015625, "mean_token_accuracy": 0.7418393063545227, "num_tokens": 43633559.0, "step": 91500 }, { "entropy": 1.0522030371427535, "epoch": 2.226356362928917, "grad_norm": 15.875, "learning_rate": 3.4214606536857774e-06, "loss": 1.0454196166992187, "mean_token_accuracy": 0.7319492375850678, "num_tokens": 43655747.0, "step": 91550 }, { "entropy": 0.9518406623601914, "epoch": 2.2275722866661805, "grad_norm": 13.25, "learning_rate": 3.4113718661427965e-06, "loss": 0.9360578918457031, "mean_token_accuracy": 0.7527556169033051, "num_tokens": 43679804.0, "step": 91600 }, { "entropy": 1.0437300145626067, "epoch": 2.2287882104034433, "grad_norm": 16.75, "learning_rate": 3.401294915116923e-06, "loss": 1.0554710388183595, "mean_token_accuracy": 0.7284577536582947, "num_tokens": 43704196.0, "step": 91650 }, { "entropy": 0.9905666759610177, "epoch": 2.2300041341407066, "grad_norm": 25.75, "learning_rate": 3.3912298187114765e-06, "loss": 0.9878391265869141, "mean_token_accuracy": 0.7400980353355407, "num_tokens": 43725813.0, "step": 91700 }, { "entropy": 1.0058441787958146, "epoch": 2.23122005787797, "grad_norm": 27.875, "learning_rate": 3.3811765950084706e-06, "loss": 1.0038873291015624, "mean_token_accuracy": 0.746178640127182, "num_tokens": 43747814.0, "step": 91750 }, { "entropy": 0.9612927466630936, "epoch": 2.2324359816152333, "grad_norm": 11.625, "learning_rate": 3.3711352620685898e-06, "loss": 0.9879490661621094, "mean_token_accuracy": 0.738293958902359, "num_tokens": 43772442.0, "step": 91800 }, { "entropy": 1.0419057488441468, "epoch": 2.233651905352496, "grad_norm": 10.1875, "learning_rate": 3.3611058379311647e-06, "loss": 1.0382782745361328, "mean_token_accuracy": 0.7300703382492065, "num_tokens": 43795002.0, "step": 91850 }, { "entropy": 0.9881766164302825, "epoch": 2.2348678290897595, "grad_norm": 35.75, "learning_rate": 3.351088340614127e-06, "loss": 0.9903181457519531, "mean_token_accuracy": 0.7373605281114578, "num_tokens": 43818238.0, "step": 91900 }, { "entropy": 1.1005197989940643, "epoch": 2.2360837528270228, "grad_norm": 24.625, "learning_rate": 3.3410827881139783e-06, "loss": 1.1319586181640624, "mean_token_accuracy": 0.7125854277610779, "num_tokens": 43839228.0, "step": 91950 }, { "entropy": 1.011505514383316, "epoch": 2.237299676564286, "grad_norm": 13.875, "learning_rate": 3.3310891984057713e-06, "loss": 1.0146851348876953, "mean_token_accuracy": 0.7350240713357925, "num_tokens": 43861517.0, "step": 92000 }, { "entropy": 0.9630370151996612, "epoch": 2.238515600301549, "grad_norm": 25.125, "learning_rate": 3.32110758944305e-06, "loss": 0.9814521026611328, "mean_token_accuracy": 0.7434720075130463, "num_tokens": 43884308.0, "step": 92050 }, { "entropy": 1.0209509658813476, "epoch": 2.2397315240388123, "grad_norm": 10.5, "learning_rate": 3.311137979157851e-06, "loss": 1.0251439666748048, "mean_token_accuracy": 0.7341196465492249, "num_tokens": 43908028.0, "step": 92100 }, { "entropy": 0.9067856204509736, "epoch": 2.2409474477760756, "grad_norm": 12.5, "learning_rate": 3.301180385460654e-06, "loss": 0.9254347991943359, "mean_token_accuracy": 0.7640578019618988, "num_tokens": 43928855.0, "step": 92150 }, { "entropy": 0.9623745155334472, "epoch": 2.242163371513339, "grad_norm": 19.875, "learning_rate": 3.2912348262403382e-06, "loss": 0.9666206359863281, "mean_token_accuracy": 0.7484768891334533, "num_tokens": 43953268.0, "step": 92200 }, { "entropy": 0.9172753512859344, "epoch": 2.2433792952506018, "grad_norm": 17.125, "learning_rate": 3.2813013193641785e-06, "loss": 0.92166015625, "mean_token_accuracy": 0.7602487647533417, "num_tokens": 43975838.0, "step": 92250 }, { "entropy": 1.040367476940155, "epoch": 2.244595218987865, "grad_norm": 24.0, "learning_rate": 3.2713798826777888e-06, "loss": 1.0610098266601562, "mean_token_accuracy": 0.7247315227985383, "num_tokens": 44003152.0, "step": 92300 }, { "entropy": 0.9974731492996216, "epoch": 2.2458111427251284, "grad_norm": 10.6875, "learning_rate": 3.2614705340050954e-06, "loss": 1.0404570007324219, "mean_token_accuracy": 0.7223964858055115, "num_tokens": 44029043.0, "step": 92350 }, { "entropy": 0.9377450034022331, "epoch": 2.2470270664623913, "grad_norm": 14.6875, "learning_rate": 3.25157329114832e-06, "loss": 0.9610479736328125, "mean_token_accuracy": 0.7494747149944305, "num_tokens": 44048692.0, "step": 92400 }, { "entropy": 0.9542433726787567, "epoch": 2.2482429901996546, "grad_norm": 27.875, "learning_rate": 3.2416881718879232e-06, "loss": 0.95422607421875, "mean_token_accuracy": 0.7444876372814179, "num_tokens": 44071498.0, "step": 92450 }, { "entropy": 0.9486313432455062, "epoch": 2.249458913936918, "grad_norm": 21.25, "learning_rate": 3.231815193982598e-06, "loss": 0.9321025085449218, "mean_token_accuracy": 0.7476832163333893, "num_tokens": 44093893.0, "step": 92500 }, { "entropy": 0.955502741932869, "epoch": 2.250674837674181, "grad_norm": 16.25, "learning_rate": 3.221954375169214e-06, "loss": 0.9345763397216796, "mean_token_accuracy": 0.7517745959758758, "num_tokens": 44118404.0, "step": 92550 }, { "entropy": 0.9286492365598679, "epoch": 2.2518907614114445, "grad_norm": 21.0, "learning_rate": 3.212105733162807e-06, "loss": 0.9335489654541016, "mean_token_accuracy": 0.7589629435539246, "num_tokens": 44139879.0, "step": 92600 }, { "entropy": 1.004187947511673, "epoch": 2.2531066851487074, "grad_norm": 17.375, "learning_rate": 3.2022692856565294e-06, "loss": 1.0005992889404296, "mean_token_accuracy": 0.7378577733039856, "num_tokens": 44165695.0, "step": 92650 }, { "entropy": 1.095622348189354, "epoch": 2.2543226088859707, "grad_norm": 21.5, "learning_rate": 3.1924450503216255e-06, "loss": 1.1054854583740235, "mean_token_accuracy": 0.7165529441833496, "num_tokens": 44189116.0, "step": 92700 }, { "entropy": 0.9120453530550003, "epoch": 2.255538532623234, "grad_norm": 17.5, "learning_rate": 3.1826330448074073e-06, "loss": 0.917801513671875, "mean_token_accuracy": 0.7584377300739288, "num_tokens": 44213947.0, "step": 92750 }, { "entropy": 1.0210186177492142, "epoch": 2.256754456360497, "grad_norm": 27.625, "learning_rate": 3.172833286741215e-06, "loss": 1.0457009887695312, "mean_token_accuracy": 0.7224771332740784, "num_tokens": 44236452.0, "step": 92800 }, { "entropy": 1.0347911870479585, "epoch": 2.25797038009776, "grad_norm": 13.75, "learning_rate": 3.1630457937283764e-06, "loss": 1.0450563049316406, "mean_token_accuracy": 0.732227498292923, "num_tokens": 44259185.0, "step": 92850 }, { "entropy": 1.0375212633609772, "epoch": 2.2591863038350235, "grad_norm": 17.375, "learning_rate": 3.1532705833521993e-06, "loss": 1.0479297637939453, "mean_token_accuracy": 0.7307690805196763, "num_tokens": 44283106.0, "step": 92900 }, { "entropy": 0.9709310352802276, "epoch": 2.260402227572287, "grad_norm": 17.5, "learning_rate": 3.1435076731739156e-06, "loss": 0.9699678802490235, "mean_token_accuracy": 0.7394730997085571, "num_tokens": 44306233.0, "step": 92950 }, { "entropy": 0.8951483643054963, "epoch": 2.2616181513095497, "grad_norm": 18.5, "learning_rate": 3.133757080732658e-06, "loss": 0.8903813934326172, "mean_token_accuracy": 0.7629954421520233, "num_tokens": 44327649.0, "step": 93000 }, { "entropy": 0.9133878070116043, "epoch": 2.262834075046813, "grad_norm": 14.5625, "learning_rate": 3.124018823545443e-06, "loss": 0.8979875946044922, "mean_token_accuracy": 0.7552621245384217, "num_tokens": 44353243.0, "step": 93050 }, { "entropy": 0.8857519036531448, "epoch": 2.2640499987840763, "grad_norm": 23.625, "learning_rate": 3.11429291910711e-06, "loss": 0.9017655944824219, "mean_token_accuracy": 0.7586613166332244, "num_tokens": 44378465.0, "step": 93100 }, { "entropy": 0.9540095192193985, "epoch": 2.2652659225213396, "grad_norm": 24.625, "learning_rate": 3.104579384890324e-06, "loss": 0.9397758483886719, "mean_token_accuracy": 0.7466320621967316, "num_tokens": 44401168.0, "step": 93150 }, { "entropy": 1.0548709380626677, "epoch": 2.2664818462586025, "grad_norm": 16.5, "learning_rate": 3.09487823834551e-06, "loss": 1.0705772399902345, "mean_token_accuracy": 0.7272307014465332, "num_tokens": 44426659.0, "step": 93200 }, { "entropy": 1.018199521303177, "epoch": 2.267697769995866, "grad_norm": 14.625, "learning_rate": 3.085189496900853e-06, "loss": 1.0038623046875, "mean_token_accuracy": 0.7414319324493408, "num_tokens": 44452672.0, "step": 93250 }, { "entropy": 0.9421713554859161, "epoch": 2.268913693733129, "grad_norm": 17.0, "learning_rate": 3.075513177962243e-06, "loss": 0.9422064208984375, "mean_token_accuracy": 0.7530347740650177, "num_tokens": 44478230.0, "step": 93300 }, { "entropy": 0.8877424877882004, "epoch": 2.2701296174703924, "grad_norm": 19.25, "learning_rate": 3.0658492989132525e-06, "loss": 0.8859227752685547, "mean_token_accuracy": 0.7594033324718475, "num_tokens": 44503354.0, "step": 93350 }, { "entropy": 1.0248270070552825, "epoch": 2.2713455412076553, "grad_norm": 35.5, "learning_rate": 3.0561978771151146e-06, "loss": 1.0649641418457032, "mean_token_accuracy": 0.728805719614029, "num_tokens": 44527148.0, "step": 93400 }, { "entropy": 1.0053327673673629, "epoch": 2.2725614649449186, "grad_norm": 15.625, "learning_rate": 3.04655892990667e-06, "loss": 1.018981704711914, "mean_token_accuracy": 0.7355321919918061, "num_tokens": 44550984.0, "step": 93450 }, { "entropy": 1.0856688183546066, "epoch": 2.273777388682182, "grad_norm": 13.9375, "learning_rate": 3.0369324746043626e-06, "loss": 1.0922054290771483, "mean_token_accuracy": 0.7267630970478058, "num_tokens": 44577088.0, "step": 93500 }, { "entropy": 0.9455137658119201, "epoch": 2.2749933124194452, "grad_norm": 19.125, "learning_rate": 3.0273185285021856e-06, "loss": 0.9440573120117187, "mean_token_accuracy": 0.7568325906991958, "num_tokens": 44600457.0, "step": 93550 }, { "entropy": 0.9864205151796341, "epoch": 2.276209236156708, "grad_norm": 12.875, "learning_rate": 3.0177171088716563e-06, "loss": 1.0068096923828125, "mean_token_accuracy": 0.7327397662401199, "num_tokens": 44623697.0, "step": 93600 }, { "entropy": 0.9493138343095779, "epoch": 2.2774251598939714, "grad_norm": 12.5, "learning_rate": 3.0081282329617957e-06, "loss": 0.9434712219238282, "mean_token_accuracy": 0.7574577832221985, "num_tokens": 44643896.0, "step": 93650 }, { "entropy": 0.94921111702919, "epoch": 2.2786410836312347, "grad_norm": 19.75, "learning_rate": 2.998551917999093e-06, "loss": 0.9572761535644532, "mean_token_accuracy": 0.750439395904541, "num_tokens": 44668164.0, "step": 93700 }, { "entropy": 0.9739478868246079, "epoch": 2.2798570073684976, "grad_norm": 27.625, "learning_rate": 2.9889881811874568e-06, "loss": 0.9891908264160156, "mean_token_accuracy": 0.7412521743774414, "num_tokens": 44692524.0, "step": 93750 }, { "entropy": 1.0012891227006913, "epoch": 2.281072931105761, "grad_norm": 19.25, "learning_rate": 2.979437039708216e-06, "loss": 1.0097435760498046, "mean_token_accuracy": 0.7427318507432937, "num_tokens": 44718680.0, "step": 93800 }, { "entropy": 0.9760964855551719, "epoch": 2.2822888548430242, "grad_norm": 35.75, "learning_rate": 2.9698985107200606e-06, "loss": 1.0012615203857422, "mean_token_accuracy": 0.7390626585483551, "num_tokens": 44741880.0, "step": 93850 }, { "entropy": 1.0679415607452392, "epoch": 2.2835047785802876, "grad_norm": 9.5, "learning_rate": 2.9603726113590225e-06, "loss": 1.0804270172119141, "mean_token_accuracy": 0.7314370971918106, "num_tokens": 44762297.0, "step": 93900 }, { "entropy": 1.013844422698021, "epoch": 2.284720702317551, "grad_norm": 11.3125, "learning_rate": 2.950859358738453e-06, "loss": 0.9914096069335937, "mean_token_accuracy": 0.7341550707817077, "num_tokens": 44786051.0, "step": 93950 }, { "entropy": 0.9300890475511551, "epoch": 2.2859366260548137, "grad_norm": 21.0, "learning_rate": 2.9413587699489723e-06, "loss": 0.9513533782958984, "mean_token_accuracy": 0.7489095044136047, "num_tokens": 44807885.0, "step": 94000 }, { "entropy": 1.0079742628335953, "epoch": 2.287152549792077, "grad_norm": 33.0, "learning_rate": 2.9318708620584603e-06, "loss": 0.9962818145751953, "mean_token_accuracy": 0.7450474095344544, "num_tokens": 44833360.0, "step": 94050 }, { "entropy": 0.994007157087326, "epoch": 2.2883684735293404, "grad_norm": 16.5, "learning_rate": 2.9223956521120058e-06, "loss": 1.0081062316894531, "mean_token_accuracy": 0.7443462234735488, "num_tokens": 44853364.0, "step": 94100 }, { "entropy": 0.9837645828723908, "epoch": 2.2895843972666032, "grad_norm": 12.0, "learning_rate": 2.9129331571318966e-06, "loss": 1.0106182098388672, "mean_token_accuracy": 0.7307669651508332, "num_tokens": 44876801.0, "step": 94150 }, { "entropy": 1.0692649418115616, "epoch": 2.2908003210038665, "grad_norm": 13.375, "learning_rate": 2.903483394117569e-06, "loss": 1.0739817810058594, "mean_token_accuracy": 0.7244762814044953, "num_tokens": 44905149.0, "step": 94200 }, { "entropy": 1.15278817653656, "epoch": 2.29201624474113, "grad_norm": 15.5625, "learning_rate": 2.894046380045584e-06, "loss": 1.1640730285644532, "mean_token_accuracy": 0.7084847795963287, "num_tokens": 44927487.0, "step": 94250 }, { "entropy": 0.870697969198227, "epoch": 2.293232168478393, "grad_norm": 20.25, "learning_rate": 2.8846221318696142e-06, "loss": 0.8722151947021485, "mean_token_accuracy": 0.7651208651065826, "num_tokens": 44949134.0, "step": 94300 }, { "entropy": 1.0045555931329728, "epoch": 2.294448092215656, "grad_norm": 12.75, "learning_rate": 2.8752106665203793e-06, "loss": 1.0182839965820312, "mean_token_accuracy": 0.7357378578186036, "num_tokens": 44972805.0, "step": 94350 }, { "entropy": 1.07135901927948, "epoch": 2.2956640159529194, "grad_norm": 13.625, "learning_rate": 2.8658120009056477e-06, "loss": 1.0729496002197265, "mean_token_accuracy": 0.7231455540657044, "num_tokens": 44997772.0, "step": 94400 }, { "entropy": 1.013095486164093, "epoch": 2.2968799396901827, "grad_norm": 18.5, "learning_rate": 2.8564261519101945e-06, "loss": 1.0310089111328125, "mean_token_accuracy": 0.7310674929618836, "num_tokens": 45023434.0, "step": 94450 }, { "entropy": 0.9643177282810211, "epoch": 2.298095863427446, "grad_norm": 30.875, "learning_rate": 2.847053136395751e-06, "loss": 0.9676065826416016, "mean_token_accuracy": 0.7454683756828309, "num_tokens": 45046938.0, "step": 94500 }, { "entropy": 1.004192669391632, "epoch": 2.299311787164709, "grad_norm": 7.4375, "learning_rate": 2.8376929712010127e-06, "loss": 1.0100228881835938, "mean_token_accuracy": 0.747706503868103, "num_tokens": 45073252.0, "step": 94550 }, { "entropy": 1.0663441127538682, "epoch": 2.300527710901972, "grad_norm": 15.8125, "learning_rate": 2.828345673141586e-06, "loss": 1.0900337982177735, "mean_token_accuracy": 0.7269902157783509, "num_tokens": 45095935.0, "step": 94600 }, { "entropy": 0.9879442119598388, "epoch": 2.3017436346392355, "grad_norm": 32.75, "learning_rate": 2.8190112590099504e-06, "loss": 1.019479751586914, "mean_token_accuracy": 0.7344420349597931, "num_tokens": 45119798.0, "step": 94650 }, { "entropy": 0.9234118378162384, "epoch": 2.302959558376499, "grad_norm": 24.25, "learning_rate": 2.809689745575456e-06, "loss": 0.9168170928955078, "mean_token_accuracy": 0.7516093122959137, "num_tokens": 45139178.0, "step": 94700 }, { "entropy": 1.0407817208766936, "epoch": 2.3041754821137617, "grad_norm": 22.125, "learning_rate": 2.8003811495842626e-06, "loss": 1.0729417419433593, "mean_token_accuracy": 0.7259517467021942, "num_tokens": 45166147.0, "step": 94750 }, { "entropy": 1.0555431479215622, "epoch": 2.305391405851025, "grad_norm": 12.125, "learning_rate": 2.7910854877593263e-06, "loss": 1.069442596435547, "mean_token_accuracy": 0.7297629421949386, "num_tokens": 45190015.0, "step": 94800 }, { "entropy": 1.02040536403656, "epoch": 2.3066073295882883, "grad_norm": 13.75, "learning_rate": 2.781802776800375e-06, "loss": 1.0214608764648438, "mean_token_accuracy": 0.7306193375587463, "num_tokens": 45212730.0, "step": 94850 }, { "entropy": 1.035077532529831, "epoch": 2.3078232533255516, "grad_norm": 13.6875, "learning_rate": 2.7725330333838597e-06, "loss": 1.0673436737060547, "mean_token_accuracy": 0.718283588886261, "num_tokens": 45238308.0, "step": 94900 }, { "entropy": 1.0816886138916015, "epoch": 2.3090391770628145, "grad_norm": 13.6875, "learning_rate": 2.763276274162945e-06, "loss": 1.0575791931152343, "mean_token_accuracy": 0.7278029799461365, "num_tokens": 45262016.0, "step": 94950 }, { "entropy": 1.0139601796865463, "epoch": 2.310255100800078, "grad_norm": 33.75, "learning_rate": 2.7540325157674586e-06, "loss": 1.025236587524414, "mean_token_accuracy": 0.7378444838523864, "num_tokens": 45282145.0, "step": 95000 }, { "entropy": 1.0227803814411163, "epoch": 2.311471024537341, "grad_norm": 13.125, "learning_rate": 2.7448017748038845e-06, "loss": 1.0095792388916016, "mean_token_accuracy": 0.7407915604114532, "num_tokens": 45305500.0, "step": 95050 }, { "entropy": 0.939029932320118, "epoch": 2.312686948274604, "grad_norm": 12.75, "learning_rate": 2.73558406785531e-06, "loss": 0.94644775390625, "mean_token_accuracy": 0.7460575550794601, "num_tokens": 45329419.0, "step": 95100 }, { "entropy": 1.004320264160633, "epoch": 2.3139028720118673, "grad_norm": 14.6875, "learning_rate": 2.72637941148141e-06, "loss": 1.0227207946777344, "mean_token_accuracy": 0.7425266194343567, "num_tokens": 45349061.0, "step": 95150 }, { "entropy": 0.9614485442638397, "epoch": 2.3151187957491306, "grad_norm": 17.375, "learning_rate": 2.7171878222184202e-06, "loss": 0.9714505767822266, "mean_token_accuracy": 0.7416908359527588, "num_tokens": 45374241.0, "step": 95200 }, { "entropy": 1.0586227637529373, "epoch": 2.316334719486394, "grad_norm": 15.25, "learning_rate": 2.708009316579089e-06, "loss": 1.0945005798339844, "mean_token_accuracy": 0.7210349929332733, "num_tokens": 45398008.0, "step": 95250 }, { "entropy": 0.8872892498970032, "epoch": 2.3175506432236572, "grad_norm": 13.375, "learning_rate": 2.6988439110526686e-06, "loss": 0.8803879547119141, "mean_token_accuracy": 0.766468665599823, "num_tokens": 45422294.0, "step": 95300 }, { "entropy": 0.9517962753772735, "epoch": 2.31876656696092, "grad_norm": 12.375, "learning_rate": 2.6896916221048806e-06, "loss": 0.9491099548339844, "mean_token_accuracy": 0.7487523579597473, "num_tokens": 45444861.0, "step": 95350 }, { "entropy": 1.031522238254547, "epoch": 2.3199824906981834, "grad_norm": 26.0, "learning_rate": 2.6805524661778735e-06, "loss": 1.02490234375, "mean_token_accuracy": 0.7361214315891266, "num_tokens": 45470769.0, "step": 95400 }, { "entropy": 0.9931298846006393, "epoch": 2.3211984144354467, "grad_norm": 14.75, "learning_rate": 2.6714264596902005e-06, "loss": 0.9819428253173829, "mean_token_accuracy": 0.7442084538936615, "num_tokens": 45494304.0, "step": 95450 }, { "entropy": 1.0444553971290589, "epoch": 2.3224143381727096, "grad_norm": 12.9375, "learning_rate": 2.662313619036806e-06, "loss": 1.0683853912353516, "mean_token_accuracy": 0.7256763672828674, "num_tokens": 45518591.0, "step": 95500 }, { "entropy": 1.076417651772499, "epoch": 2.323630261909973, "grad_norm": 15.125, "learning_rate": 2.653213960588964e-06, "loss": 1.0970492553710938, "mean_token_accuracy": 0.7292157924175262, "num_tokens": 45543619.0, "step": 95550 }, { "entropy": 1.0889058494567871, "epoch": 2.324846185647236, "grad_norm": 13.3125, "learning_rate": 2.6441275006942825e-06, "loss": 1.110810775756836, "mean_token_accuracy": 0.7147517728805542, "num_tokens": 45565878.0, "step": 95600 }, { "entropy": 0.9615481629967689, "epoch": 2.3260621093844995, "grad_norm": 10.9375, "learning_rate": 2.6350542556766478e-06, "loss": 0.9866937255859375, "mean_token_accuracy": 0.7407089984416961, "num_tokens": 45589139.0, "step": 95650 }, { "entropy": 0.8992525696754455, "epoch": 2.327278033121763, "grad_norm": 7.71875, "learning_rate": 2.625994241836207e-06, "loss": 0.8896331024169922, "mean_token_accuracy": 0.7604225528240204, "num_tokens": 45613523.0, "step": 95700 }, { "entropy": 0.8949892872571945, "epoch": 2.3284939568590257, "grad_norm": 6.90625, "learning_rate": 2.616947475449344e-06, "loss": 0.8974932861328125, "mean_token_accuracy": 0.7694405388832092, "num_tokens": 45634629.0, "step": 95750 }, { "entropy": 0.9033841317892075, "epoch": 2.329709880596289, "grad_norm": 14.5625, "learning_rate": 2.607913972768632e-06, "loss": 0.8931670379638672, "mean_token_accuracy": 0.764010626077652, "num_tokens": 45659168.0, "step": 95800 }, { "entropy": 0.8905762293934822, "epoch": 2.3309258043335523, "grad_norm": 12.1875, "learning_rate": 2.5988937500228307e-06, "loss": 0.8805188751220703, "mean_token_accuracy": 0.7606967318058014, "num_tokens": 45680216.0, "step": 95850 }, { "entropy": 1.0350791704654694, "epoch": 2.332141728070815, "grad_norm": 12.3125, "learning_rate": 2.5898868234168263e-06, "loss": 1.069333038330078, "mean_token_accuracy": 0.7193169164657592, "num_tokens": 45703330.0, "step": 95900 }, { "entropy": 0.9947279679775238, "epoch": 2.3333576518080785, "grad_norm": 16.0, "learning_rate": 2.580893209131633e-06, "loss": 0.987651596069336, "mean_token_accuracy": 0.7370724487304687, "num_tokens": 45727159.0, "step": 95950 }, { "entropy": 0.9833281421661377, "epoch": 2.334573575545342, "grad_norm": 45.75, "learning_rate": 2.571912923324339e-06, "loss": 0.9769818115234375, "mean_token_accuracy": 0.7469660496711731, "num_tokens": 45748269.0, "step": 96000 }, { "entropy": 1.0446341776847838, "epoch": 2.335789499282605, "grad_norm": 17.75, "learning_rate": 2.562945982128089e-06, "loss": 1.0458043670654298, "mean_token_accuracy": 0.7316469609737396, "num_tokens": 45771668.0, "step": 96050 }, { "entropy": 0.9928829175233841, "epoch": 2.337005423019868, "grad_norm": 14.625, "learning_rate": 2.5539924016520624e-06, "loss": 0.9916670989990234, "mean_token_accuracy": 0.7439694893360138, "num_tokens": 45793956.0, "step": 96100 }, { "entropy": 0.8972603964805603, "epoch": 2.3382213467571313, "grad_norm": 28.0, "learning_rate": 2.5450521979814237e-06, "loss": 0.8978917694091797, "mean_token_accuracy": 0.7577879929542541, "num_tokens": 45816264.0, "step": 96150 }, { "entropy": 1.020527741909027, "epoch": 2.3394372704943946, "grad_norm": 9.4375, "learning_rate": 2.536125387177315e-06, "loss": 1.031499710083008, "mean_token_accuracy": 0.7317077982425689, "num_tokens": 45843327.0, "step": 96200 }, { "entropy": 1.0370132464170456, "epoch": 2.340653194231658, "grad_norm": 10.75, "learning_rate": 2.5272119852768183e-06, "loss": 1.0355408477783203, "mean_token_accuracy": 0.7324371683597565, "num_tokens": 45867854.0, "step": 96250 }, { "entropy": 0.9251981416344642, "epoch": 2.341869117968921, "grad_norm": 30.625, "learning_rate": 2.51831200829292e-06, "loss": 0.938086929321289, "mean_token_accuracy": 0.755743362903595, "num_tokens": 45891290.0, "step": 96300 }, { "entropy": 0.9801165926456451, "epoch": 2.343085041706184, "grad_norm": 9.4375, "learning_rate": 2.5094254722144896e-06, "loss": 0.9431886291503906, "mean_token_accuracy": 0.7537684845924377, "num_tokens": 45912890.0, "step": 96350 }, { "entropy": 1.054764795899391, "epoch": 2.3443009654434475, "grad_norm": 8.375, "learning_rate": 2.500552393006257e-06, "loss": 1.0679257202148438, "mean_token_accuracy": 0.7230303740501404, "num_tokens": 45940558.0, "step": 96400 }, { "entropy": 0.9865311366319657, "epoch": 2.3455168891807108, "grad_norm": 17.5, "learning_rate": 2.491692786608766e-06, "loss": 0.9914148712158203, "mean_token_accuracy": 0.7406204652786255, "num_tokens": 45966263.0, "step": 96450 }, { "entropy": 0.9733604335784912, "epoch": 2.3467328129179736, "grad_norm": 30.25, "learning_rate": 2.482846668938369e-06, "loss": 0.9904890441894532, "mean_token_accuracy": 0.7406991612911225, "num_tokens": 45989787.0, "step": 96500 }, { "entropy": 0.9576501840353012, "epoch": 2.347948736655237, "grad_norm": 14.375, "learning_rate": 2.4740140558871728e-06, "loss": 0.9839324188232422, "mean_token_accuracy": 0.7432239401340485, "num_tokens": 46014585.0, "step": 96550 }, { "entropy": 1.0006695771217347, "epoch": 2.3491646603925003, "grad_norm": 16.875, "learning_rate": 2.4651949633230352e-06, "loss": 0.9953112030029296, "mean_token_accuracy": 0.7427278339862824, "num_tokens": 46041274.0, "step": 96600 }, { "entropy": 0.9916406232118606, "epoch": 2.3503805841297636, "grad_norm": 12.3125, "learning_rate": 2.4563894070895154e-06, "loss": 1.0121498107910156, "mean_token_accuracy": 0.7347976195812226, "num_tokens": 46067844.0, "step": 96650 }, { "entropy": 1.0133782243728637, "epoch": 2.3515965078670265, "grad_norm": 17.875, "learning_rate": 2.4475974030058535e-06, "loss": 1.0008023834228517, "mean_token_accuracy": 0.7434499800205231, "num_tokens": 46093777.0, "step": 96700 }, { "entropy": 0.981670982837677, "epoch": 2.3528124316042898, "grad_norm": 23.75, "learning_rate": 2.438818966866956e-06, "loss": 1.010425796508789, "mean_token_accuracy": 0.7354640173912048, "num_tokens": 46117282.0, "step": 96750 }, { "entropy": 1.0195647996664048, "epoch": 2.354028355341553, "grad_norm": 19.75, "learning_rate": 2.4300541144433366e-06, "loss": 1.0325605773925781, "mean_token_accuracy": 0.730508942604065, "num_tokens": 46140939.0, "step": 96800 }, { "entropy": 1.0250851234793663, "epoch": 2.355244279078816, "grad_norm": 12.3125, "learning_rate": 2.4213028614811216e-06, "loss": 1.0289112091064454, "mean_token_accuracy": 0.7325102293491363, "num_tokens": 46164853.0, "step": 96850 }, { "entropy": 0.9998527413606644, "epoch": 2.3564602028160793, "grad_norm": 19.0, "learning_rate": 2.412565223701997e-06, "loss": 0.9981300354003906, "mean_token_accuracy": 0.7427718102931976, "num_tokens": 46191887.0, "step": 96900 }, { "entropy": 0.9632665473222732, "epoch": 2.3576761265533426, "grad_norm": 17.0, "learning_rate": 2.4038412168031877e-06, "loss": 0.9816886138916016, "mean_token_accuracy": 0.7525983810424804, "num_tokens": 46215595.0, "step": 96950 }, { "entropy": 1.0534915137290954, "epoch": 2.358892050290606, "grad_norm": 15.125, "learning_rate": 2.3951308564574383e-06, "loss": 1.054468307495117, "mean_token_accuracy": 0.7265292465686798, "num_tokens": 46241344.0, "step": 97000 }, { "entropy": 1.011704724431038, "epoch": 2.360107974027869, "grad_norm": 12.0, "learning_rate": 2.386434158312969e-06, "loss": 1.0313463592529297, "mean_token_accuracy": 0.7350385248661041, "num_tokens": 46266145.0, "step": 97050 }, { "entropy": 0.9472399127483367, "epoch": 2.361323897765132, "grad_norm": 16.125, "learning_rate": 2.3777511379934604e-06, "loss": 0.953708267211914, "mean_token_accuracy": 0.7532491850852966, "num_tokens": 46289271.0, "step": 97100 }, { "entropy": 0.991425866484642, "epoch": 2.3625398215023954, "grad_norm": 16.5, "learning_rate": 2.369081811098025e-06, "loss": 1.0010517120361329, "mean_token_accuracy": 0.7427856993675231, "num_tokens": 46312408.0, "step": 97150 }, { "entropy": 0.9979304158687592, "epoch": 2.3637557452396587, "grad_norm": 21.0, "learning_rate": 2.3604261932011664e-06, "loss": 0.9966995239257812, "mean_token_accuracy": 0.7452984464168548, "num_tokens": 46335015.0, "step": 97200 }, { "entropy": 0.9826168721914291, "epoch": 2.3649716689769216, "grad_norm": 22.625, "learning_rate": 2.3517842998527597e-06, "loss": 0.9710489654541016, "mean_token_accuracy": 0.7453414452075958, "num_tokens": 46359209.0, "step": 97250 }, { "entropy": 0.9797687846422195, "epoch": 2.366187592714185, "grad_norm": 28.375, "learning_rate": 2.3431561465780337e-06, "loss": 0.9990670776367188, "mean_token_accuracy": 0.7435233318805694, "num_tokens": 46381914.0, "step": 97300 }, { "entropy": 0.9886573797464371, "epoch": 2.367403516451448, "grad_norm": 15.9375, "learning_rate": 2.3345417488775224e-06, "loss": 1.0055271911621093, "mean_token_accuracy": 0.7347913306951522, "num_tokens": 46406687.0, "step": 97350 }, { "entropy": 1.0292078775167466, "epoch": 2.3686194401887115, "grad_norm": 21.0, "learning_rate": 2.3259411222270556e-06, "loss": 1.0542235565185547, "mean_token_accuracy": 0.7357412385940552, "num_tokens": 46433704.0, "step": 97400 }, { "entropy": 1.0112492966651916, "epoch": 2.3698353639259744, "grad_norm": 14.8125, "learning_rate": 2.3173542820777174e-06, "loss": 1.013678970336914, "mean_token_accuracy": 0.7360450756549836, "num_tokens": 46456346.0, "step": 97450 }, { "entropy": 1.0010080754756927, "epoch": 2.3710512876632377, "grad_norm": 28.0, "learning_rate": 2.3087812438558286e-06, "loss": 0.9823719787597657, "mean_token_accuracy": 0.7402499401569367, "num_tokens": 46478672.0, "step": 97500 }, { "entropy": 1.0524135592579842, "epoch": 2.372267211400501, "grad_norm": 13.3125, "learning_rate": 2.3002220229629147e-06, "loss": 1.043142318725586, "mean_token_accuracy": 0.7252648305892945, "num_tokens": 46507621.0, "step": 97550 }, { "entropy": 0.8588764774799347, "epoch": 2.3734831351377643, "grad_norm": 48.75, "learning_rate": 2.29167663477567e-06, "loss": 0.8684282684326172, "mean_token_accuracy": 0.760467540025711, "num_tokens": 46533337.0, "step": 97600 }, { "entropy": 0.9341969162225723, "epoch": 2.374699058875027, "grad_norm": 11.4375, "learning_rate": 2.2831450946459534e-06, "loss": 0.9523145294189453, "mean_token_accuracy": 0.7494847202301025, "num_tokens": 46556775.0, "step": 97650 }, { "entropy": 0.9231040665507316, "epoch": 2.3759149826122905, "grad_norm": 32.75, "learning_rate": 2.2746274179007298e-06, "loss": 0.9243079376220703, "mean_token_accuracy": 0.7530919468402862, "num_tokens": 46581080.0, "step": 97700 }, { "entropy": 0.9587431740760803, "epoch": 2.377130906349554, "grad_norm": 15.5, "learning_rate": 2.266123619842069e-06, "loss": 0.9656648254394531, "mean_token_accuracy": 0.7452446484565735, "num_tokens": 46606793.0, "step": 97750 }, { "entropy": 0.9179245400428772, "epoch": 2.378346830086817, "grad_norm": 12.1875, "learning_rate": 2.2576337157471083e-06, "loss": 0.9258041381835938, "mean_token_accuracy": 0.7607336950302124, "num_tokens": 46628933.0, "step": 97800 }, { "entropy": 0.978563247025013, "epoch": 2.37956275382408, "grad_norm": 22.625, "learning_rate": 2.249157720868016e-06, "loss": 0.9636675262451172, "mean_token_accuracy": 0.755839718580246, "num_tokens": 46649247.0, "step": 97850 }, { "entropy": 1.02070008456707, "epoch": 2.3807786775613433, "grad_norm": 15.125, "learning_rate": 2.2406956504319776e-06, "loss": 1.0429288482666015, "mean_token_accuracy": 0.7351869118213653, "num_tokens": 46673408.0, "step": 97900 }, { "entropy": 0.958078818321228, "epoch": 2.3819946012986066, "grad_norm": 18.5, "learning_rate": 2.232247519641161e-06, "loss": 0.9663208770751953, "mean_token_accuracy": 0.7414830005168915, "num_tokens": 46695908.0, "step": 97950 }, { "entropy": 1.0084451013803482, "epoch": 2.38321052503587, "grad_norm": 16.875, "learning_rate": 2.2238133436726937e-06, "loss": 1.0054339599609374, "mean_token_accuracy": 0.7477175867557526, "num_tokens": 46719604.0, "step": 98000 }, { "entropy": 0.9946250337362289, "epoch": 2.384426448773133, "grad_norm": 22.25, "learning_rate": 2.2153931376786377e-06, "loss": 1.0071290588378907, "mean_token_accuracy": 0.7356315159797668, "num_tokens": 46742612.0, "step": 98050 }, { "entropy": 1.0702527886629105, "epoch": 2.385642372510396, "grad_norm": 15.0, "learning_rate": 2.206986916785947e-06, "loss": 1.0818299102783202, "mean_token_accuracy": 0.7222705447673797, "num_tokens": 46769112.0, "step": 98100 }, { "entropy": 1.0497675186395645, "epoch": 2.3868582962476594, "grad_norm": 17.0, "learning_rate": 2.1985946960964565e-06, "loss": 1.0443623352050782, "mean_token_accuracy": 0.7386875069141388, "num_tokens": 46789153.0, "step": 98150 }, { "entropy": 0.9224061185121536, "epoch": 2.3880742199849223, "grad_norm": 13.125, "learning_rate": 2.190216490686855e-06, "loss": 0.9474951171875, "mean_token_accuracy": 0.7499234068393708, "num_tokens": 46812423.0, "step": 98200 }, { "entropy": 0.9453705936670304, "epoch": 2.3892901437221856, "grad_norm": 19.25, "learning_rate": 2.1818523156086425e-06, "loss": 0.9401871490478516, "mean_token_accuracy": 0.753737051486969, "num_tokens": 46837406.0, "step": 98250 }, { "entropy": 0.9362173706293107, "epoch": 2.390506067459449, "grad_norm": 17.25, "learning_rate": 2.1735021858881244e-06, "loss": 0.9453360748291015, "mean_token_accuracy": 0.7529061543941498, "num_tokens": 46864512.0, "step": 98300 }, { "entropy": 0.9680910271406173, "epoch": 2.3917219911967122, "grad_norm": 12.5625, "learning_rate": 2.1651661165263615e-06, "loss": 0.9462548828125, "mean_token_accuracy": 0.7476029825210572, "num_tokens": 46890694.0, "step": 98350 }, { "entropy": 0.9690888637304306, "epoch": 2.3929379149339756, "grad_norm": 26.0, "learning_rate": 2.156844122499169e-06, "loss": 0.9478904724121093, "mean_token_accuracy": 0.7448553216457366, "num_tokens": 46915388.0, "step": 98400 }, { "entropy": 0.9707385742664337, "epoch": 2.3941538386712384, "grad_norm": 21.375, "learning_rate": 2.1485362187570625e-06, "loss": 0.9753245544433594, "mean_token_accuracy": 0.7467138588428497, "num_tokens": 46939871.0, "step": 98450 }, { "entropy": 0.9367838877439499, "epoch": 2.3953697624085017, "grad_norm": 17.125, "learning_rate": 2.1402424202252494e-06, "loss": 0.9545435333251953, "mean_token_accuracy": 0.747455644607544, "num_tokens": 46963188.0, "step": 98500 }, { "entropy": 0.9079476976394654, "epoch": 2.396585686145765, "grad_norm": 15.0625, "learning_rate": 2.1319627418036017e-06, "loss": 0.9083273315429687, "mean_token_accuracy": 0.7621216583251953, "num_tokens": 46984952.0, "step": 98550 }, { "entropy": 1.012872309088707, "epoch": 2.397801609883028, "grad_norm": 17.25, "learning_rate": 2.1236971983666144e-06, "loss": 1.0237086486816407, "mean_token_accuracy": 0.7429370081424713, "num_tokens": 47003972.0, "step": 98600 }, { "entropy": 0.9102479445934296, "epoch": 2.3990175336202912, "grad_norm": 13.6875, "learning_rate": 2.1154458047633997e-06, "loss": 0.8939956665039063, "mean_token_accuracy": 0.7670606297254562, "num_tokens": 47025791.0, "step": 98650 }, { "entropy": 0.9922562772035599, "epoch": 2.4002334573575546, "grad_norm": 17.125, "learning_rate": 2.107208575817646e-06, "loss": 1.01915283203125, "mean_token_accuracy": 0.740577472448349, "num_tokens": 47052242.0, "step": 98700 }, { "entropy": 1.0358601319789886, "epoch": 2.401449381094818, "grad_norm": 11.625, "learning_rate": 2.0989855263275904e-06, "loss": 1.0541246795654298, "mean_token_accuracy": 0.7228220617771148, "num_tokens": 47075750.0, "step": 98750 }, { "entropy": 0.9085548120737076, "epoch": 2.4026653048320807, "grad_norm": 12.375, "learning_rate": 2.0907766710660005e-06, "loss": 0.9361541748046875, "mean_token_accuracy": 0.7542395180463791, "num_tokens": 47098246.0, "step": 98800 }, { "entropy": 0.9869731068611145, "epoch": 2.403881228569344, "grad_norm": 23.0, "learning_rate": 2.08258202478014e-06, "loss": 0.9955321502685547, "mean_token_accuracy": 0.7421335649490356, "num_tokens": 47123159.0, "step": 98850 }, { "entropy": 1.009657376408577, "epoch": 2.4050971523066074, "grad_norm": 21.25, "learning_rate": 2.0744016021917514e-06, "loss": 1.0381554412841796, "mean_token_accuracy": 0.7382822763919831, "num_tokens": 47147411.0, "step": 98900 }, { "entropy": 1.051675442457199, "epoch": 2.4063130760438707, "grad_norm": 22.5, "learning_rate": 2.0662354179970245e-06, "loss": 1.1015760040283202, "mean_token_accuracy": 0.7146197617053985, "num_tokens": 47170244.0, "step": 98950 }, { "entropy": 0.874287223815918, "epoch": 2.4075289997811335, "grad_norm": 14.875, "learning_rate": 2.0580834868665624e-06, "loss": 0.8667985534667969, "mean_token_accuracy": 0.7706022346019745, "num_tokens": 47192785.0, "step": 99000 }, { "entropy": 1.057842715382576, "epoch": 2.408744923518397, "grad_norm": 17.25, "learning_rate": 2.0499458234453707e-06, "loss": 1.0563845062255859, "mean_token_accuracy": 0.7265172934532166, "num_tokens": 47220213.0, "step": 99050 }, { "entropy": 0.9943228501081467, "epoch": 2.40996084725566, "grad_norm": 29.625, "learning_rate": 2.041822442352819e-06, "loss": 1.0059576416015625, "mean_token_accuracy": 0.7358046424388885, "num_tokens": 47241166.0, "step": 99100 }, { "entropy": 1.0050245028734208, "epoch": 2.4111767709929235, "grad_norm": 17.25, "learning_rate": 2.033713358182614e-06, "loss": 1.0065509033203126, "mean_token_accuracy": 0.7429979074001313, "num_tokens": 47263889.0, "step": 99150 }, { "entropy": 0.9145067524909973, "epoch": 2.4123926947301864, "grad_norm": 10.5, "learning_rate": 2.02561858550279e-06, "loss": 0.9102850341796875, "mean_token_accuracy": 0.7583632624149322, "num_tokens": 47286221.0, "step": 99200 }, { "entropy": 1.0151828598976136, "epoch": 2.4136086184674497, "grad_norm": 30.875, "learning_rate": 2.017538138855657e-06, "loss": 1.0294156646728516, "mean_token_accuracy": 0.7380634140968323, "num_tokens": 47308837.0, "step": 99250 }, { "entropy": 1.0109664091467858, "epoch": 2.414824542204713, "grad_norm": 29.75, "learning_rate": 2.0094720327578e-06, "loss": 1.0283316040039063, "mean_token_accuracy": 0.7351270866394043, "num_tokens": 47330594.0, "step": 99300 }, { "entropy": 1.023058723807335, "epoch": 2.4160404659419763, "grad_norm": 24.125, "learning_rate": 2.001420281700035e-06, "loss": 1.0031009674072267, "mean_token_accuracy": 0.746183260679245, "num_tokens": 47352313.0, "step": 99350 }, { "entropy": 1.0482118445634843, "epoch": 2.417256389679239, "grad_norm": 15.5625, "learning_rate": 1.9933829001473846e-06, "loss": 1.0515255737304687, "mean_token_accuracy": 0.7287826204299926, "num_tokens": 47374822.0, "step": 99400 }, { "entropy": 1.06862952709198, "epoch": 2.4184723134165025, "grad_norm": 20.625, "learning_rate": 1.985359902539068e-06, "loss": 1.0772602081298828, "mean_token_accuracy": 0.7207397556304932, "num_tokens": 47398123.0, "step": 99450 }, { "entropy": 0.9914139384031295, "epoch": 2.419688237153766, "grad_norm": 15.25, "learning_rate": 1.9773513032884515e-06, "loss": 0.9963228607177734, "mean_token_accuracy": 0.7434949064254761, "num_tokens": 47421816.0, "step": 99500 }, { "entropy": 0.9597957438230514, "epoch": 2.4209041608910287, "grad_norm": 20.75, "learning_rate": 1.969357116783045e-06, "loss": 0.9518717956542969, "mean_token_accuracy": 0.7438527595996857, "num_tokens": 47442371.0, "step": 99550 }, { "entropy": 0.9928667014837265, "epoch": 2.422120084628292, "grad_norm": 12.875, "learning_rate": 1.961377357384461e-06, "loss": 0.9920714569091796, "mean_token_accuracy": 0.7481425046920777, "num_tokens": 47462555.0, "step": 99600 }, { "entropy": 1.072260895371437, "epoch": 2.4233360083655553, "grad_norm": 21.375, "learning_rate": 1.953412039428393e-06, "loss": 1.0819627380371093, "mean_token_accuracy": 0.724660758972168, "num_tokens": 47484880.0, "step": 99650 }, { "entropy": 0.9281311076879502, "epoch": 2.4245519321028186, "grad_norm": 13.125, "learning_rate": 1.945461177224591e-06, "loss": 0.9269882965087891, "mean_token_accuracy": 0.7571560144424438, "num_tokens": 47505054.0, "step": 99700 }, { "entropy": 1.0420177346467971, "epoch": 2.425767855840082, "grad_norm": 28.75, "learning_rate": 1.9375247850568314e-06, "loss": 1.0658219146728516, "mean_token_accuracy": 0.7280727779865265, "num_tokens": 47529652.0, "step": 99750 }, { "entropy": 0.97538643181324, "epoch": 2.426983779577345, "grad_norm": 21.0, "learning_rate": 1.9296028771829022e-06, "loss": 0.9431584167480469, "mean_token_accuracy": 0.7526849675178527, "num_tokens": 47554519.0, "step": 99800 }, { "entropy": 1.0727769672870635, "epoch": 2.428199703314608, "grad_norm": 12.0, "learning_rate": 1.9216954678345713e-06, "loss": 1.0751062774658202, "mean_token_accuracy": 0.7270131355524063, "num_tokens": 47578067.0, "step": 99850 }, { "entropy": 0.9920514255762101, "epoch": 2.4294156270518714, "grad_norm": 19.375, "learning_rate": 1.913802571217548e-06, "loss": 0.9936841583251953, "mean_token_accuracy": 0.7455769538879394, "num_tokens": 47602594.0, "step": 99900 }, { "entropy": 0.9799969124794007, "epoch": 2.4306315507891343, "grad_norm": 14.8125, "learning_rate": 1.905924201511482e-06, "loss": 1.0127312469482421, "mean_token_accuracy": 0.733732179403305, "num_tokens": 47622735.0, "step": 99950 }, { "entropy": 0.9482760637998581, "epoch": 2.4318474745263976, "grad_norm": 28.75, "learning_rate": 1.89806037286992e-06, "loss": 0.9347732543945313, "mean_token_accuracy": 0.749175968170166, "num_tokens": 47643898.0, "step": 100000 }, { "epoch": 2.4318474745263976, "eval_entropy": 1.081404837136601, "eval_loss": 1.3074215650558472, "eval_mean_token_accuracy": 0.6792477380462328, "eval_num_tokens": 47643898.0, "eval_runtime": 388.8158, "eval_samples_per_second": 11.751, "eval_steps_per_second": 11.751, "step": 100000 }, { "entropy": 0.9836595916748047, "epoch": 2.433063398263661, "grad_norm": 20.625, "learning_rate": 1.8902110994202815e-06, "loss": 0.9659170532226562, "mean_token_accuracy": 0.74706183552742, "num_tokens": 47669860.0, "step": 100050 }, { "entropy": 1.0615554295480252, "epoch": 2.4342793220009242, "grad_norm": 20.875, "learning_rate": 1.8823763952638484e-06, "loss": 1.0811090850830078, "mean_token_accuracy": 0.7335676056146622, "num_tokens": 47691130.0, "step": 100100 }, { "entropy": 1.04272772192955, "epoch": 2.4354952457381875, "grad_norm": 15.0, "learning_rate": 1.8745562744757162e-06, "loss": 1.0472515106201172, "mean_token_accuracy": 0.719669337272644, "num_tokens": 47716030.0, "step": 100150 }, { "entropy": 1.0478219857811928, "epoch": 2.4367111694754504, "grad_norm": 14.5, "learning_rate": 1.8667507511047888e-06, "loss": 1.0427615356445312, "mean_token_accuracy": 0.7310945403575897, "num_tokens": 47739617.0, "step": 100200 }, { "entropy": 1.1005677896738053, "epoch": 2.4379270932127137, "grad_norm": 23.875, "learning_rate": 1.8589598391737485e-06, "loss": 1.1021302795410157, "mean_token_accuracy": 0.7221346145868301, "num_tokens": 47763290.0, "step": 100250 }, { "entropy": 1.016863424181938, "epoch": 2.439143016949977, "grad_norm": 25.0, "learning_rate": 1.8511835526790202e-06, "loss": 1.0232311248779298, "mean_token_accuracy": 0.7369371616840362, "num_tokens": 47784558.0, "step": 100300 }, { "entropy": 0.9660828307271003, "epoch": 2.44035894068724, "grad_norm": 11.0, "learning_rate": 1.8434219055907586e-06, "loss": 0.9699872589111328, "mean_token_accuracy": 0.7536335396766662, "num_tokens": 47808665.0, "step": 100350 }, { "entropy": 0.9488025814294815, "epoch": 2.441574864424503, "grad_norm": 15.25, "learning_rate": 1.835674911852814e-06, "loss": 0.9662906646728515, "mean_token_accuracy": 0.7468181753158569, "num_tokens": 47829916.0, "step": 100400 }, { "entropy": 1.0127140146493911, "epoch": 2.4427907881617665, "grad_norm": 14.3125, "learning_rate": 1.8279425853827182e-06, "loss": 0.9998883819580078, "mean_token_accuracy": 0.7488120567798614, "num_tokens": 47853962.0, "step": 100450 }, { "entropy": 0.9498662346601486, "epoch": 2.44400671189903, "grad_norm": 17.375, "learning_rate": 1.8202249400716543e-06, "loss": 0.94242431640625, "mean_token_accuracy": 0.7567758893966675, "num_tokens": 47878233.0, "step": 100500 }, { "entropy": 0.9298828792572021, "epoch": 2.4452226356362927, "grad_norm": 12.875, "learning_rate": 1.8125219897844226e-06, "loss": 0.9273408508300781, "mean_token_accuracy": 0.7475625133514404, "num_tokens": 47904391.0, "step": 100550 }, { "entropy": 1.0207532292604446, "epoch": 2.446438559373556, "grad_norm": 19.625, "learning_rate": 1.8048337483594292e-06, "loss": 1.035699996948242, "mean_token_accuracy": 0.726094017624855, "num_tokens": 47925202.0, "step": 100600 }, { "entropy": 0.899384799003601, "epoch": 2.4476544831108193, "grad_norm": 12.125, "learning_rate": 1.797160229608651e-06, "loss": 0.8985781097412109, "mean_token_accuracy": 0.7629414463043213, "num_tokens": 47949019.0, "step": 100650 }, { "entropy": 1.0062444788217544, "epoch": 2.4488704068480827, "grad_norm": 23.375, "learning_rate": 1.789501447317622e-06, "loss": 1.0162257385253906, "mean_token_accuracy": 0.7389129507541656, "num_tokens": 47972620.0, "step": 100700 }, { "entropy": 0.9075559729337692, "epoch": 2.4500863305853455, "grad_norm": 12.125, "learning_rate": 1.7818574152453993e-06, "loss": 0.8896517944335938, "mean_token_accuracy": 0.765027482509613, "num_tokens": 47999139.0, "step": 100750 }, { "entropy": 1.0182770735025406, "epoch": 2.451302254322609, "grad_norm": 8.9375, "learning_rate": 1.7742281471245382e-06, "loss": 1.031886215209961, "mean_token_accuracy": 0.7334607017040252, "num_tokens": 48029125.0, "step": 100800 }, { "entropy": 0.9341620069742202, "epoch": 2.452518178059872, "grad_norm": 10.5625, "learning_rate": 1.766613656661077e-06, "loss": 0.9507267761230469, "mean_token_accuracy": 0.7535652363300324, "num_tokens": 48051666.0, "step": 100850 }, { "entropy": 0.9982780653238297, "epoch": 2.4537341017971355, "grad_norm": 15.75, "learning_rate": 1.7590139575344989e-06, "loss": 0.9823082733154297, "mean_token_accuracy": 0.7400788688659667, "num_tokens": 48078125.0, "step": 100900 }, { "entropy": 1.02696810901165, "epoch": 2.4549500255343983, "grad_norm": 20.5, "learning_rate": 1.751429063397715e-06, "loss": 1.0337081909179688, "mean_token_accuracy": 0.7336541783809661, "num_tokens": 48102814.0, "step": 100950 }, { "entropy": 1.0278156608343125, "epoch": 2.4561659492716617, "grad_norm": 30.75, "learning_rate": 1.7438589878770462e-06, "loss": 1.0291008758544922, "mean_token_accuracy": 0.7276808333396911, "num_tokens": 48124827.0, "step": 101000 }, { "entropy": 0.9860463476181031, "epoch": 2.457381873008925, "grad_norm": 16.25, "learning_rate": 1.736303744572182e-06, "loss": 1.0044378662109374, "mean_token_accuracy": 0.7360053670406341, "num_tokens": 48151652.0, "step": 101050 }, { "entropy": 1.1348385268449783, "epoch": 2.4585977967461883, "grad_norm": 33.0, "learning_rate": 1.7287633470561727e-06, "loss": 1.1485122680664062, "mean_token_accuracy": 0.7117344009876251, "num_tokens": 48173273.0, "step": 101100 }, { "entropy": 1.1269717121124267, "epoch": 2.459813720483451, "grad_norm": 19.5, "learning_rate": 1.7212378088753978e-06, "loss": 1.1132564544677734, "mean_token_accuracy": 0.7206014686822891, "num_tokens": 48197352.0, "step": 101150 }, { "entropy": 1.0051128256320954, "epoch": 2.4610296442207145, "grad_norm": 17.125, "learning_rate": 1.7137271435495394e-06, "loss": 1.0192572021484374, "mean_token_accuracy": 0.7383128881454468, "num_tokens": 48220141.0, "step": 101200 }, { "entropy": 0.978776968717575, "epoch": 2.4622455679579778, "grad_norm": 18.0, "learning_rate": 1.706231364571559e-06, "loss": 0.9886521148681641, "mean_token_accuracy": 0.7456870424747467, "num_tokens": 48243990.0, "step": 101250 }, { "entropy": 1.0885599714517593, "epoch": 2.4634614916952406, "grad_norm": 30.375, "learning_rate": 1.6987504854076753e-06, "loss": 1.1228165435791015, "mean_token_accuracy": 0.7173869144916535, "num_tokens": 48269687.0, "step": 101300 }, { "entropy": 1.0619021582603454, "epoch": 2.464677415432504, "grad_norm": 11.0, "learning_rate": 1.6912845194973427e-06, "loss": 1.0727503204345703, "mean_token_accuracy": 0.7265542298555374, "num_tokens": 48296493.0, "step": 101350 }, { "entropy": 0.9850762486457825, "epoch": 2.4658933391697673, "grad_norm": 23.125, "learning_rate": 1.683833480253224e-06, "loss": 0.9650557708740234, "mean_token_accuracy": 0.7439814895391464, "num_tokens": 48320384.0, "step": 101400 }, { "entropy": 1.043160718679428, "epoch": 2.4671092629070306, "grad_norm": 11.625, "learning_rate": 1.67639738106116e-06, "loss": 1.0741024780273438, "mean_token_accuracy": 0.7205118000507355, "num_tokens": 48348031.0, "step": 101450 }, { "entropy": 1.0157823485136033, "epoch": 2.468325186644294, "grad_norm": 17.875, "learning_rate": 1.668976235280162e-06, "loss": 1.0116475677490235, "mean_token_accuracy": 0.7422782409191132, "num_tokens": 48369790.0, "step": 101500 }, { "entropy": 1.0212584269046783, "epoch": 2.4695411103815568, "grad_norm": 17.25, "learning_rate": 1.661570056242362e-06, "loss": 1.0394863128662108, "mean_token_accuracy": 0.732077329158783, "num_tokens": 48392372.0, "step": 101550 }, { "entropy": 1.020299414396286, "epoch": 2.47075703411882, "grad_norm": 22.375, "learning_rate": 1.654178857253017e-06, "loss": 1.0264083862304687, "mean_token_accuracy": 0.7355496394634247, "num_tokens": 48417483.0, "step": 101600 }, { "entropy": 0.9311938554048538, "epoch": 2.4719729578560834, "grad_norm": 20.0, "learning_rate": 1.6468026515904712e-06, "loss": 0.942291259765625, "mean_token_accuracy": 0.7515642404556274, "num_tokens": 48442613.0, "step": 101650 }, { "entropy": 0.9447511404752731, "epoch": 2.4731888815933463, "grad_norm": 10.875, "learning_rate": 1.6394414525061252e-06, "loss": 0.949665298461914, "mean_token_accuracy": 0.7489394092559815, "num_tokens": 48471219.0, "step": 101700 }, { "entropy": 1.0003357911109925, "epoch": 2.4744048053306096, "grad_norm": 15.4375, "learning_rate": 1.632095273224431e-06, "loss": 0.9883349609375, "mean_token_accuracy": 0.749803626537323, "num_tokens": 48495092.0, "step": 101750 }, { "entropy": 0.9897074753046036, "epoch": 2.475620729067873, "grad_norm": 8.5625, "learning_rate": 1.6247641269428493e-06, "loss": 0.9729000854492188, "mean_token_accuracy": 0.746709805727005, "num_tokens": 48520032.0, "step": 101800 }, { "entropy": 0.8648980009555817, "epoch": 2.476836652805136, "grad_norm": 13.1875, "learning_rate": 1.6174480268318326e-06, "loss": 0.8683270263671875, "mean_token_accuracy": 0.762782769203186, "num_tokens": 48543740.0, "step": 101850 }, { "entropy": 0.9481565737724305, "epoch": 2.478052576542399, "grad_norm": 27.375, "learning_rate": 1.6101469860348118e-06, "loss": 0.9542909240722657, "mean_token_accuracy": 0.7509562259912491, "num_tokens": 48567045.0, "step": 101900 }, { "entropy": 1.0031441628932953, "epoch": 2.4792685002796624, "grad_norm": 24.625, "learning_rate": 1.6028610176681547e-06, "loss": 1.0092359161376954, "mean_token_accuracy": 0.7472099781036377, "num_tokens": 48589849.0, "step": 101950 }, { "entropy": 0.9466308909654617, "epoch": 2.4804844240169257, "grad_norm": 8.125, "learning_rate": 1.5955901348211567e-06, "loss": 0.9647543334960937, "mean_token_accuracy": 0.7436049580574036, "num_tokens": 48616142.0, "step": 102000 }, { "entropy": 0.9403740233182907, "epoch": 2.481700347754189, "grad_norm": 14.8125, "learning_rate": 1.5883343505560123e-06, "loss": 0.9308699798583985, "mean_token_accuracy": 0.7527655351161957, "num_tokens": 48639574.0, "step": 102050 }, { "entropy": 1.0844241148233413, "epoch": 2.482916271491452, "grad_norm": 17.375, "learning_rate": 1.581093677907788e-06, "loss": 1.060426025390625, "mean_token_accuracy": 0.7344825500249863, "num_tokens": 48666549.0, "step": 102100 }, { "entropy": 0.9614435064792634, "epoch": 2.484132195228715, "grad_norm": 14.0, "learning_rate": 1.5738681298844028e-06, "loss": 0.9768770599365234, "mean_token_accuracy": 0.7433327603340149, "num_tokens": 48692267.0, "step": 102150 }, { "entropy": 0.9990470486879349, "epoch": 2.4853481189659785, "grad_norm": 11.5625, "learning_rate": 1.566657719466602e-06, "loss": 0.9860111999511719, "mean_token_accuracy": 0.7463922172784805, "num_tokens": 48714685.0, "step": 102200 }, { "entropy": 1.0484341335296632, "epoch": 2.486564042703242, "grad_norm": 14.0625, "learning_rate": 1.5594624596079432e-06, "loss": 1.0680496215820312, "mean_token_accuracy": 0.72446049451828, "num_tokens": 48741596.0, "step": 102250 }, { "entropy": 0.9953687340021133, "epoch": 2.4877799664405047, "grad_norm": 21.875, "learning_rate": 1.5522823632347618e-06, "loss": 0.9750669860839843, "mean_token_accuracy": 0.7525313210487365, "num_tokens": 48762133.0, "step": 102300 }, { "entropy": 1.0185768216848374, "epoch": 2.488995890177768, "grad_norm": 14.5625, "learning_rate": 1.545117443246148e-06, "loss": 1.0197138214111328, "mean_token_accuracy": 0.733232877254486, "num_tokens": 48787584.0, "step": 102350 }, { "entropy": 1.0605068236589432, "epoch": 2.4902118139150313, "grad_norm": 23.375, "learning_rate": 1.537967712513937e-06, "loss": 1.0764173126220704, "mean_token_accuracy": 0.7216453289985657, "num_tokens": 48815243.0, "step": 102400 }, { "entropy": 1.0246488362550736, "epoch": 2.4914277376522946, "grad_norm": 19.75, "learning_rate": 1.5308331838826674e-06, "loss": 1.001631393432617, "mean_token_accuracy": 0.739677232503891, "num_tokens": 48839622.0, "step": 102450 }, { "entropy": 0.9997948712110519, "epoch": 2.4926436613895575, "grad_norm": 14.125, "learning_rate": 1.5237138701695676e-06, "loss": 1.017121810913086, "mean_token_accuracy": 0.7290791666507721, "num_tokens": 48861744.0, "step": 102500 }, { "entropy": 1.05985153734684, "epoch": 2.493859585126821, "grad_norm": 19.625, "learning_rate": 1.5166097841645412e-06, "loss": 1.0737869262695312, "mean_token_accuracy": 0.7244985330104828, "num_tokens": 48884990.0, "step": 102550 }, { "entropy": 1.0990385866165162, "epoch": 2.495075508864084, "grad_norm": 18.625, "learning_rate": 1.5095209386301225e-06, "loss": 1.109145050048828, "mean_token_accuracy": 0.728320991396904, "num_tokens": 48910628.0, "step": 102600 }, { "entropy": 0.9728285628557205, "epoch": 2.496291432601347, "grad_norm": 18.25, "learning_rate": 1.5024473463014766e-06, "loss": 0.9703627777099609, "mean_token_accuracy": 0.7480955016613007, "num_tokens": 48933502.0, "step": 102650 }, { "entropy": 0.9255929958820343, "epoch": 2.4975073563386103, "grad_norm": 21.75, "learning_rate": 1.4953890198863664e-06, "loss": 0.9370236968994141, "mean_token_accuracy": 0.7555711007118225, "num_tokens": 48953722.0, "step": 102700 }, { "entropy": 0.9670300137996674, "epoch": 2.4987232800758736, "grad_norm": 22.625, "learning_rate": 1.488345972065115e-06, "loss": 0.9718161010742188, "mean_token_accuracy": 0.7442238056659698, "num_tokens": 48980951.0, "step": 102750 }, { "entropy": 1.1139459538459777, "epoch": 2.499939203813137, "grad_norm": 13.8125, "learning_rate": 1.481318215490617e-06, "loss": 1.1415672302246094, "mean_token_accuracy": 0.7118721026182174, "num_tokens": 49006078.0, "step": 102800 }, { "entropy": 1.0074073427915573, "epoch": 2.5011551275504003, "grad_norm": 20.625, "learning_rate": 1.4743057627882806e-06, "loss": 1.0333832550048827, "mean_token_accuracy": 0.7316744506359101, "num_tokens": 49026344.0, "step": 102850 }, { "entropy": 0.9911123609542847, "epoch": 2.502371051287663, "grad_norm": 21.5, "learning_rate": 1.4673086265560287e-06, "loss": 0.9896331787109375, "mean_token_accuracy": 0.7405658602714539, "num_tokens": 49046627.0, "step": 102900 }, { "entropy": 1.1530007243156433, "epoch": 2.5035869750249264, "grad_norm": 23.375, "learning_rate": 1.4603268193642695e-06, "loss": 1.1697243499755858, "mean_token_accuracy": 0.7177327251434327, "num_tokens": 49070867.0, "step": 102950 }, { "entropy": 0.9402179777622223, "epoch": 2.5048028987621898, "grad_norm": 19.0, "learning_rate": 1.453360353755866e-06, "loss": 0.9324073028564454, "mean_token_accuracy": 0.7658421522378922, "num_tokens": 49089868.0, "step": 103000 }, { "entropy": 1.0565684348344804, "epoch": 2.5060188224994526, "grad_norm": 14.0625, "learning_rate": 1.4464092422461229e-06, "loss": 1.0403884887695312, "mean_token_accuracy": 0.7368279576301575, "num_tokens": 49113566.0, "step": 103050 }, { "entropy": 1.046613371372223, "epoch": 2.507234746236716, "grad_norm": 33.5, "learning_rate": 1.4394734973227598e-06, "loss": 1.0398822021484375, "mean_token_accuracy": 0.7301582998037338, "num_tokens": 49135282.0, "step": 103100 }, { "entropy": 0.9196431905031204, "epoch": 2.5084506699739793, "grad_norm": 11.9375, "learning_rate": 1.432553131445894e-06, "loss": 0.9243701171875, "mean_token_accuracy": 0.7649926155805588, "num_tokens": 49161180.0, "step": 103150 }, { "entropy": 1.1172394794225693, "epoch": 2.5096665937112426, "grad_norm": 15.0, "learning_rate": 1.425648157048013e-06, "loss": 1.1417144012451172, "mean_token_accuracy": 0.7146862304210663, "num_tokens": 49186912.0, "step": 103200 }, { "entropy": 0.9089815497398377, "epoch": 2.510882517448506, "grad_norm": 13.8125, "learning_rate": 1.4187585865339471e-06, "loss": 0.9113701629638672, "mean_token_accuracy": 0.7550681054592132, "num_tokens": 49210163.0, "step": 103250 }, { "entropy": 0.9980392336845398, "epoch": 2.5120984411857687, "grad_norm": 13.25, "learning_rate": 1.4118844322808656e-06, "loss": 1.0296529388427735, "mean_token_accuracy": 0.737673306465149, "num_tokens": 49236257.0, "step": 103300 }, { "entropy": 0.9632880592346191, "epoch": 2.513314364923032, "grad_norm": 20.125, "learning_rate": 1.405025706638231e-06, "loss": 0.9615594482421875, "mean_token_accuracy": 0.7450952661037445, "num_tokens": 49260779.0, "step": 103350 }, { "entropy": 0.9335470592975617, "epoch": 2.5145302886602954, "grad_norm": 22.125, "learning_rate": 1.3981824219277906e-06, "loss": 0.9450257873535156, "mean_token_accuracy": 0.7566630339622498, "num_tokens": 49285928.0, "step": 103400 }, { "entropy": 1.030834854245186, "epoch": 2.5157462123975582, "grad_norm": 35.75, "learning_rate": 1.39135459044356e-06, "loss": 1.0533512878417968, "mean_token_accuracy": 0.7235351598262787, "num_tokens": 49311217.0, "step": 103450 }, { "entropy": 0.8798539435863495, "epoch": 2.5169621361348216, "grad_norm": 12.6875, "learning_rate": 1.3845422244517793e-06, "loss": 0.8739839935302735, "mean_token_accuracy": 0.7608583629131317, "num_tokens": 49331121.0, "step": 103500 }, { "entropy": 1.0016985833644867, "epoch": 2.518178059872085, "grad_norm": 21.625, "learning_rate": 1.3777453361909167e-06, "loss": 1.0287648773193359, "mean_token_accuracy": 0.7264931106567383, "num_tokens": 49352790.0, "step": 103550 }, { "entropy": 0.8766825622320176, "epoch": 2.519393983609348, "grad_norm": 11.8125, "learning_rate": 1.370963937871632e-06, "loss": 0.8686483764648437, "mean_token_accuracy": 0.7642320251464844, "num_tokens": 49376275.0, "step": 103600 }, { "entropy": 1.1577589881420136, "epoch": 2.520609907346611, "grad_norm": 16.875, "learning_rate": 1.364198041676753e-06, "loss": 1.1587754058837891, "mean_token_accuracy": 0.7062227433919906, "num_tokens": 49400919.0, "step": 103650 }, { "entropy": 1.0890564996004104, "epoch": 2.5218258310838744, "grad_norm": 17.375, "learning_rate": 1.3574476597612595e-06, "loss": 1.1217782592773438, "mean_token_accuracy": 0.7144102734327317, "num_tokens": 49428582.0, "step": 103700 }, { "entropy": 1.0376430749893188, "epoch": 2.5230417548211377, "grad_norm": 27.5, "learning_rate": 1.3507128042522588e-06, "loss": 1.0280857849121094, "mean_token_accuracy": 0.7392285239696502, "num_tokens": 49450819.0, "step": 103750 }, { "entropy": 1.0191030538082122, "epoch": 2.524257678558401, "grad_norm": 13.375, "learning_rate": 1.343993487248968e-06, "loss": 1.0215982055664063, "mean_token_accuracy": 0.7341066169738769, "num_tokens": 49476502.0, "step": 103800 }, { "entropy": 0.942950963973999, "epoch": 2.525473602295664, "grad_norm": 24.875, "learning_rate": 1.337289720822691e-06, "loss": 0.9488111877441406, "mean_token_accuracy": 0.745291496515274, "num_tokens": 49499624.0, "step": 103850 }, { "entropy": 0.990794375538826, "epoch": 2.526689526032927, "grad_norm": 27.625, "learning_rate": 1.3306015170167875e-06, "loss": 0.9701706695556641, "mean_token_accuracy": 0.7415746295452118, "num_tokens": 49524285.0, "step": 103900 }, { "entropy": 1.0346154534816743, "epoch": 2.5279054497701905, "grad_norm": 20.0, "learning_rate": 1.3239288878466683e-06, "loss": 1.0532839965820313, "mean_token_accuracy": 0.7368849849700928, "num_tokens": 49546614.0, "step": 103950 }, { "entropy": 1.0325119197368622, "epoch": 2.5291213735074534, "grad_norm": 11.25, "learning_rate": 1.3172718452997503e-06, "loss": 1.037323226928711, "mean_token_accuracy": 0.7348220336437226, "num_tokens": 49570831.0, "step": 104000 }, { "entropy": 1.0195522010326385, "epoch": 2.5303372972447167, "grad_norm": 18.25, "learning_rate": 1.3106304013354632e-06, "loss": 1.025034408569336, "mean_token_accuracy": 0.7328062814474106, "num_tokens": 49596417.0, "step": 104050 }, { "entropy": 1.0655855059623718, "epoch": 2.53155322098198, "grad_norm": 23.375, "learning_rate": 1.3040045678852099e-06, "loss": 1.0512665557861327, "mean_token_accuracy": 0.7394750452041626, "num_tokens": 49616388.0, "step": 104100 }, { "entropy": 1.0378383231163024, "epoch": 2.5327691447192433, "grad_norm": 15.9375, "learning_rate": 1.2973943568523416e-06, "loss": 1.0224919128417969, "mean_token_accuracy": 0.7330958580970764, "num_tokens": 49639659.0, "step": 104150 }, { "entropy": 0.9586259236931801, "epoch": 2.5339850684565066, "grad_norm": 8.75, "learning_rate": 1.2907997801121542e-06, "loss": 0.9546808624267578, "mean_token_accuracy": 0.7492866569757461, "num_tokens": 49662368.0, "step": 104200 }, { "entropy": 0.9649835240840912, "epoch": 2.5352009921937695, "grad_norm": 17.375, "learning_rate": 1.2842208495118502e-06, "loss": 0.9805207061767578, "mean_token_accuracy": 0.7435810101032257, "num_tokens": 49688298.0, "step": 104250 }, { "entropy": 0.9989014172554016, "epoch": 2.536416915931033, "grad_norm": 21.875, "learning_rate": 1.2776575768705213e-06, "loss": 1.0170543670654297, "mean_token_accuracy": 0.7332951653003693, "num_tokens": 49714357.0, "step": 104300 }, { "entropy": 0.9954424214363098, "epoch": 2.537632839668296, "grad_norm": 12.5625, "learning_rate": 1.2711099739791399e-06, "loss": 1.0189527130126954, "mean_token_accuracy": 0.7315696287155151, "num_tokens": 49741832.0, "step": 104350 }, { "entropy": 0.9587236869335175, "epoch": 2.538848763405559, "grad_norm": 10.75, "learning_rate": 1.2645780526005147e-06, "loss": 0.9696900177001954, "mean_token_accuracy": 0.7403694558143615, "num_tokens": 49770248.0, "step": 104400 }, { "entropy": 0.927549187541008, "epoch": 2.5400646871428223, "grad_norm": 16.25, "learning_rate": 1.258061824469291e-06, "loss": 0.9346392822265625, "mean_token_accuracy": 0.7522609579563141, "num_tokens": 49794245.0, "step": 104450 }, { "entropy": 0.920587175488472, "epoch": 2.5412806108800856, "grad_norm": 12.4375, "learning_rate": 1.2515613012919214e-06, "loss": 0.9038491821289063, "mean_token_accuracy": 0.7648253214359283, "num_tokens": 49817349.0, "step": 104500 }, { "entropy": 1.0718003398180007, "epoch": 2.542496534617349, "grad_norm": 25.375, "learning_rate": 1.245076494746641e-06, "loss": 1.094884796142578, "mean_token_accuracy": 0.7215039187669754, "num_tokens": 49840396.0, "step": 104550 }, { "entropy": 0.9350012952089309, "epoch": 2.5437124583546122, "grad_norm": 17.875, "learning_rate": 1.2386074164834484e-06, "loss": 0.9280446624755859, "mean_token_accuracy": 0.7604186648130417, "num_tokens": 49865409.0, "step": 104600 }, { "entropy": 0.8917613792419433, "epoch": 2.544928382091875, "grad_norm": 15.75, "learning_rate": 1.232154078124087e-06, "loss": 0.9035269927978515, "mean_token_accuracy": 0.7588699334859847, "num_tokens": 49887961.0, "step": 104650 }, { "entropy": 1.010292176604271, "epoch": 2.5461443058291384, "grad_norm": 18.125, "learning_rate": 1.225716491262028e-06, "loss": 0.9866258239746094, "mean_token_accuracy": 0.7466860330104828, "num_tokens": 49910793.0, "step": 104700 }, { "entropy": 0.971391322016716, "epoch": 2.5473602295664017, "grad_norm": 12.25, "learning_rate": 1.2192946674624429e-06, "loss": 0.9778835296630859, "mean_token_accuracy": 0.7473379933834076, "num_tokens": 49937425.0, "step": 104750 }, { "entropy": 1.0349435910582543, "epoch": 2.5485761533036646, "grad_norm": 11.3125, "learning_rate": 1.2128886182621803e-06, "loss": 1.0352454376220703, "mean_token_accuracy": 0.7401261162757874, "num_tokens": 49962281.0, "step": 104800 }, { "entropy": 0.9529681444168091, "epoch": 2.549792077040928, "grad_norm": 38.5, "learning_rate": 1.2064983551697563e-06, "loss": 0.9308871459960938, "mean_token_accuracy": 0.761286780834198, "num_tokens": 49979918.0, "step": 104850 }, { "entropy": 1.045644696354866, "epoch": 2.5510080007781912, "grad_norm": 18.625, "learning_rate": 1.2001238896653212e-06, "loss": 1.0606533813476562, "mean_token_accuracy": 0.7238392519950867, "num_tokens": 50009600.0, "step": 104900 }, { "entropy": 0.9580894327163696, "epoch": 2.5522239245154545, "grad_norm": 40.25, "learning_rate": 1.1937652332006478e-06, "loss": 0.9611460113525391, "mean_token_accuracy": 0.7497076916694642, "num_tokens": 50033647.0, "step": 104950 }, { "entropy": 0.957829013466835, "epoch": 2.553439848252718, "grad_norm": 10.4375, "learning_rate": 1.1874223971991094e-06, "loss": 0.9514501953125, "mean_token_accuracy": 0.7532640171051025, "num_tokens": 50055253.0, "step": 105000 }, { "entropy": 0.9337042623758316, "epoch": 2.5546557719899807, "grad_norm": 27.5, "learning_rate": 1.1810953930556523e-06, "loss": 0.9286872863769531, "mean_token_accuracy": 0.759197895526886, "num_tokens": 50074208.0, "step": 105050 }, { "entropy": 1.015368835926056, "epoch": 2.555871695727244, "grad_norm": 56.0, "learning_rate": 1.1747842321367886e-06, "loss": 0.9994526672363281, "mean_token_accuracy": 0.739399573802948, "num_tokens": 50096720.0, "step": 105100 }, { "entropy": 0.9950816333293915, "epoch": 2.5570876194645074, "grad_norm": 28.375, "learning_rate": 1.1684889257805608e-06, "loss": 1.0056852722167968, "mean_token_accuracy": 0.741629763841629, "num_tokens": 50119138.0, "step": 105150 }, { "entropy": 0.9856169879436493, "epoch": 2.5583035432017702, "grad_norm": 12.5, "learning_rate": 1.1622094852965304e-06, "loss": 0.9855728912353515, "mean_token_accuracy": 0.7426676952838898, "num_tokens": 50139611.0, "step": 105200 }, { "entropy": 0.9671674263477326, "epoch": 2.5595194669390335, "grad_norm": 21.0, "learning_rate": 1.15594592196576e-06, "loss": 0.9858509063720703, "mean_token_accuracy": 0.7461441957950592, "num_tokens": 50166000.0, "step": 105250 }, { "entropy": 0.990457237958908, "epoch": 2.560735390676297, "grad_norm": 11.625, "learning_rate": 1.1496982470407813e-06, "loss": 0.9814657592773437, "mean_token_accuracy": 0.7424070465564728, "num_tokens": 50191662.0, "step": 105300 }, { "entropy": 0.9982630556821823, "epoch": 2.5619513144135597, "grad_norm": 15.25, "learning_rate": 1.1434664717455901e-06, "loss": 0.9837971496582031, "mean_token_accuracy": 0.7399848401546478, "num_tokens": 50212489.0, "step": 105350 }, { "entropy": 0.9551169204711915, "epoch": 2.563167238150823, "grad_norm": 12.9375, "learning_rate": 1.1372506072756152e-06, "loss": 0.9495316314697265, "mean_token_accuracy": 0.755679429769516, "num_tokens": 50236258.0, "step": 105400 }, { "entropy": 0.960747412443161, "epoch": 2.5643831618880863, "grad_norm": 19.75, "learning_rate": 1.1310506647976992e-06, "loss": 0.9621828460693359, "mean_token_accuracy": 0.7500583493709564, "num_tokens": 50260819.0, "step": 105450 }, { "entropy": 1.020504447221756, "epoch": 2.5655990856253497, "grad_norm": 34.75, "learning_rate": 1.1248666554500831e-06, "loss": 1.0132673645019532, "mean_token_accuracy": 0.7388945770263672, "num_tokens": 50285532.0, "step": 105500 }, { "entropy": 1.1757718765735625, "epoch": 2.566815009362613, "grad_norm": 14.625, "learning_rate": 1.1186985903423819e-06, "loss": 1.2050786590576172, "mean_token_accuracy": 0.6962389212846756, "num_tokens": 50314707.0, "step": 105550 }, { "entropy": 0.9802342519164086, "epoch": 2.568030933099876, "grad_norm": 34.25, "learning_rate": 1.1125464805555697e-06, "loss": 0.9754843139648437, "mean_token_accuracy": 0.7453578794002533, "num_tokens": 50334367.0, "step": 105600 }, { "entropy": 1.0081701058149337, "epoch": 2.569246856837139, "grad_norm": 16.625, "learning_rate": 1.1064103371419577e-06, "loss": 1.039241409301758, "mean_token_accuracy": 0.7361119741201401, "num_tokens": 50358892.0, "step": 105650 }, { "entropy": 0.9935075628757477, "epoch": 2.5704627805744025, "grad_norm": 25.0, "learning_rate": 1.1002901711251668e-06, "loss": 1.0251219940185547, "mean_token_accuracy": 0.7330999350547791, "num_tokens": 50379171.0, "step": 105700 }, { "entropy": 1.055458783507347, "epoch": 2.5716787043116653, "grad_norm": 15.125, "learning_rate": 1.0941859935001231e-06, "loss": 1.0801286315917968, "mean_token_accuracy": 0.7235253256559372, "num_tokens": 50400718.0, "step": 105750 }, { "entropy": 1.054695147871971, "epoch": 2.5728946280489287, "grad_norm": 14.625, "learning_rate": 1.0880978152330213e-06, "loss": 1.061307373046875, "mean_token_accuracy": 0.7222152727842331, "num_tokens": 50427852.0, "step": 105800 }, { "entropy": 1.0474263715744019, "epoch": 2.574110551786192, "grad_norm": 21.5, "learning_rate": 1.0820256472613166e-06, "loss": 1.0519487762451172, "mean_token_accuracy": 0.72873006939888, "num_tokens": 50451397.0, "step": 105850 }, { "entropy": 0.9653053539991379, "epoch": 2.5753264755234553, "grad_norm": 15.5, "learning_rate": 1.0759695004937055e-06, "loss": 0.960606689453125, "mean_token_accuracy": 0.755843060016632, "num_tokens": 50473281.0, "step": 105900 }, { "entropy": 1.0956609785556792, "epoch": 2.5765423992607186, "grad_norm": 17.75, "learning_rate": 1.0699293858100935e-06, "loss": 1.1128132629394532, "mean_token_accuracy": 0.712190557718277, "num_tokens": 50500960.0, "step": 105950 }, { "entropy": 0.93848153591156, "epoch": 2.5777583229979815, "grad_norm": 20.625, "learning_rate": 1.063905314061594e-06, "loss": 0.9437621307373046, "mean_token_accuracy": 0.7525544369220734, "num_tokens": 50529156.0, "step": 106000 }, { "entropy": 0.9521066558361053, "epoch": 2.578974246735245, "grad_norm": 17.75, "learning_rate": 1.0578972960704903e-06, "loss": 0.9440763092041016, "mean_token_accuracy": 0.7498063039779663, "num_tokens": 50552025.0, "step": 106050 }, { "entropy": 1.1378323197364808, "epoch": 2.580190170472508, "grad_norm": 14.875, "learning_rate": 1.051905342630234e-06, "loss": 1.1256524658203124, "mean_token_accuracy": 0.7236077344417572, "num_tokens": 50582984.0, "step": 106100 }, { "entropy": 0.9209997415542602, "epoch": 2.581406094209771, "grad_norm": 23.25, "learning_rate": 1.0459294645054097e-06, "loss": 0.9160138702392578, "mean_token_accuracy": 0.7584985792636871, "num_tokens": 50606955.0, "step": 106150 }, { "entropy": 1.0301914620399475, "epoch": 2.5826220179470343, "grad_norm": 14.0, "learning_rate": 1.0399696724317233e-06, "loss": 1.0384619140625, "mean_token_accuracy": 0.7278827118873596, "num_tokens": 50633880.0, "step": 106200 }, { "entropy": 0.978227613568306, "epoch": 2.5838379416842976, "grad_norm": 27.375, "learning_rate": 1.0340259771159832e-06, "loss": 0.9624491882324219, "mean_token_accuracy": 0.7367807048559188, "num_tokens": 50659958.0, "step": 106250 }, { "entropy": 0.897814707159996, "epoch": 2.585053865421561, "grad_norm": 6.84375, "learning_rate": 1.028098389236084e-06, "loss": 0.9050896453857422, "mean_token_accuracy": 0.7628456318378448, "num_tokens": 50682399.0, "step": 106300 }, { "entropy": 0.8614608305692673, "epoch": 2.586269789158824, "grad_norm": 16.25, "learning_rate": 1.0221869194409783e-06, "loss": 0.8588703155517579, "mean_token_accuracy": 0.7658287072181702, "num_tokens": 50705875.0, "step": 106350 }, { "entropy": 0.9519368588924408, "epoch": 2.587485712896087, "grad_norm": 12.625, "learning_rate": 1.0162915783506611e-06, "loss": 0.9517557525634766, "mean_token_accuracy": 0.7532089334726334, "num_tokens": 50730166.0, "step": 106400 }, { "entropy": 1.0135728597640992, "epoch": 2.5887016366333504, "grad_norm": 22.25, "learning_rate": 1.0104123765561535e-06, "loss": 1.0063963317871094, "mean_token_accuracy": 0.7417378401756287, "num_tokens": 50753779.0, "step": 106450 }, { "entropy": 0.9383879309892654, "epoch": 2.5899175603706137, "grad_norm": 11.625, "learning_rate": 1.0045493246194848e-06, "loss": 0.9201748657226563, "mean_token_accuracy": 0.749480459690094, "num_tokens": 50780240.0, "step": 106500 }, { "entropy": 1.0100717967748643, "epoch": 2.5911334841078766, "grad_norm": 15.4375, "learning_rate": 9.987024330736727e-07, "loss": 0.998207778930664, "mean_token_accuracy": 0.7440124940872193, "num_tokens": 50802590.0, "step": 106550 }, { "entropy": 1.0304051339626312, "epoch": 2.59234940784514, "grad_norm": 14.4375, "learning_rate": 9.92871712422694e-07, "loss": 1.0208925628662109, "mean_token_accuracy": 0.7414871287345887, "num_tokens": 50825236.0, "step": 106600 }, { "entropy": 1.0611192613840104, "epoch": 2.593565331582403, "grad_norm": 18.625, "learning_rate": 9.870571731414835e-07, "loss": 1.0406791687011718, "mean_token_accuracy": 0.7333661329746246, "num_tokens": 50847884.0, "step": 106650 }, { "entropy": 0.9788377296924591, "epoch": 2.594781255319666, "grad_norm": 18.25, "learning_rate": 9.812588256759004e-07, "loss": 1.0005054473876953, "mean_token_accuracy": 0.7451376354694367, "num_tokens": 50870173.0, "step": 106700 }, { "entropy": 1.0187804320454596, "epoch": 2.5959971790569294, "grad_norm": 16.375, "learning_rate": 9.754766804427152e-07, "loss": 1.0186637878417968, "mean_token_accuracy": 0.7338246703147888, "num_tokens": 50893556.0, "step": 106750 }, { "entropy": 0.9625055778026581, "epoch": 2.5972131027941927, "grad_norm": 14.3125, "learning_rate": 9.697107478295975e-07, "loss": 0.9796237945556641, "mean_token_accuracy": 0.7500422656536102, "num_tokens": 50915965.0, "step": 106800 }, { "entropy": 1.0167157530784607, "epoch": 2.598429026531456, "grad_norm": 15.875, "learning_rate": 9.639610381950816e-07, "loss": 1.027807159423828, "mean_token_accuracy": 0.7370240616798401, "num_tokens": 50940132.0, "step": 106850 }, { "entropy": 1.0073155176639557, "epoch": 2.5996449502687193, "grad_norm": 8.9375, "learning_rate": 9.582275618685644e-07, "loss": 1.02667236328125, "mean_token_accuracy": 0.7391720920801162, "num_tokens": 50967349.0, "step": 106900 }, { "entropy": 1.0670498284697532, "epoch": 2.600860874005982, "grad_norm": 16.25, "learning_rate": 9.525103291502736e-07, "loss": 1.0924049377441407, "mean_token_accuracy": 0.7224184620380402, "num_tokens": 50992567.0, "step": 106950 }, { "entropy": 1.1165875881910323, "epoch": 2.6020767977432455, "grad_norm": 20.25, "learning_rate": 9.46809350311263e-07, "loss": 1.1188265228271483, "mean_token_accuracy": 0.71525350689888, "num_tokens": 51015101.0, "step": 107000 }, { "entropy": 0.9907313132286072, "epoch": 2.603292721480509, "grad_norm": 41.75, "learning_rate": 9.411246355933779e-07, "loss": 0.9977623748779297, "mean_token_accuracy": 0.7370880508422851, "num_tokens": 51035823.0, "step": 107050 }, { "entropy": 0.9874867469072341, "epoch": 2.6045086452177717, "grad_norm": 22.25, "learning_rate": 9.354561952092478e-07, "loss": 0.9987039184570312, "mean_token_accuracy": 0.7445381438732147, "num_tokens": 51058696.0, "step": 107100 }, { "entropy": 1.0735037565231322, "epoch": 2.605724568955035, "grad_norm": 25.625, "learning_rate": 9.29804039342268e-07, "loss": 1.093974380493164, "mean_token_accuracy": 0.7218513321876526, "num_tokens": 51087148.0, "step": 107150 }, { "entropy": 0.9151772034168243, "epoch": 2.6069404926922983, "grad_norm": 24.125, "learning_rate": 9.241681781465784e-07, "loss": 0.9209111785888672, "mean_token_accuracy": 0.7569822013378144, "num_tokens": 51108700.0, "step": 107200 }, { "entropy": 1.0314585304260253, "epoch": 2.6081564164295616, "grad_norm": 9.375, "learning_rate": 9.185486217470396e-07, "loss": 1.0246949768066407, "mean_token_accuracy": 0.7376565325260163, "num_tokens": 51132938.0, "step": 107250 }, { "entropy": 0.8818664294481278, "epoch": 2.609372340166825, "grad_norm": 10.75, "learning_rate": 9.129453802392274e-07, "loss": 0.8881739044189453, "mean_token_accuracy": 0.7615623927116394, "num_tokens": 51156489.0, "step": 107300 }, { "entropy": 1.0000248461961747, "epoch": 2.610588263904088, "grad_norm": 25.375, "learning_rate": 9.073584636894039e-07, "loss": 1.0010150146484376, "mean_token_accuracy": 0.7375551617145538, "num_tokens": 51177137.0, "step": 107350 }, { "entropy": 1.0048554682731627, "epoch": 2.611804187641351, "grad_norm": 17.0, "learning_rate": 9.017878821345016e-07, "loss": 1.0393885040283204, "mean_token_accuracy": 0.7336253154277802, "num_tokens": 51200951.0, "step": 107400 }, { "entropy": 0.8729966682195663, "epoch": 2.6130201113786145, "grad_norm": 28.125, "learning_rate": 8.962336455821152e-07, "loss": 0.8851512145996093, "mean_token_accuracy": 0.7634774971008301, "num_tokens": 51227316.0, "step": 107450 }, { "entropy": 1.0107795852422714, "epoch": 2.6142360351158773, "grad_norm": 17.625, "learning_rate": 8.906957640104641e-07, "loss": 1.0106943511962891, "mean_token_accuracy": 0.7348872649669648, "num_tokens": 51252804.0, "step": 107500 }, { "entropy": 1.0472652053833007, "epoch": 2.6154519588531406, "grad_norm": 33.5, "learning_rate": 8.851742473683978e-07, "loss": 1.0249176788330079, "mean_token_accuracy": 0.7337560129165649, "num_tokens": 51274321.0, "step": 107550 }, { "entropy": 0.932758923470974, "epoch": 2.616667882590404, "grad_norm": 19.375, "learning_rate": 8.796691055753559e-07, "loss": 0.940406494140625, "mean_token_accuracy": 0.753874443769455, "num_tokens": 51295504.0, "step": 107600 }, { "entropy": 0.9790420436859131, "epoch": 2.6178838063276673, "grad_norm": 20.875, "learning_rate": 8.74180348521364e-07, "loss": 0.981773910522461, "mean_token_accuracy": 0.7440208911895752, "num_tokens": 51320635.0, "step": 107650 }, { "entropy": 0.9516189992427826, "epoch": 2.6190997300649306, "grad_norm": 19.375, "learning_rate": 8.687079860670167e-07, "loss": 0.9569261169433594, "mean_token_accuracy": 0.7523603713512421, "num_tokens": 51343312.0, "step": 107700 }, { "entropy": 1.0259203612804413, "epoch": 2.6203156538021934, "grad_norm": 26.125, "learning_rate": 8.632520280434475e-07, "loss": 1.0276480865478517, "mean_token_accuracy": 0.7344263398647308, "num_tokens": 51368685.0, "step": 107750 }, { "entropy": 1.0258416521549225, "epoch": 2.6215315775394568, "grad_norm": 10.25, "learning_rate": 8.578124842523283e-07, "loss": 1.0297837829589844, "mean_token_accuracy": 0.7316190075874328, "num_tokens": 51395732.0, "step": 107800 }, { "entropy": 1.0047945258021356, "epoch": 2.62274750127672, "grad_norm": 21.875, "learning_rate": 8.523893644658332e-07, "loss": 0.9905553436279297, "mean_token_accuracy": 0.7454310142993927, "num_tokens": 51418168.0, "step": 107850 }, { "entropy": 0.9352298212051392, "epoch": 2.623963425013983, "grad_norm": 22.875, "learning_rate": 8.4698267842664e-07, "loss": 0.9276483917236328, "mean_token_accuracy": 0.7557738256454468, "num_tokens": 51442448.0, "step": 107900 }, { "entropy": 1.1419610154628754, "epoch": 2.6251793487512463, "grad_norm": 9.8125, "learning_rate": 8.415924358478966e-07, "loss": 1.1374635314941406, "mean_token_accuracy": 0.7137657189369202, "num_tokens": 51463121.0, "step": 107950 }, { "entropy": 0.9550573861598969, "epoch": 2.6263952724885096, "grad_norm": 18.125, "learning_rate": 8.362186464132094e-07, "loss": 0.9863291931152344, "mean_token_accuracy": 0.7405512738227844, "num_tokens": 51486864.0, "step": 108000 }, { "entropy": 0.9745255905389786, "epoch": 2.627611196225773, "grad_norm": 18.75, "learning_rate": 8.308613197766325e-07, "loss": 0.9733135223388671, "mean_token_accuracy": 0.7449212801456452, "num_tokens": 51509544.0, "step": 108050 }, { "entropy": 0.9881429421901703, "epoch": 2.6288271199630358, "grad_norm": 14.5, "learning_rate": 8.255204655626414e-07, "loss": 1.0146865081787109, "mean_token_accuracy": 0.7404028749465943, "num_tokens": 51537320.0, "step": 108100 }, { "entropy": 0.9777805525064468, "epoch": 2.630043043700299, "grad_norm": 17.5, "learning_rate": 8.201960933661147e-07, "loss": 1.0042379760742188, "mean_token_accuracy": 0.7393556904792785, "num_tokens": 51559624.0, "step": 108150 }, { "entropy": 1.015886892080307, "epoch": 2.6312589674375624, "grad_norm": 19.0, "learning_rate": 8.148882127523306e-07, "loss": 1.030931167602539, "mean_token_accuracy": 0.7327668941020966, "num_tokens": 51586499.0, "step": 108200 }, { "entropy": 1.0013961344957352, "epoch": 2.6324748911748257, "grad_norm": 13.4375, "learning_rate": 8.095968332569292e-07, "loss": 1.0114504241943358, "mean_token_accuracy": 0.7402716255187989, "num_tokens": 51608793.0, "step": 108250 }, { "entropy": 0.8325568000972271, "epoch": 2.6336908149120886, "grad_norm": 13.6875, "learning_rate": 8.043219643859124e-07, "loss": 0.8319743347167968, "mean_token_accuracy": 0.7731650233268738, "num_tokens": 51633800.0, "step": 108300 }, { "entropy": 0.9807480078935623, "epoch": 2.634906738649352, "grad_norm": 14.625, "learning_rate": 7.990636156156217e-07, "loss": 0.9987392425537109, "mean_token_accuracy": 0.7426286625862122, "num_tokens": 51657355.0, "step": 108350 }, { "entropy": 0.9375357377529144, "epoch": 2.636122662386615, "grad_norm": 16.75, "learning_rate": 7.938217963927141e-07, "loss": 0.93902587890625, "mean_token_accuracy": 0.75060760140419, "num_tokens": 51681824.0, "step": 108400 }, { "entropy": 0.9495703846216201, "epoch": 2.637338586123878, "grad_norm": 13.0625, "learning_rate": 7.885965161341591e-07, "loss": 0.9502619171142578, "mean_token_accuracy": 0.7510526728630066, "num_tokens": 51706344.0, "step": 108450 }, { "entropy": 0.9719495612382889, "epoch": 2.6385545098611414, "grad_norm": 16.375, "learning_rate": 7.833877842272064e-07, "loss": 0.9795008850097656, "mean_token_accuracy": 0.7364980113506318, "num_tokens": 51726867.0, "step": 108500 }, { "entropy": 0.9817003554105759, "epoch": 2.6397704335984047, "grad_norm": 31.625, "learning_rate": 7.78195610029382e-07, "loss": 1.016275405883789, "mean_token_accuracy": 0.738146271109581, "num_tokens": 51752989.0, "step": 108550 }, { "entropy": 1.0005552279949188, "epoch": 2.640986357335668, "grad_norm": 17.75, "learning_rate": 7.730200028684632e-07, "loss": 0.9963328552246093, "mean_token_accuracy": 0.7358175146579743, "num_tokens": 51778992.0, "step": 108600 }, { "entropy": 0.9405342769622803, "epoch": 2.6422022810729313, "grad_norm": 22.5, "learning_rate": 7.678609720424624e-07, "loss": 0.9533555603027344, "mean_token_accuracy": 0.7517112970352173, "num_tokens": 51800271.0, "step": 108650 }, { "entropy": 0.8869284254312515, "epoch": 2.643418204810194, "grad_norm": 14.375, "learning_rate": 7.627185268196191e-07, "loss": 0.8910034942626953, "mean_token_accuracy": 0.7607296442985535, "num_tokens": 51821348.0, "step": 108700 }, { "entropy": 0.9292179423570633, "epoch": 2.6446341285474575, "grad_norm": 14.0625, "learning_rate": 7.57592676438369e-07, "loss": 0.9447904205322266, "mean_token_accuracy": 0.7495230841636658, "num_tokens": 51846122.0, "step": 108750 }, { "entropy": 1.0587746250629424, "epoch": 2.645850052284721, "grad_norm": 19.0, "learning_rate": 7.524834301073413e-07, "loss": 1.0915943145751954, "mean_token_accuracy": 0.7184668374061585, "num_tokens": 51868929.0, "step": 108800 }, { "entropy": 0.8958941513299942, "epoch": 2.6470659760219837, "grad_norm": 20.75, "learning_rate": 7.473907970053329e-07, "loss": 0.9057614898681641, "mean_token_accuracy": 0.7553265571594239, "num_tokens": 51890906.0, "step": 108850 }, { "entropy": 0.8836267638206482, "epoch": 2.648281899759247, "grad_norm": 26.875, "learning_rate": 7.423147862812918e-07, "loss": 0.8770804595947266, "mean_token_accuracy": 0.7586109900474548, "num_tokens": 51914415.0, "step": 108900 }, { "entropy": 0.969136636853218, "epoch": 2.6494978234965103, "grad_norm": 18.5, "learning_rate": 7.372554070543103e-07, "loss": 0.9676994323730469, "mean_token_accuracy": 0.7478697621822357, "num_tokens": 51936445.0, "step": 108950 }, { "entropy": 0.9888305449485779, "epoch": 2.6507137472337736, "grad_norm": 53.75, "learning_rate": 7.322126684135999e-07, "loss": 1.0024871063232421, "mean_token_accuracy": 0.7476302927732468, "num_tokens": 51960084.0, "step": 109000 }, { "entropy": 0.9995301097631455, "epoch": 2.651929670971037, "grad_norm": 16.5, "learning_rate": 7.27186579418474e-07, "loss": 1.0153566741943358, "mean_token_accuracy": 0.7376843512058258, "num_tokens": 51981137.0, "step": 109050 }, { "entropy": 1.0058932900428772, "epoch": 2.6531455947083, "grad_norm": 9.875, "learning_rate": 7.221771490983387e-07, "loss": 1.0111757659912108, "mean_token_accuracy": 0.7379914736747741, "num_tokens": 52005051.0, "step": 109100 }, { "entropy": 1.140269963145256, "epoch": 2.654361518445563, "grad_norm": 18.75, "learning_rate": 7.171843864526706e-07, "loss": 1.1518521881103516, "mean_token_accuracy": 0.711386787891388, "num_tokens": 52027387.0, "step": 109150 }, { "entropy": 0.9263234037160873, "epoch": 2.6555774421828264, "grad_norm": 15.8125, "learning_rate": 7.122083004510016e-07, "loss": 0.9171220397949219, "mean_token_accuracy": 0.7574475121498108, "num_tokens": 52050316.0, "step": 109200 }, { "entropy": 1.0187901270389557, "epoch": 2.6567933659200893, "grad_norm": 15.875, "learning_rate": 7.072489000329086e-07, "loss": 1.020080795288086, "mean_token_accuracy": 0.743782354593277, "num_tokens": 52071593.0, "step": 109250 }, { "entropy": 1.1445829141139985, "epoch": 2.6580092896573526, "grad_norm": 22.25, "learning_rate": 7.023061941079856e-07, "loss": 1.1712594604492188, "mean_token_accuracy": 0.7142707651853561, "num_tokens": 52095526.0, "step": 109300 }, { "entropy": 0.9876207113265991, "epoch": 2.659225213394616, "grad_norm": 12.3125, "learning_rate": 6.973801915558432e-07, "loss": 0.97959228515625, "mean_token_accuracy": 0.7444368797540665, "num_tokens": 52120191.0, "step": 109350 }, { "entropy": 1.1116272377967835, "epoch": 2.6604411371318792, "grad_norm": 24.125, "learning_rate": 6.924709012260755e-07, "loss": 1.1218132781982422, "mean_token_accuracy": 0.7211153489351273, "num_tokens": 52144367.0, "step": 109400 }, { "entropy": 0.9943124735355378, "epoch": 2.6616570608691426, "grad_norm": 15.875, "learning_rate": 6.875783319382612e-07, "loss": 1.0208475494384766, "mean_token_accuracy": 0.7298110449314117, "num_tokens": 52168073.0, "step": 109450 }, { "entropy": 1.090029501914978, "epoch": 2.6628729846064054, "grad_norm": 12.9375, "learning_rate": 6.827024924819337e-07, "loss": 1.0875837707519531, "mean_token_accuracy": 0.7222688376903534, "num_tokens": 52193369.0, "step": 109500 }, { "entropy": 0.888359522819519, "epoch": 2.6640889083436687, "grad_norm": 10.375, "learning_rate": 6.778433916165716e-07, "loss": 0.8942478179931641, "mean_token_accuracy": 0.7520707392692566, "num_tokens": 52218159.0, "step": 109550 }, { "entropy": 0.9250783973932266, "epoch": 2.665304832080932, "grad_norm": 9.4375, "learning_rate": 6.730010380715878e-07, "loss": 0.9290471649169922, "mean_token_accuracy": 0.75400505900383, "num_tokens": 52247559.0, "step": 109600 }, { "entropy": 0.9408354532718658, "epoch": 2.666520755818195, "grad_norm": 20.625, "learning_rate": 6.681754405463015e-07, "loss": 0.9639012908935547, "mean_token_accuracy": 0.7428287172317505, "num_tokens": 52272637.0, "step": 109650 }, { "entropy": 1.040404914021492, "epoch": 2.6677366795554582, "grad_norm": 13.4375, "learning_rate": 6.633666077099332e-07, "loss": 1.0356200408935547, "mean_token_accuracy": 0.73112524330616, "num_tokens": 52296867.0, "step": 109700 }, { "entropy": 1.028226996064186, "epoch": 2.6689526032927215, "grad_norm": 15.0625, "learning_rate": 6.585745482015904e-07, "loss": 1.0305223846435547, "mean_token_accuracy": 0.7350073659420013, "num_tokens": 52322519.0, "step": 109750 }, { "entropy": 1.0028828209638596, "epoch": 2.6701685270299844, "grad_norm": 35.75, "learning_rate": 6.537992706302332e-07, "loss": 1.0319060516357421, "mean_token_accuracy": 0.7344464075565338, "num_tokens": 52347347.0, "step": 109800 }, { "entropy": 1.021370929479599, "epoch": 2.6713844507672477, "grad_norm": 9.375, "learning_rate": 6.490407835746859e-07, "loss": 1.0465718841552734, "mean_token_accuracy": 0.7312397027015686, "num_tokens": 52374176.0, "step": 109850 }, { "entropy": 0.9162893271446229, "epoch": 2.672600374504511, "grad_norm": 17.5, "learning_rate": 6.44299095583607e-07, "loss": 0.9150701904296875, "mean_token_accuracy": 0.7565386950969696, "num_tokens": 52395103.0, "step": 109900 }, { "entropy": 0.8948313707113266, "epoch": 2.6738162982417744, "grad_norm": 20.875, "learning_rate": 6.39574215175468e-07, "loss": 0.9111589050292969, "mean_token_accuracy": 0.760413510799408, "num_tokens": 52417792.0, "step": 109950 }, { "entropy": 0.9795268297195434, "epoch": 2.6750322219790377, "grad_norm": 30.25, "learning_rate": 6.348661508385523e-07, "loss": 0.9622150421142578, "mean_token_accuracy": 0.7479294657707214, "num_tokens": 52439853.0, "step": 110000 }, { "epoch": 2.6750322219790377, "eval_entropy": 1.0785905203035373, "eval_loss": 1.3080554008483887, "eval_mean_token_accuracy": 0.6793590920984837, "eval_num_tokens": 52439853.0, "eval_runtime": 391.2365, "eval_samples_per_second": 11.678, "eval_steps_per_second": 11.678, "step": 110000 }, { "entropy": 0.978514341711998, "epoch": 2.6762481457163005, "grad_norm": 16.375, "learning_rate": 6.301749110309319e-07, "loss": 0.9512926483154297, "mean_token_accuracy": 0.7553909814357758, "num_tokens": 52463223.0, "step": 110050 }, { "entropy": 1.0176084995269776, "epoch": 2.677464069453564, "grad_norm": 16.125, "learning_rate": 6.255005041804474e-07, "loss": 1.0234467315673828, "mean_token_accuracy": 0.736748053431511, "num_tokens": 52489847.0, "step": 110100 }, { "entropy": 0.9373827886581421, "epoch": 2.678679993190827, "grad_norm": 23.0, "learning_rate": 6.208429386847104e-07, "loss": 0.9441160583496093, "mean_token_accuracy": 0.7545195519924164, "num_tokens": 52513058.0, "step": 110150 }, { "entropy": 0.955939639210701, "epoch": 2.67989591692809, "grad_norm": 13.3125, "learning_rate": 6.162022229110654e-07, "loss": 0.9519827270507812, "mean_token_accuracy": 0.7546627259254456, "num_tokens": 52534885.0, "step": 110200 }, { "entropy": 0.902596794962883, "epoch": 2.6811118406653534, "grad_norm": 39.0, "learning_rate": 6.115783651965935e-07, "loss": 0.8847471618652344, "mean_token_accuracy": 0.7647233009338379, "num_tokens": 52554837.0, "step": 110250 }, { "entropy": 1.0203976285457612, "epoch": 2.6823277644026167, "grad_norm": 13.0625, "learning_rate": 6.069713738480864e-07, "loss": 1.0223790740966796, "mean_token_accuracy": 0.7350008666515351, "num_tokens": 52584236.0, "step": 110300 }, { "entropy": 0.9624048119783402, "epoch": 2.68354368813988, "grad_norm": 6.96875, "learning_rate": 6.023812571420385e-07, "loss": 0.9564561462402343, "mean_token_accuracy": 0.7504685628414154, "num_tokens": 52608929.0, "step": 110350 }, { "entropy": 0.9981696462631225, "epoch": 2.6847596118771433, "grad_norm": 16.125, "learning_rate": 5.978080233246264e-07, "loss": 1.0247626495361328, "mean_token_accuracy": 0.730824030637741, "num_tokens": 52633022.0, "step": 110400 }, { "entropy": 1.0921355414390563, "epoch": 2.685975535614406, "grad_norm": 13.375, "learning_rate": 5.932516806116951e-07, "loss": 1.1084934234619142, "mean_token_accuracy": 0.721460273861885, "num_tokens": 52658476.0, "step": 110450 }, { "entropy": 0.906126703619957, "epoch": 2.6871914593516695, "grad_norm": 24.875, "learning_rate": 5.8871223718875e-07, "loss": 0.9305561828613281, "mean_token_accuracy": 0.7550677192211152, "num_tokens": 52681876.0, "step": 110500 }, { "entropy": 1.0651261794567108, "epoch": 2.688407383088933, "grad_norm": 15.5, "learning_rate": 5.841897012109287e-07, "loss": 1.085829391479492, "mean_token_accuracy": 0.7274690914154053, "num_tokens": 52705285.0, "step": 110550 }, { "entropy": 1.0820489275455474, "epoch": 2.6896233068261957, "grad_norm": 15.375, "learning_rate": 5.796840808030025e-07, "loss": 1.0892329406738281, "mean_token_accuracy": 0.7278337723016739, "num_tokens": 52733638.0, "step": 110600 }, { "entropy": 1.0086560279130936, "epoch": 2.690839230563459, "grad_norm": 35.25, "learning_rate": 5.751953840593516e-07, "loss": 1.0254384613037109, "mean_token_accuracy": 0.7335282695293427, "num_tokens": 52756230.0, "step": 110650 }, { "entropy": 1.0820791721343994, "epoch": 2.6920551543007223, "grad_norm": 12.25, "learning_rate": 5.707236190439492e-07, "loss": 1.0810054779052733, "mean_token_accuracy": 0.7148030984401703, "num_tokens": 52778400.0, "step": 110700 }, { "entropy": 0.9889980632066727, "epoch": 2.6932710780379856, "grad_norm": 22.125, "learning_rate": 5.662687937903521e-07, "loss": 0.9881917572021485, "mean_token_accuracy": 0.7425192987918854, "num_tokens": 52804868.0, "step": 110750 }, { "entropy": 1.0141047620773316, "epoch": 2.694487001775249, "grad_norm": 29.375, "learning_rate": 5.618309163016877e-07, "loss": 1.035560073852539, "mean_token_accuracy": 0.7322927641868592, "num_tokens": 52831396.0, "step": 110800 }, { "entropy": 0.9885842150449753, "epoch": 2.695702925512512, "grad_norm": 9.0625, "learning_rate": 5.574099945506328e-07, "loss": 0.9772468566894531, "mean_token_accuracy": 0.750915162563324, "num_tokens": 52855166.0, "step": 110850 }, { "entropy": 1.0062905108928681, "epoch": 2.696918849249775, "grad_norm": 14.375, "learning_rate": 5.530060364794054e-07, "loss": 1.0184424591064454, "mean_token_accuracy": 0.7356734907627106, "num_tokens": 52876092.0, "step": 110900 }, { "entropy": 1.0760069018602372, "epoch": 2.6981347729870384, "grad_norm": 23.125, "learning_rate": 5.486190499997457e-07, "loss": 1.0796470642089844, "mean_token_accuracy": 0.7258928453922272, "num_tokens": 52901684.0, "step": 110950 }, { "entropy": 1.0817727106809616, "epoch": 2.6993506967243013, "grad_norm": 12.0, "learning_rate": 5.442490429929092e-07, "loss": 1.0634054565429687, "mean_token_accuracy": 0.7314545238018035, "num_tokens": 52924533.0, "step": 111000 }, { "entropy": 0.9455717313289642, "epoch": 2.7005666204615646, "grad_norm": 15.375, "learning_rate": 5.398960233096418e-07, "loss": 0.9585591888427735, "mean_token_accuracy": 0.7482923996448517, "num_tokens": 52947921.0, "step": 111050 }, { "entropy": 0.9983842268586158, "epoch": 2.701782544198828, "grad_norm": 9.625, "learning_rate": 5.355599987701743e-07, "loss": 0.9937109375, "mean_token_accuracy": 0.7392192316055298, "num_tokens": 52971865.0, "step": 111100 }, { "entropy": 0.9854601389169693, "epoch": 2.7029984679360908, "grad_norm": 15.5625, "learning_rate": 5.312409771642068e-07, "loss": 1.0086141967773437, "mean_token_accuracy": 0.7358418297767639, "num_tokens": 52996471.0, "step": 111150 }, { "entropy": 1.0432666745781898, "epoch": 2.704214391673354, "grad_norm": 12.5625, "learning_rate": 5.269389662508906e-07, "loss": 1.066672134399414, "mean_token_accuracy": 0.7249362748861313, "num_tokens": 53020218.0, "step": 111200 }, { "entropy": 0.9767155832052231, "epoch": 2.7054303154106174, "grad_norm": 13.5, "learning_rate": 5.226539737588221e-07, "loss": 0.9942125701904296, "mean_token_accuracy": 0.7392118632793426, "num_tokens": 53045818.0, "step": 111250 }, { "entropy": 0.9806083908677101, "epoch": 2.7066462391478807, "grad_norm": 16.5, "learning_rate": 5.18386007386017e-07, "loss": 0.9900437927246094, "mean_token_accuracy": 0.7320473551750183, "num_tokens": 53071633.0, "step": 111300 }, { "entropy": 0.9807909029722214, "epoch": 2.707862162885144, "grad_norm": 16.625, "learning_rate": 5.141350747999075e-07, "loss": 0.9789112091064454, "mean_token_accuracy": 0.749490841627121, "num_tokens": 53094679.0, "step": 111350 }, { "entropy": 0.9773163163661956, "epoch": 2.709078086622407, "grad_norm": 15.75, "learning_rate": 5.099011836373257e-07, "loss": 0.9811200714111328, "mean_token_accuracy": 0.7421659135818481, "num_tokens": 53117283.0, "step": 111400 }, { "entropy": 0.9990241238474846, "epoch": 2.71029401035967, "grad_norm": 14.1875, "learning_rate": 5.056843415044854e-07, "loss": 0.9808837890625, "mean_token_accuracy": 0.7477682399749755, "num_tokens": 53137420.0, "step": 111450 }, { "entropy": 0.990340739786625, "epoch": 2.7115099340969335, "grad_norm": 20.25, "learning_rate": 5.014845559769743e-07, "loss": 0.9780358123779297, "mean_token_accuracy": 0.7454759222269058, "num_tokens": 53158021.0, "step": 111500 }, { "entropy": 0.9458284008502961, "epoch": 2.7127258578341964, "grad_norm": 20.125, "learning_rate": 4.973018345997393e-07, "loss": 0.9315061950683594, "mean_token_accuracy": 0.7538479954004288, "num_tokens": 53178863.0, "step": 111550 }, { "entropy": 0.9428892213106156, "epoch": 2.7139417815714597, "grad_norm": 9.875, "learning_rate": 4.931361848870653e-07, "loss": 0.9553699493408203, "mean_token_accuracy": 0.7438997101783752, "num_tokens": 53202198.0, "step": 111600 }, { "entropy": 1.1163210082054138, "epoch": 2.715157705308723, "grad_norm": 15.8125, "learning_rate": 4.889876143225714e-07, "loss": 1.134495391845703, "mean_token_accuracy": 0.7105651330947876, "num_tokens": 53226375.0, "step": 111650 }, { "entropy": 0.8937248748540878, "epoch": 2.7163736290459863, "grad_norm": 11.625, "learning_rate": 4.848561303591981e-07, "loss": 0.9029081726074218, "mean_token_accuracy": 0.7648430478572845, "num_tokens": 53247933.0, "step": 111700 }, { "entropy": 1.046981431245804, "epoch": 2.7175895527832497, "grad_norm": 21.375, "learning_rate": 4.807417404191805e-07, "loss": 1.054875946044922, "mean_token_accuracy": 0.7271146833896637, "num_tokens": 53272487.0, "step": 111750 }, { "entropy": 0.9359622603654861, "epoch": 2.7188054765205125, "grad_norm": 19.5, "learning_rate": 4.766444518940527e-07, "loss": 0.9317530059814453, "mean_token_accuracy": 0.7590201687812805, "num_tokens": 53295108.0, "step": 111800 }, { "entropy": 0.9497821629047394, "epoch": 2.720021400257776, "grad_norm": 12.75, "learning_rate": 4.7256427214462106e-07, "loss": 0.9581794738769531, "mean_token_accuracy": 0.7480999541282654, "num_tokens": 53319392.0, "step": 111850 }, { "entropy": 1.0011928710341453, "epoch": 2.721237323995039, "grad_norm": 17.75, "learning_rate": 4.685012085009577e-07, "loss": 0.9822267913818359, "mean_token_accuracy": 0.7474860525131226, "num_tokens": 53344882.0, "step": 111900 }, { "entropy": 1.0034582489728927, "epoch": 2.722453247732302, "grad_norm": 21.125, "learning_rate": 4.644552682623871e-07, "loss": 1.0003804779052734, "mean_token_accuracy": 0.7377656459808349, "num_tokens": 53366495.0, "step": 111950 }, { "entropy": 1.0132174062728883, "epoch": 2.7236691714695653, "grad_norm": 24.0, "learning_rate": 4.6042645869746494e-07, "loss": 1.031844482421875, "mean_token_accuracy": 0.7284968078136445, "num_tokens": 53393000.0, "step": 112000 }, { "entropy": 1.0086028277873993, "epoch": 2.7248850952068286, "grad_norm": 13.4375, "learning_rate": 4.5641478704398165e-07, "loss": 1.0058221435546875, "mean_token_accuracy": 0.7356225657463074, "num_tokens": 53420632.0, "step": 112050 }, { "entropy": 0.9943940246105194, "epoch": 2.726101018944092, "grad_norm": 18.625, "learning_rate": 4.5242026050892896e-07, "loss": 1.02170166015625, "mean_token_accuracy": 0.7406422048807144, "num_tokens": 53444649.0, "step": 112100 }, { "entropy": 1.000537900328636, "epoch": 2.7273169426813553, "grad_norm": 18.25, "learning_rate": 4.484428862685053e-07, "loss": 1.0259168243408203, "mean_token_accuracy": 0.7345914912223815, "num_tokens": 53469436.0, "step": 112150 }, { "entropy": 1.0034010928869248, "epoch": 2.728532866418618, "grad_norm": 15.4375, "learning_rate": 4.4448267146809277e-07, "loss": 1.0031063079833984, "mean_token_accuracy": 0.7453998756408692, "num_tokens": 53495445.0, "step": 112200 }, { "entropy": 1.0169308161735535, "epoch": 2.7297487901558815, "grad_norm": 19.25, "learning_rate": 4.4053962322224033e-07, "loss": 1.0443162536621093, "mean_token_accuracy": 0.7230087840557098, "num_tokens": 53523184.0, "step": 112250 }, { "entropy": 0.9607875472307206, "epoch": 2.7309647138931448, "grad_norm": 14.9375, "learning_rate": 4.36613748614666e-07, "loss": 0.9652200317382813, "mean_token_accuracy": 0.7501916837692261, "num_tokens": 53544517.0, "step": 112300 }, { "entropy": 0.954289962053299, "epoch": 2.7321806376304076, "grad_norm": 12.1875, "learning_rate": 4.3270505469822924e-07, "loss": 0.9712342834472656, "mean_token_accuracy": 0.7482094514369965, "num_tokens": 53569506.0, "step": 112350 }, { "entropy": 1.0581603199243546, "epoch": 2.733396561367671, "grad_norm": 10.9375, "learning_rate": 4.288135484949285e-07, "loss": 1.0801685333251954, "mean_token_accuracy": 0.7261176526546478, "num_tokens": 53594931.0, "step": 112400 }, { "entropy": 0.9222967135906219, "epoch": 2.7346124851049343, "grad_norm": 32.75, "learning_rate": 4.249392369958805e-07, "loss": 0.9458370971679687, "mean_token_accuracy": 0.7474024856090545, "num_tokens": 53619337.0, "step": 112450 }, { "entropy": 0.9795794749259948, "epoch": 2.7358284088421976, "grad_norm": 24.5, "learning_rate": 4.2108212716131525e-07, "loss": 0.9957691192626953, "mean_token_accuracy": 0.7457467317581177, "num_tokens": 53644210.0, "step": 112500 }, { "entropy": 1.017253800034523, "epoch": 2.737044332579461, "grad_norm": 17.375, "learning_rate": 4.172422259205544e-07, "loss": 1.0299463653564453, "mean_token_accuracy": 0.7449036484956741, "num_tokens": 53672764.0, "step": 112550 }, { "entropy": 0.9621437293291092, "epoch": 2.7382602563167238, "grad_norm": 13.625, "learning_rate": 4.1341954017201204e-07, "loss": 0.9920436859130859, "mean_token_accuracy": 0.741725685596466, "num_tokens": 53697029.0, "step": 112600 }, { "entropy": 0.9505011695623398, "epoch": 2.739476180053987, "grad_norm": 27.75, "learning_rate": 4.096140767831658e-07, "loss": 0.9600469970703125, "mean_token_accuracy": 0.7474373006820678, "num_tokens": 53721706.0, "step": 112650 }, { "entropy": 1.0270223355293273, "epoch": 2.7406921037912504, "grad_norm": 15.25, "learning_rate": 4.0582584259056146e-07, "loss": 1.0164574432373046, "mean_token_accuracy": 0.7405843985080719, "num_tokens": 53743727.0, "step": 112700 }, { "entropy": 0.9625311356782913, "epoch": 2.7419080275285133, "grad_norm": 8.9375, "learning_rate": 4.0205484439978624e-07, "loss": 0.9965531921386719, "mean_token_accuracy": 0.740030552148819, "num_tokens": 53768326.0, "step": 112750 }, { "entropy": 0.9035199588537216, "epoch": 2.7431239512657766, "grad_norm": 11.9375, "learning_rate": 3.983010889854666e-07, "loss": 0.8987786865234375, "mean_token_accuracy": 0.7557780027389527, "num_tokens": 53791336.0, "step": 112800 }, { "entropy": 0.9608728247880935, "epoch": 2.74433987500304, "grad_norm": 26.125, "learning_rate": 3.9456458309125034e-07, "loss": 0.9556855773925781, "mean_token_accuracy": 0.756381641626358, "num_tokens": 53814232.0, "step": 112850 }, { "entropy": 0.9066956454515457, "epoch": 2.7455557987403028, "grad_norm": 14.625, "learning_rate": 3.9084533342979457e-07, "loss": 0.9037670135498047, "mean_token_accuracy": 0.7665419244766235, "num_tokens": 53837664.0, "step": 112900 }, { "entropy": 1.0514943373203278, "epoch": 2.746771722477566, "grad_norm": 11.625, "learning_rate": 3.871433466827612e-07, "loss": 1.0651081085205079, "mean_token_accuracy": 0.7305334222316742, "num_tokens": 53863118.0, "step": 112950 }, { "entropy": 0.9845683360099793, "epoch": 2.7479876462148294, "grad_norm": 15.5, "learning_rate": 3.834586295007925e-07, "loss": 1.0171753692626953, "mean_token_accuracy": 0.7373329365253448, "num_tokens": 53887735.0, "step": 113000 }, { "entropy": 0.9632229709625244, "epoch": 2.7492035699520927, "grad_norm": 17.0, "learning_rate": 3.7979118850350996e-07, "loss": 0.9951593017578125, "mean_token_accuracy": 0.7498142182826996, "num_tokens": 53911048.0, "step": 113050 }, { "entropy": 1.0226265209913254, "epoch": 2.750419493689356, "grad_norm": 12.0, "learning_rate": 3.7614103027949875e-07, "loss": 1.0260712432861328, "mean_token_accuracy": 0.7369971477985382, "num_tokens": 53935510.0, "step": 113100 }, { "entropy": 0.9337666207551956, "epoch": 2.751635417426619, "grad_norm": 13.75, "learning_rate": 3.7250816138629355e-07, "loss": 0.9404550933837891, "mean_token_accuracy": 0.7509028738737107, "num_tokens": 53958705.0, "step": 113150 }, { "entropy": 0.948011953830719, "epoch": 2.752851341163882, "grad_norm": 17.0, "learning_rate": 3.68892588350368e-07, "loss": 0.9335304260253906, "mean_token_accuracy": 0.746604882478714, "num_tokens": 53982920.0, "step": 113200 }, { "entropy": 1.186265920996666, "epoch": 2.7540672649011455, "grad_norm": 12.4375, "learning_rate": 3.652943176671253e-07, "loss": 1.1933872985839844, "mean_token_accuracy": 0.7036471903324127, "num_tokens": 54012580.0, "step": 113250 }, { "entropy": 1.019960412979126, "epoch": 2.7552831886384084, "grad_norm": 13.4375, "learning_rate": 3.6171335580088227e-07, "loss": 1.0278069305419921, "mean_token_accuracy": 0.7315247750282288, "num_tokens": 54036289.0, "step": 113300 }, { "entropy": 1.012952653169632, "epoch": 2.7564991123756717, "grad_norm": 13.0625, "learning_rate": 3.5814970918486623e-07, "loss": 0.9884164428710938, "mean_token_accuracy": 0.743416291475296, "num_tokens": 54060333.0, "step": 113350 }, { "entropy": 1.0255360954999924, "epoch": 2.757715036112935, "grad_norm": 30.25, "learning_rate": 3.5460338422119154e-07, "loss": 1.0446531677246094, "mean_token_accuracy": 0.7350248819589615, "num_tokens": 54080086.0, "step": 113400 }, { "entropy": 1.0823175358772277, "epoch": 2.7589309598501983, "grad_norm": 13.0, "learning_rate": 3.5107438728085417e-07, "loss": 1.0932450103759765, "mean_token_accuracy": 0.7148306012153626, "num_tokens": 54106485.0, "step": 113450 }, { "entropy": 0.9568195241689682, "epoch": 2.7601468835874616, "grad_norm": 21.0, "learning_rate": 3.47562724703725e-07, "loss": 0.9562154388427735, "mean_token_accuracy": 0.7448030650615692, "num_tokens": 54129159.0, "step": 113500 }, { "entropy": 1.059928260743618, "epoch": 2.7613628073247245, "grad_norm": 19.625, "learning_rate": 3.440684027985286e-07, "loss": 1.1003276824951171, "mean_token_accuracy": 0.7176908779144288, "num_tokens": 54152939.0, "step": 113550 }, { "entropy": 1.0022494161128999, "epoch": 2.762578731061988, "grad_norm": 18.125, "learning_rate": 3.405914278428413e-07, "loss": 1.011132125854492, "mean_token_accuracy": 0.7390492248535157, "num_tokens": 54177210.0, "step": 113600 }, { "entropy": 0.9644745582342148, "epoch": 2.763794654799251, "grad_norm": 11.4375, "learning_rate": 3.3713180608307084e-07, "loss": 0.9674130249023437, "mean_token_accuracy": 0.7484719932079316, "num_tokens": 54200632.0, "step": 113650 }, { "entropy": 1.0845598179101943, "epoch": 2.765010578536514, "grad_norm": 22.5, "learning_rate": 3.336895437344545e-07, "loss": 1.0845651245117187, "mean_token_accuracy": 0.7360820555686951, "num_tokens": 54223330.0, "step": 113700 }, { "entropy": 1.0320745152235031, "epoch": 2.7662265022737773, "grad_norm": 28.0, "learning_rate": 3.3026464698104e-07, "loss": 1.0594290161132813, "mean_token_accuracy": 0.7294441854953766, "num_tokens": 54246182.0, "step": 113750 }, { "entropy": 0.97442617893219, "epoch": 2.7674424260110406, "grad_norm": 30.125, "learning_rate": 3.2685712197567776e-07, "loss": 0.97386962890625, "mean_token_accuracy": 0.7507176005840301, "num_tokens": 54268414.0, "step": 113800 }, { "entropy": 1.01003555059433, "epoch": 2.768658349748304, "grad_norm": 10.0625, "learning_rate": 3.234669748400121e-07, "loss": 1.056983413696289, "mean_token_accuracy": 0.7247345995903015, "num_tokens": 54291948.0, "step": 113850 }, { "entropy": 1.036020274758339, "epoch": 2.7698742734855673, "grad_norm": 17.25, "learning_rate": 3.200942116644634e-07, "loss": 1.0078451538085937, "mean_token_accuracy": 0.7382279968261719, "num_tokens": 54313532.0, "step": 113900 }, { "entropy": 1.0041409015655518, "epoch": 2.77109019722283, "grad_norm": 15.3125, "learning_rate": 3.1673883850822817e-07, "loss": 1.014461135864258, "mean_token_accuracy": 0.7370877635478973, "num_tokens": 54340762.0, "step": 113950 }, { "entropy": 1.0416928595304489, "epoch": 2.7723061209600934, "grad_norm": 14.5625, "learning_rate": 3.1340086139925675e-07, "loss": 1.0393769836425781, "mean_token_accuracy": 0.7390854489803315, "num_tokens": 54362965.0, "step": 114000 }, { "entropy": 1.0035091280937194, "epoch": 2.7735220446973567, "grad_norm": 15.0, "learning_rate": 3.100802863342489e-07, "loss": 0.9903877258300782, "mean_token_accuracy": 0.7505400705337525, "num_tokens": 54384858.0, "step": 114050 }, { "entropy": 0.9846669995784759, "epoch": 2.7747379684346196, "grad_norm": 16.75, "learning_rate": 3.067771192786406e-07, "loss": 0.9863367462158203, "mean_token_accuracy": 0.7464083087444305, "num_tokens": 54406817.0, "step": 114100 }, { "entropy": 0.8845524591207504, "epoch": 2.775953892171883, "grad_norm": 19.5, "learning_rate": 3.0349136616659503e-07, "loss": 0.876064453125, "mean_token_accuracy": 0.7667195284366608, "num_tokens": 54429363.0, "step": 114150 }, { "entropy": 0.9581177693605423, "epoch": 2.7771698159091462, "grad_norm": 13.875, "learning_rate": 3.002230329009892e-07, "loss": 0.948549575805664, "mean_token_accuracy": 0.7525829946994782, "num_tokens": 54453407.0, "step": 114200 }, { "entropy": 0.9838417088985443, "epoch": 2.778385739646409, "grad_norm": 11.6875, "learning_rate": 2.969721253534119e-07, "loss": 1.0273055267333984, "mean_token_accuracy": 0.7309948742389679, "num_tokens": 54477170.0, "step": 114250 }, { "entropy": 0.9299306105077266, "epoch": 2.7796016633836724, "grad_norm": 32.75, "learning_rate": 2.9373864936413587e-07, "loss": 0.9365229797363281, "mean_token_accuracy": 0.7586916202306747, "num_tokens": 54501123.0, "step": 114300 }, { "entropy": 0.9841593086719513, "epoch": 2.7808175871209357, "grad_norm": 12.125, "learning_rate": 2.9052261074212773e-07, "loss": 1.0127647399902344, "mean_token_accuracy": 0.7294403648376465, "num_tokens": 54524067.0, "step": 114350 }, { "entropy": 1.0063468611240387, "epoch": 2.782033510858199, "grad_norm": 11.9375, "learning_rate": 2.8732401526502254e-07, "loss": 1.0180883026123047, "mean_token_accuracy": 0.7358232003450393, "num_tokens": 54549276.0, "step": 114400 }, { "entropy": 0.9451399993896484, "epoch": 2.7832494345954624, "grad_norm": 20.875, "learning_rate": 2.841428686791181e-07, "loss": 0.9515379333496093, "mean_token_accuracy": 0.7553419530391693, "num_tokens": 54572030.0, "step": 114450 }, { "entropy": 0.9439499664306641, "epoch": 2.7844653583327252, "grad_norm": 18.875, "learning_rate": 2.809791766993686e-07, "loss": 0.9427397155761719, "mean_token_accuracy": 0.7534293687343597, "num_tokens": 54594543.0, "step": 114500 }, { "entropy": 0.9688922381401062, "epoch": 2.7856812820699886, "grad_norm": 33.25, "learning_rate": 2.7783294500936864e-07, "loss": 0.9794910430908204, "mean_token_accuracy": 0.7433765888214111, "num_tokens": 54616972.0, "step": 114550 }, { "entropy": 1.0313680672645569, "epoch": 2.786897205807252, "grad_norm": 10.5625, "learning_rate": 2.7470417926134583e-07, "loss": 1.0414948272705078, "mean_token_accuracy": 0.7348430621623993, "num_tokens": 54641513.0, "step": 114600 }, { "entropy": 0.9312786948680878, "epoch": 2.7881131295445147, "grad_norm": 16.0, "learning_rate": 2.715928850761496e-07, "loss": 0.9364778137207032, "mean_token_accuracy": 0.7526705038547515, "num_tokens": 54664924.0, "step": 114650 }, { "entropy": 0.9691234213113785, "epoch": 2.789329053281778, "grad_norm": 13.0, "learning_rate": 2.684990680432409e-07, "loss": 1.007237777709961, "mean_token_accuracy": 0.7431471991539002, "num_tokens": 54691087.0, "step": 114700 }, { "entropy": 0.9650176560878754, "epoch": 2.7905449770190414, "grad_norm": 21.75, "learning_rate": 2.6542273372068715e-07, "loss": 0.9707457733154297, "mean_token_accuracy": 0.7456982564926148, "num_tokens": 54712192.0, "step": 114750 }, { "entropy": 0.9654578238725662, "epoch": 2.7917609007563047, "grad_norm": 14.6875, "learning_rate": 2.6236388763514087e-07, "loss": 0.9675308227539062, "mean_token_accuracy": 0.7467458415031433, "num_tokens": 54735247.0, "step": 114800 }, { "entropy": 1.066213818192482, "epoch": 2.792976824493568, "grad_norm": 8.3125, "learning_rate": 2.593225352818429e-07, "loss": 1.080449676513672, "mean_token_accuracy": 0.721096693277359, "num_tokens": 54762528.0, "step": 114850 }, { "entropy": 0.9526462876796722, "epoch": 2.794192748230831, "grad_norm": 11.6875, "learning_rate": 2.56298682124605e-07, "loss": 0.9531565093994141, "mean_token_accuracy": 0.750742005109787, "num_tokens": 54785216.0, "step": 114900 }, { "entropy": 0.9799685406684876, "epoch": 2.795408671968094, "grad_norm": 23.875, "learning_rate": 2.532923335958004e-07, "loss": 0.9679408264160156, "mean_token_accuracy": 0.7434845471382141, "num_tokens": 54806367.0, "step": 114950 }, { "entropy": 1.0976629608869553, "epoch": 2.7966245957053575, "grad_norm": 30.625, "learning_rate": 2.5030349509635453e-07, "loss": 1.111673812866211, "mean_token_accuracy": 0.7226938700675964, "num_tokens": 54830324.0, "step": 115000 }, { "entropy": 1.0169099515676499, "epoch": 2.7978405194426204, "grad_norm": 16.5, "learning_rate": 2.473321719957367e-07, "loss": 1.035288314819336, "mean_token_accuracy": 0.7320815598964692, "num_tokens": 54851854.0, "step": 115050 }, { "entropy": 0.9165084493160248, "epoch": 2.7990564431798837, "grad_norm": 18.5, "learning_rate": 2.4437836963195014e-07, "loss": 0.9171047973632812, "mean_token_accuracy": 0.7609453785419464, "num_tokens": 54878265.0, "step": 115100 }, { "entropy": 1.0348282504081725, "epoch": 2.800272366917147, "grad_norm": 17.375, "learning_rate": 2.4144209331152357e-07, "loss": 1.023433380126953, "mean_token_accuracy": 0.7402663123607636, "num_tokens": 54901308.0, "step": 115150 }, { "entropy": 1.0251089215278626, "epoch": 2.8014882906544103, "grad_norm": 9.875, "learning_rate": 2.385233483094951e-07, "loss": 1.015367431640625, "mean_token_accuracy": 0.7368389451503754, "num_tokens": 54923242.0, "step": 115200 }, { "entropy": 1.0810132479667665, "epoch": 2.8027042143916736, "grad_norm": 22.25, "learning_rate": 2.3562213986941361e-07, "loss": 1.1007504272460937, "mean_token_accuracy": 0.718923379778862, "num_tokens": 54945828.0, "step": 115250 }, { "entropy": 0.9920709919929505, "epoch": 2.8039201381289365, "grad_norm": 15.9375, "learning_rate": 2.3273847320331999e-07, "loss": 1.0128399658203124, "mean_token_accuracy": 0.7369376492500305, "num_tokens": 54967300.0, "step": 115300 }, { "entropy": 1.0640474355220795, "epoch": 2.8051360618662, "grad_norm": 13.0, "learning_rate": 2.298723534917391e-07, "loss": 1.0780924987792968, "mean_token_accuracy": 0.7184087836742401, "num_tokens": 54992492.0, "step": 115350 }, { "entropy": 1.0688082653284072, "epoch": 2.806351985603463, "grad_norm": 28.375, "learning_rate": 2.270237858836799e-07, "loss": 1.099534683227539, "mean_token_accuracy": 0.7114997768402099, "num_tokens": 55021189.0, "step": 115400 }, { "entropy": 1.033969420194626, "epoch": 2.807567909340726, "grad_norm": 6.90625, "learning_rate": 2.2419277549661e-07, "loss": 1.0492178344726562, "mean_token_accuracy": 0.7252196395397186, "num_tokens": 55042443.0, "step": 115450 }, { "entropy": 1.002737054824829, "epoch": 2.8087838330779893, "grad_norm": 21.75, "learning_rate": 2.2137932741646218e-07, "loss": 0.9790817260742187, "mean_token_accuracy": 0.743315514922142, "num_tokens": 55064291.0, "step": 115500 }, { "entropy": 0.8767700254917145, "epoch": 2.8099997568152526, "grad_norm": 19.75, "learning_rate": 2.1858344669761556e-07, "loss": 0.8752703857421875, "mean_token_accuracy": 0.7697005712985993, "num_tokens": 55086065.0, "step": 115550 }, { "entropy": 0.967153971195221, "epoch": 2.8112156805525155, "grad_norm": 13.1875, "learning_rate": 2.1580513836288897e-07, "loss": 0.9693123626708985, "mean_token_accuracy": 0.7422484219074249, "num_tokens": 55111491.0, "step": 115600 }, { "entropy": 0.9281044372916222, "epoch": 2.812431604289779, "grad_norm": 48.0, "learning_rate": 2.1304440740353426e-07, "loss": 0.9336537933349609, "mean_token_accuracy": 0.7563761067390442, "num_tokens": 55135444.0, "step": 115650 }, { "entropy": 0.9863737249374389, "epoch": 2.813647528027042, "grad_norm": 20.875, "learning_rate": 2.103012587792219e-07, "loss": 0.9921760559082031, "mean_token_accuracy": 0.7361530655622482, "num_tokens": 55160091.0, "step": 115700 }, { "entropy": 0.9885347199440002, "epoch": 2.8148634517643054, "grad_norm": 24.625, "learning_rate": 2.0757569741803873e-07, "loss": 0.9655556488037109, "mean_token_accuracy": 0.741897314786911, "num_tokens": 55181983.0, "step": 115750 }, { "entropy": 1.0128659695386886, "epoch": 2.8160793755015687, "grad_norm": 25.25, "learning_rate": 2.0486772821647682e-07, "loss": 1.0497339630126954, "mean_token_accuracy": 0.7245005893707276, "num_tokens": 55205133.0, "step": 115800 }, { "entropy": 0.9579020154476166, "epoch": 2.8172952992388316, "grad_norm": 9.25, "learning_rate": 2.0217735603942024e-07, "loss": 0.9661583709716797, "mean_token_accuracy": 0.7472428262233735, "num_tokens": 55228205.0, "step": 115850 }, { "entropy": 1.0271486973762511, "epoch": 2.818511222976095, "grad_norm": 27.25, "learning_rate": 1.9950458572014163e-07, "loss": 1.0390121459960937, "mean_token_accuracy": 0.735849940776825, "num_tokens": 55255180.0, "step": 115900 }, { "entropy": 1.0333256947994232, "epoch": 2.8197271467133582, "grad_norm": 17.5, "learning_rate": 1.9684942206029235e-07, "loss": 1.0413694763183594, "mean_token_accuracy": 0.7340843164920807, "num_tokens": 55278516.0, "step": 115950 }, { "entropy": 1.0044001317024231, "epoch": 2.820943070450621, "grad_norm": 13.6875, "learning_rate": 1.9421186982989116e-07, "loss": 1.0025067138671875, "mean_token_accuracy": 0.7384379708766937, "num_tokens": 55302635.0, "step": 116000 }, { "entropy": 1.0306926387548447, "epoch": 2.8221589941878844, "grad_norm": 13.5625, "learning_rate": 1.9159193376732332e-07, "loss": 1.0273973846435547, "mean_token_accuracy": 0.7343603652715683, "num_tokens": 55328155.0, "step": 116050 }, { "entropy": 0.9696931871771812, "epoch": 2.8233749179251477, "grad_norm": 32.5, "learning_rate": 1.889896185793172e-07, "loss": 0.9419130706787109, "mean_token_accuracy": 0.7467413079738617, "num_tokens": 55351437.0, "step": 116100 }, { "entropy": 0.8419248622655868, "epoch": 2.824590841662411, "grad_norm": 10.125, "learning_rate": 1.864049289409553e-07, "loss": 0.8624279022216796, "mean_token_accuracy": 0.768602499961853, "num_tokens": 55375665.0, "step": 116150 }, { "entropy": 0.9857154476642609, "epoch": 2.8258067653996743, "grad_norm": 14.5, "learning_rate": 1.8383786949564886e-07, "loss": 1.012446746826172, "mean_token_accuracy": 0.7387987017631531, "num_tokens": 55394502.0, "step": 116200 }, { "entropy": 1.0042635202407837, "epoch": 2.827022689136937, "grad_norm": 18.25, "learning_rate": 1.8128844485513775e-07, "loss": 1.0205460357666016, "mean_token_accuracy": 0.7307760298252106, "num_tokens": 55414324.0, "step": 116250 }, { "entropy": 0.9464024159312249, "epoch": 2.8282386128742005, "grad_norm": 18.25, "learning_rate": 1.7875665959948164e-07, "loss": 0.954687728881836, "mean_token_accuracy": 0.7470253098011017, "num_tokens": 55438227.0, "step": 116300 }, { "entropy": 1.1218437486886979, "epoch": 2.829454536611464, "grad_norm": 11.1875, "learning_rate": 1.7624251827705108e-07, "loss": 1.1296800231933595, "mean_token_accuracy": 0.7081575989723206, "num_tokens": 55460793.0, "step": 116350 }, { "entropy": 0.9877503144741059, "epoch": 2.8306704603487267, "grad_norm": 12.8125, "learning_rate": 1.7374602540451756e-07, "loss": 0.9885687255859374, "mean_token_accuracy": 0.7480585205554963, "num_tokens": 55485447.0, "step": 116400 }, { "entropy": 0.9918819868564606, "epoch": 2.83188638408599, "grad_norm": 15.625, "learning_rate": 1.712671854668513e-07, "loss": 0.9589106750488281, "mean_token_accuracy": 0.7453446519374848, "num_tokens": 55505478.0, "step": 116450 }, { "entropy": 0.9782289439439773, "epoch": 2.8331023078232533, "grad_norm": 6.9375, "learning_rate": 1.6880600291730332e-07, "loss": 0.9999700164794922, "mean_token_accuracy": 0.741182746887207, "num_tokens": 55531311.0, "step": 116500 }, { "entropy": 0.9844257897138595, "epoch": 2.8343182315605167, "grad_norm": 13.4375, "learning_rate": 1.663624821774057e-07, "loss": 1.0076411437988282, "mean_token_accuracy": 0.7337619733810424, "num_tokens": 55553991.0, "step": 116550 }, { "entropy": 1.0085419464111327, "epoch": 2.83553415529778, "grad_norm": 19.875, "learning_rate": 1.6393662763696027e-07, "loss": 1.0262266540527343, "mean_token_accuracy": 0.7317882061004639, "num_tokens": 55578500.0, "step": 116600 }, { "entropy": 1.0188874971866608, "epoch": 2.836750079035043, "grad_norm": 11.75, "learning_rate": 1.615284436540332e-07, "loss": 1.0216382598876954, "mean_token_accuracy": 0.731557832956314, "num_tokens": 55602304.0, "step": 116650 }, { "entropy": 0.908112878203392, "epoch": 2.837966002772306, "grad_norm": 21.375, "learning_rate": 1.5913793455494375e-07, "loss": 0.9404705810546875, "mean_token_accuracy": 0.7535702395439148, "num_tokens": 55624119.0, "step": 116700 }, { "entropy": 0.9889141458272934, "epoch": 2.8391819265095695, "grad_norm": 29.25, "learning_rate": 1.5676510463425776e-07, "loss": 0.9908080291748047, "mean_token_accuracy": 0.7388783782720566, "num_tokens": 55646214.0, "step": 116750 }, { "entropy": 0.9019498765468598, "epoch": 2.8403978502468323, "grad_norm": 13.875, "learning_rate": 1.5440995815478198e-07, "loss": 0.9063055419921875, "mean_token_accuracy": 0.7559668684005737, "num_tokens": 55670791.0, "step": 116800 }, { "entropy": 1.0421849715709686, "epoch": 2.8416137739840956, "grad_norm": 22.0, "learning_rate": 1.5207249934755307e-07, "loss": 1.0607028198242188, "mean_token_accuracy": 0.7275997066497802, "num_tokens": 55696856.0, "step": 116850 }, { "entropy": 1.0405791515111924, "epoch": 2.842829697721359, "grad_norm": 22.25, "learning_rate": 1.4975273241183196e-07, "loss": 1.0574522399902344, "mean_token_accuracy": 0.722698233127594, "num_tokens": 55723071.0, "step": 116900 }, { "entropy": 0.9574574261903763, "epoch": 2.8440456214586223, "grad_norm": 20.5, "learning_rate": 1.4745066151509723e-07, "loss": 0.9717871856689453, "mean_token_accuracy": 0.7544066286087037, "num_tokens": 55744495.0, "step": 116950 }, { "entropy": 0.9698511347174644, "epoch": 2.8452615451958856, "grad_norm": 25.0, "learning_rate": 1.4516629079303514e-07, "loss": 0.9786369323730468, "mean_token_accuracy": 0.7501525992155075, "num_tokens": 55765840.0, "step": 117000 }, { "entropy": 0.9864330279827118, "epoch": 2.8464774689331485, "grad_norm": 13.75, "learning_rate": 1.4289962434953508e-07, "loss": 0.9828387451171875, "mean_token_accuracy": 0.7459978330135345, "num_tokens": 55789255.0, "step": 117050 }, { "entropy": 1.057951031923294, "epoch": 2.8476933926704118, "grad_norm": 24.375, "learning_rate": 1.4065066625667757e-07, "loss": 1.0702967071533203, "mean_token_accuracy": 0.7207434225082398, "num_tokens": 55811233.0, "step": 117100 }, { "entropy": 1.0142734676599503, "epoch": 2.848909316407675, "grad_norm": 16.125, "learning_rate": 1.3841942055473178e-07, "loss": 1.0226026153564454, "mean_token_accuracy": 0.7345160961151123, "num_tokens": 55833863.0, "step": 117150 }, { "entropy": 1.0613247418403626, "epoch": 2.850125240144938, "grad_norm": 16.875, "learning_rate": 1.3620589125214688e-07, "loss": 1.0900078582763673, "mean_token_accuracy": 0.7197101283073425, "num_tokens": 55857294.0, "step": 117200 }, { "entropy": 0.9247582519054413, "epoch": 2.8513411638822013, "grad_norm": 5.8125, "learning_rate": 1.3401008232554525e-07, "loss": 0.9328715515136718, "mean_token_accuracy": 0.7521842777729034, "num_tokens": 55882171.0, "step": 117250 }, { "entropy": 1.0792150163650513, "epoch": 2.8525570876194646, "grad_norm": 29.25, "learning_rate": 1.3183199771971024e-07, "loss": 1.0918241882324218, "mean_token_accuracy": 0.7170130240917206, "num_tokens": 55906318.0, "step": 117300 }, { "entropy": 0.9803829169273377, "epoch": 2.8537730113567275, "grad_norm": 25.375, "learning_rate": 1.2967164134758957e-07, "loss": 0.986800537109375, "mean_token_accuracy": 0.7458522140979766, "num_tokens": 55928268.0, "step": 117350 }, { "entropy": 0.8757830369472503, "epoch": 2.8549889350939908, "grad_norm": 19.75, "learning_rate": 1.2752901709027766e-07, "loss": 0.8838016510009765, "mean_token_accuracy": 0.7645984828472138, "num_tokens": 55948321.0, "step": 117400 }, { "entropy": 1.0254687356948853, "epoch": 2.856204858831254, "grad_norm": 30.0, "learning_rate": 1.2540412879701424e-07, "loss": 1.0280177307128906, "mean_token_accuracy": 0.7292625153064728, "num_tokens": 55970046.0, "step": 117450 }, { "entropy": 1.0146738082170486, "epoch": 2.8574207825685174, "grad_norm": 9.375, "learning_rate": 1.2329698028517355e-07, "loss": 1.020592041015625, "mean_token_accuracy": 0.7354930186271668, "num_tokens": 55997270.0, "step": 117500 }, { "entropy": 0.9374657425284386, "epoch": 2.8586367063057807, "grad_norm": 11.4375, "learning_rate": 1.212075753402664e-07, "loss": 0.9450077056884766, "mean_token_accuracy": 0.7544078385829925, "num_tokens": 56026636.0, "step": 117550 }, { "entropy": 1.0088494783639907, "epoch": 2.8598526300430436, "grad_norm": 16.75, "learning_rate": 1.1913591771592125e-07, "loss": 1.0260050201416016, "mean_token_accuracy": 0.7368209528923034, "num_tokens": 56051895.0, "step": 117600 }, { "entropy": 0.9704135698080063, "epoch": 2.861068553780307, "grad_norm": 10.375, "learning_rate": 1.1708201113388773e-07, "loss": 0.999386215209961, "mean_token_accuracy": 0.7402707254886627, "num_tokens": 56077557.0, "step": 117650 }, { "entropy": 1.0503000676631928, "epoch": 2.86228447751757, "grad_norm": 14.25, "learning_rate": 1.1504585928402201e-07, "loss": 1.0518902587890624, "mean_token_accuracy": 0.7317397058010101, "num_tokens": 56101312.0, "step": 117700 }, { "entropy": 0.9386983144283295, "epoch": 2.863500401254833, "grad_norm": 15.8125, "learning_rate": 1.1302746582428581e-07, "loss": 0.9566117858886719, "mean_token_accuracy": 0.7469509011507034, "num_tokens": 56124640.0, "step": 117750 }, { "entropy": 0.8787132906913757, "epoch": 2.8647163249920964, "grad_norm": 23.125, "learning_rate": 1.1102683438073636e-07, "loss": 0.8939923858642578, "mean_token_accuracy": 0.7564888143539429, "num_tokens": 56146021.0, "step": 117800 }, { "entropy": 0.9915650147199631, "epoch": 2.8659322487293597, "grad_norm": 17.25, "learning_rate": 1.090439685475242e-07, "loss": 1.0216986846923828, "mean_token_accuracy": 0.73433613717556, "num_tokens": 56170429.0, "step": 117850 }, { "entropy": 1.0417633539438247, "epoch": 2.867148172466623, "grad_norm": 16.0, "learning_rate": 1.0707887188688093e-07, "loss": 1.0493611907958984, "mean_token_accuracy": 0.7325889730453491, "num_tokens": 56190997.0, "step": 117900 }, { "entropy": 1.0078706991672517, "epoch": 2.8683640962038863, "grad_norm": 12.625, "learning_rate": 1.0513154792911484e-07, "loss": 1.024631576538086, "mean_token_accuracy": 0.737855875492096, "num_tokens": 56216548.0, "step": 117950 }, { "entropy": 0.9679790663719178, "epoch": 2.869580019941149, "grad_norm": 19.75, "learning_rate": 1.0320200017260972e-07, "loss": 0.9682194519042969, "mean_token_accuracy": 0.7477601301670075, "num_tokens": 56238616.0, "step": 118000 }, { "entropy": 1.0602210146188735, "epoch": 2.8707959436784125, "grad_norm": 19.875, "learning_rate": 1.0129023208381162e-07, "loss": 1.0828019714355468, "mean_token_accuracy": 0.7315092658996583, "num_tokens": 56261888.0, "step": 118050 }, { "entropy": 0.9940321069955825, "epoch": 2.872011867415676, "grad_norm": 20.875, "learning_rate": 9.939624709722429e-08, "loss": 1.0028153228759766, "mean_token_accuracy": 0.7412607312202454, "num_tokens": 56284541.0, "step": 118100 }, { "entropy": 1.0048188644647598, "epoch": 2.8732277911529387, "grad_norm": 14.9375, "learning_rate": 9.752004861540487e-08, "loss": 0.9801268005371093, "mean_token_accuracy": 0.7425142002105712, "num_tokens": 56312269.0, "step": 118150 }, { "entropy": 0.9120019495487213, "epoch": 2.874443714890202, "grad_norm": 19.625, "learning_rate": 9.566164000895828e-08, "loss": 0.9045909118652343, "mean_token_accuracy": 0.7612448567152024, "num_tokens": 56333007.0, "step": 118200 }, { "entropy": 0.9699261260032653, "epoch": 2.8756596386274653, "grad_norm": 19.25, "learning_rate": 9.382102461652831e-08, "loss": 0.9744369506835937, "mean_token_accuracy": 0.7382773327827453, "num_tokens": 56358558.0, "step": 118250 }, { "entropy": 0.953444305062294, "epoch": 2.8768755623647286, "grad_norm": 19.125, "learning_rate": 9.199820574479323e-08, "loss": 0.9516196441650391, "mean_token_accuracy": 0.7458076751232148, "num_tokens": 56379524.0, "step": 118300 }, { "entropy": 1.044488663673401, "epoch": 2.878091486101992, "grad_norm": 23.25, "learning_rate": 9.019318666846133e-08, "loss": 1.0700759887695312, "mean_token_accuracy": 0.7370458227396012, "num_tokens": 56400061.0, "step": 118350 }, { "entropy": 1.0137146085500717, "epoch": 2.879307409839255, "grad_norm": 12.75, "learning_rate": 8.840597063026091e-08, "loss": 0.9809807586669922, "mean_token_accuracy": 0.7380691337585449, "num_tokens": 56423849.0, "step": 118400 }, { "entropy": 1.0051956778764726, "epoch": 2.880523333576518, "grad_norm": 24.0, "learning_rate": 8.663656084093808e-08, "loss": 1.0503324127197267, "mean_token_accuracy": 0.7298349285125733, "num_tokens": 56449349.0, "step": 118450 }, { "entropy": 1.0879907977581025, "epoch": 2.8817392573137814, "grad_norm": 19.5, "learning_rate": 8.488496047925121e-08, "loss": 1.102609634399414, "mean_token_accuracy": 0.7307731413841247, "num_tokens": 56475082.0, "step": 118500 }, { "entropy": 1.025365772843361, "epoch": 2.8829551810510443, "grad_norm": 19.25, "learning_rate": 8.315117269196094e-08, "loss": 1.0416781616210937, "mean_token_accuracy": 0.7339113998413086, "num_tokens": 56501768.0, "step": 118550 }, { "entropy": 1.0019063425064088, "epoch": 2.8841711047883076, "grad_norm": 23.75, "learning_rate": 8.143520059383236e-08, "loss": 1.0232559204101563, "mean_token_accuracy": 0.733264011144638, "num_tokens": 56525892.0, "step": 118600 }, { "entropy": 0.9745906084775925, "epoch": 2.885387028525571, "grad_norm": 18.25, "learning_rate": 7.973704726761844e-08, "loss": 0.9571871948242188, "mean_token_accuracy": 0.7459091228246689, "num_tokens": 56551659.0, "step": 118650 }, { "entropy": 1.0950467067956924, "epoch": 2.886602952262834, "grad_norm": 12.5625, "learning_rate": 7.805671576406548e-08, "loss": 1.1420883178710937, "mean_token_accuracy": 0.7172491478919983, "num_tokens": 56574839.0, "step": 118700 }, { "entropy": 0.9174246197938919, "epoch": 2.887818876000097, "grad_norm": 15.875, "learning_rate": 7.639420910189987e-08, "loss": 0.9351757049560547, "mean_token_accuracy": 0.7622818768024444, "num_tokens": 56598321.0, "step": 118750 }, { "entropy": 1.0437779581546784, "epoch": 2.8890347997373604, "grad_norm": 8.75, "learning_rate": 7.474953026782694e-08, "loss": 1.04421875, "mean_token_accuracy": 0.7313807559013367, "num_tokens": 56621880.0, "step": 118800 }, { "entropy": 0.9603959196805953, "epoch": 2.8902507234746238, "grad_norm": 11.375, "learning_rate": 7.31226822165243e-08, "loss": 0.9408973693847656, "mean_token_accuracy": 0.7494485449790954, "num_tokens": 56644190.0, "step": 118850 }, { "entropy": 0.9691303718090057, "epoch": 2.891466647211887, "grad_norm": 17.0, "learning_rate": 7.151366787063519e-08, "loss": 0.9805790710449219, "mean_token_accuracy": 0.7454406476020813, "num_tokens": 56667018.0, "step": 118900 }, { "entropy": 0.900578167438507, "epoch": 2.89268257094915, "grad_norm": 15.375, "learning_rate": 6.992249012076624e-08, "loss": 0.8915953826904297, "mean_token_accuracy": 0.757689608335495, "num_tokens": 56693930.0, "step": 118950 }, { "entropy": 0.8658087307214737, "epoch": 2.8938984946864132, "grad_norm": 12.375, "learning_rate": 6.834915182547975e-08, "loss": 0.855547866821289, "mean_token_accuracy": 0.7730180537700653, "num_tokens": 56720235.0, "step": 119000 }, { "entropy": 0.8997723990678788, "epoch": 2.8951144184236766, "grad_norm": 14.625, "learning_rate": 6.679365581128694e-08, "loss": 0.8874333190917969, "mean_token_accuracy": 0.7624965167045593, "num_tokens": 56741825.0, "step": 119050 }, { "entropy": 0.9230927604436875, "epoch": 2.8963303421609394, "grad_norm": 21.0, "learning_rate": 6.525600487264916e-08, "loss": 0.9193753814697265, "mean_token_accuracy": 0.7530867600440979, "num_tokens": 56764606.0, "step": 119100 }, { "entropy": 1.036542655825615, "epoch": 2.8975462658982027, "grad_norm": 11.375, "learning_rate": 6.373620177196783e-08, "loss": 1.0438595581054688, "mean_token_accuracy": 0.7329340314865113, "num_tokens": 56789490.0, "step": 119150 }, { "entropy": 0.9488338923454285, "epoch": 2.898762189635466, "grad_norm": 19.25, "learning_rate": 6.223424923957777e-08, "loss": 0.9578321838378906, "mean_token_accuracy": 0.7561791670322419, "num_tokens": 56812159.0, "step": 119200 }, { "entropy": 1.0088286125659942, "epoch": 2.8999781133727294, "grad_norm": 18.375, "learning_rate": 6.075014997375062e-08, "loss": 1.001135025024414, "mean_token_accuracy": 0.7388080370426178, "num_tokens": 56833482.0, "step": 119250 }, { "entropy": 0.9954344010353089, "epoch": 2.9011940371099927, "grad_norm": 15.1875, "learning_rate": 5.9283906640678066e-08, "loss": 1.0148294830322266, "mean_token_accuracy": 0.7422082185745239, "num_tokens": 56853493.0, "step": 119300 }, { "entropy": 0.9018650621175766, "epoch": 2.9024099608472556, "grad_norm": 13.1875, "learning_rate": 5.7835521874477495e-08, "loss": 0.8932535552978516, "mean_token_accuracy": 0.7672075271606446, "num_tokens": 56874351.0, "step": 119350 }, { "entropy": 0.9207712966203689, "epoch": 2.903625884584519, "grad_norm": 25.25, "learning_rate": 5.6404998277184154e-08, "loss": 0.9132821655273438, "mean_token_accuracy": 0.7562958681583405, "num_tokens": 56897401.0, "step": 119400 }, { "entropy": 0.9404785922169685, "epoch": 2.904841808321782, "grad_norm": 13.9375, "learning_rate": 5.499233841874119e-08, "loss": 0.9517763519287109, "mean_token_accuracy": 0.7553953981399536, "num_tokens": 56921371.0, "step": 119450 }, { "entropy": 0.9330316251516342, "epoch": 2.906057732059045, "grad_norm": 20.625, "learning_rate": 5.359754483700297e-08, "loss": 0.9323036193847656, "mean_token_accuracy": 0.7490214812755585, "num_tokens": 56942674.0, "step": 119500 }, { "entropy": 1.0032779079675676, "epoch": 2.9072736557963084, "grad_norm": 17.0, "learning_rate": 5.2220620037727323e-08, "loss": 0.9993828582763672, "mean_token_accuracy": 0.7371323931217194, "num_tokens": 56968410.0, "step": 119550 }, { "entropy": 0.9721470856666565, "epoch": 2.9084895795335717, "grad_norm": 39.25, "learning_rate": 5.086156649456775e-08, "loss": 0.9666569519042969, "mean_token_accuracy": 0.745840517282486, "num_tokens": 56994480.0, "step": 119600 }, { "entropy": 0.9566621673107147, "epoch": 2.909705503270835, "grad_norm": 13.125, "learning_rate": 4.9520386649075656e-08, "loss": 0.9677532958984375, "mean_token_accuracy": 0.7415012180805206, "num_tokens": 57019793.0, "step": 119650 }, { "entropy": 0.9261517763137818, "epoch": 2.9109214270080983, "grad_norm": 20.25, "learning_rate": 4.8197082910687034e-08, "loss": 0.9187328338623046, "mean_token_accuracy": 0.7536276745796203, "num_tokens": 57039425.0, "step": 119700 }, { "entropy": 0.9165789169073105, "epoch": 2.912137350745361, "grad_norm": 8.4375, "learning_rate": 4.6891657656727984e-08, "loss": 0.9002429962158203, "mean_token_accuracy": 0.7603135347366333, "num_tokens": 57061136.0, "step": 119750 }, { "entropy": 1.0506758570671082, "epoch": 2.9133532744826245, "grad_norm": 26.875, "learning_rate": 4.560411323240588e-08, "loss": 1.030487289428711, "mean_token_accuracy": 0.7314448964595794, "num_tokens": 57082778.0, "step": 119800 }, { "entropy": 0.9033359128236771, "epoch": 2.914569198219888, "grad_norm": 5.75, "learning_rate": 4.433445195080155e-08, "loss": 0.8944524383544922, "mean_token_accuracy": 0.7747660791873932, "num_tokens": 57108717.0, "step": 119850 }, { "entropy": 0.9314423102140427, "epoch": 2.9157851219571507, "grad_norm": 24.0, "learning_rate": 4.3082676092870425e-08, "loss": 0.9647809600830078, "mean_token_accuracy": 0.7487574303150177, "num_tokens": 57137131.0, "step": 119900 }, { "entropy": 1.0647255992889404, "epoch": 2.917001045694414, "grad_norm": 22.0, "learning_rate": 4.184878790743696e-08, "loss": 1.089533233642578, "mean_token_accuracy": 0.7279811477661133, "num_tokens": 57161781.0, "step": 119950 }, { "entropy": 1.0101378697156906, "epoch": 2.9182169694316773, "grad_norm": 20.125, "learning_rate": 4.06327896111891e-08, "loss": 1.0226168823242188, "mean_token_accuracy": 0.7368338298797608, "num_tokens": 57185694.0, "step": 120000 }, { "epoch": 2.9182169694316773, "eval_entropy": 1.0785979679701077, "eval_loss": 1.3080025911331177, "eval_mean_token_accuracy": 0.6791624420579921, "eval_num_tokens": 57185694.0, "eval_runtime": 392.1524, "eval_samples_per_second": 11.651, "eval_steps_per_second": 11.651, "step": 120000 }, { "entropy": 1.0462483441829682, "epoch": 2.91943289316894, "grad_norm": 14.25, "learning_rate": 3.9434683388679395e-08, "loss": 1.069031753540039, "mean_token_accuracy": 0.7224847114086151, "num_tokens": 57214154.0, "step": 120050 }, { "entropy": 0.9977847248315811, "epoch": 2.9206488169062035, "grad_norm": 35.5, "learning_rate": 3.825447139231053e-08, "loss": 0.9917311096191406, "mean_token_accuracy": 0.7384879338741303, "num_tokens": 57239874.0, "step": 120100 }, { "entropy": 0.9081371426582336, "epoch": 2.921864740643467, "grad_norm": 12.6875, "learning_rate": 3.709215574234426e-08, "loss": 0.9336949920654297, "mean_token_accuracy": 0.7547085189819336, "num_tokens": 57264363.0, "step": 120150 }, { "entropy": 1.0089478927850724, "epoch": 2.92308066438073, "grad_norm": 26.375, "learning_rate": 3.5947738526888045e-08, "loss": 1.0215868377685546, "mean_token_accuracy": 0.7396757465600967, "num_tokens": 57288477.0, "step": 120200 }, { "entropy": 0.9714799982309341, "epoch": 2.9242965881179934, "grad_norm": 16.0, "learning_rate": 3.4821221801896175e-08, "loss": 0.9764431762695313, "mean_token_accuracy": 0.7447112834453583, "num_tokens": 57310869.0, "step": 120250 }, { "entropy": 1.0125698661804199, "epoch": 2.9255125118552563, "grad_norm": 12.5, "learning_rate": 3.371260759116202e-08, "loss": 1.0215953063964844, "mean_token_accuracy": 0.7371567142009735, "num_tokens": 57332234.0, "step": 120300 }, { "entropy": 1.0673026978969573, "epoch": 2.9267284355925196, "grad_norm": 18.5, "learning_rate": 3.26218978863202e-08, "loss": 1.0730329132080079, "mean_token_accuracy": 0.7223427057266235, "num_tokens": 57357279.0, "step": 120350 }, { "entropy": 1.0649119484424592, "epoch": 2.927944359329783, "grad_norm": 18.375, "learning_rate": 3.1549094646838864e-08, "loss": 1.098577346801758, "mean_token_accuracy": 0.7316529011726379, "num_tokens": 57383071.0, "step": 120400 }, { "entropy": 1.0660306811332703, "epoch": 2.929160283067046, "grad_norm": 13.25, "learning_rate": 3.0494199800016334e-08, "loss": 1.0675081634521484, "mean_token_accuracy": 0.7259544384479523, "num_tokens": 57410093.0, "step": 120450 }, { "entropy": 1.0009278243780135, "epoch": 2.930376206804309, "grad_norm": 11.4375, "learning_rate": 2.9457215240977776e-08, "loss": 0.9963227844238282, "mean_token_accuracy": 0.7418833601474762, "num_tokens": 57433113.0, "step": 120500 }, { "entropy": 0.8796268039941788, "epoch": 2.9315921305415724, "grad_norm": 21.625, "learning_rate": 2.84381428326741e-08, "loss": 0.8620655059814453, "mean_token_accuracy": 0.775029993057251, "num_tokens": 57454795.0, "step": 120550 }, { "entropy": 0.9106228286027909, "epoch": 2.9328080542788357, "grad_norm": 22.625, "learning_rate": 2.7436984405875276e-08, "loss": 0.934001693725586, "mean_token_accuracy": 0.7464396822452545, "num_tokens": 57479620.0, "step": 120600 }, { "entropy": 1.0197695100307465, "epoch": 2.934023978016099, "grad_norm": 12.5, "learning_rate": 2.6453741759170372e-08, "loss": 1.029709014892578, "mean_token_accuracy": 0.7334027898311615, "num_tokens": 57504170.0, "step": 120650 }, { "entropy": 0.9673912239074707, "epoch": 2.935239901753362, "grad_norm": 11.9375, "learning_rate": 2.5488416658961957e-08, "loss": 0.9599706268310547, "mean_token_accuracy": 0.7460824227333069, "num_tokens": 57529200.0, "step": 120700 }, { "entropy": 1.000197114944458, "epoch": 2.9364558254906252, "grad_norm": 13.5625, "learning_rate": 2.4541010839463918e-08, "loss": 1.0166887664794921, "mean_token_accuracy": 0.7452354073524475, "num_tokens": 57553214.0, "step": 120750 }, { "entropy": 1.069293838739395, "epoch": 2.9376717492278885, "grad_norm": 17.5, "learning_rate": 2.3611526002694783e-08, "loss": 1.0823525238037108, "mean_token_accuracy": 0.7198152863979339, "num_tokens": 57575905.0, "step": 120800 }, { "entropy": 0.9206156384944916, "epoch": 2.9388876729651514, "grad_norm": 11.9375, "learning_rate": 2.269996381848327e-08, "loss": 0.9176224517822266, "mean_token_accuracy": 0.7669831717014313, "num_tokens": 57600254.0, "step": 120850 }, { "entropy": 0.9716946125030518, "epoch": 2.9401035967024147, "grad_norm": 19.625, "learning_rate": 2.1806325924457196e-08, "loss": 0.9890315246582031, "mean_token_accuracy": 0.7411002135276794, "num_tokens": 57626936.0, "step": 120900 }, { "entropy": 1.0758713269233704, "epoch": 2.941319520439678, "grad_norm": 13.875, "learning_rate": 2.0930613926042344e-08, "loss": 1.071374740600586, "mean_token_accuracy": 0.7361378604173661, "num_tokens": 57651556.0, "step": 120950 }, { "entropy": 0.9710651886463165, "epoch": 2.9425354441769414, "grad_norm": 13.9375, "learning_rate": 2.007282939646138e-08, "loss": 0.9692184448242187, "mean_token_accuracy": 0.7477237379550934, "num_tokens": 57678519.0, "step": 121000 }, { "entropy": 1.0256191033124924, "epoch": 2.9437513679142047, "grad_norm": 23.0, "learning_rate": 1.923297387673051e-08, "loss": 1.0396028900146483, "mean_token_accuracy": 0.7323069727420807, "num_tokens": 57706885.0, "step": 121050 }, { "entropy": 0.9520085960626602, "epoch": 2.9449672916514675, "grad_norm": 15.625, "learning_rate": 1.8411048875657255e-08, "loss": 0.9400405883789062, "mean_token_accuracy": 0.7517239010334015, "num_tokens": 57726140.0, "step": 121100 }, { "entropy": 0.9813761329650879, "epoch": 2.946183215388731, "grad_norm": 18.125, "learning_rate": 1.76070558698338e-08, "loss": 1.0079036712646485, "mean_token_accuracy": 0.7346193957328796, "num_tokens": 57749161.0, "step": 121150 }, { "entropy": 0.9220207893848419, "epoch": 2.947399139125994, "grad_norm": 21.0, "learning_rate": 1.6820996303641424e-08, "loss": 0.9476890563964844, "mean_token_accuracy": 0.752641669511795, "num_tokens": 57771708.0, "step": 121200 }, { "entropy": 0.9181600594520569, "epoch": 2.948615062863257, "grad_norm": 26.375, "learning_rate": 1.6052871589240515e-08, "loss": 0.9065481567382813, "mean_token_accuracy": 0.7623660123348236, "num_tokens": 57798288.0, "step": 121250 }, { "entropy": 0.9900998514890671, "epoch": 2.9498309866005203, "grad_norm": 23.625, "learning_rate": 1.530268310657168e-08, "loss": 0.9893180084228516, "mean_token_accuracy": 0.7381099534034729, "num_tokens": 57822766.0, "step": 121300 }, { "entropy": 0.9183531486988068, "epoch": 2.9510469103377837, "grad_norm": 23.0, "learning_rate": 1.4570432203355744e-08, "loss": 0.9250215148925781, "mean_token_accuracy": 0.7492922830581665, "num_tokens": 57845692.0, "step": 121350 }, { "entropy": 0.9579020667076111, "epoch": 2.952262834075047, "grad_norm": 20.5, "learning_rate": 1.3856120195084865e-08, "loss": 0.9654190826416016, "mean_token_accuracy": 0.7478619140386581, "num_tokens": 57869556.0, "step": 121400 }, { "entropy": 1.0172871351242065, "epoch": 2.9534787578123103, "grad_norm": 18.5, "learning_rate": 1.3159748365026982e-08, "loss": 1.0201974487304688, "mean_token_accuracy": 0.734766097664833, "num_tokens": 57896478.0, "step": 121450 }, { "entropy": 0.9837756675481796, "epoch": 2.954694681549573, "grad_norm": 16.875, "learning_rate": 1.2481317964218031e-08, "loss": 0.9823404693603516, "mean_token_accuracy": 0.7426323354244232, "num_tokens": 57923048.0, "step": 121500 }, { "entropy": 1.0301683431863784, "epoch": 2.9559106052868365, "grad_norm": 10.5625, "learning_rate": 1.1820830211464185e-08, "loss": 1.0356914520263671, "mean_token_accuracy": 0.735959957242012, "num_tokens": 57943967.0, "step": 121550 }, { "entropy": 0.9578010606765747, "epoch": 2.9571265290241, "grad_norm": 44.75, "learning_rate": 1.1178286293335172e-08, "loss": 0.9550495910644531, "mean_token_accuracy": 0.7514186269044876, "num_tokens": 57970279.0, "step": 121600 }, { "entropy": 0.9938694161176681, "epoch": 2.9583424527613627, "grad_norm": 15.6875, "learning_rate": 1.0553687364166509e-08, "loss": 1.008555679321289, "mean_token_accuracy": 0.7401728558540345, "num_tokens": 57990428.0, "step": 121650 }, { "entropy": 0.9114606297016143, "epoch": 2.959558376498626, "grad_norm": 13.4375, "learning_rate": 9.947034546053946e-09, "loss": 0.9364859771728515, "mean_token_accuracy": 0.7510693204402924, "num_tokens": 58013237.0, "step": 121700 }, { "entropy": 0.9660573518276214, "epoch": 2.9607743002358893, "grad_norm": 14.125, "learning_rate": 9.358328928853467e-09, "loss": 0.9644913482666015, "mean_token_accuracy": 0.7416697758436203, "num_tokens": 58038824.0, "step": 121750 }, { "entropy": 0.9335377132892608, "epoch": 2.961990223973152, "grad_norm": 10.875, "learning_rate": 8.787571570180175e-09, "loss": 0.935056381225586, "mean_token_accuracy": 0.749672977924347, "num_tokens": 58062630.0, "step": 121800 }, { "entropy": 1.043381700515747, "epoch": 2.9632061477104155, "grad_norm": 11.75, "learning_rate": 8.234763495402753e-09, "loss": 1.0624332427978516, "mean_token_accuracy": 0.7285669481754303, "num_tokens": 58087582.0, "step": 121850 }, { "entropy": 0.9223842561244965, "epoch": 2.9644220714476788, "grad_norm": 11.0625, "learning_rate": 7.699905697644561e-09, "loss": 0.9119044494628906, "mean_token_accuracy": 0.7633848106861114, "num_tokens": 58109935.0, "step": 121900 }, { "entropy": 1.0000457805395127, "epoch": 2.965637995184942, "grad_norm": 11.9375, "learning_rate": 7.182999137782531e-09, "loss": 1.0180549621582031, "mean_token_accuracy": 0.7280778646469116, "num_tokens": 58134794.0, "step": 121950 }, { "entropy": 0.9023905761539937, "epoch": 2.9668539189222054, "grad_norm": 14.0625, "learning_rate": 6.68404474444162e-09, "loss": 0.9063859558105469, "mean_token_accuracy": 0.7598000812530518, "num_tokens": 58158990.0, "step": 122000 }, { "entropy": 1.025526497364044, "epoch": 2.9680698426594683, "grad_norm": 20.75, "learning_rate": 6.203043413997023e-09, "loss": 1.0250437927246094, "mean_token_accuracy": 0.7301163399219512, "num_tokens": 58187056.0, "step": 122050 }, { "entropy": 0.9752578845620156, "epoch": 2.9692857663967316, "grad_norm": 28.0, "learning_rate": 5.739996010571958e-09, "loss": 0.9931999206542969, "mean_token_accuracy": 0.7465454399585724, "num_tokens": 58208418.0, "step": 122100 }, { "entropy": 1.013219534754753, "epoch": 2.970501690133995, "grad_norm": 15.1875, "learning_rate": 5.294903366034332e-09, "loss": 1.0178377532958984, "mean_token_accuracy": 0.7411682868003845, "num_tokens": 58230777.0, "step": 122150 }, { "entropy": 0.8883857470750809, "epoch": 2.9717176138712578, "grad_norm": 14.75, "learning_rate": 4.867766279995634e-09, "loss": 0.8831577301025391, "mean_token_accuracy": 0.7642147183418274, "num_tokens": 58252368.0, "step": 122200 }, { "entropy": 0.9307674908638001, "epoch": 2.972933537608521, "grad_norm": 10.4375, "learning_rate": 4.4585855198098216e-09, "loss": 0.9384067535400391, "mean_token_accuracy": 0.7498104751110077, "num_tokens": 58277316.0, "step": 122250 }, { "entropy": 0.9640088737010956, "epoch": 2.9741494613457844, "grad_norm": 12.5625, "learning_rate": 4.067361820575544e-09, "loss": 0.9893340301513672, "mean_token_accuracy": 0.7402093815803528, "num_tokens": 58302089.0, "step": 122300 }, { "entropy": 1.0207258665561676, "epoch": 2.9753653850830477, "grad_norm": 10.25, "learning_rate": 3.694095885127258e-09, "loss": 1.0120935821533203, "mean_token_accuracy": 0.737233636379242, "num_tokens": 58326928.0, "step": 122350 }, { "entropy": 0.9572024509310723, "epoch": 2.976581308820311, "grad_norm": 25.25, "learning_rate": 3.3387883840396706e-09, "loss": 0.9957042694091797, "mean_token_accuracy": 0.7409994661808014, "num_tokens": 58351379.0, "step": 122400 }, { "entropy": 1.004076727926731, "epoch": 2.977797232557574, "grad_norm": 19.125, "learning_rate": 3.001439955625518e-09, "loss": 1.0205298614501954, "mean_token_accuracy": 0.7374642795324325, "num_tokens": 58374875.0, "step": 122450 }, { "entropy": 0.9784541076421738, "epoch": 2.979013156294837, "grad_norm": 15.875, "learning_rate": 2.682051205934455e-09, "loss": 0.9975102996826172, "mean_token_accuracy": 0.7516021132469177, "num_tokens": 58397131.0, "step": 122500 }, { "entropy": 0.8933392900228501, "epoch": 2.9802290800321005, "grad_norm": 15.0, "learning_rate": 2.3806227087486146e-09, "loss": 0.9072338104248047, "mean_token_accuracy": 0.7687712681293487, "num_tokens": 58417561.0, "step": 122550 }, { "entropy": 1.0817178094387054, "epoch": 2.9814450037693634, "grad_norm": 27.125, "learning_rate": 2.0971550055892688e-09, "loss": 1.0925949096679688, "mean_token_accuracy": 0.7179835259914398, "num_tokens": 58446735.0, "step": 122600 }, { "entropy": 0.965233981013298, "epoch": 2.9826609275066267, "grad_norm": 13.875, "learning_rate": 1.831648605705727e-09, "loss": 0.9886736297607421, "mean_token_accuracy": 0.7336948621273041, "num_tokens": 58471097.0, "step": 122650 }, { "entropy": 0.9694788730144501, "epoch": 2.98387685124389, "grad_norm": 26.125, "learning_rate": 1.5841039860831076e-09, "loss": 0.98613525390625, "mean_token_accuracy": 0.7441735327243805, "num_tokens": 58493863.0, "step": 122700 }, { "entropy": 0.9738598155975342, "epoch": 2.9850927749811533, "grad_norm": 11.8125, "learning_rate": 1.3545215914378962e-09, "loss": 0.9637165832519531, "mean_token_accuracy": 0.7575994765758515, "num_tokens": 58513638.0, "step": 122750 }, { "entropy": 0.9532602059841156, "epoch": 2.9863086987184166, "grad_norm": 24.875, "learning_rate": 1.1429018342146159e-09, "loss": 0.9650433349609375, "mean_token_accuracy": 0.7501829016208649, "num_tokens": 58536048.0, "step": 122800 }, { "entropy": 0.9967953222990036, "epoch": 2.9875246224556795, "grad_norm": 21.125, "learning_rate": 9.492450945902676e-10, "loss": 1.021906509399414, "mean_token_accuracy": 0.7356204581260681, "num_tokens": 58560651.0, "step": 122850 }, { "entropy": 0.9762166893482208, "epoch": 2.988740546192943, "grad_norm": 28.125, "learning_rate": 7.735517204721099e-10, "loss": 0.9851982116699218, "mean_token_accuracy": 0.7449333423376083, "num_tokens": 58583785.0, "step": 122900 }, { "entropy": 0.9830562841892242, "epoch": 2.989956469930206, "grad_norm": 16.125, "learning_rate": 6.158220274921078e-10, "loss": 0.9901358032226563, "mean_token_accuracy": 0.741366310119629, "num_tokens": 58607993.0, "step": 122950 }, { "entropy": 0.9461834496259689, "epoch": 2.991172393667469, "grad_norm": 19.5, "learning_rate": 4.760562990147044e-10, "loss": 0.9547337341308594, "mean_token_accuracy": 0.7523912489414215, "num_tokens": 58633380.0, "step": 123000 }, { "entropy": 0.897589481472969, "epoch": 2.9923883174047323, "grad_norm": 16.875, "learning_rate": 3.542547861301593e-10, "loss": 0.9104531860351562, "mean_token_accuracy": 0.7646911001205444, "num_tokens": 58657714.0, "step": 123050 }, { "entropy": 1.0041503542661667, "epoch": 2.9936042411419956, "grad_norm": 12.0625, "learning_rate": 2.5041770765343867e-10, "loss": 1.0069861602783203, "mean_token_accuracy": 0.7369261991977691, "num_tokens": 58681497.0, "step": 123100 }, { "entropy": 0.9655870127677918, "epoch": 2.9948201648792585, "grad_norm": 10.5625, "learning_rate": 1.6454525013087641e-10, "loss": 0.9921199798583984, "mean_token_accuracy": 0.7393676280975342, "num_tokens": 58704157.0, "step": 123150 }, { "entropy": 0.8975879955291748, "epoch": 2.996036088616522, "grad_norm": 21.0, "learning_rate": 9.663756783129253e-11, "loss": 0.8810039520263672, "mean_token_accuracy": 0.7649057471752166, "num_tokens": 58725019.0, "step": 123200 }, { "entropy": 0.8463268876075745, "epoch": 2.997252012353785, "grad_norm": 15.75, "learning_rate": 4.669478275376449e-11, "loss": 0.8507908630371094, "mean_token_accuracy": 0.7725774693489075, "num_tokens": 58746657.0, "step": 123250 }, { "entropy": 1.0385210883617402, "epoch": 2.9984679360910484, "grad_norm": 12.1875, "learning_rate": 1.4716984617635377e-11, "loss": 1.046081314086914, "mean_token_accuracy": 0.7301409995555878, "num_tokens": 58767979.0, "step": 123300 }, { "entropy": 1.000432608127594, "epoch": 2.9996838598283118, "grad_norm": 18.0, "learning_rate": 7.042308736160408e-13, "loss": 1.0197488403320312, "mean_token_accuracy": 0.7332193768024444, "num_tokens": 58788504.0, "step": 123350 } ], "logging_steps": 50, "max_steps": 123363, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2784862226802263e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }