{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.96, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.43850863675276436, "epoch": 0.0096, "grad_norm": 4.34375, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8229872425397237, "num_tokens": 3432459.0, "step": 10 }, { "entropy": 0.43693508704503375, "epoch": 0.0192, "grad_norm": 2.8125, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8290777782599131, "num_tokens": 6859685.0, "step": 20 }, { "entropy": 0.44068048397699994, "epoch": 0.0288, "grad_norm": 2.421875, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8306132018566131, "num_tokens": 10280810.0, "step": 30 }, { "entropy": 0.4448391616344452, "epoch": 0.0384, "grad_norm": 1.8203125, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8322072466214497, "num_tokens": 13708851.0, "step": 40 }, { "entropy": 0.4470527251561483, "epoch": 0.048, "grad_norm": 1.2734375, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8338870048522949, "num_tokens": 17136948.0, "step": 50 }, { "entropy": 0.44311814606189726, "epoch": 0.0576, "grad_norm": 1.203125, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8359324594338735, "num_tokens": 20560658.0, "step": 60 }, { "entropy": 0.4470181296269099, "epoch": 0.0672, "grad_norm": 1.1640625, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8356486141681672, "num_tokens": 23987248.0, "step": 70 }, { "entropy": 0.449398942788442, "epoch": 0.0768, "grad_norm": 0.9296875, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8356277287006378, "num_tokens": 27418078.0, "step": 80 }, { "entropy": 0.4466124544541041, "epoch": 0.0864, "grad_norm": 1.078125, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8362693071365357, "num_tokens": 30842206.0, "step": 90 }, { "entropy": 0.44751456181208293, "epoch": 0.096, "grad_norm": 0.953125, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8364668329556783, "num_tokens": 34270915.0, "step": 100 }, { "entropy": 0.44619213143984476, "epoch": 0.1056, "grad_norm": 0.96875, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8371777753035228, "num_tokens": 37699757.0, "step": 110 }, { "entropy": 0.4475706567366918, "epoch": 0.1152, "grad_norm": 1.0, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8366606632868449, "num_tokens": 41127680.0, "step": 120 }, { "entropy": 0.44157906572024025, "epoch": 0.1248, "grad_norm": 0.953125, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8387815574804942, "num_tokens": 44550205.0, "step": 130 }, { "entropy": 0.44636616806189217, "epoch": 0.1344, "grad_norm": 0.9375, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8369020839532216, "num_tokens": 47978530.0, "step": 140 }, { "entropy": 0.45083456734816235, "epoch": 0.144, "grad_norm": 0.91015625, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8364426136016846, "num_tokens": 51415497.0, "step": 150 }, { "entropy": 0.4423963377873103, "epoch": 0.1536, "grad_norm": 0.81640625, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8387805402278901, "num_tokens": 54843112.0, "step": 160 }, { "entropy": 0.44047041336695353, "epoch": 0.1632, "grad_norm": 0.86328125, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8391756375630697, "num_tokens": 58269937.0, "step": 170 }, { "entropy": 0.44535795946915946, "epoch": 0.1728, "grad_norm": 0.84765625, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8376253386338551, "num_tokens": 61698122.0, "step": 180 }, { "entropy": 0.4441406190395355, "epoch": 0.1824, "grad_norm": 0.921875, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8387903730074565, "num_tokens": 65129853.0, "step": 190 }, { "entropy": 0.4387974033753077, "epoch": 0.192, "grad_norm": 0.8046875, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8403141776720683, "num_tokens": 68554676.0, "step": 200 }, { "entropy": 0.43539145588874817, "epoch": 0.2016, "grad_norm": 0.9921875, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8406339287757874, "num_tokens": 71978060.0, "step": 210 }, { "entropy": 0.4371789425611496, "epoch": 0.2112, "grad_norm": 0.85546875, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8400953491528829, "num_tokens": 75401498.0, "step": 220 }, { "entropy": 0.4436704327662786, "epoch": 0.2208, "grad_norm": 1.0234375, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8383757730325063, "num_tokens": 78829143.0, "step": 230 }, { "entropy": 0.43721583088239035, "epoch": 0.2304, "grad_norm": 0.7578125, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8410045862197876, "num_tokens": 82255415.0, "step": 240 }, { "entropy": 0.43779849211374916, "epoch": 0.24, "grad_norm": 0.7890625, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.840403014421463, "num_tokens": 85678092.0, "step": 250 }, { "entropy": 0.4423576871554057, "epoch": 0.2496, "grad_norm": 0.7578125, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8392611801624298, "num_tokens": 89108181.0, "step": 260 }, { "entropy": 0.44207868178685505, "epoch": 0.2592, "grad_norm": 0.8359375, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8386734426021576, "num_tokens": 92534098.0, "step": 270 }, { "entropy": 0.43932537039120995, "epoch": 0.2688, "grad_norm": 0.76171875, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8403779149055481, "num_tokens": 95965587.0, "step": 280 }, { "entropy": 0.44096320470174155, "epoch": 0.2784, "grad_norm": 0.8203125, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8391348044077556, "num_tokens": 99394676.0, "step": 290 }, { "entropy": 0.4383016347885132, "epoch": 0.288, "grad_norm": 0.70703125, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.840146021048228, "num_tokens": 102821016.0, "step": 300 }, { "entropy": 0.4389273832241694, "epoch": 0.2976, "grad_norm": 0.74609375, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8401629229386648, "num_tokens": 106250675.0, "step": 310 }, { "entropy": 0.43793109953403475, "epoch": 0.3072, "grad_norm": 1.21875, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8408861041069031, "num_tokens": 109676233.0, "step": 320 }, { "entropy": 0.4382920225461324, "epoch": 0.3168, "grad_norm": 0.72265625, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.840533846616745, "num_tokens": 113104836.0, "step": 330 }, { "entropy": 0.4332722157239914, "epoch": 0.3264, "grad_norm": 0.67578125, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8416322549184163, "num_tokens": 116528992.0, "step": 340 }, { "entropy": 0.43754682640234627, "epoch": 0.336, "grad_norm": 0.82421875, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8403245389461518, "num_tokens": 119955123.0, "step": 350 }, { "entropy": 0.42634722888469695, "epoch": 0.3456, "grad_norm": 0.76171875, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.844380776087443, "num_tokens": 123374023.0, "step": 360 }, { "entropy": 0.43683901329835256, "epoch": 0.3552, "grad_norm": 0.6875, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8407382468382517, "num_tokens": 126801597.0, "step": 370 }, { "entropy": 0.4341103653113047, "epoch": 0.3648, "grad_norm": 0.95703125, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8416337351004283, "num_tokens": 130226608.0, "step": 380 }, { "entropy": 0.43337511718273164, "epoch": 0.3744, "grad_norm": 0.76953125, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8418285946051279, "num_tokens": 133650147.0, "step": 390 }, { "entropy": 0.43345692853132883, "epoch": 0.384, "grad_norm": 0.75, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8420754154523213, "num_tokens": 137072559.0, "step": 400 }, { "entropy": 0.43710677921772, "epoch": 0.3936, "grad_norm": 0.734375, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8403918604056041, "num_tokens": 140502777.0, "step": 410 }, { "entropy": 0.4345085640748342, "epoch": 0.4032, "grad_norm": 0.88671875, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8418921589851379, "num_tokens": 143932092.0, "step": 420 }, { "entropy": 0.43460349341233573, "epoch": 0.4128, "grad_norm": 0.66796875, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8413500209649404, "num_tokens": 147358021.0, "step": 430 }, { "entropy": 0.4344378610452016, "epoch": 0.4224, "grad_norm": 0.765625, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8416643917560578, "num_tokens": 150785208.0, "step": 440 }, { "entropy": 0.43424378136793773, "epoch": 0.432, "grad_norm": 0.75390625, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8417747735977172, "num_tokens": 154214667.0, "step": 450 }, { "entropy": 0.43412678241729735, "epoch": 0.4416, "grad_norm": 0.6484375, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8415433506170908, "num_tokens": 157644805.0, "step": 460 }, { "entropy": 0.4340047796567281, "epoch": 0.4512, "grad_norm": 0.6640625, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8419170657793681, "num_tokens": 161074326.0, "step": 470 }, { "entropy": 0.4271635631720225, "epoch": 0.4608, "grad_norm": 0.65625, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8433916787306468, "num_tokens": 164493009.0, "step": 480 }, { "entropy": 0.4347446064154307, "epoch": 0.4704, "grad_norm": 0.7109375, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8410797854264577, "num_tokens": 167922637.0, "step": 490 }, { "entropy": 0.43026507596174873, "epoch": 0.48, "grad_norm": 0.6640625, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8427020668983459, "num_tokens": 171349238.0, "step": 500 }, { "entropy": 0.43025335570176443, "epoch": 0.4896, "grad_norm": 0.67578125, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8427554865678152, "num_tokens": 174774665.0, "step": 510 }, { "entropy": 0.43068666458129884, "epoch": 0.4992, "grad_norm": 0.67578125, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8422266582647959, "num_tokens": 178199588.0, "step": 520 }, { "entropy": 0.4314758092164993, "epoch": 0.5088, "grad_norm": 0.69140625, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8423931996027628, "num_tokens": 181622373.0, "step": 530 }, { "entropy": 0.43214026689529417, "epoch": 0.5184, "grad_norm": 0.62890625, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8417826076348622, "num_tokens": 185046141.0, "step": 540 }, { "entropy": 0.42929191191991173, "epoch": 0.528, "grad_norm": 0.6484375, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8429112037022909, "num_tokens": 188471598.0, "step": 550 }, { "entropy": 0.43347863058249153, "epoch": 0.5376, "grad_norm": 0.6796875, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8421526908874511, "num_tokens": 191901663.0, "step": 560 }, { "entropy": 0.4340634206930796, "epoch": 0.5472, "grad_norm": 0.703125, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8407556653022766, "num_tokens": 195330705.0, "step": 570 }, { "entropy": 0.4317493091026942, "epoch": 0.5568, "grad_norm": 0.640625, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8426395455996195, "num_tokens": 198759569.0, "step": 580 }, { "entropy": 0.4264296998580297, "epoch": 0.5664, "grad_norm": 0.79296875, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.843881368637085, "num_tokens": 202180759.0, "step": 590 }, { "entropy": 0.42934685150782265, "epoch": 0.576, "grad_norm": 0.66796875, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8431523183981577, "num_tokens": 205609433.0, "step": 600 }, { "entropy": 0.43238858282566073, "epoch": 0.5856, "grad_norm": 0.69921875, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8418097396691641, "num_tokens": 209040734.0, "step": 610 }, { "entropy": 0.4318026860555013, "epoch": 0.5952, "grad_norm": 0.6328125, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8424915373325348, "num_tokens": 212473401.0, "step": 620 }, { "entropy": 0.4235608865817388, "epoch": 0.6048, "grad_norm": 0.6015625, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8451377809047699, "num_tokens": 215896965.0, "step": 630 }, { "entropy": 0.42819432020187376, "epoch": 0.6144, "grad_norm": 1.1484375, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8432986001173656, "num_tokens": 219323070.0, "step": 640 }, { "entropy": 0.43294126689434054, "epoch": 0.624, "grad_norm": 0.70703125, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8416835029919942, "num_tokens": 222753922.0, "step": 650 }, { "entropy": 0.42949473758538564, "epoch": 0.6336, "grad_norm": 0.734375, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8425247291723887, "num_tokens": 226181564.0, "step": 660 }, { "entropy": 0.431626628835996, "epoch": 0.6432, "grad_norm": 0.9140625, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8420288483301799, "num_tokens": 229609454.0, "step": 670 }, { "entropy": 0.42529774804910025, "epoch": 0.6528, "grad_norm": 0.7109375, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8438542902469635, "num_tokens": 233030434.0, "step": 680 }, { "entropy": 0.4295430819193522, "epoch": 0.6624, "grad_norm": 0.7109375, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8431335310141246, "num_tokens": 236458053.0, "step": 690 }, { "entropy": 0.4359436571598053, "epoch": 0.672, "grad_norm": 0.875, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8408014853795369, "num_tokens": 239892329.0, "step": 700 }, { "entropy": 0.43046695590019224, "epoch": 0.6816, "grad_norm": 0.83203125, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8423032621542613, "num_tokens": 243321186.0, "step": 710 }, { "entropy": 0.427196944753329, "epoch": 0.6912, "grad_norm": 0.578125, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8436582644780477, "num_tokens": 246748277.0, "step": 720 }, { "entropy": 0.4301902174949646, "epoch": 0.7008, "grad_norm": 0.6484375, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8430530607700348, "num_tokens": 250176097.0, "step": 730 }, { "entropy": 0.42852813402811685, "epoch": 0.7104, "grad_norm": 0.81640625, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8430961946646373, "num_tokens": 253603786.0, "step": 740 }, { "entropy": 0.4267232229312261, "epoch": 0.72, "grad_norm": 0.66015625, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8436786532402039, "num_tokens": 257030085.0, "step": 750 }, { "entropy": 0.42847318251927696, "epoch": 0.7296, "grad_norm": 0.9453125, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8432875255743663, "num_tokens": 260459662.0, "step": 760 }, { "entropy": 0.42785159647464754, "epoch": 0.7392, "grad_norm": 0.65234375, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8437132080396016, "num_tokens": 263886799.0, "step": 770 }, { "entropy": 0.42436840136845905, "epoch": 0.7488, "grad_norm": 0.7734375, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8438882827758789, "num_tokens": 267311788.0, "step": 780 }, { "entropy": 0.42350135842959086, "epoch": 0.7584, "grad_norm": 0.71484375, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8450363477071127, "num_tokens": 270733591.0, "step": 790 }, { "entropy": 0.42643423279126486, "epoch": 0.768, "grad_norm": 0.73828125, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8439076920350392, "num_tokens": 274158074.0, "step": 800 }, { "entropy": 0.4293264577786128, "epoch": 0.7776, "grad_norm": 0.76171875, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8420809169610342, "num_tokens": 277581909.0, "step": 810 }, { "entropy": 0.42917039692401887, "epoch": 0.7872, "grad_norm": 0.74609375, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.842545215288798, "num_tokens": 281007702.0, "step": 820 }, { "entropy": 0.4337611397107442, "epoch": 0.7968, "grad_norm": 0.71484375, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8417014559110005, "num_tokens": 284442105.0, "step": 830 }, { "entropy": 0.42585750023523966, "epoch": 0.8064, "grad_norm": 0.60546875, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8437936822573344, "num_tokens": 287868793.0, "step": 840 }, { "entropy": 0.4264136056105296, "epoch": 0.816, "grad_norm": 0.9296875, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8439127763112386, "num_tokens": 291295752.0, "step": 850 }, { "entropy": 0.4276336113611857, "epoch": 0.8256, "grad_norm": 0.69140625, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8433744112650553, "num_tokens": 294724526.0, "step": 860 }, { "entropy": 0.4295493682225545, "epoch": 0.8352, "grad_norm": 0.67578125, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8426620582739512, "num_tokens": 298153923.0, "step": 870 }, { "entropy": 0.42792819142341615, "epoch": 0.8448, "grad_norm": 0.76953125, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8432036856810252, "num_tokens": 301584253.0, "step": 880 }, { "entropy": 0.42749512096246084, "epoch": 0.8544, "grad_norm": 0.77734375, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8432926038901011, "num_tokens": 305013666.0, "step": 890 }, { "entropy": 0.4297857642173767, "epoch": 0.864, "grad_norm": 0.83984375, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8426412324110667, "num_tokens": 308440989.0, "step": 900 }, { "entropy": 0.42787257631619774, "epoch": 0.8736, "grad_norm": 0.66015625, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8433160742123922, "num_tokens": 311870076.0, "step": 910 }, { "entropy": 0.42855414350827536, "epoch": 0.8832, "grad_norm": 0.71875, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8432195166746775, "num_tokens": 315296851.0, "step": 920 }, { "entropy": 0.42333943645159405, "epoch": 0.8928, "grad_norm": 0.625, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8448341071605683, "num_tokens": 318720357.0, "step": 930 }, { "entropy": 0.4254674275716146, "epoch": 0.9024, "grad_norm": 0.69140625, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.844523819287618, "num_tokens": 322150685.0, "step": 940 }, { "entropy": 0.4280929406483968, "epoch": 0.912, "grad_norm": 0.609375, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8432438095410665, "num_tokens": 325578800.0, "step": 950 }, { "entropy": 0.4216255853573481, "epoch": 0.9216, "grad_norm": 0.76171875, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8448065340518951, "num_tokens": 328999744.0, "step": 960 }, { "entropy": 0.42576794425646464, "epoch": 0.9312, "grad_norm": 0.78515625, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8436802566051483, "num_tokens": 332424272.0, "step": 970 }, { "entropy": 0.41938706636428835, "epoch": 0.9408, "grad_norm": 0.60546875, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8458850880463918, "num_tokens": 335842047.0, "step": 980 }, { "entropy": 0.4206093430519104, "epoch": 0.9504, "grad_norm": 0.78515625, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8452507853507996, "num_tokens": 339266824.0, "step": 990 }, { "entropy": 0.42829200327396394, "epoch": 0.96, "grad_norm": 0.6875, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.842687439918518, "num_tokens": 342692717.0, "step": 1000 } ], "logging_steps": 10, "max_steps": 1042, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.762403376047587e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }