{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.48, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.43850863675276436, "epoch": 0.0096, "grad_norm": 4.34375, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8229872425397237, "num_tokens": 3432459.0, "step": 10 }, { "entropy": 0.43693508704503375, "epoch": 0.0192, "grad_norm": 2.8125, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8290777782599131, "num_tokens": 6859685.0, "step": 20 }, { "entropy": 0.44068048397699994, "epoch": 0.0288, "grad_norm": 2.421875, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8306132018566131, "num_tokens": 10280810.0, "step": 30 }, { "entropy": 0.4448391616344452, "epoch": 0.0384, "grad_norm": 1.8203125, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8322072466214497, "num_tokens": 13708851.0, "step": 40 }, { "entropy": 0.4470527251561483, "epoch": 0.048, "grad_norm": 1.2734375, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8338870048522949, "num_tokens": 17136948.0, "step": 50 }, { "entropy": 0.44311814606189726, "epoch": 0.0576, "grad_norm": 1.203125, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8359324594338735, "num_tokens": 20560658.0, "step": 60 }, { "entropy": 0.4470181296269099, "epoch": 0.0672, "grad_norm": 1.1640625, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8356486141681672, "num_tokens": 23987248.0, "step": 70 }, { "entropy": 0.449398942788442, "epoch": 0.0768, "grad_norm": 0.9296875, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8356277287006378, "num_tokens": 27418078.0, "step": 80 }, { "entropy": 0.4466124544541041, "epoch": 0.0864, "grad_norm": 1.078125, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8362693071365357, "num_tokens": 30842206.0, "step": 90 }, { "entropy": 0.44751456181208293, "epoch": 0.096, "grad_norm": 0.953125, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8364668329556783, "num_tokens": 34270915.0, "step": 100 }, { "entropy": 0.44619213143984476, "epoch": 0.1056, "grad_norm": 0.96875, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8371777753035228, "num_tokens": 37699757.0, "step": 110 }, { "entropy": 0.4475706567366918, "epoch": 0.1152, "grad_norm": 1.0, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8366606632868449, "num_tokens": 41127680.0, "step": 120 }, { "entropy": 0.44157906572024025, "epoch": 0.1248, "grad_norm": 0.953125, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8387815574804942, "num_tokens": 44550205.0, "step": 130 }, { "entropy": 0.44636616806189217, "epoch": 0.1344, "grad_norm": 0.9375, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8369020839532216, "num_tokens": 47978530.0, "step": 140 }, { "entropy": 0.45083456734816235, "epoch": 0.144, "grad_norm": 0.91015625, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8364426136016846, "num_tokens": 51415497.0, "step": 150 }, { "entropy": 0.4423963377873103, "epoch": 0.1536, "grad_norm": 0.81640625, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8387805402278901, "num_tokens": 54843112.0, "step": 160 }, { "entropy": 0.44047041336695353, "epoch": 0.1632, "grad_norm": 0.86328125, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8391756375630697, "num_tokens": 58269937.0, "step": 170 }, { "entropy": 0.44535795946915946, "epoch": 0.1728, "grad_norm": 0.84765625, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8376253386338551, "num_tokens": 61698122.0, "step": 180 }, { "entropy": 0.4441406190395355, "epoch": 0.1824, "grad_norm": 0.921875, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8387903730074565, "num_tokens": 65129853.0, "step": 190 }, { "entropy": 0.4387974033753077, "epoch": 0.192, "grad_norm": 0.8046875, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8403141776720683, "num_tokens": 68554676.0, "step": 200 }, { "entropy": 0.43539145588874817, "epoch": 0.2016, "grad_norm": 0.9921875, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8406339287757874, "num_tokens": 71978060.0, "step": 210 }, { "entropy": 0.4371789425611496, "epoch": 0.2112, "grad_norm": 0.85546875, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8400953491528829, "num_tokens": 75401498.0, "step": 220 }, { "entropy": 0.4436704327662786, "epoch": 0.2208, "grad_norm": 1.0234375, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8383757730325063, "num_tokens": 78829143.0, "step": 230 }, { "entropy": 0.43721583088239035, "epoch": 0.2304, "grad_norm": 0.7578125, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8410045862197876, "num_tokens": 82255415.0, "step": 240 }, { "entropy": 0.43779849211374916, "epoch": 0.24, "grad_norm": 0.7890625, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.840403014421463, "num_tokens": 85678092.0, "step": 250 }, { "entropy": 0.4423576871554057, "epoch": 0.2496, "grad_norm": 0.7578125, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8392611801624298, "num_tokens": 89108181.0, "step": 260 }, { "entropy": 0.44207868178685505, "epoch": 0.2592, "grad_norm": 0.8359375, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8386734426021576, "num_tokens": 92534098.0, "step": 270 }, { "entropy": 0.43932537039120995, "epoch": 0.2688, "grad_norm": 0.76171875, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8403779149055481, "num_tokens": 95965587.0, "step": 280 }, { "entropy": 0.44096320470174155, "epoch": 0.2784, "grad_norm": 0.8203125, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8391348044077556, "num_tokens": 99394676.0, "step": 290 }, { "entropy": 0.4383016347885132, "epoch": 0.288, "grad_norm": 0.70703125, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.840146021048228, "num_tokens": 102821016.0, "step": 300 }, { "entropy": 0.4389273832241694, "epoch": 0.2976, "grad_norm": 0.74609375, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8401629229386648, "num_tokens": 106250675.0, "step": 310 }, { "entropy": 0.43793109953403475, "epoch": 0.3072, "grad_norm": 1.21875, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8408861041069031, "num_tokens": 109676233.0, "step": 320 }, { "entropy": 0.4382920225461324, "epoch": 0.3168, "grad_norm": 0.72265625, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.840533846616745, "num_tokens": 113104836.0, "step": 330 }, { "entropy": 0.4332722157239914, "epoch": 0.3264, "grad_norm": 0.67578125, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8416322549184163, "num_tokens": 116528992.0, "step": 340 }, { "entropy": 0.43754682640234627, "epoch": 0.336, "grad_norm": 0.82421875, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8403245389461518, "num_tokens": 119955123.0, "step": 350 }, { "entropy": 0.42634722888469695, "epoch": 0.3456, "grad_norm": 0.76171875, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.844380776087443, "num_tokens": 123374023.0, "step": 360 }, { "entropy": 0.43683901329835256, "epoch": 0.3552, "grad_norm": 0.6875, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8407382468382517, "num_tokens": 126801597.0, "step": 370 }, { "entropy": 0.4341103653113047, "epoch": 0.3648, "grad_norm": 0.95703125, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8416337351004283, "num_tokens": 130226608.0, "step": 380 }, { "entropy": 0.43337511718273164, "epoch": 0.3744, "grad_norm": 0.76953125, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8418285946051279, "num_tokens": 133650147.0, "step": 390 }, { "entropy": 0.43345692853132883, "epoch": 0.384, "grad_norm": 0.75, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8420754154523213, "num_tokens": 137072559.0, "step": 400 }, { "entropy": 0.43710677921772, "epoch": 0.3936, "grad_norm": 0.734375, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8403918604056041, "num_tokens": 140502777.0, "step": 410 }, { "entropy": 0.4345085640748342, "epoch": 0.4032, "grad_norm": 0.88671875, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8418921589851379, "num_tokens": 143932092.0, "step": 420 }, { "entropy": 0.43460349341233573, "epoch": 0.4128, "grad_norm": 0.66796875, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8413500209649404, "num_tokens": 147358021.0, "step": 430 }, { "entropy": 0.4344378610452016, "epoch": 0.4224, "grad_norm": 0.765625, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8416643917560578, "num_tokens": 150785208.0, "step": 440 }, { "entropy": 0.43424378136793773, "epoch": 0.432, "grad_norm": 0.75390625, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8417747735977172, "num_tokens": 154214667.0, "step": 450 }, { "entropy": 0.43412678241729735, "epoch": 0.4416, "grad_norm": 0.6484375, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8415433506170908, "num_tokens": 157644805.0, "step": 460 }, { "entropy": 0.4340047796567281, "epoch": 0.4512, "grad_norm": 0.6640625, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8419170657793681, "num_tokens": 161074326.0, "step": 470 }, { "entropy": 0.4271635631720225, "epoch": 0.4608, "grad_norm": 0.65625, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8433916787306468, "num_tokens": 164493009.0, "step": 480 }, { "entropy": 0.4347446064154307, "epoch": 0.4704, "grad_norm": 0.7109375, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8410797854264577, "num_tokens": 167922637.0, "step": 490 }, { "entropy": 0.43026507596174873, "epoch": 0.48, "grad_norm": 0.6640625, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8427020668983459, "num_tokens": 171349238.0, "step": 500 } ], "logging_steps": 10, "max_steps": 1042, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.381015024450142e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }