{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.502749410840534, "eval_steps": 100, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.4948265369981528, "epoch": 0.12568735271013354, "grad_norm": 0.0615234375, "learning_rate": 0.00015000000000000001, "loss": 1.7029500961303712, "mean_token_accuracy": 0.6537273893132806, "num_tokens": 156024.0, "step": 10 }, { "entropy": 1.1234326036646962, "epoch": 0.2513747054202671, "grad_norm": 0.037353515625, "learning_rate": 0.00019953520716943371, "loss": 1.1771140098571777, "mean_token_accuracy": 0.7510503690689803, "num_tokens": 298620.0, "step": 20 }, { "entropy": 1.0369394151493907, "epoch": 0.3770620581304006, "grad_norm": 0.0267333984375, "learning_rate": 0.0001972690659618564, "loss": 1.1110390663146972, "mean_token_accuracy": 0.7662029687315226, "num_tokens": 449966.0, "step": 30 }, { "entropy": 1.0444004433229566, "epoch": 0.5027494108405341, "grad_norm": 0.02880859375, "learning_rate": 0.0001931591088051279, "loss": 1.1269343376159668, "mean_token_accuracy": 0.7651370905339718, "num_tokens": 608754.0, "step": 40 }, { "entropy": 1.051739121414721, "epoch": 0.6284367635506677, "grad_norm": 0.0255126953125, "learning_rate": 0.00018728324335139814, "loss": 1.088887882232666, "mean_token_accuracy": 0.7666051037609577, "num_tokens": 764583.0, "step": 50 }, { "entropy": 0.9839291835203767, "epoch": 0.7541241162608012, "grad_norm": 0.029052734375, "learning_rate": 0.0001797528515115709, "loss": 1.020584487915039, "mean_token_accuracy": 0.7803368698805571, "num_tokens": 912716.0, "step": 60 }, { "entropy": 1.0362637933343648, "epoch": 0.8798114689709348, "grad_norm": 0.0244140625, "learning_rate": 0.00017071067811865476, "loss": 1.0753373146057128, "mean_token_accuracy": 0.7713750531896949, "num_tokens": 1064494.0, "step": 70 }, { "entropy": 0.9550455894345552, "epoch": 1.0, "grad_norm": 0.02880859375, "learning_rate": 0.0001603281250808719, "loss": 0.9675676345825195, "mean_token_accuracy": 0.7857394160008898, "num_tokens": 1201271.0, "step": 80 }, { "entropy": 0.9374621393159032, "epoch": 1.1256873527101336, "grad_norm": 0.0244140625, "learning_rate": 0.00014880200231609983, "loss": 0.9870312690734864, "mean_token_accuracy": 0.7901170210912823, "num_tokens": 1349133.0, "step": 90 }, { "entropy": 0.949882148578763, "epoch": 1.251374705420267, "grad_norm": 0.03173828125, "learning_rate": 0.00013635079705638298, "loss": 0.9436046600341796, "mean_token_accuracy": 0.788494935259223, "num_tokens": 1500927.0, "step": 100 }, { "epoch": 1.251374705420267, "eval_entropy": 0.9404270783276625, "eval_loss": 0.9446325302124023, "eval_mean_token_accuracy": 0.7829881757497787, "eval_num_tokens": 1500927.0, "eval_runtime": 86.1079, "eval_samples_per_second": 1.649, "eval_steps_per_second": 1.649, "step": 100 }, { "entropy": 0.9173299714922905, "epoch": 1.3770620581304005, "grad_norm": 0.0233154296875, "learning_rate": 0.0001232105322409468, "loss": 0.9481925964355469, "mean_token_accuracy": 0.7930883213877677, "num_tokens": 1646678.0, "step": 110 }, { "entropy": 0.9073959412053227, "epoch": 1.5027494108405341, "grad_norm": 0.03076171875, "learning_rate": 0.00010963029250531418, "loss": 0.9351880073547363, "mean_token_accuracy": 0.7968914289027452, "num_tokens": 1799797.0, "step": 120 }, { "entropy": 0.9462396390736103, "epoch": 1.6284367635506678, "grad_norm": 0.029296875, "learning_rate": 9.586750257511867e-05, "loss": 0.9954720497131347, "mean_token_accuracy": 0.789978607185185, "num_tokens": 1958692.0, "step": 130 }, { "entropy": 0.9035223769024014, "epoch": 1.7541241162608012, "grad_norm": 0.02880859375, "learning_rate": 8.218304756658072e-05, "loss": 0.9473580360412598, "mean_token_accuracy": 0.7965094247832895, "num_tokens": 2098856.0, "step": 140 }, { "entropy": 0.9002945913001895, "epoch": 1.8798114689709347, "grad_norm": 0.035888671875, "learning_rate": 6.883632769240589e-05, "loss": 0.9326913833618165, "mean_token_accuracy": 0.800568002089858, "num_tokens": 2253582.0, "step": 150 }, { "entropy": 0.8597502765897053, "epoch": 2.0, "grad_norm": 0.031494140625, "learning_rate": 5.608034111526298e-05, "loss": 0.8665840148925781, "mean_token_accuracy": 0.8103327634287816, "num_tokens": 2402542.0, "step": 160 }, { "entropy": 0.8727964337915182, "epoch": 2.1256873527101336, "grad_norm": 0.03173828125, "learning_rate": 4.415688815743858e-05, "loss": 0.9112902641296386, "mean_token_accuracy": 0.8074018105864524, "num_tokens": 2554831.0, "step": 170 }, { "entropy": 0.8327508143149316, "epoch": 2.2513747054202673, "grad_norm": 0.03955078125, "learning_rate": 3.329198777485869e-05, "loss": 0.8397786140441894, "mean_token_accuracy": 0.816638495773077, "num_tokens": 2700898.0, "step": 180 }, { "entropy": 0.8549322345294058, "epoch": 2.3770620581304005, "grad_norm": 0.041015625, "learning_rate": 2.3691593180019366e-05, "loss": 0.8526265144348144, "mean_token_accuracy": 0.8113520180806517, "num_tokens": 2854824.0, "step": 190 }, { "entropy": 0.8102049398235976, "epoch": 2.502749410840534, "grad_norm": 0.06787109375, "learning_rate": 1.553768782775351e-05, "loss": 0.8202457427978516, "mean_token_accuracy": 0.8225593730807305, "num_tokens": 3003572.0, "step": 200 }, { "epoch": 2.502749410840534, "eval_entropy": 0.8503286918284187, "eval_loss": 0.8580905795097351, "eval_mean_token_accuracy": 0.8025841964802272, "eval_num_tokens": 3003572.0, "eval_runtime": 85.9551, "eval_samples_per_second": 1.652, "eval_steps_per_second": 1.652, "step": 200 } ], "logging_steps": 10, "max_steps": 240, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2965602468164403e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }