| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.502749410840534, | |
| "eval_steps": 100, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.4948265369981528, | |
| "epoch": 0.12568735271013354, | |
| "grad_norm": 0.0615234375, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 1.7029500961303712, | |
| "mean_token_accuracy": 0.6537273893132806, | |
| "num_tokens": 156024.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.1234326036646962, | |
| "epoch": 0.2513747054202671, | |
| "grad_norm": 0.037353515625, | |
| "learning_rate": 0.00019953520716943371, | |
| "loss": 1.1771140098571777, | |
| "mean_token_accuracy": 0.7510503690689803, | |
| "num_tokens": 298620.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.0369394151493907, | |
| "epoch": 0.3770620581304006, | |
| "grad_norm": 0.0267333984375, | |
| "learning_rate": 0.0001972690659618564, | |
| "loss": 1.1110390663146972, | |
| "mean_token_accuracy": 0.7662029687315226, | |
| "num_tokens": 449966.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.0444004433229566, | |
| "epoch": 0.5027494108405341, | |
| "grad_norm": 0.02880859375, | |
| "learning_rate": 0.0001931591088051279, | |
| "loss": 1.1269343376159668, | |
| "mean_token_accuracy": 0.7651370905339718, | |
| "num_tokens": 608754.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.051739121414721, | |
| "epoch": 0.6284367635506677, | |
| "grad_norm": 0.0255126953125, | |
| "learning_rate": 0.00018728324335139814, | |
| "loss": 1.088887882232666, | |
| "mean_token_accuracy": 0.7666051037609577, | |
| "num_tokens": 764583.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.9839291835203767, | |
| "epoch": 0.7541241162608012, | |
| "grad_norm": 0.029052734375, | |
| "learning_rate": 0.0001797528515115709, | |
| "loss": 1.020584487915039, | |
| "mean_token_accuracy": 0.7803368698805571, | |
| "num_tokens": 912716.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.0362637933343648, | |
| "epoch": 0.8798114689709348, | |
| "grad_norm": 0.0244140625, | |
| "learning_rate": 0.00017071067811865476, | |
| "loss": 1.0753373146057128, | |
| "mean_token_accuracy": 0.7713750531896949, | |
| "num_tokens": 1064494.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.9550455894345552, | |
| "epoch": 1.0, | |
| "grad_norm": 0.02880859375, | |
| "learning_rate": 0.0001603281250808719, | |
| "loss": 0.9675676345825195, | |
| "mean_token_accuracy": 0.7857394160008898, | |
| "num_tokens": 1201271.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.9374621393159032, | |
| "epoch": 1.1256873527101336, | |
| "grad_norm": 0.0244140625, | |
| "learning_rate": 0.00014880200231609983, | |
| "loss": 0.9870312690734864, | |
| "mean_token_accuracy": 0.7901170210912823, | |
| "num_tokens": 1349133.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.949882148578763, | |
| "epoch": 1.251374705420267, | |
| "grad_norm": 0.03173828125, | |
| "learning_rate": 0.00013635079705638298, | |
| "loss": 0.9436046600341796, | |
| "mean_token_accuracy": 0.788494935259223, | |
| "num_tokens": 1500927.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.251374705420267, | |
| "eval_entropy": 0.9404270783276625, | |
| "eval_loss": 0.9446325302124023, | |
| "eval_mean_token_accuracy": 0.7829881757497787, | |
| "eval_num_tokens": 1500927.0, | |
| "eval_runtime": 86.1079, | |
| "eval_samples_per_second": 1.649, | |
| "eval_steps_per_second": 1.649, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.9173299714922905, | |
| "epoch": 1.3770620581304005, | |
| "grad_norm": 0.0233154296875, | |
| "learning_rate": 0.0001232105322409468, | |
| "loss": 0.9481925964355469, | |
| "mean_token_accuracy": 0.7930883213877677, | |
| "num_tokens": 1646678.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.9073959412053227, | |
| "epoch": 1.5027494108405341, | |
| "grad_norm": 0.03076171875, | |
| "learning_rate": 0.00010963029250531418, | |
| "loss": 0.9351880073547363, | |
| "mean_token_accuracy": 0.7968914289027452, | |
| "num_tokens": 1799797.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.9462396390736103, | |
| "epoch": 1.6284367635506678, | |
| "grad_norm": 0.029296875, | |
| "learning_rate": 9.586750257511867e-05, | |
| "loss": 0.9954720497131347, | |
| "mean_token_accuracy": 0.789978607185185, | |
| "num_tokens": 1958692.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.9035223769024014, | |
| "epoch": 1.7541241162608012, | |
| "grad_norm": 0.02880859375, | |
| "learning_rate": 8.218304756658072e-05, | |
| "loss": 0.9473580360412598, | |
| "mean_token_accuracy": 0.7965094247832895, | |
| "num_tokens": 2098856.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.9002945913001895, | |
| "epoch": 1.8798114689709347, | |
| "grad_norm": 0.035888671875, | |
| "learning_rate": 6.883632769240589e-05, | |
| "loss": 0.9326913833618165, | |
| "mean_token_accuracy": 0.800568002089858, | |
| "num_tokens": 2253582.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.8597502765897053, | |
| "epoch": 2.0, | |
| "grad_norm": 0.031494140625, | |
| "learning_rate": 5.608034111526298e-05, | |
| "loss": 0.8665840148925781, | |
| "mean_token_accuracy": 0.8103327634287816, | |
| "num_tokens": 2402542.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.8727964337915182, | |
| "epoch": 2.1256873527101336, | |
| "grad_norm": 0.03173828125, | |
| "learning_rate": 4.415688815743858e-05, | |
| "loss": 0.9112902641296386, | |
| "mean_token_accuracy": 0.8074018105864524, | |
| "num_tokens": 2554831.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.8327508143149316, | |
| "epoch": 2.2513747054202673, | |
| "grad_norm": 0.03955078125, | |
| "learning_rate": 3.329198777485869e-05, | |
| "loss": 0.8397786140441894, | |
| "mean_token_accuracy": 0.816638495773077, | |
| "num_tokens": 2700898.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.8549322345294058, | |
| "epoch": 2.3770620581304005, | |
| "grad_norm": 0.041015625, | |
| "learning_rate": 2.3691593180019366e-05, | |
| "loss": 0.8526265144348144, | |
| "mean_token_accuracy": 0.8113520180806517, | |
| "num_tokens": 2854824.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.8102049398235976, | |
| "epoch": 2.502749410840534, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 1.553768782775351e-05, | |
| "loss": 0.8202457427978516, | |
| "mean_token_accuracy": 0.8225593730807305, | |
| "num_tokens": 3003572.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.502749410840534, | |
| "eval_entropy": 0.8503286918284187, | |
| "eval_loss": 0.8580905795097351, | |
| "eval_mean_token_accuracy": 0.8025841964802272, | |
| "eval_num_tokens": 3003572.0, | |
| "eval_runtime": 85.9551, | |
| "eval_samples_per_second": 1.652, | |
| "eval_steps_per_second": 1.652, | |
| "step": 200 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 240, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2965602468164403e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |