{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 713, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.7893231290578843, "epoch": 0.07012622720897616, "grad_norm": 1.1326932907104492, "learning_rate": 0.00018625525946704068, "loss": 0.883, "mean_token_accuracy": 0.8367647039890289, "num_tokens": 51200.0, "step": 50 }, { "entropy": 0.604653742313385, "epoch": 0.1402524544179523, "grad_norm": 1.4008318185806274, "learning_rate": 0.00017223001402524545, "loss": 0.6207, "mean_token_accuracy": 0.8560980391502381, "num_tokens": 102400.0, "step": 100 }, { "entropy": 0.552680384516716, "epoch": 0.21037868162692847, "grad_norm": 1.2125327587127686, "learning_rate": 0.00015820476858345023, "loss": 0.5688, "mean_token_accuracy": 0.8681372570991516, "num_tokens": 153600.0, "step": 150 }, { "entropy": 0.5870230662822723, "epoch": 0.2805049088359046, "grad_norm": 1.0003958940505981, "learning_rate": 0.000144179523141655, "loss": 0.6044, "mean_token_accuracy": 0.8610000038146972, "num_tokens": 204800.0, "step": 200 }, { "entropy": 0.5841648465394974, "epoch": 0.3506311360448808, "grad_norm": 1.2221473455429077, "learning_rate": 0.00013015427769985974, "loss": 0.5942, "mean_token_accuracy": 0.8643529415130615, "num_tokens": 256000.0, "step": 250 }, { "entropy": 0.5446552008390426, "epoch": 0.42075736325385693, "grad_norm": 0.8246455788612366, "learning_rate": 0.00011612903225806453, "loss": 0.5693, "mean_token_accuracy": 0.8707843124866486, "num_tokens": 307200.0, "step": 300 }, { "entropy": 0.5239044934511184, "epoch": 0.4908835904628331, "grad_norm": 1.022056221961975, "learning_rate": 0.00010210378681626929, "loss": 0.5433, "mean_token_accuracy": 0.8759215676784515, "num_tokens": 358400.0, "step": 350 }, { "entropy": 0.516560555100441, "epoch": 0.5610098176718092, "grad_norm": 1.1696139574050903, "learning_rate": 8.807854137447405e-05, "loss": 0.5267, "mean_token_accuracy": 0.877607843875885, "num_tokens": 409600.0, "step": 400 }, { "entropy": 0.5230421105027199, "epoch": 0.6311360448807855, "grad_norm": 0.8299864530563354, "learning_rate": 7.405329593267882e-05, "loss": 0.5232, "mean_token_accuracy": 0.879117648601532, "num_tokens": 460800.0, "step": 450 }, { "entropy": 0.5555241489410401, "epoch": 0.7012622720897616, "grad_norm": 1.1203019618988037, "learning_rate": 6.002805049088359e-05, "loss": 0.568, "mean_token_accuracy": 0.8681960797309876, "num_tokens": 512000.0, "step": 500 }, { "entropy": 0.5119614934921265, "epoch": 0.7713884992987378, "grad_norm": 1.3243024349212646, "learning_rate": 4.600280504908836e-05, "loss": 0.517, "mean_token_accuracy": 0.8799803924560546, "num_tokens": 563200.0, "step": 550 }, { "entropy": 0.4760155099630356, "epoch": 0.8415147265077139, "grad_norm": 0.8521648049354553, "learning_rate": 3.197755960729313e-05, "loss": 0.4854, "mean_token_accuracy": 0.8899803864955902, "num_tokens": 614400.0, "step": 600 }, { "entropy": 0.45049646943807603, "epoch": 0.9116409537166901, "grad_norm": 0.7379061579704285, "learning_rate": 1.7952314165497897e-05, "loss": 0.4515, "mean_token_accuracy": 0.8959019601345062, "num_tokens": 665600.0, "step": 650 }, { "entropy": 0.49310168087482453, "epoch": 0.9817671809256662, "grad_norm": 1.3646042346954346, "learning_rate": 3.927068723702665e-06, "loss": 0.508, "mean_token_accuracy": 0.8815882337093354, "num_tokens": 716800.0, "step": 700 } ], "logging_steps": 50, "max_steps": 713, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 464916248985600.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }