{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.23998080153587714, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.0767303490638733, "epoch": 0.023998080153587713, "grad_norm": 1.40625, "learning_rate": 7.857142857142858e-06, "loss": 0.7652, "mean_token_accuracy": 0.807009916305542, "num_tokens": 46104284.0, "step": 100 }, { "entropy": 1.011719313263893, "epoch": 0.04799616030717543, "grad_norm": 2.5, "learning_rate": 9.991950086669187e-06, "loss": 0.4576, "mean_token_accuracy": 0.853641785979271, "num_tokens": 92210808.0, "step": 200 }, { "entropy": 0.9967595022916794, "epoch": 0.07199424046076314, "grad_norm": 1.6875, "learning_rate": 9.954845660034937e-06, "loss": 0.4416, "mean_token_accuracy": 0.8567710411548615, "num_tokens": 138315654.0, "step": 300 }, { "entropy": 0.982376498579979, "epoch": 0.09599232061435085, "grad_norm": 1.640625, "learning_rate": 9.887809392638194e-06, "loss": 0.4328, "mean_token_accuracy": 0.8593817538022995, "num_tokens": 184416108.0, "step": 400 }, { "entropy": 0.9777479559183121, "epoch": 0.11999040076793857, "grad_norm": 1.046875, "learning_rate": 9.791246245403818e-06, "loss": 0.4293, "mean_token_accuracy": 0.8590555649995804, "num_tokens": 230522098.0, "step": 500 }, { "entropy": 0.9740712708234787, "epoch": 0.14398848092152627, "grad_norm": 5.875, "learning_rate": 9.665739548862132e-06, "loss": 0.4405, "mean_token_accuracy": 0.8574600327014923, "num_tokens": 276628106.0, "step": 600 }, { "entropy": 0.9753435063362121, "epoch": 0.16798656107511398, "grad_norm": 5.34375, "learning_rate": 9.512047479294147e-06, "loss": 0.4346, "mean_token_accuracy": 0.8599888670444489, "num_tokens": 322733236.0, "step": 700 }, { "entropy": 0.9697796589136124, "epoch": 0.1919846412287017, "grad_norm": 1.421875, "learning_rate": 9.331098478647084e-06, "loss": 0.4324, "mean_token_accuracy": 0.8604245519638062, "num_tokens": 368840126.0, "step": 800 }, { "entropy": 0.9738489526510239, "epoch": 0.2159827213822894, "grad_norm": 4.5625, "learning_rate": 9.123985645888116e-06, "loss": 0.423, "mean_token_accuracy": 0.8620242869853973, "num_tokens": 414942536.0, "step": 900 }, { "entropy": 0.9679786705970764, "epoch": 0.23998080153587714, "grad_norm": 4.34375, "learning_rate": 8.891960133677763e-06, "loss": 0.4212, "mean_token_accuracy": 0.8623434334993363, "num_tokens": 461046116.0, "step": 1000 } ], "logging_steps": 100, "max_steps": 4167, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.620733471602246e+18, "train_batch_size": 30, "trial_name": null, "trial_params": null }