{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 111, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.418435550481081, "epoch": 0.27303754266211605, "grad_norm": 0.056396484375, "learning_rate": 0.00019959742939952392, "loss": 1.6161483764648437, "mean_token_accuracy": 0.6638647213578224, "num_tokens": 151209.0, "step": 10 }, { "entropy": 1.097722884826362, "epoch": 0.5460750853242321, "grad_norm": 0.033203125, "learning_rate": 0.00019253043004739968, "loss": 1.1661317825317383, "mean_token_accuracy": 0.7458787495270371, "num_tokens": 320046.0, "step": 20 }, { "entropy": 1.063631435856223, "epoch": 0.8191126279863481, "grad_norm": 0.0247802734375, "learning_rate": 0.00017724169592245995, "loss": 1.118991756439209, "mean_token_accuracy": 0.7535936305299401, "num_tokens": 492438.0, "step": 30 }, { "entropy": 0.9953024551852957, "epoch": 1.0819112627986347, "grad_norm": 0.0255126953125, "learning_rate": 0.00015508969814521025, "loss": 1.0578977584838867, "mean_token_accuracy": 0.7685656903626082, "num_tokens": 642939.0, "step": 40 }, { "entropy": 0.9422929083928466, "epoch": 1.3549488054607508, "grad_norm": 0.025390625, "learning_rate": 0.00012804273893060028, "loss": 1.0265376091003418, "mean_token_accuracy": 0.7775206623598934, "num_tokens": 817432.0, "step": 50 }, { "entropy": 0.9282504981383681, "epoch": 1.627986348122867, "grad_norm": 0.025390625, "learning_rate": 9.850405929847366e-05, "loss": 0.9812464714050293, "mean_token_accuracy": 0.7824548855423927, "num_tokens": 971956.0, "step": 60 }, { "entropy": 0.9165813697502017, "epoch": 1.901023890784983, "grad_norm": 0.028076171875, "learning_rate": 6.909830056250527e-05, "loss": 0.9443504333496093, "mean_token_accuracy": 0.7839378425851464, "num_tokens": 1134082.0, "step": 70 }, { "entropy": 0.946953180354911, "epoch": 2.1638225255972694, "grad_norm": 0.02880859375, "learning_rate": 4.2438293431432665e-05, "loss": 0.968726921081543, "mean_token_accuracy": 0.7822197033213331, "num_tokens": 1290177.0, "step": 80 }, { "entropy": 0.8450189002789557, "epoch": 2.4368600682593855, "grad_norm": 0.031982421875, "learning_rate": 2.0892896534365904e-05, "loss": 0.9016420364379882, "mean_token_accuracy": 0.7995872467756271, "num_tokens": 1461672.0, "step": 90 }, { "entropy": 0.8309150729328394, "epoch": 2.7098976109215016, "grad_norm": 0.037353515625, "learning_rate": 6.37651293602628e-06, "loss": 0.8841998100280761, "mean_token_accuracy": 0.8054205430671573, "num_tokens": 1624257.0, "step": 100 }, { "epoch": 2.7098976109215016, "eval_entropy": 0.9007232843926458, "eval_loss": 0.9238418340682983, "eval_mean_token_accuracy": 0.7843361054406022, "eval_num_tokens": 1624257.0, "eval_runtime": 42.5945, "eval_samples_per_second": 1.549, "eval_steps_per_second": 1.549, "step": 100 }, { "entropy": 0.8297194179147482, "epoch": 2.9829351535836177, "grad_norm": 0.0322265625, "learning_rate": 1.7898702322648453e-07, "loss": 0.8666117668151856, "mean_token_accuracy": 0.8052121920511126, "num_tokens": 1778979.0, "step": 110 } ], "logging_steps": 10, "max_steps": 111, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.754556700156723e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }