{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0625, "grad_norm": 0.4859062731266022, "learning_rate": 9.4375e-05, "loss": 2.4443, "mean_token_accuracy": 0.5860039107501507, "num_tokens": 40960.0, "step": 10 }, { "epoch": 0.125, "grad_norm": 0.3083125948905945, "learning_rate": 8.8125e-05, "loss": 0.0556, "mean_token_accuracy": 0.9846604824066162, "num_tokens": 81920.0, "step": 20 }, { "epoch": 0.1875, "grad_norm": 0.24934740364551544, "learning_rate": 8.1875e-05, "loss": 0.0563, "mean_token_accuracy": 0.9847581863403321, "num_tokens": 122880.0, "step": 30 }, { "epoch": 0.25, "grad_norm": 0.3064737915992737, "learning_rate": 7.5625e-05, "loss": 0.0497, "mean_token_accuracy": 0.9860527634620666, "num_tokens": 163840.0, "step": 40 }, { "epoch": 0.3125, "grad_norm": 0.26368093490600586, "learning_rate": 6.9375e-05, "loss": 0.0541, "mean_token_accuracy": 0.9845627784729004, "num_tokens": 204800.0, "step": 50 }, { "epoch": 0.375, "grad_norm": 0.26650479435920715, "learning_rate": 6.3125e-05, "loss": 0.0448, "mean_token_accuracy": 0.9867611169815064, "num_tokens": 245760.0, "step": 60 }, { "epoch": 0.4375, "grad_norm": 0.2567497491836548, "learning_rate": 5.6875e-05, "loss": 0.0488, "mean_token_accuracy": 0.9860039114952087, "num_tokens": 286720.0, "step": 70 }, { "epoch": 0.5, "grad_norm": 0.2256651371717453, "learning_rate": 5.0625e-05, "loss": 0.0488, "mean_token_accuracy": 0.9857840776443482, "num_tokens": 327680.0, "step": 80 }, { "epoch": 0.5625, "grad_norm": 0.26728323101997375, "learning_rate": 4.4375e-05, "loss": 0.0468, "mean_token_accuracy": 0.9865657091140747, "num_tokens": 368640.0, "step": 90 }, { "epoch": 0.5625, "eval_runtime": 16.058, "eval_samples_per_second": 4.982, "eval_steps_per_second": 0.311, "step": 90 }, { "epoch": 0.625, "grad_norm": 0.2592334449291229, "learning_rate": 3.8125e-05, "loss": 0.0475, "mean_token_accuracy": 0.9867366909980774, "num_tokens": 409600.0, "step": 100 }, { "epoch": 0.6875, "grad_norm": 0.2610887289047241, "learning_rate": 3.1875e-05, "loss": 0.0475, "mean_token_accuracy": 0.9867855429649353, "num_tokens": 450560.0, "step": 110 }, { "epoch": 0.75, "grad_norm": 0.2812255322933197, "learning_rate": 2.5625e-05, "loss": 0.044, "mean_token_accuracy": 0.9877381563186646, "num_tokens": 491520.0, "step": 120 }, { "epoch": 0.8125, "grad_norm": 0.2330714762210846, "learning_rate": 1.9375e-05, "loss": 0.0457, "mean_token_accuracy": 0.9867122650146485, "num_tokens": 532480.0, "step": 130 }, { "epoch": 0.875, "grad_norm": 0.2395644336938858, "learning_rate": 1.3125e-05, "loss": 0.0517, "mean_token_accuracy": 0.9855642437934875, "num_tokens": 573440.0, "step": 140 }, { "epoch": 0.9375, "grad_norm": 0.2232440710067749, "learning_rate": 6.875000000000001e-06, "loss": 0.0442, "mean_token_accuracy": 0.9875916004180908, "num_tokens": 614400.0, "step": 150 }, { "epoch": 1.0, "grad_norm": 0.2189246416091919, "learning_rate": 6.25e-07, "loss": 0.0419, "mean_token_accuracy": 0.9886174917221069, "num_tokens": 655360.0, "step": 160 }, { "epoch": 1.0, "eval_runtime": 15.1106, "eval_samples_per_second": 5.294, "eval_steps_per_second": 0.331, "step": 160 }, { "epoch": 1.0, "step": 160, "total_flos": 0.0, "train_loss": 0.19822477344423534, "train_runtime": 202.9526, "train_samples_per_second": 1.577, "train_steps_per_second": 0.788 } ], "logging_steps": 10, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }