{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.764705882352942, "eval_steps": 20, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.058823529411764705, "eval_loss": 2.5443625450134277, "eval_runtime": 14.3878, "eval_samples_per_second": 104.324, "eval_steps_per_second": 2.433, "step": 1 }, { "epoch": 0.5882352941176471, "grad_norm": 2.671875, "learning_rate": 0.0001, "loss": 2.2541, "step": 10 }, { "epoch": 1.1764705882352942, "grad_norm": 3.65625, "learning_rate": 0.0002, "loss": 2.0275, "step": 20 }, { "epoch": 1.1764705882352942, "eval_loss": 2.187530517578125, "eval_runtime": 13.8986, "eval_samples_per_second": 107.996, "eval_steps_per_second": 2.518, "step": 20 }, { "epoch": 1.7647058823529411, "grad_norm": 2.15625, "learning_rate": 0.000199658449300667, "loss": 1.5265, "step": 30 }, { "epoch": 2.3529411764705883, "grad_norm": 2.40625, "learning_rate": 0.00019863613034027224, "loss": 1.1034, "step": 40 }, { "epoch": 2.3529411764705883, "eval_loss": 2.58608341217041, "eval_runtime": 14.5728, "eval_samples_per_second": 103.0, "eval_steps_per_second": 2.402, "step": 40 }, { "epoch": 2.9411764705882355, "grad_norm": 3.078125, "learning_rate": 0.00019694002659393305, "loss": 0.8112, "step": 50 }, { "epoch": 3.5294117647058822, "grad_norm": 3.6875, "learning_rate": 0.00019458172417006347, "loss": 0.3839, "step": 60 }, { "epoch": 3.5294117647058822, "eval_loss": 2.9881904125213623, "eval_runtime": 15.7638, "eval_samples_per_second": 95.218, "eval_steps_per_second": 2.22, "step": 60 }, { "epoch": 4.117647058823529, "grad_norm": 1.3125, "learning_rate": 0.00019157733266550575, "loss": 0.3102, "step": 70 }, { "epoch": 4.705882352941177, "grad_norm": 1.6171875, "learning_rate": 0.0001879473751206489, "loss": 0.1897, "step": 80 }, { "epoch": 4.705882352941177, "eval_loss": 3.129960536956787, "eval_runtime": 14.6965, "eval_samples_per_second": 102.133, "eval_steps_per_second": 2.382, "step": 80 }, { "epoch": 5.294117647058823, "grad_norm": 1.25, "learning_rate": 0.00018371664782625287, "loss": 0.1821, "step": 90 }, { "epoch": 5.882352941176471, "grad_norm": 1.3828125, "learning_rate": 0.00017891405093963938, "loss": 0.1432, "step": 100 }, { "epoch": 5.882352941176471, "eval_loss": 3.274164915084839, "eval_runtime": 15.1902, "eval_samples_per_second": 98.814, "eval_steps_per_second": 2.304, "step": 100 }, { "epoch": 6.470588235294118, "grad_norm": 1.1796875, "learning_rate": 0.00017357239106731317, "loss": 0.1188, "step": 110 }, { "epoch": 7.0588235294117645, "grad_norm": 0.78125, "learning_rate": 0.00016772815716257412, "loss": 0.1193, "step": 120 }, { "epoch": 7.0588235294117645, "eval_loss": 3.397793769836426, "eval_runtime": 14.573, "eval_samples_per_second": 102.998, "eval_steps_per_second": 2.402, "step": 120 }, { "epoch": 7.647058823529412, "grad_norm": 1.0546875, "learning_rate": 0.0001614212712689668, "loss": 0.0733, "step": 130 }, { "epoch": 8.235294117647058, "grad_norm": 0.69921875, "learning_rate": 0.00015469481581224272, "loss": 0.0776, "step": 140 }, { "epoch": 8.235294117647058, "eval_loss": 3.6323835849761963, "eval_runtime": 14.197, "eval_samples_per_second": 105.726, "eval_steps_per_second": 2.465, "step": 140 }, { "epoch": 8.823529411764707, "grad_norm": 0.76171875, "learning_rate": 0.00014759473930370736, "loss": 0.0647, "step": 150 }, { "epoch": 9.411764705882353, "grad_norm": 0.67578125, "learning_rate": 0.00014016954246529696, "loss": 0.0456, "step": 160 }, { "epoch": 9.411764705882353, "eval_loss": 3.7919914722442627, "eval_runtime": 15.5596, "eval_samples_per_second": 96.468, "eval_steps_per_second": 2.249, "step": 160 }, { "epoch": 10.0, "grad_norm": 0.62109375, "learning_rate": 0.00013246994692046836, "loss": 0.0373, "step": 170 }, { "epoch": 10.588235294117647, "grad_norm": 0.349609375, "learning_rate": 0.00012454854871407994, "loss": 0.0152, "step": 180 }, { "epoch": 10.588235294117647, "eval_loss": 3.9090092182159424, "eval_runtime": 15.4377, "eval_samples_per_second": 97.23, "eval_steps_per_second": 2.267, "step": 180 }, { "epoch": 11.176470588235293, "grad_norm": 0.134765625, "learning_rate": 0.00011645945902807341, "loss": 0.0121, "step": 190 }, { "epoch": 11.764705882352942, "grad_norm": 0.1884765625, "learning_rate": 0.00010825793454723325, "loss": 0.0051, "step": 200 }, { "epoch": 11.764705882352942, "eval_loss": 4.041390419006348, "eval_runtime": 14.6874, "eval_samples_per_second": 102.197, "eval_steps_per_second": 2.383, "step": 200 } ], "logging_steps": 10, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 24, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0433372297468314e+17, "train_batch_size": 11, "trial_name": null, "trial_params": null }