{ "best_global_step": 340, "best_metric": 0.7385169267654419, "best_model_checkpoint": "./mangoes/checkpoint-340", "epoch": 2.0, "eval_steps": 500, "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.058823529411764705, "grad_norm": 1.9217854738235474, "learning_rate": 1.9470588235294118e-05, "loss": 2.044, "step": 10 }, { "epoch": 0.11764705882352941, "grad_norm": 3.201918363571167, "learning_rate": 1.888235294117647e-05, "loss": 2.0161, "step": 20 }, { "epoch": 0.17647058823529413, "grad_norm": 3.1314334869384766, "learning_rate": 1.8294117647058824e-05, "loss": 1.9021, "step": 30 }, { "epoch": 0.23529411764705882, "grad_norm": 2.9427032470703125, "learning_rate": 1.7705882352941177e-05, "loss": 1.8208, "step": 40 }, { "epoch": 0.29411764705882354, "grad_norm": 3.096381425857544, "learning_rate": 1.711764705882353e-05, "loss": 1.7553, "step": 50 }, { "epoch": 0.35294117647058826, "grad_norm": 3.1433303356170654, "learning_rate": 1.6529411764705883e-05, "loss": 1.7674, "step": 60 }, { "epoch": 0.4117647058823529, "grad_norm": 3.0195441246032715, "learning_rate": 1.594117647058824e-05, "loss": 1.6242, "step": 70 }, { "epoch": 0.47058823529411764, "grad_norm": 3.748368740081787, "learning_rate": 1.535294117647059e-05, "loss": 1.5804, "step": 80 }, { "epoch": 0.5294117647058824, "grad_norm": 2.8905935287475586, "learning_rate": 1.4764705882352944e-05, "loss": 1.5126, "step": 90 }, { "epoch": 0.5882352941176471, "grad_norm": 3.1870696544647217, "learning_rate": 1.4176470588235297e-05, "loss": 1.3781, "step": 100 }, { "epoch": 0.6470588235294118, "grad_norm": 3.208005428314209, "learning_rate": 1.3588235294117648e-05, "loss": 1.4037, "step": 110 }, { "epoch": 0.7058823529411765, "grad_norm": 4.6087236404418945, "learning_rate": 1.3000000000000001e-05, "loss": 1.2771, "step": 120 }, { "epoch": 0.7647058823529411, "grad_norm": 3.6908063888549805, "learning_rate": 1.2411764705882354e-05, "loss": 1.2711, "step": 130 }, { "epoch": 0.8235294117647058, "grad_norm": 3.6166765689849854, "learning_rate": 1.1823529411764707e-05, "loss": 1.192, "step": 140 }, { "epoch": 0.8823529411764706, "grad_norm": 3.6934988498687744, "learning_rate": 1.123529411764706e-05, "loss": 1.1566, "step": 150 }, { "epoch": 0.9411764705882353, "grad_norm": 3.789727210998535, "learning_rate": 1.0647058823529413e-05, "loss": 1.1063, "step": 160 }, { "epoch": 1.0, "grad_norm": 3.842630386352539, "learning_rate": 1.0058823529411766e-05, "loss": 1.0281, "step": 170 }, { "epoch": 1.0, "eval_accuracy": 0.9583333333333334, "eval_loss": 1.0490069389343262, "eval_runtime": 6.306, "eval_samples_per_second": 38.059, "eval_steps_per_second": 4.757, "step": 170 }, { "epoch": 1.0588235294117647, "grad_norm": 3.8409788608551025, "learning_rate": 9.470588235294119e-06, "loss": 1.0169, "step": 180 }, { "epoch": 1.1176470588235294, "grad_norm": 2.461111068725586, "learning_rate": 8.88235294117647e-06, "loss": 0.9592, "step": 190 }, { "epoch": 1.1764705882352942, "grad_norm": 3.7472541332244873, "learning_rate": 8.294117647058825e-06, "loss": 1.014, "step": 200 }, { "epoch": 1.2352941176470589, "grad_norm": 4.744520664215088, "learning_rate": 7.705882352941178e-06, "loss": 0.9485, "step": 210 }, { "epoch": 1.2941176470588236, "grad_norm": 2.4809184074401855, "learning_rate": 7.11764705882353e-06, "loss": 0.956, "step": 220 }, { "epoch": 1.3529411764705883, "grad_norm": 6.709966659545898, "learning_rate": 6.529411764705883e-06, "loss": 0.9707, "step": 230 }, { "epoch": 1.4117647058823528, "grad_norm": 4.961846828460693, "learning_rate": 5.941176470588236e-06, "loss": 0.8539, "step": 240 }, { "epoch": 1.4705882352941178, "grad_norm": 5.209068298339844, "learning_rate": 5.352941176470589e-06, "loss": 0.8284, "step": 250 }, { "epoch": 1.5294117647058822, "grad_norm": 3.826070547103882, "learning_rate": 4.764705882352941e-06, "loss": 0.8226, "step": 260 }, { "epoch": 1.5882352941176472, "grad_norm": 2.8872721195220947, "learning_rate": 4.176470588235295e-06, "loss": 0.7727, "step": 270 }, { "epoch": 1.6470588235294117, "grad_norm": 3.0581214427948, "learning_rate": 3.5882352941176475e-06, "loss": 0.7841, "step": 280 }, { "epoch": 1.7058823529411766, "grad_norm": 4.626227855682373, "learning_rate": 3e-06, "loss": 0.7934, "step": 290 }, { "epoch": 1.7647058823529411, "grad_norm": 2.622793436050415, "learning_rate": 2.411764705882353e-06, "loss": 0.7713, "step": 300 }, { "epoch": 1.8235294117647058, "grad_norm": 2.549530267715454, "learning_rate": 1.8235294117647058e-06, "loss": 0.7459, "step": 310 }, { "epoch": 1.8823529411764706, "grad_norm": 3.626901149749756, "learning_rate": 1.235294117647059e-06, "loss": 0.8056, "step": 320 }, { "epoch": 1.9411764705882353, "grad_norm": 2.356318950653076, "learning_rate": 6.470588235294118e-07, "loss": 0.7665, "step": 330 }, { "epoch": 2.0, "grad_norm": 4.176856517791748, "learning_rate": 5.882352941176471e-08, "loss": 0.7454, "step": 340 }, { "epoch": 2.0, "eval_accuracy": 0.9791666666666666, "eval_loss": 0.7385169267654419, "eval_runtime": 3.873, "eval_samples_per_second": 61.967, "eval_steps_per_second": 7.746, "step": 340 }, { "epoch": 2.0, "step": 340, "total_flos": 2.1078954658234368e+17, "train_loss": 1.1879772003959208, "train_runtime": 144.3943, "train_samples_per_second": 18.837, "train_steps_per_second": 2.355 } ], "logging_steps": 10, "max_steps": 340, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1078954658234368e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }