tapt_base_LR-2e-05 / trainer_state.json
Mardiyyah's picture
End of training
d1f1ce6 verified
{
"best_metric": 1.8658331632614136,
"best_model_checkpoint": "/nfs/production/literature/amina-mardiyyah/new_data/OT-Entity-Extraction-Pipeline/model_outputs/Continued_pretraining/TAPT/bioformers/bioformer-16L/Mardiyyah/TAPT_data_V2_split/tapt_base_LR-2e-05/checkpoint-255",
"epoch": 49.94117647058823,
"eval_steps": 1,
"global_step": 450,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.10457516339869281,
"eval_loss": 2.221620798110962,
"eval_runtime": 2.4071,
"eval_samples_per_second": 808.035,
"eval_steps_per_second": 12.879,
"step": 1
},
{
"epoch": 0.20915032679738563,
"eval_loss": 2.1887502670288086,
"eval_runtime": 2.4126,
"eval_samples_per_second": 806.196,
"eval_steps_per_second": 12.849,
"step": 2
},
{
"epoch": 0.3137254901960784,
"eval_loss": 2.173266887664795,
"eval_runtime": 2.4488,
"eval_samples_per_second": 794.257,
"eval_steps_per_second": 12.659,
"step": 3
},
{
"epoch": 0.41830065359477125,
"eval_loss": 2.1297478675842285,
"eval_runtime": 2.3778,
"eval_samples_per_second": 817.994,
"eval_steps_per_second": 13.037,
"step": 4
},
{
"epoch": 0.5228758169934641,
"eval_loss": 2.192237377166748,
"eval_runtime": 2.3912,
"eval_samples_per_second": 813.415,
"eval_steps_per_second": 12.964,
"step": 5
},
{
"epoch": 0.6274509803921569,
"eval_loss": 2.1468276977539062,
"eval_runtime": 2.3902,
"eval_samples_per_second": 813.739,
"eval_steps_per_second": 12.97,
"step": 6
},
{
"epoch": 0.7320261437908496,
"eval_loss": 2.1432690620422363,
"eval_runtime": 2.3678,
"eval_samples_per_second": 821.446,
"eval_steps_per_second": 13.092,
"step": 7
},
{
"epoch": 0.8366013071895425,
"eval_loss": 2.0927038192749023,
"eval_runtime": 2.3704,
"eval_samples_per_second": 820.552,
"eval_steps_per_second": 13.078,
"step": 8
},
{
"epoch": 0.9411764705882353,
"eval_loss": 2.124303102493286,
"eval_runtime": 2.4224,
"eval_samples_per_second": 802.925,
"eval_steps_per_second": 12.797,
"step": 9
},
{
"epoch": 0.9411764705882353,
"grad_norm": 2.930349588394165,
"learning_rate": 6.666666666666667e-06,
"loss": 2.4847,
"step": 9
},
{
"epoch": 1.1045751633986929,
"eval_loss": 2.1266961097717285,
"eval_runtime": 2.3993,
"eval_samples_per_second": 810.647,
"eval_steps_per_second": 12.92,
"step": 10
},
{
"epoch": 1.2091503267973855,
"eval_loss": 2.081995964050293,
"eval_runtime": 2.4023,
"eval_samples_per_second": 809.627,
"eval_steps_per_second": 12.904,
"step": 11
},
{
"epoch": 1.3137254901960784,
"eval_loss": 2.0737693309783936,
"eval_runtime": 2.4011,
"eval_samples_per_second": 810.042,
"eval_steps_per_second": 12.911,
"step": 12
},
{
"epoch": 1.4183006535947713,
"eval_loss": 2.0427086353302,
"eval_runtime": 2.4076,
"eval_samples_per_second": 807.874,
"eval_steps_per_second": 12.876,
"step": 13
},
{
"epoch": 1.522875816993464,
"eval_loss": 2.056819200515747,
"eval_runtime": 2.3991,
"eval_samples_per_second": 810.727,
"eval_steps_per_second": 12.922,
"step": 14
},
{
"epoch": 1.6274509803921569,
"eval_loss": 2.083451747894287,
"eval_runtime": 2.4323,
"eval_samples_per_second": 799.665,
"eval_steps_per_second": 12.745,
"step": 15
},
{
"epoch": 1.7320261437908497,
"eval_loss": 2.078913450241089,
"eval_runtime": 2.4756,
"eval_samples_per_second": 785.669,
"eval_steps_per_second": 12.522,
"step": 16
},
{
"epoch": 1.8366013071895426,
"eval_loss": 2.067417621612549,
"eval_runtime": 2.438,
"eval_samples_per_second": 797.798,
"eval_steps_per_second": 12.716,
"step": 17
},
{
"epoch": 1.9411764705882353,
"eval_loss": 2.0401482582092285,
"eval_runtime": 2.4764,
"eval_samples_per_second": 785.419,
"eval_steps_per_second": 12.518,
"step": 18
},
{
"epoch": 1.9411764705882353,
"grad_norm": 1.9967031478881836,
"learning_rate": 1.3333333333333333e-05,
"loss": 2.4101,
"step": 18
},
{
"epoch": 2.104575163398693,
"eval_loss": 2.0421407222747803,
"eval_runtime": 2.4395,
"eval_samples_per_second": 797.31,
"eval_steps_per_second": 12.708,
"step": 19
},
{
"epoch": 2.2091503267973858,
"eval_loss": 2.0762155055999756,
"eval_runtime": 2.4888,
"eval_samples_per_second": 781.507,
"eval_steps_per_second": 12.456,
"step": 20
},
{
"epoch": 2.313725490196078,
"eval_loss": 2.006462574005127,
"eval_runtime": 2.5161,
"eval_samples_per_second": 773.026,
"eval_steps_per_second": 12.321,
"step": 21
},
{
"epoch": 2.418300653594771,
"eval_loss": 2.0763015747070312,
"eval_runtime": 2.4587,
"eval_samples_per_second": 791.08,
"eval_steps_per_second": 12.608,
"step": 22
},
{
"epoch": 2.522875816993464,
"eval_loss": 2.0424351692199707,
"eval_runtime": 2.4605,
"eval_samples_per_second": 790.5,
"eval_steps_per_second": 12.599,
"step": 23
},
{
"epoch": 2.627450980392157,
"eval_loss": 2.031003952026367,
"eval_runtime": 2.4119,
"eval_samples_per_second": 806.408,
"eval_steps_per_second": 12.853,
"step": 24
},
{
"epoch": 2.7320261437908497,
"eval_loss": 2.0873942375183105,
"eval_runtime": 2.4077,
"eval_samples_per_second": 807.816,
"eval_steps_per_second": 12.875,
"step": 25
},
{
"epoch": 2.8366013071895426,
"eval_loss": 2.0235297679901123,
"eval_runtime": 2.3945,
"eval_samples_per_second": 812.27,
"eval_steps_per_second": 12.946,
"step": 26
},
{
"epoch": 2.9411764705882355,
"eval_loss": 2.059739589691162,
"eval_runtime": 2.4114,
"eval_samples_per_second": 806.602,
"eval_steps_per_second": 12.856,
"step": 27
},
{
"epoch": 2.9411764705882355,
"grad_norm": 2.4962875843048096,
"learning_rate": 2e-05,
"loss": 2.3677,
"step": 27
},
{
"epoch": 3.104575163398693,
"eval_loss": 1.986527442932129,
"eval_runtime": 2.3999,
"eval_samples_per_second": 810.462,
"eval_steps_per_second": 12.917,
"step": 28
},
{
"epoch": 3.2091503267973858,
"eval_loss": 2.0295257568359375,
"eval_runtime": 2.4114,
"eval_samples_per_second": 806.573,
"eval_steps_per_second": 12.855,
"step": 29
},
{
"epoch": 3.313725490196078,
"eval_loss": 2.029600143432617,
"eval_runtime": 2.3867,
"eval_samples_per_second": 814.924,
"eval_steps_per_second": 12.989,
"step": 30
},
{
"epoch": 3.418300653594771,
"eval_loss": 2.0018720626831055,
"eval_runtime": 2.3894,
"eval_samples_per_second": 814.014,
"eval_steps_per_second": 12.974,
"step": 31
},
{
"epoch": 3.522875816993464,
"eval_loss": 1.9695795774459839,
"eval_runtime": 2.3919,
"eval_samples_per_second": 813.157,
"eval_steps_per_second": 12.96,
"step": 32
},
{
"epoch": 3.627450980392157,
"eval_loss": 2.0265488624572754,
"eval_runtime": 2.426,
"eval_samples_per_second": 801.734,
"eval_steps_per_second": 12.778,
"step": 33
},
{
"epoch": 3.7320261437908497,
"eval_loss": 2.010695457458496,
"eval_runtime": 2.429,
"eval_samples_per_second": 800.731,
"eval_steps_per_second": 12.762,
"step": 34
},
{
"epoch": 3.8366013071895426,
"eval_loss": 2.034428119659424,
"eval_runtime": 2.4086,
"eval_samples_per_second": 807.51,
"eval_steps_per_second": 12.87,
"step": 35
},
{
"epoch": 3.9411764705882355,
"eval_loss": 2.0281381607055664,
"eval_runtime": 2.4118,
"eval_samples_per_second": 806.449,
"eval_steps_per_second": 12.853,
"step": 36
},
{
"epoch": 3.9411764705882355,
"grad_norm": 1.8768209218978882,
"learning_rate": 1.9574468085106384e-05,
"loss": 2.2639,
"step": 36
},
{
"epoch": 4.104575163398692,
"eval_loss": 2.0171053409576416,
"eval_runtime": 2.4083,
"eval_samples_per_second": 807.623,
"eval_steps_per_second": 12.872,
"step": 37
},
{
"epoch": 4.209150326797386,
"eval_loss": 2.0344126224517822,
"eval_runtime": 2.4108,
"eval_samples_per_second": 806.785,
"eval_steps_per_second": 12.859,
"step": 38
},
{
"epoch": 4.313725490196078,
"eval_loss": 1.9913954734802246,
"eval_runtime": 2.3858,
"eval_samples_per_second": 815.248,
"eval_steps_per_second": 12.994,
"step": 39
},
{
"epoch": 4.4183006535947715,
"eval_loss": 1.9855905771255493,
"eval_runtime": 2.3822,
"eval_samples_per_second": 816.455,
"eval_steps_per_second": 13.013,
"step": 40
},
{
"epoch": 4.522875816993464,
"eval_loss": 2.0357260704040527,
"eval_runtime": 2.3873,
"eval_samples_per_second": 814.74,
"eval_steps_per_second": 12.986,
"step": 41
},
{
"epoch": 4.627450980392156,
"eval_loss": 2.028900384902954,
"eval_runtime": 2.4177,
"eval_samples_per_second": 804.49,
"eval_steps_per_second": 12.822,
"step": 42
},
{
"epoch": 4.73202614379085,
"eval_loss": 1.9714045524597168,
"eval_runtime": 2.3849,
"eval_samples_per_second": 815.537,
"eval_steps_per_second": 12.998,
"step": 43
},
{
"epoch": 4.836601307189542,
"eval_loss": 1.9895257949829102,
"eval_runtime": 2.4769,
"eval_samples_per_second": 785.249,
"eval_steps_per_second": 12.516,
"step": 44
},
{
"epoch": 4.9411764705882355,
"eval_loss": 1.990486741065979,
"eval_runtime": 2.4263,
"eval_samples_per_second": 801.627,
"eval_steps_per_second": 12.777,
"step": 45
},
{
"epoch": 4.9411764705882355,
"grad_norm": 1.8783236742019653,
"learning_rate": 1.914893617021277e-05,
"loss": 2.2037,
"step": 45
},
{
"epoch": 5.104575163398692,
"eval_loss": 1.9589457511901855,
"eval_runtime": 2.4045,
"eval_samples_per_second": 808.887,
"eval_steps_per_second": 12.892,
"step": 46
},
{
"epoch": 5.209150326797386,
"eval_loss": 1.9864917993545532,
"eval_runtime": 2.4112,
"eval_samples_per_second": 806.648,
"eval_steps_per_second": 12.857,
"step": 47
},
{
"epoch": 5.313725490196078,
"eval_loss": 2.0113699436187744,
"eval_runtime": 2.4027,
"eval_samples_per_second": 809.512,
"eval_steps_per_second": 12.902,
"step": 48
},
{
"epoch": 5.4183006535947715,
"eval_loss": 2.0007834434509277,
"eval_runtime": 2.4133,
"eval_samples_per_second": 805.942,
"eval_steps_per_second": 12.845,
"step": 49
},
{
"epoch": 5.522875816993464,
"eval_loss": 1.9577592611312866,
"eval_runtime": 2.3791,
"eval_samples_per_second": 817.533,
"eval_steps_per_second": 13.03,
"step": 50
},
{
"epoch": 5.627450980392156,
"eval_loss": 2.029423236846924,
"eval_runtime": 2.3902,
"eval_samples_per_second": 813.74,
"eval_steps_per_second": 12.97,
"step": 51
},
{
"epoch": 5.73202614379085,
"eval_loss": 1.9585332870483398,
"eval_runtime": 2.4118,
"eval_samples_per_second": 806.436,
"eval_steps_per_second": 12.853,
"step": 52
},
{
"epoch": 5.836601307189542,
"eval_loss": 1.9783401489257812,
"eval_runtime": 2.4264,
"eval_samples_per_second": 801.594,
"eval_steps_per_second": 12.776,
"step": 53
},
{
"epoch": 5.9411764705882355,
"eval_loss": 1.9880473613739014,
"eval_runtime": 2.488,
"eval_samples_per_second": 781.753,
"eval_steps_per_second": 12.46,
"step": 54
},
{
"epoch": 5.9411764705882355,
"grad_norm": 1.8594753742218018,
"learning_rate": 1.872340425531915e-05,
"loss": 2.16,
"step": 54
},
{
"epoch": 6.104575163398692,
"eval_loss": 2.0060460567474365,
"eval_runtime": 2.4336,
"eval_samples_per_second": 799.234,
"eval_steps_per_second": 12.738,
"step": 55
},
{
"epoch": 6.209150326797386,
"eval_loss": 1.9557570219039917,
"eval_runtime": 2.4097,
"eval_samples_per_second": 807.143,
"eval_steps_per_second": 12.864,
"step": 56
},
{
"epoch": 6.313725490196078,
"eval_loss": 1.9664386510849,
"eval_runtime": 2.4059,
"eval_samples_per_second": 808.441,
"eval_steps_per_second": 12.885,
"step": 57
},
{
"epoch": 6.4183006535947715,
"eval_loss": 1.920135498046875,
"eval_runtime": 2.4056,
"eval_samples_per_second": 808.538,
"eval_steps_per_second": 12.887,
"step": 58
},
{
"epoch": 6.522875816993464,
"eval_loss": 1.9815952777862549,
"eval_runtime": 2.4122,
"eval_samples_per_second": 806.325,
"eval_steps_per_second": 12.851,
"step": 59
},
{
"epoch": 6.627450980392156,
"eval_loss": 1.9681768417358398,
"eval_runtime": 2.3867,
"eval_samples_per_second": 814.941,
"eval_steps_per_second": 12.989,
"step": 60
},
{
"epoch": 6.73202614379085,
"eval_loss": 1.9605098962783813,
"eval_runtime": 2.3875,
"eval_samples_per_second": 814.648,
"eval_steps_per_second": 12.984,
"step": 61
},
{
"epoch": 6.836601307189542,
"eval_loss": 1.9233237504959106,
"eval_runtime": 2.385,
"eval_samples_per_second": 815.51,
"eval_steps_per_second": 12.998,
"step": 62
},
{
"epoch": 6.9411764705882355,
"eval_loss": 1.9687212705612183,
"eval_runtime": 2.3854,
"eval_samples_per_second": 815.361,
"eval_steps_per_second": 12.995,
"step": 63
},
{
"epoch": 6.9411764705882355,
"grad_norm": 1.886400818824768,
"learning_rate": 1.8297872340425533e-05,
"loss": 2.1108,
"step": 63
},
{
"epoch": 7.104575163398692,
"eval_loss": 1.9986543655395508,
"eval_runtime": 2.4144,
"eval_samples_per_second": 805.581,
"eval_steps_per_second": 12.84,
"step": 64
},
{
"epoch": 7.209150326797386,
"eval_loss": 2.002251386642456,
"eval_runtime": 2.4255,
"eval_samples_per_second": 801.892,
"eval_steps_per_second": 12.781,
"step": 65
},
{
"epoch": 7.313725490196078,
"eval_loss": 1.9626870155334473,
"eval_runtime": 2.4235,
"eval_samples_per_second": 802.565,
"eval_steps_per_second": 12.792,
"step": 66
},
{
"epoch": 7.4183006535947715,
"eval_loss": 2.0214684009552,
"eval_runtime": 2.4109,
"eval_samples_per_second": 806.764,
"eval_steps_per_second": 12.858,
"step": 67
},
{
"epoch": 7.522875816993464,
"eval_loss": 1.961344599723816,
"eval_runtime": 2.4317,
"eval_samples_per_second": 799.86,
"eval_steps_per_second": 12.748,
"step": 68
},
{
"epoch": 7.627450980392156,
"eval_loss": 2.026102066040039,
"eval_runtime": 2.4027,
"eval_samples_per_second": 809.502,
"eval_steps_per_second": 12.902,
"step": 69
},
{
"epoch": 7.73202614379085,
"eval_loss": 1.9625698328018188,
"eval_runtime": 2.3794,
"eval_samples_per_second": 817.425,
"eval_steps_per_second": 13.028,
"step": 70
},
{
"epoch": 7.836601307189542,
"eval_loss": 2.000683546066284,
"eval_runtime": 2.3828,
"eval_samples_per_second": 816.278,
"eval_steps_per_second": 13.01,
"step": 71
},
{
"epoch": 7.9411764705882355,
"eval_loss": 1.9403586387634277,
"eval_runtime": 2.383,
"eval_samples_per_second": 816.21,
"eval_steps_per_second": 13.009,
"step": 72
},
{
"epoch": 7.9411764705882355,
"grad_norm": 3.798304319381714,
"learning_rate": 1.7872340425531915e-05,
"loss": 2.0949,
"step": 72
},
{
"epoch": 8.104575163398692,
"eval_loss": 1.994275450706482,
"eval_runtime": 2.3862,
"eval_samples_per_second": 815.09,
"eval_steps_per_second": 12.991,
"step": 73
},
{
"epoch": 8.209150326797385,
"eval_loss": 2.0442616939544678,
"eval_runtime": 2.3871,
"eval_samples_per_second": 814.81,
"eval_steps_per_second": 12.987,
"step": 74
},
{
"epoch": 8.313725490196079,
"eval_loss": 1.99091637134552,
"eval_runtime": 2.3849,
"eval_samples_per_second": 815.556,
"eval_steps_per_second": 12.999,
"step": 75
},
{
"epoch": 8.418300653594772,
"eval_loss": 1.9789609909057617,
"eval_runtime": 2.4032,
"eval_samples_per_second": 809.347,
"eval_steps_per_second": 12.9,
"step": 76
},
{
"epoch": 8.522875816993464,
"eval_loss": 1.9505332708358765,
"eval_runtime": 2.457,
"eval_samples_per_second": 791.608,
"eval_steps_per_second": 12.617,
"step": 77
},
{
"epoch": 8.627450980392156,
"eval_loss": 1.9477442502975464,
"eval_runtime": 2.4164,
"eval_samples_per_second": 804.91,
"eval_steps_per_second": 12.829,
"step": 78
},
{
"epoch": 8.732026143790849,
"eval_loss": 2.027162790298462,
"eval_runtime": 2.4059,
"eval_samples_per_second": 808.413,
"eval_steps_per_second": 12.885,
"step": 79
},
{
"epoch": 8.836601307189543,
"eval_loss": 1.954852819442749,
"eval_runtime": 2.4078,
"eval_samples_per_second": 807.8,
"eval_steps_per_second": 12.875,
"step": 80
},
{
"epoch": 8.941176470588236,
"eval_loss": 1.9641313552856445,
"eval_runtime": 2.4106,
"eval_samples_per_second": 806.865,
"eval_steps_per_second": 12.86,
"step": 81
},
{
"epoch": 8.941176470588236,
"grad_norm": 1.9192023277282715,
"learning_rate": 1.74468085106383e-05,
"loss": 2.0617,
"step": 81
},
{
"epoch": 9.104575163398692,
"eval_loss": 1.9859141111373901,
"eval_runtime": 2.3909,
"eval_samples_per_second": 813.488,
"eval_steps_per_second": 12.966,
"step": 82
},
{
"epoch": 9.209150326797385,
"eval_loss": 1.937601089477539,
"eval_runtime": 2.3889,
"eval_samples_per_second": 814.196,
"eval_steps_per_second": 12.977,
"step": 83
},
{
"epoch": 9.313725490196079,
"eval_loss": 1.9699262380599976,
"eval_runtime": 2.3832,
"eval_samples_per_second": 816.116,
"eval_steps_per_second": 13.008,
"step": 84
},
{
"epoch": 9.418300653594772,
"eval_loss": 1.9334497451782227,
"eval_runtime": 2.4366,
"eval_samples_per_second": 798.247,
"eval_steps_per_second": 12.723,
"step": 85
},
{
"epoch": 9.522875816993464,
"eval_loss": 1.9708276987075806,
"eval_runtime": 2.3835,
"eval_samples_per_second": 816.03,
"eval_steps_per_second": 13.006,
"step": 86
},
{
"epoch": 9.627450980392156,
"eval_loss": 1.970037817955017,
"eval_runtime": 2.4674,
"eval_samples_per_second": 788.282,
"eval_steps_per_second": 12.564,
"step": 87
},
{
"epoch": 9.732026143790849,
"eval_loss": 1.9634466171264648,
"eval_runtime": 2.4497,
"eval_samples_per_second": 793.975,
"eval_steps_per_second": 12.655,
"step": 88
},
{
"epoch": 9.836601307189543,
"eval_loss": 1.92203688621521,
"eval_runtime": 2.4109,
"eval_samples_per_second": 806.761,
"eval_steps_per_second": 12.858,
"step": 89
},
{
"epoch": 9.941176470588236,
"eval_loss": 1.966900110244751,
"eval_runtime": 2.4107,
"eval_samples_per_second": 806.822,
"eval_steps_per_second": 12.859,
"step": 90
},
{
"epoch": 9.941176470588236,
"grad_norm": 2.050672769546509,
"learning_rate": 1.7021276595744682e-05,
"loss": 2.0509,
"step": 90
},
{
"epoch": 10.104575163398692,
"eval_loss": 1.956833004951477,
"eval_runtime": 2.3947,
"eval_samples_per_second": 812.203,
"eval_steps_per_second": 12.945,
"step": 91
},
{
"epoch": 10.209150326797385,
"eval_loss": 1.9699444770812988,
"eval_runtime": 2.3837,
"eval_samples_per_second": 815.943,
"eval_steps_per_second": 13.005,
"step": 92
},
{
"epoch": 10.313725490196079,
"eval_loss": 2.0316123962402344,
"eval_runtime": 2.3823,
"eval_samples_per_second": 816.431,
"eval_steps_per_second": 13.013,
"step": 93
},
{
"epoch": 10.418300653594772,
"eval_loss": 1.912984848022461,
"eval_runtime": 2.3869,
"eval_samples_per_second": 814.86,
"eval_steps_per_second": 12.987,
"step": 94
},
{
"epoch": 10.522875816993464,
"eval_loss": 1.9707229137420654,
"eval_runtime": 2.3837,
"eval_samples_per_second": 815.942,
"eval_steps_per_second": 13.005,
"step": 95
},
{
"epoch": 10.627450980392156,
"eval_loss": 1.9623687267303467,
"eval_runtime": 2.455,
"eval_samples_per_second": 792.269,
"eval_steps_per_second": 12.627,
"step": 96
},
{
"epoch": 10.732026143790849,
"eval_loss": 1.9515836238861084,
"eval_runtime": 2.4477,
"eval_samples_per_second": 794.62,
"eval_steps_per_second": 12.665,
"step": 97
},
{
"epoch": 10.836601307189543,
"eval_loss": 1.9508367776870728,
"eval_runtime": 2.407,
"eval_samples_per_second": 808.071,
"eval_steps_per_second": 12.879,
"step": 98
},
{
"epoch": 10.941176470588236,
"eval_loss": 1.9166395664215088,
"eval_runtime": 2.4193,
"eval_samples_per_second": 803.965,
"eval_steps_per_second": 12.814,
"step": 99
},
{
"epoch": 10.941176470588236,
"grad_norm": 1.911039113998413,
"learning_rate": 1.6595744680851064e-05,
"loss": 1.9835,
"step": 99
},
{
"epoch": 11.104575163398692,
"eval_loss": 1.9469496011734009,
"eval_runtime": 2.4171,
"eval_samples_per_second": 804.693,
"eval_steps_per_second": 12.825,
"step": 100
},
{
"epoch": 11.209150326797385,
"eval_loss": 1.962018609046936,
"eval_runtime": 2.3932,
"eval_samples_per_second": 812.727,
"eval_steps_per_second": 12.953,
"step": 101
},
{
"epoch": 11.313725490196079,
"eval_loss": 1.94699227809906,
"eval_runtime": 2.3789,
"eval_samples_per_second": 817.61,
"eval_steps_per_second": 13.031,
"step": 102
},
{
"epoch": 11.418300653594772,
"eval_loss": 1.945833444595337,
"eval_runtime": 2.3834,
"eval_samples_per_second": 816.059,
"eval_steps_per_second": 13.007,
"step": 103
},
{
"epoch": 11.522875816993464,
"eval_loss": 1.9585113525390625,
"eval_runtime": 2.3811,
"eval_samples_per_second": 816.833,
"eval_steps_per_second": 13.019,
"step": 104
},
{
"epoch": 11.627450980392156,
"eval_loss": 1.9450502395629883,
"eval_runtime": 2.381,
"eval_samples_per_second": 816.892,
"eval_steps_per_second": 13.02,
"step": 105
},
{
"epoch": 11.732026143790849,
"eval_loss": 1.9202919006347656,
"eval_runtime": 2.4348,
"eval_samples_per_second": 798.83,
"eval_steps_per_second": 12.732,
"step": 106
},
{
"epoch": 11.836601307189543,
"eval_loss": 1.9322612285614014,
"eval_runtime": 2.4503,
"eval_samples_per_second": 793.773,
"eval_steps_per_second": 12.651,
"step": 107
},
{
"epoch": 11.941176470588236,
"eval_loss": 1.9641361236572266,
"eval_runtime": 2.4059,
"eval_samples_per_second": 808.427,
"eval_steps_per_second": 12.885,
"step": 108
},
{
"epoch": 11.941176470588236,
"grad_norm": 1.9469199180603027,
"learning_rate": 1.6170212765957446e-05,
"loss": 1.9719,
"step": 108
},
{
"epoch": 12.104575163398692,
"eval_loss": 1.9262347221374512,
"eval_runtime": 2.4058,
"eval_samples_per_second": 808.471,
"eval_steps_per_second": 12.886,
"step": 109
},
{
"epoch": 12.209150326797385,
"eval_loss": 1.9799877405166626,
"eval_runtime": 2.4101,
"eval_samples_per_second": 807.019,
"eval_steps_per_second": 12.863,
"step": 110
},
{
"epoch": 12.313725490196079,
"eval_loss": 1.9421709775924683,
"eval_runtime": 2.3763,
"eval_samples_per_second": 818.512,
"eval_steps_per_second": 13.046,
"step": 111
},
{
"epoch": 12.418300653594772,
"eval_loss": 1.9286293983459473,
"eval_runtime": 2.4023,
"eval_samples_per_second": 809.639,
"eval_steps_per_second": 12.904,
"step": 112
},
{
"epoch": 12.522875816993464,
"eval_loss": 1.9933801889419556,
"eval_runtime": 2.3795,
"eval_samples_per_second": 817.406,
"eval_steps_per_second": 13.028,
"step": 113
},
{
"epoch": 12.627450980392156,
"eval_loss": 1.9704465866088867,
"eval_runtime": 2.3792,
"eval_samples_per_second": 817.502,
"eval_steps_per_second": 13.03,
"step": 114
},
{
"epoch": 12.732026143790849,
"eval_loss": 1.939013957977295,
"eval_runtime": 2.4295,
"eval_samples_per_second": 800.592,
"eval_steps_per_second": 12.76,
"step": 115
},
{
"epoch": 12.836601307189543,
"eval_loss": 1.916093349456787,
"eval_runtime": 2.3862,
"eval_samples_per_second": 815.104,
"eval_steps_per_second": 12.991,
"step": 116
},
{
"epoch": 12.941176470588236,
"eval_loss": 1.94829523563385,
"eval_runtime": 2.4403,
"eval_samples_per_second": 797.034,
"eval_steps_per_second": 12.703,
"step": 117
},
{
"epoch": 12.941176470588236,
"grad_norm": 1.835829734802246,
"learning_rate": 1.5744680851063832e-05,
"loss": 1.9663,
"step": 117
},
{
"epoch": 13.104575163398692,
"eval_loss": 1.9584107398986816,
"eval_runtime": 2.4351,
"eval_samples_per_second": 798.732,
"eval_steps_per_second": 12.73,
"step": 118
},
{
"epoch": 13.209150326797385,
"eval_loss": 1.9641852378845215,
"eval_runtime": 2.4617,
"eval_samples_per_second": 790.102,
"eval_steps_per_second": 12.593,
"step": 119
},
{
"epoch": 13.313725490196079,
"eval_loss": 1.9446567296981812,
"eval_runtime": 2.4215,
"eval_samples_per_second": 803.226,
"eval_steps_per_second": 12.802,
"step": 120
},
{
"epoch": 13.418300653594772,
"eval_loss": 2.001385450363159,
"eval_runtime": 2.416,
"eval_samples_per_second": 805.044,
"eval_steps_per_second": 12.831,
"step": 121
},
{
"epoch": 13.522875816993464,
"eval_loss": 1.8805845975875854,
"eval_runtime": 2.4069,
"eval_samples_per_second": 808.102,
"eval_steps_per_second": 12.88,
"step": 122
},
{
"epoch": 13.627450980392156,
"eval_loss": 1.9486974477767944,
"eval_runtime": 2.4072,
"eval_samples_per_second": 807.987,
"eval_steps_per_second": 12.878,
"step": 123
},
{
"epoch": 13.732026143790849,
"eval_loss": 1.9180878400802612,
"eval_runtime": 2.4102,
"eval_samples_per_second": 806.973,
"eval_steps_per_second": 12.862,
"step": 124
},
{
"epoch": 13.836601307189543,
"eval_loss": 1.9238054752349854,
"eval_runtime": 2.3928,
"eval_samples_per_second": 812.868,
"eval_steps_per_second": 12.956,
"step": 125
},
{
"epoch": 13.941176470588236,
"eval_loss": 1.9513754844665527,
"eval_runtime": 2.3862,
"eval_samples_per_second": 815.11,
"eval_steps_per_second": 12.991,
"step": 126
},
{
"epoch": 13.941176470588236,
"grad_norm": 1.8433274030685425,
"learning_rate": 1.5319148936170214e-05,
"loss": 1.9785,
"step": 126
},
{
"epoch": 14.104575163398692,
"eval_loss": 1.9426443576812744,
"eval_runtime": 2.3889,
"eval_samples_per_second": 814.174,
"eval_steps_per_second": 12.977,
"step": 127
},
{
"epoch": 14.209150326797385,
"eval_loss": 1.9765559434890747,
"eval_runtime": 2.3875,
"eval_samples_per_second": 814.644,
"eval_steps_per_second": 12.984,
"step": 128
},
{
"epoch": 14.313725490196079,
"eval_loss": 1.9118081331253052,
"eval_runtime": 2.4268,
"eval_samples_per_second": 801.463,
"eval_steps_per_second": 12.774,
"step": 129
},
{
"epoch": 14.418300653594772,
"eval_loss": 1.9367104768753052,
"eval_runtime": 2.4183,
"eval_samples_per_second": 804.282,
"eval_steps_per_second": 12.819,
"step": 130
},
{
"epoch": 14.522875816993464,
"eval_loss": 1.9372411966323853,
"eval_runtime": 2.4201,
"eval_samples_per_second": 803.68,
"eval_steps_per_second": 12.809,
"step": 131
},
{
"epoch": 14.627450980392156,
"eval_loss": 1.923244595527649,
"eval_runtime": 2.4324,
"eval_samples_per_second": 799.606,
"eval_steps_per_second": 12.744,
"step": 132
},
{
"epoch": 14.732026143790849,
"eval_loss": 1.999928593635559,
"eval_runtime": 2.4161,
"eval_samples_per_second": 805.026,
"eval_steps_per_second": 12.831,
"step": 133
},
{
"epoch": 14.836601307189543,
"eval_loss": 1.9354963302612305,
"eval_runtime": 2.3965,
"eval_samples_per_second": 811.59,
"eval_steps_per_second": 12.935,
"step": 134
},
{
"epoch": 14.941176470588236,
"eval_loss": 1.965717077255249,
"eval_runtime": 2.4083,
"eval_samples_per_second": 807.639,
"eval_steps_per_second": 12.872,
"step": 135
},
{
"epoch": 14.941176470588236,
"grad_norm": 1.9256954193115234,
"learning_rate": 1.4893617021276596e-05,
"loss": 1.9329,
"step": 135
},
{
"epoch": 15.104575163398692,
"eval_loss": 1.9451290369033813,
"eval_runtime": 2.3926,
"eval_samples_per_second": 812.94,
"eval_steps_per_second": 12.957,
"step": 136
},
{
"epoch": 15.209150326797385,
"eval_loss": 1.9596805572509766,
"eval_runtime": 2.4003,
"eval_samples_per_second": 810.326,
"eval_steps_per_second": 12.915,
"step": 137
},
{
"epoch": 15.313725490196079,
"eval_loss": 1.9179918766021729,
"eval_runtime": 2.388,
"eval_samples_per_second": 814.49,
"eval_steps_per_second": 12.982,
"step": 138
},
{
"epoch": 15.418300653594772,
"eval_loss": 1.9344438314437866,
"eval_runtime": 2.4394,
"eval_samples_per_second": 797.34,
"eval_steps_per_second": 12.708,
"step": 139
},
{
"epoch": 15.522875816993464,
"eval_loss": 1.9772499799728394,
"eval_runtime": 2.4332,
"eval_samples_per_second": 799.368,
"eval_steps_per_second": 12.741,
"step": 140
},
{
"epoch": 15.627450980392156,
"eval_loss": 1.9796696901321411,
"eval_runtime": 2.4159,
"eval_samples_per_second": 805.096,
"eval_steps_per_second": 12.832,
"step": 141
},
{
"epoch": 15.732026143790849,
"eval_loss": 1.9060624837875366,
"eval_runtime": 2.4134,
"eval_samples_per_second": 805.928,
"eval_steps_per_second": 12.845,
"step": 142
},
{
"epoch": 15.836601307189543,
"eval_loss": 1.8885753154754639,
"eval_runtime": 2.4138,
"eval_samples_per_second": 805.794,
"eval_steps_per_second": 12.843,
"step": 143
},
{
"epoch": 15.941176470588236,
"eval_loss": 1.9685148000717163,
"eval_runtime": 2.4145,
"eval_samples_per_second": 805.548,
"eval_steps_per_second": 12.839,
"step": 144
},
{
"epoch": 15.941176470588236,
"grad_norm": 1.8362805843353271,
"learning_rate": 1.4468085106382981e-05,
"loss": 1.9144,
"step": 144
},
{
"epoch": 16.104575163398692,
"eval_loss": 1.9797979593276978,
"eval_runtime": 2.3896,
"eval_samples_per_second": 813.952,
"eval_steps_per_second": 12.973,
"step": 145
},
{
"epoch": 16.209150326797385,
"eval_loss": 1.9587923288345337,
"eval_runtime": 2.4182,
"eval_samples_per_second": 804.331,
"eval_steps_per_second": 12.82,
"step": 146
},
{
"epoch": 16.313725490196077,
"eval_loss": 1.9274431467056274,
"eval_runtime": 2.3878,
"eval_samples_per_second": 814.56,
"eval_steps_per_second": 12.983,
"step": 147
},
{
"epoch": 16.41830065359477,
"eval_loss": 1.958984375,
"eval_runtime": 2.4353,
"eval_samples_per_second": 798.68,
"eval_steps_per_second": 12.73,
"step": 148
},
{
"epoch": 16.522875816993466,
"eval_loss": 1.9552897214889526,
"eval_runtime": 2.4367,
"eval_samples_per_second": 798.2,
"eval_steps_per_second": 12.722,
"step": 149
},
{
"epoch": 16.627450980392158,
"eval_loss": 1.9142913818359375,
"eval_runtime": 2.4118,
"eval_samples_per_second": 806.44,
"eval_steps_per_second": 12.853,
"step": 150
},
{
"epoch": 16.73202614379085,
"eval_loss": 1.9268592596054077,
"eval_runtime": 2.4143,
"eval_samples_per_second": 805.633,
"eval_steps_per_second": 12.84,
"step": 151
},
{
"epoch": 16.836601307189543,
"eval_loss": 1.965384840965271,
"eval_runtime": 2.4177,
"eval_samples_per_second": 804.492,
"eval_steps_per_second": 12.822,
"step": 152
},
{
"epoch": 16.941176470588236,
"eval_loss": 1.9789389371871948,
"eval_runtime": 2.4157,
"eval_samples_per_second": 805.134,
"eval_steps_per_second": 12.832,
"step": 153
},
{
"epoch": 16.941176470588236,
"grad_norm": 1.918270230293274,
"learning_rate": 1.4042553191489363e-05,
"loss": 1.9103,
"step": 153
},
{
"epoch": 17.104575163398692,
"eval_loss": 1.9568538665771484,
"eval_runtime": 2.4112,
"eval_samples_per_second": 806.642,
"eval_steps_per_second": 12.857,
"step": 154
},
{
"epoch": 17.209150326797385,
"eval_loss": 1.9652351140975952,
"eval_runtime": 2.3867,
"eval_samples_per_second": 814.935,
"eval_steps_per_second": 12.989,
"step": 155
},
{
"epoch": 17.313725490196077,
"eval_loss": 1.9810242652893066,
"eval_runtime": 2.3904,
"eval_samples_per_second": 813.655,
"eval_steps_per_second": 12.968,
"step": 156
},
{
"epoch": 17.41830065359477,
"eval_loss": 1.928475260734558,
"eval_runtime": 2.3938,
"eval_samples_per_second": 812.526,
"eval_steps_per_second": 12.95,
"step": 157
},
{
"epoch": 17.522875816993466,
"eval_loss": 1.937834620475769,
"eval_runtime": 2.4231,
"eval_samples_per_second": 802.677,
"eval_steps_per_second": 12.793,
"step": 158
},
{
"epoch": 17.627450980392158,
"eval_loss": 1.9520132541656494,
"eval_runtime": 2.4525,
"eval_samples_per_second": 793.081,
"eval_steps_per_second": 12.64,
"step": 159
},
{
"epoch": 17.73202614379085,
"eval_loss": 1.9782063961029053,
"eval_runtime": 2.4273,
"eval_samples_per_second": 801.302,
"eval_steps_per_second": 12.771,
"step": 160
},
{
"epoch": 17.836601307189543,
"eval_loss": 1.9681016206741333,
"eval_runtime": 2.4116,
"eval_samples_per_second": 806.515,
"eval_steps_per_second": 12.854,
"step": 161
},
{
"epoch": 17.941176470588236,
"eval_loss": 1.8925799131393433,
"eval_runtime": 2.4098,
"eval_samples_per_second": 807.121,
"eval_steps_per_second": 12.864,
"step": 162
},
{
"epoch": 17.941176470588236,
"grad_norm": 1.8396626710891724,
"learning_rate": 1.3617021276595745e-05,
"loss": 1.887,
"step": 162
},
{
"epoch": 18.104575163398692,
"eval_loss": 1.9333585500717163,
"eval_runtime": 2.4106,
"eval_samples_per_second": 806.84,
"eval_steps_per_second": 12.86,
"step": 163
},
{
"epoch": 18.209150326797385,
"eval_loss": 1.925223469734192,
"eval_runtime": 2.3936,
"eval_samples_per_second": 812.57,
"eval_steps_per_second": 12.951,
"step": 164
},
{
"epoch": 18.313725490196077,
"eval_loss": 1.9398906230926514,
"eval_runtime": 2.4263,
"eval_samples_per_second": 801.632,
"eval_steps_per_second": 12.777,
"step": 165
},
{
"epoch": 18.41830065359477,
"eval_loss": 1.9518330097198486,
"eval_runtime": 2.3924,
"eval_samples_per_second": 812.992,
"eval_steps_per_second": 12.958,
"step": 166
},
{
"epoch": 18.522875816993466,
"eval_loss": 1.992385983467102,
"eval_runtime": 2.3882,
"eval_samples_per_second": 814.425,
"eval_steps_per_second": 12.981,
"step": 167
},
{
"epoch": 18.627450980392158,
"eval_loss": 1.905411720275879,
"eval_runtime": 2.3878,
"eval_samples_per_second": 814.555,
"eval_steps_per_second": 12.983,
"step": 168
},
{
"epoch": 18.73202614379085,
"eval_loss": 1.9480212926864624,
"eval_runtime": 2.4462,
"eval_samples_per_second": 795.096,
"eval_steps_per_second": 12.672,
"step": 169
},
{
"epoch": 18.836601307189543,
"eval_loss": 1.9308433532714844,
"eval_runtime": 2.445,
"eval_samples_per_second": 795.492,
"eval_steps_per_second": 12.679,
"step": 170
},
{
"epoch": 18.941176470588236,
"eval_loss": 1.9342797994613647,
"eval_runtime": 2.414,
"eval_samples_per_second": 805.712,
"eval_steps_per_second": 12.842,
"step": 171
},
{
"epoch": 18.941176470588236,
"grad_norm": 1.8954132795333862,
"learning_rate": 1.3191489361702127e-05,
"loss": 1.8644,
"step": 171
},
{
"epoch": 19.104575163398692,
"eval_loss": 1.9860589504241943,
"eval_runtime": 2.4977,
"eval_samples_per_second": 778.731,
"eval_steps_per_second": 12.412,
"step": 172
},
{
"epoch": 19.209150326797385,
"eval_loss": 1.9452682733535767,
"eval_runtime": 2.413,
"eval_samples_per_second": 806.043,
"eval_steps_per_second": 12.847,
"step": 173
},
{
"epoch": 19.313725490196077,
"eval_loss": 1.8998777866363525,
"eval_runtime": 2.3895,
"eval_samples_per_second": 813.974,
"eval_steps_per_second": 12.973,
"step": 174
},
{
"epoch": 19.41830065359477,
"eval_loss": 1.93086838722229,
"eval_runtime": 2.383,
"eval_samples_per_second": 816.195,
"eval_steps_per_second": 13.009,
"step": 175
},
{
"epoch": 19.522875816993466,
"eval_loss": 1.954423189163208,
"eval_runtime": 2.393,
"eval_samples_per_second": 812.774,
"eval_steps_per_second": 12.954,
"step": 176
},
{
"epoch": 19.627450980392158,
"eval_loss": 1.9435521364212036,
"eval_runtime": 2.4095,
"eval_samples_per_second": 807.227,
"eval_steps_per_second": 12.866,
"step": 177
},
{
"epoch": 19.73202614379085,
"eval_loss": 1.9165093898773193,
"eval_runtime": 2.3936,
"eval_samples_per_second": 812.598,
"eval_steps_per_second": 12.951,
"step": 178
},
{
"epoch": 19.836601307189543,
"eval_loss": 1.9695576429367065,
"eval_runtime": 2.4194,
"eval_samples_per_second": 803.908,
"eval_steps_per_second": 12.813,
"step": 179
},
{
"epoch": 19.941176470588236,
"eval_loss": 1.9247905015945435,
"eval_runtime": 2.4477,
"eval_samples_per_second": 794.61,
"eval_steps_per_second": 12.665,
"step": 180
},
{
"epoch": 19.941176470588236,
"grad_norm": 1.8007246255874634,
"learning_rate": 1.2765957446808513e-05,
"loss": 1.8687,
"step": 180
},
{
"epoch": 20.104575163398692,
"eval_loss": 1.9517226219177246,
"eval_runtime": 2.4132,
"eval_samples_per_second": 805.981,
"eval_steps_per_second": 12.846,
"step": 181
},
{
"epoch": 20.209150326797385,
"eval_loss": 1.9041943550109863,
"eval_runtime": 2.4088,
"eval_samples_per_second": 807.45,
"eval_steps_per_second": 12.869,
"step": 182
},
{
"epoch": 20.313725490196077,
"eval_loss": 1.992538571357727,
"eval_runtime": 2.4151,
"eval_samples_per_second": 805.334,
"eval_steps_per_second": 12.836,
"step": 183
},
{
"epoch": 20.41830065359477,
"eval_loss": 1.8842642307281494,
"eval_runtime": 2.4235,
"eval_samples_per_second": 802.553,
"eval_steps_per_second": 12.791,
"step": 184
},
{
"epoch": 20.522875816993466,
"eval_loss": 1.979435682296753,
"eval_runtime": 2.3874,
"eval_samples_per_second": 814.695,
"eval_steps_per_second": 12.985,
"step": 185
},
{
"epoch": 20.627450980392158,
"eval_loss": 1.9789183139801025,
"eval_runtime": 2.3863,
"eval_samples_per_second": 815.07,
"eval_steps_per_second": 12.991,
"step": 186
},
{
"epoch": 20.73202614379085,
"eval_loss": 1.9192243814468384,
"eval_runtime": 2.3878,
"eval_samples_per_second": 814.573,
"eval_steps_per_second": 12.983,
"step": 187
},
{
"epoch": 20.836601307189543,
"eval_loss": 1.9174364805221558,
"eval_runtime": 2.3935,
"eval_samples_per_second": 812.633,
"eval_steps_per_second": 12.952,
"step": 188
},
{
"epoch": 20.941176470588236,
"eval_loss": 1.9568063020706177,
"eval_runtime": 2.4211,
"eval_samples_per_second": 803.354,
"eval_steps_per_second": 12.804,
"step": 189
},
{
"epoch": 20.941176470588236,
"grad_norm": 1.8035422563552856,
"learning_rate": 1.2340425531914895e-05,
"loss": 1.8361,
"step": 189
},
{
"epoch": 21.104575163398692,
"eval_loss": 1.9128376245498657,
"eval_runtime": 2.4627,
"eval_samples_per_second": 789.793,
"eval_steps_per_second": 12.588,
"step": 190
},
{
"epoch": 21.209150326797385,
"eval_loss": 1.9428894519805908,
"eval_runtime": 2.4145,
"eval_samples_per_second": 805.537,
"eval_steps_per_second": 12.839,
"step": 191
},
{
"epoch": 21.313725490196077,
"eval_loss": 1.95577073097229,
"eval_runtime": 2.4665,
"eval_samples_per_second": 788.556,
"eval_steps_per_second": 12.568,
"step": 192
},
{
"epoch": 21.41830065359477,
"eval_loss": 1.9128402471542358,
"eval_runtime": 2.4073,
"eval_samples_per_second": 807.972,
"eval_steps_per_second": 12.878,
"step": 193
},
{
"epoch": 21.522875816993466,
"eval_loss": 1.9588518142700195,
"eval_runtime": 2.3872,
"eval_samples_per_second": 814.75,
"eval_steps_per_second": 12.986,
"step": 194
},
{
"epoch": 21.627450980392158,
"eval_loss": 1.9744739532470703,
"eval_runtime": 2.4193,
"eval_samples_per_second": 803.936,
"eval_steps_per_second": 12.813,
"step": 195
},
{
"epoch": 21.73202614379085,
"eval_loss": 1.9993598461151123,
"eval_runtime": 2.3876,
"eval_samples_per_second": 814.634,
"eval_steps_per_second": 12.984,
"step": 196
},
{
"epoch": 21.836601307189543,
"eval_loss": 1.959428071975708,
"eval_runtime": 2.3868,
"eval_samples_per_second": 814.91,
"eval_steps_per_second": 12.988,
"step": 197
},
{
"epoch": 21.941176470588236,
"eval_loss": 1.9063607454299927,
"eval_runtime": 2.3998,
"eval_samples_per_second": 810.501,
"eval_steps_per_second": 12.918,
"step": 198
},
{
"epoch": 21.941176470588236,
"grad_norm": 1.7296489477157593,
"learning_rate": 1.1914893617021277e-05,
"loss": 1.8461,
"step": 198
},
{
"epoch": 22.104575163398692,
"eval_loss": 1.9475386142730713,
"eval_runtime": 2.4105,
"eval_samples_per_second": 806.877,
"eval_steps_per_second": 12.86,
"step": 199
},
{
"epoch": 22.209150326797385,
"eval_loss": 1.9637689590454102,
"eval_runtime": 2.4134,
"eval_samples_per_second": 805.906,
"eval_steps_per_second": 12.845,
"step": 200
},
{
"epoch": 22.313725490196077,
"eval_loss": 1.9350510835647583,
"eval_runtime": 2.4228,
"eval_samples_per_second": 802.787,
"eval_steps_per_second": 12.795,
"step": 201
},
{
"epoch": 22.41830065359477,
"eval_loss": 1.9184238910675049,
"eval_runtime": 2.4127,
"eval_samples_per_second": 806.144,
"eval_steps_per_second": 12.849,
"step": 202
},
{
"epoch": 22.522875816993466,
"eval_loss": 1.9656862020492554,
"eval_runtime": 2.4145,
"eval_samples_per_second": 805.559,
"eval_steps_per_second": 12.839,
"step": 203
},
{
"epoch": 22.627450980392158,
"eval_loss": 1.9108870029449463,
"eval_runtime": 2.3915,
"eval_samples_per_second": 813.292,
"eval_steps_per_second": 12.962,
"step": 204
},
{
"epoch": 22.73202614379085,
"eval_loss": 1.9319818019866943,
"eval_runtime": 2.3918,
"eval_samples_per_second": 813.179,
"eval_steps_per_second": 12.961,
"step": 205
},
{
"epoch": 22.836601307189543,
"eval_loss": 1.9680215120315552,
"eval_runtime": 2.4001,
"eval_samples_per_second": 810.398,
"eval_steps_per_second": 12.916,
"step": 206
},
{
"epoch": 22.941176470588236,
"eval_loss": 1.9628697633743286,
"eval_runtime": 2.3906,
"eval_samples_per_second": 813.608,
"eval_steps_per_second": 12.968,
"step": 207
},
{
"epoch": 22.941176470588236,
"grad_norm": 1.8930681943893433,
"learning_rate": 1.1489361702127662e-05,
"loss": 1.8246,
"step": 207
},
{
"epoch": 23.104575163398692,
"eval_loss": 1.9429619312286377,
"eval_runtime": 2.3877,
"eval_samples_per_second": 814.582,
"eval_steps_per_second": 12.983,
"step": 208
},
{
"epoch": 23.209150326797385,
"eval_loss": 1.9262027740478516,
"eval_runtime": 2.5188,
"eval_samples_per_second": 772.187,
"eval_steps_per_second": 12.307,
"step": 209
},
{
"epoch": 23.313725490196077,
"eval_loss": 1.9614677429199219,
"eval_runtime": 2.4709,
"eval_samples_per_second": 787.175,
"eval_steps_per_second": 12.546,
"step": 210
},
{
"epoch": 23.41830065359477,
"eval_loss": 1.9559693336486816,
"eval_runtime": 2.4152,
"eval_samples_per_second": 805.308,
"eval_steps_per_second": 12.835,
"step": 211
},
{
"epoch": 23.522875816993466,
"eval_loss": 1.966059923171997,
"eval_runtime": 2.4122,
"eval_samples_per_second": 806.317,
"eval_steps_per_second": 12.851,
"step": 212
},
{
"epoch": 23.627450980392158,
"eval_loss": 1.9780749082565308,
"eval_runtime": 2.437,
"eval_samples_per_second": 798.101,
"eval_steps_per_second": 12.72,
"step": 213
},
{
"epoch": 23.73202614379085,
"eval_loss": 1.980626106262207,
"eval_runtime": 2.404,
"eval_samples_per_second": 809.055,
"eval_steps_per_second": 12.895,
"step": 214
},
{
"epoch": 23.836601307189543,
"eval_loss": 1.9735476970672607,
"eval_runtime": 2.3933,
"eval_samples_per_second": 812.67,
"eval_steps_per_second": 12.953,
"step": 215
},
{
"epoch": 23.941176470588236,
"eval_loss": 1.9582773447036743,
"eval_runtime": 2.4128,
"eval_samples_per_second": 806.128,
"eval_steps_per_second": 12.848,
"step": 216
},
{
"epoch": 23.941176470588236,
"grad_norm": 1.8361761569976807,
"learning_rate": 1.1063829787234044e-05,
"loss": 1.8181,
"step": 216
},
{
"epoch": 24.104575163398692,
"eval_loss": 1.9554569721221924,
"eval_runtime": 2.387,
"eval_samples_per_second": 814.839,
"eval_steps_per_second": 12.987,
"step": 217
},
{
"epoch": 24.209150326797385,
"eval_loss": 1.9165290594100952,
"eval_runtime": 2.3919,
"eval_samples_per_second": 813.148,
"eval_steps_per_second": 12.96,
"step": 218
},
{
"epoch": 24.313725490196077,
"eval_loss": 1.9637575149536133,
"eval_runtime": 2.4007,
"eval_samples_per_second": 810.191,
"eval_steps_per_second": 12.913,
"step": 219
},
{
"epoch": 24.41830065359477,
"eval_loss": 2.000793218612671,
"eval_runtime": 2.4153,
"eval_samples_per_second": 805.272,
"eval_steps_per_second": 12.835,
"step": 220
},
{
"epoch": 24.522875816993466,
"eval_loss": 1.9246618747711182,
"eval_runtime": 2.4113,
"eval_samples_per_second": 806.61,
"eval_steps_per_second": 12.856,
"step": 221
},
{
"epoch": 24.627450980392158,
"eval_loss": 1.9719598293304443,
"eval_runtime": 2.4103,
"eval_samples_per_second": 806.944,
"eval_steps_per_second": 12.861,
"step": 222
},
{
"epoch": 24.73202614379085,
"eval_loss": 2.008406400680542,
"eval_runtime": 2.4142,
"eval_samples_per_second": 805.666,
"eval_steps_per_second": 12.841,
"step": 223
},
{
"epoch": 24.836601307189543,
"eval_loss": 1.942387580871582,
"eval_runtime": 2.4335,
"eval_samples_per_second": 799.261,
"eval_steps_per_second": 12.739,
"step": 224
},
{
"epoch": 24.941176470588236,
"eval_loss": 1.9110654592514038,
"eval_runtime": 2.3897,
"eval_samples_per_second": 813.923,
"eval_steps_per_second": 12.973,
"step": 225
},
{
"epoch": 24.941176470588236,
"grad_norm": 1.8342725038528442,
"learning_rate": 1.0638297872340426e-05,
"loss": 1.797,
"step": 225
},
{
"epoch": 25.104575163398692,
"eval_loss": 1.9787415266036987,
"eval_runtime": 2.4748,
"eval_samples_per_second": 785.908,
"eval_steps_per_second": 12.526,
"step": 226
},
{
"epoch": 25.209150326797385,
"eval_loss": 1.9613263607025146,
"eval_runtime": 2.3841,
"eval_samples_per_second": 815.829,
"eval_steps_per_second": 13.003,
"step": 227
},
{
"epoch": 25.313725490196077,
"eval_loss": 1.8806324005126953,
"eval_runtime": 2.4198,
"eval_samples_per_second": 803.779,
"eval_steps_per_second": 12.811,
"step": 228
},
{
"epoch": 25.41830065359477,
"eval_loss": 1.9231013059616089,
"eval_runtime": 2.3851,
"eval_samples_per_second": 815.485,
"eval_steps_per_second": 12.997,
"step": 229
},
{
"epoch": 25.522875816993466,
"eval_loss": 1.9021631479263306,
"eval_runtime": 2.3853,
"eval_samples_per_second": 815.428,
"eval_steps_per_second": 12.997,
"step": 230
},
{
"epoch": 25.627450980392158,
"eval_loss": 1.9682537317276,
"eval_runtime": 2.4444,
"eval_samples_per_second": 795.691,
"eval_steps_per_second": 12.682,
"step": 231
},
{
"epoch": 25.73202614379085,
"eval_loss": 1.9824862480163574,
"eval_runtime": 2.4349,
"eval_samples_per_second": 798.799,
"eval_steps_per_second": 12.732,
"step": 232
},
{
"epoch": 25.836601307189543,
"eval_loss": 1.962891936302185,
"eval_runtime": 2.411,
"eval_samples_per_second": 806.729,
"eval_steps_per_second": 12.858,
"step": 233
},
{
"epoch": 25.941176470588236,
"eval_loss": 1.9116088151931763,
"eval_runtime": 2.4247,
"eval_samples_per_second": 802.172,
"eval_steps_per_second": 12.785,
"step": 234
},
{
"epoch": 25.941176470588236,
"grad_norm": 1.9028220176696777,
"learning_rate": 1.0212765957446808e-05,
"loss": 1.7749,
"step": 234
},
{
"epoch": 26.104575163398692,
"eval_loss": 1.9699651002883911,
"eval_runtime": 2.4102,
"eval_samples_per_second": 806.995,
"eval_steps_per_second": 12.862,
"step": 235
},
{
"epoch": 26.209150326797385,
"eval_loss": 1.9811697006225586,
"eval_runtime": 2.386,
"eval_samples_per_second": 815.174,
"eval_steps_per_second": 12.992,
"step": 236
},
{
"epoch": 26.313725490196077,
"eval_loss": 1.9248907566070557,
"eval_runtime": 2.3932,
"eval_samples_per_second": 812.727,
"eval_steps_per_second": 12.953,
"step": 237
},
{
"epoch": 26.41830065359477,
"eval_loss": 1.9683917760849,
"eval_runtime": 2.3996,
"eval_samples_per_second": 810.537,
"eval_steps_per_second": 12.919,
"step": 238
},
{
"epoch": 26.522875816993466,
"eval_loss": 1.9604750871658325,
"eval_runtime": 2.3993,
"eval_samples_per_second": 810.666,
"eval_steps_per_second": 12.921,
"step": 239
},
{
"epoch": 26.627450980392158,
"eval_loss": 1.8918408155441284,
"eval_runtime": 2.4162,
"eval_samples_per_second": 804.968,
"eval_steps_per_second": 12.83,
"step": 240
},
{
"epoch": 26.73202614379085,
"eval_loss": 1.9443118572235107,
"eval_runtime": 2.3904,
"eval_samples_per_second": 813.679,
"eval_steps_per_second": 12.969,
"step": 241
},
{
"epoch": 26.836601307189543,
"eval_loss": 1.9147528409957886,
"eval_runtime": 2.4305,
"eval_samples_per_second": 800.261,
"eval_steps_per_second": 12.755,
"step": 242
},
{
"epoch": 26.941176470588236,
"eval_loss": 1.8974157571792603,
"eval_runtime": 2.4514,
"eval_samples_per_second": 793.417,
"eval_steps_per_second": 12.646,
"step": 243
},
{
"epoch": 26.941176470588236,
"grad_norm": 1.7782148122787476,
"learning_rate": 9.787234042553192e-06,
"loss": 1.8022,
"step": 243
},
{
"epoch": 27.104575163398692,
"eval_loss": 1.9711873531341553,
"eval_runtime": 2.4377,
"eval_samples_per_second": 797.883,
"eval_steps_per_second": 12.717,
"step": 244
},
{
"epoch": 27.209150326797385,
"eval_loss": 1.9718581438064575,
"eval_runtime": 2.404,
"eval_samples_per_second": 809.082,
"eval_steps_per_second": 12.895,
"step": 245
},
{
"epoch": 27.313725490196077,
"eval_loss": 1.9540036916732788,
"eval_runtime": 2.411,
"eval_samples_per_second": 806.731,
"eval_steps_per_second": 12.858,
"step": 246
},
{
"epoch": 27.41830065359477,
"eval_loss": 1.8907063007354736,
"eval_runtime": 2.3875,
"eval_samples_per_second": 814.658,
"eval_steps_per_second": 12.984,
"step": 247
},
{
"epoch": 27.522875816993466,
"eval_loss": 1.9907869100570679,
"eval_runtime": 2.3956,
"eval_samples_per_second": 811.916,
"eval_steps_per_second": 12.941,
"step": 248
},
{
"epoch": 27.627450980392158,
"eval_loss": 1.9273970127105713,
"eval_runtime": 2.3922,
"eval_samples_per_second": 813.044,
"eval_steps_per_second": 12.959,
"step": 249
},
{
"epoch": 27.73202614379085,
"eval_loss": 1.9233652353286743,
"eval_runtime": 2.3912,
"eval_samples_per_second": 813.399,
"eval_steps_per_second": 12.964,
"step": 250
},
{
"epoch": 27.836601307189543,
"eval_loss": 1.9580994844436646,
"eval_runtime": 2.3875,
"eval_samples_per_second": 814.648,
"eval_steps_per_second": 12.984,
"step": 251
},
{
"epoch": 27.941176470588236,
"eval_loss": 1.9409220218658447,
"eval_runtime": 2.4743,
"eval_samples_per_second": 786.095,
"eval_steps_per_second": 12.529,
"step": 252
},
{
"epoch": 27.941176470588236,
"grad_norm": 1.8487893342971802,
"learning_rate": 9.361702127659576e-06,
"loss": 1.7879,
"step": 252
},
{
"epoch": 28.104575163398692,
"eval_loss": 1.8715720176696777,
"eval_runtime": 2.4867,
"eval_samples_per_second": 782.152,
"eval_steps_per_second": 12.466,
"step": 253
},
{
"epoch": 28.209150326797385,
"eval_loss": 1.9945265054702759,
"eval_runtime": 2.4913,
"eval_samples_per_second": 780.713,
"eval_steps_per_second": 12.443,
"step": 254
},
{
"epoch": 28.313725490196077,
"eval_loss": 1.8658331632614136,
"eval_runtime": 2.4682,
"eval_samples_per_second": 788.013,
"eval_steps_per_second": 12.56,
"step": 255
},
{
"epoch": 28.41830065359477,
"eval_loss": 1.946846604347229,
"eval_runtime": 2.4685,
"eval_samples_per_second": 787.912,
"eval_steps_per_second": 12.558,
"step": 256
},
{
"epoch": 28.522875816993466,
"eval_loss": 1.9456650018692017,
"eval_runtime": 2.4963,
"eval_samples_per_second": 779.142,
"eval_steps_per_second": 12.418,
"step": 257
},
{
"epoch": 28.627450980392158,
"eval_loss": 1.9555299282073975,
"eval_runtime": 2.467,
"eval_samples_per_second": 788.417,
"eval_steps_per_second": 12.566,
"step": 258
},
{
"epoch": 28.73202614379085,
"eval_loss": 1.9544572830200195,
"eval_runtime": 2.4342,
"eval_samples_per_second": 799.038,
"eval_steps_per_second": 12.735,
"step": 259
},
{
"epoch": 28.836601307189543,
"eval_loss": 1.9225515127182007,
"eval_runtime": 2.3903,
"eval_samples_per_second": 813.707,
"eval_steps_per_second": 12.969,
"step": 260
},
{
"epoch": 28.941176470588236,
"eval_loss": 1.9331358671188354,
"eval_runtime": 2.3878,
"eval_samples_per_second": 814.568,
"eval_steps_per_second": 12.983,
"step": 261
},
{
"epoch": 28.941176470588236,
"grad_norm": 1.77451491355896,
"learning_rate": 8.936170212765958e-06,
"loss": 1.8019,
"step": 261
},
{
"epoch": 29.104575163398692,
"eval_loss": 1.9785720109939575,
"eval_runtime": 2.4363,
"eval_samples_per_second": 798.34,
"eval_steps_per_second": 12.724,
"step": 262
},
{
"epoch": 29.209150326797385,
"eval_loss": 1.9767541885375977,
"eval_runtime": 2.4358,
"eval_samples_per_second": 798.515,
"eval_steps_per_second": 12.727,
"step": 263
},
{
"epoch": 29.313725490196077,
"eval_loss": 1.9601216316223145,
"eval_runtime": 2.4048,
"eval_samples_per_second": 808.808,
"eval_steps_per_second": 12.891,
"step": 264
},
{
"epoch": 29.41830065359477,
"eval_loss": 1.917155385017395,
"eval_runtime": 2.4141,
"eval_samples_per_second": 805.697,
"eval_steps_per_second": 12.841,
"step": 265
},
{
"epoch": 29.522875816993466,
"eval_loss": 1.922187328338623,
"eval_runtime": 2.4093,
"eval_samples_per_second": 807.305,
"eval_steps_per_second": 12.867,
"step": 266
},
{
"epoch": 29.627450980392158,
"eval_loss": 1.918397307395935,
"eval_runtime": 2.3948,
"eval_samples_per_second": 812.187,
"eval_steps_per_second": 12.945,
"step": 267
},
{
"epoch": 29.73202614379085,
"eval_loss": 1.882236123085022,
"eval_runtime": 2.3912,
"eval_samples_per_second": 813.4,
"eval_steps_per_second": 12.964,
"step": 268
},
{
"epoch": 29.836601307189543,
"eval_loss": 1.916178822517395,
"eval_runtime": 2.3869,
"eval_samples_per_second": 814.874,
"eval_steps_per_second": 12.988,
"step": 269
},
{
"epoch": 29.941176470588236,
"eval_loss": 1.9769715070724487,
"eval_runtime": 2.3858,
"eval_samples_per_second": 815.249,
"eval_steps_per_second": 12.994,
"step": 270
},
{
"epoch": 29.941176470588236,
"grad_norm": 2.04988431930542,
"learning_rate": 8.510638297872341e-06,
"loss": 1.7614,
"step": 270
},
{
"epoch": 30.104575163398692,
"eval_loss": 1.9033125638961792,
"eval_runtime": 2.4701,
"eval_samples_per_second": 787.431,
"eval_steps_per_second": 12.55,
"step": 271
},
{
"epoch": 30.209150326797385,
"eval_loss": 1.9454644918441772,
"eval_runtime": 2.4111,
"eval_samples_per_second": 806.698,
"eval_steps_per_second": 12.857,
"step": 272
},
{
"epoch": 30.313725490196077,
"eval_loss": 1.9106584787368774,
"eval_runtime": 2.4243,
"eval_samples_per_second": 802.304,
"eval_steps_per_second": 12.787,
"step": 273
},
{
"epoch": 30.41830065359477,
"eval_loss": 1.9812813997268677,
"eval_runtime": 2.4148,
"eval_samples_per_second": 805.466,
"eval_steps_per_second": 12.838,
"step": 274
},
{
"epoch": 30.522875816993466,
"eval_loss": 1.9426772594451904,
"eval_runtime": 2.3997,
"eval_samples_per_second": 810.526,
"eval_steps_per_second": 12.918,
"step": 275
},
{
"epoch": 30.627450980392158,
"eval_loss": 1.9499095678329468,
"eval_runtime": 2.3957,
"eval_samples_per_second": 811.863,
"eval_steps_per_second": 12.94,
"step": 276
},
{
"epoch": 30.73202614379085,
"eval_loss": 1.961235523223877,
"eval_runtime": 2.3821,
"eval_samples_per_second": 816.509,
"eval_steps_per_second": 13.014,
"step": 277
},
{
"epoch": 30.836601307189543,
"eval_loss": 1.9450849294662476,
"eval_runtime": 2.3942,
"eval_samples_per_second": 812.387,
"eval_steps_per_second": 12.948,
"step": 278
},
{
"epoch": 30.941176470588236,
"eval_loss": 1.9132739305496216,
"eval_runtime": 2.4136,
"eval_samples_per_second": 805.836,
"eval_steps_per_second": 12.844,
"step": 279
},
{
"epoch": 30.941176470588236,
"grad_norm": 1.8763809204101562,
"learning_rate": 8.085106382978723e-06,
"loss": 1.7619,
"step": 279
},
{
"epoch": 31.104575163398692,
"eval_loss": 1.9205107688903809,
"eval_runtime": 2.4332,
"eval_samples_per_second": 799.374,
"eval_steps_per_second": 12.741,
"step": 280
},
{
"epoch": 31.209150326797385,
"eval_loss": 1.946841835975647,
"eval_runtime": 2.464,
"eval_samples_per_second": 789.356,
"eval_steps_per_second": 12.581,
"step": 281
},
{
"epoch": 31.313725490196077,
"eval_loss": 1.9464671611785889,
"eval_runtime": 2.4074,
"eval_samples_per_second": 807.917,
"eval_steps_per_second": 12.877,
"step": 282
},
{
"epoch": 31.41830065359477,
"eval_loss": 1.8833441734313965,
"eval_runtime": 2.4082,
"eval_samples_per_second": 807.64,
"eval_steps_per_second": 12.872,
"step": 283
},
{
"epoch": 31.522875816993466,
"eval_loss": 1.9414160251617432,
"eval_runtime": 2.407,
"eval_samples_per_second": 808.072,
"eval_steps_per_second": 12.879,
"step": 284
},
{
"epoch": 31.627450980392158,
"eval_loss": 1.9485697746276855,
"eval_runtime": 2.4025,
"eval_samples_per_second": 809.577,
"eval_steps_per_second": 12.903,
"step": 285
},
{
"epoch": 31.73202614379085,
"eval_loss": 1.9184815883636475,
"eval_runtime": 2.3828,
"eval_samples_per_second": 816.275,
"eval_steps_per_second": 13.01,
"step": 286
},
{
"epoch": 31.836601307189543,
"eval_loss": 1.9518897533416748,
"eval_runtime": 2.4096,
"eval_samples_per_second": 807.189,
"eval_steps_per_second": 12.865,
"step": 287
},
{
"epoch": 31.941176470588236,
"eval_loss": 1.9385578632354736,
"eval_runtime": 2.4088,
"eval_samples_per_second": 807.447,
"eval_steps_per_second": 12.869,
"step": 288
},
{
"epoch": 31.941176470588236,
"grad_norm": 1.8053061962127686,
"learning_rate": 7.659574468085107e-06,
"loss": 1.7713,
"step": 288
},
{
"epoch": 32.10457516339869,
"eval_loss": 1.8966560363769531,
"eval_runtime": 2.4403,
"eval_samples_per_second": 797.048,
"eval_steps_per_second": 12.704,
"step": 289
},
{
"epoch": 32.209150326797385,
"eval_loss": 1.9649851322174072,
"eval_runtime": 2.4092,
"eval_samples_per_second": 807.307,
"eval_steps_per_second": 12.867,
"step": 290
},
{
"epoch": 32.31372549019608,
"eval_loss": 1.919927954673767,
"eval_runtime": 2.41,
"eval_samples_per_second": 807.057,
"eval_steps_per_second": 12.863,
"step": 291
},
{
"epoch": 32.41830065359477,
"eval_loss": 1.9147096872329712,
"eval_runtime": 2.4127,
"eval_samples_per_second": 806.134,
"eval_steps_per_second": 12.848,
"step": 292
},
{
"epoch": 32.52287581699346,
"eval_loss": 1.9159774780273438,
"eval_runtime": 2.4052,
"eval_samples_per_second": 808.673,
"eval_steps_per_second": 12.889,
"step": 293
},
{
"epoch": 32.627450980392155,
"eval_loss": 1.9953843355178833,
"eval_runtime": 2.3906,
"eval_samples_per_second": 813.616,
"eval_steps_per_second": 12.968,
"step": 294
},
{
"epoch": 32.73202614379085,
"eval_loss": 1.92180597782135,
"eval_runtime": 2.4249,
"eval_samples_per_second": 802.081,
"eval_steps_per_second": 12.784,
"step": 295
},
{
"epoch": 32.83660130718954,
"eval_loss": 1.9786967039108276,
"eval_runtime": 2.3846,
"eval_samples_per_second": 815.658,
"eval_steps_per_second": 13.0,
"step": 296
},
{
"epoch": 32.94117647058823,
"eval_loss": 1.9362424612045288,
"eval_runtime": 2.3879,
"eval_samples_per_second": 814.534,
"eval_steps_per_second": 12.982,
"step": 297
},
{
"epoch": 32.94117647058823,
"grad_norm": 2.0353338718414307,
"learning_rate": 7.234042553191491e-06,
"loss": 1.7635,
"step": 297
},
{
"epoch": 33.10457516339869,
"eval_loss": 1.9281338453292847,
"eval_runtime": 2.4396,
"eval_samples_per_second": 797.251,
"eval_steps_per_second": 12.707,
"step": 298
},
{
"epoch": 33.209150326797385,
"eval_loss": 1.921140193939209,
"eval_runtime": 2.4118,
"eval_samples_per_second": 806.459,
"eval_steps_per_second": 12.854,
"step": 299
},
{
"epoch": 33.31372549019608,
"eval_loss": 1.9680968523025513,
"eval_runtime": 2.4046,
"eval_samples_per_second": 808.854,
"eval_steps_per_second": 12.892,
"step": 300
},
{
"epoch": 33.41830065359477,
"eval_loss": 1.9094316959381104,
"eval_runtime": 2.4065,
"eval_samples_per_second": 808.218,
"eval_steps_per_second": 12.882,
"step": 301
},
{
"epoch": 33.52287581699346,
"eval_loss": 1.9845983982086182,
"eval_runtime": 2.4045,
"eval_samples_per_second": 808.905,
"eval_steps_per_second": 12.893,
"step": 302
},
{
"epoch": 33.627450980392155,
"eval_loss": 1.9461405277252197,
"eval_runtime": 2.4098,
"eval_samples_per_second": 807.107,
"eval_steps_per_second": 12.864,
"step": 303
},
{
"epoch": 33.73202614379085,
"eval_loss": 1.8947722911834717,
"eval_runtime": 2.392,
"eval_samples_per_second": 813.124,
"eval_steps_per_second": 12.96,
"step": 304
},
{
"epoch": 33.83660130718954,
"eval_loss": 1.9371235370635986,
"eval_runtime": 2.3855,
"eval_samples_per_second": 815.347,
"eval_steps_per_second": 12.995,
"step": 305
},
{
"epoch": 33.94117647058823,
"eval_loss": 2.006459951400757,
"eval_runtime": 2.4252,
"eval_samples_per_second": 802.01,
"eval_steps_per_second": 12.783,
"step": 306
},
{
"epoch": 33.94117647058823,
"grad_norm": 1.884189486503601,
"learning_rate": 6.808510638297873e-06,
"loss": 1.7394,
"step": 306
},
{
"epoch": 34.10457516339869,
"eval_loss": 1.9282273054122925,
"eval_runtime": 2.4222,
"eval_samples_per_second": 802.994,
"eval_steps_per_second": 12.798,
"step": 307
},
{
"epoch": 34.209150326797385,
"eval_loss": 1.9412920475006104,
"eval_runtime": 2.5082,
"eval_samples_per_second": 775.463,
"eval_steps_per_second": 12.36,
"step": 308
},
{
"epoch": 34.31372549019608,
"eval_loss": 1.9883979558944702,
"eval_runtime": 2.5353,
"eval_samples_per_second": 767.18,
"eval_steps_per_second": 12.228,
"step": 309
},
{
"epoch": 34.41830065359477,
"eval_loss": 1.917364239692688,
"eval_runtime": 2.4621,
"eval_samples_per_second": 789.978,
"eval_steps_per_second": 12.591,
"step": 310
},
{
"epoch": 34.52287581699346,
"eval_loss": 1.9594651460647583,
"eval_runtime": 2.5176,
"eval_samples_per_second": 772.567,
"eval_steps_per_second": 12.313,
"step": 311
},
{
"epoch": 34.627450980392155,
"eval_loss": 1.9423621892929077,
"eval_runtime": 2.5063,
"eval_samples_per_second": 776.037,
"eval_steps_per_second": 12.369,
"step": 312
},
{
"epoch": 34.73202614379085,
"eval_loss": 1.9494574069976807,
"eval_runtime": 2.4032,
"eval_samples_per_second": 809.321,
"eval_steps_per_second": 12.899,
"step": 313
},
{
"epoch": 34.83660130718954,
"eval_loss": 1.9160590171813965,
"eval_runtime": 2.3866,
"eval_samples_per_second": 814.959,
"eval_steps_per_second": 12.989,
"step": 314
},
{
"epoch": 34.94117647058823,
"eval_loss": 1.963183879852295,
"eval_runtime": 2.3866,
"eval_samples_per_second": 814.96,
"eval_steps_per_second": 12.989,
"step": 315
},
{
"epoch": 34.94117647058823,
"grad_norm": 1.7952407598495483,
"learning_rate": 6.382978723404256e-06,
"loss": 1.7434,
"step": 315
},
{
"epoch": 35.10457516339869,
"eval_loss": 1.9129880666732788,
"eval_runtime": 2.3856,
"eval_samples_per_second": 815.294,
"eval_steps_per_second": 12.994,
"step": 316
},
{
"epoch": 35.209150326797385,
"eval_loss": 1.9850044250488281,
"eval_runtime": 2.4394,
"eval_samples_per_second": 797.311,
"eval_steps_per_second": 12.708,
"step": 317
},
{
"epoch": 35.31372549019608,
"eval_loss": 1.9291285276412964,
"eval_runtime": 2.3856,
"eval_samples_per_second": 815.319,
"eval_steps_per_second": 12.995,
"step": 318
},
{
"epoch": 35.41830065359477,
"eval_loss": 1.9300141334533691,
"eval_runtime": 2.5153,
"eval_samples_per_second": 773.266,
"eval_steps_per_second": 12.325,
"step": 319
},
{
"epoch": 35.52287581699346,
"eval_loss": 1.9399768114089966,
"eval_runtime": 2.4758,
"eval_samples_per_second": 785.618,
"eval_steps_per_second": 12.521,
"step": 320
},
{
"epoch": 35.627450980392155,
"eval_loss": 1.9735783338546753,
"eval_runtime": 2.4325,
"eval_samples_per_second": 799.598,
"eval_steps_per_second": 12.744,
"step": 321
},
{
"epoch": 35.73202614379085,
"eval_loss": 1.9033024311065674,
"eval_runtime": 2.4152,
"eval_samples_per_second": 805.309,
"eval_steps_per_second": 12.835,
"step": 322
},
{
"epoch": 35.83660130718954,
"eval_loss": 1.9249202013015747,
"eval_runtime": 2.4104,
"eval_samples_per_second": 806.927,
"eval_steps_per_second": 12.861,
"step": 323
},
{
"epoch": 35.94117647058823,
"eval_loss": 1.9796316623687744,
"eval_runtime": 2.4374,
"eval_samples_per_second": 797.996,
"eval_steps_per_second": 12.719,
"step": 324
},
{
"epoch": 35.94117647058823,
"grad_norm": 1.8199615478515625,
"learning_rate": 5.957446808510638e-06,
"loss": 1.7578,
"step": 324
},
{
"epoch": 36.10457516339869,
"eval_loss": 1.9595942497253418,
"eval_runtime": 2.3871,
"eval_samples_per_second": 814.78,
"eval_steps_per_second": 12.986,
"step": 325
},
{
"epoch": 36.209150326797385,
"eval_loss": 1.9293735027313232,
"eval_runtime": 2.3875,
"eval_samples_per_second": 814.674,
"eval_steps_per_second": 12.985,
"step": 326
},
{
"epoch": 36.31372549019608,
"eval_loss": 1.957201600074768,
"eval_runtime": 2.3892,
"eval_samples_per_second": 814.083,
"eval_steps_per_second": 12.975,
"step": 327
},
{
"epoch": 36.41830065359477,
"eval_loss": 1.9536631107330322,
"eval_runtime": 2.4048,
"eval_samples_per_second": 808.796,
"eval_steps_per_second": 12.891,
"step": 328
},
{
"epoch": 36.52287581699346,
"eval_loss": 1.974502682685852,
"eval_runtime": 2.3928,
"eval_samples_per_second": 812.843,
"eval_steps_per_second": 12.955,
"step": 329
},
{
"epoch": 36.627450980392155,
"eval_loss": 1.9568116664886475,
"eval_runtime": 2.4183,
"eval_samples_per_second": 804.288,
"eval_steps_per_second": 12.819,
"step": 330
},
{
"epoch": 36.73202614379085,
"eval_loss": 1.9689034223556519,
"eval_runtime": 2.4676,
"eval_samples_per_second": 788.229,
"eval_steps_per_second": 12.563,
"step": 331
},
{
"epoch": 36.83660130718954,
"eval_loss": 1.9140371084213257,
"eval_runtime": 2.4088,
"eval_samples_per_second": 807.459,
"eval_steps_per_second": 12.87,
"step": 332
},
{
"epoch": 36.94117647058823,
"eval_loss": 1.929794192314148,
"eval_runtime": 2.4119,
"eval_samples_per_second": 806.412,
"eval_steps_per_second": 12.853,
"step": 333
},
{
"epoch": 36.94117647058823,
"grad_norm": 1.8074049949645996,
"learning_rate": 5.531914893617022e-06,
"loss": 1.7497,
"step": 333
},
{
"epoch": 37.10457516339869,
"eval_loss": 1.9698741436004639,
"eval_runtime": 2.4082,
"eval_samples_per_second": 807.67,
"eval_steps_per_second": 12.873,
"step": 334
},
{
"epoch": 37.209150326797385,
"eval_loss": 1.90766441822052,
"eval_runtime": 2.3974,
"eval_samples_per_second": 811.298,
"eval_steps_per_second": 12.931,
"step": 335
},
{
"epoch": 37.31372549019608,
"eval_loss": 1.9559139013290405,
"eval_runtime": 2.3918,
"eval_samples_per_second": 813.182,
"eval_steps_per_second": 12.961,
"step": 336
},
{
"epoch": 37.41830065359477,
"eval_loss": 1.9621520042419434,
"eval_runtime": 2.4126,
"eval_samples_per_second": 806.189,
"eval_steps_per_second": 12.849,
"step": 337
},
{
"epoch": 37.52287581699346,
"eval_loss": 1.9238826036453247,
"eval_runtime": 2.3925,
"eval_samples_per_second": 812.943,
"eval_steps_per_second": 12.957,
"step": 338
},
{
"epoch": 37.627450980392155,
"eval_loss": 1.9738985300064087,
"eval_runtime": 2.4262,
"eval_samples_per_second": 801.666,
"eval_steps_per_second": 12.777,
"step": 339
},
{
"epoch": 37.73202614379085,
"eval_loss": 1.936599612236023,
"eval_runtime": 2.446,
"eval_samples_per_second": 795.165,
"eval_steps_per_second": 12.674,
"step": 340
},
{
"epoch": 37.83660130718954,
"eval_loss": 1.9857661724090576,
"eval_runtime": 2.4387,
"eval_samples_per_second": 797.57,
"eval_steps_per_second": 12.712,
"step": 341
},
{
"epoch": 37.94117647058823,
"eval_loss": 1.9602775573730469,
"eval_runtime": 2.4623,
"eval_samples_per_second": 789.898,
"eval_steps_per_second": 12.59,
"step": 342
},
{
"epoch": 37.94117647058823,
"grad_norm": 1.8968150615692139,
"learning_rate": 5.106382978723404e-06,
"loss": 1.7378,
"step": 342
},
{
"epoch": 38.10457516339869,
"eval_loss": 1.9392098188400269,
"eval_runtime": 2.5445,
"eval_samples_per_second": 764.389,
"eval_steps_per_second": 12.183,
"step": 343
},
{
"epoch": 38.209150326797385,
"eval_loss": 1.9554734230041504,
"eval_runtime": 2.4153,
"eval_samples_per_second": 805.294,
"eval_steps_per_second": 12.835,
"step": 344
},
{
"epoch": 38.31372549019608,
"eval_loss": 1.9802982807159424,
"eval_runtime": 2.4043,
"eval_samples_per_second": 808.971,
"eval_steps_per_second": 12.894,
"step": 345
},
{
"epoch": 38.41830065359477,
"eval_loss": 1.950205683708191,
"eval_runtime": 2.408,
"eval_samples_per_second": 807.739,
"eval_steps_per_second": 12.874,
"step": 346
},
{
"epoch": 38.52287581699346,
"eval_loss": 1.959083914756775,
"eval_runtime": 2.3857,
"eval_samples_per_second": 815.264,
"eval_steps_per_second": 12.994,
"step": 347
},
{
"epoch": 38.627450980392155,
"eval_loss": 1.9582518339157104,
"eval_runtime": 2.4306,
"eval_samples_per_second": 800.213,
"eval_steps_per_second": 12.754,
"step": 348
},
{
"epoch": 38.73202614379085,
"eval_loss": 1.9507373571395874,
"eval_runtime": 2.3863,
"eval_samples_per_second": 815.084,
"eval_steps_per_second": 12.991,
"step": 349
},
{
"epoch": 38.83660130718954,
"eval_loss": 1.9410823583602905,
"eval_runtime": 2.4092,
"eval_samples_per_second": 807.311,
"eval_steps_per_second": 12.867,
"step": 350
},
{
"epoch": 38.94117647058823,
"eval_loss": 1.922089695930481,
"eval_runtime": 2.4113,
"eval_samples_per_second": 806.625,
"eval_steps_per_second": 12.856,
"step": 351
},
{
"epoch": 38.94117647058823,
"grad_norm": 1.8066309690475464,
"learning_rate": 4.680851063829788e-06,
"loss": 1.7324,
"step": 351
},
{
"epoch": 39.10457516339869,
"eval_loss": 1.9468454122543335,
"eval_runtime": 2.3833,
"eval_samples_per_second": 816.093,
"eval_steps_per_second": 13.007,
"step": 352
},
{
"epoch": 39.209150326797385,
"eval_loss": 1.9370498657226562,
"eval_runtime": 2.3853,
"eval_samples_per_second": 815.41,
"eval_steps_per_second": 12.996,
"step": 353
},
{
"epoch": 39.31372549019608,
"eval_loss": 1.9278494119644165,
"eval_runtime": 2.4316,
"eval_samples_per_second": 799.901,
"eval_steps_per_second": 12.749,
"step": 354
},
{
"epoch": 39.41830065359477,
"eval_loss": 1.9604259729385376,
"eval_runtime": 2.4996,
"eval_samples_per_second": 778.116,
"eval_steps_per_second": 12.402,
"step": 355
},
{
"epoch": 39.52287581699346,
"eval_loss": 1.9375855922698975,
"eval_runtime": 2.4112,
"eval_samples_per_second": 806.646,
"eval_steps_per_second": 12.857,
"step": 356
},
{
"epoch": 39.627450980392155,
"eval_loss": 1.9473201036453247,
"eval_runtime": 2.4652,
"eval_samples_per_second": 788.988,
"eval_steps_per_second": 12.575,
"step": 357
},
{
"epoch": 39.73202614379085,
"eval_loss": 1.9490294456481934,
"eval_runtime": 2.4617,
"eval_samples_per_second": 790.102,
"eval_steps_per_second": 12.593,
"step": 358
},
{
"epoch": 39.83660130718954,
"eval_loss": 1.913360834121704,
"eval_runtime": 2.445,
"eval_samples_per_second": 795.514,
"eval_steps_per_second": 12.679,
"step": 359
},
{
"epoch": 39.94117647058823,
"eval_loss": 1.9323056936264038,
"eval_runtime": 2.4383,
"eval_samples_per_second": 797.685,
"eval_steps_per_second": 12.714,
"step": 360
},
{
"epoch": 39.94117647058823,
"grad_norm": 1.7927449941635132,
"learning_rate": 4.255319148936171e-06,
"loss": 1.7195,
"step": 360
},
{
"epoch": 40.10457516339869,
"eval_loss": 1.9119060039520264,
"eval_runtime": 2.4209,
"eval_samples_per_second": 803.426,
"eval_steps_per_second": 12.805,
"step": 361
},
{
"epoch": 40.209150326797385,
"eval_loss": 1.939374327659607,
"eval_runtime": 2.4652,
"eval_samples_per_second": 788.974,
"eval_steps_per_second": 12.575,
"step": 362
},
{
"epoch": 40.31372549019608,
"eval_loss": 1.9959666728973389,
"eval_runtime": 2.3847,
"eval_samples_per_second": 815.601,
"eval_steps_per_second": 12.999,
"step": 363
},
{
"epoch": 40.41830065359477,
"eval_loss": 1.9789413213729858,
"eval_runtime": 2.3893,
"eval_samples_per_second": 814.046,
"eval_steps_per_second": 12.975,
"step": 364
},
{
"epoch": 40.52287581699346,
"eval_loss": 1.9750434160232544,
"eval_runtime": 2.3871,
"eval_samples_per_second": 814.781,
"eval_steps_per_second": 12.986,
"step": 365
},
{
"epoch": 40.627450980392155,
"eval_loss": 1.9399486780166626,
"eval_runtime": 2.4366,
"eval_samples_per_second": 798.229,
"eval_steps_per_second": 12.722,
"step": 366
},
{
"epoch": 40.73202614379085,
"eval_loss": 1.9516425132751465,
"eval_runtime": 2.4136,
"eval_samples_per_second": 805.85,
"eval_steps_per_second": 12.844,
"step": 367
},
{
"epoch": 40.83660130718954,
"eval_loss": 1.941023826599121,
"eval_runtime": 2.4094,
"eval_samples_per_second": 807.251,
"eval_steps_per_second": 12.866,
"step": 368
},
{
"epoch": 40.94117647058823,
"eval_loss": 1.9317693710327148,
"eval_runtime": 2.4091,
"eval_samples_per_second": 807.345,
"eval_steps_per_second": 12.868,
"step": 369
},
{
"epoch": 40.94117647058823,
"grad_norm": 1.7872660160064697,
"learning_rate": 3.8297872340425535e-06,
"loss": 1.7043,
"step": 369
},
{
"epoch": 41.10457516339869,
"eval_loss": 1.9890044927597046,
"eval_runtime": 2.414,
"eval_samples_per_second": 805.708,
"eval_steps_per_second": 12.842,
"step": 370
},
{
"epoch": 41.209150326797385,
"eval_loss": 1.9840960502624512,
"eval_runtime": 2.3959,
"eval_samples_per_second": 811.809,
"eval_steps_per_second": 12.939,
"step": 371
},
{
"epoch": 41.31372549019608,
"eval_loss": 1.9188443422317505,
"eval_runtime": 2.4069,
"eval_samples_per_second": 808.088,
"eval_steps_per_second": 12.88,
"step": 372
},
{
"epoch": 41.41830065359477,
"eval_loss": 1.9614779949188232,
"eval_runtime": 2.3886,
"eval_samples_per_second": 814.274,
"eval_steps_per_second": 12.978,
"step": 373
},
{
"epoch": 41.52287581699346,
"eval_loss": 1.9061365127563477,
"eval_runtime": 2.3924,
"eval_samples_per_second": 812.981,
"eval_steps_per_second": 12.958,
"step": 374
},
{
"epoch": 41.627450980392155,
"eval_loss": 1.909993052482605,
"eval_runtime": 2.3858,
"eval_samples_per_second": 815.236,
"eval_steps_per_second": 12.993,
"step": 375
},
{
"epoch": 41.73202614379085,
"eval_loss": 1.9422426223754883,
"eval_runtime": 2.3888,
"eval_samples_per_second": 814.201,
"eval_steps_per_second": 12.977,
"step": 376
},
{
"epoch": 41.83660130718954,
"eval_loss": 1.9640315771102905,
"eval_runtime": 2.4376,
"eval_samples_per_second": 797.911,
"eval_steps_per_second": 12.717,
"step": 377
},
{
"epoch": 41.94117647058823,
"eval_loss": 1.917662262916565,
"eval_runtime": 2.4538,
"eval_samples_per_second": 792.647,
"eval_steps_per_second": 12.633,
"step": 378
},
{
"epoch": 41.94117647058823,
"grad_norm": 1.7721134424209595,
"learning_rate": 3.4042553191489363e-06,
"loss": 1.7169,
"step": 378
},
{
"epoch": 42.10457516339869,
"eval_loss": 1.9163570404052734,
"eval_runtime": 2.4281,
"eval_samples_per_second": 801.034,
"eval_steps_per_second": 12.767,
"step": 379
},
{
"epoch": 42.209150326797385,
"eval_loss": 1.9374709129333496,
"eval_runtime": 2.4192,
"eval_samples_per_second": 803.999,
"eval_steps_per_second": 12.814,
"step": 380
},
{
"epoch": 42.31372549019608,
"eval_loss": 1.9525771141052246,
"eval_runtime": 2.4174,
"eval_samples_per_second": 804.581,
"eval_steps_per_second": 12.824,
"step": 381
},
{
"epoch": 42.41830065359477,
"eval_loss": 1.938783884048462,
"eval_runtime": 2.3891,
"eval_samples_per_second": 814.119,
"eval_steps_per_second": 12.976,
"step": 382
},
{
"epoch": 42.52287581699346,
"eval_loss": 1.9378857612609863,
"eval_runtime": 2.3879,
"eval_samples_per_second": 814.515,
"eval_steps_per_second": 12.982,
"step": 383
},
{
"epoch": 42.627450980392155,
"eval_loss": 1.931535243988037,
"eval_runtime": 2.3924,
"eval_samples_per_second": 812.994,
"eval_steps_per_second": 12.958,
"step": 384
},
{
"epoch": 42.73202614379085,
"eval_loss": 1.9418144226074219,
"eval_runtime": 2.3917,
"eval_samples_per_second": 813.219,
"eval_steps_per_second": 12.961,
"step": 385
},
{
"epoch": 42.83660130718954,
"eval_loss": 1.9460214376449585,
"eval_runtime": 2.417,
"eval_samples_per_second": 804.722,
"eval_steps_per_second": 12.826,
"step": 386
},
{
"epoch": 42.94117647058823,
"eval_loss": 1.9129729270935059,
"eval_runtime": 2.3889,
"eval_samples_per_second": 814.19,
"eval_steps_per_second": 12.977,
"step": 387
},
{
"epoch": 42.94117647058823,
"grad_norm": 1.8166015148162842,
"learning_rate": 2.978723404255319e-06,
"loss": 1.7315,
"step": 387
},
{
"epoch": 43.10457516339869,
"eval_loss": 1.9539881944656372,
"eval_runtime": 2.4601,
"eval_samples_per_second": 790.624,
"eval_steps_per_second": 12.601,
"step": 388
},
{
"epoch": 43.209150326797385,
"eval_loss": 1.951253890991211,
"eval_runtime": 2.4131,
"eval_samples_per_second": 806.019,
"eval_steps_per_second": 12.847,
"step": 389
},
{
"epoch": 43.31372549019608,
"eval_loss": 2.0078840255737305,
"eval_runtime": 2.4151,
"eval_samples_per_second": 805.365,
"eval_steps_per_second": 12.836,
"step": 390
},
{
"epoch": 43.41830065359477,
"eval_loss": 1.9754467010498047,
"eval_runtime": 2.409,
"eval_samples_per_second": 807.392,
"eval_steps_per_second": 12.868,
"step": 391
},
{
"epoch": 43.52287581699346,
"eval_loss": 1.972512125968933,
"eval_runtime": 2.4172,
"eval_samples_per_second": 804.635,
"eval_steps_per_second": 12.825,
"step": 392
},
{
"epoch": 43.627450980392155,
"eval_loss": 1.9601085186004639,
"eval_runtime": 2.4555,
"eval_samples_per_second": 792.113,
"eval_steps_per_second": 12.625,
"step": 393
},
{
"epoch": 43.73202614379085,
"eval_loss": 1.9266124963760376,
"eval_runtime": 2.4604,
"eval_samples_per_second": 790.516,
"eval_steps_per_second": 12.599,
"step": 394
},
{
"epoch": 43.83660130718954,
"eval_loss": 1.9546335935592651,
"eval_runtime": 2.467,
"eval_samples_per_second": 788.405,
"eval_steps_per_second": 12.566,
"step": 395
},
{
"epoch": 43.94117647058823,
"eval_loss": 1.918619990348816,
"eval_runtime": 2.4607,
"eval_samples_per_second": 790.428,
"eval_steps_per_second": 12.598,
"step": 396
},
{
"epoch": 43.94117647058823,
"grad_norm": 1.8490442037582397,
"learning_rate": 2.553191489361702e-06,
"loss": 1.7095,
"step": 396
},
{
"epoch": 44.10457516339869,
"eval_loss": 1.9544674158096313,
"eval_runtime": 2.5483,
"eval_samples_per_second": 763.254,
"eval_steps_per_second": 12.165,
"step": 397
},
{
"epoch": 44.209150326797385,
"eval_loss": 2.0218536853790283,
"eval_runtime": 2.578,
"eval_samples_per_second": 754.467,
"eval_steps_per_second": 12.025,
"step": 398
},
{
"epoch": 44.31372549019608,
"eval_loss": 1.9499460458755493,
"eval_runtime": 2.4731,
"eval_samples_per_second": 786.477,
"eval_steps_per_second": 12.535,
"step": 399
},
{
"epoch": 44.41830065359477,
"eval_loss": 1.9414080381393433,
"eval_runtime": 2.4805,
"eval_samples_per_second": 784.104,
"eval_steps_per_second": 12.497,
"step": 400
},
{
"epoch": 44.52287581699346,
"eval_loss": 1.961714506149292,
"eval_runtime": 2.4673,
"eval_samples_per_second": 788.301,
"eval_steps_per_second": 12.564,
"step": 401
},
{
"epoch": 44.627450980392155,
"eval_loss": 1.9939833879470825,
"eval_runtime": 2.4722,
"eval_samples_per_second": 786.751,
"eval_steps_per_second": 12.539,
"step": 402
},
{
"epoch": 44.73202614379085,
"eval_loss": 1.9617350101470947,
"eval_runtime": 2.4683,
"eval_samples_per_second": 787.977,
"eval_steps_per_second": 12.559,
"step": 403
},
{
"epoch": 44.83660130718954,
"eval_loss": 1.9692201614379883,
"eval_runtime": 2.4374,
"eval_samples_per_second": 797.97,
"eval_steps_per_second": 12.718,
"step": 404
},
{
"epoch": 44.94117647058823,
"eval_loss": 1.9219003915786743,
"eval_runtime": 2.435,
"eval_samples_per_second": 798.78,
"eval_steps_per_second": 12.731,
"step": 405
},
{
"epoch": 44.94117647058823,
"grad_norm": 1.796848177909851,
"learning_rate": 2.1276595744680853e-06,
"loss": 1.7071,
"step": 405
},
{
"epoch": 45.10457516339869,
"eval_loss": 1.9611177444458008,
"eval_runtime": 2.4195,
"eval_samples_per_second": 803.881,
"eval_steps_per_second": 12.812,
"step": 406
},
{
"epoch": 45.209150326797385,
"eval_loss": 1.9778918027877808,
"eval_runtime": 2.4197,
"eval_samples_per_second": 803.821,
"eval_steps_per_second": 12.812,
"step": 407
},
{
"epoch": 45.31372549019608,
"eval_loss": 1.9238309860229492,
"eval_runtime": 2.4635,
"eval_samples_per_second": 789.519,
"eval_steps_per_second": 12.584,
"step": 408
},
{
"epoch": 45.41830065359477,
"eval_loss": 1.9089758396148682,
"eval_runtime": 2.4247,
"eval_samples_per_second": 802.146,
"eval_steps_per_second": 12.785,
"step": 409
},
{
"epoch": 45.52287581699346,
"eval_loss": 1.9342485666275024,
"eval_runtime": 2.4113,
"eval_samples_per_second": 806.609,
"eval_steps_per_second": 12.856,
"step": 410
},
{
"epoch": 45.627450980392155,
"eval_loss": 1.9936097860336304,
"eval_runtime": 2.4133,
"eval_samples_per_second": 805.937,
"eval_steps_per_second": 12.845,
"step": 411
},
{
"epoch": 45.73202614379085,
"eval_loss": 1.897844672203064,
"eval_runtime": 2.4034,
"eval_samples_per_second": 809.278,
"eval_steps_per_second": 12.899,
"step": 412
},
{
"epoch": 45.83660130718954,
"eval_loss": 1.9208406209945679,
"eval_runtime": 2.3918,
"eval_samples_per_second": 813.195,
"eval_steps_per_second": 12.961,
"step": 413
},
{
"epoch": 45.94117647058823,
"eval_loss": 1.9177494049072266,
"eval_runtime": 2.4571,
"eval_samples_per_second": 791.578,
"eval_steps_per_second": 12.616,
"step": 414
},
{
"epoch": 45.94117647058823,
"grad_norm": 1.7152032852172852,
"learning_rate": 1.7021276595744682e-06,
"loss": 1.7116,
"step": 414
},
{
"epoch": 46.10457516339869,
"eval_loss": 1.957858681678772,
"eval_runtime": 2.4339,
"eval_samples_per_second": 799.125,
"eval_steps_per_second": 12.737,
"step": 415
},
{
"epoch": 46.209150326797385,
"eval_loss": 1.9422305822372437,
"eval_runtime": 2.4497,
"eval_samples_per_second": 793.973,
"eval_steps_per_second": 12.655,
"step": 416
},
{
"epoch": 46.31372549019608,
"eval_loss": 1.9287089109420776,
"eval_runtime": 2.423,
"eval_samples_per_second": 802.723,
"eval_steps_per_second": 12.794,
"step": 417
},
{
"epoch": 46.41830065359477,
"eval_loss": 1.9444739818572998,
"eval_runtime": 2.4956,
"eval_samples_per_second": 779.376,
"eval_steps_per_second": 12.422,
"step": 418
},
{
"epoch": 46.52287581699346,
"eval_loss": 1.923707127571106,
"eval_runtime": 2.4651,
"eval_samples_per_second": 789.011,
"eval_steps_per_second": 12.575,
"step": 419
},
{
"epoch": 46.627450980392155,
"eval_loss": 1.9269739389419556,
"eval_runtime": 2.4648,
"eval_samples_per_second": 789.121,
"eval_steps_per_second": 12.577,
"step": 420
},
{
"epoch": 46.73202614379085,
"eval_loss": 1.9492802619934082,
"eval_runtime": 2.4582,
"eval_samples_per_second": 791.229,
"eval_steps_per_second": 12.611,
"step": 421
},
{
"epoch": 46.83660130718954,
"eval_loss": 1.9743090867996216,
"eval_runtime": 2.4551,
"eval_samples_per_second": 792.232,
"eval_steps_per_second": 12.627,
"step": 422
},
{
"epoch": 46.94117647058823,
"eval_loss": 1.957751750946045,
"eval_runtime": 2.434,
"eval_samples_per_second": 799.105,
"eval_steps_per_second": 12.736,
"step": 423
},
{
"epoch": 46.94117647058823,
"grad_norm": 1.9101431369781494,
"learning_rate": 1.276595744680851e-06,
"loss": 1.733,
"step": 423
},
{
"epoch": 47.10457516339869,
"eval_loss": 1.9019426107406616,
"eval_runtime": 2.4414,
"eval_samples_per_second": 796.664,
"eval_steps_per_second": 12.697,
"step": 424
},
{
"epoch": 47.209150326797385,
"eval_loss": 1.944284200668335,
"eval_runtime": 2.4514,
"eval_samples_per_second": 793.435,
"eval_steps_per_second": 12.646,
"step": 425
},
{
"epoch": 47.31372549019608,
"eval_loss": 1.9661508798599243,
"eval_runtime": 2.4442,
"eval_samples_per_second": 795.747,
"eval_steps_per_second": 12.683,
"step": 426
},
{
"epoch": 47.41830065359477,
"eval_loss": 1.9728316068649292,
"eval_runtime": 2.4822,
"eval_samples_per_second": 783.586,
"eval_steps_per_second": 12.489,
"step": 427
},
{
"epoch": 47.52287581699346,
"eval_loss": 1.923363208770752,
"eval_runtime": 2.4644,
"eval_samples_per_second": 789.251,
"eval_steps_per_second": 12.579,
"step": 428
},
{
"epoch": 47.627450980392155,
"eval_loss": 1.9165805578231812,
"eval_runtime": 2.3965,
"eval_samples_per_second": 811.601,
"eval_steps_per_second": 12.936,
"step": 429
},
{
"epoch": 47.73202614379085,
"eval_loss": 1.9413442611694336,
"eval_runtime": 2.4408,
"eval_samples_per_second": 796.869,
"eval_steps_per_second": 12.701,
"step": 430
},
{
"epoch": 47.83660130718954,
"eval_loss": 1.8855735063552856,
"eval_runtime": 2.4507,
"eval_samples_per_second": 793.659,
"eval_steps_per_second": 12.65,
"step": 431
},
{
"epoch": 47.94117647058823,
"eval_loss": 1.952731728553772,
"eval_runtime": 2.5241,
"eval_samples_per_second": 770.567,
"eval_steps_per_second": 12.282,
"step": 432
},
{
"epoch": 47.94117647058823,
"grad_norm": 1.7582765817642212,
"learning_rate": 8.510638297872341e-07,
"loss": 1.7065,
"step": 432
},
{
"epoch": 48.10457516339869,
"eval_loss": 1.9524160623550415,
"eval_runtime": 2.4138,
"eval_samples_per_second": 805.783,
"eval_steps_per_second": 12.843,
"step": 433
},
{
"epoch": 48.209150326797385,
"eval_loss": 1.9682825803756714,
"eval_runtime": 2.5144,
"eval_samples_per_second": 773.543,
"eval_steps_per_second": 12.329,
"step": 434
},
{
"epoch": 48.31372549019608,
"eval_loss": 1.9489309787750244,
"eval_runtime": 2.5744,
"eval_samples_per_second": 755.51,
"eval_steps_per_second": 12.042,
"step": 435
},
{
"epoch": 48.41830065359477,
"eval_loss": 1.9564448595046997,
"eval_runtime": 2.4521,
"eval_samples_per_second": 793.202,
"eval_steps_per_second": 12.642,
"step": 436
},
{
"epoch": 48.52287581699346,
"eval_loss": 1.9767297506332397,
"eval_runtime": 2.4404,
"eval_samples_per_second": 796.999,
"eval_steps_per_second": 12.703,
"step": 437
},
{
"epoch": 48.627450980392155,
"eval_loss": 1.9058864116668701,
"eval_runtime": 2.4851,
"eval_samples_per_second": 782.665,
"eval_steps_per_second": 12.474,
"step": 438
},
{
"epoch": 48.73202614379085,
"eval_loss": 1.9629017114639282,
"eval_runtime": 2.4371,
"eval_samples_per_second": 798.081,
"eval_steps_per_second": 12.72,
"step": 439
},
{
"epoch": 48.83660130718954,
"eval_loss": 1.937988519668579,
"eval_runtime": 2.4379,
"eval_samples_per_second": 797.822,
"eval_steps_per_second": 12.716,
"step": 440
},
{
"epoch": 48.94117647058823,
"eval_loss": 1.9694868326187134,
"eval_runtime": 2.4378,
"eval_samples_per_second": 797.855,
"eval_steps_per_second": 12.716,
"step": 441
},
{
"epoch": 48.94117647058823,
"grad_norm": 1.9079190492630005,
"learning_rate": 4.2553191489361704e-07,
"loss": 1.6997,
"step": 441
},
{
"epoch": 49.10457516339869,
"eval_loss": 1.9250315427780151,
"eval_runtime": 2.4353,
"eval_samples_per_second": 798.679,
"eval_steps_per_second": 12.73,
"step": 442
},
{
"epoch": 49.209150326797385,
"eval_loss": 1.9391655921936035,
"eval_runtime": 2.4099,
"eval_samples_per_second": 807.094,
"eval_steps_per_second": 12.864,
"step": 443
},
{
"epoch": 49.31372549019608,
"eval_loss": 1.9522807598114014,
"eval_runtime": 2.4344,
"eval_samples_per_second": 798.963,
"eval_steps_per_second": 12.734,
"step": 444
},
{
"epoch": 49.41830065359477,
"eval_loss": 1.9459158182144165,
"eval_runtime": 2.4156,
"eval_samples_per_second": 805.199,
"eval_steps_per_second": 12.834,
"step": 445
},
{
"epoch": 49.52287581699346,
"eval_loss": 1.9365217685699463,
"eval_runtime": 2.4071,
"eval_samples_per_second": 808.028,
"eval_steps_per_second": 12.879,
"step": 446
},
{
"epoch": 49.627450980392155,
"eval_loss": 1.9133816957473755,
"eval_runtime": 2.3906,
"eval_samples_per_second": 813.594,
"eval_steps_per_second": 12.967,
"step": 447
},
{
"epoch": 49.73202614379085,
"eval_loss": 1.9433826208114624,
"eval_runtime": 2.4005,
"eval_samples_per_second": 810.237,
"eval_steps_per_second": 12.914,
"step": 448
},
{
"epoch": 49.83660130718954,
"eval_loss": 1.9771692752838135,
"eval_runtime": 2.3888,
"eval_samples_per_second": 814.201,
"eval_steps_per_second": 12.977,
"step": 449
},
{
"epoch": 49.94117647058823,
"eval_loss": 1.9776495695114136,
"eval_runtime": 2.3848,
"eval_samples_per_second": 815.59,
"eval_steps_per_second": 12.999,
"step": 450
},
{
"epoch": 49.94117647058823,
"grad_norm": 1.8996864557266235,
"learning_rate": 0.0,
"loss": 1.6668,
"step": 450
},
{
"epoch": 49.94117647058823,
"step": 450,
"total_flos": 1.2548402868338688e+16,
"train_loss": 1.8820014402601455,
"train_runtime": 2628.924,
"train_samples_per_second": 185.114,
"train_steps_per_second": 0.171
}
],
"logging_steps": 500,
"max_steps": 450,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2548402868338688e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}