POLAR-G3-4b-Commerce_Law-sft / trainer_state.json
CocoRoF's picture
Upload content from checkpoint-320
7c96822 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 100.0,
"global_step": 320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015625,
"grad_norm": 179.0,
"learning_rate": 0.0,
"loss": 5.2367,
"mean_token_accuracy": 0.43311607837677,
"step": 1
},
{
"epoch": 0.03125,
"grad_norm": 198.0,
"learning_rate": 3.75e-07,
"loss": 5.5486,
"mean_token_accuracy": 0.4108841121196747,
"step": 2
},
{
"epoch": 0.046875,
"grad_norm": 178.0,
"learning_rate": 7.5e-07,
"loss": 5.2141,
"mean_token_accuracy": 0.4345736503601074,
"step": 3
},
{
"epoch": 0.0625,
"grad_norm": 172.0,
"learning_rate": 1.125e-06,
"loss": 5.5836,
"mean_token_accuracy": 0.41250425577163696,
"step": 4
},
{
"epoch": 0.078125,
"grad_norm": 152.0,
"learning_rate": 1.5e-06,
"loss": 5.2296,
"mean_token_accuracy": 0.4149760901927948,
"step": 5
},
{
"epoch": 0.09375,
"grad_norm": 126.5,
"learning_rate": 1.875e-06,
"loss": 4.6572,
"mean_token_accuracy": 0.4508528411388397,
"step": 6
},
{
"epoch": 0.109375,
"grad_norm": 122.0,
"learning_rate": 2.25e-06,
"loss": 4.8441,
"mean_token_accuracy": 0.4253324270248413,
"step": 7
},
{
"epoch": 0.125,
"grad_norm": 93.5,
"learning_rate": 2.6250000000000003e-06,
"loss": 4.3849,
"mean_token_accuracy": 0.4506120979785919,
"step": 8
},
{
"epoch": 0.140625,
"grad_norm": 73.0,
"learning_rate": 3e-06,
"loss": 4.0401,
"mean_token_accuracy": 0.45889315009117126,
"step": 9
},
{
"epoch": 0.15625,
"grad_norm": 60.75,
"learning_rate": 2.9903846153846156e-06,
"loss": 3.8834,
"mean_token_accuracy": 0.47038498520851135,
"step": 10
},
{
"epoch": 0.171875,
"grad_norm": 50.5,
"learning_rate": 2.9807692307692307e-06,
"loss": 3.6623,
"mean_token_accuracy": 0.4854481816291809,
"step": 11
},
{
"epoch": 0.1875,
"grad_norm": 43.5,
"learning_rate": 2.9711538461538463e-06,
"loss": 3.4374,
"mean_token_accuracy": 0.4968532621860504,
"step": 12
},
{
"epoch": 0.203125,
"grad_norm": 45.5,
"learning_rate": 2.961538461538462e-06,
"loss": 3.2283,
"mean_token_accuracy": 0.513970673084259,
"step": 13
},
{
"epoch": 0.21875,
"grad_norm": 35.25,
"learning_rate": 2.951923076923077e-06,
"loss": 3.1423,
"mean_token_accuracy": 0.5192499160766602,
"step": 14
},
{
"epoch": 0.234375,
"grad_norm": 32.0,
"learning_rate": 2.942307692307692e-06,
"loss": 2.9815,
"mean_token_accuracy": 0.5281234383583069,
"step": 15
},
{
"epoch": 0.25,
"grad_norm": 32.5,
"learning_rate": 2.9326923076923076e-06,
"loss": 2.863,
"mean_token_accuracy": 0.5459433197975159,
"step": 16
},
{
"epoch": 0.265625,
"grad_norm": 27.5,
"learning_rate": 2.923076923076923e-06,
"loss": 2.8314,
"mean_token_accuracy": 0.5349738597869873,
"step": 17
},
{
"epoch": 0.28125,
"grad_norm": 26.125,
"learning_rate": 2.9134615384615387e-06,
"loss": 2.7488,
"mean_token_accuracy": 0.5427058339118958,
"step": 18
},
{
"epoch": 0.296875,
"grad_norm": 23.625,
"learning_rate": 2.903846153846154e-06,
"loss": 2.6134,
"mean_token_accuracy": 0.5453544855117798,
"step": 19
},
{
"epoch": 0.3125,
"grad_norm": 23.75,
"learning_rate": 2.8942307692307693e-06,
"loss": 2.562,
"mean_token_accuracy": 0.5491897463798523,
"step": 20
},
{
"epoch": 0.328125,
"grad_norm": 22.625,
"learning_rate": 2.884615384615385e-06,
"loss": 2.4742,
"mean_token_accuracy": 0.5496063232421875,
"step": 21
},
{
"epoch": 0.34375,
"grad_norm": 21.875,
"learning_rate": 2.875e-06,
"loss": 2.3924,
"mean_token_accuracy": 0.5709417462348938,
"step": 22
},
{
"epoch": 0.359375,
"grad_norm": 22.625,
"learning_rate": 2.8653846153846155e-06,
"loss": 2.2842,
"mean_token_accuracy": 0.5809605717658997,
"step": 23
},
{
"epoch": 0.375,
"grad_norm": 21.125,
"learning_rate": 2.8557692307692307e-06,
"loss": 2.1831,
"mean_token_accuracy": 0.5968478918075562,
"step": 24
},
{
"epoch": 0.390625,
"grad_norm": 21.875,
"learning_rate": 2.846153846153846e-06,
"loss": 2.1536,
"mean_token_accuracy": 0.602466344833374,
"step": 25
},
{
"epoch": 0.40625,
"grad_norm": 21.5,
"learning_rate": 2.8365384615384613e-06,
"loss": 2.0267,
"mean_token_accuracy": 0.6342177987098694,
"step": 26
},
{
"epoch": 0.421875,
"grad_norm": 23.25,
"learning_rate": 2.826923076923077e-06,
"loss": 1.9421,
"mean_token_accuracy": 0.6432027220726013,
"step": 27
},
{
"epoch": 0.4375,
"grad_norm": 23.5,
"learning_rate": 2.8173076923076924e-06,
"loss": 1.9156,
"mean_token_accuracy": 0.6579385995864868,
"step": 28
},
{
"epoch": 0.453125,
"grad_norm": 24.125,
"learning_rate": 2.807692307692308e-06,
"loss": 1.8596,
"mean_token_accuracy": 0.6711886525154114,
"step": 29
},
{
"epoch": 0.46875,
"grad_norm": 21.5,
"learning_rate": 2.798076923076923e-06,
"loss": 1.7841,
"mean_token_accuracy": 0.6769159436225891,
"step": 30
},
{
"epoch": 0.484375,
"grad_norm": 19.25,
"learning_rate": 2.7884615384615386e-06,
"loss": 1.7306,
"mean_token_accuracy": 0.6835744976997375,
"step": 31
},
{
"epoch": 0.5,
"grad_norm": 19.375,
"learning_rate": 2.778846153846154e-06,
"loss": 1.6979,
"mean_token_accuracy": 0.6885009407997131,
"step": 32
},
{
"epoch": 0.515625,
"grad_norm": 19.625,
"learning_rate": 2.7692307692307693e-06,
"loss": 1.6705,
"mean_token_accuracy": 0.6899428963661194,
"step": 33
},
{
"epoch": 0.53125,
"grad_norm": 19.375,
"learning_rate": 2.7596153846153844e-06,
"loss": 1.5569,
"mean_token_accuracy": 0.7047604918479919,
"step": 34
},
{
"epoch": 0.546875,
"grad_norm": 25.25,
"learning_rate": 2.75e-06,
"loss": 1.5437,
"mean_token_accuracy": 0.7077205777168274,
"step": 35
},
{
"epoch": 0.5625,
"grad_norm": 18.0,
"learning_rate": 2.7403846153846155e-06,
"loss": 1.5079,
"mean_token_accuracy": 0.721455454826355,
"step": 36
},
{
"epoch": 0.578125,
"grad_norm": 16.125,
"learning_rate": 2.7307692307692306e-06,
"loss": 1.4751,
"mean_token_accuracy": 0.731259822845459,
"step": 37
},
{
"epoch": 0.59375,
"grad_norm": 13.875,
"learning_rate": 2.721153846153846e-06,
"loss": 1.4444,
"mean_token_accuracy": 0.721596896648407,
"step": 38
},
{
"epoch": 0.609375,
"grad_norm": 17.25,
"learning_rate": 2.7115384615384617e-06,
"loss": 1.4267,
"mean_token_accuracy": 0.7311121821403503,
"step": 39
},
{
"epoch": 0.625,
"grad_norm": 14.6875,
"learning_rate": 2.7019230769230772e-06,
"loss": 1.4606,
"mean_token_accuracy": 0.7229064106941223,
"step": 40
},
{
"epoch": 0.640625,
"grad_norm": 12.0625,
"learning_rate": 2.6923076923076923e-06,
"loss": 1.4415,
"mean_token_accuracy": 0.7228537201881409,
"step": 41
},
{
"epoch": 0.65625,
"grad_norm": 12.1875,
"learning_rate": 2.682692307692308e-06,
"loss": 1.3615,
"mean_token_accuracy": 0.7459965944290161,
"step": 42
},
{
"epoch": 0.671875,
"grad_norm": 12.0625,
"learning_rate": 2.6730769230769234e-06,
"loss": 1.3769,
"mean_token_accuracy": 0.7359253764152527,
"step": 43
},
{
"epoch": 0.6875,
"grad_norm": 12.0,
"learning_rate": 2.6634615384615385e-06,
"loss": 1.3322,
"mean_token_accuracy": 0.7482958436012268,
"step": 44
},
{
"epoch": 0.703125,
"grad_norm": 11.3125,
"learning_rate": 2.6538461538461537e-06,
"loss": 1.3471,
"mean_token_accuracy": 0.738798975944519,
"step": 45
},
{
"epoch": 0.71875,
"grad_norm": 12.8125,
"learning_rate": 2.644230769230769e-06,
"loss": 1.2892,
"mean_token_accuracy": 0.7512567043304443,
"step": 46
},
{
"epoch": 0.734375,
"grad_norm": 12.25,
"learning_rate": 2.6346153846153847e-06,
"loss": 1.2592,
"mean_token_accuracy": 0.7561485171318054,
"step": 47
},
{
"epoch": 0.75,
"grad_norm": 13.875,
"learning_rate": 2.6250000000000003e-06,
"loss": 1.289,
"mean_token_accuracy": 0.7504905462265015,
"step": 48
},
{
"epoch": 0.765625,
"grad_norm": 17.25,
"learning_rate": 2.6153846153846154e-06,
"loss": 1.2412,
"mean_token_accuracy": 0.7566288113594055,
"step": 49
},
{
"epoch": 0.78125,
"grad_norm": 24.375,
"learning_rate": 2.605769230769231e-06,
"loss": 1.2232,
"mean_token_accuracy": 0.7667444348335266,
"step": 50
},
{
"epoch": 0.796875,
"grad_norm": 34.25,
"learning_rate": 2.5961538461538465e-06,
"loss": 1.2176,
"mean_token_accuracy": 0.7597681879997253,
"step": 51
},
{
"epoch": 0.8125,
"grad_norm": 31.0,
"learning_rate": 2.5865384615384616e-06,
"loss": 1.2091,
"mean_token_accuracy": 0.7564249634742737,
"step": 52
},
{
"epoch": 0.828125,
"grad_norm": 9.4375,
"learning_rate": 2.5769230769230767e-06,
"loss": 1.1706,
"mean_token_accuracy": 0.7586262226104736,
"step": 53
},
{
"epoch": 0.84375,
"grad_norm": 9.3125,
"learning_rate": 2.5673076923076923e-06,
"loss": 1.2491,
"mean_token_accuracy": 0.7373932600021362,
"step": 54
},
{
"epoch": 0.859375,
"grad_norm": 8.8125,
"learning_rate": 2.557692307692308e-06,
"loss": 1.2008,
"mean_token_accuracy": 0.7576074004173279,
"step": 55
},
{
"epoch": 0.875,
"grad_norm": 7.8125,
"learning_rate": 2.548076923076923e-06,
"loss": 1.1553,
"mean_token_accuracy": 0.7607351541519165,
"step": 56
},
{
"epoch": 0.890625,
"grad_norm": 7.875,
"learning_rate": 2.5384615384615385e-06,
"loss": 1.1896,
"mean_token_accuracy": 0.7548706531524658,
"step": 57
},
{
"epoch": 0.90625,
"grad_norm": 13.6875,
"learning_rate": 2.528846153846154e-06,
"loss": 1.156,
"mean_token_accuracy": 0.7589271664619446,
"step": 58
},
{
"epoch": 0.921875,
"grad_norm": 12.375,
"learning_rate": 2.5192307692307695e-06,
"loss": 1.1924,
"mean_token_accuracy": 0.7581174373626709,
"step": 59
},
{
"epoch": 0.9375,
"grad_norm": 8.875,
"learning_rate": 2.5096153846153847e-06,
"loss": 1.193,
"mean_token_accuracy": 0.7505763173103333,
"step": 60
},
{
"epoch": 0.953125,
"grad_norm": 7.53125,
"learning_rate": 2.5e-06,
"loss": 1.118,
"mean_token_accuracy": 0.7625401020050049,
"step": 61
},
{
"epoch": 0.96875,
"grad_norm": 7.03125,
"learning_rate": 2.4903846153846157e-06,
"loss": 1.1186,
"mean_token_accuracy": 0.7640949487686157,
"step": 62
},
{
"epoch": 0.984375,
"grad_norm": 6.9375,
"learning_rate": 2.480769230769231e-06,
"loss": 1.1178,
"mean_token_accuracy": 0.7689425945281982,
"step": 63
},
{
"epoch": 1.0,
"grad_norm": 9.75,
"learning_rate": 2.471153846153846e-06,
"loss": 1.1485,
"mean_token_accuracy": 0.7646326422691345,
"step": 64
},
{
"epoch": 1.015625,
"grad_norm": 15.3125,
"learning_rate": 2.4615384615384615e-06,
"loss": 1.1362,
"mean_token_accuracy": 0.7596017122268677,
"step": 65
},
{
"epoch": 1.03125,
"grad_norm": 11.5625,
"learning_rate": 2.451923076923077e-06,
"loss": 1.1227,
"mean_token_accuracy": 0.7617481350898743,
"step": 66
},
{
"epoch": 1.046875,
"grad_norm": 29.125,
"learning_rate": 2.442307692307692e-06,
"loss": 1.0983,
"mean_token_accuracy": 0.7637330293655396,
"step": 67
},
{
"epoch": 1.0625,
"grad_norm": 11.0625,
"learning_rate": 2.4326923076923077e-06,
"loss": 1.1244,
"mean_token_accuracy": 0.75943922996521,
"step": 68
},
{
"epoch": 1.078125,
"grad_norm": 6.78125,
"learning_rate": 2.4230769230769233e-06,
"loss": 1.1029,
"mean_token_accuracy": 0.7678335309028625,
"step": 69
},
{
"epoch": 1.09375,
"grad_norm": 8.25,
"learning_rate": 2.413461538461539e-06,
"loss": 1.1107,
"mean_token_accuracy": 0.766456127166748,
"step": 70
},
{
"epoch": 1.109375,
"grad_norm": 6.5,
"learning_rate": 2.403846153846154e-06,
"loss": 1.0539,
"mean_token_accuracy": 0.7779717445373535,
"step": 71
},
{
"epoch": 1.125,
"grad_norm": 11.125,
"learning_rate": 2.3942307692307695e-06,
"loss": 1.1745,
"mean_token_accuracy": 0.7527372241020203,
"step": 72
},
{
"epoch": 1.140625,
"grad_norm": 6.65625,
"learning_rate": 2.3846153846153846e-06,
"loss": 1.1154,
"mean_token_accuracy": 0.7618581652641296,
"step": 73
},
{
"epoch": 1.15625,
"grad_norm": 6.65625,
"learning_rate": 2.375e-06,
"loss": 1.0716,
"mean_token_accuracy": 0.7723715305328369,
"step": 74
},
{
"epoch": 1.171875,
"grad_norm": 7.09375,
"learning_rate": 2.3653846153846152e-06,
"loss": 1.0951,
"mean_token_accuracy": 0.7622343897819519,
"step": 75
},
{
"epoch": 1.1875,
"grad_norm": 6.375,
"learning_rate": 2.355769230769231e-06,
"loss": 1.1046,
"mean_token_accuracy": 0.768226146697998,
"step": 76
},
{
"epoch": 1.203125,
"grad_norm": 8.0,
"learning_rate": 2.3461538461538463e-06,
"loss": 1.1724,
"mean_token_accuracy": 0.7464057803153992,
"step": 77
},
{
"epoch": 1.21875,
"grad_norm": 6.96875,
"learning_rate": 2.3365384615384615e-06,
"loss": 1.0518,
"mean_token_accuracy": 0.7725957036018372,
"step": 78
},
{
"epoch": 1.234375,
"grad_norm": 7.03125,
"learning_rate": 2.326923076923077e-06,
"loss": 1.0578,
"mean_token_accuracy": 0.7661430835723877,
"step": 79
},
{
"epoch": 1.25,
"grad_norm": 7.15625,
"learning_rate": 2.3173076923076925e-06,
"loss": 1.0921,
"mean_token_accuracy": 0.7706676125526428,
"step": 80
},
{
"epoch": 1.265625,
"grad_norm": 7.9375,
"learning_rate": 2.307692307692308e-06,
"loss": 1.0947,
"mean_token_accuracy": 0.7600434422492981,
"step": 81
},
{
"epoch": 1.28125,
"grad_norm": 6.28125,
"learning_rate": 2.298076923076923e-06,
"loss": 1.0152,
"mean_token_accuracy": 0.7791481614112854,
"step": 82
},
{
"epoch": 1.296875,
"grad_norm": 6.40625,
"learning_rate": 2.2884615384615383e-06,
"loss": 1.1093,
"mean_token_accuracy": 0.7621586918830872,
"step": 83
},
{
"epoch": 1.3125,
"grad_norm": 6.0625,
"learning_rate": 2.278846153846154e-06,
"loss": 1.0595,
"mean_token_accuracy": 0.7689430713653564,
"step": 84
},
{
"epoch": 1.328125,
"grad_norm": 25.625,
"learning_rate": 2.2692307692307694e-06,
"loss": 1.1264,
"mean_token_accuracy": 0.7479003667831421,
"step": 85
},
{
"epoch": 1.34375,
"grad_norm": 6.9375,
"learning_rate": 2.2596153846153845e-06,
"loss": 1.0278,
"mean_token_accuracy": 0.7810243964195251,
"step": 86
},
{
"epoch": 1.359375,
"grad_norm": 12.0,
"learning_rate": 2.25e-06,
"loss": 1.0898,
"mean_token_accuracy": 0.7608263492584229,
"step": 87
},
{
"epoch": 1.375,
"grad_norm": 6.96875,
"learning_rate": 2.2403846153846156e-06,
"loss": 1.0646,
"mean_token_accuracy": 0.7628912925720215,
"step": 88
},
{
"epoch": 1.390625,
"grad_norm": 6.3125,
"learning_rate": 2.2307692307692307e-06,
"loss": 1.1259,
"mean_token_accuracy": 0.747133195400238,
"step": 89
},
{
"epoch": 1.40625,
"grad_norm": 6.96875,
"learning_rate": 2.2211538461538463e-06,
"loss": 1.0527,
"mean_token_accuracy": 0.769545316696167,
"step": 90
},
{
"epoch": 1.421875,
"grad_norm": 6.78125,
"learning_rate": 2.211538461538462e-06,
"loss": 1.0857,
"mean_token_accuracy": 0.7563959956169128,
"step": 91
},
{
"epoch": 1.4375,
"grad_norm": 6.6875,
"learning_rate": 2.201923076923077e-06,
"loss": 1.0779,
"mean_token_accuracy": 0.7625620365142822,
"step": 92
},
{
"epoch": 1.453125,
"grad_norm": 11.0,
"learning_rate": 2.192307692307692e-06,
"loss": 1.0615,
"mean_token_accuracy": 0.7684396505355835,
"step": 93
},
{
"epoch": 1.46875,
"grad_norm": 7.53125,
"learning_rate": 2.1826923076923076e-06,
"loss": 1.0067,
"mean_token_accuracy": 0.773740291595459,
"step": 94
},
{
"epoch": 1.484375,
"grad_norm": 6.40625,
"learning_rate": 2.173076923076923e-06,
"loss": 1.0334,
"mean_token_accuracy": 0.7674839496612549,
"step": 95
},
{
"epoch": 1.5,
"grad_norm": 9.3125,
"learning_rate": 2.1634615384615387e-06,
"loss": 1.0772,
"mean_token_accuracy": 0.7548962235450745,
"step": 96
},
{
"epoch": 1.515625,
"grad_norm": 20.25,
"learning_rate": 2.1538461538461538e-06,
"loss": 1.0953,
"mean_token_accuracy": 0.7568336129188538,
"step": 97
},
{
"epoch": 1.53125,
"grad_norm": 6.84375,
"learning_rate": 2.1442307692307693e-06,
"loss": 1.0656,
"mean_token_accuracy": 0.7618821859359741,
"step": 98
},
{
"epoch": 1.546875,
"grad_norm": 6.96875,
"learning_rate": 2.134615384615385e-06,
"loss": 0.9638,
"mean_token_accuracy": 0.7852028608322144,
"step": 99
},
{
"epoch": 1.5625,
"grad_norm": 16.25,
"learning_rate": 2.125e-06,
"loss": 1.057,
"mean_token_accuracy": 0.7648255228996277,
"step": 100
},
{
"epoch": 1.578125,
"grad_norm": 6.59375,
"learning_rate": 2.1153846153846155e-06,
"loss": 1.0351,
"mean_token_accuracy": 0.7706173062324524,
"step": 101
},
{
"epoch": 1.59375,
"grad_norm": 19.375,
"learning_rate": 2.1057692307692306e-06,
"loss": 1.0262,
"mean_token_accuracy": 0.7676799297332764,
"step": 102
},
{
"epoch": 1.609375,
"grad_norm": 6.40625,
"learning_rate": 2.096153846153846e-06,
"loss": 0.9901,
"mean_token_accuracy": 0.7759581208229065,
"step": 103
},
{
"epoch": 1.625,
"grad_norm": 8.5,
"learning_rate": 2.0865384615384613e-06,
"loss": 0.9888,
"mean_token_accuracy": 0.7782798409461975,
"step": 104
},
{
"epoch": 1.640625,
"grad_norm": 6.90625,
"learning_rate": 2.076923076923077e-06,
"loss": 0.9912,
"mean_token_accuracy": 0.7795735001564026,
"step": 105
},
{
"epoch": 1.65625,
"grad_norm": 8.6875,
"learning_rate": 2.0673076923076924e-06,
"loss": 1.0046,
"mean_token_accuracy": 0.7750487923622131,
"step": 106
},
{
"epoch": 1.671875,
"grad_norm": 6.59375,
"learning_rate": 2.057692307692308e-06,
"loss": 1.0527,
"mean_token_accuracy": 0.7730545997619629,
"step": 107
},
{
"epoch": 1.6875,
"grad_norm": 7.0,
"learning_rate": 2.048076923076923e-06,
"loss": 0.9943,
"mean_token_accuracy": 0.7790677547454834,
"step": 108
},
{
"epoch": 1.703125,
"grad_norm": 52.25,
"learning_rate": 2.0384615384615386e-06,
"loss": 1.0243,
"mean_token_accuracy": 0.7736808657646179,
"step": 109
},
{
"epoch": 1.71875,
"grad_norm": 6.65625,
"learning_rate": 2.028846153846154e-06,
"loss": 0.9599,
"mean_token_accuracy": 0.7829864025115967,
"step": 110
},
{
"epoch": 1.734375,
"grad_norm": 6.875,
"learning_rate": 2.0192307692307692e-06,
"loss": 0.993,
"mean_token_accuracy": 0.784246563911438,
"step": 111
},
{
"epoch": 1.75,
"grad_norm": 15.9375,
"learning_rate": 2.0096153846153844e-06,
"loss": 1.0284,
"mean_token_accuracy": 0.7701562643051147,
"step": 112
},
{
"epoch": 1.765625,
"grad_norm": 11.25,
"learning_rate": 2e-06,
"loss": 1.0453,
"mean_token_accuracy": 0.765605092048645,
"step": 113
},
{
"epoch": 1.78125,
"grad_norm": 11.25,
"learning_rate": 1.9903846153846155e-06,
"loss": 0.9963,
"mean_token_accuracy": 0.7780460119247437,
"step": 114
},
{
"epoch": 1.796875,
"grad_norm": 7.8125,
"learning_rate": 1.9807692307692306e-06,
"loss": 1.0104,
"mean_token_accuracy": 0.7722231149673462,
"step": 115
},
{
"epoch": 1.8125,
"grad_norm": 7.46875,
"learning_rate": 1.971153846153846e-06,
"loss": 0.9977,
"mean_token_accuracy": 0.7763562798500061,
"step": 116
},
{
"epoch": 1.828125,
"grad_norm": 7.09375,
"learning_rate": 1.9615384615384617e-06,
"loss": 0.9994,
"mean_token_accuracy": 0.7727492451667786,
"step": 117
},
{
"epoch": 1.84375,
"grad_norm": 6.71875,
"learning_rate": 1.951923076923077e-06,
"loss": 0.9729,
"mean_token_accuracy": 0.7792785167694092,
"step": 118
},
{
"epoch": 1.859375,
"grad_norm": 10.25,
"learning_rate": 1.9423076923076923e-06,
"loss": 1.0092,
"mean_token_accuracy": 0.7684956789016724,
"step": 119
},
{
"epoch": 1.875,
"grad_norm": 7.53125,
"learning_rate": 1.932692307692308e-06,
"loss": 0.962,
"mean_token_accuracy": 0.7807108163833618,
"step": 120
},
{
"epoch": 1.890625,
"grad_norm": 7.0,
"learning_rate": 1.9230769230769234e-06,
"loss": 1.0035,
"mean_token_accuracy": 0.7749958634376526,
"step": 121
},
{
"epoch": 1.90625,
"grad_norm": 9.3125,
"learning_rate": 1.9134615384615385e-06,
"loss": 1.0121,
"mean_token_accuracy": 0.7693877816200256,
"step": 122
},
{
"epoch": 1.921875,
"grad_norm": 6.46875,
"learning_rate": 1.9038461538461538e-06,
"loss": 0.9797,
"mean_token_accuracy": 0.7859560251235962,
"step": 123
},
{
"epoch": 1.9375,
"grad_norm": 6.65625,
"learning_rate": 1.8942307692307692e-06,
"loss": 0.9071,
"mean_token_accuracy": 0.7927353978157043,
"step": 124
},
{
"epoch": 1.953125,
"grad_norm": 6.5,
"learning_rate": 1.8846153846153847e-06,
"loss": 0.8898,
"mean_token_accuracy": 0.7983871102333069,
"step": 125
},
{
"epoch": 1.96875,
"grad_norm": 11.625,
"learning_rate": 1.875e-06,
"loss": 0.9509,
"mean_token_accuracy": 0.7818240523338318,
"step": 126
},
{
"epoch": 1.984375,
"grad_norm": 23.0,
"learning_rate": 1.8653846153846154e-06,
"loss": 0.9974,
"mean_token_accuracy": 0.7754906415939331,
"step": 127
},
{
"epoch": 2.0,
"grad_norm": 16.25,
"learning_rate": 1.855769230769231e-06,
"loss": 0.9983,
"mean_token_accuracy": 0.7626262903213501,
"step": 128
},
{
"epoch": 2.015625,
"grad_norm": 107.5,
"learning_rate": 1.8461538461538462e-06,
"loss": 0.981,
"mean_token_accuracy": 0.7775020599365234,
"step": 129
},
{
"epoch": 2.03125,
"grad_norm": 78.0,
"learning_rate": 1.8365384615384618e-06,
"loss": 1.0013,
"mean_token_accuracy": 0.777273416519165,
"step": 130
},
{
"epoch": 2.046875,
"grad_norm": 15.75,
"learning_rate": 1.826923076923077e-06,
"loss": 1.0124,
"mean_token_accuracy": 0.7745746970176697,
"step": 131
},
{
"epoch": 2.0625,
"grad_norm": 6.75,
"learning_rate": 1.8173076923076922e-06,
"loss": 1.0109,
"mean_token_accuracy": 0.7626786828041077,
"step": 132
},
{
"epoch": 2.078125,
"grad_norm": 10.75,
"learning_rate": 1.8076923076923076e-06,
"loss": 0.9854,
"mean_token_accuracy": 0.7782512903213501,
"step": 133
},
{
"epoch": 2.09375,
"grad_norm": 13.1875,
"learning_rate": 1.7980769230769231e-06,
"loss": 0.9558,
"mean_token_accuracy": 0.7810325026512146,
"step": 134
},
{
"epoch": 2.109375,
"grad_norm": 9.875,
"learning_rate": 1.7884615384615384e-06,
"loss": 0.9925,
"mean_token_accuracy": 0.7802160978317261,
"step": 135
},
{
"epoch": 2.125,
"grad_norm": 7.40625,
"learning_rate": 1.778846153846154e-06,
"loss": 0.9899,
"mean_token_accuracy": 0.7759094834327698,
"step": 136
},
{
"epoch": 2.140625,
"grad_norm": 6.28125,
"learning_rate": 1.7692307692307693e-06,
"loss": 0.9509,
"mean_token_accuracy": 0.7890124917030334,
"step": 137
},
{
"epoch": 2.15625,
"grad_norm": 8.1875,
"learning_rate": 1.7596153846153846e-06,
"loss": 0.9832,
"mean_token_accuracy": 0.7752114534378052,
"step": 138
},
{
"epoch": 2.171875,
"grad_norm": 6.25,
"learning_rate": 1.7500000000000002e-06,
"loss": 0.9225,
"mean_token_accuracy": 0.7903473973274231,
"step": 139
},
{
"epoch": 2.1875,
"grad_norm": 5.84375,
"learning_rate": 1.7403846153846155e-06,
"loss": 0.9476,
"mean_token_accuracy": 0.7873295545578003,
"step": 140
},
{
"epoch": 2.203125,
"grad_norm": 20.625,
"learning_rate": 1.7307692307692306e-06,
"loss": 1.0055,
"mean_token_accuracy": 0.7686769962310791,
"step": 141
},
{
"epoch": 2.21875,
"grad_norm": 26.625,
"learning_rate": 1.7211538461538462e-06,
"loss": 1.0145,
"mean_token_accuracy": 0.7676523923873901,
"step": 142
},
{
"epoch": 2.234375,
"grad_norm": 7.4375,
"learning_rate": 1.7115384615384615e-06,
"loss": 0.9358,
"mean_token_accuracy": 0.7874542474746704,
"step": 143
},
{
"epoch": 2.25,
"grad_norm": 23.875,
"learning_rate": 1.7019230769230768e-06,
"loss": 1.0153,
"mean_token_accuracy": 0.768985390663147,
"step": 144
},
{
"epoch": 2.265625,
"grad_norm": 6.625,
"learning_rate": 1.6923076923076924e-06,
"loss": 0.9844,
"mean_token_accuracy": 0.7775037288665771,
"step": 145
},
{
"epoch": 2.28125,
"grad_norm": 7.03125,
"learning_rate": 1.6826923076923077e-06,
"loss": 0.9851,
"mean_token_accuracy": 0.7740368843078613,
"step": 146
},
{
"epoch": 2.296875,
"grad_norm": 44.25,
"learning_rate": 1.6730769230769232e-06,
"loss": 0.9817,
"mean_token_accuracy": 0.7741778492927551,
"step": 147
},
{
"epoch": 2.3125,
"grad_norm": 13.3125,
"learning_rate": 1.6634615384615386e-06,
"loss": 0.9974,
"mean_token_accuracy": 0.7730644941329956,
"step": 148
},
{
"epoch": 2.328125,
"grad_norm": 5.625,
"learning_rate": 1.653846153846154e-06,
"loss": 0.927,
"mean_token_accuracy": 0.7886461615562439,
"step": 149
},
{
"epoch": 2.34375,
"grad_norm": 6.9375,
"learning_rate": 1.6442307692307695e-06,
"loss": 0.8944,
"mean_token_accuracy": 0.795914888381958,
"step": 150
},
{
"epoch": 2.359375,
"grad_norm": 8.625,
"learning_rate": 1.6346153846153846e-06,
"loss": 0.9641,
"mean_token_accuracy": 0.7850483059883118,
"step": 151
},
{
"epoch": 2.375,
"grad_norm": 61.25,
"learning_rate": 1.625e-06,
"loss": 0.9894,
"mean_token_accuracy": 0.7669225335121155,
"step": 152
},
{
"epoch": 2.390625,
"grad_norm": 13.9375,
"learning_rate": 1.6153846153846154e-06,
"loss": 0.9687,
"mean_token_accuracy": 0.7841285467147827,
"step": 153
},
{
"epoch": 2.40625,
"grad_norm": 25.375,
"learning_rate": 1.6057692307692308e-06,
"loss": 0.9431,
"mean_token_accuracy": 0.7892006039619446,
"step": 154
},
{
"epoch": 2.421875,
"grad_norm": 6.28125,
"learning_rate": 1.596153846153846e-06,
"loss": 0.9615,
"mean_token_accuracy": 0.7802021503448486,
"step": 155
},
{
"epoch": 2.4375,
"grad_norm": 9.4375,
"learning_rate": 1.5865384615384616e-06,
"loss": 0.912,
"mean_token_accuracy": 0.7976588606834412,
"step": 156
},
{
"epoch": 2.453125,
"grad_norm": 6.1875,
"learning_rate": 1.576923076923077e-06,
"loss": 0.944,
"mean_token_accuracy": 0.7864833474159241,
"step": 157
},
{
"epoch": 2.46875,
"grad_norm": 10.4375,
"learning_rate": 1.5673076923076925e-06,
"loss": 0.9251,
"mean_token_accuracy": 0.7923402786254883,
"step": 158
},
{
"epoch": 2.484375,
"grad_norm": 14.0625,
"learning_rate": 1.5576923076923078e-06,
"loss": 0.9495,
"mean_token_accuracy": 0.7810263633728027,
"step": 159
},
{
"epoch": 2.5,
"grad_norm": 27.75,
"learning_rate": 1.5480769230769232e-06,
"loss": 1.0118,
"mean_token_accuracy": 0.7756314873695374,
"step": 160
},
{
"epoch": 2.515625,
"grad_norm": 6.1875,
"learning_rate": 1.5384615384615383e-06,
"loss": 0.8795,
"mean_token_accuracy": 0.8030520677566528,
"step": 161
},
{
"epoch": 2.53125,
"grad_norm": 10.75,
"learning_rate": 1.5288461538461538e-06,
"loss": 0.993,
"mean_token_accuracy": 0.7751861214637756,
"step": 162
},
{
"epoch": 2.546875,
"grad_norm": 6.34375,
"learning_rate": 1.5192307692307692e-06,
"loss": 0.9337,
"mean_token_accuracy": 0.7878151535987854,
"step": 163
},
{
"epoch": 2.5625,
"grad_norm": 6.4375,
"learning_rate": 1.5096153846153847e-06,
"loss": 0.9229,
"mean_token_accuracy": 0.7985841631889343,
"step": 164
},
{
"epoch": 2.578125,
"grad_norm": 6.03125,
"learning_rate": 1.5e-06,
"loss": 0.9183,
"mean_token_accuracy": 0.7949057221412659,
"step": 165
},
{
"epoch": 2.59375,
"grad_norm": 8.1875,
"learning_rate": 1.4903846153846154e-06,
"loss": 0.9715,
"mean_token_accuracy": 0.7781420946121216,
"step": 166
},
{
"epoch": 2.609375,
"grad_norm": 5.90625,
"learning_rate": 1.480769230769231e-06,
"loss": 0.9473,
"mean_token_accuracy": 0.7875062227249146,
"step": 167
},
{
"epoch": 2.625,
"grad_norm": 5.75,
"learning_rate": 1.471153846153846e-06,
"loss": 0.9271,
"mean_token_accuracy": 0.7928742170333862,
"step": 168
},
{
"epoch": 2.640625,
"grad_norm": 13.25,
"learning_rate": 1.4615384615384616e-06,
"loss": 0.9739,
"mean_token_accuracy": 0.7802754640579224,
"step": 169
},
{
"epoch": 2.65625,
"grad_norm": 6.34375,
"learning_rate": 1.451923076923077e-06,
"loss": 0.9205,
"mean_token_accuracy": 0.7956867218017578,
"step": 170
},
{
"epoch": 2.671875,
"grad_norm": 20.125,
"learning_rate": 1.4423076923076924e-06,
"loss": 0.9228,
"mean_token_accuracy": 0.7963763475418091,
"step": 171
},
{
"epoch": 2.6875,
"grad_norm": 7.8125,
"learning_rate": 1.4326923076923078e-06,
"loss": 0.9705,
"mean_token_accuracy": 0.7838233113288879,
"step": 172
},
{
"epoch": 2.703125,
"grad_norm": 7.625,
"learning_rate": 1.423076923076923e-06,
"loss": 0.9856,
"mean_token_accuracy": 0.777830958366394,
"step": 173
},
{
"epoch": 2.71875,
"grad_norm": 5.90625,
"learning_rate": 1.4134615384615384e-06,
"loss": 0.9123,
"mean_token_accuracy": 0.7965753674507141,
"step": 174
},
{
"epoch": 2.734375,
"grad_norm": 6.59375,
"learning_rate": 1.403846153846154e-06,
"loss": 0.8977,
"mean_token_accuracy": 0.7950465679168701,
"step": 175
},
{
"epoch": 2.75,
"grad_norm": 5.96875,
"learning_rate": 1.3942307692307693e-06,
"loss": 0.9534,
"mean_token_accuracy": 0.7896023392677307,
"step": 176
},
{
"epoch": 2.765625,
"grad_norm": 51.5,
"learning_rate": 1.3846153846153846e-06,
"loss": 0.9886,
"mean_token_accuracy": 0.7733170390129089,
"step": 177
},
{
"epoch": 2.78125,
"grad_norm": 6.25,
"learning_rate": 1.375e-06,
"loss": 0.9304,
"mean_token_accuracy": 0.7874513864517212,
"step": 178
},
{
"epoch": 2.796875,
"grad_norm": 9.1875,
"learning_rate": 1.3653846153846153e-06,
"loss": 0.9909,
"mean_token_accuracy": 0.7773162722587585,
"step": 179
},
{
"epoch": 2.8125,
"grad_norm": 17.375,
"learning_rate": 1.3557692307692308e-06,
"loss": 0.9561,
"mean_token_accuracy": 0.7817555665969849,
"step": 180
},
{
"epoch": 2.828125,
"grad_norm": 6.15625,
"learning_rate": 1.3461538461538462e-06,
"loss": 0.9449,
"mean_token_accuracy": 0.7880972623825073,
"step": 181
},
{
"epoch": 2.84375,
"grad_norm": 6.375,
"learning_rate": 1.3365384615384617e-06,
"loss": 0.9588,
"mean_token_accuracy": 0.7854828238487244,
"step": 182
},
{
"epoch": 2.859375,
"grad_norm": 33.75,
"learning_rate": 1.3269230769230768e-06,
"loss": 0.9635,
"mean_token_accuracy": 0.7801363468170166,
"step": 183
},
{
"epoch": 2.875,
"grad_norm": 6.53125,
"learning_rate": 1.3173076923076924e-06,
"loss": 0.9914,
"mean_token_accuracy": 0.7746433615684509,
"step": 184
},
{
"epoch": 2.890625,
"grad_norm": 6.4375,
"learning_rate": 1.3076923076923077e-06,
"loss": 0.9492,
"mean_token_accuracy": 0.7791839838027954,
"step": 185
},
{
"epoch": 2.90625,
"grad_norm": 6.65625,
"learning_rate": 1.2980769230769232e-06,
"loss": 0.9667,
"mean_token_accuracy": 0.7795350551605225,
"step": 186
},
{
"epoch": 2.921875,
"grad_norm": 10.0625,
"learning_rate": 1.2884615384615384e-06,
"loss": 0.9588,
"mean_token_accuracy": 0.7847999930381775,
"step": 187
},
{
"epoch": 2.9375,
"grad_norm": 25.375,
"learning_rate": 1.278846153846154e-06,
"loss": 0.8759,
"mean_token_accuracy": 0.8001649975776672,
"step": 188
},
{
"epoch": 2.953125,
"grad_norm": 8.5625,
"learning_rate": 1.2692307692307692e-06,
"loss": 0.9675,
"mean_token_accuracy": 0.7719402313232422,
"step": 189
},
{
"epoch": 2.96875,
"grad_norm": 6.9375,
"learning_rate": 1.2596153846153848e-06,
"loss": 0.9304,
"mean_token_accuracy": 0.7856206893920898,
"step": 190
},
{
"epoch": 2.984375,
"grad_norm": 6.1875,
"learning_rate": 1.25e-06,
"loss": 0.9542,
"mean_token_accuracy": 0.78487229347229,
"step": 191
},
{
"epoch": 3.0,
"grad_norm": 10.8125,
"learning_rate": 1.2403846153846154e-06,
"loss": 0.9371,
"mean_token_accuracy": 0.7693274617195129,
"step": 192
},
{
"epoch": 3.015625,
"grad_norm": 6.25,
"learning_rate": 1.2307692307692308e-06,
"loss": 0.892,
"mean_token_accuracy": 0.7988702654838562,
"step": 193
},
{
"epoch": 3.03125,
"grad_norm": 18.375,
"learning_rate": 1.221153846153846e-06,
"loss": 0.9524,
"mean_token_accuracy": 0.782721221446991,
"step": 194
},
{
"epoch": 3.046875,
"grad_norm": 15.25,
"learning_rate": 1.2115384615384616e-06,
"loss": 0.9835,
"mean_token_accuracy": 0.7716888189315796,
"step": 195
},
{
"epoch": 3.0625,
"grad_norm": 6.03125,
"learning_rate": 1.201923076923077e-06,
"loss": 0.8871,
"mean_token_accuracy": 0.8021034002304077,
"step": 196
},
{
"epoch": 3.078125,
"grad_norm": 35.0,
"learning_rate": 1.1923076923076923e-06,
"loss": 0.9402,
"mean_token_accuracy": 0.7832167744636536,
"step": 197
},
{
"epoch": 3.09375,
"grad_norm": 8.75,
"learning_rate": 1.1826923076923076e-06,
"loss": 0.991,
"mean_token_accuracy": 0.7750375866889954,
"step": 198
},
{
"epoch": 3.109375,
"grad_norm": 20.25,
"learning_rate": 1.1730769230769232e-06,
"loss": 0.906,
"mean_token_accuracy": 0.7941325902938843,
"step": 199
},
{
"epoch": 3.125,
"grad_norm": 6.15625,
"learning_rate": 1.1634615384615385e-06,
"loss": 0.9131,
"mean_token_accuracy": 0.7984575629234314,
"step": 200
},
{
"epoch": 3.140625,
"grad_norm": 6.3125,
"learning_rate": 1.153846153846154e-06,
"loss": 0.823,
"mean_token_accuracy": 0.8103417158126831,
"step": 201
},
{
"epoch": 3.15625,
"grad_norm": 7.0,
"learning_rate": 1.1442307692307692e-06,
"loss": 0.9495,
"mean_token_accuracy": 0.7842703461647034,
"step": 202
},
{
"epoch": 3.171875,
"grad_norm": 7.4375,
"learning_rate": 1.1346153846153847e-06,
"loss": 0.9537,
"mean_token_accuracy": 0.7833541035652161,
"step": 203
},
{
"epoch": 3.1875,
"grad_norm": 8.8125,
"learning_rate": 1.125e-06,
"loss": 0.9689,
"mean_token_accuracy": 0.7763713002204895,
"step": 204
},
{
"epoch": 3.203125,
"grad_norm": 23.375,
"learning_rate": 1.1153846153846154e-06,
"loss": 0.9487,
"mean_token_accuracy": 0.7811124920845032,
"step": 205
},
{
"epoch": 3.21875,
"grad_norm": 6.0625,
"learning_rate": 1.105769230769231e-06,
"loss": 0.8658,
"mean_token_accuracy": 0.8113903999328613,
"step": 206
},
{
"epoch": 3.234375,
"grad_norm": 11.625,
"learning_rate": 1.096153846153846e-06,
"loss": 0.9577,
"mean_token_accuracy": 0.7787481546401978,
"step": 207
},
{
"epoch": 3.25,
"grad_norm": 6.4375,
"learning_rate": 1.0865384615384616e-06,
"loss": 0.9042,
"mean_token_accuracy": 0.7923972010612488,
"step": 208
},
{
"epoch": 3.265625,
"grad_norm": 6.6875,
"learning_rate": 1.0769230769230769e-06,
"loss": 0.9379,
"mean_token_accuracy": 0.7835002541542053,
"step": 209
},
{
"epoch": 3.28125,
"grad_norm": 5.9375,
"learning_rate": 1.0673076923076924e-06,
"loss": 0.9172,
"mean_token_accuracy": 0.7931802868843079,
"step": 210
},
{
"epoch": 3.296875,
"grad_norm": 7.875,
"learning_rate": 1.0576923076923078e-06,
"loss": 0.9593,
"mean_token_accuracy": 0.7814356684684753,
"step": 211
},
{
"epoch": 3.3125,
"grad_norm": 30.125,
"learning_rate": 1.048076923076923e-06,
"loss": 0.9164,
"mean_token_accuracy": 0.7888871431350708,
"step": 212
},
{
"epoch": 3.328125,
"grad_norm": 6.28125,
"learning_rate": 1.0384615384615384e-06,
"loss": 0.9056,
"mean_token_accuracy": 0.7965211868286133,
"step": 213
},
{
"epoch": 3.34375,
"grad_norm": 6.53125,
"learning_rate": 1.028846153846154e-06,
"loss": 0.948,
"mean_token_accuracy": 0.7837017178535461,
"step": 214
},
{
"epoch": 3.359375,
"grad_norm": 6.3125,
"learning_rate": 1.0192307692307693e-06,
"loss": 0.902,
"mean_token_accuracy": 0.8006668090820312,
"step": 215
},
{
"epoch": 3.375,
"grad_norm": 5.75,
"learning_rate": 1.0096153846153846e-06,
"loss": 0.8923,
"mean_token_accuracy": 0.7972199320793152,
"step": 216
},
{
"epoch": 3.390625,
"grad_norm": 6.1875,
"learning_rate": 1e-06,
"loss": 0.8828,
"mean_token_accuracy": 0.8011194467544556,
"step": 217
},
{
"epoch": 3.40625,
"grad_norm": 6.28125,
"learning_rate": 9.903846153846153e-07,
"loss": 0.8411,
"mean_token_accuracy": 0.8033494353294373,
"step": 218
},
{
"epoch": 3.421875,
"grad_norm": 33.75,
"learning_rate": 9.807692307692308e-07,
"loss": 0.916,
"mean_token_accuracy": 0.7819077372550964,
"step": 219
},
{
"epoch": 3.4375,
"grad_norm": 6.40625,
"learning_rate": 9.711538461538462e-07,
"loss": 0.9401,
"mean_token_accuracy": 0.7851645946502686,
"step": 220
},
{
"epoch": 3.453125,
"grad_norm": 6.15625,
"learning_rate": 9.615384615384617e-07,
"loss": 0.8603,
"mean_token_accuracy": 0.8017836213111877,
"step": 221
},
{
"epoch": 3.46875,
"grad_norm": 7.5,
"learning_rate": 9.519230769230769e-07,
"loss": 0.9541,
"mean_token_accuracy": 0.7865185141563416,
"step": 222
},
{
"epoch": 3.484375,
"grad_norm": 8.625,
"learning_rate": 9.423076923076924e-07,
"loss": 0.9576,
"mean_token_accuracy": 0.774846613407135,
"step": 223
},
{
"epoch": 3.5,
"grad_norm": 15.25,
"learning_rate": 9.326923076923077e-07,
"loss": 0.9012,
"mean_token_accuracy": 0.7876802086830139,
"step": 224
},
{
"epoch": 3.515625,
"grad_norm": 25.5,
"learning_rate": 9.230769230769231e-07,
"loss": 0.9489,
"mean_token_accuracy": 0.7805652618408203,
"step": 225
},
{
"epoch": 3.53125,
"grad_norm": 6.28125,
"learning_rate": 9.134615384615385e-07,
"loss": 0.874,
"mean_token_accuracy": 0.7986671328544617,
"step": 226
},
{
"epoch": 3.546875,
"grad_norm": 23.5,
"learning_rate": 9.038461538461538e-07,
"loss": 0.966,
"mean_token_accuracy": 0.7745603322982788,
"step": 227
},
{
"epoch": 3.5625,
"grad_norm": 6.46875,
"learning_rate": 8.942307692307692e-07,
"loss": 0.8936,
"mean_token_accuracy": 0.7917812466621399,
"step": 228
},
{
"epoch": 3.578125,
"grad_norm": 10.875,
"learning_rate": 8.846153846153847e-07,
"loss": 0.9755,
"mean_token_accuracy": 0.7724282145500183,
"step": 229
},
{
"epoch": 3.59375,
"grad_norm": 21.875,
"learning_rate": 8.750000000000001e-07,
"loss": 0.9574,
"mean_token_accuracy": 0.7761261463165283,
"step": 230
},
{
"epoch": 3.609375,
"grad_norm": 20.625,
"learning_rate": 8.653846153846153e-07,
"loss": 0.9784,
"mean_token_accuracy": 0.7752029299736023,
"step": 231
},
{
"epoch": 3.625,
"grad_norm": 6.25,
"learning_rate": 8.557692307692308e-07,
"loss": 0.8936,
"mean_token_accuracy": 0.796856701374054,
"step": 232
},
{
"epoch": 3.640625,
"grad_norm": 6.21875,
"learning_rate": 8.461538461538462e-07,
"loss": 0.9131,
"mean_token_accuracy": 0.7863614559173584,
"step": 233
},
{
"epoch": 3.65625,
"grad_norm": 5.9375,
"learning_rate": 8.365384615384616e-07,
"loss": 0.8746,
"mean_token_accuracy": 0.7959774732589722,
"step": 234
},
{
"epoch": 3.671875,
"grad_norm": 24.625,
"learning_rate": 8.26923076923077e-07,
"loss": 0.9224,
"mean_token_accuracy": 0.7824280858039856,
"step": 235
},
{
"epoch": 3.6875,
"grad_norm": 27.125,
"learning_rate": 8.173076923076923e-07,
"loss": 0.9327,
"mean_token_accuracy": 0.7815178632736206,
"step": 236
},
{
"epoch": 3.703125,
"grad_norm": 6.09375,
"learning_rate": 8.076923076923077e-07,
"loss": 0.9501,
"mean_token_accuracy": 0.7856559157371521,
"step": 237
},
{
"epoch": 3.71875,
"grad_norm": 20.25,
"learning_rate": 7.98076923076923e-07,
"loss": 0.8748,
"mean_token_accuracy": 0.7932746410369873,
"step": 238
},
{
"epoch": 3.734375,
"grad_norm": 6.625,
"learning_rate": 7.884615384615385e-07,
"loss": 0.9192,
"mean_token_accuracy": 0.7865205407142639,
"step": 239
},
{
"epoch": 3.75,
"grad_norm": 6.4375,
"learning_rate": 7.788461538461539e-07,
"loss": 0.9443,
"mean_token_accuracy": 0.7778134346008301,
"step": 240
},
{
"epoch": 3.765625,
"grad_norm": 6.0,
"learning_rate": 7.692307692307691e-07,
"loss": 0.9398,
"mean_token_accuracy": 0.7858214974403381,
"step": 241
},
{
"epoch": 3.78125,
"grad_norm": 5.9375,
"learning_rate": 7.596153846153846e-07,
"loss": 0.873,
"mean_token_accuracy": 0.7989254593849182,
"step": 242
},
{
"epoch": 3.796875,
"grad_norm": 6.21875,
"learning_rate": 7.5e-07,
"loss": 0.9991,
"mean_token_accuracy": 0.7662928104400635,
"step": 243
},
{
"epoch": 3.8125,
"grad_norm": 11.75,
"learning_rate": 7.403846153846155e-07,
"loss": 0.8945,
"mean_token_accuracy": 0.7896128296852112,
"step": 244
},
{
"epoch": 3.828125,
"grad_norm": 6.375,
"learning_rate": 7.307692307692308e-07,
"loss": 0.917,
"mean_token_accuracy": 0.7882217764854431,
"step": 245
},
{
"epoch": 3.84375,
"grad_norm": 6.28125,
"learning_rate": 7.211538461538462e-07,
"loss": 0.9661,
"mean_token_accuracy": 0.7709052562713623,
"step": 246
},
{
"epoch": 3.859375,
"grad_norm": 6.125,
"learning_rate": 7.115384615384616e-07,
"loss": 0.9326,
"mean_token_accuracy": 0.7830464243888855,
"step": 247
},
{
"epoch": 3.875,
"grad_norm": 8.5,
"learning_rate": 7.01923076923077e-07,
"loss": 0.9376,
"mean_token_accuracy": 0.7810230255126953,
"step": 248
},
{
"epoch": 3.890625,
"grad_norm": 14.0,
"learning_rate": 6.923076923076923e-07,
"loss": 0.9734,
"mean_token_accuracy": 0.7753646373748779,
"step": 249
},
{
"epoch": 3.90625,
"grad_norm": 12.8125,
"learning_rate": 6.826923076923076e-07,
"loss": 0.8922,
"mean_token_accuracy": 0.7958292961120605,
"step": 250
},
{
"epoch": 3.921875,
"grad_norm": 5.71875,
"learning_rate": 6.730769230769231e-07,
"loss": 0.9053,
"mean_token_accuracy": 0.7927485704421997,
"step": 251
},
{
"epoch": 3.9375,
"grad_norm": 6.21875,
"learning_rate": 6.634615384615384e-07,
"loss": 0.8707,
"mean_token_accuracy": 0.8024818897247314,
"step": 252
},
{
"epoch": 3.953125,
"grad_norm": 6.09375,
"learning_rate": 6.538461538461538e-07,
"loss": 0.8756,
"mean_token_accuracy": 0.7953294515609741,
"step": 253
},
{
"epoch": 3.96875,
"grad_norm": 16.75,
"learning_rate": 6.442307692307692e-07,
"loss": 0.9806,
"mean_token_accuracy": 0.7700640559196472,
"step": 254
},
{
"epoch": 3.984375,
"grad_norm": 6.15625,
"learning_rate": 6.346153846153846e-07,
"loss": 0.9052,
"mean_token_accuracy": 0.7910767197608948,
"step": 255
},
{
"epoch": 4.0,
"grad_norm": 6.1875,
"learning_rate": 6.25e-07,
"loss": 0.8756,
"mean_token_accuracy": 0.7918968796730042,
"step": 256
},
{
"epoch": 4.015625,
"grad_norm": 6.15625,
"learning_rate": 6.153846153846154e-07,
"loss": 0.8858,
"mean_token_accuracy": 0.7959042191505432,
"step": 257
},
{
"epoch": 4.03125,
"grad_norm": 6.8125,
"learning_rate": 6.057692307692308e-07,
"loss": 0.9302,
"mean_token_accuracy": 0.7876441478729248,
"step": 258
},
{
"epoch": 4.046875,
"grad_norm": 7.96875,
"learning_rate": 5.961538461538461e-07,
"loss": 0.9649,
"mean_token_accuracy": 0.7740775346755981,
"step": 259
},
{
"epoch": 4.0625,
"grad_norm": 19.125,
"learning_rate": 5.865384615384616e-07,
"loss": 0.9184,
"mean_token_accuracy": 0.7818529605865479,
"step": 260
},
{
"epoch": 4.078125,
"grad_norm": 11.375,
"learning_rate": 5.76923076923077e-07,
"loss": 0.912,
"mean_token_accuracy": 0.7843265533447266,
"step": 261
},
{
"epoch": 4.09375,
"grad_norm": 6.84375,
"learning_rate": 5.673076923076923e-07,
"loss": 0.9025,
"mean_token_accuracy": 0.7933535575866699,
"step": 262
},
{
"epoch": 4.109375,
"grad_norm": 6.1875,
"learning_rate": 5.576923076923077e-07,
"loss": 0.9127,
"mean_token_accuracy": 0.7852448225021362,
"step": 263
},
{
"epoch": 4.125,
"grad_norm": 6.03125,
"learning_rate": 5.48076923076923e-07,
"loss": 0.9111,
"mean_token_accuracy": 0.7866109013557434,
"step": 264
},
{
"epoch": 4.140625,
"grad_norm": 6.21875,
"learning_rate": 5.384615384615384e-07,
"loss": 0.892,
"mean_token_accuracy": 0.7890470027923584,
"step": 265
},
{
"epoch": 4.15625,
"grad_norm": 12.0625,
"learning_rate": 5.288461538461539e-07,
"loss": 0.8986,
"mean_token_accuracy": 0.7962346076965332,
"step": 266
},
{
"epoch": 4.171875,
"grad_norm": 29.0,
"learning_rate": 5.192307692307692e-07,
"loss": 0.9209,
"mean_token_accuracy": 0.786607563495636,
"step": 267
},
{
"epoch": 4.1875,
"grad_norm": 6.25,
"learning_rate": 5.096153846153846e-07,
"loss": 0.904,
"mean_token_accuracy": 0.7876223921775818,
"step": 268
},
{
"epoch": 4.203125,
"grad_norm": 7.96875,
"learning_rate": 5e-07,
"loss": 0.9383,
"mean_token_accuracy": 0.7814289927482605,
"step": 269
},
{
"epoch": 4.21875,
"grad_norm": 6.15625,
"learning_rate": 4.903846153846154e-07,
"loss": 0.9128,
"mean_token_accuracy": 0.7915287613868713,
"step": 270
},
{
"epoch": 4.234375,
"grad_norm": 6.78125,
"learning_rate": 4.807692307692308e-07,
"loss": 0.9481,
"mean_token_accuracy": 0.7819157242774963,
"step": 271
},
{
"epoch": 4.25,
"grad_norm": 6.25,
"learning_rate": 4.711538461538462e-07,
"loss": 0.8902,
"mean_token_accuracy": 0.7958080768585205,
"step": 272
},
{
"epoch": 4.265625,
"grad_norm": 7.65625,
"learning_rate": 4.6153846153846156e-07,
"loss": 0.977,
"mean_token_accuracy": 0.7705891132354736,
"step": 273
},
{
"epoch": 4.28125,
"grad_norm": 18.125,
"learning_rate": 4.519230769230769e-07,
"loss": 0.9237,
"mean_token_accuracy": 0.7899967432022095,
"step": 274
},
{
"epoch": 4.296875,
"grad_norm": 8.0,
"learning_rate": 4.4230769230769233e-07,
"loss": 0.8679,
"mean_token_accuracy": 0.7969164252281189,
"step": 275
},
{
"epoch": 4.3125,
"grad_norm": 6.03125,
"learning_rate": 4.3269230769230766e-07,
"loss": 0.8823,
"mean_token_accuracy": 0.7964279651641846,
"step": 276
},
{
"epoch": 4.328125,
"grad_norm": 9.6875,
"learning_rate": 4.230769230769231e-07,
"loss": 0.8829,
"mean_token_accuracy": 0.7917771935462952,
"step": 277
},
{
"epoch": 4.34375,
"grad_norm": 5.9375,
"learning_rate": 4.134615384615385e-07,
"loss": 0.8448,
"mean_token_accuracy": 0.80218505859375,
"step": 278
},
{
"epoch": 4.359375,
"grad_norm": 23.875,
"learning_rate": 4.0384615384615386e-07,
"loss": 0.9189,
"mean_token_accuracy": 0.7808871865272522,
"step": 279
},
{
"epoch": 4.375,
"grad_norm": 11.9375,
"learning_rate": 3.9423076923076924e-07,
"loss": 0.9288,
"mean_token_accuracy": 0.7887097001075745,
"step": 280
},
{
"epoch": 4.390625,
"grad_norm": 6.0625,
"learning_rate": 3.846153846153846e-07,
"loss": 0.9306,
"mean_token_accuracy": 0.7884582877159119,
"step": 281
},
{
"epoch": 4.40625,
"grad_norm": 6.40625,
"learning_rate": 3.75e-07,
"loss": 0.9234,
"mean_token_accuracy": 0.7834262847900391,
"step": 282
},
{
"epoch": 4.421875,
"grad_norm": 6.1875,
"learning_rate": 3.653846153846154e-07,
"loss": 0.8417,
"mean_token_accuracy": 0.8057113885879517,
"step": 283
},
{
"epoch": 4.4375,
"grad_norm": 6.59375,
"learning_rate": 3.557692307692308e-07,
"loss": 0.9138,
"mean_token_accuracy": 0.7837575078010559,
"step": 284
},
{
"epoch": 4.453125,
"grad_norm": 33.25,
"learning_rate": 3.4615384615384616e-07,
"loss": 0.8612,
"mean_token_accuracy": 0.7957943081855774,
"step": 285
},
{
"epoch": 4.46875,
"grad_norm": 5.71875,
"learning_rate": 3.3653846153846154e-07,
"loss": 0.8749,
"mean_token_accuracy": 0.8011859059333801,
"step": 286
},
{
"epoch": 4.484375,
"grad_norm": 6.5,
"learning_rate": 3.269230769230769e-07,
"loss": 0.8933,
"mean_token_accuracy": 0.7931327819824219,
"step": 287
},
{
"epoch": 4.5,
"grad_norm": 5.96875,
"learning_rate": 3.173076923076923e-07,
"loss": 0.873,
"mean_token_accuracy": 0.799176812171936,
"step": 288
},
{
"epoch": 4.515625,
"grad_norm": 29.25,
"learning_rate": 3.076923076923077e-07,
"loss": 0.9831,
"mean_token_accuracy": 0.7707536816596985,
"step": 289
},
{
"epoch": 4.53125,
"grad_norm": 13.0,
"learning_rate": 2.980769230769231e-07,
"loss": 0.9484,
"mean_token_accuracy": 0.7785703539848328,
"step": 290
},
{
"epoch": 4.546875,
"grad_norm": 6.875,
"learning_rate": 2.884615384615385e-07,
"loss": 0.9039,
"mean_token_accuracy": 0.7895255088806152,
"step": 291
},
{
"epoch": 4.5625,
"grad_norm": 6.625,
"learning_rate": 2.7884615384615384e-07,
"loss": 0.9063,
"mean_token_accuracy": 0.7905294299125671,
"step": 292
},
{
"epoch": 4.578125,
"grad_norm": 7.5,
"learning_rate": 2.692307692307692e-07,
"loss": 0.8963,
"mean_token_accuracy": 0.7899447083473206,
"step": 293
},
{
"epoch": 4.59375,
"grad_norm": 6.84375,
"learning_rate": 2.596153846153846e-07,
"loss": 0.94,
"mean_token_accuracy": 0.7839446663856506,
"step": 294
},
{
"epoch": 4.609375,
"grad_norm": 5.875,
"learning_rate": 2.5e-07,
"loss": 0.8796,
"mean_token_accuracy": 0.7935015559196472,
"step": 295
},
{
"epoch": 4.625,
"grad_norm": 10.125,
"learning_rate": 2.403846153846154e-07,
"loss": 0.8859,
"mean_token_accuracy": 0.789797842502594,
"step": 296
},
{
"epoch": 4.640625,
"grad_norm": 6.5,
"learning_rate": 2.3076923076923078e-07,
"loss": 0.9302,
"mean_token_accuracy": 0.7839468121528625,
"step": 297
},
{
"epoch": 4.65625,
"grad_norm": 6.28125,
"learning_rate": 2.2115384615384616e-07,
"loss": 0.9524,
"mean_token_accuracy": 0.7737887501716614,
"step": 298
},
{
"epoch": 4.671875,
"grad_norm": 8.125,
"learning_rate": 2.1153846153846155e-07,
"loss": 0.9181,
"mean_token_accuracy": 0.7865311503410339,
"step": 299
},
{
"epoch": 4.6875,
"grad_norm": 6.8125,
"learning_rate": 2.0192307692307693e-07,
"loss": 0.8934,
"mean_token_accuracy": 0.789657473564148,
"step": 300
},
{
"epoch": 4.703125,
"grad_norm": 6.4375,
"learning_rate": 1.923076923076923e-07,
"loss": 0.9065,
"mean_token_accuracy": 0.789890468120575,
"step": 301
},
{
"epoch": 4.71875,
"grad_norm": 6.0625,
"learning_rate": 1.826923076923077e-07,
"loss": 0.8889,
"mean_token_accuracy": 0.7914140820503235,
"step": 302
},
{
"epoch": 4.734375,
"grad_norm": 14.0,
"learning_rate": 1.7307692307692308e-07,
"loss": 0.9113,
"mean_token_accuracy": 0.7846829891204834,
"step": 303
},
{
"epoch": 4.75,
"grad_norm": 7.34375,
"learning_rate": 1.6346153846153846e-07,
"loss": 0.9374,
"mean_token_accuracy": 0.7835116386413574,
"step": 304
},
{
"epoch": 4.765625,
"grad_norm": 11.375,
"learning_rate": 1.5384615384615385e-07,
"loss": 0.8824,
"mean_token_accuracy": 0.7850437760353088,
"step": 305
},
{
"epoch": 4.78125,
"grad_norm": 7.1875,
"learning_rate": 1.4423076923076925e-07,
"loss": 0.9408,
"mean_token_accuracy": 0.7855393886566162,
"step": 306
},
{
"epoch": 4.796875,
"grad_norm": 8.3125,
"learning_rate": 1.346153846153846e-07,
"loss": 0.9047,
"mean_token_accuracy": 0.7834271788597107,
"step": 307
},
{
"epoch": 4.8125,
"grad_norm": 8.0625,
"learning_rate": 1.25e-07,
"loss": 0.9188,
"mean_token_accuracy": 0.7803459167480469,
"step": 308
},
{
"epoch": 4.828125,
"grad_norm": 20.25,
"learning_rate": 1.1538461538461539e-07,
"loss": 0.9627,
"mean_token_accuracy": 0.7762289047241211,
"step": 309
},
{
"epoch": 4.84375,
"grad_norm": 10.6875,
"learning_rate": 1.0576923076923077e-07,
"loss": 0.884,
"mean_token_accuracy": 0.7895846962928772,
"step": 310
},
{
"epoch": 4.859375,
"grad_norm": 6.53125,
"learning_rate": 9.615384615384614e-08,
"loss": 0.894,
"mean_token_accuracy": 0.792397677898407,
"step": 311
},
{
"epoch": 4.875,
"grad_norm": 10.6875,
"learning_rate": 8.653846153846154e-08,
"loss": 0.9112,
"mean_token_accuracy": 0.786827564239502,
"step": 312
},
{
"epoch": 4.890625,
"grad_norm": 6.6875,
"learning_rate": 7.692307692307692e-08,
"loss": 0.9368,
"mean_token_accuracy": 0.7861944437026978,
"step": 313
},
{
"epoch": 4.90625,
"grad_norm": 10.625,
"learning_rate": 6.73076923076923e-08,
"loss": 0.9033,
"mean_token_accuracy": 0.7840073704719543,
"step": 314
},
{
"epoch": 4.921875,
"grad_norm": 10.25,
"learning_rate": 5.7692307692307695e-08,
"loss": 0.8777,
"mean_token_accuracy": 0.7938881516456604,
"step": 315
},
{
"epoch": 4.9375,
"grad_norm": 6.5625,
"learning_rate": 4.807692307692307e-08,
"loss": 0.9133,
"mean_token_accuracy": 0.7796001434326172,
"step": 316
},
{
"epoch": 4.953125,
"grad_norm": 5.6875,
"learning_rate": 3.846153846153846e-08,
"loss": 0.8674,
"mean_token_accuracy": 0.7963815927505493,
"step": 317
},
{
"epoch": 4.96875,
"grad_norm": 5.96875,
"learning_rate": 2.8846153846153848e-08,
"loss": 0.8872,
"mean_token_accuracy": 0.7956120371818542,
"step": 318
},
{
"epoch": 4.984375,
"grad_norm": 35.0,
"learning_rate": 1.923076923076923e-08,
"loss": 0.9419,
"mean_token_accuracy": 0.7827126979827881,
"step": 319
},
{
"epoch": 5.0,
"grad_norm": 6.625,
"learning_rate": 9.615384615384615e-09,
"loss": 0.8891,
"mean_token_accuracy": 0.7931276559829712,
"step": 320
}
],
"logging_steps": 1.0,
"max_steps": 320,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.299754052563763e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}