smollamdk-1 / trainer_state.json
ukung's picture
Upload 11 files
17b8f39 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 83.38095238095238,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.19047619047619047,
"grad_norm": 33.78459548950195,
"learning_rate": 1e-05,
"loss": 14.2748,
"mean_token_accuracy": 0.4245416074991226,
"step": 1
},
{
"epoch": 0.38095238095238093,
"grad_norm": 34.141048431396484,
"learning_rate": 2e-05,
"loss": 14.9063,
"mean_token_accuracy": 0.42434193193912506,
"step": 2
},
{
"epoch": 0.5714285714285714,
"grad_norm": 33.89708709716797,
"learning_rate": 3e-05,
"loss": 14.4293,
"mean_token_accuracy": 0.42967987805604935,
"step": 3
},
{
"epoch": 0.7619047619047619,
"grad_norm": 21.82135009765625,
"learning_rate": 4e-05,
"loss": 13.1187,
"mean_token_accuracy": 0.4886682406067848,
"step": 4
},
{
"epoch": 0.9523809523809523,
"grad_norm": 19.052448272705078,
"learning_rate": 5e-05,
"loss": 11.6617,
"mean_token_accuracy": 0.5300922393798828,
"step": 5
},
{
"epoch": 1.0,
"grad_norm": 10.372604370117188,
"learning_rate": 4.98989898989899e-05,
"loss": 1.9845,
"mean_token_accuracy": 0.6190476417541504,
"step": 6
},
{
"epoch": 1.1904761904761905,
"grad_norm": 18.249330520629883,
"learning_rate": 4.97979797979798e-05,
"loss": 9.8237,
"mean_token_accuracy": 0.58917336165905,
"step": 7
},
{
"epoch": 1.380952380952381,
"grad_norm": 18.177717208862305,
"learning_rate": 4.9696969696969694e-05,
"loss": 9.7575,
"mean_token_accuracy": 0.5883309841156006,
"step": 8
},
{
"epoch": 1.5714285714285714,
"grad_norm": 16.09309196472168,
"learning_rate": 4.9595959595959594e-05,
"loss": 9.3943,
"mean_token_accuracy": 0.6104246228933334,
"step": 9
},
{
"epoch": 1.7619047619047619,
"grad_norm": 14.678476333618164,
"learning_rate": 4.94949494949495e-05,
"loss": 8.6018,
"mean_token_accuracy": 0.6411420404911041,
"step": 10
},
{
"epoch": 1.9523809523809523,
"grad_norm": 12.80629825592041,
"learning_rate": 4.93939393939394e-05,
"loss": 7.9568,
"mean_token_accuracy": 0.6764184236526489,
"step": 11
},
{
"epoch": 2.0,
"grad_norm": 9.918559074401855,
"learning_rate": 4.92929292929293e-05,
"loss": 1.4247,
"mean_token_accuracy": 0.75,
"step": 12
},
{
"epoch": 2.1904761904761907,
"grad_norm": 11.65300464630127,
"learning_rate": 4.919191919191919e-05,
"loss": 7.3849,
"mean_token_accuracy": 0.6941855251789093,
"step": 13
},
{
"epoch": 2.380952380952381,
"grad_norm": 11.127327919006348,
"learning_rate": 4.909090909090909e-05,
"loss": 6.7104,
"mean_token_accuracy": 0.7069735676050186,
"step": 14
},
{
"epoch": 2.571428571428571,
"grad_norm": 11.559555053710938,
"learning_rate": 4.898989898989899e-05,
"loss": 7.0902,
"mean_token_accuracy": 0.709569051861763,
"step": 15
},
{
"epoch": 2.761904761904762,
"grad_norm": 10.838669776916504,
"learning_rate": 4.888888888888889e-05,
"loss": 6.7901,
"mean_token_accuracy": 0.713655412197113,
"step": 16
},
{
"epoch": 2.9523809523809526,
"grad_norm": 10.266611099243164,
"learning_rate": 4.878787878787879e-05,
"loss": 6.4548,
"mean_token_accuracy": 0.7244278490543365,
"step": 17
},
{
"epoch": 3.0,
"grad_norm": 5.915023326873779,
"learning_rate": 4.868686868686869e-05,
"loss": 0.636,
"mean_token_accuracy": 0.8730158805847168,
"step": 18
},
{
"epoch": 3.1904761904761907,
"grad_norm": 9.826017379760742,
"learning_rate": 4.858585858585859e-05,
"loss": 5.6655,
"mean_token_accuracy": 0.7555368840694427,
"step": 19
},
{
"epoch": 3.380952380952381,
"grad_norm": 9.213407516479492,
"learning_rate": 4.848484848484849e-05,
"loss": 6.4954,
"mean_token_accuracy": 0.7222279012203217,
"step": 20
},
{
"epoch": 3.571428571428571,
"grad_norm": 9.642789840698242,
"learning_rate": 4.838383838383839e-05,
"loss": 5.1397,
"mean_token_accuracy": 0.7691315412521362,
"step": 21
},
{
"epoch": 3.761904761904762,
"grad_norm": 8.594555854797363,
"learning_rate": 4.828282828282829e-05,
"loss": 5.4342,
"mean_token_accuracy": 0.7607319056987762,
"step": 22
},
{
"epoch": 3.9523809523809526,
"grad_norm": 8.79131031036377,
"learning_rate": 4.8181818181818186e-05,
"loss": 5.7146,
"mean_token_accuracy": 0.7484780848026276,
"step": 23
},
{
"epoch": 4.0,
"grad_norm": 6.953114032745361,
"learning_rate": 4.808080808080808e-05,
"loss": 1.4211,
"mean_token_accuracy": 0.7580645084381104,
"step": 24
},
{
"epoch": 4.190476190476191,
"grad_norm": 8.912933349609375,
"learning_rate": 4.797979797979798e-05,
"loss": 4.9729,
"mean_token_accuracy": 0.7652112394571304,
"step": 25
},
{
"epoch": 4.380952380952381,
"grad_norm": 9.128190994262695,
"learning_rate": 4.787878787878788e-05,
"loss": 4.9376,
"mean_token_accuracy": 0.7736384719610214,
"step": 26
},
{
"epoch": 4.571428571428571,
"grad_norm": 9.021340370178223,
"learning_rate": 4.7777777777777784e-05,
"loss": 5.1022,
"mean_token_accuracy": 0.7747573852539062,
"step": 27
},
{
"epoch": 4.761904761904762,
"grad_norm": 8.445326805114746,
"learning_rate": 4.7676767676767684e-05,
"loss": 4.4903,
"mean_token_accuracy": 0.8014376759529114,
"step": 28
},
{
"epoch": 4.9523809523809526,
"grad_norm": 8.269598960876465,
"learning_rate": 4.7575757575757576e-05,
"loss": 4.7027,
"mean_token_accuracy": 0.7928940802812576,
"step": 29
},
{
"epoch": 5.0,
"grad_norm": 4.256129264831543,
"learning_rate": 4.7474747474747476e-05,
"loss": 1.1768,
"mean_token_accuracy": 0.8405796885490417,
"step": 30
},
{
"epoch": 5.190476190476191,
"grad_norm": 7.8270978927612305,
"learning_rate": 4.7373737373737375e-05,
"loss": 4.2699,
"mean_token_accuracy": 0.8052034825086594,
"step": 31
},
{
"epoch": 5.380952380952381,
"grad_norm": 7.741850852966309,
"learning_rate": 4.7272727272727275e-05,
"loss": 3.9571,
"mean_token_accuracy": 0.8226524442434311,
"step": 32
},
{
"epoch": 5.571428571428571,
"grad_norm": 7.062904357910156,
"learning_rate": 4.7171717171717174e-05,
"loss": 4.1547,
"mean_token_accuracy": 0.8154689371585846,
"step": 33
},
{
"epoch": 5.761904761904762,
"grad_norm": 7.048011779785156,
"learning_rate": 4.7070707070707074e-05,
"loss": 4.4063,
"mean_token_accuracy": 0.8031313121318817,
"step": 34
},
{
"epoch": 5.9523809523809526,
"grad_norm": 7.0800580978393555,
"learning_rate": 4.696969696969697e-05,
"loss": 3.6279,
"mean_token_accuracy": 0.8297399282455444,
"step": 35
},
{
"epoch": 6.0,
"grad_norm": 7.842761993408203,
"learning_rate": 4.686868686868687e-05,
"loss": 1.2107,
"mean_token_accuracy": 0.8068181872367859,
"step": 36
},
{
"epoch": 6.190476190476191,
"grad_norm": 7.796157360076904,
"learning_rate": 4.676767676767677e-05,
"loss": 3.3978,
"mean_token_accuracy": 0.8329954296350479,
"step": 37
},
{
"epoch": 6.380952380952381,
"grad_norm": 6.457103252410889,
"learning_rate": 4.666666666666667e-05,
"loss": 3.451,
"mean_token_accuracy": 0.8273660093545914,
"step": 38
},
{
"epoch": 6.571428571428571,
"grad_norm": 6.003915786743164,
"learning_rate": 4.656565656565657e-05,
"loss": 3.5587,
"mean_token_accuracy": 0.83831487596035,
"step": 39
},
{
"epoch": 6.761904761904762,
"grad_norm": 6.043710231781006,
"learning_rate": 4.6464646464646464e-05,
"loss": 3.5422,
"mean_token_accuracy": 0.8222462385892868,
"step": 40
},
{
"epoch": 6.9523809523809526,
"grad_norm": 6.391598701477051,
"learning_rate": 4.636363636363636e-05,
"loss": 3.2658,
"mean_token_accuracy": 0.856766939163208,
"step": 41
},
{
"epoch": 7.0,
"grad_norm": 5.940098285675049,
"learning_rate": 4.626262626262626e-05,
"loss": 0.7579,
"mean_token_accuracy": 0.8301886916160583,
"step": 42
},
{
"epoch": 7.190476190476191,
"grad_norm": 6.040279388427734,
"learning_rate": 4.616161616161616e-05,
"loss": 2.7243,
"mean_token_accuracy": 0.8692310005426407,
"step": 43
},
{
"epoch": 7.380952380952381,
"grad_norm": 5.645506858825684,
"learning_rate": 4.606060606060607e-05,
"loss": 2.701,
"mean_token_accuracy": 0.8647979497909546,
"step": 44
},
{
"epoch": 7.571428571428571,
"grad_norm": 5.126684188842773,
"learning_rate": 4.595959595959596e-05,
"loss": 2.8655,
"mean_token_accuracy": 0.8684723079204559,
"step": 45
},
{
"epoch": 7.761904761904762,
"grad_norm": 8.235642433166504,
"learning_rate": 4.585858585858586e-05,
"loss": 2.9052,
"mean_token_accuracy": 0.8446438163518906,
"step": 46
},
{
"epoch": 7.9523809523809526,
"grad_norm": 6.074913501739502,
"learning_rate": 4.575757575757576e-05,
"loss": 2.8831,
"mean_token_accuracy": 0.857246458530426,
"step": 47
},
{
"epoch": 8.0,
"grad_norm": 4.886857986450195,
"learning_rate": 4.565656565656566e-05,
"loss": 0.8029,
"mean_token_accuracy": 0.8294573426246643,
"step": 48
},
{
"epoch": 8.19047619047619,
"grad_norm": 6.794694900512695,
"learning_rate": 4.555555555555556e-05,
"loss": 2.4927,
"mean_token_accuracy": 0.8782062977552414,
"step": 49
},
{
"epoch": 8.380952380952381,
"grad_norm": 5.690680503845215,
"learning_rate": 4.545454545454546e-05,
"loss": 1.9744,
"mean_token_accuracy": 0.8949980139732361,
"step": 50
},
{
"epoch": 8.571428571428571,
"grad_norm": 9.415908813476562,
"learning_rate": 4.535353535353535e-05,
"loss": 2.2432,
"mean_token_accuracy": 0.8826991468667984,
"step": 51
},
{
"epoch": 8.761904761904763,
"grad_norm": 7.901670932769775,
"learning_rate": 4.525252525252526e-05,
"loss": 2.2805,
"mean_token_accuracy": 0.8890593945980072,
"step": 52
},
{
"epoch": 8.952380952380953,
"grad_norm": 6.918704986572266,
"learning_rate": 4.515151515151516e-05,
"loss": 2.5343,
"mean_token_accuracy": 0.8712608069181442,
"step": 53
},
{
"epoch": 9.0,
"grad_norm": 12.76561450958252,
"learning_rate": 4.5050505050505056e-05,
"loss": 0.576,
"mean_token_accuracy": 0.8529411554336548,
"step": 54
},
{
"epoch": 9.19047619047619,
"grad_norm": 6.143138408660889,
"learning_rate": 4.494949494949495e-05,
"loss": 1.878,
"mean_token_accuracy": 0.9020879119634628,
"step": 55
},
{
"epoch": 9.380952380952381,
"grad_norm": 7.497737884521484,
"learning_rate": 4.484848484848485e-05,
"loss": 1.9871,
"mean_token_accuracy": 0.8944180905818939,
"step": 56
},
{
"epoch": 9.571428571428571,
"grad_norm": 5.427354335784912,
"learning_rate": 4.474747474747475e-05,
"loss": 1.9095,
"mean_token_accuracy": 0.9023730456829071,
"step": 57
},
{
"epoch": 9.761904761904763,
"grad_norm": 5.814023017883301,
"learning_rate": 4.464646464646465e-05,
"loss": 1.8084,
"mean_token_accuracy": 0.9020061939954758,
"step": 58
},
{
"epoch": 9.952380952380953,
"grad_norm": 6.965571403503418,
"learning_rate": 4.454545454545455e-05,
"loss": 1.7746,
"mean_token_accuracy": 0.9095794558525085,
"step": 59
},
{
"epoch": 10.0,
"grad_norm": 6.048158168792725,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.4674,
"mean_token_accuracy": 0.9152542352676392,
"step": 60
},
{
"epoch": 10.19047619047619,
"grad_norm": 6.400238513946533,
"learning_rate": 4.4343434343434346e-05,
"loss": 1.4747,
"mean_token_accuracy": 0.9173053950071335,
"step": 61
},
{
"epoch": 10.380952380952381,
"grad_norm": 5.616025924682617,
"learning_rate": 4.4242424242424246e-05,
"loss": 1.4234,
"mean_token_accuracy": 0.9245103895664215,
"step": 62
},
{
"epoch": 10.571428571428571,
"grad_norm": 6.788946628570557,
"learning_rate": 4.4141414141414145e-05,
"loss": 1.6027,
"mean_token_accuracy": 0.9176820814609528,
"step": 63
},
{
"epoch": 10.761904761904763,
"grad_norm": 6.084983825683594,
"learning_rate": 4.4040404040404044e-05,
"loss": 1.4259,
"mean_token_accuracy": 0.9250814765691757,
"step": 64
},
{
"epoch": 10.952380952380953,
"grad_norm": 10.394392967224121,
"learning_rate": 4.3939393939393944e-05,
"loss": 1.2998,
"mean_token_accuracy": 0.9314595013856888,
"step": 65
},
{
"epoch": 11.0,
"grad_norm": 4.715174198150635,
"learning_rate": 4.383838383838384e-05,
"loss": 0.2015,
"mean_token_accuracy": 0.9506173133850098,
"step": 66
},
{
"epoch": 11.19047619047619,
"grad_norm": 4.792293071746826,
"learning_rate": 4.3737373737373736e-05,
"loss": 1.2582,
"mean_token_accuracy": 0.9351158142089844,
"step": 67
},
{
"epoch": 11.380952380952381,
"grad_norm": 7.185492515563965,
"learning_rate": 4.3636363636363636e-05,
"loss": 1.025,
"mean_token_accuracy": 0.9418339878320694,
"step": 68
},
{
"epoch": 11.571428571428571,
"grad_norm": 6.083255290985107,
"learning_rate": 4.3535353535353535e-05,
"loss": 1.0012,
"mean_token_accuracy": 0.9446901679039001,
"step": 69
},
{
"epoch": 11.761904761904763,
"grad_norm": 8.141711235046387,
"learning_rate": 4.343434343434344e-05,
"loss": 1.2278,
"mean_token_accuracy": 0.9310520589351654,
"step": 70
},
{
"epoch": 11.952380952380953,
"grad_norm": 9.146880149841309,
"learning_rate": 4.3333333333333334e-05,
"loss": 1.0842,
"mean_token_accuracy": 0.9404759407043457,
"step": 71
},
{
"epoch": 12.0,
"grad_norm": 3.645364761352539,
"learning_rate": 4.3232323232323234e-05,
"loss": 0.1553,
"mean_token_accuracy": 0.9714285731315613,
"step": 72
},
{
"epoch": 12.19047619047619,
"grad_norm": 7.048225402832031,
"learning_rate": 4.313131313131313e-05,
"loss": 1.0319,
"mean_token_accuracy": 0.9446324110031128,
"step": 73
},
{
"epoch": 12.380952380952381,
"grad_norm": 6.668647289276123,
"learning_rate": 4.303030303030303e-05,
"loss": 0.8348,
"mean_token_accuracy": 0.9561943113803864,
"step": 74
},
{
"epoch": 12.571428571428571,
"grad_norm": 7.347132205963135,
"learning_rate": 4.292929292929293e-05,
"loss": 0.8571,
"mean_token_accuracy": 0.9449830502271652,
"step": 75
},
{
"epoch": 12.761904761904763,
"grad_norm": 5.543299674987793,
"learning_rate": 4.282828282828283e-05,
"loss": 0.9421,
"mean_token_accuracy": 0.9508587419986725,
"step": 76
},
{
"epoch": 12.952380952380953,
"grad_norm": 6.999424934387207,
"learning_rate": 4.2727272727272724e-05,
"loss": 0.6839,
"mean_token_accuracy": 0.9609730541706085,
"step": 77
},
{
"epoch": 13.0,
"grad_norm": 2.92433762550354,
"learning_rate": 4.262626262626263e-05,
"loss": 0.1323,
"mean_token_accuracy": 0.9838709831237793,
"step": 78
},
{
"epoch": 13.19047619047619,
"grad_norm": 5.790960311889648,
"learning_rate": 4.252525252525253e-05,
"loss": 0.7111,
"mean_token_accuracy": 0.9593389332294464,
"step": 79
},
{
"epoch": 13.380952380952381,
"grad_norm": 5.800691604614258,
"learning_rate": 4.242424242424243e-05,
"loss": 0.6327,
"mean_token_accuracy": 0.9631912261247635,
"step": 80
},
{
"epoch": 13.571428571428571,
"grad_norm": 5.627686977386475,
"learning_rate": 4.232323232323233e-05,
"loss": 0.6079,
"mean_token_accuracy": 0.961370512843132,
"step": 81
},
{
"epoch": 13.761904761904763,
"grad_norm": 7.996088027954102,
"learning_rate": 4.222222222222222e-05,
"loss": 0.578,
"mean_token_accuracy": 0.9649683386087418,
"step": 82
},
{
"epoch": 13.952380952380953,
"grad_norm": 6.650062084197998,
"learning_rate": 4.212121212121212e-05,
"loss": 0.738,
"mean_token_accuracy": 0.9565856605768204,
"step": 83
},
{
"epoch": 14.0,
"grad_norm": 3.682978630065918,
"learning_rate": 4.202020202020202e-05,
"loss": 0.1826,
"mean_token_accuracy": 0.9818181991577148,
"step": 84
},
{
"epoch": 14.19047619047619,
"grad_norm": 4.094846725463867,
"learning_rate": 4.191919191919192e-05,
"loss": 0.4917,
"mean_token_accuracy": 0.9723720699548721,
"step": 85
},
{
"epoch": 14.380952380952381,
"grad_norm": 5.953057289123535,
"learning_rate": 4.181818181818182e-05,
"loss": 0.4787,
"mean_token_accuracy": 0.9700902253389359,
"step": 86
},
{
"epoch": 14.571428571428571,
"grad_norm": 4.5836591720581055,
"learning_rate": 4.171717171717172e-05,
"loss": 0.5792,
"mean_token_accuracy": 0.9712613523006439,
"step": 87
},
{
"epoch": 14.761904761904763,
"grad_norm": 4.867373943328857,
"learning_rate": 4.161616161616162e-05,
"loss": 0.4702,
"mean_token_accuracy": 0.9780033379793167,
"step": 88
},
{
"epoch": 14.952380952380953,
"grad_norm": 7.761333465576172,
"learning_rate": 4.151515151515152e-05,
"loss": 0.6332,
"mean_token_accuracy": 0.9641157388687134,
"step": 89
},
{
"epoch": 15.0,
"grad_norm": 4.875545501708984,
"learning_rate": 4.141414141414142e-05,
"loss": 0.1378,
"mean_token_accuracy": 0.98591548204422,
"step": 90
},
{
"epoch": 15.19047619047619,
"grad_norm": 4.117421627044678,
"learning_rate": 4.131313131313132e-05,
"loss": 0.4463,
"mean_token_accuracy": 0.9724489748477936,
"step": 91
},
{
"epoch": 15.380952380952381,
"grad_norm": 3.252460241317749,
"learning_rate": 4.1212121212121216e-05,
"loss": 0.3858,
"mean_token_accuracy": 0.9809663742780685,
"step": 92
},
{
"epoch": 15.571428571428571,
"grad_norm": 4.330794334411621,
"learning_rate": 4.111111111111111e-05,
"loss": 0.4585,
"mean_token_accuracy": 0.9748548269271851,
"step": 93
},
{
"epoch": 15.761904761904763,
"grad_norm": 5.096158027648926,
"learning_rate": 4.101010101010101e-05,
"loss": 0.4829,
"mean_token_accuracy": 0.9708511531352997,
"step": 94
},
{
"epoch": 15.952380952380953,
"grad_norm": 6.11644172668457,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.4374,
"mean_token_accuracy": 0.974689856171608,
"step": 95
},
{
"epoch": 16.0,
"grad_norm": 2.1705079078674316,
"learning_rate": 4.0808080808080814e-05,
"loss": 0.0851,
"mean_token_accuracy": 0.9838709831237793,
"step": 96
},
{
"epoch": 16.19047619047619,
"grad_norm": 3.2492971420288086,
"learning_rate": 4.070707070707071e-05,
"loss": 0.3638,
"mean_token_accuracy": 0.9768412113189697,
"step": 97
},
{
"epoch": 16.38095238095238,
"grad_norm": 2.8683860301971436,
"learning_rate": 4.0606060606060606e-05,
"loss": 0.3437,
"mean_token_accuracy": 0.9768141210079193,
"step": 98
},
{
"epoch": 16.571428571428573,
"grad_norm": 3.508230686187744,
"learning_rate": 4.0505050505050506e-05,
"loss": 0.354,
"mean_token_accuracy": 0.9778662770986557,
"step": 99
},
{
"epoch": 16.761904761904763,
"grad_norm": 3.8338069915771484,
"learning_rate": 4.0404040404040405e-05,
"loss": 0.3948,
"mean_token_accuracy": 0.973381832242012,
"step": 100
},
{
"epoch": 16.952380952380953,
"grad_norm": 4.676501750946045,
"learning_rate": 4.0303030303030305e-05,
"loss": 0.3893,
"mean_token_accuracy": 0.9753514975309372,
"step": 101
},
{
"epoch": 17.0,
"grad_norm": 4.8052287101745605,
"learning_rate": 4.0202020202020204e-05,
"loss": 0.1183,
"mean_token_accuracy": 0.9649122953414917,
"step": 102
},
{
"epoch": 17.19047619047619,
"grad_norm": 3.2596077919006348,
"learning_rate": 4.01010101010101e-05,
"loss": 0.3596,
"mean_token_accuracy": 0.9725935012102127,
"step": 103
},
{
"epoch": 17.38095238095238,
"grad_norm": 2.6120784282684326,
"learning_rate": 4e-05,
"loss": 0.3414,
"mean_token_accuracy": 0.9788288474082947,
"step": 104
},
{
"epoch": 17.571428571428573,
"grad_norm": 3.26759934425354,
"learning_rate": 3.98989898989899e-05,
"loss": 0.3576,
"mean_token_accuracy": 0.9772270619869232,
"step": 105
},
{
"epoch": 17.761904761904763,
"grad_norm": 3.644747734069824,
"learning_rate": 3.97979797979798e-05,
"loss": 0.3324,
"mean_token_accuracy": 0.9781567454338074,
"step": 106
},
{
"epoch": 17.952380952380953,
"grad_norm": 4.441091537475586,
"learning_rate": 3.96969696969697e-05,
"loss": 0.3747,
"mean_token_accuracy": 0.9714739322662354,
"step": 107
},
{
"epoch": 18.0,
"grad_norm": 2.743286371231079,
"learning_rate": 3.9595959595959594e-05,
"loss": 0.0975,
"mean_token_accuracy": 0.9696969985961914,
"step": 108
},
{
"epoch": 18.19047619047619,
"grad_norm": 3.2830970287323,
"learning_rate": 3.9494949494949494e-05,
"loss": 0.3028,
"mean_token_accuracy": 0.9811016768217087,
"step": 109
},
{
"epoch": 18.38095238095238,
"grad_norm": 2.505868673324585,
"learning_rate": 3.939393939393939e-05,
"loss": 0.3186,
"mean_token_accuracy": 0.9771904498338699,
"step": 110
},
{
"epoch": 18.571428571428573,
"grad_norm": 2.6549816131591797,
"learning_rate": 3.929292929292929e-05,
"loss": 0.3141,
"mean_token_accuracy": 0.9759136885404587,
"step": 111
},
{
"epoch": 18.761904761904763,
"grad_norm": 3.7054269313812256,
"learning_rate": 3.91919191919192e-05,
"loss": 0.3736,
"mean_token_accuracy": 0.9732943773269653,
"step": 112
},
{
"epoch": 18.952380952380953,
"grad_norm": 3.014618158340454,
"learning_rate": 3.909090909090909e-05,
"loss": 0.3676,
"mean_token_accuracy": 0.9800769239664078,
"step": 113
},
{
"epoch": 19.0,
"grad_norm": 4.232401371002197,
"learning_rate": 3.898989898989899e-05,
"loss": 0.1268,
"mean_token_accuracy": 0.9577465057373047,
"step": 114
},
{
"epoch": 19.19047619047619,
"grad_norm": 1.8361284732818604,
"learning_rate": 3.888888888888889e-05,
"loss": 0.2937,
"mean_token_accuracy": 0.9818844795227051,
"step": 115
},
{
"epoch": 19.38095238095238,
"grad_norm": 3.4175708293914795,
"learning_rate": 3.878787878787879e-05,
"loss": 0.2919,
"mean_token_accuracy": 0.9831363707780838,
"step": 116
},
{
"epoch": 19.571428571428573,
"grad_norm": 3.504340887069702,
"learning_rate": 3.868686868686869e-05,
"loss": 0.3739,
"mean_token_accuracy": 0.9758433997631073,
"step": 117
},
{
"epoch": 19.761904761904763,
"grad_norm": 3.542600154876709,
"learning_rate": 3.858585858585859e-05,
"loss": 0.3247,
"mean_token_accuracy": 0.9753479957580566,
"step": 118
},
{
"epoch": 19.952380952380953,
"grad_norm": 2.5886898040771484,
"learning_rate": 3.848484848484848e-05,
"loss": 0.3257,
"mean_token_accuracy": 0.9774775803089142,
"step": 119
},
{
"epoch": 20.0,
"grad_norm": 2.6909375190734863,
"learning_rate": 3.838383838383838e-05,
"loss": 0.0882,
"mean_token_accuracy": 0.9682539701461792,
"step": 120
},
{
"epoch": 20.19047619047619,
"grad_norm": 2.958399772644043,
"learning_rate": 3.828282828282829e-05,
"loss": 0.3205,
"mean_token_accuracy": 0.9724349826574326,
"step": 121
},
{
"epoch": 20.38095238095238,
"grad_norm": 2.2972922325134277,
"learning_rate": 3.818181818181819e-05,
"loss": 0.2829,
"mean_token_accuracy": 0.9813934862613678,
"step": 122
},
{
"epoch": 20.571428571428573,
"grad_norm": 2.2647204399108887,
"learning_rate": 3.8080808080808087e-05,
"loss": 0.3087,
"mean_token_accuracy": 0.9758166968822479,
"step": 123
},
{
"epoch": 20.761904761904763,
"grad_norm": 2.4949004650115967,
"learning_rate": 3.797979797979798e-05,
"loss": 0.3143,
"mean_token_accuracy": 0.9777243584394455,
"step": 124
},
{
"epoch": 20.952380952380953,
"grad_norm": 2.5387442111968994,
"learning_rate": 3.787878787878788e-05,
"loss": 0.326,
"mean_token_accuracy": 0.9755249470472336,
"step": 125
},
{
"epoch": 21.0,
"grad_norm": 2.745015859603882,
"learning_rate": 3.777777777777778e-05,
"loss": 0.0842,
"mean_token_accuracy": 0.9714285731315613,
"step": 126
},
{
"epoch": 21.19047619047619,
"grad_norm": 1.7736639976501465,
"learning_rate": 3.767676767676768e-05,
"loss": 0.2777,
"mean_token_accuracy": 0.9804743677377701,
"step": 127
},
{
"epoch": 21.38095238095238,
"grad_norm": 2.391968011856079,
"learning_rate": 3.757575757575758e-05,
"loss": 0.2969,
"mean_token_accuracy": 0.9765493422746658,
"step": 128
},
{
"epoch": 21.571428571428573,
"grad_norm": 1.9384799003601074,
"learning_rate": 3.747474747474748e-05,
"loss": 0.2764,
"mean_token_accuracy": 0.978370875120163,
"step": 129
},
{
"epoch": 21.761904761904763,
"grad_norm": 2.363274097442627,
"learning_rate": 3.7373737373737376e-05,
"loss": 0.3086,
"mean_token_accuracy": 0.9715951085090637,
"step": 130
},
{
"epoch": 21.952380952380953,
"grad_norm": 2.90826416015625,
"learning_rate": 3.7272727272727276e-05,
"loss": 0.3241,
"mean_token_accuracy": 0.9738913327455521,
"step": 131
},
{
"epoch": 22.0,
"grad_norm": 1.8676457405090332,
"learning_rate": 3.7171717171717175e-05,
"loss": 0.0867,
"mean_token_accuracy": 0.9830508232116699,
"step": 132
},
{
"epoch": 22.19047619047619,
"grad_norm": 2.1423661708831787,
"learning_rate": 3.7070707070707075e-05,
"loss": 0.2691,
"mean_token_accuracy": 0.9791481345891953,
"step": 133
},
{
"epoch": 22.38095238095238,
"grad_norm": 2.0479485988616943,
"learning_rate": 3.6969696969696974e-05,
"loss": 0.2898,
"mean_token_accuracy": 0.9813213050365448,
"step": 134
},
{
"epoch": 22.571428571428573,
"grad_norm": 2.566549777984619,
"learning_rate": 3.686868686868687e-05,
"loss": 0.3174,
"mean_token_accuracy": 0.975700318813324,
"step": 135
},
{
"epoch": 22.761904761904763,
"grad_norm": 2.541551351547241,
"learning_rate": 3.6767676767676766e-05,
"loss": 0.3205,
"mean_token_accuracy": 0.978480726480484,
"step": 136
},
{
"epoch": 22.952380952380953,
"grad_norm": 2.037262201309204,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.2741,
"mean_token_accuracy": 0.9802869260311127,
"step": 137
},
{
"epoch": 23.0,
"grad_norm": 2.753689765930176,
"learning_rate": 3.656565656565657e-05,
"loss": 0.0844,
"mean_token_accuracy": 0.9841269850730896,
"step": 138
},
{
"epoch": 23.19047619047619,
"grad_norm": 1.9929062128067017,
"learning_rate": 3.6464646464646465e-05,
"loss": 0.2798,
"mean_token_accuracy": 0.9800110459327698,
"step": 139
},
{
"epoch": 23.38095238095238,
"grad_norm": 2.7327589988708496,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.2671,
"mean_token_accuracy": 0.9807360470294952,
"step": 140
},
{
"epoch": 23.571428571428573,
"grad_norm": 1.7482175827026367,
"learning_rate": 3.6262626262626264e-05,
"loss": 0.2965,
"mean_token_accuracy": 0.9796760976314545,
"step": 141
},
{
"epoch": 23.761904761904763,
"grad_norm": 2.599804639816284,
"learning_rate": 3.616161616161616e-05,
"loss": 0.3154,
"mean_token_accuracy": 0.977075606584549,
"step": 142
},
{
"epoch": 23.952380952380953,
"grad_norm": 2.482060194015503,
"learning_rate": 3.606060606060606e-05,
"loss": 0.3009,
"mean_token_accuracy": 0.9737012088298798,
"step": 143
},
{
"epoch": 24.0,
"grad_norm": 3.389758825302124,
"learning_rate": 3.595959595959596e-05,
"loss": 0.1225,
"mean_token_accuracy": 0.9636363387107849,
"step": 144
},
{
"epoch": 24.19047619047619,
"grad_norm": 1.8538786172866821,
"learning_rate": 3.5858585858585855e-05,
"loss": 0.2625,
"mean_token_accuracy": 0.9796436280012131,
"step": 145
},
{
"epoch": 24.38095238095238,
"grad_norm": 1.6289573907852173,
"learning_rate": 3.575757575757576e-05,
"loss": 0.2616,
"mean_token_accuracy": 0.9804391115903854,
"step": 146
},
{
"epoch": 24.571428571428573,
"grad_norm": 2.4140396118164062,
"learning_rate": 3.565656565656566e-05,
"loss": 0.3128,
"mean_token_accuracy": 0.979373887181282,
"step": 147
},
{
"epoch": 24.761904761904763,
"grad_norm": 2.182692766189575,
"learning_rate": 3.555555555555556e-05,
"loss": 0.2983,
"mean_token_accuracy": 0.9793859571218491,
"step": 148
},
{
"epoch": 24.952380952380953,
"grad_norm": 2.800553560256958,
"learning_rate": 3.545454545454546e-05,
"loss": 0.3566,
"mean_token_accuracy": 0.9733032137155533,
"step": 149
},
{
"epoch": 25.0,
"grad_norm": 1.8961296081542969,
"learning_rate": 3.535353535353535e-05,
"loss": 0.0623,
"mean_token_accuracy": 0.9797979593276978,
"step": 150
},
{
"epoch": 25.19047619047619,
"grad_norm": 2.6031830310821533,
"learning_rate": 3.525252525252525e-05,
"loss": 0.307,
"mean_token_accuracy": 0.9759431630373001,
"step": 151
},
{
"epoch": 25.38095238095238,
"grad_norm": 1.7213940620422363,
"learning_rate": 3.515151515151515e-05,
"loss": 0.2605,
"mean_token_accuracy": 0.9829924404621124,
"step": 152
},
{
"epoch": 25.571428571428573,
"grad_norm": 2.169405221939087,
"learning_rate": 3.505050505050505e-05,
"loss": 0.2833,
"mean_token_accuracy": 0.976715162396431,
"step": 153
},
{
"epoch": 25.761904761904763,
"grad_norm": 2.126295566558838,
"learning_rate": 3.494949494949495e-05,
"loss": 0.2836,
"mean_token_accuracy": 0.9775257259607315,
"step": 154
},
{
"epoch": 25.952380952380953,
"grad_norm": 2.112752914428711,
"learning_rate": 3.484848484848485e-05,
"loss": 0.3001,
"mean_token_accuracy": 0.9795974045991898,
"step": 155
},
{
"epoch": 26.0,
"grad_norm": 2.9405832290649414,
"learning_rate": 3.474747474747475e-05,
"loss": 0.1069,
"mean_token_accuracy": 0.9824561476707458,
"step": 156
},
{
"epoch": 26.19047619047619,
"grad_norm": 1.8124560117721558,
"learning_rate": 3.464646464646465e-05,
"loss": 0.2694,
"mean_token_accuracy": 0.982256755232811,
"step": 157
},
{
"epoch": 26.38095238095238,
"grad_norm": 1.8597822189331055,
"learning_rate": 3.454545454545455e-05,
"loss": 0.2558,
"mean_token_accuracy": 0.98062863945961,
"step": 158
},
{
"epoch": 26.571428571428573,
"grad_norm": 1.6446207761764526,
"learning_rate": 3.444444444444445e-05,
"loss": 0.2587,
"mean_token_accuracy": 0.9779441952705383,
"step": 159
},
{
"epoch": 26.761904761904763,
"grad_norm": 2.2227869033813477,
"learning_rate": 3.434343434343435e-05,
"loss": 0.3241,
"mean_token_accuracy": 0.9747696965932846,
"step": 160
},
{
"epoch": 26.952380952380953,
"grad_norm": 1.6738312244415283,
"learning_rate": 3.424242424242424e-05,
"loss": 0.2779,
"mean_token_accuracy": 0.9778714776039124,
"step": 161
},
{
"epoch": 27.0,
"grad_norm": 1.4880234003067017,
"learning_rate": 3.414141414141414e-05,
"loss": 0.0801,
"mean_token_accuracy": 0.9838709831237793,
"step": 162
},
{
"epoch": 27.19047619047619,
"grad_norm": 1.5148252248764038,
"learning_rate": 3.4040404040404045e-05,
"loss": 0.2581,
"mean_token_accuracy": 0.980286031961441,
"step": 163
},
{
"epoch": 27.38095238095238,
"grad_norm": 1.833160400390625,
"learning_rate": 3.3939393939393945e-05,
"loss": 0.2724,
"mean_token_accuracy": 0.9760157763957977,
"step": 164
},
{
"epoch": 27.571428571428573,
"grad_norm": 2.1366348266601562,
"learning_rate": 3.3838383838383844e-05,
"loss": 0.2916,
"mean_token_accuracy": 0.9787898063659668,
"step": 165
},
{
"epoch": 27.761904761904763,
"grad_norm": 2.5082993507385254,
"learning_rate": 3.373737373737374e-05,
"loss": 0.2929,
"mean_token_accuracy": 0.9774486720561981,
"step": 166
},
{
"epoch": 27.952380952380953,
"grad_norm": 2.1355273723602295,
"learning_rate": 3.3636363636363636e-05,
"loss": 0.2856,
"mean_token_accuracy": 0.9789445698261261,
"step": 167
},
{
"epoch": 28.0,
"grad_norm": 1.970436930656433,
"learning_rate": 3.3535353535353536e-05,
"loss": 0.0806,
"mean_token_accuracy": 0.9692307710647583,
"step": 168
},
{
"epoch": 28.19047619047619,
"grad_norm": 2.1435768604278564,
"learning_rate": 3.3434343434343435e-05,
"loss": 0.2658,
"mean_token_accuracy": 0.9759610444307327,
"step": 169
},
{
"epoch": 28.38095238095238,
"grad_norm": 1.6564626693725586,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.2548,
"mean_token_accuracy": 0.9793960750102997,
"step": 170
},
{
"epoch": 28.571428571428573,
"grad_norm": 1.7106664180755615,
"learning_rate": 3.3232323232323234e-05,
"loss": 0.255,
"mean_token_accuracy": 0.9787760227918625,
"step": 171
},
{
"epoch": 28.761904761904763,
"grad_norm": 2.1820991039276123,
"learning_rate": 3.3131313131313134e-05,
"loss": 0.3227,
"mean_token_accuracy": 0.973702073097229,
"step": 172
},
{
"epoch": 28.952380952380953,
"grad_norm": 1.7227038145065308,
"learning_rate": 3.303030303030303e-05,
"loss": 0.2653,
"mean_token_accuracy": 0.9788563847541809,
"step": 173
},
{
"epoch": 29.0,
"grad_norm": 1.6985877752304077,
"learning_rate": 3.292929292929293e-05,
"loss": 0.0653,
"mean_token_accuracy": 0.9756097793579102,
"step": 174
},
{
"epoch": 29.19047619047619,
"grad_norm": 1.70681631565094,
"learning_rate": 3.282828282828283e-05,
"loss": 0.2621,
"mean_token_accuracy": 0.9808604121208191,
"step": 175
},
{
"epoch": 29.38095238095238,
"grad_norm": 1.5982296466827393,
"learning_rate": 3.272727272727273e-05,
"loss": 0.2444,
"mean_token_accuracy": 0.9789219200611115,
"step": 176
},
{
"epoch": 29.571428571428573,
"grad_norm": 1.4115501642227173,
"learning_rate": 3.2626262626262624e-05,
"loss": 0.2386,
"mean_token_accuracy": 0.9839699417352676,
"step": 177
},
{
"epoch": 29.761904761904763,
"grad_norm": 2.2143611907958984,
"learning_rate": 3.2525252525252524e-05,
"loss": 0.3214,
"mean_token_accuracy": 0.9736231416463852,
"step": 178
},
{
"epoch": 29.952380952380953,
"grad_norm": 2.329328775405884,
"learning_rate": 3.2424242424242423e-05,
"loss": 0.2899,
"mean_token_accuracy": 0.974274680018425,
"step": 179
},
{
"epoch": 30.0,
"grad_norm": 1.8894615173339844,
"learning_rate": 3.232323232323233e-05,
"loss": 0.0873,
"mean_token_accuracy": 0.970588207244873,
"step": 180
},
{
"epoch": 30.19047619047619,
"grad_norm": 1.8685792684555054,
"learning_rate": 3.222222222222223e-05,
"loss": 0.2713,
"mean_token_accuracy": 0.9793071448802948,
"step": 181
},
{
"epoch": 30.38095238095238,
"grad_norm": 1.6303725242614746,
"learning_rate": 3.212121212121212e-05,
"loss": 0.2602,
"mean_token_accuracy": 0.978649765253067,
"step": 182
},
{
"epoch": 30.571428571428573,
"grad_norm": 1.5414835214614868,
"learning_rate": 3.202020202020202e-05,
"loss": 0.2507,
"mean_token_accuracy": 0.9816054552793503,
"step": 183
},
{
"epoch": 30.761904761904763,
"grad_norm": 1.9461543560028076,
"learning_rate": 3.191919191919192e-05,
"loss": 0.2622,
"mean_token_accuracy": 0.9799721091985703,
"step": 184
},
{
"epoch": 30.952380952380953,
"grad_norm": 2.4515039920806885,
"learning_rate": 3.181818181818182e-05,
"loss": 0.316,
"mean_token_accuracy": 0.9738900065422058,
"step": 185
},
{
"epoch": 31.0,
"grad_norm": 2.3152859210968018,
"learning_rate": 3.171717171717172e-05,
"loss": 0.0924,
"mean_token_accuracy": 0.9666666388511658,
"step": 186
},
{
"epoch": 31.19047619047619,
"grad_norm": 1.5827226638793945,
"learning_rate": 3.161616161616161e-05,
"loss": 0.2548,
"mean_token_accuracy": 0.9807614088058472,
"step": 187
},
{
"epoch": 31.38095238095238,
"grad_norm": 1.5467098951339722,
"learning_rate": 3.151515151515151e-05,
"loss": 0.2567,
"mean_token_accuracy": 0.9772002995014191,
"step": 188
},
{
"epoch": 31.571428571428573,
"grad_norm": 1.5654078722000122,
"learning_rate": 3.141414141414142e-05,
"loss": 0.2523,
"mean_token_accuracy": 0.9784552752971649,
"step": 189
},
{
"epoch": 31.761904761904763,
"grad_norm": 1.6791102886199951,
"learning_rate": 3.131313131313132e-05,
"loss": 0.2749,
"mean_token_accuracy": 0.9773024320602417,
"step": 190
},
{
"epoch": 31.952380952380953,
"grad_norm": 1.864105224609375,
"learning_rate": 3.121212121212122e-05,
"loss": 0.2938,
"mean_token_accuracy": 0.9765942692756653,
"step": 191
},
{
"epoch": 32.0,
"grad_norm": 1.214571475982666,
"learning_rate": 3.111111111111111e-05,
"loss": 0.0665,
"mean_token_accuracy": 0.987500011920929,
"step": 192
},
{
"epoch": 32.19047619047619,
"grad_norm": 1.4030119180679321,
"learning_rate": 3.101010101010101e-05,
"loss": 0.2415,
"mean_token_accuracy": 0.9817796945571899,
"step": 193
},
{
"epoch": 32.38095238095238,
"grad_norm": 1.6708261966705322,
"learning_rate": 3.090909090909091e-05,
"loss": 0.2582,
"mean_token_accuracy": 0.9801040887832642,
"step": 194
},
{
"epoch": 32.57142857142857,
"grad_norm": 1.4296513795852661,
"learning_rate": 3.080808080808081e-05,
"loss": 0.2493,
"mean_token_accuracy": 0.9811757057905197,
"step": 195
},
{
"epoch": 32.76190476190476,
"grad_norm": 1.7713197469711304,
"learning_rate": 3.070707070707071e-05,
"loss": 0.2823,
"mean_token_accuracy": 0.9782667905092239,
"step": 196
},
{
"epoch": 32.95238095238095,
"grad_norm": 2.032137632369995,
"learning_rate": 3.060606060606061e-05,
"loss": 0.294,
"mean_token_accuracy": 0.9734672009944916,
"step": 197
},
{
"epoch": 33.0,
"grad_norm": 2.334019660949707,
"learning_rate": 3.050505050505051e-05,
"loss": 0.0861,
"mean_token_accuracy": 0.9726027250289917,
"step": 198
},
{
"epoch": 33.19047619047619,
"grad_norm": 1.4779608249664307,
"learning_rate": 3.0404040404040406e-05,
"loss": 0.2537,
"mean_token_accuracy": 0.981317549943924,
"step": 199
},
{
"epoch": 33.38095238095238,
"grad_norm": 1.435577392578125,
"learning_rate": 3.0303030303030306e-05,
"loss": 0.2544,
"mean_token_accuracy": 0.9813797920942307,
"step": 200
},
{
"epoch": 33.57142857142857,
"grad_norm": 1.8126311302185059,
"learning_rate": 3.0202020202020205e-05,
"loss": 0.2705,
"mean_token_accuracy": 0.9765264093875885,
"step": 201
},
{
"epoch": 33.76190476190476,
"grad_norm": 1.5598095655441284,
"learning_rate": 3.01010101010101e-05,
"loss": 0.2723,
"mean_token_accuracy": 0.978124126791954,
"step": 202
},
{
"epoch": 33.95238095238095,
"grad_norm": 1.8001117706298828,
"learning_rate": 3e-05,
"loss": 0.271,
"mean_token_accuracy": 0.9785387814044952,
"step": 203
},
{
"epoch": 34.0,
"grad_norm": 1.7313034534454346,
"learning_rate": 2.98989898989899e-05,
"loss": 0.0652,
"mean_token_accuracy": 0.9746835231781006,
"step": 204
},
{
"epoch": 34.19047619047619,
"grad_norm": 1.389072060585022,
"learning_rate": 2.9797979797979796e-05,
"loss": 0.242,
"mean_token_accuracy": 0.9788109809160233,
"step": 205
},
{
"epoch": 34.38095238095238,
"grad_norm": 1.434044599533081,
"learning_rate": 2.96969696969697e-05,
"loss": 0.2426,
"mean_token_accuracy": 0.979528471827507,
"step": 206
},
{
"epoch": 34.57142857142857,
"grad_norm": 1.9448174238204956,
"learning_rate": 2.95959595959596e-05,
"loss": 0.2695,
"mean_token_accuracy": 0.9793160408735275,
"step": 207
},
{
"epoch": 34.76190476190476,
"grad_norm": 1.85161554813385,
"learning_rate": 2.9494949494949498e-05,
"loss": 0.293,
"mean_token_accuracy": 0.9727693498134613,
"step": 208
},
{
"epoch": 34.95238095238095,
"grad_norm": 1.7662495374679565,
"learning_rate": 2.9393939393939394e-05,
"loss": 0.2817,
"mean_token_accuracy": 0.9758803397417068,
"step": 209
},
{
"epoch": 35.0,
"grad_norm": 1.3624759912490845,
"learning_rate": 2.9292929292929294e-05,
"loss": 0.0738,
"mean_token_accuracy": 0.9848484992980957,
"step": 210
},
{
"epoch": 35.19047619047619,
"grad_norm": 1.622554063796997,
"learning_rate": 2.9191919191919193e-05,
"loss": 0.2493,
"mean_token_accuracy": 0.9789364635944366,
"step": 211
},
{
"epoch": 35.38095238095238,
"grad_norm": 1.7415611743927002,
"learning_rate": 2.909090909090909e-05,
"loss": 0.2849,
"mean_token_accuracy": 0.9779055863618851,
"step": 212
},
{
"epoch": 35.57142857142857,
"grad_norm": 1.585845947265625,
"learning_rate": 2.898989898989899e-05,
"loss": 0.2497,
"mean_token_accuracy": 0.9807179868221283,
"step": 213
},
{
"epoch": 35.76190476190476,
"grad_norm": 1.5177557468414307,
"learning_rate": 2.8888888888888888e-05,
"loss": 0.264,
"mean_token_accuracy": 0.9775202721357346,
"step": 214
},
{
"epoch": 35.95238095238095,
"grad_norm": 1.8757683038711548,
"learning_rate": 2.878787878787879e-05,
"loss": 0.2589,
"mean_token_accuracy": 0.9773915261030197,
"step": 215
},
{
"epoch": 36.0,
"grad_norm": 2.2826578617095947,
"learning_rate": 2.868686868686869e-05,
"loss": 0.0933,
"mean_token_accuracy": 0.9491525292396545,
"step": 216
},
{
"epoch": 36.19047619047619,
"grad_norm": 1.3637081384658813,
"learning_rate": 2.8585858585858587e-05,
"loss": 0.245,
"mean_token_accuracy": 0.9781962931156158,
"step": 217
},
{
"epoch": 36.38095238095238,
"grad_norm": 1.4664133787155151,
"learning_rate": 2.8484848484848486e-05,
"loss": 0.2521,
"mean_token_accuracy": 0.9817428290843964,
"step": 218
},
{
"epoch": 36.57142857142857,
"grad_norm": 1.5265666246414185,
"learning_rate": 2.8383838383838386e-05,
"loss": 0.2615,
"mean_token_accuracy": 0.9806021302938461,
"step": 219
},
{
"epoch": 36.76190476190476,
"grad_norm": 1.4322954416275024,
"learning_rate": 2.8282828282828282e-05,
"loss": 0.2599,
"mean_token_accuracy": 0.9800188541412354,
"step": 220
},
{
"epoch": 36.95238095238095,
"grad_norm": 1.76764976978302,
"learning_rate": 2.818181818181818e-05,
"loss": 0.292,
"mean_token_accuracy": 0.9746560305356979,
"step": 221
},
{
"epoch": 37.0,
"grad_norm": 2.1554458141326904,
"learning_rate": 2.808080808080808e-05,
"loss": 0.0865,
"mean_token_accuracy": 0.9682539701461792,
"step": 222
},
{
"epoch": 37.19047619047619,
"grad_norm": 1.4079774618148804,
"learning_rate": 2.7979797979797984e-05,
"loss": 0.2359,
"mean_token_accuracy": 0.9809356033802032,
"step": 223
},
{
"epoch": 37.38095238095238,
"grad_norm": 1.8873682022094727,
"learning_rate": 2.7878787878787883e-05,
"loss": 0.2731,
"mean_token_accuracy": 0.9777008444070816,
"step": 224
},
{
"epoch": 37.57142857142857,
"grad_norm": 1.7195765972137451,
"learning_rate": 2.777777777777778e-05,
"loss": 0.2557,
"mean_token_accuracy": 0.980317622423172,
"step": 225
},
{
"epoch": 37.76190476190476,
"grad_norm": 1.5935289859771729,
"learning_rate": 2.767676767676768e-05,
"loss": 0.2663,
"mean_token_accuracy": 0.9756544232368469,
"step": 226
},
{
"epoch": 37.95238095238095,
"grad_norm": 1.626733660697937,
"learning_rate": 2.7575757575757578e-05,
"loss": 0.2668,
"mean_token_accuracy": 0.9801195561885834,
"step": 227
},
{
"epoch": 38.0,
"grad_norm": 2.378291368484497,
"learning_rate": 2.7474747474747474e-05,
"loss": 0.0872,
"mean_token_accuracy": 0.9718309640884399,
"step": 228
},
{
"epoch": 38.19047619047619,
"grad_norm": 1.4580754041671753,
"learning_rate": 2.7373737373737374e-05,
"loss": 0.243,
"mean_token_accuracy": 0.9807321429252625,
"step": 229
},
{
"epoch": 38.38095238095238,
"grad_norm": 1.3259878158569336,
"learning_rate": 2.7272727272727273e-05,
"loss": 0.2479,
"mean_token_accuracy": 0.9801591485738754,
"step": 230
},
{
"epoch": 38.57142857142857,
"grad_norm": 1.43174147605896,
"learning_rate": 2.717171717171717e-05,
"loss": 0.2477,
"mean_token_accuracy": 0.9830300509929657,
"step": 231
},
{
"epoch": 38.76190476190476,
"grad_norm": 1.6294718980789185,
"learning_rate": 2.7070707070707075e-05,
"loss": 0.2666,
"mean_token_accuracy": 0.9755284339189529,
"step": 232
},
{
"epoch": 38.95238095238095,
"grad_norm": 2.30196213722229,
"learning_rate": 2.696969696969697e-05,
"loss": 0.2929,
"mean_token_accuracy": 0.9752500951290131,
"step": 233
},
{
"epoch": 39.0,
"grad_norm": 1.96921968460083,
"learning_rate": 2.686868686868687e-05,
"loss": 0.0762,
"mean_token_accuracy": 0.9722222089767456,
"step": 234
},
{
"epoch": 39.19047619047619,
"grad_norm": 1.3506882190704346,
"learning_rate": 2.676767676767677e-05,
"loss": 0.2359,
"mean_token_accuracy": 0.9817389249801636,
"step": 235
},
{
"epoch": 39.38095238095238,
"grad_norm": 1.4548856019973755,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.2456,
"mean_token_accuracy": 0.9811435043811798,
"step": 236
},
{
"epoch": 39.57142857142857,
"grad_norm": 1.5215767621994019,
"learning_rate": 2.6565656565656566e-05,
"loss": 0.2575,
"mean_token_accuracy": 0.9797980934381485,
"step": 237
},
{
"epoch": 39.76190476190476,
"grad_norm": 1.8254742622375488,
"learning_rate": 2.6464646464646466e-05,
"loss": 0.2889,
"mean_token_accuracy": 0.9770003706216812,
"step": 238
},
{
"epoch": 39.95238095238095,
"grad_norm": 1.818259596824646,
"learning_rate": 2.636363636363636e-05,
"loss": 0.2897,
"mean_token_accuracy": 0.976064071059227,
"step": 239
},
{
"epoch": 40.0,
"grad_norm": 1.3236188888549805,
"learning_rate": 2.6262626262626268e-05,
"loss": 0.0774,
"mean_token_accuracy": 0.9838709831237793,
"step": 240
},
{
"epoch": 40.19047619047619,
"grad_norm": 1.5586050748825073,
"learning_rate": 2.6161616161616164e-05,
"loss": 0.2731,
"mean_token_accuracy": 0.9815535992383957,
"step": 241
},
{
"epoch": 40.38095238095238,
"grad_norm": 1.5174766778945923,
"learning_rate": 2.6060606060606063e-05,
"loss": 0.2473,
"mean_token_accuracy": 0.9786833673715591,
"step": 242
},
{
"epoch": 40.57142857142857,
"grad_norm": 1.3981167078018188,
"learning_rate": 2.5959595959595963e-05,
"loss": 0.2531,
"mean_token_accuracy": 0.9792415052652359,
"step": 243
},
{
"epoch": 40.76190476190476,
"grad_norm": 1.5628103017807007,
"learning_rate": 2.585858585858586e-05,
"loss": 0.257,
"mean_token_accuracy": 0.9798375219106674,
"step": 244
},
{
"epoch": 40.95238095238095,
"grad_norm": 1.5515220165252686,
"learning_rate": 2.575757575757576e-05,
"loss": 0.2669,
"mean_token_accuracy": 0.9787022620439529,
"step": 245
},
{
"epoch": 41.0,
"grad_norm": 1.8415720462799072,
"learning_rate": 2.5656565656565658e-05,
"loss": 0.0799,
"mean_token_accuracy": 0.9682539701461792,
"step": 246
},
{
"epoch": 41.19047619047619,
"grad_norm": 1.423293113708496,
"learning_rate": 2.5555555555555554e-05,
"loss": 0.2393,
"mean_token_accuracy": 0.9812082797288895,
"step": 247
},
{
"epoch": 41.38095238095238,
"grad_norm": 1.394112467765808,
"learning_rate": 2.5454545454545454e-05,
"loss": 0.2521,
"mean_token_accuracy": 0.9827133864164352,
"step": 248
},
{
"epoch": 41.57142857142857,
"grad_norm": 1.6987677812576294,
"learning_rate": 2.5353535353535356e-05,
"loss": 0.2671,
"mean_token_accuracy": 0.9742349684238434,
"step": 249
},
{
"epoch": 41.76190476190476,
"grad_norm": 1.6028631925582886,
"learning_rate": 2.5252525252525256e-05,
"loss": 0.2602,
"mean_token_accuracy": 0.9791279435157776,
"step": 250
},
{
"epoch": 41.95238095238095,
"grad_norm": 1.8165968656539917,
"learning_rate": 2.5151515151515155e-05,
"loss": 0.2826,
"mean_token_accuracy": 0.9778096079826355,
"step": 251
},
{
"epoch": 42.0,
"grad_norm": 0.9838045835494995,
"learning_rate": 2.505050505050505e-05,
"loss": 0.0517,
"mean_token_accuracy": 0.9902912378311157,
"step": 252
},
{
"epoch": 42.19047619047619,
"grad_norm": 1.3776968717575073,
"learning_rate": 2.494949494949495e-05,
"loss": 0.2612,
"mean_token_accuracy": 0.9751808941364288,
"step": 253
},
{
"epoch": 42.38095238095238,
"grad_norm": 1.5808742046356201,
"learning_rate": 2.4848484848484847e-05,
"loss": 0.2466,
"mean_token_accuracy": 0.9846099317073822,
"step": 254
},
{
"epoch": 42.57142857142857,
"grad_norm": 1.2304980754852295,
"learning_rate": 2.474747474747475e-05,
"loss": 0.2344,
"mean_token_accuracy": 0.9795664101839066,
"step": 255
},
{
"epoch": 42.76190476190476,
"grad_norm": 1.6060268878936768,
"learning_rate": 2.464646464646465e-05,
"loss": 0.2817,
"mean_token_accuracy": 0.9776766449213028,
"step": 256
},
{
"epoch": 42.95238095238095,
"grad_norm": 1.6796001195907593,
"learning_rate": 2.4545454545454545e-05,
"loss": 0.2489,
"mean_token_accuracy": 0.9769842028617859,
"step": 257
},
{
"epoch": 43.0,
"grad_norm": 1.4542969465255737,
"learning_rate": 2.4444444444444445e-05,
"loss": 0.0595,
"mean_token_accuracy": 0.9753086566925049,
"step": 258
},
{
"epoch": 43.19047619047619,
"grad_norm": 1.4857451915740967,
"learning_rate": 2.4343434343434344e-05,
"loss": 0.2527,
"mean_token_accuracy": 0.97712042927742,
"step": 259
},
{
"epoch": 43.38095238095238,
"grad_norm": 1.306619644165039,
"learning_rate": 2.4242424242424244e-05,
"loss": 0.2363,
"mean_token_accuracy": 0.980791300535202,
"step": 260
},
{
"epoch": 43.57142857142857,
"grad_norm": 1.6846957206726074,
"learning_rate": 2.4141414141414143e-05,
"loss": 0.259,
"mean_token_accuracy": 0.9791981130838394,
"step": 261
},
{
"epoch": 43.76190476190476,
"grad_norm": 1.4038276672363281,
"learning_rate": 2.404040404040404e-05,
"loss": 0.251,
"mean_token_accuracy": 0.9791757315397263,
"step": 262
},
{
"epoch": 43.95238095238095,
"grad_norm": 1.5158367156982422,
"learning_rate": 2.393939393939394e-05,
"loss": 0.2702,
"mean_token_accuracy": 0.9788329601287842,
"step": 263
},
{
"epoch": 44.0,
"grad_norm": 1.7850970029830933,
"learning_rate": 2.3838383838383842e-05,
"loss": 0.0728,
"mean_token_accuracy": 0.9759036302566528,
"step": 264
},
{
"epoch": 44.19047619047619,
"grad_norm": 1.1887112855911255,
"learning_rate": 2.3737373737373738e-05,
"loss": 0.2319,
"mean_token_accuracy": 0.9812621474266052,
"step": 265
},
{
"epoch": 44.38095238095238,
"grad_norm": 1.4217466115951538,
"learning_rate": 2.3636363636363637e-05,
"loss": 0.238,
"mean_token_accuracy": 0.9808095693588257,
"step": 266
},
{
"epoch": 44.57142857142857,
"grad_norm": 1.7025716304779053,
"learning_rate": 2.3535353535353537e-05,
"loss": 0.2537,
"mean_token_accuracy": 0.9779138118028641,
"step": 267
},
{
"epoch": 44.76190476190476,
"grad_norm": 1.7018096446990967,
"learning_rate": 2.3434343434343436e-05,
"loss": 0.274,
"mean_token_accuracy": 0.9743378162384033,
"step": 268
},
{
"epoch": 44.95238095238095,
"grad_norm": 1.7380796670913696,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.2768,
"mean_token_accuracy": 0.9779854416847229,
"step": 269
},
{
"epoch": 45.0,
"grad_norm": 1.0162783861160278,
"learning_rate": 2.3232323232323232e-05,
"loss": 0.051,
"mean_token_accuracy": 0.9898989796638489,
"step": 270
},
{
"epoch": 45.19047619047619,
"grad_norm": 1.322588562965393,
"learning_rate": 2.313131313131313e-05,
"loss": 0.2384,
"mean_token_accuracy": 0.9804540276527405,
"step": 271
},
{
"epoch": 45.38095238095238,
"grad_norm": 1.294411301612854,
"learning_rate": 2.3030303030303034e-05,
"loss": 0.2342,
"mean_token_accuracy": 0.9810962080955505,
"step": 272
},
{
"epoch": 45.57142857142857,
"grad_norm": 1.4505170583724976,
"learning_rate": 2.292929292929293e-05,
"loss": 0.2572,
"mean_token_accuracy": 0.9756149500608444,
"step": 273
},
{
"epoch": 45.76190476190476,
"grad_norm": 1.6599575281143188,
"learning_rate": 2.282828282828283e-05,
"loss": 0.2678,
"mean_token_accuracy": 0.9741277694702148,
"step": 274
},
{
"epoch": 45.95238095238095,
"grad_norm": 1.4780550003051758,
"learning_rate": 2.272727272727273e-05,
"loss": 0.2647,
"mean_token_accuracy": 0.9768411070108414,
"step": 275
},
{
"epoch": 46.0,
"grad_norm": 1.1366266012191772,
"learning_rate": 2.262626262626263e-05,
"loss": 0.0557,
"mean_token_accuracy": 0.9878048896789551,
"step": 276
},
{
"epoch": 46.19047619047619,
"grad_norm": 1.3346896171569824,
"learning_rate": 2.2525252525252528e-05,
"loss": 0.2325,
"mean_token_accuracy": 0.979757234454155,
"step": 277
},
{
"epoch": 46.38095238095238,
"grad_norm": 1.4182461500167847,
"learning_rate": 2.2424242424242424e-05,
"loss": 0.2331,
"mean_token_accuracy": 0.9792613536119461,
"step": 278
},
{
"epoch": 46.57142857142857,
"grad_norm": 1.5474402904510498,
"learning_rate": 2.2323232323232324e-05,
"loss": 0.2641,
"mean_token_accuracy": 0.9776208251714706,
"step": 279
},
{
"epoch": 46.76190476190476,
"grad_norm": 1.8437175750732422,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.2841,
"mean_token_accuracy": 0.9759227335453033,
"step": 280
},
{
"epoch": 46.95238095238095,
"grad_norm": 1.8677905797958374,
"learning_rate": 2.2121212121212123e-05,
"loss": 0.2611,
"mean_token_accuracy": 0.9794552326202393,
"step": 281
},
{
"epoch": 47.0,
"grad_norm": 1.7438082695007324,
"learning_rate": 2.2020202020202022e-05,
"loss": 0.0768,
"mean_token_accuracy": 0.9701492786407471,
"step": 282
},
{
"epoch": 47.19047619047619,
"grad_norm": 1.38357675075531,
"learning_rate": 2.191919191919192e-05,
"loss": 0.2514,
"mean_token_accuracy": 0.9804678857326508,
"step": 283
},
{
"epoch": 47.38095238095238,
"grad_norm": 1.3532003164291382,
"learning_rate": 2.1818181818181818e-05,
"loss": 0.233,
"mean_token_accuracy": 0.9824511855840683,
"step": 284
},
{
"epoch": 47.57142857142857,
"grad_norm": 1.6904886960983276,
"learning_rate": 2.171717171717172e-05,
"loss": 0.249,
"mean_token_accuracy": 0.9747414886951447,
"step": 285
},
{
"epoch": 47.76190476190476,
"grad_norm": 1.4693493843078613,
"learning_rate": 2.1616161616161617e-05,
"loss": 0.2637,
"mean_token_accuracy": 0.9777188897132874,
"step": 286
},
{
"epoch": 47.95238095238095,
"grad_norm": 1.4712016582489014,
"learning_rate": 2.1515151515151516e-05,
"loss": 0.2641,
"mean_token_accuracy": 0.9823849946260452,
"step": 287
},
{
"epoch": 48.0,
"grad_norm": 2.5622308254241943,
"learning_rate": 2.1414141414141416e-05,
"loss": 0.0963,
"mean_token_accuracy": 0.9473684430122375,
"step": 288
},
{
"epoch": 48.19047619047619,
"grad_norm": 1.4440287351608276,
"learning_rate": 2.1313131313131315e-05,
"loss": 0.2439,
"mean_token_accuracy": 0.9802645593881607,
"step": 289
},
{
"epoch": 48.38095238095238,
"grad_norm": 1.373253583908081,
"learning_rate": 2.1212121212121215e-05,
"loss": 0.2437,
"mean_token_accuracy": 0.9763128757476807,
"step": 290
},
{
"epoch": 48.57142857142857,
"grad_norm": 1.6184741258621216,
"learning_rate": 2.111111111111111e-05,
"loss": 0.2654,
"mean_token_accuracy": 0.9782317876815796,
"step": 291
},
{
"epoch": 48.76190476190476,
"grad_norm": 1.3039287328720093,
"learning_rate": 2.101010101010101e-05,
"loss": 0.2395,
"mean_token_accuracy": 0.9821481555700302,
"step": 292
},
{
"epoch": 48.95238095238095,
"grad_norm": 1.394302487373352,
"learning_rate": 2.090909090909091e-05,
"loss": 0.2645,
"mean_token_accuracy": 0.9776430726051331,
"step": 293
},
{
"epoch": 49.0,
"grad_norm": 1.0925865173339844,
"learning_rate": 2.080808080808081e-05,
"loss": 0.0562,
"mean_token_accuracy": 0.9878048896789551,
"step": 294
},
{
"epoch": 49.19047619047619,
"grad_norm": 1.3069161176681519,
"learning_rate": 2.070707070707071e-05,
"loss": 0.2455,
"mean_token_accuracy": 0.97951839864254,
"step": 295
},
{
"epoch": 49.38095238095238,
"grad_norm": 1.3214561939239502,
"learning_rate": 2.0606060606060608e-05,
"loss": 0.2381,
"mean_token_accuracy": 0.9809810966253281,
"step": 296
},
{
"epoch": 49.57142857142857,
"grad_norm": 1.3639582395553589,
"learning_rate": 2.0505050505050504e-05,
"loss": 0.2535,
"mean_token_accuracy": 0.9802620708942413,
"step": 297
},
{
"epoch": 49.76190476190476,
"grad_norm": 1.4789013862609863,
"learning_rate": 2.0404040404040407e-05,
"loss": 0.2622,
"mean_token_accuracy": 0.9760318547487259,
"step": 298
},
{
"epoch": 49.95238095238095,
"grad_norm": 1.5978738069534302,
"learning_rate": 2.0303030303030303e-05,
"loss": 0.2756,
"mean_token_accuracy": 0.9767571240663528,
"step": 299
},
{
"epoch": 50.0,
"grad_norm": 0.994212806224823,
"learning_rate": 2.0202020202020203e-05,
"loss": 0.0477,
"mean_token_accuracy": 0.9837398529052734,
"step": 300
},
{
"epoch": 50.19047619047619,
"grad_norm": 1.257419228553772,
"learning_rate": 2.0101010101010102e-05,
"loss": 0.2437,
"mean_token_accuracy": 0.9815521091222763,
"step": 301
},
{
"epoch": 50.38095238095238,
"grad_norm": 1.2623318433761597,
"learning_rate": 2e-05,
"loss": 0.2467,
"mean_token_accuracy": 0.9801167845726013,
"step": 302
},
{
"epoch": 50.57142857142857,
"grad_norm": 1.3023744821548462,
"learning_rate": 1.98989898989899e-05,
"loss": 0.2498,
"mean_token_accuracy": 0.9767654687166214,
"step": 303
},
{
"epoch": 50.76190476190476,
"grad_norm": 1.4939366579055786,
"learning_rate": 1.9797979797979797e-05,
"loss": 0.276,
"mean_token_accuracy": 0.9766338616609573,
"step": 304
},
{
"epoch": 50.95238095238095,
"grad_norm": 1.2986633777618408,
"learning_rate": 1.9696969696969697e-05,
"loss": 0.2431,
"mean_token_accuracy": 0.9812084436416626,
"step": 305
},
{
"epoch": 51.0,
"grad_norm": 2.027116298675537,
"learning_rate": 1.95959595959596e-05,
"loss": 0.0666,
"mean_token_accuracy": 0.9629629850387573,
"step": 306
},
{
"epoch": 51.19047619047619,
"grad_norm": 1.4073251485824585,
"learning_rate": 1.9494949494949496e-05,
"loss": 0.2457,
"mean_token_accuracy": 0.9779722541570663,
"step": 307
},
{
"epoch": 51.38095238095238,
"grad_norm": 1.383111834526062,
"learning_rate": 1.9393939393939395e-05,
"loss": 0.2377,
"mean_token_accuracy": 0.9842050075531006,
"step": 308
},
{
"epoch": 51.57142857142857,
"grad_norm": 1.4835509061813354,
"learning_rate": 1.9292929292929295e-05,
"loss": 0.2503,
"mean_token_accuracy": 0.9771096408367157,
"step": 309
},
{
"epoch": 51.76190476190476,
"grad_norm": 1.756462812423706,
"learning_rate": 1.919191919191919e-05,
"loss": 0.2544,
"mean_token_accuracy": 0.9787980318069458,
"step": 310
},
{
"epoch": 51.95238095238095,
"grad_norm": 1.5173331499099731,
"learning_rate": 1.9090909090909094e-05,
"loss": 0.2593,
"mean_token_accuracy": 0.9801317751407623,
"step": 311
},
{
"epoch": 52.0,
"grad_norm": 2.2640252113342285,
"learning_rate": 1.898989898989899e-05,
"loss": 0.087,
"mean_token_accuracy": 0.9558823704719543,
"step": 312
},
{
"epoch": 52.19047619047619,
"grad_norm": 1.4061003923416138,
"learning_rate": 1.888888888888889e-05,
"loss": 0.2364,
"mean_token_accuracy": 0.9783814698457718,
"step": 313
},
{
"epoch": 52.38095238095238,
"grad_norm": 1.2146430015563965,
"learning_rate": 1.878787878787879e-05,
"loss": 0.2265,
"mean_token_accuracy": 0.9835509955883026,
"step": 314
},
{
"epoch": 52.57142857142857,
"grad_norm": 1.5701649188995361,
"learning_rate": 1.8686868686868688e-05,
"loss": 0.2637,
"mean_token_accuracy": 0.9780102521181107,
"step": 315
},
{
"epoch": 52.76190476190476,
"grad_norm": 1.5340619087219238,
"learning_rate": 1.8585858585858588e-05,
"loss": 0.2627,
"mean_token_accuracy": 0.9796072393655777,
"step": 316
},
{
"epoch": 52.95238095238095,
"grad_norm": 1.6451423168182373,
"learning_rate": 1.8484848484848487e-05,
"loss": 0.2599,
"mean_token_accuracy": 0.9780296385288239,
"step": 317
},
{
"epoch": 53.0,
"grad_norm": 1.1250572204589844,
"learning_rate": 1.8383838383838383e-05,
"loss": 0.0599,
"mean_token_accuracy": 0.987500011920929,
"step": 318
},
{
"epoch": 53.19047619047619,
"grad_norm": 1.382422924041748,
"learning_rate": 1.8282828282828286e-05,
"loss": 0.2615,
"mean_token_accuracy": 0.9795158058404922,
"step": 319
},
{
"epoch": 53.38095238095238,
"grad_norm": 1.434237003326416,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.2226,
"mean_token_accuracy": 0.9817993342876434,
"step": 320
},
{
"epoch": 53.57142857142857,
"grad_norm": 1.3543226718902588,
"learning_rate": 1.808080808080808e-05,
"loss": 0.2455,
"mean_token_accuracy": 0.9820217341184616,
"step": 321
},
{
"epoch": 53.76190476190476,
"grad_norm": 1.5558395385742188,
"learning_rate": 1.797979797979798e-05,
"loss": 0.2473,
"mean_token_accuracy": 0.9786651730537415,
"step": 322
},
{
"epoch": 53.95238095238095,
"grad_norm": 1.998782992362976,
"learning_rate": 1.787878787878788e-05,
"loss": 0.2808,
"mean_token_accuracy": 0.9743632227182388,
"step": 323
},
{
"epoch": 54.0,
"grad_norm": 1.8470655679702759,
"learning_rate": 1.777777777777778e-05,
"loss": 0.0674,
"mean_token_accuracy": 0.978723406791687,
"step": 324
},
{
"epoch": 54.19047619047619,
"grad_norm": 1.557365894317627,
"learning_rate": 1.7676767676767676e-05,
"loss": 0.2485,
"mean_token_accuracy": 0.9763985723257065,
"step": 325
},
{
"epoch": 54.38095238095238,
"grad_norm": 1.2708889245986938,
"learning_rate": 1.7575757575757576e-05,
"loss": 0.2396,
"mean_token_accuracy": 0.9807141125202179,
"step": 326
},
{
"epoch": 54.57142857142857,
"grad_norm": 1.574637770652771,
"learning_rate": 1.7474747474747475e-05,
"loss": 0.2552,
"mean_token_accuracy": 0.9784888029098511,
"step": 327
},
{
"epoch": 54.76190476190476,
"grad_norm": 1.5815781354904175,
"learning_rate": 1.7373737373737375e-05,
"loss": 0.2516,
"mean_token_accuracy": 0.9797972589731216,
"step": 328
},
{
"epoch": 54.95238095238095,
"grad_norm": 1.4875643253326416,
"learning_rate": 1.7272727272727274e-05,
"loss": 0.253,
"mean_token_accuracy": 0.9805921763181686,
"step": 329
},
{
"epoch": 55.0,
"grad_norm": 1.404120922088623,
"learning_rate": 1.7171717171717173e-05,
"loss": 0.0607,
"mean_token_accuracy": 0.9756097793579102,
"step": 330
},
{
"epoch": 55.19047619047619,
"grad_norm": 1.1672003269195557,
"learning_rate": 1.707070707070707e-05,
"loss": 0.226,
"mean_token_accuracy": 0.9818458557128906,
"step": 331
},
{
"epoch": 55.38095238095238,
"grad_norm": 1.3702583312988281,
"learning_rate": 1.6969696969696972e-05,
"loss": 0.2285,
"mean_token_accuracy": 0.9818858057260513,
"step": 332
},
{
"epoch": 55.57142857142857,
"grad_norm": 1.567103624343872,
"learning_rate": 1.686868686868687e-05,
"loss": 0.2592,
"mean_token_accuracy": 0.9774815589189529,
"step": 333
},
{
"epoch": 55.76190476190476,
"grad_norm": 1.5476545095443726,
"learning_rate": 1.6767676767676768e-05,
"loss": 0.2693,
"mean_token_accuracy": 0.9761824756860733,
"step": 334
},
{
"epoch": 55.95238095238095,
"grad_norm": 1.7951135635375977,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.2627,
"mean_token_accuracy": 0.9772898554801941,
"step": 335
},
{
"epoch": 56.0,
"grad_norm": 1.5311144590377808,
"learning_rate": 1.6565656565656567e-05,
"loss": 0.0607,
"mean_token_accuracy": 0.9750000238418579,
"step": 336
},
{
"epoch": 56.19047619047619,
"grad_norm": 1.4896326065063477,
"learning_rate": 1.6464646464646466e-05,
"loss": 0.2483,
"mean_token_accuracy": 0.9790806472301483,
"step": 337
},
{
"epoch": 56.38095238095238,
"grad_norm": 1.385233998298645,
"learning_rate": 1.6363636363636366e-05,
"loss": 0.2471,
"mean_token_accuracy": 0.9801070243120193,
"step": 338
},
{
"epoch": 56.57142857142857,
"grad_norm": 1.5755606889724731,
"learning_rate": 1.6262626262626262e-05,
"loss": 0.2462,
"mean_token_accuracy": 0.9776095598936081,
"step": 339
},
{
"epoch": 56.76190476190476,
"grad_norm": 1.4080952405929565,
"learning_rate": 1.6161616161616165e-05,
"loss": 0.2559,
"mean_token_accuracy": 0.9763025045394897,
"step": 340
},
{
"epoch": 56.95238095238095,
"grad_norm": 1.2759824991226196,
"learning_rate": 1.606060606060606e-05,
"loss": 0.2429,
"mean_token_accuracy": 0.9811924993991852,
"step": 341
},
{
"epoch": 57.0,
"grad_norm": 1.4365907907485962,
"learning_rate": 1.595959595959596e-05,
"loss": 0.0744,
"mean_token_accuracy": 0.9836065769195557,
"step": 342
},
{
"epoch": 57.19047619047619,
"grad_norm": 1.4234627485275269,
"learning_rate": 1.585858585858586e-05,
"loss": 0.2353,
"mean_token_accuracy": 0.9792965203523636,
"step": 343
},
{
"epoch": 57.38095238095238,
"grad_norm": 1.3555465936660767,
"learning_rate": 1.5757575757575756e-05,
"loss": 0.2494,
"mean_token_accuracy": 0.9825381934642792,
"step": 344
},
{
"epoch": 57.57142857142857,
"grad_norm": 1.4413907527923584,
"learning_rate": 1.565656565656566e-05,
"loss": 0.2534,
"mean_token_accuracy": 0.979871854186058,
"step": 345
},
{
"epoch": 57.76190476190476,
"grad_norm": 1.4927953481674194,
"learning_rate": 1.5555555555555555e-05,
"loss": 0.2305,
"mean_token_accuracy": 0.9812074899673462,
"step": 346
},
{
"epoch": 57.95238095238095,
"grad_norm": 1.7719610929489136,
"learning_rate": 1.5454545454545454e-05,
"loss": 0.2633,
"mean_token_accuracy": 0.9754152894020081,
"step": 347
},
{
"epoch": 58.0,
"grad_norm": 0.9548564553260803,
"learning_rate": 1.5353535353535354e-05,
"loss": 0.0521,
"mean_token_accuracy": 0.9885057210922241,
"step": 348
},
{
"epoch": 58.19047619047619,
"grad_norm": 1.4914696216583252,
"learning_rate": 1.5252525252525255e-05,
"loss": 0.2591,
"mean_token_accuracy": 0.9796448796987534,
"step": 349
},
{
"epoch": 58.38095238095238,
"grad_norm": 1.4677958488464355,
"learning_rate": 1.5151515151515153e-05,
"loss": 0.2468,
"mean_token_accuracy": 0.9798107296228409,
"step": 350
},
{
"epoch": 58.57142857142857,
"grad_norm": 1.3141554594039917,
"learning_rate": 1.505050505050505e-05,
"loss": 0.2325,
"mean_token_accuracy": 0.9803733974695206,
"step": 351
},
{
"epoch": 58.76190476190476,
"grad_norm": 1.3697947263717651,
"learning_rate": 1.494949494949495e-05,
"loss": 0.2598,
"mean_token_accuracy": 0.9749108999967575,
"step": 352
},
{
"epoch": 58.95238095238095,
"grad_norm": 1.252795696258545,
"learning_rate": 1.484848484848485e-05,
"loss": 0.2361,
"mean_token_accuracy": 0.9824285060167313,
"step": 353
},
{
"epoch": 59.0,
"grad_norm": 1.830544114112854,
"learning_rate": 1.4747474747474749e-05,
"loss": 0.0772,
"mean_token_accuracy": 0.9682539701461792,
"step": 354
},
{
"epoch": 59.19047619047619,
"grad_norm": 1.266861081123352,
"learning_rate": 1.4646464646464647e-05,
"loss": 0.236,
"mean_token_accuracy": 0.9807495921850204,
"step": 355
},
{
"epoch": 59.38095238095238,
"grad_norm": 1.5132209062576294,
"learning_rate": 1.4545454545454545e-05,
"loss": 0.2498,
"mean_token_accuracy": 0.9786520302295685,
"step": 356
},
{
"epoch": 59.57142857142857,
"grad_norm": 1.259032964706421,
"learning_rate": 1.4444444444444444e-05,
"loss": 0.2223,
"mean_token_accuracy": 0.9812145084142685,
"step": 357
},
{
"epoch": 59.76190476190476,
"grad_norm": 1.5718448162078857,
"learning_rate": 1.4343434343434345e-05,
"loss": 0.2627,
"mean_token_accuracy": 0.9778482913970947,
"step": 358
},
{
"epoch": 59.95238095238095,
"grad_norm": 1.4775868654251099,
"learning_rate": 1.4242424242424243e-05,
"loss": 0.2587,
"mean_token_accuracy": 0.9746824651956558,
"step": 359
},
{
"epoch": 60.0,
"grad_norm": 1.638393521308899,
"learning_rate": 1.4141414141414141e-05,
"loss": 0.0824,
"mean_token_accuracy": 0.9824561476707458,
"step": 360
},
{
"epoch": 60.19047619047619,
"grad_norm": 1.3080830574035645,
"learning_rate": 1.404040404040404e-05,
"loss": 0.2382,
"mean_token_accuracy": 0.9818608462810516,
"step": 361
},
{
"epoch": 60.38095238095238,
"grad_norm": 1.1936572790145874,
"learning_rate": 1.3939393939393942e-05,
"loss": 0.2333,
"mean_token_accuracy": 0.9817762225866318,
"step": 362
},
{
"epoch": 60.57142857142857,
"grad_norm": 1.5468491315841675,
"learning_rate": 1.383838383838384e-05,
"loss": 0.2653,
"mean_token_accuracy": 0.9788466989994049,
"step": 363
},
{
"epoch": 60.76190476190476,
"grad_norm": 1.3440382480621338,
"learning_rate": 1.3737373737373737e-05,
"loss": 0.2495,
"mean_token_accuracy": 0.9803344905376434,
"step": 364
},
{
"epoch": 60.95238095238095,
"grad_norm": 1.5807853937149048,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.2399,
"mean_token_accuracy": 0.977335661649704,
"step": 365
},
{
"epoch": 61.0,
"grad_norm": 1.8642648458480835,
"learning_rate": 1.3535353535353538e-05,
"loss": 0.0675,
"mean_token_accuracy": 0.9610389471054077,
"step": 366
},
{
"epoch": 61.19047619047619,
"grad_norm": 1.4595698118209839,
"learning_rate": 1.3434343434343436e-05,
"loss": 0.2433,
"mean_token_accuracy": 0.9782412499189377,
"step": 367
},
{
"epoch": 61.38095238095238,
"grad_norm": 1.7195943593978882,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.2283,
"mean_token_accuracy": 0.98487289249897,
"step": 368
},
{
"epoch": 61.57142857142857,
"grad_norm": 1.6731146574020386,
"learning_rate": 1.3232323232323233e-05,
"loss": 0.2481,
"mean_token_accuracy": 0.9755380898714066,
"step": 369
},
{
"epoch": 61.76190476190476,
"grad_norm": 1.3162552118301392,
"learning_rate": 1.3131313131313134e-05,
"loss": 0.2682,
"mean_token_accuracy": 0.9773096293210983,
"step": 370
},
{
"epoch": 61.95238095238095,
"grad_norm": 1.5763328075408936,
"learning_rate": 1.3030303030303032e-05,
"loss": 0.247,
"mean_token_accuracy": 0.9791599065065384,
"step": 371
},
{
"epoch": 62.0,
"grad_norm": 1.8567732572555542,
"learning_rate": 1.292929292929293e-05,
"loss": 0.0676,
"mean_token_accuracy": 0.970588207244873,
"step": 372
},
{
"epoch": 62.19047619047619,
"grad_norm": 1.322481393814087,
"learning_rate": 1.2828282828282829e-05,
"loss": 0.2385,
"mean_token_accuracy": 0.979724794626236,
"step": 373
},
{
"epoch": 62.38095238095238,
"grad_norm": 1.4246753454208374,
"learning_rate": 1.2727272727272727e-05,
"loss": 0.2467,
"mean_token_accuracy": 0.9777331054210663,
"step": 374
},
{
"epoch": 62.57142857142857,
"grad_norm": 1.4530190229415894,
"learning_rate": 1.2626262626262628e-05,
"loss": 0.2377,
"mean_token_accuracy": 0.9767781794071198,
"step": 375
},
{
"epoch": 62.76190476190476,
"grad_norm": 1.4946351051330566,
"learning_rate": 1.2525252525252526e-05,
"loss": 0.2547,
"mean_token_accuracy": 0.9767863899469376,
"step": 376
},
{
"epoch": 62.95238095238095,
"grad_norm": 1.442986011505127,
"learning_rate": 1.2424242424242424e-05,
"loss": 0.2575,
"mean_token_accuracy": 0.9808852076530457,
"step": 377
},
{
"epoch": 63.0,
"grad_norm": 2.1069142818450928,
"learning_rate": 1.2323232323232325e-05,
"loss": 0.0682,
"mean_token_accuracy": 0.9726027250289917,
"step": 378
},
{
"epoch": 63.19047619047619,
"grad_norm": 1.4386465549468994,
"learning_rate": 1.2222222222222222e-05,
"loss": 0.2472,
"mean_token_accuracy": 0.9808338433504105,
"step": 379
},
{
"epoch": 63.38095238095238,
"grad_norm": 1.5726056098937988,
"learning_rate": 1.2121212121212122e-05,
"loss": 0.2488,
"mean_token_accuracy": 0.9816757142543793,
"step": 380
},
{
"epoch": 63.57142857142857,
"grad_norm": 1.6537950038909912,
"learning_rate": 1.202020202020202e-05,
"loss": 0.2471,
"mean_token_accuracy": 0.9798701107501984,
"step": 381
},
{
"epoch": 63.76190476190476,
"grad_norm": 1.4154284000396729,
"learning_rate": 1.1919191919191921e-05,
"loss": 0.2483,
"mean_token_accuracy": 0.9786428213119507,
"step": 382
},
{
"epoch": 63.95238095238095,
"grad_norm": 1.493235468864441,
"learning_rate": 1.1818181818181819e-05,
"loss": 0.2499,
"mean_token_accuracy": 0.9752872586250305,
"step": 383
},
{
"epoch": 64.0,
"grad_norm": 0.9331473112106323,
"learning_rate": 1.1717171717171718e-05,
"loss": 0.0481,
"mean_token_accuracy": 0.9902912378311157,
"step": 384
},
{
"epoch": 64.19047619047619,
"grad_norm": 1.5490996837615967,
"learning_rate": 1.1616161616161616e-05,
"loss": 0.2544,
"mean_token_accuracy": 0.9750427901744843,
"step": 385
},
{
"epoch": 64.38095238095238,
"grad_norm": 1.2337415218353271,
"learning_rate": 1.1515151515151517e-05,
"loss": 0.2372,
"mean_token_accuracy": 0.9794412702322006,
"step": 386
},
{
"epoch": 64.57142857142857,
"grad_norm": 1.3450168371200562,
"learning_rate": 1.1414141414141415e-05,
"loss": 0.251,
"mean_token_accuracy": 0.9808587580919266,
"step": 387
},
{
"epoch": 64.76190476190476,
"grad_norm": 1.4372197389602661,
"learning_rate": 1.1313131313131314e-05,
"loss": 0.2541,
"mean_token_accuracy": 0.9765901118516922,
"step": 388
},
{
"epoch": 64.95238095238095,
"grad_norm": 1.3596030473709106,
"learning_rate": 1.1212121212121212e-05,
"loss": 0.2327,
"mean_token_accuracy": 0.9819456040859222,
"step": 389
},
{
"epoch": 65.0,
"grad_norm": 1.2771663665771484,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.0615,
"mean_token_accuracy": 0.9871794581413269,
"step": 390
},
{
"epoch": 65.19047619047619,
"grad_norm": 1.3283063173294067,
"learning_rate": 1.1010101010101011e-05,
"loss": 0.2431,
"mean_token_accuracy": 0.9796550124883652,
"step": 391
},
{
"epoch": 65.38095238095238,
"grad_norm": 1.4404308795928955,
"learning_rate": 1.0909090909090909e-05,
"loss": 0.242,
"mean_token_accuracy": 0.9827671945095062,
"step": 392
},
{
"epoch": 65.57142857142857,
"grad_norm": 1.322653889656067,
"learning_rate": 1.0808080808080808e-05,
"loss": 0.235,
"mean_token_accuracy": 0.9791911989450455,
"step": 393
},
{
"epoch": 65.76190476190476,
"grad_norm": 1.346421718597412,
"learning_rate": 1.0707070707070708e-05,
"loss": 0.2602,
"mean_token_accuracy": 0.9792519062757492,
"step": 394
},
{
"epoch": 65.95238095238095,
"grad_norm": 1.361152172088623,
"learning_rate": 1.0606060606060607e-05,
"loss": 0.2404,
"mean_token_accuracy": 0.9787698835134506,
"step": 395
},
{
"epoch": 66.0,
"grad_norm": 1.4586611986160278,
"learning_rate": 1.0505050505050505e-05,
"loss": 0.0681,
"mean_token_accuracy": 0.970588207244873,
"step": 396
},
{
"epoch": 66.19047619047619,
"grad_norm": 1.4977368116378784,
"learning_rate": 1.0404040404040405e-05,
"loss": 0.2359,
"mean_token_accuracy": 0.9806597381830215,
"step": 397
},
{
"epoch": 66.38095238095238,
"grad_norm": 1.2351692914962769,
"learning_rate": 1.0303030303030304e-05,
"loss": 0.2508,
"mean_token_accuracy": 0.977878749370575,
"step": 398
},
{
"epoch": 66.57142857142857,
"grad_norm": 1.3478460311889648,
"learning_rate": 1.0202020202020204e-05,
"loss": 0.2321,
"mean_token_accuracy": 0.9855255037546158,
"step": 399
},
{
"epoch": 66.76190476190476,
"grad_norm": 1.618532419204712,
"learning_rate": 1.0101010101010101e-05,
"loss": 0.2658,
"mean_token_accuracy": 0.9772535562515259,
"step": 400
},
{
"epoch": 66.95238095238095,
"grad_norm": 1.5389485359191895,
"learning_rate": 1e-05,
"loss": 0.2465,
"mean_token_accuracy": 0.9769544303417206,
"step": 401
},
{
"epoch": 67.0,
"grad_norm": 0.9716305732727051,
"learning_rate": 9.898989898989899e-06,
"loss": 0.0529,
"mean_token_accuracy": 0.9885057210922241,
"step": 402
},
{
"epoch": 67.19047619047619,
"grad_norm": 1.4950332641601562,
"learning_rate": 9.7979797979798e-06,
"loss": 0.249,
"mean_token_accuracy": 0.9769591093063354,
"step": 403
},
{
"epoch": 67.38095238095238,
"grad_norm": 1.524194359779358,
"learning_rate": 9.696969696969698e-06,
"loss": 0.2477,
"mean_token_accuracy": 0.98219034075737,
"step": 404
},
{
"epoch": 67.57142857142857,
"grad_norm": 1.231911540031433,
"learning_rate": 9.595959595959595e-06,
"loss": 0.2232,
"mean_token_accuracy": 0.9810429662466049,
"step": 405
},
{
"epoch": 67.76190476190476,
"grad_norm": 1.404455304145813,
"learning_rate": 9.494949494949495e-06,
"loss": 0.2701,
"mean_token_accuracy": 0.9793097227811813,
"step": 406
},
{
"epoch": 67.95238095238095,
"grad_norm": 1.3537510633468628,
"learning_rate": 9.393939393939394e-06,
"loss": 0.2338,
"mean_token_accuracy": 0.9800481051206589,
"step": 407
},
{
"epoch": 68.0,
"grad_norm": 0.9093771576881409,
"learning_rate": 9.292929292929294e-06,
"loss": 0.0423,
"mean_token_accuracy": 0.9902912378311157,
"step": 408
},
{
"epoch": 68.19047619047619,
"grad_norm": 1.3876770734786987,
"learning_rate": 9.191919191919192e-06,
"loss": 0.2453,
"mean_token_accuracy": 0.9814929813146591,
"step": 409
},
{
"epoch": 68.38095238095238,
"grad_norm": 1.5604972839355469,
"learning_rate": 9.090909090909091e-06,
"loss": 0.2474,
"mean_token_accuracy": 0.9796653985977173,
"step": 410
},
{
"epoch": 68.57142857142857,
"grad_norm": 1.4196627140045166,
"learning_rate": 8.98989898989899e-06,
"loss": 0.2421,
"mean_token_accuracy": 0.9826227128505707,
"step": 411
},
{
"epoch": 68.76190476190476,
"grad_norm": 1.4446525573730469,
"learning_rate": 8.88888888888889e-06,
"loss": 0.237,
"mean_token_accuracy": 0.9770011454820633,
"step": 412
},
{
"epoch": 68.95238095238095,
"grad_norm": 1.3088741302490234,
"learning_rate": 8.787878787878788e-06,
"loss": 0.242,
"mean_token_accuracy": 0.9788557142019272,
"step": 413
},
{
"epoch": 69.0,
"grad_norm": 1.1058439016342163,
"learning_rate": 8.686868686868687e-06,
"loss": 0.0552,
"mean_token_accuracy": 0.9878048896789551,
"step": 414
},
{
"epoch": 69.19047619047619,
"grad_norm": 1.5012304782867432,
"learning_rate": 8.585858585858587e-06,
"loss": 0.2472,
"mean_token_accuracy": 0.9804881513118744,
"step": 415
},
{
"epoch": 69.38095238095238,
"grad_norm": 1.2776250839233398,
"learning_rate": 8.484848484848486e-06,
"loss": 0.245,
"mean_token_accuracy": 0.9793550372123718,
"step": 416
},
{
"epoch": 69.57142857142857,
"grad_norm": 1.4031535387039185,
"learning_rate": 8.383838383838384e-06,
"loss": 0.2391,
"mean_token_accuracy": 0.9811627715826035,
"step": 417
},
{
"epoch": 69.76190476190476,
"grad_norm": 1.5323896408081055,
"learning_rate": 8.282828282828283e-06,
"loss": 0.2402,
"mean_token_accuracy": 0.9756592959165573,
"step": 418
},
{
"epoch": 69.95238095238095,
"grad_norm": 1.415002465248108,
"learning_rate": 8.181818181818183e-06,
"loss": 0.2447,
"mean_token_accuracy": 0.9816397428512573,
"step": 419
},
{
"epoch": 70.0,
"grad_norm": 1.84005606174469,
"learning_rate": 8.080808080808082e-06,
"loss": 0.0622,
"mean_token_accuracy": 0.9726027250289917,
"step": 420
},
{
"epoch": 70.19047619047619,
"grad_norm": 1.3505762815475464,
"learning_rate": 7.97979797979798e-06,
"loss": 0.2363,
"mean_token_accuracy": 0.9800622910261154,
"step": 421
},
{
"epoch": 70.38095238095238,
"grad_norm": 1.3231146335601807,
"learning_rate": 7.878787878787878e-06,
"loss": 0.2327,
"mean_token_accuracy": 0.9815961122512817,
"step": 422
},
{
"epoch": 70.57142857142857,
"grad_norm": 1.6289716958999634,
"learning_rate": 7.777777777777777e-06,
"loss": 0.2469,
"mean_token_accuracy": 0.976947546005249,
"step": 423
},
{
"epoch": 70.76190476190476,
"grad_norm": 1.5643327236175537,
"learning_rate": 7.676767676767677e-06,
"loss": 0.2541,
"mean_token_accuracy": 0.9771561771631241,
"step": 424
},
{
"epoch": 70.95238095238095,
"grad_norm": 1.4305167198181152,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.2452,
"mean_token_accuracy": 0.9759194254875183,
"step": 425
},
{
"epoch": 71.0,
"grad_norm": 1.5850602388381958,
"learning_rate": 7.474747474747475e-06,
"loss": 0.0683,
"mean_token_accuracy": 0.9850746393203735,
"step": 426
},
{
"epoch": 71.19047619047619,
"grad_norm": 1.3248540163040161,
"learning_rate": 7.3737373737373745e-06,
"loss": 0.24,
"mean_token_accuracy": 0.9821758568286896,
"step": 427
},
{
"epoch": 71.38095238095238,
"grad_norm": 1.3908957242965698,
"learning_rate": 7.272727272727272e-06,
"loss": 0.242,
"mean_token_accuracy": 0.9802806377410889,
"step": 428
},
{
"epoch": 71.57142857142857,
"grad_norm": 1.3902804851531982,
"learning_rate": 7.171717171717173e-06,
"loss": 0.2423,
"mean_token_accuracy": 0.9788789004087448,
"step": 429
},
{
"epoch": 71.76190476190476,
"grad_norm": 1.4126980304718018,
"learning_rate": 7.0707070707070704e-06,
"loss": 0.2437,
"mean_token_accuracy": 0.9766863882541656,
"step": 430
},
{
"epoch": 71.95238095238095,
"grad_norm": 1.423156499862671,
"learning_rate": 6.969696969696971e-06,
"loss": 0.2427,
"mean_token_accuracy": 0.9781524240970612,
"step": 431
},
{
"epoch": 72.0,
"grad_norm": 1.736093521118164,
"learning_rate": 6.8686868686868685e-06,
"loss": 0.0814,
"mean_token_accuracy": 0.9818181991577148,
"step": 432
},
{
"epoch": 72.19047619047619,
"grad_norm": 1.281557321548462,
"learning_rate": 6.767676767676769e-06,
"loss": 0.2482,
"mean_token_accuracy": 0.9825676530599594,
"step": 433
},
{
"epoch": 72.38095238095238,
"grad_norm": 1.3980622291564941,
"learning_rate": 6.666666666666667e-06,
"loss": 0.2428,
"mean_token_accuracy": 0.9788574278354645,
"step": 434
},
{
"epoch": 72.57142857142857,
"grad_norm": 1.419425368309021,
"learning_rate": 6.565656565656567e-06,
"loss": 0.2431,
"mean_token_accuracy": 0.9791808128356934,
"step": 435
},
{
"epoch": 72.76190476190476,
"grad_norm": 1.5525389909744263,
"learning_rate": 6.464646464646465e-06,
"loss": 0.2538,
"mean_token_accuracy": 0.9783525764942169,
"step": 436
},
{
"epoch": 72.95238095238095,
"grad_norm": 1.295773983001709,
"learning_rate": 6.363636363636363e-06,
"loss": 0.2299,
"mean_token_accuracy": 0.9779433310031891,
"step": 437
},
{
"epoch": 73.0,
"grad_norm": 0.6111257076263428,
"learning_rate": 6.262626262626263e-06,
"loss": 0.0384,
"mean_token_accuracy": 0.9922480583190918,
"step": 438
},
{
"epoch": 73.19047619047619,
"grad_norm": 1.387117862701416,
"learning_rate": 6.161616161616162e-06,
"loss": 0.2405,
"mean_token_accuracy": 0.979522630572319,
"step": 439
},
{
"epoch": 73.38095238095238,
"grad_norm": 1.3952202796936035,
"learning_rate": 6.060606060606061e-06,
"loss": 0.2486,
"mean_token_accuracy": 0.9780898541212082,
"step": 440
},
{
"epoch": 73.57142857142857,
"grad_norm": 1.6391713619232178,
"learning_rate": 5.9595959595959605e-06,
"loss": 0.2504,
"mean_token_accuracy": 0.9782277494668961,
"step": 441
},
{
"epoch": 73.76190476190476,
"grad_norm": 1.4811103343963623,
"learning_rate": 5.858585858585859e-06,
"loss": 0.2392,
"mean_token_accuracy": 0.9793239235877991,
"step": 442
},
{
"epoch": 73.95238095238095,
"grad_norm": 1.4281538724899292,
"learning_rate": 5.7575757575757586e-06,
"loss": 0.2326,
"mean_token_accuracy": 0.979654997587204,
"step": 443
},
{
"epoch": 74.0,
"grad_norm": 1.2993221282958984,
"learning_rate": 5.656565656565657e-06,
"loss": 0.0573,
"mean_token_accuracy": 0.9876543283462524,
"step": 444
},
{
"epoch": 74.19047619047619,
"grad_norm": 1.2887934446334839,
"learning_rate": 5.555555555555556e-06,
"loss": 0.2422,
"mean_token_accuracy": 0.9798881709575653,
"step": 445
},
{
"epoch": 74.38095238095238,
"grad_norm": 1.581034779548645,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.2462,
"mean_token_accuracy": 0.9796192944049835,
"step": 446
},
{
"epoch": 74.57142857142857,
"grad_norm": 1.219085693359375,
"learning_rate": 5.353535353535354e-06,
"loss": 0.2434,
"mean_token_accuracy": 0.9797424674034119,
"step": 447
},
{
"epoch": 74.76190476190476,
"grad_norm": 1.2309306859970093,
"learning_rate": 5.2525252525252526e-06,
"loss": 0.2379,
"mean_token_accuracy": 0.978371798992157,
"step": 448
},
{
"epoch": 74.95238095238095,
"grad_norm": 1.4002373218536377,
"learning_rate": 5.151515151515152e-06,
"loss": 0.2325,
"mean_token_accuracy": 0.9793529957532883,
"step": 449
},
{
"epoch": 75.0,
"grad_norm": 2.0193445682525635,
"learning_rate": 5.050505050505051e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9807692170143127,
"step": 450
},
{
"epoch": 75.19047619047619,
"grad_norm": 1.3020991086959839,
"learning_rate": 4.949494949494949e-06,
"loss": 0.2249,
"mean_token_accuracy": 0.983807697892189,
"step": 451
},
{
"epoch": 75.38095238095238,
"grad_norm": 1.2189743518829346,
"learning_rate": 4.848484848484849e-06,
"loss": 0.2444,
"mean_token_accuracy": 0.9823562502861023,
"step": 452
},
{
"epoch": 75.57142857142857,
"grad_norm": 1.43671715259552,
"learning_rate": 4.747474747474747e-06,
"loss": 0.2473,
"mean_token_accuracy": 0.9775967448949814,
"step": 453
},
{
"epoch": 75.76190476190476,
"grad_norm": 1.6678014993667603,
"learning_rate": 4.646464646464647e-06,
"loss": 0.2352,
"mean_token_accuracy": 0.9812745600938797,
"step": 454
},
{
"epoch": 75.95238095238095,
"grad_norm": 1.9260616302490234,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.2581,
"mean_token_accuracy": 0.9734574407339096,
"step": 455
},
{
"epoch": 76.0,
"grad_norm": 1.5224919319152832,
"learning_rate": 4.444444444444445e-06,
"loss": 0.0667,
"mean_token_accuracy": 0.9846153855323792,
"step": 456
},
{
"epoch": 76.19047619047619,
"grad_norm": 1.1384742259979248,
"learning_rate": 4.343434343434344e-06,
"loss": 0.2166,
"mean_token_accuracy": 0.9816610366106033,
"step": 457
},
{
"epoch": 76.38095238095238,
"grad_norm": 1.5136680603027344,
"learning_rate": 4.242424242424243e-06,
"loss": 0.2443,
"mean_token_accuracy": 0.9804109483957291,
"step": 458
},
{
"epoch": 76.57142857142857,
"grad_norm": 1.5559028387069702,
"learning_rate": 4.141414141414142e-06,
"loss": 0.2472,
"mean_token_accuracy": 0.9795145392417908,
"step": 459
},
{
"epoch": 76.76190476190476,
"grad_norm": 1.4042458534240723,
"learning_rate": 4.040404040404041e-06,
"loss": 0.2422,
"mean_token_accuracy": 0.9746371954679489,
"step": 460
},
{
"epoch": 76.95238095238095,
"grad_norm": 1.3069055080413818,
"learning_rate": 3.939393939393939e-06,
"loss": 0.2574,
"mean_token_accuracy": 0.981501892209053,
"step": 461
},
{
"epoch": 77.0,
"grad_norm": 1.4545823335647583,
"learning_rate": 3.8383838383838385e-06,
"loss": 0.0675,
"mean_token_accuracy": 0.970588207244873,
"step": 462
},
{
"epoch": 77.19047619047619,
"grad_norm": 1.4684022665023804,
"learning_rate": 3.7373737373737375e-06,
"loss": 0.2269,
"mean_token_accuracy": 0.981085941195488,
"step": 463
},
{
"epoch": 77.38095238095238,
"grad_norm": 1.5217136144638062,
"learning_rate": 3.636363636363636e-06,
"loss": 0.2415,
"mean_token_accuracy": 0.9836974442005157,
"step": 464
},
{
"epoch": 77.57142857142857,
"grad_norm": 1.2941691875457764,
"learning_rate": 3.5353535353535352e-06,
"loss": 0.2387,
"mean_token_accuracy": 0.978131577372551,
"step": 465
},
{
"epoch": 77.76190476190476,
"grad_norm": 1.4465221166610718,
"learning_rate": 3.4343434343434343e-06,
"loss": 0.2404,
"mean_token_accuracy": 0.9785452336072922,
"step": 466
},
{
"epoch": 77.95238095238095,
"grad_norm": 1.4259777069091797,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.2515,
"mean_token_accuracy": 0.9781184196472168,
"step": 467
},
{
"epoch": 78.0,
"grad_norm": 1.9436161518096924,
"learning_rate": 3.2323232323232324e-06,
"loss": 0.0751,
"mean_token_accuracy": 0.9661017060279846,
"step": 468
},
{
"epoch": 78.19047619047619,
"grad_norm": 1.2418111562728882,
"learning_rate": 3.1313131313131314e-06,
"loss": 0.2206,
"mean_token_accuracy": 0.980968713760376,
"step": 469
},
{
"epoch": 78.38095238095238,
"grad_norm": 1.3781098127365112,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.2423,
"mean_token_accuracy": 0.9790701419115067,
"step": 470
},
{
"epoch": 78.57142857142857,
"grad_norm": 1.3852852582931519,
"learning_rate": 2.9292929292929295e-06,
"loss": 0.2423,
"mean_token_accuracy": 0.9788630157709122,
"step": 471
},
{
"epoch": 78.76190476190476,
"grad_norm": 1.5246734619140625,
"learning_rate": 2.8282828282828286e-06,
"loss": 0.2497,
"mean_token_accuracy": 0.9794032126665115,
"step": 472
},
{
"epoch": 78.95238095238095,
"grad_norm": 1.4307729005813599,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.2479,
"mean_token_accuracy": 0.9815962314605713,
"step": 473
},
{
"epoch": 79.0,
"grad_norm": 1.941765308380127,
"learning_rate": 2.6262626262626263e-06,
"loss": 0.0653,
"mean_token_accuracy": 0.9577465057373047,
"step": 474
},
{
"epoch": 79.19047619047619,
"grad_norm": 1.2771799564361572,
"learning_rate": 2.5252525252525253e-06,
"loss": 0.2255,
"mean_token_accuracy": 0.9810370206832886,
"step": 475
},
{
"epoch": 79.38095238095238,
"grad_norm": 1.325358271598816,
"learning_rate": 2.4242424242424244e-06,
"loss": 0.242,
"mean_token_accuracy": 0.9791529029607773,
"step": 476
},
{
"epoch": 79.57142857142857,
"grad_norm": 1.295100212097168,
"learning_rate": 2.3232323232323234e-06,
"loss": 0.2488,
"mean_token_accuracy": 0.9798661768436432,
"step": 477
},
{
"epoch": 79.76190476190476,
"grad_norm": 1.4676238298416138,
"learning_rate": 2.2222222222222225e-06,
"loss": 0.2367,
"mean_token_accuracy": 0.9780342727899551,
"step": 478
},
{
"epoch": 79.95238095238095,
"grad_norm": 1.7996033430099487,
"learning_rate": 2.1212121212121216e-06,
"loss": 0.2452,
"mean_token_accuracy": 0.9771022349596024,
"step": 479
},
{
"epoch": 80.0,
"grad_norm": 1.3761502504348755,
"learning_rate": 2.0202020202020206e-06,
"loss": 0.06,
"mean_token_accuracy": 0.970588207244873,
"step": 480
},
{
"epoch": 80.19047619047619,
"grad_norm": 1.3741532564163208,
"learning_rate": 1.9191919191919192e-06,
"loss": 0.2414,
"mean_token_accuracy": 0.9827142953872681,
"step": 481
},
{
"epoch": 80.38095238095238,
"grad_norm": 1.680336594581604,
"learning_rate": 1.818181818181818e-06,
"loss": 0.2308,
"mean_token_accuracy": 0.980181872844696,
"step": 482
},
{
"epoch": 80.57142857142857,
"grad_norm": 1.1747589111328125,
"learning_rate": 1.7171717171717171e-06,
"loss": 0.2201,
"mean_token_accuracy": 0.9804712980985641,
"step": 483
},
{
"epoch": 80.76190476190476,
"grad_norm": 1.4682387113571167,
"learning_rate": 1.6161616161616162e-06,
"loss": 0.2481,
"mean_token_accuracy": 0.9811168909072876,
"step": 484
},
{
"epoch": 80.95238095238095,
"grad_norm": 1.5288760662078857,
"learning_rate": 1.5151515151515152e-06,
"loss": 0.2542,
"mean_token_accuracy": 0.9763506799936295,
"step": 485
},
{
"epoch": 81.0,
"grad_norm": 2.051353931427002,
"learning_rate": 1.4141414141414143e-06,
"loss": 0.0759,
"mean_token_accuracy": 0.9666666388511658,
"step": 486
},
{
"epoch": 81.19047619047619,
"grad_norm": 1.4453171491622925,
"learning_rate": 1.3131313131313131e-06,
"loss": 0.2488,
"mean_token_accuracy": 0.9764743894338608,
"step": 487
},
{
"epoch": 81.38095238095238,
"grad_norm": 1.2203129529953003,
"learning_rate": 1.2121212121212122e-06,
"loss": 0.2208,
"mean_token_accuracy": 0.9802269041538239,
"step": 488
},
{
"epoch": 81.57142857142857,
"grad_norm": 1.338069200515747,
"learning_rate": 1.1111111111111112e-06,
"loss": 0.2454,
"mean_token_accuracy": 0.9848097264766693,
"step": 489
},
{
"epoch": 81.76190476190476,
"grad_norm": 1.3311666250228882,
"learning_rate": 1.0101010101010103e-06,
"loss": 0.2276,
"mean_token_accuracy": 0.9802386462688446,
"step": 490
},
{
"epoch": 81.95238095238095,
"grad_norm": 1.4156842231750488,
"learning_rate": 9.09090909090909e-07,
"loss": 0.2622,
"mean_token_accuracy": 0.9762069880962372,
"step": 491
},
{
"epoch": 82.0,
"grad_norm": 1.7438231706619263,
"learning_rate": 8.080808080808081e-07,
"loss": 0.0642,
"mean_token_accuracy": 0.9710144996643066,
"step": 492
},
{
"epoch": 82.19047619047619,
"grad_norm": 1.338675618171692,
"learning_rate": 7.070707070707071e-07,
"loss": 0.2547,
"mean_token_accuracy": 0.9793485999107361,
"step": 493
},
{
"epoch": 82.38095238095238,
"grad_norm": 1.248263955116272,
"learning_rate": 6.060606060606061e-07,
"loss": 0.2139,
"mean_token_accuracy": 0.9814836531877518,
"step": 494
},
{
"epoch": 82.57142857142857,
"grad_norm": 1.4303299188613892,
"learning_rate": 5.050505050505052e-07,
"loss": 0.2466,
"mean_token_accuracy": 0.9783899486064911,
"step": 495
},
{
"epoch": 82.76190476190476,
"grad_norm": 1.4656988382339478,
"learning_rate": 4.0404040404040405e-07,
"loss": 0.2469,
"mean_token_accuracy": 0.9803285598754883,
"step": 496
},
{
"epoch": 82.95238095238095,
"grad_norm": 1.3924672603607178,
"learning_rate": 3.0303030303030305e-07,
"loss": 0.2375,
"mean_token_accuracy": 0.9797345548868179,
"step": 497
},
{
"epoch": 83.0,
"grad_norm": 0.9879482388496399,
"learning_rate": 2.0202020202020202e-07,
"loss": 0.0395,
"mean_token_accuracy": 0.9838709831237793,
"step": 498
},
{
"epoch": 83.19047619047619,
"grad_norm": 1.2162104845046997,
"learning_rate": 1.0101010101010101e-07,
"loss": 0.2433,
"mean_token_accuracy": 0.981399655342102,
"step": 499
},
{
"epoch": 83.38095238095238,
"grad_norm": 1.2492247819900513,
"learning_rate": 0.0,
"loss": 0.2299,
"mean_token_accuracy": 0.9802171587944031,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2203866148700160.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}