{ "best_metric": null, "best_model_checkpoint": null, "epoch": 83.38095238095238, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.19047619047619047, "grad_norm": 33.78459548950195, "learning_rate": 1e-05, "loss": 14.2748, "mean_token_accuracy": 0.4245416074991226, "step": 1 }, { "epoch": 0.38095238095238093, "grad_norm": 34.141048431396484, "learning_rate": 2e-05, "loss": 14.9063, "mean_token_accuracy": 0.42434193193912506, "step": 2 }, { "epoch": 0.5714285714285714, "grad_norm": 33.89708709716797, "learning_rate": 3e-05, "loss": 14.4293, "mean_token_accuracy": 0.42967987805604935, "step": 3 }, { "epoch": 0.7619047619047619, "grad_norm": 21.82135009765625, "learning_rate": 4e-05, "loss": 13.1187, "mean_token_accuracy": 0.4886682406067848, "step": 4 }, { "epoch": 0.9523809523809523, "grad_norm": 19.052448272705078, "learning_rate": 5e-05, "loss": 11.6617, "mean_token_accuracy": 0.5300922393798828, "step": 5 }, { "epoch": 1.0, "grad_norm": 10.372604370117188, "learning_rate": 4.98989898989899e-05, "loss": 1.9845, "mean_token_accuracy": 0.6190476417541504, "step": 6 }, { "epoch": 1.1904761904761905, "grad_norm": 18.249330520629883, "learning_rate": 4.97979797979798e-05, "loss": 9.8237, "mean_token_accuracy": 0.58917336165905, "step": 7 }, { "epoch": 1.380952380952381, "grad_norm": 18.177717208862305, "learning_rate": 4.9696969696969694e-05, "loss": 9.7575, "mean_token_accuracy": 0.5883309841156006, "step": 8 }, { "epoch": 1.5714285714285714, "grad_norm": 16.09309196472168, "learning_rate": 4.9595959595959594e-05, "loss": 9.3943, "mean_token_accuracy": 0.6104246228933334, "step": 9 }, { "epoch": 1.7619047619047619, "grad_norm": 14.678476333618164, "learning_rate": 4.94949494949495e-05, "loss": 8.6018, "mean_token_accuracy": 0.6411420404911041, "step": 10 }, { "epoch": 1.9523809523809523, "grad_norm": 12.80629825592041, "learning_rate": 4.93939393939394e-05, "loss": 7.9568, "mean_token_accuracy": 0.6764184236526489, "step": 11 }, { "epoch": 2.0, "grad_norm": 9.918559074401855, "learning_rate": 4.92929292929293e-05, "loss": 1.4247, "mean_token_accuracy": 0.75, "step": 12 }, { "epoch": 2.1904761904761907, "grad_norm": 11.65300464630127, "learning_rate": 4.919191919191919e-05, "loss": 7.3849, "mean_token_accuracy": 0.6941855251789093, "step": 13 }, { "epoch": 2.380952380952381, "grad_norm": 11.127327919006348, "learning_rate": 4.909090909090909e-05, "loss": 6.7104, "mean_token_accuracy": 0.7069735676050186, "step": 14 }, { "epoch": 2.571428571428571, "grad_norm": 11.559555053710938, "learning_rate": 4.898989898989899e-05, "loss": 7.0902, "mean_token_accuracy": 0.709569051861763, "step": 15 }, { "epoch": 2.761904761904762, "grad_norm": 10.838669776916504, "learning_rate": 4.888888888888889e-05, "loss": 6.7901, "mean_token_accuracy": 0.713655412197113, "step": 16 }, { "epoch": 2.9523809523809526, "grad_norm": 10.266611099243164, "learning_rate": 4.878787878787879e-05, "loss": 6.4548, "mean_token_accuracy": 0.7244278490543365, "step": 17 }, { "epoch": 3.0, "grad_norm": 5.915023326873779, "learning_rate": 4.868686868686869e-05, "loss": 0.636, "mean_token_accuracy": 0.8730158805847168, "step": 18 }, { "epoch": 3.1904761904761907, "grad_norm": 9.826017379760742, "learning_rate": 4.858585858585859e-05, "loss": 5.6655, "mean_token_accuracy": 0.7555368840694427, "step": 19 }, { "epoch": 3.380952380952381, "grad_norm": 9.213407516479492, "learning_rate": 4.848484848484849e-05, "loss": 6.4954, "mean_token_accuracy": 0.7222279012203217, "step": 20 }, { "epoch": 3.571428571428571, "grad_norm": 9.642789840698242, "learning_rate": 4.838383838383839e-05, "loss": 5.1397, "mean_token_accuracy": 0.7691315412521362, "step": 21 }, { "epoch": 3.761904761904762, "grad_norm": 8.594555854797363, "learning_rate": 4.828282828282829e-05, "loss": 5.4342, "mean_token_accuracy": 0.7607319056987762, "step": 22 }, { "epoch": 3.9523809523809526, "grad_norm": 8.79131031036377, "learning_rate": 4.8181818181818186e-05, "loss": 5.7146, "mean_token_accuracy": 0.7484780848026276, "step": 23 }, { "epoch": 4.0, "grad_norm": 6.953114032745361, "learning_rate": 4.808080808080808e-05, "loss": 1.4211, "mean_token_accuracy": 0.7580645084381104, "step": 24 }, { "epoch": 4.190476190476191, "grad_norm": 8.912933349609375, "learning_rate": 4.797979797979798e-05, "loss": 4.9729, "mean_token_accuracy": 0.7652112394571304, "step": 25 }, { "epoch": 4.380952380952381, "grad_norm": 9.128190994262695, "learning_rate": 4.787878787878788e-05, "loss": 4.9376, "mean_token_accuracy": 0.7736384719610214, "step": 26 }, { "epoch": 4.571428571428571, "grad_norm": 9.021340370178223, "learning_rate": 4.7777777777777784e-05, "loss": 5.1022, "mean_token_accuracy": 0.7747573852539062, "step": 27 }, { "epoch": 4.761904761904762, "grad_norm": 8.445326805114746, "learning_rate": 4.7676767676767684e-05, "loss": 4.4903, "mean_token_accuracy": 0.8014376759529114, "step": 28 }, { "epoch": 4.9523809523809526, "grad_norm": 8.269598960876465, "learning_rate": 4.7575757575757576e-05, "loss": 4.7027, "mean_token_accuracy": 0.7928940802812576, "step": 29 }, { "epoch": 5.0, "grad_norm": 4.256129264831543, "learning_rate": 4.7474747474747476e-05, "loss": 1.1768, "mean_token_accuracy": 0.8405796885490417, "step": 30 }, { "epoch": 5.190476190476191, "grad_norm": 7.8270978927612305, "learning_rate": 4.7373737373737375e-05, "loss": 4.2699, "mean_token_accuracy": 0.8052034825086594, "step": 31 }, { "epoch": 5.380952380952381, "grad_norm": 7.741850852966309, "learning_rate": 4.7272727272727275e-05, "loss": 3.9571, "mean_token_accuracy": 0.8226524442434311, "step": 32 }, { "epoch": 5.571428571428571, "grad_norm": 7.062904357910156, "learning_rate": 4.7171717171717174e-05, "loss": 4.1547, "mean_token_accuracy": 0.8154689371585846, "step": 33 }, { "epoch": 5.761904761904762, "grad_norm": 7.048011779785156, "learning_rate": 4.7070707070707074e-05, "loss": 4.4063, "mean_token_accuracy": 0.8031313121318817, "step": 34 }, { "epoch": 5.9523809523809526, "grad_norm": 7.0800580978393555, "learning_rate": 4.696969696969697e-05, "loss": 3.6279, "mean_token_accuracy": 0.8297399282455444, "step": 35 }, { "epoch": 6.0, "grad_norm": 7.842761993408203, "learning_rate": 4.686868686868687e-05, "loss": 1.2107, "mean_token_accuracy": 0.8068181872367859, "step": 36 }, { "epoch": 6.190476190476191, "grad_norm": 7.796157360076904, "learning_rate": 4.676767676767677e-05, "loss": 3.3978, "mean_token_accuracy": 0.8329954296350479, "step": 37 }, { "epoch": 6.380952380952381, "grad_norm": 6.457103252410889, "learning_rate": 4.666666666666667e-05, "loss": 3.451, "mean_token_accuracy": 0.8273660093545914, "step": 38 }, { "epoch": 6.571428571428571, "grad_norm": 6.003915786743164, "learning_rate": 4.656565656565657e-05, "loss": 3.5587, "mean_token_accuracy": 0.83831487596035, "step": 39 }, { "epoch": 6.761904761904762, "grad_norm": 6.043710231781006, "learning_rate": 4.6464646464646464e-05, "loss": 3.5422, "mean_token_accuracy": 0.8222462385892868, "step": 40 }, { "epoch": 6.9523809523809526, "grad_norm": 6.391598701477051, "learning_rate": 4.636363636363636e-05, "loss": 3.2658, "mean_token_accuracy": 0.856766939163208, "step": 41 }, { "epoch": 7.0, "grad_norm": 5.940098285675049, "learning_rate": 4.626262626262626e-05, "loss": 0.7579, "mean_token_accuracy": 0.8301886916160583, "step": 42 }, { "epoch": 7.190476190476191, "grad_norm": 6.040279388427734, "learning_rate": 4.616161616161616e-05, "loss": 2.7243, "mean_token_accuracy": 0.8692310005426407, "step": 43 }, { "epoch": 7.380952380952381, "grad_norm": 5.645506858825684, "learning_rate": 4.606060606060607e-05, "loss": 2.701, "mean_token_accuracy": 0.8647979497909546, "step": 44 }, { "epoch": 7.571428571428571, "grad_norm": 5.126684188842773, "learning_rate": 4.595959595959596e-05, "loss": 2.8655, "mean_token_accuracy": 0.8684723079204559, "step": 45 }, { "epoch": 7.761904761904762, "grad_norm": 8.235642433166504, "learning_rate": 4.585858585858586e-05, "loss": 2.9052, "mean_token_accuracy": 0.8446438163518906, "step": 46 }, { "epoch": 7.9523809523809526, "grad_norm": 6.074913501739502, "learning_rate": 4.575757575757576e-05, "loss": 2.8831, "mean_token_accuracy": 0.857246458530426, "step": 47 }, { "epoch": 8.0, "grad_norm": 4.886857986450195, "learning_rate": 4.565656565656566e-05, "loss": 0.8029, "mean_token_accuracy": 0.8294573426246643, "step": 48 }, { "epoch": 8.19047619047619, "grad_norm": 6.794694900512695, "learning_rate": 4.555555555555556e-05, "loss": 2.4927, "mean_token_accuracy": 0.8782062977552414, "step": 49 }, { "epoch": 8.380952380952381, "grad_norm": 5.690680503845215, "learning_rate": 4.545454545454546e-05, "loss": 1.9744, "mean_token_accuracy": 0.8949980139732361, "step": 50 }, { "epoch": 8.571428571428571, "grad_norm": 9.415908813476562, "learning_rate": 4.535353535353535e-05, "loss": 2.2432, "mean_token_accuracy": 0.8826991468667984, "step": 51 }, { "epoch": 8.761904761904763, "grad_norm": 7.901670932769775, "learning_rate": 4.525252525252526e-05, "loss": 2.2805, "mean_token_accuracy": 0.8890593945980072, "step": 52 }, { "epoch": 8.952380952380953, "grad_norm": 6.918704986572266, "learning_rate": 4.515151515151516e-05, "loss": 2.5343, "mean_token_accuracy": 0.8712608069181442, "step": 53 }, { "epoch": 9.0, "grad_norm": 12.76561450958252, "learning_rate": 4.5050505050505056e-05, "loss": 0.576, "mean_token_accuracy": 0.8529411554336548, "step": 54 }, { "epoch": 9.19047619047619, "grad_norm": 6.143138408660889, "learning_rate": 4.494949494949495e-05, "loss": 1.878, "mean_token_accuracy": 0.9020879119634628, "step": 55 }, { "epoch": 9.380952380952381, "grad_norm": 7.497737884521484, "learning_rate": 4.484848484848485e-05, "loss": 1.9871, "mean_token_accuracy": 0.8944180905818939, "step": 56 }, { "epoch": 9.571428571428571, "grad_norm": 5.427354335784912, "learning_rate": 4.474747474747475e-05, "loss": 1.9095, "mean_token_accuracy": 0.9023730456829071, "step": 57 }, { "epoch": 9.761904761904763, "grad_norm": 5.814023017883301, "learning_rate": 4.464646464646465e-05, "loss": 1.8084, "mean_token_accuracy": 0.9020061939954758, "step": 58 }, { "epoch": 9.952380952380953, "grad_norm": 6.965571403503418, "learning_rate": 4.454545454545455e-05, "loss": 1.7746, "mean_token_accuracy": 0.9095794558525085, "step": 59 }, { "epoch": 10.0, "grad_norm": 6.048158168792725, "learning_rate": 4.4444444444444447e-05, "loss": 0.4674, "mean_token_accuracy": 0.9152542352676392, "step": 60 }, { "epoch": 10.19047619047619, "grad_norm": 6.400238513946533, "learning_rate": 4.4343434343434346e-05, "loss": 1.4747, "mean_token_accuracy": 0.9173053950071335, "step": 61 }, { "epoch": 10.380952380952381, "grad_norm": 5.616025924682617, "learning_rate": 4.4242424242424246e-05, "loss": 1.4234, "mean_token_accuracy": 0.9245103895664215, "step": 62 }, { "epoch": 10.571428571428571, "grad_norm": 6.788946628570557, "learning_rate": 4.4141414141414145e-05, "loss": 1.6027, "mean_token_accuracy": 0.9176820814609528, "step": 63 }, { "epoch": 10.761904761904763, "grad_norm": 6.084983825683594, "learning_rate": 4.4040404040404044e-05, "loss": 1.4259, "mean_token_accuracy": 0.9250814765691757, "step": 64 }, { "epoch": 10.952380952380953, "grad_norm": 10.394392967224121, "learning_rate": 4.3939393939393944e-05, "loss": 1.2998, "mean_token_accuracy": 0.9314595013856888, "step": 65 }, { "epoch": 11.0, "grad_norm": 4.715174198150635, "learning_rate": 4.383838383838384e-05, "loss": 0.2015, "mean_token_accuracy": 0.9506173133850098, "step": 66 }, { "epoch": 11.19047619047619, "grad_norm": 4.792293071746826, "learning_rate": 4.3737373737373736e-05, "loss": 1.2582, "mean_token_accuracy": 0.9351158142089844, "step": 67 }, { "epoch": 11.380952380952381, "grad_norm": 7.185492515563965, "learning_rate": 4.3636363636363636e-05, "loss": 1.025, "mean_token_accuracy": 0.9418339878320694, "step": 68 }, { "epoch": 11.571428571428571, "grad_norm": 6.083255290985107, "learning_rate": 4.3535353535353535e-05, "loss": 1.0012, "mean_token_accuracy": 0.9446901679039001, "step": 69 }, { "epoch": 11.761904761904763, "grad_norm": 8.141711235046387, "learning_rate": 4.343434343434344e-05, "loss": 1.2278, "mean_token_accuracy": 0.9310520589351654, "step": 70 }, { "epoch": 11.952380952380953, "grad_norm": 9.146880149841309, "learning_rate": 4.3333333333333334e-05, "loss": 1.0842, "mean_token_accuracy": 0.9404759407043457, "step": 71 }, { "epoch": 12.0, "grad_norm": 3.645364761352539, "learning_rate": 4.3232323232323234e-05, "loss": 0.1553, "mean_token_accuracy": 0.9714285731315613, "step": 72 }, { "epoch": 12.19047619047619, "grad_norm": 7.048225402832031, "learning_rate": 4.313131313131313e-05, "loss": 1.0319, "mean_token_accuracy": 0.9446324110031128, "step": 73 }, { "epoch": 12.380952380952381, "grad_norm": 6.668647289276123, "learning_rate": 4.303030303030303e-05, "loss": 0.8348, "mean_token_accuracy": 0.9561943113803864, "step": 74 }, { "epoch": 12.571428571428571, "grad_norm": 7.347132205963135, "learning_rate": 4.292929292929293e-05, "loss": 0.8571, "mean_token_accuracy": 0.9449830502271652, "step": 75 }, { "epoch": 12.761904761904763, "grad_norm": 5.543299674987793, "learning_rate": 4.282828282828283e-05, "loss": 0.9421, "mean_token_accuracy": 0.9508587419986725, "step": 76 }, { "epoch": 12.952380952380953, "grad_norm": 6.999424934387207, "learning_rate": 4.2727272727272724e-05, "loss": 0.6839, "mean_token_accuracy": 0.9609730541706085, "step": 77 }, { "epoch": 13.0, "grad_norm": 2.92433762550354, "learning_rate": 4.262626262626263e-05, "loss": 0.1323, "mean_token_accuracy": 0.9838709831237793, "step": 78 }, { "epoch": 13.19047619047619, "grad_norm": 5.790960311889648, "learning_rate": 4.252525252525253e-05, "loss": 0.7111, "mean_token_accuracy": 0.9593389332294464, "step": 79 }, { "epoch": 13.380952380952381, "grad_norm": 5.800691604614258, "learning_rate": 4.242424242424243e-05, "loss": 0.6327, "mean_token_accuracy": 0.9631912261247635, "step": 80 }, { "epoch": 13.571428571428571, "grad_norm": 5.627686977386475, "learning_rate": 4.232323232323233e-05, "loss": 0.6079, "mean_token_accuracy": 0.961370512843132, "step": 81 }, { "epoch": 13.761904761904763, "grad_norm": 7.996088027954102, "learning_rate": 4.222222222222222e-05, "loss": 0.578, "mean_token_accuracy": 0.9649683386087418, "step": 82 }, { "epoch": 13.952380952380953, "grad_norm": 6.650062084197998, "learning_rate": 4.212121212121212e-05, "loss": 0.738, "mean_token_accuracy": 0.9565856605768204, "step": 83 }, { "epoch": 14.0, "grad_norm": 3.682978630065918, "learning_rate": 4.202020202020202e-05, "loss": 0.1826, "mean_token_accuracy": 0.9818181991577148, "step": 84 }, { "epoch": 14.19047619047619, "grad_norm": 4.094846725463867, "learning_rate": 4.191919191919192e-05, "loss": 0.4917, "mean_token_accuracy": 0.9723720699548721, "step": 85 }, { "epoch": 14.380952380952381, "grad_norm": 5.953057289123535, "learning_rate": 4.181818181818182e-05, "loss": 0.4787, "mean_token_accuracy": 0.9700902253389359, "step": 86 }, { "epoch": 14.571428571428571, "grad_norm": 4.5836591720581055, "learning_rate": 4.171717171717172e-05, "loss": 0.5792, "mean_token_accuracy": 0.9712613523006439, "step": 87 }, { "epoch": 14.761904761904763, "grad_norm": 4.867373943328857, "learning_rate": 4.161616161616162e-05, "loss": 0.4702, "mean_token_accuracy": 0.9780033379793167, "step": 88 }, { "epoch": 14.952380952380953, "grad_norm": 7.761333465576172, "learning_rate": 4.151515151515152e-05, "loss": 0.6332, "mean_token_accuracy": 0.9641157388687134, "step": 89 }, { "epoch": 15.0, "grad_norm": 4.875545501708984, "learning_rate": 4.141414141414142e-05, "loss": 0.1378, "mean_token_accuracy": 0.98591548204422, "step": 90 }, { "epoch": 15.19047619047619, "grad_norm": 4.117421627044678, "learning_rate": 4.131313131313132e-05, "loss": 0.4463, "mean_token_accuracy": 0.9724489748477936, "step": 91 }, { "epoch": 15.380952380952381, "grad_norm": 3.252460241317749, "learning_rate": 4.1212121212121216e-05, "loss": 0.3858, "mean_token_accuracy": 0.9809663742780685, "step": 92 }, { "epoch": 15.571428571428571, "grad_norm": 4.330794334411621, "learning_rate": 4.111111111111111e-05, "loss": 0.4585, "mean_token_accuracy": 0.9748548269271851, "step": 93 }, { "epoch": 15.761904761904763, "grad_norm": 5.096158027648926, "learning_rate": 4.101010101010101e-05, "loss": 0.4829, "mean_token_accuracy": 0.9708511531352997, "step": 94 }, { "epoch": 15.952380952380953, "grad_norm": 6.11644172668457, "learning_rate": 4.0909090909090915e-05, "loss": 0.4374, "mean_token_accuracy": 0.974689856171608, "step": 95 }, { "epoch": 16.0, "grad_norm": 2.1705079078674316, "learning_rate": 4.0808080808080814e-05, "loss": 0.0851, "mean_token_accuracy": 0.9838709831237793, "step": 96 }, { "epoch": 16.19047619047619, "grad_norm": 3.2492971420288086, "learning_rate": 4.070707070707071e-05, "loss": 0.3638, "mean_token_accuracy": 0.9768412113189697, "step": 97 }, { "epoch": 16.38095238095238, "grad_norm": 2.8683860301971436, "learning_rate": 4.0606060606060606e-05, "loss": 0.3437, "mean_token_accuracy": 0.9768141210079193, "step": 98 }, { "epoch": 16.571428571428573, "grad_norm": 3.508230686187744, "learning_rate": 4.0505050505050506e-05, "loss": 0.354, "mean_token_accuracy": 0.9778662770986557, "step": 99 }, { "epoch": 16.761904761904763, "grad_norm": 3.8338069915771484, "learning_rate": 4.0404040404040405e-05, "loss": 0.3948, "mean_token_accuracy": 0.973381832242012, "step": 100 }, { "epoch": 16.952380952380953, "grad_norm": 4.676501750946045, "learning_rate": 4.0303030303030305e-05, "loss": 0.3893, "mean_token_accuracy": 0.9753514975309372, "step": 101 }, { "epoch": 17.0, "grad_norm": 4.8052287101745605, "learning_rate": 4.0202020202020204e-05, "loss": 0.1183, "mean_token_accuracy": 0.9649122953414917, "step": 102 }, { "epoch": 17.19047619047619, "grad_norm": 3.2596077919006348, "learning_rate": 4.01010101010101e-05, "loss": 0.3596, "mean_token_accuracy": 0.9725935012102127, "step": 103 }, { "epoch": 17.38095238095238, "grad_norm": 2.6120784282684326, "learning_rate": 4e-05, "loss": 0.3414, "mean_token_accuracy": 0.9788288474082947, "step": 104 }, { "epoch": 17.571428571428573, "grad_norm": 3.26759934425354, "learning_rate": 3.98989898989899e-05, "loss": 0.3576, "mean_token_accuracy": 0.9772270619869232, "step": 105 }, { "epoch": 17.761904761904763, "grad_norm": 3.644747734069824, "learning_rate": 3.97979797979798e-05, "loss": 0.3324, "mean_token_accuracy": 0.9781567454338074, "step": 106 }, { "epoch": 17.952380952380953, "grad_norm": 4.441091537475586, "learning_rate": 3.96969696969697e-05, "loss": 0.3747, "mean_token_accuracy": 0.9714739322662354, "step": 107 }, { "epoch": 18.0, "grad_norm": 2.743286371231079, "learning_rate": 3.9595959595959594e-05, "loss": 0.0975, "mean_token_accuracy": 0.9696969985961914, "step": 108 }, { "epoch": 18.19047619047619, "grad_norm": 3.2830970287323, "learning_rate": 3.9494949494949494e-05, "loss": 0.3028, "mean_token_accuracy": 0.9811016768217087, "step": 109 }, { "epoch": 18.38095238095238, "grad_norm": 2.505868673324585, "learning_rate": 3.939393939393939e-05, "loss": 0.3186, "mean_token_accuracy": 0.9771904498338699, "step": 110 }, { "epoch": 18.571428571428573, "grad_norm": 2.6549816131591797, "learning_rate": 3.929292929292929e-05, "loss": 0.3141, "mean_token_accuracy": 0.9759136885404587, "step": 111 }, { "epoch": 18.761904761904763, "grad_norm": 3.7054269313812256, "learning_rate": 3.91919191919192e-05, "loss": 0.3736, "mean_token_accuracy": 0.9732943773269653, "step": 112 }, { "epoch": 18.952380952380953, "grad_norm": 3.014618158340454, "learning_rate": 3.909090909090909e-05, "loss": 0.3676, "mean_token_accuracy": 0.9800769239664078, "step": 113 }, { "epoch": 19.0, "grad_norm": 4.232401371002197, "learning_rate": 3.898989898989899e-05, "loss": 0.1268, "mean_token_accuracy": 0.9577465057373047, "step": 114 }, { "epoch": 19.19047619047619, "grad_norm": 1.8361284732818604, "learning_rate": 3.888888888888889e-05, "loss": 0.2937, "mean_token_accuracy": 0.9818844795227051, "step": 115 }, { "epoch": 19.38095238095238, "grad_norm": 3.4175708293914795, "learning_rate": 3.878787878787879e-05, "loss": 0.2919, "mean_token_accuracy": 0.9831363707780838, "step": 116 }, { "epoch": 19.571428571428573, "grad_norm": 3.504340887069702, "learning_rate": 3.868686868686869e-05, "loss": 0.3739, "mean_token_accuracy": 0.9758433997631073, "step": 117 }, { "epoch": 19.761904761904763, "grad_norm": 3.542600154876709, "learning_rate": 3.858585858585859e-05, "loss": 0.3247, "mean_token_accuracy": 0.9753479957580566, "step": 118 }, { "epoch": 19.952380952380953, "grad_norm": 2.5886898040771484, "learning_rate": 3.848484848484848e-05, "loss": 0.3257, "mean_token_accuracy": 0.9774775803089142, "step": 119 }, { "epoch": 20.0, "grad_norm": 2.6909375190734863, "learning_rate": 3.838383838383838e-05, "loss": 0.0882, "mean_token_accuracy": 0.9682539701461792, "step": 120 }, { "epoch": 20.19047619047619, "grad_norm": 2.958399772644043, "learning_rate": 3.828282828282829e-05, "loss": 0.3205, "mean_token_accuracy": 0.9724349826574326, "step": 121 }, { "epoch": 20.38095238095238, "grad_norm": 2.2972922325134277, "learning_rate": 3.818181818181819e-05, "loss": 0.2829, "mean_token_accuracy": 0.9813934862613678, "step": 122 }, { "epoch": 20.571428571428573, "grad_norm": 2.2647204399108887, "learning_rate": 3.8080808080808087e-05, "loss": 0.3087, "mean_token_accuracy": 0.9758166968822479, "step": 123 }, { "epoch": 20.761904761904763, "grad_norm": 2.4949004650115967, "learning_rate": 3.797979797979798e-05, "loss": 0.3143, "mean_token_accuracy": 0.9777243584394455, "step": 124 }, { "epoch": 20.952380952380953, "grad_norm": 2.5387442111968994, "learning_rate": 3.787878787878788e-05, "loss": 0.326, "mean_token_accuracy": 0.9755249470472336, "step": 125 }, { "epoch": 21.0, "grad_norm": 2.745015859603882, "learning_rate": 3.777777777777778e-05, "loss": 0.0842, "mean_token_accuracy": 0.9714285731315613, "step": 126 }, { "epoch": 21.19047619047619, "grad_norm": 1.7736639976501465, "learning_rate": 3.767676767676768e-05, "loss": 0.2777, "mean_token_accuracy": 0.9804743677377701, "step": 127 }, { "epoch": 21.38095238095238, "grad_norm": 2.391968011856079, "learning_rate": 3.757575757575758e-05, "loss": 0.2969, "mean_token_accuracy": 0.9765493422746658, "step": 128 }, { "epoch": 21.571428571428573, "grad_norm": 1.9384799003601074, "learning_rate": 3.747474747474748e-05, "loss": 0.2764, "mean_token_accuracy": 0.978370875120163, "step": 129 }, { "epoch": 21.761904761904763, "grad_norm": 2.363274097442627, "learning_rate": 3.7373737373737376e-05, "loss": 0.3086, "mean_token_accuracy": 0.9715951085090637, "step": 130 }, { "epoch": 21.952380952380953, "grad_norm": 2.90826416015625, "learning_rate": 3.7272727272727276e-05, "loss": 0.3241, "mean_token_accuracy": 0.9738913327455521, "step": 131 }, { "epoch": 22.0, "grad_norm": 1.8676457405090332, "learning_rate": 3.7171717171717175e-05, "loss": 0.0867, "mean_token_accuracy": 0.9830508232116699, "step": 132 }, { "epoch": 22.19047619047619, "grad_norm": 2.1423661708831787, "learning_rate": 3.7070707070707075e-05, "loss": 0.2691, "mean_token_accuracy": 0.9791481345891953, "step": 133 }, { "epoch": 22.38095238095238, "grad_norm": 2.0479485988616943, "learning_rate": 3.6969696969696974e-05, "loss": 0.2898, "mean_token_accuracy": 0.9813213050365448, "step": 134 }, { "epoch": 22.571428571428573, "grad_norm": 2.566549777984619, "learning_rate": 3.686868686868687e-05, "loss": 0.3174, "mean_token_accuracy": 0.975700318813324, "step": 135 }, { "epoch": 22.761904761904763, "grad_norm": 2.541551351547241, "learning_rate": 3.6767676767676766e-05, "loss": 0.3205, "mean_token_accuracy": 0.978480726480484, "step": 136 }, { "epoch": 22.952380952380953, "grad_norm": 2.037262201309204, "learning_rate": 3.6666666666666666e-05, "loss": 0.2741, "mean_token_accuracy": 0.9802869260311127, "step": 137 }, { "epoch": 23.0, "grad_norm": 2.753689765930176, "learning_rate": 3.656565656565657e-05, "loss": 0.0844, "mean_token_accuracy": 0.9841269850730896, "step": 138 }, { "epoch": 23.19047619047619, "grad_norm": 1.9929062128067017, "learning_rate": 3.6464646464646465e-05, "loss": 0.2798, "mean_token_accuracy": 0.9800110459327698, "step": 139 }, { "epoch": 23.38095238095238, "grad_norm": 2.7327589988708496, "learning_rate": 3.6363636363636364e-05, "loss": 0.2671, "mean_token_accuracy": 0.9807360470294952, "step": 140 }, { "epoch": 23.571428571428573, "grad_norm": 1.7482175827026367, "learning_rate": 3.6262626262626264e-05, "loss": 0.2965, "mean_token_accuracy": 0.9796760976314545, "step": 141 }, { "epoch": 23.761904761904763, "grad_norm": 2.599804639816284, "learning_rate": 3.616161616161616e-05, "loss": 0.3154, "mean_token_accuracy": 0.977075606584549, "step": 142 }, { "epoch": 23.952380952380953, "grad_norm": 2.482060194015503, "learning_rate": 3.606060606060606e-05, "loss": 0.3009, "mean_token_accuracy": 0.9737012088298798, "step": 143 }, { "epoch": 24.0, "grad_norm": 3.389758825302124, "learning_rate": 3.595959595959596e-05, "loss": 0.1225, "mean_token_accuracy": 0.9636363387107849, "step": 144 }, { "epoch": 24.19047619047619, "grad_norm": 1.8538786172866821, "learning_rate": 3.5858585858585855e-05, "loss": 0.2625, "mean_token_accuracy": 0.9796436280012131, "step": 145 }, { "epoch": 24.38095238095238, "grad_norm": 1.6289573907852173, "learning_rate": 3.575757575757576e-05, "loss": 0.2616, "mean_token_accuracy": 0.9804391115903854, "step": 146 }, { "epoch": 24.571428571428573, "grad_norm": 2.4140396118164062, "learning_rate": 3.565656565656566e-05, "loss": 0.3128, "mean_token_accuracy": 0.979373887181282, "step": 147 }, { "epoch": 24.761904761904763, "grad_norm": 2.182692766189575, "learning_rate": 3.555555555555556e-05, "loss": 0.2983, "mean_token_accuracy": 0.9793859571218491, "step": 148 }, { "epoch": 24.952380952380953, "grad_norm": 2.800553560256958, "learning_rate": 3.545454545454546e-05, "loss": 0.3566, "mean_token_accuracy": 0.9733032137155533, "step": 149 }, { "epoch": 25.0, "grad_norm": 1.8961296081542969, "learning_rate": 3.535353535353535e-05, "loss": 0.0623, "mean_token_accuracy": 0.9797979593276978, "step": 150 }, { "epoch": 25.19047619047619, "grad_norm": 2.6031830310821533, "learning_rate": 3.525252525252525e-05, "loss": 0.307, "mean_token_accuracy": 0.9759431630373001, "step": 151 }, { "epoch": 25.38095238095238, "grad_norm": 1.7213940620422363, "learning_rate": 3.515151515151515e-05, "loss": 0.2605, "mean_token_accuracy": 0.9829924404621124, "step": 152 }, { "epoch": 25.571428571428573, "grad_norm": 2.169405221939087, "learning_rate": 3.505050505050505e-05, "loss": 0.2833, "mean_token_accuracy": 0.976715162396431, "step": 153 }, { "epoch": 25.761904761904763, "grad_norm": 2.126295566558838, "learning_rate": 3.494949494949495e-05, "loss": 0.2836, "mean_token_accuracy": 0.9775257259607315, "step": 154 }, { "epoch": 25.952380952380953, "grad_norm": 2.112752914428711, "learning_rate": 3.484848484848485e-05, "loss": 0.3001, "mean_token_accuracy": 0.9795974045991898, "step": 155 }, { "epoch": 26.0, "grad_norm": 2.9405832290649414, "learning_rate": 3.474747474747475e-05, "loss": 0.1069, "mean_token_accuracy": 0.9824561476707458, "step": 156 }, { "epoch": 26.19047619047619, "grad_norm": 1.8124560117721558, "learning_rate": 3.464646464646465e-05, "loss": 0.2694, "mean_token_accuracy": 0.982256755232811, "step": 157 }, { "epoch": 26.38095238095238, "grad_norm": 1.8597822189331055, "learning_rate": 3.454545454545455e-05, "loss": 0.2558, "mean_token_accuracy": 0.98062863945961, "step": 158 }, { "epoch": 26.571428571428573, "grad_norm": 1.6446207761764526, "learning_rate": 3.444444444444445e-05, "loss": 0.2587, "mean_token_accuracy": 0.9779441952705383, "step": 159 }, { "epoch": 26.761904761904763, "grad_norm": 2.2227869033813477, "learning_rate": 3.434343434343435e-05, "loss": 0.3241, "mean_token_accuracy": 0.9747696965932846, "step": 160 }, { "epoch": 26.952380952380953, "grad_norm": 1.6738312244415283, "learning_rate": 3.424242424242424e-05, "loss": 0.2779, "mean_token_accuracy": 0.9778714776039124, "step": 161 }, { "epoch": 27.0, "grad_norm": 1.4880234003067017, "learning_rate": 3.414141414141414e-05, "loss": 0.0801, "mean_token_accuracy": 0.9838709831237793, "step": 162 }, { "epoch": 27.19047619047619, "grad_norm": 1.5148252248764038, "learning_rate": 3.4040404040404045e-05, "loss": 0.2581, "mean_token_accuracy": 0.980286031961441, "step": 163 }, { "epoch": 27.38095238095238, "grad_norm": 1.833160400390625, "learning_rate": 3.3939393939393945e-05, "loss": 0.2724, "mean_token_accuracy": 0.9760157763957977, "step": 164 }, { "epoch": 27.571428571428573, "grad_norm": 2.1366348266601562, "learning_rate": 3.3838383838383844e-05, "loss": 0.2916, "mean_token_accuracy": 0.9787898063659668, "step": 165 }, { "epoch": 27.761904761904763, "grad_norm": 2.5082993507385254, "learning_rate": 3.373737373737374e-05, "loss": 0.2929, "mean_token_accuracy": 0.9774486720561981, "step": 166 }, { "epoch": 27.952380952380953, "grad_norm": 2.1355273723602295, "learning_rate": 3.3636363636363636e-05, "loss": 0.2856, "mean_token_accuracy": 0.9789445698261261, "step": 167 }, { "epoch": 28.0, "grad_norm": 1.970436930656433, "learning_rate": 3.3535353535353536e-05, "loss": 0.0806, "mean_token_accuracy": 0.9692307710647583, "step": 168 }, { "epoch": 28.19047619047619, "grad_norm": 2.1435768604278564, "learning_rate": 3.3434343434343435e-05, "loss": 0.2658, "mean_token_accuracy": 0.9759610444307327, "step": 169 }, { "epoch": 28.38095238095238, "grad_norm": 1.6564626693725586, "learning_rate": 3.3333333333333335e-05, "loss": 0.2548, "mean_token_accuracy": 0.9793960750102997, "step": 170 }, { "epoch": 28.571428571428573, "grad_norm": 1.7106664180755615, "learning_rate": 3.3232323232323234e-05, "loss": 0.255, "mean_token_accuracy": 0.9787760227918625, "step": 171 }, { "epoch": 28.761904761904763, "grad_norm": 2.1820991039276123, "learning_rate": 3.3131313131313134e-05, "loss": 0.3227, "mean_token_accuracy": 0.973702073097229, "step": 172 }, { "epoch": 28.952380952380953, "grad_norm": 1.7227038145065308, "learning_rate": 3.303030303030303e-05, "loss": 0.2653, "mean_token_accuracy": 0.9788563847541809, "step": 173 }, { "epoch": 29.0, "grad_norm": 1.6985877752304077, "learning_rate": 3.292929292929293e-05, "loss": 0.0653, "mean_token_accuracy": 0.9756097793579102, "step": 174 }, { "epoch": 29.19047619047619, "grad_norm": 1.70681631565094, "learning_rate": 3.282828282828283e-05, "loss": 0.2621, "mean_token_accuracy": 0.9808604121208191, "step": 175 }, { "epoch": 29.38095238095238, "grad_norm": 1.5982296466827393, "learning_rate": 3.272727272727273e-05, "loss": 0.2444, "mean_token_accuracy": 0.9789219200611115, "step": 176 }, { "epoch": 29.571428571428573, "grad_norm": 1.4115501642227173, "learning_rate": 3.2626262626262624e-05, "loss": 0.2386, "mean_token_accuracy": 0.9839699417352676, "step": 177 }, { "epoch": 29.761904761904763, "grad_norm": 2.2143611907958984, "learning_rate": 3.2525252525252524e-05, "loss": 0.3214, "mean_token_accuracy": 0.9736231416463852, "step": 178 }, { "epoch": 29.952380952380953, "grad_norm": 2.329328775405884, "learning_rate": 3.2424242424242423e-05, "loss": 0.2899, "mean_token_accuracy": 0.974274680018425, "step": 179 }, { "epoch": 30.0, "grad_norm": 1.8894615173339844, "learning_rate": 3.232323232323233e-05, "loss": 0.0873, "mean_token_accuracy": 0.970588207244873, "step": 180 }, { "epoch": 30.19047619047619, "grad_norm": 1.8685792684555054, "learning_rate": 3.222222222222223e-05, "loss": 0.2713, "mean_token_accuracy": 0.9793071448802948, "step": 181 }, { "epoch": 30.38095238095238, "grad_norm": 1.6303725242614746, "learning_rate": 3.212121212121212e-05, "loss": 0.2602, "mean_token_accuracy": 0.978649765253067, "step": 182 }, { "epoch": 30.571428571428573, "grad_norm": 1.5414835214614868, "learning_rate": 3.202020202020202e-05, "loss": 0.2507, "mean_token_accuracy": 0.9816054552793503, "step": 183 }, { "epoch": 30.761904761904763, "grad_norm": 1.9461543560028076, "learning_rate": 3.191919191919192e-05, "loss": 0.2622, "mean_token_accuracy": 0.9799721091985703, "step": 184 }, { "epoch": 30.952380952380953, "grad_norm": 2.4515039920806885, "learning_rate": 3.181818181818182e-05, "loss": 0.316, "mean_token_accuracy": 0.9738900065422058, "step": 185 }, { "epoch": 31.0, "grad_norm": 2.3152859210968018, "learning_rate": 3.171717171717172e-05, "loss": 0.0924, "mean_token_accuracy": 0.9666666388511658, "step": 186 }, { "epoch": 31.19047619047619, "grad_norm": 1.5827226638793945, "learning_rate": 3.161616161616161e-05, "loss": 0.2548, "mean_token_accuracy": 0.9807614088058472, "step": 187 }, { "epoch": 31.38095238095238, "grad_norm": 1.5467098951339722, "learning_rate": 3.151515151515151e-05, "loss": 0.2567, "mean_token_accuracy": 0.9772002995014191, "step": 188 }, { "epoch": 31.571428571428573, "grad_norm": 1.5654078722000122, "learning_rate": 3.141414141414142e-05, "loss": 0.2523, "mean_token_accuracy": 0.9784552752971649, "step": 189 }, { "epoch": 31.761904761904763, "grad_norm": 1.6791102886199951, "learning_rate": 3.131313131313132e-05, "loss": 0.2749, "mean_token_accuracy": 0.9773024320602417, "step": 190 }, { "epoch": 31.952380952380953, "grad_norm": 1.864105224609375, "learning_rate": 3.121212121212122e-05, "loss": 0.2938, "mean_token_accuracy": 0.9765942692756653, "step": 191 }, { "epoch": 32.0, "grad_norm": 1.214571475982666, "learning_rate": 3.111111111111111e-05, "loss": 0.0665, "mean_token_accuracy": 0.987500011920929, "step": 192 }, { "epoch": 32.19047619047619, "grad_norm": 1.4030119180679321, "learning_rate": 3.101010101010101e-05, "loss": 0.2415, "mean_token_accuracy": 0.9817796945571899, "step": 193 }, { "epoch": 32.38095238095238, "grad_norm": 1.6708261966705322, "learning_rate": 3.090909090909091e-05, "loss": 0.2582, "mean_token_accuracy": 0.9801040887832642, "step": 194 }, { "epoch": 32.57142857142857, "grad_norm": 1.4296513795852661, "learning_rate": 3.080808080808081e-05, "loss": 0.2493, "mean_token_accuracy": 0.9811757057905197, "step": 195 }, { "epoch": 32.76190476190476, "grad_norm": 1.7713197469711304, "learning_rate": 3.070707070707071e-05, "loss": 0.2823, "mean_token_accuracy": 0.9782667905092239, "step": 196 }, { "epoch": 32.95238095238095, "grad_norm": 2.032137632369995, "learning_rate": 3.060606060606061e-05, "loss": 0.294, "mean_token_accuracy": 0.9734672009944916, "step": 197 }, { "epoch": 33.0, "grad_norm": 2.334019660949707, "learning_rate": 3.050505050505051e-05, "loss": 0.0861, "mean_token_accuracy": 0.9726027250289917, "step": 198 }, { "epoch": 33.19047619047619, "grad_norm": 1.4779608249664307, "learning_rate": 3.0404040404040406e-05, "loss": 0.2537, "mean_token_accuracy": 0.981317549943924, "step": 199 }, { "epoch": 33.38095238095238, "grad_norm": 1.435577392578125, "learning_rate": 3.0303030303030306e-05, "loss": 0.2544, "mean_token_accuracy": 0.9813797920942307, "step": 200 }, { "epoch": 33.57142857142857, "grad_norm": 1.8126311302185059, "learning_rate": 3.0202020202020205e-05, "loss": 0.2705, "mean_token_accuracy": 0.9765264093875885, "step": 201 }, { "epoch": 33.76190476190476, "grad_norm": 1.5598095655441284, "learning_rate": 3.01010101010101e-05, "loss": 0.2723, "mean_token_accuracy": 0.978124126791954, "step": 202 }, { "epoch": 33.95238095238095, "grad_norm": 1.8001117706298828, "learning_rate": 3e-05, "loss": 0.271, "mean_token_accuracy": 0.9785387814044952, "step": 203 }, { "epoch": 34.0, "grad_norm": 1.7313034534454346, "learning_rate": 2.98989898989899e-05, "loss": 0.0652, "mean_token_accuracy": 0.9746835231781006, "step": 204 }, { "epoch": 34.19047619047619, "grad_norm": 1.389072060585022, "learning_rate": 2.9797979797979796e-05, "loss": 0.242, "mean_token_accuracy": 0.9788109809160233, "step": 205 }, { "epoch": 34.38095238095238, "grad_norm": 1.434044599533081, "learning_rate": 2.96969696969697e-05, "loss": 0.2426, "mean_token_accuracy": 0.979528471827507, "step": 206 }, { "epoch": 34.57142857142857, "grad_norm": 1.9448174238204956, "learning_rate": 2.95959595959596e-05, "loss": 0.2695, "mean_token_accuracy": 0.9793160408735275, "step": 207 }, { "epoch": 34.76190476190476, "grad_norm": 1.85161554813385, "learning_rate": 2.9494949494949498e-05, "loss": 0.293, "mean_token_accuracy": 0.9727693498134613, "step": 208 }, { "epoch": 34.95238095238095, "grad_norm": 1.7662495374679565, "learning_rate": 2.9393939393939394e-05, "loss": 0.2817, "mean_token_accuracy": 0.9758803397417068, "step": 209 }, { "epoch": 35.0, "grad_norm": 1.3624759912490845, "learning_rate": 2.9292929292929294e-05, "loss": 0.0738, "mean_token_accuracy": 0.9848484992980957, "step": 210 }, { "epoch": 35.19047619047619, "grad_norm": 1.622554063796997, "learning_rate": 2.9191919191919193e-05, "loss": 0.2493, "mean_token_accuracy": 0.9789364635944366, "step": 211 }, { "epoch": 35.38095238095238, "grad_norm": 1.7415611743927002, "learning_rate": 2.909090909090909e-05, "loss": 0.2849, "mean_token_accuracy": 0.9779055863618851, "step": 212 }, { "epoch": 35.57142857142857, "grad_norm": 1.585845947265625, "learning_rate": 2.898989898989899e-05, "loss": 0.2497, "mean_token_accuracy": 0.9807179868221283, "step": 213 }, { "epoch": 35.76190476190476, "grad_norm": 1.5177557468414307, "learning_rate": 2.8888888888888888e-05, "loss": 0.264, "mean_token_accuracy": 0.9775202721357346, "step": 214 }, { "epoch": 35.95238095238095, "grad_norm": 1.8757683038711548, "learning_rate": 2.878787878787879e-05, "loss": 0.2589, "mean_token_accuracy": 0.9773915261030197, "step": 215 }, { "epoch": 36.0, "grad_norm": 2.2826578617095947, "learning_rate": 2.868686868686869e-05, "loss": 0.0933, "mean_token_accuracy": 0.9491525292396545, "step": 216 }, { "epoch": 36.19047619047619, "grad_norm": 1.3637081384658813, "learning_rate": 2.8585858585858587e-05, "loss": 0.245, "mean_token_accuracy": 0.9781962931156158, "step": 217 }, { "epoch": 36.38095238095238, "grad_norm": 1.4664133787155151, "learning_rate": 2.8484848484848486e-05, "loss": 0.2521, "mean_token_accuracy": 0.9817428290843964, "step": 218 }, { "epoch": 36.57142857142857, "grad_norm": 1.5265666246414185, "learning_rate": 2.8383838383838386e-05, "loss": 0.2615, "mean_token_accuracy": 0.9806021302938461, "step": 219 }, { "epoch": 36.76190476190476, "grad_norm": 1.4322954416275024, "learning_rate": 2.8282828282828282e-05, "loss": 0.2599, "mean_token_accuracy": 0.9800188541412354, "step": 220 }, { "epoch": 36.95238095238095, "grad_norm": 1.76764976978302, "learning_rate": 2.818181818181818e-05, "loss": 0.292, "mean_token_accuracy": 0.9746560305356979, "step": 221 }, { "epoch": 37.0, "grad_norm": 2.1554458141326904, "learning_rate": 2.808080808080808e-05, "loss": 0.0865, "mean_token_accuracy": 0.9682539701461792, "step": 222 }, { "epoch": 37.19047619047619, "grad_norm": 1.4079774618148804, "learning_rate": 2.7979797979797984e-05, "loss": 0.2359, "mean_token_accuracy": 0.9809356033802032, "step": 223 }, { "epoch": 37.38095238095238, "grad_norm": 1.8873682022094727, "learning_rate": 2.7878787878787883e-05, "loss": 0.2731, "mean_token_accuracy": 0.9777008444070816, "step": 224 }, { "epoch": 37.57142857142857, "grad_norm": 1.7195765972137451, "learning_rate": 2.777777777777778e-05, "loss": 0.2557, "mean_token_accuracy": 0.980317622423172, "step": 225 }, { "epoch": 37.76190476190476, "grad_norm": 1.5935289859771729, "learning_rate": 2.767676767676768e-05, "loss": 0.2663, "mean_token_accuracy": 0.9756544232368469, "step": 226 }, { "epoch": 37.95238095238095, "grad_norm": 1.626733660697937, "learning_rate": 2.7575757575757578e-05, "loss": 0.2668, "mean_token_accuracy": 0.9801195561885834, "step": 227 }, { "epoch": 38.0, "grad_norm": 2.378291368484497, "learning_rate": 2.7474747474747474e-05, "loss": 0.0872, "mean_token_accuracy": 0.9718309640884399, "step": 228 }, { "epoch": 38.19047619047619, "grad_norm": 1.4580754041671753, "learning_rate": 2.7373737373737374e-05, "loss": 0.243, "mean_token_accuracy": 0.9807321429252625, "step": 229 }, { "epoch": 38.38095238095238, "grad_norm": 1.3259878158569336, "learning_rate": 2.7272727272727273e-05, "loss": 0.2479, "mean_token_accuracy": 0.9801591485738754, "step": 230 }, { "epoch": 38.57142857142857, "grad_norm": 1.43174147605896, "learning_rate": 2.717171717171717e-05, "loss": 0.2477, "mean_token_accuracy": 0.9830300509929657, "step": 231 }, { "epoch": 38.76190476190476, "grad_norm": 1.6294718980789185, "learning_rate": 2.7070707070707075e-05, "loss": 0.2666, "mean_token_accuracy": 0.9755284339189529, "step": 232 }, { "epoch": 38.95238095238095, "grad_norm": 2.30196213722229, "learning_rate": 2.696969696969697e-05, "loss": 0.2929, "mean_token_accuracy": 0.9752500951290131, "step": 233 }, { "epoch": 39.0, "grad_norm": 1.96921968460083, "learning_rate": 2.686868686868687e-05, "loss": 0.0762, "mean_token_accuracy": 0.9722222089767456, "step": 234 }, { "epoch": 39.19047619047619, "grad_norm": 1.3506882190704346, "learning_rate": 2.676767676767677e-05, "loss": 0.2359, "mean_token_accuracy": 0.9817389249801636, "step": 235 }, { "epoch": 39.38095238095238, "grad_norm": 1.4548856019973755, "learning_rate": 2.6666666666666667e-05, "loss": 0.2456, "mean_token_accuracy": 0.9811435043811798, "step": 236 }, { "epoch": 39.57142857142857, "grad_norm": 1.5215767621994019, "learning_rate": 2.6565656565656566e-05, "loss": 0.2575, "mean_token_accuracy": 0.9797980934381485, "step": 237 }, { "epoch": 39.76190476190476, "grad_norm": 1.8254742622375488, "learning_rate": 2.6464646464646466e-05, "loss": 0.2889, "mean_token_accuracy": 0.9770003706216812, "step": 238 }, { "epoch": 39.95238095238095, "grad_norm": 1.818259596824646, "learning_rate": 2.636363636363636e-05, "loss": 0.2897, "mean_token_accuracy": 0.976064071059227, "step": 239 }, { "epoch": 40.0, "grad_norm": 1.3236188888549805, "learning_rate": 2.6262626262626268e-05, "loss": 0.0774, "mean_token_accuracy": 0.9838709831237793, "step": 240 }, { "epoch": 40.19047619047619, "grad_norm": 1.5586050748825073, "learning_rate": 2.6161616161616164e-05, "loss": 0.2731, "mean_token_accuracy": 0.9815535992383957, "step": 241 }, { "epoch": 40.38095238095238, "grad_norm": 1.5174766778945923, "learning_rate": 2.6060606060606063e-05, "loss": 0.2473, "mean_token_accuracy": 0.9786833673715591, "step": 242 }, { "epoch": 40.57142857142857, "grad_norm": 1.3981167078018188, "learning_rate": 2.5959595959595963e-05, "loss": 0.2531, "mean_token_accuracy": 0.9792415052652359, "step": 243 }, { "epoch": 40.76190476190476, "grad_norm": 1.5628103017807007, "learning_rate": 2.585858585858586e-05, "loss": 0.257, "mean_token_accuracy": 0.9798375219106674, "step": 244 }, { "epoch": 40.95238095238095, "grad_norm": 1.5515220165252686, "learning_rate": 2.575757575757576e-05, "loss": 0.2669, "mean_token_accuracy": 0.9787022620439529, "step": 245 }, { "epoch": 41.0, "grad_norm": 1.8415720462799072, "learning_rate": 2.5656565656565658e-05, "loss": 0.0799, "mean_token_accuracy": 0.9682539701461792, "step": 246 }, { "epoch": 41.19047619047619, "grad_norm": 1.423293113708496, "learning_rate": 2.5555555555555554e-05, "loss": 0.2393, "mean_token_accuracy": 0.9812082797288895, "step": 247 }, { "epoch": 41.38095238095238, "grad_norm": 1.394112467765808, "learning_rate": 2.5454545454545454e-05, "loss": 0.2521, "mean_token_accuracy": 0.9827133864164352, "step": 248 }, { "epoch": 41.57142857142857, "grad_norm": 1.6987677812576294, "learning_rate": 2.5353535353535356e-05, "loss": 0.2671, "mean_token_accuracy": 0.9742349684238434, "step": 249 }, { "epoch": 41.76190476190476, "grad_norm": 1.6028631925582886, "learning_rate": 2.5252525252525256e-05, "loss": 0.2602, "mean_token_accuracy": 0.9791279435157776, "step": 250 }, { "epoch": 41.95238095238095, "grad_norm": 1.8165968656539917, "learning_rate": 2.5151515151515155e-05, "loss": 0.2826, "mean_token_accuracy": 0.9778096079826355, "step": 251 }, { "epoch": 42.0, "grad_norm": 0.9838045835494995, "learning_rate": 2.505050505050505e-05, "loss": 0.0517, "mean_token_accuracy": 0.9902912378311157, "step": 252 }, { "epoch": 42.19047619047619, "grad_norm": 1.3776968717575073, "learning_rate": 2.494949494949495e-05, "loss": 0.2612, "mean_token_accuracy": 0.9751808941364288, "step": 253 }, { "epoch": 42.38095238095238, "grad_norm": 1.5808742046356201, "learning_rate": 2.4848484848484847e-05, "loss": 0.2466, "mean_token_accuracy": 0.9846099317073822, "step": 254 }, { "epoch": 42.57142857142857, "grad_norm": 1.2304980754852295, "learning_rate": 2.474747474747475e-05, "loss": 0.2344, "mean_token_accuracy": 0.9795664101839066, "step": 255 }, { "epoch": 42.76190476190476, "grad_norm": 1.6060268878936768, "learning_rate": 2.464646464646465e-05, "loss": 0.2817, "mean_token_accuracy": 0.9776766449213028, "step": 256 }, { "epoch": 42.95238095238095, "grad_norm": 1.6796001195907593, "learning_rate": 2.4545454545454545e-05, "loss": 0.2489, "mean_token_accuracy": 0.9769842028617859, "step": 257 }, { "epoch": 43.0, "grad_norm": 1.4542969465255737, "learning_rate": 2.4444444444444445e-05, "loss": 0.0595, "mean_token_accuracy": 0.9753086566925049, "step": 258 }, { "epoch": 43.19047619047619, "grad_norm": 1.4857451915740967, "learning_rate": 2.4343434343434344e-05, "loss": 0.2527, "mean_token_accuracy": 0.97712042927742, "step": 259 }, { "epoch": 43.38095238095238, "grad_norm": 1.306619644165039, "learning_rate": 2.4242424242424244e-05, "loss": 0.2363, "mean_token_accuracy": 0.980791300535202, "step": 260 }, { "epoch": 43.57142857142857, "grad_norm": 1.6846957206726074, "learning_rate": 2.4141414141414143e-05, "loss": 0.259, "mean_token_accuracy": 0.9791981130838394, "step": 261 }, { "epoch": 43.76190476190476, "grad_norm": 1.4038276672363281, "learning_rate": 2.404040404040404e-05, "loss": 0.251, "mean_token_accuracy": 0.9791757315397263, "step": 262 }, { "epoch": 43.95238095238095, "grad_norm": 1.5158367156982422, "learning_rate": 2.393939393939394e-05, "loss": 0.2702, "mean_token_accuracy": 0.9788329601287842, "step": 263 }, { "epoch": 44.0, "grad_norm": 1.7850970029830933, "learning_rate": 2.3838383838383842e-05, "loss": 0.0728, "mean_token_accuracy": 0.9759036302566528, "step": 264 }, { "epoch": 44.19047619047619, "grad_norm": 1.1887112855911255, "learning_rate": 2.3737373737373738e-05, "loss": 0.2319, "mean_token_accuracy": 0.9812621474266052, "step": 265 }, { "epoch": 44.38095238095238, "grad_norm": 1.4217466115951538, "learning_rate": 2.3636363636363637e-05, "loss": 0.238, "mean_token_accuracy": 0.9808095693588257, "step": 266 }, { "epoch": 44.57142857142857, "grad_norm": 1.7025716304779053, "learning_rate": 2.3535353535353537e-05, "loss": 0.2537, "mean_token_accuracy": 0.9779138118028641, "step": 267 }, { "epoch": 44.76190476190476, "grad_norm": 1.7018096446990967, "learning_rate": 2.3434343434343436e-05, "loss": 0.274, "mean_token_accuracy": 0.9743378162384033, "step": 268 }, { "epoch": 44.95238095238095, "grad_norm": 1.7380796670913696, "learning_rate": 2.3333333333333336e-05, "loss": 0.2768, "mean_token_accuracy": 0.9779854416847229, "step": 269 }, { "epoch": 45.0, "grad_norm": 1.0162783861160278, "learning_rate": 2.3232323232323232e-05, "loss": 0.051, "mean_token_accuracy": 0.9898989796638489, "step": 270 }, { "epoch": 45.19047619047619, "grad_norm": 1.322588562965393, "learning_rate": 2.313131313131313e-05, "loss": 0.2384, "mean_token_accuracy": 0.9804540276527405, "step": 271 }, { "epoch": 45.38095238095238, "grad_norm": 1.294411301612854, "learning_rate": 2.3030303030303034e-05, "loss": 0.2342, "mean_token_accuracy": 0.9810962080955505, "step": 272 }, { "epoch": 45.57142857142857, "grad_norm": 1.4505170583724976, "learning_rate": 2.292929292929293e-05, "loss": 0.2572, "mean_token_accuracy": 0.9756149500608444, "step": 273 }, { "epoch": 45.76190476190476, "grad_norm": 1.6599575281143188, "learning_rate": 2.282828282828283e-05, "loss": 0.2678, "mean_token_accuracy": 0.9741277694702148, "step": 274 }, { "epoch": 45.95238095238095, "grad_norm": 1.4780550003051758, "learning_rate": 2.272727272727273e-05, "loss": 0.2647, "mean_token_accuracy": 0.9768411070108414, "step": 275 }, { "epoch": 46.0, "grad_norm": 1.1366266012191772, "learning_rate": 2.262626262626263e-05, "loss": 0.0557, "mean_token_accuracy": 0.9878048896789551, "step": 276 }, { "epoch": 46.19047619047619, "grad_norm": 1.3346896171569824, "learning_rate": 2.2525252525252528e-05, "loss": 0.2325, "mean_token_accuracy": 0.979757234454155, "step": 277 }, { "epoch": 46.38095238095238, "grad_norm": 1.4182461500167847, "learning_rate": 2.2424242424242424e-05, "loss": 0.2331, "mean_token_accuracy": 0.9792613536119461, "step": 278 }, { "epoch": 46.57142857142857, "grad_norm": 1.5474402904510498, "learning_rate": 2.2323232323232324e-05, "loss": 0.2641, "mean_token_accuracy": 0.9776208251714706, "step": 279 }, { "epoch": 46.76190476190476, "grad_norm": 1.8437175750732422, "learning_rate": 2.2222222222222223e-05, "loss": 0.2841, "mean_token_accuracy": 0.9759227335453033, "step": 280 }, { "epoch": 46.95238095238095, "grad_norm": 1.8677905797958374, "learning_rate": 2.2121212121212123e-05, "loss": 0.2611, "mean_token_accuracy": 0.9794552326202393, "step": 281 }, { "epoch": 47.0, "grad_norm": 1.7438082695007324, "learning_rate": 2.2020202020202022e-05, "loss": 0.0768, "mean_token_accuracy": 0.9701492786407471, "step": 282 }, { "epoch": 47.19047619047619, "grad_norm": 1.38357675075531, "learning_rate": 2.191919191919192e-05, "loss": 0.2514, "mean_token_accuracy": 0.9804678857326508, "step": 283 }, { "epoch": 47.38095238095238, "grad_norm": 1.3532003164291382, "learning_rate": 2.1818181818181818e-05, "loss": 0.233, "mean_token_accuracy": 0.9824511855840683, "step": 284 }, { "epoch": 47.57142857142857, "grad_norm": 1.6904886960983276, "learning_rate": 2.171717171717172e-05, "loss": 0.249, "mean_token_accuracy": 0.9747414886951447, "step": 285 }, { "epoch": 47.76190476190476, "grad_norm": 1.4693493843078613, "learning_rate": 2.1616161616161617e-05, "loss": 0.2637, "mean_token_accuracy": 0.9777188897132874, "step": 286 }, { "epoch": 47.95238095238095, "grad_norm": 1.4712016582489014, "learning_rate": 2.1515151515151516e-05, "loss": 0.2641, "mean_token_accuracy": 0.9823849946260452, "step": 287 }, { "epoch": 48.0, "grad_norm": 2.5622308254241943, "learning_rate": 2.1414141414141416e-05, "loss": 0.0963, "mean_token_accuracy": 0.9473684430122375, "step": 288 }, { "epoch": 48.19047619047619, "grad_norm": 1.4440287351608276, "learning_rate": 2.1313131313131315e-05, "loss": 0.2439, "mean_token_accuracy": 0.9802645593881607, "step": 289 }, { "epoch": 48.38095238095238, "grad_norm": 1.373253583908081, "learning_rate": 2.1212121212121215e-05, "loss": 0.2437, "mean_token_accuracy": 0.9763128757476807, "step": 290 }, { "epoch": 48.57142857142857, "grad_norm": 1.6184741258621216, "learning_rate": 2.111111111111111e-05, "loss": 0.2654, "mean_token_accuracy": 0.9782317876815796, "step": 291 }, { "epoch": 48.76190476190476, "grad_norm": 1.3039287328720093, "learning_rate": 2.101010101010101e-05, "loss": 0.2395, "mean_token_accuracy": 0.9821481555700302, "step": 292 }, { "epoch": 48.95238095238095, "grad_norm": 1.394302487373352, "learning_rate": 2.090909090909091e-05, "loss": 0.2645, "mean_token_accuracy": 0.9776430726051331, "step": 293 }, { "epoch": 49.0, "grad_norm": 1.0925865173339844, "learning_rate": 2.080808080808081e-05, "loss": 0.0562, "mean_token_accuracy": 0.9878048896789551, "step": 294 }, { "epoch": 49.19047619047619, "grad_norm": 1.3069161176681519, "learning_rate": 2.070707070707071e-05, "loss": 0.2455, "mean_token_accuracy": 0.97951839864254, "step": 295 }, { "epoch": 49.38095238095238, "grad_norm": 1.3214561939239502, "learning_rate": 2.0606060606060608e-05, "loss": 0.2381, "mean_token_accuracy": 0.9809810966253281, "step": 296 }, { "epoch": 49.57142857142857, "grad_norm": 1.3639582395553589, "learning_rate": 2.0505050505050504e-05, "loss": 0.2535, "mean_token_accuracy": 0.9802620708942413, "step": 297 }, { "epoch": 49.76190476190476, "grad_norm": 1.4789013862609863, "learning_rate": 2.0404040404040407e-05, "loss": 0.2622, "mean_token_accuracy": 0.9760318547487259, "step": 298 }, { "epoch": 49.95238095238095, "grad_norm": 1.5978738069534302, "learning_rate": 2.0303030303030303e-05, "loss": 0.2756, "mean_token_accuracy": 0.9767571240663528, "step": 299 }, { "epoch": 50.0, "grad_norm": 0.994212806224823, "learning_rate": 2.0202020202020203e-05, "loss": 0.0477, "mean_token_accuracy": 0.9837398529052734, "step": 300 }, { "epoch": 50.19047619047619, "grad_norm": 1.257419228553772, "learning_rate": 2.0101010101010102e-05, "loss": 0.2437, "mean_token_accuracy": 0.9815521091222763, "step": 301 }, { "epoch": 50.38095238095238, "grad_norm": 1.2623318433761597, "learning_rate": 2e-05, "loss": 0.2467, "mean_token_accuracy": 0.9801167845726013, "step": 302 }, { "epoch": 50.57142857142857, "grad_norm": 1.3023744821548462, "learning_rate": 1.98989898989899e-05, "loss": 0.2498, "mean_token_accuracy": 0.9767654687166214, "step": 303 }, { "epoch": 50.76190476190476, "grad_norm": 1.4939366579055786, "learning_rate": 1.9797979797979797e-05, "loss": 0.276, "mean_token_accuracy": 0.9766338616609573, "step": 304 }, { "epoch": 50.95238095238095, "grad_norm": 1.2986633777618408, "learning_rate": 1.9696969696969697e-05, "loss": 0.2431, "mean_token_accuracy": 0.9812084436416626, "step": 305 }, { "epoch": 51.0, "grad_norm": 2.027116298675537, "learning_rate": 1.95959595959596e-05, "loss": 0.0666, "mean_token_accuracy": 0.9629629850387573, "step": 306 }, { "epoch": 51.19047619047619, "grad_norm": 1.4073251485824585, "learning_rate": 1.9494949494949496e-05, "loss": 0.2457, "mean_token_accuracy": 0.9779722541570663, "step": 307 }, { "epoch": 51.38095238095238, "grad_norm": 1.383111834526062, "learning_rate": 1.9393939393939395e-05, "loss": 0.2377, "mean_token_accuracy": 0.9842050075531006, "step": 308 }, { "epoch": 51.57142857142857, "grad_norm": 1.4835509061813354, "learning_rate": 1.9292929292929295e-05, "loss": 0.2503, "mean_token_accuracy": 0.9771096408367157, "step": 309 }, { "epoch": 51.76190476190476, "grad_norm": 1.756462812423706, "learning_rate": 1.919191919191919e-05, "loss": 0.2544, "mean_token_accuracy": 0.9787980318069458, "step": 310 }, { "epoch": 51.95238095238095, "grad_norm": 1.5173331499099731, "learning_rate": 1.9090909090909094e-05, "loss": 0.2593, "mean_token_accuracy": 0.9801317751407623, "step": 311 }, { "epoch": 52.0, "grad_norm": 2.2640252113342285, "learning_rate": 1.898989898989899e-05, "loss": 0.087, "mean_token_accuracy": 0.9558823704719543, "step": 312 }, { "epoch": 52.19047619047619, "grad_norm": 1.4061003923416138, "learning_rate": 1.888888888888889e-05, "loss": 0.2364, "mean_token_accuracy": 0.9783814698457718, "step": 313 }, { "epoch": 52.38095238095238, "grad_norm": 1.2146430015563965, "learning_rate": 1.878787878787879e-05, "loss": 0.2265, "mean_token_accuracy": 0.9835509955883026, "step": 314 }, { "epoch": 52.57142857142857, "grad_norm": 1.5701649188995361, "learning_rate": 1.8686868686868688e-05, "loss": 0.2637, "mean_token_accuracy": 0.9780102521181107, "step": 315 }, { "epoch": 52.76190476190476, "grad_norm": 1.5340619087219238, "learning_rate": 1.8585858585858588e-05, "loss": 0.2627, "mean_token_accuracy": 0.9796072393655777, "step": 316 }, { "epoch": 52.95238095238095, "grad_norm": 1.6451423168182373, "learning_rate": 1.8484848484848487e-05, "loss": 0.2599, "mean_token_accuracy": 0.9780296385288239, "step": 317 }, { "epoch": 53.0, "grad_norm": 1.1250572204589844, "learning_rate": 1.8383838383838383e-05, "loss": 0.0599, "mean_token_accuracy": 0.987500011920929, "step": 318 }, { "epoch": 53.19047619047619, "grad_norm": 1.382422924041748, "learning_rate": 1.8282828282828286e-05, "loss": 0.2615, "mean_token_accuracy": 0.9795158058404922, "step": 319 }, { "epoch": 53.38095238095238, "grad_norm": 1.434237003326416, "learning_rate": 1.8181818181818182e-05, "loss": 0.2226, "mean_token_accuracy": 0.9817993342876434, "step": 320 }, { "epoch": 53.57142857142857, "grad_norm": 1.3543226718902588, "learning_rate": 1.808080808080808e-05, "loss": 0.2455, "mean_token_accuracy": 0.9820217341184616, "step": 321 }, { "epoch": 53.76190476190476, "grad_norm": 1.5558395385742188, "learning_rate": 1.797979797979798e-05, "loss": 0.2473, "mean_token_accuracy": 0.9786651730537415, "step": 322 }, { "epoch": 53.95238095238095, "grad_norm": 1.998782992362976, "learning_rate": 1.787878787878788e-05, "loss": 0.2808, "mean_token_accuracy": 0.9743632227182388, "step": 323 }, { "epoch": 54.0, "grad_norm": 1.8470655679702759, "learning_rate": 1.777777777777778e-05, "loss": 0.0674, "mean_token_accuracy": 0.978723406791687, "step": 324 }, { "epoch": 54.19047619047619, "grad_norm": 1.557365894317627, "learning_rate": 1.7676767676767676e-05, "loss": 0.2485, "mean_token_accuracy": 0.9763985723257065, "step": 325 }, { "epoch": 54.38095238095238, "grad_norm": 1.2708889245986938, "learning_rate": 1.7575757575757576e-05, "loss": 0.2396, "mean_token_accuracy": 0.9807141125202179, "step": 326 }, { "epoch": 54.57142857142857, "grad_norm": 1.574637770652771, "learning_rate": 1.7474747474747475e-05, "loss": 0.2552, "mean_token_accuracy": 0.9784888029098511, "step": 327 }, { "epoch": 54.76190476190476, "grad_norm": 1.5815781354904175, "learning_rate": 1.7373737373737375e-05, "loss": 0.2516, "mean_token_accuracy": 0.9797972589731216, "step": 328 }, { "epoch": 54.95238095238095, "grad_norm": 1.4875643253326416, "learning_rate": 1.7272727272727274e-05, "loss": 0.253, "mean_token_accuracy": 0.9805921763181686, "step": 329 }, { "epoch": 55.0, "grad_norm": 1.404120922088623, "learning_rate": 1.7171717171717173e-05, "loss": 0.0607, "mean_token_accuracy": 0.9756097793579102, "step": 330 }, { "epoch": 55.19047619047619, "grad_norm": 1.1672003269195557, "learning_rate": 1.707070707070707e-05, "loss": 0.226, "mean_token_accuracy": 0.9818458557128906, "step": 331 }, { "epoch": 55.38095238095238, "grad_norm": 1.3702583312988281, "learning_rate": 1.6969696969696972e-05, "loss": 0.2285, "mean_token_accuracy": 0.9818858057260513, "step": 332 }, { "epoch": 55.57142857142857, "grad_norm": 1.567103624343872, "learning_rate": 1.686868686868687e-05, "loss": 0.2592, "mean_token_accuracy": 0.9774815589189529, "step": 333 }, { "epoch": 55.76190476190476, "grad_norm": 1.5476545095443726, "learning_rate": 1.6767676767676768e-05, "loss": 0.2693, "mean_token_accuracy": 0.9761824756860733, "step": 334 }, { "epoch": 55.95238095238095, "grad_norm": 1.7951135635375977, "learning_rate": 1.6666666666666667e-05, "loss": 0.2627, "mean_token_accuracy": 0.9772898554801941, "step": 335 }, { "epoch": 56.0, "grad_norm": 1.5311144590377808, "learning_rate": 1.6565656565656567e-05, "loss": 0.0607, "mean_token_accuracy": 0.9750000238418579, "step": 336 }, { "epoch": 56.19047619047619, "grad_norm": 1.4896326065063477, "learning_rate": 1.6464646464646466e-05, "loss": 0.2483, "mean_token_accuracy": 0.9790806472301483, "step": 337 }, { "epoch": 56.38095238095238, "grad_norm": 1.385233998298645, "learning_rate": 1.6363636363636366e-05, "loss": 0.2471, "mean_token_accuracy": 0.9801070243120193, "step": 338 }, { "epoch": 56.57142857142857, "grad_norm": 1.5755606889724731, "learning_rate": 1.6262626262626262e-05, "loss": 0.2462, "mean_token_accuracy": 0.9776095598936081, "step": 339 }, { "epoch": 56.76190476190476, "grad_norm": 1.4080952405929565, "learning_rate": 1.6161616161616165e-05, "loss": 0.2559, "mean_token_accuracy": 0.9763025045394897, "step": 340 }, { "epoch": 56.95238095238095, "grad_norm": 1.2759824991226196, "learning_rate": 1.606060606060606e-05, "loss": 0.2429, "mean_token_accuracy": 0.9811924993991852, "step": 341 }, { "epoch": 57.0, "grad_norm": 1.4365907907485962, "learning_rate": 1.595959595959596e-05, "loss": 0.0744, "mean_token_accuracy": 0.9836065769195557, "step": 342 }, { "epoch": 57.19047619047619, "grad_norm": 1.4234627485275269, "learning_rate": 1.585858585858586e-05, "loss": 0.2353, "mean_token_accuracy": 0.9792965203523636, "step": 343 }, { "epoch": 57.38095238095238, "grad_norm": 1.3555465936660767, "learning_rate": 1.5757575757575756e-05, "loss": 0.2494, "mean_token_accuracy": 0.9825381934642792, "step": 344 }, { "epoch": 57.57142857142857, "grad_norm": 1.4413907527923584, "learning_rate": 1.565656565656566e-05, "loss": 0.2534, "mean_token_accuracy": 0.979871854186058, "step": 345 }, { "epoch": 57.76190476190476, "grad_norm": 1.4927953481674194, "learning_rate": 1.5555555555555555e-05, "loss": 0.2305, "mean_token_accuracy": 0.9812074899673462, "step": 346 }, { "epoch": 57.95238095238095, "grad_norm": 1.7719610929489136, "learning_rate": 1.5454545454545454e-05, "loss": 0.2633, "mean_token_accuracy": 0.9754152894020081, "step": 347 }, { "epoch": 58.0, "grad_norm": 0.9548564553260803, "learning_rate": 1.5353535353535354e-05, "loss": 0.0521, "mean_token_accuracy": 0.9885057210922241, "step": 348 }, { "epoch": 58.19047619047619, "grad_norm": 1.4914696216583252, "learning_rate": 1.5252525252525255e-05, "loss": 0.2591, "mean_token_accuracy": 0.9796448796987534, "step": 349 }, { "epoch": 58.38095238095238, "grad_norm": 1.4677958488464355, "learning_rate": 1.5151515151515153e-05, "loss": 0.2468, "mean_token_accuracy": 0.9798107296228409, "step": 350 }, { "epoch": 58.57142857142857, "grad_norm": 1.3141554594039917, "learning_rate": 1.505050505050505e-05, "loss": 0.2325, "mean_token_accuracy": 0.9803733974695206, "step": 351 }, { "epoch": 58.76190476190476, "grad_norm": 1.3697947263717651, "learning_rate": 1.494949494949495e-05, "loss": 0.2598, "mean_token_accuracy": 0.9749108999967575, "step": 352 }, { "epoch": 58.95238095238095, "grad_norm": 1.252795696258545, "learning_rate": 1.484848484848485e-05, "loss": 0.2361, "mean_token_accuracy": 0.9824285060167313, "step": 353 }, { "epoch": 59.0, "grad_norm": 1.830544114112854, "learning_rate": 1.4747474747474749e-05, "loss": 0.0772, "mean_token_accuracy": 0.9682539701461792, "step": 354 }, { "epoch": 59.19047619047619, "grad_norm": 1.266861081123352, "learning_rate": 1.4646464646464647e-05, "loss": 0.236, "mean_token_accuracy": 0.9807495921850204, "step": 355 }, { "epoch": 59.38095238095238, "grad_norm": 1.5132209062576294, "learning_rate": 1.4545454545454545e-05, "loss": 0.2498, "mean_token_accuracy": 0.9786520302295685, "step": 356 }, { "epoch": 59.57142857142857, "grad_norm": 1.259032964706421, "learning_rate": 1.4444444444444444e-05, "loss": 0.2223, "mean_token_accuracy": 0.9812145084142685, "step": 357 }, { "epoch": 59.76190476190476, "grad_norm": 1.5718448162078857, "learning_rate": 1.4343434343434345e-05, "loss": 0.2627, "mean_token_accuracy": 0.9778482913970947, "step": 358 }, { "epoch": 59.95238095238095, "grad_norm": 1.4775868654251099, "learning_rate": 1.4242424242424243e-05, "loss": 0.2587, "mean_token_accuracy": 0.9746824651956558, "step": 359 }, { "epoch": 60.0, "grad_norm": 1.638393521308899, "learning_rate": 1.4141414141414141e-05, "loss": 0.0824, "mean_token_accuracy": 0.9824561476707458, "step": 360 }, { "epoch": 60.19047619047619, "grad_norm": 1.3080830574035645, "learning_rate": 1.404040404040404e-05, "loss": 0.2382, "mean_token_accuracy": 0.9818608462810516, "step": 361 }, { "epoch": 60.38095238095238, "grad_norm": 1.1936572790145874, "learning_rate": 1.3939393939393942e-05, "loss": 0.2333, "mean_token_accuracy": 0.9817762225866318, "step": 362 }, { "epoch": 60.57142857142857, "grad_norm": 1.5468491315841675, "learning_rate": 1.383838383838384e-05, "loss": 0.2653, "mean_token_accuracy": 0.9788466989994049, "step": 363 }, { "epoch": 60.76190476190476, "grad_norm": 1.3440382480621338, "learning_rate": 1.3737373737373737e-05, "loss": 0.2495, "mean_token_accuracy": 0.9803344905376434, "step": 364 }, { "epoch": 60.95238095238095, "grad_norm": 1.5807853937149048, "learning_rate": 1.3636363636363637e-05, "loss": 0.2399, "mean_token_accuracy": 0.977335661649704, "step": 365 }, { "epoch": 61.0, "grad_norm": 1.8642648458480835, "learning_rate": 1.3535353535353538e-05, "loss": 0.0675, "mean_token_accuracy": 0.9610389471054077, "step": 366 }, { "epoch": 61.19047619047619, "grad_norm": 1.4595698118209839, "learning_rate": 1.3434343434343436e-05, "loss": 0.2433, "mean_token_accuracy": 0.9782412499189377, "step": 367 }, { "epoch": 61.38095238095238, "grad_norm": 1.7195943593978882, "learning_rate": 1.3333333333333333e-05, "loss": 0.2283, "mean_token_accuracy": 0.98487289249897, "step": 368 }, { "epoch": 61.57142857142857, "grad_norm": 1.6731146574020386, "learning_rate": 1.3232323232323233e-05, "loss": 0.2481, "mean_token_accuracy": 0.9755380898714066, "step": 369 }, { "epoch": 61.76190476190476, "grad_norm": 1.3162552118301392, "learning_rate": 1.3131313131313134e-05, "loss": 0.2682, "mean_token_accuracy": 0.9773096293210983, "step": 370 }, { "epoch": 61.95238095238095, "grad_norm": 1.5763328075408936, "learning_rate": 1.3030303030303032e-05, "loss": 0.247, "mean_token_accuracy": 0.9791599065065384, "step": 371 }, { "epoch": 62.0, "grad_norm": 1.8567732572555542, "learning_rate": 1.292929292929293e-05, "loss": 0.0676, "mean_token_accuracy": 0.970588207244873, "step": 372 }, { "epoch": 62.19047619047619, "grad_norm": 1.322481393814087, "learning_rate": 1.2828282828282829e-05, "loss": 0.2385, "mean_token_accuracy": 0.979724794626236, "step": 373 }, { "epoch": 62.38095238095238, "grad_norm": 1.4246753454208374, "learning_rate": 1.2727272727272727e-05, "loss": 0.2467, "mean_token_accuracy": 0.9777331054210663, "step": 374 }, { "epoch": 62.57142857142857, "grad_norm": 1.4530190229415894, "learning_rate": 1.2626262626262628e-05, "loss": 0.2377, "mean_token_accuracy": 0.9767781794071198, "step": 375 }, { "epoch": 62.76190476190476, "grad_norm": 1.4946351051330566, "learning_rate": 1.2525252525252526e-05, "loss": 0.2547, "mean_token_accuracy": 0.9767863899469376, "step": 376 }, { "epoch": 62.95238095238095, "grad_norm": 1.442986011505127, "learning_rate": 1.2424242424242424e-05, "loss": 0.2575, "mean_token_accuracy": 0.9808852076530457, "step": 377 }, { "epoch": 63.0, "grad_norm": 2.1069142818450928, "learning_rate": 1.2323232323232325e-05, "loss": 0.0682, "mean_token_accuracy": 0.9726027250289917, "step": 378 }, { "epoch": 63.19047619047619, "grad_norm": 1.4386465549468994, "learning_rate": 1.2222222222222222e-05, "loss": 0.2472, "mean_token_accuracy": 0.9808338433504105, "step": 379 }, { "epoch": 63.38095238095238, "grad_norm": 1.5726056098937988, "learning_rate": 1.2121212121212122e-05, "loss": 0.2488, "mean_token_accuracy": 0.9816757142543793, "step": 380 }, { "epoch": 63.57142857142857, "grad_norm": 1.6537950038909912, "learning_rate": 1.202020202020202e-05, "loss": 0.2471, "mean_token_accuracy": 0.9798701107501984, "step": 381 }, { "epoch": 63.76190476190476, "grad_norm": 1.4154284000396729, "learning_rate": 1.1919191919191921e-05, "loss": 0.2483, "mean_token_accuracy": 0.9786428213119507, "step": 382 }, { "epoch": 63.95238095238095, "grad_norm": 1.493235468864441, "learning_rate": 1.1818181818181819e-05, "loss": 0.2499, "mean_token_accuracy": 0.9752872586250305, "step": 383 }, { "epoch": 64.0, "grad_norm": 0.9331473112106323, "learning_rate": 1.1717171717171718e-05, "loss": 0.0481, "mean_token_accuracy": 0.9902912378311157, "step": 384 }, { "epoch": 64.19047619047619, "grad_norm": 1.5490996837615967, "learning_rate": 1.1616161616161616e-05, "loss": 0.2544, "mean_token_accuracy": 0.9750427901744843, "step": 385 }, { "epoch": 64.38095238095238, "grad_norm": 1.2337415218353271, "learning_rate": 1.1515151515151517e-05, "loss": 0.2372, "mean_token_accuracy": 0.9794412702322006, "step": 386 }, { "epoch": 64.57142857142857, "grad_norm": 1.3450168371200562, "learning_rate": 1.1414141414141415e-05, "loss": 0.251, "mean_token_accuracy": 0.9808587580919266, "step": 387 }, { "epoch": 64.76190476190476, "grad_norm": 1.4372197389602661, "learning_rate": 1.1313131313131314e-05, "loss": 0.2541, "mean_token_accuracy": 0.9765901118516922, "step": 388 }, { "epoch": 64.95238095238095, "grad_norm": 1.3596030473709106, "learning_rate": 1.1212121212121212e-05, "loss": 0.2327, "mean_token_accuracy": 0.9819456040859222, "step": 389 }, { "epoch": 65.0, "grad_norm": 1.2771663665771484, "learning_rate": 1.1111111111111112e-05, "loss": 0.0615, "mean_token_accuracy": 0.9871794581413269, "step": 390 }, { "epoch": 65.19047619047619, "grad_norm": 1.3283063173294067, "learning_rate": 1.1010101010101011e-05, "loss": 0.2431, "mean_token_accuracy": 0.9796550124883652, "step": 391 }, { "epoch": 65.38095238095238, "grad_norm": 1.4404308795928955, "learning_rate": 1.0909090909090909e-05, "loss": 0.242, "mean_token_accuracy": 0.9827671945095062, "step": 392 }, { "epoch": 65.57142857142857, "grad_norm": 1.322653889656067, "learning_rate": 1.0808080808080808e-05, "loss": 0.235, "mean_token_accuracy": 0.9791911989450455, "step": 393 }, { "epoch": 65.76190476190476, "grad_norm": 1.346421718597412, "learning_rate": 1.0707070707070708e-05, "loss": 0.2602, "mean_token_accuracy": 0.9792519062757492, "step": 394 }, { "epoch": 65.95238095238095, "grad_norm": 1.361152172088623, "learning_rate": 1.0606060606060607e-05, "loss": 0.2404, "mean_token_accuracy": 0.9787698835134506, "step": 395 }, { "epoch": 66.0, "grad_norm": 1.4586611986160278, "learning_rate": 1.0505050505050505e-05, "loss": 0.0681, "mean_token_accuracy": 0.970588207244873, "step": 396 }, { "epoch": 66.19047619047619, "grad_norm": 1.4977368116378784, "learning_rate": 1.0404040404040405e-05, "loss": 0.2359, "mean_token_accuracy": 0.9806597381830215, "step": 397 }, { "epoch": 66.38095238095238, "grad_norm": 1.2351692914962769, "learning_rate": 1.0303030303030304e-05, "loss": 0.2508, "mean_token_accuracy": 0.977878749370575, "step": 398 }, { "epoch": 66.57142857142857, "grad_norm": 1.3478460311889648, "learning_rate": 1.0202020202020204e-05, "loss": 0.2321, "mean_token_accuracy": 0.9855255037546158, "step": 399 }, { "epoch": 66.76190476190476, "grad_norm": 1.618532419204712, "learning_rate": 1.0101010101010101e-05, "loss": 0.2658, "mean_token_accuracy": 0.9772535562515259, "step": 400 }, { "epoch": 66.95238095238095, "grad_norm": 1.5389485359191895, "learning_rate": 1e-05, "loss": 0.2465, "mean_token_accuracy": 0.9769544303417206, "step": 401 }, { "epoch": 67.0, "grad_norm": 0.9716305732727051, "learning_rate": 9.898989898989899e-06, "loss": 0.0529, "mean_token_accuracy": 0.9885057210922241, "step": 402 }, { "epoch": 67.19047619047619, "grad_norm": 1.4950332641601562, "learning_rate": 9.7979797979798e-06, "loss": 0.249, "mean_token_accuracy": 0.9769591093063354, "step": 403 }, { "epoch": 67.38095238095238, "grad_norm": 1.524194359779358, "learning_rate": 9.696969696969698e-06, "loss": 0.2477, "mean_token_accuracy": 0.98219034075737, "step": 404 }, { "epoch": 67.57142857142857, "grad_norm": 1.231911540031433, "learning_rate": 9.595959595959595e-06, "loss": 0.2232, "mean_token_accuracy": 0.9810429662466049, "step": 405 }, { "epoch": 67.76190476190476, "grad_norm": 1.404455304145813, "learning_rate": 9.494949494949495e-06, "loss": 0.2701, "mean_token_accuracy": 0.9793097227811813, "step": 406 }, { "epoch": 67.95238095238095, "grad_norm": 1.3537510633468628, "learning_rate": 9.393939393939394e-06, "loss": 0.2338, "mean_token_accuracy": 0.9800481051206589, "step": 407 }, { "epoch": 68.0, "grad_norm": 0.9093771576881409, "learning_rate": 9.292929292929294e-06, "loss": 0.0423, "mean_token_accuracy": 0.9902912378311157, "step": 408 }, { "epoch": 68.19047619047619, "grad_norm": 1.3876770734786987, "learning_rate": 9.191919191919192e-06, "loss": 0.2453, "mean_token_accuracy": 0.9814929813146591, "step": 409 }, { "epoch": 68.38095238095238, "grad_norm": 1.5604972839355469, "learning_rate": 9.090909090909091e-06, "loss": 0.2474, "mean_token_accuracy": 0.9796653985977173, "step": 410 }, { "epoch": 68.57142857142857, "grad_norm": 1.4196627140045166, "learning_rate": 8.98989898989899e-06, "loss": 0.2421, "mean_token_accuracy": 0.9826227128505707, "step": 411 }, { "epoch": 68.76190476190476, "grad_norm": 1.4446525573730469, "learning_rate": 8.88888888888889e-06, "loss": 0.237, "mean_token_accuracy": 0.9770011454820633, "step": 412 }, { "epoch": 68.95238095238095, "grad_norm": 1.3088741302490234, "learning_rate": 8.787878787878788e-06, "loss": 0.242, "mean_token_accuracy": 0.9788557142019272, "step": 413 }, { "epoch": 69.0, "grad_norm": 1.1058439016342163, "learning_rate": 8.686868686868687e-06, "loss": 0.0552, "mean_token_accuracy": 0.9878048896789551, "step": 414 }, { "epoch": 69.19047619047619, "grad_norm": 1.5012304782867432, "learning_rate": 8.585858585858587e-06, "loss": 0.2472, "mean_token_accuracy": 0.9804881513118744, "step": 415 }, { "epoch": 69.38095238095238, "grad_norm": 1.2776250839233398, "learning_rate": 8.484848484848486e-06, "loss": 0.245, "mean_token_accuracy": 0.9793550372123718, "step": 416 }, { "epoch": 69.57142857142857, "grad_norm": 1.4031535387039185, "learning_rate": 8.383838383838384e-06, "loss": 0.2391, "mean_token_accuracy": 0.9811627715826035, "step": 417 }, { "epoch": 69.76190476190476, "grad_norm": 1.5323896408081055, "learning_rate": 8.282828282828283e-06, "loss": 0.2402, "mean_token_accuracy": 0.9756592959165573, "step": 418 }, { "epoch": 69.95238095238095, "grad_norm": 1.415002465248108, "learning_rate": 8.181818181818183e-06, "loss": 0.2447, "mean_token_accuracy": 0.9816397428512573, "step": 419 }, { "epoch": 70.0, "grad_norm": 1.84005606174469, "learning_rate": 8.080808080808082e-06, "loss": 0.0622, "mean_token_accuracy": 0.9726027250289917, "step": 420 }, { "epoch": 70.19047619047619, "grad_norm": 1.3505762815475464, "learning_rate": 7.97979797979798e-06, "loss": 0.2363, "mean_token_accuracy": 0.9800622910261154, "step": 421 }, { "epoch": 70.38095238095238, "grad_norm": 1.3231146335601807, "learning_rate": 7.878787878787878e-06, "loss": 0.2327, "mean_token_accuracy": 0.9815961122512817, "step": 422 }, { "epoch": 70.57142857142857, "grad_norm": 1.6289716958999634, "learning_rate": 7.777777777777777e-06, "loss": 0.2469, "mean_token_accuracy": 0.976947546005249, "step": 423 }, { "epoch": 70.76190476190476, "grad_norm": 1.5643327236175537, "learning_rate": 7.676767676767677e-06, "loss": 0.2541, "mean_token_accuracy": 0.9771561771631241, "step": 424 }, { "epoch": 70.95238095238095, "grad_norm": 1.4305167198181152, "learning_rate": 7.5757575757575764e-06, "loss": 0.2452, "mean_token_accuracy": 0.9759194254875183, "step": 425 }, { "epoch": 71.0, "grad_norm": 1.5850602388381958, "learning_rate": 7.474747474747475e-06, "loss": 0.0683, "mean_token_accuracy": 0.9850746393203735, "step": 426 }, { "epoch": 71.19047619047619, "grad_norm": 1.3248540163040161, "learning_rate": 7.3737373737373745e-06, "loss": 0.24, "mean_token_accuracy": 0.9821758568286896, "step": 427 }, { "epoch": 71.38095238095238, "grad_norm": 1.3908957242965698, "learning_rate": 7.272727272727272e-06, "loss": 0.242, "mean_token_accuracy": 0.9802806377410889, "step": 428 }, { "epoch": 71.57142857142857, "grad_norm": 1.3902804851531982, "learning_rate": 7.171717171717173e-06, "loss": 0.2423, "mean_token_accuracy": 0.9788789004087448, "step": 429 }, { "epoch": 71.76190476190476, "grad_norm": 1.4126980304718018, "learning_rate": 7.0707070707070704e-06, "loss": 0.2437, "mean_token_accuracy": 0.9766863882541656, "step": 430 }, { "epoch": 71.95238095238095, "grad_norm": 1.423156499862671, "learning_rate": 6.969696969696971e-06, "loss": 0.2427, "mean_token_accuracy": 0.9781524240970612, "step": 431 }, { "epoch": 72.0, "grad_norm": 1.736093521118164, "learning_rate": 6.8686868686868685e-06, "loss": 0.0814, "mean_token_accuracy": 0.9818181991577148, "step": 432 }, { "epoch": 72.19047619047619, "grad_norm": 1.281557321548462, "learning_rate": 6.767676767676769e-06, "loss": 0.2482, "mean_token_accuracy": 0.9825676530599594, "step": 433 }, { "epoch": 72.38095238095238, "grad_norm": 1.3980622291564941, "learning_rate": 6.666666666666667e-06, "loss": 0.2428, "mean_token_accuracy": 0.9788574278354645, "step": 434 }, { "epoch": 72.57142857142857, "grad_norm": 1.419425368309021, "learning_rate": 6.565656565656567e-06, "loss": 0.2431, "mean_token_accuracy": 0.9791808128356934, "step": 435 }, { "epoch": 72.76190476190476, "grad_norm": 1.5525389909744263, "learning_rate": 6.464646464646465e-06, "loss": 0.2538, "mean_token_accuracy": 0.9783525764942169, "step": 436 }, { "epoch": 72.95238095238095, "grad_norm": 1.295773983001709, "learning_rate": 6.363636363636363e-06, "loss": 0.2299, "mean_token_accuracy": 0.9779433310031891, "step": 437 }, { "epoch": 73.0, "grad_norm": 0.6111257076263428, "learning_rate": 6.262626262626263e-06, "loss": 0.0384, "mean_token_accuracy": 0.9922480583190918, "step": 438 }, { "epoch": 73.19047619047619, "grad_norm": 1.387117862701416, "learning_rate": 6.161616161616162e-06, "loss": 0.2405, "mean_token_accuracy": 0.979522630572319, "step": 439 }, { "epoch": 73.38095238095238, "grad_norm": 1.3952202796936035, "learning_rate": 6.060606060606061e-06, "loss": 0.2486, "mean_token_accuracy": 0.9780898541212082, "step": 440 }, { "epoch": 73.57142857142857, "grad_norm": 1.6391713619232178, "learning_rate": 5.9595959595959605e-06, "loss": 0.2504, "mean_token_accuracy": 0.9782277494668961, "step": 441 }, { "epoch": 73.76190476190476, "grad_norm": 1.4811103343963623, "learning_rate": 5.858585858585859e-06, "loss": 0.2392, "mean_token_accuracy": 0.9793239235877991, "step": 442 }, { "epoch": 73.95238095238095, "grad_norm": 1.4281538724899292, "learning_rate": 5.7575757575757586e-06, "loss": 0.2326, "mean_token_accuracy": 0.979654997587204, "step": 443 }, { "epoch": 74.0, "grad_norm": 1.2993221282958984, "learning_rate": 5.656565656565657e-06, "loss": 0.0573, "mean_token_accuracy": 0.9876543283462524, "step": 444 }, { "epoch": 74.19047619047619, "grad_norm": 1.2887934446334839, "learning_rate": 5.555555555555556e-06, "loss": 0.2422, "mean_token_accuracy": 0.9798881709575653, "step": 445 }, { "epoch": 74.38095238095238, "grad_norm": 1.581034779548645, "learning_rate": 5.4545454545454545e-06, "loss": 0.2462, "mean_token_accuracy": 0.9796192944049835, "step": 446 }, { "epoch": 74.57142857142857, "grad_norm": 1.219085693359375, "learning_rate": 5.353535353535354e-06, "loss": 0.2434, "mean_token_accuracy": 0.9797424674034119, "step": 447 }, { "epoch": 74.76190476190476, "grad_norm": 1.2309306859970093, "learning_rate": 5.2525252525252526e-06, "loss": 0.2379, "mean_token_accuracy": 0.978371798992157, "step": 448 }, { "epoch": 74.95238095238095, "grad_norm": 1.4002373218536377, "learning_rate": 5.151515151515152e-06, "loss": 0.2325, "mean_token_accuracy": 0.9793529957532883, "step": 449 }, { "epoch": 75.0, "grad_norm": 2.0193445682525635, "learning_rate": 5.050505050505051e-06, "loss": 0.0842, "mean_token_accuracy": 0.9807692170143127, "step": 450 }, { "epoch": 75.19047619047619, "grad_norm": 1.3020991086959839, "learning_rate": 4.949494949494949e-06, "loss": 0.2249, "mean_token_accuracy": 0.983807697892189, "step": 451 }, { "epoch": 75.38095238095238, "grad_norm": 1.2189743518829346, "learning_rate": 4.848484848484849e-06, "loss": 0.2444, "mean_token_accuracy": 0.9823562502861023, "step": 452 }, { "epoch": 75.57142857142857, "grad_norm": 1.43671715259552, "learning_rate": 4.747474747474747e-06, "loss": 0.2473, "mean_token_accuracy": 0.9775967448949814, "step": 453 }, { "epoch": 75.76190476190476, "grad_norm": 1.6678014993667603, "learning_rate": 4.646464646464647e-06, "loss": 0.2352, "mean_token_accuracy": 0.9812745600938797, "step": 454 }, { "epoch": 75.95238095238095, "grad_norm": 1.9260616302490234, "learning_rate": 4.5454545454545455e-06, "loss": 0.2581, "mean_token_accuracy": 0.9734574407339096, "step": 455 }, { "epoch": 76.0, "grad_norm": 1.5224919319152832, "learning_rate": 4.444444444444445e-06, "loss": 0.0667, "mean_token_accuracy": 0.9846153855323792, "step": 456 }, { "epoch": 76.19047619047619, "grad_norm": 1.1384742259979248, "learning_rate": 4.343434343434344e-06, "loss": 0.2166, "mean_token_accuracy": 0.9816610366106033, "step": 457 }, { "epoch": 76.38095238095238, "grad_norm": 1.5136680603027344, "learning_rate": 4.242424242424243e-06, "loss": 0.2443, "mean_token_accuracy": 0.9804109483957291, "step": 458 }, { "epoch": 76.57142857142857, "grad_norm": 1.5559028387069702, "learning_rate": 4.141414141414142e-06, "loss": 0.2472, "mean_token_accuracy": 0.9795145392417908, "step": 459 }, { "epoch": 76.76190476190476, "grad_norm": 1.4042458534240723, "learning_rate": 4.040404040404041e-06, "loss": 0.2422, "mean_token_accuracy": 0.9746371954679489, "step": 460 }, { "epoch": 76.95238095238095, "grad_norm": 1.3069055080413818, "learning_rate": 3.939393939393939e-06, "loss": 0.2574, "mean_token_accuracy": 0.981501892209053, "step": 461 }, { "epoch": 77.0, "grad_norm": 1.4545823335647583, "learning_rate": 3.8383838383838385e-06, "loss": 0.0675, "mean_token_accuracy": 0.970588207244873, "step": 462 }, { "epoch": 77.19047619047619, "grad_norm": 1.4684022665023804, "learning_rate": 3.7373737373737375e-06, "loss": 0.2269, "mean_token_accuracy": 0.981085941195488, "step": 463 }, { "epoch": 77.38095238095238, "grad_norm": 1.5217136144638062, "learning_rate": 3.636363636363636e-06, "loss": 0.2415, "mean_token_accuracy": 0.9836974442005157, "step": 464 }, { "epoch": 77.57142857142857, "grad_norm": 1.2941691875457764, "learning_rate": 3.5353535353535352e-06, "loss": 0.2387, "mean_token_accuracy": 0.978131577372551, "step": 465 }, { "epoch": 77.76190476190476, "grad_norm": 1.4465221166610718, "learning_rate": 3.4343434343434343e-06, "loss": 0.2404, "mean_token_accuracy": 0.9785452336072922, "step": 466 }, { "epoch": 77.95238095238095, "grad_norm": 1.4259777069091797, "learning_rate": 3.3333333333333333e-06, "loss": 0.2515, "mean_token_accuracy": 0.9781184196472168, "step": 467 }, { "epoch": 78.0, "grad_norm": 1.9436161518096924, "learning_rate": 3.2323232323232324e-06, "loss": 0.0751, "mean_token_accuracy": 0.9661017060279846, "step": 468 }, { "epoch": 78.19047619047619, "grad_norm": 1.2418111562728882, "learning_rate": 3.1313131313131314e-06, "loss": 0.2206, "mean_token_accuracy": 0.980968713760376, "step": 469 }, { "epoch": 78.38095238095238, "grad_norm": 1.3781098127365112, "learning_rate": 3.0303030303030305e-06, "loss": 0.2423, "mean_token_accuracy": 0.9790701419115067, "step": 470 }, { "epoch": 78.57142857142857, "grad_norm": 1.3852852582931519, "learning_rate": 2.9292929292929295e-06, "loss": 0.2423, "mean_token_accuracy": 0.9788630157709122, "step": 471 }, { "epoch": 78.76190476190476, "grad_norm": 1.5246734619140625, "learning_rate": 2.8282828282828286e-06, "loss": 0.2497, "mean_token_accuracy": 0.9794032126665115, "step": 472 }, { "epoch": 78.95238095238095, "grad_norm": 1.4307729005813599, "learning_rate": 2.7272727272727272e-06, "loss": 0.2479, "mean_token_accuracy": 0.9815962314605713, "step": 473 }, { "epoch": 79.0, "grad_norm": 1.941765308380127, "learning_rate": 2.6262626262626263e-06, "loss": 0.0653, "mean_token_accuracy": 0.9577465057373047, "step": 474 }, { "epoch": 79.19047619047619, "grad_norm": 1.2771799564361572, "learning_rate": 2.5252525252525253e-06, "loss": 0.2255, "mean_token_accuracy": 0.9810370206832886, "step": 475 }, { "epoch": 79.38095238095238, "grad_norm": 1.325358271598816, "learning_rate": 2.4242424242424244e-06, "loss": 0.242, "mean_token_accuracy": 0.9791529029607773, "step": 476 }, { "epoch": 79.57142857142857, "grad_norm": 1.295100212097168, "learning_rate": 2.3232323232323234e-06, "loss": 0.2488, "mean_token_accuracy": 0.9798661768436432, "step": 477 }, { "epoch": 79.76190476190476, "grad_norm": 1.4676238298416138, "learning_rate": 2.2222222222222225e-06, "loss": 0.2367, "mean_token_accuracy": 0.9780342727899551, "step": 478 }, { "epoch": 79.95238095238095, "grad_norm": 1.7996033430099487, "learning_rate": 2.1212121212121216e-06, "loss": 0.2452, "mean_token_accuracy": 0.9771022349596024, "step": 479 }, { "epoch": 80.0, "grad_norm": 1.3761502504348755, "learning_rate": 2.0202020202020206e-06, "loss": 0.06, "mean_token_accuracy": 0.970588207244873, "step": 480 }, { "epoch": 80.19047619047619, "grad_norm": 1.3741532564163208, "learning_rate": 1.9191919191919192e-06, "loss": 0.2414, "mean_token_accuracy": 0.9827142953872681, "step": 481 }, { "epoch": 80.38095238095238, "grad_norm": 1.680336594581604, "learning_rate": 1.818181818181818e-06, "loss": 0.2308, "mean_token_accuracy": 0.980181872844696, "step": 482 }, { "epoch": 80.57142857142857, "grad_norm": 1.1747589111328125, "learning_rate": 1.7171717171717171e-06, "loss": 0.2201, "mean_token_accuracy": 0.9804712980985641, "step": 483 }, { "epoch": 80.76190476190476, "grad_norm": 1.4682387113571167, "learning_rate": 1.6161616161616162e-06, "loss": 0.2481, "mean_token_accuracy": 0.9811168909072876, "step": 484 }, { "epoch": 80.95238095238095, "grad_norm": 1.5288760662078857, "learning_rate": 1.5151515151515152e-06, "loss": 0.2542, "mean_token_accuracy": 0.9763506799936295, "step": 485 }, { "epoch": 81.0, "grad_norm": 2.051353931427002, "learning_rate": 1.4141414141414143e-06, "loss": 0.0759, "mean_token_accuracy": 0.9666666388511658, "step": 486 }, { "epoch": 81.19047619047619, "grad_norm": 1.4453171491622925, "learning_rate": 1.3131313131313131e-06, "loss": 0.2488, "mean_token_accuracy": 0.9764743894338608, "step": 487 }, { "epoch": 81.38095238095238, "grad_norm": 1.2203129529953003, "learning_rate": 1.2121212121212122e-06, "loss": 0.2208, "mean_token_accuracy": 0.9802269041538239, "step": 488 }, { "epoch": 81.57142857142857, "grad_norm": 1.338069200515747, "learning_rate": 1.1111111111111112e-06, "loss": 0.2454, "mean_token_accuracy": 0.9848097264766693, "step": 489 }, { "epoch": 81.76190476190476, "grad_norm": 1.3311666250228882, "learning_rate": 1.0101010101010103e-06, "loss": 0.2276, "mean_token_accuracy": 0.9802386462688446, "step": 490 }, { "epoch": 81.95238095238095, "grad_norm": 1.4156842231750488, "learning_rate": 9.09090909090909e-07, "loss": 0.2622, "mean_token_accuracy": 0.9762069880962372, "step": 491 }, { "epoch": 82.0, "grad_norm": 1.7438231706619263, "learning_rate": 8.080808080808081e-07, "loss": 0.0642, "mean_token_accuracy": 0.9710144996643066, "step": 492 }, { "epoch": 82.19047619047619, "grad_norm": 1.338675618171692, "learning_rate": 7.070707070707071e-07, "loss": 0.2547, "mean_token_accuracy": 0.9793485999107361, "step": 493 }, { "epoch": 82.38095238095238, "grad_norm": 1.248263955116272, "learning_rate": 6.060606060606061e-07, "loss": 0.2139, "mean_token_accuracy": 0.9814836531877518, "step": 494 }, { "epoch": 82.57142857142857, "grad_norm": 1.4303299188613892, "learning_rate": 5.050505050505052e-07, "loss": 0.2466, "mean_token_accuracy": 0.9783899486064911, "step": 495 }, { "epoch": 82.76190476190476, "grad_norm": 1.4656988382339478, "learning_rate": 4.0404040404040405e-07, "loss": 0.2469, "mean_token_accuracy": 0.9803285598754883, "step": 496 }, { "epoch": 82.95238095238095, "grad_norm": 1.3924672603607178, "learning_rate": 3.0303030303030305e-07, "loss": 0.2375, "mean_token_accuracy": 0.9797345548868179, "step": 497 }, { "epoch": 83.0, "grad_norm": 0.9879482388496399, "learning_rate": 2.0202020202020202e-07, "loss": 0.0395, "mean_token_accuracy": 0.9838709831237793, "step": 498 }, { "epoch": 83.19047619047619, "grad_norm": 1.2162104845046997, "learning_rate": 1.0101010101010101e-07, "loss": 0.2433, "mean_token_accuracy": 0.981399655342102, "step": 499 }, { "epoch": 83.38095238095238, "grad_norm": 1.2492247819900513, "learning_rate": 0.0, "loss": 0.2299, "mean_token_accuracy": 0.9802171587944031, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2203866148700160.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }