| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 83.38095238095238, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 33.78459548950195, | |
| "learning_rate": 1e-05, | |
| "loss": 14.2748, | |
| "mean_token_accuracy": 0.4245416074991226, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 34.141048431396484, | |
| "learning_rate": 2e-05, | |
| "loss": 14.9063, | |
| "mean_token_accuracy": 0.42434193193912506, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 33.89708709716797, | |
| "learning_rate": 3e-05, | |
| "loss": 14.4293, | |
| "mean_token_accuracy": 0.42967987805604935, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 21.82135009765625, | |
| "learning_rate": 4e-05, | |
| "loss": 13.1187, | |
| "mean_token_accuracy": 0.4886682406067848, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 19.052448272705078, | |
| "learning_rate": 5e-05, | |
| "loss": 11.6617, | |
| "mean_token_accuracy": 0.5300922393798828, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 10.372604370117188, | |
| "learning_rate": 4.98989898989899e-05, | |
| "loss": 1.9845, | |
| "mean_token_accuracy": 0.6190476417541504, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 18.249330520629883, | |
| "learning_rate": 4.97979797979798e-05, | |
| "loss": 9.8237, | |
| "mean_token_accuracy": 0.58917336165905, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 18.177717208862305, | |
| "learning_rate": 4.9696969696969694e-05, | |
| "loss": 9.7575, | |
| "mean_token_accuracy": 0.5883309841156006, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 16.09309196472168, | |
| "learning_rate": 4.9595959595959594e-05, | |
| "loss": 9.3943, | |
| "mean_token_accuracy": 0.6104246228933334, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 1.7619047619047619, | |
| "grad_norm": 14.678476333618164, | |
| "learning_rate": 4.94949494949495e-05, | |
| "loss": 8.6018, | |
| "mean_token_accuracy": 0.6411420404911041, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 1.9523809523809523, | |
| "grad_norm": 12.80629825592041, | |
| "learning_rate": 4.93939393939394e-05, | |
| "loss": 7.9568, | |
| "mean_token_accuracy": 0.6764184236526489, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 9.918559074401855, | |
| "learning_rate": 4.92929292929293e-05, | |
| "loss": 1.4247, | |
| "mean_token_accuracy": 0.75, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 2.1904761904761907, | |
| "grad_norm": 11.65300464630127, | |
| "learning_rate": 4.919191919191919e-05, | |
| "loss": 7.3849, | |
| "mean_token_accuracy": 0.6941855251789093, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 11.127327919006348, | |
| "learning_rate": 4.909090909090909e-05, | |
| "loss": 6.7104, | |
| "mean_token_accuracy": 0.7069735676050186, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 11.559555053710938, | |
| "learning_rate": 4.898989898989899e-05, | |
| "loss": 7.0902, | |
| "mean_token_accuracy": 0.709569051861763, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 2.761904761904762, | |
| "grad_norm": 10.838669776916504, | |
| "learning_rate": 4.888888888888889e-05, | |
| "loss": 6.7901, | |
| "mean_token_accuracy": 0.713655412197113, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 2.9523809523809526, | |
| "grad_norm": 10.266611099243164, | |
| "learning_rate": 4.878787878787879e-05, | |
| "loss": 6.4548, | |
| "mean_token_accuracy": 0.7244278490543365, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 5.915023326873779, | |
| "learning_rate": 4.868686868686869e-05, | |
| "loss": 0.636, | |
| "mean_token_accuracy": 0.8730158805847168, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 3.1904761904761907, | |
| "grad_norm": 9.826017379760742, | |
| "learning_rate": 4.858585858585859e-05, | |
| "loss": 5.6655, | |
| "mean_token_accuracy": 0.7555368840694427, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 3.380952380952381, | |
| "grad_norm": 9.213407516479492, | |
| "learning_rate": 4.848484848484849e-05, | |
| "loss": 6.4954, | |
| "mean_token_accuracy": 0.7222279012203217, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 3.571428571428571, | |
| "grad_norm": 9.642789840698242, | |
| "learning_rate": 4.838383838383839e-05, | |
| "loss": 5.1397, | |
| "mean_token_accuracy": 0.7691315412521362, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 3.761904761904762, | |
| "grad_norm": 8.594555854797363, | |
| "learning_rate": 4.828282828282829e-05, | |
| "loss": 5.4342, | |
| "mean_token_accuracy": 0.7607319056987762, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 3.9523809523809526, | |
| "grad_norm": 8.79131031036377, | |
| "learning_rate": 4.8181818181818186e-05, | |
| "loss": 5.7146, | |
| "mean_token_accuracy": 0.7484780848026276, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 6.953114032745361, | |
| "learning_rate": 4.808080808080808e-05, | |
| "loss": 1.4211, | |
| "mean_token_accuracy": 0.7580645084381104, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 4.190476190476191, | |
| "grad_norm": 8.912933349609375, | |
| "learning_rate": 4.797979797979798e-05, | |
| "loss": 4.9729, | |
| "mean_token_accuracy": 0.7652112394571304, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 4.380952380952381, | |
| "grad_norm": 9.128190994262695, | |
| "learning_rate": 4.787878787878788e-05, | |
| "loss": 4.9376, | |
| "mean_token_accuracy": 0.7736384719610214, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 4.571428571428571, | |
| "grad_norm": 9.021340370178223, | |
| "learning_rate": 4.7777777777777784e-05, | |
| "loss": 5.1022, | |
| "mean_token_accuracy": 0.7747573852539062, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 4.761904761904762, | |
| "grad_norm": 8.445326805114746, | |
| "learning_rate": 4.7676767676767684e-05, | |
| "loss": 4.4903, | |
| "mean_token_accuracy": 0.8014376759529114, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 4.9523809523809526, | |
| "grad_norm": 8.269598960876465, | |
| "learning_rate": 4.7575757575757576e-05, | |
| "loss": 4.7027, | |
| "mean_token_accuracy": 0.7928940802812576, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 4.256129264831543, | |
| "learning_rate": 4.7474747474747476e-05, | |
| "loss": 1.1768, | |
| "mean_token_accuracy": 0.8405796885490417, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 5.190476190476191, | |
| "grad_norm": 7.8270978927612305, | |
| "learning_rate": 4.7373737373737375e-05, | |
| "loss": 4.2699, | |
| "mean_token_accuracy": 0.8052034825086594, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 5.380952380952381, | |
| "grad_norm": 7.741850852966309, | |
| "learning_rate": 4.7272727272727275e-05, | |
| "loss": 3.9571, | |
| "mean_token_accuracy": 0.8226524442434311, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 5.571428571428571, | |
| "grad_norm": 7.062904357910156, | |
| "learning_rate": 4.7171717171717174e-05, | |
| "loss": 4.1547, | |
| "mean_token_accuracy": 0.8154689371585846, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 5.761904761904762, | |
| "grad_norm": 7.048011779785156, | |
| "learning_rate": 4.7070707070707074e-05, | |
| "loss": 4.4063, | |
| "mean_token_accuracy": 0.8031313121318817, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 5.9523809523809526, | |
| "grad_norm": 7.0800580978393555, | |
| "learning_rate": 4.696969696969697e-05, | |
| "loss": 3.6279, | |
| "mean_token_accuracy": 0.8297399282455444, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 7.842761993408203, | |
| "learning_rate": 4.686868686868687e-05, | |
| "loss": 1.2107, | |
| "mean_token_accuracy": 0.8068181872367859, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 6.190476190476191, | |
| "grad_norm": 7.796157360076904, | |
| "learning_rate": 4.676767676767677e-05, | |
| "loss": 3.3978, | |
| "mean_token_accuracy": 0.8329954296350479, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 6.380952380952381, | |
| "grad_norm": 6.457103252410889, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 3.451, | |
| "mean_token_accuracy": 0.8273660093545914, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 6.571428571428571, | |
| "grad_norm": 6.003915786743164, | |
| "learning_rate": 4.656565656565657e-05, | |
| "loss": 3.5587, | |
| "mean_token_accuracy": 0.83831487596035, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 6.761904761904762, | |
| "grad_norm": 6.043710231781006, | |
| "learning_rate": 4.6464646464646464e-05, | |
| "loss": 3.5422, | |
| "mean_token_accuracy": 0.8222462385892868, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 6.9523809523809526, | |
| "grad_norm": 6.391598701477051, | |
| "learning_rate": 4.636363636363636e-05, | |
| "loss": 3.2658, | |
| "mean_token_accuracy": 0.856766939163208, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 5.940098285675049, | |
| "learning_rate": 4.626262626262626e-05, | |
| "loss": 0.7579, | |
| "mean_token_accuracy": 0.8301886916160583, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 7.190476190476191, | |
| "grad_norm": 6.040279388427734, | |
| "learning_rate": 4.616161616161616e-05, | |
| "loss": 2.7243, | |
| "mean_token_accuracy": 0.8692310005426407, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 7.380952380952381, | |
| "grad_norm": 5.645506858825684, | |
| "learning_rate": 4.606060606060607e-05, | |
| "loss": 2.701, | |
| "mean_token_accuracy": 0.8647979497909546, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 7.571428571428571, | |
| "grad_norm": 5.126684188842773, | |
| "learning_rate": 4.595959595959596e-05, | |
| "loss": 2.8655, | |
| "mean_token_accuracy": 0.8684723079204559, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 7.761904761904762, | |
| "grad_norm": 8.235642433166504, | |
| "learning_rate": 4.585858585858586e-05, | |
| "loss": 2.9052, | |
| "mean_token_accuracy": 0.8446438163518906, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 7.9523809523809526, | |
| "grad_norm": 6.074913501739502, | |
| "learning_rate": 4.575757575757576e-05, | |
| "loss": 2.8831, | |
| "mean_token_accuracy": 0.857246458530426, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 4.886857986450195, | |
| "learning_rate": 4.565656565656566e-05, | |
| "loss": 0.8029, | |
| "mean_token_accuracy": 0.8294573426246643, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 8.19047619047619, | |
| "grad_norm": 6.794694900512695, | |
| "learning_rate": 4.555555555555556e-05, | |
| "loss": 2.4927, | |
| "mean_token_accuracy": 0.8782062977552414, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 8.380952380952381, | |
| "grad_norm": 5.690680503845215, | |
| "learning_rate": 4.545454545454546e-05, | |
| "loss": 1.9744, | |
| "mean_token_accuracy": 0.8949980139732361, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 8.571428571428571, | |
| "grad_norm": 9.415908813476562, | |
| "learning_rate": 4.535353535353535e-05, | |
| "loss": 2.2432, | |
| "mean_token_accuracy": 0.8826991468667984, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 8.761904761904763, | |
| "grad_norm": 7.901670932769775, | |
| "learning_rate": 4.525252525252526e-05, | |
| "loss": 2.2805, | |
| "mean_token_accuracy": 0.8890593945980072, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 8.952380952380953, | |
| "grad_norm": 6.918704986572266, | |
| "learning_rate": 4.515151515151516e-05, | |
| "loss": 2.5343, | |
| "mean_token_accuracy": 0.8712608069181442, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 12.76561450958252, | |
| "learning_rate": 4.5050505050505056e-05, | |
| "loss": 0.576, | |
| "mean_token_accuracy": 0.8529411554336548, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 9.19047619047619, | |
| "grad_norm": 6.143138408660889, | |
| "learning_rate": 4.494949494949495e-05, | |
| "loss": 1.878, | |
| "mean_token_accuracy": 0.9020879119634628, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 9.380952380952381, | |
| "grad_norm": 7.497737884521484, | |
| "learning_rate": 4.484848484848485e-05, | |
| "loss": 1.9871, | |
| "mean_token_accuracy": 0.8944180905818939, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 9.571428571428571, | |
| "grad_norm": 5.427354335784912, | |
| "learning_rate": 4.474747474747475e-05, | |
| "loss": 1.9095, | |
| "mean_token_accuracy": 0.9023730456829071, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 9.761904761904763, | |
| "grad_norm": 5.814023017883301, | |
| "learning_rate": 4.464646464646465e-05, | |
| "loss": 1.8084, | |
| "mean_token_accuracy": 0.9020061939954758, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 9.952380952380953, | |
| "grad_norm": 6.965571403503418, | |
| "learning_rate": 4.454545454545455e-05, | |
| "loss": 1.7746, | |
| "mean_token_accuracy": 0.9095794558525085, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 6.048158168792725, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 0.4674, | |
| "mean_token_accuracy": 0.9152542352676392, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 10.19047619047619, | |
| "grad_norm": 6.400238513946533, | |
| "learning_rate": 4.4343434343434346e-05, | |
| "loss": 1.4747, | |
| "mean_token_accuracy": 0.9173053950071335, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 10.380952380952381, | |
| "grad_norm": 5.616025924682617, | |
| "learning_rate": 4.4242424242424246e-05, | |
| "loss": 1.4234, | |
| "mean_token_accuracy": 0.9245103895664215, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 10.571428571428571, | |
| "grad_norm": 6.788946628570557, | |
| "learning_rate": 4.4141414141414145e-05, | |
| "loss": 1.6027, | |
| "mean_token_accuracy": 0.9176820814609528, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 10.761904761904763, | |
| "grad_norm": 6.084983825683594, | |
| "learning_rate": 4.4040404040404044e-05, | |
| "loss": 1.4259, | |
| "mean_token_accuracy": 0.9250814765691757, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 10.952380952380953, | |
| "grad_norm": 10.394392967224121, | |
| "learning_rate": 4.3939393939393944e-05, | |
| "loss": 1.2998, | |
| "mean_token_accuracy": 0.9314595013856888, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 4.715174198150635, | |
| "learning_rate": 4.383838383838384e-05, | |
| "loss": 0.2015, | |
| "mean_token_accuracy": 0.9506173133850098, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 11.19047619047619, | |
| "grad_norm": 4.792293071746826, | |
| "learning_rate": 4.3737373737373736e-05, | |
| "loss": 1.2582, | |
| "mean_token_accuracy": 0.9351158142089844, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 11.380952380952381, | |
| "grad_norm": 7.185492515563965, | |
| "learning_rate": 4.3636363636363636e-05, | |
| "loss": 1.025, | |
| "mean_token_accuracy": 0.9418339878320694, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 11.571428571428571, | |
| "grad_norm": 6.083255290985107, | |
| "learning_rate": 4.3535353535353535e-05, | |
| "loss": 1.0012, | |
| "mean_token_accuracy": 0.9446901679039001, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 11.761904761904763, | |
| "grad_norm": 8.141711235046387, | |
| "learning_rate": 4.343434343434344e-05, | |
| "loss": 1.2278, | |
| "mean_token_accuracy": 0.9310520589351654, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 11.952380952380953, | |
| "grad_norm": 9.146880149841309, | |
| "learning_rate": 4.3333333333333334e-05, | |
| "loss": 1.0842, | |
| "mean_token_accuracy": 0.9404759407043457, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 3.645364761352539, | |
| "learning_rate": 4.3232323232323234e-05, | |
| "loss": 0.1553, | |
| "mean_token_accuracy": 0.9714285731315613, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 12.19047619047619, | |
| "grad_norm": 7.048225402832031, | |
| "learning_rate": 4.313131313131313e-05, | |
| "loss": 1.0319, | |
| "mean_token_accuracy": 0.9446324110031128, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 12.380952380952381, | |
| "grad_norm": 6.668647289276123, | |
| "learning_rate": 4.303030303030303e-05, | |
| "loss": 0.8348, | |
| "mean_token_accuracy": 0.9561943113803864, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 12.571428571428571, | |
| "grad_norm": 7.347132205963135, | |
| "learning_rate": 4.292929292929293e-05, | |
| "loss": 0.8571, | |
| "mean_token_accuracy": 0.9449830502271652, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 12.761904761904763, | |
| "grad_norm": 5.543299674987793, | |
| "learning_rate": 4.282828282828283e-05, | |
| "loss": 0.9421, | |
| "mean_token_accuracy": 0.9508587419986725, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 12.952380952380953, | |
| "grad_norm": 6.999424934387207, | |
| "learning_rate": 4.2727272727272724e-05, | |
| "loss": 0.6839, | |
| "mean_token_accuracy": 0.9609730541706085, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 2.92433762550354, | |
| "learning_rate": 4.262626262626263e-05, | |
| "loss": 0.1323, | |
| "mean_token_accuracy": 0.9838709831237793, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 13.19047619047619, | |
| "grad_norm": 5.790960311889648, | |
| "learning_rate": 4.252525252525253e-05, | |
| "loss": 0.7111, | |
| "mean_token_accuracy": 0.9593389332294464, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 13.380952380952381, | |
| "grad_norm": 5.800691604614258, | |
| "learning_rate": 4.242424242424243e-05, | |
| "loss": 0.6327, | |
| "mean_token_accuracy": 0.9631912261247635, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 13.571428571428571, | |
| "grad_norm": 5.627686977386475, | |
| "learning_rate": 4.232323232323233e-05, | |
| "loss": 0.6079, | |
| "mean_token_accuracy": 0.961370512843132, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 13.761904761904763, | |
| "grad_norm": 7.996088027954102, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 0.578, | |
| "mean_token_accuracy": 0.9649683386087418, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 13.952380952380953, | |
| "grad_norm": 6.650062084197998, | |
| "learning_rate": 4.212121212121212e-05, | |
| "loss": 0.738, | |
| "mean_token_accuracy": 0.9565856605768204, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 3.682978630065918, | |
| "learning_rate": 4.202020202020202e-05, | |
| "loss": 0.1826, | |
| "mean_token_accuracy": 0.9818181991577148, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 14.19047619047619, | |
| "grad_norm": 4.094846725463867, | |
| "learning_rate": 4.191919191919192e-05, | |
| "loss": 0.4917, | |
| "mean_token_accuracy": 0.9723720699548721, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 14.380952380952381, | |
| "grad_norm": 5.953057289123535, | |
| "learning_rate": 4.181818181818182e-05, | |
| "loss": 0.4787, | |
| "mean_token_accuracy": 0.9700902253389359, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 14.571428571428571, | |
| "grad_norm": 4.5836591720581055, | |
| "learning_rate": 4.171717171717172e-05, | |
| "loss": 0.5792, | |
| "mean_token_accuracy": 0.9712613523006439, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 14.761904761904763, | |
| "grad_norm": 4.867373943328857, | |
| "learning_rate": 4.161616161616162e-05, | |
| "loss": 0.4702, | |
| "mean_token_accuracy": 0.9780033379793167, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 14.952380952380953, | |
| "grad_norm": 7.761333465576172, | |
| "learning_rate": 4.151515151515152e-05, | |
| "loss": 0.6332, | |
| "mean_token_accuracy": 0.9641157388687134, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 4.875545501708984, | |
| "learning_rate": 4.141414141414142e-05, | |
| "loss": 0.1378, | |
| "mean_token_accuracy": 0.98591548204422, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 15.19047619047619, | |
| "grad_norm": 4.117421627044678, | |
| "learning_rate": 4.131313131313132e-05, | |
| "loss": 0.4463, | |
| "mean_token_accuracy": 0.9724489748477936, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 15.380952380952381, | |
| "grad_norm": 3.252460241317749, | |
| "learning_rate": 4.1212121212121216e-05, | |
| "loss": 0.3858, | |
| "mean_token_accuracy": 0.9809663742780685, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 15.571428571428571, | |
| "grad_norm": 4.330794334411621, | |
| "learning_rate": 4.111111111111111e-05, | |
| "loss": 0.4585, | |
| "mean_token_accuracy": 0.9748548269271851, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 15.761904761904763, | |
| "grad_norm": 5.096158027648926, | |
| "learning_rate": 4.101010101010101e-05, | |
| "loss": 0.4829, | |
| "mean_token_accuracy": 0.9708511531352997, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 15.952380952380953, | |
| "grad_norm": 6.11644172668457, | |
| "learning_rate": 4.0909090909090915e-05, | |
| "loss": 0.4374, | |
| "mean_token_accuracy": 0.974689856171608, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 2.1705079078674316, | |
| "learning_rate": 4.0808080808080814e-05, | |
| "loss": 0.0851, | |
| "mean_token_accuracy": 0.9838709831237793, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 16.19047619047619, | |
| "grad_norm": 3.2492971420288086, | |
| "learning_rate": 4.070707070707071e-05, | |
| "loss": 0.3638, | |
| "mean_token_accuracy": 0.9768412113189697, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 16.38095238095238, | |
| "grad_norm": 2.8683860301971436, | |
| "learning_rate": 4.0606060606060606e-05, | |
| "loss": 0.3437, | |
| "mean_token_accuracy": 0.9768141210079193, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 16.571428571428573, | |
| "grad_norm": 3.508230686187744, | |
| "learning_rate": 4.0505050505050506e-05, | |
| "loss": 0.354, | |
| "mean_token_accuracy": 0.9778662770986557, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 16.761904761904763, | |
| "grad_norm": 3.8338069915771484, | |
| "learning_rate": 4.0404040404040405e-05, | |
| "loss": 0.3948, | |
| "mean_token_accuracy": 0.973381832242012, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 16.952380952380953, | |
| "grad_norm": 4.676501750946045, | |
| "learning_rate": 4.0303030303030305e-05, | |
| "loss": 0.3893, | |
| "mean_token_accuracy": 0.9753514975309372, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 4.8052287101745605, | |
| "learning_rate": 4.0202020202020204e-05, | |
| "loss": 0.1183, | |
| "mean_token_accuracy": 0.9649122953414917, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 17.19047619047619, | |
| "grad_norm": 3.2596077919006348, | |
| "learning_rate": 4.01010101010101e-05, | |
| "loss": 0.3596, | |
| "mean_token_accuracy": 0.9725935012102127, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 17.38095238095238, | |
| "grad_norm": 2.6120784282684326, | |
| "learning_rate": 4e-05, | |
| "loss": 0.3414, | |
| "mean_token_accuracy": 0.9788288474082947, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 17.571428571428573, | |
| "grad_norm": 3.26759934425354, | |
| "learning_rate": 3.98989898989899e-05, | |
| "loss": 0.3576, | |
| "mean_token_accuracy": 0.9772270619869232, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 17.761904761904763, | |
| "grad_norm": 3.644747734069824, | |
| "learning_rate": 3.97979797979798e-05, | |
| "loss": 0.3324, | |
| "mean_token_accuracy": 0.9781567454338074, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 17.952380952380953, | |
| "grad_norm": 4.441091537475586, | |
| "learning_rate": 3.96969696969697e-05, | |
| "loss": 0.3747, | |
| "mean_token_accuracy": 0.9714739322662354, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 2.743286371231079, | |
| "learning_rate": 3.9595959595959594e-05, | |
| "loss": 0.0975, | |
| "mean_token_accuracy": 0.9696969985961914, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 18.19047619047619, | |
| "grad_norm": 3.2830970287323, | |
| "learning_rate": 3.9494949494949494e-05, | |
| "loss": 0.3028, | |
| "mean_token_accuracy": 0.9811016768217087, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 18.38095238095238, | |
| "grad_norm": 2.505868673324585, | |
| "learning_rate": 3.939393939393939e-05, | |
| "loss": 0.3186, | |
| "mean_token_accuracy": 0.9771904498338699, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 18.571428571428573, | |
| "grad_norm": 2.6549816131591797, | |
| "learning_rate": 3.929292929292929e-05, | |
| "loss": 0.3141, | |
| "mean_token_accuracy": 0.9759136885404587, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 18.761904761904763, | |
| "grad_norm": 3.7054269313812256, | |
| "learning_rate": 3.91919191919192e-05, | |
| "loss": 0.3736, | |
| "mean_token_accuracy": 0.9732943773269653, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 18.952380952380953, | |
| "grad_norm": 3.014618158340454, | |
| "learning_rate": 3.909090909090909e-05, | |
| "loss": 0.3676, | |
| "mean_token_accuracy": 0.9800769239664078, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 4.232401371002197, | |
| "learning_rate": 3.898989898989899e-05, | |
| "loss": 0.1268, | |
| "mean_token_accuracy": 0.9577465057373047, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 19.19047619047619, | |
| "grad_norm": 1.8361284732818604, | |
| "learning_rate": 3.888888888888889e-05, | |
| "loss": 0.2937, | |
| "mean_token_accuracy": 0.9818844795227051, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 19.38095238095238, | |
| "grad_norm": 3.4175708293914795, | |
| "learning_rate": 3.878787878787879e-05, | |
| "loss": 0.2919, | |
| "mean_token_accuracy": 0.9831363707780838, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 19.571428571428573, | |
| "grad_norm": 3.504340887069702, | |
| "learning_rate": 3.868686868686869e-05, | |
| "loss": 0.3739, | |
| "mean_token_accuracy": 0.9758433997631073, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 19.761904761904763, | |
| "grad_norm": 3.542600154876709, | |
| "learning_rate": 3.858585858585859e-05, | |
| "loss": 0.3247, | |
| "mean_token_accuracy": 0.9753479957580566, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 19.952380952380953, | |
| "grad_norm": 2.5886898040771484, | |
| "learning_rate": 3.848484848484848e-05, | |
| "loss": 0.3257, | |
| "mean_token_accuracy": 0.9774775803089142, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 2.6909375190734863, | |
| "learning_rate": 3.838383838383838e-05, | |
| "loss": 0.0882, | |
| "mean_token_accuracy": 0.9682539701461792, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 20.19047619047619, | |
| "grad_norm": 2.958399772644043, | |
| "learning_rate": 3.828282828282829e-05, | |
| "loss": 0.3205, | |
| "mean_token_accuracy": 0.9724349826574326, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 20.38095238095238, | |
| "grad_norm": 2.2972922325134277, | |
| "learning_rate": 3.818181818181819e-05, | |
| "loss": 0.2829, | |
| "mean_token_accuracy": 0.9813934862613678, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 20.571428571428573, | |
| "grad_norm": 2.2647204399108887, | |
| "learning_rate": 3.8080808080808087e-05, | |
| "loss": 0.3087, | |
| "mean_token_accuracy": 0.9758166968822479, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 20.761904761904763, | |
| "grad_norm": 2.4949004650115967, | |
| "learning_rate": 3.797979797979798e-05, | |
| "loss": 0.3143, | |
| "mean_token_accuracy": 0.9777243584394455, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 20.952380952380953, | |
| "grad_norm": 2.5387442111968994, | |
| "learning_rate": 3.787878787878788e-05, | |
| "loss": 0.326, | |
| "mean_token_accuracy": 0.9755249470472336, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "grad_norm": 2.745015859603882, | |
| "learning_rate": 3.777777777777778e-05, | |
| "loss": 0.0842, | |
| "mean_token_accuracy": 0.9714285731315613, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 21.19047619047619, | |
| "grad_norm": 1.7736639976501465, | |
| "learning_rate": 3.767676767676768e-05, | |
| "loss": 0.2777, | |
| "mean_token_accuracy": 0.9804743677377701, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 21.38095238095238, | |
| "grad_norm": 2.391968011856079, | |
| "learning_rate": 3.757575757575758e-05, | |
| "loss": 0.2969, | |
| "mean_token_accuracy": 0.9765493422746658, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 21.571428571428573, | |
| "grad_norm": 1.9384799003601074, | |
| "learning_rate": 3.747474747474748e-05, | |
| "loss": 0.2764, | |
| "mean_token_accuracy": 0.978370875120163, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 21.761904761904763, | |
| "grad_norm": 2.363274097442627, | |
| "learning_rate": 3.7373737373737376e-05, | |
| "loss": 0.3086, | |
| "mean_token_accuracy": 0.9715951085090637, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 21.952380952380953, | |
| "grad_norm": 2.90826416015625, | |
| "learning_rate": 3.7272727272727276e-05, | |
| "loss": 0.3241, | |
| "mean_token_accuracy": 0.9738913327455521, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 1.8676457405090332, | |
| "learning_rate": 3.7171717171717175e-05, | |
| "loss": 0.0867, | |
| "mean_token_accuracy": 0.9830508232116699, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 22.19047619047619, | |
| "grad_norm": 2.1423661708831787, | |
| "learning_rate": 3.7070707070707075e-05, | |
| "loss": 0.2691, | |
| "mean_token_accuracy": 0.9791481345891953, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 22.38095238095238, | |
| "grad_norm": 2.0479485988616943, | |
| "learning_rate": 3.6969696969696974e-05, | |
| "loss": 0.2898, | |
| "mean_token_accuracy": 0.9813213050365448, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 22.571428571428573, | |
| "grad_norm": 2.566549777984619, | |
| "learning_rate": 3.686868686868687e-05, | |
| "loss": 0.3174, | |
| "mean_token_accuracy": 0.975700318813324, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 22.761904761904763, | |
| "grad_norm": 2.541551351547241, | |
| "learning_rate": 3.6767676767676766e-05, | |
| "loss": 0.3205, | |
| "mean_token_accuracy": 0.978480726480484, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 22.952380952380953, | |
| "grad_norm": 2.037262201309204, | |
| "learning_rate": 3.6666666666666666e-05, | |
| "loss": 0.2741, | |
| "mean_token_accuracy": 0.9802869260311127, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "grad_norm": 2.753689765930176, | |
| "learning_rate": 3.656565656565657e-05, | |
| "loss": 0.0844, | |
| "mean_token_accuracy": 0.9841269850730896, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 23.19047619047619, | |
| "grad_norm": 1.9929062128067017, | |
| "learning_rate": 3.6464646464646465e-05, | |
| "loss": 0.2798, | |
| "mean_token_accuracy": 0.9800110459327698, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 23.38095238095238, | |
| "grad_norm": 2.7327589988708496, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 0.2671, | |
| "mean_token_accuracy": 0.9807360470294952, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 23.571428571428573, | |
| "grad_norm": 1.7482175827026367, | |
| "learning_rate": 3.6262626262626264e-05, | |
| "loss": 0.2965, | |
| "mean_token_accuracy": 0.9796760976314545, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 23.761904761904763, | |
| "grad_norm": 2.599804639816284, | |
| "learning_rate": 3.616161616161616e-05, | |
| "loss": 0.3154, | |
| "mean_token_accuracy": 0.977075606584549, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 23.952380952380953, | |
| "grad_norm": 2.482060194015503, | |
| "learning_rate": 3.606060606060606e-05, | |
| "loss": 0.3009, | |
| "mean_token_accuracy": 0.9737012088298798, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 3.389758825302124, | |
| "learning_rate": 3.595959595959596e-05, | |
| "loss": 0.1225, | |
| "mean_token_accuracy": 0.9636363387107849, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 24.19047619047619, | |
| "grad_norm": 1.8538786172866821, | |
| "learning_rate": 3.5858585858585855e-05, | |
| "loss": 0.2625, | |
| "mean_token_accuracy": 0.9796436280012131, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 24.38095238095238, | |
| "grad_norm": 1.6289573907852173, | |
| "learning_rate": 3.575757575757576e-05, | |
| "loss": 0.2616, | |
| "mean_token_accuracy": 0.9804391115903854, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 24.571428571428573, | |
| "grad_norm": 2.4140396118164062, | |
| "learning_rate": 3.565656565656566e-05, | |
| "loss": 0.3128, | |
| "mean_token_accuracy": 0.979373887181282, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 24.761904761904763, | |
| "grad_norm": 2.182692766189575, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 0.2983, | |
| "mean_token_accuracy": 0.9793859571218491, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 24.952380952380953, | |
| "grad_norm": 2.800553560256958, | |
| "learning_rate": 3.545454545454546e-05, | |
| "loss": 0.3566, | |
| "mean_token_accuracy": 0.9733032137155533, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "grad_norm": 1.8961296081542969, | |
| "learning_rate": 3.535353535353535e-05, | |
| "loss": 0.0623, | |
| "mean_token_accuracy": 0.9797979593276978, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 25.19047619047619, | |
| "grad_norm": 2.6031830310821533, | |
| "learning_rate": 3.525252525252525e-05, | |
| "loss": 0.307, | |
| "mean_token_accuracy": 0.9759431630373001, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 25.38095238095238, | |
| "grad_norm": 1.7213940620422363, | |
| "learning_rate": 3.515151515151515e-05, | |
| "loss": 0.2605, | |
| "mean_token_accuracy": 0.9829924404621124, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 25.571428571428573, | |
| "grad_norm": 2.169405221939087, | |
| "learning_rate": 3.505050505050505e-05, | |
| "loss": 0.2833, | |
| "mean_token_accuracy": 0.976715162396431, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 25.761904761904763, | |
| "grad_norm": 2.126295566558838, | |
| "learning_rate": 3.494949494949495e-05, | |
| "loss": 0.2836, | |
| "mean_token_accuracy": 0.9775257259607315, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 25.952380952380953, | |
| "grad_norm": 2.112752914428711, | |
| "learning_rate": 3.484848484848485e-05, | |
| "loss": 0.3001, | |
| "mean_token_accuracy": 0.9795974045991898, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "grad_norm": 2.9405832290649414, | |
| "learning_rate": 3.474747474747475e-05, | |
| "loss": 0.1069, | |
| "mean_token_accuracy": 0.9824561476707458, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 26.19047619047619, | |
| "grad_norm": 1.8124560117721558, | |
| "learning_rate": 3.464646464646465e-05, | |
| "loss": 0.2694, | |
| "mean_token_accuracy": 0.982256755232811, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 26.38095238095238, | |
| "grad_norm": 1.8597822189331055, | |
| "learning_rate": 3.454545454545455e-05, | |
| "loss": 0.2558, | |
| "mean_token_accuracy": 0.98062863945961, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 26.571428571428573, | |
| "grad_norm": 1.6446207761764526, | |
| "learning_rate": 3.444444444444445e-05, | |
| "loss": 0.2587, | |
| "mean_token_accuracy": 0.9779441952705383, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 26.761904761904763, | |
| "grad_norm": 2.2227869033813477, | |
| "learning_rate": 3.434343434343435e-05, | |
| "loss": 0.3241, | |
| "mean_token_accuracy": 0.9747696965932846, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 26.952380952380953, | |
| "grad_norm": 1.6738312244415283, | |
| "learning_rate": 3.424242424242424e-05, | |
| "loss": 0.2779, | |
| "mean_token_accuracy": 0.9778714776039124, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "grad_norm": 1.4880234003067017, | |
| "learning_rate": 3.414141414141414e-05, | |
| "loss": 0.0801, | |
| "mean_token_accuracy": 0.9838709831237793, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 27.19047619047619, | |
| "grad_norm": 1.5148252248764038, | |
| "learning_rate": 3.4040404040404045e-05, | |
| "loss": 0.2581, | |
| "mean_token_accuracy": 0.980286031961441, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 27.38095238095238, | |
| "grad_norm": 1.833160400390625, | |
| "learning_rate": 3.3939393939393945e-05, | |
| "loss": 0.2724, | |
| "mean_token_accuracy": 0.9760157763957977, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 27.571428571428573, | |
| "grad_norm": 2.1366348266601562, | |
| "learning_rate": 3.3838383838383844e-05, | |
| "loss": 0.2916, | |
| "mean_token_accuracy": 0.9787898063659668, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 27.761904761904763, | |
| "grad_norm": 2.5082993507385254, | |
| "learning_rate": 3.373737373737374e-05, | |
| "loss": 0.2929, | |
| "mean_token_accuracy": 0.9774486720561981, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 27.952380952380953, | |
| "grad_norm": 2.1355273723602295, | |
| "learning_rate": 3.3636363636363636e-05, | |
| "loss": 0.2856, | |
| "mean_token_accuracy": 0.9789445698261261, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 1.970436930656433, | |
| "learning_rate": 3.3535353535353536e-05, | |
| "loss": 0.0806, | |
| "mean_token_accuracy": 0.9692307710647583, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 28.19047619047619, | |
| "grad_norm": 2.1435768604278564, | |
| "learning_rate": 3.3434343434343435e-05, | |
| "loss": 0.2658, | |
| "mean_token_accuracy": 0.9759610444307327, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 28.38095238095238, | |
| "grad_norm": 1.6564626693725586, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.2548, | |
| "mean_token_accuracy": 0.9793960750102997, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 28.571428571428573, | |
| "grad_norm": 1.7106664180755615, | |
| "learning_rate": 3.3232323232323234e-05, | |
| "loss": 0.255, | |
| "mean_token_accuracy": 0.9787760227918625, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 28.761904761904763, | |
| "grad_norm": 2.1820991039276123, | |
| "learning_rate": 3.3131313131313134e-05, | |
| "loss": 0.3227, | |
| "mean_token_accuracy": 0.973702073097229, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 28.952380952380953, | |
| "grad_norm": 1.7227038145065308, | |
| "learning_rate": 3.303030303030303e-05, | |
| "loss": 0.2653, | |
| "mean_token_accuracy": 0.9788563847541809, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "grad_norm": 1.6985877752304077, | |
| "learning_rate": 3.292929292929293e-05, | |
| "loss": 0.0653, | |
| "mean_token_accuracy": 0.9756097793579102, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 29.19047619047619, | |
| "grad_norm": 1.70681631565094, | |
| "learning_rate": 3.282828282828283e-05, | |
| "loss": 0.2621, | |
| "mean_token_accuracy": 0.9808604121208191, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 29.38095238095238, | |
| "grad_norm": 1.5982296466827393, | |
| "learning_rate": 3.272727272727273e-05, | |
| "loss": 0.2444, | |
| "mean_token_accuracy": 0.9789219200611115, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 29.571428571428573, | |
| "grad_norm": 1.4115501642227173, | |
| "learning_rate": 3.2626262626262624e-05, | |
| "loss": 0.2386, | |
| "mean_token_accuracy": 0.9839699417352676, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 29.761904761904763, | |
| "grad_norm": 2.2143611907958984, | |
| "learning_rate": 3.2525252525252524e-05, | |
| "loss": 0.3214, | |
| "mean_token_accuracy": 0.9736231416463852, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 29.952380952380953, | |
| "grad_norm": 2.329328775405884, | |
| "learning_rate": 3.2424242424242423e-05, | |
| "loss": 0.2899, | |
| "mean_token_accuracy": 0.974274680018425, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 1.8894615173339844, | |
| "learning_rate": 3.232323232323233e-05, | |
| "loss": 0.0873, | |
| "mean_token_accuracy": 0.970588207244873, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 30.19047619047619, | |
| "grad_norm": 1.8685792684555054, | |
| "learning_rate": 3.222222222222223e-05, | |
| "loss": 0.2713, | |
| "mean_token_accuracy": 0.9793071448802948, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 30.38095238095238, | |
| "grad_norm": 1.6303725242614746, | |
| "learning_rate": 3.212121212121212e-05, | |
| "loss": 0.2602, | |
| "mean_token_accuracy": 0.978649765253067, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 30.571428571428573, | |
| "grad_norm": 1.5414835214614868, | |
| "learning_rate": 3.202020202020202e-05, | |
| "loss": 0.2507, | |
| "mean_token_accuracy": 0.9816054552793503, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 30.761904761904763, | |
| "grad_norm": 1.9461543560028076, | |
| "learning_rate": 3.191919191919192e-05, | |
| "loss": 0.2622, | |
| "mean_token_accuracy": 0.9799721091985703, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 30.952380952380953, | |
| "grad_norm": 2.4515039920806885, | |
| "learning_rate": 3.181818181818182e-05, | |
| "loss": 0.316, | |
| "mean_token_accuracy": 0.9738900065422058, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "grad_norm": 2.3152859210968018, | |
| "learning_rate": 3.171717171717172e-05, | |
| "loss": 0.0924, | |
| "mean_token_accuracy": 0.9666666388511658, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 31.19047619047619, | |
| "grad_norm": 1.5827226638793945, | |
| "learning_rate": 3.161616161616161e-05, | |
| "loss": 0.2548, | |
| "mean_token_accuracy": 0.9807614088058472, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 31.38095238095238, | |
| "grad_norm": 1.5467098951339722, | |
| "learning_rate": 3.151515151515151e-05, | |
| "loss": 0.2567, | |
| "mean_token_accuracy": 0.9772002995014191, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 31.571428571428573, | |
| "grad_norm": 1.5654078722000122, | |
| "learning_rate": 3.141414141414142e-05, | |
| "loss": 0.2523, | |
| "mean_token_accuracy": 0.9784552752971649, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 31.761904761904763, | |
| "grad_norm": 1.6791102886199951, | |
| "learning_rate": 3.131313131313132e-05, | |
| "loss": 0.2749, | |
| "mean_token_accuracy": 0.9773024320602417, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 31.952380952380953, | |
| "grad_norm": 1.864105224609375, | |
| "learning_rate": 3.121212121212122e-05, | |
| "loss": 0.2938, | |
| "mean_token_accuracy": 0.9765942692756653, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 1.214571475982666, | |
| "learning_rate": 3.111111111111111e-05, | |
| "loss": 0.0665, | |
| "mean_token_accuracy": 0.987500011920929, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 32.19047619047619, | |
| "grad_norm": 1.4030119180679321, | |
| "learning_rate": 3.101010101010101e-05, | |
| "loss": 0.2415, | |
| "mean_token_accuracy": 0.9817796945571899, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 32.38095238095238, | |
| "grad_norm": 1.6708261966705322, | |
| "learning_rate": 3.090909090909091e-05, | |
| "loss": 0.2582, | |
| "mean_token_accuracy": 0.9801040887832642, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 32.57142857142857, | |
| "grad_norm": 1.4296513795852661, | |
| "learning_rate": 3.080808080808081e-05, | |
| "loss": 0.2493, | |
| "mean_token_accuracy": 0.9811757057905197, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 32.76190476190476, | |
| "grad_norm": 1.7713197469711304, | |
| "learning_rate": 3.070707070707071e-05, | |
| "loss": 0.2823, | |
| "mean_token_accuracy": 0.9782667905092239, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 32.95238095238095, | |
| "grad_norm": 2.032137632369995, | |
| "learning_rate": 3.060606060606061e-05, | |
| "loss": 0.294, | |
| "mean_token_accuracy": 0.9734672009944916, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "grad_norm": 2.334019660949707, | |
| "learning_rate": 3.050505050505051e-05, | |
| "loss": 0.0861, | |
| "mean_token_accuracy": 0.9726027250289917, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 33.19047619047619, | |
| "grad_norm": 1.4779608249664307, | |
| "learning_rate": 3.0404040404040406e-05, | |
| "loss": 0.2537, | |
| "mean_token_accuracy": 0.981317549943924, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 33.38095238095238, | |
| "grad_norm": 1.435577392578125, | |
| "learning_rate": 3.0303030303030306e-05, | |
| "loss": 0.2544, | |
| "mean_token_accuracy": 0.9813797920942307, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 33.57142857142857, | |
| "grad_norm": 1.8126311302185059, | |
| "learning_rate": 3.0202020202020205e-05, | |
| "loss": 0.2705, | |
| "mean_token_accuracy": 0.9765264093875885, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 33.76190476190476, | |
| "grad_norm": 1.5598095655441284, | |
| "learning_rate": 3.01010101010101e-05, | |
| "loss": 0.2723, | |
| "mean_token_accuracy": 0.978124126791954, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 33.95238095238095, | |
| "grad_norm": 1.8001117706298828, | |
| "learning_rate": 3e-05, | |
| "loss": 0.271, | |
| "mean_token_accuracy": 0.9785387814044952, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "grad_norm": 1.7313034534454346, | |
| "learning_rate": 2.98989898989899e-05, | |
| "loss": 0.0652, | |
| "mean_token_accuracy": 0.9746835231781006, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 34.19047619047619, | |
| "grad_norm": 1.389072060585022, | |
| "learning_rate": 2.9797979797979796e-05, | |
| "loss": 0.242, | |
| "mean_token_accuracy": 0.9788109809160233, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 34.38095238095238, | |
| "grad_norm": 1.434044599533081, | |
| "learning_rate": 2.96969696969697e-05, | |
| "loss": 0.2426, | |
| "mean_token_accuracy": 0.979528471827507, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 34.57142857142857, | |
| "grad_norm": 1.9448174238204956, | |
| "learning_rate": 2.95959595959596e-05, | |
| "loss": 0.2695, | |
| "mean_token_accuracy": 0.9793160408735275, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 34.76190476190476, | |
| "grad_norm": 1.85161554813385, | |
| "learning_rate": 2.9494949494949498e-05, | |
| "loss": 0.293, | |
| "mean_token_accuracy": 0.9727693498134613, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 34.95238095238095, | |
| "grad_norm": 1.7662495374679565, | |
| "learning_rate": 2.9393939393939394e-05, | |
| "loss": 0.2817, | |
| "mean_token_accuracy": 0.9758803397417068, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "grad_norm": 1.3624759912490845, | |
| "learning_rate": 2.9292929292929294e-05, | |
| "loss": 0.0738, | |
| "mean_token_accuracy": 0.9848484992980957, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 35.19047619047619, | |
| "grad_norm": 1.622554063796997, | |
| "learning_rate": 2.9191919191919193e-05, | |
| "loss": 0.2493, | |
| "mean_token_accuracy": 0.9789364635944366, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 35.38095238095238, | |
| "grad_norm": 1.7415611743927002, | |
| "learning_rate": 2.909090909090909e-05, | |
| "loss": 0.2849, | |
| "mean_token_accuracy": 0.9779055863618851, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 35.57142857142857, | |
| "grad_norm": 1.585845947265625, | |
| "learning_rate": 2.898989898989899e-05, | |
| "loss": 0.2497, | |
| "mean_token_accuracy": 0.9807179868221283, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 35.76190476190476, | |
| "grad_norm": 1.5177557468414307, | |
| "learning_rate": 2.8888888888888888e-05, | |
| "loss": 0.264, | |
| "mean_token_accuracy": 0.9775202721357346, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 35.95238095238095, | |
| "grad_norm": 1.8757683038711548, | |
| "learning_rate": 2.878787878787879e-05, | |
| "loss": 0.2589, | |
| "mean_token_accuracy": 0.9773915261030197, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "grad_norm": 2.2826578617095947, | |
| "learning_rate": 2.868686868686869e-05, | |
| "loss": 0.0933, | |
| "mean_token_accuracy": 0.9491525292396545, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 36.19047619047619, | |
| "grad_norm": 1.3637081384658813, | |
| "learning_rate": 2.8585858585858587e-05, | |
| "loss": 0.245, | |
| "mean_token_accuracy": 0.9781962931156158, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 36.38095238095238, | |
| "grad_norm": 1.4664133787155151, | |
| "learning_rate": 2.8484848484848486e-05, | |
| "loss": 0.2521, | |
| "mean_token_accuracy": 0.9817428290843964, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 36.57142857142857, | |
| "grad_norm": 1.5265666246414185, | |
| "learning_rate": 2.8383838383838386e-05, | |
| "loss": 0.2615, | |
| "mean_token_accuracy": 0.9806021302938461, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 36.76190476190476, | |
| "grad_norm": 1.4322954416275024, | |
| "learning_rate": 2.8282828282828282e-05, | |
| "loss": 0.2599, | |
| "mean_token_accuracy": 0.9800188541412354, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 36.95238095238095, | |
| "grad_norm": 1.76764976978302, | |
| "learning_rate": 2.818181818181818e-05, | |
| "loss": 0.292, | |
| "mean_token_accuracy": 0.9746560305356979, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "grad_norm": 2.1554458141326904, | |
| "learning_rate": 2.808080808080808e-05, | |
| "loss": 0.0865, | |
| "mean_token_accuracy": 0.9682539701461792, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 37.19047619047619, | |
| "grad_norm": 1.4079774618148804, | |
| "learning_rate": 2.7979797979797984e-05, | |
| "loss": 0.2359, | |
| "mean_token_accuracy": 0.9809356033802032, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 37.38095238095238, | |
| "grad_norm": 1.8873682022094727, | |
| "learning_rate": 2.7878787878787883e-05, | |
| "loss": 0.2731, | |
| "mean_token_accuracy": 0.9777008444070816, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 37.57142857142857, | |
| "grad_norm": 1.7195765972137451, | |
| "learning_rate": 2.777777777777778e-05, | |
| "loss": 0.2557, | |
| "mean_token_accuracy": 0.980317622423172, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 37.76190476190476, | |
| "grad_norm": 1.5935289859771729, | |
| "learning_rate": 2.767676767676768e-05, | |
| "loss": 0.2663, | |
| "mean_token_accuracy": 0.9756544232368469, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 37.95238095238095, | |
| "grad_norm": 1.626733660697937, | |
| "learning_rate": 2.7575757575757578e-05, | |
| "loss": 0.2668, | |
| "mean_token_accuracy": 0.9801195561885834, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "grad_norm": 2.378291368484497, | |
| "learning_rate": 2.7474747474747474e-05, | |
| "loss": 0.0872, | |
| "mean_token_accuracy": 0.9718309640884399, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 38.19047619047619, | |
| "grad_norm": 1.4580754041671753, | |
| "learning_rate": 2.7373737373737374e-05, | |
| "loss": 0.243, | |
| "mean_token_accuracy": 0.9807321429252625, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 38.38095238095238, | |
| "grad_norm": 1.3259878158569336, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 0.2479, | |
| "mean_token_accuracy": 0.9801591485738754, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 38.57142857142857, | |
| "grad_norm": 1.43174147605896, | |
| "learning_rate": 2.717171717171717e-05, | |
| "loss": 0.2477, | |
| "mean_token_accuracy": 0.9830300509929657, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 38.76190476190476, | |
| "grad_norm": 1.6294718980789185, | |
| "learning_rate": 2.7070707070707075e-05, | |
| "loss": 0.2666, | |
| "mean_token_accuracy": 0.9755284339189529, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 38.95238095238095, | |
| "grad_norm": 2.30196213722229, | |
| "learning_rate": 2.696969696969697e-05, | |
| "loss": 0.2929, | |
| "mean_token_accuracy": 0.9752500951290131, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "grad_norm": 1.96921968460083, | |
| "learning_rate": 2.686868686868687e-05, | |
| "loss": 0.0762, | |
| "mean_token_accuracy": 0.9722222089767456, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 39.19047619047619, | |
| "grad_norm": 1.3506882190704346, | |
| "learning_rate": 2.676767676767677e-05, | |
| "loss": 0.2359, | |
| "mean_token_accuracy": 0.9817389249801636, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 39.38095238095238, | |
| "grad_norm": 1.4548856019973755, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.2456, | |
| "mean_token_accuracy": 0.9811435043811798, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 39.57142857142857, | |
| "grad_norm": 1.5215767621994019, | |
| "learning_rate": 2.6565656565656566e-05, | |
| "loss": 0.2575, | |
| "mean_token_accuracy": 0.9797980934381485, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 39.76190476190476, | |
| "grad_norm": 1.8254742622375488, | |
| "learning_rate": 2.6464646464646466e-05, | |
| "loss": 0.2889, | |
| "mean_token_accuracy": 0.9770003706216812, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 39.95238095238095, | |
| "grad_norm": 1.818259596824646, | |
| "learning_rate": 2.636363636363636e-05, | |
| "loss": 0.2897, | |
| "mean_token_accuracy": 0.976064071059227, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 1.3236188888549805, | |
| "learning_rate": 2.6262626262626268e-05, | |
| "loss": 0.0774, | |
| "mean_token_accuracy": 0.9838709831237793, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 40.19047619047619, | |
| "grad_norm": 1.5586050748825073, | |
| "learning_rate": 2.6161616161616164e-05, | |
| "loss": 0.2731, | |
| "mean_token_accuracy": 0.9815535992383957, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 40.38095238095238, | |
| "grad_norm": 1.5174766778945923, | |
| "learning_rate": 2.6060606060606063e-05, | |
| "loss": 0.2473, | |
| "mean_token_accuracy": 0.9786833673715591, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 40.57142857142857, | |
| "grad_norm": 1.3981167078018188, | |
| "learning_rate": 2.5959595959595963e-05, | |
| "loss": 0.2531, | |
| "mean_token_accuracy": 0.9792415052652359, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 40.76190476190476, | |
| "grad_norm": 1.5628103017807007, | |
| "learning_rate": 2.585858585858586e-05, | |
| "loss": 0.257, | |
| "mean_token_accuracy": 0.9798375219106674, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 40.95238095238095, | |
| "grad_norm": 1.5515220165252686, | |
| "learning_rate": 2.575757575757576e-05, | |
| "loss": 0.2669, | |
| "mean_token_accuracy": 0.9787022620439529, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "grad_norm": 1.8415720462799072, | |
| "learning_rate": 2.5656565656565658e-05, | |
| "loss": 0.0799, | |
| "mean_token_accuracy": 0.9682539701461792, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 41.19047619047619, | |
| "grad_norm": 1.423293113708496, | |
| "learning_rate": 2.5555555555555554e-05, | |
| "loss": 0.2393, | |
| "mean_token_accuracy": 0.9812082797288895, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 41.38095238095238, | |
| "grad_norm": 1.394112467765808, | |
| "learning_rate": 2.5454545454545454e-05, | |
| "loss": 0.2521, | |
| "mean_token_accuracy": 0.9827133864164352, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 41.57142857142857, | |
| "grad_norm": 1.6987677812576294, | |
| "learning_rate": 2.5353535353535356e-05, | |
| "loss": 0.2671, | |
| "mean_token_accuracy": 0.9742349684238434, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 41.76190476190476, | |
| "grad_norm": 1.6028631925582886, | |
| "learning_rate": 2.5252525252525256e-05, | |
| "loss": 0.2602, | |
| "mean_token_accuracy": 0.9791279435157776, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 41.95238095238095, | |
| "grad_norm": 1.8165968656539917, | |
| "learning_rate": 2.5151515151515155e-05, | |
| "loss": 0.2826, | |
| "mean_token_accuracy": 0.9778096079826355, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "grad_norm": 0.9838045835494995, | |
| "learning_rate": 2.505050505050505e-05, | |
| "loss": 0.0517, | |
| "mean_token_accuracy": 0.9902912378311157, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 42.19047619047619, | |
| "grad_norm": 1.3776968717575073, | |
| "learning_rate": 2.494949494949495e-05, | |
| "loss": 0.2612, | |
| "mean_token_accuracy": 0.9751808941364288, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 42.38095238095238, | |
| "grad_norm": 1.5808742046356201, | |
| "learning_rate": 2.4848484848484847e-05, | |
| "loss": 0.2466, | |
| "mean_token_accuracy": 0.9846099317073822, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 42.57142857142857, | |
| "grad_norm": 1.2304980754852295, | |
| "learning_rate": 2.474747474747475e-05, | |
| "loss": 0.2344, | |
| "mean_token_accuracy": 0.9795664101839066, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 42.76190476190476, | |
| "grad_norm": 1.6060268878936768, | |
| "learning_rate": 2.464646464646465e-05, | |
| "loss": 0.2817, | |
| "mean_token_accuracy": 0.9776766449213028, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 42.95238095238095, | |
| "grad_norm": 1.6796001195907593, | |
| "learning_rate": 2.4545454545454545e-05, | |
| "loss": 0.2489, | |
| "mean_token_accuracy": 0.9769842028617859, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "grad_norm": 1.4542969465255737, | |
| "learning_rate": 2.4444444444444445e-05, | |
| "loss": 0.0595, | |
| "mean_token_accuracy": 0.9753086566925049, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 43.19047619047619, | |
| "grad_norm": 1.4857451915740967, | |
| "learning_rate": 2.4343434343434344e-05, | |
| "loss": 0.2527, | |
| "mean_token_accuracy": 0.97712042927742, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 43.38095238095238, | |
| "grad_norm": 1.306619644165039, | |
| "learning_rate": 2.4242424242424244e-05, | |
| "loss": 0.2363, | |
| "mean_token_accuracy": 0.980791300535202, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 43.57142857142857, | |
| "grad_norm": 1.6846957206726074, | |
| "learning_rate": 2.4141414141414143e-05, | |
| "loss": 0.259, | |
| "mean_token_accuracy": 0.9791981130838394, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 43.76190476190476, | |
| "grad_norm": 1.4038276672363281, | |
| "learning_rate": 2.404040404040404e-05, | |
| "loss": 0.251, | |
| "mean_token_accuracy": 0.9791757315397263, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 43.95238095238095, | |
| "grad_norm": 1.5158367156982422, | |
| "learning_rate": 2.393939393939394e-05, | |
| "loss": 0.2702, | |
| "mean_token_accuracy": 0.9788329601287842, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "grad_norm": 1.7850970029830933, | |
| "learning_rate": 2.3838383838383842e-05, | |
| "loss": 0.0728, | |
| "mean_token_accuracy": 0.9759036302566528, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 44.19047619047619, | |
| "grad_norm": 1.1887112855911255, | |
| "learning_rate": 2.3737373737373738e-05, | |
| "loss": 0.2319, | |
| "mean_token_accuracy": 0.9812621474266052, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 44.38095238095238, | |
| "grad_norm": 1.4217466115951538, | |
| "learning_rate": 2.3636363636363637e-05, | |
| "loss": 0.238, | |
| "mean_token_accuracy": 0.9808095693588257, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 44.57142857142857, | |
| "grad_norm": 1.7025716304779053, | |
| "learning_rate": 2.3535353535353537e-05, | |
| "loss": 0.2537, | |
| "mean_token_accuracy": 0.9779138118028641, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 44.76190476190476, | |
| "grad_norm": 1.7018096446990967, | |
| "learning_rate": 2.3434343434343436e-05, | |
| "loss": 0.274, | |
| "mean_token_accuracy": 0.9743378162384033, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 44.95238095238095, | |
| "grad_norm": 1.7380796670913696, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 0.2768, | |
| "mean_token_accuracy": 0.9779854416847229, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "grad_norm": 1.0162783861160278, | |
| "learning_rate": 2.3232323232323232e-05, | |
| "loss": 0.051, | |
| "mean_token_accuracy": 0.9898989796638489, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 45.19047619047619, | |
| "grad_norm": 1.322588562965393, | |
| "learning_rate": 2.313131313131313e-05, | |
| "loss": 0.2384, | |
| "mean_token_accuracy": 0.9804540276527405, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 45.38095238095238, | |
| "grad_norm": 1.294411301612854, | |
| "learning_rate": 2.3030303030303034e-05, | |
| "loss": 0.2342, | |
| "mean_token_accuracy": 0.9810962080955505, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 45.57142857142857, | |
| "grad_norm": 1.4505170583724976, | |
| "learning_rate": 2.292929292929293e-05, | |
| "loss": 0.2572, | |
| "mean_token_accuracy": 0.9756149500608444, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 45.76190476190476, | |
| "grad_norm": 1.6599575281143188, | |
| "learning_rate": 2.282828282828283e-05, | |
| "loss": 0.2678, | |
| "mean_token_accuracy": 0.9741277694702148, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 45.95238095238095, | |
| "grad_norm": 1.4780550003051758, | |
| "learning_rate": 2.272727272727273e-05, | |
| "loss": 0.2647, | |
| "mean_token_accuracy": 0.9768411070108414, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "grad_norm": 1.1366266012191772, | |
| "learning_rate": 2.262626262626263e-05, | |
| "loss": 0.0557, | |
| "mean_token_accuracy": 0.9878048896789551, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 46.19047619047619, | |
| "grad_norm": 1.3346896171569824, | |
| "learning_rate": 2.2525252525252528e-05, | |
| "loss": 0.2325, | |
| "mean_token_accuracy": 0.979757234454155, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 46.38095238095238, | |
| "grad_norm": 1.4182461500167847, | |
| "learning_rate": 2.2424242424242424e-05, | |
| "loss": 0.2331, | |
| "mean_token_accuracy": 0.9792613536119461, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 46.57142857142857, | |
| "grad_norm": 1.5474402904510498, | |
| "learning_rate": 2.2323232323232324e-05, | |
| "loss": 0.2641, | |
| "mean_token_accuracy": 0.9776208251714706, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 46.76190476190476, | |
| "grad_norm": 1.8437175750732422, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 0.2841, | |
| "mean_token_accuracy": 0.9759227335453033, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 46.95238095238095, | |
| "grad_norm": 1.8677905797958374, | |
| "learning_rate": 2.2121212121212123e-05, | |
| "loss": 0.2611, | |
| "mean_token_accuracy": 0.9794552326202393, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "grad_norm": 1.7438082695007324, | |
| "learning_rate": 2.2020202020202022e-05, | |
| "loss": 0.0768, | |
| "mean_token_accuracy": 0.9701492786407471, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 47.19047619047619, | |
| "grad_norm": 1.38357675075531, | |
| "learning_rate": 2.191919191919192e-05, | |
| "loss": 0.2514, | |
| "mean_token_accuracy": 0.9804678857326508, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 47.38095238095238, | |
| "grad_norm": 1.3532003164291382, | |
| "learning_rate": 2.1818181818181818e-05, | |
| "loss": 0.233, | |
| "mean_token_accuracy": 0.9824511855840683, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 47.57142857142857, | |
| "grad_norm": 1.6904886960983276, | |
| "learning_rate": 2.171717171717172e-05, | |
| "loss": 0.249, | |
| "mean_token_accuracy": 0.9747414886951447, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 47.76190476190476, | |
| "grad_norm": 1.4693493843078613, | |
| "learning_rate": 2.1616161616161617e-05, | |
| "loss": 0.2637, | |
| "mean_token_accuracy": 0.9777188897132874, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 47.95238095238095, | |
| "grad_norm": 1.4712016582489014, | |
| "learning_rate": 2.1515151515151516e-05, | |
| "loss": 0.2641, | |
| "mean_token_accuracy": 0.9823849946260452, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "grad_norm": 2.5622308254241943, | |
| "learning_rate": 2.1414141414141416e-05, | |
| "loss": 0.0963, | |
| "mean_token_accuracy": 0.9473684430122375, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 48.19047619047619, | |
| "grad_norm": 1.4440287351608276, | |
| "learning_rate": 2.1313131313131315e-05, | |
| "loss": 0.2439, | |
| "mean_token_accuracy": 0.9802645593881607, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 48.38095238095238, | |
| "grad_norm": 1.373253583908081, | |
| "learning_rate": 2.1212121212121215e-05, | |
| "loss": 0.2437, | |
| "mean_token_accuracy": 0.9763128757476807, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 48.57142857142857, | |
| "grad_norm": 1.6184741258621216, | |
| "learning_rate": 2.111111111111111e-05, | |
| "loss": 0.2654, | |
| "mean_token_accuracy": 0.9782317876815796, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 48.76190476190476, | |
| "grad_norm": 1.3039287328720093, | |
| "learning_rate": 2.101010101010101e-05, | |
| "loss": 0.2395, | |
| "mean_token_accuracy": 0.9821481555700302, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 48.95238095238095, | |
| "grad_norm": 1.394302487373352, | |
| "learning_rate": 2.090909090909091e-05, | |
| "loss": 0.2645, | |
| "mean_token_accuracy": 0.9776430726051331, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "grad_norm": 1.0925865173339844, | |
| "learning_rate": 2.080808080808081e-05, | |
| "loss": 0.0562, | |
| "mean_token_accuracy": 0.9878048896789551, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 49.19047619047619, | |
| "grad_norm": 1.3069161176681519, | |
| "learning_rate": 2.070707070707071e-05, | |
| "loss": 0.2455, | |
| "mean_token_accuracy": 0.97951839864254, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 49.38095238095238, | |
| "grad_norm": 1.3214561939239502, | |
| "learning_rate": 2.0606060606060608e-05, | |
| "loss": 0.2381, | |
| "mean_token_accuracy": 0.9809810966253281, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 49.57142857142857, | |
| "grad_norm": 1.3639582395553589, | |
| "learning_rate": 2.0505050505050504e-05, | |
| "loss": 0.2535, | |
| "mean_token_accuracy": 0.9802620708942413, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 49.76190476190476, | |
| "grad_norm": 1.4789013862609863, | |
| "learning_rate": 2.0404040404040407e-05, | |
| "loss": 0.2622, | |
| "mean_token_accuracy": 0.9760318547487259, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 49.95238095238095, | |
| "grad_norm": 1.5978738069534302, | |
| "learning_rate": 2.0303030303030303e-05, | |
| "loss": 0.2756, | |
| "mean_token_accuracy": 0.9767571240663528, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 0.994212806224823, | |
| "learning_rate": 2.0202020202020203e-05, | |
| "loss": 0.0477, | |
| "mean_token_accuracy": 0.9837398529052734, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 50.19047619047619, | |
| "grad_norm": 1.257419228553772, | |
| "learning_rate": 2.0101010101010102e-05, | |
| "loss": 0.2437, | |
| "mean_token_accuracy": 0.9815521091222763, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 50.38095238095238, | |
| "grad_norm": 1.2623318433761597, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2467, | |
| "mean_token_accuracy": 0.9801167845726013, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 50.57142857142857, | |
| "grad_norm": 1.3023744821548462, | |
| "learning_rate": 1.98989898989899e-05, | |
| "loss": 0.2498, | |
| "mean_token_accuracy": 0.9767654687166214, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 50.76190476190476, | |
| "grad_norm": 1.4939366579055786, | |
| "learning_rate": 1.9797979797979797e-05, | |
| "loss": 0.276, | |
| "mean_token_accuracy": 0.9766338616609573, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 50.95238095238095, | |
| "grad_norm": 1.2986633777618408, | |
| "learning_rate": 1.9696969696969697e-05, | |
| "loss": 0.2431, | |
| "mean_token_accuracy": 0.9812084436416626, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "grad_norm": 2.027116298675537, | |
| "learning_rate": 1.95959595959596e-05, | |
| "loss": 0.0666, | |
| "mean_token_accuracy": 0.9629629850387573, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 51.19047619047619, | |
| "grad_norm": 1.4073251485824585, | |
| "learning_rate": 1.9494949494949496e-05, | |
| "loss": 0.2457, | |
| "mean_token_accuracy": 0.9779722541570663, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 51.38095238095238, | |
| "grad_norm": 1.383111834526062, | |
| "learning_rate": 1.9393939393939395e-05, | |
| "loss": 0.2377, | |
| "mean_token_accuracy": 0.9842050075531006, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 51.57142857142857, | |
| "grad_norm": 1.4835509061813354, | |
| "learning_rate": 1.9292929292929295e-05, | |
| "loss": 0.2503, | |
| "mean_token_accuracy": 0.9771096408367157, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 51.76190476190476, | |
| "grad_norm": 1.756462812423706, | |
| "learning_rate": 1.919191919191919e-05, | |
| "loss": 0.2544, | |
| "mean_token_accuracy": 0.9787980318069458, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 51.95238095238095, | |
| "grad_norm": 1.5173331499099731, | |
| "learning_rate": 1.9090909090909094e-05, | |
| "loss": 0.2593, | |
| "mean_token_accuracy": 0.9801317751407623, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "grad_norm": 2.2640252113342285, | |
| "learning_rate": 1.898989898989899e-05, | |
| "loss": 0.087, | |
| "mean_token_accuracy": 0.9558823704719543, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 52.19047619047619, | |
| "grad_norm": 1.4061003923416138, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 0.2364, | |
| "mean_token_accuracy": 0.9783814698457718, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 52.38095238095238, | |
| "grad_norm": 1.2146430015563965, | |
| "learning_rate": 1.878787878787879e-05, | |
| "loss": 0.2265, | |
| "mean_token_accuracy": 0.9835509955883026, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 52.57142857142857, | |
| "grad_norm": 1.5701649188995361, | |
| "learning_rate": 1.8686868686868688e-05, | |
| "loss": 0.2637, | |
| "mean_token_accuracy": 0.9780102521181107, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 52.76190476190476, | |
| "grad_norm": 1.5340619087219238, | |
| "learning_rate": 1.8585858585858588e-05, | |
| "loss": 0.2627, | |
| "mean_token_accuracy": 0.9796072393655777, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 52.95238095238095, | |
| "grad_norm": 1.6451423168182373, | |
| "learning_rate": 1.8484848484848487e-05, | |
| "loss": 0.2599, | |
| "mean_token_accuracy": 0.9780296385288239, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "grad_norm": 1.1250572204589844, | |
| "learning_rate": 1.8383838383838383e-05, | |
| "loss": 0.0599, | |
| "mean_token_accuracy": 0.987500011920929, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 53.19047619047619, | |
| "grad_norm": 1.382422924041748, | |
| "learning_rate": 1.8282828282828286e-05, | |
| "loss": 0.2615, | |
| "mean_token_accuracy": 0.9795158058404922, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 53.38095238095238, | |
| "grad_norm": 1.434237003326416, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 0.2226, | |
| "mean_token_accuracy": 0.9817993342876434, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 53.57142857142857, | |
| "grad_norm": 1.3543226718902588, | |
| "learning_rate": 1.808080808080808e-05, | |
| "loss": 0.2455, | |
| "mean_token_accuracy": 0.9820217341184616, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 53.76190476190476, | |
| "grad_norm": 1.5558395385742188, | |
| "learning_rate": 1.797979797979798e-05, | |
| "loss": 0.2473, | |
| "mean_token_accuracy": 0.9786651730537415, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 53.95238095238095, | |
| "grad_norm": 1.998782992362976, | |
| "learning_rate": 1.787878787878788e-05, | |
| "loss": 0.2808, | |
| "mean_token_accuracy": 0.9743632227182388, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "grad_norm": 1.8470655679702759, | |
| "learning_rate": 1.777777777777778e-05, | |
| "loss": 0.0674, | |
| "mean_token_accuracy": 0.978723406791687, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 54.19047619047619, | |
| "grad_norm": 1.557365894317627, | |
| "learning_rate": 1.7676767676767676e-05, | |
| "loss": 0.2485, | |
| "mean_token_accuracy": 0.9763985723257065, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 54.38095238095238, | |
| "grad_norm": 1.2708889245986938, | |
| "learning_rate": 1.7575757575757576e-05, | |
| "loss": 0.2396, | |
| "mean_token_accuracy": 0.9807141125202179, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 54.57142857142857, | |
| "grad_norm": 1.574637770652771, | |
| "learning_rate": 1.7474747474747475e-05, | |
| "loss": 0.2552, | |
| "mean_token_accuracy": 0.9784888029098511, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 54.76190476190476, | |
| "grad_norm": 1.5815781354904175, | |
| "learning_rate": 1.7373737373737375e-05, | |
| "loss": 0.2516, | |
| "mean_token_accuracy": 0.9797972589731216, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 54.95238095238095, | |
| "grad_norm": 1.4875643253326416, | |
| "learning_rate": 1.7272727272727274e-05, | |
| "loss": 0.253, | |
| "mean_token_accuracy": 0.9805921763181686, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "grad_norm": 1.404120922088623, | |
| "learning_rate": 1.7171717171717173e-05, | |
| "loss": 0.0607, | |
| "mean_token_accuracy": 0.9756097793579102, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 55.19047619047619, | |
| "grad_norm": 1.1672003269195557, | |
| "learning_rate": 1.707070707070707e-05, | |
| "loss": 0.226, | |
| "mean_token_accuracy": 0.9818458557128906, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 55.38095238095238, | |
| "grad_norm": 1.3702583312988281, | |
| "learning_rate": 1.6969696969696972e-05, | |
| "loss": 0.2285, | |
| "mean_token_accuracy": 0.9818858057260513, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 55.57142857142857, | |
| "grad_norm": 1.567103624343872, | |
| "learning_rate": 1.686868686868687e-05, | |
| "loss": 0.2592, | |
| "mean_token_accuracy": 0.9774815589189529, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 55.76190476190476, | |
| "grad_norm": 1.5476545095443726, | |
| "learning_rate": 1.6767676767676768e-05, | |
| "loss": 0.2693, | |
| "mean_token_accuracy": 0.9761824756860733, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 55.95238095238095, | |
| "grad_norm": 1.7951135635375977, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.2627, | |
| "mean_token_accuracy": 0.9772898554801941, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "grad_norm": 1.5311144590377808, | |
| "learning_rate": 1.6565656565656567e-05, | |
| "loss": 0.0607, | |
| "mean_token_accuracy": 0.9750000238418579, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 56.19047619047619, | |
| "grad_norm": 1.4896326065063477, | |
| "learning_rate": 1.6464646464646466e-05, | |
| "loss": 0.2483, | |
| "mean_token_accuracy": 0.9790806472301483, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 56.38095238095238, | |
| "grad_norm": 1.385233998298645, | |
| "learning_rate": 1.6363636363636366e-05, | |
| "loss": 0.2471, | |
| "mean_token_accuracy": 0.9801070243120193, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 56.57142857142857, | |
| "grad_norm": 1.5755606889724731, | |
| "learning_rate": 1.6262626262626262e-05, | |
| "loss": 0.2462, | |
| "mean_token_accuracy": 0.9776095598936081, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 56.76190476190476, | |
| "grad_norm": 1.4080952405929565, | |
| "learning_rate": 1.6161616161616165e-05, | |
| "loss": 0.2559, | |
| "mean_token_accuracy": 0.9763025045394897, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 56.95238095238095, | |
| "grad_norm": 1.2759824991226196, | |
| "learning_rate": 1.606060606060606e-05, | |
| "loss": 0.2429, | |
| "mean_token_accuracy": 0.9811924993991852, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "grad_norm": 1.4365907907485962, | |
| "learning_rate": 1.595959595959596e-05, | |
| "loss": 0.0744, | |
| "mean_token_accuracy": 0.9836065769195557, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 57.19047619047619, | |
| "grad_norm": 1.4234627485275269, | |
| "learning_rate": 1.585858585858586e-05, | |
| "loss": 0.2353, | |
| "mean_token_accuracy": 0.9792965203523636, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 57.38095238095238, | |
| "grad_norm": 1.3555465936660767, | |
| "learning_rate": 1.5757575757575756e-05, | |
| "loss": 0.2494, | |
| "mean_token_accuracy": 0.9825381934642792, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 57.57142857142857, | |
| "grad_norm": 1.4413907527923584, | |
| "learning_rate": 1.565656565656566e-05, | |
| "loss": 0.2534, | |
| "mean_token_accuracy": 0.979871854186058, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 57.76190476190476, | |
| "grad_norm": 1.4927953481674194, | |
| "learning_rate": 1.5555555555555555e-05, | |
| "loss": 0.2305, | |
| "mean_token_accuracy": 0.9812074899673462, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 57.95238095238095, | |
| "grad_norm": 1.7719610929489136, | |
| "learning_rate": 1.5454545454545454e-05, | |
| "loss": 0.2633, | |
| "mean_token_accuracy": 0.9754152894020081, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "grad_norm": 0.9548564553260803, | |
| "learning_rate": 1.5353535353535354e-05, | |
| "loss": 0.0521, | |
| "mean_token_accuracy": 0.9885057210922241, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 58.19047619047619, | |
| "grad_norm": 1.4914696216583252, | |
| "learning_rate": 1.5252525252525255e-05, | |
| "loss": 0.2591, | |
| "mean_token_accuracy": 0.9796448796987534, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 58.38095238095238, | |
| "grad_norm": 1.4677958488464355, | |
| "learning_rate": 1.5151515151515153e-05, | |
| "loss": 0.2468, | |
| "mean_token_accuracy": 0.9798107296228409, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 58.57142857142857, | |
| "grad_norm": 1.3141554594039917, | |
| "learning_rate": 1.505050505050505e-05, | |
| "loss": 0.2325, | |
| "mean_token_accuracy": 0.9803733974695206, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 58.76190476190476, | |
| "grad_norm": 1.3697947263717651, | |
| "learning_rate": 1.494949494949495e-05, | |
| "loss": 0.2598, | |
| "mean_token_accuracy": 0.9749108999967575, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 58.95238095238095, | |
| "grad_norm": 1.252795696258545, | |
| "learning_rate": 1.484848484848485e-05, | |
| "loss": 0.2361, | |
| "mean_token_accuracy": 0.9824285060167313, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "grad_norm": 1.830544114112854, | |
| "learning_rate": 1.4747474747474749e-05, | |
| "loss": 0.0772, | |
| "mean_token_accuracy": 0.9682539701461792, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 59.19047619047619, | |
| "grad_norm": 1.266861081123352, | |
| "learning_rate": 1.4646464646464647e-05, | |
| "loss": 0.236, | |
| "mean_token_accuracy": 0.9807495921850204, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 59.38095238095238, | |
| "grad_norm": 1.5132209062576294, | |
| "learning_rate": 1.4545454545454545e-05, | |
| "loss": 0.2498, | |
| "mean_token_accuracy": 0.9786520302295685, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 59.57142857142857, | |
| "grad_norm": 1.259032964706421, | |
| "learning_rate": 1.4444444444444444e-05, | |
| "loss": 0.2223, | |
| "mean_token_accuracy": 0.9812145084142685, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 59.76190476190476, | |
| "grad_norm": 1.5718448162078857, | |
| "learning_rate": 1.4343434343434345e-05, | |
| "loss": 0.2627, | |
| "mean_token_accuracy": 0.9778482913970947, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 59.95238095238095, | |
| "grad_norm": 1.4775868654251099, | |
| "learning_rate": 1.4242424242424243e-05, | |
| "loss": 0.2587, | |
| "mean_token_accuracy": 0.9746824651956558, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "grad_norm": 1.638393521308899, | |
| "learning_rate": 1.4141414141414141e-05, | |
| "loss": 0.0824, | |
| "mean_token_accuracy": 0.9824561476707458, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 60.19047619047619, | |
| "grad_norm": 1.3080830574035645, | |
| "learning_rate": 1.404040404040404e-05, | |
| "loss": 0.2382, | |
| "mean_token_accuracy": 0.9818608462810516, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 60.38095238095238, | |
| "grad_norm": 1.1936572790145874, | |
| "learning_rate": 1.3939393939393942e-05, | |
| "loss": 0.2333, | |
| "mean_token_accuracy": 0.9817762225866318, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 60.57142857142857, | |
| "grad_norm": 1.5468491315841675, | |
| "learning_rate": 1.383838383838384e-05, | |
| "loss": 0.2653, | |
| "mean_token_accuracy": 0.9788466989994049, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 60.76190476190476, | |
| "grad_norm": 1.3440382480621338, | |
| "learning_rate": 1.3737373737373737e-05, | |
| "loss": 0.2495, | |
| "mean_token_accuracy": 0.9803344905376434, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 60.95238095238095, | |
| "grad_norm": 1.5807853937149048, | |
| "learning_rate": 1.3636363636363637e-05, | |
| "loss": 0.2399, | |
| "mean_token_accuracy": 0.977335661649704, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "grad_norm": 1.8642648458480835, | |
| "learning_rate": 1.3535353535353538e-05, | |
| "loss": 0.0675, | |
| "mean_token_accuracy": 0.9610389471054077, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 61.19047619047619, | |
| "grad_norm": 1.4595698118209839, | |
| "learning_rate": 1.3434343434343436e-05, | |
| "loss": 0.2433, | |
| "mean_token_accuracy": 0.9782412499189377, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 61.38095238095238, | |
| "grad_norm": 1.7195943593978882, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.2283, | |
| "mean_token_accuracy": 0.98487289249897, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 61.57142857142857, | |
| "grad_norm": 1.6731146574020386, | |
| "learning_rate": 1.3232323232323233e-05, | |
| "loss": 0.2481, | |
| "mean_token_accuracy": 0.9755380898714066, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 61.76190476190476, | |
| "grad_norm": 1.3162552118301392, | |
| "learning_rate": 1.3131313131313134e-05, | |
| "loss": 0.2682, | |
| "mean_token_accuracy": 0.9773096293210983, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 61.95238095238095, | |
| "grad_norm": 1.5763328075408936, | |
| "learning_rate": 1.3030303030303032e-05, | |
| "loss": 0.247, | |
| "mean_token_accuracy": 0.9791599065065384, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "grad_norm": 1.8567732572555542, | |
| "learning_rate": 1.292929292929293e-05, | |
| "loss": 0.0676, | |
| "mean_token_accuracy": 0.970588207244873, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 62.19047619047619, | |
| "grad_norm": 1.322481393814087, | |
| "learning_rate": 1.2828282828282829e-05, | |
| "loss": 0.2385, | |
| "mean_token_accuracy": 0.979724794626236, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 62.38095238095238, | |
| "grad_norm": 1.4246753454208374, | |
| "learning_rate": 1.2727272727272727e-05, | |
| "loss": 0.2467, | |
| "mean_token_accuracy": 0.9777331054210663, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 62.57142857142857, | |
| "grad_norm": 1.4530190229415894, | |
| "learning_rate": 1.2626262626262628e-05, | |
| "loss": 0.2377, | |
| "mean_token_accuracy": 0.9767781794071198, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 62.76190476190476, | |
| "grad_norm": 1.4946351051330566, | |
| "learning_rate": 1.2525252525252526e-05, | |
| "loss": 0.2547, | |
| "mean_token_accuracy": 0.9767863899469376, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 62.95238095238095, | |
| "grad_norm": 1.442986011505127, | |
| "learning_rate": 1.2424242424242424e-05, | |
| "loss": 0.2575, | |
| "mean_token_accuracy": 0.9808852076530457, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "grad_norm": 2.1069142818450928, | |
| "learning_rate": 1.2323232323232325e-05, | |
| "loss": 0.0682, | |
| "mean_token_accuracy": 0.9726027250289917, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 63.19047619047619, | |
| "grad_norm": 1.4386465549468994, | |
| "learning_rate": 1.2222222222222222e-05, | |
| "loss": 0.2472, | |
| "mean_token_accuracy": 0.9808338433504105, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 63.38095238095238, | |
| "grad_norm": 1.5726056098937988, | |
| "learning_rate": 1.2121212121212122e-05, | |
| "loss": 0.2488, | |
| "mean_token_accuracy": 0.9816757142543793, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 63.57142857142857, | |
| "grad_norm": 1.6537950038909912, | |
| "learning_rate": 1.202020202020202e-05, | |
| "loss": 0.2471, | |
| "mean_token_accuracy": 0.9798701107501984, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 63.76190476190476, | |
| "grad_norm": 1.4154284000396729, | |
| "learning_rate": 1.1919191919191921e-05, | |
| "loss": 0.2483, | |
| "mean_token_accuracy": 0.9786428213119507, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 63.95238095238095, | |
| "grad_norm": 1.493235468864441, | |
| "learning_rate": 1.1818181818181819e-05, | |
| "loss": 0.2499, | |
| "mean_token_accuracy": 0.9752872586250305, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "grad_norm": 0.9331473112106323, | |
| "learning_rate": 1.1717171717171718e-05, | |
| "loss": 0.0481, | |
| "mean_token_accuracy": 0.9902912378311157, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 64.19047619047619, | |
| "grad_norm": 1.5490996837615967, | |
| "learning_rate": 1.1616161616161616e-05, | |
| "loss": 0.2544, | |
| "mean_token_accuracy": 0.9750427901744843, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 64.38095238095238, | |
| "grad_norm": 1.2337415218353271, | |
| "learning_rate": 1.1515151515151517e-05, | |
| "loss": 0.2372, | |
| "mean_token_accuracy": 0.9794412702322006, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 64.57142857142857, | |
| "grad_norm": 1.3450168371200562, | |
| "learning_rate": 1.1414141414141415e-05, | |
| "loss": 0.251, | |
| "mean_token_accuracy": 0.9808587580919266, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 64.76190476190476, | |
| "grad_norm": 1.4372197389602661, | |
| "learning_rate": 1.1313131313131314e-05, | |
| "loss": 0.2541, | |
| "mean_token_accuracy": 0.9765901118516922, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 64.95238095238095, | |
| "grad_norm": 1.3596030473709106, | |
| "learning_rate": 1.1212121212121212e-05, | |
| "loss": 0.2327, | |
| "mean_token_accuracy": 0.9819456040859222, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "grad_norm": 1.2771663665771484, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 0.0615, | |
| "mean_token_accuracy": 0.9871794581413269, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 65.19047619047619, | |
| "grad_norm": 1.3283063173294067, | |
| "learning_rate": 1.1010101010101011e-05, | |
| "loss": 0.2431, | |
| "mean_token_accuracy": 0.9796550124883652, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 65.38095238095238, | |
| "grad_norm": 1.4404308795928955, | |
| "learning_rate": 1.0909090909090909e-05, | |
| "loss": 0.242, | |
| "mean_token_accuracy": 0.9827671945095062, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 65.57142857142857, | |
| "grad_norm": 1.322653889656067, | |
| "learning_rate": 1.0808080808080808e-05, | |
| "loss": 0.235, | |
| "mean_token_accuracy": 0.9791911989450455, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 65.76190476190476, | |
| "grad_norm": 1.346421718597412, | |
| "learning_rate": 1.0707070707070708e-05, | |
| "loss": 0.2602, | |
| "mean_token_accuracy": 0.9792519062757492, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 65.95238095238095, | |
| "grad_norm": 1.361152172088623, | |
| "learning_rate": 1.0606060606060607e-05, | |
| "loss": 0.2404, | |
| "mean_token_accuracy": 0.9787698835134506, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "grad_norm": 1.4586611986160278, | |
| "learning_rate": 1.0505050505050505e-05, | |
| "loss": 0.0681, | |
| "mean_token_accuracy": 0.970588207244873, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 66.19047619047619, | |
| "grad_norm": 1.4977368116378784, | |
| "learning_rate": 1.0404040404040405e-05, | |
| "loss": 0.2359, | |
| "mean_token_accuracy": 0.9806597381830215, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 66.38095238095238, | |
| "grad_norm": 1.2351692914962769, | |
| "learning_rate": 1.0303030303030304e-05, | |
| "loss": 0.2508, | |
| "mean_token_accuracy": 0.977878749370575, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 66.57142857142857, | |
| "grad_norm": 1.3478460311889648, | |
| "learning_rate": 1.0202020202020204e-05, | |
| "loss": 0.2321, | |
| "mean_token_accuracy": 0.9855255037546158, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 66.76190476190476, | |
| "grad_norm": 1.618532419204712, | |
| "learning_rate": 1.0101010101010101e-05, | |
| "loss": 0.2658, | |
| "mean_token_accuracy": 0.9772535562515259, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 66.95238095238095, | |
| "grad_norm": 1.5389485359191895, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2465, | |
| "mean_token_accuracy": 0.9769544303417206, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "grad_norm": 0.9716305732727051, | |
| "learning_rate": 9.898989898989899e-06, | |
| "loss": 0.0529, | |
| "mean_token_accuracy": 0.9885057210922241, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 67.19047619047619, | |
| "grad_norm": 1.4950332641601562, | |
| "learning_rate": 9.7979797979798e-06, | |
| "loss": 0.249, | |
| "mean_token_accuracy": 0.9769591093063354, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 67.38095238095238, | |
| "grad_norm": 1.524194359779358, | |
| "learning_rate": 9.696969696969698e-06, | |
| "loss": 0.2477, | |
| "mean_token_accuracy": 0.98219034075737, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 67.57142857142857, | |
| "grad_norm": 1.231911540031433, | |
| "learning_rate": 9.595959595959595e-06, | |
| "loss": 0.2232, | |
| "mean_token_accuracy": 0.9810429662466049, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 67.76190476190476, | |
| "grad_norm": 1.404455304145813, | |
| "learning_rate": 9.494949494949495e-06, | |
| "loss": 0.2701, | |
| "mean_token_accuracy": 0.9793097227811813, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 67.95238095238095, | |
| "grad_norm": 1.3537510633468628, | |
| "learning_rate": 9.393939393939394e-06, | |
| "loss": 0.2338, | |
| "mean_token_accuracy": 0.9800481051206589, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "grad_norm": 0.9093771576881409, | |
| "learning_rate": 9.292929292929294e-06, | |
| "loss": 0.0423, | |
| "mean_token_accuracy": 0.9902912378311157, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 68.19047619047619, | |
| "grad_norm": 1.3876770734786987, | |
| "learning_rate": 9.191919191919192e-06, | |
| "loss": 0.2453, | |
| "mean_token_accuracy": 0.9814929813146591, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 68.38095238095238, | |
| "grad_norm": 1.5604972839355469, | |
| "learning_rate": 9.090909090909091e-06, | |
| "loss": 0.2474, | |
| "mean_token_accuracy": 0.9796653985977173, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 68.57142857142857, | |
| "grad_norm": 1.4196627140045166, | |
| "learning_rate": 8.98989898989899e-06, | |
| "loss": 0.2421, | |
| "mean_token_accuracy": 0.9826227128505707, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 68.76190476190476, | |
| "grad_norm": 1.4446525573730469, | |
| "learning_rate": 8.88888888888889e-06, | |
| "loss": 0.237, | |
| "mean_token_accuracy": 0.9770011454820633, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 68.95238095238095, | |
| "grad_norm": 1.3088741302490234, | |
| "learning_rate": 8.787878787878788e-06, | |
| "loss": 0.242, | |
| "mean_token_accuracy": 0.9788557142019272, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "grad_norm": 1.1058439016342163, | |
| "learning_rate": 8.686868686868687e-06, | |
| "loss": 0.0552, | |
| "mean_token_accuracy": 0.9878048896789551, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 69.19047619047619, | |
| "grad_norm": 1.5012304782867432, | |
| "learning_rate": 8.585858585858587e-06, | |
| "loss": 0.2472, | |
| "mean_token_accuracy": 0.9804881513118744, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 69.38095238095238, | |
| "grad_norm": 1.2776250839233398, | |
| "learning_rate": 8.484848484848486e-06, | |
| "loss": 0.245, | |
| "mean_token_accuracy": 0.9793550372123718, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 69.57142857142857, | |
| "grad_norm": 1.4031535387039185, | |
| "learning_rate": 8.383838383838384e-06, | |
| "loss": 0.2391, | |
| "mean_token_accuracy": 0.9811627715826035, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 69.76190476190476, | |
| "grad_norm": 1.5323896408081055, | |
| "learning_rate": 8.282828282828283e-06, | |
| "loss": 0.2402, | |
| "mean_token_accuracy": 0.9756592959165573, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 69.95238095238095, | |
| "grad_norm": 1.415002465248108, | |
| "learning_rate": 8.181818181818183e-06, | |
| "loss": 0.2447, | |
| "mean_token_accuracy": 0.9816397428512573, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "grad_norm": 1.84005606174469, | |
| "learning_rate": 8.080808080808082e-06, | |
| "loss": 0.0622, | |
| "mean_token_accuracy": 0.9726027250289917, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 70.19047619047619, | |
| "grad_norm": 1.3505762815475464, | |
| "learning_rate": 7.97979797979798e-06, | |
| "loss": 0.2363, | |
| "mean_token_accuracy": 0.9800622910261154, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 70.38095238095238, | |
| "grad_norm": 1.3231146335601807, | |
| "learning_rate": 7.878787878787878e-06, | |
| "loss": 0.2327, | |
| "mean_token_accuracy": 0.9815961122512817, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 70.57142857142857, | |
| "grad_norm": 1.6289716958999634, | |
| "learning_rate": 7.777777777777777e-06, | |
| "loss": 0.2469, | |
| "mean_token_accuracy": 0.976947546005249, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 70.76190476190476, | |
| "grad_norm": 1.5643327236175537, | |
| "learning_rate": 7.676767676767677e-06, | |
| "loss": 0.2541, | |
| "mean_token_accuracy": 0.9771561771631241, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 70.95238095238095, | |
| "grad_norm": 1.4305167198181152, | |
| "learning_rate": 7.5757575757575764e-06, | |
| "loss": 0.2452, | |
| "mean_token_accuracy": 0.9759194254875183, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "grad_norm": 1.5850602388381958, | |
| "learning_rate": 7.474747474747475e-06, | |
| "loss": 0.0683, | |
| "mean_token_accuracy": 0.9850746393203735, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 71.19047619047619, | |
| "grad_norm": 1.3248540163040161, | |
| "learning_rate": 7.3737373737373745e-06, | |
| "loss": 0.24, | |
| "mean_token_accuracy": 0.9821758568286896, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 71.38095238095238, | |
| "grad_norm": 1.3908957242965698, | |
| "learning_rate": 7.272727272727272e-06, | |
| "loss": 0.242, | |
| "mean_token_accuracy": 0.9802806377410889, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 71.57142857142857, | |
| "grad_norm": 1.3902804851531982, | |
| "learning_rate": 7.171717171717173e-06, | |
| "loss": 0.2423, | |
| "mean_token_accuracy": 0.9788789004087448, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 71.76190476190476, | |
| "grad_norm": 1.4126980304718018, | |
| "learning_rate": 7.0707070707070704e-06, | |
| "loss": 0.2437, | |
| "mean_token_accuracy": 0.9766863882541656, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 71.95238095238095, | |
| "grad_norm": 1.423156499862671, | |
| "learning_rate": 6.969696969696971e-06, | |
| "loss": 0.2427, | |
| "mean_token_accuracy": 0.9781524240970612, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "grad_norm": 1.736093521118164, | |
| "learning_rate": 6.8686868686868685e-06, | |
| "loss": 0.0814, | |
| "mean_token_accuracy": 0.9818181991577148, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 72.19047619047619, | |
| "grad_norm": 1.281557321548462, | |
| "learning_rate": 6.767676767676769e-06, | |
| "loss": 0.2482, | |
| "mean_token_accuracy": 0.9825676530599594, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 72.38095238095238, | |
| "grad_norm": 1.3980622291564941, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.2428, | |
| "mean_token_accuracy": 0.9788574278354645, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 72.57142857142857, | |
| "grad_norm": 1.419425368309021, | |
| "learning_rate": 6.565656565656567e-06, | |
| "loss": 0.2431, | |
| "mean_token_accuracy": 0.9791808128356934, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 72.76190476190476, | |
| "grad_norm": 1.5525389909744263, | |
| "learning_rate": 6.464646464646465e-06, | |
| "loss": 0.2538, | |
| "mean_token_accuracy": 0.9783525764942169, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 72.95238095238095, | |
| "grad_norm": 1.295773983001709, | |
| "learning_rate": 6.363636363636363e-06, | |
| "loss": 0.2299, | |
| "mean_token_accuracy": 0.9779433310031891, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "grad_norm": 0.6111257076263428, | |
| "learning_rate": 6.262626262626263e-06, | |
| "loss": 0.0384, | |
| "mean_token_accuracy": 0.9922480583190918, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 73.19047619047619, | |
| "grad_norm": 1.387117862701416, | |
| "learning_rate": 6.161616161616162e-06, | |
| "loss": 0.2405, | |
| "mean_token_accuracy": 0.979522630572319, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 73.38095238095238, | |
| "grad_norm": 1.3952202796936035, | |
| "learning_rate": 6.060606060606061e-06, | |
| "loss": 0.2486, | |
| "mean_token_accuracy": 0.9780898541212082, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 73.57142857142857, | |
| "grad_norm": 1.6391713619232178, | |
| "learning_rate": 5.9595959595959605e-06, | |
| "loss": 0.2504, | |
| "mean_token_accuracy": 0.9782277494668961, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 73.76190476190476, | |
| "grad_norm": 1.4811103343963623, | |
| "learning_rate": 5.858585858585859e-06, | |
| "loss": 0.2392, | |
| "mean_token_accuracy": 0.9793239235877991, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 73.95238095238095, | |
| "grad_norm": 1.4281538724899292, | |
| "learning_rate": 5.7575757575757586e-06, | |
| "loss": 0.2326, | |
| "mean_token_accuracy": 0.979654997587204, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "grad_norm": 1.2993221282958984, | |
| "learning_rate": 5.656565656565657e-06, | |
| "loss": 0.0573, | |
| "mean_token_accuracy": 0.9876543283462524, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 74.19047619047619, | |
| "grad_norm": 1.2887934446334839, | |
| "learning_rate": 5.555555555555556e-06, | |
| "loss": 0.2422, | |
| "mean_token_accuracy": 0.9798881709575653, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 74.38095238095238, | |
| "grad_norm": 1.581034779548645, | |
| "learning_rate": 5.4545454545454545e-06, | |
| "loss": 0.2462, | |
| "mean_token_accuracy": 0.9796192944049835, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 74.57142857142857, | |
| "grad_norm": 1.219085693359375, | |
| "learning_rate": 5.353535353535354e-06, | |
| "loss": 0.2434, | |
| "mean_token_accuracy": 0.9797424674034119, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 74.76190476190476, | |
| "grad_norm": 1.2309306859970093, | |
| "learning_rate": 5.2525252525252526e-06, | |
| "loss": 0.2379, | |
| "mean_token_accuracy": 0.978371798992157, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 74.95238095238095, | |
| "grad_norm": 1.4002373218536377, | |
| "learning_rate": 5.151515151515152e-06, | |
| "loss": 0.2325, | |
| "mean_token_accuracy": 0.9793529957532883, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "grad_norm": 2.0193445682525635, | |
| "learning_rate": 5.050505050505051e-06, | |
| "loss": 0.0842, | |
| "mean_token_accuracy": 0.9807692170143127, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 75.19047619047619, | |
| "grad_norm": 1.3020991086959839, | |
| "learning_rate": 4.949494949494949e-06, | |
| "loss": 0.2249, | |
| "mean_token_accuracy": 0.983807697892189, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 75.38095238095238, | |
| "grad_norm": 1.2189743518829346, | |
| "learning_rate": 4.848484848484849e-06, | |
| "loss": 0.2444, | |
| "mean_token_accuracy": 0.9823562502861023, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 75.57142857142857, | |
| "grad_norm": 1.43671715259552, | |
| "learning_rate": 4.747474747474747e-06, | |
| "loss": 0.2473, | |
| "mean_token_accuracy": 0.9775967448949814, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 75.76190476190476, | |
| "grad_norm": 1.6678014993667603, | |
| "learning_rate": 4.646464646464647e-06, | |
| "loss": 0.2352, | |
| "mean_token_accuracy": 0.9812745600938797, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 75.95238095238095, | |
| "grad_norm": 1.9260616302490234, | |
| "learning_rate": 4.5454545454545455e-06, | |
| "loss": 0.2581, | |
| "mean_token_accuracy": 0.9734574407339096, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "grad_norm": 1.5224919319152832, | |
| "learning_rate": 4.444444444444445e-06, | |
| "loss": 0.0667, | |
| "mean_token_accuracy": 0.9846153855323792, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 76.19047619047619, | |
| "grad_norm": 1.1384742259979248, | |
| "learning_rate": 4.343434343434344e-06, | |
| "loss": 0.2166, | |
| "mean_token_accuracy": 0.9816610366106033, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 76.38095238095238, | |
| "grad_norm": 1.5136680603027344, | |
| "learning_rate": 4.242424242424243e-06, | |
| "loss": 0.2443, | |
| "mean_token_accuracy": 0.9804109483957291, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 76.57142857142857, | |
| "grad_norm": 1.5559028387069702, | |
| "learning_rate": 4.141414141414142e-06, | |
| "loss": 0.2472, | |
| "mean_token_accuracy": 0.9795145392417908, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 76.76190476190476, | |
| "grad_norm": 1.4042458534240723, | |
| "learning_rate": 4.040404040404041e-06, | |
| "loss": 0.2422, | |
| "mean_token_accuracy": 0.9746371954679489, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 76.95238095238095, | |
| "grad_norm": 1.3069055080413818, | |
| "learning_rate": 3.939393939393939e-06, | |
| "loss": 0.2574, | |
| "mean_token_accuracy": 0.981501892209053, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 77.0, | |
| "grad_norm": 1.4545823335647583, | |
| "learning_rate": 3.8383838383838385e-06, | |
| "loss": 0.0675, | |
| "mean_token_accuracy": 0.970588207244873, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 77.19047619047619, | |
| "grad_norm": 1.4684022665023804, | |
| "learning_rate": 3.7373737373737375e-06, | |
| "loss": 0.2269, | |
| "mean_token_accuracy": 0.981085941195488, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 77.38095238095238, | |
| "grad_norm": 1.5217136144638062, | |
| "learning_rate": 3.636363636363636e-06, | |
| "loss": 0.2415, | |
| "mean_token_accuracy": 0.9836974442005157, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 77.57142857142857, | |
| "grad_norm": 1.2941691875457764, | |
| "learning_rate": 3.5353535353535352e-06, | |
| "loss": 0.2387, | |
| "mean_token_accuracy": 0.978131577372551, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 77.76190476190476, | |
| "grad_norm": 1.4465221166610718, | |
| "learning_rate": 3.4343434343434343e-06, | |
| "loss": 0.2404, | |
| "mean_token_accuracy": 0.9785452336072922, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 77.95238095238095, | |
| "grad_norm": 1.4259777069091797, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.2515, | |
| "mean_token_accuracy": 0.9781184196472168, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 78.0, | |
| "grad_norm": 1.9436161518096924, | |
| "learning_rate": 3.2323232323232324e-06, | |
| "loss": 0.0751, | |
| "mean_token_accuracy": 0.9661017060279846, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 78.19047619047619, | |
| "grad_norm": 1.2418111562728882, | |
| "learning_rate": 3.1313131313131314e-06, | |
| "loss": 0.2206, | |
| "mean_token_accuracy": 0.980968713760376, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 78.38095238095238, | |
| "grad_norm": 1.3781098127365112, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "loss": 0.2423, | |
| "mean_token_accuracy": 0.9790701419115067, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 78.57142857142857, | |
| "grad_norm": 1.3852852582931519, | |
| "learning_rate": 2.9292929292929295e-06, | |
| "loss": 0.2423, | |
| "mean_token_accuracy": 0.9788630157709122, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 78.76190476190476, | |
| "grad_norm": 1.5246734619140625, | |
| "learning_rate": 2.8282828282828286e-06, | |
| "loss": 0.2497, | |
| "mean_token_accuracy": 0.9794032126665115, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 78.95238095238095, | |
| "grad_norm": 1.4307729005813599, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 0.2479, | |
| "mean_token_accuracy": 0.9815962314605713, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 79.0, | |
| "grad_norm": 1.941765308380127, | |
| "learning_rate": 2.6262626262626263e-06, | |
| "loss": 0.0653, | |
| "mean_token_accuracy": 0.9577465057373047, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 79.19047619047619, | |
| "grad_norm": 1.2771799564361572, | |
| "learning_rate": 2.5252525252525253e-06, | |
| "loss": 0.2255, | |
| "mean_token_accuracy": 0.9810370206832886, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 79.38095238095238, | |
| "grad_norm": 1.325358271598816, | |
| "learning_rate": 2.4242424242424244e-06, | |
| "loss": 0.242, | |
| "mean_token_accuracy": 0.9791529029607773, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 79.57142857142857, | |
| "grad_norm": 1.295100212097168, | |
| "learning_rate": 2.3232323232323234e-06, | |
| "loss": 0.2488, | |
| "mean_token_accuracy": 0.9798661768436432, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 79.76190476190476, | |
| "grad_norm": 1.4676238298416138, | |
| "learning_rate": 2.2222222222222225e-06, | |
| "loss": 0.2367, | |
| "mean_token_accuracy": 0.9780342727899551, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 79.95238095238095, | |
| "grad_norm": 1.7996033430099487, | |
| "learning_rate": 2.1212121212121216e-06, | |
| "loss": 0.2452, | |
| "mean_token_accuracy": 0.9771022349596024, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "grad_norm": 1.3761502504348755, | |
| "learning_rate": 2.0202020202020206e-06, | |
| "loss": 0.06, | |
| "mean_token_accuracy": 0.970588207244873, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 80.19047619047619, | |
| "grad_norm": 1.3741532564163208, | |
| "learning_rate": 1.9191919191919192e-06, | |
| "loss": 0.2414, | |
| "mean_token_accuracy": 0.9827142953872681, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 80.38095238095238, | |
| "grad_norm": 1.680336594581604, | |
| "learning_rate": 1.818181818181818e-06, | |
| "loss": 0.2308, | |
| "mean_token_accuracy": 0.980181872844696, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 80.57142857142857, | |
| "grad_norm": 1.1747589111328125, | |
| "learning_rate": 1.7171717171717171e-06, | |
| "loss": 0.2201, | |
| "mean_token_accuracy": 0.9804712980985641, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 80.76190476190476, | |
| "grad_norm": 1.4682387113571167, | |
| "learning_rate": 1.6161616161616162e-06, | |
| "loss": 0.2481, | |
| "mean_token_accuracy": 0.9811168909072876, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 80.95238095238095, | |
| "grad_norm": 1.5288760662078857, | |
| "learning_rate": 1.5151515151515152e-06, | |
| "loss": 0.2542, | |
| "mean_token_accuracy": 0.9763506799936295, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 81.0, | |
| "grad_norm": 2.051353931427002, | |
| "learning_rate": 1.4141414141414143e-06, | |
| "loss": 0.0759, | |
| "mean_token_accuracy": 0.9666666388511658, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 81.19047619047619, | |
| "grad_norm": 1.4453171491622925, | |
| "learning_rate": 1.3131313131313131e-06, | |
| "loss": 0.2488, | |
| "mean_token_accuracy": 0.9764743894338608, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 81.38095238095238, | |
| "grad_norm": 1.2203129529953003, | |
| "learning_rate": 1.2121212121212122e-06, | |
| "loss": 0.2208, | |
| "mean_token_accuracy": 0.9802269041538239, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 81.57142857142857, | |
| "grad_norm": 1.338069200515747, | |
| "learning_rate": 1.1111111111111112e-06, | |
| "loss": 0.2454, | |
| "mean_token_accuracy": 0.9848097264766693, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 81.76190476190476, | |
| "grad_norm": 1.3311666250228882, | |
| "learning_rate": 1.0101010101010103e-06, | |
| "loss": 0.2276, | |
| "mean_token_accuracy": 0.9802386462688446, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 81.95238095238095, | |
| "grad_norm": 1.4156842231750488, | |
| "learning_rate": 9.09090909090909e-07, | |
| "loss": 0.2622, | |
| "mean_token_accuracy": 0.9762069880962372, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 82.0, | |
| "grad_norm": 1.7438231706619263, | |
| "learning_rate": 8.080808080808081e-07, | |
| "loss": 0.0642, | |
| "mean_token_accuracy": 0.9710144996643066, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 82.19047619047619, | |
| "grad_norm": 1.338675618171692, | |
| "learning_rate": 7.070707070707071e-07, | |
| "loss": 0.2547, | |
| "mean_token_accuracy": 0.9793485999107361, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 82.38095238095238, | |
| "grad_norm": 1.248263955116272, | |
| "learning_rate": 6.060606060606061e-07, | |
| "loss": 0.2139, | |
| "mean_token_accuracy": 0.9814836531877518, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 82.57142857142857, | |
| "grad_norm": 1.4303299188613892, | |
| "learning_rate": 5.050505050505052e-07, | |
| "loss": 0.2466, | |
| "mean_token_accuracy": 0.9783899486064911, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 82.76190476190476, | |
| "grad_norm": 1.4656988382339478, | |
| "learning_rate": 4.0404040404040405e-07, | |
| "loss": 0.2469, | |
| "mean_token_accuracy": 0.9803285598754883, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 82.95238095238095, | |
| "grad_norm": 1.3924672603607178, | |
| "learning_rate": 3.0303030303030305e-07, | |
| "loss": 0.2375, | |
| "mean_token_accuracy": 0.9797345548868179, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 83.0, | |
| "grad_norm": 0.9879482388496399, | |
| "learning_rate": 2.0202020202020202e-07, | |
| "loss": 0.0395, | |
| "mean_token_accuracy": 0.9838709831237793, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 83.19047619047619, | |
| "grad_norm": 1.2162104845046997, | |
| "learning_rate": 1.0101010101010101e-07, | |
| "loss": 0.2433, | |
| "mean_token_accuracy": 0.981399655342102, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 83.38095238095238, | |
| "grad_norm": 1.2492247819900513, | |
| "learning_rate": 0.0, | |
| "loss": 0.2299, | |
| "mean_token_accuracy": 0.9802171587944031, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2203866148700160.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |