SwapnilPatil28's picture
Final Update - Add training artifacts, README updates, and scripts
c3648b5 verified
[
{
"loss": 2.836225128173828,
"grad_norm": 64.5,
"learning_rate": 1.9921568627450984e-05,
"entropy": 2.411133313179016,
"num_tokens": 3137.0,
"mean_token_accuracy": 0.49307813346385954,
"epoch": 0.014705882352941176,
"step": 5
},
{
"loss": 1.3722827911376954,
"grad_norm": 10.0,
"learning_rate": 1.9823529411764708e-05,
"entropy": 1.489565873146057,
"num_tokens": 6240.0,
"mean_token_accuracy": 0.7310294091701508,
"epoch": 0.029411764705882353,
"step": 10
},
{
"loss": 0.9681278228759765,
"grad_norm": 9.0,
"learning_rate": 1.9725490196078433e-05,
"entropy": 1.0941020846366882,
"num_tokens": 9372.0,
"mean_token_accuracy": 0.7977278172969818,
"epoch": 0.04411764705882353,
"step": 15
},
{
"loss": 0.7952256202697754,
"grad_norm": 7.5625,
"learning_rate": 1.9627450980392157e-05,
"entropy": 0.7959236443042755,
"num_tokens": 12496.0,
"mean_token_accuracy": 0.8263253927230835,
"epoch": 0.058823529411764705,
"step": 20
},
{
"loss": 0.7038975715637207,
"grad_norm": 10.0,
"learning_rate": 1.9529411764705885e-05,
"entropy": 0.7730603992938996,
"num_tokens": 15726.0,
"mean_token_accuracy": 0.8362560391426086,
"epoch": 0.07352941176470588,
"step": 25
},
{
"loss": 0.5153284072875977,
"grad_norm": 9.5,
"learning_rate": 1.943137254901961e-05,
"entropy": 0.5871870815753937,
"num_tokens": 18807.0,
"mean_token_accuracy": 0.8711118042469025,
"epoch": 0.08823529411764706,
"step": 30
},
{
"loss": 0.4624673843383789,
"grad_norm": 9.375,
"learning_rate": 1.9333333333333333e-05,
"entropy": 0.5334561973810196,
"num_tokens": 21955.0,
"mean_token_accuracy": 0.8878682732582093,
"epoch": 0.10294117647058823,
"step": 35
},
{
"loss": 0.3805722236633301,
"grad_norm": 7.0625,
"learning_rate": 1.923529411764706e-05,
"entropy": 0.490571403503418,
"num_tokens": 25129.0,
"mean_token_accuracy": 0.9082872688770294,
"epoch": 0.11764705882352941,
"step": 40
},
{
"loss": 0.2753485679626465,
"grad_norm": 8.75,
"learning_rate": 1.9137254901960786e-05,
"entropy": 0.3105604648590088,
"num_tokens": 28291.0,
"mean_token_accuracy": 0.9394680917263031,
"epoch": 0.1323529411764706,
"step": 45
},
{
"loss": 0.22170100212097169,
"grad_norm": 5.65625,
"learning_rate": 1.903921568627451e-05,
"entropy": 0.28098965287208555,
"num_tokens": 31415.0,
"mean_token_accuracy": 0.949154794216156,
"epoch": 0.14705882352941177,
"step": 50
},
{
"loss": 0.18951488733291627,
"grad_norm": 9.9375,
"learning_rate": 1.8941176470588238e-05,
"entropy": 0.20550020337104796,
"num_tokens": 34603.0,
"mean_token_accuracy": 0.9539743661880493,
"epoch": 0.16176470588235295,
"step": 55
},
{
"loss": 0.17650480270385743,
"grad_norm": 4.25,
"learning_rate": 1.8843137254901962e-05,
"entropy": 0.21026135981082916,
"num_tokens": 37754.0,
"mean_token_accuracy": 0.9567391991615295,
"epoch": 0.17647058823529413,
"step": 60
},
{
"loss": 0.18774482011795043,
"grad_norm": 5.5,
"learning_rate": 1.8745098039215686e-05,
"entropy": 0.23240296691656112,
"num_tokens": 40848.0,
"mean_token_accuracy": 0.9520188570022583,
"epoch": 0.19117647058823528,
"step": 65
},
{
"loss": 0.12736810445785524,
"grad_norm": 10.625,
"learning_rate": 1.8647058823529414e-05,
"entropy": 0.16197917684912683,
"num_tokens": 44001.0,
"mean_token_accuracy": 0.9676418542861939,
"epoch": 0.20588235294117646,
"step": 70
},
{
"loss": 0.14076029062271117,
"grad_norm": 4.53125,
"learning_rate": 1.854901960784314e-05,
"entropy": 0.15784153044223787,
"num_tokens": 47159.0,
"mean_token_accuracy": 0.9648099303245544,
"epoch": 0.22058823529411764,
"step": 75
},
{
"loss": 0.10759507417678833,
"grad_norm": 3.328125,
"learning_rate": 1.8450980392156866e-05,
"entropy": 0.14289679378271103,
"num_tokens": 50298.0,
"mean_token_accuracy": 0.9671541452407837,
"epoch": 0.23529411764705882,
"step": 80
},
{
"loss": 0.12589149475097655,
"grad_norm": 5.46875,
"learning_rate": 1.8352941176470587e-05,
"entropy": 0.13958239406347275,
"num_tokens": 53455.0,
"mean_token_accuracy": 0.9665216684341431,
"epoch": 0.25,
"step": 85
},
{
"loss": 0.12024720907211303,
"grad_norm": 4.53125,
"learning_rate": 1.8254901960784315e-05,
"entropy": 0.13711344972252845,
"num_tokens": 56595.0,
"mean_token_accuracy": 0.9648710668087006,
"epoch": 0.2647058823529412,
"step": 90
},
{
"loss": 0.10167303085327148,
"grad_norm": 4.8125,
"learning_rate": 1.815686274509804e-05,
"entropy": 0.13078619986772538,
"num_tokens": 59674.0,
"mean_token_accuracy": 0.9712324619293213,
"epoch": 0.27941176470588236,
"step": 95
},
{
"loss": 0.08662314414978027,
"grad_norm": 3.671875,
"learning_rate": 1.8058823529411767e-05,
"entropy": 0.10740345045924186,
"num_tokens": 62774.0,
"mean_token_accuracy": 0.9719909071922302,
"epoch": 0.29411764705882354,
"step": 100
},
{
"loss": 0.09073780775070191,
"grad_norm": 4.15625,
"learning_rate": 1.796078431372549e-05,
"entropy": 0.09185975939035415,
"num_tokens": 65866.0,
"mean_token_accuracy": 0.9742748856544494,
"epoch": 0.3088235294117647,
"step": 105
},
{
"loss": 0.07408615350723266,
"grad_norm": 2.734375,
"learning_rate": 1.786274509803922e-05,
"entropy": 0.10024651288986205,
"num_tokens": 68995.0,
"mean_token_accuracy": 0.9773713290691376,
"epoch": 0.3235294117647059,
"step": 110
},
{
"loss": 0.08644189834594726,
"grad_norm": 6.71875,
"learning_rate": 1.776470588235294e-05,
"entropy": 0.09930562153458596,
"num_tokens": 72160.0,
"mean_token_accuracy": 0.9748322486877441,
"epoch": 0.3382352941176471,
"step": 115
},
{
"loss": 0.11685197353363037,
"grad_norm": 10.3125,
"learning_rate": 1.7666666666666668e-05,
"entropy": 0.11419346779584885,
"num_tokens": 75262.0,
"mean_token_accuracy": 0.9695464611053467,
"epoch": 0.35294117647058826,
"step": 120
},
{
"loss": 0.10757300853729249,
"grad_norm": 8.9375,
"learning_rate": 1.7568627450980392e-05,
"entropy": 0.12836654633283615,
"num_tokens": 78384.0,
"mean_token_accuracy": 0.9728550255298615,
"epoch": 0.36764705882352944,
"step": 125
},
{
"loss": 0.07711289525032043,
"grad_norm": 3.015625,
"learning_rate": 1.747058823529412e-05,
"entropy": 0.10070741027593613,
"num_tokens": 81583.0,
"mean_token_accuracy": 0.9778402209281921,
"epoch": 0.38235294117647056,
"step": 130
},
{
"loss": 0.08512116074562073,
"grad_norm": 5.375,
"learning_rate": 1.7372549019607845e-05,
"entropy": 0.09163436144590378,
"num_tokens": 84729.0,
"mean_token_accuracy": 0.9748329102993012,
"epoch": 0.39705882352941174,
"step": 135
},
{
"loss": 0.09534031748771668,
"grad_norm": 3.40625,
"learning_rate": 1.7274509803921572e-05,
"entropy": 0.09555450975894927,
"num_tokens": 87916.0,
"mean_token_accuracy": 0.9727975726127625,
"epoch": 0.4117647058823529,
"step": 140
},
{
"loss": 0.0699828803539276,
"grad_norm": 2.828125,
"learning_rate": 1.7176470588235293e-05,
"entropy": 0.089533219486475,
"num_tokens": 90982.0,
"mean_token_accuracy": 0.9772566497325897,
"epoch": 0.4264705882352941,
"step": 145
},
{
"loss": 0.06004565954208374,
"grad_norm": 4.28125,
"learning_rate": 1.707843137254902e-05,
"entropy": 0.07979470491409302,
"num_tokens": 94197.0,
"mean_token_accuracy": 0.980064970254898,
"epoch": 0.4411764705882353,
"step": 150
},
{
"loss": 0.07095102667808532,
"grad_norm": 3.8125,
"learning_rate": 1.6980392156862745e-05,
"entropy": 0.07709958106279373,
"num_tokens": 97332.0,
"mean_token_accuracy": 0.9785419166088104,
"epoch": 0.45588235294117646,
"step": 155
},
{
"loss": 0.05590643882751465,
"grad_norm": 1.671875,
"learning_rate": 1.6882352941176473e-05,
"entropy": 0.07423891946673393,
"num_tokens": 100515.0,
"mean_token_accuracy": 0.9827289760112763,
"epoch": 0.47058823529411764,
"step": 160
},
{
"loss": 0.06335585117340088,
"grad_norm": 2.390625,
"learning_rate": 1.6784313725490198e-05,
"entropy": 0.08311136476695538,
"num_tokens": 103630.0,
"mean_token_accuracy": 0.9795481741428376,
"epoch": 0.4852941176470588,
"step": 165
},
{
"loss": 0.06994503140449523,
"grad_norm": 3.625,
"learning_rate": 1.6686274509803922e-05,
"entropy": 0.07972728088498116,
"num_tokens": 106741.0,
"mean_token_accuracy": 0.9786823868751526,
"epoch": 0.5,
"step": 170
},
{
"loss": 0.047742915153503415,
"grad_norm": 5.71875,
"learning_rate": 1.658823529411765e-05,
"entropy": 0.059984054416418076,
"num_tokens": 109921.0,
"mean_token_accuracy": 0.9847357928752899,
"epoch": 0.5147058823529411,
"step": 175
},
{
"loss": 0.05979984998703003,
"grad_norm": 7.0625,
"learning_rate": 1.6490196078431374e-05,
"entropy": 0.06703888289630414,
"num_tokens": 112994.0,
"mean_token_accuracy": 0.9824592292308807,
"epoch": 0.5294117647058824,
"step": 180
},
{
"loss": 0.04938005805015564,
"grad_norm": 2.90625,
"learning_rate": 1.63921568627451e-05,
"entropy": 0.054279588535428046,
"num_tokens": 116201.0,
"mean_token_accuracy": 0.9846667230129242,
"epoch": 0.5441176470588235,
"step": 185
},
{
"loss": 0.06785057783126831,
"grad_norm": 7.4375,
"learning_rate": 1.6294117647058826e-05,
"entropy": 0.06177988387644291,
"num_tokens": 119381.0,
"mean_token_accuracy": 0.9796367406845092,
"epoch": 0.5588235294117647,
"step": 190
},
{
"loss": 0.05383546352386474,
"grad_norm": 5.40625,
"learning_rate": 1.619607843137255e-05,
"entropy": 0.0636073287576437,
"num_tokens": 122517.0,
"mean_token_accuracy": 0.9798873722553253,
"epoch": 0.5735294117647058,
"step": 195
},
{
"loss": 0.0490637868642807,
"grad_norm": 1.96875,
"learning_rate": 1.6098039215686275e-05,
"entropy": 0.0639917254447937,
"num_tokens": 125663.0,
"mean_token_accuracy": 0.9849890351295472,
"epoch": 0.5882352941176471,
"step": 200
},
{
"loss": 0.06412197351455688,
"grad_norm": 6.84375,
"learning_rate": 1.6000000000000003e-05,
"entropy": 0.06784685887396336,
"num_tokens": 128856.0,
"mean_token_accuracy": 0.9818105876445771,
"epoch": 0.6029411764705882,
"step": 205
},
{
"loss": 0.04346465170383453,
"grad_norm": 4.375,
"learning_rate": 1.5901960784313727e-05,
"entropy": 0.06049864292144776,
"num_tokens": 131995.0,
"mean_token_accuracy": 0.9882112145423889,
"epoch": 0.6176470588235294,
"step": 210
},
{
"loss": 0.04320838153362274,
"grad_norm": 2.015625,
"learning_rate": 1.580392156862745e-05,
"entropy": 0.047596517577767374,
"num_tokens": 135181.0,
"mean_token_accuracy": 0.985132920742035,
"epoch": 0.6323529411764706,
"step": 215
},
{
"loss": 0.06799347996711731,
"grad_norm": 8.5625,
"learning_rate": 1.570588235294118e-05,
"entropy": 0.06635901145637035,
"num_tokens": 138254.0,
"mean_token_accuracy": 0.9791639804840088,
"epoch": 0.6470588235294118,
"step": 220
},
{
"loss": 0.041108173131942746,
"grad_norm": 2.859375,
"learning_rate": 1.5607843137254904e-05,
"entropy": 0.051696383953094484,
"num_tokens": 141381.0,
"mean_token_accuracy": 0.9862416744232178,
"epoch": 0.6617647058823529,
"step": 225
},
{
"loss": 0.045146191120147706,
"grad_norm": 3.078125,
"learning_rate": 1.5509803921568628e-05,
"entropy": 0.055339107289910316,
"num_tokens": 144583.0,
"mean_token_accuracy": 0.9822882294654847,
"epoch": 0.6764705882352942,
"step": 230
},
{
"loss": 0.04143168330192566,
"grad_norm": 1.578125,
"learning_rate": 1.5411764705882356e-05,
"entropy": 0.05063906572759151,
"num_tokens": 147764.0,
"mean_token_accuracy": 0.9831606447696686,
"epoch": 0.6911764705882353,
"step": 235
},
{
"loss": 0.03947827816009521,
"grad_norm": 1.9921875,
"learning_rate": 1.531372549019608e-05,
"entropy": 0.05209046043455601,
"num_tokens": 150961.0,
"mean_token_accuracy": 0.9848346650600434,
"epoch": 0.7058823529411765,
"step": 240
},
{
"loss": 0.034212198853492734,
"grad_norm": 1.8984375,
"learning_rate": 1.5215686274509804e-05,
"entropy": 0.04912327118217945,
"num_tokens": 154174.0,
"mean_token_accuracy": 0.9855735838413239,
"epoch": 0.7205882352941176,
"step": 245
},
{
"loss": 0.03223183453083038,
"grad_norm": 1.7265625,
"learning_rate": 1.511764705882353e-05,
"entropy": 0.045325061306357384,
"num_tokens": 157374.0,
"mean_token_accuracy": 0.9866909861564637,
"epoch": 0.7352941176470589,
"step": 250
},
{
"loss": 0.04085415601730347,
"grad_norm": 2.625,
"learning_rate": 1.5019607843137257e-05,
"entropy": 0.045074894279241565,
"num_tokens": 160519.0,
"mean_token_accuracy": 0.9865182876586914,
"epoch": 0.75,
"step": 255
},
{
"loss": 0.03927797079086304,
"grad_norm": 2.671875,
"learning_rate": 1.4921568627450983e-05,
"entropy": 0.039533843845129014,
"num_tokens": 163756.0,
"mean_token_accuracy": 0.9872985363006592,
"epoch": 0.7647058823529411,
"step": 260
},
{
"loss": 0.042234039306640624,
"grad_norm": 1.7109375,
"learning_rate": 1.4823529411764707e-05,
"entropy": 0.043326519429683685,
"num_tokens": 166884.0,
"mean_token_accuracy": 0.9839499652385711,
"epoch": 0.7794117647058824,
"step": 265
},
{
"loss": 0.04218446910381317,
"grad_norm": 3.671875,
"learning_rate": 1.4725490196078433e-05,
"entropy": 0.05446031875908375,
"num_tokens": 170021.0,
"mean_token_accuracy": 0.983331423997879,
"epoch": 0.7941176470588235,
"step": 270
},
{
"loss": 0.031345850229263304,
"grad_norm": 1.375,
"learning_rate": 1.4627450980392157e-05,
"entropy": 0.044994413107633593,
"num_tokens": 173138.0,
"mean_token_accuracy": 0.9864144027233124,
"epoch": 0.8088235294117647,
"step": 275
},
{
"loss": 0.03718245923519135,
"grad_norm": 2.03125,
"learning_rate": 1.4529411764705883e-05,
"entropy": 0.04372772537171841,
"num_tokens": 176269.0,
"mean_token_accuracy": 0.9855779051780701,
"epoch": 0.8235294117647058,
"step": 280
},
{
"loss": 0.038416677713394166,
"grad_norm": 3.234375,
"learning_rate": 1.443137254901961e-05,
"entropy": 0.04306882936507463,
"num_tokens": 179436.0,
"mean_token_accuracy": 0.9847787022590637,
"epoch": 0.8382352941176471,
"step": 285
},
{
"loss": 0.03612026274204254,
"grad_norm": 4.28125,
"learning_rate": 1.4333333333333334e-05,
"entropy": 0.04190887995064259,
"num_tokens": 182619.0,
"mean_token_accuracy": 0.9853791892528534,
"epoch": 0.8529411764705882,
"step": 290
},
{
"loss": 0.03549243807792664,
"grad_norm": 1.5546875,
"learning_rate": 1.423529411764706e-05,
"entropy": 0.041007821820676325,
"num_tokens": 185835.0,
"mean_token_accuracy": 0.987481951713562,
"epoch": 0.8676470588235294,
"step": 295
},
{
"loss": 0.03658969700336456,
"grad_norm": 1.9921875,
"learning_rate": 1.4137254901960786e-05,
"entropy": 0.03911938704550266,
"num_tokens": 189059.0,
"mean_token_accuracy": 0.9859034955501557,
"epoch": 0.8823529411764706,
"step": 300
},
{
"loss": 0.03189299702644348,
"grad_norm": 1.3984375,
"learning_rate": 1.403921568627451e-05,
"entropy": 0.04015427939593792,
"num_tokens": 192245.0,
"mean_token_accuracy": 0.9858013272285462,
"epoch": 0.8970588235294118,
"step": 305
},
{
"loss": 0.04162760376930237,
"grad_norm": 4.6875,
"learning_rate": 1.3941176470588236e-05,
"entropy": 0.04337671361863613,
"num_tokens": 195334.0,
"mean_token_accuracy": 0.9834910809993744,
"epoch": 0.9117647058823529,
"step": 310
},
{
"loss": 0.03357888162136078,
"grad_norm": 1.515625,
"learning_rate": 1.384313725490196e-05,
"entropy": 0.043437547981739044,
"num_tokens": 198482.0,
"mean_token_accuracy": 0.9839794993400574,
"epoch": 0.9264705882352942,
"step": 315
},
{
"loss": 0.03252431154251099,
"grad_norm": 2.390625,
"learning_rate": 1.3745098039215687e-05,
"entropy": 0.041450836881995204,
"num_tokens": 201737.0,
"mean_token_accuracy": 0.9883051753044129,
"epoch": 0.9411764705882353,
"step": 320
},
{
"loss": 0.03779064118862152,
"grad_norm": 2.953125,
"learning_rate": 1.3647058823529413e-05,
"entropy": 0.03566624131053686,
"num_tokens": 204889.0,
"mean_token_accuracy": 0.9875539124011994,
"epoch": 0.9558823529411765,
"step": 325
},
{
"loss": 0.0329700767993927,
"grad_norm": 2.15625,
"learning_rate": 1.3549019607843139e-05,
"entropy": 0.03808465227484703,
"num_tokens": 208114.0,
"mean_token_accuracy": 0.986751276254654,
"epoch": 0.9705882352941176,
"step": 330
},
{
"loss": 0.031173259019851685,
"grad_norm": 1.546875,
"learning_rate": 1.3450980392156865e-05,
"entropy": 0.04065078347921371,
"num_tokens": 211217.0,
"mean_token_accuracy": 0.9860772728919983,
"epoch": 0.9852941176470589,
"step": 335
},
{
"loss": 0.03390420079231262,
"grad_norm": 1.515625,
"learning_rate": 1.3352941176470588e-05,
"entropy": 0.04108036197721958,
"num_tokens": 214368.0,
"mean_token_accuracy": 0.9871271908283233,
"epoch": 1.0,
"step": 340
},
{
"loss": 0.03671025633811951,
"grad_norm": 1.5625,
"learning_rate": 1.3254901960784314e-05,
"entropy": 0.04091338850557804,
"num_tokens": 217480.0,
"mean_token_accuracy": 0.9861762046813964,
"epoch": 1.0147058823529411,
"step": 345
},
{
"loss": 0.030594143271446227,
"grad_norm": 1.5546875,
"learning_rate": 1.315686274509804e-05,
"entropy": 0.040245630964636805,
"num_tokens": 220615.0,
"mean_token_accuracy": 0.9881528139114379,
"epoch": 1.0294117647058822,
"step": 350
},
{
"loss": 0.027347692847251893,
"grad_norm": 1.7734375,
"learning_rate": 1.3058823529411766e-05,
"entropy": 0.03420254942029714,
"num_tokens": 223751.0,
"mean_token_accuracy": 0.989202469587326,
"epoch": 1.0441176470588236,
"step": 355
},
{
"loss": 0.03148679435253143,
"grad_norm": 1.9609375,
"learning_rate": 1.2960784313725492e-05,
"entropy": 0.03210772704333067,
"num_tokens": 226948.0,
"mean_token_accuracy": 0.9868246436119079,
"epoch": 1.0588235294117647,
"step": 360
},
{
"loss": 0.031260594725608826,
"grad_norm": 1.8046875,
"learning_rate": 1.2862745098039218e-05,
"entropy": 0.033671201393008235,
"num_tokens": 230088.0,
"mean_token_accuracy": 0.9856015264987945,
"epoch": 1.0735294117647058,
"step": 365
},
{
"loss": 0.028061491250991822,
"grad_norm": 1.2890625,
"learning_rate": 1.276470588235294e-05,
"entropy": 0.03639122284948826,
"num_tokens": 233247.0,
"mean_token_accuracy": 0.9885319888591766,
"epoch": 1.088235294117647,
"step": 370
},
{
"loss": 0.0304165780544281,
"grad_norm": 2.203125,
"learning_rate": 1.2666666666666667e-05,
"entropy": 0.03107942212373018,
"num_tokens": 236423.0,
"mean_token_accuracy": 0.9864429414272309,
"epoch": 1.1029411764705883,
"step": 375
},
{
"loss": 0.028667458891868593,
"grad_norm": 1.4453125,
"learning_rate": 1.2568627450980393e-05,
"entropy": 0.03269361965358257,
"num_tokens": 239698.0,
"mean_token_accuracy": 0.9882214546203614,
"epoch": 1.1176470588235294,
"step": 380
},
{
"loss": 0.03024893403053284,
"grad_norm": 1.4375,
"learning_rate": 1.2470588235294119e-05,
"entropy": 0.036648140475153926,
"num_tokens": 242904.0,
"mean_token_accuracy": 0.9854198694229126,
"epoch": 1.1323529411764706,
"step": 385
},
{
"loss": 0.03237654864788055,
"grad_norm": 1.140625,
"learning_rate": 1.2372549019607845e-05,
"entropy": 0.036488327011466024,
"num_tokens": 246044.0,
"mean_token_accuracy": 0.9868141651153565,
"epoch": 1.1470588235294117,
"step": 390
},
{
"loss": 0.026534423232078552,
"grad_norm": 1.2890625,
"learning_rate": 1.2274509803921571e-05,
"entropy": 0.03317699953913689,
"num_tokens": 249199.0,
"mean_token_accuracy": 0.9891056835651397,
"epoch": 1.161764705882353,
"step": 395
},
{
"loss": 0.02918187975883484,
"grad_norm": 1.546875,
"learning_rate": 1.2176470588235294e-05,
"entropy": 0.033053198270499705,
"num_tokens": 252416.0,
"mean_token_accuracy": 0.9872093260288238,
"epoch": 1.1764705882352942,
"step": 400
},
{
"loss": 0.027815410494804384,
"grad_norm": 1.5,
"learning_rate": 1.207843137254902e-05,
"entropy": 0.03630108144134283,
"num_tokens": 255505.0,
"mean_token_accuracy": 0.9886294066905975,
"epoch": 1.1911764705882353,
"step": 405
},
{
"loss": 0.029119834303855896,
"grad_norm": 1.640625,
"learning_rate": 1.1980392156862746e-05,
"entropy": 0.0321140518411994,
"num_tokens": 258679.0,
"mean_token_accuracy": 0.9888967990875244,
"epoch": 1.2058823529411764,
"step": 410
},
{
"loss": 0.025961104035377502,
"grad_norm": 1.8203125,
"learning_rate": 1.1882352941176472e-05,
"entropy": 0.02944366242736578,
"num_tokens": 261856.0,
"mean_token_accuracy": 0.9895209610462189,
"epoch": 1.2205882352941178,
"step": 415
},
{
"loss": 0.03058839440345764,
"grad_norm": 2.390625,
"learning_rate": 1.1784313725490198e-05,
"entropy": 0.03461700212210417,
"num_tokens": 264960.0,
"mean_token_accuracy": 0.9882765769958496,
"epoch": 1.2352941176470589,
"step": 420
},
{
"loss": 0.028424999117851256,
"grad_norm": 1.28125,
"learning_rate": 1.1686274509803922e-05,
"entropy": 0.02985447719693184,
"num_tokens": 268114.0,
"mean_token_accuracy": 0.9882177650928498,
"epoch": 1.25,
"step": 425
},
{
"loss": 0.03086719512939453,
"grad_norm": 2.265625,
"learning_rate": 1.1588235294117648e-05,
"entropy": 0.03250212036073208,
"num_tokens": 271274.0,
"mean_token_accuracy": 0.9888392806053161,
"epoch": 1.2647058823529411,
"step": 430
},
{
"loss": 0.027977922558784486,
"grad_norm": 1.3046875,
"learning_rate": 1.1490196078431373e-05,
"entropy": 0.034127247892320155,
"num_tokens": 274452.0,
"mean_token_accuracy": 0.9908244907855988,
"epoch": 1.2794117647058822,
"step": 435
},
{
"loss": 0.02676369547843933,
"grad_norm": 1.09375,
"learning_rate": 1.1392156862745099e-05,
"entropy": 0.03699512742459774,
"num_tokens": 277562.0,
"mean_token_accuracy": 0.9871235430240631,
"epoch": 1.2941176470588236,
"step": 440
},
{
"loss": 0.02789466977119446,
"grad_norm": 2.203125,
"learning_rate": 1.1294117647058825e-05,
"entropy": 0.03514884728938341,
"num_tokens": 280635.0,
"mean_token_accuracy": 0.990158212184906,
"epoch": 1.3088235294117647,
"step": 445
},
{
"loss": 0.03088509142398834,
"grad_norm": 1.8359375,
"learning_rate": 1.119607843137255e-05,
"entropy": 0.034746605530381204,
"num_tokens": 283725.0,
"mean_token_accuracy": 0.9876766622066497,
"epoch": 1.3235294117647058,
"step": 450
},
{
"loss": 0.03232976496219635,
"grad_norm": 1.734375,
"learning_rate": 1.1098039215686275e-05,
"entropy": 0.031742793321609494,
"num_tokens": 286888.0,
"mean_token_accuracy": 0.9871384859085083,
"epoch": 1.3382352941176472,
"step": 455
},
{
"loss": 0.02845146059989929,
"grad_norm": 2.0,
"learning_rate": 1.1000000000000001e-05,
"entropy": 0.03175645042210817,
"num_tokens": 290064.0,
"mean_token_accuracy": 0.9873914003372193,
"epoch": 1.3529411764705883,
"step": 460
},
{
"loss": 0.029486137628555297,
"grad_norm": 1.265625,
"learning_rate": 1.0901960784313726e-05,
"entropy": 0.03463620245456696,
"num_tokens": 293189.0,
"mean_token_accuracy": 0.9874814569950103,
"epoch": 1.3676470588235294,
"step": 465
},
{
"loss": 0.02618069648742676,
"grad_norm": 1.109375,
"learning_rate": 1.0803921568627452e-05,
"entropy": 0.033889508619904515,
"num_tokens": 296268.0,
"mean_token_accuracy": 0.9882802128791809,
"epoch": 1.3823529411764706,
"step": 470
},
{
"loss": 0.025544488430023195,
"grad_norm": 0.8984375,
"learning_rate": 1.0705882352941178e-05,
"entropy": 0.03317532502114773,
"num_tokens": 299418.0,
"mean_token_accuracy": 0.9891822457313537,
"epoch": 1.3970588235294117,
"step": 475
},
{
"loss": 0.02922942042350769,
"grad_norm": 1.5859375,
"learning_rate": 1.0607843137254902e-05,
"entropy": 0.03228537701070309,
"num_tokens": 302608.0,
"mean_token_accuracy": 0.9864252746105194,
"epoch": 1.4117647058823528,
"step": 480
},
{
"loss": 0.025081342458724974,
"grad_norm": 1.4140625,
"learning_rate": 1.0509803921568628e-05,
"entropy": 0.033559339493513106,
"num_tokens": 305748.0,
"mean_token_accuracy": 0.9891697466373444,
"epoch": 1.4264705882352942,
"step": 485
},
{
"loss": 0.028987354040145873,
"grad_norm": 1.2109375,
"learning_rate": 1.0411764705882354e-05,
"entropy": 0.029655468463897706,
"num_tokens": 308946.0,
"mean_token_accuracy": 0.9884015321731567,
"epoch": 1.4411764705882353,
"step": 490
},
{
"loss": 0.022376981377601624,
"grad_norm": 1.5859375,
"learning_rate": 1.031372549019608e-05,
"entropy": 0.030257853865623473,
"num_tokens": 312060.0,
"mean_token_accuracy": 0.990349942445755,
"epoch": 1.4558823529411764,
"step": 495
},
{
"loss": 0.027941384911537172,
"grad_norm": 1.2734375,
"learning_rate": 1.0215686274509805e-05,
"entropy": 0.029427625238895416,
"num_tokens": 315202.0,
"mean_token_accuracy": 0.9894903540611267,
"epoch": 1.4705882352941178,
"step": 500
},
{
"loss": 0.02513147294521332,
"grad_norm": 1.8828125,
"learning_rate": 1.011764705882353e-05,
"entropy": 0.029220272414386274,
"num_tokens": 318423.0,
"mean_token_accuracy": 0.9887598037719727,
"epoch": 1.4852941176470589,
"step": 505
},
{
"loss": 0.024520005285739898,
"grad_norm": 1.3515625,
"learning_rate": 1.0019607843137255e-05,
"entropy": 0.027622674778103828,
"num_tokens": 321643.0,
"mean_token_accuracy": 0.9881017684936524,
"epoch": 1.5,
"step": 510
},
{
"loss": 0.022774545848369597,
"grad_norm": 0.96875,
"learning_rate": 9.921568627450981e-06,
"entropy": 0.027344943769276143,
"num_tokens": 324896.0,
"mean_token_accuracy": 0.9891824662685395,
"epoch": 1.5147058823529411,
"step": 515
},
{
"loss": 0.026902440190315246,
"grad_norm": 1.34375,
"learning_rate": 9.823529411764706e-06,
"entropy": 0.03210813459008932,
"num_tokens": 327953.0,
"mean_token_accuracy": 0.9872022986412048,
"epoch": 1.5294117647058822,
"step": 520
},
{
"loss": 0.02404342144727707,
"grad_norm": 1.34375,
"learning_rate": 9.725490196078432e-06,
"entropy": 0.03047515023499727,
"num_tokens": 331110.0,
"mean_token_accuracy": 0.9887873768806458,
"epoch": 1.5441176470588234,
"step": 525
},
{
"loss": 0.022797247767448424,
"grad_norm": 1.2265625,
"learning_rate": 9.627450980392158e-06,
"entropy": 0.03160413987934589,
"num_tokens": 334226.0,
"mean_token_accuracy": 0.9889481067657471,
"epoch": 1.5588235294117647,
"step": 530
},
{
"loss": 0.023706996440887453,
"grad_norm": 1.078125,
"learning_rate": 9.529411764705882e-06,
"entropy": 0.0283035334199667,
"num_tokens": 337371.0,
"mean_token_accuracy": 0.9890589594841004,
"epoch": 1.5735294117647058,
"step": 535
},
{
"loss": 0.023340512812137604,
"grad_norm": 2.5625,
"learning_rate": 9.431372549019608e-06,
"entropy": 0.029125319607555867,
"num_tokens": 340563.0,
"mean_token_accuracy": 0.9882973015308381,
"epoch": 1.5882352941176472,
"step": 540
},
{
"loss": 0.025814762711524962,
"grad_norm": 1.8046875,
"learning_rate": 9.333333333333334e-06,
"entropy": 0.029474343173205853,
"num_tokens": 343715.0,
"mean_token_accuracy": 0.9888520836830139,
"epoch": 1.6029411764705883,
"step": 545
},
{
"loss": 0.024609880149364473,
"grad_norm": 1.359375,
"learning_rate": 9.23529411764706e-06,
"entropy": 0.02793533504009247,
"num_tokens": 346928.0,
"mean_token_accuracy": 0.9896528542041778,
"epoch": 1.6176470588235294,
"step": 550
},
{
"loss": 0.024091285467147828,
"grad_norm": 1.171875,
"learning_rate": 9.137254901960785e-06,
"entropy": 0.03169798478484154,
"num_tokens": 349942.0,
"mean_token_accuracy": 0.9896469593048096,
"epoch": 1.6323529411764706,
"step": 555
},
{
"loss": 0.022402273118495943,
"grad_norm": 1.3203125,
"learning_rate": 9.03921568627451e-06,
"entropy": 0.02854564245790243,
"num_tokens": 353063.0,
"mean_token_accuracy": 0.9894876420497895,
"epoch": 1.6470588235294117,
"step": 560
},
{
"loss": 0.023489847779273987,
"grad_norm": 1.8359375,
"learning_rate": 8.941176470588237e-06,
"entropy": 0.028600608371198176,
"num_tokens": 356180.0,
"mean_token_accuracy": 0.9890201330184937,
"epoch": 1.6617647058823528,
"step": 565
},
{
"loss": 0.02147035002708435,
"grad_norm": 1.0859375,
"learning_rate": 8.843137254901961e-06,
"entropy": 0.026650307327508928,
"num_tokens": 359351.0,
"mean_token_accuracy": 0.9898578941822052,
"epoch": 1.6764705882352942,
"step": 570
},
{
"loss": 0.022052311897277833,
"grad_norm": 1.3515625,
"learning_rate": 8.745098039215687e-06,
"entropy": 0.027873093821108343,
"num_tokens": 362470.0,
"mean_token_accuracy": 0.989058256149292,
"epoch": 1.6911764705882353,
"step": 575
},
{
"loss": 0.023864805698394775,
"grad_norm": 1.5859375,
"learning_rate": 8.647058823529413e-06,
"entropy": 0.027629780396819115,
"num_tokens": 365614.0,
"mean_token_accuracy": 0.9894056558609009,
"epoch": 1.7058823529411766,
"step": 580
},
{
"loss": 0.027744096517562867,
"grad_norm": 1.6875,
"learning_rate": 8.549019607843138e-06,
"entropy": 0.028794774785637856,
"num_tokens": 368805.0,
"mean_token_accuracy": 0.9880473792552948,
"epoch": 1.7205882352941178,
"step": 585
},
{
"loss": 0.021863000094890596,
"grad_norm": 1.1796875,
"learning_rate": 8.450980392156864e-06,
"entropy": 0.028252063691616057,
"num_tokens": 371947.0,
"mean_token_accuracy": 0.9904429137706756,
"epoch": 1.7352941176470589,
"step": 590
},
{
"loss": 0.021520544588565827,
"grad_norm": 1.3203125,
"learning_rate": 8.35294117647059e-06,
"entropy": 0.028264945745468138,
"num_tokens": 375103.0,
"mean_token_accuracy": 0.9904776751995087,
"epoch": 1.75,
"step": 595
},
{
"loss": 0.026353719830513,
"grad_norm": 1.1953125,
"learning_rate": 8.254901960784314e-06,
"entropy": 0.027113928645849227,
"num_tokens": 378317.0,
"mean_token_accuracy": 0.9884898960590363,
"epoch": 1.7647058823529411,
"step": 600
},
{
"loss": 0.026097461581230164,
"grad_norm": 1.421875,
"learning_rate": 8.15686274509804e-06,
"entropy": 0.028313294425606726,
"num_tokens": 381417.0,
"mean_token_accuracy": 0.9879869103431702,
"epoch": 1.7794117647058822,
"step": 605
},
{
"loss": 0.02049378156661987,
"grad_norm": 1.0546875,
"learning_rate": 8.058823529411766e-06,
"entropy": 0.026570411399006844,
"num_tokens": 384632.0,
"mean_token_accuracy": 0.9887495577335358,
"epoch": 1.7941176470588234,
"step": 610
},
{
"loss": 0.022221173346042632,
"grad_norm": 1.1171875,
"learning_rate": 7.96078431372549e-06,
"entropy": 0.02754255346953869,
"num_tokens": 387836.0,
"mean_token_accuracy": 0.9899809181690216,
"epoch": 1.8088235294117647,
"step": 615
},
{
"loss": 0.023856499791145326,
"grad_norm": 1.3203125,
"learning_rate": 7.862745098039217e-06,
"entropy": 0.031241112016141416,
"num_tokens": 390887.0,
"mean_token_accuracy": 0.9897979915142059,
"epoch": 1.8235294117647058,
"step": 620
},
{
"loss": 0.0225734680891037,
"grad_norm": 1.40625,
"learning_rate": 7.764705882352941e-06,
"entropy": 0.02798519879579544,
"num_tokens": 394027.0,
"mean_token_accuracy": 0.9890839040279389,
"epoch": 1.8382352941176472,
"step": 625
},
{
"loss": 0.022729092836380006,
"grad_norm": 1.25,
"learning_rate": 7.666666666666667e-06,
"entropy": 0.02719390895217657,
"num_tokens": 397202.0,
"mean_token_accuracy": 0.9886514127254487,
"epoch": 1.8529411764705883,
"step": 630
},
{
"loss": 0.021688875555992127,
"grad_norm": 1.0859375,
"learning_rate": 7.5686274509803925e-06,
"entropy": 0.027222988195717335,
"num_tokens": 400378.0,
"mean_token_accuracy": 0.9908071339130402,
"epoch": 1.8676470588235294,
"step": 635
},
{
"loss": 0.023884420096874238,
"grad_norm": 1.4296875,
"learning_rate": 7.4705882352941185e-06,
"entropy": 0.028057356551289558,
"num_tokens": 403503.0,
"mean_token_accuracy": 0.9900456726551056,
"epoch": 1.8823529411764706,
"step": 640
},
{
"loss": 0.020375268161296846,
"grad_norm": 1.6953125,
"learning_rate": 7.372549019607845e-06,
"entropy": 0.02543655373156071,
"num_tokens": 406768.0,
"mean_token_accuracy": 0.9911065042018891,
"epoch": 1.8970588235294117,
"step": 645
},
{
"loss": 0.020015493035316467,
"grad_norm": 1.7421875,
"learning_rate": 7.274509803921569e-06,
"entropy": 0.027230485714972018,
"num_tokens": 409875.0,
"mean_token_accuracy": 0.9906234502792358,
"epoch": 1.9117647058823528,
"step": 650
},
{
"loss": 0.022530680894851683,
"grad_norm": 1.421875,
"learning_rate": 7.176470588235295e-06,
"entropy": 0.028223772905766963,
"num_tokens": 412987.0,
"mean_token_accuracy": 0.9903216242790223,
"epoch": 1.9264705882352942,
"step": 655
},
{
"loss": 0.021129874885082243,
"grad_norm": 1.109375,
"learning_rate": 7.07843137254902e-06,
"entropy": 0.02674291282892227,
"num_tokens": 416181.0,
"mean_token_accuracy": 0.9886639952659607,
"epoch": 1.9411764705882353,
"step": 660
},
{
"loss": 0.021244224905967713,
"grad_norm": 0.9453125,
"learning_rate": 6.9803921568627454e-06,
"entropy": 0.028005971759557723,
"num_tokens": 419323.0,
"mean_token_accuracy": 0.9905200719833374,
"epoch": 1.9558823529411766,
"step": 665
},
{
"loss": 0.022309188544750214,
"grad_norm": 1.375,
"learning_rate": 6.8823529411764715e-06,
"entropy": 0.027272411435842515,
"num_tokens": 422484.0,
"mean_token_accuracy": 0.9878733932971955,
"epoch": 1.9705882352941178,
"step": 670
},
{
"loss": 0.022459632158279418,
"grad_norm": 1.203125,
"learning_rate": 6.784313725490197e-06,
"entropy": 0.026817415095865726,
"num_tokens": 425583.0,
"mean_token_accuracy": 0.9908780753612518,
"epoch": 1.9852941176470589,
"step": 675
},
{
"loss": 0.021811096370220183,
"grad_norm": 1.265625,
"learning_rate": 6.686274509803922e-06,
"entropy": 0.026038615591824056,
"num_tokens": 428736.0,
"mean_token_accuracy": 0.9897907853126526,
"epoch": 2.0,
"step": 680
},
{
"loss": 0.019171090424060823,
"grad_norm": 1.078125,
"learning_rate": 6.588235294117647e-06,
"entropy": 0.02475190218538046,
"num_tokens": 431976.0,
"mean_token_accuracy": 0.989355844259262,
"epoch": 2.014705882352941,
"step": 685
},
{
"loss": 0.023474155366420744,
"grad_norm": 1.1640625,
"learning_rate": 6.490196078431373e-06,
"entropy": 0.026115396432578562,
"num_tokens": 435142.0,
"mean_token_accuracy": 0.9885824680328369,
"epoch": 2.0294117647058822,
"step": 690
},
{
"loss": 0.020176805555820465,
"grad_norm": 1.0,
"learning_rate": 6.3921568627450984e-06,
"entropy": 0.026907235756516455,
"num_tokens": 438259.0,
"mean_token_accuracy": 0.9919745445251464,
"epoch": 2.0441176470588234,
"step": 695
},
{
"loss": 0.022543656826019286,
"grad_norm": 1.34375,
"learning_rate": 6.294117647058824e-06,
"entropy": 0.02749718502163887,
"num_tokens": 441366.0,
"mean_token_accuracy": 0.9880188047885895,
"epoch": 2.0588235294117645,
"step": 700
},
{
"loss": 0.019685085117816924,
"grad_norm": 0.9453125,
"learning_rate": 6.19607843137255e-06,
"entropy": 0.024849089048802852,
"num_tokens": 444474.0,
"mean_token_accuracy": 0.9906105160713196,
"epoch": 2.073529411764706,
"step": 705
},
{
"loss": 0.020225000381469727,
"grad_norm": 1.234375,
"learning_rate": 6.098039215686276e-06,
"entropy": 0.023934758827090265,
"num_tokens": 447652.0,
"mean_token_accuracy": 0.9896179974079132,
"epoch": 2.088235294117647,
"step": 710
},
{
"loss": 0.02128472626209259,
"grad_norm": 1.078125,
"learning_rate": 6e-06,
"entropy": 0.02389440070837736,
"num_tokens": 450833.0,
"mean_token_accuracy": 0.9899099349975586,
"epoch": 2.1029411764705883,
"step": 715
},
{
"loss": 0.021367147564888,
"grad_norm": 1.6015625,
"learning_rate": 5.901960784313726e-06,
"entropy": 0.02620517127215862,
"num_tokens": 453949.0,
"mean_token_accuracy": 0.988726532459259,
"epoch": 2.1176470588235294,
"step": 720
},
{
"loss": 0.01960753947496414,
"grad_norm": 1.03125,
"learning_rate": 5.803921568627452e-06,
"entropy": 0.02435927651822567,
"num_tokens": 457147.0,
"mean_token_accuracy": 0.9908569097518921,
"epoch": 2.1323529411764706,
"step": 725
},
{
"loss": 0.022167882323265074,
"grad_norm": 1.234375,
"learning_rate": 5.705882352941177e-06,
"entropy": 0.02521121110767126,
"num_tokens": 460308.0,
"mean_token_accuracy": 0.9891940593719483,
"epoch": 2.1470588235294117,
"step": 730
},
{
"loss": 0.0210279181599617,
"grad_norm": 1.359375,
"learning_rate": 5.607843137254903e-06,
"entropy": 0.02500821612775326,
"num_tokens": 463449.0,
"mean_token_accuracy": 0.9884547054767608,
"epoch": 2.161764705882353,
"step": 735
},
{
"loss": 0.01987575888633728,
"grad_norm": 1.03125,
"learning_rate": 5.509803921568628e-06,
"entropy": 0.025977463461458683,
"num_tokens": 466590.0,
"mean_token_accuracy": 0.9888093769550323,
"epoch": 2.176470588235294,
"step": 740
},
{
"loss": 0.019111356139183043,
"grad_norm": 1.25,
"learning_rate": 5.411764705882353e-06,
"entropy": 0.02638601940125227,
"num_tokens": 469726.0,
"mean_token_accuracy": 0.9917258858680725,
"epoch": 2.1911764705882355,
"step": 745
},
{
"loss": 0.020354922115802764,
"grad_norm": 1.171875,
"learning_rate": 5.313725490196079e-06,
"entropy": 0.026662386767566205,
"num_tokens": 472853.0,
"mean_token_accuracy": 0.99064000248909,
"epoch": 2.2058823529411766,
"step": 750
},
{
"loss": 0.01959734410047531,
"grad_norm": 0.80859375,
"learning_rate": 5.2156862745098044e-06,
"entropy": 0.02579411044716835,
"num_tokens": 476008.0,
"mean_token_accuracy": 0.9904728531837463,
"epoch": 2.2205882352941178,
"step": 755
},
{
"loss": 0.020466303825378417,
"grad_norm": 1.3828125,
"learning_rate": 5.11764705882353e-06,
"entropy": 0.0256651122123003,
"num_tokens": 479150.0,
"mean_token_accuracy": 0.9903539717197418,
"epoch": 2.235294117647059,
"step": 760
},
{
"loss": 0.01983775794506073,
"grad_norm": 0.99609375,
"learning_rate": 5.019607843137255e-06,
"entropy": 0.02584236618131399,
"num_tokens": 482321.0,
"mean_token_accuracy": 0.9914842903614044,
"epoch": 2.25,
"step": 765
},
{
"loss": 0.020100761950016022,
"grad_norm": 1.046875,
"learning_rate": 4.921568627450981e-06,
"entropy": 0.02499296572059393,
"num_tokens": 485510.0,
"mean_token_accuracy": 0.991219836473465,
"epoch": 2.264705882352941,
"step": 770
},
{
"loss": 0.02088477313518524,
"grad_norm": 1.328125,
"learning_rate": 4.823529411764706e-06,
"entropy": 0.024959737621247768,
"num_tokens": 488698.0,
"mean_token_accuracy": 0.9898148238658905,
"epoch": 2.2794117647058822,
"step": 775
},
{
"loss": 0.0195361465215683,
"grad_norm": 1.2421875,
"learning_rate": 4.725490196078431e-06,
"entropy": 0.023672481067478657,
"num_tokens": 491906.0,
"mean_token_accuracy": 0.9900302290916443,
"epoch": 2.2941176470588234,
"step": 780
},
{
"loss": 0.019702821969985962,
"grad_norm": 1.265625,
"learning_rate": 4.627450980392157e-06,
"entropy": 0.025737580843269825,
"num_tokens": 494997.0,
"mean_token_accuracy": 0.9905776441097259,
"epoch": 2.3088235294117645,
"step": 785
},
{
"loss": 0.018527360260486604,
"grad_norm": 1.078125,
"learning_rate": 4.529411764705883e-06,
"entropy": 0.02454463895410299,
"num_tokens": 498138.0,
"mean_token_accuracy": 0.9910318195819855,
"epoch": 2.323529411764706,
"step": 790
},
{
"loss": 0.018923106789588928,
"grad_norm": 1.359375,
"learning_rate": 4.431372549019608e-06,
"entropy": 0.0245100449770689,
"num_tokens": 501316.0,
"mean_token_accuracy": 0.9911953806877136,
"epoch": 2.338235294117647,
"step": 795
},
{
"loss": 0.01874026209115982,
"grad_norm": 1.140625,
"learning_rate": 4.333333333333334e-06,
"entropy": 0.023334310948848726,
"num_tokens": 504533.0,
"mean_token_accuracy": 0.9910171329975128,
"epoch": 2.3529411764705883,
"step": 800
},
{
"loss": 0.022160655260086058,
"grad_norm": 1.2578125,
"learning_rate": 4.235294117647059e-06,
"entropy": 0.026187057420611382,
"num_tokens": 507616.0,
"mean_token_accuracy": 0.9876076638698578,
"epoch": 2.3676470588235294,
"step": 805
},
{
"loss": 0.018640576303005217,
"grad_norm": 1.03125,
"learning_rate": 4.137254901960784e-06,
"entropy": 0.02308085039258003,
"num_tokens": 510793.0,
"mean_token_accuracy": 0.9908162891864777,
"epoch": 2.3823529411764706,
"step": 810
},
{
"loss": 0.019237047433853148,
"grad_norm": 0.8984375,
"learning_rate": 4.03921568627451e-06,
"entropy": 0.024417817965149878,
"num_tokens": 513995.0,
"mean_token_accuracy": 0.9902299284934998,
"epoch": 2.3970588235294117,
"step": 815
},
{
"loss": 0.020626239478588104,
"grad_norm": 1.1640625,
"learning_rate": 3.941176470588236e-06,
"entropy": 0.025944224931299685,
"num_tokens": 517128.0,
"mean_token_accuracy": 0.9896773338317871,
"epoch": 2.411764705882353,
"step": 820
},
{
"loss": 0.018906430900096895,
"grad_norm": 1.0546875,
"learning_rate": 3.843137254901962e-06,
"entropy": 0.02529167104512453,
"num_tokens": 520219.0,
"mean_token_accuracy": 0.9905548214912414,
"epoch": 2.426470588235294,
"step": 825
},
{
"loss": 0.01989607810974121,
"grad_norm": 1.171875,
"learning_rate": 3.7450980392156865e-06,
"entropy": 0.025429282896220685,
"num_tokens": 523368.0,
"mean_token_accuracy": 0.9910161614418029,
"epoch": 2.4411764705882355,
"step": 830
},
{
"loss": 0.019511505961418152,
"grad_norm": 1.046875,
"learning_rate": 3.6470588235294117e-06,
"entropy": 0.026134114153683184,
"num_tokens": 526516.0,
"mean_token_accuracy": 0.9898114144802094,
"epoch": 2.4558823529411766,
"step": 835
},
{
"loss": 0.018582092225551607,
"grad_norm": 1.1328125,
"learning_rate": 3.5490196078431378e-06,
"entropy": 0.02343358173966408,
"num_tokens": 529660.0,
"mean_token_accuracy": 0.9904271245002747,
"epoch": 2.4705882352941178,
"step": 840
},
{
"loss": 0.020261451601982117,
"grad_norm": 1.453125,
"learning_rate": 3.450980392156863e-06,
"entropy": 0.024460323713719846,
"num_tokens": 532778.0,
"mean_token_accuracy": 0.9899402976036071,
"epoch": 2.485294117647059,
"step": 845
},
{
"loss": 0.020383948087692262,
"grad_norm": 1.1796875,
"learning_rate": 3.352941176470588e-06,
"entropy": 0.024987665377557276,
"num_tokens": 535932.0,
"mean_token_accuracy": 0.9898059248924256,
"epoch": 2.5,
"step": 850
},
{
"loss": 0.019448164105415344,
"grad_norm": 1.3515625,
"learning_rate": 3.2549019607843143e-06,
"entropy": 0.02465162370353937,
"num_tokens": 539037.0,
"mean_token_accuracy": 0.9913235783576966,
"epoch": 2.514705882352941,
"step": 855
},
{
"loss": 0.018925553560256957,
"grad_norm": 1.046875,
"learning_rate": 3.1568627450980395e-06,
"entropy": 0.025184641405940057,
"num_tokens": 542197.0,
"mean_token_accuracy": 0.991470605134964,
"epoch": 2.5294117647058822,
"step": 860
},
{
"loss": 0.01913969814777374,
"grad_norm": 1.0546875,
"learning_rate": 3.058823529411765e-06,
"entropy": 0.024113286659121512,
"num_tokens": 545387.0,
"mean_token_accuracy": 0.9914486467838287,
"epoch": 2.5441176470588234,
"step": 865
},
{
"loss": 0.018765930831432343,
"grad_norm": 1.0703125,
"learning_rate": 2.9607843137254903e-06,
"entropy": 0.02413007989525795,
"num_tokens": 548534.0,
"mean_token_accuracy": 0.9907777428627014,
"epoch": 2.5588235294117645,
"step": 870
},
{
"loss": 0.019279350340366364,
"grad_norm": 2.1875,
"learning_rate": 2.8627450980392155e-06,
"entropy": 0.024522659182548524,
"num_tokens": 551721.0,
"mean_token_accuracy": 0.9905555963516235,
"epoch": 2.5735294117647056,
"step": 875
},
{
"loss": 0.019660860300064087,
"grad_norm": 1.1015625,
"learning_rate": 2.7647058823529416e-06,
"entropy": 0.024852845631539822,
"num_tokens": 554912.0,
"mean_token_accuracy": 0.9898727238178253,
"epoch": 2.588235294117647,
"step": 880
},
{
"loss": 0.018780362606048585,
"grad_norm": 1.0703125,
"learning_rate": 2.666666666666667e-06,
"entropy": 0.02551023568958044,
"num_tokens": 558028.0,
"mean_token_accuracy": 0.99192915558815,
"epoch": 2.6029411764705883,
"step": 885
},
{
"loss": 0.01949601024389267,
"grad_norm": 1.1953125,
"learning_rate": 2.568627450980392e-06,
"entropy": 0.025155650451779366,
"num_tokens": 561189.0,
"mean_token_accuracy": 0.990712708234787,
"epoch": 2.6176470588235294,
"step": 890
},
{
"loss": 0.019716159999370576,
"grad_norm": 1.296875,
"learning_rate": 2.470588235294118e-06,
"entropy": 0.024883992783725262,
"num_tokens": 564374.0,
"mean_token_accuracy": 0.989579439163208,
"epoch": 2.6323529411764706,
"step": 895
},
{
"loss": 0.017295162379741668,
"grad_norm": 0.97265625,
"learning_rate": 2.3725490196078433e-06,
"entropy": 0.0241273645311594,
"num_tokens": 567550.0,
"mean_token_accuracy": 0.9934020042419434,
"epoch": 2.6470588235294117,
"step": 900
},
{
"loss": 0.020695842802524567,
"grad_norm": 1.109375,
"learning_rate": 2.274509803921569e-06,
"entropy": 0.02697849553078413,
"num_tokens": 570611.0,
"mean_token_accuracy": 0.9914706110954284,
"epoch": 2.661764705882353,
"step": 905
},
{
"loss": 0.017908445000648497,
"grad_norm": 1.2734375,
"learning_rate": 2.176470588235294e-06,
"entropy": 0.022997986152768136,
"num_tokens": 573767.0,
"mean_token_accuracy": 0.9898150980472564,
"epoch": 2.6764705882352944,
"step": 910
},
{
"loss": 0.020641934871673585,
"grad_norm": 1.4921875,
"learning_rate": 2.07843137254902e-06,
"entropy": 0.027346356958150863,
"num_tokens": 576830.0,
"mean_token_accuracy": 0.9897843182086945,
"epoch": 2.6911764705882355,
"step": 915
},
{
"loss": 0.019691270589828492,
"grad_norm": 1.2890625,
"learning_rate": 1.980392156862745e-06,
"entropy": 0.023718219250440598,
"num_tokens": 580065.0,
"mean_token_accuracy": 0.9901076138019562,
"epoch": 2.7058823529411766,
"step": 920
},
{
"loss": 0.02009253352880478,
"grad_norm": 1.2109375,
"learning_rate": 1.8823529411764707e-06,
"entropy": 0.024860053882002832,
"num_tokens": 583200.0,
"mean_token_accuracy": 0.9894306361675262,
"epoch": 2.7205882352941178,
"step": 925
},
{
"loss": 0.019820311665534975,
"grad_norm": 1.1796875,
"learning_rate": 1.7843137254901963e-06,
"entropy": 0.02641481179744005,
"num_tokens": 586247.0,
"mean_token_accuracy": 0.9888152658939362,
"epoch": 2.735294117647059,
"step": 930
},
{
"loss": 0.020238989591598512,
"grad_norm": 1.34375,
"learning_rate": 1.6862745098039217e-06,
"entropy": 0.025426279939711093,
"num_tokens": 589348.0,
"mean_token_accuracy": 0.9893324971199036,
"epoch": 2.75,
"step": 935
},
{
"loss": 0.020529073476791383,
"grad_norm": 1.1953125,
"learning_rate": 1.5882352941176472e-06,
"entropy": 0.025489212945103645,
"num_tokens": 592483.0,
"mean_token_accuracy": 0.9883848607540131,
"epoch": 2.764705882352941,
"step": 940
},
{
"loss": 0.019503119587898254,
"grad_norm": 1.875,
"learning_rate": 1.4901960784313726e-06,
"entropy": 0.025844238512218,
"num_tokens": 595654.0,
"mean_token_accuracy": 0.9898752987384796,
"epoch": 2.7794117647058822,
"step": 945
},
{
"loss": 0.020725423097610475,
"grad_norm": 1.3359375,
"learning_rate": 1.3921568627450982e-06,
"entropy": 0.025542815588414668,
"num_tokens": 598757.0,
"mean_token_accuracy": 0.9899684190750122,
"epoch": 2.7941176470588234,
"step": 950
},
{
"loss": 0.020795242488384248,
"grad_norm": 1.1640625,
"learning_rate": 1.2941176470588237e-06,
"entropy": 0.023506213910877705,
"num_tokens": 602069.0,
"mean_token_accuracy": 0.9894281327724457,
"epoch": 2.8088235294117645,
"step": 955
},
{
"loss": 0.01915638893842697,
"grad_norm": 1.21875,
"learning_rate": 1.196078431372549e-06,
"entropy": 0.024655142053961753,
"num_tokens": 605286.0,
"mean_token_accuracy": 0.9900248169898986,
"epoch": 2.8235294117647056,
"step": 960
},
{
"loss": 0.01975841522216797,
"grad_norm": 1.1484375,
"learning_rate": 1.0980392156862745e-06,
"entropy": 0.025551106408238412,
"num_tokens": 608374.0,
"mean_token_accuracy": 0.9892638444900512,
"epoch": 2.838235294117647,
"step": 965
},
{
"loss": 0.020852866768836974,
"grad_norm": 1.2421875,
"learning_rate": 1.0000000000000002e-06,
"entropy": 0.02480896282941103,
"num_tokens": 611577.0,
"mean_token_accuracy": 0.9892595648765564,
"epoch": 2.8529411764705883,
"step": 970
},
{
"loss": 0.019326749444007873,
"grad_norm": 0.875,
"learning_rate": 9.019607843137256e-07,
"entropy": 0.02385783474892378,
"num_tokens": 614761.0,
"mean_token_accuracy": 0.9904800593852997,
"epoch": 2.8676470588235294,
"step": 975
},
{
"loss": 0.019405061006546022,
"grad_norm": 1.1875,
"learning_rate": 8.039215686274511e-07,
"entropy": 0.026029090210795403,
"num_tokens": 617870.0,
"mean_token_accuracy": 0.9896216452121734,
"epoch": 2.8823529411764706,
"step": 980
},
{
"loss": 0.019337351620197295,
"grad_norm": 0.9921875,
"learning_rate": 7.058823529411766e-07,
"entropy": 0.026062553003430366,
"num_tokens": 620943.0,
"mean_token_accuracy": 0.9899002552032471,
"epoch": 2.8970588235294117,
"step": 985
},
{
"loss": 0.01972263157367706,
"grad_norm": 1.5625,
"learning_rate": 6.07843137254902e-07,
"entropy": 0.025324805453419686,
"num_tokens": 624094.0,
"mean_token_accuracy": 0.9898600101470947,
"epoch": 2.911764705882353,
"step": 990
},
{
"loss": 0.017833781242370606,
"grad_norm": 1.2265625,
"learning_rate": 5.098039215686275e-07,
"entropy": 0.023284821771085262,
"num_tokens": 627253.0,
"mean_token_accuracy": 0.9910983681678772,
"epoch": 2.9264705882352944,
"step": 995
},
{
"loss": 0.020137375593185423,
"grad_norm": 1.3984375,
"learning_rate": 4.1176470588235295e-07,
"entropy": 0.024203809909522533,
"num_tokens": 630427.0,
"mean_token_accuracy": 0.9907480180263519,
"epoch": 2.9411764705882355,
"step": 1000
},
{
"loss": 0.019109995663166048,
"grad_norm": 1.21875,
"learning_rate": 3.1372549019607843e-07,
"entropy": 0.02416255362331867,
"num_tokens": 633632.0,
"mean_token_accuracy": 0.9915190756320953,
"epoch": 2.9558823529411766,
"step": 1005
},
{
"loss": 0.02000269144773483,
"grad_norm": 1.859375,
"learning_rate": 2.1568627450980394e-07,
"entropy": 0.024217843264341354,
"num_tokens": 636805.0,
"mean_token_accuracy": 0.9894875824451447,
"epoch": 2.9705882352941178,
"step": 1010
},
{
"loss": 0.020338763296604157,
"grad_norm": 1.546875,
"learning_rate": 1.1764705882352942e-07,
"entropy": 0.024258859269320966,
"num_tokens": 639984.0,
"mean_token_accuracy": 0.9892021059989929,
"epoch": 2.985294117647059,
"step": 1015
},
{
"loss": 0.020995336771011352,
"grad_norm": 1.046875,
"learning_rate": 1.9607843137254902e-08,
"entropy": 0.025342148169875144,
"num_tokens": 643104.0,
"mean_token_accuracy": 0.9887544453144074,
"epoch": 3.0,
"step": 1020
},
{
"train_runtime": 3944.5682,
"train_samples_per_second": 0.517,
"train_steps_per_second": 0.259,
"total_flos": 5056111718203392.0,
"train_loss": 0.07629515403041652,
"epoch": 3.0,
"step": 1020
}
]