| [ |
| { |
| "loss": 2.836225128173828, |
| "grad_norm": 64.5, |
| "learning_rate": 1.9921568627450984e-05, |
| "entropy": 2.411133313179016, |
| "num_tokens": 3137.0, |
| "mean_token_accuracy": 0.49307813346385954, |
| "epoch": 0.014705882352941176, |
| "step": 5 |
| }, |
| { |
| "loss": 1.3722827911376954, |
| "grad_norm": 10.0, |
| "learning_rate": 1.9823529411764708e-05, |
| "entropy": 1.489565873146057, |
| "num_tokens": 6240.0, |
| "mean_token_accuracy": 0.7310294091701508, |
| "epoch": 0.029411764705882353, |
| "step": 10 |
| }, |
| { |
| "loss": 0.9681278228759765, |
| "grad_norm": 9.0, |
| "learning_rate": 1.9725490196078433e-05, |
| "entropy": 1.0941020846366882, |
| "num_tokens": 9372.0, |
| "mean_token_accuracy": 0.7977278172969818, |
| "epoch": 0.04411764705882353, |
| "step": 15 |
| }, |
| { |
| "loss": 0.7952256202697754, |
| "grad_norm": 7.5625, |
| "learning_rate": 1.9627450980392157e-05, |
| "entropy": 0.7959236443042755, |
| "num_tokens": 12496.0, |
| "mean_token_accuracy": 0.8263253927230835, |
| "epoch": 0.058823529411764705, |
| "step": 20 |
| }, |
| { |
| "loss": 0.7038975715637207, |
| "grad_norm": 10.0, |
| "learning_rate": 1.9529411764705885e-05, |
| "entropy": 0.7730603992938996, |
| "num_tokens": 15726.0, |
| "mean_token_accuracy": 0.8362560391426086, |
| "epoch": 0.07352941176470588, |
| "step": 25 |
| }, |
| { |
| "loss": 0.5153284072875977, |
| "grad_norm": 9.5, |
| "learning_rate": 1.943137254901961e-05, |
| "entropy": 0.5871870815753937, |
| "num_tokens": 18807.0, |
| "mean_token_accuracy": 0.8711118042469025, |
| "epoch": 0.08823529411764706, |
| "step": 30 |
| }, |
| { |
| "loss": 0.4624673843383789, |
| "grad_norm": 9.375, |
| "learning_rate": 1.9333333333333333e-05, |
| "entropy": 0.5334561973810196, |
| "num_tokens": 21955.0, |
| "mean_token_accuracy": 0.8878682732582093, |
| "epoch": 0.10294117647058823, |
| "step": 35 |
| }, |
| { |
| "loss": 0.3805722236633301, |
| "grad_norm": 7.0625, |
| "learning_rate": 1.923529411764706e-05, |
| "entropy": 0.490571403503418, |
| "num_tokens": 25129.0, |
| "mean_token_accuracy": 0.9082872688770294, |
| "epoch": 0.11764705882352941, |
| "step": 40 |
| }, |
| { |
| "loss": 0.2753485679626465, |
| "grad_norm": 8.75, |
| "learning_rate": 1.9137254901960786e-05, |
| "entropy": 0.3105604648590088, |
| "num_tokens": 28291.0, |
| "mean_token_accuracy": 0.9394680917263031, |
| "epoch": 0.1323529411764706, |
| "step": 45 |
| }, |
| { |
| "loss": 0.22170100212097169, |
| "grad_norm": 5.65625, |
| "learning_rate": 1.903921568627451e-05, |
| "entropy": 0.28098965287208555, |
| "num_tokens": 31415.0, |
| "mean_token_accuracy": 0.949154794216156, |
| "epoch": 0.14705882352941177, |
| "step": 50 |
| }, |
| { |
| "loss": 0.18951488733291627, |
| "grad_norm": 9.9375, |
| "learning_rate": 1.8941176470588238e-05, |
| "entropy": 0.20550020337104796, |
| "num_tokens": 34603.0, |
| "mean_token_accuracy": 0.9539743661880493, |
| "epoch": 0.16176470588235295, |
| "step": 55 |
| }, |
| { |
| "loss": 0.17650480270385743, |
| "grad_norm": 4.25, |
| "learning_rate": 1.8843137254901962e-05, |
| "entropy": 0.21026135981082916, |
| "num_tokens": 37754.0, |
| "mean_token_accuracy": 0.9567391991615295, |
| "epoch": 0.17647058823529413, |
| "step": 60 |
| }, |
| { |
| "loss": 0.18774482011795043, |
| "grad_norm": 5.5, |
| "learning_rate": 1.8745098039215686e-05, |
| "entropy": 0.23240296691656112, |
| "num_tokens": 40848.0, |
| "mean_token_accuracy": 0.9520188570022583, |
| "epoch": 0.19117647058823528, |
| "step": 65 |
| }, |
| { |
| "loss": 0.12736810445785524, |
| "grad_norm": 10.625, |
| "learning_rate": 1.8647058823529414e-05, |
| "entropy": 0.16197917684912683, |
| "num_tokens": 44001.0, |
| "mean_token_accuracy": 0.9676418542861939, |
| "epoch": 0.20588235294117646, |
| "step": 70 |
| }, |
| { |
| "loss": 0.14076029062271117, |
| "grad_norm": 4.53125, |
| "learning_rate": 1.854901960784314e-05, |
| "entropy": 0.15784153044223787, |
| "num_tokens": 47159.0, |
| "mean_token_accuracy": 0.9648099303245544, |
| "epoch": 0.22058823529411764, |
| "step": 75 |
| }, |
| { |
| "loss": 0.10759507417678833, |
| "grad_norm": 3.328125, |
| "learning_rate": 1.8450980392156866e-05, |
| "entropy": 0.14289679378271103, |
| "num_tokens": 50298.0, |
| "mean_token_accuracy": 0.9671541452407837, |
| "epoch": 0.23529411764705882, |
| "step": 80 |
| }, |
| { |
| "loss": 0.12589149475097655, |
| "grad_norm": 5.46875, |
| "learning_rate": 1.8352941176470587e-05, |
| "entropy": 0.13958239406347275, |
| "num_tokens": 53455.0, |
| "mean_token_accuracy": 0.9665216684341431, |
| "epoch": 0.25, |
| "step": 85 |
| }, |
| { |
| "loss": 0.12024720907211303, |
| "grad_norm": 4.53125, |
| "learning_rate": 1.8254901960784315e-05, |
| "entropy": 0.13711344972252845, |
| "num_tokens": 56595.0, |
| "mean_token_accuracy": 0.9648710668087006, |
| "epoch": 0.2647058823529412, |
| "step": 90 |
| }, |
| { |
| "loss": 0.10167303085327148, |
| "grad_norm": 4.8125, |
| "learning_rate": 1.815686274509804e-05, |
| "entropy": 0.13078619986772538, |
| "num_tokens": 59674.0, |
| "mean_token_accuracy": 0.9712324619293213, |
| "epoch": 0.27941176470588236, |
| "step": 95 |
| }, |
| { |
| "loss": 0.08662314414978027, |
| "grad_norm": 3.671875, |
| "learning_rate": 1.8058823529411767e-05, |
| "entropy": 0.10740345045924186, |
| "num_tokens": 62774.0, |
| "mean_token_accuracy": 0.9719909071922302, |
| "epoch": 0.29411764705882354, |
| "step": 100 |
| }, |
| { |
| "loss": 0.09073780775070191, |
| "grad_norm": 4.15625, |
| "learning_rate": 1.796078431372549e-05, |
| "entropy": 0.09185975939035415, |
| "num_tokens": 65866.0, |
| "mean_token_accuracy": 0.9742748856544494, |
| "epoch": 0.3088235294117647, |
| "step": 105 |
| }, |
| { |
| "loss": 0.07408615350723266, |
| "grad_norm": 2.734375, |
| "learning_rate": 1.786274509803922e-05, |
| "entropy": 0.10024651288986205, |
| "num_tokens": 68995.0, |
| "mean_token_accuracy": 0.9773713290691376, |
| "epoch": 0.3235294117647059, |
| "step": 110 |
| }, |
| { |
| "loss": 0.08644189834594726, |
| "grad_norm": 6.71875, |
| "learning_rate": 1.776470588235294e-05, |
| "entropy": 0.09930562153458596, |
| "num_tokens": 72160.0, |
| "mean_token_accuracy": 0.9748322486877441, |
| "epoch": 0.3382352941176471, |
| "step": 115 |
| }, |
| { |
| "loss": 0.11685197353363037, |
| "grad_norm": 10.3125, |
| "learning_rate": 1.7666666666666668e-05, |
| "entropy": 0.11419346779584885, |
| "num_tokens": 75262.0, |
| "mean_token_accuracy": 0.9695464611053467, |
| "epoch": 0.35294117647058826, |
| "step": 120 |
| }, |
| { |
| "loss": 0.10757300853729249, |
| "grad_norm": 8.9375, |
| "learning_rate": 1.7568627450980392e-05, |
| "entropy": 0.12836654633283615, |
| "num_tokens": 78384.0, |
| "mean_token_accuracy": 0.9728550255298615, |
| "epoch": 0.36764705882352944, |
| "step": 125 |
| }, |
| { |
| "loss": 0.07711289525032043, |
| "grad_norm": 3.015625, |
| "learning_rate": 1.747058823529412e-05, |
| "entropy": 0.10070741027593613, |
| "num_tokens": 81583.0, |
| "mean_token_accuracy": 0.9778402209281921, |
| "epoch": 0.38235294117647056, |
| "step": 130 |
| }, |
| { |
| "loss": 0.08512116074562073, |
| "grad_norm": 5.375, |
| "learning_rate": 1.7372549019607845e-05, |
| "entropy": 0.09163436144590378, |
| "num_tokens": 84729.0, |
| "mean_token_accuracy": 0.9748329102993012, |
| "epoch": 0.39705882352941174, |
| "step": 135 |
| }, |
| { |
| "loss": 0.09534031748771668, |
| "grad_norm": 3.40625, |
| "learning_rate": 1.7274509803921572e-05, |
| "entropy": 0.09555450975894927, |
| "num_tokens": 87916.0, |
| "mean_token_accuracy": 0.9727975726127625, |
| "epoch": 0.4117647058823529, |
| "step": 140 |
| }, |
| { |
| "loss": 0.0699828803539276, |
| "grad_norm": 2.828125, |
| "learning_rate": 1.7176470588235293e-05, |
| "entropy": 0.089533219486475, |
| "num_tokens": 90982.0, |
| "mean_token_accuracy": 0.9772566497325897, |
| "epoch": 0.4264705882352941, |
| "step": 145 |
| }, |
| { |
| "loss": 0.06004565954208374, |
| "grad_norm": 4.28125, |
| "learning_rate": 1.707843137254902e-05, |
| "entropy": 0.07979470491409302, |
| "num_tokens": 94197.0, |
| "mean_token_accuracy": 0.980064970254898, |
| "epoch": 0.4411764705882353, |
| "step": 150 |
| }, |
| { |
| "loss": 0.07095102667808532, |
| "grad_norm": 3.8125, |
| "learning_rate": 1.6980392156862745e-05, |
| "entropy": 0.07709958106279373, |
| "num_tokens": 97332.0, |
| "mean_token_accuracy": 0.9785419166088104, |
| "epoch": 0.45588235294117646, |
| "step": 155 |
| }, |
| { |
| "loss": 0.05590643882751465, |
| "grad_norm": 1.671875, |
| "learning_rate": 1.6882352941176473e-05, |
| "entropy": 0.07423891946673393, |
| "num_tokens": 100515.0, |
| "mean_token_accuracy": 0.9827289760112763, |
| "epoch": 0.47058823529411764, |
| "step": 160 |
| }, |
| { |
| "loss": 0.06335585117340088, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.6784313725490198e-05, |
| "entropy": 0.08311136476695538, |
| "num_tokens": 103630.0, |
| "mean_token_accuracy": 0.9795481741428376, |
| "epoch": 0.4852941176470588, |
| "step": 165 |
| }, |
| { |
| "loss": 0.06994503140449523, |
| "grad_norm": 3.625, |
| "learning_rate": 1.6686274509803922e-05, |
| "entropy": 0.07972728088498116, |
| "num_tokens": 106741.0, |
| "mean_token_accuracy": 0.9786823868751526, |
| "epoch": 0.5, |
| "step": 170 |
| }, |
| { |
| "loss": 0.047742915153503415, |
| "grad_norm": 5.71875, |
| "learning_rate": 1.658823529411765e-05, |
| "entropy": 0.059984054416418076, |
| "num_tokens": 109921.0, |
| "mean_token_accuracy": 0.9847357928752899, |
| "epoch": 0.5147058823529411, |
| "step": 175 |
| }, |
| { |
| "loss": 0.05979984998703003, |
| "grad_norm": 7.0625, |
| "learning_rate": 1.6490196078431374e-05, |
| "entropy": 0.06703888289630414, |
| "num_tokens": 112994.0, |
| "mean_token_accuracy": 0.9824592292308807, |
| "epoch": 0.5294117647058824, |
| "step": 180 |
| }, |
| { |
| "loss": 0.04938005805015564, |
| "grad_norm": 2.90625, |
| "learning_rate": 1.63921568627451e-05, |
| "entropy": 0.054279588535428046, |
| "num_tokens": 116201.0, |
| "mean_token_accuracy": 0.9846667230129242, |
| "epoch": 0.5441176470588235, |
| "step": 185 |
| }, |
| { |
| "loss": 0.06785057783126831, |
| "grad_norm": 7.4375, |
| "learning_rate": 1.6294117647058826e-05, |
| "entropy": 0.06177988387644291, |
| "num_tokens": 119381.0, |
| "mean_token_accuracy": 0.9796367406845092, |
| "epoch": 0.5588235294117647, |
| "step": 190 |
| }, |
| { |
| "loss": 0.05383546352386474, |
| "grad_norm": 5.40625, |
| "learning_rate": 1.619607843137255e-05, |
| "entropy": 0.0636073287576437, |
| "num_tokens": 122517.0, |
| "mean_token_accuracy": 0.9798873722553253, |
| "epoch": 0.5735294117647058, |
| "step": 195 |
| }, |
| { |
| "loss": 0.0490637868642807, |
| "grad_norm": 1.96875, |
| "learning_rate": 1.6098039215686275e-05, |
| "entropy": 0.0639917254447937, |
| "num_tokens": 125663.0, |
| "mean_token_accuracy": 0.9849890351295472, |
| "epoch": 0.5882352941176471, |
| "step": 200 |
| }, |
| { |
| "loss": 0.06412197351455688, |
| "grad_norm": 6.84375, |
| "learning_rate": 1.6000000000000003e-05, |
| "entropy": 0.06784685887396336, |
| "num_tokens": 128856.0, |
| "mean_token_accuracy": 0.9818105876445771, |
| "epoch": 0.6029411764705882, |
| "step": 205 |
| }, |
| { |
| "loss": 0.04346465170383453, |
| "grad_norm": 4.375, |
| "learning_rate": 1.5901960784313727e-05, |
| "entropy": 0.06049864292144776, |
| "num_tokens": 131995.0, |
| "mean_token_accuracy": 0.9882112145423889, |
| "epoch": 0.6176470588235294, |
| "step": 210 |
| }, |
| { |
| "loss": 0.04320838153362274, |
| "grad_norm": 2.015625, |
| "learning_rate": 1.580392156862745e-05, |
| "entropy": 0.047596517577767374, |
| "num_tokens": 135181.0, |
| "mean_token_accuracy": 0.985132920742035, |
| "epoch": 0.6323529411764706, |
| "step": 215 |
| }, |
| { |
| "loss": 0.06799347996711731, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.570588235294118e-05, |
| "entropy": 0.06635901145637035, |
| "num_tokens": 138254.0, |
| "mean_token_accuracy": 0.9791639804840088, |
| "epoch": 0.6470588235294118, |
| "step": 220 |
| }, |
| { |
| "loss": 0.041108173131942746, |
| "grad_norm": 2.859375, |
| "learning_rate": 1.5607843137254904e-05, |
| "entropy": 0.051696383953094484, |
| "num_tokens": 141381.0, |
| "mean_token_accuracy": 0.9862416744232178, |
| "epoch": 0.6617647058823529, |
| "step": 225 |
| }, |
| { |
| "loss": 0.045146191120147706, |
| "grad_norm": 3.078125, |
| "learning_rate": 1.5509803921568628e-05, |
| "entropy": 0.055339107289910316, |
| "num_tokens": 144583.0, |
| "mean_token_accuracy": 0.9822882294654847, |
| "epoch": 0.6764705882352942, |
| "step": 230 |
| }, |
| { |
| "loss": 0.04143168330192566, |
| "grad_norm": 1.578125, |
| "learning_rate": 1.5411764705882356e-05, |
| "entropy": 0.05063906572759151, |
| "num_tokens": 147764.0, |
| "mean_token_accuracy": 0.9831606447696686, |
| "epoch": 0.6911764705882353, |
| "step": 235 |
| }, |
| { |
| "loss": 0.03947827816009521, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.531372549019608e-05, |
| "entropy": 0.05209046043455601, |
| "num_tokens": 150961.0, |
| "mean_token_accuracy": 0.9848346650600434, |
| "epoch": 0.7058823529411765, |
| "step": 240 |
| }, |
| { |
| "loss": 0.034212198853492734, |
| "grad_norm": 1.8984375, |
| "learning_rate": 1.5215686274509804e-05, |
| "entropy": 0.04912327118217945, |
| "num_tokens": 154174.0, |
| "mean_token_accuracy": 0.9855735838413239, |
| "epoch": 0.7205882352941176, |
| "step": 245 |
| }, |
| { |
| "loss": 0.03223183453083038, |
| "grad_norm": 1.7265625, |
| "learning_rate": 1.511764705882353e-05, |
| "entropy": 0.045325061306357384, |
| "num_tokens": 157374.0, |
| "mean_token_accuracy": 0.9866909861564637, |
| "epoch": 0.7352941176470589, |
| "step": 250 |
| }, |
| { |
| "loss": 0.04085415601730347, |
| "grad_norm": 2.625, |
| "learning_rate": 1.5019607843137257e-05, |
| "entropy": 0.045074894279241565, |
| "num_tokens": 160519.0, |
| "mean_token_accuracy": 0.9865182876586914, |
| "epoch": 0.75, |
| "step": 255 |
| }, |
| { |
| "loss": 0.03927797079086304, |
| "grad_norm": 2.671875, |
| "learning_rate": 1.4921568627450983e-05, |
| "entropy": 0.039533843845129014, |
| "num_tokens": 163756.0, |
| "mean_token_accuracy": 0.9872985363006592, |
| "epoch": 0.7647058823529411, |
| "step": 260 |
| }, |
| { |
| "loss": 0.042234039306640624, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.4823529411764707e-05, |
| "entropy": 0.043326519429683685, |
| "num_tokens": 166884.0, |
| "mean_token_accuracy": 0.9839499652385711, |
| "epoch": 0.7794117647058824, |
| "step": 265 |
| }, |
| { |
| "loss": 0.04218446910381317, |
| "grad_norm": 3.671875, |
| "learning_rate": 1.4725490196078433e-05, |
| "entropy": 0.05446031875908375, |
| "num_tokens": 170021.0, |
| "mean_token_accuracy": 0.983331423997879, |
| "epoch": 0.7941176470588235, |
| "step": 270 |
| }, |
| { |
| "loss": 0.031345850229263304, |
| "grad_norm": 1.375, |
| "learning_rate": 1.4627450980392157e-05, |
| "entropy": 0.044994413107633593, |
| "num_tokens": 173138.0, |
| "mean_token_accuracy": 0.9864144027233124, |
| "epoch": 0.8088235294117647, |
| "step": 275 |
| }, |
| { |
| "loss": 0.03718245923519135, |
| "grad_norm": 2.03125, |
| "learning_rate": 1.4529411764705883e-05, |
| "entropy": 0.04372772537171841, |
| "num_tokens": 176269.0, |
| "mean_token_accuracy": 0.9855779051780701, |
| "epoch": 0.8235294117647058, |
| "step": 280 |
| }, |
| { |
| "loss": 0.038416677713394166, |
| "grad_norm": 3.234375, |
| "learning_rate": 1.443137254901961e-05, |
| "entropy": 0.04306882936507463, |
| "num_tokens": 179436.0, |
| "mean_token_accuracy": 0.9847787022590637, |
| "epoch": 0.8382352941176471, |
| "step": 285 |
| }, |
| { |
| "loss": 0.03612026274204254, |
| "grad_norm": 4.28125, |
| "learning_rate": 1.4333333333333334e-05, |
| "entropy": 0.04190887995064259, |
| "num_tokens": 182619.0, |
| "mean_token_accuracy": 0.9853791892528534, |
| "epoch": 0.8529411764705882, |
| "step": 290 |
| }, |
| { |
| "loss": 0.03549243807792664, |
| "grad_norm": 1.5546875, |
| "learning_rate": 1.423529411764706e-05, |
| "entropy": 0.041007821820676325, |
| "num_tokens": 185835.0, |
| "mean_token_accuracy": 0.987481951713562, |
| "epoch": 0.8676470588235294, |
| "step": 295 |
| }, |
| { |
| "loss": 0.03658969700336456, |
| "grad_norm": 1.9921875, |
| "learning_rate": 1.4137254901960786e-05, |
| "entropy": 0.03911938704550266, |
| "num_tokens": 189059.0, |
| "mean_token_accuracy": 0.9859034955501557, |
| "epoch": 0.8823529411764706, |
| "step": 300 |
| }, |
| { |
| "loss": 0.03189299702644348, |
| "grad_norm": 1.3984375, |
| "learning_rate": 1.403921568627451e-05, |
| "entropy": 0.04015427939593792, |
| "num_tokens": 192245.0, |
| "mean_token_accuracy": 0.9858013272285462, |
| "epoch": 0.8970588235294118, |
| "step": 305 |
| }, |
| { |
| "loss": 0.04162760376930237, |
| "grad_norm": 4.6875, |
| "learning_rate": 1.3941176470588236e-05, |
| "entropy": 0.04337671361863613, |
| "num_tokens": 195334.0, |
| "mean_token_accuracy": 0.9834910809993744, |
| "epoch": 0.9117647058823529, |
| "step": 310 |
| }, |
| { |
| "loss": 0.03357888162136078, |
| "grad_norm": 1.515625, |
| "learning_rate": 1.384313725490196e-05, |
| "entropy": 0.043437547981739044, |
| "num_tokens": 198482.0, |
| "mean_token_accuracy": 0.9839794993400574, |
| "epoch": 0.9264705882352942, |
| "step": 315 |
| }, |
| { |
| "loss": 0.03252431154251099, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.3745098039215687e-05, |
| "entropy": 0.041450836881995204, |
| "num_tokens": 201737.0, |
| "mean_token_accuracy": 0.9883051753044129, |
| "epoch": 0.9411764705882353, |
| "step": 320 |
| }, |
| { |
| "loss": 0.03779064118862152, |
| "grad_norm": 2.953125, |
| "learning_rate": 1.3647058823529413e-05, |
| "entropy": 0.03566624131053686, |
| "num_tokens": 204889.0, |
| "mean_token_accuracy": 0.9875539124011994, |
| "epoch": 0.9558823529411765, |
| "step": 325 |
| }, |
| { |
| "loss": 0.0329700767993927, |
| "grad_norm": 2.15625, |
| "learning_rate": 1.3549019607843139e-05, |
| "entropy": 0.03808465227484703, |
| "num_tokens": 208114.0, |
| "mean_token_accuracy": 0.986751276254654, |
| "epoch": 0.9705882352941176, |
| "step": 330 |
| }, |
| { |
| "loss": 0.031173259019851685, |
| "grad_norm": 1.546875, |
| "learning_rate": 1.3450980392156865e-05, |
| "entropy": 0.04065078347921371, |
| "num_tokens": 211217.0, |
| "mean_token_accuracy": 0.9860772728919983, |
| "epoch": 0.9852941176470589, |
| "step": 335 |
| }, |
| { |
| "loss": 0.03390420079231262, |
| "grad_norm": 1.515625, |
| "learning_rate": 1.3352941176470588e-05, |
| "entropy": 0.04108036197721958, |
| "num_tokens": 214368.0, |
| "mean_token_accuracy": 0.9871271908283233, |
| "epoch": 1.0, |
| "step": 340 |
| }, |
| { |
| "loss": 0.03671025633811951, |
| "grad_norm": 1.5625, |
| "learning_rate": 1.3254901960784314e-05, |
| "entropy": 0.04091338850557804, |
| "num_tokens": 217480.0, |
| "mean_token_accuracy": 0.9861762046813964, |
| "epoch": 1.0147058823529411, |
| "step": 345 |
| }, |
| { |
| "loss": 0.030594143271446227, |
| "grad_norm": 1.5546875, |
| "learning_rate": 1.315686274509804e-05, |
| "entropy": 0.040245630964636805, |
| "num_tokens": 220615.0, |
| "mean_token_accuracy": 0.9881528139114379, |
| "epoch": 1.0294117647058822, |
| "step": 350 |
| }, |
| { |
| "loss": 0.027347692847251893, |
| "grad_norm": 1.7734375, |
| "learning_rate": 1.3058823529411766e-05, |
| "entropy": 0.03420254942029714, |
| "num_tokens": 223751.0, |
| "mean_token_accuracy": 0.989202469587326, |
| "epoch": 1.0441176470588236, |
| "step": 355 |
| }, |
| { |
| "loss": 0.03148679435253143, |
| "grad_norm": 1.9609375, |
| "learning_rate": 1.2960784313725492e-05, |
| "entropy": 0.03210772704333067, |
| "num_tokens": 226948.0, |
| "mean_token_accuracy": 0.9868246436119079, |
| "epoch": 1.0588235294117647, |
| "step": 360 |
| }, |
| { |
| "loss": 0.031260594725608826, |
| "grad_norm": 1.8046875, |
| "learning_rate": 1.2862745098039218e-05, |
| "entropy": 0.033671201393008235, |
| "num_tokens": 230088.0, |
| "mean_token_accuracy": 0.9856015264987945, |
| "epoch": 1.0735294117647058, |
| "step": 365 |
| }, |
| { |
| "loss": 0.028061491250991822, |
| "grad_norm": 1.2890625, |
| "learning_rate": 1.276470588235294e-05, |
| "entropy": 0.03639122284948826, |
| "num_tokens": 233247.0, |
| "mean_token_accuracy": 0.9885319888591766, |
| "epoch": 1.088235294117647, |
| "step": 370 |
| }, |
| { |
| "loss": 0.0304165780544281, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.2666666666666667e-05, |
| "entropy": 0.03107942212373018, |
| "num_tokens": 236423.0, |
| "mean_token_accuracy": 0.9864429414272309, |
| "epoch": 1.1029411764705883, |
| "step": 375 |
| }, |
| { |
| "loss": 0.028667458891868593, |
| "grad_norm": 1.4453125, |
| "learning_rate": 1.2568627450980393e-05, |
| "entropy": 0.03269361965358257, |
| "num_tokens": 239698.0, |
| "mean_token_accuracy": 0.9882214546203614, |
| "epoch": 1.1176470588235294, |
| "step": 380 |
| }, |
| { |
| "loss": 0.03024893403053284, |
| "grad_norm": 1.4375, |
| "learning_rate": 1.2470588235294119e-05, |
| "entropy": 0.036648140475153926, |
| "num_tokens": 242904.0, |
| "mean_token_accuracy": 0.9854198694229126, |
| "epoch": 1.1323529411764706, |
| "step": 385 |
| }, |
| { |
| "loss": 0.03237654864788055, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.2372549019607845e-05, |
| "entropy": 0.036488327011466024, |
| "num_tokens": 246044.0, |
| "mean_token_accuracy": 0.9868141651153565, |
| "epoch": 1.1470588235294117, |
| "step": 390 |
| }, |
| { |
| "loss": 0.026534423232078552, |
| "grad_norm": 1.2890625, |
| "learning_rate": 1.2274509803921571e-05, |
| "entropy": 0.03317699953913689, |
| "num_tokens": 249199.0, |
| "mean_token_accuracy": 0.9891056835651397, |
| "epoch": 1.161764705882353, |
| "step": 395 |
| }, |
| { |
| "loss": 0.02918187975883484, |
| "grad_norm": 1.546875, |
| "learning_rate": 1.2176470588235294e-05, |
| "entropy": 0.033053198270499705, |
| "num_tokens": 252416.0, |
| "mean_token_accuracy": 0.9872093260288238, |
| "epoch": 1.1764705882352942, |
| "step": 400 |
| }, |
| { |
| "loss": 0.027815410494804384, |
| "grad_norm": 1.5, |
| "learning_rate": 1.207843137254902e-05, |
| "entropy": 0.03630108144134283, |
| "num_tokens": 255505.0, |
| "mean_token_accuracy": 0.9886294066905975, |
| "epoch": 1.1911764705882353, |
| "step": 405 |
| }, |
| { |
| "loss": 0.029119834303855896, |
| "grad_norm": 1.640625, |
| "learning_rate": 1.1980392156862746e-05, |
| "entropy": 0.0321140518411994, |
| "num_tokens": 258679.0, |
| "mean_token_accuracy": 0.9888967990875244, |
| "epoch": 1.2058823529411764, |
| "step": 410 |
| }, |
| { |
| "loss": 0.025961104035377502, |
| "grad_norm": 1.8203125, |
| "learning_rate": 1.1882352941176472e-05, |
| "entropy": 0.02944366242736578, |
| "num_tokens": 261856.0, |
| "mean_token_accuracy": 0.9895209610462189, |
| "epoch": 1.2205882352941178, |
| "step": 415 |
| }, |
| { |
| "loss": 0.03058839440345764, |
| "grad_norm": 2.390625, |
| "learning_rate": 1.1784313725490198e-05, |
| "entropy": 0.03461700212210417, |
| "num_tokens": 264960.0, |
| "mean_token_accuracy": 0.9882765769958496, |
| "epoch": 1.2352941176470589, |
| "step": 420 |
| }, |
| { |
| "loss": 0.028424999117851256, |
| "grad_norm": 1.28125, |
| "learning_rate": 1.1686274509803922e-05, |
| "entropy": 0.02985447719693184, |
| "num_tokens": 268114.0, |
| "mean_token_accuracy": 0.9882177650928498, |
| "epoch": 1.25, |
| "step": 425 |
| }, |
| { |
| "loss": 0.03086719512939453, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.1588235294117648e-05, |
| "entropy": 0.03250212036073208, |
| "num_tokens": 271274.0, |
| "mean_token_accuracy": 0.9888392806053161, |
| "epoch": 1.2647058823529411, |
| "step": 430 |
| }, |
| { |
| "loss": 0.027977922558784486, |
| "grad_norm": 1.3046875, |
| "learning_rate": 1.1490196078431373e-05, |
| "entropy": 0.034127247892320155, |
| "num_tokens": 274452.0, |
| "mean_token_accuracy": 0.9908244907855988, |
| "epoch": 1.2794117647058822, |
| "step": 435 |
| }, |
| { |
| "loss": 0.02676369547843933, |
| "grad_norm": 1.09375, |
| "learning_rate": 1.1392156862745099e-05, |
| "entropy": 0.03699512742459774, |
| "num_tokens": 277562.0, |
| "mean_token_accuracy": 0.9871235430240631, |
| "epoch": 1.2941176470588236, |
| "step": 440 |
| }, |
| { |
| "loss": 0.02789466977119446, |
| "grad_norm": 2.203125, |
| "learning_rate": 1.1294117647058825e-05, |
| "entropy": 0.03514884728938341, |
| "num_tokens": 280635.0, |
| "mean_token_accuracy": 0.990158212184906, |
| "epoch": 1.3088235294117647, |
| "step": 445 |
| }, |
| { |
| "loss": 0.03088509142398834, |
| "grad_norm": 1.8359375, |
| "learning_rate": 1.119607843137255e-05, |
| "entropy": 0.034746605530381204, |
| "num_tokens": 283725.0, |
| "mean_token_accuracy": 0.9876766622066497, |
| "epoch": 1.3235294117647058, |
| "step": 450 |
| }, |
| { |
| "loss": 0.03232976496219635, |
| "grad_norm": 1.734375, |
| "learning_rate": 1.1098039215686275e-05, |
| "entropy": 0.031742793321609494, |
| "num_tokens": 286888.0, |
| "mean_token_accuracy": 0.9871384859085083, |
| "epoch": 1.3382352941176472, |
| "step": 455 |
| }, |
| { |
| "loss": 0.02845146059989929, |
| "grad_norm": 2.0, |
| "learning_rate": 1.1000000000000001e-05, |
| "entropy": 0.03175645042210817, |
| "num_tokens": 290064.0, |
| "mean_token_accuracy": 0.9873914003372193, |
| "epoch": 1.3529411764705883, |
| "step": 460 |
| }, |
| { |
| "loss": 0.029486137628555297, |
| "grad_norm": 1.265625, |
| "learning_rate": 1.0901960784313726e-05, |
| "entropy": 0.03463620245456696, |
| "num_tokens": 293189.0, |
| "mean_token_accuracy": 0.9874814569950103, |
| "epoch": 1.3676470588235294, |
| "step": 465 |
| }, |
| { |
| "loss": 0.02618069648742676, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.0803921568627452e-05, |
| "entropy": 0.033889508619904515, |
| "num_tokens": 296268.0, |
| "mean_token_accuracy": 0.9882802128791809, |
| "epoch": 1.3823529411764706, |
| "step": 470 |
| }, |
| { |
| "loss": 0.025544488430023195, |
| "grad_norm": 0.8984375, |
| "learning_rate": 1.0705882352941178e-05, |
| "entropy": 0.03317532502114773, |
| "num_tokens": 299418.0, |
| "mean_token_accuracy": 0.9891822457313537, |
| "epoch": 1.3970588235294117, |
| "step": 475 |
| }, |
| { |
| "loss": 0.02922942042350769, |
| "grad_norm": 1.5859375, |
| "learning_rate": 1.0607843137254902e-05, |
| "entropy": 0.03228537701070309, |
| "num_tokens": 302608.0, |
| "mean_token_accuracy": 0.9864252746105194, |
| "epoch": 1.4117647058823528, |
| "step": 480 |
| }, |
| { |
| "loss": 0.025081342458724974, |
| "grad_norm": 1.4140625, |
| "learning_rate": 1.0509803921568628e-05, |
| "entropy": 0.033559339493513106, |
| "num_tokens": 305748.0, |
| "mean_token_accuracy": 0.9891697466373444, |
| "epoch": 1.4264705882352942, |
| "step": 485 |
| }, |
| { |
| "loss": 0.028987354040145873, |
| "grad_norm": 1.2109375, |
| "learning_rate": 1.0411764705882354e-05, |
| "entropy": 0.029655468463897706, |
| "num_tokens": 308946.0, |
| "mean_token_accuracy": 0.9884015321731567, |
| "epoch": 1.4411764705882353, |
| "step": 490 |
| }, |
| { |
| "loss": 0.022376981377601624, |
| "grad_norm": 1.5859375, |
| "learning_rate": 1.031372549019608e-05, |
| "entropy": 0.030257853865623473, |
| "num_tokens": 312060.0, |
| "mean_token_accuracy": 0.990349942445755, |
| "epoch": 1.4558823529411764, |
| "step": 495 |
| }, |
| { |
| "loss": 0.027941384911537172, |
| "grad_norm": 1.2734375, |
| "learning_rate": 1.0215686274509805e-05, |
| "entropy": 0.029427625238895416, |
| "num_tokens": 315202.0, |
| "mean_token_accuracy": 0.9894903540611267, |
| "epoch": 1.4705882352941178, |
| "step": 500 |
| }, |
| { |
| "loss": 0.02513147294521332, |
| "grad_norm": 1.8828125, |
| "learning_rate": 1.011764705882353e-05, |
| "entropy": 0.029220272414386274, |
| "num_tokens": 318423.0, |
| "mean_token_accuracy": 0.9887598037719727, |
| "epoch": 1.4852941176470589, |
| "step": 505 |
| }, |
| { |
| "loss": 0.024520005285739898, |
| "grad_norm": 1.3515625, |
| "learning_rate": 1.0019607843137255e-05, |
| "entropy": 0.027622674778103828, |
| "num_tokens": 321643.0, |
| "mean_token_accuracy": 0.9881017684936524, |
| "epoch": 1.5, |
| "step": 510 |
| }, |
| { |
| "loss": 0.022774545848369597, |
| "grad_norm": 0.96875, |
| "learning_rate": 9.921568627450981e-06, |
| "entropy": 0.027344943769276143, |
| "num_tokens": 324896.0, |
| "mean_token_accuracy": 0.9891824662685395, |
| "epoch": 1.5147058823529411, |
| "step": 515 |
| }, |
| { |
| "loss": 0.026902440190315246, |
| "grad_norm": 1.34375, |
| "learning_rate": 9.823529411764706e-06, |
| "entropy": 0.03210813459008932, |
| "num_tokens": 327953.0, |
| "mean_token_accuracy": 0.9872022986412048, |
| "epoch": 1.5294117647058822, |
| "step": 520 |
| }, |
| { |
| "loss": 0.02404342144727707, |
| "grad_norm": 1.34375, |
| "learning_rate": 9.725490196078432e-06, |
| "entropy": 0.03047515023499727, |
| "num_tokens": 331110.0, |
| "mean_token_accuracy": 0.9887873768806458, |
| "epoch": 1.5441176470588234, |
| "step": 525 |
| }, |
| { |
| "loss": 0.022797247767448424, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.627450980392158e-06, |
| "entropy": 0.03160413987934589, |
| "num_tokens": 334226.0, |
| "mean_token_accuracy": 0.9889481067657471, |
| "epoch": 1.5588235294117647, |
| "step": 530 |
| }, |
| { |
| "loss": 0.023706996440887453, |
| "grad_norm": 1.078125, |
| "learning_rate": 9.529411764705882e-06, |
| "entropy": 0.0283035334199667, |
| "num_tokens": 337371.0, |
| "mean_token_accuracy": 0.9890589594841004, |
| "epoch": 1.5735294117647058, |
| "step": 535 |
| }, |
| { |
| "loss": 0.023340512812137604, |
| "grad_norm": 2.5625, |
| "learning_rate": 9.431372549019608e-06, |
| "entropy": 0.029125319607555867, |
| "num_tokens": 340563.0, |
| "mean_token_accuracy": 0.9882973015308381, |
| "epoch": 1.5882352941176472, |
| "step": 540 |
| }, |
| { |
| "loss": 0.025814762711524962, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.333333333333334e-06, |
| "entropy": 0.029474343173205853, |
| "num_tokens": 343715.0, |
| "mean_token_accuracy": 0.9888520836830139, |
| "epoch": 1.6029411764705883, |
| "step": 545 |
| }, |
| { |
| "loss": 0.024609880149364473, |
| "grad_norm": 1.359375, |
| "learning_rate": 9.23529411764706e-06, |
| "entropy": 0.02793533504009247, |
| "num_tokens": 346928.0, |
| "mean_token_accuracy": 0.9896528542041778, |
| "epoch": 1.6176470588235294, |
| "step": 550 |
| }, |
| { |
| "loss": 0.024091285467147828, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.137254901960785e-06, |
| "entropy": 0.03169798478484154, |
| "num_tokens": 349942.0, |
| "mean_token_accuracy": 0.9896469593048096, |
| "epoch": 1.6323529411764706, |
| "step": 555 |
| }, |
| { |
| "loss": 0.022402273118495943, |
| "grad_norm": 1.3203125, |
| "learning_rate": 9.03921568627451e-06, |
| "entropy": 0.02854564245790243, |
| "num_tokens": 353063.0, |
| "mean_token_accuracy": 0.9894876420497895, |
| "epoch": 1.6470588235294117, |
| "step": 560 |
| }, |
| { |
| "loss": 0.023489847779273987, |
| "grad_norm": 1.8359375, |
| "learning_rate": 8.941176470588237e-06, |
| "entropy": 0.028600608371198176, |
| "num_tokens": 356180.0, |
| "mean_token_accuracy": 0.9890201330184937, |
| "epoch": 1.6617647058823528, |
| "step": 565 |
| }, |
| { |
| "loss": 0.02147035002708435, |
| "grad_norm": 1.0859375, |
| "learning_rate": 8.843137254901961e-06, |
| "entropy": 0.026650307327508928, |
| "num_tokens": 359351.0, |
| "mean_token_accuracy": 0.9898578941822052, |
| "epoch": 1.6764705882352942, |
| "step": 570 |
| }, |
| { |
| "loss": 0.022052311897277833, |
| "grad_norm": 1.3515625, |
| "learning_rate": 8.745098039215687e-06, |
| "entropy": 0.027873093821108343, |
| "num_tokens": 362470.0, |
| "mean_token_accuracy": 0.989058256149292, |
| "epoch": 1.6911764705882353, |
| "step": 575 |
| }, |
| { |
| "loss": 0.023864805698394775, |
| "grad_norm": 1.5859375, |
| "learning_rate": 8.647058823529413e-06, |
| "entropy": 0.027629780396819115, |
| "num_tokens": 365614.0, |
| "mean_token_accuracy": 0.9894056558609009, |
| "epoch": 1.7058823529411766, |
| "step": 580 |
| }, |
| { |
| "loss": 0.027744096517562867, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.549019607843138e-06, |
| "entropy": 0.028794774785637856, |
| "num_tokens": 368805.0, |
| "mean_token_accuracy": 0.9880473792552948, |
| "epoch": 1.7205882352941178, |
| "step": 585 |
| }, |
| { |
| "loss": 0.021863000094890596, |
| "grad_norm": 1.1796875, |
| "learning_rate": 8.450980392156864e-06, |
| "entropy": 0.028252063691616057, |
| "num_tokens": 371947.0, |
| "mean_token_accuracy": 0.9904429137706756, |
| "epoch": 1.7352941176470589, |
| "step": 590 |
| }, |
| { |
| "loss": 0.021520544588565827, |
| "grad_norm": 1.3203125, |
| "learning_rate": 8.35294117647059e-06, |
| "entropy": 0.028264945745468138, |
| "num_tokens": 375103.0, |
| "mean_token_accuracy": 0.9904776751995087, |
| "epoch": 1.75, |
| "step": 595 |
| }, |
| { |
| "loss": 0.026353719830513, |
| "grad_norm": 1.1953125, |
| "learning_rate": 8.254901960784314e-06, |
| "entropy": 0.027113928645849227, |
| "num_tokens": 378317.0, |
| "mean_token_accuracy": 0.9884898960590363, |
| "epoch": 1.7647058823529411, |
| "step": 600 |
| }, |
| { |
| "loss": 0.026097461581230164, |
| "grad_norm": 1.421875, |
| "learning_rate": 8.15686274509804e-06, |
| "entropy": 0.028313294425606726, |
| "num_tokens": 381417.0, |
| "mean_token_accuracy": 0.9879869103431702, |
| "epoch": 1.7794117647058822, |
| "step": 605 |
| }, |
| { |
| "loss": 0.02049378156661987, |
| "grad_norm": 1.0546875, |
| "learning_rate": 8.058823529411766e-06, |
| "entropy": 0.026570411399006844, |
| "num_tokens": 384632.0, |
| "mean_token_accuracy": 0.9887495577335358, |
| "epoch": 1.7941176470588234, |
| "step": 610 |
| }, |
| { |
| "loss": 0.022221173346042632, |
| "grad_norm": 1.1171875, |
| "learning_rate": 7.96078431372549e-06, |
| "entropy": 0.02754255346953869, |
| "num_tokens": 387836.0, |
| "mean_token_accuracy": 0.9899809181690216, |
| "epoch": 1.8088235294117647, |
| "step": 615 |
| }, |
| { |
| "loss": 0.023856499791145326, |
| "grad_norm": 1.3203125, |
| "learning_rate": 7.862745098039217e-06, |
| "entropy": 0.031241112016141416, |
| "num_tokens": 390887.0, |
| "mean_token_accuracy": 0.9897979915142059, |
| "epoch": 1.8235294117647058, |
| "step": 620 |
| }, |
| { |
| "loss": 0.0225734680891037, |
| "grad_norm": 1.40625, |
| "learning_rate": 7.764705882352941e-06, |
| "entropy": 0.02798519879579544, |
| "num_tokens": 394027.0, |
| "mean_token_accuracy": 0.9890839040279389, |
| "epoch": 1.8382352941176472, |
| "step": 625 |
| }, |
| { |
| "loss": 0.022729092836380006, |
| "grad_norm": 1.25, |
| "learning_rate": 7.666666666666667e-06, |
| "entropy": 0.02719390895217657, |
| "num_tokens": 397202.0, |
| "mean_token_accuracy": 0.9886514127254487, |
| "epoch": 1.8529411764705883, |
| "step": 630 |
| }, |
| { |
| "loss": 0.021688875555992127, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.5686274509803925e-06, |
| "entropy": 0.027222988195717335, |
| "num_tokens": 400378.0, |
| "mean_token_accuracy": 0.9908071339130402, |
| "epoch": 1.8676470588235294, |
| "step": 635 |
| }, |
| { |
| "loss": 0.023884420096874238, |
| "grad_norm": 1.4296875, |
| "learning_rate": 7.4705882352941185e-06, |
| "entropy": 0.028057356551289558, |
| "num_tokens": 403503.0, |
| "mean_token_accuracy": 0.9900456726551056, |
| "epoch": 1.8823529411764706, |
| "step": 640 |
| }, |
| { |
| "loss": 0.020375268161296846, |
| "grad_norm": 1.6953125, |
| "learning_rate": 7.372549019607845e-06, |
| "entropy": 0.02543655373156071, |
| "num_tokens": 406768.0, |
| "mean_token_accuracy": 0.9911065042018891, |
| "epoch": 1.8970588235294117, |
| "step": 645 |
| }, |
| { |
| "loss": 0.020015493035316467, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.274509803921569e-06, |
| "entropy": 0.027230485714972018, |
| "num_tokens": 409875.0, |
| "mean_token_accuracy": 0.9906234502792358, |
| "epoch": 1.9117647058823528, |
| "step": 650 |
| }, |
| { |
| "loss": 0.022530680894851683, |
| "grad_norm": 1.421875, |
| "learning_rate": 7.176470588235295e-06, |
| "entropy": 0.028223772905766963, |
| "num_tokens": 412987.0, |
| "mean_token_accuracy": 0.9903216242790223, |
| "epoch": 1.9264705882352942, |
| "step": 655 |
| }, |
| { |
| "loss": 0.021129874885082243, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.07843137254902e-06, |
| "entropy": 0.02674291282892227, |
| "num_tokens": 416181.0, |
| "mean_token_accuracy": 0.9886639952659607, |
| "epoch": 1.9411764705882353, |
| "step": 660 |
| }, |
| { |
| "loss": 0.021244224905967713, |
| "grad_norm": 0.9453125, |
| "learning_rate": 6.9803921568627454e-06, |
| "entropy": 0.028005971759557723, |
| "num_tokens": 419323.0, |
| "mean_token_accuracy": 0.9905200719833374, |
| "epoch": 1.9558823529411766, |
| "step": 665 |
| }, |
| { |
| "loss": 0.022309188544750214, |
| "grad_norm": 1.375, |
| "learning_rate": 6.8823529411764715e-06, |
| "entropy": 0.027272411435842515, |
| "num_tokens": 422484.0, |
| "mean_token_accuracy": 0.9878733932971955, |
| "epoch": 1.9705882352941178, |
| "step": 670 |
| }, |
| { |
| "loss": 0.022459632158279418, |
| "grad_norm": 1.203125, |
| "learning_rate": 6.784313725490197e-06, |
| "entropy": 0.026817415095865726, |
| "num_tokens": 425583.0, |
| "mean_token_accuracy": 0.9908780753612518, |
| "epoch": 1.9852941176470589, |
| "step": 675 |
| }, |
| { |
| "loss": 0.021811096370220183, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.686274509803922e-06, |
| "entropy": 0.026038615591824056, |
| "num_tokens": 428736.0, |
| "mean_token_accuracy": 0.9897907853126526, |
| "epoch": 2.0, |
| "step": 680 |
| }, |
| { |
| "loss": 0.019171090424060823, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.588235294117647e-06, |
| "entropy": 0.02475190218538046, |
| "num_tokens": 431976.0, |
| "mean_token_accuracy": 0.989355844259262, |
| "epoch": 2.014705882352941, |
| "step": 685 |
| }, |
| { |
| "loss": 0.023474155366420744, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.490196078431373e-06, |
| "entropy": 0.026115396432578562, |
| "num_tokens": 435142.0, |
| "mean_token_accuracy": 0.9885824680328369, |
| "epoch": 2.0294117647058822, |
| "step": 690 |
| }, |
| { |
| "loss": 0.020176805555820465, |
| "grad_norm": 1.0, |
| "learning_rate": 6.3921568627450984e-06, |
| "entropy": 0.026907235756516455, |
| "num_tokens": 438259.0, |
| "mean_token_accuracy": 0.9919745445251464, |
| "epoch": 2.0441176470588234, |
| "step": 695 |
| }, |
| { |
| "loss": 0.022543656826019286, |
| "grad_norm": 1.34375, |
| "learning_rate": 6.294117647058824e-06, |
| "entropy": 0.02749718502163887, |
| "num_tokens": 441366.0, |
| "mean_token_accuracy": 0.9880188047885895, |
| "epoch": 2.0588235294117645, |
| "step": 700 |
| }, |
| { |
| "loss": 0.019685085117816924, |
| "grad_norm": 0.9453125, |
| "learning_rate": 6.19607843137255e-06, |
| "entropy": 0.024849089048802852, |
| "num_tokens": 444474.0, |
| "mean_token_accuracy": 0.9906105160713196, |
| "epoch": 2.073529411764706, |
| "step": 705 |
| }, |
| { |
| "loss": 0.020225000381469727, |
| "grad_norm": 1.234375, |
| "learning_rate": 6.098039215686276e-06, |
| "entropy": 0.023934758827090265, |
| "num_tokens": 447652.0, |
| "mean_token_accuracy": 0.9896179974079132, |
| "epoch": 2.088235294117647, |
| "step": 710 |
| }, |
| { |
| "loss": 0.02128472626209259, |
| "grad_norm": 1.078125, |
| "learning_rate": 6e-06, |
| "entropy": 0.02389440070837736, |
| "num_tokens": 450833.0, |
| "mean_token_accuracy": 0.9899099349975586, |
| "epoch": 2.1029411764705883, |
| "step": 715 |
| }, |
| { |
| "loss": 0.021367147564888, |
| "grad_norm": 1.6015625, |
| "learning_rate": 5.901960784313726e-06, |
| "entropy": 0.02620517127215862, |
| "num_tokens": 453949.0, |
| "mean_token_accuracy": 0.988726532459259, |
| "epoch": 2.1176470588235294, |
| "step": 720 |
| }, |
| { |
| "loss": 0.01960753947496414, |
| "grad_norm": 1.03125, |
| "learning_rate": 5.803921568627452e-06, |
| "entropy": 0.02435927651822567, |
| "num_tokens": 457147.0, |
| "mean_token_accuracy": 0.9908569097518921, |
| "epoch": 2.1323529411764706, |
| "step": 725 |
| }, |
| { |
| "loss": 0.022167882323265074, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.705882352941177e-06, |
| "entropy": 0.02521121110767126, |
| "num_tokens": 460308.0, |
| "mean_token_accuracy": 0.9891940593719483, |
| "epoch": 2.1470588235294117, |
| "step": 730 |
| }, |
| { |
| "loss": 0.0210279181599617, |
| "grad_norm": 1.359375, |
| "learning_rate": 5.607843137254903e-06, |
| "entropy": 0.02500821612775326, |
| "num_tokens": 463449.0, |
| "mean_token_accuracy": 0.9884547054767608, |
| "epoch": 2.161764705882353, |
| "step": 735 |
| }, |
| { |
| "loss": 0.01987575888633728, |
| "grad_norm": 1.03125, |
| "learning_rate": 5.509803921568628e-06, |
| "entropy": 0.025977463461458683, |
| "num_tokens": 466590.0, |
| "mean_token_accuracy": 0.9888093769550323, |
| "epoch": 2.176470588235294, |
| "step": 740 |
| }, |
| { |
| "loss": 0.019111356139183043, |
| "grad_norm": 1.25, |
| "learning_rate": 5.411764705882353e-06, |
| "entropy": 0.02638601940125227, |
| "num_tokens": 469726.0, |
| "mean_token_accuracy": 0.9917258858680725, |
| "epoch": 2.1911764705882355, |
| "step": 745 |
| }, |
| { |
| "loss": 0.020354922115802764, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.313725490196079e-06, |
| "entropy": 0.026662386767566205, |
| "num_tokens": 472853.0, |
| "mean_token_accuracy": 0.99064000248909, |
| "epoch": 2.2058823529411766, |
| "step": 750 |
| }, |
| { |
| "loss": 0.01959734410047531, |
| "grad_norm": 0.80859375, |
| "learning_rate": 5.2156862745098044e-06, |
| "entropy": 0.02579411044716835, |
| "num_tokens": 476008.0, |
| "mean_token_accuracy": 0.9904728531837463, |
| "epoch": 2.2205882352941178, |
| "step": 755 |
| }, |
| { |
| "loss": 0.020466303825378417, |
| "grad_norm": 1.3828125, |
| "learning_rate": 5.11764705882353e-06, |
| "entropy": 0.0256651122123003, |
| "num_tokens": 479150.0, |
| "mean_token_accuracy": 0.9903539717197418, |
| "epoch": 2.235294117647059, |
| "step": 760 |
| }, |
| { |
| "loss": 0.01983775794506073, |
| "grad_norm": 0.99609375, |
| "learning_rate": 5.019607843137255e-06, |
| "entropy": 0.02584236618131399, |
| "num_tokens": 482321.0, |
| "mean_token_accuracy": 0.9914842903614044, |
| "epoch": 2.25, |
| "step": 765 |
| }, |
| { |
| "loss": 0.020100761950016022, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.921568627450981e-06, |
| "entropy": 0.02499296572059393, |
| "num_tokens": 485510.0, |
| "mean_token_accuracy": 0.991219836473465, |
| "epoch": 2.264705882352941, |
| "step": 770 |
| }, |
| { |
| "loss": 0.02088477313518524, |
| "grad_norm": 1.328125, |
| "learning_rate": 4.823529411764706e-06, |
| "entropy": 0.024959737621247768, |
| "num_tokens": 488698.0, |
| "mean_token_accuracy": 0.9898148238658905, |
| "epoch": 2.2794117647058822, |
| "step": 775 |
| }, |
| { |
| "loss": 0.0195361465215683, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.725490196078431e-06, |
| "entropy": 0.023672481067478657, |
| "num_tokens": 491906.0, |
| "mean_token_accuracy": 0.9900302290916443, |
| "epoch": 2.2941176470588234, |
| "step": 780 |
| }, |
| { |
| "loss": 0.019702821969985962, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.627450980392157e-06, |
| "entropy": 0.025737580843269825, |
| "num_tokens": 494997.0, |
| "mean_token_accuracy": 0.9905776441097259, |
| "epoch": 2.3088235294117645, |
| "step": 785 |
| }, |
| { |
| "loss": 0.018527360260486604, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.529411764705883e-06, |
| "entropy": 0.02454463895410299, |
| "num_tokens": 498138.0, |
| "mean_token_accuracy": 0.9910318195819855, |
| "epoch": 2.323529411764706, |
| "step": 790 |
| }, |
| { |
| "loss": 0.018923106789588928, |
| "grad_norm": 1.359375, |
| "learning_rate": 4.431372549019608e-06, |
| "entropy": 0.0245100449770689, |
| "num_tokens": 501316.0, |
| "mean_token_accuracy": 0.9911953806877136, |
| "epoch": 2.338235294117647, |
| "step": 795 |
| }, |
| { |
| "loss": 0.01874026209115982, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.333333333333334e-06, |
| "entropy": 0.023334310948848726, |
| "num_tokens": 504533.0, |
| "mean_token_accuracy": 0.9910171329975128, |
| "epoch": 2.3529411764705883, |
| "step": 800 |
| }, |
| { |
| "loss": 0.022160655260086058, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.235294117647059e-06, |
| "entropy": 0.026187057420611382, |
| "num_tokens": 507616.0, |
| "mean_token_accuracy": 0.9876076638698578, |
| "epoch": 2.3676470588235294, |
| "step": 805 |
| }, |
| { |
| "loss": 0.018640576303005217, |
| "grad_norm": 1.03125, |
| "learning_rate": 4.137254901960784e-06, |
| "entropy": 0.02308085039258003, |
| "num_tokens": 510793.0, |
| "mean_token_accuracy": 0.9908162891864777, |
| "epoch": 2.3823529411764706, |
| "step": 810 |
| }, |
| { |
| "loss": 0.019237047433853148, |
| "grad_norm": 0.8984375, |
| "learning_rate": 4.03921568627451e-06, |
| "entropy": 0.024417817965149878, |
| "num_tokens": 513995.0, |
| "mean_token_accuracy": 0.9902299284934998, |
| "epoch": 2.3970588235294117, |
| "step": 815 |
| }, |
| { |
| "loss": 0.020626239478588104, |
| "grad_norm": 1.1640625, |
| "learning_rate": 3.941176470588236e-06, |
| "entropy": 0.025944224931299685, |
| "num_tokens": 517128.0, |
| "mean_token_accuracy": 0.9896773338317871, |
| "epoch": 2.411764705882353, |
| "step": 820 |
| }, |
| { |
| "loss": 0.018906430900096895, |
| "grad_norm": 1.0546875, |
| "learning_rate": 3.843137254901962e-06, |
| "entropy": 0.02529167104512453, |
| "num_tokens": 520219.0, |
| "mean_token_accuracy": 0.9905548214912414, |
| "epoch": 2.426470588235294, |
| "step": 825 |
| }, |
| { |
| "loss": 0.01989607810974121, |
| "grad_norm": 1.171875, |
| "learning_rate": 3.7450980392156865e-06, |
| "entropy": 0.025429282896220685, |
| "num_tokens": 523368.0, |
| "mean_token_accuracy": 0.9910161614418029, |
| "epoch": 2.4411764705882355, |
| "step": 830 |
| }, |
| { |
| "loss": 0.019511505961418152, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.6470588235294117e-06, |
| "entropy": 0.026134114153683184, |
| "num_tokens": 526516.0, |
| "mean_token_accuracy": 0.9898114144802094, |
| "epoch": 2.4558823529411766, |
| "step": 835 |
| }, |
| { |
| "loss": 0.018582092225551607, |
| "grad_norm": 1.1328125, |
| "learning_rate": 3.5490196078431378e-06, |
| "entropy": 0.02343358173966408, |
| "num_tokens": 529660.0, |
| "mean_token_accuracy": 0.9904271245002747, |
| "epoch": 2.4705882352941178, |
| "step": 840 |
| }, |
| { |
| "loss": 0.020261451601982117, |
| "grad_norm": 1.453125, |
| "learning_rate": 3.450980392156863e-06, |
| "entropy": 0.024460323713719846, |
| "num_tokens": 532778.0, |
| "mean_token_accuracy": 0.9899402976036071, |
| "epoch": 2.485294117647059, |
| "step": 845 |
| }, |
| { |
| "loss": 0.020383948087692262, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.352941176470588e-06, |
| "entropy": 0.024987665377557276, |
| "num_tokens": 535932.0, |
| "mean_token_accuracy": 0.9898059248924256, |
| "epoch": 2.5, |
| "step": 850 |
| }, |
| { |
| "loss": 0.019448164105415344, |
| "grad_norm": 1.3515625, |
| "learning_rate": 3.2549019607843143e-06, |
| "entropy": 0.02465162370353937, |
| "num_tokens": 539037.0, |
| "mean_token_accuracy": 0.9913235783576966, |
| "epoch": 2.514705882352941, |
| "step": 855 |
| }, |
| { |
| "loss": 0.018925553560256957, |
| "grad_norm": 1.046875, |
| "learning_rate": 3.1568627450980395e-06, |
| "entropy": 0.025184641405940057, |
| "num_tokens": 542197.0, |
| "mean_token_accuracy": 0.991470605134964, |
| "epoch": 2.5294117647058822, |
| "step": 860 |
| }, |
| { |
| "loss": 0.01913969814777374, |
| "grad_norm": 1.0546875, |
| "learning_rate": 3.058823529411765e-06, |
| "entropy": 0.024113286659121512, |
| "num_tokens": 545387.0, |
| "mean_token_accuracy": 0.9914486467838287, |
| "epoch": 2.5441176470588234, |
| "step": 865 |
| }, |
| { |
| "loss": 0.018765930831432343, |
| "grad_norm": 1.0703125, |
| "learning_rate": 2.9607843137254903e-06, |
| "entropy": 0.02413007989525795, |
| "num_tokens": 548534.0, |
| "mean_token_accuracy": 0.9907777428627014, |
| "epoch": 2.5588235294117645, |
| "step": 870 |
| }, |
| { |
| "loss": 0.019279350340366364, |
| "grad_norm": 2.1875, |
| "learning_rate": 2.8627450980392155e-06, |
| "entropy": 0.024522659182548524, |
| "num_tokens": 551721.0, |
| "mean_token_accuracy": 0.9905555963516235, |
| "epoch": 2.5735294117647056, |
| "step": 875 |
| }, |
| { |
| "loss": 0.019660860300064087, |
| "grad_norm": 1.1015625, |
| "learning_rate": 2.7647058823529416e-06, |
| "entropy": 0.024852845631539822, |
| "num_tokens": 554912.0, |
| "mean_token_accuracy": 0.9898727238178253, |
| "epoch": 2.588235294117647, |
| "step": 880 |
| }, |
| { |
| "loss": 0.018780362606048585, |
| "grad_norm": 1.0703125, |
| "learning_rate": 2.666666666666667e-06, |
| "entropy": 0.02551023568958044, |
| "num_tokens": 558028.0, |
| "mean_token_accuracy": 0.99192915558815, |
| "epoch": 2.6029411764705883, |
| "step": 885 |
| }, |
| { |
| "loss": 0.01949601024389267, |
| "grad_norm": 1.1953125, |
| "learning_rate": 2.568627450980392e-06, |
| "entropy": 0.025155650451779366, |
| "num_tokens": 561189.0, |
| "mean_token_accuracy": 0.990712708234787, |
| "epoch": 2.6176470588235294, |
| "step": 890 |
| }, |
| { |
| "loss": 0.019716159999370576, |
| "grad_norm": 1.296875, |
| "learning_rate": 2.470588235294118e-06, |
| "entropy": 0.024883992783725262, |
| "num_tokens": 564374.0, |
| "mean_token_accuracy": 0.989579439163208, |
| "epoch": 2.6323529411764706, |
| "step": 895 |
| }, |
| { |
| "loss": 0.017295162379741668, |
| "grad_norm": 0.97265625, |
| "learning_rate": 2.3725490196078433e-06, |
| "entropy": 0.0241273645311594, |
| "num_tokens": 567550.0, |
| "mean_token_accuracy": 0.9934020042419434, |
| "epoch": 2.6470588235294117, |
| "step": 900 |
| }, |
| { |
| "loss": 0.020695842802524567, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.274509803921569e-06, |
| "entropy": 0.02697849553078413, |
| "num_tokens": 570611.0, |
| "mean_token_accuracy": 0.9914706110954284, |
| "epoch": 2.661764705882353, |
| "step": 905 |
| }, |
| { |
| "loss": 0.017908445000648497, |
| "grad_norm": 1.2734375, |
| "learning_rate": 2.176470588235294e-06, |
| "entropy": 0.022997986152768136, |
| "num_tokens": 573767.0, |
| "mean_token_accuracy": 0.9898150980472564, |
| "epoch": 2.6764705882352944, |
| "step": 910 |
| }, |
| { |
| "loss": 0.020641934871673585, |
| "grad_norm": 1.4921875, |
| "learning_rate": 2.07843137254902e-06, |
| "entropy": 0.027346356958150863, |
| "num_tokens": 576830.0, |
| "mean_token_accuracy": 0.9897843182086945, |
| "epoch": 2.6911764705882355, |
| "step": 915 |
| }, |
| { |
| "loss": 0.019691270589828492, |
| "grad_norm": 1.2890625, |
| "learning_rate": 1.980392156862745e-06, |
| "entropy": 0.023718219250440598, |
| "num_tokens": 580065.0, |
| "mean_token_accuracy": 0.9901076138019562, |
| "epoch": 2.7058823529411766, |
| "step": 920 |
| }, |
| { |
| "loss": 0.02009253352880478, |
| "grad_norm": 1.2109375, |
| "learning_rate": 1.8823529411764707e-06, |
| "entropy": 0.024860053882002832, |
| "num_tokens": 583200.0, |
| "mean_token_accuracy": 0.9894306361675262, |
| "epoch": 2.7205882352941178, |
| "step": 925 |
| }, |
| { |
| "loss": 0.019820311665534975, |
| "grad_norm": 1.1796875, |
| "learning_rate": 1.7843137254901963e-06, |
| "entropy": 0.02641481179744005, |
| "num_tokens": 586247.0, |
| "mean_token_accuracy": 0.9888152658939362, |
| "epoch": 2.735294117647059, |
| "step": 930 |
| }, |
| { |
| "loss": 0.020238989591598512, |
| "grad_norm": 1.34375, |
| "learning_rate": 1.6862745098039217e-06, |
| "entropy": 0.025426279939711093, |
| "num_tokens": 589348.0, |
| "mean_token_accuracy": 0.9893324971199036, |
| "epoch": 2.75, |
| "step": 935 |
| }, |
| { |
| "loss": 0.020529073476791383, |
| "grad_norm": 1.1953125, |
| "learning_rate": 1.5882352941176472e-06, |
| "entropy": 0.025489212945103645, |
| "num_tokens": 592483.0, |
| "mean_token_accuracy": 0.9883848607540131, |
| "epoch": 2.764705882352941, |
| "step": 940 |
| }, |
| { |
| "loss": 0.019503119587898254, |
| "grad_norm": 1.875, |
| "learning_rate": 1.4901960784313726e-06, |
| "entropy": 0.025844238512218, |
| "num_tokens": 595654.0, |
| "mean_token_accuracy": 0.9898752987384796, |
| "epoch": 2.7794117647058822, |
| "step": 945 |
| }, |
| { |
| "loss": 0.020725423097610475, |
| "grad_norm": 1.3359375, |
| "learning_rate": 1.3921568627450982e-06, |
| "entropy": 0.025542815588414668, |
| "num_tokens": 598757.0, |
| "mean_token_accuracy": 0.9899684190750122, |
| "epoch": 2.7941176470588234, |
| "step": 950 |
| }, |
| { |
| "loss": 0.020795242488384248, |
| "grad_norm": 1.1640625, |
| "learning_rate": 1.2941176470588237e-06, |
| "entropy": 0.023506213910877705, |
| "num_tokens": 602069.0, |
| "mean_token_accuracy": 0.9894281327724457, |
| "epoch": 2.8088235294117645, |
| "step": 955 |
| }, |
| { |
| "loss": 0.01915638893842697, |
| "grad_norm": 1.21875, |
| "learning_rate": 1.196078431372549e-06, |
| "entropy": 0.024655142053961753, |
| "num_tokens": 605286.0, |
| "mean_token_accuracy": 0.9900248169898986, |
| "epoch": 2.8235294117647056, |
| "step": 960 |
| }, |
| { |
| "loss": 0.01975841522216797, |
| "grad_norm": 1.1484375, |
| "learning_rate": 1.0980392156862745e-06, |
| "entropy": 0.025551106408238412, |
| "num_tokens": 608374.0, |
| "mean_token_accuracy": 0.9892638444900512, |
| "epoch": 2.838235294117647, |
| "step": 965 |
| }, |
| { |
| "loss": 0.020852866768836974, |
| "grad_norm": 1.2421875, |
| "learning_rate": 1.0000000000000002e-06, |
| "entropy": 0.02480896282941103, |
| "num_tokens": 611577.0, |
| "mean_token_accuracy": 0.9892595648765564, |
| "epoch": 2.8529411764705883, |
| "step": 970 |
| }, |
| { |
| "loss": 0.019326749444007873, |
| "grad_norm": 0.875, |
| "learning_rate": 9.019607843137256e-07, |
| "entropy": 0.02385783474892378, |
| "num_tokens": 614761.0, |
| "mean_token_accuracy": 0.9904800593852997, |
| "epoch": 2.8676470588235294, |
| "step": 975 |
| }, |
| { |
| "loss": 0.019405061006546022, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.039215686274511e-07, |
| "entropy": 0.026029090210795403, |
| "num_tokens": 617870.0, |
| "mean_token_accuracy": 0.9896216452121734, |
| "epoch": 2.8823529411764706, |
| "step": 980 |
| }, |
| { |
| "loss": 0.019337351620197295, |
| "grad_norm": 0.9921875, |
| "learning_rate": 7.058823529411766e-07, |
| "entropy": 0.026062553003430366, |
| "num_tokens": 620943.0, |
| "mean_token_accuracy": 0.9899002552032471, |
| "epoch": 2.8970588235294117, |
| "step": 985 |
| }, |
| { |
| "loss": 0.01972263157367706, |
| "grad_norm": 1.5625, |
| "learning_rate": 6.07843137254902e-07, |
| "entropy": 0.025324805453419686, |
| "num_tokens": 624094.0, |
| "mean_token_accuracy": 0.9898600101470947, |
| "epoch": 2.911764705882353, |
| "step": 990 |
| }, |
| { |
| "loss": 0.017833781242370606, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.098039215686275e-07, |
| "entropy": 0.023284821771085262, |
| "num_tokens": 627253.0, |
| "mean_token_accuracy": 0.9910983681678772, |
| "epoch": 2.9264705882352944, |
| "step": 995 |
| }, |
| { |
| "loss": 0.020137375593185423, |
| "grad_norm": 1.3984375, |
| "learning_rate": 4.1176470588235295e-07, |
| "entropy": 0.024203809909522533, |
| "num_tokens": 630427.0, |
| "mean_token_accuracy": 0.9907480180263519, |
| "epoch": 2.9411764705882355, |
| "step": 1000 |
| }, |
| { |
| "loss": 0.019109995663166048, |
| "grad_norm": 1.21875, |
| "learning_rate": 3.1372549019607843e-07, |
| "entropy": 0.02416255362331867, |
| "num_tokens": 633632.0, |
| "mean_token_accuracy": 0.9915190756320953, |
| "epoch": 2.9558823529411766, |
| "step": 1005 |
| }, |
| { |
| "loss": 0.02000269144773483, |
| "grad_norm": 1.859375, |
| "learning_rate": 2.1568627450980394e-07, |
| "entropy": 0.024217843264341354, |
| "num_tokens": 636805.0, |
| "mean_token_accuracy": 0.9894875824451447, |
| "epoch": 2.9705882352941178, |
| "step": 1010 |
| }, |
| { |
| "loss": 0.020338763296604157, |
| "grad_norm": 1.546875, |
| "learning_rate": 1.1764705882352942e-07, |
| "entropy": 0.024258859269320966, |
| "num_tokens": 639984.0, |
| "mean_token_accuracy": 0.9892021059989929, |
| "epoch": 2.985294117647059, |
| "step": 1015 |
| }, |
| { |
| "loss": 0.020995336771011352, |
| "grad_norm": 1.046875, |
| "learning_rate": 1.9607843137254902e-08, |
| "entropy": 0.025342148169875144, |
| "num_tokens": 643104.0, |
| "mean_token_accuracy": 0.9887544453144074, |
| "epoch": 3.0, |
| "step": 1020 |
| }, |
| { |
| "train_runtime": 3944.5682, |
| "train_samples_per_second": 0.517, |
| "train_steps_per_second": 0.259, |
| "total_flos": 5056111718203392.0, |
| "train_loss": 0.07629515403041652, |
| "epoch": 3.0, |
| "step": 1020 |
| } |
| ] |