[ { "loss": 2.836225128173828, "grad_norm": 64.5, "learning_rate": 1.9921568627450984e-05, "entropy": 2.411133313179016, "num_tokens": 3137.0, "mean_token_accuracy": 0.49307813346385954, "epoch": 0.014705882352941176, "step": 5 }, { "loss": 1.3722827911376954, "grad_norm": 10.0, "learning_rate": 1.9823529411764708e-05, "entropy": 1.489565873146057, "num_tokens": 6240.0, "mean_token_accuracy": 0.7310294091701508, "epoch": 0.029411764705882353, "step": 10 }, { "loss": 0.9681278228759765, "grad_norm": 9.0, "learning_rate": 1.9725490196078433e-05, "entropy": 1.0941020846366882, "num_tokens": 9372.0, "mean_token_accuracy": 0.7977278172969818, "epoch": 0.04411764705882353, "step": 15 }, { "loss": 0.7952256202697754, "grad_norm": 7.5625, "learning_rate": 1.9627450980392157e-05, "entropy": 0.7959236443042755, "num_tokens": 12496.0, "mean_token_accuracy": 0.8263253927230835, "epoch": 0.058823529411764705, "step": 20 }, { "loss": 0.7038975715637207, "grad_norm": 10.0, "learning_rate": 1.9529411764705885e-05, "entropy": 0.7730603992938996, "num_tokens": 15726.0, "mean_token_accuracy": 0.8362560391426086, "epoch": 0.07352941176470588, "step": 25 }, { "loss": 0.5153284072875977, "grad_norm": 9.5, "learning_rate": 1.943137254901961e-05, "entropy": 0.5871870815753937, "num_tokens": 18807.0, "mean_token_accuracy": 0.8711118042469025, "epoch": 0.08823529411764706, "step": 30 }, { "loss": 0.4624673843383789, "grad_norm": 9.375, "learning_rate": 1.9333333333333333e-05, "entropy": 0.5334561973810196, "num_tokens": 21955.0, "mean_token_accuracy": 0.8878682732582093, "epoch": 0.10294117647058823, "step": 35 }, { "loss": 0.3805722236633301, "grad_norm": 7.0625, "learning_rate": 1.923529411764706e-05, "entropy": 0.490571403503418, "num_tokens": 25129.0, "mean_token_accuracy": 0.9082872688770294, "epoch": 0.11764705882352941, "step": 40 }, { "loss": 0.2753485679626465, "grad_norm": 8.75, "learning_rate": 1.9137254901960786e-05, "entropy": 0.3105604648590088, "num_tokens": 28291.0, "mean_token_accuracy": 0.9394680917263031, "epoch": 0.1323529411764706, "step": 45 }, { "loss": 0.22170100212097169, "grad_norm": 5.65625, "learning_rate": 1.903921568627451e-05, "entropy": 0.28098965287208555, "num_tokens": 31415.0, "mean_token_accuracy": 0.949154794216156, "epoch": 0.14705882352941177, "step": 50 }, { "loss": 0.18951488733291627, "grad_norm": 9.9375, "learning_rate": 1.8941176470588238e-05, "entropy": 0.20550020337104796, "num_tokens": 34603.0, "mean_token_accuracy": 0.9539743661880493, "epoch": 0.16176470588235295, "step": 55 }, { "loss": 0.17650480270385743, "grad_norm": 4.25, "learning_rate": 1.8843137254901962e-05, "entropy": 0.21026135981082916, "num_tokens": 37754.0, "mean_token_accuracy": 0.9567391991615295, "epoch": 0.17647058823529413, "step": 60 }, { "loss": 0.18774482011795043, "grad_norm": 5.5, "learning_rate": 1.8745098039215686e-05, "entropy": 0.23240296691656112, "num_tokens": 40848.0, "mean_token_accuracy": 0.9520188570022583, "epoch": 0.19117647058823528, "step": 65 }, { "loss": 0.12736810445785524, "grad_norm": 10.625, "learning_rate": 1.8647058823529414e-05, "entropy": 0.16197917684912683, "num_tokens": 44001.0, "mean_token_accuracy": 0.9676418542861939, "epoch": 0.20588235294117646, "step": 70 }, { "loss": 0.14076029062271117, "grad_norm": 4.53125, "learning_rate": 1.854901960784314e-05, "entropy": 0.15784153044223787, "num_tokens": 47159.0, "mean_token_accuracy": 0.9648099303245544, "epoch": 0.22058823529411764, "step": 75 }, { "loss": 0.10759507417678833, "grad_norm": 3.328125, "learning_rate": 1.8450980392156866e-05, "entropy": 0.14289679378271103, "num_tokens": 50298.0, "mean_token_accuracy": 0.9671541452407837, "epoch": 0.23529411764705882, "step": 80 }, { "loss": 0.12589149475097655, "grad_norm": 5.46875, "learning_rate": 1.8352941176470587e-05, "entropy": 0.13958239406347275, "num_tokens": 53455.0, "mean_token_accuracy": 0.9665216684341431, "epoch": 0.25, "step": 85 }, { "loss": 0.12024720907211303, "grad_norm": 4.53125, "learning_rate": 1.8254901960784315e-05, "entropy": 0.13711344972252845, "num_tokens": 56595.0, "mean_token_accuracy": 0.9648710668087006, "epoch": 0.2647058823529412, "step": 90 }, { "loss": 0.10167303085327148, "grad_norm": 4.8125, "learning_rate": 1.815686274509804e-05, "entropy": 0.13078619986772538, "num_tokens": 59674.0, "mean_token_accuracy": 0.9712324619293213, "epoch": 0.27941176470588236, "step": 95 }, { "loss": 0.08662314414978027, "grad_norm": 3.671875, "learning_rate": 1.8058823529411767e-05, "entropy": 0.10740345045924186, "num_tokens": 62774.0, "mean_token_accuracy": 0.9719909071922302, "epoch": 0.29411764705882354, "step": 100 }, { "loss": 0.09073780775070191, "grad_norm": 4.15625, "learning_rate": 1.796078431372549e-05, "entropy": 0.09185975939035415, "num_tokens": 65866.0, "mean_token_accuracy": 0.9742748856544494, "epoch": 0.3088235294117647, "step": 105 }, { "loss": 0.07408615350723266, "grad_norm": 2.734375, "learning_rate": 1.786274509803922e-05, "entropy": 0.10024651288986205, "num_tokens": 68995.0, "mean_token_accuracy": 0.9773713290691376, "epoch": 0.3235294117647059, "step": 110 }, { "loss": 0.08644189834594726, "grad_norm": 6.71875, "learning_rate": 1.776470588235294e-05, "entropy": 0.09930562153458596, "num_tokens": 72160.0, "mean_token_accuracy": 0.9748322486877441, "epoch": 0.3382352941176471, "step": 115 }, { "loss": 0.11685197353363037, "grad_norm": 10.3125, "learning_rate": 1.7666666666666668e-05, "entropy": 0.11419346779584885, "num_tokens": 75262.0, "mean_token_accuracy": 0.9695464611053467, "epoch": 0.35294117647058826, "step": 120 }, { "loss": 0.10757300853729249, "grad_norm": 8.9375, "learning_rate": 1.7568627450980392e-05, "entropy": 0.12836654633283615, "num_tokens": 78384.0, "mean_token_accuracy": 0.9728550255298615, "epoch": 0.36764705882352944, "step": 125 }, { "loss": 0.07711289525032043, "grad_norm": 3.015625, "learning_rate": 1.747058823529412e-05, "entropy": 0.10070741027593613, "num_tokens": 81583.0, "mean_token_accuracy": 0.9778402209281921, "epoch": 0.38235294117647056, "step": 130 }, { "loss": 0.08512116074562073, "grad_norm": 5.375, "learning_rate": 1.7372549019607845e-05, "entropy": 0.09163436144590378, "num_tokens": 84729.0, "mean_token_accuracy": 0.9748329102993012, "epoch": 0.39705882352941174, "step": 135 }, { "loss": 0.09534031748771668, "grad_norm": 3.40625, "learning_rate": 1.7274509803921572e-05, "entropy": 0.09555450975894927, "num_tokens": 87916.0, "mean_token_accuracy": 0.9727975726127625, "epoch": 0.4117647058823529, "step": 140 }, { "loss": 0.0699828803539276, "grad_norm": 2.828125, "learning_rate": 1.7176470588235293e-05, "entropy": 0.089533219486475, "num_tokens": 90982.0, "mean_token_accuracy": 0.9772566497325897, "epoch": 0.4264705882352941, "step": 145 }, { "loss": 0.06004565954208374, "grad_norm": 4.28125, "learning_rate": 1.707843137254902e-05, "entropy": 0.07979470491409302, "num_tokens": 94197.0, "mean_token_accuracy": 0.980064970254898, "epoch": 0.4411764705882353, "step": 150 }, { "loss": 0.07095102667808532, "grad_norm": 3.8125, "learning_rate": 1.6980392156862745e-05, "entropy": 0.07709958106279373, "num_tokens": 97332.0, "mean_token_accuracy": 0.9785419166088104, "epoch": 0.45588235294117646, "step": 155 }, { "loss": 0.05590643882751465, "grad_norm": 1.671875, "learning_rate": 1.6882352941176473e-05, "entropy": 0.07423891946673393, "num_tokens": 100515.0, "mean_token_accuracy": 0.9827289760112763, "epoch": 0.47058823529411764, "step": 160 }, { "loss": 0.06335585117340088, "grad_norm": 2.390625, "learning_rate": 1.6784313725490198e-05, "entropy": 0.08311136476695538, "num_tokens": 103630.0, "mean_token_accuracy": 0.9795481741428376, "epoch": 0.4852941176470588, "step": 165 }, { "loss": 0.06994503140449523, "grad_norm": 3.625, "learning_rate": 1.6686274509803922e-05, "entropy": 0.07972728088498116, "num_tokens": 106741.0, "mean_token_accuracy": 0.9786823868751526, "epoch": 0.5, "step": 170 }, { "loss": 0.047742915153503415, "grad_norm": 5.71875, "learning_rate": 1.658823529411765e-05, "entropy": 0.059984054416418076, "num_tokens": 109921.0, "mean_token_accuracy": 0.9847357928752899, "epoch": 0.5147058823529411, "step": 175 }, { "loss": 0.05979984998703003, "grad_norm": 7.0625, "learning_rate": 1.6490196078431374e-05, "entropy": 0.06703888289630414, "num_tokens": 112994.0, "mean_token_accuracy": 0.9824592292308807, "epoch": 0.5294117647058824, "step": 180 }, { "loss": 0.04938005805015564, "grad_norm": 2.90625, "learning_rate": 1.63921568627451e-05, "entropy": 0.054279588535428046, "num_tokens": 116201.0, "mean_token_accuracy": 0.9846667230129242, "epoch": 0.5441176470588235, "step": 185 }, { "loss": 0.06785057783126831, "grad_norm": 7.4375, "learning_rate": 1.6294117647058826e-05, "entropy": 0.06177988387644291, "num_tokens": 119381.0, "mean_token_accuracy": 0.9796367406845092, "epoch": 0.5588235294117647, "step": 190 }, { "loss": 0.05383546352386474, "grad_norm": 5.40625, "learning_rate": 1.619607843137255e-05, "entropy": 0.0636073287576437, "num_tokens": 122517.0, "mean_token_accuracy": 0.9798873722553253, "epoch": 0.5735294117647058, "step": 195 }, { "loss": 0.0490637868642807, "grad_norm": 1.96875, "learning_rate": 1.6098039215686275e-05, "entropy": 0.0639917254447937, "num_tokens": 125663.0, "mean_token_accuracy": 0.9849890351295472, "epoch": 0.5882352941176471, "step": 200 }, { "loss": 0.06412197351455688, "grad_norm": 6.84375, "learning_rate": 1.6000000000000003e-05, "entropy": 0.06784685887396336, "num_tokens": 128856.0, "mean_token_accuracy": 0.9818105876445771, "epoch": 0.6029411764705882, "step": 205 }, { "loss": 0.04346465170383453, "grad_norm": 4.375, "learning_rate": 1.5901960784313727e-05, "entropy": 0.06049864292144776, "num_tokens": 131995.0, "mean_token_accuracy": 0.9882112145423889, "epoch": 0.6176470588235294, "step": 210 }, { "loss": 0.04320838153362274, "grad_norm": 2.015625, "learning_rate": 1.580392156862745e-05, "entropy": 0.047596517577767374, "num_tokens": 135181.0, "mean_token_accuracy": 0.985132920742035, "epoch": 0.6323529411764706, "step": 215 }, { "loss": 0.06799347996711731, "grad_norm": 8.5625, "learning_rate": 1.570588235294118e-05, "entropy": 0.06635901145637035, "num_tokens": 138254.0, "mean_token_accuracy": 0.9791639804840088, "epoch": 0.6470588235294118, "step": 220 }, { "loss": 0.041108173131942746, "grad_norm": 2.859375, "learning_rate": 1.5607843137254904e-05, "entropy": 0.051696383953094484, "num_tokens": 141381.0, "mean_token_accuracy": 0.9862416744232178, "epoch": 0.6617647058823529, "step": 225 }, { "loss": 0.045146191120147706, "grad_norm": 3.078125, "learning_rate": 1.5509803921568628e-05, "entropy": 0.055339107289910316, "num_tokens": 144583.0, "mean_token_accuracy": 0.9822882294654847, "epoch": 0.6764705882352942, "step": 230 }, { "loss": 0.04143168330192566, "grad_norm": 1.578125, "learning_rate": 1.5411764705882356e-05, "entropy": 0.05063906572759151, "num_tokens": 147764.0, "mean_token_accuracy": 0.9831606447696686, "epoch": 0.6911764705882353, "step": 235 }, { "loss": 0.03947827816009521, "grad_norm": 1.9921875, "learning_rate": 1.531372549019608e-05, "entropy": 0.05209046043455601, "num_tokens": 150961.0, "mean_token_accuracy": 0.9848346650600434, "epoch": 0.7058823529411765, "step": 240 }, { "loss": 0.034212198853492734, "grad_norm": 1.8984375, "learning_rate": 1.5215686274509804e-05, "entropy": 0.04912327118217945, "num_tokens": 154174.0, "mean_token_accuracy": 0.9855735838413239, "epoch": 0.7205882352941176, "step": 245 }, { "loss": 0.03223183453083038, "grad_norm": 1.7265625, "learning_rate": 1.511764705882353e-05, "entropy": 0.045325061306357384, "num_tokens": 157374.0, "mean_token_accuracy": 0.9866909861564637, "epoch": 0.7352941176470589, "step": 250 }, { "loss": 0.04085415601730347, "grad_norm": 2.625, "learning_rate": 1.5019607843137257e-05, "entropy": 0.045074894279241565, "num_tokens": 160519.0, "mean_token_accuracy": 0.9865182876586914, "epoch": 0.75, "step": 255 }, { "loss": 0.03927797079086304, "grad_norm": 2.671875, "learning_rate": 1.4921568627450983e-05, "entropy": 0.039533843845129014, "num_tokens": 163756.0, "mean_token_accuracy": 0.9872985363006592, "epoch": 0.7647058823529411, "step": 260 }, { "loss": 0.042234039306640624, "grad_norm": 1.7109375, "learning_rate": 1.4823529411764707e-05, "entropy": 0.043326519429683685, "num_tokens": 166884.0, "mean_token_accuracy": 0.9839499652385711, "epoch": 0.7794117647058824, "step": 265 }, { "loss": 0.04218446910381317, "grad_norm": 3.671875, "learning_rate": 1.4725490196078433e-05, "entropy": 0.05446031875908375, "num_tokens": 170021.0, "mean_token_accuracy": 0.983331423997879, "epoch": 0.7941176470588235, "step": 270 }, { "loss": 0.031345850229263304, "grad_norm": 1.375, "learning_rate": 1.4627450980392157e-05, "entropy": 0.044994413107633593, "num_tokens": 173138.0, "mean_token_accuracy": 0.9864144027233124, "epoch": 0.8088235294117647, "step": 275 }, { "loss": 0.03718245923519135, "grad_norm": 2.03125, "learning_rate": 1.4529411764705883e-05, "entropy": 0.04372772537171841, "num_tokens": 176269.0, "mean_token_accuracy": 0.9855779051780701, "epoch": 0.8235294117647058, "step": 280 }, { "loss": 0.038416677713394166, "grad_norm": 3.234375, "learning_rate": 1.443137254901961e-05, "entropy": 0.04306882936507463, "num_tokens": 179436.0, "mean_token_accuracy": 0.9847787022590637, "epoch": 0.8382352941176471, "step": 285 }, { "loss": 0.03612026274204254, "grad_norm": 4.28125, "learning_rate": 1.4333333333333334e-05, "entropy": 0.04190887995064259, "num_tokens": 182619.0, "mean_token_accuracy": 0.9853791892528534, "epoch": 0.8529411764705882, "step": 290 }, { "loss": 0.03549243807792664, "grad_norm": 1.5546875, "learning_rate": 1.423529411764706e-05, "entropy": 0.041007821820676325, "num_tokens": 185835.0, "mean_token_accuracy": 0.987481951713562, "epoch": 0.8676470588235294, "step": 295 }, { "loss": 0.03658969700336456, "grad_norm": 1.9921875, "learning_rate": 1.4137254901960786e-05, "entropy": 0.03911938704550266, "num_tokens": 189059.0, "mean_token_accuracy": 0.9859034955501557, "epoch": 0.8823529411764706, "step": 300 }, { "loss": 0.03189299702644348, "grad_norm": 1.3984375, "learning_rate": 1.403921568627451e-05, "entropy": 0.04015427939593792, "num_tokens": 192245.0, "mean_token_accuracy": 0.9858013272285462, "epoch": 0.8970588235294118, "step": 305 }, { "loss": 0.04162760376930237, "grad_norm": 4.6875, "learning_rate": 1.3941176470588236e-05, "entropy": 0.04337671361863613, "num_tokens": 195334.0, "mean_token_accuracy": 0.9834910809993744, "epoch": 0.9117647058823529, "step": 310 }, { "loss": 0.03357888162136078, "grad_norm": 1.515625, "learning_rate": 1.384313725490196e-05, "entropy": 0.043437547981739044, "num_tokens": 198482.0, "mean_token_accuracy": 0.9839794993400574, "epoch": 0.9264705882352942, "step": 315 }, { "loss": 0.03252431154251099, "grad_norm": 2.390625, "learning_rate": 1.3745098039215687e-05, "entropy": 0.041450836881995204, "num_tokens": 201737.0, "mean_token_accuracy": 0.9883051753044129, "epoch": 0.9411764705882353, "step": 320 }, { "loss": 0.03779064118862152, "grad_norm": 2.953125, "learning_rate": 1.3647058823529413e-05, "entropy": 0.03566624131053686, "num_tokens": 204889.0, "mean_token_accuracy": 0.9875539124011994, "epoch": 0.9558823529411765, "step": 325 }, { "loss": 0.0329700767993927, "grad_norm": 2.15625, "learning_rate": 1.3549019607843139e-05, "entropy": 0.03808465227484703, "num_tokens": 208114.0, "mean_token_accuracy": 0.986751276254654, "epoch": 0.9705882352941176, "step": 330 }, { "loss": 0.031173259019851685, "grad_norm": 1.546875, "learning_rate": 1.3450980392156865e-05, "entropy": 0.04065078347921371, "num_tokens": 211217.0, "mean_token_accuracy": 0.9860772728919983, "epoch": 0.9852941176470589, "step": 335 }, { "loss": 0.03390420079231262, "grad_norm": 1.515625, "learning_rate": 1.3352941176470588e-05, "entropy": 0.04108036197721958, "num_tokens": 214368.0, "mean_token_accuracy": 0.9871271908283233, "epoch": 1.0, "step": 340 }, { "loss": 0.03671025633811951, "grad_norm": 1.5625, "learning_rate": 1.3254901960784314e-05, "entropy": 0.04091338850557804, "num_tokens": 217480.0, "mean_token_accuracy": 0.9861762046813964, "epoch": 1.0147058823529411, "step": 345 }, { "loss": 0.030594143271446227, "grad_norm": 1.5546875, "learning_rate": 1.315686274509804e-05, "entropy": 0.040245630964636805, "num_tokens": 220615.0, "mean_token_accuracy": 0.9881528139114379, "epoch": 1.0294117647058822, "step": 350 }, { "loss": 0.027347692847251893, "grad_norm": 1.7734375, "learning_rate": 1.3058823529411766e-05, "entropy": 0.03420254942029714, "num_tokens": 223751.0, "mean_token_accuracy": 0.989202469587326, "epoch": 1.0441176470588236, "step": 355 }, { "loss": 0.03148679435253143, "grad_norm": 1.9609375, "learning_rate": 1.2960784313725492e-05, "entropy": 0.03210772704333067, "num_tokens": 226948.0, "mean_token_accuracy": 0.9868246436119079, "epoch": 1.0588235294117647, "step": 360 }, { "loss": 0.031260594725608826, "grad_norm": 1.8046875, "learning_rate": 1.2862745098039218e-05, "entropy": 0.033671201393008235, "num_tokens": 230088.0, "mean_token_accuracy": 0.9856015264987945, "epoch": 1.0735294117647058, "step": 365 }, { "loss": 0.028061491250991822, "grad_norm": 1.2890625, "learning_rate": 1.276470588235294e-05, "entropy": 0.03639122284948826, "num_tokens": 233247.0, "mean_token_accuracy": 0.9885319888591766, "epoch": 1.088235294117647, "step": 370 }, { "loss": 0.0304165780544281, "grad_norm": 2.203125, "learning_rate": 1.2666666666666667e-05, "entropy": 0.03107942212373018, "num_tokens": 236423.0, "mean_token_accuracy": 0.9864429414272309, "epoch": 1.1029411764705883, "step": 375 }, { "loss": 0.028667458891868593, "grad_norm": 1.4453125, "learning_rate": 1.2568627450980393e-05, "entropy": 0.03269361965358257, "num_tokens": 239698.0, "mean_token_accuracy": 0.9882214546203614, "epoch": 1.1176470588235294, "step": 380 }, { "loss": 0.03024893403053284, "grad_norm": 1.4375, "learning_rate": 1.2470588235294119e-05, "entropy": 0.036648140475153926, "num_tokens": 242904.0, "mean_token_accuracy": 0.9854198694229126, "epoch": 1.1323529411764706, "step": 385 }, { "loss": 0.03237654864788055, "grad_norm": 1.140625, "learning_rate": 1.2372549019607845e-05, "entropy": 0.036488327011466024, "num_tokens": 246044.0, "mean_token_accuracy": 0.9868141651153565, "epoch": 1.1470588235294117, "step": 390 }, { "loss": 0.026534423232078552, "grad_norm": 1.2890625, "learning_rate": 1.2274509803921571e-05, "entropy": 0.03317699953913689, "num_tokens": 249199.0, "mean_token_accuracy": 0.9891056835651397, "epoch": 1.161764705882353, "step": 395 }, { "loss": 0.02918187975883484, "grad_norm": 1.546875, "learning_rate": 1.2176470588235294e-05, "entropy": 0.033053198270499705, "num_tokens": 252416.0, "mean_token_accuracy": 0.9872093260288238, "epoch": 1.1764705882352942, "step": 400 }, { "loss": 0.027815410494804384, "grad_norm": 1.5, "learning_rate": 1.207843137254902e-05, "entropy": 0.03630108144134283, "num_tokens": 255505.0, "mean_token_accuracy": 0.9886294066905975, "epoch": 1.1911764705882353, "step": 405 }, { "loss": 0.029119834303855896, "grad_norm": 1.640625, "learning_rate": 1.1980392156862746e-05, "entropy": 0.0321140518411994, "num_tokens": 258679.0, "mean_token_accuracy": 0.9888967990875244, "epoch": 1.2058823529411764, "step": 410 }, { "loss": 0.025961104035377502, "grad_norm": 1.8203125, "learning_rate": 1.1882352941176472e-05, "entropy": 0.02944366242736578, "num_tokens": 261856.0, "mean_token_accuracy": 0.9895209610462189, "epoch": 1.2205882352941178, "step": 415 }, { "loss": 0.03058839440345764, "grad_norm": 2.390625, "learning_rate": 1.1784313725490198e-05, "entropy": 0.03461700212210417, "num_tokens": 264960.0, "mean_token_accuracy": 0.9882765769958496, "epoch": 1.2352941176470589, "step": 420 }, { "loss": 0.028424999117851256, "grad_norm": 1.28125, "learning_rate": 1.1686274509803922e-05, "entropy": 0.02985447719693184, "num_tokens": 268114.0, "mean_token_accuracy": 0.9882177650928498, "epoch": 1.25, "step": 425 }, { "loss": 0.03086719512939453, "grad_norm": 2.265625, "learning_rate": 1.1588235294117648e-05, "entropy": 0.03250212036073208, "num_tokens": 271274.0, "mean_token_accuracy": 0.9888392806053161, "epoch": 1.2647058823529411, "step": 430 }, { "loss": 0.027977922558784486, "grad_norm": 1.3046875, "learning_rate": 1.1490196078431373e-05, "entropy": 0.034127247892320155, "num_tokens": 274452.0, "mean_token_accuracy": 0.9908244907855988, "epoch": 1.2794117647058822, "step": 435 }, { "loss": 0.02676369547843933, "grad_norm": 1.09375, "learning_rate": 1.1392156862745099e-05, "entropy": 0.03699512742459774, "num_tokens": 277562.0, "mean_token_accuracy": 0.9871235430240631, "epoch": 1.2941176470588236, "step": 440 }, { "loss": 0.02789466977119446, "grad_norm": 2.203125, "learning_rate": 1.1294117647058825e-05, "entropy": 0.03514884728938341, "num_tokens": 280635.0, "mean_token_accuracy": 0.990158212184906, "epoch": 1.3088235294117647, "step": 445 }, { "loss": 0.03088509142398834, "grad_norm": 1.8359375, "learning_rate": 1.119607843137255e-05, "entropy": 0.034746605530381204, "num_tokens": 283725.0, "mean_token_accuracy": 0.9876766622066497, "epoch": 1.3235294117647058, "step": 450 }, { "loss": 0.03232976496219635, "grad_norm": 1.734375, "learning_rate": 1.1098039215686275e-05, "entropy": 0.031742793321609494, "num_tokens": 286888.0, "mean_token_accuracy": 0.9871384859085083, "epoch": 1.3382352941176472, "step": 455 }, { "loss": 0.02845146059989929, "grad_norm": 2.0, "learning_rate": 1.1000000000000001e-05, "entropy": 0.03175645042210817, "num_tokens": 290064.0, "mean_token_accuracy": 0.9873914003372193, "epoch": 1.3529411764705883, "step": 460 }, { "loss": 0.029486137628555297, "grad_norm": 1.265625, "learning_rate": 1.0901960784313726e-05, "entropy": 0.03463620245456696, "num_tokens": 293189.0, "mean_token_accuracy": 0.9874814569950103, "epoch": 1.3676470588235294, "step": 465 }, { "loss": 0.02618069648742676, "grad_norm": 1.109375, "learning_rate": 1.0803921568627452e-05, "entropy": 0.033889508619904515, "num_tokens": 296268.0, "mean_token_accuracy": 0.9882802128791809, "epoch": 1.3823529411764706, "step": 470 }, { "loss": 0.025544488430023195, "grad_norm": 0.8984375, "learning_rate": 1.0705882352941178e-05, "entropy": 0.03317532502114773, "num_tokens": 299418.0, "mean_token_accuracy": 0.9891822457313537, "epoch": 1.3970588235294117, "step": 475 }, { "loss": 0.02922942042350769, "grad_norm": 1.5859375, "learning_rate": 1.0607843137254902e-05, "entropy": 0.03228537701070309, "num_tokens": 302608.0, "mean_token_accuracy": 0.9864252746105194, "epoch": 1.4117647058823528, "step": 480 }, { "loss": 0.025081342458724974, "grad_norm": 1.4140625, "learning_rate": 1.0509803921568628e-05, "entropy": 0.033559339493513106, "num_tokens": 305748.0, "mean_token_accuracy": 0.9891697466373444, "epoch": 1.4264705882352942, "step": 485 }, { "loss": 0.028987354040145873, "grad_norm": 1.2109375, "learning_rate": 1.0411764705882354e-05, "entropy": 0.029655468463897706, "num_tokens": 308946.0, "mean_token_accuracy": 0.9884015321731567, "epoch": 1.4411764705882353, "step": 490 }, { "loss": 0.022376981377601624, "grad_norm": 1.5859375, "learning_rate": 1.031372549019608e-05, "entropy": 0.030257853865623473, "num_tokens": 312060.0, "mean_token_accuracy": 0.990349942445755, "epoch": 1.4558823529411764, "step": 495 }, { "loss": 0.027941384911537172, "grad_norm": 1.2734375, "learning_rate": 1.0215686274509805e-05, "entropy": 0.029427625238895416, "num_tokens": 315202.0, "mean_token_accuracy": 0.9894903540611267, "epoch": 1.4705882352941178, "step": 500 }, { "loss": 0.02513147294521332, "grad_norm": 1.8828125, "learning_rate": 1.011764705882353e-05, "entropy": 0.029220272414386274, "num_tokens": 318423.0, "mean_token_accuracy": 0.9887598037719727, "epoch": 1.4852941176470589, "step": 505 }, { "loss": 0.024520005285739898, "grad_norm": 1.3515625, "learning_rate": 1.0019607843137255e-05, "entropy": 0.027622674778103828, "num_tokens": 321643.0, "mean_token_accuracy": 0.9881017684936524, "epoch": 1.5, "step": 510 }, { "loss": 0.022774545848369597, "grad_norm": 0.96875, "learning_rate": 9.921568627450981e-06, "entropy": 0.027344943769276143, "num_tokens": 324896.0, "mean_token_accuracy": 0.9891824662685395, "epoch": 1.5147058823529411, "step": 515 }, { "loss": 0.026902440190315246, "grad_norm": 1.34375, "learning_rate": 9.823529411764706e-06, "entropy": 0.03210813459008932, "num_tokens": 327953.0, "mean_token_accuracy": 0.9872022986412048, "epoch": 1.5294117647058822, "step": 520 }, { "loss": 0.02404342144727707, "grad_norm": 1.34375, "learning_rate": 9.725490196078432e-06, "entropy": 0.03047515023499727, "num_tokens": 331110.0, "mean_token_accuracy": 0.9887873768806458, "epoch": 1.5441176470588234, "step": 525 }, { "loss": 0.022797247767448424, "grad_norm": 1.2265625, "learning_rate": 9.627450980392158e-06, "entropy": 0.03160413987934589, "num_tokens": 334226.0, "mean_token_accuracy": 0.9889481067657471, "epoch": 1.5588235294117647, "step": 530 }, { "loss": 0.023706996440887453, "grad_norm": 1.078125, "learning_rate": 9.529411764705882e-06, "entropy": 0.0283035334199667, "num_tokens": 337371.0, "mean_token_accuracy": 0.9890589594841004, "epoch": 1.5735294117647058, "step": 535 }, { "loss": 0.023340512812137604, "grad_norm": 2.5625, "learning_rate": 9.431372549019608e-06, "entropy": 0.029125319607555867, "num_tokens": 340563.0, "mean_token_accuracy": 0.9882973015308381, "epoch": 1.5882352941176472, "step": 540 }, { "loss": 0.025814762711524962, "grad_norm": 1.8046875, "learning_rate": 9.333333333333334e-06, "entropy": 0.029474343173205853, "num_tokens": 343715.0, "mean_token_accuracy": 0.9888520836830139, "epoch": 1.6029411764705883, "step": 545 }, { "loss": 0.024609880149364473, "grad_norm": 1.359375, "learning_rate": 9.23529411764706e-06, "entropy": 0.02793533504009247, "num_tokens": 346928.0, "mean_token_accuracy": 0.9896528542041778, "epoch": 1.6176470588235294, "step": 550 }, { "loss": 0.024091285467147828, "grad_norm": 1.171875, "learning_rate": 9.137254901960785e-06, "entropy": 0.03169798478484154, "num_tokens": 349942.0, "mean_token_accuracy": 0.9896469593048096, "epoch": 1.6323529411764706, "step": 555 }, { "loss": 0.022402273118495943, "grad_norm": 1.3203125, "learning_rate": 9.03921568627451e-06, "entropy": 0.02854564245790243, "num_tokens": 353063.0, "mean_token_accuracy": 0.9894876420497895, "epoch": 1.6470588235294117, "step": 560 }, { "loss": 0.023489847779273987, "grad_norm": 1.8359375, "learning_rate": 8.941176470588237e-06, "entropy": 0.028600608371198176, "num_tokens": 356180.0, "mean_token_accuracy": 0.9890201330184937, "epoch": 1.6617647058823528, "step": 565 }, { "loss": 0.02147035002708435, "grad_norm": 1.0859375, "learning_rate": 8.843137254901961e-06, "entropy": 0.026650307327508928, "num_tokens": 359351.0, "mean_token_accuracy": 0.9898578941822052, "epoch": 1.6764705882352942, "step": 570 }, { "loss": 0.022052311897277833, "grad_norm": 1.3515625, "learning_rate": 8.745098039215687e-06, "entropy": 0.027873093821108343, "num_tokens": 362470.0, "mean_token_accuracy": 0.989058256149292, "epoch": 1.6911764705882353, "step": 575 }, { "loss": 0.023864805698394775, "grad_norm": 1.5859375, "learning_rate": 8.647058823529413e-06, "entropy": 0.027629780396819115, "num_tokens": 365614.0, "mean_token_accuracy": 0.9894056558609009, "epoch": 1.7058823529411766, "step": 580 }, { "loss": 0.027744096517562867, "grad_norm": 1.6875, "learning_rate": 8.549019607843138e-06, "entropy": 0.028794774785637856, "num_tokens": 368805.0, "mean_token_accuracy": 0.9880473792552948, "epoch": 1.7205882352941178, "step": 585 }, { "loss": 0.021863000094890596, "grad_norm": 1.1796875, "learning_rate": 8.450980392156864e-06, "entropy": 0.028252063691616057, "num_tokens": 371947.0, "mean_token_accuracy": 0.9904429137706756, "epoch": 1.7352941176470589, "step": 590 }, { "loss": 0.021520544588565827, "grad_norm": 1.3203125, "learning_rate": 8.35294117647059e-06, "entropy": 0.028264945745468138, "num_tokens": 375103.0, "mean_token_accuracy": 0.9904776751995087, "epoch": 1.75, "step": 595 }, { "loss": 0.026353719830513, "grad_norm": 1.1953125, "learning_rate": 8.254901960784314e-06, "entropy": 0.027113928645849227, "num_tokens": 378317.0, "mean_token_accuracy": 0.9884898960590363, "epoch": 1.7647058823529411, "step": 600 }, { "loss": 0.026097461581230164, "grad_norm": 1.421875, "learning_rate": 8.15686274509804e-06, "entropy": 0.028313294425606726, "num_tokens": 381417.0, "mean_token_accuracy": 0.9879869103431702, "epoch": 1.7794117647058822, "step": 605 }, { "loss": 0.02049378156661987, "grad_norm": 1.0546875, "learning_rate": 8.058823529411766e-06, "entropy": 0.026570411399006844, "num_tokens": 384632.0, "mean_token_accuracy": 0.9887495577335358, "epoch": 1.7941176470588234, "step": 610 }, { "loss": 0.022221173346042632, "grad_norm": 1.1171875, "learning_rate": 7.96078431372549e-06, "entropy": 0.02754255346953869, "num_tokens": 387836.0, "mean_token_accuracy": 0.9899809181690216, "epoch": 1.8088235294117647, "step": 615 }, { "loss": 0.023856499791145326, "grad_norm": 1.3203125, "learning_rate": 7.862745098039217e-06, "entropy": 0.031241112016141416, "num_tokens": 390887.0, "mean_token_accuracy": 0.9897979915142059, "epoch": 1.8235294117647058, "step": 620 }, { "loss": 0.0225734680891037, "grad_norm": 1.40625, "learning_rate": 7.764705882352941e-06, "entropy": 0.02798519879579544, "num_tokens": 394027.0, "mean_token_accuracy": 0.9890839040279389, "epoch": 1.8382352941176472, "step": 625 }, { "loss": 0.022729092836380006, "grad_norm": 1.25, "learning_rate": 7.666666666666667e-06, "entropy": 0.02719390895217657, "num_tokens": 397202.0, "mean_token_accuracy": 0.9886514127254487, "epoch": 1.8529411764705883, "step": 630 }, { "loss": 0.021688875555992127, "grad_norm": 1.0859375, "learning_rate": 7.5686274509803925e-06, "entropy": 0.027222988195717335, "num_tokens": 400378.0, "mean_token_accuracy": 0.9908071339130402, "epoch": 1.8676470588235294, "step": 635 }, { "loss": 0.023884420096874238, "grad_norm": 1.4296875, "learning_rate": 7.4705882352941185e-06, "entropy": 0.028057356551289558, "num_tokens": 403503.0, "mean_token_accuracy": 0.9900456726551056, "epoch": 1.8823529411764706, "step": 640 }, { "loss": 0.020375268161296846, "grad_norm": 1.6953125, "learning_rate": 7.372549019607845e-06, "entropy": 0.02543655373156071, "num_tokens": 406768.0, "mean_token_accuracy": 0.9911065042018891, "epoch": 1.8970588235294117, "step": 645 }, { "loss": 0.020015493035316467, "grad_norm": 1.7421875, "learning_rate": 7.274509803921569e-06, "entropy": 0.027230485714972018, "num_tokens": 409875.0, "mean_token_accuracy": 0.9906234502792358, "epoch": 1.9117647058823528, "step": 650 }, { "loss": 0.022530680894851683, "grad_norm": 1.421875, "learning_rate": 7.176470588235295e-06, "entropy": 0.028223772905766963, "num_tokens": 412987.0, "mean_token_accuracy": 0.9903216242790223, "epoch": 1.9264705882352942, "step": 655 }, { "loss": 0.021129874885082243, "grad_norm": 1.109375, "learning_rate": 7.07843137254902e-06, "entropy": 0.02674291282892227, "num_tokens": 416181.0, "mean_token_accuracy": 0.9886639952659607, "epoch": 1.9411764705882353, "step": 660 }, { "loss": 0.021244224905967713, "grad_norm": 0.9453125, "learning_rate": 6.9803921568627454e-06, "entropy": 0.028005971759557723, "num_tokens": 419323.0, "mean_token_accuracy": 0.9905200719833374, "epoch": 1.9558823529411766, "step": 665 }, { "loss": 0.022309188544750214, "grad_norm": 1.375, "learning_rate": 6.8823529411764715e-06, "entropy": 0.027272411435842515, "num_tokens": 422484.0, "mean_token_accuracy": 0.9878733932971955, "epoch": 1.9705882352941178, "step": 670 }, { "loss": 0.022459632158279418, "grad_norm": 1.203125, "learning_rate": 6.784313725490197e-06, "entropy": 0.026817415095865726, "num_tokens": 425583.0, "mean_token_accuracy": 0.9908780753612518, "epoch": 1.9852941176470589, "step": 675 }, { "loss": 0.021811096370220183, "grad_norm": 1.265625, "learning_rate": 6.686274509803922e-06, "entropy": 0.026038615591824056, "num_tokens": 428736.0, "mean_token_accuracy": 0.9897907853126526, "epoch": 2.0, "step": 680 }, { "loss": 0.019171090424060823, "grad_norm": 1.078125, "learning_rate": 6.588235294117647e-06, "entropy": 0.02475190218538046, "num_tokens": 431976.0, "mean_token_accuracy": 0.989355844259262, "epoch": 2.014705882352941, "step": 685 }, { "loss": 0.023474155366420744, "grad_norm": 1.1640625, "learning_rate": 6.490196078431373e-06, "entropy": 0.026115396432578562, "num_tokens": 435142.0, "mean_token_accuracy": 0.9885824680328369, "epoch": 2.0294117647058822, "step": 690 }, { "loss": 0.020176805555820465, "grad_norm": 1.0, "learning_rate": 6.3921568627450984e-06, "entropy": 0.026907235756516455, "num_tokens": 438259.0, "mean_token_accuracy": 0.9919745445251464, "epoch": 2.0441176470588234, "step": 695 }, { "loss": 0.022543656826019286, "grad_norm": 1.34375, "learning_rate": 6.294117647058824e-06, "entropy": 0.02749718502163887, "num_tokens": 441366.0, "mean_token_accuracy": 0.9880188047885895, "epoch": 2.0588235294117645, "step": 700 }, { "loss": 0.019685085117816924, "grad_norm": 0.9453125, "learning_rate": 6.19607843137255e-06, "entropy": 0.024849089048802852, "num_tokens": 444474.0, "mean_token_accuracy": 0.9906105160713196, "epoch": 2.073529411764706, "step": 705 }, { "loss": 0.020225000381469727, "grad_norm": 1.234375, "learning_rate": 6.098039215686276e-06, "entropy": 0.023934758827090265, "num_tokens": 447652.0, "mean_token_accuracy": 0.9896179974079132, "epoch": 2.088235294117647, "step": 710 }, { "loss": 0.02128472626209259, "grad_norm": 1.078125, "learning_rate": 6e-06, "entropy": 0.02389440070837736, "num_tokens": 450833.0, "mean_token_accuracy": 0.9899099349975586, "epoch": 2.1029411764705883, "step": 715 }, { "loss": 0.021367147564888, "grad_norm": 1.6015625, "learning_rate": 5.901960784313726e-06, "entropy": 0.02620517127215862, "num_tokens": 453949.0, "mean_token_accuracy": 0.988726532459259, "epoch": 2.1176470588235294, "step": 720 }, { "loss": 0.01960753947496414, "grad_norm": 1.03125, "learning_rate": 5.803921568627452e-06, "entropy": 0.02435927651822567, "num_tokens": 457147.0, "mean_token_accuracy": 0.9908569097518921, "epoch": 2.1323529411764706, "step": 725 }, { "loss": 0.022167882323265074, "grad_norm": 1.234375, "learning_rate": 5.705882352941177e-06, "entropy": 0.02521121110767126, "num_tokens": 460308.0, "mean_token_accuracy": 0.9891940593719483, "epoch": 2.1470588235294117, "step": 730 }, { "loss": 0.0210279181599617, "grad_norm": 1.359375, "learning_rate": 5.607843137254903e-06, "entropy": 0.02500821612775326, "num_tokens": 463449.0, "mean_token_accuracy": 0.9884547054767608, "epoch": 2.161764705882353, "step": 735 }, { "loss": 0.01987575888633728, "grad_norm": 1.03125, "learning_rate": 5.509803921568628e-06, "entropy": 0.025977463461458683, "num_tokens": 466590.0, "mean_token_accuracy": 0.9888093769550323, "epoch": 2.176470588235294, "step": 740 }, { "loss": 0.019111356139183043, "grad_norm": 1.25, "learning_rate": 5.411764705882353e-06, "entropy": 0.02638601940125227, "num_tokens": 469726.0, "mean_token_accuracy": 0.9917258858680725, "epoch": 2.1911764705882355, "step": 745 }, { "loss": 0.020354922115802764, "grad_norm": 1.171875, "learning_rate": 5.313725490196079e-06, "entropy": 0.026662386767566205, "num_tokens": 472853.0, "mean_token_accuracy": 0.99064000248909, "epoch": 2.2058823529411766, "step": 750 }, { "loss": 0.01959734410047531, "grad_norm": 0.80859375, "learning_rate": 5.2156862745098044e-06, "entropy": 0.02579411044716835, "num_tokens": 476008.0, "mean_token_accuracy": 0.9904728531837463, "epoch": 2.2205882352941178, "step": 755 }, { "loss": 0.020466303825378417, "grad_norm": 1.3828125, "learning_rate": 5.11764705882353e-06, "entropy": 0.0256651122123003, "num_tokens": 479150.0, "mean_token_accuracy": 0.9903539717197418, "epoch": 2.235294117647059, "step": 760 }, { "loss": 0.01983775794506073, "grad_norm": 0.99609375, "learning_rate": 5.019607843137255e-06, "entropy": 0.02584236618131399, "num_tokens": 482321.0, "mean_token_accuracy": 0.9914842903614044, "epoch": 2.25, "step": 765 }, { "loss": 0.020100761950016022, "grad_norm": 1.046875, "learning_rate": 4.921568627450981e-06, "entropy": 0.02499296572059393, "num_tokens": 485510.0, "mean_token_accuracy": 0.991219836473465, "epoch": 2.264705882352941, "step": 770 }, { "loss": 0.02088477313518524, "grad_norm": 1.328125, "learning_rate": 4.823529411764706e-06, "entropy": 0.024959737621247768, "num_tokens": 488698.0, "mean_token_accuracy": 0.9898148238658905, "epoch": 2.2794117647058822, "step": 775 }, { "loss": 0.0195361465215683, "grad_norm": 1.2421875, "learning_rate": 4.725490196078431e-06, "entropy": 0.023672481067478657, "num_tokens": 491906.0, "mean_token_accuracy": 0.9900302290916443, "epoch": 2.2941176470588234, "step": 780 }, { "loss": 0.019702821969985962, "grad_norm": 1.265625, "learning_rate": 4.627450980392157e-06, "entropy": 0.025737580843269825, "num_tokens": 494997.0, "mean_token_accuracy": 0.9905776441097259, "epoch": 2.3088235294117645, "step": 785 }, { "loss": 0.018527360260486604, "grad_norm": 1.078125, "learning_rate": 4.529411764705883e-06, "entropy": 0.02454463895410299, "num_tokens": 498138.0, "mean_token_accuracy": 0.9910318195819855, "epoch": 2.323529411764706, "step": 790 }, { "loss": 0.018923106789588928, "grad_norm": 1.359375, "learning_rate": 4.431372549019608e-06, "entropy": 0.0245100449770689, "num_tokens": 501316.0, "mean_token_accuracy": 0.9911953806877136, "epoch": 2.338235294117647, "step": 795 }, { "loss": 0.01874026209115982, "grad_norm": 1.140625, "learning_rate": 4.333333333333334e-06, "entropy": 0.023334310948848726, "num_tokens": 504533.0, "mean_token_accuracy": 0.9910171329975128, "epoch": 2.3529411764705883, "step": 800 }, { "loss": 0.022160655260086058, "grad_norm": 1.2578125, "learning_rate": 4.235294117647059e-06, "entropy": 0.026187057420611382, "num_tokens": 507616.0, "mean_token_accuracy": 0.9876076638698578, "epoch": 2.3676470588235294, "step": 805 }, { "loss": 0.018640576303005217, "grad_norm": 1.03125, "learning_rate": 4.137254901960784e-06, "entropy": 0.02308085039258003, "num_tokens": 510793.0, "mean_token_accuracy": 0.9908162891864777, "epoch": 2.3823529411764706, "step": 810 }, { "loss": 0.019237047433853148, "grad_norm": 0.8984375, "learning_rate": 4.03921568627451e-06, "entropy": 0.024417817965149878, "num_tokens": 513995.0, "mean_token_accuracy": 0.9902299284934998, "epoch": 2.3970588235294117, "step": 815 }, { "loss": 0.020626239478588104, "grad_norm": 1.1640625, "learning_rate": 3.941176470588236e-06, "entropy": 0.025944224931299685, "num_tokens": 517128.0, "mean_token_accuracy": 0.9896773338317871, "epoch": 2.411764705882353, "step": 820 }, { "loss": 0.018906430900096895, "grad_norm": 1.0546875, "learning_rate": 3.843137254901962e-06, "entropy": 0.02529167104512453, "num_tokens": 520219.0, "mean_token_accuracy": 0.9905548214912414, "epoch": 2.426470588235294, "step": 825 }, { "loss": 0.01989607810974121, "grad_norm": 1.171875, "learning_rate": 3.7450980392156865e-06, "entropy": 0.025429282896220685, "num_tokens": 523368.0, "mean_token_accuracy": 0.9910161614418029, "epoch": 2.4411764705882355, "step": 830 }, { "loss": 0.019511505961418152, "grad_norm": 1.046875, "learning_rate": 3.6470588235294117e-06, "entropy": 0.026134114153683184, "num_tokens": 526516.0, "mean_token_accuracy": 0.9898114144802094, "epoch": 2.4558823529411766, "step": 835 }, { "loss": 0.018582092225551607, "grad_norm": 1.1328125, "learning_rate": 3.5490196078431378e-06, "entropy": 0.02343358173966408, "num_tokens": 529660.0, "mean_token_accuracy": 0.9904271245002747, "epoch": 2.4705882352941178, "step": 840 }, { "loss": 0.020261451601982117, "grad_norm": 1.453125, "learning_rate": 3.450980392156863e-06, "entropy": 0.024460323713719846, "num_tokens": 532778.0, "mean_token_accuracy": 0.9899402976036071, "epoch": 2.485294117647059, "step": 845 }, { "loss": 0.020383948087692262, "grad_norm": 1.1796875, "learning_rate": 3.352941176470588e-06, "entropy": 0.024987665377557276, "num_tokens": 535932.0, "mean_token_accuracy": 0.9898059248924256, "epoch": 2.5, "step": 850 }, { "loss": 0.019448164105415344, "grad_norm": 1.3515625, "learning_rate": 3.2549019607843143e-06, "entropy": 0.02465162370353937, "num_tokens": 539037.0, "mean_token_accuracy": 0.9913235783576966, "epoch": 2.514705882352941, "step": 855 }, { "loss": 0.018925553560256957, "grad_norm": 1.046875, "learning_rate": 3.1568627450980395e-06, "entropy": 0.025184641405940057, "num_tokens": 542197.0, "mean_token_accuracy": 0.991470605134964, "epoch": 2.5294117647058822, "step": 860 }, { "loss": 0.01913969814777374, "grad_norm": 1.0546875, "learning_rate": 3.058823529411765e-06, "entropy": 0.024113286659121512, "num_tokens": 545387.0, "mean_token_accuracy": 0.9914486467838287, "epoch": 2.5441176470588234, "step": 865 }, { "loss": 0.018765930831432343, "grad_norm": 1.0703125, "learning_rate": 2.9607843137254903e-06, "entropy": 0.02413007989525795, "num_tokens": 548534.0, "mean_token_accuracy": 0.9907777428627014, "epoch": 2.5588235294117645, "step": 870 }, { "loss": 0.019279350340366364, "grad_norm": 2.1875, "learning_rate": 2.8627450980392155e-06, "entropy": 0.024522659182548524, "num_tokens": 551721.0, "mean_token_accuracy": 0.9905555963516235, "epoch": 2.5735294117647056, "step": 875 }, { "loss": 0.019660860300064087, "grad_norm": 1.1015625, "learning_rate": 2.7647058823529416e-06, "entropy": 0.024852845631539822, "num_tokens": 554912.0, "mean_token_accuracy": 0.9898727238178253, "epoch": 2.588235294117647, "step": 880 }, { "loss": 0.018780362606048585, "grad_norm": 1.0703125, "learning_rate": 2.666666666666667e-06, "entropy": 0.02551023568958044, "num_tokens": 558028.0, "mean_token_accuracy": 0.99192915558815, "epoch": 2.6029411764705883, "step": 885 }, { "loss": 0.01949601024389267, "grad_norm": 1.1953125, "learning_rate": 2.568627450980392e-06, "entropy": 0.025155650451779366, "num_tokens": 561189.0, "mean_token_accuracy": 0.990712708234787, "epoch": 2.6176470588235294, "step": 890 }, { "loss": 0.019716159999370576, "grad_norm": 1.296875, "learning_rate": 2.470588235294118e-06, "entropy": 0.024883992783725262, "num_tokens": 564374.0, "mean_token_accuracy": 0.989579439163208, "epoch": 2.6323529411764706, "step": 895 }, { "loss": 0.017295162379741668, "grad_norm": 0.97265625, "learning_rate": 2.3725490196078433e-06, "entropy": 0.0241273645311594, "num_tokens": 567550.0, "mean_token_accuracy": 0.9934020042419434, "epoch": 2.6470588235294117, "step": 900 }, { "loss": 0.020695842802524567, "grad_norm": 1.109375, "learning_rate": 2.274509803921569e-06, "entropy": 0.02697849553078413, "num_tokens": 570611.0, "mean_token_accuracy": 0.9914706110954284, "epoch": 2.661764705882353, "step": 905 }, { "loss": 0.017908445000648497, "grad_norm": 1.2734375, "learning_rate": 2.176470588235294e-06, "entropy": 0.022997986152768136, "num_tokens": 573767.0, "mean_token_accuracy": 0.9898150980472564, "epoch": 2.6764705882352944, "step": 910 }, { "loss": 0.020641934871673585, "grad_norm": 1.4921875, "learning_rate": 2.07843137254902e-06, "entropy": 0.027346356958150863, "num_tokens": 576830.0, "mean_token_accuracy": 0.9897843182086945, "epoch": 2.6911764705882355, "step": 915 }, { "loss": 0.019691270589828492, "grad_norm": 1.2890625, "learning_rate": 1.980392156862745e-06, "entropy": 0.023718219250440598, "num_tokens": 580065.0, "mean_token_accuracy": 0.9901076138019562, "epoch": 2.7058823529411766, "step": 920 }, { "loss": 0.02009253352880478, "grad_norm": 1.2109375, "learning_rate": 1.8823529411764707e-06, "entropy": 0.024860053882002832, "num_tokens": 583200.0, "mean_token_accuracy": 0.9894306361675262, "epoch": 2.7205882352941178, "step": 925 }, { "loss": 0.019820311665534975, "grad_norm": 1.1796875, "learning_rate": 1.7843137254901963e-06, "entropy": 0.02641481179744005, "num_tokens": 586247.0, "mean_token_accuracy": 0.9888152658939362, "epoch": 2.735294117647059, "step": 930 }, { "loss": 0.020238989591598512, "grad_norm": 1.34375, "learning_rate": 1.6862745098039217e-06, "entropy": 0.025426279939711093, "num_tokens": 589348.0, "mean_token_accuracy": 0.9893324971199036, "epoch": 2.75, "step": 935 }, { "loss": 0.020529073476791383, "grad_norm": 1.1953125, "learning_rate": 1.5882352941176472e-06, "entropy": 0.025489212945103645, "num_tokens": 592483.0, "mean_token_accuracy": 0.9883848607540131, "epoch": 2.764705882352941, "step": 940 }, { "loss": 0.019503119587898254, "grad_norm": 1.875, "learning_rate": 1.4901960784313726e-06, "entropy": 0.025844238512218, "num_tokens": 595654.0, "mean_token_accuracy": 0.9898752987384796, "epoch": 2.7794117647058822, "step": 945 }, { "loss": 0.020725423097610475, "grad_norm": 1.3359375, "learning_rate": 1.3921568627450982e-06, "entropy": 0.025542815588414668, "num_tokens": 598757.0, "mean_token_accuracy": 0.9899684190750122, "epoch": 2.7941176470588234, "step": 950 }, { "loss": 0.020795242488384248, "grad_norm": 1.1640625, "learning_rate": 1.2941176470588237e-06, "entropy": 0.023506213910877705, "num_tokens": 602069.0, "mean_token_accuracy": 0.9894281327724457, "epoch": 2.8088235294117645, "step": 955 }, { "loss": 0.01915638893842697, "grad_norm": 1.21875, "learning_rate": 1.196078431372549e-06, "entropy": 0.024655142053961753, "num_tokens": 605286.0, "mean_token_accuracy": 0.9900248169898986, "epoch": 2.8235294117647056, "step": 960 }, { "loss": 0.01975841522216797, "grad_norm": 1.1484375, "learning_rate": 1.0980392156862745e-06, "entropy": 0.025551106408238412, "num_tokens": 608374.0, "mean_token_accuracy": 0.9892638444900512, "epoch": 2.838235294117647, "step": 965 }, { "loss": 0.020852866768836974, "grad_norm": 1.2421875, "learning_rate": 1.0000000000000002e-06, "entropy": 0.02480896282941103, "num_tokens": 611577.0, "mean_token_accuracy": 0.9892595648765564, "epoch": 2.8529411764705883, "step": 970 }, { "loss": 0.019326749444007873, "grad_norm": 0.875, "learning_rate": 9.019607843137256e-07, "entropy": 0.02385783474892378, "num_tokens": 614761.0, "mean_token_accuracy": 0.9904800593852997, "epoch": 2.8676470588235294, "step": 975 }, { "loss": 0.019405061006546022, "grad_norm": 1.1875, "learning_rate": 8.039215686274511e-07, "entropy": 0.026029090210795403, "num_tokens": 617870.0, "mean_token_accuracy": 0.9896216452121734, "epoch": 2.8823529411764706, "step": 980 }, { "loss": 0.019337351620197295, "grad_norm": 0.9921875, "learning_rate": 7.058823529411766e-07, "entropy": 0.026062553003430366, "num_tokens": 620943.0, "mean_token_accuracy": 0.9899002552032471, "epoch": 2.8970588235294117, "step": 985 }, { "loss": 0.01972263157367706, "grad_norm": 1.5625, "learning_rate": 6.07843137254902e-07, "entropy": 0.025324805453419686, "num_tokens": 624094.0, "mean_token_accuracy": 0.9898600101470947, "epoch": 2.911764705882353, "step": 990 }, { "loss": 0.017833781242370606, "grad_norm": 1.2265625, "learning_rate": 5.098039215686275e-07, "entropy": 0.023284821771085262, "num_tokens": 627253.0, "mean_token_accuracy": 0.9910983681678772, "epoch": 2.9264705882352944, "step": 995 }, { "loss": 0.020137375593185423, "grad_norm": 1.3984375, "learning_rate": 4.1176470588235295e-07, "entropy": 0.024203809909522533, "num_tokens": 630427.0, "mean_token_accuracy": 0.9907480180263519, "epoch": 2.9411764705882355, "step": 1000 }, { "loss": 0.019109995663166048, "grad_norm": 1.21875, "learning_rate": 3.1372549019607843e-07, "entropy": 0.02416255362331867, "num_tokens": 633632.0, "mean_token_accuracy": 0.9915190756320953, "epoch": 2.9558823529411766, "step": 1005 }, { "loss": 0.02000269144773483, "grad_norm": 1.859375, "learning_rate": 2.1568627450980394e-07, "entropy": 0.024217843264341354, "num_tokens": 636805.0, "mean_token_accuracy": 0.9894875824451447, "epoch": 2.9705882352941178, "step": 1010 }, { "loss": 0.020338763296604157, "grad_norm": 1.546875, "learning_rate": 1.1764705882352942e-07, "entropy": 0.024258859269320966, "num_tokens": 639984.0, "mean_token_accuracy": 0.9892021059989929, "epoch": 2.985294117647059, "step": 1015 }, { "loss": 0.020995336771011352, "grad_norm": 1.046875, "learning_rate": 1.9607843137254902e-08, "entropy": 0.025342148169875144, "num_tokens": 643104.0, "mean_token_accuracy": 0.9887544453144074, "epoch": 3.0, "step": 1020 }, { "train_runtime": 3944.5682, "train_samples_per_second": 0.517, "train_steps_per_second": 0.259, "total_flos": 5056111718203392.0, "train_loss": 0.07629515403041652, "epoch": 3.0, "step": 1020 } ]