{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5197568389057752, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007598784194528875, "grad_norm": 11.283177375793457, "learning_rate": 0.0, "loss": 0.7971611022949219, "mean_token_accuracy": 0.7827156782150269, "num_tokens": 9945.0, "step": 1 }, { "epoch": 0.001519756838905775, "grad_norm": 15.171019554138184, "learning_rate": 2.5252525252525256e-08, "loss": 0.7045958042144775, "mean_token_accuracy": 0.8420767784118652, "num_tokens": 13584.0, "step": 2 }, { "epoch": 0.0022796352583586625, "grad_norm": 13.268917083740234, "learning_rate": 5.050505050505051e-08, "loss": 0.9431202411651611, "mean_token_accuracy": 0.7105640172958374, "num_tokens": 25466.0, "step": 3 }, { "epoch": 0.00303951367781155, "grad_norm": 16.681028366088867, "learning_rate": 7.575757575757576e-08, "loss": 1.0039623975753784, "mean_token_accuracy": 0.7251508235931396, "num_tokens": 31349.0, "step": 4 }, { "epoch": 0.003799392097264438, "grad_norm": 14.952352523803711, "learning_rate": 1.0101010101010103e-07, "loss": 0.9401828050613403, "mean_token_accuracy": 0.7458977699279785, "num_tokens": 38127.0, "step": 5 }, { "epoch": 0.004559270516717325, "grad_norm": 12.397281646728516, "learning_rate": 1.2626262626262626e-07, "loss": 0.7461763620376587, "mean_token_accuracy": 0.7800794839859009, "num_tokens": 44277.0, "step": 6 }, { "epoch": 0.005319148936170213, "grad_norm": 13.733500480651855, "learning_rate": 1.5151515151515152e-07, "loss": 0.845583438873291, "mean_token_accuracy": 0.7497140169143677, "num_tokens": 52421.0, "step": 7 }, { "epoch": 0.0060790273556231, "grad_norm": 11.571980476379395, "learning_rate": 1.767676767676768e-07, "loss": 0.8373166918754578, "mean_token_accuracy": 0.760189414024353, "num_tokens": 62813.0, "step": 8 }, { "epoch": 0.006838905775075988, "grad_norm": 14.639892578125, "learning_rate": 2.0202020202020205e-07, "loss": 0.9515388607978821, "mean_token_accuracy": 0.7438177466392517, "num_tokens": 69119.0, "step": 9 }, { "epoch": 0.007598784194528876, "grad_norm": 17.526134490966797, "learning_rate": 2.2727272727272729e-07, "loss": 1.0442354679107666, "mean_token_accuracy": 0.7132344245910645, "num_tokens": 74791.0, "step": 10 }, { "epoch": 0.008358662613981762, "grad_norm": 11.106691360473633, "learning_rate": 2.525252525252525e-07, "loss": 0.8264291286468506, "mean_token_accuracy": 0.7412869334220886, "num_tokens": 89934.0, "step": 11 }, { "epoch": 0.00911854103343465, "grad_norm": 16.4799861907959, "learning_rate": 2.7777777777777776e-07, "loss": 1.0191465616226196, "mean_token_accuracy": 0.7320127487182617, "num_tokens": 96056.0, "step": 12 }, { "epoch": 0.009878419452887538, "grad_norm": 12.276870727539062, "learning_rate": 3.0303030303030305e-07, "loss": 0.9538564682006836, "mean_token_accuracy": 0.7254630327224731, "num_tokens": 106049.0, "step": 13 }, { "epoch": 0.010638297872340425, "grad_norm": 13.591197967529297, "learning_rate": 3.2828282828282834e-07, "loss": 0.8274934887886047, "mean_token_accuracy": 0.7830080986022949, "num_tokens": 111456.0, "step": 14 }, { "epoch": 0.011398176291793313, "grad_norm": 15.956825256347656, "learning_rate": 3.535353535353536e-07, "loss": 0.8980859518051147, "mean_token_accuracy": 0.7673473954200745, "num_tokens": 117730.0, "step": 15 }, { "epoch": 0.0121580547112462, "grad_norm": 11.615748405456543, "learning_rate": 3.787878787878788e-07, "loss": 0.8425506949424744, "mean_token_accuracy": 0.7624208927154541, "num_tokens": 127723.0, "step": 16 }, { "epoch": 0.012917933130699088, "grad_norm": 13.266345024108887, "learning_rate": 4.040404040404041e-07, "loss": 0.955736517906189, "mean_token_accuracy": 0.7301814556121826, "num_tokens": 140758.0, "step": 17 }, { "epoch": 0.013677811550151976, "grad_norm": 12.722447395324707, "learning_rate": 4.2929292929292934e-07, "loss": 0.8487507700920105, "mean_token_accuracy": 0.7661495804786682, "num_tokens": 148284.0, "step": 18 }, { "epoch": 0.014437689969604863, "grad_norm": 20.19617462158203, "learning_rate": 4.5454545454545457e-07, "loss": 1.1628165245056152, "mean_token_accuracy": 0.7297697067260742, "num_tokens": 151360.0, "step": 19 }, { "epoch": 0.015197568389057751, "grad_norm": 10.173198699951172, "learning_rate": 4.797979797979798e-07, "loss": 0.7091408967971802, "mean_token_accuracy": 0.8006219267845154, "num_tokens": 161563.0, "step": 20 }, { "epoch": 0.015957446808510637, "grad_norm": 10.829021453857422, "learning_rate": 5.05050505050505e-07, "loss": 0.7268050909042358, "mean_token_accuracy": 0.7929083108901978, "num_tokens": 171492.0, "step": 21 }, { "epoch": 0.016717325227963525, "grad_norm": 13.791851997375488, "learning_rate": 5.303030303030304e-07, "loss": 1.0046508312225342, "mean_token_accuracy": 0.7021903991699219, "num_tokens": 180054.0, "step": 22 }, { "epoch": 0.017477203647416412, "grad_norm": 12.147807121276855, "learning_rate": 5.555555555555555e-07, "loss": 0.868274450302124, "mean_token_accuracy": 0.7546091675758362, "num_tokens": 187940.0, "step": 23 }, { "epoch": 0.0182370820668693, "grad_norm": 11.01634407043457, "learning_rate": 5.808080808080809e-07, "loss": 0.6816753149032593, "mean_token_accuracy": 0.7865602970123291, "num_tokens": 204159.0, "step": 24 }, { "epoch": 0.018996960486322188, "grad_norm": 25.296483993530273, "learning_rate": 6.060606060606061e-07, "loss": 0.9021725654602051, "mean_token_accuracy": 0.8104009628295898, "num_tokens": 206425.0, "step": 25 }, { "epoch": 0.019756838905775075, "grad_norm": 12.985673904418945, "learning_rate": 6.313131313131314e-07, "loss": 0.9623971581459045, "mean_token_accuracy": 0.7304720878601074, "num_tokens": 216035.0, "step": 26 }, { "epoch": 0.020516717325227963, "grad_norm": 10.757186889648438, "learning_rate": 6.565656565656567e-07, "loss": 0.8544430136680603, "mean_token_accuracy": 0.7558294534683228, "num_tokens": 221961.0, "step": 27 }, { "epoch": 0.02127659574468085, "grad_norm": 14.1519775390625, "learning_rate": 6.818181818181818e-07, "loss": 0.9835999011993408, "mean_token_accuracy": 0.7347756028175354, "num_tokens": 226096.0, "step": 28 }, { "epoch": 0.022036474164133738, "grad_norm": 10.439614295959473, "learning_rate": 7.070707070707071e-07, "loss": 0.7926623821258545, "mean_token_accuracy": 0.7580159306526184, "num_tokens": 233847.0, "step": 29 }, { "epoch": 0.022796352583586626, "grad_norm": 7.497498512268066, "learning_rate": 7.323232323232324e-07, "loss": 0.6091005206108093, "mean_token_accuracy": 0.8185515999794006, "num_tokens": 250007.0, "step": 30 }, { "epoch": 0.023556231003039513, "grad_norm": 7.350061893463135, "learning_rate": 7.575757575757576e-07, "loss": 0.6764670610427856, "mean_token_accuracy": 0.8003748655319214, "num_tokens": 266452.0, "step": 31 }, { "epoch": 0.0243161094224924, "grad_norm": 8.805062294006348, "learning_rate": 7.82828282828283e-07, "loss": 0.9537851810455322, "mean_token_accuracy": 0.7211984992027283, "num_tokens": 273503.0, "step": 32 }, { "epoch": 0.02507598784194529, "grad_norm": 7.508991718292236, "learning_rate": 8.080808080808082e-07, "loss": 0.7078614830970764, "mean_token_accuracy": 0.8025475144386292, "num_tokens": 279366.0, "step": 33 }, { "epoch": 0.025835866261398176, "grad_norm": 7.2268242835998535, "learning_rate": 8.333333333333333e-07, "loss": 0.724380373954773, "mean_token_accuracy": 0.7933604717254639, "num_tokens": 286850.0, "step": 34 }, { "epoch": 0.026595744680851064, "grad_norm": 5.610812664031982, "learning_rate": 8.585858585858587e-07, "loss": 0.5734965801239014, "mean_token_accuracy": 0.8215349912643433, "num_tokens": 298456.0, "step": 35 }, { "epoch": 0.02735562310030395, "grad_norm": 5.60352087020874, "learning_rate": 8.838383838383839e-07, "loss": 0.58527672290802, "mean_token_accuracy": 0.8147624731063843, "num_tokens": 308843.0, "step": 36 }, { "epoch": 0.02811550151975684, "grad_norm": 7.149941444396973, "learning_rate": 9.090909090909091e-07, "loss": 0.743807315826416, "mean_token_accuracy": 0.7675718665122986, "num_tokens": 314275.0, "step": 37 }, { "epoch": 0.028875379939209727, "grad_norm": 4.390667915344238, "learning_rate": 9.343434343434345e-07, "loss": 0.6498252749443054, "mean_token_accuracy": 0.7975111603736877, "num_tokens": 326556.0, "step": 38 }, { "epoch": 0.029635258358662615, "grad_norm": 4.54934549331665, "learning_rate": 9.595959595959596e-07, "loss": 0.6274977326393127, "mean_token_accuracy": 0.7976137399673462, "num_tokens": 332575.0, "step": 39 }, { "epoch": 0.030395136778115502, "grad_norm": 3.5230090618133545, "learning_rate": 9.84848484848485e-07, "loss": 0.5257169008255005, "mean_token_accuracy": 0.8215709924697876, "num_tokens": 344979.0, "step": 40 }, { "epoch": 0.03115501519756839, "grad_norm": 5.243790626525879, "learning_rate": 1.01010101010101e-06, "loss": 0.6588603854179382, "mean_token_accuracy": 0.8016726970672607, "num_tokens": 352625.0, "step": 41 }, { "epoch": 0.031914893617021274, "grad_norm": 4.898980617523193, "learning_rate": 1.0353535353535354e-06, "loss": 0.48198166489601135, "mean_token_accuracy": 0.8494296073913574, "num_tokens": 356906.0, "step": 42 }, { "epoch": 0.03267477203647416, "grad_norm": 4.83416748046875, "learning_rate": 1.0606060606060608e-06, "loss": 0.7198941707611084, "mean_token_accuracy": 0.7772096991539001, "num_tokens": 362345.0, "step": 43 }, { "epoch": 0.03343465045592705, "grad_norm": 3.0984978675842285, "learning_rate": 1.085858585858586e-06, "loss": 0.5374331474304199, "mean_token_accuracy": 0.8272683620452881, "num_tokens": 369624.0, "step": 44 }, { "epoch": 0.03419452887537994, "grad_norm": 2.5063867568969727, "learning_rate": 1.111111111111111e-06, "loss": 0.5425165891647339, "mean_token_accuracy": 0.8135398626327515, "num_tokens": 383627.0, "step": 45 }, { "epoch": 0.034954407294832825, "grad_norm": 2.2525131702423096, "learning_rate": 1.1363636363636364e-06, "loss": 0.4416641891002655, "mean_token_accuracy": 0.8437566757202148, "num_tokens": 401569.0, "step": 46 }, { "epoch": 0.03571428571428571, "grad_norm": 3.5199527740478516, "learning_rate": 1.1616161616161617e-06, "loss": 0.5858573913574219, "mean_token_accuracy": 0.7998367547988892, "num_tokens": 408392.0, "step": 47 }, { "epoch": 0.0364741641337386, "grad_norm": 3.3617491722106934, "learning_rate": 1.186868686868687e-06, "loss": 0.4014914929866791, "mean_token_accuracy": 0.8768728971481323, "num_tokens": 413042.0, "step": 48 }, { "epoch": 0.03723404255319149, "grad_norm": 6.081437110900879, "learning_rate": 1.2121212121212122e-06, "loss": 0.74919193983078, "mean_token_accuracy": 0.7965025901794434, "num_tokens": 415571.0, "step": 49 }, { "epoch": 0.037993920972644375, "grad_norm": 5.349213600158691, "learning_rate": 1.2373737373737375e-06, "loss": 0.5735707879066467, "mean_token_accuracy": 0.8414558172225952, "num_tokens": 418323.0, "step": 50 }, { "epoch": 0.03875379939209726, "grad_norm": 3.1204047203063965, "learning_rate": 1.2626262626262629e-06, "loss": 0.5516555309295654, "mean_token_accuracy": 0.8204443454742432, "num_tokens": 431515.0, "step": 51 }, { "epoch": 0.03951367781155015, "grad_norm": 3.1404378414154053, "learning_rate": 1.287878787878788e-06, "loss": 0.640109658241272, "mean_token_accuracy": 0.7860654592514038, "num_tokens": 441654.0, "step": 52 }, { "epoch": 0.04027355623100304, "grad_norm": 2.680227518081665, "learning_rate": 1.3131313131313134e-06, "loss": 0.5188562273979187, "mean_token_accuracy": 0.8292064666748047, "num_tokens": 451848.0, "step": 53 }, { "epoch": 0.041033434650455926, "grad_norm": 3.0411198139190674, "learning_rate": 1.3383838383838385e-06, "loss": 0.6637120842933655, "mean_token_accuracy": 0.7974024415016174, "num_tokens": 463258.0, "step": 54 }, { "epoch": 0.04179331306990881, "grad_norm": 1.9518145322799683, "learning_rate": 1.3636363636363636e-06, "loss": 0.5045453310012817, "mean_token_accuracy": 0.8279316425323486, "num_tokens": 482025.0, "step": 55 }, { "epoch": 0.0425531914893617, "grad_norm": 2.809535503387451, "learning_rate": 1.3888888888888892e-06, "loss": 0.6189742684364319, "mean_token_accuracy": 0.788748025894165, "num_tokens": 491561.0, "step": 56 }, { "epoch": 0.04331306990881459, "grad_norm": 2.19962477684021, "learning_rate": 1.4141414141414143e-06, "loss": 0.5295487642288208, "mean_token_accuracy": 0.8143942356109619, "num_tokens": 513580.0, "step": 57 }, { "epoch": 0.044072948328267476, "grad_norm": 3.321911096572876, "learning_rate": 1.4393939393939396e-06, "loss": 0.5833899974822998, "mean_token_accuracy": 0.8207724690437317, "num_tokens": 518608.0, "step": 58 }, { "epoch": 0.044832826747720364, "grad_norm": 3.252626657485962, "learning_rate": 1.4646464646464648e-06, "loss": 0.6266637444496155, "mean_token_accuracy": 0.7806040048599243, "num_tokens": 526028.0, "step": 59 }, { "epoch": 0.04559270516717325, "grad_norm": 2.391918182373047, "learning_rate": 1.48989898989899e-06, "loss": 0.4990342855453491, "mean_token_accuracy": 0.8362030982971191, "num_tokens": 536515.0, "step": 60 }, { "epoch": 0.04635258358662614, "grad_norm": 4.2576985359191895, "learning_rate": 1.5151515151515152e-06, "loss": 0.6036560535430908, "mean_token_accuracy": 0.8048122525215149, "num_tokens": 539997.0, "step": 61 }, { "epoch": 0.04711246200607903, "grad_norm": 3.1281285285949707, "learning_rate": 1.5404040404040404e-06, "loss": 0.5602238178253174, "mean_token_accuracy": 0.8046067357063293, "num_tokens": 546192.0, "step": 62 }, { "epoch": 0.047872340425531915, "grad_norm": 2.5616486072540283, "learning_rate": 1.565656565656566e-06, "loss": 0.563162624835968, "mean_token_accuracy": 0.8025326728820801, "num_tokens": 555153.0, "step": 63 }, { "epoch": 0.0486322188449848, "grad_norm": 2.0530178546905518, "learning_rate": 1.590909090909091e-06, "loss": 0.5288445353507996, "mean_token_accuracy": 0.8235469460487366, "num_tokens": 570107.0, "step": 64 }, { "epoch": 0.04939209726443769, "grad_norm": 6.312800884246826, "learning_rate": 1.6161616161616164e-06, "loss": 0.4401736855506897, "mean_token_accuracy": 0.8390042185783386, "num_tokens": 573715.0, "step": 65 }, { "epoch": 0.05015197568389058, "grad_norm": 2.2594375610351562, "learning_rate": 1.6414141414141415e-06, "loss": 0.6218773126602173, "mean_token_accuracy": 0.782254695892334, "num_tokens": 586651.0, "step": 66 }, { "epoch": 0.050911854103343465, "grad_norm": 2.471200466156006, "learning_rate": 1.6666666666666667e-06, "loss": 0.601629376411438, "mean_token_accuracy": 0.819005012512207, "num_tokens": 599682.0, "step": 67 }, { "epoch": 0.05167173252279635, "grad_norm": 3.020212411880493, "learning_rate": 1.6919191919191922e-06, "loss": 0.47155898809432983, "mean_token_accuracy": 0.8229066729545593, "num_tokens": 604964.0, "step": 68 }, { "epoch": 0.05243161094224924, "grad_norm": 2.80106520652771, "learning_rate": 1.7171717171717173e-06, "loss": 0.6624096632003784, "mean_token_accuracy": 0.7921645641326904, "num_tokens": 614910.0, "step": 69 }, { "epoch": 0.05319148936170213, "grad_norm": 3.308406114578247, "learning_rate": 1.7424242424242427e-06, "loss": 0.5815398693084717, "mean_token_accuracy": 0.8116977214813232, "num_tokens": 619735.0, "step": 70 }, { "epoch": 0.053951367781155016, "grad_norm": 2.4876530170440674, "learning_rate": 1.7676767676767678e-06, "loss": 0.5332903861999512, "mean_token_accuracy": 0.816318154335022, "num_tokens": 627667.0, "step": 71 }, { "epoch": 0.0547112462006079, "grad_norm": 2.404669761657715, "learning_rate": 1.792929292929293e-06, "loss": 0.6185491681098938, "mean_token_accuracy": 0.7874469757080078, "num_tokens": 637711.0, "step": 72 }, { "epoch": 0.05547112462006079, "grad_norm": 2.0979228019714355, "learning_rate": 1.8181818181818183e-06, "loss": 0.4259670376777649, "mean_token_accuracy": 0.8593729734420776, "num_tokens": 646713.0, "step": 73 }, { "epoch": 0.05623100303951368, "grad_norm": 3.3549416065216064, "learning_rate": 1.8434343434343434e-06, "loss": 0.5811284780502319, "mean_token_accuracy": 0.8014984130859375, "num_tokens": 652156.0, "step": 74 }, { "epoch": 0.056990881458966566, "grad_norm": 2.107623338699341, "learning_rate": 1.868686868686869e-06, "loss": 0.41120773553848267, "mean_token_accuracy": 0.8120651841163635, "num_tokens": 660078.0, "step": 75 }, { "epoch": 0.057750759878419454, "grad_norm": 2.8542380332946777, "learning_rate": 1.8939393939393941e-06, "loss": 0.5268816351890564, "mean_token_accuracy": 0.8322579860687256, "num_tokens": 665346.0, "step": 76 }, { "epoch": 0.05851063829787234, "grad_norm": 1.578070878982544, "learning_rate": 1.9191919191919192e-06, "loss": 0.5522144436836243, "mean_token_accuracy": 0.8128998279571533, "num_tokens": 686660.0, "step": 77 }, { "epoch": 0.05927051671732523, "grad_norm": 1.591410756111145, "learning_rate": 1.944444444444445e-06, "loss": 0.5557606220245361, "mean_token_accuracy": 0.8052172660827637, "num_tokens": 707484.0, "step": 78 }, { "epoch": 0.06003039513677812, "grad_norm": 2.8112404346466064, "learning_rate": 1.96969696969697e-06, "loss": 0.5988024473190308, "mean_token_accuracy": 0.8061349987983704, "num_tokens": 714318.0, "step": 79 }, { "epoch": 0.060790273556231005, "grad_norm": 2.6897754669189453, "learning_rate": 1.994949494949495e-06, "loss": 0.5176495313644409, "mean_token_accuracy": 0.8129833936691284, "num_tokens": 720917.0, "step": 80 }, { "epoch": 0.06155015197568389, "grad_norm": 3.3799517154693604, "learning_rate": 2.02020202020202e-06, "loss": 0.43894606828689575, "mean_token_accuracy": 0.8361872434616089, "num_tokens": 733885.0, "step": 81 }, { "epoch": 0.06231003039513678, "grad_norm": 3.728245496749878, "learning_rate": 2.0454545454545457e-06, "loss": 0.613950252532959, "mean_token_accuracy": 0.8073053359985352, "num_tokens": 737662.0, "step": 82 }, { "epoch": 0.06306990881458967, "grad_norm": 1.8278634548187256, "learning_rate": 2.070707070707071e-06, "loss": 0.4874405264854431, "mean_token_accuracy": 0.8348642587661743, "num_tokens": 749980.0, "step": 83 }, { "epoch": 0.06382978723404255, "grad_norm": 1.807234764099121, "learning_rate": 2.095959595959596e-06, "loss": 0.4484546482563019, "mean_token_accuracy": 0.8600111603736877, "num_tokens": 762881.0, "step": 84 }, { "epoch": 0.06458966565349544, "grad_norm": 2.55515456199646, "learning_rate": 2.1212121212121216e-06, "loss": 0.5734638571739197, "mean_token_accuracy": 0.8466952443122864, "num_tokens": 769672.0, "step": 85 }, { "epoch": 0.06534954407294832, "grad_norm": 7.3995842933654785, "learning_rate": 2.1464646464646467e-06, "loss": 0.5208925008773804, "mean_token_accuracy": 0.8223283886909485, "num_tokens": 776196.0, "step": 86 }, { "epoch": 0.06610942249240122, "grad_norm": 2.283025026321411, "learning_rate": 2.171717171717172e-06, "loss": 0.5865270495414734, "mean_token_accuracy": 0.8025071620941162, "num_tokens": 784754.0, "step": 87 }, { "epoch": 0.0668693009118541, "grad_norm": 2.318805456161499, "learning_rate": 2.196969696969697e-06, "loss": 0.613620400428772, "mean_token_accuracy": 0.7853732109069824, "num_tokens": 792804.0, "step": 88 }, { "epoch": 0.067629179331307, "grad_norm": 1.811884880065918, "learning_rate": 2.222222222222222e-06, "loss": 0.5702196955680847, "mean_token_accuracy": 0.8059632778167725, "num_tokens": 809277.0, "step": 89 }, { "epoch": 0.06838905775075987, "grad_norm": 2.4894015789031982, "learning_rate": 2.2474747474747476e-06, "loss": 0.5306090116500854, "mean_token_accuracy": 0.8193257451057434, "num_tokens": 815089.0, "step": 90 }, { "epoch": 0.06914893617021277, "grad_norm": 1.965124487876892, "learning_rate": 2.2727272727272728e-06, "loss": 0.5051044225692749, "mean_token_accuracy": 0.8207112550735474, "num_tokens": 824557.0, "step": 91 }, { "epoch": 0.06990881458966565, "grad_norm": 1.7140872478485107, "learning_rate": 2.2979797979797983e-06, "loss": 0.5126519203186035, "mean_token_accuracy": 0.8226314783096313, "num_tokens": 836722.0, "step": 92 }, { "epoch": 0.07066869300911854, "grad_norm": 1.8673690557479858, "learning_rate": 2.3232323232323234e-06, "loss": 0.4990379512310028, "mean_token_accuracy": 0.8230464458465576, "num_tokens": 847964.0, "step": 93 }, { "epoch": 0.07142857142857142, "grad_norm": 3.2861623764038086, "learning_rate": 2.348484848484849e-06, "loss": 0.48529326915740967, "mean_token_accuracy": 0.8435841202735901, "num_tokens": 851748.0, "step": 94 }, { "epoch": 0.07218844984802432, "grad_norm": 1.7555155754089355, "learning_rate": 2.373737373737374e-06, "loss": 0.4947798252105713, "mean_token_accuracy": 0.8226956129074097, "num_tokens": 862745.0, "step": 95 }, { "epoch": 0.0729483282674772, "grad_norm": 1.8702484369277954, "learning_rate": 2.3989898989898993e-06, "loss": 0.556981086730957, "mean_token_accuracy": 0.8084474802017212, "num_tokens": 875938.0, "step": 96 }, { "epoch": 0.0737082066869301, "grad_norm": 5.505680084228516, "learning_rate": 2.4242424242424244e-06, "loss": 0.7146786451339722, "mean_token_accuracy": 0.7729057669639587, "num_tokens": 879032.0, "step": 97 }, { "epoch": 0.07446808510638298, "grad_norm": 2.6841628551483154, "learning_rate": 2.4494949494949495e-06, "loss": 0.5092146396636963, "mean_token_accuracy": 0.8317261934280396, "num_tokens": 884771.0, "step": 98 }, { "epoch": 0.07522796352583587, "grad_norm": 2.146594524383545, "learning_rate": 2.474747474747475e-06, "loss": 0.46619778871536255, "mean_token_accuracy": 0.829513669013977, "num_tokens": 899646.0, "step": 99 }, { "epoch": 0.07598784194528875, "grad_norm": 2.7602176666259766, "learning_rate": 2.5e-06, "loss": 0.6693596839904785, "mean_token_accuracy": 0.7876315116882324, "num_tokens": 906491.0, "step": 100 }, { "epoch": 0.07674772036474165, "grad_norm": 2.2387099266052246, "learning_rate": 2.5252525252525258e-06, "loss": 0.6272501945495605, "mean_token_accuracy": 0.7843590974807739, "num_tokens": 917335.0, "step": 101 }, { "epoch": 0.07750759878419453, "grad_norm": 1.5419353246688843, "learning_rate": 2.5505050505050505e-06, "loss": 0.4267471730709076, "mean_token_accuracy": 0.8374665975570679, "num_tokens": 933355.0, "step": 102 }, { "epoch": 0.07826747720364742, "grad_norm": 2.0402910709381104, "learning_rate": 2.575757575757576e-06, "loss": 0.6042163968086243, "mean_token_accuracy": 0.7968152761459351, "num_tokens": 945204.0, "step": 103 }, { "epoch": 0.0790273556231003, "grad_norm": 1.4149705171585083, "learning_rate": 2.601010101010101e-06, "loss": 0.3748396933078766, "mean_token_accuracy": 0.847211480140686, "num_tokens": 958612.0, "step": 104 }, { "epoch": 0.0797872340425532, "grad_norm": 2.3728153705596924, "learning_rate": 2.6262626262626267e-06, "loss": 0.5331054925918579, "mean_token_accuracy": 0.8208682537078857, "num_tokens": 965752.0, "step": 105 }, { "epoch": 0.08054711246200608, "grad_norm": 3.549079179763794, "learning_rate": 2.6515151515151514e-06, "loss": 0.42785799503326416, "mean_token_accuracy": 0.8573940992355347, "num_tokens": 969027.0, "step": 106 }, { "epoch": 0.08130699088145897, "grad_norm": 4.202904224395752, "learning_rate": 2.676767676767677e-06, "loss": 0.5676337480545044, "mean_token_accuracy": 0.8095206022262573, "num_tokens": 971972.0, "step": 107 }, { "epoch": 0.08206686930091185, "grad_norm": 2.4428415298461914, "learning_rate": 2.7020202020202025e-06, "loss": 0.6118494272232056, "mean_token_accuracy": 0.7927250862121582, "num_tokens": 980420.0, "step": 108 }, { "epoch": 0.08282674772036475, "grad_norm": 1.3100357055664062, "learning_rate": 2.7272727272727272e-06, "loss": 0.38054990768432617, "mean_token_accuracy": 0.8546916842460632, "num_tokens": 999025.0, "step": 109 }, { "epoch": 0.08358662613981763, "grad_norm": 3.113126039505005, "learning_rate": 2.7525252525252528e-06, "loss": 0.6473406553268433, "mean_token_accuracy": 0.7861988544464111, "num_tokens": 1003949.0, "step": 110 }, { "epoch": 0.08434650455927052, "grad_norm": 1.9397801160812378, "learning_rate": 2.7777777777777783e-06, "loss": 0.5103553533554077, "mean_token_accuracy": 0.8258156180381775, "num_tokens": 1015959.0, "step": 111 }, { "epoch": 0.0851063829787234, "grad_norm": 2.4072656631469727, "learning_rate": 2.803030303030303e-06, "loss": 0.5189981460571289, "mean_token_accuracy": 0.8395966291427612, "num_tokens": 1023545.0, "step": 112 }, { "epoch": 0.0858662613981763, "grad_norm": 2.847546339035034, "learning_rate": 2.8282828282828286e-06, "loss": 0.48659196496009827, "mean_token_accuracy": 0.8372319340705872, "num_tokens": 1028486.0, "step": 113 }, { "epoch": 0.08662613981762918, "grad_norm": 2.4680769443511963, "learning_rate": 2.8535353535353537e-06, "loss": 0.46832722425460815, "mean_token_accuracy": 0.8386073112487793, "num_tokens": 1034730.0, "step": 114 }, { "epoch": 0.08738601823708207, "grad_norm": 2.969630718231201, "learning_rate": 2.8787878787878793e-06, "loss": 0.5375088453292847, "mean_token_accuracy": 0.8141192197799683, "num_tokens": 1041427.0, "step": 115 }, { "epoch": 0.08814589665653495, "grad_norm": 1.9104801416397095, "learning_rate": 2.904040404040404e-06, "loss": 0.4691570997238159, "mean_token_accuracy": 0.8179234266281128, "num_tokens": 1052158.0, "step": 116 }, { "epoch": 0.08890577507598785, "grad_norm": 1.835992455482483, "learning_rate": 2.9292929292929295e-06, "loss": 0.519229531288147, "mean_token_accuracy": 0.823078989982605, "num_tokens": 1064753.0, "step": 117 }, { "epoch": 0.08966565349544073, "grad_norm": 1.4507238864898682, "learning_rate": 2.954545454545455e-06, "loss": 0.44041338562965393, "mean_token_accuracy": 0.8542196750640869, "num_tokens": 1080811.0, "step": 118 }, { "epoch": 0.09042553191489362, "grad_norm": 2.6786766052246094, "learning_rate": 2.97979797979798e-06, "loss": 0.611121654510498, "mean_token_accuracy": 0.7847856283187866, "num_tokens": 1087186.0, "step": 119 }, { "epoch": 0.0911854103343465, "grad_norm": 2.6856000423431396, "learning_rate": 3.0050505050505054e-06, "loss": 0.48068684339523315, "mean_token_accuracy": 0.8335654139518738, "num_tokens": 1092987.0, "step": 120 }, { "epoch": 0.0919452887537994, "grad_norm": 2.734081745147705, "learning_rate": 3.0303030303030305e-06, "loss": 0.5843905806541443, "mean_token_accuracy": 0.8087998628616333, "num_tokens": 1098814.0, "step": 121 }, { "epoch": 0.09270516717325228, "grad_norm": 2.8224918842315674, "learning_rate": 3.055555555555556e-06, "loss": 0.5544137954711914, "mean_token_accuracy": 0.8101569414138794, "num_tokens": 1104380.0, "step": 122 }, { "epoch": 0.09346504559270517, "grad_norm": 2.3097996711730957, "learning_rate": 3.0808080808080807e-06, "loss": 0.5377509593963623, "mean_token_accuracy": 0.8328956961631775, "num_tokens": 1112904.0, "step": 123 }, { "epoch": 0.09422492401215805, "grad_norm": 1.8075451850891113, "learning_rate": 3.1060606060606063e-06, "loss": 0.44285160303115845, "mean_token_accuracy": 0.8430694341659546, "num_tokens": 1122381.0, "step": 124 }, { "epoch": 0.09498480243161095, "grad_norm": 1.7106983661651611, "learning_rate": 3.131313131313132e-06, "loss": 0.5341939926147461, "mean_token_accuracy": 0.833850622177124, "num_tokens": 1135861.0, "step": 125 }, { "epoch": 0.09574468085106383, "grad_norm": 2.5285379886627197, "learning_rate": 3.1565656565656566e-06, "loss": 0.4692184031009674, "mean_token_accuracy": 0.8595664501190186, "num_tokens": 1142032.0, "step": 126 }, { "epoch": 0.09650455927051672, "grad_norm": 1.740753412246704, "learning_rate": 3.181818181818182e-06, "loss": 0.4924545884132385, "mean_token_accuracy": 0.8333353996276855, "num_tokens": 1155508.0, "step": 127 }, { "epoch": 0.0972644376899696, "grad_norm": 3.07169246673584, "learning_rate": 3.2070707070707072e-06, "loss": 0.6479271054267883, "mean_token_accuracy": 0.7937667965888977, "num_tokens": 1160594.0, "step": 128 }, { "epoch": 0.0980243161094225, "grad_norm": 2.6075611114501953, "learning_rate": 3.232323232323233e-06, "loss": 0.5685735940933228, "mean_token_accuracy": 0.8052327632904053, "num_tokens": 1169828.0, "step": 129 }, { "epoch": 0.09878419452887538, "grad_norm": 2.055074453353882, "learning_rate": 3.257575757575758e-06, "loss": 0.401486873626709, "mean_token_accuracy": 0.864121675491333, "num_tokens": 1177760.0, "step": 130 }, { "epoch": 0.09954407294832827, "grad_norm": 1.610193133354187, "learning_rate": 3.282828282828283e-06, "loss": 0.4663908779621124, "mean_token_accuracy": 0.8454406261444092, "num_tokens": 1190196.0, "step": 131 }, { "epoch": 0.10030395136778116, "grad_norm": 3.7052183151245117, "learning_rate": 3.3080808080808086e-06, "loss": 0.5785504579544067, "mean_token_accuracy": 0.7985049486160278, "num_tokens": 1194569.0, "step": 132 }, { "epoch": 0.10106382978723404, "grad_norm": 2.4421536922454834, "learning_rate": 3.3333333333333333e-06, "loss": 0.47366881370544434, "mean_token_accuracy": 0.8285588026046753, "num_tokens": 1200246.0, "step": 133 }, { "epoch": 0.10182370820668693, "grad_norm": 2.562891721725464, "learning_rate": 3.358585858585859e-06, "loss": 0.4463881552219391, "mean_token_accuracy": 0.874564528465271, "num_tokens": 1204990.0, "step": 134 }, { "epoch": 0.10258358662613981, "grad_norm": 2.4984731674194336, "learning_rate": 3.3838383838383844e-06, "loss": 0.36950692534446716, "mean_token_accuracy": 0.8726234436035156, "num_tokens": 1209798.0, "step": 135 }, { "epoch": 0.1033434650455927, "grad_norm": 2.000546455383301, "learning_rate": 3.409090909090909e-06, "loss": 0.5601820945739746, "mean_token_accuracy": 0.8426068425178528, "num_tokens": 1222406.0, "step": 136 }, { "epoch": 0.10410334346504559, "grad_norm": 3.5904040336608887, "learning_rate": 3.4343434343434347e-06, "loss": 0.46701300144195557, "mean_token_accuracy": 0.8508556485176086, "num_tokens": 1225928.0, "step": 137 }, { "epoch": 0.10486322188449848, "grad_norm": 2.627048969268799, "learning_rate": 3.45959595959596e-06, "loss": 0.5123917460441589, "mean_token_accuracy": 0.8308509588241577, "num_tokens": 1232582.0, "step": 138 }, { "epoch": 0.10562310030395136, "grad_norm": 1.6476247310638428, "learning_rate": 3.4848484848484854e-06, "loss": 0.37991654872894287, "mean_token_accuracy": 0.8649545907974243, "num_tokens": 1242846.0, "step": 139 }, { "epoch": 0.10638297872340426, "grad_norm": 1.9283066987991333, "learning_rate": 3.51010101010101e-06, "loss": 0.44996580481529236, "mean_token_accuracy": 0.8297461867332458, "num_tokens": 1251870.0, "step": 140 }, { "epoch": 0.10714285714285714, "grad_norm": 2.539581298828125, "learning_rate": 3.5353535353535356e-06, "loss": 0.5692148804664612, "mean_token_accuracy": 0.8007944226264954, "num_tokens": 1259323.0, "step": 141 }, { "epoch": 0.10790273556231003, "grad_norm": 3.483673572540283, "learning_rate": 3.560606060606061e-06, "loss": 0.4150466322898865, "mean_token_accuracy": 0.8599950671195984, "num_tokens": 1262353.0, "step": 142 }, { "epoch": 0.10866261398176291, "grad_norm": 2.72830867767334, "learning_rate": 3.585858585858586e-06, "loss": 0.5083350539207458, "mean_token_accuracy": 0.8165005445480347, "num_tokens": 1267258.0, "step": 143 }, { "epoch": 0.1094224924012158, "grad_norm": 3.0948173999786377, "learning_rate": 3.6111111111111115e-06, "loss": 0.6016761064529419, "mean_token_accuracy": 0.7953758239746094, "num_tokens": 1274814.0, "step": 144 }, { "epoch": 0.11018237082066869, "grad_norm": 1.8370214700698853, "learning_rate": 3.6363636363636366e-06, "loss": 0.46101880073547363, "mean_token_accuracy": 0.842634916305542, "num_tokens": 1285621.0, "step": 145 }, { "epoch": 0.11094224924012158, "grad_norm": 2.6534411907196045, "learning_rate": 3.661616161616162e-06, "loss": 0.49813517928123474, "mean_token_accuracy": 0.8288999199867249, "num_tokens": 1291286.0, "step": 146 }, { "epoch": 0.11170212765957446, "grad_norm": 2.3179194927215576, "learning_rate": 3.686868686868687e-06, "loss": 0.37569794058799744, "mean_token_accuracy": 0.8637095093727112, "num_tokens": 1296590.0, "step": 147 }, { "epoch": 0.11246200607902736, "grad_norm": 2.950357675552368, "learning_rate": 3.7121212121212124e-06, "loss": 0.36906760931015015, "mean_token_accuracy": 0.8722575306892395, "num_tokens": 1300285.0, "step": 148 }, { "epoch": 0.11322188449848024, "grad_norm": 2.5531680583953857, "learning_rate": 3.737373737373738e-06, "loss": 0.6222835779190063, "mean_token_accuracy": 0.8069770336151123, "num_tokens": 1308128.0, "step": 149 }, { "epoch": 0.11398176291793313, "grad_norm": 1.5450068712234497, "learning_rate": 3.7626262626262627e-06, "loss": 0.5184916257858276, "mean_token_accuracy": 0.8111289143562317, "num_tokens": 1322203.0, "step": 150 }, { "epoch": 0.11474164133738601, "grad_norm": 2.367525815963745, "learning_rate": 3.7878787878787882e-06, "loss": 0.5896013975143433, "mean_token_accuracy": 0.8134846687316895, "num_tokens": 1330319.0, "step": 151 }, { "epoch": 0.11550151975683891, "grad_norm": 2.4617726802825928, "learning_rate": 3.8131313131313138e-06, "loss": 0.48776859045028687, "mean_token_accuracy": 0.8270028829574585, "num_tokens": 1336441.0, "step": 152 }, { "epoch": 0.11626139817629179, "grad_norm": 2.1757259368896484, "learning_rate": 3.8383838383838385e-06, "loss": 0.46739453077316284, "mean_token_accuracy": 0.8506530523300171, "num_tokens": 1344429.0, "step": 153 }, { "epoch": 0.11702127659574468, "grad_norm": 1.836829662322998, "learning_rate": 3.863636363636364e-06, "loss": 0.39413154125213623, "mean_token_accuracy": 0.8306484222412109, "num_tokens": 1353819.0, "step": 154 }, { "epoch": 0.11778115501519756, "grad_norm": 1.9097377061843872, "learning_rate": 3.88888888888889e-06, "loss": 0.5854921340942383, "mean_token_accuracy": 0.808419942855835, "num_tokens": 1367394.0, "step": 155 }, { "epoch": 0.11854103343465046, "grad_norm": 1.3445755243301392, "learning_rate": 3.914141414141415e-06, "loss": 0.424688458442688, "mean_token_accuracy": 0.8546179533004761, "num_tokens": 1385196.0, "step": 156 }, { "epoch": 0.11930091185410334, "grad_norm": 3.0801331996917725, "learning_rate": 3.93939393939394e-06, "loss": 0.5556809306144714, "mean_token_accuracy": 0.839694619178772, "num_tokens": 1389204.0, "step": 157 }, { "epoch": 0.12006079027355623, "grad_norm": 2.406383991241455, "learning_rate": 3.964646464646465e-06, "loss": 0.577480673789978, "mean_token_accuracy": 0.7948997020721436, "num_tokens": 1395989.0, "step": 158 }, { "epoch": 0.12082066869300911, "grad_norm": 2.291191339492798, "learning_rate": 3.98989898989899e-06, "loss": 0.47489291429519653, "mean_token_accuracy": 0.8395655751228333, "num_tokens": 1403502.0, "step": 159 }, { "epoch": 0.12158054711246201, "grad_norm": 2.4482150077819824, "learning_rate": 4.015151515151515e-06, "loss": 0.4630856215953827, "mean_token_accuracy": 0.8551818132400513, "num_tokens": 1409583.0, "step": 160 }, { "epoch": 0.12234042553191489, "grad_norm": 2.2204723358154297, "learning_rate": 4.04040404040404e-06, "loss": 0.5246984958648682, "mean_token_accuracy": 0.8199984431266785, "num_tokens": 1417105.0, "step": 161 }, { "epoch": 0.12310030395136778, "grad_norm": 2.2438621520996094, "learning_rate": 4.065656565656566e-06, "loss": 0.5016493797302246, "mean_token_accuracy": 0.8170363903045654, "num_tokens": 1424524.0, "step": 162 }, { "epoch": 0.12386018237082067, "grad_norm": 1.9608901739120483, "learning_rate": 4.0909090909090915e-06, "loss": 0.4367106556892395, "mean_token_accuracy": 0.866153359413147, "num_tokens": 1433634.0, "step": 163 }, { "epoch": 0.12462006079027356, "grad_norm": 1.6554137468338013, "learning_rate": 4.116161616161617e-06, "loss": 0.5264513492584229, "mean_token_accuracy": 0.8171229958534241, "num_tokens": 1449026.0, "step": 164 }, { "epoch": 0.12537993920972645, "grad_norm": 2.085062265396118, "learning_rate": 4.141414141414142e-06, "loss": 0.5241636037826538, "mean_token_accuracy": 0.8255504965782166, "num_tokens": 1463885.0, "step": 165 }, { "epoch": 0.12613981762917933, "grad_norm": 3.6241228580474854, "learning_rate": 4.166666666666667e-06, "loss": 0.47209346294403076, "mean_token_accuracy": 0.848996102809906, "num_tokens": 1467858.0, "step": 166 }, { "epoch": 0.12689969604863222, "grad_norm": 1.6649004220962524, "learning_rate": 4.191919191919192e-06, "loss": 0.49254295229911804, "mean_token_accuracy": 0.8267063498497009, "num_tokens": 1478671.0, "step": 167 }, { "epoch": 0.1276595744680851, "grad_norm": 1.5069276094436646, "learning_rate": 4.217171717171717e-06, "loss": 0.49602210521698, "mean_token_accuracy": 0.8417288661003113, "num_tokens": 1499120.0, "step": 168 }, { "epoch": 0.128419452887538, "grad_norm": 1.7290006875991821, "learning_rate": 4.242424242424243e-06, "loss": 0.4862136244773865, "mean_token_accuracy": 0.8200680017471313, "num_tokens": 1511400.0, "step": 169 }, { "epoch": 0.12917933130699089, "grad_norm": 2.7758901119232178, "learning_rate": 4.267676767676767e-06, "loss": 0.5446156859397888, "mean_token_accuracy": 0.8082899451255798, "num_tokens": 1518381.0, "step": 170 }, { "epoch": 0.12993920972644377, "grad_norm": 2.673225164413452, "learning_rate": 4.292929292929293e-06, "loss": 0.603697657585144, "mean_token_accuracy": 0.814541757106781, "num_tokens": 1525862.0, "step": 171 }, { "epoch": 0.13069908814589665, "grad_norm": 3.221341133117676, "learning_rate": 4.3181818181818185e-06, "loss": 0.3855879604816437, "mean_token_accuracy": 0.8715401291847229, "num_tokens": 1529082.0, "step": 172 }, { "epoch": 0.13145896656534956, "grad_norm": 2.2676260471343994, "learning_rate": 4.343434343434344e-06, "loss": 0.39173901081085205, "mean_token_accuracy": 0.8528841733932495, "num_tokens": 1535440.0, "step": 173 }, { "epoch": 0.13221884498480244, "grad_norm": 1.9669594764709473, "learning_rate": 4.368686868686869e-06, "loss": 0.4901638627052307, "mean_token_accuracy": 0.8228653073310852, "num_tokens": 1544007.0, "step": 174 }, { "epoch": 0.13297872340425532, "grad_norm": 2.5451693534851074, "learning_rate": 4.393939393939394e-06, "loss": 0.5373168587684631, "mean_token_accuracy": 0.8103193044662476, "num_tokens": 1550958.0, "step": 175 }, { "epoch": 0.1337386018237082, "grad_norm": 1.6717054843902588, "learning_rate": 4.41919191919192e-06, "loss": 0.5187485218048096, "mean_token_accuracy": 0.8374402523040771, "num_tokens": 1565548.0, "step": 176 }, { "epoch": 0.1344984802431611, "grad_norm": 1.7334532737731934, "learning_rate": 4.444444444444444e-06, "loss": 0.49591949582099915, "mean_token_accuracy": 0.8237836360931396, "num_tokens": 1577431.0, "step": 177 }, { "epoch": 0.135258358662614, "grad_norm": 3.037680149078369, "learning_rate": 4.46969696969697e-06, "loss": 0.3759227395057678, "mean_token_accuracy": 0.8660281896591187, "num_tokens": 1580945.0, "step": 178 }, { "epoch": 0.13601823708206687, "grad_norm": 2.263552188873291, "learning_rate": 4.494949494949495e-06, "loss": 0.519679069519043, "mean_token_accuracy": 0.8363398313522339, "num_tokens": 1588403.0, "step": 179 }, { "epoch": 0.13677811550151975, "grad_norm": 2.6347336769104004, "learning_rate": 4.520202020202021e-06, "loss": 0.5427145957946777, "mean_token_accuracy": 0.8300544023513794, "num_tokens": 1594552.0, "step": 180 }, { "epoch": 0.13753799392097266, "grad_norm": 2.181586503982544, "learning_rate": 4.5454545454545455e-06, "loss": 0.5583125352859497, "mean_token_accuracy": 0.8262332081794739, "num_tokens": 1603003.0, "step": 181 }, { "epoch": 0.13829787234042554, "grad_norm": 2.5012893676757812, "learning_rate": 4.5707070707070715e-06, "loss": 0.3074784278869629, "mean_token_accuracy": 0.8823067545890808, "num_tokens": 1607891.0, "step": 182 }, { "epoch": 0.13905775075987842, "grad_norm": 2.5653810501098633, "learning_rate": 4.595959595959597e-06, "loss": 0.5661969184875488, "mean_token_accuracy": 0.812934398651123, "num_tokens": 1614430.0, "step": 183 }, { "epoch": 0.1398176291793313, "grad_norm": 2.126523017883301, "learning_rate": 4.621212121212122e-06, "loss": 0.49272066354751587, "mean_token_accuracy": 0.8277758359909058, "num_tokens": 1621646.0, "step": 184 }, { "epoch": 0.1405775075987842, "grad_norm": 2.557128429412842, "learning_rate": 4.646464646464647e-06, "loss": 0.45818793773651123, "mean_token_accuracy": 0.8401618003845215, "num_tokens": 1627007.0, "step": 185 }, { "epoch": 0.1413373860182371, "grad_norm": 3.263108491897583, "learning_rate": 4.671717171717172e-06, "loss": 0.36023402214050293, "mean_token_accuracy": 0.8685399293899536, "num_tokens": 1629994.0, "step": 186 }, { "epoch": 0.14209726443768997, "grad_norm": 1.8945348262786865, "learning_rate": 4.696969696969698e-06, "loss": 0.5250070095062256, "mean_token_accuracy": 0.8402786254882812, "num_tokens": 1642363.0, "step": 187 }, { "epoch": 0.14285714285714285, "grad_norm": 3.2461509704589844, "learning_rate": 4.722222222222222e-06, "loss": 0.4553906321525574, "mean_token_accuracy": 0.8446127772331238, "num_tokens": 1646341.0, "step": 188 }, { "epoch": 0.14361702127659576, "grad_norm": 1.7286163568496704, "learning_rate": 4.747474747474748e-06, "loss": 0.47110071778297424, "mean_token_accuracy": 0.8438384532928467, "num_tokens": 1657009.0, "step": 189 }, { "epoch": 0.14437689969604864, "grad_norm": 2.5611977577209473, "learning_rate": 4.772727272727273e-06, "loss": 0.40714097023010254, "mean_token_accuracy": 0.8551381230354309, "num_tokens": 1662133.0, "step": 190 }, { "epoch": 0.14513677811550152, "grad_norm": 2.824475049972534, "learning_rate": 4.7979797979797985e-06, "loss": 0.37457284331321716, "mean_token_accuracy": 0.8744102716445923, "num_tokens": 1666071.0, "step": 191 }, { "epoch": 0.1458966565349544, "grad_norm": 2.905911445617676, "learning_rate": 4.823232323232324e-06, "loss": 0.4000989496707916, "mean_token_accuracy": 0.86374431848526, "num_tokens": 1670783.0, "step": 192 }, { "epoch": 0.1466565349544073, "grad_norm": 2.0857510566711426, "learning_rate": 4.848484848484849e-06, "loss": 0.5203642845153809, "mean_token_accuracy": 0.8152116537094116, "num_tokens": 1681211.0, "step": 193 }, { "epoch": 0.1474164133738602, "grad_norm": 2.348444938659668, "learning_rate": 4.873737373737374e-06, "loss": 0.47104156017303467, "mean_token_accuracy": 0.8531544208526611, "num_tokens": 1688386.0, "step": 194 }, { "epoch": 0.14817629179331307, "grad_norm": 2.4826736450195312, "learning_rate": 4.898989898989899e-06, "loss": 0.38890179991722107, "mean_token_accuracy": 0.8573125004768372, "num_tokens": 1693788.0, "step": 195 }, { "epoch": 0.14893617021276595, "grad_norm": 2.54610276222229, "learning_rate": 4.924242424242425e-06, "loss": 0.513835072517395, "mean_token_accuracy": 0.8168267011642456, "num_tokens": 1700459.0, "step": 196 }, { "epoch": 0.14969604863221886, "grad_norm": 2.144178867340088, "learning_rate": 4.94949494949495e-06, "loss": 0.39389657974243164, "mean_token_accuracy": 0.860227108001709, "num_tokens": 1707935.0, "step": 197 }, { "epoch": 0.15045592705167174, "grad_norm": 2.5684738159179688, "learning_rate": 4.974747474747475e-06, "loss": 0.4755311608314514, "mean_token_accuracy": 0.8305763602256775, "num_tokens": 1713174.0, "step": 198 }, { "epoch": 0.15121580547112462, "grad_norm": 1.8828866481781006, "learning_rate": 5e-06, "loss": 0.4542219638824463, "mean_token_accuracy": 0.8394654393196106, "num_tokens": 1721615.0, "step": 199 }, { "epoch": 0.1519756838905775, "grad_norm": 3.436140775680542, "learning_rate": 4.999999122701883e-06, "loss": 0.3606000542640686, "mean_token_accuracy": 0.8709638118743896, "num_tokens": 1724591.0, "step": 200 }, { "epoch": 0.15273556231003038, "grad_norm": 3.222001791000366, "learning_rate": 4.999996490808146e-06, "loss": 0.43145138025283813, "mean_token_accuracy": 0.8614907264709473, "num_tokens": 1728551.0, "step": 201 }, { "epoch": 0.1534954407294833, "grad_norm": 1.425344705581665, "learning_rate": 4.9999921043206356e-06, "loss": 0.39088189601898193, "mean_token_accuracy": 0.8567240238189697, "num_tokens": 1742833.0, "step": 202 }, { "epoch": 0.15425531914893617, "grad_norm": 4.0360260009765625, "learning_rate": 4.999985963242432e-06, "loss": 0.5054274797439575, "mean_token_accuracy": 0.830514669418335, "num_tokens": 1745849.0, "step": 203 }, { "epoch": 0.15501519756838905, "grad_norm": 3.223965644836426, "learning_rate": 4.999978067577844e-06, "loss": 0.40649789571762085, "mean_token_accuracy": 0.8544188141822815, "num_tokens": 1749247.0, "step": 204 }, { "epoch": 0.15577507598784193, "grad_norm": 2.2814829349517822, "learning_rate": 4.999968417332415e-06, "loss": 0.5317715406417847, "mean_token_accuracy": 0.8255351781845093, "num_tokens": 1756221.0, "step": 205 }, { "epoch": 0.15653495440729484, "grad_norm": 2.2380337715148926, "learning_rate": 4.999957012512916e-06, "loss": 0.4584760069847107, "mean_token_accuracy": 0.836658239364624, "num_tokens": 1762609.0, "step": 206 }, { "epoch": 0.15729483282674772, "grad_norm": 1.8422174453735352, "learning_rate": 4.999943853127351e-06, "loss": 0.4300195574760437, "mean_token_accuracy": 0.8459518551826477, "num_tokens": 1771234.0, "step": 207 }, { "epoch": 0.1580547112462006, "grad_norm": 2.113293170928955, "learning_rate": 4.999928939184958e-06, "loss": 0.3882524371147156, "mean_token_accuracy": 0.8600642681121826, "num_tokens": 1778111.0, "step": 208 }, { "epoch": 0.15881458966565348, "grad_norm": 3.6378543376922607, "learning_rate": 4.999912270696202e-06, "loss": 0.5725066661834717, "mean_token_accuracy": 0.8156357407569885, "num_tokens": 1781576.0, "step": 209 }, { "epoch": 0.1595744680851064, "grad_norm": 2.112945079803467, "learning_rate": 4.999893847672783e-06, "loss": 0.5687650442123413, "mean_token_accuracy": 0.80972820520401, "num_tokens": 1790799.0, "step": 210 }, { "epoch": 0.16033434650455927, "grad_norm": 2.2433907985687256, "learning_rate": 4.99987367012763e-06, "loss": 0.599341094493866, "mean_token_accuracy": 0.7983472347259521, "num_tokens": 1800040.0, "step": 211 }, { "epoch": 0.16109422492401215, "grad_norm": 2.1451005935668945, "learning_rate": 4.999851738074904e-06, "loss": 0.6137303709983826, "mean_token_accuracy": 0.7854923605918884, "num_tokens": 1816346.0, "step": 212 }, { "epoch": 0.16185410334346503, "grad_norm": 2.982390880584717, "learning_rate": 4.9998280515300006e-06, "loss": 0.5502551794052124, "mean_token_accuracy": 0.8052335977554321, "num_tokens": 1821568.0, "step": 213 }, { "epoch": 0.16261398176291794, "grad_norm": 3.35490345954895, "learning_rate": 4.999802610509541e-06, "loss": 0.545773983001709, "mean_token_accuracy": 0.8195146322250366, "num_tokens": 1825494.0, "step": 214 }, { "epoch": 0.16337386018237082, "grad_norm": 3.035769462585449, "learning_rate": 4.999775415031381e-06, "loss": 0.5717880129814148, "mean_token_accuracy": 0.8186711668968201, "num_tokens": 1829808.0, "step": 215 }, { "epoch": 0.1641337386018237, "grad_norm": 2.9388792514801025, "learning_rate": 4.999746465114609e-06, "loss": 0.5311149954795837, "mean_token_accuracy": 0.8200923204421997, "num_tokens": 1834365.0, "step": 216 }, { "epoch": 0.16489361702127658, "grad_norm": 1.7520580291748047, "learning_rate": 4.999715760779541e-06, "loss": 0.4936062693595886, "mean_token_accuracy": 0.8098561763763428, "num_tokens": 1846298.0, "step": 217 }, { "epoch": 0.1656534954407295, "grad_norm": 1.4501854181289673, "learning_rate": 4.999683302047729e-06, "loss": 0.4379549026489258, "mean_token_accuracy": 0.8455474376678467, "num_tokens": 1862718.0, "step": 218 }, { "epoch": 0.16641337386018237, "grad_norm": 1.5712978839874268, "learning_rate": 4.999649088941951e-06, "loss": 0.36806052923202515, "mean_token_accuracy": 0.8413015604019165, "num_tokens": 1873162.0, "step": 219 }, { "epoch": 0.16717325227963525, "grad_norm": 3.3667683601379395, "learning_rate": 4.999613121486222e-06, "loss": 0.5865274667739868, "mean_token_accuracy": 0.8232425451278687, "num_tokens": 1877383.0, "step": 220 }, { "epoch": 0.16793313069908813, "grad_norm": 2.027570962905884, "learning_rate": 4.999575399705782e-06, "loss": 0.47757071256637573, "mean_token_accuracy": 0.8435803651809692, "num_tokens": 1885443.0, "step": 221 }, { "epoch": 0.16869300911854104, "grad_norm": 1.9477081298828125, "learning_rate": 4.9995359236271094e-06, "loss": 0.49632883071899414, "mean_token_accuracy": 0.8447240591049194, "num_tokens": 1897117.0, "step": 222 }, { "epoch": 0.16945288753799392, "grad_norm": 2.1391282081604004, "learning_rate": 4.9994946932779076e-06, "loss": 0.6010444164276123, "mean_token_accuracy": 0.8133578300476074, "num_tokens": 1907743.0, "step": 223 }, { "epoch": 0.1702127659574468, "grad_norm": 3.3081393241882324, "learning_rate": 4.999451708687114e-06, "loss": 0.5357567071914673, "mean_token_accuracy": 0.8079873323440552, "num_tokens": 1911565.0, "step": 224 }, { "epoch": 0.17097264437689969, "grad_norm": 2.4006707668304443, "learning_rate": 4.999406969884897e-06, "loss": 0.5457048416137695, "mean_token_accuracy": 0.8097658753395081, "num_tokens": 1918799.0, "step": 225 }, { "epoch": 0.1717325227963526, "grad_norm": 1.8322361707687378, "learning_rate": 4.999360476902656e-06, "loss": 0.4240890145301819, "mean_token_accuracy": 0.8516747951507568, "num_tokens": 1927555.0, "step": 226 }, { "epoch": 0.17249240121580547, "grad_norm": 3.136608600616455, "learning_rate": 4.999312229773022e-06, "loss": 0.4659491181373596, "mean_token_accuracy": 0.838162899017334, "num_tokens": 1931836.0, "step": 227 }, { "epoch": 0.17325227963525835, "grad_norm": 2.2767984867095947, "learning_rate": 4.999262228529855e-06, "loss": 0.563478946685791, "mean_token_accuracy": 0.8091976642608643, "num_tokens": 1939554.0, "step": 228 }, { "epoch": 0.17401215805471124, "grad_norm": 1.4322718381881714, "learning_rate": 4.99921047320825e-06, "loss": 0.4094346761703491, "mean_token_accuracy": 0.8564238548278809, "num_tokens": 1954282.0, "step": 229 }, { "epoch": 0.17477203647416414, "grad_norm": 3.2467753887176514, "learning_rate": 4.99915696384453e-06, "loss": 0.5694391131401062, "mean_token_accuracy": 0.8160352110862732, "num_tokens": 1958627.0, "step": 230 }, { "epoch": 0.17553191489361702, "grad_norm": 1.9246042966842651, "learning_rate": 4.99910170047625e-06, "loss": 0.5620714426040649, "mean_token_accuracy": 0.8041449189186096, "num_tokens": 1969365.0, "step": 231 }, { "epoch": 0.1762917933130699, "grad_norm": 2.924678087234497, "learning_rate": 4.999044683142196e-06, "loss": 0.4853099286556244, "mean_token_accuracy": 0.8256953954696655, "num_tokens": 1973371.0, "step": 232 }, { "epoch": 0.1770516717325228, "grad_norm": 2.098088026046753, "learning_rate": 4.998985911882383e-06, "loss": 0.5587087869644165, "mean_token_accuracy": 0.7966718673706055, "num_tokens": 1983843.0, "step": 233 }, { "epoch": 0.1778115501519757, "grad_norm": 2.514716386795044, "learning_rate": 4.998925386738063e-06, "loss": 0.47832345962524414, "mean_token_accuracy": 0.8339341878890991, "num_tokens": 1989182.0, "step": 234 }, { "epoch": 0.17857142857142858, "grad_norm": 3.023073196411133, "learning_rate": 4.998863107751711e-06, "loss": 0.49698033928871155, "mean_token_accuracy": 0.8559197187423706, "num_tokens": 1993536.0, "step": 235 }, { "epoch": 0.17933130699088146, "grad_norm": 3.1792895793914795, "learning_rate": 4.99879907496704e-06, "loss": 0.5702434778213501, "mean_token_accuracy": 0.8000571131706238, "num_tokens": 1998042.0, "step": 236 }, { "epoch": 0.18009118541033434, "grad_norm": 2.1569433212280273, "learning_rate": 4.998733288428987e-06, "loss": 0.5830904245376587, "mean_token_accuracy": 0.8168346285820007, "num_tokens": 2009376.0, "step": 237 }, { "epoch": 0.18085106382978725, "grad_norm": 2.4051129817962646, "learning_rate": 4.998665748183727e-06, "loss": 0.5650715231895447, "mean_token_accuracy": 0.8147177696228027, "num_tokens": 2017048.0, "step": 238 }, { "epoch": 0.18161094224924013, "grad_norm": 1.5866661071777344, "learning_rate": 4.998596454278661e-06, "loss": 0.5082447528839111, "mean_token_accuracy": 0.8248941898345947, "num_tokens": 2031366.0, "step": 239 }, { "epoch": 0.182370820668693, "grad_norm": 1.99013090133667, "learning_rate": 4.998525406762422e-06, "loss": 0.5089194774627686, "mean_token_accuracy": 0.8197280168533325, "num_tokens": 2040568.0, "step": 240 }, { "epoch": 0.1831306990881459, "grad_norm": 2.6105995178222656, "learning_rate": 4.998452605684874e-06, "loss": 0.4203953146934509, "mean_token_accuracy": 0.8532576560974121, "num_tokens": 2045583.0, "step": 241 }, { "epoch": 0.1838905775075988, "grad_norm": 2.277846097946167, "learning_rate": 4.998378051097111e-06, "loss": 0.5560142397880554, "mean_token_accuracy": 0.8047254085540771, "num_tokens": 2053500.0, "step": 242 }, { "epoch": 0.18465045592705168, "grad_norm": 1.656392216682434, "learning_rate": 4.998301743051459e-06, "loss": 0.602260947227478, "mean_token_accuracy": 0.7868745923042297, "num_tokens": 2069389.0, "step": 243 }, { "epoch": 0.18541033434650456, "grad_norm": 2.1323728561401367, "learning_rate": 4.9982236816014735e-06, "loss": 0.4376535415649414, "mean_token_accuracy": 0.8592725992202759, "num_tokens": 2077138.0, "step": 244 }, { "epoch": 0.18617021276595744, "grad_norm": 2.616633176803589, "learning_rate": 4.998143866801941e-06, "loss": 0.5682218670845032, "mean_token_accuracy": 0.8179980516433716, "num_tokens": 2083970.0, "step": 245 }, { "epoch": 0.18693009118541035, "grad_norm": 2.6488561630249023, "learning_rate": 4.99806229870888e-06, "loss": 0.47162926197052, "mean_token_accuracy": 0.8438947200775146, "num_tokens": 2089457.0, "step": 246 }, { "epoch": 0.18768996960486323, "grad_norm": 2.019186496734619, "learning_rate": 4.9979789773795365e-06, "loss": 0.408061683177948, "mean_token_accuracy": 0.8576331734657288, "num_tokens": 2097206.0, "step": 247 }, { "epoch": 0.1884498480243161, "grad_norm": 2.302354335784912, "learning_rate": 4.997893902872389e-06, "loss": 0.5505059361457825, "mean_token_accuracy": 0.8150410652160645, "num_tokens": 2105214.0, "step": 248 }, { "epoch": 0.189209726443769, "grad_norm": 1.7864950895309448, "learning_rate": 4.997807075247147e-06, "loss": 0.41310858726501465, "mean_token_accuracy": 0.8526202440261841, "num_tokens": 2114182.0, "step": 249 }, { "epoch": 0.1899696048632219, "grad_norm": 1.6269062757492065, "learning_rate": 4.997718494564747e-06, "loss": 0.39904171228408813, "mean_token_accuracy": 0.8582673072814941, "num_tokens": 2124312.0, "step": 250 }, { "epoch": 0.19072948328267478, "grad_norm": 1.565595269203186, "learning_rate": 4.997628160887361e-06, "loss": 0.49161577224731445, "mean_token_accuracy": 0.821890115737915, "num_tokens": 2146690.0, "step": 251 }, { "epoch": 0.19148936170212766, "grad_norm": 3.404179096221924, "learning_rate": 4.997536074278388e-06, "loss": 0.5341525673866272, "mean_token_accuracy": 0.8176189661026001, "num_tokens": 2150460.0, "step": 252 }, { "epoch": 0.19224924012158054, "grad_norm": 2.601632595062256, "learning_rate": 4.9974422348024565e-06, "loss": 0.5186881422996521, "mean_token_accuracy": 0.8306224346160889, "num_tokens": 2158218.0, "step": 253 }, { "epoch": 0.19300911854103345, "grad_norm": 2.3589515686035156, "learning_rate": 4.997346642525429e-06, "loss": 0.45730939507484436, "mean_token_accuracy": 0.8451381921768188, "num_tokens": 2164297.0, "step": 254 }, { "epoch": 0.19376899696048633, "grad_norm": 3.0846846103668213, "learning_rate": 4.9972492975143936e-06, "loss": 0.4467453360557556, "mean_token_accuracy": 0.8379011154174805, "num_tokens": 2169760.0, "step": 255 }, { "epoch": 0.1945288753799392, "grad_norm": 1.7366052865982056, "learning_rate": 4.997150199837671e-06, "loss": 0.4394836723804474, "mean_token_accuracy": 0.8398878574371338, "num_tokens": 2180146.0, "step": 256 }, { "epoch": 0.1952887537993921, "grad_norm": 2.384813070297241, "learning_rate": 4.997049349564814e-06, "loss": 0.47991105914115906, "mean_token_accuracy": 0.8374294638633728, "num_tokens": 2188162.0, "step": 257 }, { "epoch": 0.196048632218845, "grad_norm": 2.590132236480713, "learning_rate": 4.996946746766602e-06, "loss": 0.41144347190856934, "mean_token_accuracy": 0.8605992794036865, "num_tokens": 2193311.0, "step": 258 }, { "epoch": 0.19680851063829788, "grad_norm": 1.7227436304092407, "learning_rate": 4.996842391515045e-06, "loss": 0.5073963403701782, "mean_token_accuracy": 0.8359246253967285, "num_tokens": 2206332.0, "step": 259 }, { "epoch": 0.19756838905775076, "grad_norm": 1.2812135219573975, "learning_rate": 4.996736283883382e-06, "loss": 0.40696483850479126, "mean_token_accuracy": 0.8485742211341858, "num_tokens": 2226343.0, "step": 260 }, { "epoch": 0.19832826747720364, "grad_norm": 2.5326876640319824, "learning_rate": 4.9966284239460875e-06, "loss": 0.46859997510910034, "mean_token_accuracy": 0.8571674823760986, "num_tokens": 2231825.0, "step": 261 }, { "epoch": 0.19908814589665655, "grad_norm": 1.9771672487258911, "learning_rate": 4.996518811778858e-06, "loss": 0.41854721307754517, "mean_token_accuracy": 0.8526982665061951, "num_tokens": 2239051.0, "step": 262 }, { "epoch": 0.19984802431610943, "grad_norm": 2.1683223247528076, "learning_rate": 4.996407447458626e-06, "loss": 0.5161755681037903, "mean_token_accuracy": 0.8490742444992065, "num_tokens": 2247131.0, "step": 263 }, { "epoch": 0.2006079027355623, "grad_norm": 2.5736196041107178, "learning_rate": 4.99629433106355e-06, "loss": 0.4735284745693207, "mean_token_accuracy": 0.8350207209587097, "num_tokens": 2253296.0, "step": 264 }, { "epoch": 0.2013677811550152, "grad_norm": 1.7842241525650024, "learning_rate": 4.99617946267302e-06, "loss": 0.47157543897628784, "mean_token_accuracy": 0.833238959312439, "num_tokens": 2264773.0, "step": 265 }, { "epoch": 0.20212765957446807, "grad_norm": 2.9247069358825684, "learning_rate": 4.996062842367655e-06, "loss": 0.4112093448638916, "mean_token_accuracy": 0.8585200309753418, "num_tokens": 2268575.0, "step": 266 }, { "epoch": 0.20288753799392098, "grad_norm": 2.306239604949951, "learning_rate": 4.9959444702293025e-06, "loss": 0.4145011305809021, "mean_token_accuracy": 0.8556011915206909, "num_tokens": 2274508.0, "step": 267 }, { "epoch": 0.20364741641337386, "grad_norm": 2.7466163635253906, "learning_rate": 4.995824346341041e-06, "loss": 0.37871870398521423, "mean_token_accuracy": 0.8531461358070374, "num_tokens": 2279159.0, "step": 268 }, { "epoch": 0.20440729483282674, "grad_norm": 2.0082874298095703, "learning_rate": 4.99570247078718e-06, "loss": 0.5860568881034851, "mean_token_accuracy": 0.8031082153320312, "num_tokens": 2291049.0, "step": 269 }, { "epoch": 0.20516717325227962, "grad_norm": 2.280381917953491, "learning_rate": 4.995578843653255e-06, "loss": 0.467026948928833, "mean_token_accuracy": 0.8364914059638977, "num_tokens": 2297400.0, "step": 270 }, { "epoch": 0.20592705167173253, "grad_norm": 1.8473336696624756, "learning_rate": 4.995453465026033e-06, "loss": 0.4944884777069092, "mean_token_accuracy": 0.838844895362854, "num_tokens": 2307492.0, "step": 271 }, { "epoch": 0.2066869300911854, "grad_norm": 2.4100840091705322, "learning_rate": 4.995326334993508e-06, "loss": 0.5006181001663208, "mean_token_accuracy": 0.8268105983734131, "num_tokens": 2313273.0, "step": 272 }, { "epoch": 0.2074468085106383, "grad_norm": 2.236138343811035, "learning_rate": 4.9951974536449055e-06, "loss": 0.4911901354789734, "mean_token_accuracy": 0.8307386040687561, "num_tokens": 2320363.0, "step": 273 }, { "epoch": 0.20820668693009117, "grad_norm": 3.372265577316284, "learning_rate": 4.9950668210706795e-06, "loss": 0.37246087193489075, "mean_token_accuracy": 0.8783556818962097, "num_tokens": 2323307.0, "step": 274 }, { "epoch": 0.20896656534954408, "grad_norm": 2.1185147762298584, "learning_rate": 4.994934437362513e-06, "loss": 0.5806586146354675, "mean_token_accuracy": 0.8004832863807678, "num_tokens": 2333048.0, "step": 275 }, { "epoch": 0.20972644376899696, "grad_norm": 1.9632351398468018, "learning_rate": 4.994800302613318e-06, "loss": 0.4413146376609802, "mean_token_accuracy": 0.8510923981666565, "num_tokens": 2340860.0, "step": 276 }, { "epoch": 0.21048632218844984, "grad_norm": 2.279758930206299, "learning_rate": 4.994664416917236e-06, "loss": 0.5088690519332886, "mean_token_accuracy": 0.8209558725357056, "num_tokens": 2348974.0, "step": 277 }, { "epoch": 0.21124620060790272, "grad_norm": 1.6696852445602417, "learning_rate": 4.994526780369636e-06, "loss": 0.4469617009162903, "mean_token_accuracy": 0.8400031328201294, "num_tokens": 2370327.0, "step": 278 }, { "epoch": 0.21200607902735563, "grad_norm": 2.841850757598877, "learning_rate": 4.9943873930671175e-06, "loss": 0.5606095790863037, "mean_token_accuracy": 0.807634711265564, "num_tokens": 2375131.0, "step": 279 }, { "epoch": 0.2127659574468085, "grad_norm": 2.82785701751709, "learning_rate": 4.994246255107506e-06, "loss": 0.410330593585968, "mean_token_accuracy": 0.8566482067108154, "num_tokens": 2378971.0, "step": 280 }, { "epoch": 0.2135258358662614, "grad_norm": 2.8596596717834473, "learning_rate": 4.994103366589859e-06, "loss": 0.3787328600883484, "mean_token_accuracy": 0.874786376953125, "num_tokens": 2382513.0, "step": 281 }, { "epoch": 0.21428571428571427, "grad_norm": 1.7845677137374878, "learning_rate": 4.993958727614462e-06, "loss": 0.4610300660133362, "mean_token_accuracy": 0.8375419974327087, "num_tokens": 2393147.0, "step": 282 }, { "epoch": 0.21504559270516718, "grad_norm": 2.3245997428894043, "learning_rate": 4.993812338282826e-06, "loss": 0.4142797589302063, "mean_token_accuracy": 0.8542178869247437, "num_tokens": 2398916.0, "step": 283 }, { "epoch": 0.21580547112462006, "grad_norm": 1.692062258720398, "learning_rate": 4.993664198697694e-06, "loss": 0.44197604060173035, "mean_token_accuracy": 0.8350081443786621, "num_tokens": 2411836.0, "step": 284 }, { "epoch": 0.21656534954407294, "grad_norm": 2.157754898071289, "learning_rate": 4.993514308963037e-06, "loss": 0.5904836058616638, "mean_token_accuracy": 0.7999725341796875, "num_tokens": 2420611.0, "step": 285 }, { "epoch": 0.21732522796352582, "grad_norm": 3.618769884109497, "learning_rate": 4.993362669184051e-06, "loss": 0.5695822238922119, "mean_token_accuracy": 0.7965190410614014, "num_tokens": 2424041.0, "step": 286 }, { "epoch": 0.21808510638297873, "grad_norm": 2.031795024871826, "learning_rate": 4.993209279467164e-06, "loss": 0.5142983198165894, "mean_token_accuracy": 0.8019678592681885, "num_tokens": 2434620.0, "step": 287 }, { "epoch": 0.2188449848024316, "grad_norm": 1.7205814123153687, "learning_rate": 4.993054139920031e-06, "loss": 0.4392193853855133, "mean_token_accuracy": 0.8336896896362305, "num_tokens": 2444965.0, "step": 288 }, { "epoch": 0.2196048632218845, "grad_norm": 1.7577887773513794, "learning_rate": 4.992897250651535e-06, "loss": 0.5540003180503845, "mean_token_accuracy": 0.7990663051605225, "num_tokens": 2457544.0, "step": 289 }, { "epoch": 0.22036474164133737, "grad_norm": 1.802450180053711, "learning_rate": 4.992738611771787e-06, "loss": 0.5318427085876465, "mean_token_accuracy": 0.8389760851860046, "num_tokens": 2467875.0, "step": 290 }, { "epoch": 0.22112462006079028, "grad_norm": 2.1131720542907715, "learning_rate": 4.992578223392124e-06, "loss": 0.557640790939331, "mean_token_accuracy": 0.8158830404281616, "num_tokens": 2475554.0, "step": 291 }, { "epoch": 0.22188449848024316, "grad_norm": 3.0931613445281982, "learning_rate": 4.992416085625115e-06, "loss": 0.4821542501449585, "mean_token_accuracy": 0.8444563746452332, "num_tokens": 2479755.0, "step": 292 }, { "epoch": 0.22264437689969604, "grad_norm": 2.84855055809021, "learning_rate": 4.992252198584554e-06, "loss": 0.48346447944641113, "mean_token_accuracy": 0.8550156354904175, "num_tokens": 2483838.0, "step": 293 }, { "epoch": 0.22340425531914893, "grad_norm": 1.8370510339736938, "learning_rate": 4.992086562385462e-06, "loss": 0.5358907580375671, "mean_token_accuracy": 0.8083988428115845, "num_tokens": 2497336.0, "step": 294 }, { "epoch": 0.22416413373860183, "grad_norm": 1.771071195602417, "learning_rate": 4.9919191771440905e-06, "loss": 0.5313449501991272, "mean_token_accuracy": 0.8171120882034302, "num_tokens": 2513012.0, "step": 295 }, { "epoch": 0.22492401215805471, "grad_norm": 2.845015048980713, "learning_rate": 4.9917500429779165e-06, "loss": 0.5177006125450134, "mean_token_accuracy": 0.8264347314834595, "num_tokens": 2517827.0, "step": 296 }, { "epoch": 0.2256838905775076, "grad_norm": 2.6123650074005127, "learning_rate": 4.991579160005644e-06, "loss": 0.4545767605304718, "mean_token_accuracy": 0.8568557500839233, "num_tokens": 2522988.0, "step": 297 }, { "epoch": 0.22644376899696048, "grad_norm": 1.7086553573608398, "learning_rate": 4.991406528347206e-06, "loss": 0.44247299432754517, "mean_token_accuracy": 0.8628532886505127, "num_tokens": 2534893.0, "step": 298 }, { "epoch": 0.22720364741641338, "grad_norm": 2.7990589141845703, "learning_rate": 4.9912321481237616e-06, "loss": 0.5641108751296997, "mean_token_accuracy": 0.8045810461044312, "num_tokens": 2541195.0, "step": 299 }, { "epoch": 0.22796352583586627, "grad_norm": 3.0234270095825195, "learning_rate": 4.991056019457697e-06, "loss": 0.4119265675544739, "mean_token_accuracy": 0.864372730255127, "num_tokens": 2545007.0, "step": 300 }, { "epoch": 0.22872340425531915, "grad_norm": 2.3037335872650146, "learning_rate": 4.990878142472628e-06, "loss": 0.4908141493797302, "mean_token_accuracy": 0.8342020511627197, "num_tokens": 2552092.0, "step": 301 }, { "epoch": 0.22948328267477203, "grad_norm": 1.9956134557724, "learning_rate": 4.990698517293394e-06, "loss": 0.45161187648773193, "mean_token_accuracy": 0.844601035118103, "num_tokens": 2560369.0, "step": 302 }, { "epoch": 0.23024316109422494, "grad_norm": 3.525520086288452, "learning_rate": 4.9905171440460645e-06, "loss": 0.43304887413978577, "mean_token_accuracy": 0.8541014790534973, "num_tokens": 2563256.0, "step": 303 }, { "epoch": 0.23100303951367782, "grad_norm": 4.261448383331299, "learning_rate": 4.990334022857932e-06, "loss": 0.5139227509498596, "mean_token_accuracy": 0.8354399800300598, "num_tokens": 2565879.0, "step": 304 }, { "epoch": 0.2317629179331307, "grad_norm": 2.440788507461548, "learning_rate": 4.990149153857519e-06, "loss": 0.4294750690460205, "mean_token_accuracy": 0.8510936498641968, "num_tokens": 2572210.0, "step": 305 }, { "epoch": 0.23252279635258358, "grad_norm": 1.7491639852523804, "learning_rate": 4.989962537174573e-06, "loss": 0.47826993465423584, "mean_token_accuracy": 0.8374208807945251, "num_tokens": 2584135.0, "step": 306 }, { "epoch": 0.23328267477203649, "grad_norm": 3.845266342163086, "learning_rate": 4.989774172940071e-06, "loss": 0.5992549657821655, "mean_token_accuracy": 0.7800809144973755, "num_tokens": 2587386.0, "step": 307 }, { "epoch": 0.23404255319148937, "grad_norm": 2.2111220359802246, "learning_rate": 4.989584061286211e-06, "loss": 0.4815753996372223, "mean_token_accuracy": 0.8330761194229126, "num_tokens": 2593992.0, "step": 308 }, { "epoch": 0.23480243161094225, "grad_norm": 1.858041524887085, "learning_rate": 4.989392202346423e-06, "loss": 0.4118471145629883, "mean_token_accuracy": 0.8520488739013672, "num_tokens": 2604174.0, "step": 309 }, { "epoch": 0.23556231003039513, "grad_norm": 2.405632734298706, "learning_rate": 4.989198596255361e-06, "loss": 0.38085171580314636, "mean_token_accuracy": 0.854683518409729, "num_tokens": 2609186.0, "step": 310 }, { "epoch": 0.23632218844984804, "grad_norm": 3.8427865505218506, "learning_rate": 4.989003243148904e-06, "loss": 0.4445143938064575, "mean_token_accuracy": 0.8356641530990601, "num_tokens": 2611913.0, "step": 311 }, { "epoch": 0.23708206686930092, "grad_norm": 2.3617193698883057, "learning_rate": 4.988806143164159e-06, "loss": 0.4281064569950104, "mean_token_accuracy": 0.8460803031921387, "num_tokens": 2621405.0, "step": 312 }, { "epoch": 0.2378419452887538, "grad_norm": 2.439340353012085, "learning_rate": 4.988607296439459e-06, "loss": 0.5275991559028625, "mean_token_accuracy": 0.8325222730636597, "num_tokens": 2629060.0, "step": 313 }, { "epoch": 0.23860182370820668, "grad_norm": 1.5777690410614014, "learning_rate": 4.98840670311436e-06, "loss": 0.4817584455013275, "mean_token_accuracy": 0.8332220911979675, "num_tokens": 2642411.0, "step": 314 }, { "epoch": 0.2393617021276596, "grad_norm": 2.179872989654541, "learning_rate": 4.988204363329648e-06, "loss": 0.6049296855926514, "mean_token_accuracy": 0.7860822677612305, "num_tokens": 2652538.0, "step": 315 }, { "epoch": 0.24012158054711247, "grad_norm": 3.253547430038452, "learning_rate": 4.988000277227334e-06, "loss": 0.46995067596435547, "mean_token_accuracy": 0.8340871930122375, "num_tokens": 2655887.0, "step": 316 }, { "epoch": 0.24088145896656535, "grad_norm": 3.441596508026123, "learning_rate": 4.987794444950651e-06, "loss": 0.3157607316970825, "mean_token_accuracy": 0.8920344114303589, "num_tokens": 2658686.0, "step": 317 }, { "epoch": 0.24164133738601823, "grad_norm": 1.8112664222717285, "learning_rate": 4.987586866644061e-06, "loss": 0.502338171005249, "mean_token_accuracy": 0.8305802345275879, "num_tokens": 2669691.0, "step": 318 }, { "epoch": 0.24240121580547114, "grad_norm": 1.8285833597183228, "learning_rate": 4.9873775424532515e-06, "loss": 0.4544355869293213, "mean_token_accuracy": 0.8415238261222839, "num_tokens": 2678956.0, "step": 319 }, { "epoch": 0.24316109422492402, "grad_norm": 2.1771512031555176, "learning_rate": 4.9871664725251314e-06, "loss": 0.454698383808136, "mean_token_accuracy": 0.8483167886734009, "num_tokens": 2686411.0, "step": 320 }, { "epoch": 0.2439209726443769, "grad_norm": 1.6493504047393799, "learning_rate": 4.986953657007841e-06, "loss": 0.42326417565345764, "mean_token_accuracy": 0.8452361822128296, "num_tokens": 2698617.0, "step": 321 }, { "epoch": 0.24468085106382978, "grad_norm": 1.1489403247833252, "learning_rate": 4.98673909605074e-06, "loss": 0.36659368872642517, "mean_token_accuracy": 0.8352444171905518, "num_tokens": 2717933.0, "step": 322 }, { "epoch": 0.2454407294832827, "grad_norm": 2.295814275741577, "learning_rate": 4.986522789804417e-06, "loss": 0.5098875164985657, "mean_token_accuracy": 0.8190979957580566, "num_tokens": 2723970.0, "step": 323 }, { "epoch": 0.24620060790273557, "grad_norm": 2.3241398334503174, "learning_rate": 4.986304738420684e-06, "loss": 0.42751336097717285, "mean_token_accuracy": 0.8547185659408569, "num_tokens": 2729390.0, "step": 324 }, { "epoch": 0.24696048632218845, "grad_norm": 2.8512768745422363, "learning_rate": 4.986084942052577e-06, "loss": 0.33234283328056335, "mean_token_accuracy": 0.8766615986824036, "num_tokens": 2733610.0, "step": 325 }, { "epoch": 0.24772036474164133, "grad_norm": 2.4620141983032227, "learning_rate": 4.9858634008543574e-06, "loss": 0.5473405122756958, "mean_token_accuracy": 0.8383826017379761, "num_tokens": 2740063.0, "step": 326 }, { "epoch": 0.24848024316109424, "grad_norm": 1.984655737876892, "learning_rate": 4.985640114981513e-06, "loss": 0.4946171045303345, "mean_token_accuracy": 0.839870274066925, "num_tokens": 2750741.0, "step": 327 }, { "epoch": 0.24924012158054712, "grad_norm": 2.5328571796417236, "learning_rate": 4.985415084590752e-06, "loss": 0.5796651840209961, "mean_token_accuracy": 0.7959966659545898, "num_tokens": 2756165.0, "step": 328 }, { "epoch": 0.25, "grad_norm": 2.400641441345215, "learning_rate": 4.985188309840012e-06, "loss": 0.48668015003204346, "mean_token_accuracy": 0.8382711410522461, "num_tokens": 2761417.0, "step": 329 }, { "epoch": 0.2507598784194529, "grad_norm": 2.7159430980682373, "learning_rate": 4.984959790888451e-06, "loss": 0.5163013935089111, "mean_token_accuracy": 0.8203201293945312, "num_tokens": 2766549.0, "step": 330 }, { "epoch": 0.25151975683890576, "grad_norm": 2.537278652191162, "learning_rate": 4.984729527896451e-06, "loss": 0.5671283602714539, "mean_token_accuracy": 0.8172976970672607, "num_tokens": 2772817.0, "step": 331 }, { "epoch": 0.25227963525835867, "grad_norm": 3.192000150680542, "learning_rate": 4.984497521025622e-06, "loss": 0.37886548042297363, "mean_token_accuracy": 0.8669577836990356, "num_tokens": 2775795.0, "step": 332 }, { "epoch": 0.2530395136778115, "grad_norm": 2.5922412872314453, "learning_rate": 4.984263770438793e-06, "loss": 0.4395058751106262, "mean_token_accuracy": 0.847874104976654, "num_tokens": 2781033.0, "step": 333 }, { "epoch": 0.25379939209726443, "grad_norm": 1.9635506868362427, "learning_rate": 4.984028276300021e-06, "loss": 0.4364502429962158, "mean_token_accuracy": 0.8494650721549988, "num_tokens": 2787582.0, "step": 334 }, { "epoch": 0.25455927051671734, "grad_norm": 2.2944114208221436, "learning_rate": 4.983791038774585e-06, "loss": 0.4683613181114197, "mean_token_accuracy": 0.8264153599739075, "num_tokens": 2794158.0, "step": 335 }, { "epoch": 0.2553191489361702, "grad_norm": 1.778643012046814, "learning_rate": 4.983552058028985e-06, "loss": 0.46646493673324585, "mean_token_accuracy": 0.8360317945480347, "num_tokens": 2808408.0, "step": 336 }, { "epoch": 0.2560790273556231, "grad_norm": 3.0198330879211426, "learning_rate": 4.9833113342309495e-06, "loss": 0.5558529496192932, "mean_token_accuracy": 0.8131203651428223, "num_tokens": 2813796.0, "step": 337 }, { "epoch": 0.256838905775076, "grad_norm": 2.508333683013916, "learning_rate": 4.983068867549427e-06, "loss": 0.4780687093734741, "mean_token_accuracy": 0.8369977474212646, "num_tokens": 2818909.0, "step": 338 }, { "epoch": 0.25759878419452886, "grad_norm": 2.1583943367004395, "learning_rate": 4.982824658154589e-06, "loss": 0.6210355162620544, "mean_token_accuracy": 0.786374568939209, "num_tokens": 2831458.0, "step": 339 }, { "epoch": 0.25835866261398177, "grad_norm": 2.619635581970215, "learning_rate": 4.9825787062178315e-06, "loss": 0.5546093583106995, "mean_token_accuracy": 0.8163464069366455, "num_tokens": 2843726.0, "step": 340 }, { "epoch": 0.2591185410334346, "grad_norm": 1.9398376941680908, "learning_rate": 4.982331011911774e-06, "loss": 0.409252405166626, "mean_token_accuracy": 0.8436061143875122, "num_tokens": 2864157.0, "step": 341 }, { "epoch": 0.25987841945288753, "grad_norm": 2.2148611545562744, "learning_rate": 4.982081575410256e-06, "loss": 0.44126778841018677, "mean_token_accuracy": 0.8487811088562012, "num_tokens": 2870701.0, "step": 342 }, { "epoch": 0.26063829787234044, "grad_norm": 3.5705149173736572, "learning_rate": 4.9818303968883445e-06, "loss": 0.7215286493301392, "mean_token_accuracy": 0.7655127048492432, "num_tokens": 2874745.0, "step": 343 }, { "epoch": 0.2613981762917933, "grad_norm": 1.8558040857315063, "learning_rate": 4.981577476522323e-06, "loss": 0.5530655384063721, "mean_token_accuracy": 0.8249034881591797, "num_tokens": 2887052.0, "step": 344 }, { "epoch": 0.2621580547112462, "grad_norm": 2.4575531482696533, "learning_rate": 4.981322814489703e-06, "loss": 0.49899396300315857, "mean_token_accuracy": 0.828569233417511, "num_tokens": 2892473.0, "step": 345 }, { "epoch": 0.2629179331306991, "grad_norm": 1.9310275316238403, "learning_rate": 4.981066410969215e-06, "loss": 0.47420281171798706, "mean_token_accuracy": 0.8402732610702515, "num_tokens": 2900453.0, "step": 346 }, { "epoch": 0.26367781155015196, "grad_norm": 2.21812105178833, "learning_rate": 4.980808266140813e-06, "loss": 0.4555210769176483, "mean_token_accuracy": 0.8437404632568359, "num_tokens": 2906741.0, "step": 347 }, { "epoch": 0.26443768996960487, "grad_norm": 2.7364001274108887, "learning_rate": 4.9805483801856744e-06, "loss": 0.5119813680648804, "mean_token_accuracy": 0.8283645510673523, "num_tokens": 2911988.0, "step": 348 }, { "epoch": 0.2651975683890577, "grad_norm": 3.0648796558380127, "learning_rate": 4.980286753286196e-06, "loss": 0.3780750334262848, "mean_token_accuracy": 0.8826199769973755, "num_tokens": 2915193.0, "step": 349 }, { "epoch": 0.26595744680851063, "grad_norm": 1.5473605394363403, "learning_rate": 4.980023385625996e-06, "loss": 0.38594651222229004, "mean_token_accuracy": 0.8553473949432373, "num_tokens": 2929454.0, "step": 350 }, { "epoch": 0.26671732522796354, "grad_norm": 2.988367795944214, "learning_rate": 4.979758277389919e-06, "loss": 0.4946131706237793, "mean_token_accuracy": 0.8174018859863281, "num_tokens": 2934240.0, "step": 351 }, { "epoch": 0.2674772036474164, "grad_norm": 2.039886236190796, "learning_rate": 4.9794914287640264e-06, "loss": 0.5628063678741455, "mean_token_accuracy": 0.8034824132919312, "num_tokens": 2945842.0, "step": 352 }, { "epoch": 0.2682370820668693, "grad_norm": 2.3829421997070312, "learning_rate": 4.979222839935602e-06, "loss": 0.6037441492080688, "mean_token_accuracy": 0.7891867160797119, "num_tokens": 2953898.0, "step": 353 }, { "epoch": 0.2689969604863222, "grad_norm": 1.959708571434021, "learning_rate": 4.9789525110931545e-06, "loss": 0.5061007738113403, "mean_token_accuracy": 0.8236986994743347, "num_tokens": 2962281.0, "step": 354 }, { "epoch": 0.26975683890577506, "grad_norm": 2.61007022857666, "learning_rate": 4.978680442426409e-06, "loss": 0.6019249558448792, "mean_token_accuracy": 0.7874947190284729, "num_tokens": 2969603.0, "step": 355 }, { "epoch": 0.270516717325228, "grad_norm": 1.9074316024780273, "learning_rate": 4.978406634126315e-06, "loss": 0.48542970418930054, "mean_token_accuracy": 0.8402796983718872, "num_tokens": 2979413.0, "step": 356 }, { "epoch": 0.2712765957446808, "grad_norm": 1.517846941947937, "learning_rate": 4.978131086385041e-06, "loss": 0.4455636143684387, "mean_token_accuracy": 0.838932991027832, "num_tokens": 2992495.0, "step": 357 }, { "epoch": 0.27203647416413373, "grad_norm": 2.1448144912719727, "learning_rate": 4.977853799395976e-06, "loss": 0.4592750072479248, "mean_token_accuracy": 0.8330867290496826, "num_tokens": 3000007.0, "step": 358 }, { "epoch": 0.27279635258358664, "grad_norm": 3.2687668800354004, "learning_rate": 4.977574773353732e-06, "loss": 0.511050820350647, "mean_token_accuracy": 0.8200520277023315, "num_tokens": 3003758.0, "step": 359 }, { "epoch": 0.2735562310030395, "grad_norm": 2.8292200565338135, "learning_rate": 4.97729400845414e-06, "loss": 0.41245830059051514, "mean_token_accuracy": 0.8344168663024902, "num_tokens": 3007678.0, "step": 360 }, { "epoch": 0.2743161094224924, "grad_norm": 1.8785876035690308, "learning_rate": 4.977011504894253e-06, "loss": 0.46520695090293884, "mean_token_accuracy": 0.8308258652687073, "num_tokens": 3015743.0, "step": 361 }, { "epoch": 0.2750759878419453, "grad_norm": 1.6335071325302124, "learning_rate": 4.97672726287234e-06, "loss": 0.4188292324542999, "mean_token_accuracy": 0.865784764289856, "num_tokens": 3026661.0, "step": 362 }, { "epoch": 0.27583586626139817, "grad_norm": 3.381009101867676, "learning_rate": 4.976441282587894e-06, "loss": 0.5134732723236084, "mean_token_accuracy": 0.8187953233718872, "num_tokens": 3030422.0, "step": 363 }, { "epoch": 0.2765957446808511, "grad_norm": 1.3476839065551758, "learning_rate": 4.9761535642416284e-06, "loss": 0.4297364056110382, "mean_token_accuracy": 0.8355786800384521, "num_tokens": 3047763.0, "step": 364 }, { "epoch": 0.2773556231003039, "grad_norm": 2.3485515117645264, "learning_rate": 4.9758641080354745e-06, "loss": 0.4887448251247406, "mean_token_accuracy": 0.849709689617157, "num_tokens": 3053751.0, "step": 365 }, { "epoch": 0.27811550151975684, "grad_norm": 2.869009256362915, "learning_rate": 4.975572914172581e-06, "loss": 0.5443825721740723, "mean_token_accuracy": 0.8108746409416199, "num_tokens": 3058079.0, "step": 366 }, { "epoch": 0.27887537993920974, "grad_norm": 2.337939977645874, "learning_rate": 4.975279982857324e-06, "loss": 0.5280558466911316, "mean_token_accuracy": 0.8160051703453064, "num_tokens": 3065547.0, "step": 367 }, { "epoch": 0.2796352583586626, "grad_norm": 1.421703577041626, "learning_rate": 4.97498531429529e-06, "loss": 0.39594563841819763, "mean_token_accuracy": 0.8643261194229126, "num_tokens": 3078113.0, "step": 368 }, { "epoch": 0.2803951367781155, "grad_norm": 2.1441762447357178, "learning_rate": 4.97468890869329e-06, "loss": 0.4584254324436188, "mean_token_accuracy": 0.8336924314498901, "num_tokens": 3085266.0, "step": 369 }, { "epoch": 0.2811550151975684, "grad_norm": 1.343610405921936, "learning_rate": 4.974390766259353e-06, "loss": 0.4358016848564148, "mean_token_accuracy": 0.8309500217437744, "num_tokens": 3100672.0, "step": 370 }, { "epoch": 0.28191489361702127, "grad_norm": 2.636687994003296, "learning_rate": 4.974090887202726e-06, "loss": 0.5070590376853943, "mean_token_accuracy": 0.8223599791526794, "num_tokens": 3106740.0, "step": 371 }, { "epoch": 0.2826747720364742, "grad_norm": 2.0654895305633545, "learning_rate": 4.973789271733877e-06, "loss": 0.6026565432548523, "mean_token_accuracy": 0.7863624095916748, "num_tokens": 3117948.0, "step": 372 }, { "epoch": 0.28343465045592703, "grad_norm": 4.953190326690674, "learning_rate": 4.973485920064491e-06, "loss": 0.5796823501586914, "mean_token_accuracy": 0.8124111890792847, "num_tokens": 3120338.0, "step": 373 }, { "epoch": 0.28419452887537994, "grad_norm": 1.2652311325073242, "learning_rate": 4.973180832407471e-06, "loss": 0.3768846392631531, "mean_token_accuracy": 0.8423436880111694, "num_tokens": 3135718.0, "step": 374 }, { "epoch": 0.28495440729483285, "grad_norm": 2.609652280807495, "learning_rate": 4.97287400897694e-06, "loss": 0.5162647366523743, "mean_token_accuracy": 0.8220236897468567, "num_tokens": 3141392.0, "step": 375 }, { "epoch": 0.2857142857142857, "grad_norm": 3.0013082027435303, "learning_rate": 4.972565449988238e-06, "loss": 0.3151981830596924, "mean_token_accuracy": 0.8900531530380249, "num_tokens": 3144656.0, "step": 376 }, { "epoch": 0.2864741641337386, "grad_norm": 1.9808810949325562, "learning_rate": 4.972255155657925e-06, "loss": 0.4985666275024414, "mean_token_accuracy": 0.8261170387268066, "num_tokens": 3152536.0, "step": 377 }, { "epoch": 0.2872340425531915, "grad_norm": 7.131393909454346, "learning_rate": 4.9719431262037755e-06, "loss": 0.5039874911308289, "mean_token_accuracy": 0.8170304298400879, "num_tokens": 3157134.0, "step": 378 }, { "epoch": 0.28799392097264437, "grad_norm": 1.4525601863861084, "learning_rate": 4.971629361844785e-06, "loss": 0.3949277400970459, "mean_token_accuracy": 0.8594280481338501, "num_tokens": 3171764.0, "step": 379 }, { "epoch": 0.2887537993920973, "grad_norm": 1.983331322669983, "learning_rate": 4.971313862801166e-06, "loss": 0.414550244808197, "mean_token_accuracy": 0.8540629744529724, "num_tokens": 3179444.0, "step": 380 }, { "epoch": 0.28951367781155013, "grad_norm": 1.9733079671859741, "learning_rate": 4.9709966292943455e-06, "loss": 0.4406163692474365, "mean_token_accuracy": 0.8342312574386597, "num_tokens": 3186957.0, "step": 381 }, { "epoch": 0.29027355623100304, "grad_norm": 1.652886152267456, "learning_rate": 4.970677661546972e-06, "loss": 0.5227410197257996, "mean_token_accuracy": 0.8188983201980591, "num_tokens": 3201482.0, "step": 382 }, { "epoch": 0.29103343465045595, "grad_norm": 3.3413736820220947, "learning_rate": 4.970356959782909e-06, "loss": 0.5933316946029663, "mean_token_accuracy": 0.7997614145278931, "num_tokens": 3206201.0, "step": 383 }, { "epoch": 0.2917933130699088, "grad_norm": 1.6980115175247192, "learning_rate": 4.970034524227239e-06, "loss": 0.35611239075660706, "mean_token_accuracy": 0.871821939945221, "num_tokens": 3214696.0, "step": 384 }, { "epoch": 0.2925531914893617, "grad_norm": 1.4026317596435547, "learning_rate": 4.969710355106256e-06, "loss": 0.42110762000083923, "mean_token_accuracy": 0.843463659286499, "num_tokens": 3227516.0, "step": 385 }, { "epoch": 0.2933130699088146, "grad_norm": 2.508169651031494, "learning_rate": 4.969384452647477e-06, "loss": 0.4792252779006958, "mean_token_accuracy": 0.8330808877944946, "num_tokens": 3233895.0, "step": 386 }, { "epoch": 0.29407294832826747, "grad_norm": 1.7341818809509277, "learning_rate": 4.969056817079633e-06, "loss": 0.4872246980667114, "mean_token_accuracy": 0.8220229148864746, "num_tokens": 3244349.0, "step": 387 }, { "epoch": 0.2948328267477204, "grad_norm": 2.6779842376708984, "learning_rate": 4.968727448632669e-06, "loss": 0.3885750472545624, "mean_token_accuracy": 0.8585678339004517, "num_tokens": 3248613.0, "step": 388 }, { "epoch": 0.29559270516717323, "grad_norm": 1.7146910429000854, "learning_rate": 4.968396347537751e-06, "loss": 0.3956252932548523, "mean_token_accuracy": 0.8555857539176941, "num_tokens": 3259996.0, "step": 389 }, { "epoch": 0.29635258358662614, "grad_norm": 2.9935412406921387, "learning_rate": 4.968063514027258e-06, "loss": 0.37694051861763, "mean_token_accuracy": 0.8573237657546997, "num_tokens": 3263195.0, "step": 390 }, { "epoch": 0.29711246200607905, "grad_norm": 2.540135145187378, "learning_rate": 4.967728948334784e-06, "loss": 0.47736361622810364, "mean_token_accuracy": 0.830003023147583, "num_tokens": 3267650.0, "step": 391 }, { "epoch": 0.2978723404255319, "grad_norm": 1.7454566955566406, "learning_rate": 4.967392650695141e-06, "loss": 0.37880340218544006, "mean_token_accuracy": 0.8590790629386902, "num_tokens": 3279001.0, "step": 392 }, { "epoch": 0.2986322188449848, "grad_norm": 2.264423370361328, "learning_rate": 4.967054621344356e-06, "loss": 0.557185709476471, "mean_token_accuracy": 0.834559977054596, "num_tokens": 3287231.0, "step": 393 }, { "epoch": 0.2993920972644377, "grad_norm": 1.8944240808486938, "learning_rate": 4.96671486051967e-06, "loss": 0.5128063559532166, "mean_token_accuracy": 0.8286118507385254, "num_tokens": 3295986.0, "step": 394 }, { "epoch": 0.30015197568389057, "grad_norm": 2.953937530517578, "learning_rate": 4.966373368459542e-06, "loss": 0.6272998452186584, "mean_token_accuracy": 0.7938471436500549, "num_tokens": 3301681.0, "step": 395 }, { "epoch": 0.3009118541033435, "grad_norm": 2.085981845855713, "learning_rate": 4.966030145403642e-06, "loss": 0.5305740237236023, "mean_token_accuracy": 0.8177378177642822, "num_tokens": 3310738.0, "step": 396 }, { "epoch": 0.30167173252279633, "grad_norm": 1.54762864112854, "learning_rate": 4.965685191592859e-06, "loss": 0.40358829498291016, "mean_token_accuracy": 0.8470232486724854, "num_tokens": 3321604.0, "step": 397 }, { "epoch": 0.30243161094224924, "grad_norm": 4.049755573272705, "learning_rate": 4.9653385072692935e-06, "loss": 0.4449229836463928, "mean_token_accuracy": 0.8261799812316895, "num_tokens": 3324318.0, "step": 398 }, { "epoch": 0.30319148936170215, "grad_norm": 2.5733447074890137, "learning_rate": 4.964990092676263e-06, "loss": 0.48942190408706665, "mean_token_accuracy": 0.8347821235656738, "num_tokens": 3329603.0, "step": 399 }, { "epoch": 0.303951367781155, "grad_norm": 2.338879108428955, "learning_rate": 4.964639948058297e-06, "loss": 0.3268873393535614, "mean_token_accuracy": 0.881216287612915, "num_tokens": 3334802.0, "step": 400 }, { "epoch": 0.3047112462006079, "grad_norm": 1.804551124572754, "learning_rate": 4.964288073661142e-06, "loss": 0.3644765317440033, "mean_token_accuracy": 0.8437183499336243, "num_tokens": 3342948.0, "step": 401 }, { "epoch": 0.30547112462006076, "grad_norm": 1.6682343482971191, "learning_rate": 4.963934469731756e-06, "loss": 0.45854973793029785, "mean_token_accuracy": 0.8507781624794006, "num_tokens": 3353740.0, "step": 402 }, { "epoch": 0.30623100303951367, "grad_norm": 4.343244552612305, "learning_rate": 4.963579136518312e-06, "loss": 0.47984182834625244, "mean_token_accuracy": 0.8383501768112183, "num_tokens": 3357648.0, "step": 403 }, { "epoch": 0.3069908814589666, "grad_norm": 2.8490872383117676, "learning_rate": 4.963222074270197e-06, "loss": 0.5992711782455444, "mean_token_accuracy": 0.8178967237472534, "num_tokens": 3363050.0, "step": 404 }, { "epoch": 0.30775075987841943, "grad_norm": 2.543656587600708, "learning_rate": 4.962863283238011e-06, "loss": 0.5602473020553589, "mean_token_accuracy": 0.8076410293579102, "num_tokens": 3369067.0, "step": 405 }, { "epoch": 0.30851063829787234, "grad_norm": 1.5536396503448486, "learning_rate": 4.962502763673566e-06, "loss": 0.477932870388031, "mean_token_accuracy": 0.8217586278915405, "num_tokens": 3382570.0, "step": 406 }, { "epoch": 0.30927051671732525, "grad_norm": 2.4474048614501953, "learning_rate": 4.96214051582989e-06, "loss": 0.48681867122650146, "mean_token_accuracy": 0.8442000150680542, "num_tokens": 3389192.0, "step": 407 }, { "epoch": 0.3100303951367781, "grad_norm": 2.3083410263061523, "learning_rate": 4.961776539961222e-06, "loss": 0.5324057340621948, "mean_token_accuracy": 0.8179268836975098, "num_tokens": 3398668.0, "step": 408 }, { "epoch": 0.310790273556231, "grad_norm": 2.712888240814209, "learning_rate": 4.961410836323014e-06, "loss": 0.5375503301620483, "mean_token_accuracy": 0.8183479309082031, "num_tokens": 3403341.0, "step": 409 }, { "epoch": 0.31155015197568386, "grad_norm": 1.5305988788604736, "learning_rate": 4.961043405171931e-06, "loss": 0.5159702301025391, "mean_token_accuracy": 0.8177282214164734, "num_tokens": 3418953.0, "step": 410 }, { "epoch": 0.3123100303951368, "grad_norm": 1.5801697969436646, "learning_rate": 4.9606742467658505e-06, "loss": 0.5146126747131348, "mean_token_accuracy": 0.8196765184402466, "num_tokens": 3437744.0, "step": 411 }, { "epoch": 0.3130699088145897, "grad_norm": 2.3317625522613525, "learning_rate": 4.960303361363863e-06, "loss": 0.5206908583641052, "mean_token_accuracy": 0.821370542049408, "num_tokens": 3444442.0, "step": 412 }, { "epoch": 0.31382978723404253, "grad_norm": 1.645398736000061, "learning_rate": 4.959930749226269e-06, "loss": 0.40867871046066284, "mean_token_accuracy": 0.8548761606216431, "num_tokens": 3456458.0, "step": 413 }, { "epoch": 0.31458966565349544, "grad_norm": 2.6546099185943604, "learning_rate": 4.9595564106145825e-06, "loss": 0.42953741550445557, "mean_token_accuracy": 0.8524375557899475, "num_tokens": 3460874.0, "step": 414 }, { "epoch": 0.31534954407294835, "grad_norm": 1.6146862506866455, "learning_rate": 4.959180345791528e-06, "loss": 0.45501887798309326, "mean_token_accuracy": 0.8171894550323486, "num_tokens": 3475225.0, "step": 415 }, { "epoch": 0.3161094224924012, "grad_norm": 1.3038263320922852, "learning_rate": 4.958802555021042e-06, "loss": 0.42623579502105713, "mean_token_accuracy": 0.8478569984436035, "num_tokens": 3493549.0, "step": 416 }, { "epoch": 0.3168693009118541, "grad_norm": 2.0775184631347656, "learning_rate": 4.958423038568274e-06, "loss": 0.370103120803833, "mean_token_accuracy": 0.8694682717323303, "num_tokens": 3499222.0, "step": 417 }, { "epoch": 0.31762917933130697, "grad_norm": 1.9842660427093506, "learning_rate": 4.958041796699583e-06, "loss": 0.4976680278778076, "mean_token_accuracy": 0.8439270853996277, "num_tokens": 3507686.0, "step": 418 }, { "epoch": 0.3183890577507599, "grad_norm": 2.6495370864868164, "learning_rate": 4.957658829682539e-06, "loss": 0.4992824196815491, "mean_token_accuracy": 0.8215071558952332, "num_tokens": 3512261.0, "step": 419 }, { "epoch": 0.3191489361702128, "grad_norm": 1.758331537246704, "learning_rate": 4.9572741377859225e-06, "loss": 0.5421558618545532, "mean_token_accuracy": 0.8140615224838257, "num_tokens": 3522889.0, "step": 420 }, { "epoch": 0.31990881458966564, "grad_norm": 2.9629287719726562, "learning_rate": 4.956887721279726e-06, "loss": 0.49583905935287476, "mean_token_accuracy": 0.81712406873703, "num_tokens": 3527387.0, "step": 421 }, { "epoch": 0.32066869300911854, "grad_norm": 1.8280107975006104, "learning_rate": 4.95649958043515e-06, "loss": 0.3644634783267975, "mean_token_accuracy": 0.8655364513397217, "num_tokens": 3534002.0, "step": 422 }, { "epoch": 0.32142857142857145, "grad_norm": 2.3438503742218018, "learning_rate": 4.956109715524609e-06, "loss": 0.5218960046768188, "mean_token_accuracy": 0.8156325817108154, "num_tokens": 3540378.0, "step": 423 }, { "epoch": 0.3221884498480243, "grad_norm": 2.914623737335205, "learning_rate": 4.9557181268217225e-06, "loss": 0.5090000629425049, "mean_token_accuracy": 0.8220853805541992, "num_tokens": 3544791.0, "step": 424 }, { "epoch": 0.3229483282674772, "grad_norm": 1.8533551692962646, "learning_rate": 4.955324814601324e-06, "loss": 0.4710542559623718, "mean_token_accuracy": 0.8278185129165649, "num_tokens": 3554244.0, "step": 425 }, { "epoch": 0.32370820668693007, "grad_norm": 2.895254135131836, "learning_rate": 4.954929779139455e-06, "loss": 0.5684993863105774, "mean_token_accuracy": 0.8432695269584656, "num_tokens": 3560409.0, "step": 426 }, { "epoch": 0.324468085106383, "grad_norm": 2.5141751766204834, "learning_rate": 4.954533020713367e-06, "loss": 0.48398154973983765, "mean_token_accuracy": 0.8218153119087219, "num_tokens": 3567275.0, "step": 427 }, { "epoch": 0.3252279635258359, "grad_norm": 3.102905511856079, "learning_rate": 4.954134539601519e-06, "loss": 0.5117533206939697, "mean_token_accuracy": 0.8482083082199097, "num_tokens": 3572195.0, "step": 428 }, { "epoch": 0.32598784194528874, "grad_norm": 1.5614527463912964, "learning_rate": 4.953734336083582e-06, "loss": 0.39276060461997986, "mean_token_accuracy": 0.8795406818389893, "num_tokens": 3583260.0, "step": 429 }, { "epoch": 0.32674772036474165, "grad_norm": 2.461669921875, "learning_rate": 4.953332410440434e-06, "loss": 0.6011022329330444, "mean_token_accuracy": 0.790380597114563, "num_tokens": 3593519.0, "step": 430 }, { "epoch": 0.32750759878419455, "grad_norm": 1.4988338947296143, "learning_rate": 4.952928762954161e-06, "loss": 0.3363340198993683, "mean_token_accuracy": 0.882912278175354, "num_tokens": 3603554.0, "step": 431 }, { "epoch": 0.3282674772036474, "grad_norm": 2.019150733947754, "learning_rate": 4.952523393908059e-06, "loss": 0.4858176112174988, "mean_token_accuracy": 0.8212261199951172, "num_tokens": 3611867.0, "step": 432 }, { "epoch": 0.3290273556231003, "grad_norm": 2.2393953800201416, "learning_rate": 4.952116303586631e-06, "loss": 0.4052902162075043, "mean_token_accuracy": 0.851185142993927, "num_tokens": 3617158.0, "step": 433 }, { "epoch": 0.32978723404255317, "grad_norm": 2.0428338050842285, "learning_rate": 4.951707492275589e-06, "loss": 0.4827128052711487, "mean_token_accuracy": 0.8306541442871094, "num_tokens": 3625958.0, "step": 434 }, { "epoch": 0.3305471124620061, "grad_norm": 3.0937533378601074, "learning_rate": 4.951296960261853e-06, "loss": 0.31141477823257446, "mean_token_accuracy": 0.894458532333374, "num_tokens": 3629275.0, "step": 435 }, { "epoch": 0.331306990881459, "grad_norm": 2.3901596069335938, "learning_rate": 4.95088470783355e-06, "loss": 0.5179769992828369, "mean_token_accuracy": 0.8250888586044312, "num_tokens": 3634963.0, "step": 436 }, { "epoch": 0.33206686930091184, "grad_norm": 2.4738881587982178, "learning_rate": 4.950470735280013e-06, "loss": 0.45892447233200073, "mean_token_accuracy": 0.8635761737823486, "num_tokens": 3640657.0, "step": 437 }, { "epoch": 0.33282674772036475, "grad_norm": 2.332380771636963, "learning_rate": 4.950055042891786e-06, "loss": 0.47390294075012207, "mean_token_accuracy": 0.86452317237854, "num_tokens": 3646819.0, "step": 438 }, { "epoch": 0.33358662613981765, "grad_norm": 4.826568126678467, "learning_rate": 4.949637630960618e-06, "loss": 0.48327547311782837, "mean_token_accuracy": 0.8322122097015381, "num_tokens": 3648870.0, "step": 439 }, { "epoch": 0.3343465045592705, "grad_norm": 2.105173349380493, "learning_rate": 4.949218499779462e-06, "loss": 0.5252559185028076, "mean_token_accuracy": 0.8192916512489319, "num_tokens": 3657851.0, "step": 440 }, { "epoch": 0.3351063829787234, "grad_norm": 1.8412903547286987, "learning_rate": 4.948797649642484e-06, "loss": 0.49293676018714905, "mean_token_accuracy": 0.8469193577766418, "num_tokens": 3669262.0, "step": 441 }, { "epoch": 0.33586626139817627, "grad_norm": 3.4044852256774902, "learning_rate": 4.94837508084505e-06, "loss": 0.6744300127029419, "mean_token_accuracy": 0.7866237163543701, "num_tokens": 3673313.0, "step": 442 }, { "epoch": 0.3366261398176292, "grad_norm": 1.8514925241470337, "learning_rate": 4.9479507936837364e-06, "loss": 0.43219512701034546, "mean_token_accuracy": 0.8449255228042603, "num_tokens": 3681998.0, "step": 443 }, { "epoch": 0.3373860182370821, "grad_norm": 2.966836452484131, "learning_rate": 4.947524788456325e-06, "loss": 0.5979588031768799, "mean_token_accuracy": 0.8045225143432617, "num_tokens": 3686556.0, "step": 444 }, { "epoch": 0.33814589665653494, "grad_norm": 1.5906144380569458, "learning_rate": 4.947097065461801e-06, "loss": 0.48026829957962036, "mean_token_accuracy": 0.8438354134559631, "num_tokens": 3698564.0, "step": 445 }, { "epoch": 0.33890577507598785, "grad_norm": 1.947380542755127, "learning_rate": 4.946667625000358e-06, "loss": 0.4435119032859802, "mean_token_accuracy": 0.8234341740608215, "num_tokens": 3705749.0, "step": 446 }, { "epoch": 0.33966565349544076, "grad_norm": 1.73146390914917, "learning_rate": 4.946236467373392e-06, "loss": 0.5239195227622986, "mean_token_accuracy": 0.8134052157402039, "num_tokens": 3716071.0, "step": 447 }, { "epoch": 0.3404255319148936, "grad_norm": 1.9863660335540771, "learning_rate": 4.945803592883509e-06, "loss": 0.5005546808242798, "mean_token_accuracy": 0.8310836553573608, "num_tokens": 3724284.0, "step": 448 }, { "epoch": 0.3411854103343465, "grad_norm": 1.7305761575698853, "learning_rate": 4.9453690018345144e-06, "loss": 0.40787550806999207, "mean_token_accuracy": 0.8583958148956299, "num_tokens": 3734627.0, "step": 449 }, { "epoch": 0.34194528875379937, "grad_norm": 1.4045218229293823, "learning_rate": 4.944932694531423e-06, "loss": 0.5022330284118652, "mean_token_accuracy": 0.8324989080429077, "num_tokens": 3754357.0, "step": 450 }, { "epoch": 0.3427051671732523, "grad_norm": 1.7415108680725098, "learning_rate": 4.94449467128045e-06, "loss": 0.39325833320617676, "mean_token_accuracy": 0.8628256320953369, "num_tokens": 3763300.0, "step": 451 }, { "epoch": 0.3434650455927052, "grad_norm": 2.2774908542633057, "learning_rate": 4.944054932389018e-06, "loss": 0.5054275393486023, "mean_token_accuracy": 0.856291651725769, "num_tokens": 3769233.0, "step": 452 }, { "epoch": 0.34422492401215804, "grad_norm": 1.5996630191802979, "learning_rate": 4.943613478165753e-06, "loss": 0.39869362115859985, "mean_token_accuracy": 0.8556894659996033, "num_tokens": 3779668.0, "step": 453 }, { "epoch": 0.34498480243161095, "grad_norm": 2.8231725692749023, "learning_rate": 4.943170308920484e-06, "loss": 0.4610729217529297, "mean_token_accuracy": 0.8591855764389038, "num_tokens": 3783629.0, "step": 454 }, { "epoch": 0.34574468085106386, "grad_norm": 2.540994882583618, "learning_rate": 4.9427254249642445e-06, "loss": 0.5615667104721069, "mean_token_accuracy": 0.8084384202957153, "num_tokens": 3790579.0, "step": 455 }, { "epoch": 0.3465045592705167, "grad_norm": 1.7328214645385742, "learning_rate": 4.942278826609272e-06, "loss": 0.5150455236434937, "mean_token_accuracy": 0.8214741349220276, "num_tokens": 3800795.0, "step": 456 }, { "epoch": 0.3472644376899696, "grad_norm": 1.6073330640792847, "learning_rate": 4.9418305141690045e-06, "loss": 0.48729372024536133, "mean_token_accuracy": 0.8309370875358582, "num_tokens": 3813738.0, "step": 457 }, { "epoch": 0.34802431610942247, "grad_norm": 3.1176810264587402, "learning_rate": 4.9413804879580865e-06, "loss": 0.5165641903877258, "mean_token_accuracy": 0.8539294600486755, "num_tokens": 3818088.0, "step": 458 }, { "epoch": 0.3487841945288754, "grad_norm": 1.476922869682312, "learning_rate": 4.940928748292363e-06, "loss": 0.5822708606719971, "mean_token_accuracy": 0.8083155751228333, "num_tokens": 3839183.0, "step": 459 }, { "epoch": 0.3495440729483283, "grad_norm": 2.4246726036071777, "learning_rate": 4.940475295488882e-06, "loss": 0.42251867055892944, "mean_token_accuracy": 0.8503992557525635, "num_tokens": 3844849.0, "step": 460 }, { "epoch": 0.35030395136778114, "grad_norm": 1.3491480350494385, "learning_rate": 4.940020129865895e-06, "loss": 0.4598064124584198, "mean_token_accuracy": 0.830859899520874, "num_tokens": 3862108.0, "step": 461 }, { "epoch": 0.35106382978723405, "grad_norm": 2.066025495529175, "learning_rate": 4.9395632517428546e-06, "loss": 0.5363115072250366, "mean_token_accuracy": 0.8228449821472168, "num_tokens": 3870682.0, "step": 462 }, { "epoch": 0.3518237082066869, "grad_norm": 1.7449887990951538, "learning_rate": 4.939104661440415e-06, "loss": 0.42669913172721863, "mean_token_accuracy": 0.8581840395927429, "num_tokens": 3885309.0, "step": 463 }, { "epoch": 0.3525835866261398, "grad_norm": 2.282083749771118, "learning_rate": 4.938644359280433e-06, "loss": 0.528269350528717, "mean_token_accuracy": 0.8524715900421143, "num_tokens": 3892692.0, "step": 464 }, { "epoch": 0.3533434650455927, "grad_norm": 1.9782079458236694, "learning_rate": 4.938182345585967e-06, "loss": 0.5342779755592346, "mean_token_accuracy": 0.8000766038894653, "num_tokens": 3901731.0, "step": 465 }, { "epoch": 0.3541033434650456, "grad_norm": 2.3067269325256348, "learning_rate": 4.937718620681273e-06, "loss": 0.4966881275177002, "mean_token_accuracy": 0.8279182314872742, "num_tokens": 3908966.0, "step": 466 }, { "epoch": 0.3548632218844985, "grad_norm": 1.9411311149597168, "learning_rate": 4.9372531848918145e-06, "loss": 0.5149158239364624, "mean_token_accuracy": 0.8420406579971313, "num_tokens": 3919028.0, "step": 467 }, { "epoch": 0.3556231003039514, "grad_norm": 1.9435569047927856, "learning_rate": 4.936786038544251e-06, "loss": 0.5169678926467896, "mean_token_accuracy": 0.8235641121864319, "num_tokens": 3927936.0, "step": 468 }, { "epoch": 0.35638297872340424, "grad_norm": 1.3978698253631592, "learning_rate": 4.9363171819664434e-06, "loss": 0.5187251567840576, "mean_token_accuracy": 0.8063663244247437, "num_tokens": 3952117.0, "step": 469 }, { "epoch": 0.35714285714285715, "grad_norm": 2.639873743057251, "learning_rate": 4.9358466154874535e-06, "loss": 0.4771063029766083, "mean_token_accuracy": 0.8208592534065247, "num_tokens": 3957083.0, "step": 470 }, { "epoch": 0.35790273556231, "grad_norm": 1.6088488101959229, "learning_rate": 4.935374339437543e-06, "loss": 0.5243949294090271, "mean_token_accuracy": 0.8492621779441833, "num_tokens": 3972633.0, "step": 471 }, { "epoch": 0.3586626139817629, "grad_norm": 3.3320486545562744, "learning_rate": 4.934900354148173e-06, "loss": 0.4870304763317108, "mean_token_accuracy": 0.8500401973724365, "num_tokens": 3975536.0, "step": 472 }, { "epoch": 0.3594224924012158, "grad_norm": 2.7519044876098633, "learning_rate": 4.934424659952006e-06, "loss": 0.3919612467288971, "mean_token_accuracy": 0.8723220825195312, "num_tokens": 3979855.0, "step": 473 }, { "epoch": 0.3601823708206687, "grad_norm": 1.1771601438522339, "learning_rate": 4.933947257182901e-06, "loss": 0.38711655139923096, "mean_token_accuracy": 0.8588876128196716, "num_tokens": 4004167.0, "step": 474 }, { "epoch": 0.3609422492401216, "grad_norm": 1.7675265073776245, "learning_rate": 4.933468146175918e-06, "loss": 0.5844885110855103, "mean_token_accuracy": 0.8076567649841309, "num_tokens": 4016801.0, "step": 475 }, { "epoch": 0.3617021276595745, "grad_norm": 3.0058584213256836, "learning_rate": 4.932987327267317e-06, "loss": 0.4400174021720886, "mean_token_accuracy": 0.8469029664993286, "num_tokens": 4022567.0, "step": 476 }, { "epoch": 0.36246200607902734, "grad_norm": 1.3611799478530884, "learning_rate": 4.932504800794553e-06, "loss": 0.4285426139831543, "mean_token_accuracy": 0.8450878858566284, "num_tokens": 4036585.0, "step": 477 }, { "epoch": 0.36322188449848025, "grad_norm": 1.4490348100662231, "learning_rate": 4.9320205670962815e-06, "loss": 0.5200105309486389, "mean_token_accuracy": 0.816506028175354, "num_tokens": 4052671.0, "step": 478 }, { "epoch": 0.3639817629179331, "grad_norm": 2.0383307933807373, "learning_rate": 4.931534626512359e-06, "loss": 0.4381054937839508, "mean_token_accuracy": 0.8396817445755005, "num_tokens": 4061884.0, "step": 479 }, { "epoch": 0.364741641337386, "grad_norm": 1.854593276977539, "learning_rate": 4.931046979383836e-06, "loss": 0.4555840492248535, "mean_token_accuracy": 0.84235680103302, "num_tokens": 4070797.0, "step": 480 }, { "epoch": 0.3655015197568389, "grad_norm": 2.12614107131958, "learning_rate": 4.930557626052961e-06, "loss": 0.3838217854499817, "mean_token_accuracy": 0.8676179051399231, "num_tokens": 4076345.0, "step": 481 }, { "epoch": 0.3662613981762918, "grad_norm": 1.612610936164856, "learning_rate": 4.930066566863182e-06, "loss": 0.5174338817596436, "mean_token_accuracy": 0.8290661573410034, "num_tokens": 4092149.0, "step": 482 }, { "epoch": 0.3670212765957447, "grad_norm": 2.1137144565582275, "learning_rate": 4.929573802159143e-06, "loss": 0.4602130651473999, "mean_token_accuracy": 0.8441717624664307, "num_tokens": 4098977.0, "step": 483 }, { "epoch": 0.3677811550151976, "grad_norm": 1.9106091260910034, "learning_rate": 4.929079332286685e-06, "loss": 0.42526084184646606, "mean_token_accuracy": 0.8522722721099854, "num_tokens": 4106471.0, "step": 484 }, { "epoch": 0.36854103343465044, "grad_norm": 1.719895601272583, "learning_rate": 4.928583157592846e-06, "loss": 0.38735371828079224, "mean_token_accuracy": 0.8658885955810547, "num_tokens": 4116297.0, "step": 485 }, { "epoch": 0.36930091185410335, "grad_norm": 1.820185899734497, "learning_rate": 4.928085278425862e-06, "loss": 0.5133121609687805, "mean_token_accuracy": 0.8316508531570435, "num_tokens": 4127473.0, "step": 486 }, { "epoch": 0.3700607902735562, "grad_norm": 1.9347177743911743, "learning_rate": 4.927585695135162e-06, "loss": 0.5389706492424011, "mean_token_accuracy": 0.8159632682800293, "num_tokens": 4136883.0, "step": 487 }, { "epoch": 0.3708206686930091, "grad_norm": 2.309093713760376, "learning_rate": 4.9270844080713735e-06, "loss": 0.5580676198005676, "mean_token_accuracy": 0.8078963160514832, "num_tokens": 4143537.0, "step": 488 }, { "epoch": 0.371580547112462, "grad_norm": 1.7023398876190186, "learning_rate": 4.926581417586319e-06, "loss": 0.49399808049201965, "mean_token_accuracy": 0.8330680131912231, "num_tokens": 4155279.0, "step": 489 }, { "epoch": 0.3723404255319149, "grad_norm": 1.7478828430175781, "learning_rate": 4.926076724033016e-06, "loss": 0.4861065149307251, "mean_token_accuracy": 0.8224774599075317, "num_tokens": 4165624.0, "step": 490 }, { "epoch": 0.3731003039513678, "grad_norm": 1.8368672132492065, "learning_rate": 4.925570327765678e-06, "loss": 0.5203143358230591, "mean_token_accuracy": 0.8500494956970215, "num_tokens": 4179106.0, "step": 491 }, { "epoch": 0.3738601823708207, "grad_norm": 1.7900545597076416, "learning_rate": 4.9250622291397144e-06, "loss": 0.29317501187324524, "mean_token_accuracy": 0.8932639360427856, "num_tokens": 4185792.0, "step": 492 }, { "epoch": 0.37462006079027355, "grad_norm": 1.993884563446045, "learning_rate": 4.924552428511727e-06, "loss": 0.41911810636520386, "mean_token_accuracy": 0.8486036062240601, "num_tokens": 4193481.0, "step": 493 }, { "epoch": 0.37537993920972645, "grad_norm": 1.8426238298416138, "learning_rate": 4.924040926239515e-06, "loss": 0.5417478084564209, "mean_token_accuracy": 0.7899638414382935, "num_tokens": 4206182.0, "step": 494 }, { "epoch": 0.3761398176291793, "grad_norm": 2.032972812652588, "learning_rate": 4.92352772268207e-06, "loss": 0.4417288899421692, "mean_token_accuracy": 0.8448511362075806, "num_tokens": 4212604.0, "step": 495 }, { "epoch": 0.3768996960486322, "grad_norm": 2.371108293533325, "learning_rate": 4.923012818199576e-06, "loss": 0.4979432225227356, "mean_token_accuracy": 0.8591092824935913, "num_tokens": 4217960.0, "step": 496 }, { "epoch": 0.3776595744680851, "grad_norm": 2.846374750137329, "learning_rate": 4.922496213153416e-06, "loss": 0.4680670201778412, "mean_token_accuracy": 0.8293017148971558, "num_tokens": 4222783.0, "step": 497 }, { "epoch": 0.378419452887538, "grad_norm": 1.91952645778656, "learning_rate": 4.921977907906161e-06, "loss": 0.46989706158638, "mean_token_accuracy": 0.8414992094039917, "num_tokens": 4230617.0, "step": 498 }, { "epoch": 0.3791793313069909, "grad_norm": 2.1629347801208496, "learning_rate": 4.921457902821578e-06, "loss": 0.40130868554115295, "mean_token_accuracy": 0.8518390655517578, "num_tokens": 4235935.0, "step": 499 }, { "epoch": 0.3799392097264438, "grad_norm": 1.874174952507019, "learning_rate": 4.9209361982646275e-06, "loss": 0.48775261640548706, "mean_token_accuracy": 0.8302479982376099, "num_tokens": 4244450.0, "step": 500 }, { "epoch": 0.38069908814589665, "grad_norm": 2.055781126022339, "learning_rate": 4.920412794601461e-06, "loss": 0.47624891996383667, "mean_token_accuracy": 0.8339136242866516, "num_tokens": 4251400.0, "step": 501 }, { "epoch": 0.38145896656534956, "grad_norm": 2.230872392654419, "learning_rate": 4.919887692199423e-06, "loss": 0.4844909906387329, "mean_token_accuracy": 0.8168134689331055, "num_tokens": 4258101.0, "step": 502 }, { "epoch": 0.3822188449848024, "grad_norm": 2.1640610694885254, "learning_rate": 4.9193608914270515e-06, "loss": 0.563758134841919, "mean_token_accuracy": 0.8058563470840454, "num_tokens": 4267449.0, "step": 503 }, { "epoch": 0.3829787234042553, "grad_norm": 2.2596869468688965, "learning_rate": 4.918832392654075e-06, "loss": 0.497257798910141, "mean_token_accuracy": 0.8271679878234863, "num_tokens": 4274005.0, "step": 504 }, { "epoch": 0.3837386018237082, "grad_norm": 1.68129563331604, "learning_rate": 4.9183021962514145e-06, "loss": 0.5896461606025696, "mean_token_accuracy": 0.796101450920105, "num_tokens": 4289095.0, "step": 505 }, { "epoch": 0.3844984802431611, "grad_norm": 1.684326410293579, "learning_rate": 4.917770302591183e-06, "loss": 0.3357805609703064, "mean_token_accuracy": 0.877285361289978, "num_tokens": 4298089.0, "step": 506 }, { "epoch": 0.385258358662614, "grad_norm": 1.5425621271133423, "learning_rate": 4.917236712046682e-06, "loss": 0.5053616762161255, "mean_token_accuracy": 0.8094472289085388, "num_tokens": 4315325.0, "step": 507 }, { "epoch": 0.3860182370820669, "grad_norm": 1.8255196809768677, "learning_rate": 4.9167014249924075e-06, "loss": 0.3369247317314148, "mean_token_accuracy": 0.8630348443984985, "num_tokens": 4322731.0, "step": 508 }, { "epoch": 0.38677811550151975, "grad_norm": 2.2271909713745117, "learning_rate": 4.916164441804044e-06, "loss": 0.5025190711021423, "mean_token_accuracy": 0.8220493197441101, "num_tokens": 4329541.0, "step": 509 }, { "epoch": 0.38753799392097266, "grad_norm": 2.1169731616973877, "learning_rate": 4.915625762858467e-06, "loss": 0.49157875776290894, "mean_token_accuracy": 0.8319511413574219, "num_tokens": 4335972.0, "step": 510 }, { "epoch": 0.3882978723404255, "grad_norm": 1.2977492809295654, "learning_rate": 4.915085388533743e-06, "loss": 0.4655773341655731, "mean_token_accuracy": 0.8238399028778076, "num_tokens": 4355657.0, "step": 511 }, { "epoch": 0.3890577507598784, "grad_norm": 2.400388479232788, "learning_rate": 4.914543319209126e-06, "loss": 0.510635256767273, "mean_token_accuracy": 0.8391962051391602, "num_tokens": 4361302.0, "step": 512 }, { "epoch": 0.3898176291793313, "grad_norm": 2.808622121810913, "learning_rate": 4.913999555265062e-06, "loss": 0.3918214440345764, "mean_token_accuracy": 0.854076623916626, "num_tokens": 4365314.0, "step": 513 }, { "epoch": 0.3905775075987842, "grad_norm": 2.3694703578948975, "learning_rate": 4.913454097083185e-06, "loss": 0.4798097312450409, "mean_token_accuracy": 0.8340897560119629, "num_tokens": 4370592.0, "step": 514 }, { "epoch": 0.3913373860182371, "grad_norm": 2.32905650138855, "learning_rate": 4.912906945046319e-06, "loss": 0.5024875998497009, "mean_token_accuracy": 0.8495163917541504, "num_tokens": 4376713.0, "step": 515 }, { "epoch": 0.39209726443769, "grad_norm": 1.4975775480270386, "learning_rate": 4.912358099538476e-06, "loss": 0.44349104166030884, "mean_token_accuracy": 0.8256516456604004, "num_tokens": 4392145.0, "step": 516 }, { "epoch": 0.39285714285714285, "grad_norm": 1.3479279279708862, "learning_rate": 4.911807560944858e-06, "loss": 0.39952075481414795, "mean_token_accuracy": 0.8625344038009644, "num_tokens": 4407508.0, "step": 517 }, { "epoch": 0.39361702127659576, "grad_norm": 2.503182888031006, "learning_rate": 4.911255329651852e-06, "loss": 0.5685954689979553, "mean_token_accuracy": 0.8361418843269348, "num_tokens": 4413300.0, "step": 518 }, { "epoch": 0.3943768996960486, "grad_norm": 1.550897479057312, "learning_rate": 4.910701406047037e-06, "loss": 0.5214766263961792, "mean_token_accuracy": 0.8029816150665283, "num_tokens": 4432131.0, "step": 519 }, { "epoch": 0.3951367781155015, "grad_norm": 2.2570624351501465, "learning_rate": 4.910145790519177e-06, "loss": 0.5039506554603577, "mean_token_accuracy": 0.8227694034576416, "num_tokens": 4439014.0, "step": 520 }, { "epoch": 0.3958966565349544, "grad_norm": 1.2405915260314941, "learning_rate": 4.9095884834582256e-06, "loss": 0.43927496671676636, "mean_token_accuracy": 0.8406883478164673, "num_tokens": 4456142.0, "step": 521 }, { "epoch": 0.3966565349544073, "grad_norm": 2.8999664783477783, "learning_rate": 4.909029485255321e-06, "loss": 0.4566405415534973, "mean_token_accuracy": 0.8406169414520264, "num_tokens": 4460155.0, "step": 522 }, { "epoch": 0.3974164133738602, "grad_norm": 2.3775827884674072, "learning_rate": 4.90846879630279e-06, "loss": 0.47347691655158997, "mean_token_accuracy": 0.8391696214675903, "num_tokens": 4466962.0, "step": 523 }, { "epoch": 0.3981762917933131, "grad_norm": 2.705432176589966, "learning_rate": 4.907906416994146e-06, "loss": 0.34877750277519226, "mean_token_accuracy": 0.8586087822914124, "num_tokens": 4472049.0, "step": 524 }, { "epoch": 0.39893617021276595, "grad_norm": 2.1355597972869873, "learning_rate": 4.907342347724088e-06, "loss": 0.5352267026901245, "mean_token_accuracy": 0.8132659196853638, "num_tokens": 4479469.0, "step": 525 }, { "epoch": 0.39969604863221886, "grad_norm": 2.642930269241333, "learning_rate": 4.906776588888502e-06, "loss": 0.5417162179946899, "mean_token_accuracy": 0.8195326328277588, "num_tokens": 4484888.0, "step": 526 }, { "epoch": 0.4004559270516717, "grad_norm": 1.9653481245040894, "learning_rate": 4.906209140884459e-06, "loss": 0.5188787579536438, "mean_token_accuracy": 0.8164578080177307, "num_tokens": 4494126.0, "step": 527 }, { "epoch": 0.4012158054711246, "grad_norm": 2.1649622917175293, "learning_rate": 4.905640004110216e-06, "loss": 0.5350884199142456, "mean_token_accuracy": 0.8144514560699463, "num_tokens": 4500747.0, "step": 528 }, { "epoch": 0.40197568389057753, "grad_norm": 1.780606985092163, "learning_rate": 4.905069178965215e-06, "loss": 0.49227356910705566, "mean_token_accuracy": 0.8300670385360718, "num_tokens": 4511324.0, "step": 529 }, { "epoch": 0.4027355623100304, "grad_norm": 2.383995771408081, "learning_rate": 4.904496665850083e-06, "loss": 0.5728126764297485, "mean_token_accuracy": 0.7982839345932007, "num_tokens": 4518388.0, "step": 530 }, { "epoch": 0.4034954407294833, "grad_norm": 2.1188619136810303, "learning_rate": 4.903922465166633e-06, "loss": 0.5170526504516602, "mean_token_accuracy": 0.8247905969619751, "num_tokens": 4524956.0, "step": 531 }, { "epoch": 0.40425531914893614, "grad_norm": 1.3629902601242065, "learning_rate": 4.903346577317859e-06, "loss": 0.4504483938217163, "mean_token_accuracy": 0.8357315063476562, "num_tokens": 4542947.0, "step": 532 }, { "epoch": 0.40501519756838905, "grad_norm": 1.911316156387329, "learning_rate": 4.902769002707942e-06, "loss": 0.3103373944759369, "mean_token_accuracy": 0.8955463767051697, "num_tokens": 4548663.0, "step": 533 }, { "epoch": 0.40577507598784196, "grad_norm": 1.58231782913208, "learning_rate": 4.902189741742247e-06, "loss": 0.4453960359096527, "mean_token_accuracy": 0.8380752205848694, "num_tokens": 4561920.0, "step": 534 }, { "epoch": 0.4065349544072948, "grad_norm": 2.2610323429107666, "learning_rate": 4.901608794827321e-06, "loss": 0.37087973952293396, "mean_token_accuracy": 0.8731696605682373, "num_tokens": 4566566.0, "step": 535 }, { "epoch": 0.4072948328267477, "grad_norm": 2.357311248779297, "learning_rate": 4.9010261623708945e-06, "loss": 0.4292754530906677, "mean_token_accuracy": 0.8524476289749146, "num_tokens": 4572060.0, "step": 536 }, { "epoch": 0.40805471124620063, "grad_norm": 1.5455151796340942, "learning_rate": 4.900441844781882e-06, "loss": 0.49441856145858765, "mean_token_accuracy": 0.8392636775970459, "num_tokens": 4583971.0, "step": 537 }, { "epoch": 0.4088145896656535, "grad_norm": 2.362008810043335, "learning_rate": 4.89985584247038e-06, "loss": 0.45737266540527344, "mean_token_accuracy": 0.862938642501831, "num_tokens": 4590152.0, "step": 538 }, { "epoch": 0.4095744680851064, "grad_norm": 1.7617286443710327, "learning_rate": 4.899268155847667e-06, "loss": 0.45554471015930176, "mean_token_accuracy": 0.8370910882949829, "num_tokens": 4600587.0, "step": 539 }, { "epoch": 0.41033434650455924, "grad_norm": 2.217749834060669, "learning_rate": 4.898678785326205e-06, "loss": 0.4922047257423401, "mean_token_accuracy": 0.8233155608177185, "num_tokens": 4608944.0, "step": 540 }, { "epoch": 0.41109422492401215, "grad_norm": 2.5413386821746826, "learning_rate": 4.898087731319637e-06, "loss": 0.40927654504776, "mean_token_accuracy": 0.87587571144104, "num_tokens": 4613298.0, "step": 541 }, { "epoch": 0.41185410334346506, "grad_norm": 4.016610145568848, "learning_rate": 4.8974949942427854e-06, "loss": 0.44536292552948, "mean_token_accuracy": 0.8415871858596802, "num_tokens": 4615879.0, "step": 542 }, { "epoch": 0.4126139817629179, "grad_norm": 1.7483938932418823, "learning_rate": 4.896900574511657e-06, "loss": 0.4535871148109436, "mean_token_accuracy": 0.840785026550293, "num_tokens": 4625246.0, "step": 543 }, { "epoch": 0.4133738601823708, "grad_norm": 2.8072991371154785, "learning_rate": 4.89630447254344e-06, "loss": 0.6105838418006897, "mean_token_accuracy": 0.8333185911178589, "num_tokens": 4636022.0, "step": 544 }, { "epoch": 0.41413373860182373, "grad_norm": 1.475715160369873, "learning_rate": 4.8957066887565005e-06, "loss": 0.43646636605262756, "mean_token_accuracy": 0.8444384932518005, "num_tokens": 4649774.0, "step": 545 }, { "epoch": 0.4148936170212766, "grad_norm": 2.4876935482025146, "learning_rate": 4.895107223570386e-06, "loss": 0.4014703035354614, "mean_token_accuracy": 0.8722496628761292, "num_tokens": 4654593.0, "step": 546 }, { "epoch": 0.4156534954407295, "grad_norm": 2.698641538619995, "learning_rate": 4.894506077405824e-06, "loss": 0.5129483938217163, "mean_token_accuracy": 0.8329781889915466, "num_tokens": 4660158.0, "step": 547 }, { "epoch": 0.41641337386018235, "grad_norm": 2.892404317855835, "learning_rate": 4.893903250684723e-06, "loss": 0.435560405254364, "mean_token_accuracy": 0.8472626209259033, "num_tokens": 4663705.0, "step": 548 }, { "epoch": 0.41717325227963525, "grad_norm": 2.3079450130462646, "learning_rate": 4.893298743830168e-06, "loss": 0.4949483871459961, "mean_token_accuracy": 0.820269763469696, "num_tokens": 4669885.0, "step": 549 }, { "epoch": 0.41793313069908816, "grad_norm": 2.3712430000305176, "learning_rate": 4.892692557266429e-06, "loss": 0.46948671340942383, "mean_token_accuracy": 0.8335106372833252, "num_tokens": 4675845.0, "step": 550 }, { "epoch": 0.418693009118541, "grad_norm": 3.5355911254882812, "learning_rate": 4.8920846914189465e-06, "loss": 0.5023574233055115, "mean_token_accuracy": 0.8397154808044434, "num_tokens": 4678778.0, "step": 551 }, { "epoch": 0.4194528875379939, "grad_norm": 1.6729077100753784, "learning_rate": 4.891475146714348e-06, "loss": 0.5844266414642334, "mean_token_accuracy": 0.7964673042297363, "num_tokens": 4692971.0, "step": 552 }, { "epoch": 0.42021276595744683, "grad_norm": 1.5726003646850586, "learning_rate": 4.8908639235804324e-06, "loss": 0.4634174406528473, "mean_token_accuracy": 0.8316428661346436, "num_tokens": 4706483.0, "step": 553 }, { "epoch": 0.4209726443768997, "grad_norm": 1.5735766887664795, "learning_rate": 4.890251022446181e-06, "loss": 0.5243015289306641, "mean_token_accuracy": 0.8176264762878418, "num_tokens": 4721006.0, "step": 554 }, { "epoch": 0.4217325227963526, "grad_norm": 1.8711097240447998, "learning_rate": 4.889636443741752e-06, "loss": 0.42648065090179443, "mean_token_accuracy": 0.8540663719177246, "num_tokens": 4731234.0, "step": 555 }, { "epoch": 0.42249240121580545, "grad_norm": 2.1808011531829834, "learning_rate": 4.88902018789848e-06, "loss": 0.4090477228164673, "mean_token_accuracy": 0.8533409833908081, "num_tokens": 4736982.0, "step": 556 }, { "epoch": 0.42325227963525835, "grad_norm": 1.9000643491744995, "learning_rate": 4.888402255348877e-06, "loss": 0.4920252561569214, "mean_token_accuracy": 0.8278638124465942, "num_tokens": 4746015.0, "step": 557 }, { "epoch": 0.42401215805471126, "grad_norm": 1.6835788488388062, "learning_rate": 4.887782646526631e-06, "loss": 0.5171347856521606, "mean_token_accuracy": 0.8305040001869202, "num_tokens": 4757995.0, "step": 558 }, { "epoch": 0.4247720364741641, "grad_norm": 2.4674644470214844, "learning_rate": 4.887161361866608e-06, "loss": 0.5379098653793335, "mean_token_accuracy": 0.805732011795044, "num_tokens": 4766333.0, "step": 559 }, { "epoch": 0.425531914893617, "grad_norm": 2.1818697452545166, "learning_rate": 4.8865384018048494e-06, "loss": 0.5358027219772339, "mean_token_accuracy": 0.8157109022140503, "num_tokens": 4773302.0, "step": 560 }, { "epoch": 0.42629179331306993, "grad_norm": 1.5563722848892212, "learning_rate": 4.8859137667785735e-06, "loss": 0.48180875182151794, "mean_token_accuracy": 0.8297841548919678, "num_tokens": 4785130.0, "step": 561 }, { "epoch": 0.4270516717325228, "grad_norm": 2.0417134761810303, "learning_rate": 4.8852874572261715e-06, "loss": 0.482379674911499, "mean_token_accuracy": 0.8305231332778931, "num_tokens": 4791899.0, "step": 562 }, { "epoch": 0.4278115501519757, "grad_norm": 1.6289424896240234, "learning_rate": 4.884659473587213e-06, "loss": 0.5225020051002502, "mean_token_accuracy": 0.8188822865486145, "num_tokens": 4807666.0, "step": 563 }, { "epoch": 0.42857142857142855, "grad_norm": 2.3250341415405273, "learning_rate": 4.884029816302441e-06, "loss": 0.48849189281463623, "mean_token_accuracy": 0.8139042854309082, "num_tokens": 4813566.0, "step": 564 }, { "epoch": 0.42933130699088146, "grad_norm": 1.750071406364441, "learning_rate": 4.883398485813772e-06, "loss": 0.44057148694992065, "mean_token_accuracy": 0.8574005365371704, "num_tokens": 4822765.0, "step": 565 }, { "epoch": 0.43009118541033436, "grad_norm": 1.522481083869934, "learning_rate": 4.8827654825642984e-06, "loss": 0.4549819827079773, "mean_token_accuracy": 0.8268336057662964, "num_tokens": 4835230.0, "step": 566 }, { "epoch": 0.4308510638297872, "grad_norm": 1.2659013271331787, "learning_rate": 4.882130806998287e-06, "loss": 0.44574666023254395, "mean_token_accuracy": 0.8060784339904785, "num_tokens": 4851748.0, "step": 567 }, { "epoch": 0.4316109422492401, "grad_norm": 1.9666274785995483, "learning_rate": 4.881494459561177e-06, "loss": 0.553135871887207, "mean_token_accuracy": 0.8108686208724976, "num_tokens": 4860511.0, "step": 568 }, { "epoch": 0.43237082066869303, "grad_norm": 1.1471658945083618, "learning_rate": 4.880856440699582e-06, "loss": 0.373235821723938, "mean_token_accuracy": 0.8662744760513306, "num_tokens": 4881952.0, "step": 569 }, { "epoch": 0.4331306990881459, "grad_norm": 1.6941676139831543, "learning_rate": 4.880216750861288e-06, "loss": 0.536140501499176, "mean_token_accuracy": 0.8076417446136475, "num_tokens": 4893817.0, "step": 570 }, { "epoch": 0.4338905775075988, "grad_norm": 1.8708007335662842, "learning_rate": 4.879575390495254e-06, "loss": 0.37522369623184204, "mean_token_accuracy": 0.8634734749794006, "num_tokens": 4900702.0, "step": 571 }, { "epoch": 0.43465045592705165, "grad_norm": 3.032951593399048, "learning_rate": 4.878932360051611e-06, "loss": 0.5705235004425049, "mean_token_accuracy": 0.822039008140564, "num_tokens": 4905360.0, "step": 572 }, { "epoch": 0.43541033434650456, "grad_norm": 2.2931487560272217, "learning_rate": 4.878287659981663e-06, "loss": 0.4667856693267822, "mean_token_accuracy": 0.8671571612358093, "num_tokens": 4911258.0, "step": 573 }, { "epoch": 0.43617021276595747, "grad_norm": 1.5880272388458252, "learning_rate": 4.8776412907378845e-06, "loss": 0.5333184003829956, "mean_token_accuracy": 0.8448289632797241, "num_tokens": 4929347.0, "step": 574 }, { "epoch": 0.4369300911854103, "grad_norm": 1.7592260837554932, "learning_rate": 4.876993252773923e-06, "loss": 0.42056071758270264, "mean_token_accuracy": 0.8493223190307617, "num_tokens": 4937999.0, "step": 575 }, { "epoch": 0.4376899696048632, "grad_norm": 1.3128409385681152, "learning_rate": 4.876343546544596e-06, "loss": 0.43090301752090454, "mean_token_accuracy": 0.8431683778762817, "num_tokens": 4951924.0, "step": 576 }, { "epoch": 0.43844984802431614, "grad_norm": 2.2033936977386475, "learning_rate": 4.8756921725058935e-06, "loss": 0.5168547630310059, "mean_token_accuracy": 0.82608562707901, "num_tokens": 4960473.0, "step": 577 }, { "epoch": 0.439209726443769, "grad_norm": 1.54935622215271, "learning_rate": 4.875039131114975e-06, "loss": 0.3442407250404358, "mean_token_accuracy": 0.8524424433708191, "num_tokens": 4970205.0, "step": 578 }, { "epoch": 0.4399696048632219, "grad_norm": 1.621962070465088, "learning_rate": 4.8743844228301676e-06, "loss": 0.4696016311645508, "mean_token_accuracy": 0.8362908363342285, "num_tokens": 4981997.0, "step": 579 }, { "epoch": 0.44072948328267475, "grad_norm": 1.776847004890442, "learning_rate": 4.873728048110973e-06, "loss": 0.5784905552864075, "mean_token_accuracy": 0.7997252941131592, "num_tokens": 4996539.0, "step": 580 }, { "epoch": 0.44148936170212766, "grad_norm": 2.043445587158203, "learning_rate": 4.873070007418059e-06, "loss": 0.49215561151504517, "mean_token_accuracy": 0.8145220279693604, "num_tokens": 5005113.0, "step": 581 }, { "epoch": 0.44224924012158057, "grad_norm": 1.3814793825149536, "learning_rate": 4.872410301213265e-06, "loss": 0.47962701320648193, "mean_token_accuracy": 0.8382116556167603, "num_tokens": 5022068.0, "step": 582 }, { "epoch": 0.4430091185410334, "grad_norm": 1.8251920938491821, "learning_rate": 4.871748929959598e-06, "loss": 0.34540295600891113, "mean_token_accuracy": 0.8747892379760742, "num_tokens": 5031947.0, "step": 583 }, { "epoch": 0.44376899696048633, "grad_norm": 1.7698776721954346, "learning_rate": 4.871085894121234e-06, "loss": 0.5506308078765869, "mean_token_accuracy": 0.8076712489128113, "num_tokens": 5046125.0, "step": 584 }, { "epoch": 0.44452887537993924, "grad_norm": 2.159858226776123, "learning_rate": 4.870421194163515e-06, "loss": 0.4155213236808777, "mean_token_accuracy": 0.8612204194068909, "num_tokens": 5050994.0, "step": 585 }, { "epoch": 0.4452887537993921, "grad_norm": 2.544252634048462, "learning_rate": 4.869754830552956e-06, "loss": 0.4318983554840088, "mean_token_accuracy": 0.853102445602417, "num_tokens": 5055745.0, "step": 586 }, { "epoch": 0.446048632218845, "grad_norm": 2.085960626602173, "learning_rate": 4.869086803757235e-06, "loss": 0.5072901248931885, "mean_token_accuracy": 0.8226085305213928, "num_tokens": 5062649.0, "step": 587 }, { "epoch": 0.44680851063829785, "grad_norm": 2.9260871410369873, "learning_rate": 4.868417114245199e-06, "loss": 0.5853766798973083, "mean_token_accuracy": 0.8455843329429626, "num_tokens": 5067887.0, "step": 588 }, { "epoch": 0.44756838905775076, "grad_norm": 1.7980996370315552, "learning_rate": 4.867745762486862e-06, "loss": 0.496432900428772, "mean_token_accuracy": 0.8240994811058044, "num_tokens": 5077445.0, "step": 589 }, { "epoch": 0.44832826747720367, "grad_norm": 1.5280243158340454, "learning_rate": 4.8670727489534035e-06, "loss": 0.4973749816417694, "mean_token_accuracy": 0.8444130420684814, "num_tokens": 5090569.0, "step": 590 }, { "epoch": 0.4490881458966565, "grad_norm": 2.85929012298584, "learning_rate": 4.866398074117173e-06, "loss": 0.3670230507850647, "mean_token_accuracy": 0.8688805103302002, "num_tokens": 5093853.0, "step": 591 }, { "epoch": 0.44984802431610943, "grad_norm": 2.1087558269500732, "learning_rate": 4.86572173845168e-06, "loss": 0.5709018707275391, "mean_token_accuracy": 0.8056541085243225, "num_tokens": 5102248.0, "step": 592 }, { "epoch": 0.4506079027355623, "grad_norm": 2.267425537109375, "learning_rate": 4.865043742431605e-06, "loss": 0.5483532547950745, "mean_token_accuracy": 0.8169167637825012, "num_tokens": 5110453.0, "step": 593 }, { "epoch": 0.4513677811550152, "grad_norm": 1.7768651247024536, "learning_rate": 4.864364086532792e-06, "loss": 0.4660247564315796, "mean_token_accuracy": 0.8425134420394897, "num_tokens": 5122417.0, "step": 594 }, { "epoch": 0.4521276595744681, "grad_norm": 1.4207671880722046, "learning_rate": 4.863682771232249e-06, "loss": 0.44873684644699097, "mean_token_accuracy": 0.8296250104904175, "num_tokens": 5137626.0, "step": 595 }, { "epoch": 0.45288753799392095, "grad_norm": 1.976282000541687, "learning_rate": 4.862999797008149e-06, "loss": 0.5604327321052551, "mean_token_accuracy": 0.8158591985702515, "num_tokens": 5147887.0, "step": 596 }, { "epoch": 0.45364741641337386, "grad_norm": 3.4956934452056885, "learning_rate": 4.862315164339829e-06, "loss": 0.4131582975387573, "mean_token_accuracy": 0.852203905582428, "num_tokens": 5150992.0, "step": 597 }, { "epoch": 0.45440729483282677, "grad_norm": 3.1881635189056396, "learning_rate": 4.861628873707792e-06, "loss": 0.6443498134613037, "mean_token_accuracy": 0.7832992672920227, "num_tokens": 5154764.0, "step": 598 }, { "epoch": 0.4551671732522796, "grad_norm": 2.0504488945007324, "learning_rate": 4.860940925593703e-06, "loss": 0.4550248384475708, "mean_token_accuracy": 0.8533322811126709, "num_tokens": 5162569.0, "step": 599 }, { "epoch": 0.45592705167173253, "grad_norm": 3.422691822052002, "learning_rate": 4.86025132048039e-06, "loss": 0.48267173767089844, "mean_token_accuracy": 0.8282521963119507, "num_tokens": 5167056.0, "step": 600 }, { "epoch": 0.4566869300911854, "grad_norm": 1.7003215551376343, "learning_rate": 4.859560058851844e-06, "loss": 0.4646824300289154, "mean_token_accuracy": 0.8488043546676636, "num_tokens": 5177752.0, "step": 601 }, { "epoch": 0.4574468085106383, "grad_norm": 3.094937324523926, "learning_rate": 4.8588671411932195e-06, "loss": 0.47665974497795105, "mean_token_accuracy": 0.822475790977478, "num_tokens": 5181059.0, "step": 602 }, { "epoch": 0.4582066869300912, "grad_norm": 2.6230075359344482, "learning_rate": 4.858172567990832e-06, "loss": 0.5276778936386108, "mean_token_accuracy": 0.8277568817138672, "num_tokens": 5186541.0, "step": 603 }, { "epoch": 0.45896656534954405, "grad_norm": 2.0146803855895996, "learning_rate": 4.857476339732162e-06, "loss": 0.41573643684387207, "mean_token_accuracy": 0.8459112644195557, "num_tokens": 5193208.0, "step": 604 }, { "epoch": 0.45972644376899696, "grad_norm": 2.2832047939300537, "learning_rate": 4.856778456905846e-06, "loss": 0.44260644912719727, "mean_token_accuracy": 0.837356448173523, "num_tokens": 5198558.0, "step": 605 }, { "epoch": 0.46048632218844987, "grad_norm": 2.215421676635742, "learning_rate": 4.856078920001689e-06, "loss": 0.541540265083313, "mean_token_accuracy": 0.8139248490333557, "num_tokens": 5204515.0, "step": 606 }, { "epoch": 0.4612462006079027, "grad_norm": 2.1481690406799316, "learning_rate": 4.855377729510648e-06, "loss": 0.5875097513198853, "mean_token_accuracy": 0.8083330988883972, "num_tokens": 5212099.0, "step": 607 }, { "epoch": 0.46200607902735563, "grad_norm": 2.555629253387451, "learning_rate": 4.8546748859248504e-06, "loss": 0.6058074235916138, "mean_token_accuracy": 0.7870136499404907, "num_tokens": 5217974.0, "step": 608 }, { "epoch": 0.4627659574468085, "grad_norm": 2.7553253173828125, "learning_rate": 4.853970389737576e-06, "loss": 0.2978392541408539, "mean_token_accuracy": 0.8897851705551147, "num_tokens": 5221306.0, "step": 609 }, { "epoch": 0.4635258358662614, "grad_norm": 2.716369390487671, "learning_rate": 4.8532642414432675e-06, "loss": 0.6190924644470215, "mean_token_accuracy": 0.7923913598060608, "num_tokens": 5226958.0, "step": 610 }, { "epoch": 0.4642857142857143, "grad_norm": 1.7936944961547852, "learning_rate": 4.852556441537528e-06, "loss": 0.33577677607536316, "mean_token_accuracy": 0.8650237917900085, "num_tokens": 5234392.0, "step": 611 }, { "epoch": 0.46504559270516715, "grad_norm": 1.6506803035736084, "learning_rate": 4.851846990517118e-06, "loss": 0.5909202098846436, "mean_token_accuracy": 0.7970037460327148, "num_tokens": 5247013.0, "step": 612 }, { "epoch": 0.46580547112462006, "grad_norm": 1.8104954957962036, "learning_rate": 4.851135888879958e-06, "loss": 0.432063490152359, "mean_token_accuracy": 0.8540003299713135, "num_tokens": 5256835.0, "step": 613 }, { "epoch": 0.46656534954407297, "grad_norm": 2.3342862129211426, "learning_rate": 4.850423137125126e-06, "loss": 0.5327414870262146, "mean_token_accuracy": 0.8305746912956238, "num_tokens": 5264369.0, "step": 614 }, { "epoch": 0.4673252279635258, "grad_norm": 2.3439366817474365, "learning_rate": 4.8497087357528585e-06, "loss": 0.6383283138275146, "mean_token_accuracy": 0.8068422079086304, "num_tokens": 5273232.0, "step": 615 }, { "epoch": 0.46808510638297873, "grad_norm": 2.5633602142333984, "learning_rate": 4.8489926852645505e-06, "loss": 0.43189486861228943, "mean_token_accuracy": 0.8446429371833801, "num_tokens": 5278349.0, "step": 616 }, { "epoch": 0.4688449848024316, "grad_norm": 1.5836342573165894, "learning_rate": 4.848274986162754e-06, "loss": 0.4738016128540039, "mean_token_accuracy": 0.8230966329574585, "num_tokens": 5292550.0, "step": 617 }, { "epoch": 0.4696048632218845, "grad_norm": 2.2851970195770264, "learning_rate": 4.847555638951177e-06, "loss": 0.4870763123035431, "mean_token_accuracy": 0.8352062702178955, "num_tokens": 5299266.0, "step": 618 }, { "epoch": 0.4703647416413374, "grad_norm": 1.6537147760391235, "learning_rate": 4.846834644134686e-06, "loss": 0.4185401201248169, "mean_token_accuracy": 0.8508622646331787, "num_tokens": 5309226.0, "step": 619 }, { "epoch": 0.47112462006079026, "grad_norm": 2.48115873336792, "learning_rate": 4.846112002219301e-06, "loss": 0.5209293365478516, "mean_token_accuracy": 0.8153672218322754, "num_tokens": 5315891.0, "step": 620 }, { "epoch": 0.47188449848024316, "grad_norm": 2.528759002685547, "learning_rate": 4.845387713712203e-06, "loss": 0.4290841817855835, "mean_token_accuracy": 0.8516919612884521, "num_tokens": 5320410.0, "step": 621 }, { "epoch": 0.4726443768996961, "grad_norm": 1.7299134731292725, "learning_rate": 4.844661779121723e-06, "loss": 0.5427767634391785, "mean_token_accuracy": 0.809242844581604, "num_tokens": 5333369.0, "step": 622 }, { "epoch": 0.4734042553191489, "grad_norm": 2.600541353225708, "learning_rate": 4.843934198957351e-06, "loss": 0.5924317836761475, "mean_token_accuracy": 0.834161102771759, "num_tokens": 5338959.0, "step": 623 }, { "epoch": 0.47416413373860183, "grad_norm": 2.5299830436706543, "learning_rate": 4.84320497372973e-06, "loss": 0.5819499492645264, "mean_token_accuracy": 0.7921682596206665, "num_tokens": 5345088.0, "step": 624 }, { "epoch": 0.4749240121580547, "grad_norm": 2.7185213565826416, "learning_rate": 4.842474103950658e-06, "loss": 0.3976823687553406, "mean_token_accuracy": 0.8676345348358154, "num_tokens": 5349649.0, "step": 625 }, { "epoch": 0.4756838905775076, "grad_norm": 3.488968849182129, "learning_rate": 4.841741590133089e-06, "loss": 0.6358038783073425, "mean_token_accuracy": 0.8013798594474792, "num_tokens": 5353806.0, "step": 626 }, { "epoch": 0.4764437689969605, "grad_norm": 2.1854147911071777, "learning_rate": 4.841007432791129e-06, "loss": 0.4694806635379791, "mean_token_accuracy": 0.8429578542709351, "num_tokens": 5359814.0, "step": 627 }, { "epoch": 0.47720364741641336, "grad_norm": 2.2328732013702393, "learning_rate": 4.8402716324400375e-06, "loss": 0.3461613953113556, "mean_token_accuracy": 0.8803620338439941, "num_tokens": 5365183.0, "step": 628 }, { "epoch": 0.47796352583586627, "grad_norm": 1.5411367416381836, "learning_rate": 4.839534189596228e-06, "loss": 0.3935459554195404, "mean_token_accuracy": 0.8545909523963928, "num_tokens": 5375512.0, "step": 629 }, { "epoch": 0.4787234042553192, "grad_norm": 2.197303533554077, "learning_rate": 4.8387951047772656e-06, "loss": 0.4462829828262329, "mean_token_accuracy": 0.8534374237060547, "num_tokens": 5381628.0, "step": 630 }, { "epoch": 0.479483282674772, "grad_norm": 1.4957845211029053, "learning_rate": 4.838054378501868e-06, "loss": 0.45892927050590515, "mean_token_accuracy": 0.8348727822303772, "num_tokens": 5394563.0, "step": 631 }, { "epoch": 0.48024316109422494, "grad_norm": 1.5337835550308228, "learning_rate": 4.837312011289907e-06, "loss": 0.4064037799835205, "mean_token_accuracy": 0.8595039248466492, "num_tokens": 5406922.0, "step": 632 }, { "epoch": 0.4810030395136778, "grad_norm": 3.764256238937378, "learning_rate": 4.836568003662403e-06, "loss": 0.4333711564540863, "mean_token_accuracy": 0.8405383825302124, "num_tokens": 5409256.0, "step": 633 }, { "epoch": 0.4817629179331307, "grad_norm": 1.2404766082763672, "learning_rate": 4.8358223561415304e-06, "loss": 0.3710271120071411, "mean_token_accuracy": 0.8664895296096802, "num_tokens": 5424772.0, "step": 634 }, { "epoch": 0.4825227963525836, "grad_norm": 1.9933364391326904, "learning_rate": 4.835075069250613e-06, "loss": 0.389653742313385, "mean_token_accuracy": 0.8555202484130859, "num_tokens": 5431807.0, "step": 635 }, { "epoch": 0.48328267477203646, "grad_norm": 1.4354065656661987, "learning_rate": 4.8343261435141245e-06, "loss": 0.45983028411865234, "mean_token_accuracy": 0.8382909297943115, "num_tokens": 5448859.0, "step": 636 }, { "epoch": 0.48404255319148937, "grad_norm": 1.6717485189437866, "learning_rate": 4.833575579457691e-06, "loss": 0.35919392108917236, "mean_token_accuracy": 0.8889127969741821, "num_tokens": 5456548.0, "step": 637 }, { "epoch": 0.4848024316109423, "grad_norm": 1.7048540115356445, "learning_rate": 4.832823377608088e-06, "loss": 0.4005333483219147, "mean_token_accuracy": 0.86216139793396, "num_tokens": 5468141.0, "step": 638 }, { "epoch": 0.48556231003039513, "grad_norm": 1.9860360622406006, "learning_rate": 4.832069538493237e-06, "loss": 0.3747888207435608, "mean_token_accuracy": 0.8657118082046509, "num_tokens": 5474833.0, "step": 639 }, { "epoch": 0.48632218844984804, "grad_norm": 1.6138876676559448, "learning_rate": 4.831314062642213e-06, "loss": 0.47836577892303467, "mean_token_accuracy": 0.8378514051437378, "num_tokens": 5486456.0, "step": 640 }, { "epoch": 0.4870820668693009, "grad_norm": 1.9518992900848389, "learning_rate": 4.830556950585239e-06, "loss": 0.40961670875549316, "mean_token_accuracy": 0.8537701368331909, "num_tokens": 5493926.0, "step": 641 }, { "epoch": 0.4878419452887538, "grad_norm": 2.9804043769836426, "learning_rate": 4.829798202853683e-06, "loss": 0.5683507323265076, "mean_token_accuracy": 0.8166797161102295, "num_tokens": 5498797.0, "step": 642 }, { "epoch": 0.4886018237082067, "grad_norm": 1.910904049873352, "learning_rate": 4.829037819980065e-06, "loss": 0.4286502003669739, "mean_token_accuracy": 0.8568353652954102, "num_tokens": 5506365.0, "step": 643 }, { "epoch": 0.48936170212765956, "grad_norm": 2.3651235103607178, "learning_rate": 4.828275802498051e-06, "loss": 0.4485647976398468, "mean_token_accuracy": 0.8471283316612244, "num_tokens": 5511944.0, "step": 644 }, { "epoch": 0.49012158054711247, "grad_norm": 1.9755531549453735, "learning_rate": 4.827512150942454e-06, "loss": 0.4045594334602356, "mean_token_accuracy": 0.8545803427696228, "num_tokens": 5520331.0, "step": 645 }, { "epoch": 0.4908814589665654, "grad_norm": 1.8841909170150757, "learning_rate": 4.8267468658492335e-06, "loss": 0.4773695468902588, "mean_token_accuracy": 0.8486886024475098, "num_tokens": 5528605.0, "step": 646 }, { "epoch": 0.49164133738601823, "grad_norm": 1.8051406145095825, "learning_rate": 4.825979947755496e-06, "loss": 0.5543516278266907, "mean_token_accuracy": 0.7976117134094238, "num_tokens": 5540464.0, "step": 647 }, { "epoch": 0.49240121580547114, "grad_norm": 3.1752281188964844, "learning_rate": 4.8252113971994955e-06, "loss": 0.5746598243713379, "mean_token_accuracy": 0.825099766254425, "num_tokens": 5546292.0, "step": 648 }, { "epoch": 0.493161094224924, "grad_norm": 2.9637610912323, "learning_rate": 4.824441214720629e-06, "loss": 0.40092965960502625, "mean_token_accuracy": 0.8745595812797546, "num_tokens": 5549430.0, "step": 649 }, { "epoch": 0.4939209726443769, "grad_norm": 2.0234932899475098, "learning_rate": 4.823669400859441e-06, "loss": 0.579008162021637, "mean_token_accuracy": 0.8241140842437744, "num_tokens": 5557865.0, "step": 650 }, { "epoch": 0.4946808510638298, "grad_norm": 1.1711043119430542, "learning_rate": 4.8228959561576195e-06, "loss": 0.40490004420280457, "mean_token_accuracy": 0.8479105234146118, "num_tokens": 5577337.0, "step": 651 }, { "epoch": 0.49544072948328266, "grad_norm": 1.9951932430267334, "learning_rate": 4.822120881157998e-06, "loss": 0.4962886571884155, "mean_token_accuracy": 0.8239437341690063, "num_tokens": 5586382.0, "step": 652 }, { "epoch": 0.49620060790273557, "grad_norm": 3.4275267124176025, "learning_rate": 4.821344176404554e-06, "loss": 0.4348936080932617, "mean_token_accuracy": 0.8483643531799316, "num_tokens": 5589227.0, "step": 653 }, { "epoch": 0.4969604863221885, "grad_norm": 3.089677095413208, "learning_rate": 4.820565842442408e-06, "loss": 0.4969814419746399, "mean_token_accuracy": 0.820526123046875, "num_tokens": 5593318.0, "step": 654 }, { "epoch": 0.49772036474164133, "grad_norm": 2.433255434036255, "learning_rate": 4.819785879817827e-06, "loss": 0.49912628531455994, "mean_token_accuracy": 0.8462561368942261, "num_tokens": 5598313.0, "step": 655 }, { "epoch": 0.49848024316109424, "grad_norm": 2.3052635192871094, "learning_rate": 4.819004289078217e-06, "loss": 0.5517863035202026, "mean_token_accuracy": 0.8038468360900879, "num_tokens": 5604827.0, "step": 656 }, { "epoch": 0.4992401215805471, "grad_norm": 2.087714672088623, "learning_rate": 4.818221070772129e-06, "loss": 0.5179933309555054, "mean_token_accuracy": 0.8126649260520935, "num_tokens": 5612345.0, "step": 657 }, { "epoch": 0.5, "grad_norm": 1.5718315839767456, "learning_rate": 4.8174362254492555e-06, "loss": 0.49791502952575684, "mean_token_accuracy": 0.8147987127304077, "num_tokens": 5624688.0, "step": 658 }, { "epoch": 0.5007598784194529, "grad_norm": 2.023894786834717, "learning_rate": 4.816649753660431e-06, "loss": 0.3900100290775299, "mean_token_accuracy": 0.8653963804244995, "num_tokens": 5630774.0, "step": 659 }, { "epoch": 0.5015197568389058, "grad_norm": 3.1195034980773926, "learning_rate": 4.815861655957632e-06, "loss": 0.38084471225738525, "mean_token_accuracy": 0.8551889657974243, "num_tokens": 5634806.0, "step": 660 }, { "epoch": 0.5022796352583586, "grad_norm": 1.1556870937347412, "learning_rate": 4.815071932893976e-06, "loss": 0.41517752408981323, "mean_token_accuracy": 0.8434287905693054, "num_tokens": 5652308.0, "step": 661 }, { "epoch": 0.5030395136778115, "grad_norm": 1.3827934265136719, "learning_rate": 4.81428058502372e-06, "loss": 0.5281240940093994, "mean_token_accuracy": 0.8140045404434204, "num_tokens": 5670572.0, "step": 662 }, { "epoch": 0.5037993920972644, "grad_norm": 1.8974567651748657, "learning_rate": 4.813487612902265e-06, "loss": 0.5226365923881531, "mean_token_accuracy": 0.837066650390625, "num_tokens": 5679684.0, "step": 663 }, { "epoch": 0.5045592705167173, "grad_norm": 2.4013209342956543, "learning_rate": 4.812693017086145e-06, "loss": 0.470547616481781, "mean_token_accuracy": 0.821993350982666, "num_tokens": 5685769.0, "step": 664 }, { "epoch": 0.5053191489361702, "grad_norm": 1.8950684070587158, "learning_rate": 4.811896798133042e-06, "loss": 0.5226761102676392, "mean_token_accuracy": 0.8077384233474731, "num_tokens": 5696182.0, "step": 665 }, { "epoch": 0.506079027355623, "grad_norm": 2.1453075408935547, "learning_rate": 4.811098956601772e-06, "loss": 0.4317760765552521, "mean_token_accuracy": 0.8489662408828735, "num_tokens": 5702487.0, "step": 666 }, { "epoch": 0.506838905775076, "grad_norm": 1.4667294025421143, "learning_rate": 4.810299493052289e-06, "loss": 0.39067623019218445, "mean_token_accuracy": 0.8580642938613892, "num_tokens": 5714003.0, "step": 667 }, { "epoch": 0.5075987841945289, "grad_norm": 2.8400089740753174, "learning_rate": 4.809498408045691e-06, "loss": 0.4783066511154175, "mean_token_accuracy": 0.8334233164787292, "num_tokens": 5718153.0, "step": 668 }, { "epoch": 0.5083586626139818, "grad_norm": 1.5703561305999756, "learning_rate": 4.808695702144206e-06, "loss": 0.4625510573387146, "mean_token_accuracy": 0.8410191535949707, "num_tokens": 5730026.0, "step": 669 }, { "epoch": 0.5091185410334347, "grad_norm": 1.2351874113082886, "learning_rate": 4.807891375911207e-06, "loss": 0.37267962098121643, "mean_token_accuracy": 0.8449434041976929, "num_tokens": 5745666.0, "step": 670 }, { "epoch": 0.5098784194528876, "grad_norm": 2.6481969356536865, "learning_rate": 4.8070854299112e-06, "loss": 0.6104316711425781, "mean_token_accuracy": 0.8011107444763184, "num_tokens": 5751746.0, "step": 671 }, { "epoch": 0.5106382978723404, "grad_norm": 2.7330830097198486, "learning_rate": 4.806277864709828e-06, "loss": 0.5459984540939331, "mean_token_accuracy": 0.814912736415863, "num_tokens": 5756602.0, "step": 672 }, { "epoch": 0.5113981762917933, "grad_norm": 2.625068187713623, "learning_rate": 4.805468680873874e-06, "loss": 0.4850236177444458, "mean_token_accuracy": 0.8303148150444031, "num_tokens": 5761466.0, "step": 673 }, { "epoch": 0.5121580547112462, "grad_norm": 2.8971197605133057, "learning_rate": 4.804657878971252e-06, "loss": 0.3686336874961853, "mean_token_accuracy": 0.8712918758392334, "num_tokens": 5764858.0, "step": 674 }, { "epoch": 0.5129179331306991, "grad_norm": 2.4213671684265137, "learning_rate": 4.803845459571014e-06, "loss": 0.4329441487789154, "mean_token_accuracy": 0.8345562219619751, "num_tokens": 5769167.0, "step": 675 }, { "epoch": 0.513677811550152, "grad_norm": 2.9091386795043945, "learning_rate": 4.803031423243349e-06, "loss": 0.5625981688499451, "mean_token_accuracy": 0.8446449041366577, "num_tokens": 5773945.0, "step": 676 }, { "epoch": 0.5144376899696048, "grad_norm": 1.6716400384902954, "learning_rate": 4.802215770559578e-06, "loss": 0.5100817680358887, "mean_token_accuracy": 0.8298100233078003, "num_tokens": 5785497.0, "step": 677 }, { "epoch": 0.5151975683890577, "grad_norm": 2.1554148197174072, "learning_rate": 4.801398502092156e-06, "loss": 0.4152962565422058, "mean_token_accuracy": 0.8570256233215332, "num_tokens": 5792592.0, "step": 678 }, { "epoch": 0.5159574468085106, "grad_norm": 2.435845136642456, "learning_rate": 4.800579618414677e-06, "loss": 0.4497343897819519, "mean_token_accuracy": 0.8423111438751221, "num_tokens": 5798486.0, "step": 679 }, { "epoch": 0.5167173252279635, "grad_norm": 1.989006519317627, "learning_rate": 4.799759120101861e-06, "loss": 0.5540581345558167, "mean_token_accuracy": 0.8367617726325989, "num_tokens": 5805441.0, "step": 680 }, { "epoch": 0.5174772036474165, "grad_norm": 1.5651880502700806, "learning_rate": 4.798937007729568e-06, "loss": 0.4808984398841858, "mean_token_accuracy": 0.8281278610229492, "num_tokens": 5819267.0, "step": 681 }, { "epoch": 0.5182370820668692, "grad_norm": 1.9812109470367432, "learning_rate": 4.798113281874788e-06, "loss": 0.4705607295036316, "mean_token_accuracy": 0.8278742432594299, "num_tokens": 5827812.0, "step": 682 }, { "epoch": 0.5189969604863222, "grad_norm": 1.6913940906524658, "learning_rate": 4.797287943115642e-06, "loss": 0.5208501815795898, "mean_token_accuracy": 0.8249417543411255, "num_tokens": 5839235.0, "step": 683 }, { "epoch": 0.5197568389057751, "grad_norm": 1.81809401512146, "learning_rate": 4.796460992031386e-06, "loss": 0.4658868908882141, "mean_token_accuracy": 0.8420040607452393, "num_tokens": 5848972.0, "step": 684 }, { "epoch": 0.520516717325228, "grad_norm": 2.1637492179870605, "learning_rate": 4.7956324292024045e-06, "loss": 0.5366767644882202, "mean_token_accuracy": 0.8110297918319702, "num_tokens": 5856875.0, "step": 685 }, { "epoch": 0.5212765957446809, "grad_norm": 2.5795931816101074, "learning_rate": 4.794802255210217e-06, "loss": 0.5069275498390198, "mean_token_accuracy": 0.8302167654037476, "num_tokens": 5861708.0, "step": 686 }, { "epoch": 0.5220364741641338, "grad_norm": 2.643785238265991, "learning_rate": 4.793970470637469e-06, "loss": 0.592145562171936, "mean_token_accuracy": 0.790871262550354, "num_tokens": 5868283.0, "step": 687 }, { "epoch": 0.5227963525835866, "grad_norm": 1.5663838386535645, "learning_rate": 4.7931370760679415e-06, "loss": 0.4538711905479431, "mean_token_accuracy": 0.8419860005378723, "num_tokens": 5878884.0, "step": 688 }, { "epoch": 0.5235562310030395, "grad_norm": 2.236764669418335, "learning_rate": 4.792302072086542e-06, "loss": 0.5106793642044067, "mean_token_accuracy": 0.8331024646759033, "num_tokens": 5886047.0, "step": 689 }, { "epoch": 0.5243161094224924, "grad_norm": 2.964491128921509, "learning_rate": 4.7914654592793065e-06, "loss": 0.4634789824485779, "mean_token_accuracy": 0.8439992666244507, "num_tokens": 5889693.0, "step": 690 }, { "epoch": 0.5250759878419453, "grad_norm": 1.6666783094406128, "learning_rate": 4.790627238233405e-06, "loss": 0.3990614414215088, "mean_token_accuracy": 0.853827953338623, "num_tokens": 5898664.0, "step": 691 }, { "epoch": 0.5258358662613982, "grad_norm": 2.42938494682312, "learning_rate": 4.789787409537131e-06, "loss": 0.5084211230278015, "mean_token_accuracy": 0.8411228060722351, "num_tokens": 5905294.0, "step": 692 }, { "epoch": 0.526595744680851, "grad_norm": 1.7794736623764038, "learning_rate": 4.7889459737799105e-06, "loss": 0.43089574575424194, "mean_token_accuracy": 0.8505694270133972, "num_tokens": 5914131.0, "step": 693 }, { "epoch": 0.5273556231003039, "grad_norm": 2.302791118621826, "learning_rate": 4.788102931552294e-06, "loss": 0.5151532292366028, "mean_token_accuracy": 0.8154581785202026, "num_tokens": 5919937.0, "step": 694 }, { "epoch": 0.5281155015197568, "grad_norm": 2.371701717376709, "learning_rate": 4.787258283445962e-06, "loss": 0.3740626573562622, "mean_token_accuracy": 0.8779724836349487, "num_tokens": 5924778.0, "step": 695 }, { "epoch": 0.5288753799392097, "grad_norm": 2.1565825939178467, "learning_rate": 4.786412030053721e-06, "loss": 0.4572535455226898, "mean_token_accuracy": 0.8557048439979553, "num_tokens": 5931744.0, "step": 696 }, { "epoch": 0.5296352583586627, "grad_norm": 1.9793959856033325, "learning_rate": 4.785564171969503e-06, "loss": 0.46432819962501526, "mean_token_accuracy": 0.8585522174835205, "num_tokens": 5942595.0, "step": 697 }, { "epoch": 0.5303951367781155, "grad_norm": 2.6405293941497803, "learning_rate": 4.784714709788368e-06, "loss": 0.577790379524231, "mean_token_accuracy": 0.8005146980285645, "num_tokens": 5947159.0, "step": 698 }, { "epoch": 0.5311550151975684, "grad_norm": 1.6467243432998657, "learning_rate": 4.783863644106502e-06, "loss": 0.378523588180542, "mean_token_accuracy": 0.8665074706077576, "num_tokens": 5955503.0, "step": 699 }, { "epoch": 0.5319148936170213, "grad_norm": 1.6351271867752075, "learning_rate": 4.783010975521216e-06, "loss": 0.42142003774642944, "mean_token_accuracy": 0.8479681611061096, "num_tokens": 5965049.0, "step": 700 }, { "epoch": 0.5326747720364742, "grad_norm": 1.679734706878662, "learning_rate": 4.782156704630944e-06, "loss": 0.42243868112564087, "mean_token_accuracy": 0.8485743999481201, "num_tokens": 5975572.0, "step": 701 }, { "epoch": 0.5334346504559271, "grad_norm": 1.7572410106658936, "learning_rate": 4.7813008320352475e-06, "loss": 0.3124234080314636, "mean_token_accuracy": 0.8955981135368347, "num_tokens": 5982331.0, "step": 702 }, { "epoch": 0.53419452887538, "grad_norm": 2.038093328475952, "learning_rate": 4.78044335833481e-06, "loss": 0.3492271602153778, "mean_token_accuracy": 0.8749583959579468, "num_tokens": 5988044.0, "step": 703 }, { "epoch": 0.5349544072948328, "grad_norm": 1.502953290939331, "learning_rate": 4.77958428413144e-06, "loss": 0.45608213543891907, "mean_token_accuracy": 0.8436861634254456, "num_tokens": 5999360.0, "step": 704 }, { "epoch": 0.5357142857142857, "grad_norm": 1.2919143438339233, "learning_rate": 4.7787236100280685e-06, "loss": 0.35781624913215637, "mean_token_accuracy": 0.857500433921814, "num_tokens": 6014319.0, "step": 705 }, { "epoch": 0.5364741641337386, "grad_norm": 1.4746363162994385, "learning_rate": 4.777861336628751e-06, "loss": 0.4553064703941345, "mean_token_accuracy": 0.8644794821739197, "num_tokens": 6032012.0, "step": 706 }, { "epoch": 0.5372340425531915, "grad_norm": 1.2029075622558594, "learning_rate": 4.7769974645386616e-06, "loss": 0.36050671339035034, "mean_token_accuracy": 0.8742889165878296, "num_tokens": 6053982.0, "step": 707 }, { "epoch": 0.5379939209726444, "grad_norm": 1.7229902744293213, "learning_rate": 4.776131994364102e-06, "loss": 0.4012393653392792, "mean_token_accuracy": 0.8494868278503418, "num_tokens": 6062608.0, "step": 708 }, { "epoch": 0.5387537993920972, "grad_norm": 1.6619466543197632, "learning_rate": 4.775264926712489e-06, "loss": 0.5714209675788879, "mean_token_accuracy": 0.8079773187637329, "num_tokens": 6074800.0, "step": 709 }, { "epoch": 0.5395136778115501, "grad_norm": 1.8739644289016724, "learning_rate": 4.774396262192368e-06, "loss": 0.5224334001541138, "mean_token_accuracy": 0.8226214647293091, "num_tokens": 6084801.0, "step": 710 }, { "epoch": 0.540273556231003, "grad_norm": 1.7326252460479736, "learning_rate": 4.7735260014133986e-06, "loss": 0.4534417390823364, "mean_token_accuracy": 0.85335773229599, "num_tokens": 6095510.0, "step": 711 }, { "epoch": 0.541033434650456, "grad_norm": 1.5122230052947998, "learning_rate": 4.772654144986364e-06, "loss": 0.361116886138916, "mean_token_accuracy": 0.8660293817520142, "num_tokens": 6106354.0, "step": 712 }, { "epoch": 0.5417933130699089, "grad_norm": 2.6396396160125732, "learning_rate": 4.7717806935231665e-06, "loss": 0.3782613277435303, "mean_token_accuracy": 0.8628933429718018, "num_tokens": 6110425.0, "step": 713 }, { "epoch": 0.5425531914893617, "grad_norm": 1.488351583480835, "learning_rate": 4.770905647636828e-06, "loss": 0.5653847455978394, "mean_token_accuracy": 0.7892078161239624, "num_tokens": 6126792.0, "step": 714 }, { "epoch": 0.5433130699088146, "grad_norm": 2.223890781402588, "learning_rate": 4.77002900794149e-06, "loss": 0.5461349487304688, "mean_token_accuracy": 0.805544376373291, "num_tokens": 6134544.0, "step": 715 }, { "epoch": 0.5440729483282675, "grad_norm": 2.103342056274414, "learning_rate": 4.769150775052411e-06, "loss": 0.5122923254966736, "mean_token_accuracy": 0.8244349360466003, "num_tokens": 6141102.0, "step": 716 }, { "epoch": 0.5448328267477204, "grad_norm": 3.34637451171875, "learning_rate": 4.768270949585968e-06, "loss": 0.6029807329177856, "mean_token_accuracy": 0.7977179884910583, "num_tokens": 6145033.0, "step": 717 }, { "epoch": 0.5455927051671733, "grad_norm": 2.35310959815979, "learning_rate": 4.767389532159659e-06, "loss": 0.36193016171455383, "mean_token_accuracy": 0.8708676099777222, "num_tokens": 6149618.0, "step": 718 }, { "epoch": 0.5463525835866262, "grad_norm": 2.06655216217041, "learning_rate": 4.766506523392095e-06, "loss": 0.3700219690799713, "mean_token_accuracy": 0.8714411854743958, "num_tokens": 6155510.0, "step": 719 }, { "epoch": 0.547112462006079, "grad_norm": 1.1178799867630005, "learning_rate": 4.765621923903005e-06, "loss": 0.44793474674224854, "mean_token_accuracy": 0.8378269672393799, "num_tokens": 6178674.0, "step": 720 }, { "epoch": 0.5478723404255319, "grad_norm": 3.2518749237060547, "learning_rate": 4.764735734313236e-06, "loss": 0.4045289158821106, "mean_token_accuracy": 0.8469523191452026, "num_tokens": 6183091.0, "step": 721 }, { "epoch": 0.5486322188449848, "grad_norm": 2.1426641941070557, "learning_rate": 4.763847955244749e-06, "loss": 0.5390356779098511, "mean_token_accuracy": 0.819678783416748, "num_tokens": 6190703.0, "step": 722 }, { "epoch": 0.5493920972644377, "grad_norm": 2.606250762939453, "learning_rate": 4.762958587320623e-06, "loss": 0.526504635810852, "mean_token_accuracy": 0.8235678672790527, "num_tokens": 6196950.0, "step": 723 }, { "epoch": 0.5501519756838906, "grad_norm": 1.9263027906417847, "learning_rate": 4.762067631165049e-06, "loss": 0.48520591855049133, "mean_token_accuracy": 0.8337453603744507, "num_tokens": 6205760.0, "step": 724 }, { "epoch": 0.5509118541033434, "grad_norm": 4.06183385848999, "learning_rate": 4.761175087403336e-06, "loss": 0.4972907602787018, "mean_token_accuracy": 0.8399640917778015, "num_tokens": 6208851.0, "step": 725 }, { "epoch": 0.5516717325227963, "grad_norm": 2.033979654312134, "learning_rate": 4.760280956661904e-06, "loss": 0.43614792823791504, "mean_token_accuracy": 0.837505578994751, "num_tokens": 6216024.0, "step": 726 }, { "epoch": 0.5524316109422492, "grad_norm": 2.0920820236206055, "learning_rate": 4.75938523956829e-06, "loss": 0.45022889971733093, "mean_token_accuracy": 0.8302348256111145, "num_tokens": 6223527.0, "step": 727 }, { "epoch": 0.5531914893617021, "grad_norm": 1.4943269491195679, "learning_rate": 4.75848793675114e-06, "loss": 0.4882992208003998, "mean_token_accuracy": 0.8428292274475098, "num_tokens": 6240470.0, "step": 728 }, { "epoch": 0.5539513677811551, "grad_norm": 2.4450228214263916, "learning_rate": 4.757589048840219e-06, "loss": 0.36317628622055054, "mean_token_accuracy": 0.8800690174102783, "num_tokens": 6244432.0, "step": 729 }, { "epoch": 0.5547112462006079, "grad_norm": 2.691455841064453, "learning_rate": 4.756688576466398e-06, "loss": 0.4764796495437622, "mean_token_accuracy": 0.8537558913230896, "num_tokens": 6248845.0, "step": 730 }, { "epoch": 0.5554711246200608, "grad_norm": 1.594687819480896, "learning_rate": 4.755786520261666e-06, "loss": 0.45443981885910034, "mean_token_accuracy": 0.8340262174606323, "num_tokens": 6261384.0, "step": 731 }, { "epoch": 0.5562310030395137, "grad_norm": 1.464937686920166, "learning_rate": 4.75488288085912e-06, "loss": 0.37749332189559937, "mean_token_accuracy": 0.862264096736908, "num_tokens": 6272961.0, "step": 732 }, { "epoch": 0.5569908814589666, "grad_norm": 2.8826513290405273, "learning_rate": 4.753977658892967e-06, "loss": 0.4915273189544678, "mean_token_accuracy": 0.8192765712738037, "num_tokens": 6277003.0, "step": 733 }, { "epoch": 0.5577507598784195, "grad_norm": 1.9049111604690552, "learning_rate": 4.753070854998529e-06, "loss": 0.4396716356277466, "mean_token_accuracy": 0.8403393030166626, "num_tokens": 6284281.0, "step": 734 }, { "epoch": 0.5585106382978723, "grad_norm": 2.1077311038970947, "learning_rate": 4.752162469812234e-06, "loss": 0.4790551960468292, "mean_token_accuracy": 0.8353488445281982, "num_tokens": 6291830.0, "step": 735 }, { "epoch": 0.5592705167173252, "grad_norm": 1.2300493717193604, "learning_rate": 4.751252503971624e-06, "loss": 0.3813965916633606, "mean_token_accuracy": 0.8325374126434326, "num_tokens": 6308423.0, "step": 736 }, { "epoch": 0.5600303951367781, "grad_norm": 1.8665045499801636, "learning_rate": 4.750340958115346e-06, "loss": 0.5871419310569763, "mean_token_accuracy": 0.8048850893974304, "num_tokens": 6320081.0, "step": 737 }, { "epoch": 0.560790273556231, "grad_norm": 1.7575510740280151, "learning_rate": 4.749427832883158e-06, "loss": 0.4706610441207886, "mean_token_accuracy": 0.8373581171035767, "num_tokens": 6330330.0, "step": 738 }, { "epoch": 0.5615501519756839, "grad_norm": 2.0865676403045654, "learning_rate": 4.748513128915928e-06, "loss": 0.4866131544113159, "mean_token_accuracy": 0.8118366003036499, "num_tokens": 6337844.0, "step": 739 }, { "epoch": 0.5623100303951368, "grad_norm": 2.1978392601013184, "learning_rate": 4.747596846855629e-06, "loss": 0.4845651686191559, "mean_token_accuracy": 0.8276708126068115, "num_tokens": 6344039.0, "step": 740 }, { "epoch": 0.5630699088145896, "grad_norm": 1.6874626874923706, "learning_rate": 4.7466789873453446e-06, "loss": 0.41359972953796387, "mean_token_accuracy": 0.8586591482162476, "num_tokens": 6355767.0, "step": 741 }, { "epoch": 0.5638297872340425, "grad_norm": 1.4732517004013062, "learning_rate": 4.7457595510292615e-06, "loss": 0.5254936218261719, "mean_token_accuracy": 0.8229131698608398, "num_tokens": 6369705.0, "step": 742 }, { "epoch": 0.5645896656534954, "grad_norm": 1.4983631372451782, "learning_rate": 4.744838538552678e-06, "loss": 0.4143417477607727, "mean_token_accuracy": 0.8397407531738281, "num_tokens": 6382034.0, "step": 743 }, { "epoch": 0.5653495440729484, "grad_norm": 3.680663824081421, "learning_rate": 4.7439159505619946e-06, "loss": 0.397993266582489, "mean_token_accuracy": 0.8769915699958801, "num_tokens": 6384625.0, "step": 744 }, { "epoch": 0.5661094224924013, "grad_norm": 2.1235272884368896, "learning_rate": 4.74299178770472e-06, "loss": 0.5371411442756653, "mean_token_accuracy": 0.8123522400856018, "num_tokens": 6392977.0, "step": 745 }, { "epoch": 0.5668693009118541, "grad_norm": 4.376061916351318, "learning_rate": 4.742066050629465e-06, "loss": 0.5314540863037109, "mean_token_accuracy": 0.8173398375511169, "num_tokens": 6398374.0, "step": 746 }, { "epoch": 0.567629179331307, "grad_norm": 1.3401854038238525, "learning_rate": 4.741138739985951e-06, "loss": 0.37147653102874756, "mean_token_accuracy": 0.8691932559013367, "num_tokens": 6409794.0, "step": 747 }, { "epoch": 0.5683890577507599, "grad_norm": 1.9703563451766968, "learning_rate": 4.740209856424998e-06, "loss": 0.5025161504745483, "mean_token_accuracy": 0.8211580514907837, "num_tokens": 6424118.0, "step": 748 }, { "epoch": 0.5691489361702128, "grad_norm": 1.3038517236709595, "learning_rate": 4.7392794005985324e-06, "loss": 0.3963512182235718, "mean_token_accuracy": 0.8580185174942017, "num_tokens": 6440944.0, "step": 749 }, { "epoch": 0.5699088145896657, "grad_norm": 1.4175636768341064, "learning_rate": 4.738347373159585e-06, "loss": 0.5285158157348633, "mean_token_accuracy": 0.8222070336341858, "num_tokens": 6456143.0, "step": 750 }, { "epoch": 0.5706686930091185, "grad_norm": 2.14648175239563, "learning_rate": 4.737413774762287e-06, "loss": 0.3865134119987488, "mean_token_accuracy": 0.8414736986160278, "num_tokens": 6461740.0, "step": 751 }, { "epoch": 0.5714285714285714, "grad_norm": 1.5074299573898315, "learning_rate": 4.736478606061876e-06, "loss": 0.43085965514183044, "mean_token_accuracy": 0.8482084274291992, "num_tokens": 6473155.0, "step": 752 }, { "epoch": 0.5721884498480243, "grad_norm": 3.027317523956299, "learning_rate": 4.735541867714687e-06, "loss": 0.3914988934993744, "mean_token_accuracy": 0.8739659786224365, "num_tokens": 6476832.0, "step": 753 }, { "epoch": 0.5729483282674772, "grad_norm": 2.3828823566436768, "learning_rate": 4.73460356037816e-06, "loss": 0.6306310892105103, "mean_token_accuracy": 0.7872239351272583, "num_tokens": 6483810.0, "step": 754 }, { "epoch": 0.5737082066869301, "grad_norm": 2.0841176509857178, "learning_rate": 4.733663684710835e-06, "loss": 0.5214060544967651, "mean_token_accuracy": 0.8292238712310791, "num_tokens": 6491510.0, "step": 755 }, { "epoch": 0.574468085106383, "grad_norm": 1.90464186668396, "learning_rate": 4.732722241372354e-06, "loss": 0.613938570022583, "mean_token_accuracy": 0.8029822111129761, "num_tokens": 6502176.0, "step": 756 }, { "epoch": 0.5752279635258358, "grad_norm": 1.47279953956604, "learning_rate": 4.731779231023456e-06, "loss": 0.5272638201713562, "mean_token_accuracy": 0.8153349757194519, "num_tokens": 6520491.0, "step": 757 }, { "epoch": 0.5759878419452887, "grad_norm": 2.267606258392334, "learning_rate": 4.730834654325984e-06, "loss": 0.4156489670276642, "mean_token_accuracy": 0.8580739498138428, "num_tokens": 6526044.0, "step": 758 }, { "epoch": 0.5767477203647416, "grad_norm": 2.421715021133423, "learning_rate": 4.729888511942877e-06, "loss": 0.48345330357551575, "mean_token_accuracy": 0.825847327709198, "num_tokens": 6531553.0, "step": 759 }, { "epoch": 0.5775075987841946, "grad_norm": 1.7170379161834717, "learning_rate": 4.728940804538176e-06, "loss": 0.5726071000099182, "mean_token_accuracy": 0.8010392189025879, "num_tokens": 6542336.0, "step": 760 }, { "epoch": 0.5782674772036475, "grad_norm": 1.1816002130508423, "learning_rate": 4.727991532777016e-06, "loss": 0.3512741029262543, "mean_token_accuracy": 0.8428674936294556, "num_tokens": 6557847.0, "step": 761 }, { "epoch": 0.5790273556231003, "grad_norm": 1.617094874382019, "learning_rate": 4.727040697325634e-06, "loss": 0.5462164878845215, "mean_token_accuracy": 0.8163310289382935, "num_tokens": 6571545.0, "step": 762 }, { "epoch": 0.5797872340425532, "grad_norm": 2.4354491233825684, "learning_rate": 4.726088298851362e-06, "loss": 0.43055254220962524, "mean_token_accuracy": 0.8534346222877502, "num_tokens": 6576328.0, "step": 763 }, { "epoch": 0.5805471124620061, "grad_norm": 2.1651365756988525, "learning_rate": 4.725134338022631e-06, "loss": 0.5344648361206055, "mean_token_accuracy": 0.8242179155349731, "num_tokens": 6582717.0, "step": 764 }, { "epoch": 0.581306990881459, "grad_norm": 1.4754136800765991, "learning_rate": 4.724178815508967e-06, "loss": 0.3312758505344391, "mean_token_accuracy": 0.8697600364685059, "num_tokens": 6592119.0, "step": 765 }, { "epoch": 0.5820668693009119, "grad_norm": 2.2281758785247803, "learning_rate": 4.723221731980993e-06, "loss": 0.3964259624481201, "mean_token_accuracy": 0.8576456308364868, "num_tokens": 6596928.0, "step": 766 }, { "epoch": 0.5828267477203647, "grad_norm": 2.6792006492614746, "learning_rate": 4.722263088110426e-06, "loss": 0.42256325483322144, "mean_token_accuracy": 0.855099618434906, "num_tokens": 6600901.0, "step": 767 }, { "epoch": 0.5835866261398176, "grad_norm": 2.104128837585449, "learning_rate": 4.721302884570079e-06, "loss": 0.49544280767440796, "mean_token_accuracy": 0.8161357641220093, "num_tokens": 6607808.0, "step": 768 }, { "epoch": 0.5843465045592705, "grad_norm": 3.050861358642578, "learning_rate": 4.720341122033862e-06, "loss": 0.4813803732395172, "mean_token_accuracy": 0.858250617980957, "num_tokens": 6613518.0, "step": 769 }, { "epoch": 0.5851063829787234, "grad_norm": 2.0112383365631104, "learning_rate": 4.719377801176774e-06, "loss": 0.5249748826026917, "mean_token_accuracy": 0.8174425363540649, "num_tokens": 6621703.0, "step": 770 }, { "epoch": 0.5858662613981763, "grad_norm": 1.5677523612976074, "learning_rate": 4.718412922674913e-06, "loss": 0.41387608647346497, "mean_token_accuracy": 0.8520937561988831, "num_tokens": 6631490.0, "step": 771 }, { "epoch": 0.5866261398176292, "grad_norm": 1.5996551513671875, "learning_rate": 4.717446487205466e-06, "loss": 0.41482317447662354, "mean_token_accuracy": 0.8514248132705688, "num_tokens": 6644765.0, "step": 772 }, { "epoch": 0.587386018237082, "grad_norm": 1.649708867073059, "learning_rate": 4.716478495446717e-06, "loss": 0.5045386552810669, "mean_token_accuracy": 0.8259719610214233, "num_tokens": 6661065.0, "step": 773 }, { "epoch": 0.5881458966565349, "grad_norm": 2.277923345565796, "learning_rate": 4.715508948078037e-06, "loss": 0.4413490891456604, "mean_token_accuracy": 0.8452578186988831, "num_tokens": 6667280.0, "step": 774 }, { "epoch": 0.5889057750759878, "grad_norm": 1.5648988485336304, "learning_rate": 4.714537845779894e-06, "loss": 0.3617437779903412, "mean_token_accuracy": 0.8841532468795776, "num_tokens": 6677761.0, "step": 775 }, { "epoch": 0.5896656534954408, "grad_norm": 2.465161085128784, "learning_rate": 4.7135651892338445e-06, "loss": 0.49160271883010864, "mean_token_accuracy": 0.8205541372299194, "num_tokens": 6686636.0, "step": 776 }, { "epoch": 0.5904255319148937, "grad_norm": 1.30703604221344, "learning_rate": 4.712590979122534e-06, "loss": 0.3521465063095093, "mean_token_accuracy": 0.8771336078643799, "num_tokens": 6701018.0, "step": 777 }, { "epoch": 0.5911854103343465, "grad_norm": 1.6883575916290283, "learning_rate": 4.7116152161297045e-06, "loss": 0.47732865810394287, "mean_token_accuracy": 0.8221392631530762, "num_tokens": 6710766.0, "step": 778 }, { "epoch": 0.5919452887537994, "grad_norm": 1.2618685960769653, "learning_rate": 4.710637900940181e-06, "loss": 0.39039328694343567, "mean_token_accuracy": 0.8336860537528992, "num_tokens": 6727218.0, "step": 779 }, { "epoch": 0.5927051671732523, "grad_norm": 2.3619794845581055, "learning_rate": 4.7096590342398825e-06, "loss": 0.425787091255188, "mean_token_accuracy": 0.8544785976409912, "num_tokens": 6732694.0, "step": 780 }, { "epoch": 0.5934650455927052, "grad_norm": 1.4645812511444092, "learning_rate": 4.708678616715815e-06, "loss": 0.4684664309024811, "mean_token_accuracy": 0.8648834228515625, "num_tokens": 6750728.0, "step": 781 }, { "epoch": 0.5942249240121581, "grad_norm": 3.441596269607544, "learning_rate": 4.707696649056073e-06, "loss": 0.4963645935058594, "mean_token_accuracy": 0.8329465389251709, "num_tokens": 6753618.0, "step": 782 }, { "epoch": 0.5949848024316109, "grad_norm": 1.2299189567565918, "learning_rate": 4.706713131949839e-06, "loss": 0.36298274993896484, "mean_token_accuracy": 0.8557830452919006, "num_tokens": 6771659.0, "step": 783 }, { "epoch": 0.5957446808510638, "grad_norm": 1.6346250772476196, "learning_rate": 4.705728066087384e-06, "loss": 0.409068763256073, "mean_token_accuracy": 0.8497388362884521, "num_tokens": 6783168.0, "step": 784 }, { "epoch": 0.5965045592705167, "grad_norm": 2.380134344100952, "learning_rate": 4.704741452160064e-06, "loss": 0.49994900822639465, "mean_token_accuracy": 0.8468945026397705, "num_tokens": 6789224.0, "step": 785 }, { "epoch": 0.5972644376899696, "grad_norm": 2.102268695831299, "learning_rate": 4.703753290860323e-06, "loss": 0.45378783345222473, "mean_token_accuracy": 0.8378987312316895, "num_tokens": 6794990.0, "step": 786 }, { "epoch": 0.5980243161094225, "grad_norm": 1.8500839471817017, "learning_rate": 4.702763582881692e-06, "loss": 0.5022721886634827, "mean_token_accuracy": 0.8514347076416016, "num_tokens": 6803051.0, "step": 787 }, { "epoch": 0.5987841945288754, "grad_norm": 1.4423117637634277, "learning_rate": 4.701772328918784e-06, "loss": 0.40599411725997925, "mean_token_accuracy": 0.8476099967956543, "num_tokens": 6815345.0, "step": 788 }, { "epoch": 0.5995440729483282, "grad_norm": 2.578526735305786, "learning_rate": 4.700779529667301e-06, "loss": 0.48335227370262146, "mean_token_accuracy": 0.8485510945320129, "num_tokens": 6820018.0, "step": 789 }, { "epoch": 0.6003039513677811, "grad_norm": 1.7311351299285889, "learning_rate": 4.699785185824026e-06, "loss": 0.5113512277603149, "mean_token_accuracy": 0.8215739130973816, "num_tokens": 6830820.0, "step": 790 }, { "epoch": 0.601063829787234, "grad_norm": 1.6932601928710938, "learning_rate": 4.69878929808683e-06, "loss": 0.43179547786712646, "mean_token_accuracy": 0.8416491746902466, "num_tokens": 6840564.0, "step": 791 }, { "epoch": 0.601823708206687, "grad_norm": 1.9616183042526245, "learning_rate": 4.6977918671546635e-06, "loss": 0.5646926760673523, "mean_token_accuracy": 0.803817629814148, "num_tokens": 6848603.0, "step": 792 }, { "epoch": 0.6025835866261399, "grad_norm": 1.9959920644760132, "learning_rate": 4.696792893727562e-06, "loss": 0.3411133587360382, "mean_token_accuracy": 0.8736921548843384, "num_tokens": 6854574.0, "step": 793 }, { "epoch": 0.6033434650455927, "grad_norm": 2.021311044692993, "learning_rate": 4.695792378506645e-06, "loss": 0.3923317790031433, "mean_token_accuracy": 0.8675966262817383, "num_tokens": 6861597.0, "step": 794 }, { "epoch": 0.6041033434650456, "grad_norm": 3.0559072494506836, "learning_rate": 4.694790322194111e-06, "loss": 0.5951039791107178, "mean_token_accuracy": 0.7780908346176147, "num_tokens": 6866525.0, "step": 795 }, { "epoch": 0.6048632218844985, "grad_norm": 2.475194215774536, "learning_rate": 4.693786725493242e-06, "loss": 0.45655012130737305, "mean_token_accuracy": 0.8477139472961426, "num_tokens": 6872470.0, "step": 796 }, { "epoch": 0.6056231003039514, "grad_norm": 1.5484919548034668, "learning_rate": 4.692781589108402e-06, "loss": 0.404802143573761, "mean_token_accuracy": 0.8471429347991943, "num_tokens": 6882431.0, "step": 797 }, { "epoch": 0.6063829787234043, "grad_norm": 2.4598426818847656, "learning_rate": 4.691774913745033e-06, "loss": 0.42373165488243103, "mean_token_accuracy": 0.865800142288208, "num_tokens": 6888356.0, "step": 798 }, { "epoch": 0.6071428571428571, "grad_norm": 2.0124335289001465, "learning_rate": 4.690766700109659e-06, "loss": 0.35996782779693604, "mean_token_accuracy": 0.8779679536819458, "num_tokens": 6894205.0, "step": 799 }, { "epoch": 0.60790273556231, "grad_norm": 1.958358645439148, "learning_rate": 4.689756948909884e-06, "loss": 0.5029857158660889, "mean_token_accuracy": 0.8120522499084473, "num_tokens": 6902740.0, "step": 800 }, { "epoch": 0.6086626139817629, "grad_norm": 2.249605417251587, "learning_rate": 4.688745660854388e-06, "loss": 0.5539613962173462, "mean_token_accuracy": 0.8212680816650391, "num_tokens": 6916616.0, "step": 801 }, { "epoch": 0.6094224924012158, "grad_norm": 2.2435097694396973, "learning_rate": 4.687732836652935e-06, "loss": 0.49224361777305603, "mean_token_accuracy": 0.8426158428192139, "num_tokens": 6922671.0, "step": 802 }, { "epoch": 0.6101823708206687, "grad_norm": 1.9101262092590332, "learning_rate": 4.686718477016361e-06, "loss": 0.46180349588394165, "mean_token_accuracy": 0.8320032358169556, "num_tokens": 6930135.0, "step": 803 }, { "epoch": 0.6109422492401215, "grad_norm": 2.6720311641693115, "learning_rate": 4.6857025826565845e-06, "loss": 0.520714282989502, "mean_token_accuracy": 0.8307292461395264, "num_tokens": 6934983.0, "step": 804 }, { "epoch": 0.6117021276595744, "grad_norm": 2.0964162349700928, "learning_rate": 4.684685154286599e-06, "loss": 0.4943925440311432, "mean_token_accuracy": 0.8484556674957275, "num_tokens": 6940735.0, "step": 805 }, { "epoch": 0.6124620060790273, "grad_norm": 2.457338571548462, "learning_rate": 4.683666192620474e-06, "loss": 0.5134413242340088, "mean_token_accuracy": 0.8112465739250183, "num_tokens": 6946038.0, "step": 806 }, { "epoch": 0.6132218844984803, "grad_norm": 2.413060188293457, "learning_rate": 4.682645698373357e-06, "loss": 0.5169984102249146, "mean_token_accuracy": 0.8218802213668823, "num_tokens": 6952279.0, "step": 807 }, { "epoch": 0.6139817629179332, "grad_norm": 1.692279577255249, "learning_rate": 4.6816236722614694e-06, "loss": 0.5951623916625977, "mean_token_accuracy": 0.7930692434310913, "num_tokens": 6963872.0, "step": 808 }, { "epoch": 0.6147416413373861, "grad_norm": 1.7568124532699585, "learning_rate": 4.680600115002109e-06, "loss": 0.47554802894592285, "mean_token_accuracy": 0.8246628046035767, "num_tokens": 6974580.0, "step": 809 }, { "epoch": 0.6155015197568389, "grad_norm": 2.079179525375366, "learning_rate": 4.679575027313649e-06, "loss": 0.48674464225769043, "mean_token_accuracy": 0.8292477130889893, "num_tokens": 6981685.0, "step": 810 }, { "epoch": 0.6162613981762918, "grad_norm": 2.254335641860962, "learning_rate": 4.6785484099155324e-06, "loss": 0.49801093339920044, "mean_token_accuracy": 0.8244355320930481, "num_tokens": 6987356.0, "step": 811 }, { "epoch": 0.6170212765957447, "grad_norm": 1.6479203701019287, "learning_rate": 4.67752026352828e-06, "loss": 0.39767441153526306, "mean_token_accuracy": 0.8694275617599487, "num_tokens": 6996099.0, "step": 812 }, { "epoch": 0.6177811550151976, "grad_norm": 2.450373649597168, "learning_rate": 4.676490588873486e-06, "loss": 0.48279088735580444, "mean_token_accuracy": 0.8324604034423828, "num_tokens": 7001601.0, "step": 813 }, { "epoch": 0.6185410334346505, "grad_norm": 1.4855589866638184, "learning_rate": 4.675459386673815e-06, "loss": 0.3707486391067505, "mean_token_accuracy": 0.864487886428833, "num_tokens": 7013474.0, "step": 814 }, { "epoch": 0.6193009118541033, "grad_norm": 2.662778615951538, "learning_rate": 4.674426657653003e-06, "loss": 0.5028292536735535, "mean_token_accuracy": 0.8172339797019958, "num_tokens": 7018471.0, "step": 815 }, { "epoch": 0.6200607902735562, "grad_norm": 1.637942910194397, "learning_rate": 4.67339240253586e-06, "loss": 0.6092144250869751, "mean_token_accuracy": 0.7874794006347656, "num_tokens": 7033095.0, "step": 816 }, { "epoch": 0.6208206686930091, "grad_norm": 2.1327226161956787, "learning_rate": 4.672356622048266e-06, "loss": 0.49753472208976746, "mean_token_accuracy": 0.813976526260376, "num_tokens": 7039967.0, "step": 817 }, { "epoch": 0.621580547112462, "grad_norm": 1.6137940883636475, "learning_rate": 4.671319316917172e-06, "loss": 0.430887371301651, "mean_token_accuracy": 0.8568265438079834, "num_tokens": 7050297.0, "step": 818 }, { "epoch": 0.6223404255319149, "grad_norm": 2.464230537414551, "learning_rate": 4.670280487870599e-06, "loss": 0.548923134803772, "mean_token_accuracy": 0.8170759677886963, "num_tokens": 7055652.0, "step": 819 }, { "epoch": 0.6231003039513677, "grad_norm": 2.051084041595459, "learning_rate": 4.669240135637635e-06, "loss": 0.4992865324020386, "mean_token_accuracy": 0.8311952948570251, "num_tokens": 7061622.0, "step": 820 }, { "epoch": 0.6238601823708206, "grad_norm": 2.2122182846069336, "learning_rate": 4.668198260948442e-06, "loss": 0.5934240818023682, "mean_token_accuracy": 0.7990015149116516, "num_tokens": 7069896.0, "step": 821 }, { "epoch": 0.6246200607902735, "grad_norm": 1.972354531288147, "learning_rate": 4.667154864534245e-06, "loss": 0.5896181464195251, "mean_token_accuracy": 0.7959399819374084, "num_tokens": 7079704.0, "step": 822 }, { "epoch": 0.6253799392097265, "grad_norm": 2.049687385559082, "learning_rate": 4.666109947127343e-06, "loss": 0.385894775390625, "mean_token_accuracy": 0.8725259900093079, "num_tokens": 7085692.0, "step": 823 }, { "epoch": 0.6261398176291794, "grad_norm": 2.5467441082000732, "learning_rate": 4.665063509461098e-06, "loss": 0.5672459006309509, "mean_token_accuracy": 0.7989083528518677, "num_tokens": 7091340.0, "step": 824 }, { "epoch": 0.6268996960486323, "grad_norm": 2.4866766929626465, "learning_rate": 4.664015552269938e-06, "loss": 0.5028612613677979, "mean_token_accuracy": 0.8485326766967773, "num_tokens": 7097836.0, "step": 825 }, { "epoch": 0.6276595744680851, "grad_norm": 2.9302313327789307, "learning_rate": 4.662966076289363e-06, "loss": 0.42695996165275574, "mean_token_accuracy": 0.8490408658981323, "num_tokens": 7101593.0, "step": 826 }, { "epoch": 0.628419452887538, "grad_norm": 1.5770741701126099, "learning_rate": 4.661915082255932e-06, "loss": 0.4660522937774658, "mean_token_accuracy": 0.8398847579956055, "num_tokens": 7113852.0, "step": 827 }, { "epoch": 0.6291793313069909, "grad_norm": 1.4753056764602661, "learning_rate": 4.6608625709072766e-06, "loss": 0.45473548769950867, "mean_token_accuracy": 0.8189343214035034, "num_tokens": 7126685.0, "step": 828 }, { "epoch": 0.6299392097264438, "grad_norm": 2.210510015487671, "learning_rate": 4.659808542982089e-06, "loss": 0.447899729013443, "mean_token_accuracy": 0.8310917615890503, "num_tokens": 7132681.0, "step": 829 }, { "epoch": 0.6306990881458967, "grad_norm": 2.2032580375671387, "learning_rate": 4.658752999220125e-06, "loss": 0.3581208288669586, "mean_token_accuracy": 0.8732799291610718, "num_tokens": 7137537.0, "step": 830 }, { "epoch": 0.6314589665653495, "grad_norm": 2.3107855319976807, "learning_rate": 4.657695940362207e-06, "loss": 0.4941805601119995, "mean_token_accuracy": 0.8240172266960144, "num_tokens": 7142803.0, "step": 831 }, { "epoch": 0.6322188449848024, "grad_norm": 1.485609531402588, "learning_rate": 4.65663736715022e-06, "loss": 0.5009396076202393, "mean_token_accuracy": 0.8271695375442505, "num_tokens": 7157184.0, "step": 832 }, { "epoch": 0.6329787234042553, "grad_norm": 3.340023994445801, "learning_rate": 4.65557728032711e-06, "loss": 0.6149337291717529, "mean_token_accuracy": 0.7965115308761597, "num_tokens": 7161281.0, "step": 833 }, { "epoch": 0.6337386018237082, "grad_norm": 2.0645856857299805, "learning_rate": 4.654515680636888e-06, "loss": 0.5461238622665405, "mean_token_accuracy": 0.8267207145690918, "num_tokens": 7168836.0, "step": 834 }, { "epoch": 0.6344984802431611, "grad_norm": 1.063380479812622, "learning_rate": 4.653452568824625e-06, "loss": 0.3376588225364685, "mean_token_accuracy": 0.8802155256271362, "num_tokens": 7194170.0, "step": 835 }, { "epoch": 0.6352583586626139, "grad_norm": 3.4659011363983154, "learning_rate": 4.652387945636454e-06, "loss": 0.30550917983055115, "mean_token_accuracy": 0.8895115852355957, "num_tokens": 7196575.0, "step": 836 }, { "epoch": 0.6360182370820668, "grad_norm": 2.0651421546936035, "learning_rate": 4.651321811819568e-06, "loss": 0.4748995006084442, "mean_token_accuracy": 0.8297156095504761, "num_tokens": 7203948.0, "step": 837 }, { "epoch": 0.6367781155015197, "grad_norm": 2.468873977661133, "learning_rate": 4.650254168122222e-06, "loss": 0.5160819292068481, "mean_token_accuracy": 0.8209799528121948, "num_tokens": 7209727.0, "step": 838 }, { "epoch": 0.6375379939209727, "grad_norm": 2.0467090606689453, "learning_rate": 4.649185015293728e-06, "loss": 0.4545784592628479, "mean_token_accuracy": 0.8582319021224976, "num_tokens": 7216605.0, "step": 839 }, { "epoch": 0.6382978723404256, "grad_norm": 2.2342143058776855, "learning_rate": 4.64811435408446e-06, "loss": 0.5191316604614258, "mean_token_accuracy": 0.8482456207275391, "num_tokens": 7227223.0, "step": 840 }, { "epoch": 0.6390577507598785, "grad_norm": 3.1990416049957275, "learning_rate": 4.647042185245848e-06, "loss": 0.4573180377483368, "mean_token_accuracy": 0.8417835235595703, "num_tokens": 7230437.0, "step": 841 }, { "epoch": 0.6398176291793313, "grad_norm": 1.5837587118148804, "learning_rate": 4.645968509530381e-06, "loss": 0.42081165313720703, "mean_token_accuracy": 0.8477587103843689, "num_tokens": 7240325.0, "step": 842 }, { "epoch": 0.6405775075987842, "grad_norm": 2.400709390640259, "learning_rate": 4.644893327691608e-06, "loss": 0.4639664590358734, "mean_token_accuracy": 0.8325745463371277, "num_tokens": 7245949.0, "step": 843 }, { "epoch": 0.6413373860182371, "grad_norm": 2.0829503536224365, "learning_rate": 4.6438166404841316e-06, "loss": 0.5718370676040649, "mean_token_accuracy": 0.8071532249450684, "num_tokens": 7253218.0, "step": 844 }, { "epoch": 0.64209726443769, "grad_norm": 1.9976121187210083, "learning_rate": 4.6427384486636115e-06, "loss": 0.46519768238067627, "mean_token_accuracy": 0.8393628597259521, "num_tokens": 7260104.0, "step": 845 }, { "epoch": 0.6428571428571429, "grad_norm": 2.5303242206573486, "learning_rate": 4.6416587529867665e-06, "loss": 0.5093944668769836, "mean_token_accuracy": 0.8208208084106445, "num_tokens": 7265076.0, "step": 846 }, { "epoch": 0.6436170212765957, "grad_norm": 2.624427556991577, "learning_rate": 4.640577554211366e-06, "loss": 0.49459028244018555, "mean_token_accuracy": 0.834679365158081, "num_tokens": 7272422.0, "step": 847 }, { "epoch": 0.6443768996960486, "grad_norm": 2.0631775856018066, "learning_rate": 4.63949485309624e-06, "loss": 0.447976291179657, "mean_token_accuracy": 0.8618238568305969, "num_tokens": 7279399.0, "step": 848 }, { "epoch": 0.6451367781155015, "grad_norm": 1.6001992225646973, "learning_rate": 4.638410650401267e-06, "loss": 0.423392653465271, "mean_token_accuracy": 0.8587884306907654, "num_tokens": 7289378.0, "step": 849 }, { "epoch": 0.6458966565349544, "grad_norm": 1.8436834812164307, "learning_rate": 4.637324946887384e-06, "loss": 0.35557836294174194, "mean_token_accuracy": 0.869373083114624, "num_tokens": 7295777.0, "step": 850 }, { "epoch": 0.6466565349544073, "grad_norm": 3.5771102905273438, "learning_rate": 4.636237743316578e-06, "loss": 0.45969358086586, "mean_token_accuracy": 0.8549720048904419, "num_tokens": 7299083.0, "step": 851 }, { "epoch": 0.6474164133738601, "grad_norm": 2.865243673324585, "learning_rate": 4.635149040451891e-06, "loss": 0.34760022163391113, "mean_token_accuracy": 0.8827086687088013, "num_tokens": 7302325.0, "step": 852 }, { "epoch": 0.648176291793313, "grad_norm": 2.42984938621521, "learning_rate": 4.634058839057417e-06, "loss": 0.2833346128463745, "mean_token_accuracy": 0.8909599781036377, "num_tokens": 7307772.0, "step": 853 }, { "epoch": 0.648936170212766, "grad_norm": 1.3870996236801147, "learning_rate": 4.632967139898301e-06, "loss": 0.42592889070510864, "mean_token_accuracy": 0.8465644717216492, "num_tokens": 7321536.0, "step": 854 }, { "epoch": 0.6496960486322189, "grad_norm": 1.687943458557129, "learning_rate": 4.63187394374074e-06, "loss": 0.3329618275165558, "mean_token_accuracy": 0.8819146752357483, "num_tokens": 7329124.0, "step": 855 }, { "epoch": 0.6504559270516718, "grad_norm": 2.380872964859009, "learning_rate": 4.63077925135198e-06, "loss": 0.4892173111438751, "mean_token_accuracy": 0.8454505205154419, "num_tokens": 7334484.0, "step": 856 }, { "epoch": 0.6512158054711246, "grad_norm": 2.4188196659088135, "learning_rate": 4.629683063500319e-06, "loss": 0.47374117374420166, "mean_token_accuracy": 0.8230743408203125, "num_tokens": 7339580.0, "step": 857 }, { "epoch": 0.6519756838905775, "grad_norm": 1.7876373529434204, "learning_rate": 4.628585380955104e-06, "loss": 0.5236937999725342, "mean_token_accuracy": 0.8137438297271729, "num_tokens": 7347062.0, "step": 858 }, { "epoch": 0.6527355623100304, "grad_norm": 1.5910003185272217, "learning_rate": 4.62748620448673e-06, "loss": 0.4025757312774658, "mean_token_accuracy": 0.861012876033783, "num_tokens": 7357483.0, "step": 859 }, { "epoch": 0.6534954407294833, "grad_norm": 3.214264392852783, "learning_rate": 4.626385534866642e-06, "loss": 0.5050101280212402, "mean_token_accuracy": 0.8393588066101074, "num_tokens": 7360993.0, "step": 860 }, { "epoch": 0.6542553191489362, "grad_norm": 2.415461778640747, "learning_rate": 4.625283372867333e-06, "loss": 0.5053150653839111, "mean_token_accuracy": 0.8293525576591492, "num_tokens": 7367228.0, "step": 861 }, { "epoch": 0.6550151975683891, "grad_norm": 2.4207515716552734, "learning_rate": 4.624179719262342e-06, "loss": 0.5185045003890991, "mean_token_accuracy": 0.8156779408454895, "num_tokens": 7372746.0, "step": 862 }, { "epoch": 0.6557750759878419, "grad_norm": 3.787724018096924, "learning_rate": 4.623074574826254e-06, "loss": 0.5163221955299377, "mean_token_accuracy": 0.8288628458976746, "num_tokens": 7375505.0, "step": 863 }, { "epoch": 0.6565349544072948, "grad_norm": 1.5809223651885986, "learning_rate": 4.621967940334705e-06, "loss": 0.39990508556365967, "mean_token_accuracy": 0.8566436767578125, "num_tokens": 7384984.0, "step": 864 }, { "epoch": 0.6572948328267477, "grad_norm": 1.6312175989151, "learning_rate": 4.620859816564371e-06, "loss": 0.4413212537765503, "mean_token_accuracy": 0.8325690627098083, "num_tokens": 7396416.0, "step": 865 }, { "epoch": 0.6580547112462006, "grad_norm": 2.2401390075683594, "learning_rate": 4.619750204292978e-06, "loss": 0.4971809983253479, "mean_token_accuracy": 0.8272864818572998, "num_tokens": 7402822.0, "step": 866 }, { "epoch": 0.6588145896656535, "grad_norm": 2.277994155883789, "learning_rate": 4.618639104299294e-06, "loss": 0.5069500207901001, "mean_token_accuracy": 0.8187509179115295, "num_tokens": 7411273.0, "step": 867 }, { "epoch": 0.6595744680851063, "grad_norm": 1.414273738861084, "learning_rate": 4.6175265173631304e-06, "loss": 0.4307935833930969, "mean_token_accuracy": 0.8525159358978271, "num_tokens": 7424737.0, "step": 868 }, { "epoch": 0.6603343465045592, "grad_norm": 2.125316858291626, "learning_rate": 4.616412444265344e-06, "loss": 0.39343100786209106, "mean_token_accuracy": 0.8679004311561584, "num_tokens": 7430512.0, "step": 869 }, { "epoch": 0.6610942249240122, "grad_norm": 2.3756308555603027, "learning_rate": 4.6152968857878365e-06, "loss": 0.3074539005756378, "mean_token_accuracy": 0.8972070217132568, "num_tokens": 7434135.0, "step": 870 }, { "epoch": 0.6618541033434651, "grad_norm": 3.055863857269287, "learning_rate": 4.6141798427135475e-06, "loss": 0.4804600477218628, "mean_token_accuracy": 0.8362303376197815, "num_tokens": 7437704.0, "step": 871 }, { "epoch": 0.662613981762918, "grad_norm": 2.5049126148223877, "learning_rate": 4.6130613158264605e-06, "loss": 0.5290058851242065, "mean_token_accuracy": 0.8319494724273682, "num_tokens": 7443605.0, "step": 872 }, { "epoch": 0.6633738601823708, "grad_norm": 4.058017253875732, "learning_rate": 4.611941305911602e-06, "loss": 0.5475609302520752, "mean_token_accuracy": 0.8565441370010376, "num_tokens": 7446226.0, "step": 873 }, { "epoch": 0.6641337386018237, "grad_norm": 2.6561121940612793, "learning_rate": 4.610819813755038e-06, "loss": 0.4614630937576294, "mean_token_accuracy": 0.8371453285217285, "num_tokens": 7450880.0, "step": 874 }, { "epoch": 0.6648936170212766, "grad_norm": 2.3695688247680664, "learning_rate": 4.609696840143875e-06, "loss": 0.44823721051216125, "mean_token_accuracy": 0.8478057384490967, "num_tokens": 7455475.0, "step": 875 }, { "epoch": 0.6656534954407295, "grad_norm": 2.188896894454956, "learning_rate": 4.6085723858662575e-06, "loss": 0.540969967842102, "mean_token_accuracy": 0.8183263540267944, "num_tokens": 7462136.0, "step": 876 }, { "epoch": 0.6664133738601824, "grad_norm": 2.090606689453125, "learning_rate": 4.607446451711372e-06, "loss": 0.4938020408153534, "mean_token_accuracy": 0.8282856941223145, "num_tokens": 7468881.0, "step": 877 }, { "epoch": 0.6671732522796353, "grad_norm": 1.4223014116287231, "learning_rate": 4.606319038469443e-06, "loss": 0.4104450047016144, "mean_token_accuracy": 0.8580802083015442, "num_tokens": 7480008.0, "step": 878 }, { "epoch": 0.6679331306990881, "grad_norm": 1.8259541988372803, "learning_rate": 4.605190146931731e-06, "loss": 0.45610904693603516, "mean_token_accuracy": 0.8348138332366943, "num_tokens": 7488740.0, "step": 879 }, { "epoch": 0.668693009118541, "grad_norm": 1.455940842628479, "learning_rate": 4.604059777890537e-06, "loss": 0.5581926107406616, "mean_token_accuracy": 0.8333281874656677, "num_tokens": 7505385.0, "step": 880 }, { "epoch": 0.6694528875379939, "grad_norm": 1.9790794849395752, "learning_rate": 4.602927932139197e-06, "loss": 0.37450742721557617, "mean_token_accuracy": 0.8820561766624451, "num_tokens": 7511950.0, "step": 881 }, { "epoch": 0.6702127659574468, "grad_norm": 2.1166131496429443, "learning_rate": 4.601794610472083e-06, "loss": 0.6722922921180725, "mean_token_accuracy": 0.7877264618873596, "num_tokens": 7520971.0, "step": 882 }, { "epoch": 0.6709726443768997, "grad_norm": 2.0491039752960205, "learning_rate": 4.6006598136846056e-06, "loss": 0.506068766117096, "mean_token_accuracy": 0.8298271894454956, "num_tokens": 7528372.0, "step": 883 }, { "epoch": 0.6717325227963525, "grad_norm": 1.723015308380127, "learning_rate": 4.599523542573207e-06, "loss": 0.4729573130607605, "mean_token_accuracy": 0.833440899848938, "num_tokens": 7539519.0, "step": 884 }, { "epoch": 0.6724924012158054, "grad_norm": 2.212226152420044, "learning_rate": 4.598385797935368e-06, "loss": 0.5158262848854065, "mean_token_accuracy": 0.8289275169372559, "num_tokens": 7547301.0, "step": 885 }, { "epoch": 0.6732522796352584, "grad_norm": 2.41896653175354, "learning_rate": 4.5972465805696e-06, "loss": 0.426039457321167, "mean_token_accuracy": 0.8530604243278503, "num_tokens": 7552041.0, "step": 886 }, { "epoch": 0.6740121580547113, "grad_norm": 2.7718119621276855, "learning_rate": 4.596105891275449e-06, "loss": 0.4317045211791992, "mean_token_accuracy": 0.8486195802688599, "num_tokens": 7556954.0, "step": 887 }, { "epoch": 0.6747720364741642, "grad_norm": 2.2367255687713623, "learning_rate": 4.594963730853497e-06, "loss": 0.5955407619476318, "mean_token_accuracy": 0.7937953472137451, "num_tokens": 7563810.0, "step": 888 }, { "epoch": 0.675531914893617, "grad_norm": 2.594902753829956, "learning_rate": 4.593820100105355e-06, "loss": 0.4984428584575653, "mean_token_accuracy": 0.8338798880577087, "num_tokens": 7568348.0, "step": 889 }, { "epoch": 0.6762917933130699, "grad_norm": 1.9873307943344116, "learning_rate": 4.5926749998336665e-06, "loss": 0.4970153570175171, "mean_token_accuracy": 0.8113185167312622, "num_tokens": 7575923.0, "step": 890 }, { "epoch": 0.6770516717325228, "grad_norm": 1.8077143430709839, "learning_rate": 4.5915284308421075e-06, "loss": 0.4166252017021179, "mean_token_accuracy": 0.8618727326393127, "num_tokens": 7584114.0, "step": 891 }, { "epoch": 0.6778115501519757, "grad_norm": 2.633857011795044, "learning_rate": 4.590380393935383e-06, "loss": 0.3649597764015198, "mean_token_accuracy": 0.8753523826599121, "num_tokens": 7587749.0, "step": 892 }, { "epoch": 0.6785714285714286, "grad_norm": 1.1693453788757324, "learning_rate": 4.589230889919232e-06, "loss": 0.38153892755508423, "mean_token_accuracy": 0.8587384223937988, "num_tokens": 7609112.0, "step": 893 }, { "epoch": 0.6793313069908815, "grad_norm": 2.939741611480713, "learning_rate": 4.588079919600419e-06, "loss": 0.4910273849964142, "mean_token_accuracy": 0.821663498878479, "num_tokens": 7612862.0, "step": 894 }, { "epoch": 0.6800911854103343, "grad_norm": 1.179184079170227, "learning_rate": 4.586927483786739e-06, "loss": 0.4377683401107788, "mean_token_accuracy": 0.84492427110672, "num_tokens": 7634802.0, "step": 895 }, { "epoch": 0.6808510638297872, "grad_norm": 1.5825812816619873, "learning_rate": 4.585773583287017e-06, "loss": 0.4963203966617584, "mean_token_accuracy": 0.848434329032898, "num_tokens": 7651060.0, "step": 896 }, { "epoch": 0.6816109422492401, "grad_norm": 2.5651516914367676, "learning_rate": 4.584618218911104e-06, "loss": 0.4816139340400696, "mean_token_accuracy": 0.8224426507949829, "num_tokens": 7655421.0, "step": 897 }, { "epoch": 0.682370820668693, "grad_norm": 1.8367772102355957, "learning_rate": 4.583461391469879e-06, "loss": 0.4980762302875519, "mean_token_accuracy": 0.8256858587265015, "num_tokens": 7663953.0, "step": 898 }, { "epoch": 0.6831306990881459, "grad_norm": 3.117048740386963, "learning_rate": 4.582303101775249e-06, "loss": 0.4483739733695984, "mean_token_accuracy": 0.8520689010620117, "num_tokens": 7666958.0, "step": 899 }, { "epoch": 0.6838905775075987, "grad_norm": 1.429996371269226, "learning_rate": 4.581143350640146e-06, "loss": 0.4799046814441681, "mean_token_accuracy": 0.8313760161399841, "num_tokens": 7681046.0, "step": 900 }, { "epoch": 0.6846504559270516, "grad_norm": 1.665168046951294, "learning_rate": 4.579982138878527e-06, "loss": 0.49619922041893005, "mean_token_accuracy": 0.830579400062561, "num_tokens": 7696336.0, "step": 901 }, { "epoch": 0.6854103343465046, "grad_norm": 2.4953956604003906, "learning_rate": 4.578819467305375e-06, "loss": 0.45367807149887085, "mean_token_accuracy": 0.8557197451591492, "num_tokens": 7700768.0, "step": 902 }, { "epoch": 0.6861702127659575, "grad_norm": 1.9500032663345337, "learning_rate": 4.5776553367367e-06, "loss": 0.5965266227722168, "mean_token_accuracy": 0.7919489741325378, "num_tokens": 7708938.0, "step": 903 }, { "epoch": 0.6869300911854104, "grad_norm": 1.8939558267593384, "learning_rate": 4.576489747989532e-06, "loss": 0.4667394161224365, "mean_token_accuracy": 0.8221625685691833, "num_tokens": 7715954.0, "step": 904 }, { "epoch": 0.6876899696048632, "grad_norm": 1.2655456066131592, "learning_rate": 4.575322701881926e-06, "loss": 0.3889008164405823, "mean_token_accuracy": 0.8734138011932373, "num_tokens": 7733929.0, "step": 905 }, { "epoch": 0.6884498480243161, "grad_norm": 1.730330228805542, "learning_rate": 4.57415419923296e-06, "loss": 0.5634262561798096, "mean_token_accuracy": 0.8046697378158569, "num_tokens": 7747283.0, "step": 906 }, { "epoch": 0.689209726443769, "grad_norm": 2.4916765689849854, "learning_rate": 4.572984240862733e-06, "loss": 0.5775842666625977, "mean_token_accuracy": 0.803133487701416, "num_tokens": 7753457.0, "step": 907 }, { "epoch": 0.6899696048632219, "grad_norm": 2.133382797241211, "learning_rate": 4.57181282759237e-06, "loss": 0.5159484148025513, "mean_token_accuracy": 0.822996973991394, "num_tokens": 7761284.0, "step": 908 }, { "epoch": 0.6907294832826748, "grad_norm": 2.3237338066101074, "learning_rate": 4.570639960244011e-06, "loss": 0.49258583784103394, "mean_token_accuracy": 0.8296655416488647, "num_tokens": 7766944.0, "step": 909 }, { "epoch": 0.6914893617021277, "grad_norm": 2.0109713077545166, "learning_rate": 4.56946563964082e-06, "loss": 0.5125564336776733, "mean_token_accuracy": 0.8213399648666382, "num_tokens": 7775310.0, "step": 910 }, { "epoch": 0.6922492401215805, "grad_norm": 1.2341805696487427, "learning_rate": 4.5682898666069815e-06, "loss": 0.4266202449798584, "mean_token_accuracy": 0.8618178367614746, "num_tokens": 7792810.0, "step": 911 }, { "epoch": 0.6930091185410334, "grad_norm": 1.2585805654525757, "learning_rate": 4.567112641967697e-06, "loss": 0.388641893863678, "mean_token_accuracy": 0.8769892454147339, "num_tokens": 7805949.0, "step": 912 }, { "epoch": 0.6937689969604863, "grad_norm": 1.2041066884994507, "learning_rate": 4.5659339665491894e-06, "loss": 0.3482680916786194, "mean_token_accuracy": 0.8561577796936035, "num_tokens": 7821316.0, "step": 913 }, { "epoch": 0.6945288753799392, "grad_norm": 2.319331645965576, "learning_rate": 4.5647538411786965e-06, "loss": 0.41378292441368103, "mean_token_accuracy": 0.8446224331855774, "num_tokens": 7826557.0, "step": 914 }, { "epoch": 0.6952887537993921, "grad_norm": 1.283704161643982, "learning_rate": 4.563572266684478e-06, "loss": 0.5009961128234863, "mean_token_accuracy": 0.8147847652435303, "num_tokens": 7842836.0, "step": 915 }, { "epoch": 0.6960486322188449, "grad_norm": 2.496107816696167, "learning_rate": 4.562389243895807e-06, "loss": 0.42707979679107666, "mean_token_accuracy": 0.8460291624069214, "num_tokens": 7847201.0, "step": 916 }, { "epoch": 0.6968085106382979, "grad_norm": 1.5308650732040405, "learning_rate": 4.561204773642974e-06, "loss": 0.39804404973983765, "mean_token_accuracy": 0.8602392673492432, "num_tokens": 7858273.0, "step": 917 }, { "epoch": 0.6975683890577508, "grad_norm": 2.9474008083343506, "learning_rate": 4.5600188567572874e-06, "loss": 0.26960092782974243, "mean_token_accuracy": 0.9006294012069702, "num_tokens": 7860928.0, "step": 918 }, { "epoch": 0.6983282674772037, "grad_norm": 1.4389863014221191, "learning_rate": 4.558831494071069e-06, "loss": 0.4172559380531311, "mean_token_accuracy": 0.8557401895523071, "num_tokens": 7873967.0, "step": 919 }, { "epoch": 0.6990881458966566, "grad_norm": 1.7400329113006592, "learning_rate": 4.557642686417654e-06, "loss": 0.47256314754486084, "mean_token_accuracy": 0.82423996925354, "num_tokens": 7883639.0, "step": 920 }, { "epoch": 0.6998480243161094, "grad_norm": 3.0388031005859375, "learning_rate": 4.556452434631396e-06, "loss": 0.5846735239028931, "mean_token_accuracy": 0.8104480504989624, "num_tokens": 7888177.0, "step": 921 }, { "epoch": 0.7006079027355623, "grad_norm": 2.3434362411499023, "learning_rate": 4.555260739547657e-06, "loss": 0.35388419032096863, "mean_token_accuracy": 0.8845078349113464, "num_tokens": 7892527.0, "step": 922 }, { "epoch": 0.7013677811550152, "grad_norm": 1.5241142511367798, "learning_rate": 4.554067602002815e-06, "loss": 0.37504011392593384, "mean_token_accuracy": 0.8647040128707886, "num_tokens": 7903340.0, "step": 923 }, { "epoch": 0.7021276595744681, "grad_norm": 3.6781864166259766, "learning_rate": 4.55287302283426e-06, "loss": 0.5525227785110474, "mean_token_accuracy": 0.8196807503700256, "num_tokens": 7906254.0, "step": 924 }, { "epoch": 0.702887537993921, "grad_norm": 2.2237837314605713, "learning_rate": 4.551677002880395e-06, "loss": 0.4853675663471222, "mean_token_accuracy": 0.828433632850647, "num_tokens": 7912814.0, "step": 925 }, { "epoch": 0.7036474164133738, "grad_norm": 2.5017545223236084, "learning_rate": 4.550479542980632e-06, "loss": 0.4966946244239807, "mean_token_accuracy": 0.833811342716217, "num_tokens": 7917653.0, "step": 926 }, { "epoch": 0.7044072948328267, "grad_norm": 3.7602856159210205, "learning_rate": 4.549280643975394e-06, "loss": 0.424625426530838, "mean_token_accuracy": 0.8503231406211853, "num_tokens": 7920568.0, "step": 927 }, { "epoch": 0.7051671732522796, "grad_norm": 2.4364452362060547, "learning_rate": 4.548080306706114e-06, "loss": 0.26582738757133484, "mean_token_accuracy": 0.9119530916213989, "num_tokens": 7924113.0, "step": 928 }, { "epoch": 0.7059270516717325, "grad_norm": 1.36539626121521, "learning_rate": 4.5468785320152365e-06, "loss": 0.4289470911026001, "mean_token_accuracy": 0.8364672660827637, "num_tokens": 7939372.0, "step": 929 }, { "epoch": 0.7066869300911854, "grad_norm": 2.37292742729187, "learning_rate": 4.545675320746212e-06, "loss": 0.483722984790802, "mean_token_accuracy": 0.8319920897483826, "num_tokens": 7946336.0, "step": 930 }, { "epoch": 0.7074468085106383, "grad_norm": 1.7913908958435059, "learning_rate": 4.544470673743502e-06, "loss": 0.37959009408950806, "mean_token_accuracy": 0.8628261089324951, "num_tokens": 7955001.0, "step": 931 }, { "epoch": 0.7082066869300911, "grad_norm": 1.531957983970642, "learning_rate": 4.543264591852572e-06, "loss": 0.4729848802089691, "mean_token_accuracy": 0.8368238806724548, "num_tokens": 7968205.0, "step": 932 }, { "epoch": 0.708966565349544, "grad_norm": 2.206059217453003, "learning_rate": 4.542057075919898e-06, "loss": 0.4628652334213257, "mean_token_accuracy": 0.843146562576294, "num_tokens": 7974491.0, "step": 933 }, { "epoch": 0.709726443768997, "grad_norm": 1.9408152103424072, "learning_rate": 4.54084812679296e-06, "loss": 0.43363600969314575, "mean_token_accuracy": 0.843762993812561, "num_tokens": 7982082.0, "step": 934 }, { "epoch": 0.7104863221884499, "grad_norm": 1.485535979270935, "learning_rate": 4.539637745320247e-06, "loss": 0.32511797547340393, "mean_token_accuracy": 0.8863018751144409, "num_tokens": 7991296.0, "step": 935 }, { "epoch": 0.7112462006079028, "grad_norm": 2.0763707160949707, "learning_rate": 4.53842593235125e-06, "loss": 0.4504709839820862, "mean_token_accuracy": 0.8516234159469604, "num_tokens": 7997781.0, "step": 936 }, { "epoch": 0.7120060790273556, "grad_norm": 2.7200214862823486, "learning_rate": 4.537212688736466e-06, "loss": 0.44011425971984863, "mean_token_accuracy": 0.8511619567871094, "num_tokens": 8001314.0, "step": 937 }, { "epoch": 0.7127659574468085, "grad_norm": 2.2340095043182373, "learning_rate": 4.535998015327396e-06, "loss": 0.41737306118011475, "mean_token_accuracy": 0.8512250185012817, "num_tokens": 8006164.0, "step": 938 }, { "epoch": 0.7135258358662614, "grad_norm": 1.9181195497512817, "learning_rate": 4.534781912976546e-06, "loss": 0.4302041232585907, "mean_token_accuracy": 0.8511664867401123, "num_tokens": 8012893.0, "step": 939 }, { "epoch": 0.7142857142857143, "grad_norm": 1.5707837343215942, "learning_rate": 4.533564382537421e-06, "loss": 0.5106822848320007, "mean_token_accuracy": 0.8417707681655884, "num_tokens": 8025062.0, "step": 940 }, { "epoch": 0.7150455927051672, "grad_norm": 1.4295225143432617, "learning_rate": 4.532345424864533e-06, "loss": 0.37428560853004456, "mean_token_accuracy": 0.8544988036155701, "num_tokens": 8036677.0, "step": 941 }, { "epoch": 0.71580547112462, "grad_norm": 1.5033446550369263, "learning_rate": 4.531125040813392e-06, "loss": 0.45774930715560913, "mean_token_accuracy": 0.8385022878646851, "num_tokens": 8050721.0, "step": 942 }, { "epoch": 0.7165653495440729, "grad_norm": 2.19572377204895, "learning_rate": 4.529903231240511e-06, "loss": 0.45330873131752014, "mean_token_accuracy": 0.8385785818099976, "num_tokens": 8058815.0, "step": 943 }, { "epoch": 0.7173252279635258, "grad_norm": 1.722461223602295, "learning_rate": 4.528679997003403e-06, "loss": 0.49621498584747314, "mean_token_accuracy": 0.8423264026641846, "num_tokens": 8069413.0, "step": 944 }, { "epoch": 0.7180851063829787, "grad_norm": 2.1492364406585693, "learning_rate": 4.52745533896058e-06, "loss": 0.37706953287124634, "mean_token_accuracy": 0.8711401224136353, "num_tokens": 8075029.0, "step": 945 }, { "epoch": 0.7188449848024316, "grad_norm": 2.8799679279327393, "learning_rate": 4.526229257971556e-06, "loss": 0.4596155285835266, "mean_token_accuracy": 0.8330371379852295, "num_tokens": 8078633.0, "step": 946 }, { "epoch": 0.7196048632218845, "grad_norm": 2.301872491836548, "learning_rate": 4.52500175489684e-06, "loss": 0.4968586564064026, "mean_token_accuracy": 0.8335995078086853, "num_tokens": 8085216.0, "step": 947 }, { "epoch": 0.7203647416413373, "grad_norm": 1.8690073490142822, "learning_rate": 4.523772830597942e-06, "loss": 0.5353140234947205, "mean_token_accuracy": 0.8132718205451965, "num_tokens": 8093957.0, "step": 948 }, { "epoch": 0.7211246200607903, "grad_norm": 2.8340227603912354, "learning_rate": 4.522542485937369e-06, "loss": 0.4271763563156128, "mean_token_accuracy": 0.8580093383789062, "num_tokens": 8097511.0, "step": 949 }, { "epoch": 0.7218844984802432, "grad_norm": 3.47814679145813, "learning_rate": 4.521310721778622e-06, "loss": 0.41557565331459045, "mean_token_accuracy": 0.8628487586975098, "num_tokens": 8100442.0, "step": 950 }, { "epoch": 0.7226443768996961, "grad_norm": 1.4269204139709473, "learning_rate": 4.520077538986203e-06, "loss": 0.45576968789100647, "mean_token_accuracy": 0.8401131629943848, "num_tokens": 8113153.0, "step": 951 }, { "epoch": 0.723404255319149, "grad_norm": 2.2714176177978516, "learning_rate": 4.518842938425606e-06, "loss": 0.39482250809669495, "mean_token_accuracy": 0.8544554710388184, "num_tokens": 8119530.0, "step": 952 }, { "epoch": 0.7241641337386018, "grad_norm": 1.3205335140228271, "learning_rate": 4.51760692096332e-06, "loss": 0.37603840231895447, "mean_token_accuracy": 0.8616449236869812, "num_tokens": 8131305.0, "step": 953 }, { "epoch": 0.7249240121580547, "grad_norm": 2.023127317428589, "learning_rate": 4.516369487466832e-06, "loss": 0.3578413128852844, "mean_token_accuracy": 0.874262809753418, "num_tokens": 8137653.0, "step": 954 }, { "epoch": 0.7256838905775076, "grad_norm": 2.0011065006256104, "learning_rate": 4.5151306388046175e-06, "loss": 0.5531020164489746, "mean_token_accuracy": 0.8257242441177368, "num_tokens": 8147206.0, "step": 955 }, { "epoch": 0.7264437689969605, "grad_norm": 2.169484853744507, "learning_rate": 4.513890375846152e-06, "loss": 0.4228977560997009, "mean_token_accuracy": 0.8543480634689331, "num_tokens": 8152364.0, "step": 956 }, { "epoch": 0.7272036474164134, "grad_norm": 1.9234753847122192, "learning_rate": 4.512648699461897e-06, "loss": 0.5490481853485107, "mean_token_accuracy": 0.8181719779968262, "num_tokens": 8159948.0, "step": 957 }, { "epoch": 0.7279635258358662, "grad_norm": 2.429049491882324, "learning_rate": 4.511405610523309e-06, "loss": 0.5078240633010864, "mean_token_accuracy": 0.8245970606803894, "num_tokens": 8165597.0, "step": 958 }, { "epoch": 0.7287234042553191, "grad_norm": 2.5267844200134277, "learning_rate": 4.510161109902837e-06, "loss": 0.3450012803077698, "mean_token_accuracy": 0.8559510707855225, "num_tokens": 8169587.0, "step": 959 }, { "epoch": 0.729483282674772, "grad_norm": 1.9822341203689575, "learning_rate": 4.508915198473919e-06, "loss": 0.4455550014972687, "mean_token_accuracy": 0.8660366535186768, "num_tokens": 8175935.0, "step": 960 }, { "epoch": 0.7302431610942249, "grad_norm": 3.0034801959991455, "learning_rate": 4.507667877110982e-06, "loss": 0.482852041721344, "mean_token_accuracy": 0.8437433242797852, "num_tokens": 8179454.0, "step": 961 }, { "epoch": 0.7310030395136778, "grad_norm": 2.021489143371582, "learning_rate": 4.506419146689445e-06, "loss": 0.3777191638946533, "mean_token_accuracy": 0.8720347285270691, "num_tokens": 8185708.0, "step": 962 }, { "epoch": 0.7317629179331308, "grad_norm": 3.0223734378814697, "learning_rate": 4.505169008085717e-06, "loss": 0.33351024985313416, "mean_token_accuracy": 0.8853825926780701, "num_tokens": 8188627.0, "step": 963 }, { "epoch": 0.7325227963525835, "grad_norm": 1.4128164052963257, "learning_rate": 4.503917462177192e-06, "loss": 0.41878461837768555, "mean_token_accuracy": 0.845172643661499, "num_tokens": 8200765.0, "step": 964 }, { "epoch": 0.7332826747720365, "grad_norm": 2.15291690826416, "learning_rate": 4.5026645098422515e-06, "loss": 0.41664183139801025, "mean_token_accuracy": 0.8549957275390625, "num_tokens": 8206151.0, "step": 965 }, { "epoch": 0.7340425531914894, "grad_norm": 1.979921579360962, "learning_rate": 4.5014101519602684e-06, "loss": 0.47765952348709106, "mean_token_accuracy": 0.8220652341842651, "num_tokens": 8212960.0, "step": 966 }, { "epoch": 0.7348024316109423, "grad_norm": 1.9223300218582153, "learning_rate": 4.500154389411598e-06, "loss": 0.47046923637390137, "mean_token_accuracy": 0.8354085683822632, "num_tokens": 8220174.0, "step": 967 }, { "epoch": 0.7355623100303952, "grad_norm": 2.914621591567993, "learning_rate": 4.498897223077582e-06, "loss": 0.3779117465019226, "mean_token_accuracy": 0.8938813805580139, "num_tokens": 8223426.0, "step": 968 }, { "epoch": 0.736322188449848, "grad_norm": 2.284940242767334, "learning_rate": 4.49763865384055e-06, "loss": 0.4823833703994751, "mean_token_accuracy": 0.8280587196350098, "num_tokens": 8229111.0, "step": 969 }, { "epoch": 0.7370820668693009, "grad_norm": 1.9567904472351074, "learning_rate": 4.496378682583813e-06, "loss": 0.478855162858963, "mean_token_accuracy": 0.858295202255249, "num_tokens": 8236746.0, "step": 970 }, { "epoch": 0.7378419452887538, "grad_norm": 1.2472765445709229, "learning_rate": 4.495117310191667e-06, "loss": 0.46173644065856934, "mean_token_accuracy": 0.8231217861175537, "num_tokens": 8256189.0, "step": 971 }, { "epoch": 0.7386018237082067, "grad_norm": 1.8555713891983032, "learning_rate": 4.493854537549393e-06, "loss": 0.44676172733306885, "mean_token_accuracy": 0.842460036277771, "num_tokens": 8263689.0, "step": 972 }, { "epoch": 0.7393617021276596, "grad_norm": 2.5443015098571777, "learning_rate": 4.492590365543253e-06, "loss": 0.4607488214969635, "mean_token_accuracy": 0.8574792146682739, "num_tokens": 8268125.0, "step": 973 }, { "epoch": 0.7401215805471124, "grad_norm": 2.232205390930176, "learning_rate": 4.491324795060491e-06, "loss": 0.35018014907836914, "mean_token_accuracy": 0.8818938732147217, "num_tokens": 8273045.0, "step": 974 }, { "epoch": 0.7408814589665653, "grad_norm": 3.099548101425171, "learning_rate": 4.490057826989333e-06, "loss": 0.5345156788825989, "mean_token_accuracy": 0.8188414573669434, "num_tokens": 8277421.0, "step": 975 }, { "epoch": 0.7416413373860182, "grad_norm": 2.6421279907226562, "learning_rate": 4.488789462218988e-06, "loss": 0.3364614248275757, "mean_token_accuracy": 0.8776024580001831, "num_tokens": 8280573.0, "step": 976 }, { "epoch": 0.7424012158054711, "grad_norm": 3.1140081882476807, "learning_rate": 4.487519701639641e-06, "loss": 0.5701988339424133, "mean_token_accuracy": 0.803310215473175, "num_tokens": 8284623.0, "step": 977 }, { "epoch": 0.743161094224924, "grad_norm": 1.7440085411071777, "learning_rate": 4.486248546142459e-06, "loss": 0.46152663230895996, "mean_token_accuracy": 0.8350806832313538, "num_tokens": 8292830.0, "step": 978 }, { "epoch": 0.743920972644377, "grad_norm": 1.977761149406433, "learning_rate": 4.4849759966195885e-06, "loss": 0.5126093626022339, "mean_token_accuracy": 0.8274800777435303, "num_tokens": 8301102.0, "step": 979 }, { "epoch": 0.7446808510638298, "grad_norm": 1.3579951524734497, "learning_rate": 4.483702053964154e-06, "loss": 0.4015064835548401, "mean_token_accuracy": 0.8514711260795593, "num_tokens": 8315479.0, "step": 980 }, { "epoch": 0.7454407294832827, "grad_norm": 1.7891197204589844, "learning_rate": 4.482426719070258e-06, "loss": 0.5263936519622803, "mean_token_accuracy": 0.8251112699508667, "num_tokens": 8326814.0, "step": 981 }, { "epoch": 0.7462006079027356, "grad_norm": 2.727473497390747, "learning_rate": 4.4811499928329775e-06, "loss": 0.3493611514568329, "mean_token_accuracy": 0.8714612722396851, "num_tokens": 8330310.0, "step": 982 }, { "epoch": 0.7469604863221885, "grad_norm": 2.1080844402313232, "learning_rate": 4.479871876148368e-06, "loss": 0.3950042724609375, "mean_token_accuracy": 0.8624709844589233, "num_tokens": 8336144.0, "step": 983 }, { "epoch": 0.7477203647416414, "grad_norm": 1.2591725587844849, "learning_rate": 4.478592369913464e-06, "loss": 0.38214361667633057, "mean_token_accuracy": 0.8684597015380859, "num_tokens": 8353340.0, "step": 984 }, { "epoch": 0.7484802431610942, "grad_norm": 2.859177827835083, "learning_rate": 4.477311475026271e-06, "loss": 0.39489829540252686, "mean_token_accuracy": 0.8582780361175537, "num_tokens": 8357108.0, "step": 985 }, { "epoch": 0.7492401215805471, "grad_norm": 1.7800242900848389, "learning_rate": 4.476029192385769e-06, "loss": 0.4666605591773987, "mean_token_accuracy": 0.8337945938110352, "num_tokens": 8364556.0, "step": 986 }, { "epoch": 0.75, "grad_norm": 2.1390371322631836, "learning_rate": 4.474745522891915e-06, "loss": 0.4520495533943176, "mean_token_accuracy": 0.8416281938552856, "num_tokens": 8370207.0, "step": 987 }, { "epoch": 0.7507598784194529, "grad_norm": 2.019336223602295, "learning_rate": 4.473460467445637e-06, "loss": 0.5331957340240479, "mean_token_accuracy": 0.8425475358963013, "num_tokens": 8379486.0, "step": 988 }, { "epoch": 0.7515197568389058, "grad_norm": 1.9489482641220093, "learning_rate": 4.472174026948836e-06, "loss": 0.4950482249259949, "mean_token_accuracy": 0.8206159472465515, "num_tokens": 8387177.0, "step": 989 }, { "epoch": 0.7522796352583586, "grad_norm": 3.1013834476470947, "learning_rate": 4.470886202304385e-06, "loss": 0.4596092104911804, "mean_token_accuracy": 0.843962550163269, "num_tokens": 8391049.0, "step": 990 }, { "epoch": 0.7530395136778115, "grad_norm": 1.6871871948242188, "learning_rate": 4.469596994416131e-06, "loss": 0.4753928780555725, "mean_token_accuracy": 0.8504481315612793, "num_tokens": 8399793.0, "step": 991 }, { "epoch": 0.7537993920972644, "grad_norm": 2.4964523315429688, "learning_rate": 4.468306404188887e-06, "loss": 0.48669153451919556, "mean_token_accuracy": 0.8231956958770752, "num_tokens": 8405953.0, "step": 992 }, { "epoch": 0.7545592705167173, "grad_norm": 1.5213518142700195, "learning_rate": 4.467014432528441e-06, "loss": 0.42779433727264404, "mean_token_accuracy": 0.8506912589073181, "num_tokens": 8416035.0, "step": 993 }, { "epoch": 0.7553191489361702, "grad_norm": 1.985685110092163, "learning_rate": 4.465721080341547e-06, "loss": 0.5653210878372192, "mean_token_accuracy": 0.8135318756103516, "num_tokens": 8424522.0, "step": 994 }, { "epoch": 0.756079027355623, "grad_norm": 2.5402510166168213, "learning_rate": 4.4644263485359316e-06, "loss": 0.5187326669692993, "mean_token_accuracy": 0.8405015468597412, "num_tokens": 8428560.0, "step": 995 }, { "epoch": 0.756838905775076, "grad_norm": 2.289832592010498, "learning_rate": 4.463130238020284e-06, "loss": 0.5355351567268372, "mean_token_accuracy": 0.8116629123687744, "num_tokens": 8434082.0, "step": 996 }, { "epoch": 0.7575987841945289, "grad_norm": 1.4917112588882446, "learning_rate": 4.4618327497042676e-06, "loss": 0.3724251687526703, "mean_token_accuracy": 0.8679091334342957, "num_tokens": 8445348.0, "step": 997 }, { "epoch": 0.7583586626139818, "grad_norm": 2.608022451400757, "learning_rate": 4.460533884498509e-06, "loss": 0.43493184447288513, "mean_token_accuracy": 0.8561485409736633, "num_tokens": 8449571.0, "step": 998 }, { "epoch": 0.7591185410334347, "grad_norm": 3.3838589191436768, "learning_rate": 4.4592336433146e-06, "loss": 0.41346901655197144, "mean_token_accuracy": 0.8535987138748169, "num_tokens": 8453137.0, "step": 999 }, { "epoch": 0.7598784194528876, "grad_norm": 2.02105975151062, "learning_rate": 4.457932027065102e-06, "loss": 0.5325049757957458, "mean_token_accuracy": 0.8375140428543091, "num_tokens": 8459634.0, "step": 1000 }, { "epoch": 0.7606382978723404, "grad_norm": 2.77596378326416, "learning_rate": 4.456629036663537e-06, "loss": 0.41893303394317627, "mean_token_accuracy": 0.8545857667922974, "num_tokens": 8463573.0, "step": 1001 }, { "epoch": 0.7613981762917933, "grad_norm": 1.8876289129257202, "learning_rate": 4.455324673024396e-06, "loss": 0.5548145771026611, "mean_token_accuracy": 0.8121346235275269, "num_tokens": 8472536.0, "step": 1002 }, { "epoch": 0.7621580547112462, "grad_norm": 3.1031622886657715, "learning_rate": 4.4540189370631315e-06, "loss": 0.530841588973999, "mean_token_accuracy": 0.8466066122055054, "num_tokens": 8478261.0, "step": 1003 }, { "epoch": 0.7629179331306991, "grad_norm": 1.8441965579986572, "learning_rate": 4.452711829696158e-06, "loss": 0.4597780108451843, "mean_token_accuracy": 0.8342568278312683, "num_tokens": 8486498.0, "step": 1004 }, { "epoch": 0.763677811550152, "grad_norm": 1.2919553518295288, "learning_rate": 4.451403351840855e-06, "loss": 0.4109460115432739, "mean_token_accuracy": 0.8469996452331543, "num_tokens": 8500456.0, "step": 1005 }, { "epoch": 0.7644376899696048, "grad_norm": 1.2351306676864624, "learning_rate": 4.450093504415562e-06, "loss": 0.36019328236579895, "mean_token_accuracy": 0.8635528683662415, "num_tokens": 8517117.0, "step": 1006 }, { "epoch": 0.7651975683890577, "grad_norm": 2.5349855422973633, "learning_rate": 4.44878228833958e-06, "loss": 0.5035796165466309, "mean_token_accuracy": 0.8254295587539673, "num_tokens": 8522401.0, "step": 1007 }, { "epoch": 0.7659574468085106, "grad_norm": 1.387760877609253, "learning_rate": 4.447469704533172e-06, "loss": 0.5678063035011292, "mean_token_accuracy": 0.8099559545516968, "num_tokens": 8537160.0, "step": 1008 }, { "epoch": 0.7667173252279635, "grad_norm": 2.6500985622406006, "learning_rate": 4.446155753917559e-06, "loss": 0.6535372734069824, "mean_token_accuracy": 0.7783317565917969, "num_tokens": 8543657.0, "step": 1009 }, { "epoch": 0.7674772036474165, "grad_norm": 1.8321892023086548, "learning_rate": 4.444840437414923e-06, "loss": 0.4645255208015442, "mean_token_accuracy": 0.8463554382324219, "num_tokens": 8550253.0, "step": 1010 }, { "epoch": 0.7682370820668692, "grad_norm": 1.5419100522994995, "learning_rate": 4.443523755948401e-06, "loss": 0.41844701766967773, "mean_token_accuracy": 0.8535705804824829, "num_tokens": 8559762.0, "step": 1011 }, { "epoch": 0.7689969604863222, "grad_norm": 1.7755532264709473, "learning_rate": 4.442205710442095e-06, "loss": 0.5324689745903015, "mean_token_accuracy": 0.8203273415565491, "num_tokens": 8569460.0, "step": 1012 }, { "epoch": 0.7697568389057751, "grad_norm": 2.2584824562072754, "learning_rate": 4.4408863018210564e-06, "loss": 0.4767489731311798, "mean_token_accuracy": 0.8403046727180481, "num_tokens": 8575156.0, "step": 1013 }, { "epoch": 0.770516717325228, "grad_norm": 1.8031688928604126, "learning_rate": 4.439565531011299e-06, "loss": 0.43814486265182495, "mean_token_accuracy": 0.8509267568588257, "num_tokens": 8582442.0, "step": 1014 }, { "epoch": 0.7712765957446809, "grad_norm": 1.7348157167434692, "learning_rate": 4.43824339893979e-06, "loss": 0.5052890777587891, "mean_token_accuracy": 0.8190545439720154, "num_tokens": 8592320.0, "step": 1015 }, { "epoch": 0.7720364741641338, "grad_norm": 2.2615408897399902, "learning_rate": 4.436919906534452e-06, "loss": 0.4714162051677704, "mean_token_accuracy": 0.8341861963272095, "num_tokens": 8598848.0, "step": 1016 }, { "epoch": 0.7727963525835866, "grad_norm": 2.795295238494873, "learning_rate": 4.4355950547241645e-06, "loss": 0.44224533438682556, "mean_token_accuracy": 0.8643772602081299, "num_tokens": 8602394.0, "step": 1017 }, { "epoch": 0.7735562310030395, "grad_norm": 2.220775604248047, "learning_rate": 4.434268844438758e-06, "loss": 0.49567878246307373, "mean_token_accuracy": 0.8372000455856323, "num_tokens": 8608698.0, "step": 1018 }, { "epoch": 0.7743161094224924, "grad_norm": 2.073723077774048, "learning_rate": 4.432941276609018e-06, "loss": 0.5072222352027893, "mean_token_accuracy": 0.8330106735229492, "num_tokens": 8616280.0, "step": 1019 }, { "epoch": 0.7750759878419453, "grad_norm": 1.7301781177520752, "learning_rate": 4.431612352166684e-06, "loss": 0.47175338864326477, "mean_token_accuracy": 0.8372178077697754, "num_tokens": 8627292.0, "step": 1020 }, { "epoch": 0.7758358662613982, "grad_norm": 2.203221559524536, "learning_rate": 4.4302820720444454e-06, "loss": 0.42932671308517456, "mean_token_accuracy": 0.8473650217056274, "num_tokens": 8632233.0, "step": 1021 }, { "epoch": 0.776595744680851, "grad_norm": 2.7094364166259766, "learning_rate": 4.428950437175944e-06, "loss": 0.370113343000412, "mean_token_accuracy": 0.8693164587020874, "num_tokens": 8636254.0, "step": 1022 }, { "epoch": 0.7773556231003039, "grad_norm": 2.210756540298462, "learning_rate": 4.427617448495772e-06, "loss": 0.5489422082901001, "mean_token_accuracy": 0.7966009378433228, "num_tokens": 8643263.0, "step": 1023 }, { "epoch": 0.7781155015197568, "grad_norm": 1.7067797183990479, "learning_rate": 4.426283106939474e-06, "loss": 0.38478705286979675, "mean_token_accuracy": 0.8717148303985596, "num_tokens": 8652406.0, "step": 1024 }, { "epoch": 0.7788753799392097, "grad_norm": 2.3071329593658447, "learning_rate": 4.424947413443539e-06, "loss": 0.41938164830207825, "mean_token_accuracy": 0.8462989926338196, "num_tokens": 8657739.0, "step": 1025 }, { "epoch": 0.7796352583586627, "grad_norm": 2.193040609359741, "learning_rate": 4.423610368945411e-06, "loss": 0.4957553446292877, "mean_token_accuracy": 0.82379150390625, "num_tokens": 8664744.0, "step": 1026 }, { "epoch": 0.7803951367781155, "grad_norm": 1.880054235458374, "learning_rate": 4.422271974383479e-06, "loss": 0.4128783047199249, "mean_token_accuracy": 0.8549532890319824, "num_tokens": 8671598.0, "step": 1027 }, { "epoch": 0.7811550151975684, "grad_norm": 1.9795271158218384, "learning_rate": 4.420932230697079e-06, "loss": 0.4144800305366516, "mean_token_accuracy": 0.8507047891616821, "num_tokens": 8679119.0, "step": 1028 }, { "epoch": 0.7819148936170213, "grad_norm": 1.9185072183609009, "learning_rate": 4.419591138826495e-06, "loss": 0.4691133499145508, "mean_token_accuracy": 0.8370275497436523, "num_tokens": 8686545.0, "step": 1029 }, { "epoch": 0.7826747720364742, "grad_norm": 1.33665931224823, "learning_rate": 4.418248699712955e-06, "loss": 0.44059011340141296, "mean_token_accuracy": 0.8270549178123474, "num_tokens": 8701344.0, "step": 1030 }, { "epoch": 0.7834346504559271, "grad_norm": 1.1207163333892822, "learning_rate": 4.416904914298637e-06, "loss": 0.3606961965560913, "mean_token_accuracy": 0.8682348132133484, "num_tokens": 8719907.0, "step": 1031 }, { "epoch": 0.78419452887538, "grad_norm": 1.9580340385437012, "learning_rate": 4.415559783526661e-06, "loss": 0.46530643105506897, "mean_token_accuracy": 0.8319792747497559, "num_tokens": 8726106.0, "step": 1032 }, { "epoch": 0.7849544072948328, "grad_norm": 2.0062975883483887, "learning_rate": 4.414213308341092e-06, "loss": 0.5419950485229492, "mean_token_accuracy": 0.8142098784446716, "num_tokens": 8734352.0, "step": 1033 }, { "epoch": 0.7857142857142857, "grad_norm": 3.937150716781616, "learning_rate": 4.412865489686936e-06, "loss": 0.5458130836486816, "mean_token_accuracy": 0.8083989024162292, "num_tokens": 8737291.0, "step": 1034 }, { "epoch": 0.7864741641337386, "grad_norm": 1.8860409259796143, "learning_rate": 4.411516328510145e-06, "loss": 0.5704396963119507, "mean_token_accuracy": 0.8274439573287964, "num_tokens": 8746161.0, "step": 1035 }, { "epoch": 0.7872340425531915, "grad_norm": 1.9952261447906494, "learning_rate": 4.410165825757613e-06, "loss": 0.4698999524116516, "mean_token_accuracy": 0.8273038864135742, "num_tokens": 8753039.0, "step": 1036 }, { "epoch": 0.7879939209726444, "grad_norm": 3.1650075912475586, "learning_rate": 4.408813982377175e-06, "loss": 0.39484354853630066, "mean_token_accuracy": 0.8779162168502808, "num_tokens": 8755485.0, "step": 1037 }, { "epoch": 0.7887537993920972, "grad_norm": 1.2232600450515747, "learning_rate": 4.407460799317605e-06, "loss": 0.3866608440876007, "mean_token_accuracy": 0.8689858317375183, "num_tokens": 8772899.0, "step": 1038 }, { "epoch": 0.7895136778115501, "grad_norm": 2.056734561920166, "learning_rate": 4.40610627752862e-06, "loss": 0.3973802626132965, "mean_token_accuracy": 0.8627678751945496, "num_tokens": 8778953.0, "step": 1039 }, { "epoch": 0.790273556231003, "grad_norm": 2.970404624938965, "learning_rate": 4.404750417960876e-06, "loss": 0.4091486632823944, "mean_token_accuracy": 0.8623145222663879, "num_tokens": 8782389.0, "step": 1040 }, { "epoch": 0.791033434650456, "grad_norm": 1.9702332019805908, "learning_rate": 4.403393221565966e-06, "loss": 0.40283408761024475, "mean_token_accuracy": 0.8535085916519165, "num_tokens": 8788644.0, "step": 1041 }, { "epoch": 0.7917933130699089, "grad_norm": 2.926893472671509, "learning_rate": 4.402034689296425e-06, "loss": 0.2875128388404846, "mean_token_accuracy": 0.9035156965255737, "num_tokens": 8791369.0, "step": 1042 }, { "epoch": 0.7925531914893617, "grad_norm": 2.9396777153015137, "learning_rate": 4.400674822105721e-06, "loss": 0.6436929106712341, "mean_token_accuracy": 0.824264407157898, "num_tokens": 8796588.0, "step": 1043 }, { "epoch": 0.7933130699088146, "grad_norm": 1.320375919342041, "learning_rate": 4.399313620948262e-06, "loss": 0.4132855534553528, "mean_token_accuracy": 0.8429237604141235, "num_tokens": 8811276.0, "step": 1044 }, { "epoch": 0.7940729483282675, "grad_norm": 1.8580036163330078, "learning_rate": 4.397951086779392e-06, "loss": 0.4479951858520508, "mean_token_accuracy": 0.845382571220398, "num_tokens": 8819439.0, "step": 1045 }, { "epoch": 0.7948328267477204, "grad_norm": 3.21520733833313, "learning_rate": 4.396587220555389e-06, "loss": 0.5257807970046997, "mean_token_accuracy": 0.8095862865447998, "num_tokens": 8823368.0, "step": 1046 }, { "epoch": 0.7955927051671733, "grad_norm": 3.1673340797424316, "learning_rate": 4.395222023233467e-06, "loss": 0.4084981381893158, "mean_token_accuracy": 0.8585314154624939, "num_tokens": 8826906.0, "step": 1047 }, { "epoch": 0.7963525835866262, "grad_norm": 1.7790806293487549, "learning_rate": 4.393855495771774e-06, "loss": 0.4280048906803131, "mean_token_accuracy": 0.8463689088821411, "num_tokens": 8835341.0, "step": 1048 }, { "epoch": 0.797112462006079, "grad_norm": 2.5598952770233154, "learning_rate": 4.3924876391293915e-06, "loss": 0.5665745735168457, "mean_token_accuracy": 0.8169650435447693, "num_tokens": 8841602.0, "step": 1049 }, { "epoch": 0.7978723404255319, "grad_norm": 1.6590951681137085, "learning_rate": 4.391118454266335e-06, "loss": 0.4466911852359772, "mean_token_accuracy": 0.8155986070632935, "num_tokens": 8853154.0, "step": 1050 }, { "epoch": 0.7986322188449848, "grad_norm": 1.603003740310669, "learning_rate": 4.389747942143549e-06, "loss": 0.43882766366004944, "mean_token_accuracy": 0.8452494740486145, "num_tokens": 8864131.0, "step": 1051 }, { "epoch": 0.7993920972644377, "grad_norm": 2.633272647857666, "learning_rate": 4.388376103722914e-06, "loss": 0.5252490639686584, "mean_token_accuracy": 0.8409363031387329, "num_tokens": 8868637.0, "step": 1052 }, { "epoch": 0.8001519756838906, "grad_norm": 2.0235087871551514, "learning_rate": 4.387002939967237e-06, "loss": 0.2776068449020386, "mean_token_accuracy": 0.8970445990562439, "num_tokens": 8873360.0, "step": 1053 }, { "epoch": 0.8009118541033434, "grad_norm": 1.4838424921035767, "learning_rate": 4.38562845184026e-06, "loss": 0.48971569538116455, "mean_token_accuracy": 0.8425699472427368, "num_tokens": 8886948.0, "step": 1054 }, { "epoch": 0.8016717325227963, "grad_norm": 1.7026574611663818, "learning_rate": 4.384252640306649e-06, "loss": 0.5599839687347412, "mean_token_accuracy": 0.787269651889801, "num_tokens": 8897829.0, "step": 1055 }, { "epoch": 0.8024316109422492, "grad_norm": 2.503035306930542, "learning_rate": 4.382875506332002e-06, "loss": 0.467684268951416, "mean_token_accuracy": 0.8441770076751709, "num_tokens": 8902875.0, "step": 1056 }, { "epoch": 0.8031914893617021, "grad_norm": 1.7335060834884644, "learning_rate": 4.381497050882845e-06, "loss": 0.5217864513397217, "mean_token_accuracy": 0.8167104721069336, "num_tokens": 8913852.0, "step": 1057 }, { "epoch": 0.8039513677811551, "grad_norm": 2.078587055206299, "learning_rate": 4.380117274926632e-06, "loss": 0.44632554054260254, "mean_token_accuracy": 0.8510191440582275, "num_tokens": 8920663.0, "step": 1058 }, { "epoch": 0.8047112462006079, "grad_norm": 1.6764146089553833, "learning_rate": 4.3787361794317405e-06, "loss": 0.419901967048645, "mean_token_accuracy": 0.8311997056007385, "num_tokens": 8930204.0, "step": 1059 }, { "epoch": 0.8054711246200608, "grad_norm": 1.9987337589263916, "learning_rate": 4.377353765367479e-06, "loss": 0.45824307203292847, "mean_token_accuracy": 0.8364956378936768, "num_tokens": 8937846.0, "step": 1060 }, { "epoch": 0.8062310030395137, "grad_norm": 1.9666810035705566, "learning_rate": 4.375970033704078e-06, "loss": 0.3154428005218506, "mean_token_accuracy": 0.8946245908737183, "num_tokens": 8943081.0, "step": 1061 }, { "epoch": 0.8069908814589666, "grad_norm": 1.9755457639694214, "learning_rate": 4.374584985412692e-06, "loss": 0.34014445543289185, "mean_token_accuracy": 0.861365020275116, "num_tokens": 8949014.0, "step": 1062 }, { "epoch": 0.8077507598784195, "grad_norm": 2.0280346870422363, "learning_rate": 4.373198621465405e-06, "loss": 0.5190823078155518, "mean_token_accuracy": 0.8221094608306885, "num_tokens": 8958383.0, "step": 1063 }, { "epoch": 0.8085106382978723, "grad_norm": 2.427828311920166, "learning_rate": 4.3718109428352155e-06, "loss": 0.4847556948661804, "mean_token_accuracy": 0.821616530418396, "num_tokens": 8963923.0, "step": 1064 }, { "epoch": 0.8092705167173252, "grad_norm": 3.5203857421875, "learning_rate": 4.370421950496055e-06, "loss": 0.5317807197570801, "mean_token_accuracy": 0.8062059283256531, "num_tokens": 8967473.0, "step": 1065 }, { "epoch": 0.8100303951367781, "grad_norm": 2.141909122467041, "learning_rate": 4.369031645422768e-06, "loss": 0.4012424647808075, "mean_token_accuracy": 0.8550806045532227, "num_tokens": 8973907.0, "step": 1066 }, { "epoch": 0.810790273556231, "grad_norm": 2.4285736083984375, "learning_rate": 4.367640028591126e-06, "loss": 0.27693629264831543, "mean_token_accuracy": 0.8945943117141724, "num_tokens": 8977178.0, "step": 1067 }, { "epoch": 0.8115501519756839, "grad_norm": 2.164745330810547, "learning_rate": 4.366247100977818e-06, "loss": 0.3762974143028259, "mean_token_accuracy": 0.8787487745285034, "num_tokens": 8982342.0, "step": 1068 }, { "epoch": 0.8123100303951368, "grad_norm": 2.1378557682037354, "learning_rate": 4.364852863560456e-06, "loss": 0.5135717391967773, "mean_token_accuracy": 0.8224875926971436, "num_tokens": 8989107.0, "step": 1069 }, { "epoch": 0.8130699088145896, "grad_norm": 2.2960023880004883, "learning_rate": 4.363457317317568e-06, "loss": 0.3824809193611145, "mean_token_accuracy": 0.8663628101348877, "num_tokens": 8994299.0, "step": 1070 }, { "epoch": 0.8138297872340425, "grad_norm": 1.7643877267837524, "learning_rate": 4.362060463228603e-06, "loss": 0.48205146193504333, "mean_token_accuracy": 0.8588734269142151, "num_tokens": 9003419.0, "step": 1071 }, { "epoch": 0.8145896656534954, "grad_norm": 2.886695623397827, "learning_rate": 4.360662302273926e-06, "loss": 0.273180216550827, "mean_token_accuracy": 0.8931835889816284, "num_tokens": 9005973.0, "step": 1072 }, { "epoch": 0.8153495440729484, "grad_norm": 1.8112608194351196, "learning_rate": 4.35926283543482e-06, "loss": 0.4354448914527893, "mean_token_accuracy": 0.8486064076423645, "num_tokens": 9014010.0, "step": 1073 }, { "epoch": 0.8161094224924013, "grad_norm": 3.5569543838500977, "learning_rate": 4.357862063693486e-06, "loss": 0.35450923442840576, "mean_token_accuracy": 0.8832963705062866, "num_tokens": 9016394.0, "step": 1074 }, { "epoch": 0.8168693009118541, "grad_norm": 2.805382490158081, "learning_rate": 4.356459988033039e-06, "loss": 0.38273581862449646, "mean_token_accuracy": 0.8612306118011475, "num_tokens": 9020497.0, "step": 1075 }, { "epoch": 0.817629179331307, "grad_norm": 2.2936694622039795, "learning_rate": 4.355056609437509e-06, "loss": 0.4462985396385193, "mean_token_accuracy": 0.8588640689849854, "num_tokens": 9025139.0, "step": 1076 }, { "epoch": 0.8183890577507599, "grad_norm": 2.225205183029175, "learning_rate": 4.353651928891842e-06, "loss": 0.5039516687393188, "mean_token_accuracy": 0.8332420587539673, "num_tokens": 9031096.0, "step": 1077 }, { "epoch": 0.8191489361702128, "grad_norm": 2.485123634338379, "learning_rate": 4.352245947381897e-06, "loss": 0.49850213527679443, "mean_token_accuracy": 0.8178091645240784, "num_tokens": 9037259.0, "step": 1078 }, { "epoch": 0.8199088145896657, "grad_norm": 1.774471402168274, "learning_rate": 4.3508386658944455e-06, "loss": 0.4455867111682892, "mean_token_accuracy": 0.8448972702026367, "num_tokens": 9045700.0, "step": 1079 }, { "epoch": 0.8206686930091185, "grad_norm": 1.8009142875671387, "learning_rate": 4.349430085417171e-06, "loss": 0.4392588138580322, "mean_token_accuracy": 0.8517820835113525, "num_tokens": 9054699.0, "step": 1080 }, { "epoch": 0.8214285714285714, "grad_norm": 2.368156909942627, "learning_rate": 4.348020206938672e-06, "loss": 0.45366334915161133, "mean_token_accuracy": 0.858634352684021, "num_tokens": 9059454.0, "step": 1081 }, { "epoch": 0.8221884498480243, "grad_norm": 2.1474950313568115, "learning_rate": 4.3466090314484526e-06, "loss": 0.44049710035324097, "mean_token_accuracy": 0.845537543296814, "num_tokens": 9065079.0, "step": 1082 }, { "epoch": 0.8229483282674772, "grad_norm": 2.7859013080596924, "learning_rate": 4.345196559936931e-06, "loss": 0.43630853295326233, "mean_token_accuracy": 0.8819534182548523, "num_tokens": 9068719.0, "step": 1083 }, { "epoch": 0.8237082066869301, "grad_norm": 1.7428752183914185, "learning_rate": 4.343782793395435e-06, "loss": 0.37707966566085815, "mean_token_accuracy": 0.8731701374053955, "num_tokens": 9075993.0, "step": 1084 }, { "epoch": 0.824468085106383, "grad_norm": 1.6565921306610107, "learning_rate": 4.3423677328162e-06, "loss": 0.4826427400112152, "mean_token_accuracy": 0.8478366136550903, "num_tokens": 9085708.0, "step": 1085 }, { "epoch": 0.8252279635258358, "grad_norm": 1.4606108665466309, "learning_rate": 4.340951379192369e-06, "loss": 0.41244298219680786, "mean_token_accuracy": 0.8356068730354309, "num_tokens": 9097843.0, "step": 1086 }, { "epoch": 0.8259878419452887, "grad_norm": 1.7329881191253662, "learning_rate": 4.3395337335179945e-06, "loss": 0.5335365533828735, "mean_token_accuracy": 0.815739095211029, "num_tokens": 9108888.0, "step": 1087 }, { "epoch": 0.8267477203647416, "grad_norm": 3.0382602214813232, "learning_rate": 4.338114796788035e-06, "loss": 0.47978851199150085, "mean_token_accuracy": 0.8394724726676941, "num_tokens": 9111906.0, "step": 1088 }, { "epoch": 0.8275075987841946, "grad_norm": 1.3464313745498657, "learning_rate": 4.336694569998354e-06, "loss": 0.42660221457481384, "mean_token_accuracy": 0.8681737780570984, "num_tokens": 9124341.0, "step": 1089 }, { "epoch": 0.8282674772036475, "grad_norm": 2.205402135848999, "learning_rate": 4.3352730541457215e-06, "loss": 0.5092664957046509, "mean_token_accuracy": 0.8146024942398071, "num_tokens": 9129920.0, "step": 1090 }, { "epoch": 0.8290273556231003, "grad_norm": 1.405417561531067, "learning_rate": 4.333850250227814e-06, "loss": 0.4491054117679596, "mean_token_accuracy": 0.8378802537918091, "num_tokens": 9143034.0, "step": 1091 }, { "epoch": 0.8297872340425532, "grad_norm": 1.9030550718307495, "learning_rate": 4.332426159243206e-06, "loss": 0.530014157295227, "mean_token_accuracy": 0.817804753780365, "num_tokens": 9156080.0, "step": 1092 }, { "epoch": 0.8305471124620061, "grad_norm": 3.0464563369750977, "learning_rate": 4.331000782191384e-06, "loss": 0.46515169739723206, "mean_token_accuracy": 0.8385719060897827, "num_tokens": 9159028.0, "step": 1093 }, { "epoch": 0.831306990881459, "grad_norm": 2.133287191390991, "learning_rate": 4.329574120072728e-06, "loss": 0.4124572277069092, "mean_token_accuracy": 0.8583981990814209, "num_tokens": 9164342.0, "step": 1094 }, { "epoch": 0.8320668693009119, "grad_norm": 2.03017520904541, "learning_rate": 4.328146173888528e-06, "loss": 0.44622802734375, "mean_token_accuracy": 0.8378459215164185, "num_tokens": 9171744.0, "step": 1095 }, { "epoch": 0.8328267477203647, "grad_norm": 1.4623960256576538, "learning_rate": 4.32671694464097e-06, "loss": 0.3354308009147644, "mean_token_accuracy": 0.8694335222244263, "num_tokens": 9180649.0, "step": 1096 }, { "epoch": 0.8335866261398176, "grad_norm": 1.2103843688964844, "learning_rate": 4.3252864333331424e-06, "loss": 0.3614499866962433, "mean_token_accuracy": 0.8707097172737122, "num_tokens": 9195818.0, "step": 1097 }, { "epoch": 0.8343465045592705, "grad_norm": 1.5913515090942383, "learning_rate": 4.323854640969033e-06, "loss": 0.5261038541793823, "mean_token_accuracy": 0.8217508792877197, "num_tokens": 9206040.0, "step": 1098 }, { "epoch": 0.8351063829787234, "grad_norm": 1.853654384613037, "learning_rate": 4.322421568553529e-06, "loss": 0.45574983954429626, "mean_token_accuracy": 0.8306976556777954, "num_tokens": 9213265.0, "step": 1099 }, { "epoch": 0.8358662613981763, "grad_norm": 2.886070728302002, "learning_rate": 4.320987217092416e-06, "loss": 0.38911131024360657, "mean_token_accuracy": 0.8574584722518921, "num_tokens": 9216882.0, "step": 1100 }, { "epoch": 0.8366261398176292, "grad_norm": 1.6736514568328857, "learning_rate": 4.319551587592377e-06, "loss": 0.6185226440429688, "mean_token_accuracy": 0.7903167009353638, "num_tokens": 9228826.0, "step": 1101 }, { "epoch": 0.837386018237082, "grad_norm": 2.6079232692718506, "learning_rate": 4.318114681060989e-06, "loss": 0.5082242488861084, "mean_token_accuracy": 0.8450819253921509, "num_tokens": 9233334.0, "step": 1102 }, { "epoch": 0.8381458966565349, "grad_norm": 1.2644526958465576, "learning_rate": 4.316676498506735e-06, "loss": 0.3367535471916199, "mean_token_accuracy": 0.8658517599105835, "num_tokens": 9244703.0, "step": 1103 }, { "epoch": 0.8389057750759878, "grad_norm": 1.4803882837295532, "learning_rate": 4.3152370409389795e-06, "loss": 0.5120146870613098, "mean_token_accuracy": 0.819484531879425, "num_tokens": 9257823.0, "step": 1104 }, { "epoch": 0.8396656534954408, "grad_norm": 2.6995160579681396, "learning_rate": 4.3137963093679945e-06, "loss": 0.25613880157470703, "mean_token_accuracy": 0.908172607421875, "num_tokens": 9260866.0, "step": 1105 }, { "epoch": 0.8404255319148937, "grad_norm": 3.1766152381896973, "learning_rate": 4.3123543048049395e-06, "loss": 0.5877197980880737, "mean_token_accuracy": 0.8498363494873047, "num_tokens": 9265243.0, "step": 1106 }, { "epoch": 0.8411854103343465, "grad_norm": 1.38240385055542, "learning_rate": 4.310911028261867e-06, "loss": 0.38877570629119873, "mean_token_accuracy": 0.859251081943512, "num_tokens": 9278366.0, "step": 1107 }, { "epoch": 0.8419452887537994, "grad_norm": 2.567765951156616, "learning_rate": 4.309466480751726e-06, "loss": 0.3730832636356354, "mean_token_accuracy": 0.8739770650863647, "num_tokens": 9282079.0, "step": 1108 }, { "epoch": 0.8427051671732523, "grad_norm": 1.8774040937423706, "learning_rate": 4.308020663288356e-06, "loss": 0.4465939998626709, "mean_token_accuracy": 0.851172685623169, "num_tokens": 9288929.0, "step": 1109 }, { "epoch": 0.8434650455927052, "grad_norm": 1.6784414052963257, "learning_rate": 4.306573576886485e-06, "loss": 0.41513025760650635, "mean_token_accuracy": 0.8432379364967346, "num_tokens": 9298475.0, "step": 1110 }, { "epoch": 0.8442249240121581, "grad_norm": 2.5437793731689453, "learning_rate": 4.305125222561736e-06, "loss": 0.5041637420654297, "mean_token_accuracy": 0.8487101793289185, "num_tokens": 9302877.0, "step": 1111 }, { "epoch": 0.8449848024316109, "grad_norm": 1.8304115533828735, "learning_rate": 4.303675601330618e-06, "loss": 0.3474113941192627, "mean_token_accuracy": 0.8625503182411194, "num_tokens": 9309182.0, "step": 1112 }, { "epoch": 0.8457446808510638, "grad_norm": 2.3505935668945312, "learning_rate": 4.302224714210532e-06, "loss": 0.49418818950653076, "mean_token_accuracy": 0.83094322681427, "num_tokens": 9313790.0, "step": 1113 }, { "epoch": 0.8465045592705167, "grad_norm": 1.8783690929412842, "learning_rate": 4.3007725622197675e-06, "loss": 0.5591700077056885, "mean_token_accuracy": 0.7972525954246521, "num_tokens": 9323041.0, "step": 1114 }, { "epoch": 0.8472644376899696, "grad_norm": 2.3615951538085938, "learning_rate": 4.2993191463775e-06, "loss": 0.35907042026519775, "mean_token_accuracy": 0.8688554167747498, "num_tokens": 9327223.0, "step": 1115 }, { "epoch": 0.8480243161094225, "grad_norm": 1.6533325910568237, "learning_rate": 4.29786446770379e-06, "loss": 0.39073771238327026, "mean_token_accuracy": 0.8625138401985168, "num_tokens": 9335931.0, "step": 1116 }, { "epoch": 0.8487841945288754, "grad_norm": 2.0792975425720215, "learning_rate": 4.296408527219592e-06, "loss": 0.5227385759353638, "mean_token_accuracy": 0.8199836015701294, "num_tokens": 9345266.0, "step": 1117 }, { "epoch": 0.8495440729483282, "grad_norm": 1.550100326538086, "learning_rate": 4.294951325946737e-06, "loss": 0.43780404329299927, "mean_token_accuracy": 0.8482115268707275, "num_tokens": 9355750.0, "step": 1118 }, { "epoch": 0.8503039513677811, "grad_norm": 1.7280632257461548, "learning_rate": 4.293492864907947e-06, "loss": 0.5182158946990967, "mean_token_accuracy": 0.8175588846206665, "num_tokens": 9367116.0, "step": 1119 }, { "epoch": 0.851063829787234, "grad_norm": 2.0790135860443115, "learning_rate": 4.2920331451268246e-06, "loss": 0.5188535451889038, "mean_token_accuracy": 0.829055905342102, "num_tokens": 9373996.0, "step": 1120 }, { "epoch": 0.851823708206687, "grad_norm": 2.044121026992798, "learning_rate": 4.2905721676278585e-06, "loss": 0.44214338064193726, "mean_token_accuracy": 0.8467649221420288, "num_tokens": 9380188.0, "step": 1121 }, { "epoch": 0.8525835866261399, "grad_norm": 2.081540822982788, "learning_rate": 4.28910993343642e-06, "loss": 0.4633803963661194, "mean_token_accuracy": 0.8333624005317688, "num_tokens": 9387116.0, "step": 1122 }, { "epoch": 0.8533434650455927, "grad_norm": 2.5046215057373047, "learning_rate": 4.2876464435787576e-06, "loss": 0.45873305201530457, "mean_token_accuracy": 0.8383087515830994, "num_tokens": 9391493.0, "step": 1123 }, { "epoch": 0.8541033434650456, "grad_norm": 1.7710175514221191, "learning_rate": 4.286181699082008e-06, "loss": 0.4575156271457672, "mean_token_accuracy": 0.8370684385299683, "num_tokens": 9400244.0, "step": 1124 }, { "epoch": 0.8548632218844985, "grad_norm": 1.600028395652771, "learning_rate": 4.284715700974186e-06, "loss": 0.46320274472236633, "mean_token_accuracy": 0.8326346278190613, "num_tokens": 9410726.0, "step": 1125 }, { "epoch": 0.8556231003039514, "grad_norm": 1.8016510009765625, "learning_rate": 4.283248450284182e-06, "loss": 0.5639452338218689, "mean_token_accuracy": 0.797832727432251, "num_tokens": 9421552.0, "step": 1126 }, { "epoch": 0.8563829787234043, "grad_norm": 1.537709355354309, "learning_rate": 4.281779948041772e-06, "loss": 0.43862688541412354, "mean_token_accuracy": 0.8395617008209229, "num_tokens": 9432254.0, "step": 1127 }, { "epoch": 0.8571428571428571, "grad_norm": 1.6650274991989136, "learning_rate": 4.280310195277606e-06, "loss": 0.43908995389938354, "mean_token_accuracy": 0.8384156227111816, "num_tokens": 9443706.0, "step": 1128 }, { "epoch": 0.85790273556231, "grad_norm": 1.6471102237701416, "learning_rate": 4.278839193023214e-06, "loss": 0.4040481150150299, "mean_token_accuracy": 0.854622483253479, "num_tokens": 9453625.0, "step": 1129 }, { "epoch": 0.8586626139817629, "grad_norm": 1.708938717842102, "learning_rate": 4.277366942311001e-06, "loss": 0.36128726601600647, "mean_token_accuracy": 0.8723762631416321, "num_tokens": 9461228.0, "step": 1130 }, { "epoch": 0.8594224924012158, "grad_norm": 2.156235456466675, "learning_rate": 4.2758934441742494e-06, "loss": 0.360919326543808, "mean_token_accuracy": 0.8592379093170166, "num_tokens": 9466453.0, "step": 1131 }, { "epoch": 0.8601823708206687, "grad_norm": 2.0993404388427734, "learning_rate": 4.274418699647117e-06, "loss": 0.4781187176704407, "mean_token_accuracy": 0.8291562795639038, "num_tokens": 9473082.0, "step": 1132 }, { "epoch": 0.8609422492401215, "grad_norm": 1.4520338773727417, "learning_rate": 4.272942709764638e-06, "loss": 0.4698053300380707, "mean_token_accuracy": 0.8345690369606018, "num_tokens": 9485702.0, "step": 1133 }, { "epoch": 0.8617021276595744, "grad_norm": 1.8159806728363037, "learning_rate": 4.271465475562716e-06, "loss": 0.5262610912322998, "mean_token_accuracy": 0.8145499229431152, "num_tokens": 9494114.0, "step": 1134 }, { "epoch": 0.8624620060790273, "grad_norm": 2.170057535171509, "learning_rate": 4.269986998078132e-06, "loss": 0.5047216415405273, "mean_token_accuracy": 0.8333064317703247, "num_tokens": 9501322.0, "step": 1135 }, { "epoch": 0.8632218844984803, "grad_norm": 2.1477274894714355, "learning_rate": 4.268507278348539e-06, "loss": 0.571904182434082, "mean_token_accuracy": 0.786806046962738, "num_tokens": 9509542.0, "step": 1136 }, { "epoch": 0.8639817629179332, "grad_norm": 2.0406765937805176, "learning_rate": 4.2670263174124615e-06, "loss": 0.5546085238456726, "mean_token_accuracy": 0.8048046231269836, "num_tokens": 9516747.0, "step": 1137 }, { "epoch": 0.8647416413373861, "grad_norm": 1.9809539318084717, "learning_rate": 4.265544116309294e-06, "loss": 0.5239611864089966, "mean_token_accuracy": 0.8208121061325073, "num_tokens": 9524735.0, "step": 1138 }, { "epoch": 0.8655015197568389, "grad_norm": 2.7376081943511963, "learning_rate": 4.264060676079302e-06, "loss": 0.40633490681648254, "mean_token_accuracy": 0.8522694110870361, "num_tokens": 9528625.0, "step": 1139 }, { "epoch": 0.8662613981762918, "grad_norm": 2.4009523391723633, "learning_rate": 4.262575997763622e-06, "loss": 0.4452684223651886, "mean_token_accuracy": 0.8643718957901001, "num_tokens": 9533440.0, "step": 1140 }, { "epoch": 0.8670212765957447, "grad_norm": 1.9708255529403687, "learning_rate": 4.2610900824042575e-06, "loss": 0.44666150212287903, "mean_token_accuracy": 0.8368004560470581, "num_tokens": 9540148.0, "step": 1141 }, { "epoch": 0.8677811550151976, "grad_norm": 2.6039879322052, "learning_rate": 4.2596029310440826e-06, "loss": 0.5472103357315063, "mean_token_accuracy": 0.8208951354026794, "num_tokens": 9544815.0, "step": 1142 }, { "epoch": 0.8685410334346505, "grad_norm": 2.0362653732299805, "learning_rate": 4.258114544726835e-06, "loss": 0.37390074133872986, "mean_token_accuracy": 0.8741003274917603, "num_tokens": 9550444.0, "step": 1143 }, { "epoch": 0.8693009118541033, "grad_norm": 2.0087006092071533, "learning_rate": 4.256624924497124e-06, "loss": 0.389972448348999, "mean_token_accuracy": 0.865405261516571, "num_tokens": 9556925.0, "step": 1144 }, { "epoch": 0.8700607902735562, "grad_norm": 1.8859443664550781, "learning_rate": 4.25513407140042e-06, "loss": 0.3937550485134125, "mean_token_accuracy": 0.8506488800048828, "num_tokens": 9563105.0, "step": 1145 }, { "epoch": 0.8708206686930091, "grad_norm": 1.919185996055603, "learning_rate": 4.253641986483063e-06, "loss": 0.536502480506897, "mean_token_accuracy": 0.8274021148681641, "num_tokens": 9573580.0, "step": 1146 }, { "epoch": 0.871580547112462, "grad_norm": 1.3758544921875, "learning_rate": 4.2521486707922545e-06, "loss": 0.3626449406147003, "mean_token_accuracy": 0.8701262474060059, "num_tokens": 9583590.0, "step": 1147 }, { "epoch": 0.8723404255319149, "grad_norm": 1.4506094455718994, "learning_rate": 4.250654125376062e-06, "loss": 0.44432333111763, "mean_token_accuracy": 0.8455360531806946, "num_tokens": 9595693.0, "step": 1148 }, { "epoch": 0.8731003039513677, "grad_norm": 2.1039600372314453, "learning_rate": 4.249158351283414e-06, "loss": 0.38472849130630493, "mean_token_accuracy": 0.8649969100952148, "num_tokens": 9601707.0, "step": 1149 }, { "epoch": 0.8738601823708206, "grad_norm": 2.532032012939453, "learning_rate": 4.247661349564103e-06, "loss": 0.3970704972743988, "mean_token_accuracy": 0.8700840473175049, "num_tokens": 9606087.0, "step": 1150 }, { "epoch": 0.8746200607902735, "grad_norm": 1.7671585083007812, "learning_rate": 4.246163121268782e-06, "loss": 0.6075464487075806, "mean_token_accuracy": 0.8013469576835632, "num_tokens": 9618421.0, "step": 1151 }, { "epoch": 0.8753799392097265, "grad_norm": 2.5799081325531006, "learning_rate": 4.244663667448965e-06, "loss": 0.4843537509441376, "mean_token_accuracy": 0.8334331512451172, "num_tokens": 9623155.0, "step": 1152 }, { "epoch": 0.8761398176291794, "grad_norm": 1.2230721712112427, "learning_rate": 4.243162989157027e-06, "loss": 0.4277368485927582, "mean_token_accuracy": 0.8391464948654175, "num_tokens": 9639910.0, "step": 1153 }, { "epoch": 0.8768996960486323, "grad_norm": 1.9651098251342773, "learning_rate": 4.241661087446202e-06, "loss": 0.41046345233917236, "mean_token_accuracy": 0.8746967315673828, "num_tokens": 9645752.0, "step": 1154 }, { "epoch": 0.8776595744680851, "grad_norm": 2.1911747455596924, "learning_rate": 4.240157963370583e-06, "loss": 0.4212362766265869, "mean_token_accuracy": 0.8835186958312988, "num_tokens": 9651471.0, "step": 1155 }, { "epoch": 0.878419452887538, "grad_norm": 2.775716781616211, "learning_rate": 4.2386536179851175e-06, "loss": 0.48673486709594727, "mean_token_accuracy": 0.8401803374290466, "num_tokens": 9655139.0, "step": 1156 }, { "epoch": 0.8791793313069909, "grad_norm": 1.8965184688568115, "learning_rate": 4.2371480523456156e-06, "loss": 0.43706369400024414, "mean_token_accuracy": 0.8456496000289917, "num_tokens": 9662543.0, "step": 1157 }, { "epoch": 0.8799392097264438, "grad_norm": 2.1818747520446777, "learning_rate": 4.235641267508741e-06, "loss": 0.4406173825263977, "mean_token_accuracy": 0.8320701122283936, "num_tokens": 9668634.0, "step": 1158 }, { "epoch": 0.8806990881458967, "grad_norm": 2.274165153503418, "learning_rate": 4.234133264532012e-06, "loss": 0.3783491551876068, "mean_token_accuracy": 0.8656293749809265, "num_tokens": 9673012.0, "step": 1159 }, { "epoch": 0.8814589665653495, "grad_norm": 1.2731448411941528, "learning_rate": 4.232624044473805e-06, "loss": 0.3853245675563812, "mean_token_accuracy": 0.8399971127510071, "num_tokens": 9688294.0, "step": 1160 }, { "epoch": 0.8822188449848024, "grad_norm": 2.39668345451355, "learning_rate": 4.231113608393348e-06, "loss": 0.4661729633808136, "mean_token_accuracy": 0.840674638748169, "num_tokens": 9692995.0, "step": 1161 }, { "epoch": 0.8829787234042553, "grad_norm": 1.8885360956192017, "learning_rate": 4.229601957350722e-06, "loss": 0.5057909488677979, "mean_token_accuracy": 0.8230470418930054, "num_tokens": 9702390.0, "step": 1162 }, { "epoch": 0.8837386018237082, "grad_norm": 2.929244041442871, "learning_rate": 4.228089092406863e-06, "loss": 0.4609951674938202, "mean_token_accuracy": 0.8461986184120178, "num_tokens": 9705747.0, "step": 1163 }, { "epoch": 0.8844984802431611, "grad_norm": 2.2482638359069824, "learning_rate": 4.226575014623557e-06, "loss": 0.423616886138916, "mean_token_accuracy": 0.8468259572982788, "num_tokens": 9710930.0, "step": 1164 }, { "epoch": 0.8852583586626139, "grad_norm": 1.9508283138275146, "learning_rate": 4.225059725063444e-06, "loss": 0.5117239952087402, "mean_token_accuracy": 0.8230769038200378, "num_tokens": 9721006.0, "step": 1165 }, { "epoch": 0.8860182370820668, "grad_norm": 1.3771848678588867, "learning_rate": 4.22354322479001e-06, "loss": 0.42557036876678467, "mean_token_accuracy": 0.8458951711654663, "num_tokens": 9734669.0, "step": 1166 }, { "epoch": 0.8867781155015197, "grad_norm": 1.4104970693588257, "learning_rate": 4.222025514867596e-06, "loss": 0.42501896619796753, "mean_token_accuracy": 0.847164511680603, "num_tokens": 9749068.0, "step": 1167 }, { "epoch": 0.8875379939209727, "grad_norm": 3.016329050064087, "learning_rate": 4.220506596361387e-06, "loss": 0.5849587321281433, "mean_token_accuracy": 0.8366367816925049, "num_tokens": 9752946.0, "step": 1168 }, { "epoch": 0.8882978723404256, "grad_norm": 2.2328603267669678, "learning_rate": 4.218986470337419e-06, "loss": 0.5236795544624329, "mean_token_accuracy": 0.8177340626716614, "num_tokens": 9759186.0, "step": 1169 }, { "epoch": 0.8890577507598785, "grad_norm": 1.9724984169006348, "learning_rate": 4.217465137862575e-06, "loss": 0.5107414126396179, "mean_token_accuracy": 0.8174212574958801, "num_tokens": 9769609.0, "step": 1170 }, { "epoch": 0.8898176291793313, "grad_norm": 2.0076792240142822, "learning_rate": 4.215942600004586e-06, "loss": 0.4352399408817291, "mean_token_accuracy": 0.8545210361480713, "num_tokens": 9775924.0, "step": 1171 }, { "epoch": 0.8905775075987842, "grad_norm": 3.449615240097046, "learning_rate": 4.214418857832025e-06, "loss": 0.4250393211841583, "mean_token_accuracy": 0.8417866826057434, "num_tokens": 9778538.0, "step": 1172 }, { "epoch": 0.8913373860182371, "grad_norm": 1.9759560823440552, "learning_rate": 4.212893912414316e-06, "loss": 0.363790899515152, "mean_token_accuracy": 0.8873265981674194, "num_tokens": 9785470.0, "step": 1173 }, { "epoch": 0.89209726443769, "grad_norm": 1.957425594329834, "learning_rate": 4.211367764821722e-06, "loss": 0.5132498145103455, "mean_token_accuracy": 0.8276957273483276, "num_tokens": 9793590.0, "step": 1174 }, { "epoch": 0.8928571428571429, "grad_norm": 1.4993329048156738, "learning_rate": 4.209840416125353e-06, "loss": 0.4122268855571747, "mean_token_accuracy": 0.8521242737770081, "num_tokens": 9808072.0, "step": 1175 }, { "epoch": 0.8936170212765957, "grad_norm": 3.785543203353882, "learning_rate": 4.208311867397162e-06, "loss": 0.500885546207428, "mean_token_accuracy": 0.823648989200592, "num_tokens": 9810695.0, "step": 1176 }, { "epoch": 0.8943768996960486, "grad_norm": 2.0256595611572266, "learning_rate": 4.206782119709942e-06, "loss": 0.46186625957489014, "mean_token_accuracy": 0.8353452682495117, "num_tokens": 9817139.0, "step": 1177 }, { "epoch": 0.8951367781155015, "grad_norm": 1.8804218769073486, "learning_rate": 4.205251174137329e-06, "loss": 0.4735252261161804, "mean_token_accuracy": 0.829591691493988, "num_tokens": 9824537.0, "step": 1178 }, { "epoch": 0.8958966565349544, "grad_norm": 1.2467740774154663, "learning_rate": 4.2037190317538e-06, "loss": 0.48929882049560547, "mean_token_accuracy": 0.8184918165206909, "num_tokens": 9842195.0, "step": 1179 }, { "epoch": 0.8966565349544073, "grad_norm": 1.892195463180542, "learning_rate": 4.202185693634671e-06, "loss": 0.4691667854785919, "mean_token_accuracy": 0.8320895433425903, "num_tokens": 9850414.0, "step": 1180 }, { "epoch": 0.8974164133738601, "grad_norm": 1.490969181060791, "learning_rate": 4.200651160856099e-06, "loss": 0.39653098583221436, "mean_token_accuracy": 0.8524215817451477, "num_tokens": 9860301.0, "step": 1181 }, { "epoch": 0.898176291793313, "grad_norm": 2.427473545074463, "learning_rate": 4.1991154344950755e-06, "loss": 0.6368988156318665, "mean_token_accuracy": 0.7824795842170715, "num_tokens": 9868461.0, "step": 1182 }, { "epoch": 0.898936170212766, "grad_norm": 1.3861969709396362, "learning_rate": 4.197578515629435e-06, "loss": 0.43705660104751587, "mean_token_accuracy": 0.8433641195297241, "num_tokens": 9881573.0, "step": 1183 }, { "epoch": 0.8996960486322189, "grad_norm": 2.453946828842163, "learning_rate": 4.196040405337846e-06, "loss": 0.5822976231575012, "mean_token_accuracy": 0.8046828508377075, "num_tokens": 9887644.0, "step": 1184 }, { "epoch": 0.9004559270516718, "grad_norm": 2.364750385284424, "learning_rate": 4.194501104699813e-06, "loss": 0.43474334478378296, "mean_token_accuracy": 0.8531765937805176, "num_tokens": 9892590.0, "step": 1185 }, { "epoch": 0.9012158054711246, "grad_norm": 1.9483357667922974, "learning_rate": 4.192960614795676e-06, "loss": 0.46638405323028564, "mean_token_accuracy": 0.8380693197250366, "num_tokens": 9900462.0, "step": 1186 }, { "epoch": 0.9019756838905775, "grad_norm": 2.304919719696045, "learning_rate": 4.19141893670661e-06, "loss": 0.34754201769828796, "mean_token_accuracy": 0.8617918491363525, "num_tokens": 9905206.0, "step": 1187 }, { "epoch": 0.9027355623100304, "grad_norm": 2.311220645904541, "learning_rate": 4.189876071514624e-06, "loss": 0.47741419076919556, "mean_token_accuracy": 0.8241099119186401, "num_tokens": 9912405.0, "step": 1188 }, { "epoch": 0.9034954407294833, "grad_norm": 1.6040949821472168, "learning_rate": 4.188332020302561e-06, "loss": 0.4612511098384857, "mean_token_accuracy": 0.8551813364028931, "num_tokens": 9924688.0, "step": 1189 }, { "epoch": 0.9042553191489362, "grad_norm": 1.1393001079559326, "learning_rate": 4.186786784154096e-06, "loss": 0.3188606798648834, "mean_token_accuracy": 0.8754400610923767, "num_tokens": 9941307.0, "step": 1190 }, { "epoch": 0.9050151975683891, "grad_norm": 2.5400614738464355, "learning_rate": 4.1852403641537344e-06, "loss": 0.6303950548171997, "mean_token_accuracy": 0.7868539094924927, "num_tokens": 9948126.0, "step": 1191 }, { "epoch": 0.9057750759878419, "grad_norm": 2.0462563037872314, "learning_rate": 4.183692761386813e-06, "loss": 0.5419400930404663, "mean_token_accuracy": 0.8054205179214478, "num_tokens": 9956872.0, "step": 1192 }, { "epoch": 0.9065349544072948, "grad_norm": 1.4389076232910156, "learning_rate": 4.1821439769395e-06, "loss": 0.535871148109436, "mean_token_accuracy": 0.8235303163528442, "num_tokens": 9972294.0, "step": 1193 }, { "epoch": 0.9072948328267477, "grad_norm": 1.9653375148773193, "learning_rate": 4.180594011898791e-06, "loss": 0.45112964510917664, "mean_token_accuracy": 0.8362979888916016, "num_tokens": 9979369.0, "step": 1194 }, { "epoch": 0.9080547112462006, "grad_norm": 1.4190444946289062, "learning_rate": 4.1790428673525104e-06, "loss": 0.46389588713645935, "mean_token_accuracy": 0.8349757194519043, "num_tokens": 9993119.0, "step": 1195 }, { "epoch": 0.9088145896656535, "grad_norm": 2.262644052505493, "learning_rate": 4.177490544389313e-06, "loss": 0.4918975234031677, "mean_token_accuracy": 0.8344092965126038, "num_tokens": 9999478.0, "step": 1196 }, { "epoch": 0.9095744680851063, "grad_norm": 2.223912239074707, "learning_rate": 4.175937044098678e-06, "loss": 0.4718816578388214, "mean_token_accuracy": 0.8661066293716431, "num_tokens": 10004471.0, "step": 1197 }, { "epoch": 0.9103343465045592, "grad_norm": 1.6830681562423706, "learning_rate": 4.1743823675709115e-06, "loss": 0.3257449269294739, "mean_token_accuracy": 0.8767329454421997, "num_tokens": 10011467.0, "step": 1198 }, { "epoch": 0.9110942249240122, "grad_norm": 1.4707611799240112, "learning_rate": 4.172826515897146e-06, "loss": 0.38312655687332153, "mean_token_accuracy": 0.8530467748641968, "num_tokens": 10020594.0, "step": 1199 }, { "epoch": 0.9118541033434651, "grad_norm": 1.79310941696167, "learning_rate": 4.171269490169337e-06, "loss": 0.4557180404663086, "mean_token_accuracy": 0.8370726108551025, "num_tokens": 10028665.0, "step": 1200 }, { "epoch": 0.912613981762918, "grad_norm": 1.792206048965454, "learning_rate": 4.1697112914802665e-06, "loss": 0.5198315382003784, "mean_token_accuracy": 0.8252081871032715, "num_tokens": 10038161.0, "step": 1201 }, { "epoch": 0.9133738601823708, "grad_norm": 2.550187826156616, "learning_rate": 4.168151920923536e-06, "loss": 0.3525851368904114, "mean_token_accuracy": 0.8683363199234009, "num_tokens": 10041933.0, "step": 1202 }, { "epoch": 0.9141337386018237, "grad_norm": 2.041205406188965, "learning_rate": 4.1665913795935755e-06, "loss": 0.5075835585594177, "mean_token_accuracy": 0.8277581334114075, "num_tokens": 10052746.0, "step": 1203 }, { "epoch": 0.9148936170212766, "grad_norm": 2.9608535766601562, "learning_rate": 4.16502966858563e-06, "loss": 0.548503041267395, "mean_token_accuracy": 0.8154154419898987, "num_tokens": 10056917.0, "step": 1204 }, { "epoch": 0.9156534954407295, "grad_norm": 1.7683377265930176, "learning_rate": 4.163466788995768e-06, "loss": 0.5241387486457825, "mean_token_accuracy": 0.8099660873413086, "num_tokens": 10066813.0, "step": 1205 }, { "epoch": 0.9164133738601824, "grad_norm": 2.1658616065979004, "learning_rate": 4.161902741920881e-06, "loss": 0.47508829832077026, "mean_token_accuracy": 0.8315448760986328, "num_tokens": 10073419.0, "step": 1206 }, { "epoch": 0.9171732522796353, "grad_norm": 2.8079240322113037, "learning_rate": 4.160337528458676e-06, "loss": 0.46324583888053894, "mean_token_accuracy": 0.839920163154602, "num_tokens": 10077462.0, "step": 1207 }, { "epoch": 0.9179331306990881, "grad_norm": 1.479222297668457, "learning_rate": 4.15877114970768e-06, "loss": 0.4864335060119629, "mean_token_accuracy": 0.8332942724227905, "num_tokens": 10088906.0, "step": 1208 }, { "epoch": 0.918693009118541, "grad_norm": 1.8758645057678223, "learning_rate": 4.1572036067672386e-06, "loss": 0.5495859384536743, "mean_token_accuracy": 0.8107678294181824, "num_tokens": 10098744.0, "step": 1209 }, { "epoch": 0.9194528875379939, "grad_norm": 2.0267460346221924, "learning_rate": 4.155634900737513e-06, "loss": 0.5154146552085876, "mean_token_accuracy": 0.821948766708374, "num_tokens": 10106158.0, "step": 1210 }, { "epoch": 0.9202127659574468, "grad_norm": 2.3938241004943848, "learning_rate": 4.154065032719482e-06, "loss": 0.6861064434051514, "mean_token_accuracy": 0.7609241008758545, "num_tokens": 10113098.0, "step": 1211 }, { "epoch": 0.9209726443768997, "grad_norm": 1.5178842544555664, "learning_rate": 4.152494003814939e-06, "loss": 0.5226979851722717, "mean_token_accuracy": 0.8066086173057556, "num_tokens": 10129990.0, "step": 1212 }, { "epoch": 0.9217325227963525, "grad_norm": 1.6676033735275269, "learning_rate": 4.150921815126493e-06, "loss": 0.5749239325523376, "mean_token_accuracy": 0.8054403066635132, "num_tokens": 10142231.0, "step": 1213 }, { "epoch": 0.9224924012158054, "grad_norm": 2.393019914627075, "learning_rate": 4.149348467757566e-06, "loss": 0.37678688764572144, "mean_token_accuracy": 0.872439980506897, "num_tokens": 10146097.0, "step": 1214 }, { "epoch": 0.9232522796352584, "grad_norm": 1.5756078958511353, "learning_rate": 4.147773962812393e-06, "loss": 0.4058607220649719, "mean_token_accuracy": 0.8462677001953125, "num_tokens": 10158948.0, "step": 1215 }, { "epoch": 0.9240121580547113, "grad_norm": 2.3917694091796875, "learning_rate": 4.146198301396025e-06, "loss": 0.30616143345832825, "mean_token_accuracy": 0.8909944295883179, "num_tokens": 10163049.0, "step": 1216 }, { "epoch": 0.9247720364741642, "grad_norm": 1.7957452535629272, "learning_rate": 4.14462148461432e-06, "loss": 0.4330103099346161, "mean_token_accuracy": 0.8597830533981323, "num_tokens": 10170827.0, "step": 1217 }, { "epoch": 0.925531914893617, "grad_norm": 1.7771528959274292, "learning_rate": 4.143043513573949e-06, "loss": 0.48676225543022156, "mean_token_accuracy": 0.8317562937736511, "num_tokens": 10179928.0, "step": 1218 }, { "epoch": 0.9262917933130699, "grad_norm": 1.4172714948654175, "learning_rate": 4.141464389382392e-06, "loss": 0.5394001603126526, "mean_token_accuracy": 0.8138734102249146, "num_tokens": 10195894.0, "step": 1219 }, { "epoch": 0.9270516717325228, "grad_norm": 2.5023019313812256, "learning_rate": 4.13988411314794e-06, "loss": 0.5614622831344604, "mean_token_accuracy": 0.8114494681358337, "num_tokens": 10201574.0, "step": 1220 }, { "epoch": 0.9278115501519757, "grad_norm": 1.193745493888855, "learning_rate": 4.13830268597969e-06, "loss": 0.3519720137119293, "mean_token_accuracy": 0.8516223430633545, "num_tokens": 10217089.0, "step": 1221 }, { "epoch": 0.9285714285714286, "grad_norm": 2.1776585578918457, "learning_rate": 4.136720108987552e-06, "loss": 0.411466121673584, "mean_token_accuracy": 0.8498533964157104, "num_tokens": 10222290.0, "step": 1222 }, { "epoch": 0.9293313069908815, "grad_norm": 2.0673067569732666, "learning_rate": 4.1351363832822364e-06, "loss": 0.49273067712783813, "mean_token_accuracy": 0.8337151408195496, "num_tokens": 10230839.0, "step": 1223 }, { "epoch": 0.9300911854103343, "grad_norm": 1.2920328378677368, "learning_rate": 4.133551509975264e-06, "loss": 0.37604668736457825, "mean_token_accuracy": 0.8674043416976929, "num_tokens": 10244007.0, "step": 1224 }, { "epoch": 0.9308510638297872, "grad_norm": 1.5773303508758545, "learning_rate": 4.13196549017896e-06, "loss": 0.4213399887084961, "mean_token_accuracy": 0.846172034740448, "num_tokens": 10252530.0, "step": 1225 }, { "epoch": 0.9316109422492401, "grad_norm": 2.4205141067504883, "learning_rate": 4.130378325006453e-06, "loss": 0.47577419877052307, "mean_token_accuracy": 0.8168601989746094, "num_tokens": 10257253.0, "step": 1226 }, { "epoch": 0.932370820668693, "grad_norm": 1.8071359395980835, "learning_rate": 4.128790015571679e-06, "loss": 0.4768773317337036, "mean_token_accuracy": 0.8396903276443481, "num_tokens": 10264393.0, "step": 1227 }, { "epoch": 0.9331306990881459, "grad_norm": 1.382286548614502, "learning_rate": 4.127200562989372e-06, "loss": 0.3791172504425049, "mean_token_accuracy": 0.8651294112205505, "num_tokens": 10275987.0, "step": 1228 }, { "epoch": 0.9338905775075987, "grad_norm": 1.3842641115188599, "learning_rate": 4.125609968375073e-06, "loss": 0.47957515716552734, "mean_token_accuracy": 0.836072564125061, "num_tokens": 10292697.0, "step": 1229 }, { "epoch": 0.9346504559270516, "grad_norm": 1.9120399951934814, "learning_rate": 4.12401823284512e-06, "loss": 0.47652554512023926, "mean_token_accuracy": 0.8334099054336548, "num_tokens": 10300240.0, "step": 1230 }, { "epoch": 0.9354103343465046, "grad_norm": 1.3549633026123047, "learning_rate": 4.122425357516658e-06, "loss": 0.4168136715888977, "mean_token_accuracy": 0.8630322217941284, "num_tokens": 10314813.0, "step": 1231 }, { "epoch": 0.9361702127659575, "grad_norm": 2.5783302783966064, "learning_rate": 4.1208313435076255e-06, "loss": 0.35690945386886597, "mean_token_accuracy": 0.8721753358840942, "num_tokens": 10319712.0, "step": 1232 }, { "epoch": 0.9369300911854104, "grad_norm": 1.4351831674575806, "learning_rate": 4.119236191936764e-06, "loss": 0.5177080631256104, "mean_token_accuracy": 0.8338629007339478, "num_tokens": 10332993.0, "step": 1233 }, { "epoch": 0.9376899696048632, "grad_norm": 2.584526777267456, "learning_rate": 4.117639903923611e-06, "loss": 0.4224364757537842, "mean_token_accuracy": 0.8632140159606934, "num_tokens": 10336596.0, "step": 1234 }, { "epoch": 0.9384498480243161, "grad_norm": 1.6222294569015503, "learning_rate": 4.116042480588505e-06, "loss": 0.41391122341156006, "mean_token_accuracy": 0.8508138060569763, "num_tokens": 10345818.0, "step": 1235 }, { "epoch": 0.939209726443769, "grad_norm": 1.2619924545288086, "learning_rate": 4.114443923052577e-06, "loss": 0.3215915858745575, "mean_token_accuracy": 0.8670998811721802, "num_tokens": 10357409.0, "step": 1236 }, { "epoch": 0.9399696048632219, "grad_norm": 2.04646372795105, "learning_rate": 4.112844232437757e-06, "loss": 0.5404696464538574, "mean_token_accuracy": 0.8163861036300659, "num_tokens": 10364746.0, "step": 1237 }, { "epoch": 0.9407294832826748, "grad_norm": 1.3311513662338257, "learning_rate": 4.11124340986677e-06, "loss": 0.413362592458725, "mean_token_accuracy": 0.8577026724815369, "num_tokens": 10378665.0, "step": 1238 }, { "epoch": 0.9414893617021277, "grad_norm": 2.222412347793579, "learning_rate": 4.109641456463135e-06, "loss": 0.520854651927948, "mean_token_accuracy": 0.8256869912147522, "num_tokens": 10384012.0, "step": 1239 }, { "epoch": 0.9422492401215805, "grad_norm": 1.3949522972106934, "learning_rate": 4.108038373351163e-06, "loss": 0.4833412766456604, "mean_token_accuracy": 0.8231813907623291, "num_tokens": 10399015.0, "step": 1240 }, { "epoch": 0.9430091185410334, "grad_norm": 1.9517524242401123, "learning_rate": 4.106434161655962e-06, "loss": 0.45570656657218933, "mean_token_accuracy": 0.8405855298042297, "num_tokens": 10406191.0, "step": 1241 }, { "epoch": 0.9437689969604863, "grad_norm": 2.067592144012451, "learning_rate": 4.104828822503427e-06, "loss": 0.37799692153930664, "mean_token_accuracy": 0.8638370037078857, "num_tokens": 10412511.0, "step": 1242 }, { "epoch": 0.9445288753799392, "grad_norm": 1.7214694023132324, "learning_rate": 4.103222357020248e-06, "loss": 0.5492304563522339, "mean_token_accuracy": 0.8060051202774048, "num_tokens": 10424108.0, "step": 1243 }, { "epoch": 0.9452887537993921, "grad_norm": 2.3554961681365967, "learning_rate": 4.101614766333904e-06, "loss": 0.5702956914901733, "mean_token_accuracy": 0.7996999621391296, "num_tokens": 10430318.0, "step": 1244 }, { "epoch": 0.9460486322188449, "grad_norm": 1.6170350313186646, "learning_rate": 4.100006051572664e-06, "loss": 0.5266674160957336, "mean_token_accuracy": 0.8128297924995422, "num_tokens": 10440753.0, "step": 1245 }, { "epoch": 0.9468085106382979, "grad_norm": 1.9185911417007446, "learning_rate": 4.098396213865587e-06, "loss": 0.4864170551300049, "mean_token_accuracy": 0.8325300216674805, "num_tokens": 10448735.0, "step": 1246 }, { "epoch": 0.9475683890577508, "grad_norm": 1.6527823209762573, "learning_rate": 4.096785254342518e-06, "loss": 0.554766058921814, "mean_token_accuracy": 0.8109785318374634, "num_tokens": 10459911.0, "step": 1247 }, { "epoch": 0.9483282674772037, "grad_norm": 1.9447654485702515, "learning_rate": 4.095173174134091e-06, "loss": 0.4387738108634949, "mean_token_accuracy": 0.8387192487716675, "num_tokens": 10467265.0, "step": 1248 }, { "epoch": 0.9490881458966566, "grad_norm": 2.8531999588012695, "learning_rate": 4.093559974371725e-06, "loss": 0.4558791220188141, "mean_token_accuracy": 0.847366452217102, "num_tokens": 10470943.0, "step": 1249 }, { "epoch": 0.9498480243161094, "grad_norm": 2.2909798622131348, "learning_rate": 4.091945656187626e-06, "loss": 0.5094764828681946, "mean_token_accuracy": 0.8262864351272583, "num_tokens": 10476760.0, "step": 1250 }, { "epoch": 0.9506079027355623, "grad_norm": 1.796873688697815, "learning_rate": 4.090330220714785e-06, "loss": 0.40243974328041077, "mean_token_accuracy": 0.8680180311203003, "num_tokens": 10484044.0, "step": 1251 }, { "epoch": 0.9513677811550152, "grad_norm": 2.9503610134124756, "learning_rate": 4.0887136690869774e-06, "loss": 0.3909955620765686, "mean_token_accuracy": 0.8607697486877441, "num_tokens": 10487252.0, "step": 1252 }, { "epoch": 0.9521276595744681, "grad_norm": 2.81465744972229, "learning_rate": 4.08709600243876e-06, "loss": 0.35796794295310974, "mean_token_accuracy": 0.8860852718353271, "num_tokens": 10490424.0, "step": 1253 }, { "epoch": 0.952887537993921, "grad_norm": 1.9864373207092285, "learning_rate": 4.0854772219054735e-06, "loss": 0.5162959098815918, "mean_token_accuracy": 0.8130340576171875, "num_tokens": 10497965.0, "step": 1254 }, { "epoch": 0.9536474164133738, "grad_norm": 2.133432626724243, "learning_rate": 4.083857328623243e-06, "loss": 0.43794286251068115, "mean_token_accuracy": 0.8515105247497559, "num_tokens": 10503301.0, "step": 1255 }, { "epoch": 0.9544072948328267, "grad_norm": 1.7956633567810059, "learning_rate": 4.082236323728969e-06, "loss": 0.5198944211006165, "mean_token_accuracy": 0.8098677396774292, "num_tokens": 10511885.0, "step": 1256 }, { "epoch": 0.9551671732522796, "grad_norm": 1.925458550453186, "learning_rate": 4.0806142083603365e-06, "loss": 0.5314708352088928, "mean_token_accuracy": 0.8127579689025879, "num_tokens": 10519853.0, "step": 1257 }, { "epoch": 0.9559270516717325, "grad_norm": 1.7214282751083374, "learning_rate": 4.078990983655807e-06, "loss": 0.4490503668785095, "mean_token_accuracy": 0.8384820222854614, "num_tokens": 10528267.0, "step": 1258 }, { "epoch": 0.9566869300911854, "grad_norm": 1.8310840129852295, "learning_rate": 4.077366650754624e-06, "loss": 0.37812933325767517, "mean_token_accuracy": 0.8502515554428101, "num_tokens": 10535052.0, "step": 1259 }, { "epoch": 0.9574468085106383, "grad_norm": 1.756135106086731, "learning_rate": 4.075741210796806e-06, "loss": 0.39727646112442017, "mean_token_accuracy": 0.8540193438529968, "num_tokens": 10544095.0, "step": 1260 }, { "epoch": 0.9582066869300911, "grad_norm": 2.4405837059020996, "learning_rate": 4.07411466492315e-06, "loss": 0.4290313422679901, "mean_token_accuracy": 0.8591489791870117, "num_tokens": 10548589.0, "step": 1261 }, { "epoch": 0.958966565349544, "grad_norm": 2.362673759460449, "learning_rate": 4.072487014275228e-06, "loss": 0.39794155955314636, "mean_token_accuracy": 0.8577547073364258, "num_tokens": 10552947.0, "step": 1262 }, { "epoch": 0.959726443768997, "grad_norm": 1.691258192062378, "learning_rate": 4.070858259995388e-06, "loss": 0.5157690048217773, "mean_token_accuracy": 0.8210811614990234, "num_tokens": 10565805.0, "step": 1263 }, { "epoch": 0.9604863221884499, "grad_norm": 2.1436381340026855, "learning_rate": 4.069228403226751e-06, "loss": 0.44898298382759094, "mean_token_accuracy": 0.8381158113479614, "num_tokens": 10571611.0, "step": 1264 }, { "epoch": 0.9612462006079028, "grad_norm": 2.1539013385772705, "learning_rate": 4.067597445113216e-06, "loss": 0.4755209684371948, "mean_token_accuracy": 0.8289396166801453, "num_tokens": 10577419.0, "step": 1265 }, { "epoch": 0.9620060790273556, "grad_norm": 1.4557626247406006, "learning_rate": 4.06596538679945e-06, "loss": 0.4612187147140503, "mean_token_accuracy": 0.8334665298461914, "num_tokens": 10588902.0, "step": 1266 }, { "epoch": 0.9627659574468085, "grad_norm": 1.5206162929534912, "learning_rate": 4.064332229430895e-06, "loss": 0.35257554054260254, "mean_token_accuracy": 0.8940200805664062, "num_tokens": 10597638.0, "step": 1267 }, { "epoch": 0.9635258358662614, "grad_norm": 1.2261472940444946, "learning_rate": 4.062697974153764e-06, "loss": 0.33114510774612427, "mean_token_accuracy": 0.8718854784965515, "num_tokens": 10611311.0, "step": 1268 }, { "epoch": 0.9642857142857143, "grad_norm": 1.5283033847808838, "learning_rate": 4.06106262211504e-06, "loss": 0.41054391860961914, "mean_token_accuracy": 0.8406581878662109, "num_tokens": 10620846.0, "step": 1269 }, { "epoch": 0.9650455927051672, "grad_norm": 1.9345989227294922, "learning_rate": 4.059426174462476e-06, "loss": 0.5803008079528809, "mean_token_accuracy": 0.7999672889709473, "num_tokens": 10633034.0, "step": 1270 }, { "epoch": 0.96580547112462, "grad_norm": 1.9317878484725952, "learning_rate": 4.057788632344594e-06, "loss": 0.45455628633499146, "mean_token_accuracy": 0.8372423648834229, "num_tokens": 10640401.0, "step": 1271 }, { "epoch": 0.9665653495440729, "grad_norm": 1.8291925191879272, "learning_rate": 4.056149996910683e-06, "loss": 0.331752747297287, "mean_token_accuracy": 0.8972792625427246, "num_tokens": 10646491.0, "step": 1272 }, { "epoch": 0.9673252279635258, "grad_norm": 1.5805989503860474, "learning_rate": 4.054510269310803e-06, "loss": 0.5045905709266663, "mean_token_accuracy": 0.8284016251564026, "num_tokens": 10657822.0, "step": 1273 }, { "epoch": 0.9680851063829787, "grad_norm": 1.5921201705932617, "learning_rate": 4.052869450695776e-06, "loss": 0.43583858013153076, "mean_token_accuracy": 0.8400107622146606, "num_tokens": 10670025.0, "step": 1274 }, { "epoch": 0.9688449848024316, "grad_norm": 1.4225271940231323, "learning_rate": 4.051227542217192e-06, "loss": 0.5606710910797119, "mean_token_accuracy": 0.8063828945159912, "num_tokens": 10685524.0, "step": 1275 }, { "epoch": 0.9696048632218845, "grad_norm": 1.4348243474960327, "learning_rate": 4.049584545027406e-06, "loss": 0.4206877648830414, "mean_token_accuracy": 0.8815262317657471, "num_tokens": 10697750.0, "step": 1276 }, { "epoch": 0.9703647416413373, "grad_norm": 1.8710068464279175, "learning_rate": 4.047940460279537e-06, "loss": 0.48352116346359253, "mean_token_accuracy": 0.8380029201507568, "num_tokens": 10706279.0, "step": 1277 }, { "epoch": 0.9711246200607903, "grad_norm": 2.2742087841033936, "learning_rate": 4.046295289127466e-06, "loss": 0.5446169376373291, "mean_token_accuracy": 0.8434430360794067, "num_tokens": 10711875.0, "step": 1278 }, { "epoch": 0.9718844984802432, "grad_norm": 2.637559175491333, "learning_rate": 4.044649032725836e-06, "loss": 0.5013607144355774, "mean_token_accuracy": 0.8225820064544678, "num_tokens": 10717345.0, "step": 1279 }, { "epoch": 0.9726443768996961, "grad_norm": 2.215750217437744, "learning_rate": 4.0430016922300566e-06, "loss": 0.4070378243923187, "mean_token_accuracy": 0.8528624773025513, "num_tokens": 10723521.0, "step": 1280 }, { "epoch": 0.973404255319149, "grad_norm": 2.581561803817749, "learning_rate": 4.0413532687962926e-06, "loss": 0.4934103488922119, "mean_token_accuracy": 0.8133847713470459, "num_tokens": 10728399.0, "step": 1281 }, { "epoch": 0.9741641337386018, "grad_norm": 2.823772430419922, "learning_rate": 4.039703763581472e-06, "loss": 0.47714829444885254, "mean_token_accuracy": 0.8327613472938538, "num_tokens": 10732057.0, "step": 1282 }, { "epoch": 0.9749240121580547, "grad_norm": 2.436262845993042, "learning_rate": 4.038053177743279e-06, "loss": 0.40875184535980225, "mean_token_accuracy": 0.8475111722946167, "num_tokens": 10736191.0, "step": 1283 }, { "epoch": 0.9756838905775076, "grad_norm": 2.2619588375091553, "learning_rate": 4.036401512440161e-06, "loss": 0.574619472026825, "mean_token_accuracy": 0.8066798448562622, "num_tokens": 10743105.0, "step": 1284 }, { "epoch": 0.9764437689969605, "grad_norm": 1.8488695621490479, "learning_rate": 4.034748768831319e-06, "loss": 0.5067553520202637, "mean_token_accuracy": 0.82176673412323, "num_tokens": 10750550.0, "step": 1285 }, { "epoch": 0.9772036474164134, "grad_norm": 2.710092306137085, "learning_rate": 4.033094948076713e-06, "loss": 0.506095290184021, "mean_token_accuracy": 0.8233880996704102, "num_tokens": 10754890.0, "step": 1286 }, { "epoch": 0.9779635258358662, "grad_norm": 1.3619569540023804, "learning_rate": 4.031440051337056e-06, "loss": 0.42224937677383423, "mean_token_accuracy": 0.8432115912437439, "num_tokens": 10765736.0, "step": 1287 }, { "epoch": 0.9787234042553191, "grad_norm": 1.2954745292663574, "learning_rate": 4.02978407977382e-06, "loss": 0.4383229911327362, "mean_token_accuracy": 0.8429867029190063, "num_tokens": 10782590.0, "step": 1288 }, { "epoch": 0.979483282674772, "grad_norm": 1.589167594909668, "learning_rate": 4.02812703454923e-06, "loss": 0.5876135230064392, "mean_token_accuracy": 0.7890599966049194, "num_tokens": 10795779.0, "step": 1289 }, { "epoch": 0.9802431610942249, "grad_norm": 1.7081713676452637, "learning_rate": 4.026468916826262e-06, "loss": 0.3387969732284546, "mean_token_accuracy": 0.8774687051773071, "num_tokens": 10803312.0, "step": 1290 }, { "epoch": 0.9810030395136778, "grad_norm": 3.253678798675537, "learning_rate": 4.024809727768648e-06, "loss": 0.46856486797332764, "mean_token_accuracy": 0.8459595441818237, "num_tokens": 10806215.0, "step": 1291 }, { "epoch": 0.9817629179331308, "grad_norm": 1.3559294939041138, "learning_rate": 4.023149468540871e-06, "loss": 0.4177839159965515, "mean_token_accuracy": 0.8389544486999512, "num_tokens": 10822416.0, "step": 1292 }, { "epoch": 0.9825227963525835, "grad_norm": 1.7460740804672241, "learning_rate": 4.021488140308165e-06, "loss": 0.5268415808677673, "mean_token_accuracy": 0.8185181617736816, "num_tokens": 10833907.0, "step": 1293 }, { "epoch": 0.9832826747720365, "grad_norm": 4.290836811065674, "learning_rate": 4.019825744236514e-06, "loss": 0.37098920345306396, "mean_token_accuracy": 0.8725089430809021, "num_tokens": 10835692.0, "step": 1294 }, { "epoch": 0.9840425531914894, "grad_norm": 3.064326047897339, "learning_rate": 4.018162281492651e-06, "loss": 0.46853017807006836, "mean_token_accuracy": 0.8357318043708801, "num_tokens": 10839162.0, "step": 1295 }, { "epoch": 0.9848024316109423, "grad_norm": 2.7019288539886475, "learning_rate": 4.016497753244058e-06, "loss": 0.5403878688812256, "mean_token_accuracy": 0.8161486983299255, "num_tokens": 10843988.0, "step": 1296 }, { "epoch": 0.9855623100303952, "grad_norm": 1.5569218397140503, "learning_rate": 4.014832160658966e-06, "loss": 0.5340887308120728, "mean_token_accuracy": 0.8095526099205017, "num_tokens": 10861045.0, "step": 1297 }, { "epoch": 0.986322188449848, "grad_norm": 2.463092565536499, "learning_rate": 4.013165504906352e-06, "loss": 0.6377466917037964, "mean_token_accuracy": 0.7809834480285645, "num_tokens": 10867137.0, "step": 1298 }, { "epoch": 0.9870820668693009, "grad_norm": 1.9024714231491089, "learning_rate": 4.011497787155938e-06, "loss": 0.4147411584854126, "mean_token_accuracy": 0.8482648134231567, "num_tokens": 10874063.0, "step": 1299 }, { "epoch": 0.9878419452887538, "grad_norm": 2.0938241481781006, "learning_rate": 4.009829008578192e-06, "loss": 0.5130795240402222, "mean_token_accuracy": 0.8200770616531372, "num_tokens": 10881753.0, "step": 1300 }, { "epoch": 0.9886018237082067, "grad_norm": 3.5043177604675293, "learning_rate": 4.00815917034433e-06, "loss": 0.4879762530326843, "mean_token_accuracy": 0.8426535129547119, "num_tokens": 10884516.0, "step": 1301 }, { "epoch": 0.9893617021276596, "grad_norm": 2.2752530574798584, "learning_rate": 4.006488273626307e-06, "loss": 0.41869473457336426, "mean_token_accuracy": 0.8499717712402344, "num_tokens": 10889694.0, "step": 1302 }, { "epoch": 0.9901215805471124, "grad_norm": 1.936284065246582, "learning_rate": 4.004816319596822e-06, "loss": 0.5037603378295898, "mean_token_accuracy": 0.8333492279052734, "num_tokens": 10897549.0, "step": 1303 }, { "epoch": 0.9908814589665653, "grad_norm": 2.469581127166748, "learning_rate": 4.003143309429317e-06, "loss": 0.4547468423843384, "mean_token_accuracy": 0.8385593891143799, "num_tokens": 10902379.0, "step": 1304 }, { "epoch": 0.9916413373860182, "grad_norm": 2.388094425201416, "learning_rate": 4.0014692442979756e-06, "loss": 0.4164527654647827, "mean_token_accuracy": 0.8527310490608215, "num_tokens": 10906702.0, "step": 1305 }, { "epoch": 0.9924012158054711, "grad_norm": 2.7968716621398926, "learning_rate": 3.999794125377721e-06, "loss": 0.4521843194961548, "mean_token_accuracy": 0.8433413505554199, "num_tokens": 10910721.0, "step": 1306 }, { "epoch": 0.993161094224924, "grad_norm": 1.9788912534713745, "learning_rate": 3.998117953844215e-06, "loss": 0.42720502614974976, "mean_token_accuracy": 0.8394173979759216, "num_tokens": 10917495.0, "step": 1307 }, { "epoch": 0.993920972644377, "grad_norm": 1.477830410003662, "learning_rate": 3.996440730873861e-06, "loss": 0.518704891204834, "mean_token_accuracy": 0.8174512386322021, "num_tokens": 10930219.0, "step": 1308 }, { "epoch": 0.9946808510638298, "grad_norm": 1.4123990535736084, "learning_rate": 3.9947624576437975e-06, "loss": 0.38803890347480774, "mean_token_accuracy": 0.8713104724884033, "num_tokens": 10941779.0, "step": 1309 }, { "epoch": 0.9954407294832827, "grad_norm": 1.3544560670852661, "learning_rate": 3.9930831353319025e-06, "loss": 0.4507453739643097, "mean_token_accuracy": 0.8438398838043213, "num_tokens": 10957494.0, "step": 1310 }, { "epoch": 0.9962006079027356, "grad_norm": 2.010869026184082, "learning_rate": 3.9914027651167866e-06, "loss": 0.45589905977249146, "mean_token_accuracy": 0.8406643867492676, "num_tokens": 10963872.0, "step": 1311 }, { "epoch": 0.9969604863221885, "grad_norm": 1.9608005285263062, "learning_rate": 3.989721348177801e-06, "loss": 0.49592703580856323, "mean_token_accuracy": 0.8264990448951721, "num_tokens": 10970813.0, "step": 1312 }, { "epoch": 0.9977203647416414, "grad_norm": 2.632451057434082, "learning_rate": 3.988038885695028e-06, "loss": 0.3902280032634735, "mean_token_accuracy": 0.8589959144592285, "num_tokens": 10974356.0, "step": 1313 }, { "epoch": 0.9984802431610942, "grad_norm": 1.9065864086151123, "learning_rate": 3.986355378849284e-06, "loss": 0.4082326292991638, "mean_token_accuracy": 0.8369756937026978, "num_tokens": 10980809.0, "step": 1314 }, { "epoch": 0.9992401215805471, "grad_norm": 1.3190429210662842, "learning_rate": 3.984670828822118e-06, "loss": 0.48575127124786377, "mean_token_accuracy": 0.8632767200469971, "num_tokens": 10994050.0, "step": 1315 }, { "epoch": 1.0, "grad_norm": 1.5336809158325195, "learning_rate": 3.982985236795815e-06, "loss": 0.4253654181957245, "mean_token_accuracy": 0.8600625991821289, "num_tokens": 11005404.0, "step": 1316 }, { "epoch": 1.000759878419453, "grad_norm": 2.7663536071777344, "learning_rate": 3.981298603953385e-06, "loss": 0.3451944589614868, "mean_token_accuracy": 0.8792824149131775, "num_tokens": 11008382.0, "step": 1317 }, { "epoch": 1.0015197568389058, "grad_norm": 1.3342564105987549, "learning_rate": 3.979610931478574e-06, "loss": 0.3363626301288605, "mean_token_accuracy": 0.8803983926773071, "num_tokens": 11020879.0, "step": 1318 }, { "epoch": 1.0022796352583587, "grad_norm": 1.6832343339920044, "learning_rate": 3.977922220555855e-06, "loss": 0.2637443542480469, "mean_token_accuracy": 0.9024307727813721, "num_tokens": 11026895.0, "step": 1319 }, { "epoch": 1.0030395136778116, "grad_norm": 1.7672643661499023, "learning_rate": 3.976232472370431e-06, "loss": 0.5292864441871643, "mean_token_accuracy": 0.8412349224090576, "num_tokens": 11035794.0, "step": 1320 }, { "epoch": 1.0037993920972645, "grad_norm": 1.377899408340454, "learning_rate": 3.97454168810823e-06, "loss": 0.4089219272136688, "mean_token_accuracy": 0.8595932722091675, "num_tokens": 11046512.0, "step": 1321 }, { "epoch": 1.0045592705167172, "grad_norm": 1.5499588251113892, "learning_rate": 3.972849868955913e-06, "loss": 0.42919856309890747, "mean_token_accuracy": 0.8448736667633057, "num_tokens": 11057799.0, "step": 1322 }, { "epoch": 1.0053191489361701, "grad_norm": 2.09816312789917, "learning_rate": 3.97115701610086e-06, "loss": 0.3673965632915497, "mean_token_accuracy": 0.8695160150527954, "num_tokens": 11063522.0, "step": 1323 }, { "epoch": 1.006079027355623, "grad_norm": 1.5931336879730225, "learning_rate": 3.969463130731183e-06, "loss": 0.44143545627593994, "mean_token_accuracy": 0.8706151247024536, "num_tokens": 11073948.0, "step": 1324 }, { "epoch": 1.006838905775076, "grad_norm": 1.7318394184112549, "learning_rate": 3.967768214035716e-06, "loss": 0.4496203064918518, "mean_token_accuracy": 0.8377890586853027, "num_tokens": 11085080.0, "step": 1325 }, { "epoch": 1.0075987841945289, "grad_norm": 2.3798575401306152, "learning_rate": 3.966072267204014e-06, "loss": 0.42652463912963867, "mean_token_accuracy": 0.8440729379653931, "num_tokens": 11090131.0, "step": 1326 }, { "epoch": 1.0083586626139818, "grad_norm": 1.6186951398849487, "learning_rate": 3.964375291426361e-06, "loss": 0.34542495012283325, "mean_token_accuracy": 0.8747560977935791, "num_tokens": 11100936.0, "step": 1327 }, { "epoch": 1.0091185410334347, "grad_norm": 1.7011088132858276, "learning_rate": 3.962677287893758e-06, "loss": 0.3462398052215576, "mean_token_accuracy": 0.9064668416976929, "num_tokens": 11110193.0, "step": 1328 }, { "epoch": 1.0098784194528876, "grad_norm": 1.5807863473892212, "learning_rate": 3.9609782577979305e-06, "loss": 0.3543960750102997, "mean_token_accuracy": 0.8595926761627197, "num_tokens": 11118880.0, "step": 1329 }, { "epoch": 1.0106382978723405, "grad_norm": 2.4265849590301514, "learning_rate": 3.959278202331323e-06, "loss": 0.33727410435676575, "mean_token_accuracy": 0.8853530883789062, "num_tokens": 11123173.0, "step": 1330 }, { "epoch": 1.0113981762917934, "grad_norm": 2.9867868423461914, "learning_rate": 3.9575771226870986e-06, "loss": 0.3446230888366699, "mean_token_accuracy": 0.9053781032562256, "num_tokens": 11125970.0, "step": 1331 }, { "epoch": 1.012158054711246, "grad_norm": 1.580752968788147, "learning_rate": 3.955875020059141e-06, "loss": 0.3119526505470276, "mean_token_accuracy": 0.9117881059646606, "num_tokens": 11136056.0, "step": 1332 }, { "epoch": 1.012917933130699, "grad_norm": 1.9530012607574463, "learning_rate": 3.954171895642052e-06, "loss": 0.3199872076511383, "mean_token_accuracy": 0.8856668472290039, "num_tokens": 11141814.0, "step": 1333 }, { "epoch": 1.013677811550152, "grad_norm": 3.0786478519439697, "learning_rate": 3.9524677506311505e-06, "loss": 0.3572729825973511, "mean_token_accuracy": 0.877776026725769, "num_tokens": 11145375.0, "step": 1334 }, { "epoch": 1.0144376899696048, "grad_norm": 2.7258169651031494, "learning_rate": 3.950762586222469e-06, "loss": 0.37295839190483093, "mean_token_accuracy": 0.8641630411148071, "num_tokens": 11149481.0, "step": 1335 }, { "epoch": 1.0151975683890577, "grad_norm": 2.414124011993408, "learning_rate": 3.949056403612758e-06, "loss": 0.3907613754272461, "mean_token_accuracy": 0.8710864782333374, "num_tokens": 11155653.0, "step": 1336 }, { "epoch": 1.0159574468085106, "grad_norm": 2.250965118408203, "learning_rate": 3.947349203999485e-06, "loss": 0.33239802718162537, "mean_token_accuracy": 0.8823547959327698, "num_tokens": 11160610.0, "step": 1337 }, { "epoch": 1.0167173252279635, "grad_norm": 1.3921382427215576, "learning_rate": 3.945640988580824e-06, "loss": 0.39622247219085693, "mean_token_accuracy": 0.8702440857887268, "num_tokens": 11178180.0, "step": 1338 }, { "epoch": 1.0174772036474165, "grad_norm": 2.3083655834198, "learning_rate": 3.943931758555669e-06, "loss": 0.4329550862312317, "mean_token_accuracy": 0.8389226794242859, "num_tokens": 11184723.0, "step": 1339 }, { "epoch": 1.0182370820668694, "grad_norm": 2.50821852684021, "learning_rate": 3.942221515123624e-06, "loss": 0.25572317838668823, "mean_token_accuracy": 0.9061814546585083, "num_tokens": 11189124.0, "step": 1340 }, { "epoch": 1.0189969604863223, "grad_norm": 2.570591688156128, "learning_rate": 3.940510259485002e-06, "loss": 0.394476056098938, "mean_token_accuracy": 0.8653090000152588, "num_tokens": 11193574.0, "step": 1341 }, { "epoch": 1.0197568389057752, "grad_norm": 1.735337257385254, "learning_rate": 3.938797992840828e-06, "loss": 0.24641478061676025, "mean_token_accuracy": 0.9057514667510986, "num_tokens": 11201246.0, "step": 1342 }, { "epoch": 1.0205167173252279, "grad_norm": 2.720719575881958, "learning_rate": 3.937084716392839e-06, "loss": 0.4411257803440094, "mean_token_accuracy": 0.8552188873291016, "num_tokens": 11205736.0, "step": 1343 }, { "epoch": 1.0212765957446808, "grad_norm": 2.100266933441162, "learning_rate": 3.935370431343475e-06, "loss": 0.35856977105140686, "mean_token_accuracy": 0.8867555856704712, "num_tokens": 11213190.0, "step": 1344 }, { "epoch": 1.0220364741641337, "grad_norm": 1.8791909217834473, "learning_rate": 3.933655138895889e-06, "loss": 0.38474807143211365, "mean_token_accuracy": 0.870961606502533, "num_tokens": 11220148.0, "step": 1345 }, { "epoch": 1.0227963525835866, "grad_norm": 2.893890857696533, "learning_rate": 3.9319388402539395e-06, "loss": 0.5053108930587769, "mean_token_accuracy": 0.8277154564857483, "num_tokens": 11225104.0, "step": 1346 }, { "epoch": 1.0235562310030395, "grad_norm": 1.7271082401275635, "learning_rate": 3.930221536622192e-06, "loss": 0.449672669172287, "mean_token_accuracy": 0.8532121181488037, "num_tokens": 11235758.0, "step": 1347 }, { "epoch": 1.0243161094224924, "grad_norm": 1.3542752265930176, "learning_rate": 3.928503229205913e-06, "loss": 0.4087134599685669, "mean_token_accuracy": 0.8513451218605042, "num_tokens": 11250337.0, "step": 1348 }, { "epoch": 1.0250759878419453, "grad_norm": 2.0496902465820312, "learning_rate": 3.92678391921108e-06, "loss": 0.40581014752388, "mean_token_accuracy": 0.8489689826965332, "num_tokens": 11257845.0, "step": 1349 }, { "epoch": 1.0258358662613982, "grad_norm": 2.3670401573181152, "learning_rate": 3.92506360784437e-06, "loss": 0.28677645325660706, "mean_token_accuracy": 0.9186335802078247, "num_tokens": 11261746.0, "step": 1350 }, { "epoch": 1.0265957446808511, "grad_norm": 2.026749849319458, "learning_rate": 3.923342296313162e-06, "loss": 0.3303615152835846, "mean_token_accuracy": 0.8901066780090332, "num_tokens": 11267790.0, "step": 1351 }, { "epoch": 1.027355623100304, "grad_norm": 1.7830196619033813, "learning_rate": 3.92161998582554e-06, "loss": 0.5805932879447937, "mean_token_accuracy": 0.8018084764480591, "num_tokens": 11282571.0, "step": 1352 }, { "epoch": 1.028115501519757, "grad_norm": 0.9216806292533875, "learning_rate": 3.919896677590289e-06, "loss": 0.2916361689567566, "mean_token_accuracy": 0.8907076716423035, "num_tokens": 11307246.0, "step": 1353 }, { "epoch": 1.0288753799392096, "grad_norm": 1.757852554321289, "learning_rate": 3.918172372816892e-06, "loss": 0.36220526695251465, "mean_token_accuracy": 0.8659965991973877, "num_tokens": 11317266.0, "step": 1354 }, { "epoch": 1.0296352583586625, "grad_norm": 1.3394414186477661, "learning_rate": 3.916447072715531e-06, "loss": 0.34884899854660034, "mean_token_accuracy": 0.8732426762580872, "num_tokens": 11330861.0, "step": 1355 }, { "epoch": 1.0303951367781155, "grad_norm": 2.245530605316162, "learning_rate": 3.914720778497091e-06, "loss": 0.3577055335044861, "mean_token_accuracy": 0.869037926197052, "num_tokens": 11336723.0, "step": 1356 }, { "epoch": 1.0311550151975684, "grad_norm": 1.6682467460632324, "learning_rate": 3.91299349137315e-06, "loss": 0.4702274799346924, "mean_token_accuracy": 0.830884575843811, "num_tokens": 11347779.0, "step": 1357 }, { "epoch": 1.0319148936170213, "grad_norm": 1.6815375089645386, "learning_rate": 3.9112652125559845e-06, "loss": 0.4380429983139038, "mean_token_accuracy": 0.8415952920913696, "num_tokens": 11360489.0, "step": 1358 }, { "epoch": 1.0326747720364742, "grad_norm": 2.273190975189209, "learning_rate": 3.909535943258567e-06, "loss": 0.30491122603416443, "mean_token_accuracy": 0.8839948773384094, "num_tokens": 11365273.0, "step": 1359 }, { "epoch": 1.033434650455927, "grad_norm": 1.9378234148025513, "learning_rate": 3.907805684694567e-06, "loss": 0.31097856163978577, "mean_token_accuracy": 0.886725127696991, "num_tokens": 11372495.0, "step": 1360 }, { "epoch": 1.03419452887538, "grad_norm": 2.395353317260742, "learning_rate": 3.906074438078343e-06, "loss": 0.35972052812576294, "mean_token_accuracy": 0.8858376741409302, "num_tokens": 11377907.0, "step": 1361 }, { "epoch": 1.034954407294833, "grad_norm": 2.0552561283111572, "learning_rate": 3.904342204624955e-06, "loss": 0.3261054456233978, "mean_token_accuracy": 0.8893187046051025, "num_tokens": 11383410.0, "step": 1362 }, { "epoch": 1.0357142857142858, "grad_norm": 2.3459837436676025, "learning_rate": 3.9026089855501475e-06, "loss": 0.3857288360595703, "mean_token_accuracy": 0.8603960275650024, "num_tokens": 11389728.0, "step": 1363 }, { "epoch": 1.0364741641337385, "grad_norm": 2.31752872467041, "learning_rate": 3.900874782070362e-06, "loss": 0.27967965602874756, "mean_token_accuracy": 0.8794403672218323, "num_tokens": 11394794.0, "step": 1364 }, { "epoch": 1.0372340425531914, "grad_norm": 2.9405572414398193, "learning_rate": 3.899139595402729e-06, "loss": 0.31394150853157043, "mean_token_accuracy": 0.8815574645996094, "num_tokens": 11398298.0, "step": 1365 }, { "epoch": 1.0379939209726443, "grad_norm": 2.282053232192993, "learning_rate": 3.8974034267650695e-06, "loss": 0.20856636762619019, "mean_token_accuracy": 0.9048457741737366, "num_tokens": 11402352.0, "step": 1366 }, { "epoch": 1.0387537993920972, "grad_norm": 1.5712087154388428, "learning_rate": 3.895666277375892e-06, "loss": 0.30795225501060486, "mean_token_accuracy": 0.8704116344451904, "num_tokens": 11411693.0, "step": 1367 }, { "epoch": 1.0395136778115501, "grad_norm": 1.274933934211731, "learning_rate": 3.893928148454398e-06, "loss": 0.4024714529514313, "mean_token_accuracy": 0.8475641012191772, "num_tokens": 11431415.0, "step": 1368 }, { "epoch": 1.040273556231003, "grad_norm": 2.504887342453003, "learning_rate": 3.89218904122047e-06, "loss": 0.41850197315216064, "mean_token_accuracy": 0.8508123159408569, "num_tokens": 11436678.0, "step": 1369 }, { "epoch": 1.041033434650456, "grad_norm": 4.004100322723389, "learning_rate": 3.890448956894682e-06, "loss": 0.29856380820274353, "mean_token_accuracy": 0.8966588973999023, "num_tokens": 11439022.0, "step": 1370 }, { "epoch": 1.0417933130699089, "grad_norm": 2.8129076957702637, "learning_rate": 3.888707896698293e-06, "loss": 0.44774293899536133, "mean_token_accuracy": 0.8567836284637451, "num_tokens": 11443905.0, "step": 1371 }, { "epoch": 1.0425531914893618, "grad_norm": 2.1379714012145996, "learning_rate": 3.886965861853243e-06, "loss": 0.40404194593429565, "mean_token_accuracy": 0.8536176681518555, "num_tokens": 11450570.0, "step": 1372 }, { "epoch": 1.0433130699088147, "grad_norm": 2.6106762886047363, "learning_rate": 3.885222853582163e-06, "loss": 0.27475211024284363, "mean_token_accuracy": 0.9190499782562256, "num_tokens": 11454132.0, "step": 1373 }, { "epoch": 1.0440729483282676, "grad_norm": 2.5278308391571045, "learning_rate": 3.88347887310836e-06, "loss": 0.38727957010269165, "mean_token_accuracy": 0.8602001667022705, "num_tokens": 11459484.0, "step": 1374 }, { "epoch": 1.0448328267477203, "grad_norm": 1.303685188293457, "learning_rate": 3.881733921655829e-06, "loss": 0.32009151577949524, "mean_token_accuracy": 0.8958115577697754, "num_tokens": 11473571.0, "step": 1375 }, { "epoch": 1.0455927051671732, "grad_norm": 1.6037408113479614, "learning_rate": 3.879988000449243e-06, "loss": 0.31941795349121094, "mean_token_accuracy": 0.8864378929138184, "num_tokens": 11483496.0, "step": 1376 }, { "epoch": 1.046352583586626, "grad_norm": 1.6573742628097534, "learning_rate": 3.878241110713957e-06, "loss": 0.47611358761787415, "mean_token_accuracy": 0.8232378959655762, "num_tokens": 11495001.0, "step": 1377 }, { "epoch": 1.047112462006079, "grad_norm": 2.734983444213867, "learning_rate": 3.876493253676004e-06, "loss": 0.35185158252716064, "mean_token_accuracy": 0.8742201924324036, "num_tokens": 11498780.0, "step": 1378 }, { "epoch": 1.047872340425532, "grad_norm": 1.8115712404251099, "learning_rate": 3.8747444305621e-06, "loss": 0.2538357675075531, "mean_token_accuracy": 0.9031987190246582, "num_tokens": 11505072.0, "step": 1379 }, { "epoch": 1.0486322188449848, "grad_norm": 2.2728946208953857, "learning_rate": 3.872994642599635e-06, "loss": 0.4449647068977356, "mean_token_accuracy": 0.8539146184921265, "num_tokens": 11512160.0, "step": 1380 }, { "epoch": 1.0493920972644377, "grad_norm": 2.083383560180664, "learning_rate": 3.871243891016676e-06, "loss": 0.5337638258934021, "mean_token_accuracy": 0.8453105092048645, "num_tokens": 11522237.0, "step": 1381 }, { "epoch": 1.0501519756838906, "grad_norm": 1.7587594985961914, "learning_rate": 3.869492177041971e-06, "loss": 0.3790983557701111, "mean_token_accuracy": 0.8734872341156006, "num_tokens": 11533522.0, "step": 1382 }, { "epoch": 1.0509118541033435, "grad_norm": 1.866719126701355, "learning_rate": 3.867739501904938e-06, "loss": 0.2584952414035797, "mean_token_accuracy": 0.9075443744659424, "num_tokens": 11539160.0, "step": 1383 }, { "epoch": 1.0516717325227964, "grad_norm": 1.4472569227218628, "learning_rate": 3.8659858668356735e-06, "loss": 0.3662617802619934, "mean_token_accuracy": 0.8581643104553223, "num_tokens": 11552167.0, "step": 1384 }, { "epoch": 1.0524316109422491, "grad_norm": 1.7949128150939941, "learning_rate": 3.864231273064944e-06, "loss": 0.363717645406723, "mean_token_accuracy": 0.8737319707870483, "num_tokens": 11559745.0, "step": 1385 }, { "epoch": 1.053191489361702, "grad_norm": 1.84903883934021, "learning_rate": 3.862475721824193e-06, "loss": 0.2613283395767212, "mean_token_accuracy": 0.9048352837562561, "num_tokens": 11566155.0, "step": 1386 }, { "epoch": 1.053951367781155, "grad_norm": 1.7276867628097534, "learning_rate": 3.8607192143455325e-06, "loss": 0.34503084421157837, "mean_token_accuracy": 0.89206463098526, "num_tokens": 11574487.0, "step": 1387 }, { "epoch": 1.0547112462006079, "grad_norm": 1.379459261894226, "learning_rate": 3.858961751861748e-06, "loss": 0.400322288274765, "mean_token_accuracy": 0.8522929549217224, "num_tokens": 11587310.0, "step": 1388 }, { "epoch": 1.0554711246200608, "grad_norm": 2.2029714584350586, "learning_rate": 3.857203335606294e-06, "loss": 0.3725644052028656, "mean_token_accuracy": 0.8578725457191467, "num_tokens": 11593127.0, "step": 1389 }, { "epoch": 1.0562310030395137, "grad_norm": 2.7535955905914307, "learning_rate": 3.855443966813295e-06, "loss": 0.17230795323848724, "mean_token_accuracy": 0.9371508359909058, "num_tokens": 11595763.0, "step": 1390 }, { "epoch": 1.0569908814589666, "grad_norm": 2.1638691425323486, "learning_rate": 3.853683646717543e-06, "loss": 0.30789250135421753, "mean_token_accuracy": 0.903866171836853, "num_tokens": 11601895.0, "step": 1391 }, { "epoch": 1.0577507598784195, "grad_norm": 2.335332155227661, "learning_rate": 3.8519223765544985e-06, "loss": 0.3670365512371063, "mean_token_accuracy": 0.8685783743858337, "num_tokens": 11606881.0, "step": 1392 }, { "epoch": 1.0585106382978724, "grad_norm": 2.1893110275268555, "learning_rate": 3.85016015756029e-06, "loss": 0.33741292357444763, "mean_token_accuracy": 0.8927346467971802, "num_tokens": 11612317.0, "step": 1393 }, { "epoch": 1.0592705167173253, "grad_norm": 1.1562182903289795, "learning_rate": 3.848396990971709e-06, "loss": 0.29777291417121887, "mean_token_accuracy": 0.8729532361030579, "num_tokens": 11629791.0, "step": 1394 }, { "epoch": 1.0600303951367782, "grad_norm": 2.560303211212158, "learning_rate": 3.846632878026214e-06, "loss": 0.44697558879852295, "mean_token_accuracy": 0.8525060415267944, "num_tokens": 11635448.0, "step": 1395 }, { "epoch": 1.060790273556231, "grad_norm": 1.6030131578445435, "learning_rate": 3.844867819961928e-06, "loss": 0.46937382221221924, "mean_token_accuracy": 0.8522174954414368, "num_tokens": 11648569.0, "step": 1396 }, { "epoch": 1.0615501519756838, "grad_norm": 2.391331911087036, "learning_rate": 3.843101818017637e-06, "loss": 0.30766090750694275, "mean_token_accuracy": 0.881351113319397, "num_tokens": 11653765.0, "step": 1397 }, { "epoch": 1.0623100303951367, "grad_norm": 1.8748983144760132, "learning_rate": 3.841334873432789e-06, "loss": 0.4563617408275604, "mean_token_accuracy": 0.8368140459060669, "num_tokens": 11662469.0, "step": 1398 }, { "epoch": 1.0630699088145896, "grad_norm": 1.7024413347244263, "learning_rate": 3.839566987447492e-06, "loss": 0.39883697032928467, "mean_token_accuracy": 0.851905882358551, "num_tokens": 11672422.0, "step": 1399 }, { "epoch": 1.0638297872340425, "grad_norm": 1.9254841804504395, "learning_rate": 3.837798161302518e-06, "loss": 0.3974847197532654, "mean_token_accuracy": 0.8470984697341919, "num_tokens": 11679748.0, "step": 1400 }, { "epoch": 1.0645896656534954, "grad_norm": 2.4537670612335205, "learning_rate": 3.836028396239297e-06, "loss": 0.4169984757900238, "mean_token_accuracy": 0.891655683517456, "num_tokens": 11685377.0, "step": 1401 }, { "epoch": 1.0653495440729484, "grad_norm": 2.5760183334350586, "learning_rate": 3.8342576934999184e-06, "loss": 0.3128034472465515, "mean_token_accuracy": 0.8775366544723511, "num_tokens": 11689860.0, "step": 1402 }, { "epoch": 1.0661094224924013, "grad_norm": 2.4413959980010986, "learning_rate": 3.832486054327131e-06, "loss": 0.38623762130737305, "mean_token_accuracy": 0.8748388886451721, "num_tokens": 11694100.0, "step": 1403 }, { "epoch": 1.0668693009118542, "grad_norm": 2.822531223297119, "learning_rate": 3.830713479964335e-06, "loss": 0.3522868752479553, "mean_token_accuracy": 0.8735017776489258, "num_tokens": 11698136.0, "step": 1404 }, { "epoch": 1.067629179331307, "grad_norm": 1.5611909627914429, "learning_rate": 3.828939971655595e-06, "loss": 0.23979251086711884, "mean_token_accuracy": 0.9187682271003723, "num_tokens": 11706949.0, "step": 1405 }, { "epoch": 1.06838905775076, "grad_norm": 1.4100412130355835, "learning_rate": 3.827165530645627e-06, "loss": 0.40331876277923584, "mean_token_accuracy": 0.8594472408294678, "num_tokens": 11722728.0, "step": 1406 }, { "epoch": 1.0691489361702127, "grad_norm": 1.9237185716629028, "learning_rate": 3.825390158179802e-06, "loss": 0.3863624334335327, "mean_token_accuracy": 0.8645744323730469, "num_tokens": 11730364.0, "step": 1407 }, { "epoch": 1.0699088145896656, "grad_norm": 1.8858916759490967, "learning_rate": 3.823613855504144e-06, "loss": 0.4017430543899536, "mean_token_accuracy": 0.8647187352180481, "num_tokens": 11738849.0, "step": 1408 }, { "epoch": 1.0706686930091185, "grad_norm": 2.2345480918884277, "learning_rate": 3.82183662386533e-06, "loss": 0.2748807668685913, "mean_token_accuracy": 0.9032738208770752, "num_tokens": 11743345.0, "step": 1409 }, { "epoch": 1.0714285714285714, "grad_norm": 2.4631876945495605, "learning_rate": 3.82005846451069e-06, "loss": 0.39308279752731323, "mean_token_accuracy": 0.8704702854156494, "num_tokens": 11748876.0, "step": 1410 }, { "epoch": 1.0721884498480243, "grad_norm": 2.1964359283447266, "learning_rate": 3.8182793786882065e-06, "loss": 0.42337566614151, "mean_token_accuracy": 0.8442268967628479, "num_tokens": 11756352.0, "step": 1411 }, { "epoch": 1.0729483282674772, "grad_norm": 2.060384511947632, "learning_rate": 3.816499367646508e-06, "loss": 0.3529473841190338, "mean_token_accuracy": 0.8795508146286011, "num_tokens": 11763932.0, "step": 1412 }, { "epoch": 1.0737082066869301, "grad_norm": 2.659391164779663, "learning_rate": 3.814718432634877e-06, "loss": 0.4001883268356323, "mean_token_accuracy": 0.8584083914756775, "num_tokens": 11768794.0, "step": 1413 }, { "epoch": 1.074468085106383, "grad_norm": 2.447317600250244, "learning_rate": 3.8129365749032398e-06, "loss": 0.35926005244255066, "mean_token_accuracy": 0.8771983981132507, "num_tokens": 11773076.0, "step": 1414 }, { "epoch": 1.075227963525836, "grad_norm": 3.427628993988037, "learning_rate": 3.8111537957021736e-06, "loss": 0.4036615490913391, "mean_token_accuracy": 0.8481444120407104, "num_tokens": 11775866.0, "step": 1415 }, { "epoch": 1.0759878419452888, "grad_norm": 2.7665066719055176, "learning_rate": 3.809370096282903e-06, "loss": 0.4091006815433502, "mean_token_accuracy": 0.8559435606002808, "num_tokens": 11780729.0, "step": 1416 }, { "epoch": 1.0767477203647418, "grad_norm": 1.8218594789505005, "learning_rate": 3.807585477897296e-06, "loss": 0.4649350643157959, "mean_token_accuracy": 0.8379872441291809, "num_tokens": 11789674.0, "step": 1417 }, { "epoch": 1.0775075987841944, "grad_norm": 1.4760321378707886, "learning_rate": 3.8057999417978654e-06, "loss": 0.3743603229522705, "mean_token_accuracy": 0.8585041761398315, "num_tokens": 11802490.0, "step": 1418 }, { "epoch": 1.0782674772036474, "grad_norm": 2.617767572402954, "learning_rate": 3.8040134892377702e-06, "loss": 0.20920544862747192, "mean_token_accuracy": 0.9186487197875977, "num_tokens": 11807567.0, "step": 1419 }, { "epoch": 1.0790273556231003, "grad_norm": 1.4821984767913818, "learning_rate": 3.802226121470811e-06, "loss": 0.41163522005081177, "mean_token_accuracy": 0.8519705533981323, "num_tokens": 11820757.0, "step": 1420 }, { "epoch": 1.0797872340425532, "grad_norm": 2.1887447834014893, "learning_rate": 3.800437839751432e-06, "loss": 0.36980634927749634, "mean_token_accuracy": 0.8542169332504272, "num_tokens": 11827426.0, "step": 1421 }, { "epoch": 1.080547112462006, "grad_norm": 1.8914815187454224, "learning_rate": 3.7986486453347183e-06, "loss": 0.44559159874916077, "mean_token_accuracy": 0.8481492400169373, "num_tokens": 11839579.0, "step": 1422 }, { "epoch": 1.081306990881459, "grad_norm": 1.4687765836715698, "learning_rate": 3.796858539476394e-06, "loss": 0.31115007400512695, "mean_token_accuracy": 0.8857030868530273, "num_tokens": 11849308.0, "step": 1423 }, { "epoch": 1.082066869300912, "grad_norm": 2.736539840698242, "learning_rate": 3.795067523432826e-06, "loss": 0.32111749053001404, "mean_token_accuracy": 0.8889198303222656, "num_tokens": 11853640.0, "step": 1424 }, { "epoch": 1.0828267477203648, "grad_norm": 1.2518724203109741, "learning_rate": 3.793275598461017e-06, "loss": 0.23590418696403503, "mean_token_accuracy": 0.9257948398590088, "num_tokens": 11865904.0, "step": 1425 }, { "epoch": 1.0835866261398177, "grad_norm": 1.4249815940856934, "learning_rate": 3.7914827658186104e-06, "loss": 0.47661063075065613, "mean_token_accuracy": 0.8478943705558777, "num_tokens": 11884266.0, "step": 1426 }, { "epoch": 1.0843465045592706, "grad_norm": 2.8841235637664795, "learning_rate": 3.7896890267638832e-06, "loss": 0.2311479151248932, "mean_token_accuracy": 0.9119724631309509, "num_tokens": 11887514.0, "step": 1427 }, { "epoch": 1.0851063829787233, "grad_norm": 2.929975986480713, "learning_rate": 3.787894382555752e-06, "loss": 0.316008985042572, "mean_token_accuracy": 0.8895289897918701, "num_tokens": 11890898.0, "step": 1428 }, { "epoch": 1.0858662613981762, "grad_norm": 2.674365997314453, "learning_rate": 3.7860988344537664e-06, "loss": 0.3950662314891815, "mean_token_accuracy": 0.857719898223877, "num_tokens": 11896259.0, "step": 1429 }, { "epoch": 1.0866261398176291, "grad_norm": 1.3362135887145996, "learning_rate": 3.7843023837181126e-06, "loss": 0.3880857825279236, "mean_token_accuracy": 0.8618451356887817, "num_tokens": 11912015.0, "step": 1430 }, { "epoch": 1.087386018237082, "grad_norm": 2.290315628051758, "learning_rate": 3.782505031609607e-06, "loss": 0.30825555324554443, "mean_token_accuracy": 0.8877434134483337, "num_tokens": 11917137.0, "step": 1431 }, { "epoch": 1.088145896656535, "grad_norm": 1.926459550857544, "learning_rate": 3.7807067793897006e-06, "loss": 0.24383944272994995, "mean_token_accuracy": 0.8988088369369507, "num_tokens": 11922794.0, "step": 1432 }, { "epoch": 1.0889057750759878, "grad_norm": 1.746978759765625, "learning_rate": 3.778907628320477e-06, "loss": 0.3701493740081787, "mean_token_accuracy": 0.8676667213439941, "num_tokens": 11931487.0, "step": 1433 }, { "epoch": 1.0896656534954408, "grad_norm": 2.1272482872009277, "learning_rate": 3.77710757966465e-06, "loss": 0.4992450177669525, "mean_token_accuracy": 0.8437712788581848, "num_tokens": 11939122.0, "step": 1434 }, { "epoch": 1.0904255319148937, "grad_norm": 1.8471245765686035, "learning_rate": 3.775306634685562e-06, "loss": 0.281790554523468, "mean_token_accuracy": 0.9027552604675293, "num_tokens": 11946959.0, "step": 1435 }, { "epoch": 1.0911854103343466, "grad_norm": 2.189923048019409, "learning_rate": 3.773504794647187e-06, "loss": 0.3650098443031311, "mean_token_accuracy": 0.8969080448150635, "num_tokens": 11952688.0, "step": 1436 }, { "epoch": 1.0919452887537995, "grad_norm": 2.885392665863037, "learning_rate": 3.771702060814123e-06, "loss": 0.2931934893131256, "mean_token_accuracy": 0.9101849794387817, "num_tokens": 11956847.0, "step": 1437 }, { "epoch": 1.0927051671732522, "grad_norm": 4.421146392822266, "learning_rate": 3.7698984344516e-06, "loss": 0.3534395098686218, "mean_token_accuracy": 0.8911710381507874, "num_tokens": 11959312.0, "step": 1438 }, { "epoch": 1.093465045592705, "grad_norm": 1.5125991106033325, "learning_rate": 3.7680939168254733e-06, "loss": 0.36164867877960205, "mean_token_accuracy": 0.8670996427536011, "num_tokens": 11972229.0, "step": 1439 }, { "epoch": 1.094224924012158, "grad_norm": 3.1464555263519287, "learning_rate": 3.7662885092022206e-06, "loss": 0.33704444766044617, "mean_token_accuracy": 0.8891934156417847, "num_tokens": 11975935.0, "step": 1440 }, { "epoch": 1.094984802431611, "grad_norm": 2.365015745162964, "learning_rate": 3.7644822128489476e-06, "loss": 0.38000303506851196, "mean_token_accuracy": 0.8699986934661865, "num_tokens": 11982432.0, "step": 1441 }, { "epoch": 1.0957446808510638, "grad_norm": 1.6171151399612427, "learning_rate": 3.7626750290333824e-06, "loss": 0.3563687801361084, "mean_token_accuracy": 0.8732521533966064, "num_tokens": 11992314.0, "step": 1442 }, { "epoch": 1.0965045592705167, "grad_norm": 1.715600609779358, "learning_rate": 3.7608669590238765e-06, "loss": 0.37600451707839966, "mean_token_accuracy": 0.8685288429260254, "num_tokens": 12001711.0, "step": 1443 }, { "epoch": 1.0972644376899696, "grad_norm": 1.5423352718353271, "learning_rate": 3.7590580040894025e-06, "loss": 0.3473413586616516, "mean_token_accuracy": 0.8610173463821411, "num_tokens": 12013918.0, "step": 1444 }, { "epoch": 1.0980243161094225, "grad_norm": 2.108903169631958, "learning_rate": 3.7572481654995554e-06, "loss": 0.38471078872680664, "mean_token_accuracy": 0.8662124276161194, "num_tokens": 12020161.0, "step": 1445 }, { "epoch": 1.0987841945288754, "grad_norm": 1.656293511390686, "learning_rate": 3.755437444524548e-06, "loss": 0.4608612656593323, "mean_token_accuracy": 0.8732191324234009, "num_tokens": 12034852.0, "step": 1446 }, { "epoch": 1.0995440729483283, "grad_norm": 1.6599282026290894, "learning_rate": 3.7536258424352164e-06, "loss": 0.45298242568969727, "mean_token_accuracy": 0.8398016095161438, "num_tokens": 12045340.0, "step": 1447 }, { "epoch": 1.1003039513677813, "grad_norm": 2.3555991649627686, "learning_rate": 3.75181336050301e-06, "loss": 0.3949180245399475, "mean_token_accuracy": 0.8541672229766846, "num_tokens": 12051005.0, "step": 1448 }, { "epoch": 1.101063829787234, "grad_norm": 1.4173258543014526, "learning_rate": 3.7500000000000005e-06, "loss": 0.3684481978416443, "mean_token_accuracy": 0.8532872200012207, "num_tokens": 12063517.0, "step": 1449 }, { "epoch": 1.1018237082066868, "grad_norm": 1.5571858882904053, "learning_rate": 3.7481857621988734e-06, "loss": 0.46177220344543457, "mean_token_accuracy": 0.8411046266555786, "num_tokens": 12075681.0, "step": 1450 }, { "epoch": 1.1025835866261398, "grad_norm": 2.3117988109588623, "learning_rate": 3.74637064837293e-06, "loss": 0.29612571001052856, "mean_token_accuracy": 0.9062831401824951, "num_tokens": 12081149.0, "step": 1451 }, { "epoch": 1.1033434650455927, "grad_norm": 1.2898906469345093, "learning_rate": 3.7445546597960882e-06, "loss": 0.390147864818573, "mean_token_accuracy": 0.8732152581214905, "num_tokens": 12099444.0, "step": 1452 }, { "epoch": 1.1041033434650456, "grad_norm": 2.3161065578460693, "learning_rate": 3.742737797742878e-06, "loss": 0.40039196610450745, "mean_token_accuracy": 0.853137731552124, "num_tokens": 12106202.0, "step": 1453 }, { "epoch": 1.1048632218844985, "grad_norm": 2.513338804244995, "learning_rate": 3.7409200634884425e-06, "loss": 0.45786619186401367, "mean_token_accuracy": 0.8235888481140137, "num_tokens": 12112595.0, "step": 1454 }, { "epoch": 1.1056231003039514, "grad_norm": 2.342236042022705, "learning_rate": 3.7391014583085384e-06, "loss": 0.3247642517089844, "mean_token_accuracy": 0.898206889629364, "num_tokens": 12117566.0, "step": 1455 }, { "epoch": 1.1063829787234043, "grad_norm": 1.6730194091796875, "learning_rate": 3.737281983479534e-06, "loss": 0.45829376578330994, "mean_token_accuracy": 0.8496729731559753, "num_tokens": 12131120.0, "step": 1456 }, { "epoch": 1.1071428571428572, "grad_norm": 1.4153143167495728, "learning_rate": 3.735461640278404e-06, "loss": 0.40482401847839355, "mean_token_accuracy": 0.8541895747184753, "num_tokens": 12147115.0, "step": 1457 }, { "epoch": 1.1079027355623101, "grad_norm": 2.6954190731048584, "learning_rate": 3.733640429982738e-06, "loss": 0.448765367269516, "mean_token_accuracy": 0.8386822938919067, "num_tokens": 12151884.0, "step": 1458 }, { "epoch": 1.108662613981763, "grad_norm": 1.4589649438858032, "learning_rate": 3.731818353870729e-06, "loss": 0.37637197971343994, "mean_token_accuracy": 0.861752986907959, "num_tokens": 12166117.0, "step": 1459 }, { "epoch": 1.1094224924012157, "grad_norm": 1.8451684713363647, "learning_rate": 3.729995413221183e-06, "loss": 0.4089604616165161, "mean_token_accuracy": 0.8519954681396484, "num_tokens": 12175467.0, "step": 1460 }, { "epoch": 1.1101823708206686, "grad_norm": 2.5690343379974365, "learning_rate": 3.7281716093135068e-06, "loss": 0.3374151885509491, "mean_token_accuracy": 0.8890966176986694, "num_tokens": 12179807.0, "step": 1461 }, { "epoch": 1.1109422492401215, "grad_norm": 1.2950849533081055, "learning_rate": 3.726346943427719e-06, "loss": 0.3300512731075287, "mean_token_accuracy": 0.8742559552192688, "num_tokens": 12195456.0, "step": 1462 }, { "epoch": 1.1117021276595744, "grad_norm": 2.2271523475646973, "learning_rate": 3.7245214168444388e-06, "loss": 0.37156787514686584, "mean_token_accuracy": 0.8629398345947266, "num_tokens": 12201306.0, "step": 1463 }, { "epoch": 1.1124620060790273, "grad_norm": 3.0538721084594727, "learning_rate": 3.722695030844891e-06, "loss": 0.28872793912887573, "mean_token_accuracy": 0.9002166986465454, "num_tokens": 12204478.0, "step": 1464 }, { "epoch": 1.1132218844984803, "grad_norm": 1.3828390836715698, "learning_rate": 3.7208677867109042e-06, "loss": 0.35568177700042725, "mean_token_accuracy": 0.8716834783554077, "num_tokens": 12218527.0, "step": 1465 }, { "epoch": 1.1139817629179332, "grad_norm": 2.493626594543457, "learning_rate": 3.7190396857249087e-06, "loss": 0.2706877887248993, "mean_token_accuracy": 0.9051929712295532, "num_tokens": 12223262.0, "step": 1466 }, { "epoch": 1.114741641337386, "grad_norm": 1.7273808717727661, "learning_rate": 3.7172107291699356e-06, "loss": 0.48286527395248413, "mean_token_accuracy": 0.8309057354927063, "num_tokens": 12232377.0, "step": 1467 }, { "epoch": 1.115501519756839, "grad_norm": 1.6607234477996826, "learning_rate": 3.7153809183296174e-06, "loss": 0.3631202280521393, "mean_token_accuracy": 0.8650490641593933, "num_tokens": 12243610.0, "step": 1468 }, { "epoch": 1.1162613981762919, "grad_norm": 2.3145484924316406, "learning_rate": 3.713550254488185e-06, "loss": 0.3859679400920868, "mean_token_accuracy": 0.8705494999885559, "num_tokens": 12248827.0, "step": 1469 }, { "epoch": 1.1170212765957448, "grad_norm": 1.723557710647583, "learning_rate": 3.7117187389304703e-06, "loss": 0.4872702956199646, "mean_token_accuracy": 0.8282966613769531, "num_tokens": 12259659.0, "step": 1470 }, { "epoch": 1.1177811550151975, "grad_norm": 3.102405309677124, "learning_rate": 3.7098863729418997e-06, "loss": 0.5511173009872437, "mean_token_accuracy": 0.8274194598197937, "num_tokens": 12264079.0, "step": 1471 }, { "epoch": 1.1185410334346504, "grad_norm": 1.495301365852356, "learning_rate": 3.7080531578085e-06, "loss": 0.3733213245868683, "mean_token_accuracy": 0.8546344041824341, "num_tokens": 12275920.0, "step": 1472 }, { "epoch": 1.1193009118541033, "grad_norm": 2.0601508617401123, "learning_rate": 3.7062190948168906e-06, "loss": 0.4016646146774292, "mean_token_accuracy": 0.8534223437309265, "num_tokens": 12283896.0, "step": 1473 }, { "epoch": 1.1200607902735562, "grad_norm": 2.1475353240966797, "learning_rate": 3.7043841852542884e-06, "loss": 0.4101397395133972, "mean_token_accuracy": 0.8393585681915283, "num_tokens": 12290774.0, "step": 1474 }, { "epoch": 1.1208206686930091, "grad_norm": 1.7980974912643433, "learning_rate": 3.7025484304085035e-06, "loss": 0.33343690633773804, "mean_token_accuracy": 0.897619366645813, "num_tokens": 12297760.0, "step": 1475 }, { "epoch": 1.121580547112462, "grad_norm": 2.538846254348755, "learning_rate": 3.7007118315679384e-06, "loss": 0.42348384857177734, "mean_token_accuracy": 0.8347535133361816, "num_tokens": 12303315.0, "step": 1476 }, { "epoch": 1.122340425531915, "grad_norm": 2.996518135070801, "learning_rate": 3.6988743900215895e-06, "loss": 0.35087358951568604, "mean_token_accuracy": 0.86887526512146, "num_tokens": 12306620.0, "step": 1477 }, { "epoch": 1.1231003039513678, "grad_norm": 3.1563260555267334, "learning_rate": 3.6970361070590443e-06, "loss": 0.39446836709976196, "mean_token_accuracy": 0.8532922267913818, "num_tokens": 12309850.0, "step": 1478 }, { "epoch": 1.1238601823708207, "grad_norm": 2.834465265274048, "learning_rate": 3.695196983970481e-06, "loss": 0.42086079716682434, "mean_token_accuracy": 0.8679853677749634, "num_tokens": 12314281.0, "step": 1479 }, { "epoch": 1.1246200607902737, "grad_norm": 1.9354487657546997, "learning_rate": 3.6933570220466654e-06, "loss": 0.4136340916156769, "mean_token_accuracy": 0.8501224517822266, "num_tokens": 12321531.0, "step": 1480 }, { "epoch": 1.1253799392097266, "grad_norm": 1.4114911556243896, "learning_rate": 3.6915162225789546e-06, "loss": 0.3551082909107208, "mean_token_accuracy": 0.8727633953094482, "num_tokens": 12335668.0, "step": 1481 }, { "epoch": 1.1261398176291793, "grad_norm": 1.948442816734314, "learning_rate": 3.6896745868592924e-06, "loss": 0.4041626751422882, "mean_token_accuracy": 0.8544884920120239, "num_tokens": 12343898.0, "step": 1482 }, { "epoch": 1.1268996960486322, "grad_norm": 2.7664108276367188, "learning_rate": 3.6878321161802106e-06, "loss": 0.25600776076316833, "mean_token_accuracy": 0.9070765972137451, "num_tokens": 12347466.0, "step": 1483 }, { "epoch": 1.127659574468085, "grad_norm": 2.0137786865234375, "learning_rate": 3.685988811834823e-06, "loss": 0.3179168403148651, "mean_token_accuracy": 0.8870049715042114, "num_tokens": 12355198.0, "step": 1484 }, { "epoch": 1.128419452887538, "grad_norm": 1.713525414466858, "learning_rate": 3.684144675116836e-06, "loss": 0.4507277011871338, "mean_token_accuracy": 0.8427691459655762, "num_tokens": 12366218.0, "step": 1485 }, { "epoch": 1.1291793313069909, "grad_norm": 1.6192375421524048, "learning_rate": 3.682299707320532e-06, "loss": 0.35650795698165894, "mean_token_accuracy": 0.8660146594047546, "num_tokens": 12379221.0, "step": 1486 }, { "epoch": 1.1299392097264438, "grad_norm": 2.271754503250122, "learning_rate": 3.680453909740782e-06, "loss": 0.40090543031692505, "mean_token_accuracy": 0.8541368246078491, "num_tokens": 12384826.0, "step": 1487 }, { "epoch": 1.1306990881458967, "grad_norm": 1.2213362455368042, "learning_rate": 3.6786072836730376e-06, "loss": 0.5232692360877991, "mean_token_accuracy": 0.8178102970123291, "num_tokens": 12408601.0, "step": 1488 }, { "epoch": 1.1314589665653496, "grad_norm": 2.0131008625030518, "learning_rate": 3.6767598304133325e-06, "loss": 0.43543386459350586, "mean_token_accuracy": 0.8439689874649048, "num_tokens": 12415830.0, "step": 1489 }, { "epoch": 1.1322188449848025, "grad_norm": 2.3572237491607666, "learning_rate": 3.674911551258279e-06, "loss": 0.4469905495643616, "mean_token_accuracy": 0.8547621965408325, "num_tokens": 12421872.0, "step": 1490 }, { "epoch": 1.1329787234042552, "grad_norm": 2.353266477584839, "learning_rate": 3.673062447505072e-06, "loss": 0.38335856795310974, "mean_token_accuracy": 0.8657129406929016, "num_tokens": 12426635.0, "step": 1491 }, { "epoch": 1.1337386018237081, "grad_norm": 1.1553866863250732, "learning_rate": 3.6712125204514836e-06, "loss": 0.3752373456954956, "mean_token_accuracy": 0.8691136837005615, "num_tokens": 12446778.0, "step": 1492 }, { "epoch": 1.134498480243161, "grad_norm": 2.294605016708374, "learning_rate": 3.6693617713958633e-06, "loss": 0.30367785692214966, "mean_token_accuracy": 0.8980157971382141, "num_tokens": 12451416.0, "step": 1493 }, { "epoch": 1.135258358662614, "grad_norm": 2.3219408988952637, "learning_rate": 3.6675102016371387e-06, "loss": 0.5061910152435303, "mean_token_accuracy": 0.8320388793945312, "num_tokens": 12457554.0, "step": 1494 }, { "epoch": 1.1360182370820668, "grad_norm": 2.217759132385254, "learning_rate": 3.665657812474812e-06, "loss": 0.47638267278671265, "mean_token_accuracy": 0.8277538418769836, "num_tokens": 12464994.0, "step": 1495 }, { "epoch": 1.1367781155015197, "grad_norm": 2.3062057495117188, "learning_rate": 3.6638046052089614e-06, "loss": 0.29094308614730835, "mean_token_accuracy": 0.8934022188186646, "num_tokens": 12469558.0, "step": 1496 }, { "epoch": 1.1375379939209727, "grad_norm": 3.5667340755462646, "learning_rate": 3.661950581140239e-06, "loss": 0.34060972929000854, "mean_token_accuracy": 0.8843199610710144, "num_tokens": 12472104.0, "step": 1497 }, { "epoch": 1.1382978723404256, "grad_norm": 2.2694637775421143, "learning_rate": 3.660095741569871e-06, "loss": 0.3833288550376892, "mean_token_accuracy": 0.8596650958061218, "num_tokens": 12477887.0, "step": 1498 }, { "epoch": 1.1390577507598785, "grad_norm": 1.8261189460754395, "learning_rate": 3.658240087799655e-06, "loss": 0.49372974038124084, "mean_token_accuracy": 0.8439922332763672, "num_tokens": 12490967.0, "step": 1499 }, { "epoch": 1.1398176291793314, "grad_norm": 2.4423866271972656, "learning_rate": 3.6563836211319593e-06, "loss": 0.37636855244636536, "mean_token_accuracy": 0.8861795663833618, "num_tokens": 12496192.0, "step": 1500 }, { "epoch": 1.1405775075987843, "grad_norm": 2.023763418197632, "learning_rate": 3.654526342869724e-06, "loss": 0.5067221522331238, "mean_token_accuracy": 0.8322572112083435, "num_tokens": 12505282.0, "step": 1501 }, { "epoch": 1.141337386018237, "grad_norm": 1.6983009576797485, "learning_rate": 3.65266825431646e-06, "loss": 0.38836103677749634, "mean_token_accuracy": 0.8607903718948364, "num_tokens": 12516463.0, "step": 1502 }, { "epoch": 1.1420972644376899, "grad_norm": 1.591574788093567, "learning_rate": 3.6508093567762425e-06, "loss": 0.3887024521827698, "mean_token_accuracy": 0.8567622900009155, "num_tokens": 12527111.0, "step": 1503 }, { "epoch": 1.1428571428571428, "grad_norm": 2.6263785362243652, "learning_rate": 3.6489496515537204e-06, "loss": 0.43836307525634766, "mean_token_accuracy": 0.8442060947418213, "num_tokens": 12535232.0, "step": 1504 }, { "epoch": 1.1436170212765957, "grad_norm": 2.653672218322754, "learning_rate": 3.647089139954104e-06, "loss": 0.4552849233150482, "mean_token_accuracy": 0.8436453342437744, "num_tokens": 12539707.0, "step": 1505 }, { "epoch": 1.1443768996960486, "grad_norm": 1.7937148809432983, "learning_rate": 3.6452278232831734e-06, "loss": 0.4441622793674469, "mean_token_accuracy": 0.8511509895324707, "num_tokens": 12550275.0, "step": 1506 }, { "epoch": 1.1451367781155015, "grad_norm": 1.7905175685882568, "learning_rate": 3.643365702847272e-06, "loss": 0.4939506947994232, "mean_token_accuracy": 0.8230897188186646, "num_tokens": 12559454.0, "step": 1507 }, { "epoch": 1.1458966565349544, "grad_norm": 1.7709134817123413, "learning_rate": 3.641502779953307e-06, "loss": 0.48544806241989136, "mean_token_accuracy": 0.8287543058395386, "num_tokens": 12569781.0, "step": 1508 }, { "epoch": 1.1466565349544073, "grad_norm": 1.3864372968673706, "learning_rate": 3.639639055908751e-06, "loss": 0.45125365257263184, "mean_token_accuracy": 0.83880615234375, "num_tokens": 12588955.0, "step": 1509 }, { "epoch": 1.1474164133738602, "grad_norm": 2.5531187057495117, "learning_rate": 3.6377745320216346e-06, "loss": 0.4347698390483856, "mean_token_accuracy": 0.8424967527389526, "num_tokens": 12594235.0, "step": 1510 }, { "epoch": 1.1481762917933132, "grad_norm": 1.8271414041519165, "learning_rate": 3.635909209600555e-06, "loss": 0.5132643580436707, "mean_token_accuracy": 0.8271039724349976, "num_tokens": 12605439.0, "step": 1511 }, { "epoch": 1.148936170212766, "grad_norm": 1.590615153312683, "learning_rate": 3.6340430899546656e-06, "loss": 0.42570042610168457, "mean_token_accuracy": 0.8479509353637695, "num_tokens": 12615211.0, "step": 1512 }, { "epoch": 1.1496960486322187, "grad_norm": 2.7645864486694336, "learning_rate": 3.632176174393682e-06, "loss": 0.18166863918304443, "mean_token_accuracy": 0.9362800121307373, "num_tokens": 12618107.0, "step": 1513 }, { "epoch": 1.1504559270516717, "grad_norm": 1.9422248601913452, "learning_rate": 3.630308464227877e-06, "loss": 0.47589507699012756, "mean_token_accuracy": 0.8552196621894836, "num_tokens": 12625911.0, "step": 1514 }, { "epoch": 1.1512158054711246, "grad_norm": 1.1898369789123535, "learning_rate": 3.628439960768082e-06, "loss": 0.31726109981536865, "mean_token_accuracy": 0.881921112537384, "num_tokens": 12642063.0, "step": 1515 }, { "epoch": 1.1519756838905775, "grad_norm": 1.3746618032455444, "learning_rate": 3.6265706653256837e-06, "loss": 0.426246702671051, "mean_token_accuracy": 0.8429036140441895, "num_tokens": 12660573.0, "step": 1516 }, { "epoch": 1.1527355623100304, "grad_norm": 1.4866091012954712, "learning_rate": 3.624700579212626e-06, "loss": 0.2841907739639282, "mean_token_accuracy": 0.8877781629562378, "num_tokens": 12670045.0, "step": 1517 }, { "epoch": 1.1534954407294833, "grad_norm": 2.1897432804107666, "learning_rate": 3.6228297037414077e-06, "loss": 0.38316023349761963, "mean_token_accuracy": 0.8632174134254456, "num_tokens": 12675733.0, "step": 1518 }, { "epoch": 1.1542553191489362, "grad_norm": 1.7656269073486328, "learning_rate": 3.6209580402250816e-06, "loss": 0.3879716992378235, "mean_token_accuracy": 0.8587035536766052, "num_tokens": 12683632.0, "step": 1519 }, { "epoch": 1.155015197568389, "grad_norm": 1.569054365158081, "learning_rate": 3.619085589977251e-06, "loss": 0.42267853021621704, "mean_token_accuracy": 0.8511590361595154, "num_tokens": 12694600.0, "step": 1520 }, { "epoch": 1.155775075987842, "grad_norm": 1.8671791553497314, "learning_rate": 3.617212354312076e-06, "loss": 0.2932406961917877, "mean_token_accuracy": 0.8919585943222046, "num_tokens": 12701164.0, "step": 1521 }, { "epoch": 1.156534954407295, "grad_norm": 2.223289728164673, "learning_rate": 3.615338334544265e-06, "loss": 0.41690003871917725, "mean_token_accuracy": 0.8450136184692383, "num_tokens": 12708007.0, "step": 1522 }, { "epoch": 1.1572948328267478, "grad_norm": 2.271348476409912, "learning_rate": 3.6134635319890763e-06, "loss": 0.4478079676628113, "mean_token_accuracy": 0.8406678438186646, "num_tokens": 12715241.0, "step": 1523 }, { "epoch": 1.1580547112462005, "grad_norm": 2.334249973297119, "learning_rate": 3.611587947962319e-06, "loss": 0.33422547578811646, "mean_token_accuracy": 0.8784642219543457, "num_tokens": 12720260.0, "step": 1524 }, { "epoch": 1.1588145896656534, "grad_norm": 3.5312418937683105, "learning_rate": 3.6097115837803504e-06, "loss": 0.2890721559524536, "mean_token_accuracy": 0.8949326276779175, "num_tokens": 12723120.0, "step": 1525 }, { "epoch": 1.1595744680851063, "grad_norm": 2.656132698059082, "learning_rate": 3.6078344407600744e-06, "loss": 0.34816238284111023, "mean_token_accuracy": 0.8654837608337402, "num_tokens": 12727976.0, "step": 1526 }, { "epoch": 1.1603343465045592, "grad_norm": 2.117017984390259, "learning_rate": 3.6059565202189433e-06, "loss": 0.43055999279022217, "mean_token_accuracy": 0.847792387008667, "num_tokens": 12735368.0, "step": 1527 }, { "epoch": 1.1610942249240122, "grad_norm": 1.604445457458496, "learning_rate": 3.604077823474954e-06, "loss": 0.43434152007102966, "mean_token_accuracy": 0.8434398770332336, "num_tokens": 12747930.0, "step": 1528 }, { "epoch": 1.161854103343465, "grad_norm": 1.9640765190124512, "learning_rate": 3.6021983518466468e-06, "loss": 0.24849727749824524, "mean_token_accuracy": 0.9091773629188538, "num_tokens": 12752619.0, "step": 1529 }, { "epoch": 1.162613981762918, "grad_norm": 2.7943356037139893, "learning_rate": 3.600318106653108e-06, "loss": 0.268229603767395, "mean_token_accuracy": 0.8996462821960449, "num_tokens": 12756414.0, "step": 1530 }, { "epoch": 1.1633738601823709, "grad_norm": 2.2691352367401123, "learning_rate": 3.5984370892139663e-06, "loss": 0.44619977474212646, "mean_token_accuracy": 0.8387969732284546, "num_tokens": 12763480.0, "step": 1531 }, { "epoch": 1.1641337386018238, "grad_norm": 2.27956223487854, "learning_rate": 3.5965553008493924e-06, "loss": 0.2888392210006714, "mean_token_accuracy": 0.8888444900512695, "num_tokens": 12768335.0, "step": 1532 }, { "epoch": 1.1648936170212765, "grad_norm": 1.9309260845184326, "learning_rate": 3.594672742880097e-06, "loss": 0.38395166397094727, "mean_token_accuracy": 0.8667852282524109, "num_tokens": 12775625.0, "step": 1533 }, { "epoch": 1.1656534954407296, "grad_norm": 1.5114457607269287, "learning_rate": 3.5927894166273324e-06, "loss": 0.35699984431266785, "mean_token_accuracy": 0.8701331615447998, "num_tokens": 12787244.0, "step": 1534 }, { "epoch": 1.1664133738601823, "grad_norm": 2.800849676132202, "learning_rate": 3.5909053234128893e-06, "loss": 0.2240539789199829, "mean_token_accuracy": 0.9137411117553711, "num_tokens": 12790149.0, "step": 1535 }, { "epoch": 1.1671732522796352, "grad_norm": 2.260073661804199, "learning_rate": 3.5890204645590964e-06, "loss": 0.4261082112789154, "mean_token_accuracy": 0.8642333149909973, "num_tokens": 12796262.0, "step": 1536 }, { "epoch": 1.167933130699088, "grad_norm": 2.069962501525879, "learning_rate": 3.5871348413888207e-06, "loss": 0.3802388608455658, "mean_token_accuracy": 0.8630022406578064, "num_tokens": 12804049.0, "step": 1537 }, { "epoch": 1.168693009118541, "grad_norm": 1.6988741159439087, "learning_rate": 3.585248455225466e-06, "loss": 0.36427056789398193, "mean_token_accuracy": 0.862297773361206, "num_tokens": 12812781.0, "step": 1538 }, { "epoch": 1.169452887537994, "grad_norm": 2.3438217639923096, "learning_rate": 3.5833613073929684e-06, "loss": 0.1965729296207428, "mean_token_accuracy": 0.929794192314148, "num_tokens": 12816363.0, "step": 1539 }, { "epoch": 1.1702127659574468, "grad_norm": 2.2811391353607178, "learning_rate": 3.5814733992158025e-06, "loss": 0.32205697894096375, "mean_token_accuracy": 0.8993955850601196, "num_tokens": 12821773.0, "step": 1540 }, { "epoch": 1.1709726443768997, "grad_norm": 1.383322834968567, "learning_rate": 3.579584732018975e-06, "loss": 0.3190337121486664, "mean_token_accuracy": 0.8888030052185059, "num_tokens": 12835836.0, "step": 1541 }, { "epoch": 1.1717325227963526, "grad_norm": 2.503678798675537, "learning_rate": 3.577695307128024e-06, "loss": 0.46180155873298645, "mean_token_accuracy": 0.8356181383132935, "num_tokens": 12842284.0, "step": 1542 }, { "epoch": 1.1724924012158056, "grad_norm": 2.1780099868774414, "learning_rate": 3.5758051258690223e-06, "loss": 0.4707275629043579, "mean_token_accuracy": 0.8370652794837952, "num_tokens": 12849792.0, "step": 1543 }, { "epoch": 1.1732522796352582, "grad_norm": 1.5368309020996094, "learning_rate": 3.5739141895685708e-06, "loss": 0.44536107778549194, "mean_token_accuracy": 0.85077965259552, "num_tokens": 12868190.0, "step": 1544 }, { "epoch": 1.1740121580547112, "grad_norm": 1.8734257221221924, "learning_rate": 3.5720224995538023e-06, "loss": 0.2679333984851837, "mean_token_accuracy": 0.8966549038887024, "num_tokens": 12876667.0, "step": 1545 }, { "epoch": 1.174772036474164, "grad_norm": 2.169668197631836, "learning_rate": 3.5701300571523757e-06, "loss": 0.5504401922225952, "mean_token_accuracy": 0.827259361743927, "num_tokens": 12885912.0, "step": 1546 }, { "epoch": 1.175531914893617, "grad_norm": 1.9190294742584229, "learning_rate": 3.5682368636924825e-06, "loss": 0.5136854648590088, "mean_token_accuracy": 0.8493822813034058, "num_tokens": 12894106.0, "step": 1547 }, { "epoch": 1.1762917933130699, "grad_norm": 1.4628456830978394, "learning_rate": 3.566342920502837e-06, "loss": 0.3796519935131073, "mean_token_accuracy": 0.8672927618026733, "num_tokens": 12914428.0, "step": 1548 }, { "epoch": 1.1770516717325228, "grad_norm": 2.9135849475860596, "learning_rate": 3.564448228912682e-06, "loss": 0.3505942225456238, "mean_token_accuracy": 0.8779292106628418, "num_tokens": 12917886.0, "step": 1549 }, { "epoch": 1.1778115501519757, "grad_norm": 1.5976498126983643, "learning_rate": 3.562552790251785e-06, "loss": 0.398990273475647, "mean_token_accuracy": 0.8543448448181152, "num_tokens": 12930962.0, "step": 1550 }, { "epoch": 1.1785714285714286, "grad_norm": 2.1510820388793945, "learning_rate": 3.5606566058504377e-06, "loss": 0.37958794832229614, "mean_token_accuracy": 0.8734434843063354, "num_tokens": 12936608.0, "step": 1551 }, { "epoch": 1.1793313069908815, "grad_norm": 1.5784356594085693, "learning_rate": 3.558759677039455e-06, "loss": 0.3462204337120056, "mean_token_accuracy": 0.8716239929199219, "num_tokens": 12945146.0, "step": 1552 }, { "epoch": 1.1800911854103344, "grad_norm": 1.5204803943634033, "learning_rate": 3.5568620051501755e-06, "loss": 0.38687846064567566, "mean_token_accuracy": 0.8537560701370239, "num_tokens": 12958381.0, "step": 1553 }, { "epoch": 1.1808510638297873, "grad_norm": 1.442669153213501, "learning_rate": 3.5549635915144578e-06, "loss": 0.4486425518989563, "mean_token_accuracy": 0.8538786172866821, "num_tokens": 12974355.0, "step": 1554 }, { "epoch": 1.18161094224924, "grad_norm": 2.7090024948120117, "learning_rate": 3.553064437464682e-06, "loss": 0.3457731604576111, "mean_token_accuracy": 0.8701649904251099, "num_tokens": 12978498.0, "step": 1555 }, { "epoch": 1.182370820668693, "grad_norm": 2.0256686210632324, "learning_rate": 3.551164544333745e-06, "loss": 0.46301382780075073, "mean_token_accuracy": 0.8393809199333191, "num_tokens": 12986049.0, "step": 1556 }, { "epoch": 1.1831306990881458, "grad_norm": 2.809532642364502, "learning_rate": 3.549263913455069e-06, "loss": 0.4033644497394562, "mean_token_accuracy": 0.8553173542022705, "num_tokens": 12990012.0, "step": 1557 }, { "epoch": 1.1838905775075987, "grad_norm": 1.6126118898391724, "learning_rate": 3.5473625461625884e-06, "loss": 0.4213772118091583, "mean_token_accuracy": 0.8686687350273132, "num_tokens": 13000640.0, "step": 1558 }, { "epoch": 1.1846504559270516, "grad_norm": 2.4230797290802, "learning_rate": 3.5454604437907535e-06, "loss": 0.4578772783279419, "mean_token_accuracy": 0.8681545257568359, "num_tokens": 13005672.0, "step": 1559 }, { "epoch": 1.1854103343465046, "grad_norm": 1.5677014589309692, "learning_rate": 3.543557607674537e-06, "loss": 0.2732740044593811, "mean_token_accuracy": 0.9040316343307495, "num_tokens": 13014546.0, "step": 1560 }, { "epoch": 1.1861702127659575, "grad_norm": 2.0466675758361816, "learning_rate": 3.54165403914942e-06, "loss": 0.4171028137207031, "mean_token_accuracy": 0.8539668321609497, "num_tokens": 13021946.0, "step": 1561 }, { "epoch": 1.1869300911854104, "grad_norm": 1.8882291316986084, "learning_rate": 3.539749739551401e-06, "loss": 0.34301382303237915, "mean_token_accuracy": 0.8830934762954712, "num_tokens": 13029709.0, "step": 1562 }, { "epoch": 1.1876899696048633, "grad_norm": 2.285297393798828, "learning_rate": 3.53784471021699e-06, "loss": 0.42049992084503174, "mean_token_accuracy": 0.8538215160369873, "num_tokens": 13035680.0, "step": 1563 }, { "epoch": 1.1884498480243162, "grad_norm": 0.9660508036613464, "learning_rate": 3.535938952483211e-06, "loss": 0.30757755041122437, "mean_token_accuracy": 0.8770003914833069, "num_tokens": 13057545.0, "step": 1564 }, { "epoch": 1.189209726443769, "grad_norm": 3.033686399459839, "learning_rate": 3.534032467687597e-06, "loss": 0.2783232629299164, "mean_token_accuracy": 0.9065004587173462, "num_tokens": 13060350.0, "step": 1565 }, { "epoch": 1.1899696048632218, "grad_norm": 2.065147638320923, "learning_rate": 3.532125257168193e-06, "loss": 0.28573065996170044, "mean_token_accuracy": 0.9196664094924927, "num_tokens": 13066230.0, "step": 1566 }, { "epoch": 1.1907294832826747, "grad_norm": 3.1456873416900635, "learning_rate": 3.5302173222635526e-06, "loss": 0.402212917804718, "mean_token_accuracy": 0.8547881841659546, "num_tokens": 13070164.0, "step": 1567 }, { "epoch": 1.1914893617021276, "grad_norm": 1.7763887643814087, "learning_rate": 3.5283086643127396e-06, "loss": 0.41406041383743286, "mean_token_accuracy": 0.9024406671524048, "num_tokens": 13078746.0, "step": 1568 }, { "epoch": 1.1922492401215805, "grad_norm": 1.8647089004516602, "learning_rate": 3.5263992846553203e-06, "loss": 0.33638501167297363, "mean_token_accuracy": 0.8759017586708069, "num_tokens": 13087153.0, "step": 1569 }, { "epoch": 1.1930091185410334, "grad_norm": 1.7408785820007324, "learning_rate": 3.5244891846313733e-06, "loss": 0.38687846064567566, "mean_token_accuracy": 0.886833131313324, "num_tokens": 13095873.0, "step": 1570 }, { "epoch": 1.1937689969604863, "grad_norm": 1.063403844833374, "learning_rate": 3.5225783655814798e-06, "loss": 0.29847463965415955, "mean_token_accuracy": 0.8943065404891968, "num_tokens": 13112494.0, "step": 1571 }, { "epoch": 1.1945288753799392, "grad_norm": 1.3383631706237793, "learning_rate": 3.520666828846726e-06, "loss": 0.40959814190864563, "mean_token_accuracy": 0.8485267758369446, "num_tokens": 13132542.0, "step": 1572 }, { "epoch": 1.1952887537993921, "grad_norm": 2.8511970043182373, "learning_rate": 3.518754575768702e-06, "loss": 0.3631041646003723, "mean_token_accuracy": 0.8702604174613953, "num_tokens": 13136675.0, "step": 1573 }, { "epoch": 1.196048632218845, "grad_norm": 1.216565489768982, "learning_rate": 3.516841607689501e-06, "loss": 0.33183571696281433, "mean_token_accuracy": 0.8591024875640869, "num_tokens": 13155704.0, "step": 1574 }, { "epoch": 1.196808510638298, "grad_norm": 1.2260510921478271, "learning_rate": 3.5149279259517165e-06, "loss": 0.32951709628105164, "mean_token_accuracy": 0.8645073175430298, "num_tokens": 13169652.0, "step": 1575 }, { "epoch": 1.1975683890577509, "grad_norm": 3.730445146560669, "learning_rate": 3.5130135318984454e-06, "loss": 0.273306667804718, "mean_token_accuracy": 0.9016726016998291, "num_tokens": 13171823.0, "step": 1576 }, { "epoch": 1.1983282674772036, "grad_norm": 2.6083767414093018, "learning_rate": 3.5110984268732827e-06, "loss": 0.32591867446899414, "mean_token_accuracy": 0.8788018226623535, "num_tokens": 13175952.0, "step": 1577 }, { "epoch": 1.1990881458966565, "grad_norm": 1.3573346138000488, "learning_rate": 3.509182612220322e-06, "loss": 0.33707496523857117, "mean_token_accuracy": 0.8733367919921875, "num_tokens": 13191672.0, "step": 1578 }, { "epoch": 1.1998480243161094, "grad_norm": 1.689840316772461, "learning_rate": 3.507266089284157e-06, "loss": 0.3744190037250519, "mean_token_accuracy": 0.8731797933578491, "num_tokens": 13201516.0, "step": 1579 }, { "epoch": 1.2006079027355623, "grad_norm": 2.6112749576568604, "learning_rate": 3.5053488594098763e-06, "loss": 0.3127969801425934, "mean_token_accuracy": 0.8845540285110474, "num_tokens": 13205957.0, "step": 1580 }, { "epoch": 1.2013677811550152, "grad_norm": 2.285753011703491, "learning_rate": 3.5034309239430664e-06, "loss": 0.32707083225250244, "mean_token_accuracy": 0.909648597240448, "num_tokens": 13212577.0, "step": 1581 }, { "epoch": 1.202127659574468, "grad_norm": 1.7919425964355469, "learning_rate": 3.501512284229807e-06, "loss": 0.5249142646789551, "mean_token_accuracy": 0.8211127519607544, "num_tokens": 13223202.0, "step": 1582 }, { "epoch": 1.202887537993921, "grad_norm": 2.458132028579712, "learning_rate": 3.4995929416166756e-06, "loss": 0.40370824933052063, "mean_token_accuracy": 0.8594874143600464, "num_tokens": 13229225.0, "step": 1583 }, { "epoch": 1.203647416413374, "grad_norm": 2.1395130157470703, "learning_rate": 3.4976728974507387e-06, "loss": 0.4650583863258362, "mean_token_accuracy": 0.8297191858291626, "num_tokens": 13237506.0, "step": 1584 }, { "epoch": 1.2044072948328268, "grad_norm": 2.948958158493042, "learning_rate": 3.4957521530795576e-06, "loss": 0.282042533159256, "mean_token_accuracy": 0.9001186490058899, "num_tokens": 13240932.0, "step": 1585 }, { "epoch": 1.2051671732522795, "grad_norm": 1.5559227466583252, "learning_rate": 3.493830709851185e-06, "loss": 0.35081470012664795, "mean_token_accuracy": 0.8794466853141785, "num_tokens": 13250998.0, "step": 1586 }, { "epoch": 1.2059270516717326, "grad_norm": 2.4289605617523193, "learning_rate": 3.4919085691141636e-06, "loss": 0.31316375732421875, "mean_token_accuracy": 0.8789317607879639, "num_tokens": 13255583.0, "step": 1587 }, { "epoch": 1.2066869300911853, "grad_norm": 2.565730333328247, "learning_rate": 3.4899857322175252e-06, "loss": 0.43467023968696594, "mean_token_accuracy": 0.838298499584198, "num_tokens": 13260481.0, "step": 1588 }, { "epoch": 1.2074468085106382, "grad_norm": 1.783970832824707, "learning_rate": 3.4880622005107916e-06, "loss": 0.30073225498199463, "mean_token_accuracy": 0.8852391242980957, "num_tokens": 13268285.0, "step": 1589 }, { "epoch": 1.2082066869300911, "grad_norm": 1.9684637784957886, "learning_rate": 3.486137975343971e-06, "loss": 0.375322163105011, "mean_token_accuracy": 0.859891951084137, "num_tokens": 13276064.0, "step": 1590 }, { "epoch": 1.208966565349544, "grad_norm": 1.6288427114486694, "learning_rate": 3.484213058067559e-06, "loss": 0.45555630326271057, "mean_token_accuracy": 0.8430992364883423, "num_tokens": 13290148.0, "step": 1591 }, { "epoch": 1.209726443768997, "grad_norm": 2.10591983795166, "learning_rate": 3.482287450032536e-06, "loss": 0.543901801109314, "mean_token_accuracy": 0.8474050164222717, "num_tokens": 13299726.0, "step": 1592 }, { "epoch": 1.2104863221884499, "grad_norm": 3.46626615524292, "learning_rate": 3.4803611525903687e-06, "loss": 0.4644322991371155, "mean_token_accuracy": 0.857314944267273, "num_tokens": 13302704.0, "step": 1593 }, { "epoch": 1.2112462006079028, "grad_norm": 2.2779464721679688, "learning_rate": 3.4784341670930067e-06, "loss": 0.3991627097129822, "mean_token_accuracy": 0.8659536242485046, "num_tokens": 13310116.0, "step": 1594 }, { "epoch": 1.2120060790273557, "grad_norm": 1.9925066232681274, "learning_rate": 3.4765064948928813e-06, "loss": 0.2951751947402954, "mean_token_accuracy": 0.8920532464981079, "num_tokens": 13315511.0, "step": 1595 }, { "epoch": 1.2127659574468086, "grad_norm": 2.477402687072754, "learning_rate": 3.474578137342909e-06, "loss": 0.46756991744041443, "mean_token_accuracy": 0.8387883305549622, "num_tokens": 13323178.0, "step": 1596 }, { "epoch": 1.2135258358662613, "grad_norm": 2.5522773265838623, "learning_rate": 3.4726490957964836e-06, "loss": 0.34505167603492737, "mean_token_accuracy": 0.8730485439300537, "num_tokens": 13327193.0, "step": 1597 }, { "epoch": 1.2142857142857142, "grad_norm": 1.5694233179092407, "learning_rate": 3.4707193716074816e-06, "loss": 0.35508453845977783, "mean_token_accuracy": 0.8807560205459595, "num_tokens": 13338898.0, "step": 1598 }, { "epoch": 1.215045592705167, "grad_norm": 1.827397108078003, "learning_rate": 3.4687889661302577e-06, "loss": 0.40622538328170776, "mean_token_accuracy": 0.8506995439529419, "num_tokens": 13346959.0, "step": 1599 }, { "epoch": 1.21580547112462, "grad_norm": 1.4659578800201416, "learning_rate": 3.466857880719645e-06, "loss": 0.25169551372528076, "mean_token_accuracy": 0.9014549255371094, "num_tokens": 13356587.0, "step": 1600 }, { "epoch": 1.216565349544073, "grad_norm": 1.263599157333374, "learning_rate": 3.464926116730953e-06, "loss": 0.3273811638355255, "mean_token_accuracy": 0.9007604122161865, "num_tokens": 13372300.0, "step": 1601 }, { "epoch": 1.2173252279635258, "grad_norm": 1.838417649269104, "learning_rate": 3.462993675519968e-06, "loss": 0.39801910519599915, "mean_token_accuracy": 0.858196496963501, "num_tokens": 13379767.0, "step": 1602 }, { "epoch": 1.2180851063829787, "grad_norm": 2.71407413482666, "learning_rate": 3.4610605584429526e-06, "loss": 0.3743388056755066, "mean_token_accuracy": 0.8653231859207153, "num_tokens": 13384150.0, "step": 1603 }, { "epoch": 1.2188449848024316, "grad_norm": 1.1756614446640015, "learning_rate": 3.4591267668566412e-06, "loss": 0.3520668148994446, "mean_token_accuracy": 0.8701305389404297, "num_tokens": 13405806.0, "step": 1604 }, { "epoch": 1.2196048632218845, "grad_norm": 1.686341643333435, "learning_rate": 3.457192302118244e-06, "loss": 0.4127233028411865, "mean_token_accuracy": 0.8571315407752991, "num_tokens": 13415915.0, "step": 1605 }, { "epoch": 1.2203647416413375, "grad_norm": 2.196047067642212, "learning_rate": 3.455257165585444e-06, "loss": 0.5065249800682068, "mean_token_accuracy": 0.8422767519950867, "num_tokens": 13426334.0, "step": 1606 }, { "epoch": 1.2211246200607904, "grad_norm": 1.7319685220718384, "learning_rate": 3.453321358616393e-06, "loss": 0.34298816323280334, "mean_token_accuracy": 0.8779832124710083, "num_tokens": 13435776.0, "step": 1607 }, { "epoch": 1.221884498480243, "grad_norm": 2.2992677688598633, "learning_rate": 3.4513848825697145e-06, "loss": 0.3317287266254425, "mean_token_accuracy": 0.8957628011703491, "num_tokens": 13440786.0, "step": 1608 }, { "epoch": 1.222644376899696, "grad_norm": 1.7123690843582153, "learning_rate": 3.4494477388045035e-06, "loss": 0.35504674911499023, "mean_token_accuracy": 0.8597263097763062, "num_tokens": 13449507.0, "step": 1609 }, { "epoch": 1.2234042553191489, "grad_norm": 1.574321985244751, "learning_rate": 3.4475099286803204e-06, "loss": 0.48743829131126404, "mean_token_accuracy": 0.873678982257843, "num_tokens": 13463113.0, "step": 1610 }, { "epoch": 1.2241641337386018, "grad_norm": 2.6681430339813232, "learning_rate": 3.445571453557196e-06, "loss": 0.3304022252559662, "mean_token_accuracy": 0.8895198106765747, "num_tokens": 13467552.0, "step": 1611 }, { "epoch": 1.2249240121580547, "grad_norm": 2.2467198371887207, "learning_rate": 3.443632314795627e-06, "loss": 0.38827815651893616, "mean_token_accuracy": 0.8692886829376221, "num_tokens": 13472912.0, "step": 1612 }, { "epoch": 1.2256838905775076, "grad_norm": 2.747021198272705, "learning_rate": 3.4416925137565756e-06, "loss": 0.16032622754573822, "mean_token_accuracy": 0.9457395076751709, "num_tokens": 13475439.0, "step": 1613 }, { "epoch": 1.2264437689969605, "grad_norm": 1.470200777053833, "learning_rate": 3.439752051801467e-06, "loss": 0.335122287273407, "mean_token_accuracy": 0.8727607131004333, "num_tokens": 13486219.0, "step": 1614 }, { "epoch": 1.2272036474164134, "grad_norm": 2.033778667449951, "learning_rate": 3.4378109302921946e-06, "loss": 0.38197797536849976, "mean_token_accuracy": 0.8645455837249756, "num_tokens": 13493366.0, "step": 1615 }, { "epoch": 1.2279635258358663, "grad_norm": 2.0289220809936523, "learning_rate": 3.4358691505911105e-06, "loss": 0.4446212649345398, "mean_token_accuracy": 0.842170238494873, "num_tokens": 13501142.0, "step": 1616 }, { "epoch": 1.2287234042553192, "grad_norm": 1.7993971109390259, "learning_rate": 3.4339267140610317e-06, "loss": 0.37922149896621704, "mean_token_accuracy": 0.8607279062271118, "num_tokens": 13508587.0, "step": 1617 }, { "epoch": 1.2294832826747721, "grad_norm": 2.305236339569092, "learning_rate": 3.4319836220652334e-06, "loss": 0.27437761425971985, "mean_token_accuracy": 0.8981660604476929, "num_tokens": 13512963.0, "step": 1618 }, { "epoch": 1.2302431610942248, "grad_norm": 1.6349776983261108, "learning_rate": 3.430039875967454e-06, "loss": 0.5090376734733582, "mean_token_accuracy": 0.8304173946380615, "num_tokens": 13524459.0, "step": 1619 }, { "epoch": 1.2310030395136777, "grad_norm": 2.3295493125915527, "learning_rate": 3.428095477131888e-06, "loss": 0.2945078909397125, "mean_token_accuracy": 0.8936595320701599, "num_tokens": 13529346.0, "step": 1620 }, { "epoch": 1.2317629179331306, "grad_norm": 1.8379199504852295, "learning_rate": 3.4261504269231904e-06, "loss": 0.4806104600429535, "mean_token_accuracy": 0.8301663398742676, "num_tokens": 13539201.0, "step": 1621 }, { "epoch": 1.2325227963525835, "grad_norm": 2.7224607467651367, "learning_rate": 3.4242047267064714e-06, "loss": 0.444391667842865, "mean_token_accuracy": 0.8456785678863525, "num_tokens": 13544262.0, "step": 1622 }, { "epoch": 1.2332826747720365, "grad_norm": 2.8765504360198975, "learning_rate": 3.4222583778472997e-06, "loss": 0.5504393577575684, "mean_token_accuracy": 0.8232147693634033, "num_tokens": 13550692.0, "step": 1623 }, { "epoch": 1.2340425531914894, "grad_norm": 2.5914840698242188, "learning_rate": 3.4203113817116955e-06, "loss": 0.2604985237121582, "mean_token_accuracy": 0.8972113728523254, "num_tokens": 13554638.0, "step": 1624 }, { "epoch": 1.2348024316109423, "grad_norm": 2.6579203605651855, "learning_rate": 3.4183637396661372e-06, "loss": 0.23776915669441223, "mean_token_accuracy": 0.911636471748352, "num_tokens": 13558184.0, "step": 1625 }, { "epoch": 1.2355623100303952, "grad_norm": 2.339467763900757, "learning_rate": 3.4164154530775552e-06, "loss": 0.4007004201412201, "mean_token_accuracy": 0.8565382957458496, "num_tokens": 13563310.0, "step": 1626 }, { "epoch": 1.236322188449848, "grad_norm": 2.0212485790252686, "learning_rate": 3.4144665233133318e-06, "loss": 0.3323586583137512, "mean_token_accuracy": 0.8828034996986389, "num_tokens": 13570269.0, "step": 1627 }, { "epoch": 1.237082066869301, "grad_norm": 2.3259403705596924, "learning_rate": 3.4125169517413005e-06, "loss": 0.4316944479942322, "mean_token_accuracy": 0.8496184349060059, "num_tokens": 13576065.0, "step": 1628 }, { "epoch": 1.237841945288754, "grad_norm": 1.7273739576339722, "learning_rate": 3.410566739729746e-06, "loss": 0.26066160202026367, "mean_token_accuracy": 0.8986301422119141, "num_tokens": 13583920.0, "step": 1629 }, { "epoch": 1.2386018237082066, "grad_norm": 1.881059169769287, "learning_rate": 3.408615888647402e-06, "loss": 0.288002610206604, "mean_token_accuracy": 0.8981817960739136, "num_tokens": 13596055.0, "step": 1630 }, { "epoch": 1.2393617021276595, "grad_norm": 2.1503989696502686, "learning_rate": 3.4066643998634506e-06, "loss": 0.36795467138290405, "mean_token_accuracy": 0.8714635372161865, "num_tokens": 13602604.0, "step": 1631 }, { "epoch": 1.2401215805471124, "grad_norm": 1.5344990491867065, "learning_rate": 3.4047122747475227e-06, "loss": 0.3198670744895935, "mean_token_accuracy": 0.8748607635498047, "num_tokens": 13613180.0, "step": 1632 }, { "epoch": 1.2408814589665653, "grad_norm": 3.926814317703247, "learning_rate": 3.402759514669694e-06, "loss": 0.39719194173812866, "mean_token_accuracy": 0.8592313528060913, "num_tokens": 13615825.0, "step": 1633 }, { "epoch": 1.2416413373860182, "grad_norm": 1.7342312335968018, "learning_rate": 3.4008061210004872e-06, "loss": 0.38172054290771484, "mean_token_accuracy": 0.8590462803840637, "num_tokens": 13624355.0, "step": 1634 }, { "epoch": 1.2424012158054711, "grad_norm": 2.3119945526123047, "learning_rate": 3.3988520951108683e-06, "loss": 0.30511754751205444, "mean_token_accuracy": 0.8917601108551025, "num_tokens": 13629227.0, "step": 1635 }, { "epoch": 1.243161094224924, "grad_norm": 1.5961447954177856, "learning_rate": 3.3968974383722497e-06, "loss": 0.4232184886932373, "mean_token_accuracy": 0.8434830904006958, "num_tokens": 13642781.0, "step": 1636 }, { "epoch": 1.243920972644377, "grad_norm": 1.9487611055374146, "learning_rate": 3.3949421521564825e-06, "loss": 0.48407718539237976, "mean_token_accuracy": 0.8348262310028076, "num_tokens": 13653455.0, "step": 1637 }, { "epoch": 1.2446808510638299, "grad_norm": 2.154454469680786, "learning_rate": 3.392986237835863e-06, "loss": 0.27183955907821655, "mean_token_accuracy": 0.905956506729126, "num_tokens": 13659169.0, "step": 1638 }, { "epoch": 1.2454407294832828, "grad_norm": 3.525848150253296, "learning_rate": 3.391029696783127e-06, "loss": 0.38543254137039185, "mean_token_accuracy": 0.8582839965820312, "num_tokens": 13662773.0, "step": 1639 }, { "epoch": 1.2462006079027357, "grad_norm": 3.2537992000579834, "learning_rate": 3.389072530371451e-06, "loss": 0.3429376780986786, "mean_token_accuracy": 0.8879032135009766, "num_tokens": 13665988.0, "step": 1640 }, { "epoch": 1.2469604863221884, "grad_norm": 1.8951609134674072, "learning_rate": 3.3871147399744482e-06, "loss": 0.3352762460708618, "mean_token_accuracy": 0.8795855045318604, "num_tokens": 13672638.0, "step": 1641 }, { "epoch": 1.2477203647416413, "grad_norm": 1.778814673423767, "learning_rate": 3.385156326966173e-06, "loss": 0.4749578833580017, "mean_token_accuracy": 0.8496826887130737, "num_tokens": 13681794.0, "step": 1642 }, { "epoch": 1.2484802431610942, "grad_norm": 2.0423994064331055, "learning_rate": 3.383197292721114e-06, "loss": 0.47299206256866455, "mean_token_accuracy": 0.838668167591095, "num_tokens": 13689925.0, "step": 1643 }, { "epoch": 1.249240121580547, "grad_norm": 2.104271411895752, "learning_rate": 3.3812376386141966e-06, "loss": 0.4408146142959595, "mean_token_accuracy": 0.8485715389251709, "num_tokens": 13698399.0, "step": 1644 }, { "epoch": 1.25, "grad_norm": 1.7226276397705078, "learning_rate": 3.379277366020782e-06, "loss": 0.3539600670337677, "mean_token_accuracy": 0.8850942850112915, "num_tokens": 13706076.0, "step": 1645 }, { "epoch": 1.250759878419453, "grad_norm": 2.4249346256256104, "learning_rate": 3.3773164763166653e-06, "loss": 0.20374828577041626, "mean_token_accuracy": 0.9257269501686096, "num_tokens": 13709509.0, "step": 1646 }, { "epoch": 1.2515197568389058, "grad_norm": 1.7691107988357544, "learning_rate": 3.3753549708780736e-06, "loss": 0.3723805844783783, "mean_token_accuracy": 0.8693065643310547, "num_tokens": 13718910.0, "step": 1647 }, { "epoch": 1.2522796352583587, "grad_norm": 2.578094720840454, "learning_rate": 3.3733928510816677e-06, "loss": 0.40901440382003784, "mean_token_accuracy": 0.855756402015686, "num_tokens": 13723782.0, "step": 1648 }, { "epoch": 1.2530395136778116, "grad_norm": 1.9573343992233276, "learning_rate": 3.3714301183045382e-06, "loss": 0.3774934709072113, "mean_token_accuracy": 0.8821815252304077, "num_tokens": 13732124.0, "step": 1649 }, { "epoch": 1.2537993920972643, "grad_norm": 1.7424739599227905, "learning_rate": 3.369466773924207e-06, "loss": 0.4050336182117462, "mean_token_accuracy": 0.8513686656951904, "num_tokens": 13740868.0, "step": 1650 }, { "epoch": 1.2545592705167175, "grad_norm": 1.319283127784729, "learning_rate": 3.3675028193186243e-06, "loss": 0.39144033193588257, "mean_token_accuracy": 0.8555953502655029, "num_tokens": 13757942.0, "step": 1651 }, { "epoch": 1.2553191489361701, "grad_norm": 1.8625404834747314, "learning_rate": 3.365538255866169e-06, "loss": 0.4031578302383423, "mean_token_accuracy": 0.8467690348625183, "num_tokens": 13766442.0, "step": 1652 }, { "epoch": 1.256079027355623, "grad_norm": 1.417905330657959, "learning_rate": 3.3635730849456484e-06, "loss": 0.2881859540939331, "mean_token_accuracy": 0.8885266780853271, "num_tokens": 13779245.0, "step": 1653 }, { "epoch": 1.256838905775076, "grad_norm": 1.1930468082427979, "learning_rate": 3.3616073079362925e-06, "loss": 0.28738901019096375, "mean_token_accuracy": 0.8943818807601929, "num_tokens": 13794251.0, "step": 1654 }, { "epoch": 1.2575987841945289, "grad_norm": 2.1434309482574463, "learning_rate": 3.3596409262177633e-06, "loss": 0.4327430725097656, "mean_token_accuracy": 0.8654351234436035, "num_tokens": 13802331.0, "step": 1655 }, { "epoch": 1.2583586626139818, "grad_norm": 1.432978630065918, "learning_rate": 3.357673941170139e-06, "loss": 0.3437100946903229, "mean_token_accuracy": 0.8954833745956421, "num_tokens": 13813168.0, "step": 1656 }, { "epoch": 1.2591185410334347, "grad_norm": 1.627819299697876, "learning_rate": 3.3557063541739283e-06, "loss": 0.3996168375015259, "mean_token_accuracy": 0.8546762466430664, "num_tokens": 13823288.0, "step": 1657 }, { "epoch": 1.2598784194528876, "grad_norm": 1.5285189151763916, "learning_rate": 3.353738166610058e-06, "loss": 0.38533344864845276, "mean_token_accuracy": 0.8551039099693298, "num_tokens": 13835081.0, "step": 1658 }, { "epoch": 1.2606382978723405, "grad_norm": 1.3429980278015137, "learning_rate": 3.35176937985988e-06, "loss": 0.3437798321247101, "mean_token_accuracy": 0.8686226010322571, "num_tokens": 13847338.0, "step": 1659 }, { "epoch": 1.2613981762917934, "grad_norm": 1.9459609985351562, "learning_rate": 3.349799995305162e-06, "loss": 0.32939398288726807, "mean_token_accuracy": 0.8858831524848938, "num_tokens": 13854349.0, "step": 1660 }, { "epoch": 1.262158054711246, "grad_norm": 2.653934955596924, "learning_rate": 3.3478300143280946e-06, "loss": 0.24988865852355957, "mean_token_accuracy": 0.925298810005188, "num_tokens": 13858210.0, "step": 1661 }, { "epoch": 1.2629179331306992, "grad_norm": 3.8534038066864014, "learning_rate": 3.3458594383112868e-06, "loss": 0.26958444714546204, "mean_token_accuracy": 0.9060179591178894, "num_tokens": 13860545.0, "step": 1662 }, { "epoch": 1.263677811550152, "grad_norm": 2.214930772781372, "learning_rate": 3.343888268637765e-06, "loss": 0.43907588720321655, "mean_token_accuracy": 0.8396751880645752, "num_tokens": 13868946.0, "step": 1663 }, { "epoch": 1.2644376899696048, "grad_norm": 1.800555944442749, "learning_rate": 3.341916506690971e-06, "loss": 0.46563518047332764, "mean_token_accuracy": 0.8329964876174927, "num_tokens": 13879459.0, "step": 1664 }, { "epoch": 1.2651975683890577, "grad_norm": 2.1766855716705322, "learning_rate": 3.3399441538547638e-06, "loss": 0.4407610297203064, "mean_token_accuracy": 0.8443752527236938, "num_tokens": 13885767.0, "step": 1665 }, { "epoch": 1.2659574468085106, "grad_norm": 2.222520112991333, "learning_rate": 3.337971211513417e-06, "loss": 0.3715386688709259, "mean_token_accuracy": 0.8759685754776001, "num_tokens": 13891838.0, "step": 1666 }, { "epoch": 1.2667173252279635, "grad_norm": 2.146975040435791, "learning_rate": 3.3359976810516164e-06, "loss": 0.3351247310638428, "mean_token_accuracy": 0.8825492262840271, "num_tokens": 13896868.0, "step": 1667 }, { "epoch": 1.2674772036474165, "grad_norm": 1.9139604568481445, "learning_rate": 3.3340235638544633e-06, "loss": 0.42799127101898193, "mean_token_accuracy": 0.8379320502281189, "num_tokens": 13905054.0, "step": 1668 }, { "epoch": 1.2682370820668694, "grad_norm": 2.103579521179199, "learning_rate": 3.332048861307467e-06, "loss": 0.4146561026573181, "mean_token_accuracy": 0.852485716342926, "num_tokens": 13912535.0, "step": 1669 }, { "epoch": 1.2689969604863223, "grad_norm": 1.7120500802993774, "learning_rate": 3.330073574796551e-06, "loss": 0.4229850769042969, "mean_token_accuracy": 0.8472385406494141, "num_tokens": 13923376.0, "step": 1670 }, { "epoch": 1.2697568389057752, "grad_norm": 1.9959901571273804, "learning_rate": 3.328097705708047e-06, "loss": 0.33442965149879456, "mean_token_accuracy": 0.8832216262817383, "num_tokens": 13928997.0, "step": 1671 }, { "epoch": 1.2705167173252279, "grad_norm": 1.9133622646331787, "learning_rate": 3.3261212554286977e-06, "loss": 0.5105348825454712, "mean_token_accuracy": 0.8361932039260864, "num_tokens": 13939742.0, "step": 1672 }, { "epoch": 1.2712765957446808, "grad_norm": 1.307115077972412, "learning_rate": 3.324144225345649e-06, "loss": 0.46023130416870117, "mean_token_accuracy": 0.8184223175048828, "num_tokens": 13956997.0, "step": 1673 }, { "epoch": 1.2720364741641337, "grad_norm": 1.9953432083129883, "learning_rate": 3.3221666168464584e-06, "loss": 0.3181626796722412, "mean_token_accuracy": 0.8694620132446289, "num_tokens": 13962965.0, "step": 1674 }, { "epoch": 1.2727963525835866, "grad_norm": 2.7137930393218994, "learning_rate": 3.320188431319088e-06, "loss": 0.36620837450027466, "mean_token_accuracy": 0.873613178730011, "num_tokens": 13967092.0, "step": 1675 }, { "epoch": 1.2735562310030395, "grad_norm": 1.409003496170044, "learning_rate": 3.318209670151904e-06, "loss": 0.3414056599140167, "mean_token_accuracy": 0.8718935251235962, "num_tokens": 13979150.0, "step": 1676 }, { "epoch": 1.2743161094224924, "grad_norm": 2.4496495723724365, "learning_rate": 3.3162303347336765e-06, "loss": 0.4553738236427307, "mean_token_accuracy": 0.8575567007064819, "num_tokens": 13984489.0, "step": 1677 }, { "epoch": 1.2750759878419453, "grad_norm": 1.3116135597229004, "learning_rate": 3.3142504264535808e-06, "loss": 0.282420814037323, "mean_token_accuracy": 0.8997855186462402, "num_tokens": 13996714.0, "step": 1678 }, { "epoch": 1.2758358662613982, "grad_norm": 1.2699991464614868, "learning_rate": 3.3122699467011913e-06, "loss": 0.2890617549419403, "mean_token_accuracy": 0.8953261375427246, "num_tokens": 14009837.0, "step": 1679 }, { "epoch": 1.2765957446808511, "grad_norm": 2.578388214111328, "learning_rate": 3.3102888968664857e-06, "loss": 0.42454755306243896, "mean_token_accuracy": 0.8443878889083862, "num_tokens": 14015436.0, "step": 1680 }, { "epoch": 1.2773556231003038, "grad_norm": 2.735137701034546, "learning_rate": 3.308307278339842e-06, "loss": 0.31889480352401733, "mean_token_accuracy": 0.8984297513961792, "num_tokens": 14018764.0, "step": 1681 }, { "epoch": 1.278115501519757, "grad_norm": 1.8130135536193848, "learning_rate": 3.306325092512034e-06, "loss": 0.3066205680370331, "mean_token_accuracy": 0.8954950571060181, "num_tokens": 14027518.0, "step": 1682 }, { "epoch": 1.2788753799392096, "grad_norm": 2.2724480628967285, "learning_rate": 3.3043423407742374e-06, "loss": 0.33755242824554443, "mean_token_accuracy": 0.8864662647247314, "num_tokens": 14032851.0, "step": 1683 }, { "epoch": 1.2796352583586625, "grad_norm": 2.8843767642974854, "learning_rate": 3.3023590245180237e-06, "loss": 0.3719598650932312, "mean_token_accuracy": 0.8820307850837708, "num_tokens": 14036970.0, "step": 1684 }, { "epoch": 1.2803951367781155, "grad_norm": 1.650910496711731, "learning_rate": 3.300375145135361e-06, "loss": 0.444785475730896, "mean_token_accuracy": 0.8410971164703369, "num_tokens": 14052369.0, "step": 1685 }, { "epoch": 1.2811550151975684, "grad_norm": 1.6646356582641602, "learning_rate": 3.2983907040186112e-06, "loss": 0.3199513852596283, "mean_token_accuracy": 0.8973816633224487, "num_tokens": 14060686.0, "step": 1686 }, { "epoch": 1.2819148936170213, "grad_norm": 1.729041576385498, "learning_rate": 3.296405702560532e-06, "loss": 0.3635617792606354, "mean_token_accuracy": 0.8692331314086914, "num_tokens": 14069611.0, "step": 1687 }, { "epoch": 1.2826747720364742, "grad_norm": 2.2488558292388916, "learning_rate": 3.294420142154274e-06, "loss": 0.42253026366233826, "mean_token_accuracy": 0.8746185898780823, "num_tokens": 14077524.0, "step": 1688 }, { "epoch": 1.283434650455927, "grad_norm": 2.913764238357544, "learning_rate": 3.29243402419338e-06, "loss": 0.3872144818305969, "mean_token_accuracy": 0.8600287437438965, "num_tokens": 14081589.0, "step": 1689 }, { "epoch": 1.28419452887538, "grad_norm": 1.8794227838516235, "learning_rate": 3.2904473500717826e-06, "loss": 0.33545660972595215, "mean_token_accuracy": 0.876190185546875, "num_tokens": 14088370.0, "step": 1690 }, { "epoch": 1.284954407294833, "grad_norm": 2.5781097412109375, "learning_rate": 3.2884601211838087e-06, "loss": 0.36813071370124817, "mean_token_accuracy": 0.8578311800956726, "num_tokens": 14093087.0, "step": 1691 }, { "epoch": 1.2857142857142856, "grad_norm": 1.504655361175537, "learning_rate": 3.2864723389241697e-06, "loss": 0.4485331177711487, "mean_token_accuracy": 0.8405745029449463, "num_tokens": 14107052.0, "step": 1692 }, { "epoch": 1.2864741641337387, "grad_norm": 1.7300686836242676, "learning_rate": 3.284484004687969e-06, "loss": 0.3415663540363312, "mean_token_accuracy": 0.8776445984840393, "num_tokens": 14115446.0, "step": 1693 }, { "epoch": 1.2872340425531914, "grad_norm": 1.678526759147644, "learning_rate": 3.2824951198706958e-06, "loss": 0.35631367564201355, "mean_token_accuracy": 0.8998591899871826, "num_tokens": 14122606.0, "step": 1694 }, { "epoch": 1.2879939209726443, "grad_norm": 1.7024660110473633, "learning_rate": 3.280505685868226e-06, "loss": 0.3771167993545532, "mean_token_accuracy": 0.879543125629425, "num_tokens": 14132744.0, "step": 1695 }, { "epoch": 1.2887537993920972, "grad_norm": 2.8386805057525635, "learning_rate": 3.278515704076821e-06, "loss": 0.2524474859237671, "mean_token_accuracy": 0.9063973426818848, "num_tokens": 14135915.0, "step": 1696 }, { "epoch": 1.2895136778115501, "grad_norm": 1.7393723726272583, "learning_rate": 3.276525175893126e-06, "loss": 0.3635467290878296, "mean_token_accuracy": 0.8641912937164307, "num_tokens": 14144138.0, "step": 1697 }, { "epoch": 1.290273556231003, "grad_norm": 1.1936280727386475, "learning_rate": 3.274534102714172e-06, "loss": 0.33632004261016846, "mean_token_accuracy": 0.8785254955291748, "num_tokens": 14161433.0, "step": 1698 }, { "epoch": 1.291033434650456, "grad_norm": 1.6649963855743408, "learning_rate": 3.272542485937369e-06, "loss": 0.37544432282447815, "mean_token_accuracy": 0.8841269016265869, "num_tokens": 14174524.0, "step": 1699 }, { "epoch": 1.2917933130699089, "grad_norm": 2.386367082595825, "learning_rate": 3.270550326960511e-06, "loss": 0.3602409362792969, "mean_token_accuracy": 0.8775192499160767, "num_tokens": 14179780.0, "step": 1700 }, { "epoch": 1.2925531914893618, "grad_norm": 1.2309118509292603, "learning_rate": 3.268557627181772e-06, "loss": 0.2998388707637787, "mean_token_accuracy": 0.870904803276062, "num_tokens": 14192302.0, "step": 1701 }, { "epoch": 1.2933130699088147, "grad_norm": 1.7909103631973267, "learning_rate": 3.2665643879997054e-06, "loss": 0.4640781879425049, "mean_token_accuracy": 0.8320716619491577, "num_tokens": 14202725.0, "step": 1702 }, { "epoch": 1.2940729483282674, "grad_norm": 2.7360482215881348, "learning_rate": 3.2645706108132426e-06, "loss": 0.31890684366226196, "mean_token_accuracy": 0.8809678554534912, "num_tokens": 14206481.0, "step": 1703 }, { "epoch": 1.2948328267477205, "grad_norm": 2.320089817047119, "learning_rate": 3.2625762970216944e-06, "loss": 0.37968605756759644, "mean_token_accuracy": 0.8768818378448486, "num_tokens": 14211873.0, "step": 1704 }, { "epoch": 1.2955927051671732, "grad_norm": 3.1481642723083496, "learning_rate": 3.2605814480247454e-06, "loss": 0.4423736333847046, "mean_token_accuracy": 0.8536900281906128, "num_tokens": 14216007.0, "step": 1705 }, { "epoch": 1.296352583586626, "grad_norm": 2.2253670692443848, "learning_rate": 3.258586065222459e-06, "loss": 0.5184187889099121, "mean_token_accuracy": 0.818327784538269, "num_tokens": 14222702.0, "step": 1706 }, { "epoch": 1.297112462006079, "grad_norm": 1.931569218635559, "learning_rate": 3.2565901500152702e-06, "loss": 0.4868287742137909, "mean_token_accuracy": 0.8749524354934692, "num_tokens": 14234081.0, "step": 1707 }, { "epoch": 1.297872340425532, "grad_norm": 2.0268192291259766, "learning_rate": 3.2545937038039904e-06, "loss": 0.4449063241481781, "mean_token_accuracy": 0.843561589717865, "num_tokens": 14242453.0, "step": 1708 }, { "epoch": 1.2986322188449848, "grad_norm": 2.513216257095337, "learning_rate": 3.2525967279898017e-06, "loss": 0.4250633716583252, "mean_token_accuracy": 0.8536792993545532, "num_tokens": 14247496.0, "step": 1709 }, { "epoch": 1.2993920972644377, "grad_norm": 2.7836897373199463, "learning_rate": 3.2505992239742582e-06, "loss": 0.23209503293037415, "mean_token_accuracy": 0.9099798798561096, "num_tokens": 14250408.0, "step": 1710 }, { "epoch": 1.3001519756838906, "grad_norm": 2.2901854515075684, "learning_rate": 3.2486011931592863e-06, "loss": 0.468505322933197, "mean_token_accuracy": 0.8332347869873047, "num_tokens": 14256017.0, "step": 1711 }, { "epoch": 1.3009118541033435, "grad_norm": 3.0638043880462646, "learning_rate": 3.2466026369471804e-06, "loss": 0.33800238370895386, "mean_token_accuracy": 0.8722471594810486, "num_tokens": 14260053.0, "step": 1712 }, { "epoch": 1.3016717325227964, "grad_norm": 1.4896595478057861, "learning_rate": 3.2446035567406033e-06, "loss": 0.4121527671813965, "mean_token_accuracy": 0.8510831594467163, "num_tokens": 14271294.0, "step": 1713 }, { "epoch": 1.3024316109422491, "grad_norm": 1.671916127204895, "learning_rate": 3.2426039539425875e-06, "loss": 0.5107783079147339, "mean_token_accuracy": 0.8538745641708374, "num_tokens": 14283049.0, "step": 1714 }, { "epoch": 1.3031914893617023, "grad_norm": 2.4864914417266846, "learning_rate": 3.240603829956531e-06, "loss": 0.40643227100372314, "mean_token_accuracy": 0.8515803813934326, "num_tokens": 14288411.0, "step": 1715 }, { "epoch": 1.303951367781155, "grad_norm": 1.6524593830108643, "learning_rate": 3.238603186186198e-06, "loss": 0.3950223922729492, "mean_token_accuracy": 0.8688688278198242, "num_tokens": 14298571.0, "step": 1716 }, { "epoch": 1.3047112462006079, "grad_norm": 2.279806613922119, "learning_rate": 3.2366020240357166e-06, "loss": 0.2904418110847473, "mean_token_accuracy": 0.8918064832687378, "num_tokens": 14304475.0, "step": 1717 }, { "epoch": 1.3054711246200608, "grad_norm": 3.1041128635406494, "learning_rate": 3.2346003449095803e-06, "loss": 0.3866184949874878, "mean_token_accuracy": 0.868927001953125, "num_tokens": 14308401.0, "step": 1718 }, { "epoch": 1.3062310030395137, "grad_norm": 3.3894896507263184, "learning_rate": 3.2325981502126434e-06, "loss": 0.28965574502944946, "mean_token_accuracy": 0.9132376909255981, "num_tokens": 14311343.0, "step": 1719 }, { "epoch": 1.3069908814589666, "grad_norm": 2.2272822856903076, "learning_rate": 3.2305954413501252e-06, "loss": 0.3365956246852875, "mean_token_accuracy": 0.8895664215087891, "num_tokens": 14316804.0, "step": 1720 }, { "epoch": 1.3077507598784195, "grad_norm": 1.9720094203948975, "learning_rate": 3.228592219727602e-06, "loss": 0.4122162461280823, "mean_token_accuracy": 0.8482868671417236, "num_tokens": 14323714.0, "step": 1721 }, { "epoch": 1.3085106382978724, "grad_norm": 1.7257472276687622, "learning_rate": 3.226588486751012e-06, "loss": 0.5060480833053589, "mean_token_accuracy": 0.8220949172973633, "num_tokens": 14338019.0, "step": 1722 }, { "epoch": 1.3092705167173253, "grad_norm": 1.4680986404418945, "learning_rate": 3.2245842438266526e-06, "loss": 0.31893390417099, "mean_token_accuracy": 0.8578824996948242, "num_tokens": 14349015.0, "step": 1723 }, { "epoch": 1.3100303951367782, "grad_norm": 1.8801017999649048, "learning_rate": 3.222579492361179e-06, "loss": 0.449579119682312, "mean_token_accuracy": 0.8501745462417603, "num_tokens": 14358579.0, "step": 1724 }, { "epoch": 1.310790273556231, "grad_norm": 1.298132300376892, "learning_rate": 3.220574233761603e-06, "loss": 0.31765592098236084, "mean_token_accuracy": 0.902048647403717, "num_tokens": 14374065.0, "step": 1725 }, { "epoch": 1.3115501519756838, "grad_norm": 3.380974769592285, "learning_rate": 3.2185684694352913e-06, "loss": 0.32877063751220703, "mean_token_accuracy": 0.8823810815811157, "num_tokens": 14377135.0, "step": 1726 }, { "epoch": 1.3123100303951367, "grad_norm": 2.0686631202697754, "learning_rate": 3.216562200789968e-06, "loss": 0.35204291343688965, "mean_token_accuracy": 0.8630719184875488, "num_tokens": 14383776.0, "step": 1727 }, { "epoch": 1.3130699088145896, "grad_norm": 3.423368453979492, "learning_rate": 3.214555429233707e-06, "loss": 0.475549578666687, "mean_token_accuracy": 0.8287918567657471, "num_tokens": 14387388.0, "step": 1728 }, { "epoch": 1.3138297872340425, "grad_norm": 2.5058200359344482, "learning_rate": 3.2125481561749406e-06, "loss": 0.4823055565357208, "mean_token_accuracy": 0.8550118207931519, "num_tokens": 14392665.0, "step": 1729 }, { "epoch": 1.3145896656534954, "grad_norm": 2.4830386638641357, "learning_rate": 3.210540383022449e-06, "loss": 0.5080981254577637, "mean_token_accuracy": 0.8158162832260132, "num_tokens": 14398026.0, "step": 1730 }, { "epoch": 1.3153495440729484, "grad_norm": 1.8622243404388428, "learning_rate": 3.208532111185365e-06, "loss": 0.5277630686759949, "mean_token_accuracy": 0.8199248909950256, "num_tokens": 14407727.0, "step": 1731 }, { "epoch": 1.3161094224924013, "grad_norm": 1.4649666547775269, "learning_rate": 3.2065233420731717e-06, "loss": 0.24797789752483368, "mean_token_accuracy": 0.9193159341812134, "num_tokens": 14416980.0, "step": 1732 }, { "epoch": 1.3168693009118542, "grad_norm": 1.850425124168396, "learning_rate": 3.2045140770956987e-06, "loss": 0.37115979194641113, "mean_token_accuracy": 0.8730438351631165, "num_tokens": 14425281.0, "step": 1733 }, { "epoch": 1.3176291793313069, "grad_norm": 1.763343095779419, "learning_rate": 3.2025043176631283e-06, "loss": 0.45721665024757385, "mean_token_accuracy": 0.8386655449867249, "num_tokens": 14437703.0, "step": 1734 }, { "epoch": 1.31838905775076, "grad_norm": 1.9258220195770264, "learning_rate": 3.2004940651859844e-06, "loss": 0.3945969045162201, "mean_token_accuracy": 0.8956382274627686, "num_tokens": 14444062.0, "step": 1735 }, { "epoch": 1.3191489361702127, "grad_norm": 2.4880564212799072, "learning_rate": 3.198483321075141e-06, "loss": 0.5082302093505859, "mean_token_accuracy": 0.8306169509887695, "num_tokens": 14449797.0, "step": 1736 }, { "epoch": 1.3199088145896656, "grad_norm": 1.7109283208847046, "learning_rate": 3.196472086741815e-06, "loss": 0.5045887231826782, "mean_token_accuracy": 0.8213133811950684, "num_tokens": 14462272.0, "step": 1737 }, { "epoch": 1.3206686930091185, "grad_norm": 2.0488271713256836, "learning_rate": 3.194460363597569e-06, "loss": 0.325813889503479, "mean_token_accuracy": 0.8903666734695435, "num_tokens": 14468209.0, "step": 1738 }, { "epoch": 1.3214285714285714, "grad_norm": 3.7279975414276123, "learning_rate": 3.192448153054306e-06, "loss": 0.4074842929840088, "mean_token_accuracy": 0.8528945446014404, "num_tokens": 14471290.0, "step": 1739 }, { "epoch": 1.3221884498480243, "grad_norm": 2.103818655014038, "learning_rate": 3.190435456524275e-06, "loss": 0.4232693910598755, "mean_token_accuracy": 0.8500773906707764, "num_tokens": 14478533.0, "step": 1740 }, { "epoch": 1.3229483282674772, "grad_norm": 1.581458330154419, "learning_rate": 3.188422275420063e-06, "loss": 0.3913007080554962, "mean_token_accuracy": 0.8680640459060669, "num_tokens": 14489235.0, "step": 1741 }, { "epoch": 1.3237082066869301, "grad_norm": 2.702343702316284, "learning_rate": 3.186408611154597e-06, "loss": 0.20923522114753723, "mean_token_accuracy": 0.9195057153701782, "num_tokens": 14492362.0, "step": 1742 }, { "epoch": 1.324468085106383, "grad_norm": 2.3749828338623047, "learning_rate": 3.184394465141146e-06, "loss": 0.4011795222759247, "mean_token_accuracy": 0.8654810190200806, "num_tokens": 14497605.0, "step": 1743 }, { "epoch": 1.325227963525836, "grad_norm": 2.670440912246704, "learning_rate": 3.1823798387933134e-06, "loss": 0.3667614459991455, "mean_token_accuracy": 0.8844491243362427, "num_tokens": 14501926.0, "step": 1744 }, { "epoch": 1.3259878419452886, "grad_norm": 1.5772322416305542, "learning_rate": 3.180364733525043e-06, "loss": 0.42633625864982605, "mean_token_accuracy": 0.8359421491622925, "num_tokens": 14515260.0, "step": 1745 }, { "epoch": 1.3267477203647418, "grad_norm": 1.6032065153121948, "learning_rate": 3.178349150750612e-06, "loss": 0.3195420801639557, "mean_token_accuracy": 0.8810466527938843, "num_tokens": 14524066.0, "step": 1746 }, { "epoch": 1.3275075987841944, "grad_norm": 2.1783196926116943, "learning_rate": 3.1763330918846347e-06, "loss": 0.36809593439102173, "mean_token_accuracy": 0.869278073310852, "num_tokens": 14530135.0, "step": 1747 }, { "epoch": 1.3282674772036474, "grad_norm": 2.3493285179138184, "learning_rate": 3.1743165583420586e-06, "loss": 0.37988102436065674, "mean_token_accuracy": 0.868124783039093, "num_tokens": 14535033.0, "step": 1748 }, { "epoch": 1.3290273556231003, "grad_norm": 2.1477203369140625, "learning_rate": 3.1722995515381644e-06, "loss": 0.46648144721984863, "mean_token_accuracy": 0.8559156656265259, "num_tokens": 14542345.0, "step": 1749 }, { "epoch": 1.3297872340425532, "grad_norm": 1.4608255624771118, "learning_rate": 3.1702820728885657e-06, "loss": 0.39337849617004395, "mean_token_accuracy": 0.8639435172080994, "num_tokens": 14554915.0, "step": 1750 }, { "epoch": 1.330547112462006, "grad_norm": 2.3651328086853027, "learning_rate": 3.1682641238092064e-06, "loss": 0.49681341648101807, "mean_token_accuracy": 0.8158421516418457, "num_tokens": 14562006.0, "step": 1751 }, { "epoch": 1.331306990881459, "grad_norm": 0.9801863431930542, "learning_rate": 3.1662457057163603e-06, "loss": 0.31877395510673523, "mean_token_accuracy": 0.8827112317085266, "num_tokens": 14584483.0, "step": 1752 }, { "epoch": 1.332066869300912, "grad_norm": 2.4509572982788086, "learning_rate": 3.164226820026632e-06, "loss": 0.3388911187648773, "mean_token_accuracy": 0.902089536190033, "num_tokens": 14589532.0, "step": 1753 }, { "epoch": 1.3328267477203648, "grad_norm": 1.3197529315948486, "learning_rate": 3.162207468156952e-06, "loss": 0.3266242742538452, "mean_token_accuracy": 0.8774392604827881, "num_tokens": 14601366.0, "step": 1754 }, { "epoch": 1.3335866261398177, "grad_norm": 2.1562647819519043, "learning_rate": 3.16018765152458e-06, "loss": 0.37000200152397156, "mean_token_accuracy": 0.8871214389801025, "num_tokens": 14606888.0, "step": 1755 }, { "epoch": 1.3343465045592704, "grad_norm": 1.3734146356582642, "learning_rate": 3.1581673715471007e-06, "loss": 0.3569720685482025, "mean_token_accuracy": 0.8710688352584839, "num_tokens": 14620336.0, "step": 1756 }, { "epoch": 1.3351063829787235, "grad_norm": 1.3672215938568115, "learning_rate": 3.1561466296424247e-06, "loss": 0.35540711879730225, "mean_token_accuracy": 0.8681244850158691, "num_tokens": 14632580.0, "step": 1757 }, { "epoch": 1.3358662613981762, "grad_norm": 1.8354918956756592, "learning_rate": 3.154125427228786e-06, "loss": 0.37910211086273193, "mean_token_accuracy": 0.8560112118721008, "num_tokens": 14640649.0, "step": 1758 }, { "epoch": 1.3366261398176291, "grad_norm": 1.393344521522522, "learning_rate": 3.152103765724743e-06, "loss": 0.4192149043083191, "mean_token_accuracy": 0.8449965119361877, "num_tokens": 14655152.0, "step": 1759 }, { "epoch": 1.337386018237082, "grad_norm": 2.574110984802246, "learning_rate": 3.150081646549174e-06, "loss": 0.32852211594581604, "mean_token_accuracy": 0.8917709589004517, "num_tokens": 14659084.0, "step": 1760 }, { "epoch": 1.338145896656535, "grad_norm": 2.0882370471954346, "learning_rate": 3.1480590711212823e-06, "loss": 0.36409103870391846, "mean_token_accuracy": 0.8699878454208374, "num_tokens": 14665483.0, "step": 1761 }, { "epoch": 1.3389057750759878, "grad_norm": 2.1478257179260254, "learning_rate": 3.1460360408605866e-06, "loss": 0.2700880765914917, "mean_token_accuracy": 0.8920501470565796, "num_tokens": 14669978.0, "step": 1762 }, { "epoch": 1.3396656534954408, "grad_norm": 2.0100364685058594, "learning_rate": 3.144012557186931e-06, "loss": 0.40936514735221863, "mean_token_accuracy": 0.8698817491531372, "num_tokens": 14678772.0, "step": 1763 }, { "epoch": 1.3404255319148937, "grad_norm": 2.555269241333008, "learning_rate": 3.14198862152047e-06, "loss": 0.3917113244533539, "mean_token_accuracy": 0.86394202709198, "num_tokens": 14683231.0, "step": 1764 }, { "epoch": 1.3411854103343466, "grad_norm": 2.343047857284546, "learning_rate": 3.1399642352816825e-06, "loss": 0.2664611041545868, "mean_token_accuracy": 0.9000940322875977, "num_tokens": 14687546.0, "step": 1765 }, { "epoch": 1.3419452887537995, "grad_norm": 1.2528942823410034, "learning_rate": 3.1379393998913594e-06, "loss": 0.275432825088501, "mean_token_accuracy": 0.8993785381317139, "num_tokens": 14700255.0, "step": 1766 }, { "epoch": 1.3427051671732522, "grad_norm": 2.1546859741210938, "learning_rate": 3.135914116770609e-06, "loss": 0.3017679750919342, "mean_token_accuracy": 0.8885464668273926, "num_tokens": 14704839.0, "step": 1767 }, { "epoch": 1.3434650455927053, "grad_norm": 4.276898384094238, "learning_rate": 3.1338883873408517e-06, "loss": 0.41410398483276367, "mean_token_accuracy": 0.8653874397277832, "num_tokens": 14707413.0, "step": 1768 }, { "epoch": 1.344224924012158, "grad_norm": 1.9052214622497559, "learning_rate": 3.1318622130238237e-06, "loss": 0.4214959144592285, "mean_token_accuracy": 0.8464792966842651, "num_tokens": 14716010.0, "step": 1769 }, { "epoch": 1.344984802431611, "grad_norm": 2.422417640686035, "learning_rate": 3.1298355952415714e-06, "loss": 0.35174620151519775, "mean_token_accuracy": 0.88395756483078, "num_tokens": 14720396.0, "step": 1770 }, { "epoch": 1.3457446808510638, "grad_norm": 1.6704987287521362, "learning_rate": 3.127808535416454e-06, "loss": 0.47611168026924133, "mean_token_accuracy": 0.8508312702178955, "num_tokens": 14731637.0, "step": 1771 }, { "epoch": 1.3465045592705167, "grad_norm": 2.176877021789551, "learning_rate": 3.1257810349711388e-06, "loss": 0.4700402021408081, "mean_token_accuracy": 0.832779586315155, "num_tokens": 14738647.0, "step": 1772 }, { "epoch": 1.3472644376899696, "grad_norm": 1.3342349529266357, "learning_rate": 3.1237530953286046e-06, "loss": 0.4833040237426758, "mean_token_accuracy": 0.829606831073761, "num_tokens": 14757952.0, "step": 1773 }, { "epoch": 1.3480243161094225, "grad_norm": 2.3696649074554443, "learning_rate": 3.121724717912138e-06, "loss": 0.3039613962173462, "mean_token_accuracy": 0.8929629325866699, "num_tokens": 14763167.0, "step": 1774 }, { "epoch": 1.3487841945288754, "grad_norm": 1.9273014068603516, "learning_rate": 3.11969590414533e-06, "loss": 0.3786233961582184, "mean_token_accuracy": 0.8837813138961792, "num_tokens": 14769502.0, "step": 1775 }, { "epoch": 1.3495440729483283, "grad_norm": 1.7184932231903076, "learning_rate": 3.1176666554520827e-06, "loss": 0.37997254729270935, "mean_token_accuracy": 0.8764146566390991, "num_tokens": 14778180.0, "step": 1776 }, { "epoch": 1.3503039513677813, "grad_norm": 1.775872826576233, "learning_rate": 3.1156369732566006e-06, "loss": 0.42275550961494446, "mean_token_accuracy": 0.8486601114273071, "num_tokens": 14789174.0, "step": 1777 }, { "epoch": 1.351063829787234, "grad_norm": 1.774247169494629, "learning_rate": 3.113606858983391e-06, "loss": 0.35329151153564453, "mean_token_accuracy": 0.856008768081665, "num_tokens": 14797422.0, "step": 1778 }, { "epoch": 1.3518237082066868, "grad_norm": 1.4550470113754272, "learning_rate": 3.1115763140572686e-06, "loss": 0.4446291923522949, "mean_token_accuracy": 0.8477867841720581, "num_tokens": 14815582.0, "step": 1779 }, { "epoch": 1.3525835866261398, "grad_norm": 1.740195870399475, "learning_rate": 3.109545339903347e-06, "loss": 0.447843998670578, "mean_token_accuracy": 0.857703685760498, "num_tokens": 14826124.0, "step": 1780 }, { "epoch": 1.3533434650455927, "grad_norm": 2.020663261413574, "learning_rate": 3.107513937947041e-06, "loss": 0.4222099184989929, "mean_token_accuracy": 0.8516832590103149, "num_tokens": 14834100.0, "step": 1781 }, { "epoch": 1.3541033434650456, "grad_norm": 1.5809416770935059, "learning_rate": 3.1054821096140675e-06, "loss": 0.39603835344314575, "mean_token_accuracy": 0.8741145133972168, "num_tokens": 14843046.0, "step": 1782 }, { "epoch": 1.3548632218844985, "grad_norm": 2.231323480606079, "learning_rate": 3.1034498563304435e-06, "loss": 0.39556390047073364, "mean_token_accuracy": 0.8448240160942078, "num_tokens": 14849025.0, "step": 1783 }, { "epoch": 1.3556231003039514, "grad_norm": 2.2446820735931396, "learning_rate": 3.1014171795224794e-06, "loss": 0.3440389037132263, "mean_token_accuracy": 0.8767511248588562, "num_tokens": 14854541.0, "step": 1784 }, { "epoch": 1.3563829787234043, "grad_norm": 1.986039161682129, "learning_rate": 3.0993840806167884e-06, "loss": 0.4236282706260681, "mean_token_accuracy": 0.8400864601135254, "num_tokens": 14861567.0, "step": 1785 }, { "epoch": 1.3571428571428572, "grad_norm": 1.8986533880233765, "learning_rate": 3.0973505610402767e-06, "loss": 0.40832850337028503, "mean_token_accuracy": 0.8496479988098145, "num_tokens": 14869300.0, "step": 1786 }, { "epoch": 1.35790273556231, "grad_norm": 1.7468228340148926, "learning_rate": 3.0953166222201474e-06, "loss": 0.41121214628219604, "mean_token_accuracy": 0.8485229015350342, "num_tokens": 14879250.0, "step": 1787 }, { "epoch": 1.358662613981763, "grad_norm": 2.4853055477142334, "learning_rate": 3.093282265583895e-06, "loss": 0.42357271909713745, "mean_token_accuracy": 0.8484030961990356, "num_tokens": 14884531.0, "step": 1788 }, { "epoch": 1.3594224924012157, "grad_norm": 2.867727041244507, "learning_rate": 3.0912474925593124e-06, "loss": 0.3592284917831421, "mean_token_accuracy": 0.8797116875648499, "num_tokens": 14888143.0, "step": 1789 }, { "epoch": 1.3601823708206686, "grad_norm": 1.6568496227264404, "learning_rate": 3.0892123045744787e-06, "loss": 0.41129639744758606, "mean_token_accuracy": 0.854656457901001, "num_tokens": 14899614.0, "step": 1790 }, { "epoch": 1.3609422492401215, "grad_norm": 1.9425686597824097, "learning_rate": 3.0871767030577686e-06, "loss": 0.48112088441848755, "mean_token_accuracy": 0.8330042958259583, "num_tokens": 14909249.0, "step": 1791 }, { "epoch": 1.3617021276595744, "grad_norm": 2.1373918056488037, "learning_rate": 3.085140689437846e-06, "loss": 0.39633214473724365, "mean_token_accuracy": 0.8634995222091675, "num_tokens": 14916515.0, "step": 1792 }, { "epoch": 1.3624620060790273, "grad_norm": 2.1554017066955566, "learning_rate": 3.0831042651436634e-06, "loss": 0.3575013279914856, "mean_token_accuracy": 0.8727827668190002, "num_tokens": 14923508.0, "step": 1793 }, { "epoch": 1.3632218844984803, "grad_norm": 1.430115818977356, "learning_rate": 3.0810674316044602e-06, "loss": 0.33787524700164795, "mean_token_accuracy": 0.8799197673797607, "num_tokens": 14936192.0, "step": 1794 }, { "epoch": 1.3639817629179332, "grad_norm": 1.474648356437683, "learning_rate": 3.0790301902497664e-06, "loss": 0.3898437023162842, "mean_token_accuracy": 0.8537415266036987, "num_tokens": 14950322.0, "step": 1795 }, { "epoch": 1.364741641337386, "grad_norm": 1.9821518659591675, "learning_rate": 3.076992542509396e-06, "loss": 0.39159440994262695, "mean_token_accuracy": 0.8630361557006836, "num_tokens": 14958273.0, "step": 1796 }, { "epoch": 1.365501519756839, "grad_norm": 1.9144015312194824, "learning_rate": 3.0749544898134487e-06, "loss": 0.3024989664554596, "mean_token_accuracy": 0.891775369644165, "num_tokens": 14965118.0, "step": 1797 }, { "epoch": 1.3662613981762917, "grad_norm": 2.0583138465881348, "learning_rate": 3.072916033592307e-06, "loss": 0.31312304735183716, "mean_token_accuracy": 0.887916624546051, "num_tokens": 14970934.0, "step": 1798 }, { "epoch": 1.3670212765957448, "grad_norm": 3.4647998809814453, "learning_rate": 3.0708771752766397e-06, "loss": 0.441008061170578, "mean_token_accuracy": 0.8497233390808105, "num_tokens": 14974516.0, "step": 1799 }, { "epoch": 1.3677811550151975, "grad_norm": 1.6139730215072632, "learning_rate": 3.068837916297396e-06, "loss": 0.3950765132904053, "mean_token_accuracy": 0.8398228883743286, "num_tokens": 14986619.0, "step": 1800 }, { "epoch": 1.3685410334346504, "grad_norm": 1.6129515171051025, "learning_rate": 3.0667982580858047e-06, "loss": 0.3644065856933594, "mean_token_accuracy": 0.8787021636962891, "num_tokens": 14998569.0, "step": 1801 }, { "epoch": 1.3693009118541033, "grad_norm": 2.4186716079711914, "learning_rate": 3.0647582020733773e-06, "loss": 0.38788801431655884, "mean_token_accuracy": 0.8616076707839966, "num_tokens": 15003921.0, "step": 1802 }, { "epoch": 1.3700607902735562, "grad_norm": 1.8819490671157837, "learning_rate": 3.062717749691904e-06, "loss": 0.4204309582710266, "mean_token_accuracy": 0.8535496592521667, "num_tokens": 15011899.0, "step": 1803 }, { "epoch": 1.3708206686930091, "grad_norm": 1.4631839990615845, "learning_rate": 3.0606769023734535e-06, "loss": 0.37897083163261414, "mean_token_accuracy": 0.8629422187805176, "num_tokens": 15024013.0, "step": 1804 }, { "epoch": 1.371580547112462, "grad_norm": 1.628963828086853, "learning_rate": 3.0586356615503693e-06, "loss": 0.37859752774238586, "mean_token_accuracy": 0.8950241804122925, "num_tokens": 15034232.0, "step": 1805 }, { "epoch": 1.372340425531915, "grad_norm": 1.771040678024292, "learning_rate": 3.056594028655274e-06, "loss": 0.37309446930885315, "mean_token_accuracy": 0.9000084400177002, "num_tokens": 15042946.0, "step": 1806 }, { "epoch": 1.3731003039513678, "grad_norm": 1.9946256875991821, "learning_rate": 3.0545520051210637e-06, "loss": 0.44396111369132996, "mean_token_accuracy": 0.8430798053741455, "num_tokens": 15050514.0, "step": 1807 }, { "epoch": 1.3738601823708207, "grad_norm": 3.171607732772827, "learning_rate": 3.052509592380909e-06, "loss": 0.22399668395519257, "mean_token_accuracy": 0.9169777035713196, "num_tokens": 15053075.0, "step": 1808 }, { "epoch": 1.3746200607902734, "grad_norm": 1.8868348598480225, "learning_rate": 3.050466791868254e-06, "loss": 0.4898882806301117, "mean_token_accuracy": 0.8336533904075623, "num_tokens": 15063419.0, "step": 1809 }, { "epoch": 1.3753799392097266, "grad_norm": 2.0656440258026123, "learning_rate": 3.048423605016815e-06, "loss": 0.49566611647605896, "mean_token_accuracy": 0.8341299891471863, "num_tokens": 15077604.0, "step": 1810 }, { "epoch": 1.3761398176291793, "grad_norm": 3.5017685890197754, "learning_rate": 3.0463800332605787e-06, "loss": 0.249919593334198, "mean_token_accuracy": 0.90781170129776, "num_tokens": 15080125.0, "step": 1811 }, { "epoch": 1.3768996960486322, "grad_norm": 1.782050609588623, "learning_rate": 3.0443360780338034e-06, "loss": 0.3976958096027374, "mean_token_accuracy": 0.8589468002319336, "num_tokens": 15090178.0, "step": 1812 }, { "epoch": 1.377659574468085, "grad_norm": 2.2711188793182373, "learning_rate": 3.042291740771014e-06, "loss": 0.37493959069252014, "mean_token_accuracy": 0.8776636123657227, "num_tokens": 15095079.0, "step": 1813 }, { "epoch": 1.378419452887538, "grad_norm": 2.524413585662842, "learning_rate": 3.0402470229070057e-06, "loss": 0.3941831588745117, "mean_token_accuracy": 0.8824131488800049, "num_tokens": 15099432.0, "step": 1814 }, { "epoch": 1.3791793313069909, "grad_norm": 1.2691515684127808, "learning_rate": 3.03820192587684e-06, "loss": 0.39009249210357666, "mean_token_accuracy": 0.8554127216339111, "num_tokens": 15112827.0, "step": 1815 }, { "epoch": 1.3799392097264438, "grad_norm": 2.1586501598358154, "learning_rate": 3.036156451115846e-06, "loss": 0.38009151816368103, "mean_token_accuracy": 0.8648550510406494, "num_tokens": 15119109.0, "step": 1816 }, { "epoch": 1.3806990881458967, "grad_norm": 1.965774416923523, "learning_rate": 3.034110600059616e-06, "loss": 0.309835284948349, "mean_token_accuracy": 0.8951961994171143, "num_tokens": 15125762.0, "step": 1817 }, { "epoch": 1.3814589665653496, "grad_norm": 1.9553396701812744, "learning_rate": 3.0320643741440052e-06, "loss": 0.45383524894714355, "mean_token_accuracy": 0.840209424495697, "num_tokens": 15132489.0, "step": 1818 }, { "epoch": 1.3822188449848025, "grad_norm": 2.563767910003662, "learning_rate": 3.0300177748051375e-06, "loss": 0.3535853624343872, "mean_token_accuracy": 0.8712579607963562, "num_tokens": 15137164.0, "step": 1819 }, { "epoch": 1.3829787234042552, "grad_norm": 1.504638671875, "learning_rate": 3.0279708034793907e-06, "loss": 0.31597834825515747, "mean_token_accuracy": 0.8812779188156128, "num_tokens": 15147658.0, "step": 1820 }, { "epoch": 1.3837386018237083, "grad_norm": 2.439223289489746, "learning_rate": 3.025923461603412e-06, "loss": 0.3924642503261566, "mean_token_accuracy": 0.8627969026565552, "num_tokens": 15153472.0, "step": 1821 }, { "epoch": 1.384498480243161, "grad_norm": 2.8051810264587402, "learning_rate": 3.0238757506141013e-06, "loss": 0.4051172137260437, "mean_token_accuracy": 0.8733717203140259, "num_tokens": 15157547.0, "step": 1822 }, { "epoch": 1.385258358662614, "grad_norm": 2.0708088874816895, "learning_rate": 3.0218276719486245e-06, "loss": 0.45973724126815796, "mean_token_accuracy": 0.8369711637496948, "num_tokens": 15164383.0, "step": 1823 }, { "epoch": 1.3860182370820668, "grad_norm": 1.0042283535003662, "learning_rate": 3.019779227044398e-06, "loss": 0.34986358880996704, "mean_token_accuracy": 0.876350998878479, "num_tokens": 15188456.0, "step": 1824 }, { "epoch": 1.3867781155015197, "grad_norm": 2.0959692001342773, "learning_rate": 3.0177304173391038e-06, "loss": 0.4919646978378296, "mean_token_accuracy": 0.8274773359298706, "num_tokens": 15195464.0, "step": 1825 }, { "epoch": 1.3875379939209727, "grad_norm": 2.186288833618164, "learning_rate": 3.015681244270672e-06, "loss": 0.333156555891037, "mean_token_accuracy": 0.899429202079773, "num_tokens": 15200535.0, "step": 1826 }, { "epoch": 1.3882978723404256, "grad_norm": 1.7858690023422241, "learning_rate": 3.0136317092772923e-06, "loss": 0.438323438167572, "mean_token_accuracy": 0.8366807103157043, "num_tokens": 15218834.0, "step": 1827 }, { "epoch": 1.3890577507598785, "grad_norm": 2.220658302307129, "learning_rate": 3.0115818137974066e-06, "loss": 0.3475607633590698, "mean_token_accuracy": 0.8729928135871887, "num_tokens": 15223895.0, "step": 1828 }, { "epoch": 1.3898176291793314, "grad_norm": 1.1260886192321777, "learning_rate": 3.0095315592697126e-06, "loss": 0.3431466221809387, "mean_token_accuracy": 0.860142707824707, "num_tokens": 15244250.0, "step": 1829 }, { "epoch": 1.3905775075987843, "grad_norm": 1.217446208000183, "learning_rate": 3.007480947133155e-06, "loss": 0.3288293182849884, "mean_token_accuracy": 0.8959083557128906, "num_tokens": 15259570.0, "step": 1830 }, { "epoch": 1.391337386018237, "grad_norm": 1.2599458694458008, "learning_rate": 3.0054299788269343e-06, "loss": 0.3832264244556427, "mean_token_accuracy": 0.8614006042480469, "num_tokens": 15275008.0, "step": 1831 }, { "epoch": 1.39209726443769, "grad_norm": 2.2150626182556152, "learning_rate": 3.0033786557904982e-06, "loss": 0.44553741812705994, "mean_token_accuracy": 0.8316951990127563, "num_tokens": 15295288.0, "step": 1832 }, { "epoch": 1.3928571428571428, "grad_norm": 2.0414984226226807, "learning_rate": 3.001326979463545e-06, "loss": 0.364509642124176, "mean_token_accuracy": 0.9008793830871582, "num_tokens": 15301214.0, "step": 1833 }, { "epoch": 1.3936170212765957, "grad_norm": 1.7991291284561157, "learning_rate": 2.9992749512860177e-06, "loss": 0.39313066005706787, "mean_token_accuracy": 0.8565069437026978, "num_tokens": 15309650.0, "step": 1834 }, { "epoch": 1.3943768996960486, "grad_norm": 1.9310777187347412, "learning_rate": 2.9972225726981114e-06, "loss": 0.4557550549507141, "mean_token_accuracy": 0.8550512194633484, "num_tokens": 15318234.0, "step": 1835 }, { "epoch": 1.3951367781155015, "grad_norm": 1.3212847709655762, "learning_rate": 2.995169845140264e-06, "loss": 0.3883739113807678, "mean_token_accuracy": 0.8541952967643738, "num_tokens": 15337261.0, "step": 1836 }, { "epoch": 1.3958966565349544, "grad_norm": 1.2498537302017212, "learning_rate": 2.9931167700531575e-06, "loss": 0.30161699652671814, "mean_token_accuracy": 0.8842223286628723, "num_tokens": 15351679.0, "step": 1837 }, { "epoch": 1.3966565349544073, "grad_norm": 1.9505163431167603, "learning_rate": 2.9910633488777198e-06, "loss": 0.4927945137023926, "mean_token_accuracy": 0.8548860549926758, "num_tokens": 15362150.0, "step": 1838 }, { "epoch": 1.3974164133738602, "grad_norm": 2.329378843307495, "learning_rate": 2.989009583055121e-06, "loss": 0.40655839443206787, "mean_token_accuracy": 0.8434528112411499, "num_tokens": 15369470.0, "step": 1839 }, { "epoch": 1.3981762917933132, "grad_norm": 1.8739807605743408, "learning_rate": 2.9869554740267726e-06, "loss": 0.4090898036956787, "mean_token_accuracy": 0.8572638034820557, "num_tokens": 15378003.0, "step": 1840 }, { "epoch": 1.398936170212766, "grad_norm": 1.9508702754974365, "learning_rate": 2.9849010232343274e-06, "loss": 0.5067851543426514, "mean_token_accuracy": 0.8205057382583618, "num_tokens": 15388561.0, "step": 1841 }, { "epoch": 1.3996960486322187, "grad_norm": 2.0264744758605957, "learning_rate": 2.982846232119679e-06, "loss": 0.5016608834266663, "mean_token_accuracy": 0.8606590628623962, "num_tokens": 15397132.0, "step": 1842 }, { "epoch": 1.4004559270516717, "grad_norm": 1.6243107318878174, "learning_rate": 2.9807911021249573e-06, "loss": 0.3411807715892792, "mean_token_accuracy": 0.890472412109375, "num_tokens": 15408078.0, "step": 1843 }, { "epoch": 1.4012158054711246, "grad_norm": 1.7738771438598633, "learning_rate": 2.9787356346925327e-06, "loss": 0.38648271560668945, "mean_token_accuracy": 0.8597084283828735, "num_tokens": 15418190.0, "step": 1844 }, { "epoch": 1.4019756838905775, "grad_norm": 1.7822409868240356, "learning_rate": 2.9766798312650112e-06, "loss": 0.41192826628685, "mean_token_accuracy": 0.8659167289733887, "num_tokens": 15426242.0, "step": 1845 }, { "epoch": 1.4027355623100304, "grad_norm": 2.4507217407226562, "learning_rate": 2.9746236932852355e-06, "loss": 0.4982181787490845, "mean_token_accuracy": 0.8272942304611206, "num_tokens": 15433261.0, "step": 1846 }, { "epoch": 1.4034954407294833, "grad_norm": 2.3375535011291504, "learning_rate": 2.9725672221962804e-06, "loss": 0.39207589626312256, "mean_token_accuracy": 0.856019139289856, "num_tokens": 15439354.0, "step": 1847 }, { "epoch": 1.4042553191489362, "grad_norm": 1.8062119483947754, "learning_rate": 2.9705104194414587e-06, "loss": 0.29061275720596313, "mean_token_accuracy": 0.9044786691665649, "num_tokens": 15445097.0, "step": 1848 }, { "epoch": 1.405015197568389, "grad_norm": 1.4321762323379517, "learning_rate": 2.9684532864643123e-06, "loss": 0.42219293117523193, "mean_token_accuracy": 0.8495569229125977, "num_tokens": 15458391.0, "step": 1849 }, { "epoch": 1.405775075987842, "grad_norm": 2.064546823501587, "learning_rate": 2.9663958247086165e-06, "loss": 0.3725713789463043, "mean_token_accuracy": 0.8662878274917603, "num_tokens": 15464066.0, "step": 1850 }, { "epoch": 1.4065349544072947, "grad_norm": 1.4412040710449219, "learning_rate": 2.964338035618378e-06, "loss": 0.4512884020805359, "mean_token_accuracy": 0.8348442316055298, "num_tokens": 15479965.0, "step": 1851 }, { "epoch": 1.4072948328267478, "grad_norm": 1.3971494436264038, "learning_rate": 2.9622799206378306e-06, "loss": 0.5281205177307129, "mean_token_accuracy": 0.817984938621521, "num_tokens": 15501815.0, "step": 1852 }, { "epoch": 1.4080547112462005, "grad_norm": 1.50556480884552, "learning_rate": 2.9602214812114414e-06, "loss": 0.4778776168823242, "mean_token_accuracy": 0.829122006893158, "num_tokens": 15515157.0, "step": 1853 }, { "epoch": 1.4088145896656534, "grad_norm": 1.504889726638794, "learning_rate": 2.9581627187838997e-06, "loss": 0.3931974172592163, "mean_token_accuracy": 0.8585784435272217, "num_tokens": 15526324.0, "step": 1854 }, { "epoch": 1.4095744680851063, "grad_norm": 2.069134473800659, "learning_rate": 2.956103634800126e-06, "loss": 0.30524685978889465, "mean_token_accuracy": 0.8911583423614502, "num_tokens": 15532215.0, "step": 1855 }, { "epoch": 1.4103343465045592, "grad_norm": 2.549525499343872, "learning_rate": 2.9540442307052643e-06, "loss": 0.3061476945877075, "mean_token_accuracy": 0.8933674097061157, "num_tokens": 15535826.0, "step": 1856 }, { "epoch": 1.4110942249240122, "grad_norm": 2.0561094284057617, "learning_rate": 2.9519845079446824e-06, "loss": 0.4954822063446045, "mean_token_accuracy": 0.8636343479156494, "num_tokens": 15545732.0, "step": 1857 }, { "epoch": 1.411854103343465, "grad_norm": 2.1085472106933594, "learning_rate": 2.949924467963975e-06, "loss": 0.4520289897918701, "mean_token_accuracy": 0.8424645662307739, "num_tokens": 15553396.0, "step": 1858 }, { "epoch": 1.412613981762918, "grad_norm": 2.583547830581665, "learning_rate": 2.9478641122089563e-06, "loss": 0.29951080679893494, "mean_token_accuracy": 0.8957923650741577, "num_tokens": 15556950.0, "step": 1859 }, { "epoch": 1.4133738601823709, "grad_norm": 1.3569830656051636, "learning_rate": 2.945803442125663e-06, "loss": 0.3536510765552521, "mean_token_accuracy": 0.8722060918807983, "num_tokens": 15569489.0, "step": 1860 }, { "epoch": 1.4141337386018238, "grad_norm": 1.899677038192749, "learning_rate": 2.943742459160354e-06, "loss": 0.526835560798645, "mean_token_accuracy": 0.8511170148849487, "num_tokens": 15578790.0, "step": 1861 }, { "epoch": 1.4148936170212765, "grad_norm": 2.8634719848632812, "learning_rate": 2.9416811647595052e-06, "loss": 0.48145440220832825, "mean_token_accuracy": 0.8436572551727295, "num_tokens": 15583427.0, "step": 1862 }, { "epoch": 1.4156534954407296, "grad_norm": 4.073660373687744, "learning_rate": 2.939619560369813e-06, "loss": 0.5063945055007935, "mean_token_accuracy": 0.8542901277542114, "num_tokens": 15586849.0, "step": 1863 }, { "epoch": 1.4164133738601823, "grad_norm": 1.6887764930725098, "learning_rate": 2.9375576474381907e-06, "loss": 0.32744795083999634, "mean_token_accuracy": 0.8610579967498779, "num_tokens": 15595500.0, "step": 1864 }, { "epoch": 1.4171732522796352, "grad_norm": 1.9122443199157715, "learning_rate": 2.9354954274117683e-06, "loss": 0.3644459843635559, "mean_token_accuracy": 0.866673469543457, "num_tokens": 15601957.0, "step": 1865 }, { "epoch": 1.417933130699088, "grad_norm": 2.753112316131592, "learning_rate": 2.9334329017378898e-06, "loss": 0.38707786798477173, "mean_token_accuracy": 0.8688068389892578, "num_tokens": 15605494.0, "step": 1866 }, { "epoch": 1.418693009118541, "grad_norm": 1.7371296882629395, "learning_rate": 2.9313700718641167e-06, "loss": 0.33663976192474365, "mean_token_accuracy": 0.865202784538269, "num_tokens": 15614920.0, "step": 1867 }, { "epoch": 1.419452887537994, "grad_norm": 2.8042995929718018, "learning_rate": 2.9293069392382224e-06, "loss": 0.44439682364463806, "mean_token_accuracy": 0.8492491245269775, "num_tokens": 15619340.0, "step": 1868 }, { "epoch": 1.4202127659574468, "grad_norm": 1.58261239528656, "learning_rate": 2.927243505308192e-06, "loss": 0.39830970764160156, "mean_token_accuracy": 0.8567156195640564, "num_tokens": 15628486.0, "step": 1869 }, { "epoch": 1.4209726443768997, "grad_norm": 1.3645081520080566, "learning_rate": 2.925179771522223e-06, "loss": 0.343808650970459, "mean_token_accuracy": 0.8631642460823059, "num_tokens": 15642751.0, "step": 1870 }, { "epoch": 1.4217325227963526, "grad_norm": 1.8958383798599243, "learning_rate": 2.9231157393287234e-06, "loss": 0.47441768646240234, "mean_token_accuracy": 0.8252970576286316, "num_tokens": 15655089.0, "step": 1871 }, { "epoch": 1.4224924012158056, "grad_norm": 1.7967755794525146, "learning_rate": 2.9210514101763116e-06, "loss": 0.4615456461906433, "mean_token_accuracy": 0.8358234167098999, "num_tokens": 15665946.0, "step": 1872 }, { "epoch": 1.4232522796352582, "grad_norm": 2.618018627166748, "learning_rate": 2.9189867855138103e-06, "loss": 0.44436317682266235, "mean_token_accuracy": 0.8618506193161011, "num_tokens": 15670172.0, "step": 1873 }, { "epoch": 1.4240121580547114, "grad_norm": 1.7442498207092285, "learning_rate": 2.9169218667902562e-06, "loss": 0.3142424523830414, "mean_token_accuracy": 0.8770287036895752, "num_tokens": 15676637.0, "step": 1874 }, { "epoch": 1.424772036474164, "grad_norm": 1.9286264181137085, "learning_rate": 2.9148566554548857e-06, "loss": 0.3573610186576843, "mean_token_accuracy": 0.8665582537651062, "num_tokens": 15684046.0, "step": 1875 }, { "epoch": 1.425531914893617, "grad_norm": 1.3110337257385254, "learning_rate": 2.912791152957145e-06, "loss": 0.329259991645813, "mean_token_accuracy": 0.8943178057670593, "num_tokens": 15697223.0, "step": 1876 }, { "epoch": 1.4262917933130699, "grad_norm": 2.4158987998962402, "learning_rate": 2.9107253607466833e-06, "loss": 0.32241585850715637, "mean_token_accuracy": 0.8957223892211914, "num_tokens": 15701549.0, "step": 1877 }, { "epoch": 1.4270516717325228, "grad_norm": 1.9663068056106567, "learning_rate": 2.908659280273354e-06, "loss": 0.322094589471817, "mean_token_accuracy": 0.8781799077987671, "num_tokens": 15708071.0, "step": 1878 }, { "epoch": 1.4278115501519757, "grad_norm": 1.9788851737976074, "learning_rate": 2.9065929129872097e-06, "loss": 0.3762337565422058, "mean_token_accuracy": 0.8626716136932373, "num_tokens": 15715366.0, "step": 1879 }, { "epoch": 1.4285714285714286, "grad_norm": 3.9467108249664307, "learning_rate": 2.9045262603385073e-06, "loss": 0.3579118847846985, "mean_token_accuracy": 0.8858233690261841, "num_tokens": 15717845.0, "step": 1880 }, { "epoch": 1.4293313069908815, "grad_norm": 1.714511513710022, "learning_rate": 2.902459323777704e-06, "loss": 0.36425089836120605, "mean_token_accuracy": 0.8702815771102905, "num_tokens": 15725813.0, "step": 1881 }, { "epoch": 1.4300911854103344, "grad_norm": 2.570223331451416, "learning_rate": 2.900392104755455e-06, "loss": 0.5598096251487732, "mean_token_accuracy": 0.8479335308074951, "num_tokens": 15733482.0, "step": 1882 }, { "epoch": 1.4308510638297873, "grad_norm": 1.3402973413467407, "learning_rate": 2.8983246047226137e-06, "loss": 0.3499920964241028, "mean_token_accuracy": 0.8723243474960327, "num_tokens": 15749354.0, "step": 1883 }, { "epoch": 1.43161094224924, "grad_norm": 1.8155218362808228, "learning_rate": 2.8962568251302327e-06, "loss": 0.338306725025177, "mean_token_accuracy": 0.8830790519714355, "num_tokens": 15757418.0, "step": 1884 }, { "epoch": 1.4323708206686931, "grad_norm": 2.2927651405334473, "learning_rate": 2.8941887674295573e-06, "loss": 0.5045862197875977, "mean_token_accuracy": 0.8278255462646484, "num_tokens": 15765086.0, "step": 1885 }, { "epoch": 1.4331306990881458, "grad_norm": 2.3207974433898926, "learning_rate": 2.892120433072031e-06, "loss": 0.2704503536224365, "mean_token_accuracy": 0.8956273794174194, "num_tokens": 15769933.0, "step": 1886 }, { "epoch": 1.4338905775075987, "grad_norm": 2.5109899044036865, "learning_rate": 2.8900518235092908e-06, "loss": 0.2550089657306671, "mean_token_accuracy": 0.9256435632705688, "num_tokens": 15774152.0, "step": 1887 }, { "epoch": 1.4346504559270516, "grad_norm": 1.9047918319702148, "learning_rate": 2.887982940193165e-06, "loss": 0.4147410988807678, "mean_token_accuracy": 0.8526647090911865, "num_tokens": 15781255.0, "step": 1888 }, { "epoch": 1.4354103343465046, "grad_norm": 1.492797613143921, "learning_rate": 2.8859137845756785e-06, "loss": 0.3823188543319702, "mean_token_accuracy": 0.8586028814315796, "num_tokens": 15793922.0, "step": 1889 }, { "epoch": 1.4361702127659575, "grad_norm": 2.5470523834228516, "learning_rate": 2.8838443581090415e-06, "loss": 0.3789905905723572, "mean_token_accuracy": 0.8810428380966187, "num_tokens": 15798268.0, "step": 1890 }, { "epoch": 1.4369300911854104, "grad_norm": 2.785818338394165, "learning_rate": 2.8817746622456585e-06, "loss": 0.44963404536247253, "mean_token_accuracy": 0.841977596282959, "num_tokens": 15802752.0, "step": 1891 }, { "epoch": 1.4376899696048633, "grad_norm": 2.3368546962738037, "learning_rate": 2.879704698438121e-06, "loss": 0.33760276436805725, "mean_token_accuracy": 0.8854647278785706, "num_tokens": 15808045.0, "step": 1892 }, { "epoch": 1.4384498480243162, "grad_norm": 2.0127737522125244, "learning_rate": 2.8776344681392106e-06, "loss": 0.30963221192359924, "mean_token_accuracy": 0.8868530988693237, "num_tokens": 15814195.0, "step": 1893 }, { "epoch": 1.439209726443769, "grad_norm": 3.230771541595459, "learning_rate": 2.875563972801893e-06, "loss": 0.324097603559494, "mean_token_accuracy": 0.8920680284500122, "num_tokens": 15816913.0, "step": 1894 }, { "epoch": 1.4399696048632218, "grad_norm": 1.4154688119888306, "learning_rate": 2.8734932138793226e-06, "loss": 0.3350130617618561, "mean_token_accuracy": 0.8855552673339844, "num_tokens": 15829343.0, "step": 1895 }, { "epoch": 1.4407294832826747, "grad_norm": 2.111506938934326, "learning_rate": 2.871422192824837e-06, "loss": 0.41803890466690063, "mean_token_accuracy": 0.8492453098297119, "num_tokens": 15835531.0, "step": 1896 }, { "epoch": 1.4414893617021276, "grad_norm": 1.582686424255371, "learning_rate": 2.8693509110919597e-06, "loss": 0.4903678894042969, "mean_token_accuracy": 0.8195500373840332, "num_tokens": 15849740.0, "step": 1897 }, { "epoch": 1.4422492401215805, "grad_norm": 3.802279233932495, "learning_rate": 2.867279370134395e-06, "loss": 0.5214452743530273, "mean_token_accuracy": 0.818822979927063, "num_tokens": 15853137.0, "step": 1898 }, { "epoch": 1.4430091185410334, "grad_norm": 1.3670995235443115, "learning_rate": 2.8652075714060296e-06, "loss": 0.4218589961528778, "mean_token_accuracy": 0.8668398857116699, "num_tokens": 15870212.0, "step": 1899 }, { "epoch": 1.4437689969604863, "grad_norm": 2.2895567417144775, "learning_rate": 2.863135516360932e-06, "loss": 0.37892046570777893, "mean_token_accuracy": 0.8825801610946655, "num_tokens": 15875666.0, "step": 1900 }, { "epoch": 1.4445288753799392, "grad_norm": 2.0398786067962646, "learning_rate": 2.8610632064533517e-06, "loss": 0.4677078425884247, "mean_token_accuracy": 0.8766300678253174, "num_tokens": 15886718.0, "step": 1901 }, { "epoch": 1.4452887537993921, "grad_norm": 2.618565559387207, "learning_rate": 2.8589906431377133e-06, "loss": 0.4118839204311371, "mean_token_accuracy": 0.8575683832168579, "num_tokens": 15891464.0, "step": 1902 }, { "epoch": 1.446048632218845, "grad_norm": 1.0620054006576538, "learning_rate": 2.8569178278686222e-06, "loss": 0.3860771059989929, "mean_token_accuracy": 0.8628644347190857, "num_tokens": 15914150.0, "step": 1903 }, { "epoch": 1.4468085106382977, "grad_norm": 1.4800416231155396, "learning_rate": 2.8548447621008614e-06, "loss": 0.4064670205116272, "mean_token_accuracy": 0.8496098518371582, "num_tokens": 15927895.0, "step": 1904 }, { "epoch": 1.4475683890577509, "grad_norm": 2.1188619136810303, "learning_rate": 2.8527714472893866e-06, "loss": 0.408307284116745, "mean_token_accuracy": 0.8925206065177917, "num_tokens": 15934208.0, "step": 1905 }, { "epoch": 1.4483282674772036, "grad_norm": 1.1466377973556519, "learning_rate": 2.85069788488933e-06, "loss": 0.35651272535324097, "mean_token_accuracy": 0.8710312843322754, "num_tokens": 15953347.0, "step": 1906 }, { "epoch": 1.4490881458966565, "grad_norm": 2.1419358253479004, "learning_rate": 2.8486240763559984e-06, "loss": 0.3315538167953491, "mean_token_accuracy": 0.8849161863327026, "num_tokens": 15959607.0, "step": 1907 }, { "epoch": 1.4498480243161094, "grad_norm": 2.5072624683380127, "learning_rate": 2.8465500231448707e-06, "loss": 0.45812320709228516, "mean_token_accuracy": 0.8458243608474731, "num_tokens": 15965924.0, "step": 1908 }, { "epoch": 1.4506079027355623, "grad_norm": 2.3009610176086426, "learning_rate": 2.844475726711595e-06, "loss": 0.39409273862838745, "mean_token_accuracy": 0.8610217571258545, "num_tokens": 15970713.0, "step": 1909 }, { "epoch": 1.4513677811550152, "grad_norm": 2.4585795402526855, "learning_rate": 2.8424011885119956e-06, "loss": 0.4896971583366394, "mean_token_accuracy": 0.8256454467773438, "num_tokens": 15981217.0, "step": 1910 }, { "epoch": 1.452127659574468, "grad_norm": 1.3017839193344116, "learning_rate": 2.8403264100020613e-06, "loss": 0.453654408454895, "mean_token_accuracy": 0.8386960029602051, "num_tokens": 15997472.0, "step": 1911 }, { "epoch": 1.452887537993921, "grad_norm": 1.647536039352417, "learning_rate": 2.8382513926379508e-06, "loss": 0.40560781955718994, "mean_token_accuracy": 0.8591917753219604, "num_tokens": 16007709.0, "step": 1912 }, { "epoch": 1.453647416413374, "grad_norm": 1.7488670349121094, "learning_rate": 2.836176137875993e-06, "loss": 0.40125876665115356, "mean_token_accuracy": 0.8709259033203125, "num_tokens": 16018132.0, "step": 1913 }, { "epoch": 1.4544072948328268, "grad_norm": 1.5565074682235718, "learning_rate": 2.8341006471726817e-06, "loss": 0.4727994203567505, "mean_token_accuracy": 0.8342990875244141, "num_tokens": 16033168.0, "step": 1914 }, { "epoch": 1.4551671732522795, "grad_norm": 3.4736406803131104, "learning_rate": 2.832024921984674e-06, "loss": 0.3157728314399719, "mean_token_accuracy": 0.8842766284942627, "num_tokens": 16036722.0, "step": 1915 }, { "epoch": 1.4559270516717326, "grad_norm": 2.395650863647461, "learning_rate": 2.8299489637687955e-06, "loss": 0.4264276623725891, "mean_token_accuracy": 0.8467907905578613, "num_tokens": 16042989.0, "step": 1916 }, { "epoch": 1.4566869300911853, "grad_norm": 1.7940794229507446, "learning_rate": 2.8278727739820334e-06, "loss": 0.3641016483306885, "mean_token_accuracy": 0.8568278551101685, "num_tokens": 16051323.0, "step": 1917 }, { "epoch": 1.4574468085106382, "grad_norm": 1.6434123516082764, "learning_rate": 2.825796354081537e-06, "loss": 0.529184103012085, "mean_token_accuracy": 0.8344453573226929, "num_tokens": 16063281.0, "step": 1918 }, { "epoch": 1.4582066869300911, "grad_norm": 2.559065580368042, "learning_rate": 2.8237197055246175e-06, "loss": 0.22185541689395905, "mean_token_accuracy": 0.9208902716636658, "num_tokens": 16066978.0, "step": 1919 }, { "epoch": 1.458966565349544, "grad_norm": 2.988894462585449, "learning_rate": 2.821642829768748e-06, "loss": 0.4099072813987732, "mean_token_accuracy": 0.8567482829093933, "num_tokens": 16071109.0, "step": 1920 }, { "epoch": 1.459726443768997, "grad_norm": 1.7724629640579224, "learning_rate": 2.8195657282715595e-06, "loss": 0.4945113956928253, "mean_token_accuracy": 0.836738109588623, "num_tokens": 16081897.0, "step": 1921 }, { "epoch": 1.4604863221884499, "grad_norm": 2.0314419269561768, "learning_rate": 2.817488402490841e-06, "loss": 0.44073134660720825, "mean_token_accuracy": 0.8651165962219238, "num_tokens": 16088350.0, "step": 1922 }, { "epoch": 1.4612462006079028, "grad_norm": 2.509838819503784, "learning_rate": 2.8154108538845405e-06, "loss": 0.39285334944725037, "mean_token_accuracy": 0.8604426383972168, "num_tokens": 16093244.0, "step": 1923 }, { "epoch": 1.4620060790273557, "grad_norm": 2.098116636276245, "learning_rate": 2.813333083910761e-06, "loss": 0.4771330952644348, "mean_token_accuracy": 0.8538521528244019, "num_tokens": 16100235.0, "step": 1924 }, { "epoch": 1.4627659574468086, "grad_norm": 2.2465813159942627, "learning_rate": 2.8112550940277615e-06, "loss": 0.5117051601409912, "mean_token_accuracy": 0.8564461469650269, "num_tokens": 16109806.0, "step": 1925 }, { "epoch": 1.4635258358662613, "grad_norm": 1.8483238220214844, "learning_rate": 2.809176885693956e-06, "loss": 0.43735378980636597, "mean_token_accuracy": 0.842054009437561, "num_tokens": 16119316.0, "step": 1926 }, { "epoch": 1.4642857142857144, "grad_norm": 1.8036853075027466, "learning_rate": 2.807098460367911e-06, "loss": 0.3516720235347748, "mean_token_accuracy": 0.8832477331161499, "num_tokens": 16126608.0, "step": 1927 }, { "epoch": 1.465045592705167, "grad_norm": 2.5542640686035156, "learning_rate": 2.8050198195083445e-06, "loss": 0.3301466703414917, "mean_token_accuracy": 0.8843127489089966, "num_tokens": 16130773.0, "step": 1928 }, { "epoch": 1.46580547112462, "grad_norm": 1.845220685005188, "learning_rate": 2.802940964574127e-06, "loss": 0.39519134163856506, "mean_token_accuracy": 0.8583899140357971, "num_tokens": 16138435.0, "step": 1929 }, { "epoch": 1.466565349544073, "grad_norm": 2.773728609085083, "learning_rate": 2.800861897024279e-06, "loss": 0.38028937578201294, "mean_token_accuracy": 0.8659144639968872, "num_tokens": 16142567.0, "step": 1930 }, { "epoch": 1.4673252279635258, "grad_norm": 1.8423194885253906, "learning_rate": 2.798782618317971e-06, "loss": 0.358018696308136, "mean_token_accuracy": 0.8637189865112305, "num_tokens": 16150186.0, "step": 1931 }, { "epoch": 1.4680851063829787, "grad_norm": 1.7041927576065063, "learning_rate": 2.796703129914519e-06, "loss": 0.4962831139564514, "mean_token_accuracy": 0.8265417814254761, "num_tokens": 16161565.0, "step": 1932 }, { "epoch": 1.4688449848024316, "grad_norm": 2.301258087158203, "learning_rate": 2.79462343327339e-06, "loss": 0.3374050259590149, "mean_token_accuracy": 0.8793303966522217, "num_tokens": 16166339.0, "step": 1933 }, { "epoch": 1.4696048632218845, "grad_norm": 1.748000144958496, "learning_rate": 2.7925435298541944e-06, "loss": 0.33927470445632935, "mean_token_accuracy": 0.866838812828064, "num_tokens": 16176455.0, "step": 1934 }, { "epoch": 1.4703647416413375, "grad_norm": 3.1367924213409424, "learning_rate": 2.7904634211166877e-06, "loss": 0.41375264525413513, "mean_token_accuracy": 0.8599265813827515, "num_tokens": 16179911.0, "step": 1935 }, { "epoch": 1.4711246200607904, "grad_norm": 1.6053460836410522, "learning_rate": 2.7883831085207707e-06, "loss": 0.4377453327178955, "mean_token_accuracy": 0.8478385806083679, "num_tokens": 16189949.0, "step": 1936 }, { "epoch": 1.471884498480243, "grad_norm": 1.9328675270080566, "learning_rate": 2.7863025935264876e-06, "loss": 0.38693612813949585, "mean_token_accuracy": 0.8666559457778931, "num_tokens": 16198597.0, "step": 1937 }, { "epoch": 1.4726443768996962, "grad_norm": 1.2157680988311768, "learning_rate": 2.784221877594024e-06, "loss": 0.26671192049980164, "mean_token_accuracy": 0.9015446901321411, "num_tokens": 16211167.0, "step": 1938 }, { "epoch": 1.4734042553191489, "grad_norm": 1.8417600393295288, "learning_rate": 2.7821409621837042e-06, "loss": 0.41459742188453674, "mean_token_accuracy": 0.8567095398902893, "num_tokens": 16219849.0, "step": 1939 }, { "epoch": 1.4741641337386018, "grad_norm": 1.286196231842041, "learning_rate": 2.7800598487559976e-06, "loss": 0.3595050871372223, "mean_token_accuracy": 0.8591323494911194, "num_tokens": 16234188.0, "step": 1940 }, { "epoch": 1.4749240121580547, "grad_norm": 2.4813997745513916, "learning_rate": 2.777978538771508e-06, "loss": 0.36460646986961365, "mean_token_accuracy": 0.8691693544387817, "num_tokens": 16238916.0, "step": 1941 }, { "epoch": 1.4756838905775076, "grad_norm": 1.79752779006958, "learning_rate": 2.7758970336909795e-06, "loss": 0.30292239785194397, "mean_token_accuracy": 0.8843097686767578, "num_tokens": 16245841.0, "step": 1942 }, { "epoch": 1.4764437689969605, "grad_norm": 3.395206928253174, "learning_rate": 2.7738153349752923e-06, "loss": 0.46020257472991943, "mean_token_accuracy": 0.8431230783462524, "num_tokens": 16249518.0, "step": 1943 }, { "epoch": 1.4772036474164134, "grad_norm": 1.3478224277496338, "learning_rate": 2.7717334440854634e-06, "loss": 0.3035249710083008, "mean_token_accuracy": 0.9098293781280518, "num_tokens": 16263867.0, "step": 1944 }, { "epoch": 1.4779635258358663, "grad_norm": 1.9379981756210327, "learning_rate": 2.7696513624826422e-06, "loss": 0.37313729524612427, "mean_token_accuracy": 0.8760120868682861, "num_tokens": 16270788.0, "step": 1945 }, { "epoch": 1.4787234042553192, "grad_norm": 1.0166492462158203, "learning_rate": 2.7675690916281158e-06, "loss": 0.3120017349720001, "mean_token_accuracy": 0.8847176432609558, "num_tokens": 16292365.0, "step": 1946 }, { "epoch": 1.4794832826747721, "grad_norm": 1.5873897075653076, "learning_rate": 2.7654866329833e-06, "loss": 0.44397222995758057, "mean_token_accuracy": 0.8387264013290405, "num_tokens": 16303479.0, "step": 1947 }, { "epoch": 1.4802431610942248, "grad_norm": 1.8133329153060913, "learning_rate": 2.763403988009746e-06, "loss": 0.34557533264160156, "mean_token_accuracy": 0.8739964962005615, "num_tokens": 16311121.0, "step": 1948 }, { "epoch": 1.4810030395136777, "grad_norm": 1.2594037055969238, "learning_rate": 2.761321158169134e-06, "loss": 0.30497661232948303, "mean_token_accuracy": 0.894805908203125, "num_tokens": 16324552.0, "step": 1949 }, { "epoch": 1.4817629179331306, "grad_norm": 1.2903532981872559, "learning_rate": 2.759238144923274e-06, "loss": 0.4864066243171692, "mean_token_accuracy": 0.8282386064529419, "num_tokens": 16346950.0, "step": 1950 }, { "epoch": 1.4825227963525835, "grad_norm": 1.625597596168518, "learning_rate": 2.7571549497341044e-06, "loss": 0.3709399104118347, "mean_token_accuracy": 0.8715173006057739, "num_tokens": 16356114.0, "step": 1951 }, { "epoch": 1.4832826747720365, "grad_norm": 2.446568489074707, "learning_rate": 2.755071574063692e-06, "loss": 0.42299988865852356, "mean_token_accuracy": 0.8574379682540894, "num_tokens": 16361599.0, "step": 1952 }, { "epoch": 1.4840425531914894, "grad_norm": 1.3055740594863892, "learning_rate": 2.7529880193742297e-06, "loss": 0.3423617482185364, "mean_token_accuracy": 0.8742436766624451, "num_tokens": 16378558.0, "step": 1953 }, { "epoch": 1.4848024316109423, "grad_norm": 1.1686105728149414, "learning_rate": 2.7509042871280373e-06, "loss": 0.3703901767730713, "mean_token_accuracy": 0.8907407522201538, "num_tokens": 16397037.0, "step": 1954 }, { "epoch": 1.4855623100303952, "grad_norm": 1.523542881011963, "learning_rate": 2.748820378787558e-06, "loss": 0.4682610034942627, "mean_token_accuracy": 0.8278110027313232, "num_tokens": 16407254.0, "step": 1955 }, { "epoch": 1.486322188449848, "grad_norm": 2.0760133266448975, "learning_rate": 2.7467362958153585e-06, "loss": 0.345708429813385, "mean_token_accuracy": 0.8976961374282837, "num_tokens": 16412915.0, "step": 1956 }, { "epoch": 1.4870820668693008, "grad_norm": 2.341548442840576, "learning_rate": 2.7446520396741293e-06, "loss": 0.25041747093200684, "mean_token_accuracy": 0.9052972793579102, "num_tokens": 16416750.0, "step": 1957 }, { "epoch": 1.487841945288754, "grad_norm": 1.6976364850997925, "learning_rate": 2.742567611826681e-06, "loss": 0.5091716051101685, "mean_token_accuracy": 0.8378287553787231, "num_tokens": 16429385.0, "step": 1958 }, { "epoch": 1.4886018237082066, "grad_norm": 2.6599128246307373, "learning_rate": 2.7404830137359445e-06, "loss": 0.29068994522094727, "mean_token_accuracy": 0.8973856568336487, "num_tokens": 16433274.0, "step": 1959 }, { "epoch": 1.4893617021276595, "grad_norm": 1.741882562637329, "learning_rate": 2.7383982468649715e-06, "loss": 0.2991126775741577, "mean_token_accuracy": 0.889869213104248, "num_tokens": 16441176.0, "step": 1960 }, { "epoch": 1.4901215805471124, "grad_norm": 1.852423071861267, "learning_rate": 2.7363133126769326e-06, "loss": 0.4106128513813019, "mean_token_accuracy": 0.874710202217102, "num_tokens": 16450511.0, "step": 1961 }, { "epoch": 1.4908814589665653, "grad_norm": 1.583237648010254, "learning_rate": 2.7342282126351145e-06, "loss": 0.41411587595939636, "mean_token_accuracy": 0.8860340714454651, "num_tokens": 16463294.0, "step": 1962 }, { "epoch": 1.4916413373860182, "grad_norm": 1.888327956199646, "learning_rate": 2.73214294820292e-06, "loss": 0.3756563663482666, "mean_token_accuracy": 0.8713725805282593, "num_tokens": 16470764.0, "step": 1963 }, { "epoch": 1.4924012158054711, "grad_norm": 1.119258165359497, "learning_rate": 2.7300575208438684e-06, "loss": 0.30670613050460815, "mean_token_accuracy": 0.879364013671875, "num_tokens": 16487861.0, "step": 1964 }, { "epoch": 1.493161094224924, "grad_norm": 2.9257161617279053, "learning_rate": 2.7279719320215924e-06, "loss": 0.44703471660614014, "mean_token_accuracy": 0.861371636390686, "num_tokens": 16491887.0, "step": 1965 }, { "epoch": 1.493920972644377, "grad_norm": 3.007188081741333, "learning_rate": 2.725886183199839e-06, "loss": 0.33418571949005127, "mean_token_accuracy": 0.878244936466217, "num_tokens": 16496098.0, "step": 1966 }, { "epoch": 1.4946808510638299, "grad_norm": 2.144409418106079, "learning_rate": 2.723800275842468e-06, "loss": 0.3434603810310364, "mean_token_accuracy": 0.8721754550933838, "num_tokens": 16501182.0, "step": 1967 }, { "epoch": 1.4954407294832825, "grad_norm": 2.13578724861145, "learning_rate": 2.7217142114134466e-06, "loss": 0.4266355633735657, "mean_token_accuracy": 0.8814791440963745, "num_tokens": 16507485.0, "step": 1968 }, { "epoch": 1.4962006079027357, "grad_norm": 2.9423434734344482, "learning_rate": 2.7196279913768587e-06, "loss": 0.4060331881046295, "mean_token_accuracy": 0.8787740468978882, "num_tokens": 16511426.0, "step": 1969 }, { "epoch": 1.4969604863221884, "grad_norm": 3.289017915725708, "learning_rate": 2.717541617196891e-06, "loss": 0.33741700649261475, "mean_token_accuracy": 0.8868415951728821, "num_tokens": 16514408.0, "step": 1970 }, { "epoch": 1.4977203647416413, "grad_norm": 1.8096510171890259, "learning_rate": 2.7154550903378425e-06, "loss": 0.3549671769142151, "mean_token_accuracy": 0.8869922161102295, "num_tokens": 16521381.0, "step": 1971 }, { "epoch": 1.4984802431610942, "grad_norm": 1.2091960906982422, "learning_rate": 2.713368412264118e-06, "loss": 0.34742820262908936, "mean_token_accuracy": 0.8682405352592468, "num_tokens": 16537294.0, "step": 1972 }, { "epoch": 1.499240121580547, "grad_norm": 2.287986993789673, "learning_rate": 2.711281584440228e-06, "loss": 0.3794403672218323, "mean_token_accuracy": 0.8575073480606079, "num_tokens": 16543464.0, "step": 1973 }, { "epoch": 1.5, "grad_norm": 2.6913583278656006, "learning_rate": 2.70919460833079e-06, "loss": 0.36028215289115906, "mean_token_accuracy": 0.8803811073303223, "num_tokens": 16547520.0, "step": 1974 }, { "epoch": 1.500759878419453, "grad_norm": 2.9595229625701904, "learning_rate": 2.7071074854005206e-06, "loss": 0.3840835988521576, "mean_token_accuracy": 0.8786554336547852, "num_tokens": 16551315.0, "step": 1975 }, { "epoch": 1.5015197568389058, "grad_norm": 2.2881007194519043, "learning_rate": 2.705020217114248e-06, "loss": 0.5320546627044678, "mean_token_accuracy": 0.8157056570053101, "num_tokens": 16559170.0, "step": 1976 }, { "epoch": 1.5022796352583585, "grad_norm": 1.665455937385559, "learning_rate": 2.7029328049368942e-06, "loss": 0.4646413028240204, "mean_token_accuracy": 0.8227518796920776, "num_tokens": 16574934.0, "step": 1977 }, { "epoch": 1.5030395136778116, "grad_norm": 1.3689357042312622, "learning_rate": 2.700845250333486e-06, "loss": 0.4389103651046753, "mean_token_accuracy": 0.8416414260864258, "num_tokens": 16592144.0, "step": 1978 }, { "epoch": 1.5037993920972643, "grad_norm": 3.0704047679901123, "learning_rate": 2.69875755476915e-06, "loss": 0.4418138265609741, "mean_token_accuracy": 0.8455275297164917, "num_tokens": 16595924.0, "step": 1979 }, { "epoch": 1.5045592705167175, "grad_norm": 1.0455708503723145, "learning_rate": 2.696669719709111e-06, "loss": 0.3289058804512024, "mean_token_accuracy": 0.8618285655975342, "num_tokens": 16613609.0, "step": 1980 }, { "epoch": 1.5053191489361701, "grad_norm": 2.383582830429077, "learning_rate": 2.694581746618691e-06, "loss": 0.38633477687835693, "mean_token_accuracy": 0.8829542398452759, "num_tokens": 16618808.0, "step": 1981 }, { "epoch": 1.506079027355623, "grad_norm": 1.9131858348846436, "learning_rate": 2.6924936369633126e-06, "loss": 0.49194538593292236, "mean_token_accuracy": 0.8105896711349487, "num_tokens": 16626996.0, "step": 1982 }, { "epoch": 1.506838905775076, "grad_norm": 2.7373666763305664, "learning_rate": 2.6904053922084893e-06, "loss": 0.35638999938964844, "mean_token_accuracy": 0.872461199760437, "num_tokens": 16631870.0, "step": 1983 }, { "epoch": 1.5075987841945289, "grad_norm": 1.8863484859466553, "learning_rate": 2.688317013819832e-06, "loss": 0.4183836579322815, "mean_token_accuracy": 0.8601176738739014, "num_tokens": 16639556.0, "step": 1984 }, { "epoch": 1.5083586626139818, "grad_norm": 1.6627907752990723, "learning_rate": 2.686228503263045e-06, "loss": 0.3258771002292633, "mean_token_accuracy": 0.9052190780639648, "num_tokens": 16646993.0, "step": 1985 }, { "epoch": 1.5091185410334347, "grad_norm": 1.9558030366897583, "learning_rate": 2.684139862003927e-06, "loss": 0.3531185984611511, "mean_token_accuracy": 0.8705329298973083, "num_tokens": 16654185.0, "step": 1986 }, { "epoch": 1.5098784194528876, "grad_norm": 1.874787449836731, "learning_rate": 2.682051091508365e-06, "loss": 0.44819605350494385, "mean_token_accuracy": 0.8374806642532349, "num_tokens": 16663099.0, "step": 1987 }, { "epoch": 1.5106382978723403, "grad_norm": 2.3010215759277344, "learning_rate": 2.679962193242338e-06, "loss": 0.5586352944374084, "mean_token_accuracy": 0.8052853345870972, "num_tokens": 16670041.0, "step": 1988 }, { "epoch": 1.5113981762917934, "grad_norm": 2.7647526264190674, "learning_rate": 2.6778731686719177e-06, "loss": 0.4261942505836487, "mean_token_accuracy": 0.8694008588790894, "num_tokens": 16675951.0, "step": 1989 }, { "epoch": 1.512158054711246, "grad_norm": 2.916961193084717, "learning_rate": 2.67578401926326e-06, "loss": 0.3339785039424896, "mean_token_accuracy": 0.8789801597595215, "num_tokens": 16679304.0, "step": 1990 }, { "epoch": 1.5129179331306992, "grad_norm": 2.0234289169311523, "learning_rate": 2.6736947464826107e-06, "loss": 0.2022038698196411, "mean_token_accuracy": 0.9253173470497131, "num_tokens": 16684163.0, "step": 1991 }, { "epoch": 1.513677811550152, "grad_norm": 1.172646403312683, "learning_rate": 2.671605351796302e-06, "loss": 0.3492104411125183, "mean_token_accuracy": 0.8837358951568604, "num_tokens": 16701669.0, "step": 1992 }, { "epoch": 1.5144376899696048, "grad_norm": 2.756132125854492, "learning_rate": 2.6695158366707526e-06, "loss": 0.23893260955810547, "mean_token_accuracy": 0.9052976369857788, "num_tokens": 16705352.0, "step": 1993 }, { "epoch": 1.5151975683890577, "grad_norm": 2.3800320625305176, "learning_rate": 2.667426202572463e-06, "loss": 0.4487471580505371, "mean_token_accuracy": 0.8390192985534668, "num_tokens": 16711602.0, "step": 1994 }, { "epoch": 1.5159574468085106, "grad_norm": 1.13377046585083, "learning_rate": 2.665336450968019e-06, "loss": 0.3422296941280365, "mean_token_accuracy": 0.8793874979019165, "num_tokens": 16728915.0, "step": 1995 }, { "epoch": 1.5167173252279635, "grad_norm": 2.0686185359954834, "learning_rate": 2.6632465833240895e-06, "loss": 0.4688038229942322, "mean_token_accuracy": 0.8314379453659058, "num_tokens": 16736942.0, "step": 1996 }, { "epoch": 1.5174772036474165, "grad_norm": 1.8590772151947021, "learning_rate": 2.661156601107424e-06, "loss": 0.4437877833843231, "mean_token_accuracy": 0.8946475982666016, "num_tokens": 16746545.0, "step": 1997 }, { "epoch": 1.5182370820668694, "grad_norm": 2.7952499389648438, "learning_rate": 2.659066505784852e-06, "loss": 0.40627649426460266, "mean_token_accuracy": 0.8735514283180237, "num_tokens": 16751044.0, "step": 1998 }, { "epoch": 1.518996960486322, "grad_norm": 1.9548351764678955, "learning_rate": 2.6569762988232838e-06, "loss": 0.44635626673698425, "mean_token_accuracy": 0.8447756767272949, "num_tokens": 16760141.0, "step": 1999 }, { "epoch": 1.5197568389057752, "grad_norm": 2.000869035720825, "learning_rate": 2.654885981689706e-06, "loss": 0.4197946786880493, "mean_token_accuracy": 0.8619030714035034, "num_tokens": 16768881.0, "step": 2000 } ], "logging_steps": 1.0, "max_steps": 3948, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8278978880222003e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }