| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 456, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0043859649122807015, |
| "grad_norm": 17.878957220715776, |
| "learning_rate": 2.173913043478261e-07, |
| "loss": 2.0387, |
| "mean_token_accuracy": 0.6579925417900085, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.008771929824561403, |
| "grad_norm": 17.78486726138721, |
| "learning_rate": 4.347826086956522e-07, |
| "loss": 1.9416, |
| "mean_token_accuracy": 0.6654135584831238, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.013157894736842105, |
| "grad_norm": 17.13164166662652, |
| "learning_rate": 6.521739130434783e-07, |
| "loss": 2.0399, |
| "mean_token_accuracy": 0.6561939120292664, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.017543859649122806, |
| "grad_norm": 18.209458341936777, |
| "learning_rate": 8.695652173913044e-07, |
| "loss": 1.9996, |
| "mean_token_accuracy": 0.6516432166099548, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.021929824561403508, |
| "grad_norm": 18.703331955593068, |
| "learning_rate": 1.0869565217391306e-06, |
| "loss": 1.9857, |
| "mean_token_accuracy": 0.6365422606468201, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02631578947368421, |
| "grad_norm": 17.001106678403392, |
| "learning_rate": 1.3043478260869566e-06, |
| "loss": 1.9133, |
| "mean_token_accuracy": 0.667553186416626, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.03070175438596491, |
| "grad_norm": 17.916495118145253, |
| "learning_rate": 1.521739130434783e-06, |
| "loss": 1.9784, |
| "mean_token_accuracy": 0.6571428775787354, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.03508771929824561, |
| "grad_norm": 17.07134015461412, |
| "learning_rate": 1.7391304347826088e-06, |
| "loss": 1.8324, |
| "mean_token_accuracy": 0.6795937418937683, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.039473684210526314, |
| "grad_norm": 16.863789707771556, |
| "learning_rate": 1.956521739130435e-06, |
| "loss": 1.8859, |
| "mean_token_accuracy": 0.6639639735221863, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.043859649122807015, |
| "grad_norm": 16.459181504788187, |
| "learning_rate": 2.173913043478261e-06, |
| "loss": 1.6146, |
| "mean_token_accuracy": 0.6719184517860413, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04824561403508772, |
| "grad_norm": 14.215585551374044, |
| "learning_rate": 2.391304347826087e-06, |
| "loss": 1.4177, |
| "mean_token_accuracy": 0.7108108401298523, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.05263157894736842, |
| "grad_norm": 12.682396225360572, |
| "learning_rate": 2.6086956521739132e-06, |
| "loss": 1.43, |
| "mean_token_accuracy": 0.6857386827468872, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.05701754385964912, |
| "grad_norm": 13.508325699297039, |
| "learning_rate": 2.8260869565217393e-06, |
| "loss": 1.4064, |
| "mean_token_accuracy": 0.6750240921974182, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.06140350877192982, |
| "grad_norm": 9.935998030626173, |
| "learning_rate": 3.043478260869566e-06, |
| "loss": 1.0087, |
| "mean_token_accuracy": 0.7492850422859192, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.06578947368421052, |
| "grad_norm": 8.536092339229146, |
| "learning_rate": 3.2608695652173914e-06, |
| "loss": 0.9524, |
| "mean_token_accuracy": 0.7992565035820007, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.07017543859649122, |
| "grad_norm": 7.900358663532035, |
| "learning_rate": 3.4782608695652175e-06, |
| "loss": 0.8887, |
| "mean_token_accuracy": 0.8127272725105286, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.07456140350877193, |
| "grad_norm": 7.421526688992339, |
| "learning_rate": 3.6956521739130436e-06, |
| "loss": 0.7852, |
| "mean_token_accuracy": 0.8202459812164307, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.07894736842105263, |
| "grad_norm": 6.19323362471202, |
| "learning_rate": 3.91304347826087e-06, |
| "loss": 0.7339, |
| "mean_token_accuracy": 0.8502627015113831, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.08333333333333333, |
| "grad_norm": 5.59844982489824, |
| "learning_rate": 4.130434782608696e-06, |
| "loss": 0.5627, |
| "mean_token_accuracy": 0.885026752948761, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.08771929824561403, |
| "grad_norm": 5.45658138679211, |
| "learning_rate": 4.347826086956522e-06, |
| "loss": 0.4144, |
| "mean_token_accuracy": 0.9038642644882202, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.09210526315789473, |
| "grad_norm": 5.253428579590723, |
| "learning_rate": 4.565217391304348e-06, |
| "loss": 0.327, |
| "mean_token_accuracy": 0.9230769276618958, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.09649122807017543, |
| "grad_norm": 4.989829903200284, |
| "learning_rate": 4.782608695652174e-06, |
| "loss": 0.2937, |
| "mean_token_accuracy": 0.9300885200500488, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.10087719298245613, |
| "grad_norm": 3.4550261000622537, |
| "learning_rate": 5e-06, |
| "loss": 0.2323, |
| "mean_token_accuracy": 0.9402319192886353, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.10526315789473684, |
| "grad_norm": 4.1099270545591, |
| "learning_rate": 5.2173913043478265e-06, |
| "loss": 0.1775, |
| "mean_token_accuracy": 0.9578651785850525, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.10964912280701754, |
| "grad_norm": 3.699625690166745, |
| "learning_rate": 5.4347826086956525e-06, |
| "loss": 0.1628, |
| "mean_token_accuracy": 0.9545023441314697, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.11403508771929824, |
| "grad_norm": 5.205866567059522, |
| "learning_rate": 5.652173913043479e-06, |
| "loss": 0.1228, |
| "mean_token_accuracy": 0.9606372714042664, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.11842105263157894, |
| "grad_norm": 4.900178286181551, |
| "learning_rate": 5.8695652173913055e-06, |
| "loss": 0.209, |
| "mean_token_accuracy": 0.944903552532196, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.12280701754385964, |
| "grad_norm": 4.105692760093034, |
| "learning_rate": 6.086956521739132e-06, |
| "loss": 0.1665, |
| "mean_token_accuracy": 0.955974817276001, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.12719298245614036, |
| "grad_norm": 3.1141993454804733, |
| "learning_rate": 6.304347826086958e-06, |
| "loss": 0.1197, |
| "mean_token_accuracy": 0.9584513902664185, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.13157894736842105, |
| "grad_norm": 5.2875691960660705, |
| "learning_rate": 6.521739130434783e-06, |
| "loss": 0.1841, |
| "mean_token_accuracy": 0.9461756348609924, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.13596491228070176, |
| "grad_norm": 3.536157589286655, |
| "learning_rate": 6.739130434782609e-06, |
| "loss": 0.1466, |
| "mean_token_accuracy": 0.9586777091026306, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.14035087719298245, |
| "grad_norm": 3.150422102382113, |
| "learning_rate": 6.956521739130435e-06, |
| "loss": 0.143, |
| "mean_token_accuracy": 0.9620253443717957, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.14473684210526316, |
| "grad_norm": 2.918394399375414, |
| "learning_rate": 7.173913043478261e-06, |
| "loss": 0.1276, |
| "mean_token_accuracy": 0.9589040875434875, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.14912280701754385, |
| "grad_norm": 2.4545761003364377, |
| "learning_rate": 7.391304347826087e-06, |
| "loss": 0.1287, |
| "mean_token_accuracy": 0.9599659442901611, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.15350877192982457, |
| "grad_norm": 2.1082156725260237, |
| "learning_rate": 7.608695652173914e-06, |
| "loss": 0.1151, |
| "mean_token_accuracy": 0.9640427827835083, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.15789473684210525, |
| "grad_norm": 1.8138097404916538, |
| "learning_rate": 7.82608695652174e-06, |
| "loss": 0.0763, |
| "mean_token_accuracy": 0.975704550743103, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.16228070175438597, |
| "grad_norm": 1.7985617861288523, |
| "learning_rate": 8.043478260869566e-06, |
| "loss": 0.0702, |
| "mean_token_accuracy": 0.9836363792419434, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.16666666666666666, |
| "grad_norm": 2.3492670275067824, |
| "learning_rate": 8.260869565217392e-06, |
| "loss": 0.1244, |
| "mean_token_accuracy": 0.9643192291259766, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.17105263157894737, |
| "grad_norm": 2.1665390236265605, |
| "learning_rate": 8.478260869565218e-06, |
| "loss": 0.0756, |
| "mean_token_accuracy": 0.9758551120758057, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.17543859649122806, |
| "grad_norm": 2.557810654370321, |
| "learning_rate": 8.695652173913044e-06, |
| "loss": 0.0872, |
| "mean_token_accuracy": 0.9704604744911194, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.17982456140350878, |
| "grad_norm": 2.268965321261875, |
| "learning_rate": 8.91304347826087e-06, |
| "loss": 0.0898, |
| "mean_token_accuracy": 0.9727187156677246, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.18421052631578946, |
| "grad_norm": 2.0973990791113684, |
| "learning_rate": 9.130434782608697e-06, |
| "loss": 0.0866, |
| "mean_token_accuracy": 0.9679193496704102, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.18859649122807018, |
| "grad_norm": 1.626633363405833, |
| "learning_rate": 9.347826086956523e-06, |
| "loss": 0.0724, |
| "mean_token_accuracy": 0.9760368466377258, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.19298245614035087, |
| "grad_norm": 2.1620117949707756, |
| "learning_rate": 9.565217391304349e-06, |
| "loss": 0.0999, |
| "mean_token_accuracy": 0.9680150747299194, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.19736842105263158, |
| "grad_norm": 1.2520032061405733, |
| "learning_rate": 9.782608695652175e-06, |
| "loss": 0.0554, |
| "mean_token_accuracy": 0.9850606918334961, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.20175438596491227, |
| "grad_norm": 1.4309898053018073, |
| "learning_rate": 1e-05, |
| "loss": 0.064, |
| "mean_token_accuracy": 0.9765917658805847, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.20614035087719298, |
| "grad_norm": 2.333730871053151, |
| "learning_rate": 9.999867897077623e-06, |
| "loss": 0.0744, |
| "mean_token_accuracy": 0.9745222926139832, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.21052631578947367, |
| "grad_norm": 1.3385680704854448, |
| "learning_rate": 9.999471596066567e-06, |
| "loss": 0.0786, |
| "mean_token_accuracy": 0.9768304228782654, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.2149122807017544, |
| "grad_norm": 1.266546478812543, |
| "learning_rate": 9.998811120234624e-06, |
| "loss": 0.064, |
| "mean_token_accuracy": 0.9820923805236816, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.21929824561403508, |
| "grad_norm": 1.3816615110748307, |
| "learning_rate": 9.99788650835992e-06, |
| "loss": 0.0596, |
| "mean_token_accuracy": 0.985659658908844, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2236842105263158, |
| "grad_norm": 1.5563786480997326, |
| "learning_rate": 9.996697814728646e-06, |
| "loss": 0.0771, |
| "mean_token_accuracy": 0.9780316352844238, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.22807017543859648, |
| "grad_norm": 1.5481654213912595, |
| "learning_rate": 9.99524510913187e-06, |
| "loss": 0.0592, |
| "mean_token_accuracy": 0.9859022498130798, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.2324561403508772, |
| "grad_norm": 1.813189588194494, |
| "learning_rate": 9.99352847686144e-06, |
| "loss": 0.073, |
| "mean_token_accuracy": 0.9773585200309753, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.23684210526315788, |
| "grad_norm": 2.5030818918624806, |
| "learning_rate": 9.991548018704971e-06, |
| "loss": 0.0672, |
| "mean_token_accuracy": 0.9824047088623047, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.2412280701754386, |
| "grad_norm": 1.2244158695036425, |
| "learning_rate": 9.989303850939937e-06, |
| "loss": 0.0386, |
| "mean_token_accuracy": 0.9875119924545288, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.24561403508771928, |
| "grad_norm": 1.6953761327080115, |
| "learning_rate": 9.986796105326832e-06, |
| "loss": 0.0592, |
| "mean_token_accuracy": 0.979651153087616, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 3.319235269562587, |
| "learning_rate": 9.98402492910145e-06, |
| "loss": 0.097, |
| "mean_token_accuracy": 0.9707762598991394, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.2543859649122807, |
| "grad_norm": 2.988331178966144, |
| "learning_rate": 9.98099048496622e-06, |
| "loss": 0.1136, |
| "mean_token_accuracy": 0.9684684872627258, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.25877192982456143, |
| "grad_norm": 0.9470715725380437, |
| "learning_rate": 9.977692951080673e-06, |
| "loss": 0.0347, |
| "mean_token_accuracy": 0.9885764718055725, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.2631578947368421, |
| "grad_norm": 1.3283555319515876, |
| "learning_rate": 9.97413252105097e-06, |
| "loss": 0.068, |
| "mean_token_accuracy": 0.9776785969734192, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2675438596491228, |
| "grad_norm": 2.4045685350806028, |
| "learning_rate": 9.970309403918538e-06, |
| "loss": 0.0974, |
| "mean_token_accuracy": 0.9745222926139832, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.2719298245614035, |
| "grad_norm": 1.8162228200169925, |
| "learning_rate": 9.966223824147798e-06, |
| "loss": 0.0909, |
| "mean_token_accuracy": 0.9758812785148621, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.27631578947368424, |
| "grad_norm": 1.6185325805561612, |
| "learning_rate": 9.961876021612984e-06, |
| "loss": 0.066, |
| "mean_token_accuracy": 0.9776951670646667, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.2807017543859649, |
| "grad_norm": 1.379345140925176, |
| "learning_rate": 9.957266251584061e-06, |
| "loss": 0.0527, |
| "mean_token_accuracy": 0.984644889831543, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.2850877192982456, |
| "grad_norm": 1.4797188916318302, |
| "learning_rate": 9.952394784711736e-06, |
| "loss": 0.0913, |
| "mean_token_accuracy": 0.9706994295120239, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.2894736842105263, |
| "grad_norm": 1.0148882534661667, |
| "learning_rate": 9.94726190701157e-06, |
| "loss": 0.0514, |
| "mean_token_accuracy": 0.9857684969902039, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.29385964912280704, |
| "grad_norm": 1.0503887931527798, |
| "learning_rate": 9.94186791984718e-06, |
| "loss": 0.0451, |
| "mean_token_accuracy": 0.9852805733680725, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.2982456140350877, |
| "grad_norm": 1.2474420239255015, |
| "learning_rate": 9.936213139912555e-06, |
| "loss": 0.0478, |
| "mean_token_accuracy": 0.9841713309288025, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.3026315789473684, |
| "grad_norm": 1.2646846287992852, |
| "learning_rate": 9.930297899213454e-06, |
| "loss": 0.0569, |
| "mean_token_accuracy": 0.9820585250854492, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.30701754385964913, |
| "grad_norm": 1.2066314306641774, |
| "learning_rate": 9.924122545047908e-06, |
| "loss": 0.0745, |
| "mean_token_accuracy": 0.9806451797485352, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.31140350877192985, |
| "grad_norm": 1.3075919008059358, |
| "learning_rate": 9.917687439985848e-06, |
| "loss": 0.0597, |
| "mean_token_accuracy": 0.982243001461029, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.3157894736842105, |
| "grad_norm": 1.4572206129371754, |
| "learning_rate": 9.910992961847798e-06, |
| "loss": 0.0736, |
| "mean_token_accuracy": 0.9745596647262573, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.3201754385964912, |
| "grad_norm": 0.9945014238542652, |
| "learning_rate": 9.904039503682701e-06, |
| "loss": 0.0528, |
| "mean_token_accuracy": 0.9817017316818237, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.32456140350877194, |
| "grad_norm": 1.381291482904335, |
| "learning_rate": 9.896827473744848e-06, |
| "loss": 0.0621, |
| "mean_token_accuracy": 0.9814453125, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.32894736842105265, |
| "grad_norm": 1.5125438742028228, |
| "learning_rate": 9.889357295469893e-06, |
| "loss": 0.0693, |
| "mean_token_accuracy": 0.9787036776542664, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 1.2105292866557482, |
| "learning_rate": 9.881629407450007e-06, |
| "loss": 0.0799, |
| "mean_token_accuracy": 0.9771144390106201, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.33771929824561403, |
| "grad_norm": 0.8733717386433665, |
| "learning_rate": 9.873644263408119e-06, |
| "loss": 0.03, |
| "mean_token_accuracy": 0.9908629655838013, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.34210526315789475, |
| "grad_norm": 1.4745954914973345, |
| "learning_rate": 9.86540233217128e-06, |
| "loss": 0.0677, |
| "mean_token_accuracy": 0.9771327972412109, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.34649122807017546, |
| "grad_norm": 1.424001661198713, |
| "learning_rate": 9.856904097643136e-06, |
| "loss": 0.0691, |
| "mean_token_accuracy": 0.978821337223053, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.3508771929824561, |
| "grad_norm": 1.7925503325482246, |
| "learning_rate": 9.848150058775514e-06, |
| "loss": 0.0765, |
| "mean_token_accuracy": 0.9796333909034729, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.35526315789473684, |
| "grad_norm": 1.1225955509979852, |
| "learning_rate": 9.839140729539135e-06, |
| "loss": 0.0544, |
| "mean_token_accuracy": 0.9839622378349304, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.35964912280701755, |
| "grad_norm": 0.9510612378316599, |
| "learning_rate": 9.829876638893432e-06, |
| "loss": 0.0412, |
| "mean_token_accuracy": 0.9782823324203491, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.36403508771929827, |
| "grad_norm": 1.13177229230189, |
| "learning_rate": 9.820358330755487e-06, |
| "loss": 0.049, |
| "mean_token_accuracy": 0.9879879951477051, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.3684210526315789, |
| "grad_norm": 1.9754486801271534, |
| "learning_rate": 9.810586363968115e-06, |
| "loss": 0.0729, |
| "mean_token_accuracy": 0.97609943151474, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.37280701754385964, |
| "grad_norm": 1.601019897333522, |
| "learning_rate": 9.800561312267033e-06, |
| "loss": 0.0658, |
| "mean_token_accuracy": 0.9810298085212708, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.37719298245614036, |
| "grad_norm": 1.1101099115321518, |
| "learning_rate": 9.790283764247188e-06, |
| "loss": 0.0482, |
| "mean_token_accuracy": 0.9835013747215271, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.3815789473684211, |
| "grad_norm": 1.376077328474648, |
| "learning_rate": 9.779754323328192e-06, |
| "loss": 0.0758, |
| "mean_token_accuracy": 0.9761450290679932, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.38596491228070173, |
| "grad_norm": 1.3674306296849106, |
| "learning_rate": 9.768973607718896e-06, |
| "loss": 0.083, |
| "mean_token_accuracy": 0.9726027250289917, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.39035087719298245, |
| "grad_norm": 1.4732542476353192, |
| "learning_rate": 9.757942250381094e-06, |
| "loss": 0.0518, |
| "mean_token_accuracy": 0.9811498522758484, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.39473684210526316, |
| "grad_norm": 1.3482578721824612, |
| "learning_rate": 9.746660898992362e-06, |
| "loss": 0.0671, |
| "mean_token_accuracy": 0.979835033416748, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3991228070175439, |
| "grad_norm": 1.316168999392029, |
| "learning_rate": 9.735130215908027e-06, |
| "loss": 0.0722, |
| "mean_token_accuracy": 0.9818676114082336, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.40350877192982454, |
| "grad_norm": 1.379574602848112, |
| "learning_rate": 9.723350878122283e-06, |
| "loss": 0.0458, |
| "mean_token_accuracy": 0.9851852059364319, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.40789473684210525, |
| "grad_norm": 1.0071824523272006, |
| "learning_rate": 9.711323577228433e-06, |
| "loss": 0.0698, |
| "mean_token_accuracy": 0.9760589599609375, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.41228070175438597, |
| "grad_norm": 1.266466321678705, |
| "learning_rate": 9.699049019378303e-06, |
| "loss": 0.0819, |
| "mean_token_accuracy": 0.975806474685669, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 1.3455135307156914, |
| "learning_rate": 9.686527925240763e-06, |
| "loss": 0.0601, |
| "mean_token_accuracy": 0.9810181260108948, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.42105263157894735, |
| "grad_norm": 0.9456900510734121, |
| "learning_rate": 9.673761029959427e-06, |
| "loss": 0.0342, |
| "mean_token_accuracy": 0.9915013909339905, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.42543859649122806, |
| "grad_norm": 1.0670984903024676, |
| "learning_rate": 9.660749083109483e-06, |
| "loss": 0.0559, |
| "mean_token_accuracy": 0.982041597366333, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.4298245614035088, |
| "grad_norm": 0.9511608807323504, |
| "learning_rate": 9.647492848653689e-06, |
| "loss": 0.0422, |
| "mean_token_accuracy": 0.9856459498405457, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.4342105263157895, |
| "grad_norm": 0.855470902108151, |
| "learning_rate": 9.633993104897516e-06, |
| "loss": 0.0328, |
| "mean_token_accuracy": 0.9910793900489807, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.43859649122807015, |
| "grad_norm": 1.1964255904299725, |
| "learning_rate": 9.620250644443454e-06, |
| "loss": 0.0507, |
| "mean_token_accuracy": 0.9825206995010376, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.44298245614035087, |
| "grad_norm": 0.9481898254248041, |
| "learning_rate": 9.606266274144475e-06, |
| "loss": 0.0409, |
| "mean_token_accuracy": 0.9877126812934875, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.4473684210526316, |
| "grad_norm": 1.3401489394569477, |
| "learning_rate": 9.592040815056662e-06, |
| "loss": 0.041, |
| "mean_token_accuracy": 0.98247230052948, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.4517543859649123, |
| "grad_norm": 0.9998470993447407, |
| "learning_rate": 9.577575102390999e-06, |
| "loss": 0.0417, |
| "mean_token_accuracy": 0.9850187301635742, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.45614035087719296, |
| "grad_norm": 1.3562634098474966, |
| "learning_rate": 9.562869985464341e-06, |
| "loss": 0.0568, |
| "mean_token_accuracy": 0.9845559597015381, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.4605263157894737, |
| "grad_norm": 1.3140006713189099, |
| "learning_rate": 9.547926327649535e-06, |
| "loss": 0.0769, |
| "mean_token_accuracy": 0.980751633644104, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.4649122807017544, |
| "grad_norm": 1.32380632494124, |
| "learning_rate": 9.53274500632475e-06, |
| "loss": 0.0577, |
| "mean_token_accuracy": 0.9852941036224365, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.4692982456140351, |
| "grad_norm": 1.036691120819973, |
| "learning_rate": 9.517326912821948e-06, |
| "loss": 0.0457, |
| "mean_token_accuracy": 0.9857777953147888, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.47368421052631576, |
| "grad_norm": 0.9264599901876123, |
| "learning_rate": 9.501672952374551e-06, |
| "loss": 0.0259, |
| "mean_token_accuracy": 0.9909999966621399, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.4780701754385965, |
| "grad_norm": 1.0463433222244278, |
| "learning_rate": 9.485784044064305e-06, |
| "loss": 0.065, |
| "mean_token_accuracy": 0.9845559597015381, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.4824561403508772, |
| "grad_norm": 1.9769264713279622, |
| "learning_rate": 9.469661120767308e-06, |
| "loss": 0.0642, |
| "mean_token_accuracy": 0.9832746386528015, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.4868421052631579, |
| "grad_norm": 1.3001866387052001, |
| "learning_rate": 9.453305129099241e-06, |
| "loss": 0.0622, |
| "mean_token_accuracy": 0.9802371263504028, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.49122807017543857, |
| "grad_norm": 0.8345354842642208, |
| "learning_rate": 9.436717029359794e-06, |
| "loss": 0.035, |
| "mean_token_accuracy": 0.9845559597015381, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.4956140350877193, |
| "grad_norm": 1.06099019714702, |
| "learning_rate": 9.419897795476276e-06, |
| "loss": 0.048, |
| "mean_token_accuracy": 0.9877819418907166, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.7879601442619304, |
| "learning_rate": 9.402848414946445e-06, |
| "loss": 0.0446, |
| "mean_token_accuracy": 0.9885260462760925, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.5043859649122807, |
| "grad_norm": 0.9234673024179365, |
| "learning_rate": 9.385569888780517e-06, |
| "loss": 0.0577, |
| "mean_token_accuracy": 0.982758641242981, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.5087719298245614, |
| "grad_norm": 0.8127855916721984, |
| "learning_rate": 9.368063231442406e-06, |
| "loss": 0.0482, |
| "mean_token_accuracy": 0.9851632118225098, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.5131578947368421, |
| "grad_norm": 0.97018286252887, |
| "learning_rate": 9.350329470790153e-06, |
| "loss": 0.0602, |
| "mean_token_accuracy": 0.9847763776779175, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.5175438596491229, |
| "grad_norm": 0.9321617403855113, |
| "learning_rate": 9.332369648015583e-06, |
| "loss": 0.0444, |
| "mean_token_accuracy": 0.9858890175819397, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.5219298245614035, |
| "grad_norm": 1.081330247660032, |
| "learning_rate": 9.314184817583176e-06, |
| "loss": 0.0642, |
| "mean_token_accuracy": 0.9779693484306335, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.5263157894736842, |
| "grad_norm": 0.7551549278614351, |
| "learning_rate": 9.295776047168149e-06, |
| "loss": 0.0406, |
| "mean_token_accuracy": 0.9873303174972534, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5307017543859649, |
| "grad_norm": 0.7859022034827321, |
| "learning_rate": 9.277144417593777e-06, |
| "loss": 0.0389, |
| "mean_token_accuracy": 0.9890210628509521, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.5350877192982456, |
| "grad_norm": 0.885505455969031, |
| "learning_rate": 9.258291022767932e-06, |
| "loss": 0.0406, |
| "mean_token_accuracy": 0.9828734397888184, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.5394736842105263, |
| "grad_norm": 0.669353006888733, |
| "learning_rate": 9.239216969618862e-06, |
| "loss": 0.0374, |
| "mean_token_accuracy": 0.9884341359138489, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.543859649122807, |
| "grad_norm": 1.1567427530378693, |
| "learning_rate": 9.219923378030197e-06, |
| "loss": 0.0401, |
| "mean_token_accuracy": 0.9858782291412354, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.5482456140350878, |
| "grad_norm": 0.6500109168701329, |
| "learning_rate": 9.200411380775192e-06, |
| "loss": 0.0295, |
| "mean_token_accuracy": 0.992445707321167, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.5526315789473685, |
| "grad_norm": 0.8300271536745402, |
| "learning_rate": 9.180682123450232e-06, |
| "loss": 0.0275, |
| "mean_token_accuracy": 0.992530345916748, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.5570175438596491, |
| "grad_norm": 1.1489645191041542, |
| "learning_rate": 9.160736764407555e-06, |
| "loss": 0.041, |
| "mean_token_accuracy": 0.9854147434234619, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.5614035087719298, |
| "grad_norm": 1.0725337888722772, |
| "learning_rate": 9.140576474687263e-06, |
| "loss": 0.0474, |
| "mean_token_accuracy": 0.9825412034988403, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.5657894736842105, |
| "grad_norm": 1.1915828601208973, |
| "learning_rate": 9.120202437948551e-06, |
| "loss": 0.0521, |
| "mean_token_accuracy": 0.984236478805542, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.5701754385964912, |
| "grad_norm": 0.7742645797594888, |
| "learning_rate": 9.099615850400214e-06, |
| "loss": 0.0326, |
| "mean_token_accuracy": 0.9884169697761536, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5745614035087719, |
| "grad_norm": 0.7845461767632265, |
| "learning_rate": 9.078817920730421e-06, |
| "loss": 0.0368, |
| "mean_token_accuracy": 0.9894737005233765, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.5789473684210527, |
| "grad_norm": 1.0369284341370864, |
| "learning_rate": 9.057809870035743e-06, |
| "loss": 0.0507, |
| "mean_token_accuracy": 0.9851778745651245, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.5833333333333334, |
| "grad_norm": 1.0335832998709262, |
| "learning_rate": 9.036592931749463e-06, |
| "loss": 0.0659, |
| "mean_token_accuracy": 0.9821428656578064, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.5877192982456141, |
| "grad_norm": 0.9622307462607973, |
| "learning_rate": 9.015168351569165e-06, |
| "loss": 0.0534, |
| "mean_token_accuracy": 0.9873987436294556, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.5921052631578947, |
| "grad_norm": 1.120938657518319, |
| "learning_rate": 8.993537387383579e-06, |
| "loss": 0.0493, |
| "mean_token_accuracy": 0.9802445769309998, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.5964912280701754, |
| "grad_norm": 1.4457156627458978, |
| "learning_rate": 8.971701309198744e-06, |
| "loss": 0.0412, |
| "mean_token_accuracy": 0.9885167479515076, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.6008771929824561, |
| "grad_norm": 0.8696354320513997, |
| "learning_rate": 8.949661399063432e-06, |
| "loss": 0.0245, |
| "mean_token_accuracy": 0.9923954606056213, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.6052631578947368, |
| "grad_norm": 1.1258249899716837, |
| "learning_rate": 8.927418950993885e-06, |
| "loss": 0.0587, |
| "mean_token_accuracy": 0.9820466637611389, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.6096491228070176, |
| "grad_norm": 0.9496445295698097, |
| "learning_rate": 8.90497527089783e-06, |
| "loss": 0.0407, |
| "mean_token_accuracy": 0.9872379302978516, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.6140350877192983, |
| "grad_norm": 0.879391678700531, |
| "learning_rate": 8.882331676497813e-06, |
| "loss": 0.0585, |
| "mean_token_accuracy": 0.9859859943389893, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.618421052631579, |
| "grad_norm": 1.0109081929029509, |
| "learning_rate": 8.859489497253833e-06, |
| "loss": 0.0596, |
| "mean_token_accuracy": 0.9808428883552551, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.6228070175438597, |
| "grad_norm": 0.8199798067128391, |
| "learning_rate": 8.83645007428528e-06, |
| "loss": 0.0358, |
| "mean_token_accuracy": 0.9920159578323364, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.6271929824561403, |
| "grad_norm": 1.1335033794657405, |
| "learning_rate": 8.813214760292202e-06, |
| "loss": 0.0582, |
| "mean_token_accuracy": 0.9835748672485352, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.631578947368421, |
| "grad_norm": 0.8568132844393954, |
| "learning_rate": 8.789784919475878e-06, |
| "loss": 0.0434, |
| "mean_token_accuracy": 0.987261176109314, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.6359649122807017, |
| "grad_norm": 0.9459516671423277, |
| "learning_rate": 8.766161927458726e-06, |
| "loss": 0.047, |
| "mean_token_accuracy": 0.9838420152664185, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.6403508771929824, |
| "grad_norm": 0.8784141085743796, |
| "learning_rate": 8.742347171203542e-06, |
| "loss": 0.042, |
| "mean_token_accuracy": 0.982775092124939, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.6447368421052632, |
| "grad_norm": 0.6588438281546535, |
| "learning_rate": 8.718342048932054e-06, |
| "loss": 0.0287, |
| "mean_token_accuracy": 0.9916045069694519, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.6491228070175439, |
| "grad_norm": 1.133653130578679, |
| "learning_rate": 8.694147970042842e-06, |
| "loss": 0.0476, |
| "mean_token_accuracy": 0.9840674996376038, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.6535087719298246, |
| "grad_norm": 0.7672844205547247, |
| "learning_rate": 8.669766355028584e-06, |
| "loss": 0.0398, |
| "mean_token_accuracy": 0.987261176109314, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.6578947368421053, |
| "grad_norm": 0.811385457927767, |
| "learning_rate": 8.645198635392659e-06, |
| "loss": 0.0334, |
| "mean_token_accuracy": 0.9871677160263062, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6622807017543859, |
| "grad_norm": 0.6941411012141862, |
| "learning_rate": 8.620446253565088e-06, |
| "loss": 0.0293, |
| "mean_token_accuracy": 0.9859550595283508, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.9284599146300014, |
| "learning_rate": 8.595510662817865e-06, |
| "loss": 0.0541, |
| "mean_token_accuracy": 0.9869739413261414, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.6710526315789473, |
| "grad_norm": 0.556672992651389, |
| "learning_rate": 8.570393327179614e-06, |
| "loss": 0.0184, |
| "mean_token_accuracy": 0.9922928810119629, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.6754385964912281, |
| "grad_norm": 0.6433995398591961, |
| "learning_rate": 8.545095721349641e-06, |
| "loss": 0.0299, |
| "mean_token_accuracy": 0.9886956810951233, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.6798245614035088, |
| "grad_norm": 0.9442804314136596, |
| "learning_rate": 8.519619330611353e-06, |
| "loss": 0.0456, |
| "mean_token_accuracy": 0.9879629611968994, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.6842105263157895, |
| "grad_norm": 0.9331416915582066, |
| "learning_rate": 8.493965650745043e-06, |
| "loss": 0.0343, |
| "mean_token_accuracy": 0.9870370626449585, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.6885964912280702, |
| "grad_norm": 0.9196023329834228, |
| "learning_rate": 8.468136187940087e-06, |
| "loss": 0.0502, |
| "mean_token_accuracy": 0.9874638319015503, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.6929824561403509, |
| "grad_norm": 0.6985907739475705, |
| "learning_rate": 8.442132458706484e-06, |
| "loss": 0.0298, |
| "mean_token_accuracy": 0.9891794323921204, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.6973684210526315, |
| "grad_norm": 0.9290371151262443, |
| "learning_rate": 8.415955989785852e-06, |
| "loss": 0.0654, |
| "mean_token_accuracy": 0.9832869172096252, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.7017543859649122, |
| "grad_norm": 0.8835977906232552, |
| "learning_rate": 8.389608318061761e-06, |
| "loss": 0.0442, |
| "mean_token_accuracy": 0.9884058237075806, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.706140350877193, |
| "grad_norm": 0.7674572545805198, |
| "learning_rate": 8.36309099046952e-06, |
| "loss": 0.0305, |
| "mean_token_accuracy": 0.9847618937492371, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.7105263157894737, |
| "grad_norm": 0.6630131510438095, |
| "learning_rate": 8.336405563905333e-06, |
| "loss": 0.0257, |
| "mean_token_accuracy": 0.9909008145332336, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.7149122807017544, |
| "grad_norm": 0.6691346582591647, |
| "learning_rate": 8.309553605134904e-06, |
| "loss": 0.0366, |
| "mean_token_accuracy": 0.9871441721916199, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.7192982456140351, |
| "grad_norm": 0.6583372683857567, |
| "learning_rate": 8.282536690701446e-06, |
| "loss": 0.031, |
| "mean_token_accuracy": 0.9904761910438538, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.7236842105263158, |
| "grad_norm": 0.7417399430737569, |
| "learning_rate": 8.25535640683311e-06, |
| "loss": 0.034, |
| "mean_token_accuracy": 0.9887640476226807, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.7280701754385965, |
| "grad_norm": 0.6086863615930597, |
| "learning_rate": 8.228014349349872e-06, |
| "loss": 0.0294, |
| "mean_token_accuracy": 0.9897769689559937, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.7324561403508771, |
| "grad_norm": 1.1514596247668607, |
| "learning_rate": 8.200512123569817e-06, |
| "loss": 0.0636, |
| "mean_token_accuracy": 0.9883093237876892, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.7368421052631579, |
| "grad_norm": 0.7244088560914517, |
| "learning_rate": 8.172851344214896e-06, |
| "loss": 0.0351, |
| "mean_token_accuracy": 0.9887217879295349, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.7412280701754386, |
| "grad_norm": 0.8439610334789659, |
| "learning_rate": 8.14503363531613e-06, |
| "loss": 0.0307, |
| "mean_token_accuracy": 0.992445707321167, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.7456140350877193, |
| "grad_norm": 0.705102312181603, |
| "learning_rate": 8.117060630118246e-06, |
| "loss": 0.0549, |
| "mean_token_accuracy": 0.98591548204422, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.6473372518720657, |
| "learning_rate": 8.088933970983793e-06, |
| "loss": 0.0297, |
| "mean_token_accuracy": 0.9898167252540588, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.7543859649122807, |
| "grad_norm": 1.069972895235383, |
| "learning_rate": 8.060655309296712e-06, |
| "loss": 0.0633, |
| "mean_token_accuracy": 0.9842007160186768, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.7587719298245614, |
| "grad_norm": 0.5841693770396578, |
| "learning_rate": 8.032226305365383e-06, |
| "loss": 0.0272, |
| "mean_token_accuracy": 0.9884058237075806, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.7631578947368421, |
| "grad_norm": 0.5182109728386859, |
| "learning_rate": 8.003648628325136e-06, |
| "loss": 0.0224, |
| "mean_token_accuracy": 0.993096649646759, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.7675438596491229, |
| "grad_norm": 0.7775371151032232, |
| "learning_rate": 7.974923956040262e-06, |
| "loss": 0.0393, |
| "mean_token_accuracy": 0.986251175403595, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.7719298245614035, |
| "grad_norm": 0.933580913520088, |
| "learning_rate": 7.946053975005495e-06, |
| "loss": 0.042, |
| "mean_token_accuracy": 0.9868173003196716, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.7763157894736842, |
| "grad_norm": 0.6857001865915958, |
| "learning_rate": 7.917040380247e-06, |
| "loss": 0.0281, |
| "mean_token_accuracy": 0.9934024214744568, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.7807017543859649, |
| "grad_norm": 0.6748697636742754, |
| "learning_rate": 7.887884875222841e-06, |
| "loss": 0.0319, |
| "mean_token_accuracy": 0.9888476133346558, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.7850877192982456, |
| "grad_norm": 1.013709230159994, |
| "learning_rate": 7.858589171722985e-06, |
| "loss": 0.0529, |
| "mean_token_accuracy": 0.9868287444114685, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.7894736842105263, |
| "grad_norm": 0.9817957568839512, |
| "learning_rate": 7.829154989768784e-06, |
| "loss": 0.027, |
| "mean_token_accuracy": 0.991525411605835, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.793859649122807, |
| "grad_norm": 0.9368613475334713, |
| "learning_rate": 7.799584057511997e-06, |
| "loss": 0.0406, |
| "mean_token_accuracy": 0.9888682961463928, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.7982456140350878, |
| "grad_norm": 0.7264920776084144, |
| "learning_rate": 7.76987811113332e-06, |
| "loss": 0.0403, |
| "mean_token_accuracy": 0.9908842444419861, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.8026315789473685, |
| "grad_norm": 1.0812661563717334, |
| "learning_rate": 7.740038894740454e-06, |
| "loss": 0.0261, |
| "mean_token_accuracy": 0.9920239448547363, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.8070175438596491, |
| "grad_norm": 0.9401638962859927, |
| "learning_rate": 7.710068160265705e-06, |
| "loss": 0.0435, |
| "mean_token_accuracy": 0.9872298836708069, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.8114035087719298, |
| "grad_norm": 0.6197655701622428, |
| "learning_rate": 7.679967667363121e-06, |
| "loss": 0.0213, |
| "mean_token_accuracy": 0.9943342804908752, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.8157894736842105, |
| "grad_norm": 0.8832634701622661, |
| "learning_rate": 7.649739183305184e-06, |
| "loss": 0.0558, |
| "mean_token_accuracy": 0.9873303174972534, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.8201754385964912, |
| "grad_norm": 0.8044268841352189, |
| "learning_rate": 7.619384482879039e-06, |
| "loss": 0.0324, |
| "mean_token_accuracy": 0.9906191229820251, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.8245614035087719, |
| "grad_norm": 0.6162089893164087, |
| "learning_rate": 7.5889053482823015e-06, |
| "loss": 0.0246, |
| "mean_token_accuracy": 0.9921875, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.8289473684210527, |
| "grad_norm": 0.40807361993747604, |
| "learning_rate": 7.558303569018417e-06, |
| "loss": 0.0228, |
| "mean_token_accuracy": 0.9942802786827087, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 0.6324599964546919, |
| "learning_rate": 7.527580941791595e-06, |
| "loss": 0.0383, |
| "mean_token_accuracy": 0.9879406094551086, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8377192982456141, |
| "grad_norm": 0.6016959509244331, |
| "learning_rate": 7.49673927040132e-06, |
| "loss": 0.0283, |
| "mean_token_accuracy": 0.9933142066001892, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.8421052631578947, |
| "grad_norm": 0.6772586941548877, |
| "learning_rate": 7.465780365636445e-06, |
| "loss": 0.0353, |
| "mean_token_accuracy": 0.9960707426071167, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.8464912280701754, |
| "grad_norm": 0.7609750902432352, |
| "learning_rate": 7.4347060451688805e-06, |
| "loss": 0.0333, |
| "mean_token_accuracy": 0.9926335215568542, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.8508771929824561, |
| "grad_norm": 0.7304010964303218, |
| "learning_rate": 7.403518133446866e-06, |
| "loss": 0.0433, |
| "mean_token_accuracy": 0.9891008138656616, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.8552631578947368, |
| "grad_norm": 1.0060964313822855, |
| "learning_rate": 7.37221846158786e-06, |
| "loss": 0.0518, |
| "mean_token_accuracy": 0.982616662979126, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.8596491228070176, |
| "grad_norm": 0.6662204121323226, |
| "learning_rate": 7.340808867271031e-06, |
| "loss": 0.0403, |
| "mean_token_accuracy": 0.9893100261688232, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.8640350877192983, |
| "grad_norm": 0.8400104072773938, |
| "learning_rate": 7.309291194629352e-06, |
| "loss": 0.0401, |
| "mean_token_accuracy": 0.9888888597488403, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.868421052631579, |
| "grad_norm": 0.7928254172033133, |
| "learning_rate": 7.277667294141345e-06, |
| "loss": 0.0299, |
| "mean_token_accuracy": 0.9878731369972229, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.8728070175438597, |
| "grad_norm": 0.8653511665696442, |
| "learning_rate": 7.245939022522413e-06, |
| "loss": 0.0258, |
| "mean_token_accuracy": 0.9939271211624146, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.8771929824561403, |
| "grad_norm": 0.914431454360587, |
| "learning_rate": 7.214108242615852e-06, |
| "loss": 0.0402, |
| "mean_token_accuracy": 0.9838998317718506, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.881578947368421, |
| "grad_norm": 0.6770210818024482, |
| "learning_rate": 7.1821768232834595e-06, |
| "loss": 0.028, |
| "mean_token_accuracy": 0.9887111783027649, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.8859649122807017, |
| "grad_norm": 0.8616249497631141, |
| "learning_rate": 7.150146639295816e-06, |
| "loss": 0.0399, |
| "mean_token_accuracy": 0.9885495901107788, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.8903508771929824, |
| "grad_norm": 0.801695448458751, |
| "learning_rate": 7.118019571222216e-06, |
| "loss": 0.0529, |
| "mean_token_accuracy": 0.9849765300750732, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.8947368421052632, |
| "grad_norm": 0.5835820009568224, |
| "learning_rate": 7.0857975053202485e-06, |
| "loss": 0.027, |
| "mean_token_accuracy": 0.9934518337249756, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.8991228070175439, |
| "grad_norm": 0.7313001066910451, |
| "learning_rate": 7.053482333425057e-06, |
| "loss": 0.0269, |
| "mean_token_accuracy": 0.9902152419090271, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.9035087719298246, |
| "grad_norm": 0.7941941178076102, |
| "learning_rate": 7.021075952838262e-06, |
| "loss": 0.0495, |
| "mean_token_accuracy": 0.9869646430015564, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.9078947368421053, |
| "grad_norm": 0.7172681790983623, |
| "learning_rate": 6.988580266216566e-06, |
| "loss": 0.0273, |
| "mean_token_accuracy": 0.990230917930603, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.9122807017543859, |
| "grad_norm": 1.049292501966634, |
| "learning_rate": 6.955997181460041e-06, |
| "loss": 0.0529, |
| "mean_token_accuracy": 0.9861751198768616, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.9166666666666666, |
| "grad_norm": 0.620678758091928, |
| "learning_rate": 6.9233286116001194e-06, |
| "loss": 0.036, |
| "mean_token_accuracy": 0.9889094233512878, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.9210526315789473, |
| "grad_norm": 0.5342928244609167, |
| "learning_rate": 6.890576474687264e-06, |
| "loss": 0.0219, |
| "mean_token_accuracy": 0.9959473013877869, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.9254385964912281, |
| "grad_norm": 0.5747154500522089, |
| "learning_rate": 6.857742693678367e-06, |
| "loss": 0.0252, |
| "mean_token_accuracy": 0.9907321333885193, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.9298245614035088, |
| "grad_norm": 1.2876451230601493, |
| "learning_rate": 6.824829196323836e-06, |
| "loss": 0.0621, |
| "mean_token_accuracy": 0.9871541261672974, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.9342105263157895, |
| "grad_norm": 1.1262611377865053, |
| "learning_rate": 6.791837915054422e-06, |
| "loss": 0.0518, |
| "mean_token_accuracy": 0.9827761054039001, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.9385964912280702, |
| "grad_norm": 0.6956429370497834, |
| "learning_rate": 6.7587707868677566e-06, |
| "loss": 0.033, |
| "mean_token_accuracy": 0.9896519184112549, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.9429824561403509, |
| "grad_norm": 0.8530747842494603, |
| "learning_rate": 6.725629753214624e-06, |
| "loss": 0.0301, |
| "mean_token_accuracy": 0.9885386824607849, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.9473684210526315, |
| "grad_norm": 0.6692765498837409, |
| "learning_rate": 6.692416759884978e-06, |
| "loss": 0.0339, |
| "mean_token_accuracy": 0.9897483587265015, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.9517543859649122, |
| "grad_norm": 0.6287573979814297, |
| "learning_rate": 6.659133756893701e-06, |
| "loss": 0.0358, |
| "mean_token_accuracy": 0.9917657971382141, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.956140350877193, |
| "grad_norm": 0.57663263683149, |
| "learning_rate": 6.6257826983661044e-06, |
| "loss": 0.023, |
| "mean_token_accuracy": 0.9901477694511414, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.9605263157894737, |
| "grad_norm": 0.8347370699421348, |
| "learning_rate": 6.592365542423213e-06, |
| "loss": 0.0408, |
| "mean_token_accuracy": 0.9883198738098145, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.9649122807017544, |
| "grad_norm": 0.7318453288398662, |
| "learning_rate": 6.558884251066784e-06, |
| "loss": 0.0357, |
| "mean_token_accuracy": 0.9897292256355286, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.9692982456140351, |
| "grad_norm": 0.4737198820827406, |
| "learning_rate": 6.5253407900641195e-06, |
| "loss": 0.0281, |
| "mean_token_accuracy": 0.9944030046463013, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.9736842105263158, |
| "grad_norm": 0.6234815572575056, |
| "learning_rate": 6.4917371288326554e-06, |
| "loss": 0.0299, |
| "mean_token_accuracy": 0.9908814430236816, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.9780701754385965, |
| "grad_norm": 0.581581532274982, |
| "learning_rate": 6.458075240324324e-06, |
| "loss": 0.0251, |
| "mean_token_accuracy": 0.9943609237670898, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.9824561403508771, |
| "grad_norm": 0.6354748439200917, |
| "learning_rate": 6.424357100909724e-06, |
| "loss": 0.0317, |
| "mean_token_accuracy": 0.9881495237350464, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.9868421052631579, |
| "grad_norm": 0.4664519308151362, |
| "learning_rate": 6.390584690262079e-06, |
| "loss": 0.0211, |
| "mean_token_accuracy": 0.9965724349021912, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.9912280701754386, |
| "grad_norm": 0.8781913223857738, |
| "learning_rate": 6.356759991241008e-06, |
| "loss": 0.0531, |
| "mean_token_accuracy": 0.9873303174972534, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.9956140350877193, |
| "grad_norm": 0.5682681118336651, |
| "learning_rate": 6.3228849897761055e-06, |
| "loss": 0.0216, |
| "mean_token_accuracy": 0.9929328560829163, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.0872008037177538, |
| "learning_rate": 6.288961674750346e-06, |
| "loss": 0.0384, |
| "mean_token_accuracy": 0.9876072406768799, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.0043859649122806, |
| "grad_norm": 0.6778017238896538, |
| "learning_rate": 6.2549920378833055e-06, |
| "loss": 0.0296, |
| "mean_token_accuracy": 0.9917203187942505, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.0087719298245614, |
| "grad_norm": 0.49526269027622344, |
| "learning_rate": 6.22097807361423e-06, |
| "loss": 0.0152, |
| "mean_token_accuracy": 0.9934518337249756, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.013157894736842, |
| "grad_norm": 0.5779155041074402, |
| "learning_rate": 6.186921778984936e-06, |
| "loss": 0.0194, |
| "mean_token_accuracy": 0.9950932264328003, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.0175438596491229, |
| "grad_norm": 0.5812684468864261, |
| "learning_rate": 6.152825153522552e-06, |
| "loss": 0.0264, |
| "mean_token_accuracy": 0.9928379654884338, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.0219298245614035, |
| "grad_norm": 0.5270217195515672, |
| "learning_rate": 6.118690199122133e-06, |
| "loss": 0.0169, |
| "mean_token_accuracy": 0.9944186210632324, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.0263157894736843, |
| "grad_norm": 0.4996575132136591, |
| "learning_rate": 6.084518919929112e-06, |
| "loss": 0.0229, |
| "mean_token_accuracy": 0.9940828680992126, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.030701754385965, |
| "grad_norm": 0.5730226858591222, |
| "learning_rate": 6.050313322221645e-06, |
| "loss": 0.0133, |
| "mean_token_accuracy": 0.9962228536605835, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.0350877192982457, |
| "grad_norm": 0.9581304650383392, |
| "learning_rate": 6.016075414292804e-06, |
| "loss": 0.045, |
| "mean_token_accuracy": 0.9886578321456909, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.0394736842105263, |
| "grad_norm": 0.3136176703992604, |
| "learning_rate": 5.981807206332674e-06, |
| "loss": 0.0096, |
| "mean_token_accuracy": 0.9961868524551392, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.043859649122807, |
| "grad_norm": 0.45706642002705294, |
| "learning_rate": 5.947510710310332e-06, |
| "loss": 0.0129, |
| "mean_token_accuracy": 0.9944238066673279, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.0482456140350878, |
| "grad_norm": 0.6072914080253883, |
| "learning_rate": 5.9131879398557125e-06, |
| "loss": 0.0178, |
| "mean_token_accuracy": 0.9964189529418945, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.0526315789473684, |
| "grad_norm": 0.6771658494331031, |
| "learning_rate": 5.878840910141382e-06, |
| "loss": 0.0234, |
| "mean_token_accuracy": 0.9927536249160767, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.0570175438596492, |
| "grad_norm": 0.32445288412115003, |
| "learning_rate": 5.844471637764232e-06, |
| "loss": 0.0128, |
| "mean_token_accuracy": 0.9971883893013, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.0614035087719298, |
| "grad_norm": 0.48218150765611467, |
| "learning_rate": 5.810082140627069e-06, |
| "loss": 0.0136, |
| "mean_token_accuracy": 0.9963167309761047, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.0657894736842106, |
| "grad_norm": 0.43433151465079756, |
| "learning_rate": 5.77567443782015e-06, |
| "loss": 0.0119, |
| "mean_token_accuracy": 0.9950347542762756, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.0701754385964912, |
| "grad_norm": 0.6643588541223486, |
| "learning_rate": 5.7412505495026265e-06, |
| "loss": 0.0203, |
| "mean_token_accuracy": 0.9926131367683411, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.0745614035087718, |
| "grad_norm": 0.5696929267601732, |
| "learning_rate": 5.70681249678394e-06, |
| "loss": 0.0238, |
| "mean_token_accuracy": 0.9919928908348083, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0789473684210527, |
| "grad_norm": 0.25484916931066137, |
| "learning_rate": 5.67236230160516e-06, |
| "loss": 0.0086, |
| "mean_token_accuracy": 0.9981042742729187, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.0833333333333333, |
| "grad_norm": 0.7101457051113592, |
| "learning_rate": 5.63790198662027e-06, |
| "loss": 0.0234, |
| "mean_token_accuracy": 0.9931707382202148, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.087719298245614, |
| "grad_norm": 0.648214372305923, |
| "learning_rate": 5.6034335750774086e-06, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.9934456944465637, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.0921052631578947, |
| "grad_norm": 0.5724766455404428, |
| "learning_rate": 5.568959090700085e-06, |
| "loss": 0.0118, |
| "mean_token_accuracy": 0.9961649179458618, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.0964912280701755, |
| "grad_norm": 0.8253701949033893, |
| "learning_rate": 5.534480557568358e-06, |
| "loss": 0.0269, |
| "mean_token_accuracy": 0.989833652973175, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.1008771929824561, |
| "grad_norm": 0.5040799834368035, |
| "learning_rate": 5.500000000000001e-06, |
| "loss": 0.0129, |
| "mean_token_accuracy": 0.9972652792930603, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.1052631578947367, |
| "grad_norm": 0.459377873641933, |
| "learning_rate": 5.465519442431644e-06, |
| "loss": 0.0203, |
| "mean_token_accuracy": 0.993630588054657, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.1096491228070176, |
| "grad_norm": 0.40702825872800735, |
| "learning_rate": 5.431040909299917e-06, |
| "loss": 0.0093, |
| "mean_token_accuracy": 0.9972602725028992, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.1140350877192982, |
| "grad_norm": 0.46228223336937235, |
| "learning_rate": 5.3965664249225945e-06, |
| "loss": 0.0214, |
| "mean_token_accuracy": 0.9941324591636658, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.118421052631579, |
| "grad_norm": 0.4411876525278596, |
| "learning_rate": 5.362098013379732e-06, |
| "loss": 0.0167, |
| "mean_token_accuracy": 0.9962894320487976, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.1228070175438596, |
| "grad_norm": 0.7544855770102694, |
| "learning_rate": 5.327637698394842e-06, |
| "loss": 0.0263, |
| "mean_token_accuracy": 0.9925373196601868, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.1271929824561404, |
| "grad_norm": 0.5840731893496927, |
| "learning_rate": 5.293187503216062e-06, |
| "loss": 0.0107, |
| "mean_token_accuracy": 0.9954545497894287, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.131578947368421, |
| "grad_norm": 0.6414004448774665, |
| "learning_rate": 5.258749450497376e-06, |
| "loss": 0.0254, |
| "mean_token_accuracy": 0.9913544654846191, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.1359649122807018, |
| "grad_norm": 0.4158255471134415, |
| "learning_rate": 5.224325562179852e-06, |
| "loss": 0.0159, |
| "mean_token_accuracy": 0.9964881539344788, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.1403508771929824, |
| "grad_norm": 0.18478665803226857, |
| "learning_rate": 5.189917859372933e-06, |
| "loss": 0.0048, |
| "mean_token_accuracy": 1.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.1447368421052633, |
| "grad_norm": 0.2666431930478866, |
| "learning_rate": 5.15552836223577e-06, |
| "loss": 0.0061, |
| "mean_token_accuracy": 0.9980695247650146, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.1491228070175439, |
| "grad_norm": 0.7684837823554583, |
| "learning_rate": 5.121159089858619e-06, |
| "loss": 0.0348, |
| "mean_token_accuracy": 0.990205705165863, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.1535087719298245, |
| "grad_norm": 0.6624627199940151, |
| "learning_rate": 5.08681206014429e-06, |
| "loss": 0.0133, |
| "mean_token_accuracy": 0.9964507818222046, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.1578947368421053, |
| "grad_norm": 0.3398128691735022, |
| "learning_rate": 5.0524892896896685e-06, |
| "loss": 0.0075, |
| "mean_token_accuracy": 0.9981516003608704, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.162280701754386, |
| "grad_norm": 0.5944097967599022, |
| "learning_rate": 5.0181927936673265e-06, |
| "loss": 0.0186, |
| "mean_token_accuracy": 0.9935244917869568, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.1666666666666667, |
| "grad_norm": 0.7886022659384564, |
| "learning_rate": 4.983924585707199e-06, |
| "loss": 0.0201, |
| "mean_token_accuracy": 0.9925788640975952, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.1710526315789473, |
| "grad_norm": 0.7115292914262498, |
| "learning_rate": 4.949686677778357e-06, |
| "loss": 0.0237, |
| "mean_token_accuracy": 0.9924812316894531, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.1754385964912282, |
| "grad_norm": 0.6122761215358518, |
| "learning_rate": 4.915481080070887e-06, |
| "loss": 0.0201, |
| "mean_token_accuracy": 0.9907063245773315, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.1798245614035088, |
| "grad_norm": 0.6709777220718868, |
| "learning_rate": 4.8813098008778685e-06, |
| "loss": 0.0237, |
| "mean_token_accuracy": 0.9934822916984558, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.1842105263157894, |
| "grad_norm": 0.6986740509144801, |
| "learning_rate": 4.847174846477448e-06, |
| "loss": 0.0185, |
| "mean_token_accuracy": 0.9915730357170105, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.1885964912280702, |
| "grad_norm": 0.3588944023181387, |
| "learning_rate": 4.813078221015065e-06, |
| "loss": 0.0078, |
| "mean_token_accuracy": 0.9962928891181946, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.1929824561403508, |
| "grad_norm": 0.5636673419187608, |
| "learning_rate": 4.779021926385771e-06, |
| "loss": 0.0132, |
| "mean_token_accuracy": 0.994140625, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.1973684210526316, |
| "grad_norm": 0.6289458998165135, |
| "learning_rate": 4.745007962116697e-06, |
| "loss": 0.0118, |
| "mean_token_accuracy": 0.9961277842521667, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.2017543859649122, |
| "grad_norm": 0.7398248895377864, |
| "learning_rate": 4.711038325249655e-06, |
| "loss": 0.0173, |
| "mean_token_accuracy": 0.9910634756088257, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.206140350877193, |
| "grad_norm": 0.6623117208186542, |
| "learning_rate": 4.677115010223895e-06, |
| "loss": 0.0202, |
| "mean_token_accuracy": 0.992707371711731, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.2105263157894737, |
| "grad_norm": 0.6400411135674129, |
| "learning_rate": 4.6432400087589925e-06, |
| "loss": 0.016, |
| "mean_token_accuracy": 0.9927797913551331, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.2149122807017543, |
| "grad_norm": 0.30557170779284765, |
| "learning_rate": 4.609415309737922e-06, |
| "loss": 0.0079, |
| "mean_token_accuracy": 0.9980879426002502, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.219298245614035, |
| "grad_norm": 0.6777294570314061, |
| "learning_rate": 4.5756428990902765e-06, |
| "loss": 0.0178, |
| "mean_token_accuracy": 0.9942085146903992, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.2236842105263157, |
| "grad_norm": 0.5415821904029386, |
| "learning_rate": 4.541924759675677e-06, |
| "loss": 0.0177, |
| "mean_token_accuracy": 0.9953574538230896, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.2280701754385965, |
| "grad_norm": 0.7130606446070428, |
| "learning_rate": 4.508262871167347e-06, |
| "loss": 0.0229, |
| "mean_token_accuracy": 0.9914934039115906, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.2324561403508771, |
| "grad_norm": 0.32701646159840236, |
| "learning_rate": 4.474659209935882e-06, |
| "loss": 0.0066, |
| "mean_token_accuracy": 0.9979487061500549, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.236842105263158, |
| "grad_norm": 0.6693039444309381, |
| "learning_rate": 4.441115748933219e-06, |
| "loss": 0.0242, |
| "mean_token_accuracy": 0.995979905128479, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.2412280701754386, |
| "grad_norm": 0.4096607438625313, |
| "learning_rate": 4.4076344575767895e-06, |
| "loss": 0.0101, |
| "mean_token_accuracy": 0.9961685538291931, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.2456140350877192, |
| "grad_norm": 0.7845521868223057, |
| "learning_rate": 4.374217301633897e-06, |
| "loss": 0.0107, |
| "mean_token_accuracy": 0.9960591197013855, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.7841482813803873, |
| "learning_rate": 4.340866243106302e-06, |
| "loss": 0.0306, |
| "mean_token_accuracy": 0.9917279481887817, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.2543859649122808, |
| "grad_norm": 0.5488307373930096, |
| "learning_rate": 4.307583240115024e-06, |
| "loss": 0.0219, |
| "mean_token_accuracy": 0.995417058467865, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.2587719298245614, |
| "grad_norm": 0.3877661782228482, |
| "learning_rate": 4.274370246785379e-06, |
| "loss": 0.0089, |
| "mean_token_accuracy": 0.9960474371910095, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.263157894736842, |
| "grad_norm": 0.6508169046635337, |
| "learning_rate": 4.241229213132245e-06, |
| "loss": 0.0103, |
| "mean_token_accuracy": 0.9951028227806091, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.2675438596491229, |
| "grad_norm": 0.5599716503548583, |
| "learning_rate": 4.208162084945579e-06, |
| "loss": 0.021, |
| "mean_token_accuracy": 0.9927927851676941, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.2719298245614035, |
| "grad_norm": 0.4318249642417072, |
| "learning_rate": 4.175170803676166e-06, |
| "loss": 0.0235, |
| "mean_token_accuracy": 0.9941747784614563, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.2763157894736843, |
| "grad_norm": 1.0333406448865334, |
| "learning_rate": 4.142257306321635e-06, |
| "loss": 0.0124, |
| "mean_token_accuracy": 0.9951597452163696, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.280701754385965, |
| "grad_norm": 0.7210585510735594, |
| "learning_rate": 4.109423525312738e-06, |
| "loss": 0.0263, |
| "mean_token_accuracy": 0.9915094375610352, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.2850877192982457, |
| "grad_norm": 0.5173158790354765, |
| "learning_rate": 4.076671388399882e-06, |
| "loss": 0.0179, |
| "mean_token_accuracy": 0.9944081902503967, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.2894736842105263, |
| "grad_norm": 0.8832118890157983, |
| "learning_rate": 4.044002818539959e-06, |
| "loss": 0.0246, |
| "mean_token_accuracy": 0.9924599528312683, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.293859649122807, |
| "grad_norm": 0.5907877656509859, |
| "learning_rate": 4.011419733783436e-06, |
| "loss": 0.0187, |
| "mean_token_accuracy": 0.9934395551681519, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.2982456140350878, |
| "grad_norm": 0.6982528194787686, |
| "learning_rate": 3.978924047161738e-06, |
| "loss": 0.0166, |
| "mean_token_accuracy": 0.9952696561813354, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.3026315789473684, |
| "grad_norm": 0.4582539295427508, |
| "learning_rate": 3.946517666574944e-06, |
| "loss": 0.0174, |
| "mean_token_accuracy": 0.9960707426071167, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.3070175438596492, |
| "grad_norm": 0.6545454504118773, |
| "learning_rate": 3.914202494679753e-06, |
| "loss": 0.0206, |
| "mean_token_accuracy": 0.9941973090171814, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.3114035087719298, |
| "grad_norm": 0.4866923986222573, |
| "learning_rate": 3.8819804287777855e-06, |
| "loss": 0.0156, |
| "mean_token_accuracy": 0.9936651587486267, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.3157894736842106, |
| "grad_norm": 0.4951798307541836, |
| "learning_rate": 3.849853360704185e-06, |
| "loss": 0.0094, |
| "mean_token_accuracy": 0.9953183531761169, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.3201754385964912, |
| "grad_norm": 0.7048315801989936, |
| "learning_rate": 3.817823176716541e-06, |
| "loss": 0.0194, |
| "mean_token_accuracy": 0.9947460889816284, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.3245614035087718, |
| "grad_norm": 0.40086208568003023, |
| "learning_rate": 3.785891757384148e-06, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9954586625099182, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.3289473684210527, |
| "grad_norm": 0.8771840219521843, |
| "learning_rate": 3.7540609774775872e-06, |
| "loss": 0.0199, |
| "mean_token_accuracy": 0.9922555685043335, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 0.622615960469209, |
| "learning_rate": 3.7223327058586566e-06, |
| "loss": 0.0231, |
| "mean_token_accuracy": 0.9940770268440247, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.337719298245614, |
| "grad_norm": 0.5441892845955472, |
| "learning_rate": 3.6907088053706486e-06, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9918991923332214, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.3421052631578947, |
| "grad_norm": 0.29321683329824944, |
| "learning_rate": 3.659191132728971e-06, |
| "loss": 0.009, |
| "mean_token_accuracy": 0.998039186000824, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.3464912280701755, |
| "grad_norm": 0.24532683318309317, |
| "learning_rate": 3.6277815384121408e-06, |
| "loss": 0.0052, |
| "mean_token_accuracy": 0.9981867671012878, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.3508771929824561, |
| "grad_norm": 0.4808197975505922, |
| "learning_rate": 3.5964818665531365e-06, |
| "loss": 0.0141, |
| "mean_token_accuracy": 0.9946091771125793, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.3552631578947367, |
| "grad_norm": 0.37774311490805496, |
| "learning_rate": 3.5652939548311217e-06, |
| "loss": 0.0116, |
| "mean_token_accuracy": 0.9947961568832397, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.3596491228070176, |
| "grad_norm": 0.4940114670959615, |
| "learning_rate": 3.534219634363557e-06, |
| "loss": 0.0144, |
| "mean_token_accuracy": 0.99623703956604, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.3640350877192984, |
| "grad_norm": 0.5518926837640857, |
| "learning_rate": 3.503260729598681e-06, |
| "loss": 0.023, |
| "mean_token_accuracy": 0.9912959337234497, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.368421052631579, |
| "grad_norm": 0.8090679564523366, |
| "learning_rate": 3.4724190582084073e-06, |
| "loss": 0.0323, |
| "mean_token_accuracy": 0.9867172837257385, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.3728070175438596, |
| "grad_norm": 0.3065894751037074, |
| "learning_rate": 3.441696430981585e-06, |
| "loss": 0.0103, |
| "mean_token_accuracy": 0.9972875118255615, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.3771929824561404, |
| "grad_norm": 0.8231322871128265, |
| "learning_rate": 3.4110946517176995e-06, |
| "loss": 0.0216, |
| "mean_token_accuracy": 0.9934148788452148, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.381578947368421, |
| "grad_norm": 0.8555278348494451, |
| "learning_rate": 3.3806155171209632e-06, |
| "loss": 0.0266, |
| "mean_token_accuracy": 0.9924882650375366, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.3859649122807016, |
| "grad_norm": 0.6560521157167256, |
| "learning_rate": 3.3502608166948166e-06, |
| "loss": 0.0271, |
| "mean_token_accuracy": 0.9927206635475159, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.3903508771929824, |
| "grad_norm": 0.26121211234912833, |
| "learning_rate": 3.320032332636879e-06, |
| "loss": 0.0084, |
| "mean_token_accuracy": 0.9980952143669128, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.3947368421052633, |
| "grad_norm": 0.2837900051915755, |
| "learning_rate": 3.2899318397342954e-06, |
| "loss": 0.006, |
| "mean_token_accuracy": 0.9990653991699219, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.3991228070175439, |
| "grad_norm": 0.6256688623570252, |
| "learning_rate": 3.2599611052595474e-06, |
| "loss": 0.0091, |
| "mean_token_accuracy": 0.9981883764266968, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.4035087719298245, |
| "grad_norm": 0.36276300816015317, |
| "learning_rate": 3.2301218888666807e-06, |
| "loss": 0.0107, |
| "mean_token_accuracy": 0.9962335228919983, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.4078947368421053, |
| "grad_norm": 0.5006047512208177, |
| "learning_rate": 3.200415942488003e-06, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9942857027053833, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.412280701754386, |
| "grad_norm": 0.725945111661614, |
| "learning_rate": 3.170845010231216e-06, |
| "loss": 0.0095, |
| "mean_token_accuracy": 0.9972375631332397, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.4166666666666667, |
| "grad_norm": 0.45609016947704295, |
| "learning_rate": 3.141410828277015e-06, |
| "loss": 0.0102, |
| "mean_token_accuracy": 0.9961977005004883, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.4210526315789473, |
| "grad_norm": 0.3198956997347105, |
| "learning_rate": 3.1121151247771595e-06, |
| "loss": 0.0097, |
| "mean_token_accuracy": 0.9972375631332397, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.4254385964912282, |
| "grad_norm": 0.9264679650548988, |
| "learning_rate": 3.082959619753001e-06, |
| "loss": 0.0273, |
| "mean_token_accuracy": 0.9910873174667358, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.4298245614035088, |
| "grad_norm": 0.49728527546862633, |
| "learning_rate": 3.053946024994506e-06, |
| "loss": 0.0234, |
| "mean_token_accuracy": 0.9936651587486267, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.4342105263157894, |
| "grad_norm": 0.6314728887589547, |
| "learning_rate": 3.025076043959739e-06, |
| "loss": 0.0352, |
| "mean_token_accuracy": 0.9896907210350037, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.4385964912280702, |
| "grad_norm": 0.3933091506390692, |
| "learning_rate": 2.9963513716748656e-06, |
| "loss": 0.0076, |
| "mean_token_accuracy": 0.9970530271530151, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.4429824561403508, |
| "grad_norm": 0.3664404826042077, |
| "learning_rate": 2.96777369463462e-06, |
| "loss": 0.0129, |
| "mean_token_accuracy": 0.9954954981803894, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.4473684210526316, |
| "grad_norm": 0.5379329921409455, |
| "learning_rate": 2.9393446907032886e-06, |
| "loss": 0.0186, |
| "mean_token_accuracy": 0.9926267266273499, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.4517543859649122, |
| "grad_norm": 0.8780996501931602, |
| "learning_rate": 2.911066029016208e-06, |
| "loss": 0.0158, |
| "mean_token_accuracy": 0.9956101775169373, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.456140350877193, |
| "grad_norm": 0.49371246943648667, |
| "learning_rate": 2.8829393698817566e-06, |
| "loss": 0.009, |
| "mean_token_accuracy": 0.9972248077392578, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.4605263157894737, |
| "grad_norm": 0.4423810999976139, |
| "learning_rate": 2.854966364683872e-06, |
| "loss": 0.0142, |
| "mean_token_accuracy": 0.9964125752449036, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.4649122807017543, |
| "grad_norm": 0.9132158500482946, |
| "learning_rate": 2.827148655785107e-06, |
| "loss": 0.0259, |
| "mean_token_accuracy": 0.9909665584564209, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.469298245614035, |
| "grad_norm": 1.6119293927233194, |
| "learning_rate": 2.7994878764301857e-06, |
| "loss": 0.0226, |
| "mean_token_accuracy": 0.9924599528312683, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.4736842105263157, |
| "grad_norm": 0.5274591538518822, |
| "learning_rate": 2.771985650650131e-06, |
| "loss": 0.0085, |
| "mean_token_accuracy": 0.9970986247062683, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.4780701754385965, |
| "grad_norm": 0.605748603282285, |
| "learning_rate": 2.7446435931668913e-06, |
| "loss": 0.0205, |
| "mean_token_accuracy": 0.9942362904548645, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.4824561403508771, |
| "grad_norm": 0.7075623867905496, |
| "learning_rate": 2.717463309298557e-06, |
| "loss": 0.0235, |
| "mean_token_accuracy": 0.9916666746139526, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.486842105263158, |
| "grad_norm": 0.28166596804565136, |
| "learning_rate": 2.6904463948650994e-06, |
| "loss": 0.0055, |
| "mean_token_accuracy": 0.999035656452179, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.4912280701754386, |
| "grad_norm": 0.33805944213251504, |
| "learning_rate": 2.663594436094669e-06, |
| "loss": 0.0107, |
| "mean_token_accuracy": 0.9962013363838196, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.4956140350877192, |
| "grad_norm": 0.5994224387229937, |
| "learning_rate": 2.6369090095304824e-06, |
| "loss": 0.0125, |
| "mean_token_accuracy": 0.9953746795654297, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.5233983565235873, |
| "learning_rate": 2.610391681938239e-06, |
| "loss": 0.0164, |
| "mean_token_accuracy": 0.9934701323509216, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.5043859649122808, |
| "grad_norm": 1.1315809472617842, |
| "learning_rate": 2.5840440102141506e-06, |
| "loss": 0.0202, |
| "mean_token_accuracy": 0.9942802786827087, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.5087719298245614, |
| "grad_norm": 0.5517316206836442, |
| "learning_rate": 2.5578675412935172e-06, |
| "loss": 0.0161, |
| "mean_token_accuracy": 0.994140625, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.513157894736842, |
| "grad_norm": 0.7683517902888619, |
| "learning_rate": 2.531863812059916e-06, |
| "loss": 0.0257, |
| "mean_token_accuracy": 0.9907407164573669, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.5175438596491229, |
| "grad_norm": 0.4653946397853578, |
| "learning_rate": 2.5060343492549567e-06, |
| "loss": 0.017, |
| "mean_token_accuracy": 0.9947961568832397, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.5219298245614035, |
| "grad_norm": 0.30618577904649025, |
| "learning_rate": 2.480380669388648e-06, |
| "loss": 0.0083, |
| "mean_token_accuracy": 0.9983079433441162, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.526315789473684, |
| "grad_norm": 0.5356782742501742, |
| "learning_rate": 2.45490427865036e-06, |
| "loss": 0.017, |
| "mean_token_accuracy": 0.9943556189537048, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.530701754385965, |
| "grad_norm": 0.225639418877882, |
| "learning_rate": 2.429606672820387e-06, |
| "loss": 0.0062, |
| "mean_token_accuracy": 0.9981900453567505, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.5350877192982457, |
| "grad_norm": 0.5597419284796115, |
| "learning_rate": 2.4044893371821373e-06, |
| "loss": 0.0181, |
| "mean_token_accuracy": 0.9933962225914001, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.5394736842105263, |
| "grad_norm": 0.4295764753737047, |
| "learning_rate": 2.379553746434913e-06, |
| "loss": 0.0199, |
| "mean_token_accuracy": 0.9962859749794006, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.543859649122807, |
| "grad_norm": 0.6197050955725708, |
| "learning_rate": 2.3548013646073427e-06, |
| "loss": 0.0218, |
| "mean_token_accuracy": 0.9951876997947693, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.5482456140350878, |
| "grad_norm": 0.45999201575473875, |
| "learning_rate": 2.3302336449714166e-06, |
| "loss": 0.0154, |
| "mean_token_accuracy": 0.9934579730033875, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.5526315789473686, |
| "grad_norm": 0.4929601659829713, |
| "learning_rate": 2.305852029957159e-06, |
| "loss": 0.0171, |
| "mean_token_accuracy": 0.9949545860290527, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.557017543859649, |
| "grad_norm": 0.4522203350950366, |
| "learning_rate": 2.281657951067948e-06, |
| "loss": 0.0093, |
| "mean_token_accuracy": 0.9972602725028992, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.5614035087719298, |
| "grad_norm": 0.5505306502642867, |
| "learning_rate": 2.257652828796459e-06, |
| "loss": 0.0166, |
| "mean_token_accuracy": 0.9952107071876526, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.5657894736842106, |
| "grad_norm": 0.49496442265468876, |
| "learning_rate": 2.233838072541273e-06, |
| "loss": 0.0213, |
| "mean_token_accuracy": 0.9945105314254761, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.5701754385964912, |
| "grad_norm": 0.28922810086730755, |
| "learning_rate": 2.2102150805241233e-06, |
| "loss": 0.0113, |
| "mean_token_accuracy": 0.9981290698051453, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.5745614035087718, |
| "grad_norm": 0.2925026090864194, |
| "learning_rate": 2.186785239707799e-06, |
| "loss": 0.0074, |
| "mean_token_accuracy": 0.9962157011032104, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.5789473684210527, |
| "grad_norm": 0.5586576399302614, |
| "learning_rate": 2.163549925714721e-06, |
| "loss": 0.0208, |
| "mean_token_accuracy": 0.9933775067329407, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.5833333333333335, |
| "grad_norm": 0.4175057313306584, |
| "learning_rate": 2.140510502746168e-06, |
| "loss": 0.0115, |
| "mean_token_accuracy": 0.9951643943786621, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.587719298245614, |
| "grad_norm": 0.542862948574824, |
| "learning_rate": 2.1176683235021885e-06, |
| "loss": 0.0209, |
| "mean_token_accuracy": 0.9954832792282104, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.5921052631578947, |
| "grad_norm": 0.2985027096957698, |
| "learning_rate": 2.0950247291021713e-06, |
| "loss": 0.0058, |
| "mean_token_accuracy": 0.9972476959228516, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.5964912280701755, |
| "grad_norm": 0.5796895006295526, |
| "learning_rate": 2.0725810490061156e-06, |
| "loss": 0.0172, |
| "mean_token_accuracy": 0.9945651888847351, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.6008771929824561, |
| "grad_norm": 0.35351291362110343, |
| "learning_rate": 2.0503386009365685e-06, |
| "loss": 0.0138, |
| "mean_token_accuracy": 0.9954462647438049, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.6052631578947367, |
| "grad_norm": 0.6443815145624668, |
| "learning_rate": 2.028298690801257e-06, |
| "loss": 0.028, |
| "mean_token_accuracy": 0.9928250908851624, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.6096491228070176, |
| "grad_norm": 0.4538736187308062, |
| "learning_rate": 2.006462612616422e-06, |
| "loss": 0.0151, |
| "mean_token_accuracy": 0.9925719499588013, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.6140350877192984, |
| "grad_norm": 0.5476787939926033, |
| "learning_rate": 1.984831648430836e-06, |
| "loss": 0.0232, |
| "mean_token_accuracy": 0.9917808175086975, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.618421052631579, |
| "grad_norm": 0.3245890489294466, |
| "learning_rate": 1.963407068250538e-06, |
| "loss": 0.0092, |
| "mean_token_accuracy": 0.9971209168434143, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.6228070175438596, |
| "grad_norm": 0.578747539147958, |
| "learning_rate": 1.9421901299642597e-06, |
| "loss": 0.0203, |
| "mean_token_accuracy": 0.9970119595527649, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.6271929824561404, |
| "grad_norm": 0.4173581930638192, |
| "learning_rate": 1.9211820792695808e-06, |
| "loss": 0.0114, |
| "mean_token_accuracy": 0.9971618056297302, |
| "step": 371 |
| }, |
| { |
| "epoch": 1.631578947368421, |
| "grad_norm": 0.7267456190502942, |
| "learning_rate": 1.900384149599787e-06, |
| "loss": 0.0223, |
| "mean_token_accuracy": 0.9934518337249756, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.6359649122807016, |
| "grad_norm": 0.38589883521358226, |
| "learning_rate": 1.8797975620514497e-06, |
| "loss": 0.0175, |
| "mean_token_accuracy": 0.9953007698059082, |
| "step": 373 |
| }, |
| { |
| "epoch": 1.6403508771929824, |
| "grad_norm": 0.40035132787035105, |
| "learning_rate": 1.8594235253127373e-06, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.994434118270874, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.6447368421052633, |
| "grad_norm": 0.5734469679745319, |
| "learning_rate": 1.8392632355924454e-06, |
| "loss": 0.0159, |
| "mean_token_accuracy": 0.9916589260101318, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.6491228070175439, |
| "grad_norm": 0.6383721891614731, |
| "learning_rate": 1.819317876549771e-06, |
| "loss": 0.0151, |
| "mean_token_accuracy": 0.9907321333885193, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.6535087719298245, |
| "grad_norm": 0.43622588490055814, |
| "learning_rate": 1.7995886192248091e-06, |
| "loss": 0.0155, |
| "mean_token_accuracy": 0.9936363697052002, |
| "step": 377 |
| }, |
| { |
| "epoch": 1.6578947368421053, |
| "grad_norm": 0.3258458186053117, |
| "learning_rate": 1.7800766219698033e-06, |
| "loss": 0.0119, |
| "mean_token_accuracy": 0.9991079568862915, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.662280701754386, |
| "grad_norm": 0.5063266761884702, |
| "learning_rate": 1.760783030381138e-06, |
| "loss": 0.0109, |
| "mean_token_accuracy": 0.9925719499588013, |
| "step": 379 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.5575160996667514, |
| "learning_rate": 1.74170897723207e-06, |
| "loss": 0.0167, |
| "mean_token_accuracy": 0.996333658695221, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.6710526315789473, |
| "grad_norm": 0.6211241604322575, |
| "learning_rate": 1.7228555824062254e-06, |
| "loss": 0.0267, |
| "mean_token_accuracy": 0.9909583926200867, |
| "step": 381 |
| }, |
| { |
| "epoch": 1.6754385964912282, |
| "grad_norm": 0.714618690583619, |
| "learning_rate": 1.7042239528318539e-06, |
| "loss": 0.0203, |
| "mean_token_accuracy": 0.9926537871360779, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.6798245614035088, |
| "grad_norm": 0.2669222225952594, |
| "learning_rate": 1.6858151824168254e-06, |
| "loss": 0.0119, |
| "mean_token_accuracy": 0.9990466833114624, |
| "step": 383 |
| }, |
| { |
| "epoch": 1.6842105263157894, |
| "grad_norm": 0.29847381639325926, |
| "learning_rate": 1.6676303519844179e-06, |
| "loss": 0.006, |
| "mean_token_accuracy": 0.9990671873092651, |
| "step": 384 |
| }, |
| { |
| "epoch": 1.6885964912280702, |
| "grad_norm": 0.36428171744835536, |
| "learning_rate": 1.649670529209848e-06, |
| "loss": 0.0124, |
| "mean_token_accuracy": 0.9961165189743042, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.692982456140351, |
| "grad_norm": 0.489211354298264, |
| "learning_rate": 1.631936768557596e-06, |
| "loss": 0.0189, |
| "mean_token_accuracy": 0.9953007698059082, |
| "step": 386 |
| }, |
| { |
| "epoch": 1.6973684210526314, |
| "grad_norm": 0.46355789334396114, |
| "learning_rate": 1.6144301112194843e-06, |
| "loss": 0.0125, |
| "mean_token_accuracy": 0.994584858417511, |
| "step": 387 |
| }, |
| { |
| "epoch": 1.7017543859649122, |
| "grad_norm": 0.31243439716929977, |
| "learning_rate": 1.5971515850535568e-06, |
| "loss": 0.0107, |
| "mean_token_accuracy": 0.9961758852005005, |
| "step": 388 |
| }, |
| { |
| "epoch": 1.706140350877193, |
| "grad_norm": 0.5176572531228848, |
| "learning_rate": 1.5801022045237252e-06, |
| "loss": 0.0117, |
| "mean_token_accuracy": 0.9981150031089783, |
| "step": 389 |
| }, |
| { |
| "epoch": 1.7105263157894737, |
| "grad_norm": 0.37327685375323494, |
| "learning_rate": 1.5632829706402076e-06, |
| "loss": 0.0126, |
| "mean_token_accuracy": 0.9952696561813354, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.7149122807017543, |
| "grad_norm": 0.26160936430658405, |
| "learning_rate": 1.5466948709007604e-06, |
| "loss": 0.0069, |
| "mean_token_accuracy": 0.9971671104431152, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.719298245614035, |
| "grad_norm": 0.35241727464587724, |
| "learning_rate": 1.5303388792326934e-06, |
| "loss": 0.0075, |
| "mean_token_accuracy": 0.9981532692909241, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.723684210526316, |
| "grad_norm": 0.5556914854139079, |
| "learning_rate": 1.5142159559356961e-06, |
| "loss": 0.0193, |
| "mean_token_accuracy": 0.9950932264328003, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.7280701754385965, |
| "grad_norm": 0.8639947324185812, |
| "learning_rate": 1.4983270476254503e-06, |
| "loss": 0.032, |
| "mean_token_accuracy": 0.9915730357170105, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.7324561403508771, |
| "grad_norm": 0.31336779217522975, |
| "learning_rate": 1.4826730871780534e-06, |
| "loss": 0.0072, |
| "mean_token_accuracy": 0.998123824596405, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.736842105263158, |
| "grad_norm": 1.5330194630862561, |
| "learning_rate": 1.4672549936752507e-06, |
| "loss": 0.012, |
| "mean_token_accuracy": 0.9962049126625061, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.7412280701754386, |
| "grad_norm": 0.5912672902683355, |
| "learning_rate": 1.4520736723504658e-06, |
| "loss": 0.0242, |
| "mean_token_accuracy": 0.9886040091514587, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.7456140350877192, |
| "grad_norm": 0.5369897437242712, |
| "learning_rate": 1.437130014535662e-06, |
| "loss": 0.0118, |
| "mean_token_accuracy": 0.9961941242218018, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.6390109242572387, |
| "learning_rate": 1.4224248976090016e-06, |
| "loss": 0.013, |
| "mean_token_accuracy": 0.9962335228919983, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.7543859649122808, |
| "grad_norm": 0.5112711325187154, |
| "learning_rate": 1.4079591849433383e-06, |
| "loss": 0.0159, |
| "mean_token_accuracy": 0.9962581992149353, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.7587719298245614, |
| "grad_norm": 0.47387031096608817, |
| "learning_rate": 1.3937337258555252e-06, |
| "loss": 0.0094, |
| "mean_token_accuracy": 0.9941747784614563, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.763157894736842, |
| "grad_norm": 0.43213096410090324, |
| "learning_rate": 1.379749355556547e-06, |
| "loss": 0.0134, |
| "mean_token_accuracy": 0.9954627752304077, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.7675438596491229, |
| "grad_norm": 0.43937840590330046, |
| "learning_rate": 1.3660068951024857e-06, |
| "loss": 0.0135, |
| "mean_token_accuracy": 0.9980787634849548, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.7719298245614035, |
| "grad_norm": 0.5245888096165663, |
| "learning_rate": 1.3525071513463128e-06, |
| "loss": 0.0181, |
| "mean_token_accuracy": 0.9941349029541016, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.776315789473684, |
| "grad_norm": 0.4657997466315775, |
| "learning_rate": 1.339250916890519e-06, |
| "loss": 0.0169, |
| "mean_token_accuracy": 0.9923954606056213, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.780701754385965, |
| "grad_norm": 0.6524917431466828, |
| "learning_rate": 1.3262389700405746e-06, |
| "loss": 0.0134, |
| "mean_token_accuracy": 0.996370255947113, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.7850877192982457, |
| "grad_norm": 0.41484319767884376, |
| "learning_rate": 1.3134720747592373e-06, |
| "loss": 0.0186, |
| "mean_token_accuracy": 0.99434494972229, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.7894736842105263, |
| "grad_norm": 0.3574688039673247, |
| "learning_rate": 1.3009509806216986e-06, |
| "loss": 0.0078, |
| "mean_token_accuracy": 0.9971428513526917, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.793859649122807, |
| "grad_norm": 0.5517641237252472, |
| "learning_rate": 1.2886764227715679e-06, |
| "loss": 0.017, |
| "mean_token_accuracy": 0.9924026727676392, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.7982456140350878, |
| "grad_norm": 0.6210804604514915, |
| "learning_rate": 1.2766491218777197e-06, |
| "loss": 0.0177, |
| "mean_token_accuracy": 0.9936651587486267, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.8026315789473686, |
| "grad_norm": 2.4294857443530464, |
| "learning_rate": 1.2648697840919732e-06, |
| "loss": 0.0125, |
| "mean_token_accuracy": 0.9951737523078918, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.807017543859649, |
| "grad_norm": 0.3722752940922616, |
| "learning_rate": 1.2533391010076381e-06, |
| "loss": 0.0122, |
| "mean_token_accuracy": 0.9944801926612854, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.8114035087719298, |
| "grad_norm": 0.8476964704888881, |
| "learning_rate": 1.2420577496189063e-06, |
| "loss": 0.0353, |
| "mean_token_accuracy": 0.9891989231109619, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.8157894736842106, |
| "grad_norm": 0.6562461456351483, |
| "learning_rate": 1.2310263922811048e-06, |
| "loss": 0.0211, |
| "mean_token_accuracy": 0.9942583441734314, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.8201754385964912, |
| "grad_norm": 0.5477711599928562, |
| "learning_rate": 1.2202456766718092e-06, |
| "loss": 0.0107, |
| "mean_token_accuracy": 0.997029721736908, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.8245614035087718, |
| "grad_norm": 0.5604958443191792, |
| "learning_rate": 1.2097162357528128e-06, |
| "loss": 0.0146, |
| "mean_token_accuracy": 0.994434118270874, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.8289473684210527, |
| "grad_norm": 0.4382700133398788, |
| "learning_rate": 1.1994386877329678e-06, |
| "loss": 0.0171, |
| "mean_token_accuracy": 0.9952963590621948, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.8333333333333335, |
| "grad_norm": 0.6397393090109242, |
| "learning_rate": 1.189413636031886e-06, |
| "loss": 0.0144, |
| "mean_token_accuracy": 0.9972923994064331, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.837719298245614, |
| "grad_norm": 0.6606608620943953, |
| "learning_rate": 1.179641669244514e-06, |
| "loss": 0.0304, |
| "mean_token_accuracy": 0.9915174245834351, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.8421052631578947, |
| "grad_norm": 0.4095919407160312, |
| "learning_rate": 1.1701233611065705e-06, |
| "loss": 0.0098, |
| "mean_token_accuracy": 0.9973404407501221, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.8464912280701755, |
| "grad_norm": 0.3529222243911339, |
| "learning_rate": 1.1608592704608656e-06, |
| "loss": 0.0045, |
| "mean_token_accuracy": 0.9980545043945312, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.8508771929824561, |
| "grad_norm": 0.6953822895763759, |
| "learning_rate": 1.1518499412244872e-06, |
| "loss": 0.0183, |
| "mean_token_accuracy": 0.9934640526771545, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.8552631578947367, |
| "grad_norm": 0.4343088081260666, |
| "learning_rate": 1.1430959023568654e-06, |
| "loss": 0.0179, |
| "mean_token_accuracy": 0.9962085485458374, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.8596491228070176, |
| "grad_norm": 0.6173117872666708, |
| "learning_rate": 1.1345976678287216e-06, |
| "loss": 0.0167, |
| "mean_token_accuracy": 0.9961722493171692, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.8640350877192984, |
| "grad_norm": 0.6092676283120322, |
| "learning_rate": 1.126355736591882e-06, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9952874779701233, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.868421052631579, |
| "grad_norm": 0.4729725595882896, |
| "learning_rate": 1.1183705925499948e-06, |
| "loss": 0.01, |
| "mean_token_accuracy": 0.9973070025444031, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.8728070175438596, |
| "grad_norm": 0.3344399601407711, |
| "learning_rate": 1.1106427045301085e-06, |
| "loss": 0.0114, |
| "mean_token_accuracy": 0.9961977005004883, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.8771929824561404, |
| "grad_norm": 0.45221349611594724, |
| "learning_rate": 1.1031725262551536e-06, |
| "loss": 0.0073, |
| "mean_token_accuracy": 0.9981273412704468, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.881578947368421, |
| "grad_norm": 0.3416923912648374, |
| "learning_rate": 1.0959604963172996e-06, |
| "loss": 0.0095, |
| "mean_token_accuracy": 0.9972375631332397, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.8859649122807016, |
| "grad_norm": 0.38149092846328136, |
| "learning_rate": 1.0890070381522038e-06, |
| "loss": 0.011, |
| "mean_token_accuracy": 0.996303141117096, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.8903508771929824, |
| "grad_norm": 0.8536603803361408, |
| "learning_rate": 1.0823125600141529e-06, |
| "loss": 0.0235, |
| "mean_token_accuracy": 0.9946380853652954, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.8947368421052633, |
| "grad_norm": 0.3976782329418253, |
| "learning_rate": 1.0758774549520922e-06, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.9965841174125671, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.8991228070175439, |
| "grad_norm": 0.46382523978391604, |
| "learning_rate": 1.069702100786548e-06, |
| "loss": 0.0131, |
| "mean_token_accuracy": 0.9942418336868286, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.9035087719298245, |
| "grad_norm": 0.528519902056741, |
| "learning_rate": 1.0637868600874448e-06, |
| "loss": 0.023, |
| "mean_token_accuracy": 0.9981167316436768, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.9078947368421053, |
| "grad_norm": 0.35317819715643745, |
| "learning_rate": 1.0581320801528202e-06, |
| "loss": 0.0066, |
| "mean_token_accuracy": 0.9981042742729187, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.912280701754386, |
| "grad_norm": 0.4507905865887121, |
| "learning_rate": 1.0527380929884324e-06, |
| "loss": 0.0069, |
| "mean_token_accuracy": 0.998039186000824, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.9166666666666665, |
| "grad_norm": 0.500641990169249, |
| "learning_rate": 1.0476052152882653e-06, |
| "loss": 0.0145, |
| "mean_token_accuracy": 0.9944393038749695, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.9210526315789473, |
| "grad_norm": 0.8669158687204845, |
| "learning_rate": 1.0427337484159404e-06, |
| "loss": 0.03, |
| "mean_token_accuracy": 0.989130437374115, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.9254385964912282, |
| "grad_norm": 0.5362124896672962, |
| "learning_rate": 1.0381239783870168e-06, |
| "loss": 0.0225, |
| "mean_token_accuracy": 0.9942747950553894, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.9298245614035088, |
| "grad_norm": 0.1461222960998511, |
| "learning_rate": 1.0337761758522028e-06, |
| "loss": 0.0027, |
| "mean_token_accuracy": 1.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.9342105263157894, |
| "grad_norm": 0.29712582406163224, |
| "learning_rate": 1.0296905960814626e-06, |
| "loss": 0.0089, |
| "mean_token_accuracy": 0.9958890080451965, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.9385964912280702, |
| "grad_norm": 0.2219473407255181, |
| "learning_rate": 1.025867478949031e-06, |
| "loss": 0.0048, |
| "mean_token_accuracy": 0.9990272521972656, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.942982456140351, |
| "grad_norm": 0.4560723679672652, |
| "learning_rate": 1.0223070489193277e-06, |
| "loss": 0.0093, |
| "mean_token_accuracy": 0.9981203079223633, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.9473684210526314, |
| "grad_norm": 0.6337477945075195, |
| "learning_rate": 1.0190095150337812e-06, |
| "loss": 0.017, |
| "mean_token_accuracy": 0.99349445104599, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.9517543859649122, |
| "grad_norm": 0.5007232841068624, |
| "learning_rate": 1.015975070898552e-06, |
| "loss": 0.0118, |
| "mean_token_accuracy": 0.9970703125, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.956140350877193, |
| "grad_norm": 0.5302154860753227, |
| "learning_rate": 1.0132038946731682e-06, |
| "loss": 0.0143, |
| "mean_token_accuracy": 0.9944751262664795, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.9605263157894737, |
| "grad_norm": 0.7692077638980895, |
| "learning_rate": 1.0106961490600648e-06, |
| "loss": 0.0265, |
| "mean_token_accuracy": 0.9904943108558655, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.9649122807017543, |
| "grad_norm": 0.5438313639387815, |
| "learning_rate": 1.0084519812950302e-06, |
| "loss": 0.0181, |
| "mean_token_accuracy": 0.9956446290016174, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.969298245614035, |
| "grad_norm": 0.30314286668045737, |
| "learning_rate": 1.0064715231385614e-06, |
| "loss": 0.0084, |
| "mean_token_accuracy": 0.9970501661300659, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.973684210526316, |
| "grad_norm": 0.32670339704650764, |
| "learning_rate": 1.0047548908681308e-06, |
| "loss": 0.0044, |
| "mean_token_accuracy": 0.9989878535270691, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.9780701754385965, |
| "grad_norm": 0.34554361863363464, |
| "learning_rate": 1.003302185271355e-06, |
| "loss": 0.0073, |
| "mean_token_accuracy": 0.9962476491928101, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.9824561403508771, |
| "grad_norm": 0.34751292548425655, |
| "learning_rate": 1.002113491640081e-06, |
| "loss": 0.0105, |
| "mean_token_accuracy": 0.99717777967453, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.986842105263158, |
| "grad_norm": 0.7336171250026763, |
| "learning_rate": 1.001188879765377e-06, |
| "loss": 0.022, |
| "mean_token_accuracy": 0.9952107071876526, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.9912280701754386, |
| "grad_norm": 0.4023427958490199, |
| "learning_rate": 1.000528403933433e-06, |
| "loss": 0.0165, |
| "mean_token_accuracy": 0.9971482753753662, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.9956140350877192, |
| "grad_norm": 0.6576966458583471, |
| "learning_rate": 1.0001321029223788e-06, |
| "loss": 0.0171, |
| "mean_token_accuracy": 0.9943661689758301, |
| "step": 455 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.5242992313867519, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.0144, |
| "mean_token_accuracy": 0.9952606558799744, |
| "step": 456 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 456, |
| "total_flos": 925661847552.0, |
| "train_loss": 0.09788520415946678, |
| "train_runtime": 7879.858, |
| "train_samples_per_second": 1.848, |
| "train_steps_per_second": 0.058 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 456, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 925661847552.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|