diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,19291 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 2406, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012476606363069245, + "grad_norm": 9.34911917109977, + "learning_rate": 0.0, + "loss": 2.4049, + "num_tokens": 79590.0, + "step": 1 + }, + { + "epoch": 0.002495321272613849, + "grad_norm": 9.364768973646816, + "learning_rate": 1.36986301369863e-07, + "loss": 2.4134, + "num_tokens": 159103.0, + "step": 2 + }, + { + "epoch": 0.0037429819089207735, + "grad_norm": 9.456536966619405, + "learning_rate": 2.73972602739726e-07, + "loss": 2.4175, + "num_tokens": 237799.0, + "step": 3 + }, + { + "epoch": 0.004990642545227698, + "grad_norm": 9.426354062444751, + "learning_rate": 4.1095890410958903e-07, + "loss": 2.4288, + "num_tokens": 316493.0, + "step": 4 + }, + { + "epoch": 0.006238303181534623, + "grad_norm": 9.24236293603858, + "learning_rate": 5.47945205479452e-07, + "loss": 2.3781, + "num_tokens": 396816.0, + "step": 5 + }, + { + "epoch": 0.007485963817841547, + "grad_norm": 9.125681681424858, + "learning_rate": 6.849315068493151e-07, + "loss": 2.3607, + "num_tokens": 477827.0, + "step": 6 + }, + { + "epoch": 0.008733624454148471, + "grad_norm": 9.096924762031028, + "learning_rate": 8.219178082191781e-07, + "loss": 2.3668, + "num_tokens": 557522.0, + "step": 7 + }, + { + "epoch": 0.009981285090455396, + "grad_norm": 9.119042405502713, + "learning_rate": 9.589041095890411e-07, + "loss": 2.3608, + "num_tokens": 636975.0, + "step": 8 + }, + { + "epoch": 0.011228945726762321, + "grad_norm": 8.92660720049316, + "learning_rate": 1.095890410958904e-06, + "loss": 2.3189, + "num_tokens": 715688.0, + "step": 9 + }, + { + "epoch": 0.012476606363069246, + "grad_norm": 8.341929678010592, + "learning_rate": 1.2328767123287673e-06, + "loss": 2.2439, + "num_tokens": 796141.0, + "step": 10 + }, + { + "epoch": 0.01372426699937617, + "grad_norm": 8.261055619190312, + "learning_rate": 1.3698630136986302e-06, + "loss": 2.2317, + "num_tokens": 875804.0, + "step": 11 + }, + { + "epoch": 0.014971927635683094, + "grad_norm": 8.045610516077126, + "learning_rate": 1.5068493150684932e-06, + "loss": 2.1818, + "num_tokens": 957377.0, + "step": 12 + }, + { + "epoch": 0.016219588271990017, + "grad_norm": 6.709132143557291, + "learning_rate": 1.6438356164383561e-06, + "loss": 1.9262, + "num_tokens": 1038132.0, + "step": 13 + }, + { + "epoch": 0.017467248908296942, + "grad_norm": 6.562072632018573, + "learning_rate": 1.7808219178082193e-06, + "loss": 1.8848, + "num_tokens": 1119984.0, + "step": 14 + }, + { + "epoch": 0.018714909544603867, + "grad_norm": 6.461084025016272, + "learning_rate": 1.9178082191780823e-06, + "loss": 1.8372, + "num_tokens": 1200787.0, + "step": 15 + }, + { + "epoch": 0.019962570180910792, + "grad_norm": 6.432679419207876, + "learning_rate": 2.0547945205479454e-06, + "loss": 1.8135, + "num_tokens": 1281286.0, + "step": 16 + }, + { + "epoch": 0.021210230817217717, + "grad_norm": 6.869674718914819, + "learning_rate": 2.191780821917808e-06, + "loss": 1.2571, + "num_tokens": 1360214.0, + "step": 17 + }, + { + "epoch": 0.022457891453524642, + "grad_norm": 6.548310858256031, + "learning_rate": 2.3287671232876713e-06, + "loss": 1.2354, + "num_tokens": 1442460.0, + "step": 18 + }, + { + "epoch": 0.023705552089831567, + "grad_norm": 7.14680245757635, + "learning_rate": 2.4657534246575345e-06, + "loss": 1.1882, + "num_tokens": 1522593.0, + "step": 19 + }, + { + "epoch": 0.024953212726138492, + "grad_norm": 7.930859427939441, + "learning_rate": 2.6027397260273973e-06, + "loss": 1.0258, + "num_tokens": 1603158.0, + "step": 20 + }, + { + "epoch": 0.026200873362445413, + "grad_norm": 7.129618686033834, + "learning_rate": 2.7397260273972604e-06, + "loss": 0.9557, + "num_tokens": 1682215.0, + "step": 21 + }, + { + "epoch": 0.02744853399875234, + "grad_norm": 8.351617658742393, + "learning_rate": 2.876712328767123e-06, + "loss": 0.8064, + "num_tokens": 1762875.0, + "step": 22 + }, + { + "epoch": 0.028696194635059263, + "grad_norm": 6.202115696809869, + "learning_rate": 3.0136986301369864e-06, + "loss": 0.4053, + "num_tokens": 1842280.0, + "step": 23 + }, + { + "epoch": 0.02994385527136619, + "grad_norm": 3.015415758599779, + "learning_rate": 3.1506849315068495e-06, + "loss": 0.2977, + "num_tokens": 1922017.0, + "step": 24 + }, + { + "epoch": 0.031191515907673113, + "grad_norm": 1.7548078686272264, + "learning_rate": 3.2876712328767123e-06, + "loss": 0.2561, + "num_tokens": 2001171.0, + "step": 25 + }, + { + "epoch": 0.032439176543980035, + "grad_norm": 1.2570888602541386, + "learning_rate": 3.4246575342465754e-06, + "loss": 0.243, + "num_tokens": 2082788.0, + "step": 26 + }, + { + "epoch": 0.03368683718028696, + "grad_norm": 0.9858243334964526, + "learning_rate": 3.5616438356164386e-06, + "loss": 0.2084, + "num_tokens": 2163533.0, + "step": 27 + }, + { + "epoch": 0.034934497816593885, + "grad_norm": 0.8941120107804094, + "learning_rate": 3.6986301369863014e-06, + "loss": 0.2072, + "num_tokens": 2242151.0, + "step": 28 + }, + { + "epoch": 0.03618215845290081, + "grad_norm": 0.834578793765266, + "learning_rate": 3.8356164383561645e-06, + "loss": 0.2113, + "num_tokens": 2321655.0, + "step": 29 + }, + { + "epoch": 0.037429819089207735, + "grad_norm": 0.7958460007482222, + "learning_rate": 3.972602739726027e-06, + "loss": 0.1935, + "num_tokens": 2400547.0, + "step": 30 + }, + { + "epoch": 0.03867747972551466, + "grad_norm": 0.7476709118804037, + "learning_rate": 4.109589041095891e-06, + "loss": 0.1932, + "num_tokens": 2481040.0, + "step": 31 + }, + { + "epoch": 0.039925140361821584, + "grad_norm": 0.7336501580269786, + "learning_rate": 4.246575342465754e-06, + "loss": 0.1869, + "num_tokens": 2561815.0, + "step": 32 + }, + { + "epoch": 0.041172800998128506, + "grad_norm": 0.7276050156656503, + "learning_rate": 4.383561643835616e-06, + "loss": 0.1819, + "num_tokens": 2642280.0, + "step": 33 + }, + { + "epoch": 0.042420461634435434, + "grad_norm": 0.7320713436530883, + "learning_rate": 4.52054794520548e-06, + "loss": 0.1776, + "num_tokens": 2722505.0, + "step": 34 + }, + { + "epoch": 0.043668122270742356, + "grad_norm": 0.7211851177445922, + "learning_rate": 4.657534246575343e-06, + "loss": 0.1626, + "num_tokens": 2801483.0, + "step": 35 + }, + { + "epoch": 0.044915782907049284, + "grad_norm": 0.7706879408631792, + "learning_rate": 4.7945205479452054e-06, + "loss": 0.1652, + "num_tokens": 2880749.0, + "step": 36 + }, + { + "epoch": 0.046163443543356206, + "grad_norm": 0.6310470242010754, + "learning_rate": 4.931506849315069e-06, + "loss": 0.1578, + "num_tokens": 2960025.0, + "step": 37 + }, + { + "epoch": 0.047411104179663134, + "grad_norm": 0.5719157846400421, + "learning_rate": 5.068493150684932e-06, + "loss": 0.1559, + "num_tokens": 3040006.0, + "step": 38 + }, + { + "epoch": 0.048658764815970056, + "grad_norm": 0.5556127515496487, + "learning_rate": 5.2054794520547945e-06, + "loss": 0.1445, + "num_tokens": 3118779.0, + "step": 39 + }, + { + "epoch": 0.049906425452276984, + "grad_norm": 0.47419374873726, + "learning_rate": 5.342465753424658e-06, + "loss": 0.1494, + "num_tokens": 3200031.0, + "step": 40 + }, + { + "epoch": 0.051154086088583905, + "grad_norm": 0.4421370793504471, + "learning_rate": 5.479452054794521e-06, + "loss": 0.1523, + "num_tokens": 3281300.0, + "step": 41 + }, + { + "epoch": 0.05240174672489083, + "grad_norm": 0.43669518921865735, + "learning_rate": 5.6164383561643845e-06, + "loss": 0.1403, + "num_tokens": 3361534.0, + "step": 42 + }, + { + "epoch": 0.053649407361197755, + "grad_norm": 0.4277843089034562, + "learning_rate": 5.753424657534246e-06, + "loss": 0.1432, + "num_tokens": 3441863.0, + "step": 43 + }, + { + "epoch": 0.05489706799750468, + "grad_norm": 0.41630114035277377, + "learning_rate": 5.89041095890411e-06, + "loss": 0.1315, + "num_tokens": 3521074.0, + "step": 44 + }, + { + "epoch": 0.056144728633811605, + "grad_norm": 0.41370843939103547, + "learning_rate": 6.027397260273973e-06, + "loss": 0.1387, + "num_tokens": 3602411.0, + "step": 45 + }, + { + "epoch": 0.05739238927011853, + "grad_norm": 0.43876538412653404, + "learning_rate": 6.164383561643836e-06, + "loss": 0.1361, + "num_tokens": 3682928.0, + "step": 46 + }, + { + "epoch": 0.058640049906425455, + "grad_norm": 0.34801092467574696, + "learning_rate": 6.301369863013699e-06, + "loss": 0.1258, + "num_tokens": 3763667.0, + "step": 47 + }, + { + "epoch": 0.05988771054273238, + "grad_norm": 0.2939900530144378, + "learning_rate": 6.438356164383563e-06, + "loss": 0.1322, + "num_tokens": 3845378.0, + "step": 48 + }, + { + "epoch": 0.0611353711790393, + "grad_norm": 0.24786140273528653, + "learning_rate": 6.5753424657534245e-06, + "loss": 0.1267, + "num_tokens": 3925502.0, + "step": 49 + }, + { + "epoch": 0.06238303181534623, + "grad_norm": 0.20173255384894515, + "learning_rate": 6.712328767123288e-06, + "loss": 0.1208, + "num_tokens": 4004936.0, + "step": 50 + }, + { + "epoch": 0.06363069245165315, + "grad_norm": 0.18388818103544632, + "learning_rate": 6.849315068493151e-06, + "loss": 0.1225, + "num_tokens": 4084695.0, + "step": 51 + }, + { + "epoch": 0.06487835308796007, + "grad_norm": 0.21045301465823563, + "learning_rate": 6.9863013698630145e-06, + "loss": 0.1216, + "num_tokens": 4163992.0, + "step": 52 + }, + { + "epoch": 0.066126013724267, + "grad_norm": 0.18341190230418852, + "learning_rate": 7.123287671232877e-06, + "loss": 0.124, + "num_tokens": 4244593.0, + "step": 53 + }, + { + "epoch": 0.06737367436057393, + "grad_norm": 0.19088058421518178, + "learning_rate": 7.260273972602741e-06, + "loss": 0.1145, + "num_tokens": 4324390.0, + "step": 54 + }, + { + "epoch": 0.06862133499688085, + "grad_norm": 0.19103912807016007, + "learning_rate": 7.397260273972603e-06, + "loss": 0.1271, + "num_tokens": 4405698.0, + "step": 55 + }, + { + "epoch": 0.06986899563318777, + "grad_norm": 0.18970020622501074, + "learning_rate": 7.534246575342466e-06, + "loss": 0.112, + "num_tokens": 4485053.0, + "step": 56 + }, + { + "epoch": 0.07111665626949469, + "grad_norm": 0.19164305174626092, + "learning_rate": 7.671232876712329e-06, + "loss": 0.1098, + "num_tokens": 4565926.0, + "step": 57 + }, + { + "epoch": 0.07236431690580163, + "grad_norm": 0.19460729939643484, + "learning_rate": 7.808219178082192e-06, + "loss": 0.1087, + "num_tokens": 4644896.0, + "step": 58 + }, + { + "epoch": 0.07361197754210855, + "grad_norm": 0.18559860342670173, + "learning_rate": 7.945205479452055e-06, + "loss": 0.1101, + "num_tokens": 4724338.0, + "step": 59 + }, + { + "epoch": 0.07485963817841547, + "grad_norm": 0.20159007460588826, + "learning_rate": 8.082191780821919e-06, + "loss": 0.1194, + "num_tokens": 4805035.0, + "step": 60 + }, + { + "epoch": 0.07610729881472239, + "grad_norm": 0.19163951373300692, + "learning_rate": 8.219178082191782e-06, + "loss": 0.1101, + "num_tokens": 4886497.0, + "step": 61 + }, + { + "epoch": 0.07735495945102933, + "grad_norm": 0.1792411438909808, + "learning_rate": 8.356164383561644e-06, + "loss": 0.1052, + "num_tokens": 4966066.0, + "step": 62 + }, + { + "epoch": 0.07860262008733625, + "grad_norm": 0.19871949594852764, + "learning_rate": 8.493150684931507e-06, + "loss": 0.1107, + "num_tokens": 5046294.0, + "step": 63 + }, + { + "epoch": 0.07985028072364317, + "grad_norm": 0.2009426482501604, + "learning_rate": 8.63013698630137e-06, + "loss": 0.111, + "num_tokens": 5127200.0, + "step": 64 + }, + { + "epoch": 0.08109794135995009, + "grad_norm": 0.18854634659363081, + "learning_rate": 8.767123287671233e-06, + "loss": 0.1062, + "num_tokens": 5207375.0, + "step": 65 + }, + { + "epoch": 0.08234560199625701, + "grad_norm": 0.1909614838823602, + "learning_rate": 8.904109589041097e-06, + "loss": 0.1055, + "num_tokens": 5288077.0, + "step": 66 + }, + { + "epoch": 0.08359326263256395, + "grad_norm": 0.18966488054923936, + "learning_rate": 9.04109589041096e-06, + "loss": 0.0965, + "num_tokens": 5368986.0, + "step": 67 + }, + { + "epoch": 0.08484092326887087, + "grad_norm": 0.19600407468281897, + "learning_rate": 9.178082191780823e-06, + "loss": 0.0989, + "num_tokens": 5448866.0, + "step": 68 + }, + { + "epoch": 0.08608858390517779, + "grad_norm": 0.19248220731163004, + "learning_rate": 9.315068493150685e-06, + "loss": 0.1021, + "num_tokens": 5529761.0, + "step": 69 + }, + { + "epoch": 0.08733624454148471, + "grad_norm": 0.1790396798178214, + "learning_rate": 9.452054794520548e-06, + "loss": 0.0977, + "num_tokens": 5610079.0, + "step": 70 + }, + { + "epoch": 0.08858390517779165, + "grad_norm": 0.20227512996028574, + "learning_rate": 9.589041095890411e-06, + "loss": 0.103, + "num_tokens": 5690121.0, + "step": 71 + }, + { + "epoch": 0.08983156581409857, + "grad_norm": 0.1804340417389163, + "learning_rate": 9.726027397260275e-06, + "loss": 0.0988, + "num_tokens": 5770807.0, + "step": 72 + }, + { + "epoch": 0.09107922645040549, + "grad_norm": 0.18708792441875738, + "learning_rate": 9.863013698630138e-06, + "loss": 0.1007, + "num_tokens": 5851162.0, + "step": 73 + }, + { + "epoch": 0.09232688708671241, + "grad_norm": 0.18580134464623252, + "learning_rate": 1e-05, + "loss": 0.1015, + "num_tokens": 5933144.0, + "step": 74 + }, + { + "epoch": 0.09357454772301933, + "grad_norm": 0.1777511395438126, + "learning_rate": 9.999995920069922e-06, + "loss": 0.0925, + "num_tokens": 6013167.0, + "step": 75 + }, + { + "epoch": 0.09482220835932627, + "grad_norm": 0.18453230882301527, + "learning_rate": 9.999983680287084e-06, + "loss": 0.0995, + "num_tokens": 6092429.0, + "step": 76 + }, + { + "epoch": 0.09606986899563319, + "grad_norm": 0.17990626138096272, + "learning_rate": 9.99996328067368e-06, + "loss": 0.0894, + "num_tokens": 6171395.0, + "step": 77 + }, + { + "epoch": 0.09731752963194011, + "grad_norm": 0.18671247100201174, + "learning_rate": 9.999934721266702e-06, + "loss": 0.0954, + "num_tokens": 6252161.0, + "step": 78 + }, + { + "epoch": 0.09856519026824703, + "grad_norm": 0.17367676216662678, + "learning_rate": 9.999898002117937e-06, + "loss": 0.0882, + "num_tokens": 6331946.0, + "step": 79 + }, + { + "epoch": 0.09981285090455397, + "grad_norm": 0.17790787231369953, + "learning_rate": 9.999853123293967e-06, + "loss": 0.0948, + "num_tokens": 6412878.0, + "step": 80 + }, + { + "epoch": 0.10106051154086089, + "grad_norm": 0.17825720009899704, + "learning_rate": 9.99980008487617e-06, + "loss": 0.0883, + "num_tokens": 6492625.0, + "step": 81 + }, + { + "epoch": 0.10230817217716781, + "grad_norm": 0.177633671805786, + "learning_rate": 9.999738886960724e-06, + "loss": 0.0958, + "num_tokens": 6572706.0, + "step": 82 + }, + { + "epoch": 0.10355583281347473, + "grad_norm": 0.1781514598121185, + "learning_rate": 9.999669529658596e-06, + "loss": 0.1016, + "num_tokens": 6654066.0, + "step": 83 + }, + { + "epoch": 0.10480349344978165, + "grad_norm": 0.1797123476723708, + "learning_rate": 9.999592013095553e-06, + "loss": 0.0889, + "num_tokens": 6733703.0, + "step": 84 + }, + { + "epoch": 0.10605115408608859, + "grad_norm": 0.17668230595097628, + "learning_rate": 9.999506337412157e-06, + "loss": 0.0905, + "num_tokens": 6813311.0, + "step": 85 + }, + { + "epoch": 0.10729881472239551, + "grad_norm": 0.1635782657626139, + "learning_rate": 9.99941250276376e-06, + "loss": 0.0891, + "num_tokens": 6892900.0, + "step": 86 + }, + { + "epoch": 0.10854647535870243, + "grad_norm": 0.17152001517029147, + "learning_rate": 9.999310509320518e-06, + "loss": 0.0852, + "num_tokens": 6971684.0, + "step": 87 + }, + { + "epoch": 0.10979413599500935, + "grad_norm": 0.16674351199678547, + "learning_rate": 9.999200357267373e-06, + "loss": 0.0844, + "num_tokens": 7050727.0, + "step": 88 + }, + { + "epoch": 0.11104179663131628, + "grad_norm": 0.16284064715514623, + "learning_rate": 9.999082046804062e-06, + "loss": 0.0894, + "num_tokens": 7130506.0, + "step": 89 + }, + { + "epoch": 0.11228945726762321, + "grad_norm": 0.17699815453244996, + "learning_rate": 9.998955578145124e-06, + "loss": 0.0896, + "num_tokens": 7210539.0, + "step": 90 + }, + { + "epoch": 0.11353711790393013, + "grad_norm": 0.17175993286806868, + "learning_rate": 9.998820951519877e-06, + "loss": 0.0909, + "num_tokens": 7291662.0, + "step": 91 + }, + { + "epoch": 0.11478477854023705, + "grad_norm": 0.16895654572874105, + "learning_rate": 9.998678167172446e-06, + "loss": 0.0866, + "num_tokens": 7371708.0, + "step": 92 + }, + { + "epoch": 0.11603243917654397, + "grad_norm": 0.15898095818806993, + "learning_rate": 9.99852722536174e-06, + "loss": 0.0891, + "num_tokens": 7451637.0, + "step": 93 + }, + { + "epoch": 0.11728009981285091, + "grad_norm": 0.1655714312412077, + "learning_rate": 9.998368126361459e-06, + "loss": 0.0855, + "num_tokens": 7532024.0, + "step": 94 + }, + { + "epoch": 0.11852776044915783, + "grad_norm": 0.16278257907586435, + "learning_rate": 9.998200870460103e-06, + "loss": 0.0855, + "num_tokens": 7611489.0, + "step": 95 + }, + { + "epoch": 0.11977542108546475, + "grad_norm": 0.17618441557702208, + "learning_rate": 9.998025457960955e-06, + "loss": 0.0963, + "num_tokens": 7693716.0, + "step": 96 + }, + { + "epoch": 0.12102308172177167, + "grad_norm": 0.15790344775477788, + "learning_rate": 9.997841889182091e-06, + "loss": 0.0856, + "num_tokens": 7774645.0, + "step": 97 + }, + { + "epoch": 0.1222707423580786, + "grad_norm": 0.16185678156409003, + "learning_rate": 9.997650164456375e-06, + "loss": 0.0793, + "num_tokens": 7855390.0, + "step": 98 + }, + { + "epoch": 0.12351840299438553, + "grad_norm": 0.16681775638759538, + "learning_rate": 9.997450284131465e-06, + "loss": 0.0826, + "num_tokens": 7935403.0, + "step": 99 + }, + { + "epoch": 0.12476606363069245, + "grad_norm": 0.16550888127393662, + "learning_rate": 9.997242248569802e-06, + "loss": 0.0836, + "num_tokens": 8015879.0, + "step": 100 + }, + { + "epoch": 0.1260137242669994, + "grad_norm": 0.1771249194917155, + "learning_rate": 9.997026058148617e-06, + "loss": 0.0863, + "num_tokens": 8096492.0, + "step": 101 + }, + { + "epoch": 0.1272613849033063, + "grad_norm": 0.18698309428988277, + "learning_rate": 9.996801713259933e-06, + "loss": 0.0949, + "num_tokens": 8177949.0, + "step": 102 + }, + { + "epoch": 0.12850904553961323, + "grad_norm": 0.15901318998957345, + "learning_rate": 9.996569214310549e-06, + "loss": 0.0819, + "num_tokens": 8256684.0, + "step": 103 + }, + { + "epoch": 0.12975670617592014, + "grad_norm": 0.1659574005563422, + "learning_rate": 9.99632856172206e-06, + "loss": 0.0755, + "num_tokens": 8335236.0, + "step": 104 + }, + { + "epoch": 0.13100436681222707, + "grad_norm": 0.16236837894961287, + "learning_rate": 9.99607975593084e-06, + "loss": 0.0837, + "num_tokens": 8414924.0, + "step": 105 + }, + { + "epoch": 0.132252027448534, + "grad_norm": 0.16857841750351044, + "learning_rate": 9.995822797388052e-06, + "loss": 0.0832, + "num_tokens": 8494719.0, + "step": 106 + }, + { + "epoch": 0.13349968808484092, + "grad_norm": 0.16077102501626883, + "learning_rate": 9.995557686559635e-06, + "loss": 0.0825, + "num_tokens": 8574385.0, + "step": 107 + }, + { + "epoch": 0.13474734872114785, + "grad_norm": 0.16504694201517098, + "learning_rate": 9.995284423926318e-06, + "loss": 0.0823, + "num_tokens": 8655947.0, + "step": 108 + }, + { + "epoch": 0.13599500935745476, + "grad_norm": 0.16670618256633662, + "learning_rate": 9.995003009983608e-06, + "loss": 0.0859, + "num_tokens": 8735660.0, + "step": 109 + }, + { + "epoch": 0.1372426699937617, + "grad_norm": 0.18503209596000766, + "learning_rate": 9.994713445241793e-06, + "loss": 0.087, + "num_tokens": 8816074.0, + "step": 110 + }, + { + "epoch": 0.13849033063006863, + "grad_norm": 0.16851469586707762, + "learning_rate": 9.994415730225943e-06, + "loss": 0.0949, + "num_tokens": 8897733.0, + "step": 111 + }, + { + "epoch": 0.13973799126637554, + "grad_norm": 0.16695446108820516, + "learning_rate": 9.994109865475903e-06, + "loss": 0.0848, + "num_tokens": 8977866.0, + "step": 112 + }, + { + "epoch": 0.14098565190268247, + "grad_norm": 0.16886821971011903, + "learning_rate": 9.993795851546302e-06, + "loss": 0.0847, + "num_tokens": 9059979.0, + "step": 113 + }, + { + "epoch": 0.14223331253898938, + "grad_norm": 0.15825031266233286, + "learning_rate": 9.993473689006538e-06, + "loss": 0.0797, + "num_tokens": 9139827.0, + "step": 114 + }, + { + "epoch": 0.14348097317529632, + "grad_norm": 0.1628042779419236, + "learning_rate": 9.99314337844079e-06, + "loss": 0.0866, + "num_tokens": 9220225.0, + "step": 115 + }, + { + "epoch": 0.14472863381160325, + "grad_norm": 0.17318475399282052, + "learning_rate": 9.992804920448013e-06, + "loss": 0.0835, + "num_tokens": 9300879.0, + "step": 116 + }, + { + "epoch": 0.14597629444791016, + "grad_norm": 0.15657831342283274, + "learning_rate": 9.992458315641932e-06, + "loss": 0.0763, + "num_tokens": 9380164.0, + "step": 117 + }, + { + "epoch": 0.1472239550842171, + "grad_norm": 0.1716611418125263, + "learning_rate": 9.992103564651048e-06, + "loss": 0.0864, + "num_tokens": 9460543.0, + "step": 118 + }, + { + "epoch": 0.14847161572052403, + "grad_norm": 0.15729587536249917, + "learning_rate": 9.991740668118629e-06, + "loss": 0.078, + "num_tokens": 9540063.0, + "step": 119 + }, + { + "epoch": 0.14971927635683094, + "grad_norm": 0.17115153204306796, + "learning_rate": 9.991369626702717e-06, + "loss": 0.0838, + "num_tokens": 9620959.0, + "step": 120 + }, + { + "epoch": 0.15096693699313787, + "grad_norm": 0.16228121857689287, + "learning_rate": 9.990990441076125e-06, + "loss": 0.082, + "num_tokens": 9702803.0, + "step": 121 + }, + { + "epoch": 0.15221459762944478, + "grad_norm": 0.1635063935531281, + "learning_rate": 9.990603111926424e-06, + "loss": 0.0788, + "num_tokens": 9782410.0, + "step": 122 + }, + { + "epoch": 0.15346225826575172, + "grad_norm": 0.17181078017382084, + "learning_rate": 9.990207639955969e-06, + "loss": 0.0819, + "num_tokens": 9863350.0, + "step": 123 + }, + { + "epoch": 0.15470991890205865, + "grad_norm": 0.1643811410282833, + "learning_rate": 9.989804025881862e-06, + "loss": 0.077, + "num_tokens": 9942485.0, + "step": 124 + }, + { + "epoch": 0.15595757953836556, + "grad_norm": 0.1659276911169034, + "learning_rate": 9.98939227043598e-06, + "loss": 0.0785, + "num_tokens": 10022555.0, + "step": 125 + }, + { + "epoch": 0.1572052401746725, + "grad_norm": 0.1652036681004015, + "learning_rate": 9.988972374364961e-06, + "loss": 0.0802, + "num_tokens": 10102391.0, + "step": 126 + }, + { + "epoch": 0.1584529008109794, + "grad_norm": 0.17240153860438123, + "learning_rate": 9.988544338430203e-06, + "loss": 0.0796, + "num_tokens": 10183708.0, + "step": 127 + }, + { + "epoch": 0.15970056144728634, + "grad_norm": 0.16529523837288534, + "learning_rate": 9.988108163407865e-06, + "loss": 0.0809, + "num_tokens": 10265029.0, + "step": 128 + }, + { + "epoch": 0.16094822208359327, + "grad_norm": 0.1684552284249565, + "learning_rate": 9.987663850088862e-06, + "loss": 0.0787, + "num_tokens": 10344489.0, + "step": 129 + }, + { + "epoch": 0.16219588271990018, + "grad_norm": 0.15609693921109477, + "learning_rate": 9.987211399278871e-06, + "loss": 0.0765, + "num_tokens": 10423520.0, + "step": 130 + }, + { + "epoch": 0.16344354335620712, + "grad_norm": 0.16318291937030596, + "learning_rate": 9.98675081179832e-06, + "loss": 0.071, + "num_tokens": 10502900.0, + "step": 131 + }, + { + "epoch": 0.16469120399251402, + "grad_norm": 0.2007608468972681, + "learning_rate": 9.986282088482397e-06, + "loss": 0.0767, + "num_tokens": 10583282.0, + "step": 132 + }, + { + "epoch": 0.16593886462882096, + "grad_norm": 0.16935616163873396, + "learning_rate": 9.985805230181031e-06, + "loss": 0.0749, + "num_tokens": 10662589.0, + "step": 133 + }, + { + "epoch": 0.1671865252651279, + "grad_norm": 0.171740825216556, + "learning_rate": 9.985320237758918e-06, + "loss": 0.0775, + "num_tokens": 10742307.0, + "step": 134 + }, + { + "epoch": 0.1684341859014348, + "grad_norm": 0.15828176697546295, + "learning_rate": 9.984827112095495e-06, + "loss": 0.0753, + "num_tokens": 10821872.0, + "step": 135 + }, + { + "epoch": 0.16968184653774174, + "grad_norm": 0.16873525559334726, + "learning_rate": 9.984325854084946e-06, + "loss": 0.0786, + "num_tokens": 10907937.0, + "step": 136 + }, + { + "epoch": 0.17092950717404864, + "grad_norm": 0.16380536060958564, + "learning_rate": 9.983816464636203e-06, + "loss": 0.0784, + "num_tokens": 10988958.0, + "step": 137 + }, + { + "epoch": 0.17217716781035558, + "grad_norm": 0.160963929320833, + "learning_rate": 9.983298944672942e-06, + "loss": 0.0817, + "num_tokens": 11070498.0, + "step": 138 + }, + { + "epoch": 0.17342482844666252, + "grad_norm": 0.164166377631778, + "learning_rate": 9.982773295133585e-06, + "loss": 0.0754, + "num_tokens": 11150195.0, + "step": 139 + }, + { + "epoch": 0.17467248908296942, + "grad_norm": 0.18357918297872183, + "learning_rate": 9.982239516971295e-06, + "loss": 0.0783, + "num_tokens": 11231131.0, + "step": 140 + }, + { + "epoch": 0.17592014971927636, + "grad_norm": 0.15755220760124228, + "learning_rate": 9.98169761115397e-06, + "loss": 0.0786, + "num_tokens": 11311740.0, + "step": 141 + }, + { + "epoch": 0.1771678103555833, + "grad_norm": 0.16447695668867712, + "learning_rate": 9.98114757866425e-06, + "loss": 0.0784, + "num_tokens": 11392420.0, + "step": 142 + }, + { + "epoch": 0.1784154709918902, + "grad_norm": 0.15694055102021415, + "learning_rate": 9.980589420499512e-06, + "loss": 0.0841, + "num_tokens": 11472491.0, + "step": 143 + }, + { + "epoch": 0.17966313162819714, + "grad_norm": 0.1614277416464966, + "learning_rate": 9.980023137671862e-06, + "loss": 0.072, + "num_tokens": 11552759.0, + "step": 144 + }, + { + "epoch": 0.18091079226450404, + "grad_norm": 0.1581923947848541, + "learning_rate": 9.979448731208145e-06, + "loss": 0.0711, + "num_tokens": 11632535.0, + "step": 145 + }, + { + "epoch": 0.18215845290081098, + "grad_norm": 0.16298717987923186, + "learning_rate": 9.978866202149931e-06, + "loss": 0.0731, + "num_tokens": 11712101.0, + "step": 146 + }, + { + "epoch": 0.18340611353711792, + "grad_norm": 0.17368683828180292, + "learning_rate": 9.978275551553526e-06, + "loss": 0.0791, + "num_tokens": 11791963.0, + "step": 147 + }, + { + "epoch": 0.18465377417342482, + "grad_norm": 0.16144191529310298, + "learning_rate": 9.977676780489953e-06, + "loss": 0.0777, + "num_tokens": 11872345.0, + "step": 148 + }, + { + "epoch": 0.18590143480973176, + "grad_norm": 0.1651729281394997, + "learning_rate": 9.977069890044965e-06, + "loss": 0.0809, + "num_tokens": 11954469.0, + "step": 149 + }, + { + "epoch": 0.18714909544603867, + "grad_norm": 0.16467083171937716, + "learning_rate": 9.976454881319041e-06, + "loss": 0.0724, + "num_tokens": 12033673.0, + "step": 150 + }, + { + "epoch": 0.1883967560823456, + "grad_norm": 0.15270182358741702, + "learning_rate": 9.975831755427376e-06, + "loss": 0.0719, + "num_tokens": 12113393.0, + "step": 151 + }, + { + "epoch": 0.18964441671865254, + "grad_norm": 0.1552933576757993, + "learning_rate": 9.975200513499886e-06, + "loss": 0.0769, + "num_tokens": 12194535.0, + "step": 152 + }, + { + "epoch": 0.19089207735495944, + "grad_norm": 0.15708292865325066, + "learning_rate": 9.974561156681203e-06, + "loss": 0.076, + "num_tokens": 12275521.0, + "step": 153 + }, + { + "epoch": 0.19213973799126638, + "grad_norm": 0.15744491473566746, + "learning_rate": 9.973913686130674e-06, + "loss": 0.0718, + "num_tokens": 12355487.0, + "step": 154 + }, + { + "epoch": 0.1933873986275733, + "grad_norm": 0.1684494744054102, + "learning_rate": 9.973258103022361e-06, + "loss": 0.077, + "num_tokens": 12435557.0, + "step": 155 + }, + { + "epoch": 0.19463505926388022, + "grad_norm": 0.17092559295973017, + "learning_rate": 9.97259440854503e-06, + "loss": 0.078, + "num_tokens": 12516421.0, + "step": 156 + }, + { + "epoch": 0.19588271990018716, + "grad_norm": 0.16467948975450056, + "learning_rate": 9.971922603902164e-06, + "loss": 0.0792, + "num_tokens": 12596956.0, + "step": 157 + }, + { + "epoch": 0.19713038053649407, + "grad_norm": 0.16129266645735674, + "learning_rate": 9.971242690311944e-06, + "loss": 0.0715, + "num_tokens": 12677329.0, + "step": 158 + }, + { + "epoch": 0.198378041172801, + "grad_norm": 0.15566121358014243, + "learning_rate": 9.970554669007264e-06, + "loss": 0.071, + "num_tokens": 12757136.0, + "step": 159 + }, + { + "epoch": 0.19962570180910794, + "grad_norm": 0.15418083568009736, + "learning_rate": 9.969858541235708e-06, + "loss": 0.0707, + "num_tokens": 12837208.0, + "step": 160 + }, + { + "epoch": 0.20087336244541484, + "grad_norm": 0.15281662930143763, + "learning_rate": 9.969154308259572e-06, + "loss": 0.072, + "num_tokens": 12916423.0, + "step": 161 + }, + { + "epoch": 0.20212102308172178, + "grad_norm": 0.15806417128594089, + "learning_rate": 9.968441971355839e-06, + "loss": 0.0697, + "num_tokens": 12995763.0, + "step": 162 + }, + { + "epoch": 0.2033686837180287, + "grad_norm": 0.16247347666054926, + "learning_rate": 9.967721531816194e-06, + "loss": 0.069, + "num_tokens": 13075036.0, + "step": 163 + }, + { + "epoch": 0.20461634435433562, + "grad_norm": 0.16120489496865675, + "learning_rate": 9.96699299094701e-06, + "loss": 0.0657, + "num_tokens": 13153140.0, + "step": 164 + }, + { + "epoch": 0.20586400499064256, + "grad_norm": 0.1654367494643062, + "learning_rate": 9.966256350069355e-06, + "loss": 0.0719, + "num_tokens": 13233496.0, + "step": 165 + }, + { + "epoch": 0.20711166562694946, + "grad_norm": 0.15797090901409847, + "learning_rate": 9.965511610518975e-06, + "loss": 0.0741, + "num_tokens": 13313688.0, + "step": 166 + }, + { + "epoch": 0.2083593262632564, + "grad_norm": 0.17318092994565018, + "learning_rate": 9.964758773646314e-06, + "loss": 0.0705, + "num_tokens": 13392817.0, + "step": 167 + }, + { + "epoch": 0.2096069868995633, + "grad_norm": 0.16326342203924862, + "learning_rate": 9.963997840816491e-06, + "loss": 0.0694, + "num_tokens": 13472052.0, + "step": 168 + }, + { + "epoch": 0.21085464753587024, + "grad_norm": 0.1649032874821036, + "learning_rate": 9.963228813409307e-06, + "loss": 0.0718, + "num_tokens": 13552229.0, + "step": 169 + }, + { + "epoch": 0.21210230817217718, + "grad_norm": 0.14746862120200496, + "learning_rate": 9.962451692819238e-06, + "loss": 0.0674, + "num_tokens": 13631487.0, + "step": 170 + }, + { + "epoch": 0.2133499688084841, + "grad_norm": 0.16140552300758973, + "learning_rate": 9.961666480455445e-06, + "loss": 0.0711, + "num_tokens": 13710876.0, + "step": 171 + }, + { + "epoch": 0.21459762944479102, + "grad_norm": 0.1561744394885393, + "learning_rate": 9.96087317774175e-06, + "loss": 0.0678, + "num_tokens": 13790236.0, + "step": 172 + }, + { + "epoch": 0.21584529008109793, + "grad_norm": 0.1532654032002478, + "learning_rate": 9.960071786116652e-06, + "loss": 0.0701, + "num_tokens": 13869459.0, + "step": 173 + }, + { + "epoch": 0.21709295071740486, + "grad_norm": 0.16418271391935854, + "learning_rate": 9.959262307033318e-06, + "loss": 0.0702, + "num_tokens": 13949850.0, + "step": 174 + }, + { + "epoch": 0.2183406113537118, + "grad_norm": 0.16417748724525755, + "learning_rate": 9.958444741959577e-06, + "loss": 0.0794, + "num_tokens": 14030886.0, + "step": 175 + }, + { + "epoch": 0.2195882719900187, + "grad_norm": 0.1475411082063503, + "learning_rate": 9.957619092377921e-06, + "loss": 0.0697, + "num_tokens": 14110548.0, + "step": 176 + }, + { + "epoch": 0.22083593262632564, + "grad_norm": 0.15200242109370143, + "learning_rate": 9.956785359785501e-06, + "loss": 0.0725, + "num_tokens": 14191162.0, + "step": 177 + }, + { + "epoch": 0.22208359326263255, + "grad_norm": 0.14951724955021337, + "learning_rate": 9.95594354569413e-06, + "loss": 0.0713, + "num_tokens": 14270373.0, + "step": 178 + }, + { + "epoch": 0.22333125389893949, + "grad_norm": 0.15437761740995362, + "learning_rate": 9.955093651630271e-06, + "loss": 0.0694, + "num_tokens": 14350521.0, + "step": 179 + }, + { + "epoch": 0.22457891453524642, + "grad_norm": 0.16318758670731984, + "learning_rate": 9.954235679135035e-06, + "loss": 0.0648, + "num_tokens": 14430117.0, + "step": 180 + }, + { + "epoch": 0.22582657517155333, + "grad_norm": 0.1509213212454131, + "learning_rate": 9.953369629764187e-06, + "loss": 0.0659, + "num_tokens": 14510184.0, + "step": 181 + }, + { + "epoch": 0.22707423580786026, + "grad_norm": 0.15811847612755178, + "learning_rate": 9.952495505088138e-06, + "loss": 0.069, + "num_tokens": 14589696.0, + "step": 182 + }, + { + "epoch": 0.2283218964441672, + "grad_norm": 0.17426294289031205, + "learning_rate": 9.95161330669194e-06, + "loss": 0.0721, + "num_tokens": 14669642.0, + "step": 183 + }, + { + "epoch": 0.2295695570804741, + "grad_norm": 0.17769882981769738, + "learning_rate": 9.950723036175282e-06, + "loss": 0.0703, + "num_tokens": 14749847.0, + "step": 184 + }, + { + "epoch": 0.23081721771678104, + "grad_norm": 0.1674097835401589, + "learning_rate": 9.9498246951525e-06, + "loss": 0.0747, + "num_tokens": 14829647.0, + "step": 185 + }, + { + "epoch": 0.23206487835308795, + "grad_norm": 0.15100509686307984, + "learning_rate": 9.948918285252551e-06, + "loss": 0.0745, + "num_tokens": 14910343.0, + "step": 186 + }, + { + "epoch": 0.23331253898939489, + "grad_norm": 0.15360581031528767, + "learning_rate": 9.948003808119034e-06, + "loss": 0.0714, + "num_tokens": 14990734.0, + "step": 187 + }, + { + "epoch": 0.23456019962570182, + "grad_norm": 0.16940538023361718, + "learning_rate": 9.94708126541017e-06, + "loss": 0.0684, + "num_tokens": 15070750.0, + "step": 188 + }, + { + "epoch": 0.23580786026200873, + "grad_norm": 0.15548216290846645, + "learning_rate": 9.94615065879881e-06, + "loss": 0.069, + "num_tokens": 15150020.0, + "step": 189 + }, + { + "epoch": 0.23705552089831566, + "grad_norm": 0.14993741595806287, + "learning_rate": 9.945211989972425e-06, + "loss": 0.0703, + "num_tokens": 15231989.0, + "step": 190 + }, + { + "epoch": 0.23830318153462257, + "grad_norm": 0.1474419032756179, + "learning_rate": 9.944265260633105e-06, + "loss": 0.0711, + "num_tokens": 15312107.0, + "step": 191 + }, + { + "epoch": 0.2395508421709295, + "grad_norm": 0.1601090310495295, + "learning_rate": 9.943310472497556e-06, + "loss": 0.0696, + "num_tokens": 15391755.0, + "step": 192 + }, + { + "epoch": 0.24079850280723644, + "grad_norm": 0.16372600424581513, + "learning_rate": 9.942347627297095e-06, + "loss": 0.0719, + "num_tokens": 15472703.0, + "step": 193 + }, + { + "epoch": 0.24204616344354335, + "grad_norm": 0.15885711149872717, + "learning_rate": 9.941376726777656e-06, + "loss": 0.0703, + "num_tokens": 15552902.0, + "step": 194 + }, + { + "epoch": 0.24329382407985028, + "grad_norm": 0.14374818433177697, + "learning_rate": 9.940397772699773e-06, + "loss": 0.0674, + "num_tokens": 15633405.0, + "step": 195 + }, + { + "epoch": 0.2445414847161572, + "grad_norm": 0.14857277972441196, + "learning_rate": 9.939410766838586e-06, + "loss": 0.0715, + "num_tokens": 15714298.0, + "step": 196 + }, + { + "epoch": 0.24578914535246413, + "grad_norm": 0.15728113670824326, + "learning_rate": 9.938415710983834e-06, + "loss": 0.0655, + "num_tokens": 15793415.0, + "step": 197 + }, + { + "epoch": 0.24703680598877106, + "grad_norm": 0.15942515909588884, + "learning_rate": 9.937412606939854e-06, + "loss": 0.0725, + "num_tokens": 15874136.0, + "step": 198 + }, + { + "epoch": 0.24828446662507797, + "grad_norm": 0.13933287189995225, + "learning_rate": 9.936401456525578e-06, + "loss": 0.0686, + "num_tokens": 15953965.0, + "step": 199 + }, + { + "epoch": 0.2495321272613849, + "grad_norm": 0.14826313565340965, + "learning_rate": 9.935382261574527e-06, + "loss": 0.0646, + "num_tokens": 16034405.0, + "step": 200 + }, + { + "epoch": 0.25077978789769184, + "grad_norm": 0.14090118375202848, + "learning_rate": 9.934355023934808e-06, + "loss": 0.0595, + "num_tokens": 16112634.0, + "step": 201 + }, + { + "epoch": 0.2520274485339988, + "grad_norm": 0.15692035946489286, + "learning_rate": 9.933319745469117e-06, + "loss": 0.0713, + "num_tokens": 16193908.0, + "step": 202 + }, + { + "epoch": 0.25327510917030566, + "grad_norm": 0.1656646737047489, + "learning_rate": 9.932276428054723e-06, + "loss": 0.0748, + "num_tokens": 16275142.0, + "step": 203 + }, + { + "epoch": 0.2545227698066126, + "grad_norm": 0.1680255993051908, + "learning_rate": 9.931225073583476e-06, + "loss": 0.0718, + "num_tokens": 16355412.0, + "step": 204 + }, + { + "epoch": 0.2557704304429195, + "grad_norm": 0.1646716198896169, + "learning_rate": 9.930165683961803e-06, + "loss": 0.0661, + "num_tokens": 16435568.0, + "step": 205 + }, + { + "epoch": 0.25701809107922646, + "grad_norm": 0.1565441365205928, + "learning_rate": 9.929098261110694e-06, + "loss": 0.0653, + "num_tokens": 16516116.0, + "step": 206 + }, + { + "epoch": 0.2582657517155334, + "grad_norm": 0.15611179423469423, + "learning_rate": 9.92802280696571e-06, + "loss": 0.0689, + "num_tokens": 16596327.0, + "step": 207 + }, + { + "epoch": 0.2595134123518403, + "grad_norm": 0.16424170862613105, + "learning_rate": 9.926939323476976e-06, + "loss": 0.0714, + "num_tokens": 16675904.0, + "step": 208 + }, + { + "epoch": 0.2607610729881472, + "grad_norm": 0.1496338642109506, + "learning_rate": 9.925847812609174e-06, + "loss": 0.0647, + "num_tokens": 16754483.0, + "step": 209 + }, + { + "epoch": 0.26200873362445415, + "grad_norm": 0.14882331263651497, + "learning_rate": 9.924748276341541e-06, + "loss": 0.0667, + "num_tokens": 16834373.0, + "step": 210 + }, + { + "epoch": 0.2632563942607611, + "grad_norm": 0.14757123106619455, + "learning_rate": 9.923640716667872e-06, + "loss": 0.0624, + "num_tokens": 16914378.0, + "step": 211 + }, + { + "epoch": 0.264504054897068, + "grad_norm": 0.15327214179668633, + "learning_rate": 9.922525135596507e-06, + "loss": 0.0731, + "num_tokens": 16995299.0, + "step": 212 + }, + { + "epoch": 0.2657517155333749, + "grad_norm": 0.15329303036152214, + "learning_rate": 9.92140153515033e-06, + "loss": 0.0699, + "num_tokens": 17074695.0, + "step": 213 + }, + { + "epoch": 0.26699937616968183, + "grad_norm": 0.14973454931210878, + "learning_rate": 9.92026991736677e-06, + "loss": 0.0644, + "num_tokens": 17154211.0, + "step": 214 + }, + { + "epoch": 0.26824703680598877, + "grad_norm": 0.15013258739242202, + "learning_rate": 9.919130284297791e-06, + "loss": 0.0661, + "num_tokens": 17234490.0, + "step": 215 + }, + { + "epoch": 0.2694946974422957, + "grad_norm": 0.15536001180184006, + "learning_rate": 9.917982638009891e-06, + "loss": 0.0727, + "num_tokens": 17314715.0, + "step": 216 + }, + { + "epoch": 0.27074235807860264, + "grad_norm": 0.15435069913626115, + "learning_rate": 9.916826980584103e-06, + "loss": 0.0657, + "num_tokens": 17395497.0, + "step": 217 + }, + { + "epoch": 0.2719900187149095, + "grad_norm": 0.15469096812383332, + "learning_rate": 9.91566331411598e-06, + "loss": 0.0626, + "num_tokens": 17474577.0, + "step": 218 + }, + { + "epoch": 0.27323767935121646, + "grad_norm": 0.1600274953046015, + "learning_rate": 9.914491640715603e-06, + "loss": 0.0676, + "num_tokens": 17555477.0, + "step": 219 + }, + { + "epoch": 0.2744853399875234, + "grad_norm": 0.1406463527512994, + "learning_rate": 9.913311962507569e-06, + "loss": 0.0592, + "num_tokens": 17635011.0, + "step": 220 + }, + { + "epoch": 0.2757330006238303, + "grad_norm": 0.16023616480511607, + "learning_rate": 9.912124281630991e-06, + "loss": 0.069, + "num_tokens": 17714394.0, + "step": 221 + }, + { + "epoch": 0.27698066126013726, + "grad_norm": 0.14706360693121406, + "learning_rate": 9.910928600239493e-06, + "loss": 0.0672, + "num_tokens": 17795018.0, + "step": 222 + }, + { + "epoch": 0.27822832189644414, + "grad_norm": 0.15220399575934448, + "learning_rate": 9.909724920501207e-06, + "loss": 0.0657, + "num_tokens": 17874644.0, + "step": 223 + }, + { + "epoch": 0.2794759825327511, + "grad_norm": 0.15590253629720327, + "learning_rate": 9.90851324459877e-06, + "loss": 0.0692, + "num_tokens": 17954798.0, + "step": 224 + }, + { + "epoch": 0.280723643169058, + "grad_norm": 0.15484431097082246, + "learning_rate": 9.907293574729317e-06, + "loss": 0.0645, + "num_tokens": 18034230.0, + "step": 225 + }, + { + "epoch": 0.28197130380536495, + "grad_norm": 0.1449336777934227, + "learning_rate": 9.906065913104474e-06, + "loss": 0.0665, + "num_tokens": 18113515.0, + "step": 226 + }, + { + "epoch": 0.2832189644416719, + "grad_norm": 0.14201172287080596, + "learning_rate": 9.904830261950366e-06, + "loss": 0.0615, + "num_tokens": 18193510.0, + "step": 227 + }, + { + "epoch": 0.28446662507797876, + "grad_norm": 0.1422769788921952, + "learning_rate": 9.903586623507603e-06, + "loss": 0.0584, + "num_tokens": 18273926.0, + "step": 228 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.1567259496080909, + "learning_rate": 9.902335000031273e-06, + "loss": 0.0622, + "num_tokens": 18354062.0, + "step": 229 + }, + { + "epoch": 0.28696194635059263, + "grad_norm": 0.1534870201147952, + "learning_rate": 9.901075393790953e-06, + "loss": 0.0666, + "num_tokens": 18434373.0, + "step": 230 + }, + { + "epoch": 0.28820960698689957, + "grad_norm": 0.14650419642450943, + "learning_rate": 9.899807807070684e-06, + "loss": 0.0635, + "num_tokens": 18513274.0, + "step": 231 + }, + { + "epoch": 0.2894572676232065, + "grad_norm": 0.14778200561533844, + "learning_rate": 9.898532242168987e-06, + "loss": 0.0615, + "num_tokens": 18592541.0, + "step": 232 + }, + { + "epoch": 0.29070492825951344, + "grad_norm": 0.14114387988548585, + "learning_rate": 9.897248701398848e-06, + "loss": 0.0588, + "num_tokens": 18671892.0, + "step": 233 + }, + { + "epoch": 0.2919525888958203, + "grad_norm": 0.16178673205752156, + "learning_rate": 9.895957187087713e-06, + "loss": 0.0635, + "num_tokens": 18751886.0, + "step": 234 + }, + { + "epoch": 0.29320024953212726, + "grad_norm": 0.14871524378309123, + "learning_rate": 9.894657701577488e-06, + "loss": 0.0658, + "num_tokens": 18832351.0, + "step": 235 + }, + { + "epoch": 0.2944479101684342, + "grad_norm": 0.1548609260643329, + "learning_rate": 9.893350247224532e-06, + "loss": 0.0675, + "num_tokens": 18912756.0, + "step": 236 + }, + { + "epoch": 0.2956955708047411, + "grad_norm": 0.1507526419619637, + "learning_rate": 9.892034826399657e-06, + "loss": 0.0601, + "num_tokens": 18993197.0, + "step": 237 + }, + { + "epoch": 0.29694323144104806, + "grad_norm": 0.156742989410175, + "learning_rate": 9.890711441488117e-06, + "loss": 0.0669, + "num_tokens": 19074321.0, + "step": 238 + }, + { + "epoch": 0.29819089207735494, + "grad_norm": 0.15435161952476084, + "learning_rate": 9.889380094889609e-06, + "loss": 0.0659, + "num_tokens": 19153748.0, + "step": 239 + }, + { + "epoch": 0.2994385527136619, + "grad_norm": 0.14843331611676852, + "learning_rate": 9.888040789018267e-06, + "loss": 0.0608, + "num_tokens": 19232902.0, + "step": 240 + }, + { + "epoch": 0.3006862133499688, + "grad_norm": 0.15991403846542185, + "learning_rate": 9.886693526302657e-06, + "loss": 0.0699, + "num_tokens": 19314211.0, + "step": 241 + }, + { + "epoch": 0.30193387398627575, + "grad_norm": 0.14343130811237734, + "learning_rate": 9.885338309185775e-06, + "loss": 0.0593, + "num_tokens": 19393486.0, + "step": 242 + }, + { + "epoch": 0.3031815346225827, + "grad_norm": 0.14976887951301165, + "learning_rate": 9.883975140125035e-06, + "loss": 0.063, + "num_tokens": 19474289.0, + "step": 243 + }, + { + "epoch": 0.30442919525888956, + "grad_norm": 0.15391003667967385, + "learning_rate": 9.88260402159228e-06, + "loss": 0.0682, + "num_tokens": 19554117.0, + "step": 244 + }, + { + "epoch": 0.3056768558951965, + "grad_norm": 0.1558428875604292, + "learning_rate": 9.88122495607376e-06, + "loss": 0.0653, + "num_tokens": 19634331.0, + "step": 245 + }, + { + "epoch": 0.30692451653150343, + "grad_norm": 0.15426290191639358, + "learning_rate": 9.879837946070138e-06, + "loss": 0.0593, + "num_tokens": 19713085.0, + "step": 246 + }, + { + "epoch": 0.30817217716781037, + "grad_norm": 0.14767406997007507, + "learning_rate": 9.878442994096481e-06, + "loss": 0.0578, + "num_tokens": 19792400.0, + "step": 247 + }, + { + "epoch": 0.3094198378041173, + "grad_norm": 0.14741065221457425, + "learning_rate": 9.87704010268226e-06, + "loss": 0.0644, + "num_tokens": 19873279.0, + "step": 248 + }, + { + "epoch": 0.3106674984404242, + "grad_norm": 0.1529316875881466, + "learning_rate": 9.87562927437134e-06, + "loss": 0.0628, + "num_tokens": 19953306.0, + "step": 249 + }, + { + "epoch": 0.3119151590767311, + "grad_norm": 0.15478552257088712, + "learning_rate": 9.87421051172198e-06, + "loss": 0.0656, + "num_tokens": 20033613.0, + "step": 250 + }, + { + "epoch": 0.31316281971303805, + "grad_norm": 0.1469468822446913, + "learning_rate": 9.872783817306827e-06, + "loss": 0.0617, + "num_tokens": 20113360.0, + "step": 251 + }, + { + "epoch": 0.314410480349345, + "grad_norm": 0.1539836863937035, + "learning_rate": 9.871349193712905e-06, + "loss": 0.0654, + "num_tokens": 20192829.0, + "step": 252 + }, + { + "epoch": 0.3156581409856519, + "grad_norm": 0.15325635042763217, + "learning_rate": 9.869906643541625e-06, + "loss": 0.0577, + "num_tokens": 20271785.0, + "step": 253 + }, + { + "epoch": 0.3169058016219588, + "grad_norm": 0.14852576321041067, + "learning_rate": 9.868456169408763e-06, + "loss": 0.0591, + "num_tokens": 20351250.0, + "step": 254 + }, + { + "epoch": 0.31815346225826574, + "grad_norm": 0.14853921696894695, + "learning_rate": 9.866997773944469e-06, + "loss": 0.0649, + "num_tokens": 20432591.0, + "step": 255 + }, + { + "epoch": 0.3194011228945727, + "grad_norm": 0.15093199632607562, + "learning_rate": 9.865531459793254e-06, + "loss": 0.0616, + "num_tokens": 20511910.0, + "step": 256 + }, + { + "epoch": 0.3206487835308796, + "grad_norm": 0.14360562555088788, + "learning_rate": 9.864057229613988e-06, + "loss": 0.0587, + "num_tokens": 20591960.0, + "step": 257 + }, + { + "epoch": 0.32189644416718655, + "grad_norm": 0.15453328885789705, + "learning_rate": 9.862575086079897e-06, + "loss": 0.0646, + "num_tokens": 20673004.0, + "step": 258 + }, + { + "epoch": 0.3231441048034934, + "grad_norm": 0.14025760262079745, + "learning_rate": 9.861085031878556e-06, + "loss": 0.0566, + "num_tokens": 20751778.0, + "step": 259 + }, + { + "epoch": 0.32439176543980036, + "grad_norm": 0.154221736145205, + "learning_rate": 9.859587069711883e-06, + "loss": 0.0677, + "num_tokens": 20833796.0, + "step": 260 + }, + { + "epoch": 0.3256394260761073, + "grad_norm": 0.16473194510891925, + "learning_rate": 9.858081202296133e-06, + "loss": 0.0628, + "num_tokens": 20913685.0, + "step": 261 + }, + { + "epoch": 0.32688708671241423, + "grad_norm": 0.1477312778238374, + "learning_rate": 9.856567432361903e-06, + "loss": 0.0608, + "num_tokens": 20995255.0, + "step": 262 + }, + { + "epoch": 0.32813474734872117, + "grad_norm": 0.15295805096246043, + "learning_rate": 9.855045762654115e-06, + "loss": 0.0631, + "num_tokens": 21077034.0, + "step": 263 + }, + { + "epoch": 0.32938240798502805, + "grad_norm": 0.1447398547624195, + "learning_rate": 9.853516195932014e-06, + "loss": 0.0579, + "num_tokens": 21156349.0, + "step": 264 + }, + { + "epoch": 0.330630068621335, + "grad_norm": 0.14589297009135765, + "learning_rate": 9.851978734969168e-06, + "loss": 0.0583, + "num_tokens": 21236413.0, + "step": 265 + }, + { + "epoch": 0.3318777292576419, + "grad_norm": 0.143764406382231, + "learning_rate": 9.850433382553457e-06, + "loss": 0.062, + "num_tokens": 21318138.0, + "step": 266 + }, + { + "epoch": 0.33312538989394885, + "grad_norm": 0.16011843739089876, + "learning_rate": 9.848880141487076e-06, + "loss": 0.0695, + "num_tokens": 21399813.0, + "step": 267 + }, + { + "epoch": 0.3343730505302558, + "grad_norm": 0.14438380491620342, + "learning_rate": 9.847319014586517e-06, + "loss": 0.0598, + "num_tokens": 21482627.0, + "step": 268 + }, + { + "epoch": 0.33562071116656267, + "grad_norm": 0.14004050344567048, + "learning_rate": 9.845750004682576e-06, + "loss": 0.0591, + "num_tokens": 21561686.0, + "step": 269 + }, + { + "epoch": 0.3368683718028696, + "grad_norm": 0.15403062197709771, + "learning_rate": 9.844173114620342e-06, + "loss": 0.0595, + "num_tokens": 21641830.0, + "step": 270 + }, + { + "epoch": 0.33811603243917654, + "grad_norm": 0.15466717690067627, + "learning_rate": 9.842588347259192e-06, + "loss": 0.0568, + "num_tokens": 21721224.0, + "step": 271 + }, + { + "epoch": 0.3393636930754835, + "grad_norm": 0.13856966189349137, + "learning_rate": 9.84099570547279e-06, + "loss": 0.0598, + "num_tokens": 21801699.0, + "step": 272 + }, + { + "epoch": 0.3406113537117904, + "grad_norm": 0.15061899204640403, + "learning_rate": 9.839395192149077e-06, + "loss": 0.0591, + "num_tokens": 21881771.0, + "step": 273 + }, + { + "epoch": 0.3418590143480973, + "grad_norm": 0.14228872056371486, + "learning_rate": 9.837786810190268e-06, + "loss": 0.0608, + "num_tokens": 21961323.0, + "step": 274 + }, + { + "epoch": 0.3431066749844042, + "grad_norm": 0.16567790027277843, + "learning_rate": 9.836170562512844e-06, + "loss": 0.0608, + "num_tokens": 22041013.0, + "step": 275 + }, + { + "epoch": 0.34435433562071116, + "grad_norm": 0.14395086051397554, + "learning_rate": 9.83454645204755e-06, + "loss": 0.0581, + "num_tokens": 22119949.0, + "step": 276 + }, + { + "epoch": 0.3456019962570181, + "grad_norm": 0.1516359303870716, + "learning_rate": 9.832914481739391e-06, + "loss": 0.0634, + "num_tokens": 22200178.0, + "step": 277 + }, + { + "epoch": 0.34684965689332503, + "grad_norm": 0.16105443817398513, + "learning_rate": 9.831274654547623e-06, + "loss": 0.0626, + "num_tokens": 22279674.0, + "step": 278 + }, + { + "epoch": 0.34809731752963197, + "grad_norm": 0.1590038648782175, + "learning_rate": 9.829626973445745e-06, + "loss": 0.0635, + "num_tokens": 22360285.0, + "step": 279 + }, + { + "epoch": 0.34934497816593885, + "grad_norm": 0.14937561999750967, + "learning_rate": 9.827971441421504e-06, + "loss": 0.0613, + "num_tokens": 22442292.0, + "step": 280 + }, + { + "epoch": 0.3505926388022458, + "grad_norm": 0.1474165532101385, + "learning_rate": 9.826308061476878e-06, + "loss": 0.0565, + "num_tokens": 22521661.0, + "step": 281 + }, + { + "epoch": 0.3518402994385527, + "grad_norm": 0.15035582630080754, + "learning_rate": 9.824636836628078e-06, + "loss": 0.0624, + "num_tokens": 22601095.0, + "step": 282 + }, + { + "epoch": 0.35308796007485965, + "grad_norm": 0.14438687507583545, + "learning_rate": 9.822957769905544e-06, + "loss": 0.0573, + "num_tokens": 22681539.0, + "step": 283 + }, + { + "epoch": 0.3543356207111666, + "grad_norm": 0.14358016685030078, + "learning_rate": 9.821270864353924e-06, + "loss": 0.0532, + "num_tokens": 22760996.0, + "step": 284 + }, + { + "epoch": 0.35558328134747347, + "grad_norm": 0.14707615741317387, + "learning_rate": 9.819576123032092e-06, + "loss": 0.0603, + "num_tokens": 22841748.0, + "step": 285 + }, + { + "epoch": 0.3568309419837804, + "grad_norm": 0.14812718809993014, + "learning_rate": 9.817873549013127e-06, + "loss": 0.0592, + "num_tokens": 22922667.0, + "step": 286 + }, + { + "epoch": 0.35807860262008734, + "grad_norm": 0.1649467927673721, + "learning_rate": 9.816163145384308e-06, + "loss": 0.0554, + "num_tokens": 23002249.0, + "step": 287 + }, + { + "epoch": 0.3593262632563943, + "grad_norm": 0.15358093663493297, + "learning_rate": 9.814444915247115e-06, + "loss": 0.0632, + "num_tokens": 23083209.0, + "step": 288 + }, + { + "epoch": 0.3605739238927012, + "grad_norm": 0.15692040771846372, + "learning_rate": 9.81271886171722e-06, + "loss": 0.0592, + "num_tokens": 23163343.0, + "step": 289 + }, + { + "epoch": 0.3618215845290081, + "grad_norm": 0.1580641913112163, + "learning_rate": 9.810984987924477e-06, + "loss": 0.0579, + "num_tokens": 23243258.0, + "step": 290 + }, + { + "epoch": 0.363069245165315, + "grad_norm": 0.147171020680957, + "learning_rate": 9.809243297012923e-06, + "loss": 0.0622, + "num_tokens": 23325453.0, + "step": 291 + }, + { + "epoch": 0.36431690580162196, + "grad_norm": 0.15029752697042956, + "learning_rate": 9.807493792140774e-06, + "loss": 0.0575, + "num_tokens": 23406618.0, + "step": 292 + }, + { + "epoch": 0.3655645664379289, + "grad_norm": 0.1394169020427776, + "learning_rate": 9.805736476480407e-06, + "loss": 0.0565, + "num_tokens": 23485923.0, + "step": 293 + }, + { + "epoch": 0.36681222707423583, + "grad_norm": 0.14818497445465476, + "learning_rate": 9.803971353218367e-06, + "loss": 0.0526, + "num_tokens": 23565444.0, + "step": 294 + }, + { + "epoch": 0.3680598877105427, + "grad_norm": 0.14971154575642173, + "learning_rate": 9.802198425555358e-06, + "loss": 0.0636, + "num_tokens": 23647126.0, + "step": 295 + }, + { + "epoch": 0.36930754834684965, + "grad_norm": 0.16149541778586904, + "learning_rate": 9.800417696706234e-06, + "loss": 0.0636, + "num_tokens": 23727284.0, + "step": 296 + }, + { + "epoch": 0.3705552089831566, + "grad_norm": 0.14820265948993624, + "learning_rate": 9.798629169899992e-06, + "loss": 0.056, + "num_tokens": 23807739.0, + "step": 297 + }, + { + "epoch": 0.3718028696194635, + "grad_norm": 0.15131350009349487, + "learning_rate": 9.796832848379775e-06, + "loss": 0.0591, + "num_tokens": 23887839.0, + "step": 298 + }, + { + "epoch": 0.37305053025577045, + "grad_norm": 0.14986708525327327, + "learning_rate": 9.795028735402853e-06, + "loss": 0.0598, + "num_tokens": 23968437.0, + "step": 299 + }, + { + "epoch": 0.37429819089207733, + "grad_norm": 0.15808081173486005, + "learning_rate": 9.79321683424063e-06, + "loss": 0.0571, + "num_tokens": 24047656.0, + "step": 300 + }, + { + "epoch": 0.37554585152838427, + "grad_norm": 0.14958052977119102, + "learning_rate": 9.791397148178632e-06, + "loss": 0.0554, + "num_tokens": 24127038.0, + "step": 301 + }, + { + "epoch": 0.3767935121646912, + "grad_norm": 0.152499732455963, + "learning_rate": 9.789569680516497e-06, + "loss": 0.057, + "num_tokens": 24207038.0, + "step": 302 + }, + { + "epoch": 0.37804117280099814, + "grad_norm": 0.14559783006117594, + "learning_rate": 9.78773443456798e-06, + "loss": 0.0621, + "num_tokens": 24287925.0, + "step": 303 + }, + { + "epoch": 0.3792888334373051, + "grad_norm": 0.1452527426722152, + "learning_rate": 9.785891413660931e-06, + "loss": 0.0548, + "num_tokens": 24367076.0, + "step": 304 + }, + { + "epoch": 0.38053649407361195, + "grad_norm": 0.14896918649468194, + "learning_rate": 9.784040621137308e-06, + "loss": 0.0547, + "num_tokens": 24446865.0, + "step": 305 + }, + { + "epoch": 0.3817841547099189, + "grad_norm": 0.1390970861000212, + "learning_rate": 9.78218206035316e-06, + "loss": 0.0545, + "num_tokens": 24526102.0, + "step": 306 + }, + { + "epoch": 0.3830318153462258, + "grad_norm": 0.15617459548241075, + "learning_rate": 9.780315734678612e-06, + "loss": 0.0619, + "num_tokens": 24606039.0, + "step": 307 + }, + { + "epoch": 0.38427947598253276, + "grad_norm": 0.14418794697812448, + "learning_rate": 9.778441647497882e-06, + "loss": 0.0565, + "num_tokens": 24685400.0, + "step": 308 + }, + { + "epoch": 0.3855271366188397, + "grad_norm": 0.14701646779160854, + "learning_rate": 9.776559802209255e-06, + "loss": 0.0579, + "num_tokens": 24765381.0, + "step": 309 + }, + { + "epoch": 0.3867747972551466, + "grad_norm": 0.1631595272698652, + "learning_rate": 9.774670202225084e-06, + "loss": 0.0615, + "num_tokens": 24845699.0, + "step": 310 + }, + { + "epoch": 0.3880224578914535, + "grad_norm": 0.1545918579556817, + "learning_rate": 9.772772850971788e-06, + "loss": 0.0645, + "num_tokens": 24925987.0, + "step": 311 + }, + { + "epoch": 0.38927011852776044, + "grad_norm": 0.1441116111201304, + "learning_rate": 9.770867751889837e-06, + "loss": 0.0572, + "num_tokens": 25005272.0, + "step": 312 + }, + { + "epoch": 0.3905177791640674, + "grad_norm": 0.13781567161395594, + "learning_rate": 9.76895490843375e-06, + "loss": 0.0557, + "num_tokens": 25085235.0, + "step": 313 + }, + { + "epoch": 0.3917654398003743, + "grad_norm": 0.14477971945325008, + "learning_rate": 9.767034324072091e-06, + "loss": 0.0574, + "num_tokens": 25164489.0, + "step": 314 + }, + { + "epoch": 0.3930131004366812, + "grad_norm": 0.15372533695861323, + "learning_rate": 9.76510600228746e-06, + "loss": 0.0573, + "num_tokens": 25243328.0, + "step": 315 + }, + { + "epoch": 0.39426076107298813, + "grad_norm": 0.1501403033694261, + "learning_rate": 9.763169946576488e-06, + "loss": 0.0612, + "num_tokens": 25323644.0, + "step": 316 + }, + { + "epoch": 0.39550842170929507, + "grad_norm": 0.15008446129724604, + "learning_rate": 9.76122616044983e-06, + "loss": 0.0601, + "num_tokens": 25402837.0, + "step": 317 + }, + { + "epoch": 0.396756082345602, + "grad_norm": 0.15958664875841405, + "learning_rate": 9.759274647432156e-06, + "loss": 0.055, + "num_tokens": 25481850.0, + "step": 318 + }, + { + "epoch": 0.39800374298190894, + "grad_norm": 0.15101688409063593, + "learning_rate": 9.75731541106215e-06, + "loss": 0.0561, + "num_tokens": 25562517.0, + "step": 319 + }, + { + "epoch": 0.39925140361821587, + "grad_norm": 0.15012352120224384, + "learning_rate": 9.755348454892498e-06, + "loss": 0.0559, + "num_tokens": 25642299.0, + "step": 320 + }, + { + "epoch": 0.40049906425452275, + "grad_norm": 0.15137439858210344, + "learning_rate": 9.753373782489887e-06, + "loss": 0.0576, + "num_tokens": 25722205.0, + "step": 321 + }, + { + "epoch": 0.4017467248908297, + "grad_norm": 0.1506690147271384, + "learning_rate": 9.751391397434996e-06, + "loss": 0.0577, + "num_tokens": 25801893.0, + "step": 322 + }, + { + "epoch": 0.4029943855271366, + "grad_norm": 0.16103781809588258, + "learning_rate": 9.74940130332249e-06, + "loss": 0.0591, + "num_tokens": 25882061.0, + "step": 323 + }, + { + "epoch": 0.40424204616344356, + "grad_norm": 0.14444822336784965, + "learning_rate": 9.747403503761006e-06, + "loss": 0.0525, + "num_tokens": 25960184.0, + "step": 324 + }, + { + "epoch": 0.4054897067997505, + "grad_norm": 0.15166828709851793, + "learning_rate": 9.74539800237316e-06, + "loss": 0.0565, + "num_tokens": 26040338.0, + "step": 325 + }, + { + "epoch": 0.4067373674360574, + "grad_norm": 0.1504474187463855, + "learning_rate": 9.743384802795535e-06, + "loss": 0.0597, + "num_tokens": 26121905.0, + "step": 326 + }, + { + "epoch": 0.4079850280723643, + "grad_norm": 0.13344139836523075, + "learning_rate": 9.741363908678669e-06, + "loss": 0.0546, + "num_tokens": 26202255.0, + "step": 327 + }, + { + "epoch": 0.40923268870867124, + "grad_norm": 0.14377591000899273, + "learning_rate": 9.739335323687052e-06, + "loss": 0.0628, + "num_tokens": 26283068.0, + "step": 328 + }, + { + "epoch": 0.4104803493449782, + "grad_norm": 0.15940785319665565, + "learning_rate": 9.737299051499125e-06, + "loss": 0.0563, + "num_tokens": 26363408.0, + "step": 329 + }, + { + "epoch": 0.4117280099812851, + "grad_norm": 0.14599902917885316, + "learning_rate": 9.735255095807263e-06, + "loss": 0.059, + "num_tokens": 26444199.0, + "step": 330 + }, + { + "epoch": 0.412975670617592, + "grad_norm": 0.1351984864753291, + "learning_rate": 9.733203460317777e-06, + "loss": 0.0584, + "num_tokens": 26524527.0, + "step": 331 + }, + { + "epoch": 0.41422333125389893, + "grad_norm": 0.1397398203048671, + "learning_rate": 9.731144148750898e-06, + "loss": 0.0502, + "num_tokens": 26604842.0, + "step": 332 + }, + { + "epoch": 0.41547099189020587, + "grad_norm": 0.1502653858474547, + "learning_rate": 9.729077164840784e-06, + "loss": 0.0617, + "num_tokens": 26685166.0, + "step": 333 + }, + { + "epoch": 0.4167186525265128, + "grad_norm": 0.15884597841687997, + "learning_rate": 9.727002512335502e-06, + "loss": 0.0579, + "num_tokens": 26766562.0, + "step": 334 + }, + { + "epoch": 0.41796631316281974, + "grad_norm": 0.15085968982512207, + "learning_rate": 9.724920194997022e-06, + "loss": 0.0563, + "num_tokens": 26845821.0, + "step": 335 + }, + { + "epoch": 0.4192139737991266, + "grad_norm": 0.14636626725897792, + "learning_rate": 9.722830216601217e-06, + "loss": 0.0573, + "num_tokens": 26925802.0, + "step": 336 + }, + { + "epoch": 0.42046163443543355, + "grad_norm": 0.15213895413855882, + "learning_rate": 9.720732580937848e-06, + "loss": 0.0573, + "num_tokens": 27004757.0, + "step": 337 + }, + { + "epoch": 0.4217092950717405, + "grad_norm": 0.15478235225939369, + "learning_rate": 9.718627291810561e-06, + "loss": 0.0558, + "num_tokens": 27085488.0, + "step": 338 + }, + { + "epoch": 0.4229569557080474, + "grad_norm": 0.14333668524575108, + "learning_rate": 9.716514353036884e-06, + "loss": 0.0529, + "num_tokens": 27165382.0, + "step": 339 + }, + { + "epoch": 0.42420461634435436, + "grad_norm": 0.13816120021039435, + "learning_rate": 9.714393768448214e-06, + "loss": 0.055, + "num_tokens": 27244847.0, + "step": 340 + }, + { + "epoch": 0.42545227698066124, + "grad_norm": 0.1516209836461398, + "learning_rate": 9.712265541889809e-06, + "loss": 0.0571, + "num_tokens": 27326623.0, + "step": 341 + }, + { + "epoch": 0.4266999376169682, + "grad_norm": 0.14387239194414297, + "learning_rate": 9.710129677220788e-06, + "loss": 0.057, + "num_tokens": 27408759.0, + "step": 342 + }, + { + "epoch": 0.4279475982532751, + "grad_norm": 0.14913332717460548, + "learning_rate": 9.707986178314123e-06, + "loss": 0.0604, + "num_tokens": 27489615.0, + "step": 343 + }, + { + "epoch": 0.42919525888958204, + "grad_norm": 0.14600137001097122, + "learning_rate": 9.705835049056621e-06, + "loss": 0.0577, + "num_tokens": 27569276.0, + "step": 344 + }, + { + "epoch": 0.430442919525889, + "grad_norm": 0.16253517587195007, + "learning_rate": 9.70367629334893e-06, + "loss": 0.0593, + "num_tokens": 27650089.0, + "step": 345 + }, + { + "epoch": 0.43169058016219586, + "grad_norm": 0.15144799683797194, + "learning_rate": 9.701509915105527e-06, + "loss": 0.0551, + "num_tokens": 27731133.0, + "step": 346 + }, + { + "epoch": 0.4329382407985028, + "grad_norm": 0.1457681651429185, + "learning_rate": 9.699335918254714e-06, + "loss": 0.0539, + "num_tokens": 27810902.0, + "step": 347 + }, + { + "epoch": 0.43418590143480973, + "grad_norm": 0.1463053074053871, + "learning_rate": 9.6971543067386e-06, + "loss": 0.0588, + "num_tokens": 27892340.0, + "step": 348 + }, + { + "epoch": 0.43543356207111666, + "grad_norm": 0.1520505948725208, + "learning_rate": 9.694965084513106e-06, + "loss": 0.0562, + "num_tokens": 27973118.0, + "step": 349 + }, + { + "epoch": 0.4366812227074236, + "grad_norm": 0.15820851987269624, + "learning_rate": 9.692768255547957e-06, + "loss": 0.0567, + "num_tokens": 28053723.0, + "step": 350 + }, + { + "epoch": 0.4379288833437305, + "grad_norm": 0.1515133004587777, + "learning_rate": 9.690563823826666e-06, + "loss": 0.06, + "num_tokens": 28134449.0, + "step": 351 + }, + { + "epoch": 0.4391765439800374, + "grad_norm": 0.14702095630330445, + "learning_rate": 9.688351793346533e-06, + "loss": 0.0561, + "num_tokens": 28214373.0, + "step": 352 + }, + { + "epoch": 0.44042420461634435, + "grad_norm": 0.14263520426345258, + "learning_rate": 9.68613216811864e-06, + "loss": 0.0581, + "num_tokens": 28294827.0, + "step": 353 + }, + { + "epoch": 0.4416718652526513, + "grad_norm": 0.15433805512072737, + "learning_rate": 9.683904952167837e-06, + "loss": 0.0536, + "num_tokens": 28374441.0, + "step": 354 + }, + { + "epoch": 0.4429195258889582, + "grad_norm": 0.14704915295293733, + "learning_rate": 9.681670149532739e-06, + "loss": 0.0535, + "num_tokens": 28452997.0, + "step": 355 + }, + { + "epoch": 0.4441671865252651, + "grad_norm": 0.14891895587500484, + "learning_rate": 9.67942776426572e-06, + "loss": 0.0576, + "num_tokens": 28533011.0, + "step": 356 + }, + { + "epoch": 0.44541484716157204, + "grad_norm": 0.14800382217071825, + "learning_rate": 9.677177800432903e-06, + "loss": 0.056, + "num_tokens": 28612714.0, + "step": 357 + }, + { + "epoch": 0.44666250779787897, + "grad_norm": 0.1484302906683845, + "learning_rate": 9.67492026211415e-06, + "loss": 0.0541, + "num_tokens": 28692174.0, + "step": 358 + }, + { + "epoch": 0.4479101684341859, + "grad_norm": 0.1473133403449387, + "learning_rate": 9.672655153403064e-06, + "loss": 0.0556, + "num_tokens": 28771843.0, + "step": 359 + }, + { + "epoch": 0.44915782907049284, + "grad_norm": 0.13463944988028403, + "learning_rate": 9.670382478406967e-06, + "loss": 0.0562, + "num_tokens": 28851787.0, + "step": 360 + }, + { + "epoch": 0.4504054897067998, + "grad_norm": 0.14571672370885358, + "learning_rate": 9.66810224124691e-06, + "loss": 0.0561, + "num_tokens": 28931886.0, + "step": 361 + }, + { + "epoch": 0.45165315034310666, + "grad_norm": 0.13982592828240858, + "learning_rate": 9.665814446057652e-06, + "loss": 0.0527, + "num_tokens": 29011364.0, + "step": 362 + }, + { + "epoch": 0.4529008109794136, + "grad_norm": 0.1468605138111856, + "learning_rate": 9.663519096987653e-06, + "loss": 0.0569, + "num_tokens": 29091203.0, + "step": 363 + }, + { + "epoch": 0.45414847161572053, + "grad_norm": 0.15284313337192354, + "learning_rate": 9.661216198199078e-06, + "loss": 0.0553, + "num_tokens": 29172304.0, + "step": 364 + }, + { + "epoch": 0.45539613225202746, + "grad_norm": 0.15300814933585585, + "learning_rate": 9.658905753867778e-06, + "loss": 0.0608, + "num_tokens": 29252117.0, + "step": 365 + }, + { + "epoch": 0.4566437928883344, + "grad_norm": 0.16189213439882416, + "learning_rate": 9.656587768183287e-06, + "loss": 0.0546, + "num_tokens": 29332772.0, + "step": 366 + }, + { + "epoch": 0.4578914535246413, + "grad_norm": 0.14196790848023397, + "learning_rate": 9.654262245348813e-06, + "loss": 0.0517, + "num_tokens": 29414702.0, + "step": 367 + }, + { + "epoch": 0.4591391141609482, + "grad_norm": 0.14672195499400306, + "learning_rate": 9.651929189581233e-06, + "loss": 0.0595, + "num_tokens": 29495754.0, + "step": 368 + }, + { + "epoch": 0.46038677479725515, + "grad_norm": 0.14633512516697522, + "learning_rate": 9.649588605111082e-06, + "loss": 0.0554, + "num_tokens": 29575118.0, + "step": 369 + }, + { + "epoch": 0.4616344354335621, + "grad_norm": 0.14073644904651714, + "learning_rate": 9.647240496182545e-06, + "loss": 0.0556, + "num_tokens": 29655784.0, + "step": 370 + }, + { + "epoch": 0.462882096069869, + "grad_norm": 0.15125788764011439, + "learning_rate": 9.644884867053455e-06, + "loss": 0.0549, + "num_tokens": 29736212.0, + "step": 371 + }, + { + "epoch": 0.4641297567061759, + "grad_norm": 0.1498730140799902, + "learning_rate": 9.64252172199528e-06, + "loss": 0.0567, + "num_tokens": 29816732.0, + "step": 372 + }, + { + "epoch": 0.46537741734248284, + "grad_norm": 0.13559146848095813, + "learning_rate": 9.640151065293117e-06, + "loss": 0.0546, + "num_tokens": 29896880.0, + "step": 373 + }, + { + "epoch": 0.46662507797878977, + "grad_norm": 0.1640344913303149, + "learning_rate": 9.63777290124568e-06, + "loss": 0.0701, + "num_tokens": 29980295.0, + "step": 374 + }, + { + "epoch": 0.4678727386150967, + "grad_norm": 0.14536897470480084, + "learning_rate": 9.635387234165303e-06, + "loss": 0.0564, + "num_tokens": 30061645.0, + "step": 375 + }, + { + "epoch": 0.46912039925140364, + "grad_norm": 0.13332481481430616, + "learning_rate": 9.632994068377916e-06, + "loss": 0.0485, + "num_tokens": 30141179.0, + "step": 376 + }, + { + "epoch": 0.4703680598877105, + "grad_norm": 0.15492389456107522, + "learning_rate": 9.63059340822306e-06, + "loss": 0.0559, + "num_tokens": 30220539.0, + "step": 377 + }, + { + "epoch": 0.47161572052401746, + "grad_norm": 0.13840934949647143, + "learning_rate": 9.628185258053852e-06, + "loss": 0.0566, + "num_tokens": 30301422.0, + "step": 378 + }, + { + "epoch": 0.4728633811603244, + "grad_norm": 0.14291506474305504, + "learning_rate": 9.625769622236995e-06, + "loss": 0.0545, + "num_tokens": 30381142.0, + "step": 379 + }, + { + "epoch": 0.4741110417966313, + "grad_norm": 0.14000086512375753, + "learning_rate": 9.623346505152771e-06, + "loss": 0.0508, + "num_tokens": 30460521.0, + "step": 380 + }, + { + "epoch": 0.47535870243293826, + "grad_norm": 0.14400483775828635, + "learning_rate": 9.620915911195021e-06, + "loss": 0.0504, + "num_tokens": 30539451.0, + "step": 381 + }, + { + "epoch": 0.47660636306924514, + "grad_norm": 0.15315155933409735, + "learning_rate": 9.618477844771147e-06, + "loss": 0.0558, + "num_tokens": 30618847.0, + "step": 382 + }, + { + "epoch": 0.4778540237055521, + "grad_norm": 0.13374765928710877, + "learning_rate": 9.6160323103021e-06, + "loss": 0.0584, + "num_tokens": 30697880.0, + "step": 383 + }, + { + "epoch": 0.479101684341859, + "grad_norm": 0.13578880192469092, + "learning_rate": 9.613579312222377e-06, + "loss": 0.0495, + "num_tokens": 30776740.0, + "step": 384 + }, + { + "epoch": 0.48034934497816595, + "grad_norm": 0.14830879266525465, + "learning_rate": 9.611118854979998e-06, + "loss": 0.0588, + "num_tokens": 30858807.0, + "step": 385 + }, + { + "epoch": 0.4815970056144729, + "grad_norm": 0.13811959093377515, + "learning_rate": 9.608650943036522e-06, + "loss": 0.0563, + "num_tokens": 30938478.0, + "step": 386 + }, + { + "epoch": 0.48284466625077976, + "grad_norm": 0.152755680501879, + "learning_rate": 9.606175580867016e-06, + "loss": 0.0564, + "num_tokens": 31019990.0, + "step": 387 + }, + { + "epoch": 0.4840923268870867, + "grad_norm": 0.14052581035511705, + "learning_rate": 9.60369277296006e-06, + "loss": 0.0559, + "num_tokens": 31100307.0, + "step": 388 + }, + { + "epoch": 0.48533998752339363, + "grad_norm": 0.1486718670745852, + "learning_rate": 9.601202523817735e-06, + "loss": 0.0573, + "num_tokens": 31179820.0, + "step": 389 + }, + { + "epoch": 0.48658764815970057, + "grad_norm": 0.14296008439795943, + "learning_rate": 9.598704837955618e-06, + "loss": 0.0492, + "num_tokens": 31258626.0, + "step": 390 + }, + { + "epoch": 0.4878353087960075, + "grad_norm": 0.15097962419838978, + "learning_rate": 9.596199719902765e-06, + "loss": 0.0541, + "num_tokens": 31339814.0, + "step": 391 + }, + { + "epoch": 0.4890829694323144, + "grad_norm": 0.1379963039474492, + "learning_rate": 9.593687174201715e-06, + "loss": 0.053, + "num_tokens": 31419517.0, + "step": 392 + }, + { + "epoch": 0.4903306300686213, + "grad_norm": 0.13238251476670143, + "learning_rate": 9.59116720540847e-06, + "loss": 0.0492, + "num_tokens": 31498667.0, + "step": 393 + }, + { + "epoch": 0.49157829070492826, + "grad_norm": 0.1577098734056635, + "learning_rate": 9.588639818092498e-06, + "loss": 0.0574, + "num_tokens": 31579341.0, + "step": 394 + }, + { + "epoch": 0.4928259513412352, + "grad_norm": 0.1400848617210015, + "learning_rate": 9.586105016836713e-06, + "loss": 0.0547, + "num_tokens": 31660051.0, + "step": 395 + }, + { + "epoch": 0.4940736119775421, + "grad_norm": 0.14347417674815124, + "learning_rate": 9.58356280623748e-06, + "loss": 0.0561, + "num_tokens": 31740543.0, + "step": 396 + }, + { + "epoch": 0.495321272613849, + "grad_norm": 0.146326148226546, + "learning_rate": 9.58101319090459e-06, + "loss": 0.0534, + "num_tokens": 31821119.0, + "step": 397 + }, + { + "epoch": 0.49656893325015594, + "grad_norm": 0.13794827612031768, + "learning_rate": 9.578456175461272e-06, + "loss": 0.0517, + "num_tokens": 31901059.0, + "step": 398 + }, + { + "epoch": 0.4978165938864629, + "grad_norm": 0.1335465918733181, + "learning_rate": 9.575891764544162e-06, + "loss": 0.0501, + "num_tokens": 31981392.0, + "step": 399 + }, + { + "epoch": 0.4990642545227698, + "grad_norm": 0.14668973583020378, + "learning_rate": 9.573319962803317e-06, + "loss": 0.0534, + "num_tokens": 32061494.0, + "step": 400 + }, + { + "epoch": 0.5003119151590767, + "grad_norm": 0.15441275896158793, + "learning_rate": 9.570740774902189e-06, + "loss": 0.0616, + "num_tokens": 32142304.0, + "step": 401 + }, + { + "epoch": 0.5015595757953837, + "grad_norm": 0.14955081145369936, + "learning_rate": 9.568154205517623e-06, + "loss": 0.0522, + "num_tokens": 32222526.0, + "step": 402 + }, + { + "epoch": 0.5028072364316906, + "grad_norm": 0.1403875094386206, + "learning_rate": 9.565560259339856e-06, + "loss": 0.0485, + "num_tokens": 32302513.0, + "step": 403 + }, + { + "epoch": 0.5040548970679976, + "grad_norm": 0.13683297238023728, + "learning_rate": 9.562958941072491e-06, + "loss": 0.0536, + "num_tokens": 32382934.0, + "step": 404 + }, + { + "epoch": 0.5053025577043044, + "grad_norm": 0.14711235416131968, + "learning_rate": 9.560350255432508e-06, + "loss": 0.0542, + "num_tokens": 32464107.0, + "step": 405 + }, + { + "epoch": 0.5065502183406113, + "grad_norm": 0.14916320835261607, + "learning_rate": 9.557734207150243e-06, + "loss": 0.0539, + "num_tokens": 32543923.0, + "step": 406 + }, + { + "epoch": 0.5077978789769183, + "grad_norm": 0.13980467839315625, + "learning_rate": 9.55511080096938e-06, + "loss": 0.049, + "num_tokens": 32623438.0, + "step": 407 + }, + { + "epoch": 0.5090455396132252, + "grad_norm": 0.15152035984072354, + "learning_rate": 9.552480041646949e-06, + "loss": 0.055, + "num_tokens": 32703162.0, + "step": 408 + }, + { + "epoch": 0.5102932002495322, + "grad_norm": 0.1388088923573849, + "learning_rate": 9.549841933953308e-06, + "loss": 0.0487, + "num_tokens": 32782526.0, + "step": 409 + }, + { + "epoch": 0.511540860885839, + "grad_norm": 0.1384190124870507, + "learning_rate": 9.547196482672148e-06, + "loss": 0.0571, + "num_tokens": 32863740.0, + "step": 410 + }, + { + "epoch": 0.5127885215221459, + "grad_norm": 0.14052473157074324, + "learning_rate": 9.544543692600473e-06, + "loss": 0.0534, + "num_tokens": 32942880.0, + "step": 411 + }, + { + "epoch": 0.5140361821584529, + "grad_norm": 0.13353121925025832, + "learning_rate": 9.541883568548588e-06, + "loss": 0.0504, + "num_tokens": 33022952.0, + "step": 412 + }, + { + "epoch": 0.5152838427947598, + "grad_norm": 0.14750782511179328, + "learning_rate": 9.539216115340106e-06, + "loss": 0.0523, + "num_tokens": 33101857.0, + "step": 413 + }, + { + "epoch": 0.5165315034310668, + "grad_norm": 0.14466406906288587, + "learning_rate": 9.536541337811923e-06, + "loss": 0.0558, + "num_tokens": 33181714.0, + "step": 414 + }, + { + "epoch": 0.5177791640673737, + "grad_norm": 0.15553123717162787, + "learning_rate": 9.533859240814221e-06, + "loss": 0.055, + "num_tokens": 33261500.0, + "step": 415 + }, + { + "epoch": 0.5190268247036806, + "grad_norm": 0.13310331465324876, + "learning_rate": 9.531169829210452e-06, + "loss": 0.0501, + "num_tokens": 33340399.0, + "step": 416 + }, + { + "epoch": 0.5202744853399875, + "grad_norm": 0.14079149042098654, + "learning_rate": 9.528473107877333e-06, + "loss": 0.0497, + "num_tokens": 33420893.0, + "step": 417 + }, + { + "epoch": 0.5215221459762944, + "grad_norm": 0.1401444099939849, + "learning_rate": 9.525769081704835e-06, + "loss": 0.0526, + "num_tokens": 33500831.0, + "step": 418 + }, + { + "epoch": 0.5227698066126014, + "grad_norm": 0.13674566510520145, + "learning_rate": 9.523057755596174e-06, + "loss": 0.0524, + "num_tokens": 33581334.0, + "step": 419 + }, + { + "epoch": 0.5240174672489083, + "grad_norm": 0.14204922606844075, + "learning_rate": 9.520339134467803e-06, + "loss": 0.0538, + "num_tokens": 33661326.0, + "step": 420 + }, + { + "epoch": 0.5252651278852152, + "grad_norm": 0.1558664826050407, + "learning_rate": 9.517613223249402e-06, + "loss": 0.0572, + "num_tokens": 33741261.0, + "step": 421 + }, + { + "epoch": 0.5265127885215222, + "grad_norm": 0.15461038514035907, + "learning_rate": 9.514880026883877e-06, + "loss": 0.0575, + "num_tokens": 33822933.0, + "step": 422 + }, + { + "epoch": 0.527760449157829, + "grad_norm": 0.1607295696047527, + "learning_rate": 9.512139550327338e-06, + "loss": 0.0586, + "num_tokens": 33903895.0, + "step": 423 + }, + { + "epoch": 0.529008109794136, + "grad_norm": 0.1498493476829081, + "learning_rate": 9.509391798549091e-06, + "loss": 0.0579, + "num_tokens": 33984607.0, + "step": 424 + }, + { + "epoch": 0.5302557704304429, + "grad_norm": 0.14848887896859914, + "learning_rate": 9.50663677653165e-06, + "loss": 0.0526, + "num_tokens": 34064151.0, + "step": 425 + }, + { + "epoch": 0.5315034310667498, + "grad_norm": 0.14388277136012045, + "learning_rate": 9.503874489270697e-06, + "loss": 0.0507, + "num_tokens": 34144372.0, + "step": 426 + }, + { + "epoch": 0.5327510917030568, + "grad_norm": 0.14645441650969096, + "learning_rate": 9.501104941775094e-06, + "loss": 0.0582, + "num_tokens": 34224279.0, + "step": 427 + }, + { + "epoch": 0.5339987523393637, + "grad_norm": 0.14459393929071007, + "learning_rate": 9.49832813906687e-06, + "loss": 0.0536, + "num_tokens": 34303390.0, + "step": 428 + }, + { + "epoch": 0.5352464129756707, + "grad_norm": 0.12857512112686587, + "learning_rate": 9.495544086181204e-06, + "loss": 0.0485, + "num_tokens": 34383555.0, + "step": 429 + }, + { + "epoch": 0.5364940736119775, + "grad_norm": 0.1314484816771736, + "learning_rate": 9.49275278816643e-06, + "loss": 0.0498, + "num_tokens": 34463812.0, + "step": 430 + }, + { + "epoch": 0.5377417342482844, + "grad_norm": 0.13947556078476245, + "learning_rate": 9.489954250084011e-06, + "loss": 0.0545, + "num_tokens": 34544321.0, + "step": 431 + }, + { + "epoch": 0.5389893948845914, + "grad_norm": 0.1436721443948016, + "learning_rate": 9.487148477008545e-06, + "loss": 0.0525, + "num_tokens": 34624266.0, + "step": 432 + }, + { + "epoch": 0.5402370555208983, + "grad_norm": 0.13998074131625612, + "learning_rate": 9.484335474027744e-06, + "loss": 0.0514, + "num_tokens": 34704134.0, + "step": 433 + }, + { + "epoch": 0.5414847161572053, + "grad_norm": 0.14871361303836866, + "learning_rate": 9.481515246242435e-06, + "loss": 0.053, + "num_tokens": 34784190.0, + "step": 434 + }, + { + "epoch": 0.5427323767935122, + "grad_norm": 0.14624886222954445, + "learning_rate": 9.478687798766544e-06, + "loss": 0.0531, + "num_tokens": 34864130.0, + "step": 435 + }, + { + "epoch": 0.543980037429819, + "grad_norm": 0.13641332408321277, + "learning_rate": 9.475853136727086e-06, + "loss": 0.0492, + "num_tokens": 34944204.0, + "step": 436 + }, + { + "epoch": 0.545227698066126, + "grad_norm": 0.14134926386463173, + "learning_rate": 9.473011265264159e-06, + "loss": 0.0526, + "num_tokens": 35024607.0, + "step": 437 + }, + { + "epoch": 0.5464753587024329, + "grad_norm": 0.14553055880700336, + "learning_rate": 9.470162189530938e-06, + "loss": 0.0534, + "num_tokens": 35104390.0, + "step": 438 + }, + { + "epoch": 0.5477230193387399, + "grad_norm": 0.1437000622603961, + "learning_rate": 9.467305914693658e-06, + "loss": 0.0513, + "num_tokens": 35184822.0, + "step": 439 + }, + { + "epoch": 0.5489706799750468, + "grad_norm": 0.14612828451027657, + "learning_rate": 9.464442445931605e-06, + "loss": 0.0497, + "num_tokens": 35264499.0, + "step": 440 + }, + { + "epoch": 0.5502183406113537, + "grad_norm": 0.14828985114792528, + "learning_rate": 9.461571788437119e-06, + "loss": 0.0533, + "num_tokens": 35345078.0, + "step": 441 + }, + { + "epoch": 0.5514660012476607, + "grad_norm": 0.1374551082004164, + "learning_rate": 9.458693947415564e-06, + "loss": 0.0466, + "num_tokens": 35424570.0, + "step": 442 + }, + { + "epoch": 0.5527136618839675, + "grad_norm": 0.147677054111561, + "learning_rate": 9.455808928085339e-06, + "loss": 0.0542, + "num_tokens": 35505946.0, + "step": 443 + }, + { + "epoch": 0.5539613225202745, + "grad_norm": 0.13849172697877132, + "learning_rate": 9.452916735677857e-06, + "loss": 0.0485, + "num_tokens": 35586346.0, + "step": 444 + }, + { + "epoch": 0.5552089831565814, + "grad_norm": 0.15297999265119042, + "learning_rate": 9.450017375437534e-06, + "loss": 0.0531, + "num_tokens": 35665891.0, + "step": 445 + }, + { + "epoch": 0.5564566437928883, + "grad_norm": 0.15501802210849228, + "learning_rate": 9.44711085262179e-06, + "loss": 0.0573, + "num_tokens": 35746479.0, + "step": 446 + }, + { + "epoch": 0.5577043044291953, + "grad_norm": 0.13088367378601265, + "learning_rate": 9.444197172501025e-06, + "loss": 0.0484, + "num_tokens": 35826159.0, + "step": 447 + }, + { + "epoch": 0.5589519650655022, + "grad_norm": 0.16063425009788931, + "learning_rate": 9.441276340358624e-06, + "loss": 0.0576, + "num_tokens": 35906837.0, + "step": 448 + }, + { + "epoch": 0.5601996257018091, + "grad_norm": 0.1307271068905107, + "learning_rate": 9.438348361490938e-06, + "loss": 0.0525, + "num_tokens": 35987331.0, + "step": 449 + }, + { + "epoch": 0.561447286338116, + "grad_norm": 0.13171725020454944, + "learning_rate": 9.43541324120728e-06, + "loss": 0.0474, + "num_tokens": 36066069.0, + "step": 450 + }, + { + "epoch": 0.5626949469744229, + "grad_norm": 0.1309581854874849, + "learning_rate": 9.432470984829908e-06, + "loss": 0.0465, + "num_tokens": 36144807.0, + "step": 451 + }, + { + "epoch": 0.5639426076107299, + "grad_norm": 0.14475267528300384, + "learning_rate": 9.429521597694023e-06, + "loss": 0.0553, + "num_tokens": 36226188.0, + "step": 452 + }, + { + "epoch": 0.5651902682470368, + "grad_norm": 0.1425625122178402, + "learning_rate": 9.426565085147755e-06, + "loss": 0.0511, + "num_tokens": 36305107.0, + "step": 453 + }, + { + "epoch": 0.5664379288833438, + "grad_norm": 0.13616136813599408, + "learning_rate": 9.423601452552153e-06, + "loss": 0.0555, + "num_tokens": 36386076.0, + "step": 454 + }, + { + "epoch": 0.5676855895196506, + "grad_norm": 0.14376718432927843, + "learning_rate": 9.420630705281182e-06, + "loss": 0.0501, + "num_tokens": 36466132.0, + "step": 455 + }, + { + "epoch": 0.5689332501559575, + "grad_norm": 0.13639207933313066, + "learning_rate": 9.417652848721704e-06, + "loss": 0.0549, + "num_tokens": 36547317.0, + "step": 456 + }, + { + "epoch": 0.5701809107922645, + "grad_norm": 0.13858764737031773, + "learning_rate": 9.41466788827347e-06, + "loss": 0.0545, + "num_tokens": 36626984.0, + "step": 457 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.14505131114240344, + "learning_rate": 9.411675829349119e-06, + "loss": 0.0521, + "num_tokens": 36707170.0, + "step": 458 + }, + { + "epoch": 0.5726762320648784, + "grad_norm": 0.1475234995649541, + "learning_rate": 9.408676677374158e-06, + "loss": 0.0506, + "num_tokens": 36787101.0, + "step": 459 + }, + { + "epoch": 0.5739238927011853, + "grad_norm": 0.13633501610880747, + "learning_rate": 9.405670437786953e-06, + "loss": 0.0493, + "num_tokens": 36867353.0, + "step": 460 + }, + { + "epoch": 0.5751715533374921, + "grad_norm": 0.14436718039231583, + "learning_rate": 9.402657116038728e-06, + "loss": 0.0546, + "num_tokens": 36948883.0, + "step": 461 + }, + { + "epoch": 0.5764192139737991, + "grad_norm": 0.14203475439871666, + "learning_rate": 9.399636717593545e-06, + "loss": 0.0476, + "num_tokens": 37028885.0, + "step": 462 + }, + { + "epoch": 0.577666874610106, + "grad_norm": 0.14533730733608163, + "learning_rate": 9.3966092479283e-06, + "loss": 0.0516, + "num_tokens": 37108694.0, + "step": 463 + }, + { + "epoch": 0.578914535246413, + "grad_norm": 0.13808604624352258, + "learning_rate": 9.39357471253271e-06, + "loss": 0.0488, + "num_tokens": 37188517.0, + "step": 464 + }, + { + "epoch": 0.5801621958827199, + "grad_norm": 0.1461795781132845, + "learning_rate": 9.390533116909305e-06, + "loss": 0.054, + "num_tokens": 37269409.0, + "step": 465 + }, + { + "epoch": 0.5814098565190269, + "grad_norm": 0.14456406121879972, + "learning_rate": 9.387484466573417e-06, + "loss": 0.0513, + "num_tokens": 37349786.0, + "step": 466 + }, + { + "epoch": 0.5826575171553338, + "grad_norm": 0.15015842796580345, + "learning_rate": 9.38442876705317e-06, + "loss": 0.0488, + "num_tokens": 37428973.0, + "step": 467 + }, + { + "epoch": 0.5839051777916406, + "grad_norm": 0.14230447835176302, + "learning_rate": 9.381366023889475e-06, + "loss": 0.052, + "num_tokens": 37509348.0, + "step": 468 + }, + { + "epoch": 0.5851528384279476, + "grad_norm": 0.1450811926352791, + "learning_rate": 9.378296242636002e-06, + "loss": 0.0545, + "num_tokens": 37590500.0, + "step": 469 + }, + { + "epoch": 0.5864004990642545, + "grad_norm": 0.13517059402851564, + "learning_rate": 9.375219428859202e-06, + "loss": 0.05, + "num_tokens": 37672765.0, + "step": 470 + }, + { + "epoch": 0.5876481597005615, + "grad_norm": 0.14229031493976188, + "learning_rate": 9.372135588138262e-06, + "loss": 0.0508, + "num_tokens": 37753260.0, + "step": 471 + }, + { + "epoch": 0.5888958203368684, + "grad_norm": 0.13875205292970882, + "learning_rate": 9.369044726065121e-06, + "loss": 0.0515, + "num_tokens": 37832754.0, + "step": 472 + }, + { + "epoch": 0.5901434809731753, + "grad_norm": 0.13790837237053832, + "learning_rate": 9.365946848244445e-06, + "loss": 0.0508, + "num_tokens": 37911845.0, + "step": 473 + }, + { + "epoch": 0.5913911416094823, + "grad_norm": 0.140360358316561, + "learning_rate": 9.362841960293622e-06, + "loss": 0.0499, + "num_tokens": 37991693.0, + "step": 474 + }, + { + "epoch": 0.5926388022457891, + "grad_norm": 0.13689888549603996, + "learning_rate": 9.359730067842753e-06, + "loss": 0.0528, + "num_tokens": 38072283.0, + "step": 475 + }, + { + "epoch": 0.5938864628820961, + "grad_norm": 0.13291064397987987, + "learning_rate": 9.35661117653464e-06, + "loss": 0.0473, + "num_tokens": 38150942.0, + "step": 476 + }, + { + "epoch": 0.595134123518403, + "grad_norm": 0.1404556622746765, + "learning_rate": 9.353485292024775e-06, + "loss": 0.0554, + "num_tokens": 38232536.0, + "step": 477 + }, + { + "epoch": 0.5963817841547099, + "grad_norm": 0.13890649535466057, + "learning_rate": 9.35035241998133e-06, + "loss": 0.0487, + "num_tokens": 38312747.0, + "step": 478 + }, + { + "epoch": 0.5976294447910169, + "grad_norm": 0.13585175607106556, + "learning_rate": 9.347212566085153e-06, + "loss": 0.0512, + "num_tokens": 38392202.0, + "step": 479 + }, + { + "epoch": 0.5988771054273238, + "grad_norm": 0.14380652025332846, + "learning_rate": 9.344065736029746e-06, + "loss": 0.0542, + "num_tokens": 38472582.0, + "step": 480 + }, + { + "epoch": 0.6001247660636307, + "grad_norm": 0.1524558297733905, + "learning_rate": 9.34091193552126e-06, + "loss": 0.0522, + "num_tokens": 38553322.0, + "step": 481 + }, + { + "epoch": 0.6013724266999376, + "grad_norm": 0.1568778299979335, + "learning_rate": 9.337751170278495e-06, + "loss": 0.0541, + "num_tokens": 38633844.0, + "step": 482 + }, + { + "epoch": 0.6026200873362445, + "grad_norm": 0.14243398281550446, + "learning_rate": 9.334583446032866e-06, + "loss": 0.0528, + "num_tokens": 38713546.0, + "step": 483 + }, + { + "epoch": 0.6038677479725515, + "grad_norm": 0.14281972240999116, + "learning_rate": 9.331408768528423e-06, + "loss": 0.0504, + "num_tokens": 38792792.0, + "step": 484 + }, + { + "epoch": 0.6051154086088584, + "grad_norm": 0.1357329955012292, + "learning_rate": 9.328227143521809e-06, + "loss": 0.0511, + "num_tokens": 38872893.0, + "step": 485 + }, + { + "epoch": 0.6063630692451654, + "grad_norm": 0.12679897769025358, + "learning_rate": 9.325038576782275e-06, + "loss": 0.0476, + "num_tokens": 38952415.0, + "step": 486 + }, + { + "epoch": 0.6076107298814722, + "grad_norm": 0.1482843741539947, + "learning_rate": 9.321843074091654e-06, + "loss": 0.0524, + "num_tokens": 39033272.0, + "step": 487 + }, + { + "epoch": 0.6088583905177791, + "grad_norm": 0.14975071143827606, + "learning_rate": 9.318640641244362e-06, + "loss": 0.0488, + "num_tokens": 39111891.0, + "step": 488 + }, + { + "epoch": 0.6101060511540861, + "grad_norm": 0.1342258373618748, + "learning_rate": 9.315431284047375e-06, + "loss": 0.0505, + "num_tokens": 39192041.0, + "step": 489 + }, + { + "epoch": 0.611353711790393, + "grad_norm": 0.1285046989054189, + "learning_rate": 9.312215008320228e-06, + "loss": 0.0497, + "num_tokens": 39271930.0, + "step": 490 + }, + { + "epoch": 0.6126013724267, + "grad_norm": 0.1379379252162495, + "learning_rate": 9.308991819895001e-06, + "loss": 0.0529, + "num_tokens": 39352986.0, + "step": 491 + }, + { + "epoch": 0.6138490330630069, + "grad_norm": 0.14610092835159, + "learning_rate": 9.30576172461631e-06, + "loss": 0.0642, + "num_tokens": 39433237.0, + "step": 492 + }, + { + "epoch": 0.6150966936993137, + "grad_norm": 0.13695664278195094, + "learning_rate": 9.302524728341292e-06, + "loss": 0.0526, + "num_tokens": 39514407.0, + "step": 493 + }, + { + "epoch": 0.6163443543356207, + "grad_norm": 0.14695174818928397, + "learning_rate": 9.299280836939602e-06, + "loss": 0.0527, + "num_tokens": 39594636.0, + "step": 494 + }, + { + "epoch": 0.6175920149719276, + "grad_norm": 0.13598009895197866, + "learning_rate": 9.296030056293394e-06, + "loss": 0.0464, + "num_tokens": 39674169.0, + "step": 495 + }, + { + "epoch": 0.6188396756082346, + "grad_norm": 0.14132411207723297, + "learning_rate": 9.292772392297316e-06, + "loss": 0.0518, + "num_tokens": 39754876.0, + "step": 496 + }, + { + "epoch": 0.6200873362445415, + "grad_norm": 0.13790154255779888, + "learning_rate": 9.289507850858498e-06, + "loss": 0.0481, + "num_tokens": 39835143.0, + "step": 497 + }, + { + "epoch": 0.6213349968808484, + "grad_norm": 0.136274128237269, + "learning_rate": 9.286236437896538e-06, + "loss": 0.0487, + "num_tokens": 39914731.0, + "step": 498 + }, + { + "epoch": 0.6225826575171554, + "grad_norm": 0.13960062998109465, + "learning_rate": 9.282958159343502e-06, + "loss": 0.051, + "num_tokens": 39994714.0, + "step": 499 + }, + { + "epoch": 0.6238303181534622, + "grad_norm": 0.13253521456358003, + "learning_rate": 9.279673021143895e-06, + "loss": 0.0476, + "num_tokens": 40075018.0, + "step": 500 + }, + { + "epoch": 0.6250779787897692, + "grad_norm": 0.13428832406148858, + "learning_rate": 9.276381029254668e-06, + "loss": 0.0488, + "num_tokens": 40155183.0, + "step": 501 + }, + { + "epoch": 0.6263256394260761, + "grad_norm": 0.14564770785457162, + "learning_rate": 9.273082189645197e-06, + "loss": 0.053, + "num_tokens": 40236110.0, + "step": 502 + }, + { + "epoch": 0.627573300062383, + "grad_norm": 0.13321390812262743, + "learning_rate": 9.269776508297272e-06, + "loss": 0.0494, + "num_tokens": 40316306.0, + "step": 503 + }, + { + "epoch": 0.62882096069869, + "grad_norm": 0.13863622088015634, + "learning_rate": 9.266463991205096e-06, + "loss": 0.0507, + "num_tokens": 40396697.0, + "step": 504 + }, + { + "epoch": 0.6300686213349969, + "grad_norm": 0.12373924281291247, + "learning_rate": 9.263144644375264e-06, + "loss": 0.0453, + "num_tokens": 40476696.0, + "step": 505 + }, + { + "epoch": 0.6313162819713038, + "grad_norm": 0.14046491029490663, + "learning_rate": 9.259818473826753e-06, + "loss": 0.0463, + "num_tokens": 40555802.0, + "step": 506 + }, + { + "epoch": 0.6325639426076107, + "grad_norm": 0.14152092405005381, + "learning_rate": 9.256485485590916e-06, + "loss": 0.0485, + "num_tokens": 40636230.0, + "step": 507 + }, + { + "epoch": 0.6338116032439176, + "grad_norm": 0.1417103083447363, + "learning_rate": 9.25314568571147e-06, + "loss": 0.0475, + "num_tokens": 40716545.0, + "step": 508 + }, + { + "epoch": 0.6350592638802246, + "grad_norm": 0.14299937687535086, + "learning_rate": 9.24979908024448e-06, + "loss": 0.05, + "num_tokens": 40795828.0, + "step": 509 + }, + { + "epoch": 0.6363069245165315, + "grad_norm": 0.17469460318552915, + "learning_rate": 9.246445675258353e-06, + "loss": 0.0485, + "num_tokens": 40876181.0, + "step": 510 + }, + { + "epoch": 0.6375545851528385, + "grad_norm": 0.1416974505661574, + "learning_rate": 9.243085476833827e-06, + "loss": 0.0486, + "num_tokens": 40956815.0, + "step": 511 + }, + { + "epoch": 0.6388022457891454, + "grad_norm": 0.13184766297464506, + "learning_rate": 9.239718491063956e-06, + "loss": 0.0452, + "num_tokens": 41035477.0, + "step": 512 + }, + { + "epoch": 0.6400499064254522, + "grad_norm": 0.14293335757694378, + "learning_rate": 9.236344724054104e-06, + "loss": 0.0481, + "num_tokens": 41114840.0, + "step": 513 + }, + { + "epoch": 0.6412975670617592, + "grad_norm": 0.1346618868296363, + "learning_rate": 9.232964181921931e-06, + "loss": 0.0469, + "num_tokens": 41194660.0, + "step": 514 + }, + { + "epoch": 0.6425452276980661, + "grad_norm": 0.14380477524244686, + "learning_rate": 9.22957687079738e-06, + "loss": 0.0503, + "num_tokens": 41274732.0, + "step": 515 + }, + { + "epoch": 0.6437928883343731, + "grad_norm": 0.1548153138877275, + "learning_rate": 9.22618279682267e-06, + "loss": 0.0534, + "num_tokens": 41355231.0, + "step": 516 + }, + { + "epoch": 0.64504054897068, + "grad_norm": 0.12901632746395453, + "learning_rate": 9.222781966152284e-06, + "loss": 0.0487, + "num_tokens": 41435607.0, + "step": 517 + }, + { + "epoch": 0.6462882096069869, + "grad_norm": 0.13407720979842325, + "learning_rate": 9.219374384952955e-06, + "loss": 0.0459, + "num_tokens": 41514560.0, + "step": 518 + }, + { + "epoch": 0.6475358702432938, + "grad_norm": 0.12463947466497512, + "learning_rate": 9.215960059403657e-06, + "loss": 0.0473, + "num_tokens": 41594491.0, + "step": 519 + }, + { + "epoch": 0.6487835308796007, + "grad_norm": 0.1279220794254339, + "learning_rate": 9.212538995695597e-06, + "loss": 0.0445, + "num_tokens": 41673375.0, + "step": 520 + }, + { + "epoch": 0.6500311915159077, + "grad_norm": 0.1422667742985825, + "learning_rate": 9.209111200032197e-06, + "loss": 0.0457, + "num_tokens": 41752931.0, + "step": 521 + }, + { + "epoch": 0.6512788521522146, + "grad_norm": 0.13547316557303143, + "learning_rate": 9.205676678629084e-06, + "loss": 0.0514, + "num_tokens": 41833139.0, + "step": 522 + }, + { + "epoch": 0.6525265127885215, + "grad_norm": 0.1294378459458407, + "learning_rate": 9.202235437714085e-06, + "loss": 0.0452, + "num_tokens": 41912284.0, + "step": 523 + }, + { + "epoch": 0.6537741734248285, + "grad_norm": 0.13400192606267253, + "learning_rate": 9.198787483527211e-06, + "loss": 0.0499, + "num_tokens": 41993387.0, + "step": 524 + }, + { + "epoch": 0.6550218340611353, + "grad_norm": 0.1374789626351036, + "learning_rate": 9.195332822320643e-06, + "loss": 0.0507, + "num_tokens": 42073397.0, + "step": 525 + }, + { + "epoch": 0.6562694946974423, + "grad_norm": 0.14254850808958336, + "learning_rate": 9.191871460358727e-06, + "loss": 0.0498, + "num_tokens": 42153736.0, + "step": 526 + }, + { + "epoch": 0.6575171553337492, + "grad_norm": 0.1492013090788707, + "learning_rate": 9.188403403917959e-06, + "loss": 0.0508, + "num_tokens": 42233576.0, + "step": 527 + }, + { + "epoch": 0.6587648159700561, + "grad_norm": 0.13528472298759806, + "learning_rate": 9.184928659286972e-06, + "loss": 0.0461, + "num_tokens": 42312824.0, + "step": 528 + }, + { + "epoch": 0.6600124766063631, + "grad_norm": 0.14462582085339648, + "learning_rate": 9.181447232766531e-06, + "loss": 0.0541, + "num_tokens": 42394280.0, + "step": 529 + }, + { + "epoch": 0.66126013724267, + "grad_norm": 0.1363211647052413, + "learning_rate": 9.177959130669512e-06, + "loss": 0.0489, + "num_tokens": 42473681.0, + "step": 530 + }, + { + "epoch": 0.662507797878977, + "grad_norm": 0.13485083339287968, + "learning_rate": 9.174464359320898e-06, + "loss": 0.0471, + "num_tokens": 42552772.0, + "step": 531 + }, + { + "epoch": 0.6637554585152838, + "grad_norm": 0.1255665390873853, + "learning_rate": 9.170962925057769e-06, + "loss": 0.0472, + "num_tokens": 42632415.0, + "step": 532 + }, + { + "epoch": 0.6650031191515907, + "grad_norm": 0.1355451583887428, + "learning_rate": 9.167454834229281e-06, + "loss": 0.048, + "num_tokens": 42712289.0, + "step": 533 + }, + { + "epoch": 0.6662507797878977, + "grad_norm": 0.1350858530193199, + "learning_rate": 9.163940093196663e-06, + "loss": 0.0506, + "num_tokens": 42792392.0, + "step": 534 + }, + { + "epoch": 0.6674984404242046, + "grad_norm": 0.136136562989076, + "learning_rate": 9.160418708333203e-06, + "loss": 0.0478, + "num_tokens": 42872872.0, + "step": 535 + }, + { + "epoch": 0.6687461010605116, + "grad_norm": 0.146077765520201, + "learning_rate": 9.156890686024239e-06, + "loss": 0.0498, + "num_tokens": 42953883.0, + "step": 536 + }, + { + "epoch": 0.6699937616968185, + "grad_norm": 0.12797176999362384, + "learning_rate": 9.153356032667138e-06, + "loss": 0.046, + "num_tokens": 43033437.0, + "step": 537 + }, + { + "epoch": 0.6712414223331253, + "grad_norm": 0.12545897578255247, + "learning_rate": 9.149814754671296e-06, + "loss": 0.0495, + "num_tokens": 43113703.0, + "step": 538 + }, + { + "epoch": 0.6724890829694323, + "grad_norm": 0.13008384107108037, + "learning_rate": 9.14626685845812e-06, + "loss": 0.045, + "num_tokens": 43192777.0, + "step": 539 + }, + { + "epoch": 0.6737367436057392, + "grad_norm": 0.13576570983520106, + "learning_rate": 9.142712350461021e-06, + "loss": 0.0504, + "num_tokens": 43272684.0, + "step": 540 + }, + { + "epoch": 0.6749844042420462, + "grad_norm": 0.1429459018435017, + "learning_rate": 9.139151237125393e-06, + "loss": 0.052, + "num_tokens": 43354053.0, + "step": 541 + }, + { + "epoch": 0.6762320648783531, + "grad_norm": 0.11983657237309615, + "learning_rate": 9.135583524908614e-06, + "loss": 0.0441, + "num_tokens": 43435103.0, + "step": 542 + }, + { + "epoch": 0.67747972551466, + "grad_norm": 0.14035766138431027, + "learning_rate": 9.132009220280021e-06, + "loss": 0.0498, + "num_tokens": 43514717.0, + "step": 543 + }, + { + "epoch": 0.678727386150967, + "grad_norm": 0.1374244928438365, + "learning_rate": 9.128428329720911e-06, + "loss": 0.0507, + "num_tokens": 43595017.0, + "step": 544 + }, + { + "epoch": 0.6799750467872738, + "grad_norm": 0.14354836890668102, + "learning_rate": 9.12484085972452e-06, + "loss": 0.0477, + "num_tokens": 43674479.0, + "step": 545 + }, + { + "epoch": 0.6812227074235808, + "grad_norm": 0.14516354150067737, + "learning_rate": 9.121246816796017e-06, + "loss": 0.0507, + "num_tokens": 43755079.0, + "step": 546 + }, + { + "epoch": 0.6824703680598877, + "grad_norm": 0.1400453547205689, + "learning_rate": 9.117646207452487e-06, + "loss": 0.0465, + "num_tokens": 43834370.0, + "step": 547 + }, + { + "epoch": 0.6837180286961946, + "grad_norm": 0.14040265790324466, + "learning_rate": 9.114039038222922e-06, + "loss": 0.045, + "num_tokens": 43914052.0, + "step": 548 + }, + { + "epoch": 0.6849656893325016, + "grad_norm": 0.14521374141163426, + "learning_rate": 9.110425315648212e-06, + "loss": 0.0489, + "num_tokens": 43993792.0, + "step": 549 + }, + { + "epoch": 0.6862133499688085, + "grad_norm": 0.142707572861237, + "learning_rate": 9.106805046281127e-06, + "loss": 0.047, + "num_tokens": 44073401.0, + "step": 550 + }, + { + "epoch": 0.6874610106051154, + "grad_norm": 0.13023090116652908, + "learning_rate": 9.103178236686309e-06, + "loss": 0.0465, + "num_tokens": 44152510.0, + "step": 551 + }, + { + "epoch": 0.6887086712414223, + "grad_norm": 0.12890465812404028, + "learning_rate": 9.099544893440265e-06, + "loss": 0.0489, + "num_tokens": 44233765.0, + "step": 552 + }, + { + "epoch": 0.6899563318777293, + "grad_norm": 0.13134548120051465, + "learning_rate": 9.095905023131337e-06, + "loss": 0.0464, + "num_tokens": 44313193.0, + "step": 553 + }, + { + "epoch": 0.6912039925140362, + "grad_norm": 0.1392402498661132, + "learning_rate": 9.092258632359714e-06, + "loss": 0.0523, + "num_tokens": 44393864.0, + "step": 554 + }, + { + "epoch": 0.6924516531503431, + "grad_norm": 0.150967684448626, + "learning_rate": 9.088605727737405e-06, + "loss": 0.0491, + "num_tokens": 44472774.0, + "step": 555 + }, + { + "epoch": 0.6936993137866501, + "grad_norm": 0.13327113149933534, + "learning_rate": 9.08494631588823e-06, + "loss": 0.0479, + "num_tokens": 44552481.0, + "step": 556 + }, + { + "epoch": 0.6949469744229569, + "grad_norm": 0.15639101173467318, + "learning_rate": 9.08128040344781e-06, + "loss": 0.0481, + "num_tokens": 44632296.0, + "step": 557 + }, + { + "epoch": 0.6961946350592639, + "grad_norm": 0.15006175503991592, + "learning_rate": 9.077607997063546e-06, + "loss": 0.0513, + "num_tokens": 44711761.0, + "step": 558 + }, + { + "epoch": 0.6974422956955708, + "grad_norm": 0.13626963289805946, + "learning_rate": 9.073929103394627e-06, + "loss": 0.045, + "num_tokens": 44790965.0, + "step": 559 + }, + { + "epoch": 0.6986899563318777, + "grad_norm": 0.13584036137479072, + "learning_rate": 9.070243729111998e-06, + "loss": 0.0499, + "num_tokens": 44871764.0, + "step": 560 + }, + { + "epoch": 0.6999376169681847, + "grad_norm": 0.12333589383269866, + "learning_rate": 9.066551880898356e-06, + "loss": 0.0449, + "num_tokens": 44951455.0, + "step": 561 + }, + { + "epoch": 0.7011852776044916, + "grad_norm": 0.13494757593024734, + "learning_rate": 9.062853565448137e-06, + "loss": 0.0468, + "num_tokens": 45030780.0, + "step": 562 + }, + { + "epoch": 0.7024329382407986, + "grad_norm": 0.13272773332650525, + "learning_rate": 9.059148789467508e-06, + "loss": 0.0471, + "num_tokens": 45110544.0, + "step": 563 + }, + { + "epoch": 0.7036805988771054, + "grad_norm": 0.13404989182349922, + "learning_rate": 9.055437559674343e-06, + "loss": 0.05, + "num_tokens": 45190997.0, + "step": 564 + }, + { + "epoch": 0.7049282595134123, + "grad_norm": 0.13081012181879706, + "learning_rate": 9.051719882798226e-06, + "loss": 0.0466, + "num_tokens": 45270643.0, + "step": 565 + }, + { + "epoch": 0.7061759201497193, + "grad_norm": 0.125779618664369, + "learning_rate": 9.047995765580428e-06, + "loss": 0.0464, + "num_tokens": 45351906.0, + "step": 566 + }, + { + "epoch": 0.7074235807860262, + "grad_norm": 0.1318104484558839, + "learning_rate": 9.044265214773901e-06, + "loss": 0.0485, + "num_tokens": 45431479.0, + "step": 567 + }, + { + "epoch": 0.7086712414223332, + "grad_norm": 0.13586766601016315, + "learning_rate": 9.040528237143258e-06, + "loss": 0.0508, + "num_tokens": 45511600.0, + "step": 568 + }, + { + "epoch": 0.7099189020586401, + "grad_norm": 0.1493946566599566, + "learning_rate": 9.036784839464771e-06, + "loss": 0.0482, + "num_tokens": 45591552.0, + "step": 569 + }, + { + "epoch": 0.7111665626949469, + "grad_norm": 0.13417312397284684, + "learning_rate": 9.033035028526352e-06, + "loss": 0.0424, + "num_tokens": 45670440.0, + "step": 570 + }, + { + "epoch": 0.7124142233312539, + "grad_norm": 0.13228632139209826, + "learning_rate": 9.029278811127539e-06, + "loss": 0.0462, + "num_tokens": 45750706.0, + "step": 571 + }, + { + "epoch": 0.7136618839675608, + "grad_norm": 0.12684341302244245, + "learning_rate": 9.025516194079493e-06, + "loss": 0.0447, + "num_tokens": 45830615.0, + "step": 572 + }, + { + "epoch": 0.7149095446038678, + "grad_norm": 0.14484313305712285, + "learning_rate": 9.021747184204974e-06, + "loss": 0.0502, + "num_tokens": 45912425.0, + "step": 573 + }, + { + "epoch": 0.7161572052401747, + "grad_norm": 0.13886894398435345, + "learning_rate": 9.017971788338338e-06, + "loss": 0.0519, + "num_tokens": 45994773.0, + "step": 574 + }, + { + "epoch": 0.7174048658764816, + "grad_norm": 0.13735085430234897, + "learning_rate": 9.014190013325514e-06, + "loss": 0.0486, + "num_tokens": 46075176.0, + "step": 575 + }, + { + "epoch": 0.7186525265127885, + "grad_norm": 0.14128392290391406, + "learning_rate": 9.010401866024007e-06, + "loss": 0.0463, + "num_tokens": 46155352.0, + "step": 576 + }, + { + "epoch": 0.7199001871490954, + "grad_norm": 0.129131515512014, + "learning_rate": 9.006607353302874e-06, + "loss": 0.0489, + "num_tokens": 46236369.0, + "step": 577 + }, + { + "epoch": 0.7211478477854024, + "grad_norm": 0.1388535780094133, + "learning_rate": 9.00280648204271e-06, + "loss": 0.0457, + "num_tokens": 46315492.0, + "step": 578 + }, + { + "epoch": 0.7223955084217093, + "grad_norm": 0.13207054260238488, + "learning_rate": 8.998999259135648e-06, + "loss": 0.0484, + "num_tokens": 46395923.0, + "step": 579 + }, + { + "epoch": 0.7236431690580162, + "grad_norm": 0.13220982688358388, + "learning_rate": 8.99518569148533e-06, + "loss": 0.0477, + "num_tokens": 46476015.0, + "step": 580 + }, + { + "epoch": 0.7248908296943232, + "grad_norm": 0.13584168006721206, + "learning_rate": 8.991365786006908e-06, + "loss": 0.0457, + "num_tokens": 46555654.0, + "step": 581 + }, + { + "epoch": 0.72613849033063, + "grad_norm": 0.13797132921454844, + "learning_rate": 8.987539549627026e-06, + "loss": 0.0488, + "num_tokens": 46635834.0, + "step": 582 + }, + { + "epoch": 0.727386150966937, + "grad_norm": 0.12983941108044414, + "learning_rate": 8.983706989283804e-06, + "loss": 0.0439, + "num_tokens": 46715099.0, + "step": 583 + }, + { + "epoch": 0.7286338116032439, + "grad_norm": 0.12724057093818048, + "learning_rate": 8.979868111926836e-06, + "loss": 0.0453, + "num_tokens": 46794497.0, + "step": 584 + }, + { + "epoch": 0.7298814722395508, + "grad_norm": 0.14712794756597866, + "learning_rate": 8.976022924517167e-06, + "loss": 0.0523, + "num_tokens": 46875764.0, + "step": 585 + }, + { + "epoch": 0.7311291328758578, + "grad_norm": 0.1260016012184683, + "learning_rate": 8.972171434027283e-06, + "loss": 0.0467, + "num_tokens": 46954348.0, + "step": 586 + }, + { + "epoch": 0.7323767935121647, + "grad_norm": 0.13638875135524814, + "learning_rate": 8.968313647441098e-06, + "loss": 0.0495, + "num_tokens": 47035156.0, + "step": 587 + }, + { + "epoch": 0.7336244541484717, + "grad_norm": 0.13598723506181007, + "learning_rate": 8.964449571753949e-06, + "loss": 0.048, + "num_tokens": 47114933.0, + "step": 588 + }, + { + "epoch": 0.7348721147847785, + "grad_norm": 0.13957131093949476, + "learning_rate": 8.96057921397257e-06, + "loss": 0.0476, + "num_tokens": 47195699.0, + "step": 589 + }, + { + "epoch": 0.7361197754210854, + "grad_norm": 0.13018188693022992, + "learning_rate": 8.95670258111509e-06, + "loss": 0.0463, + "num_tokens": 47275340.0, + "step": 590 + }, + { + "epoch": 0.7373674360573924, + "grad_norm": 0.12927934262951574, + "learning_rate": 8.95281968021102e-06, + "loss": 0.0472, + "num_tokens": 47355190.0, + "step": 591 + }, + { + "epoch": 0.7386150966936993, + "grad_norm": 0.13407866372594857, + "learning_rate": 8.948930518301228e-06, + "loss": 0.047, + "num_tokens": 47435372.0, + "step": 592 + }, + { + "epoch": 0.7398627573300063, + "grad_norm": 0.13953419467602407, + "learning_rate": 8.945035102437943e-06, + "loss": 0.0457, + "num_tokens": 47515076.0, + "step": 593 + }, + { + "epoch": 0.7411104179663132, + "grad_norm": 0.13257678892969102, + "learning_rate": 8.94113343968473e-06, + "loss": 0.0472, + "num_tokens": 47594913.0, + "step": 594 + }, + { + "epoch": 0.74235807860262, + "grad_norm": 0.13956156798849514, + "learning_rate": 8.937225537116482e-06, + "loss": 0.0499, + "num_tokens": 47674616.0, + "step": 595 + }, + { + "epoch": 0.743605739238927, + "grad_norm": 0.12720381017536367, + "learning_rate": 8.93331140181941e-06, + "loss": 0.0488, + "num_tokens": 47754372.0, + "step": 596 + }, + { + "epoch": 0.7448533998752339, + "grad_norm": 0.13600897445694435, + "learning_rate": 8.929391040891022e-06, + "loss": 0.0521, + "num_tokens": 47834920.0, + "step": 597 + }, + { + "epoch": 0.7461010605115409, + "grad_norm": 0.1392796000732699, + "learning_rate": 8.92546446144012e-06, + "loss": 0.0472, + "num_tokens": 47914382.0, + "step": 598 + }, + { + "epoch": 0.7473487211478478, + "grad_norm": 0.1385112153330734, + "learning_rate": 8.921531670586778e-06, + "loss": 0.0447, + "num_tokens": 47993846.0, + "step": 599 + }, + { + "epoch": 0.7485963817841547, + "grad_norm": 0.12658122859507898, + "learning_rate": 8.917592675462333e-06, + "loss": 0.0464, + "num_tokens": 48073646.0, + "step": 600 + }, + { + "epoch": 0.7498440424204617, + "grad_norm": 0.1419041010382823, + "learning_rate": 8.913647483209376e-06, + "loss": 0.0466, + "num_tokens": 48153764.0, + "step": 601 + }, + { + "epoch": 0.7510917030567685, + "grad_norm": 0.12852069420000675, + "learning_rate": 8.909696100981734e-06, + "loss": 0.046, + "num_tokens": 48235033.0, + "step": 602 + }, + { + "epoch": 0.7523393636930755, + "grad_norm": 0.13929453816389586, + "learning_rate": 8.905738535944453e-06, + "loss": 0.0456, + "num_tokens": 48315153.0, + "step": 603 + }, + { + "epoch": 0.7535870243293824, + "grad_norm": 0.1294892482761023, + "learning_rate": 8.901774795273799e-06, + "loss": 0.0487, + "num_tokens": 48394989.0, + "step": 604 + }, + { + "epoch": 0.7548346849656893, + "grad_norm": 0.1265516497561457, + "learning_rate": 8.897804886157229e-06, + "loss": 0.0447, + "num_tokens": 48475313.0, + "step": 605 + }, + { + "epoch": 0.7560823456019963, + "grad_norm": 0.1250026876119754, + "learning_rate": 8.893828815793389e-06, + "loss": 0.0445, + "num_tokens": 48554294.0, + "step": 606 + }, + { + "epoch": 0.7573300062383032, + "grad_norm": 0.1395621069701436, + "learning_rate": 8.889846591392097e-06, + "loss": 0.045, + "num_tokens": 48633837.0, + "step": 607 + }, + { + "epoch": 0.7585776668746101, + "grad_norm": 0.1292703654270298, + "learning_rate": 8.88585822017433e-06, + "loss": 0.0438, + "num_tokens": 48712902.0, + "step": 608 + }, + { + "epoch": 0.759825327510917, + "grad_norm": 0.12675734039745323, + "learning_rate": 8.881863709372207e-06, + "loss": 0.0461, + "num_tokens": 48792522.0, + "step": 609 + }, + { + "epoch": 0.7610729881472239, + "grad_norm": 0.12830442879917106, + "learning_rate": 8.877863066228987e-06, + "loss": 0.0481, + "num_tokens": 48873543.0, + "step": 610 + }, + { + "epoch": 0.7623206487835309, + "grad_norm": 0.13244561051443332, + "learning_rate": 8.873856297999045e-06, + "loss": 0.047, + "num_tokens": 48952745.0, + "step": 611 + }, + { + "epoch": 0.7635683094198378, + "grad_norm": 0.1615375485783518, + "learning_rate": 8.869843411947862e-06, + "loss": 0.0496, + "num_tokens": 49033599.0, + "step": 612 + }, + { + "epoch": 0.7648159700561448, + "grad_norm": 0.13780128644772976, + "learning_rate": 8.865824415352014e-06, + "loss": 0.0461, + "num_tokens": 49113085.0, + "step": 613 + }, + { + "epoch": 0.7660636306924516, + "grad_norm": 0.13586985951384964, + "learning_rate": 8.861799315499157e-06, + "loss": 0.046, + "num_tokens": 49191966.0, + "step": 614 + }, + { + "epoch": 0.7673112913287585, + "grad_norm": 0.1271021845898844, + "learning_rate": 8.85776811968801e-06, + "loss": 0.0435, + "num_tokens": 49271798.0, + "step": 615 + }, + { + "epoch": 0.7685589519650655, + "grad_norm": 0.13241677295783946, + "learning_rate": 8.853730835228354e-06, + "loss": 0.0462, + "num_tokens": 49351085.0, + "step": 616 + }, + { + "epoch": 0.7698066126013724, + "grad_norm": 0.1473649979705641, + "learning_rate": 8.849687469441003e-06, + "loss": 0.0512, + "num_tokens": 49432116.0, + "step": 617 + }, + { + "epoch": 0.7710542732376794, + "grad_norm": 0.13824389423293423, + "learning_rate": 8.845638029657804e-06, + "loss": 0.0437, + "num_tokens": 49511692.0, + "step": 618 + }, + { + "epoch": 0.7723019338739863, + "grad_norm": 0.14209836646808868, + "learning_rate": 8.841582523221614e-06, + "loss": 0.0491, + "num_tokens": 49592183.0, + "step": 619 + }, + { + "epoch": 0.7735495945102931, + "grad_norm": 0.12494073093259925, + "learning_rate": 8.83752095748629e-06, + "loss": 0.0456, + "num_tokens": 49672496.0, + "step": 620 + }, + { + "epoch": 0.7747972551466001, + "grad_norm": 0.1494311981433102, + "learning_rate": 8.833453339816682e-06, + "loss": 0.0544, + "num_tokens": 49752281.0, + "step": 621 + }, + { + "epoch": 0.776044915782907, + "grad_norm": 0.14004390380525242, + "learning_rate": 8.829379677588607e-06, + "loss": 0.0486, + "num_tokens": 49831426.0, + "step": 622 + }, + { + "epoch": 0.777292576419214, + "grad_norm": 0.1339521325499603, + "learning_rate": 8.825299978188847e-06, + "loss": 0.0462, + "num_tokens": 49911746.0, + "step": 623 + }, + { + "epoch": 0.7785402370555209, + "grad_norm": 0.12799070765367507, + "learning_rate": 8.821214249015133e-06, + "loss": 0.0427, + "num_tokens": 49990124.0, + "step": 624 + }, + { + "epoch": 0.7797878976918278, + "grad_norm": 0.14563763920223902, + "learning_rate": 8.817122497476122e-06, + "loss": 0.0433, + "num_tokens": 50069850.0, + "step": 625 + }, + { + "epoch": 0.7810355583281348, + "grad_norm": 0.1412962344681427, + "learning_rate": 8.8130247309914e-06, + "loss": 0.0465, + "num_tokens": 50148928.0, + "step": 626 + }, + { + "epoch": 0.7822832189644416, + "grad_norm": 0.12673793649670753, + "learning_rate": 8.808920956991455e-06, + "loss": 0.0486, + "num_tokens": 50229320.0, + "step": 627 + }, + { + "epoch": 0.7835308796007486, + "grad_norm": 0.12112230001153174, + "learning_rate": 8.80481118291767e-06, + "loss": 0.0467, + "num_tokens": 50310309.0, + "step": 628 + }, + { + "epoch": 0.7847785402370555, + "grad_norm": 0.12671902753289974, + "learning_rate": 8.800695416222305e-06, + "loss": 0.0455, + "num_tokens": 50389538.0, + "step": 629 + }, + { + "epoch": 0.7860262008733624, + "grad_norm": 0.12959073046474556, + "learning_rate": 8.796573664368492e-06, + "loss": 0.0461, + "num_tokens": 50469795.0, + "step": 630 + }, + { + "epoch": 0.7872738615096694, + "grad_norm": 0.13262046702793368, + "learning_rate": 8.792445934830215e-06, + "loss": 0.0475, + "num_tokens": 50550218.0, + "step": 631 + }, + { + "epoch": 0.7885215221459763, + "grad_norm": 0.13536729420156757, + "learning_rate": 8.78831223509229e-06, + "loss": 0.0448, + "num_tokens": 50630135.0, + "step": 632 + }, + { + "epoch": 0.7897691827822833, + "grad_norm": 0.1342625728159916, + "learning_rate": 8.784172572650366e-06, + "loss": 0.0425, + "num_tokens": 50709708.0, + "step": 633 + }, + { + "epoch": 0.7910168434185901, + "grad_norm": 0.1359058372010772, + "learning_rate": 8.780026955010903e-06, + "loss": 0.0441, + "num_tokens": 50789219.0, + "step": 634 + }, + { + "epoch": 0.7922645040548971, + "grad_norm": 0.12874679521506371, + "learning_rate": 8.77587538969116e-06, + "loss": 0.0437, + "num_tokens": 50869389.0, + "step": 635 + }, + { + "epoch": 0.793512164691204, + "grad_norm": 0.1369083708111066, + "learning_rate": 8.771717884219177e-06, + "loss": 0.0493, + "num_tokens": 50949396.0, + "step": 636 + }, + { + "epoch": 0.7947598253275109, + "grad_norm": 0.14129983597974075, + "learning_rate": 8.767554446133771e-06, + "loss": 0.0451, + "num_tokens": 51029592.0, + "step": 637 + }, + { + "epoch": 0.7960074859638179, + "grad_norm": 0.1446719706863878, + "learning_rate": 8.763385082984511e-06, + "loss": 0.0506, + "num_tokens": 51110547.0, + "step": 638 + }, + { + "epoch": 0.7972551466001248, + "grad_norm": 0.13560344797018603, + "learning_rate": 8.759209802331714e-06, + "loss": 0.0472, + "num_tokens": 51191617.0, + "step": 639 + }, + { + "epoch": 0.7985028072364317, + "grad_norm": 0.15251529644301726, + "learning_rate": 8.755028611746426e-06, + "loss": 0.048, + "num_tokens": 51271860.0, + "step": 640 + }, + { + "epoch": 0.7997504678727386, + "grad_norm": 0.12215214673056587, + "learning_rate": 8.750841518810407e-06, + "loss": 0.0476, + "num_tokens": 51351464.0, + "step": 641 + }, + { + "epoch": 0.8009981285090455, + "grad_norm": 0.1295074754396241, + "learning_rate": 8.746648531116126e-06, + "loss": 0.0443, + "num_tokens": 51432841.0, + "step": 642 + }, + { + "epoch": 0.8022457891453525, + "grad_norm": 0.12895613658486638, + "learning_rate": 8.742449656266733e-06, + "loss": 0.0456, + "num_tokens": 51512108.0, + "step": 643 + }, + { + "epoch": 0.8034934497816594, + "grad_norm": 0.12440364391857861, + "learning_rate": 8.738244901876061e-06, + "loss": 0.0447, + "num_tokens": 51592279.0, + "step": 644 + }, + { + "epoch": 0.8047411104179664, + "grad_norm": 0.1326702881926524, + "learning_rate": 8.7340342755686e-06, + "loss": 0.0447, + "num_tokens": 51672173.0, + "step": 645 + }, + { + "epoch": 0.8059887710542732, + "grad_norm": 0.13915957948227095, + "learning_rate": 8.729817784979485e-06, + "loss": 0.049, + "num_tokens": 51753477.0, + "step": 646 + }, + { + "epoch": 0.8072364316905801, + "grad_norm": 0.14093605623183406, + "learning_rate": 8.725595437754489e-06, + "loss": 0.0649, + "num_tokens": 51834370.0, + "step": 647 + }, + { + "epoch": 0.8084840923268871, + "grad_norm": 0.1257616499498055, + "learning_rate": 8.721367241550007e-06, + "loss": 0.0433, + "num_tokens": 51913279.0, + "step": 648 + }, + { + "epoch": 0.809731752963194, + "grad_norm": 0.14095744719546538, + "learning_rate": 8.717133204033034e-06, + "loss": 0.0418, + "num_tokens": 51991782.0, + "step": 649 + }, + { + "epoch": 0.810979413599501, + "grad_norm": 0.13700683189419194, + "learning_rate": 8.71289333288116e-06, + "loss": 0.0469, + "num_tokens": 52071863.0, + "step": 650 + }, + { + "epoch": 0.8122270742358079, + "grad_norm": 0.13167335028118232, + "learning_rate": 8.708647635782553e-06, + "loss": 0.0469, + "num_tokens": 52151659.0, + "step": 651 + }, + { + "epoch": 0.8134747348721147, + "grad_norm": 0.13212682360066533, + "learning_rate": 8.704396120435944e-06, + "loss": 0.0418, + "num_tokens": 52230329.0, + "step": 652 + }, + { + "epoch": 0.8147223955084217, + "grad_norm": 0.13460259814153436, + "learning_rate": 8.700138794550617e-06, + "loss": 0.0477, + "num_tokens": 52310621.0, + "step": 653 + }, + { + "epoch": 0.8159700561447286, + "grad_norm": 0.1262576220568161, + "learning_rate": 8.695875665846392e-06, + "loss": 0.043, + "num_tokens": 52390363.0, + "step": 654 + }, + { + "epoch": 0.8172177167810356, + "grad_norm": 0.13451324588776792, + "learning_rate": 8.691606742053608e-06, + "loss": 0.0445, + "num_tokens": 52470407.0, + "step": 655 + }, + { + "epoch": 0.8184653774173425, + "grad_norm": 0.13838060943839892, + "learning_rate": 8.687332030913114e-06, + "loss": 0.0455, + "num_tokens": 52550801.0, + "step": 656 + }, + { + "epoch": 0.8197130380536494, + "grad_norm": 0.12544846461435052, + "learning_rate": 8.683051540176252e-06, + "loss": 0.0453, + "num_tokens": 52630184.0, + "step": 657 + }, + { + "epoch": 0.8209606986899564, + "grad_norm": 0.1227925470813221, + "learning_rate": 8.67876527760485e-06, + "loss": 0.0449, + "num_tokens": 52710226.0, + "step": 658 + }, + { + "epoch": 0.8222083593262632, + "grad_norm": 0.14125944393926942, + "learning_rate": 8.674473250971194e-06, + "loss": 0.0479, + "num_tokens": 52789646.0, + "step": 659 + }, + { + "epoch": 0.8234560199625702, + "grad_norm": 0.12898991776625454, + "learning_rate": 8.670175468058027e-06, + "loss": 0.0453, + "num_tokens": 52870777.0, + "step": 660 + }, + { + "epoch": 0.8247036805988771, + "grad_norm": 0.12662076248706405, + "learning_rate": 8.665871936658525e-06, + "loss": 0.0464, + "num_tokens": 52950874.0, + "step": 661 + }, + { + "epoch": 0.825951341235184, + "grad_norm": 0.1246338732142322, + "learning_rate": 8.661562664576297e-06, + "loss": 0.0449, + "num_tokens": 53030308.0, + "step": 662 + }, + { + "epoch": 0.827199001871491, + "grad_norm": 0.12386450841807353, + "learning_rate": 8.65724765962535e-06, + "loss": 0.0453, + "num_tokens": 53110905.0, + "step": 663 + }, + { + "epoch": 0.8284466625077979, + "grad_norm": 0.1252925833827586, + "learning_rate": 8.652926929630097e-06, + "loss": 0.0448, + "num_tokens": 53190924.0, + "step": 664 + }, + { + "epoch": 0.8296943231441049, + "grad_norm": 0.13199276345367864, + "learning_rate": 8.648600482425325e-06, + "loss": 0.0469, + "num_tokens": 53271193.0, + "step": 665 + }, + { + "epoch": 0.8309419837804117, + "grad_norm": 0.1226282221225014, + "learning_rate": 8.644268325856193e-06, + "loss": 0.0434, + "num_tokens": 53350537.0, + "step": 666 + }, + { + "epoch": 0.8321896444167186, + "grad_norm": 0.12053686374626792, + "learning_rate": 8.639930467778206e-06, + "loss": 0.0438, + "num_tokens": 53432655.0, + "step": 667 + }, + { + "epoch": 0.8334373050530256, + "grad_norm": 0.12451975349671367, + "learning_rate": 8.635586916057214e-06, + "loss": 0.0445, + "num_tokens": 53512112.0, + "step": 668 + }, + { + "epoch": 0.8346849656893325, + "grad_norm": 0.12083730426276222, + "learning_rate": 8.631237678569391e-06, + "loss": 0.0462, + "num_tokens": 53593075.0, + "step": 669 + }, + { + "epoch": 0.8359326263256395, + "grad_norm": 0.12769247867470077, + "learning_rate": 8.626882763201215e-06, + "loss": 0.0429, + "num_tokens": 53672672.0, + "step": 670 + }, + { + "epoch": 0.8371802869619464, + "grad_norm": 0.1238630262805468, + "learning_rate": 8.62252217784947e-06, + "loss": 0.0427, + "num_tokens": 53751669.0, + "step": 671 + }, + { + "epoch": 0.8384279475982532, + "grad_norm": 0.1391950829315506, + "learning_rate": 8.61815593042121e-06, + "loss": 0.0437, + "num_tokens": 53831916.0, + "step": 672 + }, + { + "epoch": 0.8396756082345602, + "grad_norm": 0.12832813628154105, + "learning_rate": 8.61378402883376e-06, + "loss": 0.0454, + "num_tokens": 53911127.0, + "step": 673 + }, + { + "epoch": 0.8409232688708671, + "grad_norm": 0.12869289586194874, + "learning_rate": 8.609406481014704e-06, + "loss": 0.0493, + "num_tokens": 53992015.0, + "step": 674 + }, + { + "epoch": 0.8421709295071741, + "grad_norm": 0.1295243174606551, + "learning_rate": 8.605023294901857e-06, + "loss": 0.0453, + "num_tokens": 54074688.0, + "step": 675 + }, + { + "epoch": 0.843418590143481, + "grad_norm": 0.14327157160951173, + "learning_rate": 8.600634478443262e-06, + "loss": 0.0475, + "num_tokens": 54154762.0, + "step": 676 + }, + { + "epoch": 0.8446662507797879, + "grad_norm": 0.13689413605347608, + "learning_rate": 8.596240039597168e-06, + "loss": 0.0487, + "num_tokens": 54234083.0, + "step": 677 + }, + { + "epoch": 0.8459139114160948, + "grad_norm": 0.12476227243203361, + "learning_rate": 8.59183998633202e-06, + "loss": 0.043, + "num_tokens": 54313838.0, + "step": 678 + }, + { + "epoch": 0.8471615720524017, + "grad_norm": 0.12843919174639454, + "learning_rate": 8.587434326626446e-06, + "loss": 0.0439, + "num_tokens": 54393140.0, + "step": 679 + }, + { + "epoch": 0.8484092326887087, + "grad_norm": 0.13686412036721382, + "learning_rate": 8.58302306846924e-06, + "loss": 0.0501, + "num_tokens": 54474299.0, + "step": 680 + }, + { + "epoch": 0.8496568933250156, + "grad_norm": 0.13617664891691592, + "learning_rate": 8.57860621985934e-06, + "loss": 0.0445, + "num_tokens": 54553494.0, + "step": 681 + }, + { + "epoch": 0.8509045539613225, + "grad_norm": 0.1271657490150743, + "learning_rate": 8.574183788805838e-06, + "loss": 0.044, + "num_tokens": 54633507.0, + "step": 682 + }, + { + "epoch": 0.8521522145976295, + "grad_norm": 0.13384106872272458, + "learning_rate": 8.56975578332793e-06, + "loss": 0.0443, + "num_tokens": 54713372.0, + "step": 683 + }, + { + "epoch": 0.8533998752339363, + "grad_norm": 0.11560773243774969, + "learning_rate": 8.56532221145493e-06, + "loss": 0.0429, + "num_tokens": 54792488.0, + "step": 684 + }, + { + "epoch": 0.8546475358702433, + "grad_norm": 0.1212428276738136, + "learning_rate": 8.560883081226246e-06, + "loss": 0.0443, + "num_tokens": 54873959.0, + "step": 685 + }, + { + "epoch": 0.8558951965065502, + "grad_norm": 0.14025896849225925, + "learning_rate": 8.55643840069136e-06, + "loss": 0.0466, + "num_tokens": 54954820.0, + "step": 686 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.13808021632034995, + "learning_rate": 8.551988177909825e-06, + "loss": 0.0462, + "num_tokens": 55035474.0, + "step": 687 + }, + { + "epoch": 0.8583905177791641, + "grad_norm": 0.14688685012575073, + "learning_rate": 8.547532420951236e-06, + "loss": 0.0474, + "num_tokens": 55115987.0, + "step": 688 + }, + { + "epoch": 0.859638178415471, + "grad_norm": 0.13519377075289518, + "learning_rate": 8.543071137895231e-06, + "loss": 0.0486, + "num_tokens": 55195473.0, + "step": 689 + }, + { + "epoch": 0.860885839051778, + "grad_norm": 0.1383342982212882, + "learning_rate": 8.538604336831463e-06, + "loss": 0.0456, + "num_tokens": 55274897.0, + "step": 690 + }, + { + "epoch": 0.8621334996880848, + "grad_norm": 0.12302270516387563, + "learning_rate": 8.53413202585959e-06, + "loss": 0.042, + "num_tokens": 55354684.0, + "step": 691 + }, + { + "epoch": 0.8633811603243917, + "grad_norm": 0.12034339097373062, + "learning_rate": 8.529654213089266e-06, + "loss": 0.0416, + "num_tokens": 55434479.0, + "step": 692 + }, + { + "epoch": 0.8646288209606987, + "grad_norm": 0.12441949815381428, + "learning_rate": 8.52517090664012e-06, + "loss": 0.0434, + "num_tokens": 55513816.0, + "step": 693 + }, + { + "epoch": 0.8658764815970056, + "grad_norm": 0.13357252884483028, + "learning_rate": 8.520682114641739e-06, + "loss": 0.0441, + "num_tokens": 55593435.0, + "step": 694 + }, + { + "epoch": 0.8671241422333126, + "grad_norm": 0.12424156938040647, + "learning_rate": 8.51618784523366e-06, + "loss": 0.0445, + "num_tokens": 55673866.0, + "step": 695 + }, + { + "epoch": 0.8683718028696195, + "grad_norm": 0.13488286343188147, + "learning_rate": 8.511688106565356e-06, + "loss": 0.0462, + "num_tokens": 55754183.0, + "step": 696 + }, + { + "epoch": 0.8696194635059263, + "grad_norm": 0.12902239020926456, + "learning_rate": 8.507182906796209e-06, + "loss": 0.0479, + "num_tokens": 55835122.0, + "step": 697 + }, + { + "epoch": 0.8708671241422333, + "grad_norm": 0.12700265076298542, + "learning_rate": 8.50267225409551e-06, + "loss": 0.0487, + "num_tokens": 55916059.0, + "step": 698 + }, + { + "epoch": 0.8721147847785402, + "grad_norm": 0.12439806501667146, + "learning_rate": 8.498156156642434e-06, + "loss": 0.0482, + "num_tokens": 55996309.0, + "step": 699 + }, + { + "epoch": 0.8733624454148472, + "grad_norm": 0.13662835946719407, + "learning_rate": 8.493634622626031e-06, + "loss": 0.0487, + "num_tokens": 56076989.0, + "step": 700 + }, + { + "epoch": 0.8746101060511541, + "grad_norm": 0.13097355494242618, + "learning_rate": 8.489107660245208e-06, + "loss": 0.0455, + "num_tokens": 56156600.0, + "step": 701 + }, + { + "epoch": 0.875857766687461, + "grad_norm": 0.12619811815079715, + "learning_rate": 8.484575277708718e-06, + "loss": 0.0482, + "num_tokens": 56237662.0, + "step": 702 + }, + { + "epoch": 0.877105427323768, + "grad_norm": 0.13272981560160393, + "learning_rate": 8.480037483235142e-06, + "loss": 0.0443, + "num_tokens": 56318037.0, + "step": 703 + }, + { + "epoch": 0.8783530879600748, + "grad_norm": 0.13876316053346943, + "learning_rate": 8.475494285052873e-06, + "loss": 0.0433, + "num_tokens": 56397397.0, + "step": 704 + }, + { + "epoch": 0.8796007485963818, + "grad_norm": 0.12990102861221764, + "learning_rate": 8.470945691400095e-06, + "loss": 0.0491, + "num_tokens": 56478651.0, + "step": 705 + }, + { + "epoch": 0.8808484092326887, + "grad_norm": 0.12177730095213436, + "learning_rate": 8.466391710524792e-06, + "loss": 0.0454, + "num_tokens": 56559439.0, + "step": 706 + }, + { + "epoch": 0.8820960698689956, + "grad_norm": 0.13073319119141943, + "learning_rate": 8.461832350684701e-06, + "loss": 0.044, + "num_tokens": 56639419.0, + "step": 707 + }, + { + "epoch": 0.8833437305053026, + "grad_norm": 0.13039116346051194, + "learning_rate": 8.457267620147326e-06, + "loss": 0.0479, + "num_tokens": 56720177.0, + "step": 708 + }, + { + "epoch": 0.8845913911416095, + "grad_norm": 0.11392658067382258, + "learning_rate": 8.452697527189901e-06, + "loss": 0.0417, + "num_tokens": 56799870.0, + "step": 709 + }, + { + "epoch": 0.8858390517779164, + "grad_norm": 0.12172332991038214, + "learning_rate": 8.448122080099384e-06, + "loss": 0.0436, + "num_tokens": 56879737.0, + "step": 710 + }, + { + "epoch": 0.8870867124142233, + "grad_norm": 0.12348030891715099, + "learning_rate": 8.443541287172443e-06, + "loss": 0.0464, + "num_tokens": 56960161.0, + "step": 711 + }, + { + "epoch": 0.8883343730505302, + "grad_norm": 0.1367197777768623, + "learning_rate": 8.438955156715443e-06, + "loss": 0.0472, + "num_tokens": 57040569.0, + "step": 712 + }, + { + "epoch": 0.8895820336868372, + "grad_norm": 0.12968055372030557, + "learning_rate": 8.434363697044423e-06, + "loss": 0.0475, + "num_tokens": 57120700.0, + "step": 713 + }, + { + "epoch": 0.8908296943231441, + "grad_norm": 0.10708595724786292, + "learning_rate": 8.429766916485087e-06, + "loss": 0.0398, + "num_tokens": 57199169.0, + "step": 714 + }, + { + "epoch": 0.8920773549594511, + "grad_norm": 0.1289490659534409, + "learning_rate": 8.42516482337279e-06, + "loss": 0.0419, + "num_tokens": 57278512.0, + "step": 715 + }, + { + "epoch": 0.8933250155957579, + "grad_norm": 0.13480571497635496, + "learning_rate": 8.420557426052513e-06, + "loss": 0.0432, + "num_tokens": 57358212.0, + "step": 716 + }, + { + "epoch": 0.8945726762320648, + "grad_norm": 0.1402455870237538, + "learning_rate": 8.415944732878863e-06, + "loss": 0.0413, + "num_tokens": 57437516.0, + "step": 717 + }, + { + "epoch": 0.8958203368683718, + "grad_norm": 0.13195092592426141, + "learning_rate": 8.411326752216048e-06, + "loss": 0.0469, + "num_tokens": 57518129.0, + "step": 718 + }, + { + "epoch": 0.8970679975046787, + "grad_norm": 0.12111826822571906, + "learning_rate": 8.406703492437863e-06, + "loss": 0.0464, + "num_tokens": 57598680.0, + "step": 719 + }, + { + "epoch": 0.8983156581409857, + "grad_norm": 0.1316599285653508, + "learning_rate": 8.402074961927674e-06, + "loss": 0.0467, + "num_tokens": 57679535.0, + "step": 720 + }, + { + "epoch": 0.8995633187772926, + "grad_norm": 0.12648224011362696, + "learning_rate": 8.397441169078404e-06, + "loss": 0.0474, + "num_tokens": 57761803.0, + "step": 721 + }, + { + "epoch": 0.9008109794135996, + "grad_norm": 0.13349369075280076, + "learning_rate": 8.392802122292522e-06, + "loss": 0.0453, + "num_tokens": 57841711.0, + "step": 722 + }, + { + "epoch": 0.9020586400499064, + "grad_norm": 0.12412030984325427, + "learning_rate": 8.388157829982023e-06, + "loss": 0.0462, + "num_tokens": 57921862.0, + "step": 723 + }, + { + "epoch": 0.9033063006862133, + "grad_norm": 0.12191290188001579, + "learning_rate": 8.383508300568409e-06, + "loss": 0.0423, + "num_tokens": 58001170.0, + "step": 724 + }, + { + "epoch": 0.9045539613225203, + "grad_norm": 0.12947888801849444, + "learning_rate": 8.378853542482687e-06, + "loss": 0.0444, + "num_tokens": 58078720.0, + "step": 725 + }, + { + "epoch": 0.9058016219588272, + "grad_norm": 0.12187109531791596, + "learning_rate": 8.374193564165338e-06, + "loss": 0.0417, + "num_tokens": 58158057.0, + "step": 726 + }, + { + "epoch": 0.9070492825951342, + "grad_norm": 0.1303865282661451, + "learning_rate": 8.36952837406631e-06, + "loss": 0.044, + "num_tokens": 58237021.0, + "step": 727 + }, + { + "epoch": 0.9082969432314411, + "grad_norm": 0.12384626240849539, + "learning_rate": 8.364857980645006e-06, + "loss": 0.0436, + "num_tokens": 58318537.0, + "step": 728 + }, + { + "epoch": 0.9095446038677479, + "grad_norm": 0.123878366797376, + "learning_rate": 8.360182392370258e-06, + "loss": 0.0463, + "num_tokens": 58398712.0, + "step": 729 + }, + { + "epoch": 0.9107922645040549, + "grad_norm": 0.1252594849614702, + "learning_rate": 8.355501617720321e-06, + "loss": 0.0469, + "num_tokens": 58480120.0, + "step": 730 + }, + { + "epoch": 0.9120399251403618, + "grad_norm": 0.12212319000991989, + "learning_rate": 8.350815665182855e-06, + "loss": 0.0415, + "num_tokens": 58559337.0, + "step": 731 + }, + { + "epoch": 0.9132875857766688, + "grad_norm": 0.13115254226617012, + "learning_rate": 8.34612454325491e-06, + "loss": 0.045, + "num_tokens": 58639403.0, + "step": 732 + }, + { + "epoch": 0.9145352464129757, + "grad_norm": 0.1153903682923885, + "learning_rate": 8.341428260442907e-06, + "loss": 0.0421, + "num_tokens": 58719196.0, + "step": 733 + }, + { + "epoch": 0.9157829070492826, + "grad_norm": 0.1350993442001407, + "learning_rate": 8.336726825262622e-06, + "loss": 0.0458, + "num_tokens": 58798792.0, + "step": 734 + }, + { + "epoch": 0.9170305676855895, + "grad_norm": 0.116827202506226, + "learning_rate": 8.332020246239183e-06, + "loss": 0.0454, + "num_tokens": 58878585.0, + "step": 735 + }, + { + "epoch": 0.9182782283218964, + "grad_norm": 0.13063470525394222, + "learning_rate": 8.327308531907039e-06, + "loss": 0.0429, + "num_tokens": 58957398.0, + "step": 736 + }, + { + "epoch": 0.9195258889582034, + "grad_norm": 0.11943276059553698, + "learning_rate": 8.322591690809952e-06, + "loss": 0.0436, + "num_tokens": 59036436.0, + "step": 737 + }, + { + "epoch": 0.9207735495945103, + "grad_norm": 0.13311888551517942, + "learning_rate": 8.317869731500981e-06, + "loss": 0.0472, + "num_tokens": 59117727.0, + "step": 738 + }, + { + "epoch": 0.9220212102308172, + "grad_norm": 0.13441754023620334, + "learning_rate": 8.313142662542465e-06, + "loss": 0.0427, + "num_tokens": 59198600.0, + "step": 739 + }, + { + "epoch": 0.9232688708671242, + "grad_norm": 0.12903006803949557, + "learning_rate": 8.30841049250601e-06, + "loss": 0.043, + "num_tokens": 59276825.0, + "step": 740 + }, + { + "epoch": 0.924516531503431, + "grad_norm": 0.11892900054116198, + "learning_rate": 8.303673229972468e-06, + "loss": 0.0428, + "num_tokens": 59356479.0, + "step": 741 + }, + { + "epoch": 0.925764192139738, + "grad_norm": 0.11014129765319786, + "learning_rate": 8.298930883531932e-06, + "loss": 0.0402, + "num_tokens": 59435634.0, + "step": 742 + }, + { + "epoch": 0.9270118527760449, + "grad_norm": 0.12790379344343886, + "learning_rate": 8.294183461783704e-06, + "loss": 0.0479, + "num_tokens": 59518043.0, + "step": 743 + }, + { + "epoch": 0.9282595134123518, + "grad_norm": 0.1271886081838466, + "learning_rate": 8.2894309733363e-06, + "loss": 0.0449, + "num_tokens": 59598023.0, + "step": 744 + }, + { + "epoch": 0.9295071740486588, + "grad_norm": 0.12608804603183602, + "learning_rate": 8.284673426807413e-06, + "loss": 0.0442, + "num_tokens": 59677990.0, + "step": 745 + }, + { + "epoch": 0.9307548346849657, + "grad_norm": 0.12197353808866439, + "learning_rate": 8.279910830823917e-06, + "loss": 0.0428, + "num_tokens": 59757003.0, + "step": 746 + }, + { + "epoch": 0.9320024953212727, + "grad_norm": 0.1183743614396383, + "learning_rate": 8.275143194021837e-06, + "loss": 0.0421, + "num_tokens": 59835942.0, + "step": 747 + }, + { + "epoch": 0.9332501559575795, + "grad_norm": 0.11788779371119068, + "learning_rate": 8.270370525046338e-06, + "loss": 0.0387, + "num_tokens": 59915228.0, + "step": 748 + }, + { + "epoch": 0.9344978165938864, + "grad_norm": 0.12838666597907625, + "learning_rate": 8.265592832551714e-06, + "loss": 0.0459, + "num_tokens": 59997067.0, + "step": 749 + }, + { + "epoch": 0.9357454772301934, + "grad_norm": 0.12163620589626947, + "learning_rate": 8.260810125201363e-06, + "loss": 0.0441, + "num_tokens": 60076744.0, + "step": 750 + }, + { + "epoch": 0.9369931378665003, + "grad_norm": 0.1208216904975396, + "learning_rate": 8.25602241166778e-06, + "loss": 0.0434, + "num_tokens": 60156803.0, + "step": 751 + }, + { + "epoch": 0.9382407985028073, + "grad_norm": 0.12583135298289463, + "learning_rate": 8.251229700632536e-06, + "loss": 0.0439, + "num_tokens": 60237132.0, + "step": 752 + }, + { + "epoch": 0.9394884591391142, + "grad_norm": 0.12011312406139918, + "learning_rate": 8.246432000786267e-06, + "loss": 0.0409, + "num_tokens": 60316759.0, + "step": 753 + }, + { + "epoch": 0.940736119775421, + "grad_norm": 0.1257483923597958, + "learning_rate": 8.241629320828652e-06, + "loss": 0.0431, + "num_tokens": 60395865.0, + "step": 754 + }, + { + "epoch": 0.941983780411728, + "grad_norm": 0.12598050416310153, + "learning_rate": 8.2368216694684e-06, + "loss": 0.0448, + "num_tokens": 60476033.0, + "step": 755 + }, + { + "epoch": 0.9432314410480349, + "grad_norm": 0.1371990547888002, + "learning_rate": 8.232009055423236e-06, + "loss": 0.0429, + "num_tokens": 60555833.0, + "step": 756 + }, + { + "epoch": 0.9444791016843419, + "grad_norm": 0.12968495911088507, + "learning_rate": 8.227191487419887e-06, + "loss": 0.0431, + "num_tokens": 60635691.0, + "step": 757 + }, + { + "epoch": 0.9457267623206488, + "grad_norm": 0.1222666448004795, + "learning_rate": 8.222368974194057e-06, + "loss": 0.0423, + "num_tokens": 60715830.0, + "step": 758 + }, + { + "epoch": 0.9469744229569557, + "grad_norm": 0.13249737757353294, + "learning_rate": 8.217541524490422e-06, + "loss": 0.0504, + "num_tokens": 60796607.0, + "step": 759 + }, + { + "epoch": 0.9482220835932627, + "grad_norm": 0.12046136010955967, + "learning_rate": 8.212709147062604e-06, + "loss": 0.0407, + "num_tokens": 60875388.0, + "step": 760 + }, + { + "epoch": 0.9494697442295695, + "grad_norm": 0.1347014144528205, + "learning_rate": 8.207871850673168e-06, + "loss": 0.0418, + "num_tokens": 60954745.0, + "step": 761 + }, + { + "epoch": 0.9507174048658765, + "grad_norm": 0.13217879759164464, + "learning_rate": 8.203029644093593e-06, + "loss": 0.0473, + "num_tokens": 61035577.0, + "step": 762 + }, + { + "epoch": 0.9519650655021834, + "grad_norm": 0.12642961452954143, + "learning_rate": 8.198182536104262e-06, + "loss": 0.0438, + "num_tokens": 61114939.0, + "step": 763 + }, + { + "epoch": 0.9532127261384903, + "grad_norm": 0.12699546277574592, + "learning_rate": 8.193330535494448e-06, + "loss": 0.0433, + "num_tokens": 61194941.0, + "step": 764 + }, + { + "epoch": 0.9544603867747973, + "grad_norm": 0.1248723416107508, + "learning_rate": 8.188473651062296e-06, + "loss": 0.0444, + "num_tokens": 61275037.0, + "step": 765 + }, + { + "epoch": 0.9557080474111042, + "grad_norm": 0.1224075580549069, + "learning_rate": 8.183611891614803e-06, + "loss": 0.0413, + "num_tokens": 61354585.0, + "step": 766 + }, + { + "epoch": 0.9569557080474111, + "grad_norm": 0.12023974168888206, + "learning_rate": 8.178745265967808e-06, + "loss": 0.0413, + "num_tokens": 61434070.0, + "step": 767 + }, + { + "epoch": 0.958203368683718, + "grad_norm": 0.11768365656858075, + "learning_rate": 8.173873782945976e-06, + "loss": 0.044, + "num_tokens": 61513480.0, + "step": 768 + }, + { + "epoch": 0.9594510293200249, + "grad_norm": 0.12252686093661061, + "learning_rate": 8.168997451382778e-06, + "loss": 0.0466, + "num_tokens": 61593111.0, + "step": 769 + }, + { + "epoch": 0.9606986899563319, + "grad_norm": 0.12444242831191968, + "learning_rate": 8.164116280120478e-06, + "loss": 0.0467, + "num_tokens": 61673108.0, + "step": 770 + }, + { + "epoch": 0.9619463505926388, + "grad_norm": 0.1170074311782113, + "learning_rate": 8.159230278010113e-06, + "loss": 0.0408, + "num_tokens": 61752840.0, + "step": 771 + }, + { + "epoch": 0.9631940112289458, + "grad_norm": 0.11877585272337694, + "learning_rate": 8.154339453911483e-06, + "loss": 0.0429, + "num_tokens": 61832555.0, + "step": 772 + }, + { + "epoch": 0.9644416718652526, + "grad_norm": 0.12480036823767689, + "learning_rate": 8.14944381669313e-06, + "loss": 0.0452, + "num_tokens": 61912425.0, + "step": 773 + }, + { + "epoch": 0.9656893325015595, + "grad_norm": 0.11828195611774657, + "learning_rate": 8.144543375232322e-06, + "loss": 0.0408, + "num_tokens": 61992459.0, + "step": 774 + }, + { + "epoch": 0.9669369931378665, + "grad_norm": 0.1326374355179406, + "learning_rate": 8.139638138415041e-06, + "loss": 0.0431, + "num_tokens": 62073365.0, + "step": 775 + }, + { + "epoch": 0.9681846537741734, + "grad_norm": 0.13258216878493195, + "learning_rate": 8.134728115135967e-06, + "loss": 0.042, + "num_tokens": 62153344.0, + "step": 776 + }, + { + "epoch": 0.9694323144104804, + "grad_norm": 0.11425515280955807, + "learning_rate": 8.129813314298457e-06, + "loss": 0.0429, + "num_tokens": 62233445.0, + "step": 777 + }, + { + "epoch": 0.9706799750467873, + "grad_norm": 0.12422677837162528, + "learning_rate": 8.124893744814524e-06, + "loss": 0.0483, + "num_tokens": 62313446.0, + "step": 778 + }, + { + "epoch": 0.9719276356830941, + "grad_norm": 0.12548120606989233, + "learning_rate": 8.11996941560484e-06, + "loss": 0.0423, + "num_tokens": 62392794.0, + "step": 779 + }, + { + "epoch": 0.9731752963194011, + "grad_norm": 0.11634689195370958, + "learning_rate": 8.115040335598701e-06, + "loss": 0.0441, + "num_tokens": 62472294.0, + "step": 780 + }, + { + "epoch": 0.974422956955708, + "grad_norm": 0.12855102743082578, + "learning_rate": 8.110106513734019e-06, + "loss": 0.0467, + "num_tokens": 62552135.0, + "step": 781 + }, + { + "epoch": 0.975670617592015, + "grad_norm": 0.12511638236191897, + "learning_rate": 8.105167958957302e-06, + "loss": 0.0425, + "num_tokens": 62632686.0, + "step": 782 + }, + { + "epoch": 0.9769182782283219, + "grad_norm": 0.1235968437568558, + "learning_rate": 8.100224680223647e-06, + "loss": 0.0458, + "num_tokens": 62712757.0, + "step": 783 + }, + { + "epoch": 0.9781659388646288, + "grad_norm": 0.12105321822365409, + "learning_rate": 8.09527668649671e-06, + "loss": 0.0455, + "num_tokens": 62792452.0, + "step": 784 + }, + { + "epoch": 0.9794135995009358, + "grad_norm": 0.12052561809276435, + "learning_rate": 8.090323986748696e-06, + "loss": 0.0433, + "num_tokens": 62872571.0, + "step": 785 + }, + { + "epoch": 0.9806612601372426, + "grad_norm": 0.11630478819939606, + "learning_rate": 8.085366589960353e-06, + "loss": 0.0434, + "num_tokens": 62954093.0, + "step": 786 + }, + { + "epoch": 0.9819089207735496, + "grad_norm": 0.12090710790599722, + "learning_rate": 8.080404505120936e-06, + "loss": 0.0415, + "num_tokens": 63033837.0, + "step": 787 + }, + { + "epoch": 0.9831565814098565, + "grad_norm": 0.11999218162825294, + "learning_rate": 8.075437741228205e-06, + "loss": 0.044, + "num_tokens": 63113726.0, + "step": 788 + }, + { + "epoch": 0.9844042420461634, + "grad_norm": 0.12012264791201754, + "learning_rate": 8.070466307288404e-06, + "loss": 0.045, + "num_tokens": 63195437.0, + "step": 789 + }, + { + "epoch": 0.9856519026824704, + "grad_norm": 0.10297347591984726, + "learning_rate": 8.065490212316245e-06, + "loss": 0.0385, + "num_tokens": 63274120.0, + "step": 790 + }, + { + "epoch": 0.9868995633187773, + "grad_norm": 0.1159425491489533, + "learning_rate": 8.060509465334895e-06, + "loss": 0.0421, + "num_tokens": 63353768.0, + "step": 791 + }, + { + "epoch": 0.9881472239550843, + "grad_norm": 0.12583816039244836, + "learning_rate": 8.055524075375951e-06, + "loss": 0.044, + "num_tokens": 63433687.0, + "step": 792 + }, + { + "epoch": 0.9893948845913911, + "grad_norm": 0.1300665820006064, + "learning_rate": 8.050534051479432e-06, + "loss": 0.0634, + "num_tokens": 63514928.0, + "step": 793 + }, + { + "epoch": 0.990642545227698, + "grad_norm": 0.1198989461072627, + "learning_rate": 8.045539402693759e-06, + "loss": 0.0435, + "num_tokens": 63594238.0, + "step": 794 + }, + { + "epoch": 0.991890205864005, + "grad_norm": 0.12871787019951422, + "learning_rate": 8.040540138075743e-06, + "loss": 0.044, + "num_tokens": 63674352.0, + "step": 795 + }, + { + "epoch": 0.9931378665003119, + "grad_norm": 0.14279234821369916, + "learning_rate": 8.035536266690561e-06, + "loss": 0.0437, + "num_tokens": 63755466.0, + "step": 796 + }, + { + "epoch": 0.9943855271366189, + "grad_norm": 0.12668717371746102, + "learning_rate": 8.030527797611742e-06, + "loss": 0.0472, + "num_tokens": 63835541.0, + "step": 797 + }, + { + "epoch": 0.9956331877729258, + "grad_norm": 0.12504493154631438, + "learning_rate": 8.025514739921155e-06, + "loss": 0.0439, + "num_tokens": 63914999.0, + "step": 798 + }, + { + "epoch": 0.9968808484092326, + "grad_norm": 0.12250309041796634, + "learning_rate": 8.02049710270899e-06, + "loss": 0.0408, + "num_tokens": 63995141.0, + "step": 799 + }, + { + "epoch": 0.9981285090455396, + "grad_norm": 0.11677292603996331, + "learning_rate": 8.015474895073739e-06, + "loss": 0.042, + "num_tokens": 64074961.0, + "step": 800 + }, + { + "epoch": 0.9993761696818465, + "grad_norm": 0.11305611872055077, + "learning_rate": 8.010448126122183e-06, + "loss": 0.0424, + "num_tokens": 64156160.0, + "step": 801 + }, + { + "epoch": 1.0, + "grad_norm": 0.11305611872055077, + "learning_rate": 8.005416804969374e-06, + "loss": 0.0453, + "num_tokens": 64196778.0, + "step": 802 + }, + { + "epoch": 1.001247660636307, + "grad_norm": 0.19297241332841056, + "learning_rate": 8.000380940738616e-06, + "loss": 0.0365, + "num_tokens": 64276858.0, + "step": 803 + }, + { + "epoch": 1.0024953212726138, + "grad_norm": 0.12339605814727976, + "learning_rate": 7.995340542561453e-06, + "loss": 0.0365, + "num_tokens": 64356382.0, + "step": 804 + }, + { + "epoch": 1.0037429819089208, + "grad_norm": 0.10655882436094036, + "learning_rate": 7.990295619577653e-06, + "loss": 0.0403, + "num_tokens": 64437312.0, + "step": 805 + }, + { + "epoch": 1.0049906425452277, + "grad_norm": 0.11516009325890297, + "learning_rate": 7.985246180935184e-06, + "loss": 0.0414, + "num_tokens": 64518748.0, + "step": 806 + }, + { + "epoch": 1.0062383031815347, + "grad_norm": 0.11139160659856995, + "learning_rate": 7.980192235790207e-06, + "loss": 0.0367, + "num_tokens": 64598711.0, + "step": 807 + }, + { + "epoch": 1.0074859638178415, + "grad_norm": 0.12064118588838565, + "learning_rate": 7.97513379330705e-06, + "loss": 0.0375, + "num_tokens": 64678489.0, + "step": 808 + }, + { + "epoch": 1.0087336244541485, + "grad_norm": 0.11389522139854903, + "learning_rate": 7.970070862658198e-06, + "loss": 0.036, + "num_tokens": 64758290.0, + "step": 809 + }, + { + "epoch": 1.0099812850904555, + "grad_norm": 0.11582742904776548, + "learning_rate": 7.965003453024273e-06, + "loss": 0.0387, + "num_tokens": 64838965.0, + "step": 810 + }, + { + "epoch": 1.0112289457267623, + "grad_norm": 0.11758744952524129, + "learning_rate": 7.959931573594025e-06, + "loss": 0.0373, + "num_tokens": 64919317.0, + "step": 811 + }, + { + "epoch": 1.0124766063630692, + "grad_norm": 0.1087404021707414, + "learning_rate": 7.954855233564301e-06, + "loss": 0.036, + "num_tokens": 65000106.0, + "step": 812 + }, + { + "epoch": 1.0137242669993762, + "grad_norm": 0.13115802969604487, + "learning_rate": 7.949774442140043e-06, + "loss": 0.0387, + "num_tokens": 65079674.0, + "step": 813 + }, + { + "epoch": 1.014971927635683, + "grad_norm": 0.1266899812719372, + "learning_rate": 7.944689208534257e-06, + "loss": 0.0383, + "num_tokens": 65160522.0, + "step": 814 + }, + { + "epoch": 1.01621958827199, + "grad_norm": 0.12285583224697105, + "learning_rate": 7.939599541968012e-06, + "loss": 0.0383, + "num_tokens": 65240870.0, + "step": 815 + }, + { + "epoch": 1.017467248908297, + "grad_norm": 0.1190497077658083, + "learning_rate": 7.93450545167041e-06, + "loss": 0.0352, + "num_tokens": 65319646.0, + "step": 816 + }, + { + "epoch": 1.018714909544604, + "grad_norm": 0.12563652698818017, + "learning_rate": 7.929406946878576e-06, + "loss": 0.0388, + "num_tokens": 65400645.0, + "step": 817 + }, + { + "epoch": 1.0199625701809107, + "grad_norm": 0.1234239723405108, + "learning_rate": 7.924304036837643e-06, + "loss": 0.0373, + "num_tokens": 65479114.0, + "step": 818 + }, + { + "epoch": 1.0212102308172177, + "grad_norm": 0.13005122038538955, + "learning_rate": 7.919196730800727e-06, + "loss": 0.0373, + "num_tokens": 65559516.0, + "step": 819 + }, + { + "epoch": 1.0224578914535247, + "grad_norm": 0.11725724234923955, + "learning_rate": 7.914085038028918e-06, + "loss": 0.0357, + "num_tokens": 65639488.0, + "step": 820 + }, + { + "epoch": 1.0237055520898315, + "grad_norm": 0.11014632164428752, + "learning_rate": 7.908968967791262e-06, + "loss": 0.0358, + "num_tokens": 65719075.0, + "step": 821 + }, + { + "epoch": 1.0249532127261385, + "grad_norm": 0.11721508584086246, + "learning_rate": 7.903848529364738e-06, + "loss": 0.0373, + "num_tokens": 65799000.0, + "step": 822 + }, + { + "epoch": 1.0262008733624455, + "grad_norm": 0.11517241066377722, + "learning_rate": 7.89872373203425e-06, + "loss": 0.0389, + "num_tokens": 65878502.0, + "step": 823 + }, + { + "epoch": 1.0274485339987522, + "grad_norm": 0.12088239271003214, + "learning_rate": 7.893594585092601e-06, + "loss": 0.0374, + "num_tokens": 65959219.0, + "step": 824 + }, + { + "epoch": 1.0286961946350592, + "grad_norm": 0.12193974539111146, + "learning_rate": 7.888461097840494e-06, + "loss": 0.0358, + "num_tokens": 66039818.0, + "step": 825 + }, + { + "epoch": 1.0299438552713662, + "grad_norm": 0.12182216187687943, + "learning_rate": 7.883323279586483e-06, + "loss": 0.0374, + "num_tokens": 66119441.0, + "step": 826 + }, + { + "epoch": 1.0311915159076732, + "grad_norm": 0.11841925845138539, + "learning_rate": 7.87818113964699e-06, + "loss": 0.0377, + "num_tokens": 66199667.0, + "step": 827 + }, + { + "epoch": 1.03243917654398, + "grad_norm": 0.12421286713634912, + "learning_rate": 7.873034687346268e-06, + "loss": 0.0371, + "num_tokens": 66279952.0, + "step": 828 + }, + { + "epoch": 1.033686837180287, + "grad_norm": 0.11425196856934319, + "learning_rate": 7.86788393201639e-06, + "loss": 0.0366, + "num_tokens": 66359882.0, + "step": 829 + }, + { + "epoch": 1.034934497816594, + "grad_norm": 0.12393074676184707, + "learning_rate": 7.862728882997236e-06, + "loss": 0.0372, + "num_tokens": 66439831.0, + "step": 830 + }, + { + "epoch": 1.0361821584529007, + "grad_norm": 0.11799704195610591, + "learning_rate": 7.857569549636462e-06, + "loss": 0.0416, + "num_tokens": 66519952.0, + "step": 831 + }, + { + "epoch": 1.0374298190892077, + "grad_norm": 0.12409491334059666, + "learning_rate": 7.852405941289503e-06, + "loss": 0.0372, + "num_tokens": 66598948.0, + "step": 832 + }, + { + "epoch": 1.0386774797255147, + "grad_norm": 0.12220754153408792, + "learning_rate": 7.847238067319542e-06, + "loss": 0.0372, + "num_tokens": 66680599.0, + "step": 833 + }, + { + "epoch": 1.0399251403618215, + "grad_norm": 0.12805122770002453, + "learning_rate": 7.842065937097495e-06, + "loss": 0.0357, + "num_tokens": 66759842.0, + "step": 834 + }, + { + "epoch": 1.0411728009981285, + "grad_norm": 0.11193160429999634, + "learning_rate": 7.836889560001997e-06, + "loss": 0.0372, + "num_tokens": 66839975.0, + "step": 835 + }, + { + "epoch": 1.0424204616344355, + "grad_norm": 0.11705280167012153, + "learning_rate": 7.831708945419383e-06, + "loss": 0.0354, + "num_tokens": 66919473.0, + "step": 836 + }, + { + "epoch": 1.0436681222707425, + "grad_norm": 0.11882428882714241, + "learning_rate": 7.826524102743678e-06, + "loss": 0.0469, + "num_tokens": 67000971.0, + "step": 837 + }, + { + "epoch": 1.0449157829070492, + "grad_norm": 0.11949649165411871, + "learning_rate": 7.821335041376565e-06, + "loss": 0.0399, + "num_tokens": 67082398.0, + "step": 838 + }, + { + "epoch": 1.0461634435433562, + "grad_norm": 0.13222049645536524, + "learning_rate": 7.816141770727381e-06, + "loss": 0.038, + "num_tokens": 67162807.0, + "step": 839 + }, + { + "epoch": 1.0474111041796632, + "grad_norm": 0.1287747273510056, + "learning_rate": 7.810944300213095e-06, + "loss": 0.0365, + "num_tokens": 67243036.0, + "step": 840 + }, + { + "epoch": 1.04865876481597, + "grad_norm": 0.12359681273704244, + "learning_rate": 7.805742639258297e-06, + "loss": 0.0377, + "num_tokens": 67324171.0, + "step": 841 + }, + { + "epoch": 1.049906425452277, + "grad_norm": 0.11791544460990037, + "learning_rate": 7.800536797295164e-06, + "loss": 0.0385, + "num_tokens": 67406821.0, + "step": 842 + }, + { + "epoch": 1.051154086088584, + "grad_norm": 0.11669683358736094, + "learning_rate": 7.795326783763463e-06, + "loss": 0.036, + "num_tokens": 67486421.0, + "step": 843 + }, + { + "epoch": 1.0524017467248907, + "grad_norm": 0.1145704300717487, + "learning_rate": 7.790112608110523e-06, + "loss": 0.0375, + "num_tokens": 67566397.0, + "step": 844 + }, + { + "epoch": 1.0536494073611977, + "grad_norm": 0.11192160356785108, + "learning_rate": 7.784894279791224e-06, + "loss": 0.0355, + "num_tokens": 67646842.0, + "step": 845 + }, + { + "epoch": 1.0548970679975047, + "grad_norm": 0.11562580889859264, + "learning_rate": 7.779671808267968e-06, + "loss": 0.0378, + "num_tokens": 67727482.0, + "step": 846 + }, + { + "epoch": 1.0561447286338117, + "grad_norm": 0.12153279759646565, + "learning_rate": 7.774445203010676e-06, + "loss": 0.0359, + "num_tokens": 67807939.0, + "step": 847 + }, + { + "epoch": 1.0573923892701185, + "grad_norm": 0.11470566792658915, + "learning_rate": 7.769214473496766e-06, + "loss": 0.0407, + "num_tokens": 67888052.0, + "step": 848 + }, + { + "epoch": 1.0586400499064255, + "grad_norm": 0.12383359576287097, + "learning_rate": 7.763979629211127e-06, + "loss": 0.038, + "num_tokens": 67968032.0, + "step": 849 + }, + { + "epoch": 1.0598877105427325, + "grad_norm": 0.11881454127188035, + "learning_rate": 7.758740679646115e-06, + "loss": 0.0371, + "num_tokens": 68046937.0, + "step": 850 + }, + { + "epoch": 1.0611353711790392, + "grad_norm": 0.11464903476697275, + "learning_rate": 7.753497634301532e-06, + "loss": 0.0364, + "num_tokens": 68127203.0, + "step": 851 + }, + { + "epoch": 1.0623830318153462, + "grad_norm": 0.12119441827724774, + "learning_rate": 7.748250502684601e-06, + "loss": 0.0367, + "num_tokens": 68207782.0, + "step": 852 + }, + { + "epoch": 1.0636306924516532, + "grad_norm": 0.13058645569693353, + "learning_rate": 7.742999294309959e-06, + "loss": 0.037, + "num_tokens": 68287141.0, + "step": 853 + }, + { + "epoch": 1.06487835308796, + "grad_norm": 0.12137449605751832, + "learning_rate": 7.737744018699634e-06, + "loss": 0.0386, + "num_tokens": 68367278.0, + "step": 854 + }, + { + "epoch": 1.066126013724267, + "grad_norm": 0.1175051411674423, + "learning_rate": 7.732484685383027e-06, + "loss": 0.0378, + "num_tokens": 68448023.0, + "step": 855 + }, + { + "epoch": 1.067373674360574, + "grad_norm": 0.11774104211220471, + "learning_rate": 7.7272213038969e-06, + "loss": 0.0362, + "num_tokens": 68527504.0, + "step": 856 + }, + { + "epoch": 1.068621334996881, + "grad_norm": 0.12107302317693551, + "learning_rate": 7.72195388378536e-06, + "loss": 0.0364, + "num_tokens": 68607013.0, + "step": 857 + }, + { + "epoch": 1.0698689956331877, + "grad_norm": 0.1227886444188267, + "learning_rate": 7.716682434599823e-06, + "loss": 0.0379, + "num_tokens": 68687882.0, + "step": 858 + }, + { + "epoch": 1.0711166562694947, + "grad_norm": 0.11421078198972955, + "learning_rate": 7.711406965899026e-06, + "loss": 0.0357, + "num_tokens": 68767520.0, + "step": 859 + }, + { + "epoch": 1.0723643169058017, + "grad_norm": 0.13139694750578262, + "learning_rate": 7.706127487248984e-06, + "loss": 0.0397, + "num_tokens": 68848548.0, + "step": 860 + }, + { + "epoch": 1.0736119775421085, + "grad_norm": 0.12048027098934615, + "learning_rate": 7.70084400822299e-06, + "loss": 0.0394, + "num_tokens": 68929107.0, + "step": 861 + }, + { + "epoch": 1.0748596381784155, + "grad_norm": 0.11688196802345287, + "learning_rate": 7.695556538401588e-06, + "loss": 0.0337, + "num_tokens": 69008833.0, + "step": 862 + }, + { + "epoch": 1.0761072988147224, + "grad_norm": 0.1178101906162924, + "learning_rate": 7.690265087372559e-06, + "loss": 0.0392, + "num_tokens": 69089477.0, + "step": 863 + }, + { + "epoch": 1.0773549594510294, + "grad_norm": 0.11901289710556308, + "learning_rate": 7.684969664730903e-06, + "loss": 0.0395, + "num_tokens": 69170299.0, + "step": 864 + }, + { + "epoch": 1.0786026200873362, + "grad_norm": 0.1131133914969235, + "learning_rate": 7.679670280078823e-06, + "loss": 0.0381, + "num_tokens": 69251770.0, + "step": 865 + }, + { + "epoch": 1.0798502807236432, + "grad_norm": 0.11744420013358728, + "learning_rate": 7.674366943025705e-06, + "loss": 0.0363, + "num_tokens": 69331910.0, + "step": 866 + }, + { + "epoch": 1.0810979413599502, + "grad_norm": 0.11404106643899889, + "learning_rate": 7.669059663188099e-06, + "loss": 0.037, + "num_tokens": 69411473.0, + "step": 867 + }, + { + "epoch": 1.082345601996257, + "grad_norm": 0.11319859025103475, + "learning_rate": 7.66374845018971e-06, + "loss": 0.0348, + "num_tokens": 69491703.0, + "step": 868 + }, + { + "epoch": 1.083593262632564, + "grad_norm": 0.112683453177615, + "learning_rate": 7.658433313661372e-06, + "loss": 0.0392, + "num_tokens": 69572447.0, + "step": 869 + }, + { + "epoch": 1.084840923268871, + "grad_norm": 0.1276133280655265, + "learning_rate": 7.653114263241034e-06, + "loss": 0.0388, + "num_tokens": 69653823.0, + "step": 870 + }, + { + "epoch": 1.0860885839051777, + "grad_norm": 0.1240984988092244, + "learning_rate": 7.647791308573744e-06, + "loss": 0.0387, + "num_tokens": 69734055.0, + "step": 871 + }, + { + "epoch": 1.0873362445414847, + "grad_norm": 0.12356452504447533, + "learning_rate": 7.642464459311623e-06, + "loss": 0.0347, + "num_tokens": 69813965.0, + "step": 872 + }, + { + "epoch": 1.0885839051777917, + "grad_norm": 0.11514715620013154, + "learning_rate": 7.637133725113864e-06, + "loss": 0.0366, + "num_tokens": 69894363.0, + "step": 873 + }, + { + "epoch": 1.0898315658140985, + "grad_norm": 0.12779474123132986, + "learning_rate": 7.631799115646697e-06, + "loss": 0.0357, + "num_tokens": 69973323.0, + "step": 874 + }, + { + "epoch": 1.0910792264504054, + "grad_norm": 0.11790975872627733, + "learning_rate": 7.6264606405833805e-06, + "loss": 0.0363, + "num_tokens": 70054250.0, + "step": 875 + }, + { + "epoch": 1.0923268870867124, + "grad_norm": 0.11641886596060783, + "learning_rate": 7.621118309604186e-06, + "loss": 0.0422, + "num_tokens": 70133988.0, + "step": 876 + }, + { + "epoch": 1.0935745477230194, + "grad_norm": 0.11751888094011356, + "learning_rate": 7.615772132396373e-06, + "loss": 0.035, + "num_tokens": 70213674.0, + "step": 877 + }, + { + "epoch": 1.0948222083593262, + "grad_norm": 0.10553263064141878, + "learning_rate": 7.6104221186541745e-06, + "loss": 0.0356, + "num_tokens": 70292896.0, + "step": 878 + }, + { + "epoch": 1.0960698689956332, + "grad_norm": 0.12384471705795567, + "learning_rate": 7.6050682780787865e-06, + "loss": 0.0397, + "num_tokens": 70372902.0, + "step": 879 + }, + { + "epoch": 1.0973175296319402, + "grad_norm": 0.1290651741195708, + "learning_rate": 7.599710620378337e-06, + "loss": 0.0362, + "num_tokens": 70453829.0, + "step": 880 + }, + { + "epoch": 1.098565190268247, + "grad_norm": 0.12342303328201779, + "learning_rate": 7.594349155267879e-06, + "loss": 0.0362, + "num_tokens": 70533001.0, + "step": 881 + }, + { + "epoch": 1.099812850904554, + "grad_norm": 0.12141677579551534, + "learning_rate": 7.588983892469372e-06, + "loss": 0.0367, + "num_tokens": 70613271.0, + "step": 882 + }, + { + "epoch": 1.101060511540861, + "grad_norm": 0.12196685769280396, + "learning_rate": 7.583614841711657e-06, + "loss": 0.0365, + "num_tokens": 70692565.0, + "step": 883 + }, + { + "epoch": 1.102308172177168, + "grad_norm": 0.12354073131658985, + "learning_rate": 7.5782420127304466e-06, + "loss": 0.0384, + "num_tokens": 70772857.0, + "step": 884 + }, + { + "epoch": 1.1035558328134747, + "grad_norm": 0.11720083779315267, + "learning_rate": 7.572865415268303e-06, + "loss": 0.0355, + "num_tokens": 70852777.0, + "step": 885 + }, + { + "epoch": 1.1048034934497817, + "grad_norm": 0.11247611644561628, + "learning_rate": 7.567485059074623e-06, + "loss": 0.0367, + "num_tokens": 70933435.0, + "step": 886 + }, + { + "epoch": 1.1060511540860887, + "grad_norm": 0.12333260571429218, + "learning_rate": 7.5621009539056175e-06, + "loss": 0.0374, + "num_tokens": 71013136.0, + "step": 887 + }, + { + "epoch": 1.1072988147223954, + "grad_norm": 0.12451469169995692, + "learning_rate": 7.556713109524301e-06, + "loss": 0.0372, + "num_tokens": 71093256.0, + "step": 888 + }, + { + "epoch": 1.1085464753587024, + "grad_norm": 0.13359967657120056, + "learning_rate": 7.551321535700456e-06, + "loss": 0.0358, + "num_tokens": 71172930.0, + "step": 889 + }, + { + "epoch": 1.1097941359950094, + "grad_norm": 0.11691502751945572, + "learning_rate": 7.545926242210643e-06, + "loss": 0.0362, + "num_tokens": 71252476.0, + "step": 890 + }, + { + "epoch": 1.1110417966313162, + "grad_norm": 0.12287538546531206, + "learning_rate": 7.540527238838156e-06, + "loss": 0.0352, + "num_tokens": 71331645.0, + "step": 891 + }, + { + "epoch": 1.1122894572676232, + "grad_norm": 0.12355870380253296, + "learning_rate": 7.535124535373019e-06, + "loss": 0.0352, + "num_tokens": 71410967.0, + "step": 892 + }, + { + "epoch": 1.1135371179039302, + "grad_norm": 0.11947008075731919, + "learning_rate": 7.529718141611972e-06, + "loss": 0.0369, + "num_tokens": 71491316.0, + "step": 893 + }, + { + "epoch": 1.114784778540237, + "grad_norm": 0.11449370018994913, + "learning_rate": 7.5243080673584345e-06, + "loss": 0.0338, + "num_tokens": 71572312.0, + "step": 894 + }, + { + "epoch": 1.116032439176544, + "grad_norm": 0.10417902495607685, + "learning_rate": 7.51889432242251e-06, + "loss": 0.0351, + "num_tokens": 71652300.0, + "step": 895 + }, + { + "epoch": 1.117280099812851, + "grad_norm": 0.11908753377459247, + "learning_rate": 7.513476916620952e-06, + "loss": 0.0387, + "num_tokens": 71733471.0, + "step": 896 + }, + { + "epoch": 1.118527760449158, + "grad_norm": 0.11776124309241255, + "learning_rate": 7.508055859777157e-06, + "loss": 0.0347, + "num_tokens": 71812889.0, + "step": 897 + }, + { + "epoch": 1.1197754210854647, + "grad_norm": 0.11842774970819654, + "learning_rate": 7.502631161721139e-06, + "loss": 0.0361, + "num_tokens": 71892941.0, + "step": 898 + }, + { + "epoch": 1.1210230817217717, + "grad_norm": 0.11809125594779071, + "learning_rate": 7.497202832289514e-06, + "loss": 0.0376, + "num_tokens": 71972718.0, + "step": 899 + }, + { + "epoch": 1.1222707423580787, + "grad_norm": 0.11729161979135838, + "learning_rate": 7.4917708813254865e-06, + "loss": 0.0387, + "num_tokens": 72054222.0, + "step": 900 + }, + { + "epoch": 1.1235184029943854, + "grad_norm": 0.11425658550363159, + "learning_rate": 7.4863353186788234e-06, + "loss": 0.0363, + "num_tokens": 72133579.0, + "step": 901 + }, + { + "epoch": 1.1247660636306924, + "grad_norm": 0.11436527897915852, + "learning_rate": 7.480896154205844e-06, + "loss": 0.0362, + "num_tokens": 72213206.0, + "step": 902 + }, + { + "epoch": 1.1260137242669994, + "grad_norm": 0.11730584452702315, + "learning_rate": 7.475453397769396e-06, + "loss": 0.0391, + "num_tokens": 72292700.0, + "step": 903 + }, + { + "epoch": 1.1272613849033064, + "grad_norm": 0.11830533017335358, + "learning_rate": 7.470007059238842e-06, + "loss": 0.0351, + "num_tokens": 72371412.0, + "step": 904 + }, + { + "epoch": 1.1285090455396132, + "grad_norm": 0.11213629131896848, + "learning_rate": 7.464557148490041e-06, + "loss": 0.035, + "num_tokens": 72451362.0, + "step": 905 + }, + { + "epoch": 1.1297567061759202, + "grad_norm": 0.14744282107332352, + "learning_rate": 7.459103675405328e-06, + "loss": 0.0384, + "num_tokens": 72531571.0, + "step": 906 + }, + { + "epoch": 1.1310043668122272, + "grad_norm": 0.11322054203080899, + "learning_rate": 7.4536466498735e-06, + "loss": 0.0362, + "num_tokens": 72611893.0, + "step": 907 + }, + { + "epoch": 1.132252027448534, + "grad_norm": 0.10786998375465344, + "learning_rate": 7.44818608178979e-06, + "loss": 0.0368, + "num_tokens": 72691853.0, + "step": 908 + }, + { + "epoch": 1.133499688084841, + "grad_norm": 0.1213664887001441, + "learning_rate": 7.442721981055862e-06, + "loss": 0.04, + "num_tokens": 72773392.0, + "step": 909 + }, + { + "epoch": 1.134747348721148, + "grad_norm": 0.11598720811101557, + "learning_rate": 7.43725435757978e-06, + "loss": 0.0359, + "num_tokens": 72852913.0, + "step": 910 + }, + { + "epoch": 1.1359950093574547, + "grad_norm": 0.11009547084534818, + "learning_rate": 7.431783221275997e-06, + "loss": 0.0372, + "num_tokens": 72932495.0, + "step": 911 + }, + { + "epoch": 1.1372426699937617, + "grad_norm": 0.13290296128464088, + "learning_rate": 7.426308582065339e-06, + "loss": 0.0375, + "num_tokens": 73013678.0, + "step": 912 + }, + { + "epoch": 1.1384903306300687, + "grad_norm": 0.12616925947987406, + "learning_rate": 7.4208304498749825e-06, + "loss": 0.0379, + "num_tokens": 73095054.0, + "step": 913 + }, + { + "epoch": 1.1397379912663754, + "grad_norm": 0.11425464035364839, + "learning_rate": 7.415348834638433e-06, + "loss": 0.0372, + "num_tokens": 73175046.0, + "step": 914 + }, + { + "epoch": 1.1409856519026824, + "grad_norm": 0.1245543910672972, + "learning_rate": 7.40986374629552e-06, + "loss": 0.0369, + "num_tokens": 73254678.0, + "step": 915 + }, + { + "epoch": 1.1422333125389894, + "grad_norm": 0.1179981926555406, + "learning_rate": 7.404375194792365e-06, + "loss": 0.0374, + "num_tokens": 73334623.0, + "step": 916 + }, + { + "epoch": 1.1434809731752964, + "grad_norm": 0.11126346028333928, + "learning_rate": 7.398883190081368e-06, + "loss": 0.0353, + "num_tokens": 73414712.0, + "step": 917 + }, + { + "epoch": 1.1447286338116032, + "grad_norm": 0.11760773052330453, + "learning_rate": 7.3933877421211986e-06, + "loss": 0.0356, + "num_tokens": 73495679.0, + "step": 918 + }, + { + "epoch": 1.1459762944479102, + "grad_norm": 0.11803137365247664, + "learning_rate": 7.387888860876763e-06, + "loss": 0.0362, + "num_tokens": 73575931.0, + "step": 919 + }, + { + "epoch": 1.1472239550842172, + "grad_norm": 0.11691088438783126, + "learning_rate": 7.382386556319193e-06, + "loss": 0.0357, + "num_tokens": 73656607.0, + "step": 920 + }, + { + "epoch": 1.1484716157205241, + "grad_norm": 0.11592560296885844, + "learning_rate": 7.376880838425832e-06, + "loss": 0.0366, + "num_tokens": 73736234.0, + "step": 921 + }, + { + "epoch": 1.149719276356831, + "grad_norm": 0.1211126889771943, + "learning_rate": 7.3713717171802106e-06, + "loss": 0.0354, + "num_tokens": 73816380.0, + "step": 922 + }, + { + "epoch": 1.150966936993138, + "grad_norm": 0.11810769499767688, + "learning_rate": 7.3658592025720285e-06, + "loss": 0.04, + "num_tokens": 73897698.0, + "step": 923 + }, + { + "epoch": 1.152214597629445, + "grad_norm": 0.11484038275799247, + "learning_rate": 7.360343304597144e-06, + "loss": 0.0354, + "num_tokens": 73977453.0, + "step": 924 + }, + { + "epoch": 1.1534622582657517, + "grad_norm": 0.10850730121509818, + "learning_rate": 7.354824033257546e-06, + "loss": 0.0365, + "num_tokens": 74056422.0, + "step": 925 + }, + { + "epoch": 1.1547099189020587, + "grad_norm": 0.11974147584903187, + "learning_rate": 7.349301398561342e-06, + "loss": 0.0351, + "num_tokens": 74136845.0, + "step": 926 + }, + { + "epoch": 1.1559575795383656, + "grad_norm": 0.11829958356397421, + "learning_rate": 7.3437754105227365e-06, + "loss": 0.0358, + "num_tokens": 74217377.0, + "step": 927 + }, + { + "epoch": 1.1572052401746724, + "grad_norm": 0.1119802739133821, + "learning_rate": 7.3382460791620165e-06, + "loss": 0.0365, + "num_tokens": 74299632.0, + "step": 928 + }, + { + "epoch": 1.1584529008109794, + "grad_norm": 0.11642309337985378, + "learning_rate": 7.332713414505534e-06, + "loss": 0.0373, + "num_tokens": 74379725.0, + "step": 929 + }, + { + "epoch": 1.1597005614472864, + "grad_norm": 0.12355861507215243, + "learning_rate": 7.32717742658568e-06, + "loss": 0.0355, + "num_tokens": 74459397.0, + "step": 930 + }, + { + "epoch": 1.1609482220835932, + "grad_norm": 0.11263553582694089, + "learning_rate": 7.321638125440872e-06, + "loss": 0.0338, + "num_tokens": 74539162.0, + "step": 931 + }, + { + "epoch": 1.1621958827199002, + "grad_norm": 0.11483802072503761, + "learning_rate": 7.316095521115541e-06, + "loss": 0.0395, + "num_tokens": 74619166.0, + "step": 932 + }, + { + "epoch": 1.1634435433562071, + "grad_norm": 0.12246171184256209, + "learning_rate": 7.310549623660101e-06, + "loss": 0.0378, + "num_tokens": 74699097.0, + "step": 933 + }, + { + "epoch": 1.164691203992514, + "grad_norm": 0.10951266608078988, + "learning_rate": 7.305000443130943e-06, + "loss": 0.0359, + "num_tokens": 74778723.0, + "step": 934 + }, + { + "epoch": 1.165938864628821, + "grad_norm": 0.1296103420820968, + "learning_rate": 7.299447989590406e-06, + "loss": 0.0379, + "num_tokens": 74857957.0, + "step": 935 + }, + { + "epoch": 1.167186525265128, + "grad_norm": 0.1106739790929476, + "learning_rate": 7.293892273106768e-06, + "loss": 0.0339, + "num_tokens": 74937533.0, + "step": 936 + }, + { + "epoch": 1.1684341859014349, + "grad_norm": 0.1159745943163753, + "learning_rate": 7.2883333037542205e-06, + "loss": 0.0361, + "num_tokens": 75017116.0, + "step": 937 + }, + { + "epoch": 1.1696818465377417, + "grad_norm": 0.11565795695947093, + "learning_rate": 7.282771091612858e-06, + "loss": 0.037, + "num_tokens": 75097805.0, + "step": 938 + }, + { + "epoch": 1.1709295071740486, + "grad_norm": 0.12436659888574522, + "learning_rate": 7.27720564676865e-06, + "loss": 0.0358, + "num_tokens": 75177008.0, + "step": 939 + }, + { + "epoch": 1.1721771678103556, + "grad_norm": 0.11305973816370699, + "learning_rate": 7.271636979313432e-06, + "loss": 0.0338, + "num_tokens": 75256060.0, + "step": 940 + }, + { + "epoch": 1.1734248284466626, + "grad_norm": 0.11468359036363136, + "learning_rate": 7.266065099344881e-06, + "loss": 0.0371, + "num_tokens": 75336135.0, + "step": 941 + }, + { + "epoch": 1.1746724890829694, + "grad_norm": 0.11929128098454908, + "learning_rate": 7.260490016966497e-06, + "loss": 0.0373, + "num_tokens": 75416812.0, + "step": 942 + }, + { + "epoch": 1.1759201497192764, + "grad_norm": 0.130142686177145, + "learning_rate": 7.2549117422875925e-06, + "loss": 0.0396, + "num_tokens": 75496993.0, + "step": 943 + }, + { + "epoch": 1.1771678103555834, + "grad_norm": 0.12008704233696307, + "learning_rate": 7.249330285423265e-06, + "loss": 0.0389, + "num_tokens": 75577776.0, + "step": 944 + }, + { + "epoch": 1.1784154709918901, + "grad_norm": 0.1220975661519173, + "learning_rate": 7.243745656494382e-06, + "loss": 0.038, + "num_tokens": 75657576.0, + "step": 945 + }, + { + "epoch": 1.1796631316281971, + "grad_norm": 0.12744332708746905, + "learning_rate": 7.238157865627562e-06, + "loss": 0.0364, + "num_tokens": 75737747.0, + "step": 946 + }, + { + "epoch": 1.1809107922645041, + "grad_norm": 0.11326510598111456, + "learning_rate": 7.2325669229551636e-06, + "loss": 0.0364, + "num_tokens": 75819395.0, + "step": 947 + }, + { + "epoch": 1.182158452900811, + "grad_norm": 0.1213820127222462, + "learning_rate": 7.226972838615251e-06, + "loss": 0.0384, + "num_tokens": 75898696.0, + "step": 948 + }, + { + "epoch": 1.1834061135371179, + "grad_norm": 0.1226667585479789, + "learning_rate": 7.221375622751593e-06, + "loss": 0.0407, + "num_tokens": 75978883.0, + "step": 949 + }, + { + "epoch": 1.1846537741734249, + "grad_norm": 0.1193275551364265, + "learning_rate": 7.215775285513633e-06, + "loss": 0.037, + "num_tokens": 76058141.0, + "step": 950 + }, + { + "epoch": 1.1859014348097316, + "grad_norm": 0.12089332711649621, + "learning_rate": 7.210171837056474e-06, + "loss": 0.0373, + "num_tokens": 76138153.0, + "step": 951 + }, + { + "epoch": 1.1871490954460386, + "grad_norm": 0.12206397250048755, + "learning_rate": 7.2045652875408614e-06, + "loss": 0.0362, + "num_tokens": 76218972.0, + "step": 952 + }, + { + "epoch": 1.1883967560823456, + "grad_norm": 0.11543786673607577, + "learning_rate": 7.198955647133167e-06, + "loss": 0.0364, + "num_tokens": 76298129.0, + "step": 953 + }, + { + "epoch": 1.1896444167186526, + "grad_norm": 0.1170017252794963, + "learning_rate": 7.193342926005362e-06, + "loss": 0.0359, + "num_tokens": 76377939.0, + "step": 954 + }, + { + "epoch": 1.1908920773549594, + "grad_norm": 0.12112960322552498, + "learning_rate": 7.187727134335006e-06, + "loss": 0.0386, + "num_tokens": 76458143.0, + "step": 955 + }, + { + "epoch": 1.1921397379912664, + "grad_norm": 0.12276416511613138, + "learning_rate": 7.182108282305231e-06, + "loss": 0.0366, + "num_tokens": 76537173.0, + "step": 956 + }, + { + "epoch": 1.1933873986275734, + "grad_norm": 0.12474176247002251, + "learning_rate": 7.176486380104707e-06, + "loss": 0.0372, + "num_tokens": 76617763.0, + "step": 957 + }, + { + "epoch": 1.1946350592638801, + "grad_norm": 0.12126829791671767, + "learning_rate": 7.1708614379276485e-06, + "loss": 0.0374, + "num_tokens": 76698109.0, + "step": 958 + }, + { + "epoch": 1.1958827199001871, + "grad_norm": 0.1248231725076693, + "learning_rate": 7.165233465973771e-06, + "loss": 0.0375, + "num_tokens": 76777864.0, + "step": 959 + }, + { + "epoch": 1.1971303805364941, + "grad_norm": 0.12569048840462274, + "learning_rate": 7.159602474448292e-06, + "loss": 0.0369, + "num_tokens": 76857197.0, + "step": 960 + }, + { + "epoch": 1.1983780411728011, + "grad_norm": 0.12332798310694695, + "learning_rate": 7.1539684735618995e-06, + "loss": 0.0364, + "num_tokens": 76937469.0, + "step": 961 + }, + { + "epoch": 1.1996257018091079, + "grad_norm": 0.11706852204502789, + "learning_rate": 7.148331473530741e-06, + "loss": 0.0383, + "num_tokens": 77017833.0, + "step": 962 + }, + { + "epoch": 1.2008733624454149, + "grad_norm": 0.12424916506541904, + "learning_rate": 7.142691484576399e-06, + "loss": 0.0361, + "num_tokens": 77097219.0, + "step": 963 + }, + { + "epoch": 1.2021210230817219, + "grad_norm": 0.10771753138390724, + "learning_rate": 7.137048516925882e-06, + "loss": 0.0335, + "num_tokens": 77176147.0, + "step": 964 + }, + { + "epoch": 1.2033686837180286, + "grad_norm": 0.11534715229087208, + "learning_rate": 7.131402580811593e-06, + "loss": 0.0373, + "num_tokens": 77256126.0, + "step": 965 + }, + { + "epoch": 1.2046163443543356, + "grad_norm": 0.12314227105562962, + "learning_rate": 7.125753686471322e-06, + "loss": 0.0361, + "num_tokens": 77335910.0, + "step": 966 + }, + { + "epoch": 1.2058640049906426, + "grad_norm": 0.12340324578863827, + "learning_rate": 7.120101844148222e-06, + "loss": 0.0358, + "num_tokens": 77416109.0, + "step": 967 + }, + { + "epoch": 1.2071116656269494, + "grad_norm": 0.12119177535967239, + "learning_rate": 7.1144470640907906e-06, + "loss": 0.0418, + "num_tokens": 77497402.0, + "step": 968 + }, + { + "epoch": 1.2083593262632564, + "grad_norm": 0.12173802469100313, + "learning_rate": 7.1087893565528545e-06, + "loss": 0.0372, + "num_tokens": 77577501.0, + "step": 969 + }, + { + "epoch": 1.2096069868995634, + "grad_norm": 0.11142850106634652, + "learning_rate": 7.103128731793546e-06, + "loss": 0.0365, + "num_tokens": 77657098.0, + "step": 970 + }, + { + "epoch": 1.2108546475358701, + "grad_norm": 0.11827410294214857, + "learning_rate": 7.097465200077289e-06, + "loss": 0.0358, + "num_tokens": 77735468.0, + "step": 971 + }, + { + "epoch": 1.2121023081721771, + "grad_norm": 0.12499420528672263, + "learning_rate": 7.0917987716737795e-06, + "loss": 0.0384, + "num_tokens": 77815844.0, + "step": 972 + }, + { + "epoch": 1.2133499688084841, + "grad_norm": 0.12225834238347827, + "learning_rate": 7.086129456857963e-06, + "loss": 0.0358, + "num_tokens": 77895631.0, + "step": 973 + }, + { + "epoch": 1.214597629444791, + "grad_norm": 0.11545282158638288, + "learning_rate": 7.080457265910022e-06, + "loss": 0.0364, + "num_tokens": 77976656.0, + "step": 974 + }, + { + "epoch": 1.2158452900810979, + "grad_norm": 0.12462671166142329, + "learning_rate": 7.074782209115356e-06, + "loss": 0.036, + "num_tokens": 78058340.0, + "step": 975 + }, + { + "epoch": 1.2170929507174049, + "grad_norm": 0.11756188080542514, + "learning_rate": 7.069104296764553e-06, + "loss": 0.0382, + "num_tokens": 78139204.0, + "step": 976 + }, + { + "epoch": 1.2183406113537119, + "grad_norm": 0.12216323940445717, + "learning_rate": 7.0634235391533874e-06, + "loss": 0.0383, + "num_tokens": 78219057.0, + "step": 977 + }, + { + "epoch": 1.2195882719900186, + "grad_norm": 0.12667133266309244, + "learning_rate": 7.05773994658279e-06, + "loss": 0.0365, + "num_tokens": 78299547.0, + "step": 978 + }, + { + "epoch": 1.2208359326263256, + "grad_norm": 0.13620206727181836, + "learning_rate": 7.052053529358831e-06, + "loss": 0.0352, + "num_tokens": 78378421.0, + "step": 979 + }, + { + "epoch": 1.2220835932626326, + "grad_norm": 0.11745647885605227, + "learning_rate": 7.046364297792703e-06, + "loss": 0.0348, + "num_tokens": 78458843.0, + "step": 980 + }, + { + "epoch": 1.2233312538989396, + "grad_norm": 0.11499104455275264, + "learning_rate": 7.040672262200705e-06, + "loss": 0.0407, + "num_tokens": 78539916.0, + "step": 981 + }, + { + "epoch": 1.2245789145352464, + "grad_norm": 0.11717217325806314, + "learning_rate": 7.0349774329042135e-06, + "loss": 0.0355, + "num_tokens": 78619130.0, + "step": 982 + }, + { + "epoch": 1.2258265751715534, + "grad_norm": 0.12516375230993684, + "learning_rate": 7.02927982022968e-06, + "loss": 0.0377, + "num_tokens": 78699034.0, + "step": 983 + }, + { + "epoch": 1.2270742358078603, + "grad_norm": 0.15353076061722065, + "learning_rate": 7.023579434508596e-06, + "loss": 0.0345, + "num_tokens": 78777947.0, + "step": 984 + }, + { + "epoch": 1.2283218964441671, + "grad_norm": 0.1197038982406782, + "learning_rate": 7.017876286077484e-06, + "loss": 0.0557, + "num_tokens": 78859554.0, + "step": 985 + }, + { + "epoch": 1.229569557080474, + "grad_norm": 0.13053665449746876, + "learning_rate": 7.012170385277877e-06, + "loss": 0.0347, + "num_tokens": 78939749.0, + "step": 986 + }, + { + "epoch": 1.230817217716781, + "grad_norm": 0.11902958957924019, + "learning_rate": 7.006461742456297e-06, + "loss": 0.0356, + "num_tokens": 79019918.0, + "step": 987 + }, + { + "epoch": 1.2320648783530879, + "grad_norm": 0.11340880688683018, + "learning_rate": 7.000750367964239e-06, + "loss": 0.0379, + "num_tokens": 79099464.0, + "step": 988 + }, + { + "epoch": 1.2333125389893949, + "grad_norm": 0.11875742504896669, + "learning_rate": 6.99503627215815e-06, + "loss": 0.0349, + "num_tokens": 79178900.0, + "step": 989 + }, + { + "epoch": 1.2345601996257018, + "grad_norm": 0.11112107319089891, + "learning_rate": 6.989319465399415e-06, + "loss": 0.0368, + "num_tokens": 79258330.0, + "step": 990 + }, + { + "epoch": 1.2358078602620086, + "grad_norm": 0.1127792821863249, + "learning_rate": 6.983599958054331e-06, + "loss": 0.0377, + "num_tokens": 79337995.0, + "step": 991 + }, + { + "epoch": 1.2370555208983156, + "grad_norm": 0.115241478566488, + "learning_rate": 6.977877760494094e-06, + "loss": 0.0348, + "num_tokens": 79419296.0, + "step": 992 + }, + { + "epoch": 1.2383031815346226, + "grad_norm": 0.12502572470635195, + "learning_rate": 6.972152883094778e-06, + "loss": 0.0355, + "num_tokens": 79498279.0, + "step": 993 + }, + { + "epoch": 1.2395508421709296, + "grad_norm": 0.12440205175969768, + "learning_rate": 6.966425336237317e-06, + "loss": 0.037, + "num_tokens": 79578630.0, + "step": 994 + }, + { + "epoch": 1.2407985028072364, + "grad_norm": 0.1251089531609647, + "learning_rate": 6.960695130307484e-06, + "loss": 0.039, + "num_tokens": 79659951.0, + "step": 995 + }, + { + "epoch": 1.2420461634435433, + "grad_norm": 0.12293216602459224, + "learning_rate": 6.954962275695871e-06, + "loss": 0.0372, + "num_tokens": 79740063.0, + "step": 996 + }, + { + "epoch": 1.2432938240798503, + "grad_norm": 0.12394497640341638, + "learning_rate": 6.9492267827978824e-06, + "loss": 0.0374, + "num_tokens": 79821223.0, + "step": 997 + }, + { + "epoch": 1.244541484716157, + "grad_norm": 0.1129659823407307, + "learning_rate": 6.943488662013697e-06, + "loss": 0.0355, + "num_tokens": 79901255.0, + "step": 998 + }, + { + "epoch": 1.245789145352464, + "grad_norm": 0.1191576382470901, + "learning_rate": 6.93774792374826e-06, + "loss": 0.0366, + "num_tokens": 79981164.0, + "step": 999 + }, + { + "epoch": 1.247036805988771, + "grad_norm": 0.12363634596962561, + "learning_rate": 6.93200457841127e-06, + "loss": 0.0345, + "num_tokens": 80060941.0, + "step": 1000 + }, + { + "epoch": 1.248284466625078, + "grad_norm": 0.11752492427361626, + "learning_rate": 6.9262586364171455e-06, + "loss": 0.0355, + "num_tokens": 80140169.0, + "step": 1001 + }, + { + "epoch": 1.2495321272613849, + "grad_norm": 0.12154555071949472, + "learning_rate": 6.920510108185016e-06, + "loss": 0.0398, + "num_tokens": 80219606.0, + "step": 1002 + }, + { + "epoch": 1.2507797878976918, + "grad_norm": 0.13878040487943977, + "learning_rate": 6.9147590041387e-06, + "loss": 0.0393, + "num_tokens": 80300978.0, + "step": 1003 + }, + { + "epoch": 1.2520274485339988, + "grad_norm": 0.11693892617963454, + "learning_rate": 6.909005334706688e-06, + "loss": 0.0357, + "num_tokens": 80380717.0, + "step": 1004 + }, + { + "epoch": 1.2532751091703056, + "grad_norm": 0.11483400032407586, + "learning_rate": 6.903249110322123e-06, + "loss": 0.0393, + "num_tokens": 80461525.0, + "step": 1005 + }, + { + "epoch": 1.2545227698066126, + "grad_norm": 0.1176675268573737, + "learning_rate": 6.897490341422779e-06, + "loss": 0.0337, + "num_tokens": 80540527.0, + "step": 1006 + }, + { + "epoch": 1.2557704304429196, + "grad_norm": 0.12167676830713421, + "learning_rate": 6.8917290384510435e-06, + "loss": 0.0375, + "num_tokens": 80619663.0, + "step": 1007 + }, + { + "epoch": 1.2570180910792264, + "grad_norm": 0.12421371269284932, + "learning_rate": 6.885965211853902e-06, + "loss": 0.035, + "num_tokens": 80700557.0, + "step": 1008 + }, + { + "epoch": 1.2582657517155333, + "grad_norm": 0.11922957080231958, + "learning_rate": 6.8801988720829134e-06, + "loss": 0.0369, + "num_tokens": 80780369.0, + "step": 1009 + }, + { + "epoch": 1.2595134123518403, + "grad_norm": 0.12633099452240243, + "learning_rate": 6.874430029594194e-06, + "loss": 0.0393, + "num_tokens": 80859727.0, + "step": 1010 + }, + { + "epoch": 1.260761072988147, + "grad_norm": 0.10977750675097879, + "learning_rate": 6.8686586948483995e-06, + "loss": 0.0385, + "num_tokens": 80940815.0, + "step": 1011 + }, + { + "epoch": 1.262008733624454, + "grad_norm": 0.11227691696830043, + "learning_rate": 6.862884878310705e-06, + "loss": 0.0361, + "num_tokens": 81019729.0, + "step": 1012 + }, + { + "epoch": 1.263256394260761, + "grad_norm": 0.11693513455086187, + "learning_rate": 6.8571085904507825e-06, + "loss": 0.0344, + "num_tokens": 81099372.0, + "step": 1013 + }, + { + "epoch": 1.264504054897068, + "grad_norm": 0.11256114481937485, + "learning_rate": 6.8513298417427895e-06, + "loss": 0.0363, + "num_tokens": 81179368.0, + "step": 1014 + }, + { + "epoch": 1.2657517155333748, + "grad_norm": 0.11521215818800695, + "learning_rate": 6.845548642665347e-06, + "loss": 0.0342, + "num_tokens": 81257916.0, + "step": 1015 + }, + { + "epoch": 1.2669993761696818, + "grad_norm": 0.10695999901129719, + "learning_rate": 6.839765003701511e-06, + "loss": 0.037, + "num_tokens": 81337952.0, + "step": 1016 + }, + { + "epoch": 1.2682470368059888, + "grad_norm": 0.12118215671319715, + "learning_rate": 6.833978935338772e-06, + "loss": 0.0363, + "num_tokens": 81416824.0, + "step": 1017 + }, + { + "epoch": 1.2694946974422958, + "grad_norm": 0.10660530253287213, + "learning_rate": 6.828190448069016e-06, + "loss": 0.035, + "num_tokens": 81496879.0, + "step": 1018 + }, + { + "epoch": 1.2707423580786026, + "grad_norm": 0.11394529803643212, + "learning_rate": 6.822399552388523e-06, + "loss": 0.0363, + "num_tokens": 81576199.0, + "step": 1019 + }, + { + "epoch": 1.2719900187149096, + "grad_norm": 0.12045967866524018, + "learning_rate": 6.816606258797936e-06, + "loss": 0.0347, + "num_tokens": 81655945.0, + "step": 1020 + }, + { + "epoch": 1.2732376793512166, + "grad_norm": 0.12925247193109857, + "learning_rate": 6.810810577802249e-06, + "loss": 0.0403, + "num_tokens": 81736714.0, + "step": 1021 + }, + { + "epoch": 1.2744853399875233, + "grad_norm": 0.12389301275829777, + "learning_rate": 6.8050125199107835e-06, + "loss": 0.038, + "num_tokens": 81816119.0, + "step": 1022 + }, + { + "epoch": 1.2757330006238303, + "grad_norm": 0.1167932325177084, + "learning_rate": 6.799212095637169e-06, + "loss": 0.0363, + "num_tokens": 81896630.0, + "step": 1023 + }, + { + "epoch": 1.2769806612601373, + "grad_norm": 0.11848171062553153, + "learning_rate": 6.7934093154993285e-06, + "loss": 0.0366, + "num_tokens": 81977134.0, + "step": 1024 + }, + { + "epoch": 1.278228321896444, + "grad_norm": 0.12149988422451896, + "learning_rate": 6.787604190019456e-06, + "loss": 0.0349, + "num_tokens": 82057209.0, + "step": 1025 + }, + { + "epoch": 1.279475982532751, + "grad_norm": 0.12548818987766705, + "learning_rate": 6.781796729724001e-06, + "loss": 0.0352, + "num_tokens": 82136947.0, + "step": 1026 + }, + { + "epoch": 1.280723643169058, + "grad_norm": 0.13025681794055913, + "learning_rate": 6.775986945143641e-06, + "loss": 0.0366, + "num_tokens": 82217559.0, + "step": 1027 + }, + { + "epoch": 1.2819713038053648, + "grad_norm": 0.12011534736589388, + "learning_rate": 6.770174846813273e-06, + "loss": 0.0372, + "num_tokens": 82296722.0, + "step": 1028 + }, + { + "epoch": 1.2832189644416718, + "grad_norm": 0.11817718721722607, + "learning_rate": 6.7643604452719894e-06, + "loss": 0.0366, + "num_tokens": 82376898.0, + "step": 1029 + }, + { + "epoch": 1.2844666250779788, + "grad_norm": 0.11923904159473753, + "learning_rate": 6.758543751063055e-06, + "loss": 0.0349, + "num_tokens": 82457760.0, + "step": 1030 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.11244501964085687, + "learning_rate": 6.752724774733899e-06, + "loss": 0.0348, + "num_tokens": 82537611.0, + "step": 1031 + }, + { + "epoch": 1.2869619463505926, + "grad_norm": 0.115025011194697, + "learning_rate": 6.746903526836079e-06, + "loss": 0.0359, + "num_tokens": 82618396.0, + "step": 1032 + }, + { + "epoch": 1.2882096069868996, + "grad_norm": 0.12002180306264341, + "learning_rate": 6.741080017925279e-06, + "loss": 0.0363, + "num_tokens": 82698629.0, + "step": 1033 + }, + { + "epoch": 1.2894572676232066, + "grad_norm": 0.1226967210363595, + "learning_rate": 6.735254258561281e-06, + "loss": 0.0376, + "num_tokens": 82777184.0, + "step": 1034 + }, + { + "epoch": 1.2907049282595136, + "grad_norm": 0.10358231550003588, + "learning_rate": 6.729426259307948e-06, + "loss": 0.0333, + "num_tokens": 82856190.0, + "step": 1035 + }, + { + "epoch": 1.2919525888958203, + "grad_norm": 0.11578090273117891, + "learning_rate": 6.723596030733204e-06, + "loss": 0.0347, + "num_tokens": 82936794.0, + "step": 1036 + }, + { + "epoch": 1.2932002495321273, + "grad_norm": 0.10081923729427175, + "learning_rate": 6.717763583409016e-06, + "loss": 0.0346, + "num_tokens": 83016097.0, + "step": 1037 + }, + { + "epoch": 1.2944479101684343, + "grad_norm": 0.12233503099838965, + "learning_rate": 6.711928927911373e-06, + "loss": 0.0376, + "num_tokens": 83095632.0, + "step": 1038 + }, + { + "epoch": 1.295695570804741, + "grad_norm": 0.11462973160105773, + "learning_rate": 6.7060920748202674e-06, + "loss": 0.0369, + "num_tokens": 83177302.0, + "step": 1039 + }, + { + "epoch": 1.296943231441048, + "grad_norm": 0.11946539311970528, + "learning_rate": 6.700253034719684e-06, + "loss": 0.0386, + "num_tokens": 83258689.0, + "step": 1040 + }, + { + "epoch": 1.298190892077355, + "grad_norm": 0.12359336236001878, + "learning_rate": 6.694411818197561e-06, + "loss": 0.0351, + "num_tokens": 83338185.0, + "step": 1041 + }, + { + "epoch": 1.2994385527136618, + "grad_norm": 0.1139374127755781, + "learning_rate": 6.688568435845792e-06, + "loss": 0.0347, + "num_tokens": 83417497.0, + "step": 1042 + }, + { + "epoch": 1.3006862133499688, + "grad_norm": 0.11205952042329616, + "learning_rate": 6.682722898260195e-06, + "loss": 0.0378, + "num_tokens": 83498065.0, + "step": 1043 + }, + { + "epoch": 1.3019338739862758, + "grad_norm": 0.1083104938066509, + "learning_rate": 6.676875216040498e-06, + "loss": 0.0339, + "num_tokens": 83577372.0, + "step": 1044 + }, + { + "epoch": 1.3031815346225826, + "grad_norm": 0.10828476825816279, + "learning_rate": 6.671025399790315e-06, + "loss": 0.0385, + "num_tokens": 83657938.0, + "step": 1045 + }, + { + "epoch": 1.3044291952588896, + "grad_norm": 0.11019045987410468, + "learning_rate": 6.66517346011713e-06, + "loss": 0.0365, + "num_tokens": 83738524.0, + "step": 1046 + }, + { + "epoch": 1.3056768558951966, + "grad_norm": 0.12276236466671721, + "learning_rate": 6.659319407632282e-06, + "loss": 0.0398, + "num_tokens": 83818548.0, + "step": 1047 + }, + { + "epoch": 1.3069245165315033, + "grad_norm": 0.1263772382661588, + "learning_rate": 6.653463252950933e-06, + "loss": 0.0378, + "num_tokens": 83898937.0, + "step": 1048 + }, + { + "epoch": 1.3081721771678103, + "grad_norm": 0.12027367867659687, + "learning_rate": 6.647605006692066e-06, + "loss": 0.037, + "num_tokens": 83979503.0, + "step": 1049 + }, + { + "epoch": 1.3094198378041173, + "grad_norm": 0.11001899385006926, + "learning_rate": 6.641744679478448e-06, + "loss": 0.0352, + "num_tokens": 84058957.0, + "step": 1050 + }, + { + "epoch": 1.310667498440424, + "grad_norm": 0.11280548803095132, + "learning_rate": 6.635882281936625e-06, + "loss": 0.0354, + "num_tokens": 84138073.0, + "step": 1051 + }, + { + "epoch": 1.311915159076731, + "grad_norm": 0.11780560554733235, + "learning_rate": 6.630017824696898e-06, + "loss": 0.0347, + "num_tokens": 84218047.0, + "step": 1052 + }, + { + "epoch": 1.313162819713038, + "grad_norm": 0.12058333417344305, + "learning_rate": 6.624151318393298e-06, + "loss": 0.0373, + "num_tokens": 84298783.0, + "step": 1053 + }, + { + "epoch": 1.314410480349345, + "grad_norm": 0.11911044876914684, + "learning_rate": 6.618282773663576e-06, + "loss": 0.0356, + "num_tokens": 84378667.0, + "step": 1054 + }, + { + "epoch": 1.315658140985652, + "grad_norm": 0.11647453576092717, + "learning_rate": 6.612412201149175e-06, + "loss": 0.037, + "num_tokens": 84459833.0, + "step": 1055 + }, + { + "epoch": 1.3169058016219588, + "grad_norm": 0.1381556034687667, + "learning_rate": 6.6065396114952195e-06, + "loss": 0.0345, + "num_tokens": 84538415.0, + "step": 1056 + }, + { + "epoch": 1.3181534622582658, + "grad_norm": 0.11291795464976989, + "learning_rate": 6.600665015350487e-06, + "loss": 0.0358, + "num_tokens": 84618356.0, + "step": 1057 + }, + { + "epoch": 1.3194011228945728, + "grad_norm": 0.11428531136103644, + "learning_rate": 6.594788423367399e-06, + "loss": 0.0379, + "num_tokens": 84699392.0, + "step": 1058 + }, + { + "epoch": 1.3206487835308796, + "grad_norm": 0.12559230820363362, + "learning_rate": 6.588909846201992e-06, + "loss": 0.0357, + "num_tokens": 84780174.0, + "step": 1059 + }, + { + "epoch": 1.3218964441671865, + "grad_norm": 0.11157729667744716, + "learning_rate": 6.583029294513902e-06, + "loss": 0.0374, + "num_tokens": 84861023.0, + "step": 1060 + }, + { + "epoch": 1.3231441048034935, + "grad_norm": 0.10532360001946126, + "learning_rate": 6.577146778966347e-06, + "loss": 0.0347, + "num_tokens": 84941536.0, + "step": 1061 + }, + { + "epoch": 1.3243917654398003, + "grad_norm": 0.11380661535983909, + "learning_rate": 6.571262310226108e-06, + "loss": 0.0361, + "num_tokens": 85021693.0, + "step": 1062 + }, + { + "epoch": 1.3256394260761073, + "grad_norm": 0.11751173019320106, + "learning_rate": 6.565375898963503e-06, + "loss": 0.0367, + "num_tokens": 85101820.0, + "step": 1063 + }, + { + "epoch": 1.3268870867124143, + "grad_norm": 0.11556897548657508, + "learning_rate": 6.5594875558523755e-06, + "loss": 0.0366, + "num_tokens": 85182245.0, + "step": 1064 + }, + { + "epoch": 1.328134747348721, + "grad_norm": 0.12290166180442655, + "learning_rate": 6.553597291570071e-06, + "loss": 0.034, + "num_tokens": 85261741.0, + "step": 1065 + }, + { + "epoch": 1.329382407985028, + "grad_norm": 0.11300496986222, + "learning_rate": 6.547705116797422e-06, + "loss": 0.0385, + "num_tokens": 85341243.0, + "step": 1066 + }, + { + "epoch": 1.330630068621335, + "grad_norm": 0.12054222224762111, + "learning_rate": 6.5418110422187156e-06, + "loss": 0.037, + "num_tokens": 85421844.0, + "step": 1067 + }, + { + "epoch": 1.3318777292576418, + "grad_norm": 0.11600190728796984, + "learning_rate": 6.535915078521697e-06, + "loss": 0.0364, + "num_tokens": 85500760.0, + "step": 1068 + }, + { + "epoch": 1.3331253898939488, + "grad_norm": 0.12283600030048854, + "learning_rate": 6.530017236397529e-06, + "loss": 0.0365, + "num_tokens": 85580795.0, + "step": 1069 + }, + { + "epoch": 1.3343730505302558, + "grad_norm": 0.11450545067136071, + "learning_rate": 6.52411752654078e-06, + "loss": 0.0347, + "num_tokens": 85661620.0, + "step": 1070 + }, + { + "epoch": 1.3356207111665626, + "grad_norm": 0.1114808904800754, + "learning_rate": 6.518215959649409e-06, + "loss": 0.0363, + "num_tokens": 85742723.0, + "step": 1071 + }, + { + "epoch": 1.3368683718028695, + "grad_norm": 0.12107476538251831, + "learning_rate": 6.512312546424739e-06, + "loss": 0.0332, + "num_tokens": 85821771.0, + "step": 1072 + }, + { + "epoch": 1.3381160324391765, + "grad_norm": 0.11709745580529317, + "learning_rate": 6.506407297571445e-06, + "loss": 0.0385, + "num_tokens": 85901823.0, + "step": 1073 + }, + { + "epoch": 1.3393636930754835, + "grad_norm": 0.12275293011077212, + "learning_rate": 6.500500223797526e-06, + "loss": 0.0357, + "num_tokens": 85981915.0, + "step": 1074 + }, + { + "epoch": 1.3406113537117905, + "grad_norm": 0.11678308605913426, + "learning_rate": 6.494591335814292e-06, + "loss": 0.0329, + "num_tokens": 86061069.0, + "step": 1075 + }, + { + "epoch": 1.3418590143480973, + "grad_norm": 0.11176605025879648, + "learning_rate": 6.488680644336344e-06, + "loss": 0.0379, + "num_tokens": 86141683.0, + "step": 1076 + }, + { + "epoch": 1.3431066749844043, + "grad_norm": 0.11363151432726346, + "learning_rate": 6.482768160081553e-06, + "loss": 0.0348, + "num_tokens": 86222086.0, + "step": 1077 + }, + { + "epoch": 1.3443543356207113, + "grad_norm": 0.11445806649743549, + "learning_rate": 6.4768538937710364e-06, + "loss": 0.0382, + "num_tokens": 86302703.0, + "step": 1078 + }, + { + "epoch": 1.345601996257018, + "grad_norm": 0.13464785228692652, + "learning_rate": 6.470937856129152e-06, + "loss": 0.0347, + "num_tokens": 86382450.0, + "step": 1079 + }, + { + "epoch": 1.346849656893325, + "grad_norm": 0.1280907328510425, + "learning_rate": 6.465020057883461e-06, + "loss": 0.0337, + "num_tokens": 86462384.0, + "step": 1080 + }, + { + "epoch": 1.348097317529632, + "grad_norm": 0.10759760692353552, + "learning_rate": 6.45910050976472e-06, + "loss": 0.0343, + "num_tokens": 86541989.0, + "step": 1081 + }, + { + "epoch": 1.3493449781659388, + "grad_norm": 0.11223790651092914, + "learning_rate": 6.45317922250686e-06, + "loss": 0.0383, + "num_tokens": 86623690.0, + "step": 1082 + }, + { + "epoch": 1.3505926388022458, + "grad_norm": 0.11168721545726104, + "learning_rate": 6.447256206846963e-06, + "loss": 0.0356, + "num_tokens": 86703459.0, + "step": 1083 + }, + { + "epoch": 1.3518402994385528, + "grad_norm": 0.11578985731239276, + "learning_rate": 6.44133147352525e-06, + "loss": 0.0351, + "num_tokens": 86787758.0, + "step": 1084 + }, + { + "epoch": 1.3530879600748595, + "grad_norm": 0.11806982316043722, + "learning_rate": 6.4354050332850505e-06, + "loss": 0.0351, + "num_tokens": 86868231.0, + "step": 1085 + }, + { + "epoch": 1.3543356207111665, + "grad_norm": 0.13776869218741045, + "learning_rate": 6.429476896872793e-06, + "loss": 0.0355, + "num_tokens": 86948392.0, + "step": 1086 + }, + { + "epoch": 1.3555832813474735, + "grad_norm": 0.1143170779368215, + "learning_rate": 6.4235470750379794e-06, + "loss": 0.0352, + "num_tokens": 87028370.0, + "step": 1087 + }, + { + "epoch": 1.3568309419837803, + "grad_norm": 0.10825819045832565, + "learning_rate": 6.4176155785331705e-06, + "loss": 0.0363, + "num_tokens": 87109397.0, + "step": 1088 + }, + { + "epoch": 1.3580786026200873, + "grad_norm": 0.12553276062809313, + "learning_rate": 6.411682418113961e-06, + "loss": 0.0401, + "num_tokens": 87191884.0, + "step": 1089 + }, + { + "epoch": 1.3593262632563943, + "grad_norm": 0.1190623682061882, + "learning_rate": 6.405747604538965e-06, + "loss": 0.036, + "num_tokens": 87272846.0, + "step": 1090 + }, + { + "epoch": 1.3605739238927013, + "grad_norm": 0.13120836782794848, + "learning_rate": 6.399811148569794e-06, + "loss": 0.0387, + "num_tokens": 87354474.0, + "step": 1091 + }, + { + "epoch": 1.361821584529008, + "grad_norm": 0.12749448720764697, + "learning_rate": 6.393873060971036e-06, + "loss": 0.0354, + "num_tokens": 87434034.0, + "step": 1092 + }, + { + "epoch": 1.363069245165315, + "grad_norm": 0.11265346187198873, + "learning_rate": 6.3879333525102375e-06, + "loss": 0.0383, + "num_tokens": 87514380.0, + "step": 1093 + }, + { + "epoch": 1.364316905801622, + "grad_norm": 0.10396098070933056, + "learning_rate": 6.381992033957889e-06, + "loss": 0.0374, + "num_tokens": 87594266.0, + "step": 1094 + }, + { + "epoch": 1.365564566437929, + "grad_norm": 0.12007723700394293, + "learning_rate": 6.376049116087393e-06, + "loss": 0.0358, + "num_tokens": 87675016.0, + "step": 1095 + }, + { + "epoch": 1.3668122270742358, + "grad_norm": 0.11088738920455023, + "learning_rate": 6.370104609675058e-06, + "loss": 0.0365, + "num_tokens": 87755275.0, + "step": 1096 + }, + { + "epoch": 1.3680598877105428, + "grad_norm": 0.12947725342366095, + "learning_rate": 6.364158525500069e-06, + "loss": 0.0386, + "num_tokens": 87835968.0, + "step": 1097 + }, + { + "epoch": 1.3693075483468498, + "grad_norm": 0.10743601396658202, + "learning_rate": 6.358210874344476e-06, + "loss": 0.0359, + "num_tokens": 87916756.0, + "step": 1098 + }, + { + "epoch": 1.3705552089831565, + "grad_norm": 0.10538537489377399, + "learning_rate": 6.352261666993167e-06, + "loss": 0.0344, + "num_tokens": 87997097.0, + "step": 1099 + }, + { + "epoch": 1.3718028696194635, + "grad_norm": 0.10920904020216766, + "learning_rate": 6.346310914233854e-06, + "loss": 0.0337, + "num_tokens": 88075564.0, + "step": 1100 + }, + { + "epoch": 1.3730505302557705, + "grad_norm": 0.12064678113049096, + "learning_rate": 6.340358626857049e-06, + "loss": 0.0374, + "num_tokens": 88155637.0, + "step": 1101 + }, + { + "epoch": 1.3742981908920773, + "grad_norm": 0.11366242121995487, + "learning_rate": 6.334404815656049e-06, + "loss": 0.034, + "num_tokens": 88234184.0, + "step": 1102 + }, + { + "epoch": 1.3755458515283843, + "grad_norm": 0.10541533458396993, + "learning_rate": 6.328449491426914e-06, + "loss": 0.0334, + "num_tokens": 88313988.0, + "step": 1103 + }, + { + "epoch": 1.3767935121646913, + "grad_norm": 0.1126212487247659, + "learning_rate": 6.322492664968446e-06, + "loss": 0.0361, + "num_tokens": 88394035.0, + "step": 1104 + }, + { + "epoch": 1.378041172800998, + "grad_norm": 0.10415858729571198, + "learning_rate": 6.316534347082173e-06, + "loss": 0.0353, + "num_tokens": 88473457.0, + "step": 1105 + }, + { + "epoch": 1.379288833437305, + "grad_norm": 0.11311924737695511, + "learning_rate": 6.310574548572325e-06, + "loss": 0.0396, + "num_tokens": 88554028.0, + "step": 1106 + }, + { + "epoch": 1.380536494073612, + "grad_norm": 0.11641164289365231, + "learning_rate": 6.304613280245816e-06, + "loss": 0.0351, + "num_tokens": 88633282.0, + "step": 1107 + }, + { + "epoch": 1.3817841547099188, + "grad_norm": 0.1238656036368708, + "learning_rate": 6.298650552912233e-06, + "loss": 0.0369, + "num_tokens": 88713446.0, + "step": 1108 + }, + { + "epoch": 1.3830318153462258, + "grad_norm": 0.10828413683130844, + "learning_rate": 6.292686377383797e-06, + "loss": 0.0366, + "num_tokens": 88793591.0, + "step": 1109 + }, + { + "epoch": 1.3842794759825328, + "grad_norm": 0.11180313140533374, + "learning_rate": 6.286720764475365e-06, + "loss": 0.0355, + "num_tokens": 88872762.0, + "step": 1110 + }, + { + "epoch": 1.3855271366188397, + "grad_norm": 0.11370487600194723, + "learning_rate": 6.280753725004395e-06, + "loss": 0.0362, + "num_tokens": 88955457.0, + "step": 1111 + }, + { + "epoch": 1.3867747972551465, + "grad_norm": 0.09617328020740629, + "learning_rate": 6.274785269790932e-06, + "loss": 0.0336, + "num_tokens": 89035406.0, + "step": 1112 + }, + { + "epoch": 1.3880224578914535, + "grad_norm": 0.11176348044222259, + "learning_rate": 6.268815409657592e-06, + "loss": 0.0342, + "num_tokens": 89116507.0, + "step": 1113 + }, + { + "epoch": 1.3892701185277605, + "grad_norm": 0.11325296873554795, + "learning_rate": 6.262844155429533e-06, + "loss": 0.0374, + "num_tokens": 89199614.0, + "step": 1114 + }, + { + "epoch": 1.3905177791640675, + "grad_norm": 0.11704030265578563, + "learning_rate": 6.256871517934445e-06, + "loss": 0.0357, + "num_tokens": 89279144.0, + "step": 1115 + }, + { + "epoch": 1.3917654398003743, + "grad_norm": 0.11952799829012306, + "learning_rate": 6.2508975080025254e-06, + "loss": 0.0353, + "num_tokens": 89359708.0, + "step": 1116 + }, + { + "epoch": 1.3930131004366813, + "grad_norm": 0.11502731870682785, + "learning_rate": 6.24492213646646e-06, + "loss": 0.0368, + "num_tokens": 89439326.0, + "step": 1117 + }, + { + "epoch": 1.3942607610729882, + "grad_norm": 0.12252081179674802, + "learning_rate": 6.2389454141614024e-06, + "loss": 0.0345, + "num_tokens": 89518867.0, + "step": 1118 + }, + { + "epoch": 1.395508421709295, + "grad_norm": 0.18070152071501802, + "learning_rate": 6.232967351924959e-06, + "loss": 0.0355, + "num_tokens": 89598243.0, + "step": 1119 + }, + { + "epoch": 1.396756082345602, + "grad_norm": 0.1228366552270244, + "learning_rate": 6.226987960597161e-06, + "loss": 0.0363, + "num_tokens": 89678232.0, + "step": 1120 + }, + { + "epoch": 1.398003742981909, + "grad_norm": 0.13722290208459134, + "learning_rate": 6.22100725102045e-06, + "loss": 0.0347, + "num_tokens": 89758532.0, + "step": 1121 + }, + { + "epoch": 1.3992514036182158, + "grad_norm": 0.11700538827862798, + "learning_rate": 6.215025234039667e-06, + "loss": 0.0371, + "num_tokens": 89838761.0, + "step": 1122 + }, + { + "epoch": 1.4004990642545228, + "grad_norm": 0.11749579319296499, + "learning_rate": 6.209041920502012e-06, + "loss": 0.0362, + "num_tokens": 89919068.0, + "step": 1123 + }, + { + "epoch": 1.4017467248908297, + "grad_norm": 0.1239771498120876, + "learning_rate": 6.203057321257041e-06, + "loss": 0.0347, + "num_tokens": 89999454.0, + "step": 1124 + }, + { + "epoch": 1.4029943855271365, + "grad_norm": 0.10457854348428894, + "learning_rate": 6.197071447156643e-06, + "loss": 0.0369, + "num_tokens": 90079489.0, + "step": 1125 + }, + { + "epoch": 1.4042420461634435, + "grad_norm": 0.12132963451268676, + "learning_rate": 6.191084309055018e-06, + "loss": 0.0359, + "num_tokens": 90160079.0, + "step": 1126 + }, + { + "epoch": 1.4054897067997505, + "grad_norm": 0.1149651839306163, + "learning_rate": 6.185095917808654e-06, + "loss": 0.0367, + "num_tokens": 90239067.0, + "step": 1127 + }, + { + "epoch": 1.4067373674360573, + "grad_norm": 0.11185660034071362, + "learning_rate": 6.179106284276315e-06, + "loss": 0.0345, + "num_tokens": 90320588.0, + "step": 1128 + }, + { + "epoch": 1.4079850280723643, + "grad_norm": 0.11791295713574668, + "learning_rate": 6.173115419319019e-06, + "loss": 0.0357, + "num_tokens": 90400737.0, + "step": 1129 + }, + { + "epoch": 1.4092326887086712, + "grad_norm": 0.12005035519918307, + "learning_rate": 6.167123333800014e-06, + "loss": 0.0372, + "num_tokens": 90481553.0, + "step": 1130 + }, + { + "epoch": 1.4104803493449782, + "grad_norm": 0.11280371649250795, + "learning_rate": 6.161130038584762e-06, + "loss": 0.0359, + "num_tokens": 90560492.0, + "step": 1131 + }, + { + "epoch": 1.4117280099812852, + "grad_norm": 0.11483105002319907, + "learning_rate": 6.155135544540917e-06, + "loss": 0.0329, + "num_tokens": 90640526.0, + "step": 1132 + }, + { + "epoch": 1.412975670617592, + "grad_norm": 0.10610511431168547, + "learning_rate": 6.1491398625383116e-06, + "loss": 0.0365, + "num_tokens": 90720931.0, + "step": 1133 + }, + { + "epoch": 1.414223331253899, + "grad_norm": 0.11403045781580023, + "learning_rate": 6.143143003448929e-06, + "loss": 0.0334, + "num_tokens": 90799876.0, + "step": 1134 + }, + { + "epoch": 1.415470991890206, + "grad_norm": 0.1128226484451764, + "learning_rate": 6.1371449781468835e-06, + "loss": 0.036, + "num_tokens": 90879955.0, + "step": 1135 + }, + { + "epoch": 1.4167186525265127, + "grad_norm": 0.1339643064995737, + "learning_rate": 6.131145797508414e-06, + "loss": 0.0362, + "num_tokens": 90960140.0, + "step": 1136 + }, + { + "epoch": 1.4179663131628197, + "grad_norm": 0.11977396725939997, + "learning_rate": 6.125145472411845e-06, + "loss": 0.0408, + "num_tokens": 91040880.0, + "step": 1137 + }, + { + "epoch": 1.4192139737991267, + "grad_norm": 0.1304144374986512, + "learning_rate": 6.1191440137375775e-06, + "loss": 0.0356, + "num_tokens": 91120578.0, + "step": 1138 + }, + { + "epoch": 1.4204616344354335, + "grad_norm": 0.10952816992518201, + "learning_rate": 6.113141432368075e-06, + "loss": 0.0342, + "num_tokens": 91199968.0, + "step": 1139 + }, + { + "epoch": 1.4217092950717405, + "grad_norm": 0.11923470924014716, + "learning_rate": 6.107137739187827e-06, + "loss": 0.0382, + "num_tokens": 91280534.0, + "step": 1140 + }, + { + "epoch": 1.4229569557080475, + "grad_norm": 0.13414248321277958, + "learning_rate": 6.101132945083347e-06, + "loss": 0.0353, + "num_tokens": 91359718.0, + "step": 1141 + }, + { + "epoch": 1.4242046163443542, + "grad_norm": 0.11441510307386335, + "learning_rate": 6.095127060943141e-06, + "loss": 0.0355, + "num_tokens": 91439021.0, + "step": 1142 + }, + { + "epoch": 1.4254522769806612, + "grad_norm": 0.11361047483263015, + "learning_rate": 6.089120097657692e-06, + "loss": 0.0399, + "num_tokens": 91520278.0, + "step": 1143 + }, + { + "epoch": 1.4266999376169682, + "grad_norm": 0.11528784116675486, + "learning_rate": 6.083112066119439e-06, + "loss": 0.0379, + "num_tokens": 91600857.0, + "step": 1144 + }, + { + "epoch": 1.427947598253275, + "grad_norm": 0.12339012360670368, + "learning_rate": 6.077102977222763e-06, + "loss": 0.0363, + "num_tokens": 91681068.0, + "step": 1145 + }, + { + "epoch": 1.429195258889582, + "grad_norm": 0.11822266945602426, + "learning_rate": 6.0710928418639515e-06, + "loss": 0.0369, + "num_tokens": 91762429.0, + "step": 1146 + }, + { + "epoch": 1.430442919525889, + "grad_norm": 0.10934047203309372, + "learning_rate": 6.065081670941204e-06, + "loss": 0.0343, + "num_tokens": 91842442.0, + "step": 1147 + }, + { + "epoch": 1.4316905801621957, + "grad_norm": 0.11563946305646088, + "learning_rate": 6.059069475354586e-06, + "loss": 0.0371, + "num_tokens": 91921815.0, + "step": 1148 + }, + { + "epoch": 1.4329382407985027, + "grad_norm": 0.13902235644402097, + "learning_rate": 6.0530562660060276e-06, + "loss": 0.035, + "num_tokens": 92001388.0, + "step": 1149 + }, + { + "epoch": 1.4341859014348097, + "grad_norm": 0.11365381032155394, + "learning_rate": 6.0470420537992915e-06, + "loss": 0.0361, + "num_tokens": 92080682.0, + "step": 1150 + }, + { + "epoch": 1.4354335620711167, + "grad_norm": 0.1215204822957711, + "learning_rate": 6.041026849639966e-06, + "loss": 0.0367, + "num_tokens": 92160919.0, + "step": 1151 + }, + { + "epoch": 1.4366812227074237, + "grad_norm": 0.10228268537749387, + "learning_rate": 6.035010664435434e-06, + "loss": 0.0361, + "num_tokens": 92241085.0, + "step": 1152 + }, + { + "epoch": 1.4379288833437305, + "grad_norm": 0.11290899169878665, + "learning_rate": 6.0289935090948536e-06, + "loss": 0.0339, + "num_tokens": 92320066.0, + "step": 1153 + }, + { + "epoch": 1.4391765439800375, + "grad_norm": 0.12929287703536887, + "learning_rate": 6.022975394529149e-06, + "loss": 0.0344, + "num_tokens": 92399898.0, + "step": 1154 + }, + { + "epoch": 1.4404242046163445, + "grad_norm": 0.10350159451251419, + "learning_rate": 6.016956331650984e-06, + "loss": 0.0338, + "num_tokens": 92479871.0, + "step": 1155 + }, + { + "epoch": 1.4416718652526512, + "grad_norm": 0.11525710135617265, + "learning_rate": 6.010936331374735e-06, + "loss": 0.0359, + "num_tokens": 92560206.0, + "step": 1156 + }, + { + "epoch": 1.4429195258889582, + "grad_norm": 0.11743945426514996, + "learning_rate": 6.00491540461648e-06, + "loss": 0.034, + "num_tokens": 92639628.0, + "step": 1157 + }, + { + "epoch": 1.4441671865252652, + "grad_norm": 0.10670559077717189, + "learning_rate": 5.998893562293986e-06, + "loss": 0.0377, + "num_tokens": 92719681.0, + "step": 1158 + }, + { + "epoch": 1.445414847161572, + "grad_norm": 0.11601172863515272, + "learning_rate": 5.992870815326667e-06, + "loss": 0.0366, + "num_tokens": 92799584.0, + "step": 1159 + }, + { + "epoch": 1.446662507797879, + "grad_norm": 0.11460386722320819, + "learning_rate": 5.986847174635586e-06, + "loss": 0.0332, + "num_tokens": 92879565.0, + "step": 1160 + }, + { + "epoch": 1.447910168434186, + "grad_norm": 0.10697055130942412, + "learning_rate": 5.980822651143426e-06, + "loss": 0.0365, + "num_tokens": 92959785.0, + "step": 1161 + }, + { + "epoch": 1.4491578290704927, + "grad_norm": 0.12723407944880333, + "learning_rate": 5.9747972557744675e-06, + "loss": 0.0382, + "num_tokens": 93040108.0, + "step": 1162 + }, + { + "epoch": 1.4504054897067997, + "grad_norm": 0.11841204743560867, + "learning_rate": 5.968770999454572e-06, + "loss": 0.036, + "num_tokens": 93121058.0, + "step": 1163 + }, + { + "epoch": 1.4516531503431067, + "grad_norm": 0.11569156196248792, + "learning_rate": 5.962743893111165e-06, + "loss": 0.0353, + "num_tokens": 93200814.0, + "step": 1164 + }, + { + "epoch": 1.4529008109794135, + "grad_norm": 0.10777418932429268, + "learning_rate": 5.956715947673212e-06, + "loss": 0.0348, + "num_tokens": 93281213.0, + "step": 1165 + }, + { + "epoch": 1.4541484716157205, + "grad_norm": 0.10994616619439264, + "learning_rate": 5.950687174071201e-06, + "loss": 0.0356, + "num_tokens": 93360403.0, + "step": 1166 + }, + { + "epoch": 1.4553961322520275, + "grad_norm": 0.10558957609563456, + "learning_rate": 5.944657583237119e-06, + "loss": 0.0359, + "num_tokens": 93440112.0, + "step": 1167 + }, + { + "epoch": 1.4566437928883345, + "grad_norm": 0.10340454479634237, + "learning_rate": 5.938627186104438e-06, + "loss": 0.0341, + "num_tokens": 93519997.0, + "step": 1168 + }, + { + "epoch": 1.4578914535246412, + "grad_norm": 0.11216617986802824, + "learning_rate": 5.932595993608092e-06, + "loss": 0.0367, + "num_tokens": 93601531.0, + "step": 1169 + }, + { + "epoch": 1.4591391141609482, + "grad_norm": 0.11118613571047507, + "learning_rate": 5.926564016684453e-06, + "loss": 0.0369, + "num_tokens": 93684506.0, + "step": 1170 + }, + { + "epoch": 1.4603867747972552, + "grad_norm": 0.11180759900830808, + "learning_rate": 5.920531266271317e-06, + "loss": 0.0345, + "num_tokens": 93765144.0, + "step": 1171 + }, + { + "epoch": 1.4616344354335622, + "grad_norm": 0.10849133876831726, + "learning_rate": 5.9144977533078885e-06, + "loss": 0.0338, + "num_tokens": 93844866.0, + "step": 1172 + }, + { + "epoch": 1.462882096069869, + "grad_norm": 0.10146908320691236, + "learning_rate": 5.90846348873475e-06, + "loss": 0.0376, + "num_tokens": 93925678.0, + "step": 1173 + }, + { + "epoch": 1.464129756706176, + "grad_norm": 0.11406035504111364, + "learning_rate": 5.902428483493845e-06, + "loss": 0.0354, + "num_tokens": 94005650.0, + "step": 1174 + }, + { + "epoch": 1.465377417342483, + "grad_norm": 0.10629587740727439, + "learning_rate": 5.89639274852846e-06, + "loss": 0.0338, + "num_tokens": 94085102.0, + "step": 1175 + }, + { + "epoch": 1.4666250779787897, + "grad_norm": 0.10967958676982019, + "learning_rate": 5.890356294783213e-06, + "loss": 0.0346, + "num_tokens": 94163932.0, + "step": 1176 + }, + { + "epoch": 1.4678727386150967, + "grad_norm": 0.1110479738340064, + "learning_rate": 5.8843191332040125e-06, + "loss": 0.0366, + "num_tokens": 94244813.0, + "step": 1177 + }, + { + "epoch": 1.4691203992514037, + "grad_norm": 0.11487895996237814, + "learning_rate": 5.878281274738061e-06, + "loss": 0.044, + "num_tokens": 94326986.0, + "step": 1178 + }, + { + "epoch": 1.4703680598877105, + "grad_norm": 0.12426013311708847, + "learning_rate": 5.872242730333822e-06, + "loss": 0.0373, + "num_tokens": 94407345.0, + "step": 1179 + }, + { + "epoch": 1.4716157205240175, + "grad_norm": 0.10392684893138183, + "learning_rate": 5.866203510940998e-06, + "loss": 0.0341, + "num_tokens": 94486495.0, + "step": 1180 + }, + { + "epoch": 1.4728633811603244, + "grad_norm": 0.11624920561782613, + "learning_rate": 5.860163627510521e-06, + "loss": 0.0354, + "num_tokens": 94566567.0, + "step": 1181 + }, + { + "epoch": 1.4741110417966312, + "grad_norm": 0.11632421080295033, + "learning_rate": 5.854123090994524e-06, + "loss": 0.0351, + "num_tokens": 94646328.0, + "step": 1182 + }, + { + "epoch": 1.4753587024329382, + "grad_norm": 0.1085474029571278, + "learning_rate": 5.848081912346329e-06, + "loss": 0.0357, + "num_tokens": 94726254.0, + "step": 1183 + }, + { + "epoch": 1.4766063630692452, + "grad_norm": 0.13928705664750285, + "learning_rate": 5.842040102520416e-06, + "loss": 0.0345, + "num_tokens": 94806492.0, + "step": 1184 + }, + { + "epoch": 1.477854023705552, + "grad_norm": 0.11956422118762197, + "learning_rate": 5.8359976724724146e-06, + "loss": 0.0373, + "num_tokens": 94888343.0, + "step": 1185 + }, + { + "epoch": 1.479101684341859, + "grad_norm": 0.10678348906652803, + "learning_rate": 5.829954633159073e-06, + "loss": 0.0365, + "num_tokens": 94968750.0, + "step": 1186 + }, + { + "epoch": 1.480349344978166, + "grad_norm": 0.10950504983347997, + "learning_rate": 5.823910995538251e-06, + "loss": 0.0363, + "num_tokens": 95048007.0, + "step": 1187 + }, + { + "epoch": 1.481597005614473, + "grad_norm": 0.10871513034226521, + "learning_rate": 5.8178667705688895e-06, + "loss": 0.034, + "num_tokens": 95127214.0, + "step": 1188 + }, + { + "epoch": 1.4828446662507797, + "grad_norm": 0.11813024759210294, + "learning_rate": 5.811821969210995e-06, + "loss": 0.0378, + "num_tokens": 95207769.0, + "step": 1189 + }, + { + "epoch": 1.4840923268870867, + "grad_norm": 0.12334121149788137, + "learning_rate": 5.8057766024256205e-06, + "loss": 0.0339, + "num_tokens": 95286918.0, + "step": 1190 + }, + { + "epoch": 1.4853399875233937, + "grad_norm": 0.11137548828632113, + "learning_rate": 5.799730681174842e-06, + "loss": 0.0345, + "num_tokens": 95367783.0, + "step": 1191 + }, + { + "epoch": 1.4865876481597007, + "grad_norm": 0.11258420176536157, + "learning_rate": 5.793684216421744e-06, + "loss": 0.0379, + "num_tokens": 95449134.0, + "step": 1192 + }, + { + "epoch": 1.4878353087960074, + "grad_norm": 0.1283974468707094, + "learning_rate": 5.787637219130392e-06, + "loss": 0.0373, + "num_tokens": 95528492.0, + "step": 1193 + }, + { + "epoch": 1.4890829694323144, + "grad_norm": 0.10883979183962499, + "learning_rate": 5.781589700265823e-06, + "loss": 0.0343, + "num_tokens": 95608208.0, + "step": 1194 + }, + { + "epoch": 1.4903306300686214, + "grad_norm": 0.10270640891807907, + "learning_rate": 5.7755416707940135e-06, + "loss": 0.0348, + "num_tokens": 95687611.0, + "step": 1195 + }, + { + "epoch": 1.4915782907049282, + "grad_norm": 0.12695450443859627, + "learning_rate": 5.76949314168187e-06, + "loss": 0.0359, + "num_tokens": 95767108.0, + "step": 1196 + }, + { + "epoch": 1.4928259513412352, + "grad_norm": 0.11114393977783613, + "learning_rate": 5.763444123897206e-06, + "loss": 0.0345, + "num_tokens": 95846696.0, + "step": 1197 + }, + { + "epoch": 1.4940736119775422, + "grad_norm": 0.10051611126921174, + "learning_rate": 5.757394628408716e-06, + "loss": 0.0355, + "num_tokens": 95927423.0, + "step": 1198 + }, + { + "epoch": 1.495321272613849, + "grad_norm": 0.10698918904729461, + "learning_rate": 5.7513446661859664e-06, + "loss": 0.0334, + "num_tokens": 96008401.0, + "step": 1199 + }, + { + "epoch": 1.496568933250156, + "grad_norm": 0.11515052950318992, + "learning_rate": 5.7452942481993655e-06, + "loss": 0.0329, + "num_tokens": 96087128.0, + "step": 1200 + }, + { + "epoch": 1.497816593886463, + "grad_norm": 0.10667262101407343, + "learning_rate": 5.739243385420151e-06, + "loss": 0.0358, + "num_tokens": 96167367.0, + "step": 1201 + }, + { + "epoch": 1.4990642545227697, + "grad_norm": 0.12405579945874308, + "learning_rate": 5.7331920888203655e-06, + "loss": 0.0346, + "num_tokens": 96246922.0, + "step": 1202 + }, + { + "epoch": 1.5003119151590767, + "grad_norm": 0.10489860643653304, + "learning_rate": 5.727140369372838e-06, + "loss": 0.0376, + "num_tokens": 96327807.0, + "step": 1203 + }, + { + "epoch": 1.5015595757953837, + "grad_norm": 0.12443731067183812, + "learning_rate": 5.721088238051168e-06, + "loss": 0.0356, + "num_tokens": 96408288.0, + "step": 1204 + }, + { + "epoch": 1.5028072364316905, + "grad_norm": 0.10422931405567512, + "learning_rate": 5.715035705829696e-06, + "loss": 0.0378, + "num_tokens": 96488747.0, + "step": 1205 + }, + { + "epoch": 1.5040548970679977, + "grad_norm": 0.11583240842588108, + "learning_rate": 5.708982783683492e-06, + "loss": 0.0351, + "num_tokens": 96567394.0, + "step": 1206 + }, + { + "epoch": 1.5053025577043044, + "grad_norm": 0.11679015463888166, + "learning_rate": 5.7029294825883365e-06, + "loss": 0.035, + "num_tokens": 96646566.0, + "step": 1207 + }, + { + "epoch": 1.5065502183406112, + "grad_norm": 0.11437606283844103, + "learning_rate": 5.696875813520691e-06, + "loss": 0.0392, + "num_tokens": 96727492.0, + "step": 1208 + }, + { + "epoch": 1.5077978789769184, + "grad_norm": 0.11691760587370684, + "learning_rate": 5.69082178745769e-06, + "loss": 0.0352, + "num_tokens": 96807931.0, + "step": 1209 + }, + { + "epoch": 1.5090455396132252, + "grad_norm": 0.10574935510817819, + "learning_rate": 5.68476741537711e-06, + "loss": 0.0346, + "num_tokens": 96887154.0, + "step": 1210 + }, + { + "epoch": 1.5102932002495322, + "grad_norm": 0.11625074019798143, + "learning_rate": 5.678712708257358e-06, + "loss": 0.039, + "num_tokens": 96969029.0, + "step": 1211 + }, + { + "epoch": 1.5115408608858392, + "grad_norm": 0.11630274180853753, + "learning_rate": 5.672657677077449e-06, + "loss": 0.0363, + "num_tokens": 97049726.0, + "step": 1212 + }, + { + "epoch": 1.512788521522146, + "grad_norm": 0.1127526825270542, + "learning_rate": 5.666602332816985e-06, + "loss": 0.0372, + "num_tokens": 97130469.0, + "step": 1213 + }, + { + "epoch": 1.514036182158453, + "grad_norm": 0.11863761031881935, + "learning_rate": 5.6605466864561344e-06, + "loss": 0.0344, + "num_tokens": 97210798.0, + "step": 1214 + }, + { + "epoch": 1.51528384279476, + "grad_norm": 0.11747556688292905, + "learning_rate": 5.654490748975615e-06, + "loss": 0.0368, + "num_tokens": 97290820.0, + "step": 1215 + }, + { + "epoch": 1.5165315034310667, + "grad_norm": 0.11740884642076882, + "learning_rate": 5.648434531356671e-06, + "loss": 0.0341, + "num_tokens": 97370963.0, + "step": 1216 + }, + { + "epoch": 1.5177791640673737, + "grad_norm": 0.1121718619003604, + "learning_rate": 5.642378044581057e-06, + "loss": 0.0372, + "num_tokens": 97451787.0, + "step": 1217 + }, + { + "epoch": 1.5190268247036807, + "grad_norm": 0.11028715028041086, + "learning_rate": 5.636321299631015e-06, + "loss": 0.0355, + "num_tokens": 97531107.0, + "step": 1218 + }, + { + "epoch": 1.5202744853399874, + "grad_norm": 0.11526797879935653, + "learning_rate": 5.630264307489251e-06, + "loss": 0.0356, + "num_tokens": 97610596.0, + "step": 1219 + }, + { + "epoch": 1.5215221459762944, + "grad_norm": 0.11697834812620382, + "learning_rate": 5.624207079138922e-06, + "loss": 0.0372, + "num_tokens": 97692010.0, + "step": 1220 + }, + { + "epoch": 1.5227698066126014, + "grad_norm": 0.11361340804878213, + "learning_rate": 5.6181496255636195e-06, + "loss": 0.038, + "num_tokens": 97771259.0, + "step": 1221 + }, + { + "epoch": 1.5240174672489082, + "grad_norm": 0.11948073802819091, + "learning_rate": 5.612091957747333e-06, + "loss": 0.0362, + "num_tokens": 97851776.0, + "step": 1222 + }, + { + "epoch": 1.5252651278852152, + "grad_norm": 0.11933952019877841, + "learning_rate": 5.606034086674447e-06, + "loss": 0.0347, + "num_tokens": 97931323.0, + "step": 1223 + }, + { + "epoch": 1.5265127885215222, + "grad_norm": 0.1119341984186723, + "learning_rate": 5.5999760233297115e-06, + "loss": 0.0355, + "num_tokens": 98012414.0, + "step": 1224 + }, + { + "epoch": 1.527760449157829, + "grad_norm": 0.12570139447794026, + "learning_rate": 5.593917778698227e-06, + "loss": 0.0351, + "num_tokens": 98092865.0, + "step": 1225 + }, + { + "epoch": 1.5290081097941361, + "grad_norm": 0.12255701579140926, + "learning_rate": 5.5878593637654226e-06, + "loss": 0.0372, + "num_tokens": 98173575.0, + "step": 1226 + }, + { + "epoch": 1.530255770430443, + "grad_norm": 0.12045998667186913, + "learning_rate": 5.581800789517036e-06, + "loss": 0.0338, + "num_tokens": 98253478.0, + "step": 1227 + }, + { + "epoch": 1.5315034310667497, + "grad_norm": 0.10599310040291675, + "learning_rate": 5.5757420669390925e-06, + "loss": 0.0333, + "num_tokens": 98331761.0, + "step": 1228 + }, + { + "epoch": 1.532751091703057, + "grad_norm": 0.1189636777802236, + "learning_rate": 5.5696832070178885e-06, + "loss": 0.0353, + "num_tokens": 98412258.0, + "step": 1229 + }, + { + "epoch": 1.5339987523393637, + "grad_norm": 0.11265547375888633, + "learning_rate": 5.563624220739969e-06, + "loss": 0.0369, + "num_tokens": 98492861.0, + "step": 1230 + }, + { + "epoch": 1.5352464129756707, + "grad_norm": 0.1135501412896236, + "learning_rate": 5.557565119092106e-06, + "loss": 0.034, + "num_tokens": 98572091.0, + "step": 1231 + }, + { + "epoch": 1.5364940736119777, + "grad_norm": 0.12706095636153494, + "learning_rate": 5.551505913061281e-06, + "loss": 0.0386, + "num_tokens": 98652747.0, + "step": 1232 + }, + { + "epoch": 1.5377417342482844, + "grad_norm": 0.11767249883496335, + "learning_rate": 5.54544661363467e-06, + "loss": 0.0356, + "num_tokens": 98732307.0, + "step": 1233 + }, + { + "epoch": 1.5389893948845914, + "grad_norm": 0.11861760143450235, + "learning_rate": 5.53938723179961e-06, + "loss": 0.0337, + "num_tokens": 98811668.0, + "step": 1234 + }, + { + "epoch": 1.5402370555208984, + "grad_norm": 0.1110579082090557, + "learning_rate": 5.533327778543593e-06, + "loss": 0.0356, + "num_tokens": 98890773.0, + "step": 1235 + }, + { + "epoch": 1.5414847161572052, + "grad_norm": 0.10918115524283789, + "learning_rate": 5.527268264854241e-06, + "loss": 0.0354, + "num_tokens": 98970768.0, + "step": 1236 + }, + { + "epoch": 1.5427323767935122, + "grad_norm": 0.1120508500986546, + "learning_rate": 5.521208701719284e-06, + "loss": 0.0371, + "num_tokens": 99052179.0, + "step": 1237 + }, + { + "epoch": 1.5439800374298192, + "grad_norm": 0.11689356960540866, + "learning_rate": 5.515149100126539e-06, + "loss": 0.0364, + "num_tokens": 99132493.0, + "step": 1238 + }, + { + "epoch": 1.545227698066126, + "grad_norm": 0.10663726507181617, + "learning_rate": 5.509089471063897e-06, + "loss": 0.0338, + "num_tokens": 99212713.0, + "step": 1239 + }, + { + "epoch": 1.546475358702433, + "grad_norm": 0.10825998214250622, + "learning_rate": 5.503029825519296e-06, + "loss": 0.0346, + "num_tokens": 99292651.0, + "step": 1240 + }, + { + "epoch": 1.54772301933874, + "grad_norm": 0.11313988408089551, + "learning_rate": 5.496970174480706e-06, + "loss": 0.0339, + "num_tokens": 99372261.0, + "step": 1241 + }, + { + "epoch": 1.5489706799750467, + "grad_norm": 0.12649997908530414, + "learning_rate": 5.4909105289361055e-06, + "loss": 0.0539, + "num_tokens": 99453192.0, + "step": 1242 + }, + { + "epoch": 1.5502183406113537, + "grad_norm": 0.1186247978835761, + "learning_rate": 5.4848508998734626e-06, + "loss": 0.0362, + "num_tokens": 99534693.0, + "step": 1243 + }, + { + "epoch": 1.5514660012476607, + "grad_norm": 0.11032319195795326, + "learning_rate": 5.478791298280719e-06, + "loss": 0.0325, + "num_tokens": 99613614.0, + "step": 1244 + }, + { + "epoch": 1.5527136618839674, + "grad_norm": 0.10331362127146462, + "learning_rate": 5.47273173514576e-06, + "loss": 0.0366, + "num_tokens": 99694144.0, + "step": 1245 + }, + { + "epoch": 1.5539613225202746, + "grad_norm": 0.11441502157583171, + "learning_rate": 5.466672221456408e-06, + "loss": 0.0352, + "num_tokens": 99772396.0, + "step": 1246 + }, + { + "epoch": 1.5552089831565814, + "grad_norm": 0.1384519685906425, + "learning_rate": 5.4606127682003915e-06, + "loss": 0.0364, + "num_tokens": 99853878.0, + "step": 1247 + }, + { + "epoch": 1.5564566437928882, + "grad_norm": 0.1187428026312172, + "learning_rate": 5.454553386365333e-06, + "loss": 0.0362, + "num_tokens": 99933199.0, + "step": 1248 + }, + { + "epoch": 1.5577043044291954, + "grad_norm": 0.1154288559693241, + "learning_rate": 5.44849408693872e-06, + "loss": 0.0355, + "num_tokens": 100013822.0, + "step": 1249 + }, + { + "epoch": 1.5589519650655022, + "grad_norm": 0.10939017703608667, + "learning_rate": 5.4424348809078974e-06, + "loss": 0.0364, + "num_tokens": 100093850.0, + "step": 1250 + }, + { + "epoch": 1.5601996257018091, + "grad_norm": 0.11577706451313442, + "learning_rate": 5.436375779260034e-06, + "loss": 0.0348, + "num_tokens": 100174014.0, + "step": 1251 + }, + { + "epoch": 1.5614472863381161, + "grad_norm": 0.11552930897371735, + "learning_rate": 5.430316792982112e-06, + "loss": 0.0364, + "num_tokens": 100254096.0, + "step": 1252 + }, + { + "epoch": 1.562694946974423, + "grad_norm": 0.12091413731054657, + "learning_rate": 5.424257933060908e-06, + "loss": 0.036, + "num_tokens": 100335736.0, + "step": 1253 + }, + { + "epoch": 1.56394260761073, + "grad_norm": 0.11349954972674088, + "learning_rate": 5.418199210482965e-06, + "loss": 0.0339, + "num_tokens": 100415770.0, + "step": 1254 + }, + { + "epoch": 1.5651902682470369, + "grad_norm": 0.11257539896050413, + "learning_rate": 5.412140636234579e-06, + "loss": 0.0365, + "num_tokens": 100496239.0, + "step": 1255 + }, + { + "epoch": 1.5664379288833437, + "grad_norm": 0.10433088685065614, + "learning_rate": 5.4060822213017745e-06, + "loss": 0.033, + "num_tokens": 100575751.0, + "step": 1256 + }, + { + "epoch": 1.5676855895196506, + "grad_norm": 0.11128220666515805, + "learning_rate": 5.400023976670291e-06, + "loss": 0.0362, + "num_tokens": 100655896.0, + "step": 1257 + }, + { + "epoch": 1.5689332501559576, + "grad_norm": 0.11665278503733945, + "learning_rate": 5.393965913325555e-06, + "loss": 0.036, + "num_tokens": 100736726.0, + "step": 1258 + }, + { + "epoch": 1.5701809107922644, + "grad_norm": 0.11372703831180105, + "learning_rate": 5.387908042252667e-06, + "loss": 0.0521, + "num_tokens": 100817144.0, + "step": 1259 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.11403138474441783, + "learning_rate": 5.381850374436383e-06, + "loss": 0.0349, + "num_tokens": 100896712.0, + "step": 1260 + }, + { + "epoch": 1.5726762320648784, + "grad_norm": 0.11114301741148946, + "learning_rate": 5.3757929208610784e-06, + "loss": 0.0338, + "num_tokens": 100975384.0, + "step": 1261 + }, + { + "epoch": 1.5739238927011852, + "grad_norm": 0.15901465459498454, + "learning_rate": 5.3697356925107514e-06, + "loss": 0.0353, + "num_tokens": 101055790.0, + "step": 1262 + }, + { + "epoch": 1.5751715533374921, + "grad_norm": 0.10627021014563413, + "learning_rate": 5.363678700368987e-06, + "loss": 0.0369, + "num_tokens": 101136479.0, + "step": 1263 + }, + { + "epoch": 1.5764192139737991, + "grad_norm": 0.12456078400858275, + "learning_rate": 5.3576219554189445e-06, + "loss": 0.0401, + "num_tokens": 101217634.0, + "step": 1264 + }, + { + "epoch": 1.577666874610106, + "grad_norm": 0.1210041167801264, + "learning_rate": 5.35156546864333e-06, + "loss": 0.037, + "num_tokens": 101297872.0, + "step": 1265 + }, + { + "epoch": 1.5789145352464131, + "grad_norm": 0.1070252120913601, + "learning_rate": 5.345509251024387e-06, + "loss": 0.0374, + "num_tokens": 101380176.0, + "step": 1266 + }, + { + "epoch": 1.5801621958827199, + "grad_norm": 0.10983324132682443, + "learning_rate": 5.339453313543868e-06, + "loss": 0.0331, + "num_tokens": 101459436.0, + "step": 1267 + }, + { + "epoch": 1.5814098565190269, + "grad_norm": 0.10244262198712618, + "learning_rate": 5.3333976671830165e-06, + "loss": 0.0343, + "num_tokens": 101538263.0, + "step": 1268 + }, + { + "epoch": 1.5826575171553339, + "grad_norm": 0.10678644753096563, + "learning_rate": 5.327342322922553e-06, + "loss": 0.0333, + "num_tokens": 101618020.0, + "step": 1269 + }, + { + "epoch": 1.5839051777916406, + "grad_norm": 0.10303177226381595, + "learning_rate": 5.321287291742645e-06, + "loss": 0.0335, + "num_tokens": 101696955.0, + "step": 1270 + }, + { + "epoch": 1.5851528384279476, + "grad_norm": 0.10922923647766163, + "learning_rate": 5.315232584622893e-06, + "loss": 0.0332, + "num_tokens": 101776118.0, + "step": 1271 + }, + { + "epoch": 1.5864004990642546, + "grad_norm": 0.11106303953873586, + "learning_rate": 5.309178212542313e-06, + "loss": 0.0342, + "num_tokens": 101855486.0, + "step": 1272 + }, + { + "epoch": 1.5876481597005614, + "grad_norm": 0.11146880211359997, + "learning_rate": 5.303124186479309e-06, + "loss": 0.0325, + "num_tokens": 101933520.0, + "step": 1273 + }, + { + "epoch": 1.5888958203368684, + "grad_norm": 0.11066657207520725, + "learning_rate": 5.297070517411664e-06, + "loss": 0.037, + "num_tokens": 102015666.0, + "step": 1274 + }, + { + "epoch": 1.5901434809731754, + "grad_norm": 0.12070337142463965, + "learning_rate": 5.2910172163165096e-06, + "loss": 0.0355, + "num_tokens": 102095242.0, + "step": 1275 + }, + { + "epoch": 1.5913911416094821, + "grad_norm": 0.11240092383283054, + "learning_rate": 5.284964294170306e-06, + "loss": 0.0362, + "num_tokens": 102174205.0, + "step": 1276 + }, + { + "epoch": 1.5926388022457891, + "grad_norm": 0.12576136429390572, + "learning_rate": 5.278911761948834e-06, + "loss": 0.0355, + "num_tokens": 102255437.0, + "step": 1277 + }, + { + "epoch": 1.5938864628820961, + "grad_norm": 0.11710384991621879, + "learning_rate": 5.272859630627164e-06, + "loss": 0.0347, + "num_tokens": 102334946.0, + "step": 1278 + }, + { + "epoch": 1.595134123518403, + "grad_norm": 0.10521059038838904, + "learning_rate": 5.266807911179638e-06, + "loss": 0.0354, + "num_tokens": 102414226.0, + "step": 1279 + }, + { + "epoch": 1.5963817841547099, + "grad_norm": 0.11308055443975817, + "learning_rate": 5.260756614579851e-06, + "loss": 0.0327, + "num_tokens": 102493991.0, + "step": 1280 + }, + { + "epoch": 1.5976294447910169, + "grad_norm": 0.1186031956772661, + "learning_rate": 5.254705751800636e-06, + "loss": 0.0348, + "num_tokens": 102574527.0, + "step": 1281 + }, + { + "epoch": 1.5988771054273236, + "grad_norm": 0.10866075596998348, + "learning_rate": 5.248655333814036e-06, + "loss": 0.0333, + "num_tokens": 102654519.0, + "step": 1282 + }, + { + "epoch": 1.6001247660636309, + "grad_norm": 0.10696137668864042, + "learning_rate": 5.242605371591286e-06, + "loss": 0.0352, + "num_tokens": 102734795.0, + "step": 1283 + }, + { + "epoch": 1.6013724266999376, + "grad_norm": 0.10337102406850418, + "learning_rate": 5.236555876102797e-06, + "loss": 0.0375, + "num_tokens": 102814405.0, + "step": 1284 + }, + { + "epoch": 1.6026200873362444, + "grad_norm": 0.12757465999262096, + "learning_rate": 5.2305068583181314e-06, + "loss": 0.036, + "num_tokens": 102894980.0, + "step": 1285 + }, + { + "epoch": 1.6038677479725516, + "grad_norm": 0.13690644302329974, + "learning_rate": 5.2244583292059896e-06, + "loss": 0.0366, + "num_tokens": 102977017.0, + "step": 1286 + }, + { + "epoch": 1.6051154086088584, + "grad_norm": 0.11930944617293818, + "learning_rate": 5.218410299734181e-06, + "loss": 0.034, + "num_tokens": 103056779.0, + "step": 1287 + }, + { + "epoch": 1.6063630692451654, + "grad_norm": 0.11012326303635499, + "learning_rate": 5.2123627808696084e-06, + "loss": 0.0339, + "num_tokens": 103137283.0, + "step": 1288 + }, + { + "epoch": 1.6076107298814724, + "grad_norm": 0.10957703717797697, + "learning_rate": 5.206315783578258e-06, + "loss": 0.0336, + "num_tokens": 103216776.0, + "step": 1289 + }, + { + "epoch": 1.6088583905177791, + "grad_norm": 0.10906712477554352, + "learning_rate": 5.20026931882516e-06, + "loss": 0.0341, + "num_tokens": 103296399.0, + "step": 1290 + }, + { + "epoch": 1.6101060511540861, + "grad_norm": 0.11440229595698308, + "learning_rate": 5.194223397574381e-06, + "loss": 0.0376, + "num_tokens": 103376203.0, + "step": 1291 + }, + { + "epoch": 1.611353711790393, + "grad_norm": 0.12588773683905502, + "learning_rate": 5.188178030789008e-06, + "loss": 0.0346, + "num_tokens": 103456735.0, + "step": 1292 + }, + { + "epoch": 1.6126013724266999, + "grad_norm": 0.12042374456895473, + "learning_rate": 5.1821332294311136e-06, + "loss": 0.0356, + "num_tokens": 103537257.0, + "step": 1293 + }, + { + "epoch": 1.6138490330630069, + "grad_norm": 0.11853035765383917, + "learning_rate": 5.176089004461752e-06, + "loss": 0.0361, + "num_tokens": 103617737.0, + "step": 1294 + }, + { + "epoch": 1.6150966936993139, + "grad_norm": 0.1084054829939777, + "learning_rate": 5.170045366840929e-06, + "loss": 0.0373, + "num_tokens": 103698813.0, + "step": 1295 + }, + { + "epoch": 1.6163443543356206, + "grad_norm": 0.12273703655358631, + "learning_rate": 5.164002327527588e-06, + "loss": 0.0374, + "num_tokens": 103782172.0, + "step": 1296 + }, + { + "epoch": 1.6175920149719276, + "grad_norm": 0.10996795677144779, + "learning_rate": 5.157959897479587e-06, + "loss": 0.0359, + "num_tokens": 103862312.0, + "step": 1297 + }, + { + "epoch": 1.6188396756082346, + "grad_norm": 0.11103120135245384, + "learning_rate": 5.151918087653672e-06, + "loss": 0.0336, + "num_tokens": 103943466.0, + "step": 1298 + }, + { + "epoch": 1.6200873362445414, + "grad_norm": 0.10467769656583091, + "learning_rate": 5.145876909005477e-06, + "loss": 0.0335, + "num_tokens": 104023154.0, + "step": 1299 + }, + { + "epoch": 1.6213349968808484, + "grad_norm": 0.10318413558631827, + "learning_rate": 5.139836372489481e-06, + "loss": 0.0318, + "num_tokens": 104102426.0, + "step": 1300 + }, + { + "epoch": 1.6225826575171554, + "grad_norm": 0.10919271565885186, + "learning_rate": 5.133796489059005e-06, + "loss": 0.0355, + "num_tokens": 104182188.0, + "step": 1301 + }, + { + "epoch": 1.6238303181534621, + "grad_norm": 0.11310842463169009, + "learning_rate": 5.1277572696661806e-06, + "loss": 0.0351, + "num_tokens": 104261762.0, + "step": 1302 + }, + { + "epoch": 1.6250779787897693, + "grad_norm": 0.1165076029660535, + "learning_rate": 5.12171872526194e-06, + "loss": 0.0355, + "num_tokens": 104342372.0, + "step": 1303 + }, + { + "epoch": 1.626325639426076, + "grad_norm": 0.11376695615925705, + "learning_rate": 5.115680866795989e-06, + "loss": 0.0342, + "num_tokens": 104422382.0, + "step": 1304 + }, + { + "epoch": 1.6275733000623829, + "grad_norm": 0.11767626403403031, + "learning_rate": 5.109643705216789e-06, + "loss": 0.0354, + "num_tokens": 104503015.0, + "step": 1305 + }, + { + "epoch": 1.62882096069869, + "grad_norm": 0.1181534867539231, + "learning_rate": 5.103607251471541e-06, + "loss": 0.0321, + "num_tokens": 104582079.0, + "step": 1306 + }, + { + "epoch": 1.6300686213349969, + "grad_norm": 0.09517674089409651, + "learning_rate": 5.097571516506158e-06, + "loss": 0.0331, + "num_tokens": 104661780.0, + "step": 1307 + }, + { + "epoch": 1.6313162819713038, + "grad_norm": 0.11931275868780851, + "learning_rate": 5.091536511265253e-06, + "loss": 0.0348, + "num_tokens": 104741752.0, + "step": 1308 + }, + { + "epoch": 1.6325639426076108, + "grad_norm": 0.10940449179708552, + "learning_rate": 5.085502246692111e-06, + "loss": 0.0354, + "num_tokens": 104822380.0, + "step": 1309 + }, + { + "epoch": 1.6338116032439176, + "grad_norm": 0.12053843559094413, + "learning_rate": 5.079468733728684e-06, + "loss": 0.0369, + "num_tokens": 104902595.0, + "step": 1310 + }, + { + "epoch": 1.6350592638802246, + "grad_norm": 0.11265796315863995, + "learning_rate": 5.07343598331555e-06, + "loss": 0.0336, + "num_tokens": 104981340.0, + "step": 1311 + }, + { + "epoch": 1.6363069245165316, + "grad_norm": 0.1109231726527133, + "learning_rate": 5.0674040063919114e-06, + "loss": 0.0349, + "num_tokens": 105060850.0, + "step": 1312 + }, + { + "epoch": 1.6375545851528384, + "grad_norm": 0.11588144786485258, + "learning_rate": 5.0613728138955644e-06, + "loss": 0.0343, + "num_tokens": 105140234.0, + "step": 1313 + }, + { + "epoch": 1.6388022457891454, + "grad_norm": 0.10922220587388409, + "learning_rate": 5.055342416762883e-06, + "loss": 0.0333, + "num_tokens": 105219670.0, + "step": 1314 + }, + { + "epoch": 1.6400499064254523, + "grad_norm": 0.10863070032451541, + "learning_rate": 5.0493128259288025e-06, + "loss": 0.0348, + "num_tokens": 105300783.0, + "step": 1315 + }, + { + "epoch": 1.641297567061759, + "grad_norm": 0.10544356260119463, + "learning_rate": 5.043284052326789e-06, + "loss": 0.0337, + "num_tokens": 105380432.0, + "step": 1316 + }, + { + "epoch": 1.642545227698066, + "grad_norm": 0.10945480005987275, + "learning_rate": 5.037256106888837e-06, + "loss": 0.0337, + "num_tokens": 105459281.0, + "step": 1317 + }, + { + "epoch": 1.643792888334373, + "grad_norm": 0.10513668226551459, + "learning_rate": 5.03122900054543e-06, + "loss": 0.0364, + "num_tokens": 105539043.0, + "step": 1318 + }, + { + "epoch": 1.6450405489706799, + "grad_norm": 0.11889358390219724, + "learning_rate": 5.025202744225535e-06, + "loss": 0.0323, + "num_tokens": 105618097.0, + "step": 1319 + }, + { + "epoch": 1.6462882096069869, + "grad_norm": 0.10819389235655742, + "learning_rate": 5.019177348856576e-06, + "loss": 0.0384, + "num_tokens": 105700246.0, + "step": 1320 + }, + { + "epoch": 1.6475358702432938, + "grad_norm": 0.12285064335570973, + "learning_rate": 5.013152825364416e-06, + "loss": 0.0335, + "num_tokens": 105779956.0, + "step": 1321 + }, + { + "epoch": 1.6487835308796006, + "grad_norm": 0.10457229323362498, + "learning_rate": 5.007129184673335e-06, + "loss": 0.0342, + "num_tokens": 105859422.0, + "step": 1322 + }, + { + "epoch": 1.6500311915159078, + "grad_norm": 0.10418232593169255, + "learning_rate": 5.001106437706016e-06, + "loss": 0.0354, + "num_tokens": 105939798.0, + "step": 1323 + }, + { + "epoch": 1.6512788521522146, + "grad_norm": 0.10621297870744945, + "learning_rate": 4.99508459538352e-06, + "loss": 0.035, + "num_tokens": 106019008.0, + "step": 1324 + }, + { + "epoch": 1.6525265127885214, + "grad_norm": 0.11055886736084558, + "learning_rate": 4.989063668625267e-06, + "loss": 0.0331, + "num_tokens": 106099014.0, + "step": 1325 + }, + { + "epoch": 1.6537741734248286, + "grad_norm": 0.12055990828923857, + "learning_rate": 4.983043668349018e-06, + "loss": 0.035, + "num_tokens": 106178538.0, + "step": 1326 + }, + { + "epoch": 1.6550218340611353, + "grad_norm": 0.11615411814602732, + "learning_rate": 4.977024605470851e-06, + "loss": 0.0342, + "num_tokens": 106257051.0, + "step": 1327 + }, + { + "epoch": 1.6562694946974423, + "grad_norm": 0.11242812128073622, + "learning_rate": 4.971006490905148e-06, + "loss": 0.0326, + "num_tokens": 106337027.0, + "step": 1328 + }, + { + "epoch": 1.6575171553337493, + "grad_norm": 0.10451594006379315, + "learning_rate": 4.964989335564571e-06, + "loss": 0.035, + "num_tokens": 106415989.0, + "step": 1329 + }, + { + "epoch": 1.658764815970056, + "grad_norm": 0.12008210058517821, + "learning_rate": 4.958973150360034e-06, + "loss": 0.0338, + "num_tokens": 106496902.0, + "step": 1330 + }, + { + "epoch": 1.660012476606363, + "grad_norm": 0.10283103060613702, + "learning_rate": 4.952957946200709e-06, + "loss": 0.0316, + "num_tokens": 106576276.0, + "step": 1331 + }, + { + "epoch": 1.66126013724267, + "grad_norm": 0.10840588788185893, + "learning_rate": 4.946943733993974e-06, + "loss": 0.0342, + "num_tokens": 106656074.0, + "step": 1332 + }, + { + "epoch": 1.6625077978789768, + "grad_norm": 0.1151122733175202, + "learning_rate": 4.940930524645414e-06, + "loss": 0.0359, + "num_tokens": 106737048.0, + "step": 1333 + }, + { + "epoch": 1.6637554585152838, + "grad_norm": 0.12138327404493408, + "learning_rate": 4.934918329058798e-06, + "loss": 0.0329, + "num_tokens": 106817115.0, + "step": 1334 + }, + { + "epoch": 1.6650031191515908, + "grad_norm": 0.10347193468215583, + "learning_rate": 4.928907158136049e-06, + "loss": 0.0351, + "num_tokens": 106896600.0, + "step": 1335 + }, + { + "epoch": 1.6662507797878976, + "grad_norm": 0.10818319318418472, + "learning_rate": 4.922897022777241e-06, + "loss": 0.0342, + "num_tokens": 106976257.0, + "step": 1336 + }, + { + "epoch": 1.6674984404242046, + "grad_norm": 0.125943792765203, + "learning_rate": 4.916887933880562e-06, + "loss": 0.0364, + "num_tokens": 107056103.0, + "step": 1337 + }, + { + "epoch": 1.6687461010605116, + "grad_norm": 0.10714932245614212, + "learning_rate": 4.910879902342309e-06, + "loss": 0.0328, + "num_tokens": 107135177.0, + "step": 1338 + }, + { + "epoch": 1.6699937616968183, + "grad_norm": 0.10022422578809588, + "learning_rate": 4.904872939056859e-06, + "loss": 0.0327, + "num_tokens": 107215076.0, + "step": 1339 + }, + { + "epoch": 1.6712414223331253, + "grad_norm": 0.11904997859265425, + "learning_rate": 4.898867054916655e-06, + "loss": 0.0331, + "num_tokens": 107294670.0, + "step": 1340 + }, + { + "epoch": 1.6724890829694323, + "grad_norm": 0.10346330508751622, + "learning_rate": 4.892862260812174e-06, + "loss": 0.0355, + "num_tokens": 107375743.0, + "step": 1341 + }, + { + "epoch": 1.673736743605739, + "grad_norm": 0.10320313489980291, + "learning_rate": 4.886858567631927e-06, + "loss": 0.037, + "num_tokens": 107456245.0, + "step": 1342 + }, + { + "epoch": 1.6749844042420463, + "grad_norm": 0.11376647833132077, + "learning_rate": 4.880855986262424e-06, + "loss": 0.0336, + "num_tokens": 107535076.0, + "step": 1343 + }, + { + "epoch": 1.676232064878353, + "grad_norm": 0.11251959987852904, + "learning_rate": 4.874854527588159e-06, + "loss": 0.0348, + "num_tokens": 107615448.0, + "step": 1344 + }, + { + "epoch": 1.6774797255146598, + "grad_norm": 0.11732474632394792, + "learning_rate": 4.868854202491587e-06, + "loss": 0.0343, + "num_tokens": 107695214.0, + "step": 1345 + }, + { + "epoch": 1.678727386150967, + "grad_norm": 0.11672350989227481, + "learning_rate": 4.862855021853117e-06, + "loss": 0.0352, + "num_tokens": 107775647.0, + "step": 1346 + }, + { + "epoch": 1.6799750467872738, + "grad_norm": 0.11684779926230927, + "learning_rate": 4.856856996551074e-06, + "loss": 0.0343, + "num_tokens": 107855586.0, + "step": 1347 + }, + { + "epoch": 1.6812227074235808, + "grad_norm": 0.11194528364428014, + "learning_rate": 4.850860137461691e-06, + "loss": 0.0349, + "num_tokens": 107933910.0, + "step": 1348 + }, + { + "epoch": 1.6824703680598878, + "grad_norm": 0.12241649866099316, + "learning_rate": 4.844864455459085e-06, + "loss": 0.0331, + "num_tokens": 108013285.0, + "step": 1349 + }, + { + "epoch": 1.6837180286961946, + "grad_norm": 0.10701404402743815, + "learning_rate": 4.83886996141524e-06, + "loss": 0.0355, + "num_tokens": 108093741.0, + "step": 1350 + }, + { + "epoch": 1.6849656893325016, + "grad_norm": 0.12427674118245957, + "learning_rate": 4.8328766661999885e-06, + "loss": 0.0351, + "num_tokens": 108174452.0, + "step": 1351 + }, + { + "epoch": 1.6862133499688086, + "grad_norm": 0.11109278194963701, + "learning_rate": 4.826884580680981e-06, + "loss": 0.0363, + "num_tokens": 108255274.0, + "step": 1352 + }, + { + "epoch": 1.6874610106051153, + "grad_norm": 0.11471172964381411, + "learning_rate": 4.8208937157236855e-06, + "loss": 0.033, + "num_tokens": 108335095.0, + "step": 1353 + }, + { + "epoch": 1.6887086712414223, + "grad_norm": 0.12341901134238756, + "learning_rate": 4.814904082191349e-06, + "loss": 0.0342, + "num_tokens": 108415877.0, + "step": 1354 + }, + { + "epoch": 1.6899563318777293, + "grad_norm": 0.1050413188206562, + "learning_rate": 4.8089156909449845e-06, + "loss": 0.0342, + "num_tokens": 108495262.0, + "step": 1355 + }, + { + "epoch": 1.691203992514036, + "grad_norm": 0.10900307986971748, + "learning_rate": 4.802928552843358e-06, + "loss": 0.0351, + "num_tokens": 108574803.0, + "step": 1356 + }, + { + "epoch": 1.692451653150343, + "grad_norm": 0.12184670852461642, + "learning_rate": 4.79694267874296e-06, + "loss": 0.0373, + "num_tokens": 108655482.0, + "step": 1357 + }, + { + "epoch": 1.69369931378665, + "grad_norm": 0.10975468138719802, + "learning_rate": 4.790958079497991e-06, + "loss": 0.0342, + "num_tokens": 108735411.0, + "step": 1358 + }, + { + "epoch": 1.6949469744229568, + "grad_norm": 0.10818231872378661, + "learning_rate": 4.784974765960335e-06, + "loss": 0.0361, + "num_tokens": 108815263.0, + "step": 1359 + }, + { + "epoch": 1.696194635059264, + "grad_norm": 0.10258324984418032, + "learning_rate": 4.77899274897955e-06, + "loss": 0.0342, + "num_tokens": 108894762.0, + "step": 1360 + }, + { + "epoch": 1.6974422956955708, + "grad_norm": 0.11081946376785735, + "learning_rate": 4.773012039402841e-06, + "loss": 0.0368, + "num_tokens": 108975844.0, + "step": 1361 + }, + { + "epoch": 1.6986899563318776, + "grad_norm": 0.10763088262915656, + "learning_rate": 4.767032648075043e-06, + "loss": 0.0352, + "num_tokens": 109056110.0, + "step": 1362 + }, + { + "epoch": 1.6999376169681848, + "grad_norm": 0.10826684816224101, + "learning_rate": 4.761054585838599e-06, + "loss": 0.0341, + "num_tokens": 109136888.0, + "step": 1363 + }, + { + "epoch": 1.7011852776044916, + "grad_norm": 0.11349832649415174, + "learning_rate": 4.755077863533541e-06, + "loss": 0.0342, + "num_tokens": 109216345.0, + "step": 1364 + }, + { + "epoch": 1.7024329382407986, + "grad_norm": 0.10721063931317605, + "learning_rate": 4.749102491997476e-06, + "loss": 0.0304, + "num_tokens": 109297726.0, + "step": 1365 + }, + { + "epoch": 1.7036805988771055, + "grad_norm": 0.11814753856914108, + "learning_rate": 4.743128482065555e-06, + "loss": 0.0368, + "num_tokens": 109377771.0, + "step": 1366 + }, + { + "epoch": 1.7049282595134123, + "grad_norm": 0.10923227600327748, + "learning_rate": 4.737155844570468e-06, + "loss": 0.0358, + "num_tokens": 109457147.0, + "step": 1367 + }, + { + "epoch": 1.7061759201497193, + "grad_norm": 0.1180774047748652, + "learning_rate": 4.7311845903424104e-06, + "loss": 0.0326, + "num_tokens": 109536870.0, + "step": 1368 + }, + { + "epoch": 1.7074235807860263, + "grad_norm": 0.10414052812314961, + "learning_rate": 4.725214730209069e-06, + "loss": 0.0348, + "num_tokens": 109617166.0, + "step": 1369 + }, + { + "epoch": 1.708671241422333, + "grad_norm": 0.11325120099720434, + "learning_rate": 4.719246274995607e-06, + "loss": 0.0373, + "num_tokens": 109697533.0, + "step": 1370 + }, + { + "epoch": 1.70991890205864, + "grad_norm": 0.10237282354895524, + "learning_rate": 4.713279235524637e-06, + "loss": 0.0339, + "num_tokens": 109778192.0, + "step": 1371 + }, + { + "epoch": 1.711166562694947, + "grad_norm": 0.1495589045820829, + "learning_rate": 4.707313622616205e-06, + "loss": 0.0348, + "num_tokens": 109858008.0, + "step": 1372 + }, + { + "epoch": 1.7124142233312538, + "grad_norm": 0.1155569950547421, + "learning_rate": 4.701349447087769e-06, + "loss": 0.0338, + "num_tokens": 109938620.0, + "step": 1373 + }, + { + "epoch": 1.7136618839675608, + "grad_norm": 0.1085633556890907, + "learning_rate": 4.695386719754184e-06, + "loss": 0.0344, + "num_tokens": 110018354.0, + "step": 1374 + }, + { + "epoch": 1.7149095446038678, + "grad_norm": 0.11234843873728942, + "learning_rate": 4.689425451427677e-06, + "loss": 0.0359, + "num_tokens": 110098749.0, + "step": 1375 + }, + { + "epoch": 1.7161572052401746, + "grad_norm": 0.11707214723093819, + "learning_rate": 4.683465652917828e-06, + "loss": 0.0358, + "num_tokens": 110178574.0, + "step": 1376 + }, + { + "epoch": 1.7174048658764816, + "grad_norm": 0.11575033343150132, + "learning_rate": 4.677507335031555e-06, + "loss": 0.0387, + "num_tokens": 110258909.0, + "step": 1377 + }, + { + "epoch": 1.7186525265127885, + "grad_norm": 0.1219047452029518, + "learning_rate": 4.671550508573087e-06, + "loss": 0.0364, + "num_tokens": 110338876.0, + "step": 1378 + }, + { + "epoch": 1.7199001871490953, + "grad_norm": 0.11606054592764724, + "learning_rate": 4.6655951843439514e-06, + "loss": 0.034, + "num_tokens": 110417753.0, + "step": 1379 + }, + { + "epoch": 1.7211478477854025, + "grad_norm": 0.11058642589780793, + "learning_rate": 4.659641373142953e-06, + "loss": 0.0369, + "num_tokens": 110497691.0, + "step": 1380 + }, + { + "epoch": 1.7223955084217093, + "grad_norm": 0.10502695534031538, + "learning_rate": 4.653689085766147e-06, + "loss": 0.0338, + "num_tokens": 110578084.0, + "step": 1381 + }, + { + "epoch": 1.723643169058016, + "grad_norm": 0.10033432947560307, + "learning_rate": 4.6477383330068335e-06, + "loss": 0.0324, + "num_tokens": 110656605.0, + "step": 1382 + }, + { + "epoch": 1.7248908296943233, + "grad_norm": 0.10347951460818469, + "learning_rate": 4.641789125655526e-06, + "loss": 0.0343, + "num_tokens": 110736629.0, + "step": 1383 + }, + { + "epoch": 1.72613849033063, + "grad_norm": 0.10404769094997354, + "learning_rate": 4.6358414744999324e-06, + "loss": 0.0322, + "num_tokens": 110816051.0, + "step": 1384 + }, + { + "epoch": 1.727386150966937, + "grad_norm": 0.10284553692602667, + "learning_rate": 4.6298953903249455e-06, + "loss": 0.0331, + "num_tokens": 110894941.0, + "step": 1385 + }, + { + "epoch": 1.728633811603244, + "grad_norm": 0.10288209629612745, + "learning_rate": 4.623950883912609e-06, + "loss": 0.0353, + "num_tokens": 110975460.0, + "step": 1386 + }, + { + "epoch": 1.7298814722395508, + "grad_norm": 0.10789958526274712, + "learning_rate": 4.618007966042114e-06, + "loss": 0.0342, + "num_tokens": 111054782.0, + "step": 1387 + }, + { + "epoch": 1.7311291328758578, + "grad_norm": 0.10876892596056022, + "learning_rate": 4.612066647489762e-06, + "loss": 0.0355, + "num_tokens": 111135170.0, + "step": 1388 + }, + { + "epoch": 1.7323767935121648, + "grad_norm": 0.12136812485887387, + "learning_rate": 4.606126939028965e-06, + "loss": 0.0338, + "num_tokens": 111214768.0, + "step": 1389 + }, + { + "epoch": 1.7336244541484715, + "grad_norm": 0.10679814813875112, + "learning_rate": 4.600188851430206e-06, + "loss": 0.0324, + "num_tokens": 111294207.0, + "step": 1390 + }, + { + "epoch": 1.7348721147847785, + "grad_norm": 0.1032000235074082, + "learning_rate": 4.594252395461036e-06, + "loss": 0.034, + "num_tokens": 111374632.0, + "step": 1391 + }, + { + "epoch": 1.7361197754210855, + "grad_norm": 0.10918063307642732, + "learning_rate": 4.588317581886041e-06, + "loss": 0.0344, + "num_tokens": 111454535.0, + "step": 1392 + }, + { + "epoch": 1.7373674360573923, + "grad_norm": 0.10999773787711306, + "learning_rate": 4.5823844214668326e-06, + "loss": 0.0352, + "num_tokens": 111534940.0, + "step": 1393 + }, + { + "epoch": 1.7386150966936993, + "grad_norm": 0.12310258864133658, + "learning_rate": 4.576452924962024e-06, + "loss": 0.0379, + "num_tokens": 111616583.0, + "step": 1394 + }, + { + "epoch": 1.7398627573300063, + "grad_norm": 0.11330748509366423, + "learning_rate": 4.570523103127209e-06, + "loss": 0.0334, + "num_tokens": 111694802.0, + "step": 1395 + }, + { + "epoch": 1.741110417966313, + "grad_norm": 0.11642883285973465, + "learning_rate": 4.564594966714952e-06, + "loss": 0.0344, + "num_tokens": 111774583.0, + "step": 1396 + }, + { + "epoch": 1.74235807860262, + "grad_norm": 0.11628781515429629, + "learning_rate": 4.558668526474751e-06, + "loss": 0.0361, + "num_tokens": 111854564.0, + "step": 1397 + }, + { + "epoch": 1.743605739238927, + "grad_norm": 0.11553952117653499, + "learning_rate": 4.552743793153037e-06, + "loss": 0.0342, + "num_tokens": 111934145.0, + "step": 1398 + }, + { + "epoch": 1.7448533998752338, + "grad_norm": 0.1130454484695648, + "learning_rate": 4.5468207774931414e-06, + "loss": 0.0362, + "num_tokens": 112014599.0, + "step": 1399 + }, + { + "epoch": 1.746101060511541, + "grad_norm": 0.10424941012798582, + "learning_rate": 4.540899490235282e-06, + "loss": 0.0353, + "num_tokens": 112095898.0, + "step": 1400 + }, + { + "epoch": 1.7473487211478478, + "grad_norm": 0.12176082233686349, + "learning_rate": 4.534979942116542e-06, + "loss": 0.0335, + "num_tokens": 112174805.0, + "step": 1401 + }, + { + "epoch": 1.7485963817841546, + "grad_norm": 0.10725290738018774, + "learning_rate": 4.529062143870849e-06, + "loss": 0.035, + "num_tokens": 112254844.0, + "step": 1402 + }, + { + "epoch": 1.7498440424204618, + "grad_norm": 0.11613612803991162, + "learning_rate": 4.5231461062289624e-06, + "loss": 0.0364, + "num_tokens": 112336687.0, + "step": 1403 + }, + { + "epoch": 1.7510917030567685, + "grad_norm": 0.11248286120432541, + "learning_rate": 4.5172318399184485e-06, + "loss": 0.0335, + "num_tokens": 112416437.0, + "step": 1404 + }, + { + "epoch": 1.7523393636930755, + "grad_norm": 0.11498685047252416, + "learning_rate": 4.511319355663657e-06, + "loss": 0.0363, + "num_tokens": 112496544.0, + "step": 1405 + }, + { + "epoch": 1.7535870243293825, + "grad_norm": 0.11584733211628963, + "learning_rate": 4.50540866418571e-06, + "loss": 0.0385, + "num_tokens": 112577418.0, + "step": 1406 + }, + { + "epoch": 1.7548346849656893, + "grad_norm": 0.10971791007225389, + "learning_rate": 4.499499776202476e-06, + "loss": 0.0327, + "num_tokens": 112655726.0, + "step": 1407 + }, + { + "epoch": 1.7560823456019963, + "grad_norm": 0.11202865802168581, + "learning_rate": 4.493592702428558e-06, + "loss": 0.0365, + "num_tokens": 112736130.0, + "step": 1408 + }, + { + "epoch": 1.7573300062383033, + "grad_norm": 0.11590791221388194, + "learning_rate": 4.487687453575261e-06, + "loss": 0.0359, + "num_tokens": 112816354.0, + "step": 1409 + }, + { + "epoch": 1.75857766687461, + "grad_norm": 0.11455918942737528, + "learning_rate": 4.481784040350593e-06, + "loss": 0.0357, + "num_tokens": 112895741.0, + "step": 1410 + }, + { + "epoch": 1.759825327510917, + "grad_norm": 0.11184953434294027, + "learning_rate": 4.475882473459221e-06, + "loss": 0.0323, + "num_tokens": 112975009.0, + "step": 1411 + }, + { + "epoch": 1.761072988147224, + "grad_norm": 0.1043909165249291, + "learning_rate": 4.469982763602473e-06, + "loss": 0.0376, + "num_tokens": 113056299.0, + "step": 1412 + }, + { + "epoch": 1.7623206487835308, + "grad_norm": 0.11796014666013199, + "learning_rate": 4.464084921478303e-06, + "loss": 0.0345, + "num_tokens": 113135339.0, + "step": 1413 + }, + { + "epoch": 1.7635683094198378, + "grad_norm": 0.10655530028085919, + "learning_rate": 4.458188957781285e-06, + "loss": 0.0332, + "num_tokens": 113215391.0, + "step": 1414 + }, + { + "epoch": 1.7648159700561448, + "grad_norm": 0.1137944705272993, + "learning_rate": 4.452294883202581e-06, + "loss": 0.0326, + "num_tokens": 113294604.0, + "step": 1415 + }, + { + "epoch": 1.7660636306924515, + "grad_norm": 0.10386535837108864, + "learning_rate": 4.44640270842993e-06, + "loss": 0.0355, + "num_tokens": 113373702.0, + "step": 1416 + }, + { + "epoch": 1.7673112913287585, + "grad_norm": 0.1124153813219037, + "learning_rate": 4.440512444147626e-06, + "loss": 0.0338, + "num_tokens": 113454253.0, + "step": 1417 + }, + { + "epoch": 1.7685589519650655, + "grad_norm": 0.11173982165813126, + "learning_rate": 4.434624101036498e-06, + "loss": 0.034, + "num_tokens": 113534170.0, + "step": 1418 + }, + { + "epoch": 1.7698066126013723, + "grad_norm": 0.10766159191175759, + "learning_rate": 4.4287376897738945e-06, + "loss": 0.0337, + "num_tokens": 113613505.0, + "step": 1419 + }, + { + "epoch": 1.7710542732376795, + "grad_norm": 0.11662200449928516, + "learning_rate": 4.4228532210336535e-06, + "loss": 0.0351, + "num_tokens": 113694356.0, + "step": 1420 + }, + { + "epoch": 1.7723019338739863, + "grad_norm": 0.12239528048779576, + "learning_rate": 4.4169707054861e-06, + "loss": 0.0348, + "num_tokens": 113773772.0, + "step": 1421 + }, + { + "epoch": 1.773549594510293, + "grad_norm": 0.11495501032768879, + "learning_rate": 4.411090153798011e-06, + "loss": 0.0361, + "num_tokens": 113854394.0, + "step": 1422 + }, + { + "epoch": 1.7747972551466002, + "grad_norm": 0.10793305404835776, + "learning_rate": 4.405211576632602e-06, + "loss": 0.0331, + "num_tokens": 113934601.0, + "step": 1423 + }, + { + "epoch": 1.776044915782907, + "grad_norm": 0.10521475892016557, + "learning_rate": 4.3993349846495136e-06, + "loss": 0.0336, + "num_tokens": 114014077.0, + "step": 1424 + }, + { + "epoch": 1.777292576419214, + "grad_norm": 0.11667480494909614, + "learning_rate": 4.393460388504784e-06, + "loss": 0.0364, + "num_tokens": 114095540.0, + "step": 1425 + }, + { + "epoch": 1.778540237055521, + "grad_norm": 0.11816708505706751, + "learning_rate": 4.387587798850826e-06, + "loss": 0.036, + "num_tokens": 114175449.0, + "step": 1426 + }, + { + "epoch": 1.7797878976918278, + "grad_norm": 0.11547381713071538, + "learning_rate": 4.381717226336426e-06, + "loss": 0.033, + "num_tokens": 114255254.0, + "step": 1427 + }, + { + "epoch": 1.7810355583281348, + "grad_norm": 0.11558449024822219, + "learning_rate": 4.375848681606704e-06, + "loss": 0.0355, + "num_tokens": 114335582.0, + "step": 1428 + }, + { + "epoch": 1.7822832189644418, + "grad_norm": 0.11501299427526183, + "learning_rate": 4.369982175303104e-06, + "loss": 0.0356, + "num_tokens": 114417492.0, + "step": 1429 + }, + { + "epoch": 1.7835308796007485, + "grad_norm": 0.11575029405727906, + "learning_rate": 4.364117718063375e-06, + "loss": 0.0342, + "num_tokens": 114498411.0, + "step": 1430 + }, + { + "epoch": 1.7847785402370555, + "grad_norm": 0.10471073262325635, + "learning_rate": 4.358255320521553e-06, + "loss": 0.0335, + "num_tokens": 114579592.0, + "step": 1431 + }, + { + "epoch": 1.7860262008733625, + "grad_norm": 0.10800367249362049, + "learning_rate": 4.352394993307935e-06, + "loss": 0.0346, + "num_tokens": 114660132.0, + "step": 1432 + }, + { + "epoch": 1.7872738615096693, + "grad_norm": 0.11784628615301575, + "learning_rate": 4.346536747049068e-06, + "loss": 0.035, + "num_tokens": 114742718.0, + "step": 1433 + }, + { + "epoch": 1.7885215221459763, + "grad_norm": 0.11047252826803172, + "learning_rate": 4.340680592367721e-06, + "loss": 0.0353, + "num_tokens": 114823729.0, + "step": 1434 + }, + { + "epoch": 1.7897691827822833, + "grad_norm": 0.10863771684339983, + "learning_rate": 4.33482653988287e-06, + "loss": 0.0352, + "num_tokens": 114903277.0, + "step": 1435 + }, + { + "epoch": 1.79101684341859, + "grad_norm": 0.11748528208682778, + "learning_rate": 4.328974600209687e-06, + "loss": 0.0333, + "num_tokens": 114983776.0, + "step": 1436 + }, + { + "epoch": 1.7922645040548972, + "grad_norm": 0.10380189194122497, + "learning_rate": 4.3231247839595045e-06, + "loss": 0.0334, + "num_tokens": 115063423.0, + "step": 1437 + }, + { + "epoch": 1.793512164691204, + "grad_norm": 0.1021667642296284, + "learning_rate": 4.317277101739806e-06, + "loss": 0.0338, + "num_tokens": 115143251.0, + "step": 1438 + }, + { + "epoch": 1.7947598253275108, + "grad_norm": 0.10887837383935073, + "learning_rate": 4.3114315641542105e-06, + "loss": 0.0342, + "num_tokens": 115224078.0, + "step": 1439 + }, + { + "epoch": 1.796007485963818, + "grad_norm": 0.12498802533911925, + "learning_rate": 4.305588181802441e-06, + "loss": 0.0333, + "num_tokens": 115304392.0, + "step": 1440 + }, + { + "epoch": 1.7972551466001248, + "grad_norm": 0.10562913529548969, + "learning_rate": 4.2997469652803185e-06, + "loss": 0.0359, + "num_tokens": 115384699.0, + "step": 1441 + }, + { + "epoch": 1.7985028072364317, + "grad_norm": 0.10968417412970875, + "learning_rate": 4.293907925179733e-06, + "loss": 0.0343, + "num_tokens": 115465341.0, + "step": 1442 + }, + { + "epoch": 1.7997504678727387, + "grad_norm": 0.10397756346348985, + "learning_rate": 4.28807107208863e-06, + "loss": 0.0328, + "num_tokens": 115544709.0, + "step": 1443 + }, + { + "epoch": 1.8009981285090455, + "grad_norm": 0.11506875540961697, + "learning_rate": 4.282236416590986e-06, + "loss": 0.0375, + "num_tokens": 115625949.0, + "step": 1444 + }, + { + "epoch": 1.8022457891453525, + "grad_norm": 0.10835660954726115, + "learning_rate": 4.276403969266797e-06, + "loss": 0.0336, + "num_tokens": 115705144.0, + "step": 1445 + }, + { + "epoch": 1.8034934497816595, + "grad_norm": 0.1125488424807153, + "learning_rate": 4.270573740692053e-06, + "loss": 0.0359, + "num_tokens": 115786232.0, + "step": 1446 + }, + { + "epoch": 1.8047411104179663, + "grad_norm": 0.11179280220174558, + "learning_rate": 4.2647457414387205e-06, + "loss": 0.0324, + "num_tokens": 115865733.0, + "step": 1447 + }, + { + "epoch": 1.8059887710542732, + "grad_norm": 0.10438418516083571, + "learning_rate": 4.2589199820747226e-06, + "loss": 0.0335, + "num_tokens": 115946117.0, + "step": 1448 + }, + { + "epoch": 1.8072364316905802, + "grad_norm": 0.12496207865854495, + "learning_rate": 4.253096473163923e-06, + "loss": 0.038, + "num_tokens": 116028253.0, + "step": 1449 + }, + { + "epoch": 1.808484092326887, + "grad_norm": 0.10562424147281688, + "learning_rate": 4.247275225266103e-06, + "loss": 0.0347, + "num_tokens": 116109099.0, + "step": 1450 + }, + { + "epoch": 1.809731752963194, + "grad_norm": 0.11476927473343183, + "learning_rate": 4.241456248936946e-06, + "loss": 0.0326, + "num_tokens": 116189316.0, + "step": 1451 + }, + { + "epoch": 1.810979413599501, + "grad_norm": 0.10456867355165096, + "learning_rate": 4.23563955472801e-06, + "loss": 0.035, + "num_tokens": 116270657.0, + "step": 1452 + }, + { + "epoch": 1.8122270742358078, + "grad_norm": 0.10972404579793639, + "learning_rate": 4.229825153186727e-06, + "loss": 0.035, + "num_tokens": 116351453.0, + "step": 1453 + }, + { + "epoch": 1.8134747348721147, + "grad_norm": 0.11614985559784176, + "learning_rate": 4.22401305485636e-06, + "loss": 0.035, + "num_tokens": 116432720.0, + "step": 1454 + }, + { + "epoch": 1.8147223955084217, + "grad_norm": 0.10838486449752817, + "learning_rate": 4.218203270276e-06, + "loss": 0.035, + "num_tokens": 116512799.0, + "step": 1455 + }, + { + "epoch": 1.8159700561447285, + "grad_norm": 0.11198739360311537, + "learning_rate": 4.2123958099805466e-06, + "loss": 0.0334, + "num_tokens": 116592644.0, + "step": 1456 + }, + { + "epoch": 1.8172177167810357, + "grad_norm": 0.11215477815079397, + "learning_rate": 4.206590684500675e-06, + "loss": 0.0348, + "num_tokens": 116672720.0, + "step": 1457 + }, + { + "epoch": 1.8184653774173425, + "grad_norm": 0.1130464870503315, + "learning_rate": 4.200787904362833e-06, + "loss": 0.0344, + "num_tokens": 116753067.0, + "step": 1458 + }, + { + "epoch": 1.8197130380536493, + "grad_norm": 0.10319308148117898, + "learning_rate": 4.194987480089218e-06, + "loss": 0.0333, + "num_tokens": 116832122.0, + "step": 1459 + }, + { + "epoch": 1.8209606986899565, + "grad_norm": 0.10789295427643053, + "learning_rate": 4.189189422197751e-06, + "loss": 0.0349, + "num_tokens": 116911422.0, + "step": 1460 + }, + { + "epoch": 1.8222083593262632, + "grad_norm": 0.10741052441160895, + "learning_rate": 4.183393741202065e-06, + "loss": 0.0346, + "num_tokens": 116991316.0, + "step": 1461 + }, + { + "epoch": 1.8234560199625702, + "grad_norm": 0.1149851515920513, + "learning_rate": 4.177600447611478e-06, + "loss": 0.0335, + "num_tokens": 117069959.0, + "step": 1462 + }, + { + "epoch": 1.8247036805988772, + "grad_norm": 0.11670396653884813, + "learning_rate": 4.171809551930985e-06, + "loss": 0.0344, + "num_tokens": 117149818.0, + "step": 1463 + }, + { + "epoch": 1.825951341235184, + "grad_norm": 0.11711057074934993, + "learning_rate": 4.166021064661231e-06, + "loss": 0.035, + "num_tokens": 117230247.0, + "step": 1464 + }, + { + "epoch": 1.827199001871491, + "grad_norm": 0.1163081603973351, + "learning_rate": 4.160234996298491e-06, + "loss": 0.0341, + "num_tokens": 117310424.0, + "step": 1465 + }, + { + "epoch": 1.828446662507798, + "grad_norm": 0.10458937874158106, + "learning_rate": 4.154451357334654e-06, + "loss": 0.0344, + "num_tokens": 117389859.0, + "step": 1466 + }, + { + "epoch": 1.8296943231441047, + "grad_norm": 0.10574896551270638, + "learning_rate": 4.148670158257211e-06, + "loss": 0.0341, + "num_tokens": 117469420.0, + "step": 1467 + }, + { + "epoch": 1.8309419837804117, + "grad_norm": 0.10899400352647608, + "learning_rate": 4.142891409549219e-06, + "loss": 0.0337, + "num_tokens": 117548721.0, + "step": 1468 + }, + { + "epoch": 1.8321896444167187, + "grad_norm": 0.10611750813836142, + "learning_rate": 4.137115121689297e-06, + "loss": 0.0348, + "num_tokens": 117628895.0, + "step": 1469 + }, + { + "epoch": 1.8334373050530255, + "grad_norm": 0.1111805298125393, + "learning_rate": 4.131341305151603e-06, + "loss": 0.0348, + "num_tokens": 117710011.0, + "step": 1470 + }, + { + "epoch": 1.8346849656893325, + "grad_norm": 0.10266067632336437, + "learning_rate": 4.1255699704058085e-06, + "loss": 0.0332, + "num_tokens": 117790110.0, + "step": 1471 + }, + { + "epoch": 1.8359326263256395, + "grad_norm": 0.12013831863359381, + "learning_rate": 4.119801127917089e-06, + "loss": 0.0329, + "num_tokens": 117870583.0, + "step": 1472 + }, + { + "epoch": 1.8371802869619462, + "grad_norm": 0.10173281550009515, + "learning_rate": 4.114034788146101e-06, + "loss": 0.0344, + "num_tokens": 117949961.0, + "step": 1473 + }, + { + "epoch": 1.8384279475982532, + "grad_norm": 0.10191141790449527, + "learning_rate": 4.108270961548957e-06, + "loss": 0.033, + "num_tokens": 118028504.0, + "step": 1474 + }, + { + "epoch": 1.8396756082345602, + "grad_norm": 0.12183190605596082, + "learning_rate": 4.102509658577223e-06, + "loss": 0.0338, + "num_tokens": 118108384.0, + "step": 1475 + }, + { + "epoch": 1.840923268870867, + "grad_norm": 0.10416883300068461, + "learning_rate": 4.096750889677878e-06, + "loss": 0.0349, + "num_tokens": 118188162.0, + "step": 1476 + }, + { + "epoch": 1.8421709295071742, + "grad_norm": 0.11986832551794753, + "learning_rate": 4.090994665293313e-06, + "loss": 0.0346, + "num_tokens": 118268028.0, + "step": 1477 + }, + { + "epoch": 1.843418590143481, + "grad_norm": 0.10172966968166275, + "learning_rate": 4.085240995861301e-06, + "loss": 0.0342, + "num_tokens": 118348423.0, + "step": 1478 + }, + { + "epoch": 1.8446662507797877, + "grad_norm": 0.11569226583939839, + "learning_rate": 4.079489891814986e-06, + "loss": 0.0352, + "num_tokens": 118429210.0, + "step": 1479 + }, + { + "epoch": 1.845913911416095, + "grad_norm": 0.10916278732424804, + "learning_rate": 4.073741363582856e-06, + "loss": 0.0352, + "num_tokens": 118508578.0, + "step": 1480 + }, + { + "epoch": 1.8471615720524017, + "grad_norm": 0.11293663723584749, + "learning_rate": 4.06799542158873e-06, + "loss": 0.0355, + "num_tokens": 118588744.0, + "step": 1481 + }, + { + "epoch": 1.8484092326887087, + "grad_norm": 0.11722344024028651, + "learning_rate": 4.062252076251739e-06, + "loss": 0.0328, + "num_tokens": 118667906.0, + "step": 1482 + }, + { + "epoch": 1.8496568933250157, + "grad_norm": 0.10122769351963658, + "learning_rate": 4.056511337986304e-06, + "loss": 0.0318, + "num_tokens": 118746761.0, + "step": 1483 + }, + { + "epoch": 1.8509045539613225, + "grad_norm": 0.10229816908665479, + "learning_rate": 4.05077321720212e-06, + "loss": 0.032, + "num_tokens": 118826981.0, + "step": 1484 + }, + { + "epoch": 1.8521522145976295, + "grad_norm": 0.11918185823359788, + "learning_rate": 4.045037724304129e-06, + "loss": 0.0338, + "num_tokens": 118906395.0, + "step": 1485 + }, + { + "epoch": 1.8533998752339365, + "grad_norm": 0.10337247858601341, + "learning_rate": 4.039304869692518e-06, + "loss": 0.0333, + "num_tokens": 118985392.0, + "step": 1486 + }, + { + "epoch": 1.8546475358702432, + "grad_norm": 0.1019281333615369, + "learning_rate": 4.033574663762685e-06, + "loss": 0.0354, + "num_tokens": 119065923.0, + "step": 1487 + }, + { + "epoch": 1.8558951965065502, + "grad_norm": 0.11895729478200164, + "learning_rate": 4.0278471169052224e-06, + "loss": 0.0333, + "num_tokens": 119144893.0, + "step": 1488 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.10167459971375013, + "learning_rate": 4.022122239505906e-06, + "loss": 0.0335, + "num_tokens": 119224641.0, + "step": 1489 + }, + { + "epoch": 1.858390517779164, + "grad_norm": 0.10149185182575064, + "learning_rate": 4.0164000419456715e-06, + "loss": 0.0341, + "num_tokens": 119305051.0, + "step": 1490 + }, + { + "epoch": 1.859638178415471, + "grad_norm": 0.11077520841456237, + "learning_rate": 4.010680534600587e-06, + "loss": 0.034, + "num_tokens": 119385471.0, + "step": 1491 + }, + { + "epoch": 1.860885839051778, + "grad_norm": 0.1120329287780735, + "learning_rate": 4.004963727841852e-06, + "loss": 0.0358, + "num_tokens": 119465985.0, + "step": 1492 + }, + { + "epoch": 1.8621334996880847, + "grad_norm": 0.11813850687069626, + "learning_rate": 3.9992496320357645e-06, + "loss": 0.0351, + "num_tokens": 119546271.0, + "step": 1493 + }, + { + "epoch": 1.8633811603243917, + "grad_norm": 0.11886113375271443, + "learning_rate": 3.993538257543706e-06, + "loss": 0.0316, + "num_tokens": 119627060.0, + "step": 1494 + }, + { + "epoch": 1.8646288209606987, + "grad_norm": 0.09874498532250679, + "learning_rate": 3.987829614722124e-06, + "loss": 0.0335, + "num_tokens": 119707551.0, + "step": 1495 + }, + { + "epoch": 1.8658764815970055, + "grad_norm": 0.10786226744861373, + "learning_rate": 3.982123713922517e-06, + "loss": 0.0344, + "num_tokens": 119787384.0, + "step": 1496 + }, + { + "epoch": 1.8671241422333127, + "grad_norm": 0.10917423418357436, + "learning_rate": 3.976420565491404e-06, + "loss": 0.0322, + "num_tokens": 119866556.0, + "step": 1497 + }, + { + "epoch": 1.8683718028696195, + "grad_norm": 0.10509609392688785, + "learning_rate": 3.970720179770322e-06, + "loss": 0.032, + "num_tokens": 119946177.0, + "step": 1498 + }, + { + "epoch": 1.8696194635059262, + "grad_norm": 0.11086500849159564, + "learning_rate": 3.965022567095788e-06, + "loss": 0.0382, + "num_tokens": 120026777.0, + "step": 1499 + }, + { + "epoch": 1.8708671241422334, + "grad_norm": 0.1026552922683715, + "learning_rate": 3.959327737799298e-06, + "loss": 0.0349, + "num_tokens": 120106915.0, + "step": 1500 + }, + { + "epoch": 1.8721147847785402, + "grad_norm": 0.10134968732729031, + "learning_rate": 3.953635702207299e-06, + "loss": 0.0316, + "num_tokens": 120186044.0, + "step": 1501 + }, + { + "epoch": 1.8733624454148472, + "grad_norm": 0.09477996979178789, + "learning_rate": 3.947946470641169e-06, + "loss": 0.0314, + "num_tokens": 120265104.0, + "step": 1502 + }, + { + "epoch": 1.8746101060511542, + "grad_norm": 0.10341753477077234, + "learning_rate": 3.9422600534172105e-06, + "loss": 0.0347, + "num_tokens": 120345399.0, + "step": 1503 + }, + { + "epoch": 1.875857766687461, + "grad_norm": 0.109955990836735, + "learning_rate": 3.936576460846614e-06, + "loss": 0.0343, + "num_tokens": 120424888.0, + "step": 1504 + }, + { + "epoch": 1.877105427323768, + "grad_norm": 0.1067874469666549, + "learning_rate": 3.930895703235448e-06, + "loss": 0.0316, + "num_tokens": 120504351.0, + "step": 1505 + }, + { + "epoch": 1.878353087960075, + "grad_norm": 0.10876424328262234, + "learning_rate": 3.925217790884646e-06, + "loss": 0.0325, + "num_tokens": 120583924.0, + "step": 1506 + }, + { + "epoch": 1.8796007485963817, + "grad_norm": 0.11262768159157971, + "learning_rate": 3.919542734089978e-06, + "loss": 0.0354, + "num_tokens": 120664606.0, + "step": 1507 + }, + { + "epoch": 1.8808484092326887, + "grad_norm": 0.10063627694670048, + "learning_rate": 3.913870543142038e-06, + "loss": 0.0359, + "num_tokens": 120744208.0, + "step": 1508 + }, + { + "epoch": 1.8820960698689957, + "grad_norm": 0.12382898842560677, + "learning_rate": 3.908201228326222e-06, + "loss": 0.0328, + "num_tokens": 120824125.0, + "step": 1509 + }, + { + "epoch": 1.8833437305053025, + "grad_norm": 0.10492818441358094, + "learning_rate": 3.902534799922713e-06, + "loss": 0.0318, + "num_tokens": 120902940.0, + "step": 1510 + }, + { + "epoch": 1.8845913911416095, + "grad_norm": 0.10403111315609859, + "learning_rate": 3.896871268206456e-06, + "loss": 0.0364, + "num_tokens": 120985013.0, + "step": 1511 + }, + { + "epoch": 1.8858390517779164, + "grad_norm": 0.11769575525186173, + "learning_rate": 3.8912106434471486e-06, + "loss": 0.0325, + "num_tokens": 121064553.0, + "step": 1512 + }, + { + "epoch": 1.8870867124142232, + "grad_norm": 0.10389306641364818, + "learning_rate": 3.885552935909212e-06, + "loss": 0.0344, + "num_tokens": 121144893.0, + "step": 1513 + }, + { + "epoch": 1.8883343730505302, + "grad_norm": 0.10981301470560886, + "learning_rate": 3.879898155851779e-06, + "loss": 0.0337, + "num_tokens": 121225640.0, + "step": 1514 + }, + { + "epoch": 1.8895820336868372, + "grad_norm": 0.10151306602448709, + "learning_rate": 3.874246313528679e-06, + "loss": 0.034, + "num_tokens": 121307302.0, + "step": 1515 + }, + { + "epoch": 1.890829694323144, + "grad_norm": 0.11075900606639615, + "learning_rate": 3.868597419188409e-06, + "loss": 0.0356, + "num_tokens": 121388078.0, + "step": 1516 + }, + { + "epoch": 1.8920773549594512, + "grad_norm": 0.11142810402559525, + "learning_rate": 3.862951483074119e-06, + "loss": 0.0329, + "num_tokens": 121467954.0, + "step": 1517 + }, + { + "epoch": 1.893325015595758, + "grad_norm": 0.10044572617648408, + "learning_rate": 3.857308515423601e-06, + "loss": 0.0354, + "num_tokens": 121547246.0, + "step": 1518 + }, + { + "epoch": 1.8945726762320647, + "grad_norm": 0.12058986292961471, + "learning_rate": 3.851668526469261e-06, + "loss": 0.0361, + "num_tokens": 121628571.0, + "step": 1519 + }, + { + "epoch": 1.895820336868372, + "grad_norm": 0.10836004623444401, + "learning_rate": 3.846031526438102e-06, + "loss": 0.0332, + "num_tokens": 121709487.0, + "step": 1520 + }, + { + "epoch": 1.8970679975046787, + "grad_norm": 0.10878927022806904, + "learning_rate": 3.84039752555171e-06, + "loss": 0.0332, + "num_tokens": 121789703.0, + "step": 1521 + }, + { + "epoch": 1.8983156581409857, + "grad_norm": 0.10824955993736991, + "learning_rate": 3.834766534026231e-06, + "loss": 0.0332, + "num_tokens": 121869985.0, + "step": 1522 + }, + { + "epoch": 1.8995633187772927, + "grad_norm": 0.10955981205776032, + "learning_rate": 3.829138562072353e-06, + "loss": 0.0335, + "num_tokens": 121948903.0, + "step": 1523 + }, + { + "epoch": 1.9008109794135994, + "grad_norm": 0.10837260665067358, + "learning_rate": 3.823513619895293e-06, + "loss": 0.034, + "num_tokens": 122028802.0, + "step": 1524 + }, + { + "epoch": 1.9020586400499064, + "grad_norm": 0.11392485556540904, + "learning_rate": 3.81789171769477e-06, + "loss": 0.0339, + "num_tokens": 122110650.0, + "step": 1525 + }, + { + "epoch": 1.9033063006862134, + "grad_norm": 0.10471350453492868, + "learning_rate": 3.812272865664994e-06, + "loss": 0.0349, + "num_tokens": 122190750.0, + "step": 1526 + }, + { + "epoch": 1.9045539613225202, + "grad_norm": 0.11294540539518536, + "learning_rate": 3.8066570739946394e-06, + "loss": 0.0346, + "num_tokens": 122271273.0, + "step": 1527 + }, + { + "epoch": 1.9058016219588272, + "grad_norm": 0.11267496722591189, + "learning_rate": 3.801044352866834e-06, + "loss": 0.0358, + "num_tokens": 122351564.0, + "step": 1528 + }, + { + "epoch": 1.9070492825951342, + "grad_norm": 0.11401020747834047, + "learning_rate": 3.7954347124591395e-06, + "loss": 0.0344, + "num_tokens": 122431176.0, + "step": 1529 + }, + { + "epoch": 1.908296943231441, + "grad_norm": 0.10875035338383428, + "learning_rate": 3.7898281629435286e-06, + "loss": 0.0348, + "num_tokens": 122512805.0, + "step": 1530 + }, + { + "epoch": 1.909544603867748, + "grad_norm": 0.1104918835133121, + "learning_rate": 3.7842247144863686e-06, + "loss": 0.0321, + "num_tokens": 122592405.0, + "step": 1531 + }, + { + "epoch": 1.910792264504055, + "grad_norm": 0.10420246611454374, + "learning_rate": 3.778624377248409e-06, + "loss": 0.0336, + "num_tokens": 122672154.0, + "step": 1532 + }, + { + "epoch": 1.9120399251403617, + "grad_norm": 0.10975968736912756, + "learning_rate": 3.77302716138475e-06, + "loss": 0.0355, + "num_tokens": 122753896.0, + "step": 1533 + }, + { + "epoch": 1.913287585776669, + "grad_norm": 0.11562096811664528, + "learning_rate": 3.7674330770448374e-06, + "loss": 0.0339, + "num_tokens": 122833339.0, + "step": 1534 + }, + { + "epoch": 1.9145352464129757, + "grad_norm": 0.09559889402594825, + "learning_rate": 3.7618421343724386e-06, + "loss": 0.0343, + "num_tokens": 122913131.0, + "step": 1535 + }, + { + "epoch": 1.9157829070492824, + "grad_norm": 0.12632567082780488, + "learning_rate": 3.756254343505621e-06, + "loss": 0.0326, + "num_tokens": 122992473.0, + "step": 1536 + }, + { + "epoch": 1.9170305676855897, + "grad_norm": 0.10173152865169789, + "learning_rate": 3.7506697145767367e-06, + "loss": 0.0334, + "num_tokens": 123072318.0, + "step": 1537 + }, + { + "epoch": 1.9182782283218964, + "grad_norm": 0.11406574908341054, + "learning_rate": 3.745088257712408e-06, + "loss": 0.0337, + "num_tokens": 123151348.0, + "step": 1538 + }, + { + "epoch": 1.9195258889582034, + "grad_norm": 0.10892304795841887, + "learning_rate": 3.7395099830335034e-06, + "loss": 0.0342, + "num_tokens": 123231936.0, + "step": 1539 + }, + { + "epoch": 1.9207735495945104, + "grad_norm": 0.10694417012259536, + "learning_rate": 3.7339349006551193e-06, + "loss": 0.0337, + "num_tokens": 123312950.0, + "step": 1540 + }, + { + "epoch": 1.9220212102308172, + "grad_norm": 0.10017582315819759, + "learning_rate": 3.7283630206865696e-06, + "loss": 0.0333, + "num_tokens": 123392780.0, + "step": 1541 + }, + { + "epoch": 1.9232688708671242, + "grad_norm": 0.10747239898877138, + "learning_rate": 3.7227943532313504e-06, + "loss": 0.0341, + "num_tokens": 123472235.0, + "step": 1542 + }, + { + "epoch": 1.9245165315034312, + "grad_norm": 0.11544815418654572, + "learning_rate": 3.7172289083871436e-06, + "loss": 0.0362, + "num_tokens": 123552579.0, + "step": 1543 + }, + { + "epoch": 1.925764192139738, + "grad_norm": 0.11739967817563109, + "learning_rate": 3.7116666962457813e-06, + "loss": 0.033, + "num_tokens": 123631233.0, + "step": 1544 + }, + { + "epoch": 1.927011852776045, + "grad_norm": 0.10369089138250741, + "learning_rate": 3.7061077268932333e-06, + "loss": 0.0344, + "num_tokens": 123711026.0, + "step": 1545 + }, + { + "epoch": 1.928259513412352, + "grad_norm": 0.11479151752423149, + "learning_rate": 3.700552010409596e-06, + "loss": 0.0358, + "num_tokens": 123790909.0, + "step": 1546 + }, + { + "epoch": 1.9295071740486587, + "grad_norm": 0.1028773833805819, + "learning_rate": 3.694999556869059e-06, + "loss": 0.0351, + "num_tokens": 123872098.0, + "step": 1547 + }, + { + "epoch": 1.9307548346849657, + "grad_norm": 0.10697413999677229, + "learning_rate": 3.6894503763399003e-06, + "loss": 0.033, + "num_tokens": 123952070.0, + "step": 1548 + }, + { + "epoch": 1.9320024953212727, + "grad_norm": 0.10029417018146669, + "learning_rate": 3.683904478884461e-06, + "loss": 0.0324, + "num_tokens": 124032234.0, + "step": 1549 + }, + { + "epoch": 1.9332501559575794, + "grad_norm": 0.11027618652006071, + "learning_rate": 3.67836187455913e-06, + "loss": 0.0326, + "num_tokens": 124111121.0, + "step": 1550 + }, + { + "epoch": 1.9344978165938864, + "grad_norm": 0.1056614113686575, + "learning_rate": 3.672822573414323e-06, + "loss": 0.0369, + "num_tokens": 124191867.0, + "step": 1551 + }, + { + "epoch": 1.9357454772301934, + "grad_norm": 0.11091294820691895, + "learning_rate": 3.6672865854944673e-06, + "loss": 0.0356, + "num_tokens": 124272599.0, + "step": 1552 + }, + { + "epoch": 1.9369931378665002, + "grad_norm": 0.1084522698863579, + "learning_rate": 3.6617539208379836e-06, + "loss": 0.0336, + "num_tokens": 124352094.0, + "step": 1553 + }, + { + "epoch": 1.9382407985028074, + "grad_norm": 0.10585047276937648, + "learning_rate": 3.656224589477264e-06, + "loss": 0.0352, + "num_tokens": 124433325.0, + "step": 1554 + }, + { + "epoch": 1.9394884591391142, + "grad_norm": 0.10935995370656627, + "learning_rate": 3.65069860143866e-06, + "loss": 0.0312, + "num_tokens": 124511945.0, + "step": 1555 + }, + { + "epoch": 1.940736119775421, + "grad_norm": 0.10900465709590437, + "learning_rate": 3.645175966742456e-06, + "loss": 0.0355, + "num_tokens": 124592313.0, + "step": 1556 + }, + { + "epoch": 1.9419837804117281, + "grad_norm": 0.11473015794303712, + "learning_rate": 3.639656695402858e-06, + "loss": 0.0349, + "num_tokens": 124672598.0, + "step": 1557 + }, + { + "epoch": 1.943231441048035, + "grad_norm": 0.10421687435100782, + "learning_rate": 3.634140797427974e-06, + "loss": 0.0343, + "num_tokens": 124752029.0, + "step": 1558 + }, + { + "epoch": 1.944479101684342, + "grad_norm": 0.11638448293053776, + "learning_rate": 3.6286282828197904e-06, + "loss": 0.0361, + "num_tokens": 124832634.0, + "step": 1559 + }, + { + "epoch": 1.945726762320649, + "grad_norm": 0.11391845107383987, + "learning_rate": 3.623119161574169e-06, + "loss": 0.0327, + "num_tokens": 124912364.0, + "step": 1560 + }, + { + "epoch": 1.9469744229569557, + "grad_norm": 0.10616429489708827, + "learning_rate": 3.6176134436808074e-06, + "loss": 0.0344, + "num_tokens": 124991270.0, + "step": 1561 + }, + { + "epoch": 1.9482220835932627, + "grad_norm": 0.11167759034779555, + "learning_rate": 3.612111139123239e-06, + "loss": 0.0348, + "num_tokens": 125070833.0, + "step": 1562 + }, + { + "epoch": 1.9494697442295696, + "grad_norm": 0.1127208715547816, + "learning_rate": 3.6066122578788033e-06, + "loss": 0.0381, + "num_tokens": 125152435.0, + "step": 1563 + }, + { + "epoch": 1.9507174048658764, + "grad_norm": 0.10729694888144387, + "learning_rate": 3.6011168099186322e-06, + "loss": 0.0335, + "num_tokens": 125233146.0, + "step": 1564 + }, + { + "epoch": 1.9519650655021834, + "grad_norm": 0.11577479373886601, + "learning_rate": 3.5956248052076383e-06, + "loss": 0.0332, + "num_tokens": 125312477.0, + "step": 1565 + }, + { + "epoch": 1.9532127261384904, + "grad_norm": 0.10816544241229983, + "learning_rate": 3.5901362537044826e-06, + "loss": 0.0353, + "num_tokens": 125393204.0, + "step": 1566 + }, + { + "epoch": 1.9544603867747972, + "grad_norm": 0.1144921783827139, + "learning_rate": 3.584651165361568e-06, + "loss": 0.0339, + "num_tokens": 125473311.0, + "step": 1567 + }, + { + "epoch": 1.9557080474111042, + "grad_norm": 0.1023826741374561, + "learning_rate": 3.579169550125019e-06, + "loss": 0.0314, + "num_tokens": 125553173.0, + "step": 1568 + }, + { + "epoch": 1.9569557080474111, + "grad_norm": 0.1081903875181614, + "learning_rate": 3.5736914179346626e-06, + "loss": 0.0359, + "num_tokens": 125633344.0, + "step": 1569 + }, + { + "epoch": 1.958203368683718, + "grad_norm": 0.11814223600506968, + "learning_rate": 3.5682167787240053e-06, + "loss": 0.0333, + "num_tokens": 125713700.0, + "step": 1570 + }, + { + "epoch": 1.959451029320025, + "grad_norm": 0.11106552128597881, + "learning_rate": 3.5627456424202223e-06, + "loss": 0.0336, + "num_tokens": 125793802.0, + "step": 1571 + }, + { + "epoch": 1.960698689956332, + "grad_norm": 0.11282020876952419, + "learning_rate": 3.55727801894414e-06, + "loss": 0.0328, + "num_tokens": 125873850.0, + "step": 1572 + }, + { + "epoch": 1.9619463505926387, + "grad_norm": 0.10618294835959388, + "learning_rate": 3.5518139182102106e-06, + "loss": 0.033, + "num_tokens": 125953640.0, + "step": 1573 + }, + { + "epoch": 1.9631940112289459, + "grad_norm": 0.09834808850932374, + "learning_rate": 3.5463533501265e-06, + "loss": 0.032, + "num_tokens": 126033564.0, + "step": 1574 + }, + { + "epoch": 1.9644416718652526, + "grad_norm": 0.11284124916966394, + "learning_rate": 3.5408963245946714e-06, + "loss": 0.0348, + "num_tokens": 126114330.0, + "step": 1575 + }, + { + "epoch": 1.9656893325015594, + "grad_norm": 0.09978958639176037, + "learning_rate": 3.53544285150996e-06, + "loss": 0.0329, + "num_tokens": 126194064.0, + "step": 1576 + }, + { + "epoch": 1.9669369931378666, + "grad_norm": 0.11717035362702442, + "learning_rate": 3.529992940761159e-06, + "loss": 0.0397, + "num_tokens": 126274303.0, + "step": 1577 + }, + { + "epoch": 1.9681846537741734, + "grad_norm": 0.11953639846307562, + "learning_rate": 3.524546602230606e-06, + "loss": 0.0351, + "num_tokens": 126355527.0, + "step": 1578 + }, + { + "epoch": 1.9694323144104804, + "grad_norm": 0.10799542403102128, + "learning_rate": 3.5191038457941596e-06, + "loss": 0.0324, + "num_tokens": 126434438.0, + "step": 1579 + }, + { + "epoch": 1.9706799750467874, + "grad_norm": 0.11164008102679486, + "learning_rate": 3.5136646813211784e-06, + "loss": 0.0338, + "num_tokens": 126513806.0, + "step": 1580 + }, + { + "epoch": 1.9719276356830941, + "grad_norm": 0.11042889340863964, + "learning_rate": 3.5082291186745145e-06, + "loss": 0.0342, + "num_tokens": 126594160.0, + "step": 1581 + }, + { + "epoch": 1.9731752963194011, + "grad_norm": 0.11255018277654384, + "learning_rate": 3.5027971677104867e-06, + "loss": 0.0349, + "num_tokens": 126674625.0, + "step": 1582 + }, + { + "epoch": 1.9744229569557081, + "grad_norm": 0.11010249880686576, + "learning_rate": 3.497368838278862e-06, + "loss": 0.0343, + "num_tokens": 126754334.0, + "step": 1583 + }, + { + "epoch": 1.975670617592015, + "grad_norm": 0.10466844945374801, + "learning_rate": 3.491944140222845e-06, + "loss": 0.0327, + "num_tokens": 126834485.0, + "step": 1584 + }, + { + "epoch": 1.976918278228322, + "grad_norm": 0.10973920592988463, + "learning_rate": 3.486523083379051e-06, + "loss": 0.0336, + "num_tokens": 126913919.0, + "step": 1585 + }, + { + "epoch": 1.9781659388646289, + "grad_norm": 0.09624968559215073, + "learning_rate": 3.481105677577493e-06, + "loss": 0.0322, + "num_tokens": 126992768.0, + "step": 1586 + }, + { + "epoch": 1.9794135995009356, + "grad_norm": 0.10348850398918999, + "learning_rate": 3.475691932641569e-06, + "loss": 0.0327, + "num_tokens": 127073558.0, + "step": 1587 + }, + { + "epoch": 1.9806612601372426, + "grad_norm": 0.11008522746530837, + "learning_rate": 3.4702818583880305e-06, + "loss": 0.0329, + "num_tokens": 127154390.0, + "step": 1588 + }, + { + "epoch": 1.9819089207735496, + "grad_norm": 0.10576324521968579, + "learning_rate": 3.46487546462698e-06, + "loss": 0.0335, + "num_tokens": 127233684.0, + "step": 1589 + }, + { + "epoch": 1.9831565814098564, + "grad_norm": 0.10909609236650647, + "learning_rate": 3.4594727611618462e-06, + "loss": 0.0354, + "num_tokens": 127314072.0, + "step": 1590 + }, + { + "epoch": 1.9844042420461634, + "grad_norm": 0.10226819882059832, + "learning_rate": 3.454073757789359e-06, + "loss": 0.0344, + "num_tokens": 127393809.0, + "step": 1591 + }, + { + "epoch": 1.9856519026824704, + "grad_norm": 0.11124791645732714, + "learning_rate": 3.4486784642995442e-06, + "loss": 0.0338, + "num_tokens": 127474232.0, + "step": 1592 + }, + { + "epoch": 1.9868995633187772, + "grad_norm": 0.10680204301961628, + "learning_rate": 3.4432868904757024e-06, + "loss": 0.0342, + "num_tokens": 127554705.0, + "step": 1593 + }, + { + "epoch": 1.9881472239550844, + "grad_norm": 0.10744418843654158, + "learning_rate": 3.437899046094384e-06, + "loss": 0.0334, + "num_tokens": 127634236.0, + "step": 1594 + }, + { + "epoch": 1.9893948845913911, + "grad_norm": 0.10963223394836877, + "learning_rate": 3.432514940925378e-06, + "loss": 0.0344, + "num_tokens": 127714557.0, + "step": 1595 + }, + { + "epoch": 1.990642545227698, + "grad_norm": 0.11012697826983461, + "learning_rate": 3.4271345847316974e-06, + "loss": 0.0364, + "num_tokens": 127795159.0, + "step": 1596 + }, + { + "epoch": 1.9918902058640051, + "grad_norm": 0.11016272568594698, + "learning_rate": 3.421757987269554e-06, + "loss": 0.0362, + "num_tokens": 127875081.0, + "step": 1597 + }, + { + "epoch": 1.9931378665003119, + "grad_norm": 0.10065770585753678, + "learning_rate": 3.416385158288343e-06, + "loss": 0.0327, + "num_tokens": 127954573.0, + "step": 1598 + }, + { + "epoch": 1.9943855271366189, + "grad_norm": 0.10974734302658783, + "learning_rate": 3.411016107530628e-06, + "loss": 0.033, + "num_tokens": 128034668.0, + "step": 1599 + }, + { + "epoch": 1.9956331877729259, + "grad_norm": 0.11061221640528077, + "learning_rate": 3.405650844732122e-06, + "loss": 0.0351, + "num_tokens": 128114461.0, + "step": 1600 + }, + { + "epoch": 1.9968808484092326, + "grad_norm": 0.09898172330167038, + "learning_rate": 3.400289379621664e-06, + "loss": 0.0334, + "num_tokens": 128194681.0, + "step": 1601 + }, + { + "epoch": 1.9981285090455396, + "grad_norm": 0.10720532173976896, + "learning_rate": 3.394931721921214e-06, + "loss": 0.0323, + "num_tokens": 128274005.0, + "step": 1602 + }, + { + "epoch": 1.9993761696818466, + "grad_norm": 0.10715909655170884, + "learning_rate": 3.3895778813458256e-06, + "loss": 0.0339, + "num_tokens": 128353693.0, + "step": 1603 + }, + { + "epoch": 2.0, + "grad_norm": 0.15351115677037117, + "learning_rate": 3.3842278676036293e-06, + "loss": 0.0295, + "num_tokens": 128394204.0, + "step": 1604 + }, + { + "epoch": 2.0012476606363068, + "grad_norm": 0.09668414870141348, + "learning_rate": 3.3788816903958145e-06, + "loss": 0.0292, + "num_tokens": 128474132.0, + "step": 1605 + }, + { + "epoch": 2.002495321272614, + "grad_norm": 0.09284121837219293, + "learning_rate": 3.37353935941662e-06, + "loss": 0.0277, + "num_tokens": 128554051.0, + "step": 1606 + }, + { + "epoch": 2.0037429819089208, + "grad_norm": 0.09339384544922638, + "learning_rate": 3.3682008843533055e-06, + "loss": 0.029, + "num_tokens": 128634209.0, + "step": 1607 + }, + { + "epoch": 2.0049906425452275, + "grad_norm": 0.09435077752847291, + "learning_rate": 3.3628662748861374e-06, + "loss": 0.0282, + "num_tokens": 128715069.0, + "step": 1608 + }, + { + "epoch": 2.0062383031815347, + "grad_norm": 0.09614281235112, + "learning_rate": 3.357535540688379e-06, + "loss": 0.0278, + "num_tokens": 128795504.0, + "step": 1609 + }, + { + "epoch": 2.0074859638178415, + "grad_norm": 0.09264669561245602, + "learning_rate": 3.3522086914262585e-06, + "loss": 0.0271, + "num_tokens": 128876123.0, + "step": 1610 + }, + { + "epoch": 2.0087336244541483, + "grad_norm": 0.11140936885389302, + "learning_rate": 3.3468857367589665e-06, + "loss": 0.0275, + "num_tokens": 128955991.0, + "step": 1611 + }, + { + "epoch": 2.0099812850904555, + "grad_norm": 0.09169587047179813, + "learning_rate": 3.3415666863386298e-06, + "loss": 0.0275, + "num_tokens": 129039016.0, + "step": 1612 + }, + { + "epoch": 2.0112289457267623, + "grad_norm": 0.1042070178139685, + "learning_rate": 3.3362515498102934e-06, + "loss": 0.0275, + "num_tokens": 129120041.0, + "step": 1613 + }, + { + "epoch": 2.0124766063630695, + "grad_norm": 0.11233635245764659, + "learning_rate": 3.330940336811903e-06, + "loss": 0.0281, + "num_tokens": 129200867.0, + "step": 1614 + }, + { + "epoch": 2.0137242669993762, + "grad_norm": 0.09294626630897553, + "learning_rate": 3.325633056974298e-06, + "loss": 0.0259, + "num_tokens": 129279446.0, + "step": 1615 + }, + { + "epoch": 2.014971927635683, + "grad_norm": 0.10425624673925903, + "learning_rate": 3.3203297199211794e-06, + "loss": 0.0273, + "num_tokens": 129359517.0, + "step": 1616 + }, + { + "epoch": 2.01621958827199, + "grad_norm": 0.10772022702288936, + "learning_rate": 3.315030335269096e-06, + "loss": 0.0272, + "num_tokens": 129439678.0, + "step": 1617 + }, + { + "epoch": 2.017467248908297, + "grad_norm": 0.11804003684908118, + "learning_rate": 3.309734912627441e-06, + "loss": 0.0282, + "num_tokens": 129519376.0, + "step": 1618 + }, + { + "epoch": 2.0187149095446038, + "grad_norm": 0.12381607955705823, + "learning_rate": 3.304443461598413e-06, + "loss": 0.0288, + "num_tokens": 129600651.0, + "step": 1619 + }, + { + "epoch": 2.019962570180911, + "grad_norm": 0.13319111452989643, + "learning_rate": 3.299155991777011e-06, + "loss": 0.0297, + "num_tokens": 129680386.0, + "step": 1620 + }, + { + "epoch": 2.0212102308172177, + "grad_norm": 0.11432348701501038, + "learning_rate": 3.2938725127510185e-06, + "loss": 0.0282, + "num_tokens": 129760859.0, + "step": 1621 + }, + { + "epoch": 2.0224578914535245, + "grad_norm": 0.10791791050792644, + "learning_rate": 3.2885930341009774e-06, + "loss": 0.0277, + "num_tokens": 129840051.0, + "step": 1622 + }, + { + "epoch": 2.0237055520898317, + "grad_norm": 0.11686951819864136, + "learning_rate": 3.2833175654001787e-06, + "loss": 0.0283, + "num_tokens": 129919701.0, + "step": 1623 + }, + { + "epoch": 2.0249532127261385, + "grad_norm": 0.11359351664175828, + "learning_rate": 3.278046116214642e-06, + "loss": 0.0269, + "num_tokens": 129999877.0, + "step": 1624 + }, + { + "epoch": 2.0262008733624453, + "grad_norm": 0.1023939359316259, + "learning_rate": 3.272778696103099e-06, + "loss": 0.0275, + "num_tokens": 130079968.0, + "step": 1625 + }, + { + "epoch": 2.0274485339987525, + "grad_norm": 0.10892544064999989, + "learning_rate": 3.2675153146169736e-06, + "loss": 0.0275, + "num_tokens": 130160624.0, + "step": 1626 + }, + { + "epoch": 2.0286961946350592, + "grad_norm": 0.10430749700068655, + "learning_rate": 3.2622559813003684e-06, + "loss": 0.0288, + "num_tokens": 130239925.0, + "step": 1627 + }, + { + "epoch": 2.029943855271366, + "grad_norm": 0.10317672945567567, + "learning_rate": 3.2570007056900437e-06, + "loss": 0.0271, + "num_tokens": 130320799.0, + "step": 1628 + }, + { + "epoch": 2.031191515907673, + "grad_norm": 0.10963736701978923, + "learning_rate": 3.2517494973154008e-06, + "loss": 0.0277, + "num_tokens": 130400099.0, + "step": 1629 + }, + { + "epoch": 2.03243917654398, + "grad_norm": 0.12238499931140737, + "learning_rate": 3.2465023656984707e-06, + "loss": 0.0292, + "num_tokens": 130480729.0, + "step": 1630 + }, + { + "epoch": 2.0336868371802868, + "grad_norm": 0.10971492853164319, + "learning_rate": 3.2412593203538857e-06, + "loss": 0.0297, + "num_tokens": 130560371.0, + "step": 1631 + }, + { + "epoch": 2.034934497816594, + "grad_norm": 0.1230436141198275, + "learning_rate": 3.236020370788876e-06, + "loss": 0.0273, + "num_tokens": 130639731.0, + "step": 1632 + }, + { + "epoch": 2.0361821584529007, + "grad_norm": 0.10170244549018313, + "learning_rate": 3.230785526503236e-06, + "loss": 0.0269, + "num_tokens": 130719595.0, + "step": 1633 + }, + { + "epoch": 2.037429819089208, + "grad_norm": 0.11160175846037224, + "learning_rate": 3.225554796989325e-06, + "loss": 0.0279, + "num_tokens": 130800181.0, + "step": 1634 + }, + { + "epoch": 2.0386774797255147, + "grad_norm": 0.09813935826310065, + "learning_rate": 3.2203281917320328e-06, + "loss": 0.0261, + "num_tokens": 130879513.0, + "step": 1635 + }, + { + "epoch": 2.0399251403618215, + "grad_norm": 0.11117192931618461, + "learning_rate": 3.2151057202087783e-06, + "loss": 0.0283, + "num_tokens": 130958850.0, + "step": 1636 + }, + { + "epoch": 2.0411728009981287, + "grad_norm": 0.10698263215463055, + "learning_rate": 3.209887391889479e-06, + "loss": 0.027, + "num_tokens": 131038985.0, + "step": 1637 + }, + { + "epoch": 2.0424204616344355, + "grad_norm": 0.10820076619681371, + "learning_rate": 3.204673216236539e-06, + "loss": 0.0275, + "num_tokens": 131118757.0, + "step": 1638 + }, + { + "epoch": 2.0436681222707422, + "grad_norm": 0.11467484503330413, + "learning_rate": 3.199463202704838e-06, + "loss": 0.0273, + "num_tokens": 131201221.0, + "step": 1639 + }, + { + "epoch": 2.0449157829070495, + "grad_norm": 0.12247988636033476, + "learning_rate": 3.194257360741706e-06, + "loss": 0.0288, + "num_tokens": 131282719.0, + "step": 1640 + }, + { + "epoch": 2.046163443543356, + "grad_norm": 0.1057593800426446, + "learning_rate": 3.189055699786906e-06, + "loss": 0.0274, + "num_tokens": 131362232.0, + "step": 1641 + }, + { + "epoch": 2.047411104179663, + "grad_norm": 0.12364375320242353, + "learning_rate": 3.1838582292726206e-06, + "loss": 0.0289, + "num_tokens": 131442046.0, + "step": 1642 + }, + { + "epoch": 2.04865876481597, + "grad_norm": 0.1145129297133959, + "learning_rate": 3.1786649586234373e-06, + "loss": 0.0282, + "num_tokens": 131523135.0, + "step": 1643 + }, + { + "epoch": 2.049906425452277, + "grad_norm": 0.10215949526592619, + "learning_rate": 3.173475897256325e-06, + "loss": 0.0266, + "num_tokens": 131603284.0, + "step": 1644 + }, + { + "epoch": 2.0511540860885837, + "grad_norm": 0.10761560661797925, + "learning_rate": 3.1682910545806167e-06, + "loss": 0.028, + "num_tokens": 131683908.0, + "step": 1645 + }, + { + "epoch": 2.052401746724891, + "grad_norm": 0.11121482292743783, + "learning_rate": 3.1631104399980053e-06, + "loss": 0.0267, + "num_tokens": 131762852.0, + "step": 1646 + }, + { + "epoch": 2.0536494073611977, + "grad_norm": 0.11153961622318409, + "learning_rate": 3.157934062902508e-06, + "loss": 0.0276, + "num_tokens": 131842654.0, + "step": 1647 + }, + { + "epoch": 2.0548970679975045, + "grad_norm": 0.11202792323594137, + "learning_rate": 3.1527619326804594e-06, + "loss": 0.0278, + "num_tokens": 131921403.0, + "step": 1648 + }, + { + "epoch": 2.0561447286338117, + "grad_norm": 0.11878747584208416, + "learning_rate": 3.147594058710498e-06, + "loss": 0.0279, + "num_tokens": 132003424.0, + "step": 1649 + }, + { + "epoch": 2.0573923892701185, + "grad_norm": 0.11600844083004452, + "learning_rate": 3.14243045036354e-06, + "loss": 0.0276, + "num_tokens": 132082840.0, + "step": 1650 + }, + { + "epoch": 2.0586400499064252, + "grad_norm": 0.11293539536069673, + "learning_rate": 3.1372711170027666e-06, + "loss": 0.0275, + "num_tokens": 132162541.0, + "step": 1651 + }, + { + "epoch": 2.0598877105427325, + "grad_norm": 0.11026505266606129, + "learning_rate": 3.13211606798361e-06, + "loss": 0.0274, + "num_tokens": 132242631.0, + "step": 1652 + }, + { + "epoch": 2.061135371179039, + "grad_norm": 0.12188319048183703, + "learning_rate": 3.1269653126537344e-06, + "loss": 0.0278, + "num_tokens": 132323488.0, + "step": 1653 + }, + { + "epoch": 2.0623830318153464, + "grad_norm": 0.12082041385469403, + "learning_rate": 3.121818860353011e-06, + "loss": 0.0275, + "num_tokens": 132403983.0, + "step": 1654 + }, + { + "epoch": 2.063630692451653, + "grad_norm": 0.10629866280671935, + "learning_rate": 3.116676720413519e-06, + "loss": 0.0267, + "num_tokens": 132483686.0, + "step": 1655 + }, + { + "epoch": 2.06487835308796, + "grad_norm": 0.10618934252287057, + "learning_rate": 3.11153890215951e-06, + "loss": 0.028, + "num_tokens": 132562728.0, + "step": 1656 + }, + { + "epoch": 2.066126013724267, + "grad_norm": 0.1179037811248608, + "learning_rate": 3.1064054149073984e-06, + "loss": 0.0287, + "num_tokens": 132643301.0, + "step": 1657 + }, + { + "epoch": 2.067373674360574, + "grad_norm": 0.1068883102584125, + "learning_rate": 3.1012762679657525e-06, + "loss": 0.0271, + "num_tokens": 132722314.0, + "step": 1658 + }, + { + "epoch": 2.0686213349968807, + "grad_norm": 0.10906974578938657, + "learning_rate": 3.0961514706352654e-06, + "loss": 0.028, + "num_tokens": 132801325.0, + "step": 1659 + }, + { + "epoch": 2.069868995633188, + "grad_norm": 0.11050438055442657, + "learning_rate": 3.09103103220874e-06, + "loss": 0.0273, + "num_tokens": 132881040.0, + "step": 1660 + }, + { + "epoch": 2.0711166562694947, + "grad_norm": 0.11682521580597607, + "learning_rate": 3.085914961971082e-06, + "loss": 0.0283, + "num_tokens": 132960595.0, + "step": 1661 + }, + { + "epoch": 2.0723643169058015, + "grad_norm": 0.11450793460923425, + "learning_rate": 3.080803269199275e-06, + "loss": 0.0277, + "num_tokens": 133039931.0, + "step": 1662 + }, + { + "epoch": 2.0736119775421087, + "grad_norm": 0.11467759010106195, + "learning_rate": 3.0756959631623583e-06, + "loss": 0.0276, + "num_tokens": 133120290.0, + "step": 1663 + }, + { + "epoch": 2.0748596381784155, + "grad_norm": 0.11713560955703137, + "learning_rate": 3.0705930531214255e-06, + "loss": 0.0284, + "num_tokens": 133201093.0, + "step": 1664 + }, + { + "epoch": 2.0761072988147222, + "grad_norm": 0.11798027964717171, + "learning_rate": 3.065494548329594e-06, + "loss": 0.0283, + "num_tokens": 133280732.0, + "step": 1665 + }, + { + "epoch": 2.0773549594510294, + "grad_norm": 0.11006281058166327, + "learning_rate": 3.060400458031991e-06, + "loss": 0.0267, + "num_tokens": 133360752.0, + "step": 1666 + }, + { + "epoch": 2.078602620087336, + "grad_norm": 0.10693659229092088, + "learning_rate": 3.055310791465744e-06, + "loss": 0.0281, + "num_tokens": 133440361.0, + "step": 1667 + }, + { + "epoch": 2.079850280723643, + "grad_norm": 0.10276904415759595, + "learning_rate": 3.0502255578599594e-06, + "loss": 0.0266, + "num_tokens": 133520074.0, + "step": 1668 + }, + { + "epoch": 2.08109794135995, + "grad_norm": 0.10682226267908494, + "learning_rate": 3.0451447664357005e-06, + "loss": 0.0274, + "num_tokens": 133599454.0, + "step": 1669 + }, + { + "epoch": 2.082345601996257, + "grad_norm": 0.11642340085158942, + "learning_rate": 3.040068426405976e-06, + "loss": 0.028, + "num_tokens": 133679535.0, + "step": 1670 + }, + { + "epoch": 2.083593262632564, + "grad_norm": 0.10439274580853032, + "learning_rate": 3.0349965469757283e-06, + "loss": 0.0276, + "num_tokens": 133759276.0, + "step": 1671 + }, + { + "epoch": 2.084840923268871, + "grad_norm": 0.10178900758944974, + "learning_rate": 3.0299291373418038e-06, + "loss": 0.0271, + "num_tokens": 133839238.0, + "step": 1672 + }, + { + "epoch": 2.0860885839051777, + "grad_norm": 0.10108576053246532, + "learning_rate": 3.024866206692953e-06, + "loss": 0.0266, + "num_tokens": 133918423.0, + "step": 1673 + }, + { + "epoch": 2.087336244541485, + "grad_norm": 0.10890004660532689, + "learning_rate": 3.0198077642097945e-06, + "loss": 0.0281, + "num_tokens": 133999314.0, + "step": 1674 + }, + { + "epoch": 2.0885839051777917, + "grad_norm": 0.11949459892569592, + "learning_rate": 3.014753819064817e-06, + "loss": 0.0285, + "num_tokens": 134079417.0, + "step": 1675 + }, + { + "epoch": 2.0898315658140985, + "grad_norm": 0.11019307261367436, + "learning_rate": 3.009704380422348e-06, + "loss": 0.0268, + "num_tokens": 134159336.0, + "step": 1676 + }, + { + "epoch": 2.0910792264504057, + "grad_norm": 0.1094076338739407, + "learning_rate": 3.004659457438548e-06, + "loss": 0.0282, + "num_tokens": 134238687.0, + "step": 1677 + }, + { + "epoch": 2.0923268870867124, + "grad_norm": 0.11203035809174719, + "learning_rate": 2.999619059261387e-06, + "loss": 0.0277, + "num_tokens": 134319431.0, + "step": 1678 + }, + { + "epoch": 2.093574547723019, + "grad_norm": 0.10125109772296438, + "learning_rate": 2.9945831950306285e-06, + "loss": 0.0267, + "num_tokens": 134399747.0, + "step": 1679 + }, + { + "epoch": 2.0948222083593264, + "grad_norm": 0.10479161190075127, + "learning_rate": 2.9895518738778196e-06, + "loss": 0.0271, + "num_tokens": 134479438.0, + "step": 1680 + }, + { + "epoch": 2.096069868995633, + "grad_norm": 0.11262694348153912, + "learning_rate": 2.984525104926262e-06, + "loss": 0.0285, + "num_tokens": 134559824.0, + "step": 1681 + }, + { + "epoch": 2.09731752963194, + "grad_norm": 0.11572630284775189, + "learning_rate": 2.97950289729101e-06, + "loss": 0.0286, + "num_tokens": 134639963.0, + "step": 1682 + }, + { + "epoch": 2.098565190268247, + "grad_norm": 0.11746731799240802, + "learning_rate": 2.974485260078846e-06, + "loss": 0.0277, + "num_tokens": 134719925.0, + "step": 1683 + }, + { + "epoch": 2.099812850904554, + "grad_norm": 0.121581880657992, + "learning_rate": 2.9694722023882607e-06, + "loss": 0.0269, + "num_tokens": 134799277.0, + "step": 1684 + }, + { + "epoch": 2.1010605115408607, + "grad_norm": 0.11598556908383914, + "learning_rate": 2.9644637333094404e-06, + "loss": 0.0284, + "num_tokens": 134879892.0, + "step": 1685 + }, + { + "epoch": 2.102308172177168, + "grad_norm": 0.14223573137275083, + "learning_rate": 2.959459861924258e-06, + "loss": 0.028, + "num_tokens": 134959367.0, + "step": 1686 + }, + { + "epoch": 2.1035558328134747, + "grad_norm": 0.12003504700797399, + "learning_rate": 2.954460597306242e-06, + "loss": 0.0291, + "num_tokens": 135039815.0, + "step": 1687 + }, + { + "epoch": 2.1048034934497815, + "grad_norm": 0.12915254648172153, + "learning_rate": 2.9494659485205683e-06, + "loss": 0.0459, + "num_tokens": 135121468.0, + "step": 1688 + }, + { + "epoch": 2.1060511540860887, + "grad_norm": 0.10985724087657672, + "learning_rate": 2.9444759246240505e-06, + "loss": 0.0271, + "num_tokens": 135200652.0, + "step": 1689 + }, + { + "epoch": 2.1072988147223954, + "grad_norm": 0.11936091256114177, + "learning_rate": 2.939490534665107e-06, + "loss": 0.0278, + "num_tokens": 135281590.0, + "step": 1690 + }, + { + "epoch": 2.108546475358702, + "grad_norm": 0.11222455840445227, + "learning_rate": 2.934509787683755e-06, + "loss": 0.0281, + "num_tokens": 135361375.0, + "step": 1691 + }, + { + "epoch": 2.1097941359950094, + "grad_norm": 0.11147040706020958, + "learning_rate": 2.929533692711598e-06, + "loss": 0.0274, + "num_tokens": 135442925.0, + "step": 1692 + }, + { + "epoch": 2.111041796631316, + "grad_norm": 0.10467852886453667, + "learning_rate": 2.9245622587717982e-06, + "loss": 0.0275, + "num_tokens": 135523906.0, + "step": 1693 + }, + { + "epoch": 2.1122894572676234, + "grad_norm": 0.10869697183862845, + "learning_rate": 2.919595494879065e-06, + "loss": 0.0276, + "num_tokens": 135603783.0, + "step": 1694 + }, + { + "epoch": 2.11353711790393, + "grad_norm": 0.1155482691293836, + "learning_rate": 2.9146334100396474e-06, + "loss": 0.0282, + "num_tokens": 135684084.0, + "step": 1695 + }, + { + "epoch": 2.114784778540237, + "grad_norm": 0.11252805380717851, + "learning_rate": 2.9096760132513036e-06, + "loss": 0.0286, + "num_tokens": 135765048.0, + "step": 1696 + }, + { + "epoch": 2.116032439176544, + "grad_norm": 0.11033818934026106, + "learning_rate": 2.9047233135032927e-06, + "loss": 0.0275, + "num_tokens": 135845335.0, + "step": 1697 + }, + { + "epoch": 2.117280099812851, + "grad_norm": 0.13859448472332372, + "learning_rate": 2.8997753197763532e-06, + "loss": 0.03, + "num_tokens": 135925643.0, + "step": 1698 + }, + { + "epoch": 2.1185277604491577, + "grad_norm": 0.11224407005748382, + "learning_rate": 2.894832041042699e-06, + "loss": 0.0288, + "num_tokens": 136005781.0, + "step": 1699 + }, + { + "epoch": 2.119775421085465, + "grad_norm": 0.10019855101966049, + "learning_rate": 2.8898934862659823e-06, + "loss": 0.0266, + "num_tokens": 136085338.0, + "step": 1700 + }, + { + "epoch": 2.1210230817217717, + "grad_norm": 0.12238851973022535, + "learning_rate": 2.8849596644013e-06, + "loss": 0.0277, + "num_tokens": 136166090.0, + "step": 1701 + }, + { + "epoch": 2.1222707423580784, + "grad_norm": 0.11577197457908402, + "learning_rate": 2.880030584395162e-06, + "loss": 0.0281, + "num_tokens": 136246455.0, + "step": 1702 + }, + { + "epoch": 2.1235184029943857, + "grad_norm": 0.11575314696378225, + "learning_rate": 2.8751062551854775e-06, + "loss": 0.0292, + "num_tokens": 136326020.0, + "step": 1703 + }, + { + "epoch": 2.1247660636306924, + "grad_norm": 0.1236602811227467, + "learning_rate": 2.870186685701545e-06, + "loss": 0.0278, + "num_tokens": 136405902.0, + "step": 1704 + }, + { + "epoch": 2.126013724266999, + "grad_norm": 0.10698357238776467, + "learning_rate": 2.8652718848640337e-06, + "loss": 0.0271, + "num_tokens": 136484304.0, + "step": 1705 + }, + { + "epoch": 2.1272613849033064, + "grad_norm": 0.11565417571999578, + "learning_rate": 2.8603618615849603e-06, + "loss": 0.0289, + "num_tokens": 136563783.0, + "step": 1706 + }, + { + "epoch": 2.128509045539613, + "grad_norm": 0.11178198694056778, + "learning_rate": 2.8554566247676806e-06, + "loss": 0.027, + "num_tokens": 136643585.0, + "step": 1707 + }, + { + "epoch": 2.12975670617592, + "grad_norm": 0.10484411194496365, + "learning_rate": 2.850556183306874e-06, + "loss": 0.0277, + "num_tokens": 136723910.0, + "step": 1708 + }, + { + "epoch": 2.131004366812227, + "grad_norm": 0.11008252356620116, + "learning_rate": 2.845660546088519e-06, + "loss": 0.0271, + "num_tokens": 136802746.0, + "step": 1709 + }, + { + "epoch": 2.132252027448534, + "grad_norm": 0.1244585389706024, + "learning_rate": 2.8407697219898865e-06, + "loss": 0.0277, + "num_tokens": 136883051.0, + "step": 1710 + }, + { + "epoch": 2.133499688084841, + "grad_norm": 0.11761109934284099, + "learning_rate": 2.8358837198795223e-06, + "loss": 0.0285, + "num_tokens": 136963474.0, + "step": 1711 + }, + { + "epoch": 2.134747348721148, + "grad_norm": 0.11248525039399156, + "learning_rate": 2.8310025486172223e-06, + "loss": 0.0288, + "num_tokens": 137043833.0, + "step": 1712 + }, + { + "epoch": 2.1359950093574547, + "grad_norm": 0.11548415099531208, + "learning_rate": 2.8261262170540242e-06, + "loss": 0.0274, + "num_tokens": 137123180.0, + "step": 1713 + }, + { + "epoch": 2.137242669993762, + "grad_norm": 0.10709230843633225, + "learning_rate": 2.821254734032194e-06, + "loss": 0.0275, + "num_tokens": 137203580.0, + "step": 1714 + }, + { + "epoch": 2.1384903306300687, + "grad_norm": 0.11759009663698383, + "learning_rate": 2.8163881083852e-06, + "loss": 0.0299, + "num_tokens": 137284710.0, + "step": 1715 + }, + { + "epoch": 2.1397379912663754, + "grad_norm": 0.10669561409225682, + "learning_rate": 2.811526348937706e-06, + "loss": 0.0275, + "num_tokens": 137363810.0, + "step": 1716 + }, + { + "epoch": 2.1409856519026826, + "grad_norm": 0.10185154190042917, + "learning_rate": 2.806669464505552e-06, + "loss": 0.0274, + "num_tokens": 137443227.0, + "step": 1717 + }, + { + "epoch": 2.1422333125389894, + "grad_norm": 0.11279381093375922, + "learning_rate": 2.80181746389574e-06, + "loss": 0.0277, + "num_tokens": 137522857.0, + "step": 1718 + }, + { + "epoch": 2.143480973175296, + "grad_norm": 0.11315608721853433, + "learning_rate": 2.7969703559064076e-06, + "loss": 0.0278, + "num_tokens": 137602845.0, + "step": 1719 + }, + { + "epoch": 2.1447286338116034, + "grad_norm": 0.11922283733338566, + "learning_rate": 2.792128149326833e-06, + "loss": 0.0289, + "num_tokens": 137683630.0, + "step": 1720 + }, + { + "epoch": 2.14597629444791, + "grad_norm": 0.115407862616771, + "learning_rate": 2.7872908529373976e-06, + "loss": 0.0286, + "num_tokens": 137765456.0, + "step": 1721 + }, + { + "epoch": 2.147223955084217, + "grad_norm": 0.11208353296927773, + "learning_rate": 2.782458475509581e-06, + "loss": 0.0271, + "num_tokens": 137845544.0, + "step": 1722 + }, + { + "epoch": 2.148471615720524, + "grad_norm": 0.1068207673083175, + "learning_rate": 2.7776310258059447e-06, + "loss": 0.0271, + "num_tokens": 137924567.0, + "step": 1723 + }, + { + "epoch": 2.149719276356831, + "grad_norm": 0.1144766801038503, + "learning_rate": 2.772808512580114e-06, + "loss": 0.0311, + "num_tokens": 138004671.0, + "step": 1724 + }, + { + "epoch": 2.1509669369931377, + "grad_norm": 0.11230257045378114, + "learning_rate": 2.767990944576763e-06, + "loss": 0.0281, + "num_tokens": 138085655.0, + "step": 1725 + }, + { + "epoch": 2.152214597629445, + "grad_norm": 0.10991165872168095, + "learning_rate": 2.7631783305316017e-06, + "loss": 0.0268, + "num_tokens": 138166694.0, + "step": 1726 + }, + { + "epoch": 2.1534622582657517, + "grad_norm": 0.10971797219708118, + "learning_rate": 2.7583706791713503e-06, + "loss": 0.0273, + "num_tokens": 138246414.0, + "step": 1727 + }, + { + "epoch": 2.154709918902059, + "grad_norm": 0.11270715704229847, + "learning_rate": 2.7535679992137338e-06, + "loss": 0.0271, + "num_tokens": 138326768.0, + "step": 1728 + }, + { + "epoch": 2.1559575795383656, + "grad_norm": 0.12328542886456291, + "learning_rate": 2.7487702993674647e-06, + "loss": 0.029, + "num_tokens": 138406486.0, + "step": 1729 + }, + { + "epoch": 2.1572052401746724, + "grad_norm": 0.1190445079709797, + "learning_rate": 2.7439775883322228e-06, + "loss": 0.0284, + "num_tokens": 138488315.0, + "step": 1730 + }, + { + "epoch": 2.158452900810979, + "grad_norm": 0.11340655933746487, + "learning_rate": 2.739189874798639e-06, + "loss": 0.0286, + "num_tokens": 138569358.0, + "step": 1731 + }, + { + "epoch": 2.1597005614472864, + "grad_norm": 0.11303296924070988, + "learning_rate": 2.7344071674482874e-06, + "loss": 0.0281, + "num_tokens": 138648911.0, + "step": 1732 + }, + { + "epoch": 2.160948222083593, + "grad_norm": 0.1034480761800008, + "learning_rate": 2.729629474953662e-06, + "loss": 0.0266, + "num_tokens": 138727334.0, + "step": 1733 + }, + { + "epoch": 2.1621958827199004, + "grad_norm": 0.11156242541516115, + "learning_rate": 2.7248568059781654e-06, + "loss": 0.0263, + "num_tokens": 138807463.0, + "step": 1734 + }, + { + "epoch": 2.163443543356207, + "grad_norm": 0.10227201302760246, + "learning_rate": 2.7200891691760838e-06, + "loss": 0.0272, + "num_tokens": 138888573.0, + "step": 1735 + }, + { + "epoch": 2.164691203992514, + "grad_norm": 0.11926979754093582, + "learning_rate": 2.715326573192588e-06, + "loss": 0.028, + "num_tokens": 138967599.0, + "step": 1736 + }, + { + "epoch": 2.165938864628821, + "grad_norm": 0.10590335435742637, + "learning_rate": 2.710569026663702e-06, + "loss": 0.027, + "num_tokens": 139048151.0, + "step": 1737 + }, + { + "epoch": 2.167186525265128, + "grad_norm": 0.1148169854925881, + "learning_rate": 2.705816538216296e-06, + "loss": 0.0272, + "num_tokens": 139128924.0, + "step": 1738 + }, + { + "epoch": 2.1684341859014347, + "grad_norm": 0.1019205633049057, + "learning_rate": 2.7010691164680696e-06, + "loss": 0.0268, + "num_tokens": 139210922.0, + "step": 1739 + }, + { + "epoch": 2.169681846537742, + "grad_norm": 0.12286991978648502, + "learning_rate": 2.696326770027533e-06, + "loss": 0.0352, + "num_tokens": 139292094.0, + "step": 1740 + }, + { + "epoch": 2.1709295071740486, + "grad_norm": 0.11680777294553368, + "learning_rate": 2.6915895074939912e-06, + "loss": 0.0274, + "num_tokens": 139372386.0, + "step": 1741 + }, + { + "epoch": 2.1721771678103554, + "grad_norm": 0.11434415621297753, + "learning_rate": 2.6868573374575356e-06, + "loss": 0.028, + "num_tokens": 139451840.0, + "step": 1742 + }, + { + "epoch": 2.1734248284466626, + "grad_norm": 0.11068475985426603, + "learning_rate": 2.6821302684990204e-06, + "loss": 0.0282, + "num_tokens": 139531179.0, + "step": 1743 + }, + { + "epoch": 2.1746724890829694, + "grad_norm": 0.10765013051927665, + "learning_rate": 2.677408309190049e-06, + "loss": 0.0273, + "num_tokens": 139611340.0, + "step": 1744 + }, + { + "epoch": 2.175920149719276, + "grad_norm": 0.1047064628578538, + "learning_rate": 2.672691468092963e-06, + "loss": 0.0266, + "num_tokens": 139690494.0, + "step": 1745 + }, + { + "epoch": 2.1771678103555834, + "grad_norm": 0.12852423753495817, + "learning_rate": 2.6679797537608184e-06, + "loss": 0.0283, + "num_tokens": 139773131.0, + "step": 1746 + }, + { + "epoch": 2.17841547099189, + "grad_norm": 0.10882310129634558, + "learning_rate": 2.6632731747373785e-06, + "loss": 0.0281, + "num_tokens": 139853287.0, + "step": 1747 + }, + { + "epoch": 2.179663131628197, + "grad_norm": 0.11131589806757092, + "learning_rate": 2.658571739557096e-06, + "loss": 0.0278, + "num_tokens": 139934168.0, + "step": 1748 + }, + { + "epoch": 2.180910792264504, + "grad_norm": 0.11163023096979792, + "learning_rate": 2.653875456745092e-06, + "loss": 0.0274, + "num_tokens": 140014255.0, + "step": 1749 + }, + { + "epoch": 2.182158452900811, + "grad_norm": 0.10614675078257936, + "learning_rate": 2.6491843348171455e-06, + "loss": 0.0275, + "num_tokens": 140094036.0, + "step": 1750 + }, + { + "epoch": 2.183406113537118, + "grad_norm": 0.11255684585168828, + "learning_rate": 2.644498382279681e-06, + "loss": 0.0279, + "num_tokens": 140174691.0, + "step": 1751 + }, + { + "epoch": 2.184653774173425, + "grad_norm": 0.11526184776808238, + "learning_rate": 2.639817607629745e-06, + "loss": 0.028, + "num_tokens": 140254358.0, + "step": 1752 + }, + { + "epoch": 2.1859014348097316, + "grad_norm": 0.11014270003322352, + "learning_rate": 2.635142019354995e-06, + "loss": 0.0279, + "num_tokens": 140335203.0, + "step": 1753 + }, + { + "epoch": 2.187149095446039, + "grad_norm": 0.11726190195059955, + "learning_rate": 2.6304716259336903e-06, + "loss": 0.027, + "num_tokens": 140415001.0, + "step": 1754 + }, + { + "epoch": 2.1883967560823456, + "grad_norm": 0.11551789258082427, + "learning_rate": 2.6258064358346642e-06, + "loss": 0.0276, + "num_tokens": 140495012.0, + "step": 1755 + }, + { + "epoch": 2.1896444167186524, + "grad_norm": 0.12911904078324782, + "learning_rate": 2.621146457517314e-06, + "loss": 0.0298, + "num_tokens": 140577112.0, + "step": 1756 + }, + { + "epoch": 2.1908920773549596, + "grad_norm": 0.10893674109666582, + "learning_rate": 2.6164916994315916e-06, + "loss": 0.0269, + "num_tokens": 140657288.0, + "step": 1757 + }, + { + "epoch": 2.1921397379912664, + "grad_norm": 0.11561232996328151, + "learning_rate": 2.6118421700179795e-06, + "loss": 0.0275, + "num_tokens": 140737502.0, + "step": 1758 + }, + { + "epoch": 2.193387398627573, + "grad_norm": 0.11811526803531466, + "learning_rate": 2.6071978777074796e-06, + "loss": 0.0278, + "num_tokens": 140816956.0, + "step": 1759 + }, + { + "epoch": 2.1946350592638804, + "grad_norm": 0.10604896561401701, + "learning_rate": 2.6025588309215975e-06, + "loss": 0.0277, + "num_tokens": 140898099.0, + "step": 1760 + }, + { + "epoch": 2.195882719900187, + "grad_norm": 0.11100890813179569, + "learning_rate": 2.5979250380723287e-06, + "loss": 0.0281, + "num_tokens": 140977581.0, + "step": 1761 + }, + { + "epoch": 2.197130380536494, + "grad_norm": 0.12574865966231316, + "learning_rate": 2.5932965075621376e-06, + "loss": 0.0291, + "num_tokens": 141058023.0, + "step": 1762 + }, + { + "epoch": 2.198378041172801, + "grad_norm": 0.11094390713862573, + "learning_rate": 2.5886732477839514e-06, + "loss": 0.0278, + "num_tokens": 141137869.0, + "step": 1763 + }, + { + "epoch": 2.199625701809108, + "grad_norm": 0.10875999873552089, + "learning_rate": 2.584055267121137e-06, + "loss": 0.0264, + "num_tokens": 141216853.0, + "step": 1764 + }, + { + "epoch": 2.2008733624454146, + "grad_norm": 0.11359059997751579, + "learning_rate": 2.579442573947488e-06, + "loss": 0.0302, + "num_tokens": 141298589.0, + "step": 1765 + }, + { + "epoch": 2.202121023081722, + "grad_norm": 0.12697222632331867, + "learning_rate": 2.5748351766272127e-06, + "loss": 0.0289, + "num_tokens": 141378590.0, + "step": 1766 + }, + { + "epoch": 2.2033686837180286, + "grad_norm": 0.10545644958622358, + "learning_rate": 2.5702330835149137e-06, + "loss": 0.0263, + "num_tokens": 141457825.0, + "step": 1767 + }, + { + "epoch": 2.204616344354336, + "grad_norm": 0.11016392505811934, + "learning_rate": 2.5656363029555788e-06, + "loss": 0.0289, + "num_tokens": 141538133.0, + "step": 1768 + }, + { + "epoch": 2.2058640049906426, + "grad_norm": 0.10965895021224552, + "learning_rate": 2.561044843284558e-06, + "loss": 0.0281, + "num_tokens": 141617180.0, + "step": 1769 + }, + { + "epoch": 2.2071116656269494, + "grad_norm": 0.1133356513749835, + "learning_rate": 2.556458712827558e-06, + "loss": 0.0287, + "num_tokens": 141695468.0, + "step": 1770 + }, + { + "epoch": 2.2083593262632566, + "grad_norm": 0.11395886557137493, + "learning_rate": 2.551877919900619e-06, + "loss": 0.0282, + "num_tokens": 141775903.0, + "step": 1771 + }, + { + "epoch": 2.2096069868995634, + "grad_norm": 0.11822802385416521, + "learning_rate": 2.5473024728101004e-06, + "loss": 0.0278, + "num_tokens": 141856125.0, + "step": 1772 + }, + { + "epoch": 2.21085464753587, + "grad_norm": 0.11611618297885314, + "learning_rate": 2.5427323798526747e-06, + "loss": 0.0277, + "num_tokens": 141936379.0, + "step": 1773 + }, + { + "epoch": 2.2121023081721773, + "grad_norm": 0.11559137208466257, + "learning_rate": 2.538167649315298e-06, + "loss": 0.0287, + "num_tokens": 142018427.0, + "step": 1774 + }, + { + "epoch": 2.213349968808484, + "grad_norm": 0.11659924816734041, + "learning_rate": 2.5336082894752084e-06, + "loss": 0.0285, + "num_tokens": 142098768.0, + "step": 1775 + }, + { + "epoch": 2.214597629444791, + "grad_norm": 0.11963881975246271, + "learning_rate": 2.529054308599906e-06, + "loss": 0.0308, + "num_tokens": 142178577.0, + "step": 1776 + }, + { + "epoch": 2.215845290081098, + "grad_norm": 0.11090511560379994, + "learning_rate": 2.524505714947131e-06, + "loss": 0.0281, + "num_tokens": 142258299.0, + "step": 1777 + }, + { + "epoch": 2.217092950717405, + "grad_norm": 0.11296744141624795, + "learning_rate": 2.5199625167648576e-06, + "loss": 0.028, + "num_tokens": 142339109.0, + "step": 1778 + }, + { + "epoch": 2.2183406113537116, + "grad_norm": 0.1257848652662134, + "learning_rate": 2.515424722291282e-06, + "loss": 0.0268, + "num_tokens": 142421110.0, + "step": 1779 + }, + { + "epoch": 2.219588271990019, + "grad_norm": 0.10900698032162932, + "learning_rate": 2.5108923397547934e-06, + "loss": 0.027, + "num_tokens": 142501998.0, + "step": 1780 + }, + { + "epoch": 2.2208359326263256, + "grad_norm": 0.11575161471868187, + "learning_rate": 2.5063653773739705e-06, + "loss": 0.0278, + "num_tokens": 142581395.0, + "step": 1781 + }, + { + "epoch": 2.2220835932626324, + "grad_norm": 0.10090543842040466, + "learning_rate": 2.501843843357568e-06, + "loss": 0.0263, + "num_tokens": 142659673.0, + "step": 1782 + }, + { + "epoch": 2.2233312538989396, + "grad_norm": 0.1115252060659669, + "learning_rate": 2.4973277459044927e-06, + "loss": 0.0282, + "num_tokens": 142741046.0, + "step": 1783 + }, + { + "epoch": 2.2245789145352464, + "grad_norm": 0.1133673733213164, + "learning_rate": 2.4928170932037916e-06, + "loss": 0.0277, + "num_tokens": 142820299.0, + "step": 1784 + }, + { + "epoch": 2.225826575171553, + "grad_norm": 0.1127136967529174, + "learning_rate": 2.4883118934346446e-06, + "loss": 0.0273, + "num_tokens": 142900381.0, + "step": 1785 + }, + { + "epoch": 2.2270742358078603, + "grad_norm": 0.10491586464405452, + "learning_rate": 2.48381215476634e-06, + "loss": 0.0265, + "num_tokens": 142980799.0, + "step": 1786 + }, + { + "epoch": 2.228321896444167, + "grad_norm": 0.11469618848304418, + "learning_rate": 2.4793178853582624e-06, + "loss": 0.0273, + "num_tokens": 143061287.0, + "step": 1787 + }, + { + "epoch": 2.229569557080474, + "grad_norm": 0.1087127525924716, + "learning_rate": 2.474829093359881e-06, + "loss": 0.0275, + "num_tokens": 143141303.0, + "step": 1788 + }, + { + "epoch": 2.230817217716781, + "grad_norm": 0.10390821955698, + "learning_rate": 2.4703457869107346e-06, + "loss": 0.0272, + "num_tokens": 143221934.0, + "step": 1789 + }, + { + "epoch": 2.232064878353088, + "grad_norm": 0.11473477655662885, + "learning_rate": 2.4658679741404106e-06, + "loss": 0.0287, + "num_tokens": 143303459.0, + "step": 1790 + }, + { + "epoch": 2.233312538989395, + "grad_norm": 0.11105580740416413, + "learning_rate": 2.461395663168539e-06, + "loss": 0.0281, + "num_tokens": 143383014.0, + "step": 1791 + }, + { + "epoch": 2.234560199625702, + "grad_norm": 0.11200067622309487, + "learning_rate": 2.4569288621047704e-06, + "loss": 0.0284, + "num_tokens": 143462866.0, + "step": 1792 + }, + { + "epoch": 2.2358078602620086, + "grad_norm": 0.10838875549676315, + "learning_rate": 2.452467579048764e-06, + "loss": 0.0267, + "num_tokens": 143541856.0, + "step": 1793 + }, + { + "epoch": 2.237055520898316, + "grad_norm": 0.11004549583890312, + "learning_rate": 2.4480118220901764e-06, + "loss": 0.0268, + "num_tokens": 143621766.0, + "step": 1794 + }, + { + "epoch": 2.2383031815346226, + "grad_norm": 0.10774371386318345, + "learning_rate": 2.4435615993086414e-06, + "loss": 0.0281, + "num_tokens": 143700863.0, + "step": 1795 + }, + { + "epoch": 2.2395508421709294, + "grad_norm": 0.10262726213219055, + "learning_rate": 2.4391169187737555e-06, + "loss": 0.0264, + "num_tokens": 143780027.0, + "step": 1796 + }, + { + "epoch": 2.2407985028072366, + "grad_norm": 0.10594867992251468, + "learning_rate": 2.434677788545071e-06, + "loss": 0.0278, + "num_tokens": 143859671.0, + "step": 1797 + }, + { + "epoch": 2.2420461634435433, + "grad_norm": 0.10587436147761728, + "learning_rate": 2.4302442166720723e-06, + "loss": 0.0275, + "num_tokens": 143940423.0, + "step": 1798 + }, + { + "epoch": 2.24329382407985, + "grad_norm": 0.11934655967345954, + "learning_rate": 2.4258162111941634e-06, + "loss": 0.0276, + "num_tokens": 144021103.0, + "step": 1799 + }, + { + "epoch": 2.2445414847161573, + "grad_norm": 0.11108721213520488, + "learning_rate": 2.42139378014066e-06, + "loss": 0.028, + "num_tokens": 144101887.0, + "step": 1800 + }, + { + "epoch": 2.245789145352464, + "grad_norm": 0.11656736214212844, + "learning_rate": 2.416976931530764e-06, + "loss": 0.0287, + "num_tokens": 144182215.0, + "step": 1801 + }, + { + "epoch": 2.247036805988771, + "grad_norm": 0.11510680943141609, + "learning_rate": 2.4125656733735554e-06, + "loss": 0.0281, + "num_tokens": 144263091.0, + "step": 1802 + }, + { + "epoch": 2.248284466625078, + "grad_norm": 0.10345650824569999, + "learning_rate": 2.4081600136679805e-06, + "loss": 0.0276, + "num_tokens": 144342322.0, + "step": 1803 + }, + { + "epoch": 2.249532127261385, + "grad_norm": 0.11225842570308209, + "learning_rate": 2.403759960402834e-06, + "loss": 0.028, + "num_tokens": 144422662.0, + "step": 1804 + }, + { + "epoch": 2.2507797878976916, + "grad_norm": 0.14563905711788522, + "learning_rate": 2.39936552155674e-06, + "loss": 0.0276, + "num_tokens": 144502890.0, + "step": 1805 + }, + { + "epoch": 2.252027448533999, + "grad_norm": 0.11458859468044612, + "learning_rate": 2.394976705098143e-06, + "loss": 0.0277, + "num_tokens": 144583307.0, + "step": 1806 + }, + { + "epoch": 2.2532751091703056, + "grad_norm": 0.11282706392587076, + "learning_rate": 2.3905935189852967e-06, + "loss": 0.0286, + "num_tokens": 144664063.0, + "step": 1807 + }, + { + "epoch": 2.254522769806613, + "grad_norm": 0.10716579111382028, + "learning_rate": 2.386215971166242e-06, + "loss": 0.0273, + "num_tokens": 144744193.0, + "step": 1808 + }, + { + "epoch": 2.2557704304429196, + "grad_norm": 0.10816884263366056, + "learning_rate": 2.381844069578793e-06, + "loss": 0.0273, + "num_tokens": 144825420.0, + "step": 1809 + }, + { + "epoch": 2.2570180910792264, + "grad_norm": 0.11079261493952917, + "learning_rate": 2.3774778221505316e-06, + "loss": 0.0269, + "num_tokens": 144904705.0, + "step": 1810 + }, + { + "epoch": 2.2582657517155336, + "grad_norm": 0.12304447536252638, + "learning_rate": 2.3731172367987856e-06, + "loss": 0.0279, + "num_tokens": 144984409.0, + "step": 1811 + }, + { + "epoch": 2.2595134123518403, + "grad_norm": 0.12073914506900349, + "learning_rate": 2.3687623214306096e-06, + "loss": 0.0281, + "num_tokens": 145065436.0, + "step": 1812 + }, + { + "epoch": 2.260761072988147, + "grad_norm": 0.11025114886878515, + "learning_rate": 2.364413083942787e-06, + "loss": 0.0278, + "num_tokens": 145145726.0, + "step": 1813 + }, + { + "epoch": 2.2620087336244543, + "grad_norm": 0.10678041974245804, + "learning_rate": 2.3600695322217965e-06, + "loss": 0.0278, + "num_tokens": 145225716.0, + "step": 1814 + }, + { + "epoch": 2.263256394260761, + "grad_norm": 0.1094478383949689, + "learning_rate": 2.355731674143809e-06, + "loss": 0.0266, + "num_tokens": 145304672.0, + "step": 1815 + }, + { + "epoch": 2.264504054897068, + "grad_norm": 0.11066469315589064, + "learning_rate": 2.3513995175746757e-06, + "loss": 0.0276, + "num_tokens": 145383597.0, + "step": 1816 + }, + { + "epoch": 2.265751715533375, + "grad_norm": 0.11081072087024889, + "learning_rate": 2.3470730703699034e-06, + "loss": 0.0264, + "num_tokens": 145463533.0, + "step": 1817 + }, + { + "epoch": 2.266999376169682, + "grad_norm": 0.12335653809318686, + "learning_rate": 2.3427523403746496e-06, + "loss": 0.0286, + "num_tokens": 145543691.0, + "step": 1818 + }, + { + "epoch": 2.2682470368059886, + "grad_norm": 0.10000215802185779, + "learning_rate": 2.338437335423705e-06, + "loss": 0.0264, + "num_tokens": 145622759.0, + "step": 1819 + }, + { + "epoch": 2.269494697442296, + "grad_norm": 0.11700316815610115, + "learning_rate": 2.3341280633414763e-06, + "loss": 0.0286, + "num_tokens": 145703874.0, + "step": 1820 + }, + { + "epoch": 2.2707423580786026, + "grad_norm": 0.11974933764453737, + "learning_rate": 2.3298245319419755e-06, + "loss": 0.0281, + "num_tokens": 145784643.0, + "step": 1821 + }, + { + "epoch": 2.2719900187149094, + "grad_norm": 0.11752237392767895, + "learning_rate": 2.325526749028808e-06, + "loss": 0.0278, + "num_tokens": 145865466.0, + "step": 1822 + }, + { + "epoch": 2.2732376793512166, + "grad_norm": 0.12283401566553695, + "learning_rate": 2.321234722395152e-06, + "loss": 0.0282, + "num_tokens": 145944342.0, + "step": 1823 + }, + { + "epoch": 2.2744853399875233, + "grad_norm": 0.11195250793318184, + "learning_rate": 2.3169484598237484e-06, + "loss": 0.0276, + "num_tokens": 146023270.0, + "step": 1824 + }, + { + "epoch": 2.2757330006238305, + "grad_norm": 0.11013517177727546, + "learning_rate": 2.312667969086887e-06, + "loss": 0.0267, + "num_tokens": 146102006.0, + "step": 1825 + }, + { + "epoch": 2.2769806612601373, + "grad_norm": 0.11787583993905608, + "learning_rate": 2.308393257946393e-06, + "loss": 0.0274, + "num_tokens": 146181867.0, + "step": 1826 + }, + { + "epoch": 2.278228321896444, + "grad_norm": 0.11819294612785929, + "learning_rate": 2.304124334153608e-06, + "loss": 0.0269, + "num_tokens": 146261476.0, + "step": 1827 + }, + { + "epoch": 2.279475982532751, + "grad_norm": 0.11184828699005617, + "learning_rate": 2.2998612054493827e-06, + "loss": 0.0284, + "num_tokens": 146341515.0, + "step": 1828 + }, + { + "epoch": 2.280723643169058, + "grad_norm": 0.10777522190355769, + "learning_rate": 2.2956038795640573e-06, + "loss": 0.0274, + "num_tokens": 146420733.0, + "step": 1829 + }, + { + "epoch": 2.281971303805365, + "grad_norm": 0.10954932409855346, + "learning_rate": 2.291352364217449e-06, + "loss": 0.028, + "num_tokens": 146501895.0, + "step": 1830 + }, + { + "epoch": 2.283218964441672, + "grad_norm": 0.10884043485738872, + "learning_rate": 2.287106667118841e-06, + "loss": 0.0281, + "num_tokens": 146581796.0, + "step": 1831 + }, + { + "epoch": 2.284466625077979, + "grad_norm": 0.11517754082110432, + "learning_rate": 2.2828667959669674e-06, + "loss": 0.0286, + "num_tokens": 146663043.0, + "step": 1832 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.1259652230677902, + "learning_rate": 2.2786327584499944e-06, + "loss": 0.0291, + "num_tokens": 146744082.0, + "step": 1833 + }, + { + "epoch": 2.286961946350593, + "grad_norm": 0.11385926033954893, + "learning_rate": 2.2744045622455112e-06, + "loss": 0.0278, + "num_tokens": 146824514.0, + "step": 1834 + }, + { + "epoch": 2.2882096069868996, + "grad_norm": 0.10823713522415669, + "learning_rate": 2.270182215020517e-06, + "loss": 0.0275, + "num_tokens": 146905003.0, + "step": 1835 + }, + { + "epoch": 2.2894572676232063, + "grad_norm": 0.10607107777899051, + "learning_rate": 2.2659657244314017e-06, + "loss": 0.0274, + "num_tokens": 146984485.0, + "step": 1836 + }, + { + "epoch": 2.2907049282595136, + "grad_norm": 0.10959122120451005, + "learning_rate": 2.26175509812394e-06, + "loss": 0.0276, + "num_tokens": 147064157.0, + "step": 1837 + }, + { + "epoch": 2.2919525888958203, + "grad_norm": 0.10057700457691329, + "learning_rate": 2.2575503437332677e-06, + "loss": 0.0273, + "num_tokens": 147143970.0, + "step": 1838 + }, + { + "epoch": 2.293200249532127, + "grad_norm": 0.10943750560043476, + "learning_rate": 2.2533514688838755e-06, + "loss": 0.028, + "num_tokens": 147225213.0, + "step": 1839 + }, + { + "epoch": 2.2944479101684343, + "grad_norm": 0.1101560015216109, + "learning_rate": 2.2491584811895927e-06, + "loss": 0.0272, + "num_tokens": 147305029.0, + "step": 1840 + }, + { + "epoch": 2.295695570804741, + "grad_norm": 0.1108231639983885, + "learning_rate": 2.244971388253576e-06, + "loss": 0.0261, + "num_tokens": 147385812.0, + "step": 1841 + }, + { + "epoch": 2.2969432314410483, + "grad_norm": 0.11679304596070587, + "learning_rate": 2.2407901976682884e-06, + "loss": 0.0294, + "num_tokens": 147466071.0, + "step": 1842 + }, + { + "epoch": 2.298190892077355, + "grad_norm": 0.09947228828930545, + "learning_rate": 2.2366149170154907e-06, + "loss": 0.0262, + "num_tokens": 147544270.0, + "step": 1843 + }, + { + "epoch": 2.299438552713662, + "grad_norm": 0.11014547069179065, + "learning_rate": 2.232445553866231e-06, + "loss": 0.0278, + "num_tokens": 147624344.0, + "step": 1844 + }, + { + "epoch": 2.3006862133499686, + "grad_norm": 0.11618854645406114, + "learning_rate": 2.228282115780824e-06, + "loss": 0.0283, + "num_tokens": 147703871.0, + "step": 1845 + }, + { + "epoch": 2.301933873986276, + "grad_norm": 0.11530991453976354, + "learning_rate": 2.22412461030884e-06, + "loss": 0.0277, + "num_tokens": 147783896.0, + "step": 1846 + }, + { + "epoch": 2.3031815346225826, + "grad_norm": 0.10820957500421617, + "learning_rate": 2.2199730449890964e-06, + "loss": 0.027, + "num_tokens": 147864352.0, + "step": 1847 + }, + { + "epoch": 2.30442919525889, + "grad_norm": 0.11573884363992257, + "learning_rate": 2.215827427349635e-06, + "loss": 0.0282, + "num_tokens": 147944345.0, + "step": 1848 + }, + { + "epoch": 2.3056768558951966, + "grad_norm": 0.11777830036437165, + "learning_rate": 2.211687764907711e-06, + "loss": 0.0272, + "num_tokens": 148026015.0, + "step": 1849 + }, + { + "epoch": 2.3069245165315033, + "grad_norm": 0.11222239508770851, + "learning_rate": 2.2075540651697873e-06, + "loss": 0.0275, + "num_tokens": 148106133.0, + "step": 1850 + }, + { + "epoch": 2.3081721771678105, + "grad_norm": 0.10933938525610572, + "learning_rate": 2.2034263356315087e-06, + "loss": 0.0278, + "num_tokens": 148186803.0, + "step": 1851 + }, + { + "epoch": 2.3094198378041173, + "grad_norm": 0.11453260431350368, + "learning_rate": 2.1993045837776957e-06, + "loss": 0.0278, + "num_tokens": 148267327.0, + "step": 1852 + }, + { + "epoch": 2.310667498440424, + "grad_norm": 0.11836630922316033, + "learning_rate": 2.195188817082331e-06, + "loss": 0.0283, + "num_tokens": 148349060.0, + "step": 1853 + }, + { + "epoch": 2.3119151590767313, + "grad_norm": 0.12681294550992456, + "learning_rate": 2.1910790430085465e-06, + "loss": 0.0275, + "num_tokens": 148428476.0, + "step": 1854 + }, + { + "epoch": 2.313162819713038, + "grad_norm": 0.10886143919786201, + "learning_rate": 2.1869752690086e-06, + "loss": 0.0269, + "num_tokens": 148507956.0, + "step": 1855 + }, + { + "epoch": 2.314410480349345, + "grad_norm": 0.1123737906437985, + "learning_rate": 2.1828775025238787e-06, + "loss": 0.027, + "num_tokens": 148587206.0, + "step": 1856 + }, + { + "epoch": 2.315658140985652, + "grad_norm": 0.1087200879265718, + "learning_rate": 2.1787857509848693e-06, + "loss": 0.027, + "num_tokens": 148666530.0, + "step": 1857 + }, + { + "epoch": 2.316905801621959, + "grad_norm": 0.12845500495545087, + "learning_rate": 2.174700021811153e-06, + "loss": 0.0288, + "num_tokens": 148745506.0, + "step": 1858 + }, + { + "epoch": 2.3181534622582656, + "grad_norm": 0.10251335702347915, + "learning_rate": 2.1706203224113944e-06, + "loss": 0.0273, + "num_tokens": 148826801.0, + "step": 1859 + }, + { + "epoch": 2.319401122894573, + "grad_norm": 0.13207173451085918, + "learning_rate": 2.1665466601833197e-06, + "loss": 0.0282, + "num_tokens": 148907396.0, + "step": 1860 + }, + { + "epoch": 2.3206487835308796, + "grad_norm": 0.11045087789527284, + "learning_rate": 2.162479042513711e-06, + "loss": 0.0275, + "num_tokens": 148987223.0, + "step": 1861 + }, + { + "epoch": 2.3218964441671863, + "grad_norm": 0.10466374938904822, + "learning_rate": 2.158417476778388e-06, + "loss": 0.0276, + "num_tokens": 149068080.0, + "step": 1862 + }, + { + "epoch": 2.3231441048034935, + "grad_norm": 0.11264871081312319, + "learning_rate": 2.1543619703421975e-06, + "loss": 0.0278, + "num_tokens": 149147571.0, + "step": 1863 + }, + { + "epoch": 2.3243917654398003, + "grad_norm": 0.104606366045223, + "learning_rate": 2.1503125305589976e-06, + "loss": 0.0272, + "num_tokens": 149227191.0, + "step": 1864 + }, + { + "epoch": 2.3256394260761075, + "grad_norm": 0.12583404589107572, + "learning_rate": 2.146269164771648e-06, + "loss": 0.0292, + "num_tokens": 149308765.0, + "step": 1865 + }, + { + "epoch": 2.3268870867124143, + "grad_norm": 0.11179688039422762, + "learning_rate": 2.142231880311992e-06, + "loss": 0.0274, + "num_tokens": 149388926.0, + "step": 1866 + }, + { + "epoch": 2.328134747348721, + "grad_norm": 0.10361239593188457, + "learning_rate": 2.1382006845008456e-06, + "loss": 0.0267, + "num_tokens": 149468409.0, + "step": 1867 + }, + { + "epoch": 2.329382407985028, + "grad_norm": 0.10285293247205865, + "learning_rate": 2.1341755846479868e-06, + "loss": 0.0274, + "num_tokens": 149548190.0, + "step": 1868 + }, + { + "epoch": 2.330630068621335, + "grad_norm": 0.11770503261290141, + "learning_rate": 2.1301565880521387e-06, + "loss": 0.0273, + "num_tokens": 149628012.0, + "step": 1869 + }, + { + "epoch": 2.331877729257642, + "grad_norm": 0.1074927375632693, + "learning_rate": 2.1261437020009565e-06, + "loss": 0.0271, + "num_tokens": 149708217.0, + "step": 1870 + }, + { + "epoch": 2.333125389893949, + "grad_norm": 0.11030656423571202, + "learning_rate": 2.122136933771014e-06, + "loss": 0.0272, + "num_tokens": 149788673.0, + "step": 1871 + }, + { + "epoch": 2.334373050530256, + "grad_norm": 0.12162150889745205, + "learning_rate": 2.118136290627795e-06, + "loss": 0.0287, + "num_tokens": 149868514.0, + "step": 1872 + }, + { + "epoch": 2.3356207111665626, + "grad_norm": 0.126632958015695, + "learning_rate": 2.114141779825674e-06, + "loss": 0.0282, + "num_tokens": 149948614.0, + "step": 1873 + }, + { + "epoch": 2.3368683718028698, + "grad_norm": 0.10058350258071957, + "learning_rate": 2.110153408607904e-06, + "loss": 0.0262, + "num_tokens": 150028919.0, + "step": 1874 + }, + { + "epoch": 2.3381160324391765, + "grad_norm": 0.11778995124698656, + "learning_rate": 2.1061711842066124e-06, + "loss": 0.028, + "num_tokens": 150108918.0, + "step": 1875 + }, + { + "epoch": 2.3393636930754833, + "grad_norm": 0.11526712748316235, + "learning_rate": 2.1021951138427736e-06, + "loss": 0.0275, + "num_tokens": 150188111.0, + "step": 1876 + }, + { + "epoch": 2.3406113537117905, + "grad_norm": 0.10576774424191199, + "learning_rate": 2.0982252047262025e-06, + "loss": 0.0277, + "num_tokens": 150267512.0, + "step": 1877 + }, + { + "epoch": 2.3418590143480973, + "grad_norm": 0.11922101045159628, + "learning_rate": 2.094261464055548e-06, + "loss": 0.028, + "num_tokens": 150349541.0, + "step": 1878 + }, + { + "epoch": 2.343106674984404, + "grad_norm": 0.11472447341513481, + "learning_rate": 2.0903038990182684e-06, + "loss": 0.0281, + "num_tokens": 150429122.0, + "step": 1879 + }, + { + "epoch": 2.3443543356207113, + "grad_norm": 0.11865849401366725, + "learning_rate": 2.086352516790624e-06, + "loss": 0.0282, + "num_tokens": 150509545.0, + "step": 1880 + }, + { + "epoch": 2.345601996257018, + "grad_norm": 0.11861011227322517, + "learning_rate": 2.082407324537668e-06, + "loss": 0.028, + "num_tokens": 150589757.0, + "step": 1881 + }, + { + "epoch": 2.3468496568933253, + "grad_norm": 0.1151057044148094, + "learning_rate": 2.078468329413223e-06, + "loss": 0.0278, + "num_tokens": 150671338.0, + "step": 1882 + }, + { + "epoch": 2.348097317529632, + "grad_norm": 0.10255007625515707, + "learning_rate": 2.07453553855988e-06, + "loss": 0.0264, + "num_tokens": 150751508.0, + "step": 1883 + }, + { + "epoch": 2.349344978165939, + "grad_norm": 0.11660533332586445, + "learning_rate": 2.0706089591089785e-06, + "loss": 0.0282, + "num_tokens": 150832075.0, + "step": 1884 + }, + { + "epoch": 2.3505926388022456, + "grad_norm": 0.1155557180421997, + "learning_rate": 2.0666885981805916e-06, + "loss": 0.0273, + "num_tokens": 150912548.0, + "step": 1885 + }, + { + "epoch": 2.3518402994385528, + "grad_norm": 0.11188233909320953, + "learning_rate": 2.0627744628835196e-06, + "loss": 0.0278, + "num_tokens": 150991832.0, + "step": 1886 + }, + { + "epoch": 2.3530879600748595, + "grad_norm": 0.11350157709518613, + "learning_rate": 2.058866560315273e-06, + "loss": 0.0272, + "num_tokens": 151071514.0, + "step": 1887 + }, + { + "epoch": 2.3543356207111668, + "grad_norm": 0.11410767768820493, + "learning_rate": 2.054964897562061e-06, + "loss": 0.0276, + "num_tokens": 151150714.0, + "step": 1888 + }, + { + "epoch": 2.3555832813474735, + "grad_norm": 0.10872089005911116, + "learning_rate": 2.0510694816987724e-06, + "loss": 0.0279, + "num_tokens": 151231361.0, + "step": 1889 + }, + { + "epoch": 2.3568309419837803, + "grad_norm": 0.1196042703091023, + "learning_rate": 2.047180319788981e-06, + "loss": 0.0276, + "num_tokens": 151311684.0, + "step": 1890 + }, + { + "epoch": 2.3580786026200875, + "grad_norm": 0.11841309111635819, + "learning_rate": 2.0432974188849103e-06, + "loss": 0.0275, + "num_tokens": 151392783.0, + "step": 1891 + }, + { + "epoch": 2.3593262632563943, + "grad_norm": 0.11255039160218248, + "learning_rate": 2.0394207860274304e-06, + "loss": 0.0277, + "num_tokens": 151472580.0, + "step": 1892 + }, + { + "epoch": 2.360573923892701, + "grad_norm": 0.10746347740509427, + "learning_rate": 2.035550428246053e-06, + "loss": 0.0272, + "num_tokens": 151552266.0, + "step": 1893 + }, + { + "epoch": 2.3618215845290083, + "grad_norm": 0.11470087718795037, + "learning_rate": 2.0316863525589037e-06, + "loss": 0.0279, + "num_tokens": 151631911.0, + "step": 1894 + }, + { + "epoch": 2.363069245165315, + "grad_norm": 0.12367004565272316, + "learning_rate": 2.0278285659727187e-06, + "loss": 0.0273, + "num_tokens": 151710855.0, + "step": 1895 + }, + { + "epoch": 2.364316905801622, + "grad_norm": 0.12984514692522087, + "learning_rate": 2.023977075482833e-06, + "loss": 0.0307, + "num_tokens": 151793755.0, + "step": 1896 + }, + { + "epoch": 2.365564566437929, + "grad_norm": 0.12148820532988883, + "learning_rate": 2.0201318880731633e-06, + "loss": 0.0283, + "num_tokens": 151875625.0, + "step": 1897 + }, + { + "epoch": 2.3668122270742358, + "grad_norm": 0.11585673908956987, + "learning_rate": 2.0162930107161963e-06, + "loss": 0.0273, + "num_tokens": 151955662.0, + "step": 1898 + }, + { + "epoch": 2.3680598877105425, + "grad_norm": 0.12778131669298992, + "learning_rate": 2.012460450372976e-06, + "loss": 0.0296, + "num_tokens": 152037665.0, + "step": 1899 + }, + { + "epoch": 2.3693075483468498, + "grad_norm": 0.12449191375414341, + "learning_rate": 2.0086342139930932e-06, + "loss": 0.031, + "num_tokens": 152116842.0, + "step": 1900 + }, + { + "epoch": 2.3705552089831565, + "grad_norm": 0.12060369090810642, + "learning_rate": 2.004814308514671e-06, + "loss": 0.0287, + "num_tokens": 152196571.0, + "step": 1901 + }, + { + "epoch": 2.3718028696194633, + "grad_norm": 0.12138740721784497, + "learning_rate": 2.001000740864353e-06, + "loss": 0.0276, + "num_tokens": 152276228.0, + "step": 1902 + }, + { + "epoch": 2.3730505302557705, + "grad_norm": 0.10235549260740247, + "learning_rate": 1.9971935179572893e-06, + "loss": 0.0268, + "num_tokens": 152355915.0, + "step": 1903 + }, + { + "epoch": 2.3742981908920773, + "grad_norm": 0.1156977150303195, + "learning_rate": 1.993392646697127e-06, + "loss": 0.0276, + "num_tokens": 152435603.0, + "step": 1904 + }, + { + "epoch": 2.3755458515283845, + "grad_norm": 0.1134522871431333, + "learning_rate": 1.9895981339759927e-06, + "loss": 0.0276, + "num_tokens": 152516244.0, + "step": 1905 + }, + { + "epoch": 2.3767935121646913, + "grad_norm": 0.116802365651326, + "learning_rate": 1.985809986674487e-06, + "loss": 0.0281, + "num_tokens": 152595457.0, + "step": 1906 + }, + { + "epoch": 2.378041172800998, + "grad_norm": 0.1162946923631464, + "learning_rate": 1.982028211661665e-06, + "loss": 0.0275, + "num_tokens": 152675867.0, + "step": 1907 + }, + { + "epoch": 2.3792888334373052, + "grad_norm": 0.12149058207102886, + "learning_rate": 1.9782528157950266e-06, + "loss": 0.0285, + "num_tokens": 152756569.0, + "step": 1908 + }, + { + "epoch": 2.380536494073612, + "grad_norm": 0.1201975730835599, + "learning_rate": 1.974483805920508e-06, + "loss": 0.0285, + "num_tokens": 152836657.0, + "step": 1909 + }, + { + "epoch": 2.3817841547099188, + "grad_norm": 0.11308003280924744, + "learning_rate": 1.970721188872461e-06, + "loss": 0.0283, + "num_tokens": 152918160.0, + "step": 1910 + }, + { + "epoch": 2.383031815346226, + "grad_norm": 0.10654932419802848, + "learning_rate": 1.966964971473649e-06, + "loss": 0.0264, + "num_tokens": 153002339.0, + "step": 1911 + }, + { + "epoch": 2.3842794759825328, + "grad_norm": 0.10620407558672333, + "learning_rate": 1.9632151605352296e-06, + "loss": 0.0279, + "num_tokens": 153081791.0, + "step": 1912 + }, + { + "epoch": 2.3855271366188395, + "grad_norm": 0.11368468414357902, + "learning_rate": 1.9594717628567432e-06, + "loss": 0.0274, + "num_tokens": 153162006.0, + "step": 1913 + }, + { + "epoch": 2.3867747972551467, + "grad_norm": 0.11000047901719937, + "learning_rate": 1.9557347852261007e-06, + "loss": 0.0274, + "num_tokens": 153242632.0, + "step": 1914 + }, + { + "epoch": 2.3880224578914535, + "grad_norm": 0.1089512194404227, + "learning_rate": 1.9520042344195727e-06, + "loss": 0.0272, + "num_tokens": 153323418.0, + "step": 1915 + }, + { + "epoch": 2.3892701185277603, + "grad_norm": 0.10861808273136546, + "learning_rate": 1.9482801172017758e-06, + "loss": 0.0266, + "num_tokens": 153403974.0, + "step": 1916 + }, + { + "epoch": 2.3905177791640675, + "grad_norm": 0.11473505086721252, + "learning_rate": 1.9445624403256576e-06, + "loss": 0.0267, + "num_tokens": 153483141.0, + "step": 1917 + }, + { + "epoch": 2.3917654398003743, + "grad_norm": 0.10654799873398078, + "learning_rate": 1.940851210532493e-06, + "loss": 0.0279, + "num_tokens": 153563318.0, + "step": 1918 + }, + { + "epoch": 2.393013100436681, + "grad_norm": 0.10839257143529775, + "learning_rate": 1.937146434551863e-06, + "loss": 0.0269, + "num_tokens": 153643226.0, + "step": 1919 + }, + { + "epoch": 2.3942607610729882, + "grad_norm": 0.10860934191124491, + "learning_rate": 1.933448119101644e-06, + "loss": 0.0286, + "num_tokens": 153722527.0, + "step": 1920 + }, + { + "epoch": 2.395508421709295, + "grad_norm": 0.10443243666752638, + "learning_rate": 1.929756270888003e-06, + "loss": 0.0265, + "num_tokens": 153802225.0, + "step": 1921 + }, + { + "epoch": 2.3967560823456022, + "grad_norm": 0.10427755534598981, + "learning_rate": 1.9260708966053744e-06, + "loss": 0.0271, + "num_tokens": 153881820.0, + "step": 1922 + }, + { + "epoch": 2.398003742981909, + "grad_norm": 0.1177864618305411, + "learning_rate": 1.9223920029364555e-06, + "loss": 0.0278, + "num_tokens": 153961515.0, + "step": 1923 + }, + { + "epoch": 2.3992514036182158, + "grad_norm": 0.11765676375572733, + "learning_rate": 1.9187195965521934e-06, + "loss": 0.028, + "num_tokens": 154041531.0, + "step": 1924 + }, + { + "epoch": 2.4004990642545225, + "grad_norm": 0.10500942947449511, + "learning_rate": 1.9150536841117713e-06, + "loss": 0.027, + "num_tokens": 154121639.0, + "step": 1925 + }, + { + "epoch": 2.4017467248908297, + "grad_norm": 0.12216098631005727, + "learning_rate": 1.911394272262595e-06, + "loss": 0.0298, + "num_tokens": 154203019.0, + "step": 1926 + }, + { + "epoch": 2.4029943855271365, + "grad_norm": 0.10777756704694372, + "learning_rate": 1.907741367640286e-06, + "loss": 0.0273, + "num_tokens": 154283769.0, + "step": 1927 + }, + { + "epoch": 2.4042420461634437, + "grad_norm": 0.12546585298956595, + "learning_rate": 1.9040949768686646e-06, + "loss": 0.0282, + "num_tokens": 154364349.0, + "step": 1928 + }, + { + "epoch": 2.4054897067997505, + "grad_norm": 0.11144911675809406, + "learning_rate": 1.900455106559737e-06, + "loss": 0.0275, + "num_tokens": 154444248.0, + "step": 1929 + }, + { + "epoch": 2.4067373674360573, + "grad_norm": 0.10425862497450274, + "learning_rate": 1.8968217633136909e-06, + "loss": 0.0266, + "num_tokens": 154523672.0, + "step": 1930 + }, + { + "epoch": 2.4079850280723645, + "grad_norm": 0.10633358249196864, + "learning_rate": 1.893194953718875e-06, + "loss": 0.0267, + "num_tokens": 154603383.0, + "step": 1931 + }, + { + "epoch": 2.4092326887086712, + "grad_norm": 0.11025002525121674, + "learning_rate": 1.8895746843517892e-06, + "loss": 0.0273, + "num_tokens": 154683826.0, + "step": 1932 + }, + { + "epoch": 2.410480349344978, + "grad_norm": 0.12840305730755552, + "learning_rate": 1.8859609617770786e-06, + "loss": 0.0436, + "num_tokens": 154764256.0, + "step": 1933 + }, + { + "epoch": 2.4117280099812852, + "grad_norm": 0.10790452835307801, + "learning_rate": 1.8823537925475143e-06, + "loss": 0.0272, + "num_tokens": 154843472.0, + "step": 1934 + }, + { + "epoch": 2.412975670617592, + "grad_norm": 0.10970134188031404, + "learning_rate": 1.8787531832039846e-06, + "loss": 0.0278, + "num_tokens": 154923415.0, + "step": 1935 + }, + { + "epoch": 2.4142233312538988, + "grad_norm": 0.10844976031889222, + "learning_rate": 1.8751591402754802e-06, + "loss": 0.0271, + "num_tokens": 155001644.0, + "step": 1936 + }, + { + "epoch": 2.415470991890206, + "grad_norm": 0.10790804890822218, + "learning_rate": 1.8715716702790903e-06, + "loss": 0.0278, + "num_tokens": 155082689.0, + "step": 1937 + }, + { + "epoch": 2.4167186525265127, + "grad_norm": 0.10777373189548671, + "learning_rate": 1.8679907797199798e-06, + "loss": 0.0271, + "num_tokens": 155161299.0, + "step": 1938 + }, + { + "epoch": 2.41796631316282, + "grad_norm": 0.10873821982168631, + "learning_rate": 1.8644164750913868e-06, + "loss": 0.0274, + "num_tokens": 155240482.0, + "step": 1939 + }, + { + "epoch": 2.4192139737991267, + "grad_norm": 0.10820841964710465, + "learning_rate": 1.8608487628746072e-06, + "loss": 0.0272, + "num_tokens": 155320020.0, + "step": 1940 + }, + { + "epoch": 2.4204616344354335, + "grad_norm": 0.11112717770569587, + "learning_rate": 1.8572876495389808e-06, + "loss": 0.0272, + "num_tokens": 155400618.0, + "step": 1941 + }, + { + "epoch": 2.4217092950717403, + "grad_norm": 0.11217938609690513, + "learning_rate": 1.8537331415418802e-06, + "loss": 0.0274, + "num_tokens": 155480528.0, + "step": 1942 + }, + { + "epoch": 2.4229569557080475, + "grad_norm": 0.11097473906028339, + "learning_rate": 1.8501852453287056e-06, + "loss": 0.0272, + "num_tokens": 155562620.0, + "step": 1943 + }, + { + "epoch": 2.4242046163443542, + "grad_norm": 0.10818902006610549, + "learning_rate": 1.846643967332865e-06, + "loss": 0.0274, + "num_tokens": 155641589.0, + "step": 1944 + }, + { + "epoch": 2.4254522769806615, + "grad_norm": 0.11177617772827625, + "learning_rate": 1.8431093139757635e-06, + "loss": 0.028, + "num_tokens": 155723189.0, + "step": 1945 + }, + { + "epoch": 2.4266999376169682, + "grad_norm": 0.11904079761014202, + "learning_rate": 1.8395812916667974e-06, + "loss": 0.0276, + "num_tokens": 155802507.0, + "step": 1946 + }, + { + "epoch": 2.427947598253275, + "grad_norm": 0.10796895127972975, + "learning_rate": 1.836059906803339e-06, + "loss": 0.0275, + "num_tokens": 155882396.0, + "step": 1947 + }, + { + "epoch": 2.429195258889582, + "grad_norm": 0.11670361221308223, + "learning_rate": 1.832545165770721e-06, + "loss": 0.0283, + "num_tokens": 155962352.0, + "step": 1948 + }, + { + "epoch": 2.430442919525889, + "grad_norm": 0.11461518816560916, + "learning_rate": 1.8290370749422327e-06, + "loss": 0.0279, + "num_tokens": 156043890.0, + "step": 1949 + }, + { + "epoch": 2.4316905801621957, + "grad_norm": 0.10679786955474743, + "learning_rate": 1.8255356406791036e-06, + "loss": 0.0272, + "num_tokens": 156123445.0, + "step": 1950 + }, + { + "epoch": 2.432938240798503, + "grad_norm": 0.10688075962587382, + "learning_rate": 1.82204086933049e-06, + "loss": 0.0268, + "num_tokens": 156203255.0, + "step": 1951 + }, + { + "epoch": 2.4341859014348097, + "grad_norm": 0.11372182933826729, + "learning_rate": 1.8185527672334712e-06, + "loss": 0.0265, + "num_tokens": 156282984.0, + "step": 1952 + }, + { + "epoch": 2.4354335620711165, + "grad_norm": 0.11467488601047325, + "learning_rate": 1.8150713407130283e-06, + "loss": 0.0278, + "num_tokens": 156362864.0, + "step": 1953 + }, + { + "epoch": 2.4366812227074237, + "grad_norm": 0.11881203317305625, + "learning_rate": 1.8115965960820414e-06, + "loss": 0.0284, + "num_tokens": 156443925.0, + "step": 1954 + }, + { + "epoch": 2.4379288833437305, + "grad_norm": 0.11291508916496522, + "learning_rate": 1.8081285396412738e-06, + "loss": 0.0275, + "num_tokens": 156526214.0, + "step": 1955 + }, + { + "epoch": 2.4391765439800372, + "grad_norm": 0.1127357370742545, + "learning_rate": 1.8046671776793584e-06, + "loss": 0.028, + "num_tokens": 156606671.0, + "step": 1956 + }, + { + "epoch": 2.4404242046163445, + "grad_norm": 0.11312286561244898, + "learning_rate": 1.80121251647279e-06, + "loss": 0.0274, + "num_tokens": 156686884.0, + "step": 1957 + }, + { + "epoch": 2.4416718652526512, + "grad_norm": 0.11771902116431116, + "learning_rate": 1.7977645622859157e-06, + "loss": 0.0285, + "num_tokens": 156767153.0, + "step": 1958 + }, + { + "epoch": 2.442919525888958, + "grad_norm": 0.11286969218942611, + "learning_rate": 1.7943233213709173e-06, + "loss": 0.0276, + "num_tokens": 156848405.0, + "step": 1959 + }, + { + "epoch": 2.444167186525265, + "grad_norm": 0.10775615689943846, + "learning_rate": 1.7908887999678046e-06, + "loss": 0.0269, + "num_tokens": 156927640.0, + "step": 1960 + }, + { + "epoch": 2.445414847161572, + "grad_norm": 0.12023464215065506, + "learning_rate": 1.7874610043044027e-06, + "loss": 0.0288, + "num_tokens": 157009867.0, + "step": 1961 + }, + { + "epoch": 2.446662507797879, + "grad_norm": 0.1109760515355431, + "learning_rate": 1.7840399405963432e-06, + "loss": 0.027, + "num_tokens": 157090040.0, + "step": 1962 + }, + { + "epoch": 2.447910168434186, + "grad_norm": 0.11510300631971997, + "learning_rate": 1.7806256150470472e-06, + "loss": 0.0278, + "num_tokens": 157171270.0, + "step": 1963 + }, + { + "epoch": 2.4491578290704927, + "grad_norm": 0.113356826538854, + "learning_rate": 1.7772180338477173e-06, + "loss": 0.0269, + "num_tokens": 157250589.0, + "step": 1964 + }, + { + "epoch": 2.4504054897068, + "grad_norm": 0.11020161557209646, + "learning_rate": 1.7738172031773322e-06, + "loss": 0.0272, + "num_tokens": 157332005.0, + "step": 1965 + }, + { + "epoch": 2.4516531503431067, + "grad_norm": 0.10920573621039224, + "learning_rate": 1.7704231292026219e-06, + "loss": 0.0274, + "num_tokens": 157412201.0, + "step": 1966 + }, + { + "epoch": 2.4529008109794135, + "grad_norm": 0.11239699652286433, + "learning_rate": 1.76703581807807e-06, + "loss": 0.0272, + "num_tokens": 157493943.0, + "step": 1967 + }, + { + "epoch": 2.4541484716157207, + "grad_norm": 0.1082221654475512, + "learning_rate": 1.7636552759458963e-06, + "loss": 0.0269, + "num_tokens": 157572738.0, + "step": 1968 + }, + { + "epoch": 2.4553961322520275, + "grad_norm": 0.11287162279835761, + "learning_rate": 1.760281508936045e-06, + "loss": 0.0274, + "num_tokens": 157653339.0, + "step": 1969 + }, + { + "epoch": 2.4566437928883342, + "grad_norm": 0.11440457618886876, + "learning_rate": 1.7569145231661738e-06, + "loss": 0.028, + "num_tokens": 157734700.0, + "step": 1970 + }, + { + "epoch": 2.4578914535246414, + "grad_norm": 0.11118819360031555, + "learning_rate": 1.753554324741648e-06, + "loss": 0.0272, + "num_tokens": 157815324.0, + "step": 1971 + }, + { + "epoch": 2.459139114160948, + "grad_norm": 0.11370081079972753, + "learning_rate": 1.7502009197555215e-06, + "loss": 0.0287, + "num_tokens": 157896290.0, + "step": 1972 + }, + { + "epoch": 2.460386774797255, + "grad_norm": 0.13188166291275744, + "learning_rate": 1.7468543142885308e-06, + "loss": 0.0291, + "num_tokens": 157978183.0, + "step": 1973 + }, + { + "epoch": 2.461634435433562, + "grad_norm": 0.10901400798519152, + "learning_rate": 1.7435145144090852e-06, + "loss": 0.0281, + "num_tokens": 158056882.0, + "step": 1974 + }, + { + "epoch": 2.462882096069869, + "grad_norm": 0.10230118141977014, + "learning_rate": 1.740181526173248e-06, + "loss": 0.0271, + "num_tokens": 158136794.0, + "step": 1975 + }, + { + "epoch": 2.4641297567061757, + "grad_norm": 0.11403183233637615, + "learning_rate": 1.736855355624737e-06, + "loss": 0.0278, + "num_tokens": 158216836.0, + "step": 1976 + }, + { + "epoch": 2.465377417342483, + "grad_norm": 0.10903076514079771, + "learning_rate": 1.7335360087949048e-06, + "loss": 0.0275, + "num_tokens": 158297798.0, + "step": 1977 + }, + { + "epoch": 2.4666250779787897, + "grad_norm": 0.10466959609108883, + "learning_rate": 1.73022349170273e-06, + "loss": 0.0271, + "num_tokens": 158378165.0, + "step": 1978 + }, + { + "epoch": 2.467872738615097, + "grad_norm": 0.11147187512684335, + "learning_rate": 1.7269178103548057e-06, + "loss": 0.0271, + "num_tokens": 158459178.0, + "step": 1979 + }, + { + "epoch": 2.4691203992514037, + "grad_norm": 0.11409682525660257, + "learning_rate": 1.723618970745334e-06, + "loss": 0.0281, + "num_tokens": 158540418.0, + "step": 1980 + }, + { + "epoch": 2.4703680598877105, + "grad_norm": 0.10477788112973571, + "learning_rate": 1.7203269788561067e-06, + "loss": 0.027, + "num_tokens": 158619445.0, + "step": 1981 + }, + { + "epoch": 2.4716157205240172, + "grad_norm": 0.1074788211668467, + "learning_rate": 1.7170418406564982e-06, + "loss": 0.027, + "num_tokens": 158700296.0, + "step": 1982 + }, + { + "epoch": 2.4728633811603244, + "grad_norm": 0.11129094430099397, + "learning_rate": 1.7137635621034614e-06, + "loss": 0.0277, + "num_tokens": 158780253.0, + "step": 1983 + }, + { + "epoch": 2.474111041796631, + "grad_norm": 0.10164355680147183, + "learning_rate": 1.7104921491415038e-06, + "loss": 0.0268, + "num_tokens": 158860384.0, + "step": 1984 + }, + { + "epoch": 2.4753587024329384, + "grad_norm": 0.11686566136668367, + "learning_rate": 1.7072276077026856e-06, + "loss": 0.0283, + "num_tokens": 158939479.0, + "step": 1985 + }, + { + "epoch": 2.476606363069245, + "grad_norm": 0.11938137497936563, + "learning_rate": 1.7039699437066076e-06, + "loss": 0.0274, + "num_tokens": 159021019.0, + "step": 1986 + }, + { + "epoch": 2.477854023705552, + "grad_norm": 0.110769383334913, + "learning_rate": 1.7007191630604003e-06, + "loss": 0.0269, + "num_tokens": 159100911.0, + "step": 1987 + }, + { + "epoch": 2.479101684341859, + "grad_norm": 0.1066179656774663, + "learning_rate": 1.6974752716587092e-06, + "loss": 0.0276, + "num_tokens": 159180930.0, + "step": 1988 + }, + { + "epoch": 2.480349344978166, + "grad_norm": 0.12059952000176534, + "learning_rate": 1.6942382753836912e-06, + "loss": 0.0287, + "num_tokens": 159260601.0, + "step": 1989 + }, + { + "epoch": 2.4815970056144727, + "grad_norm": 0.11227342940000516, + "learning_rate": 1.691008180105e-06, + "loss": 0.0273, + "num_tokens": 159340555.0, + "step": 1990 + }, + { + "epoch": 2.48284466625078, + "grad_norm": 0.1098479649477286, + "learning_rate": 1.6877849916797728e-06, + "loss": 0.0288, + "num_tokens": 159420038.0, + "step": 1991 + }, + { + "epoch": 2.4840923268870867, + "grad_norm": 0.1111828471383589, + "learning_rate": 1.684568715952626e-06, + "loss": 0.0277, + "num_tokens": 159499863.0, + "step": 1992 + }, + { + "epoch": 2.4853399875233935, + "grad_norm": 0.11053778696407449, + "learning_rate": 1.6813593587556392e-06, + "loss": 0.0272, + "num_tokens": 159579187.0, + "step": 1993 + }, + { + "epoch": 2.4865876481597007, + "grad_norm": 0.11439406236806961, + "learning_rate": 1.6781569259083463e-06, + "loss": 0.0282, + "num_tokens": 159659711.0, + "step": 1994 + }, + { + "epoch": 2.4878353087960074, + "grad_norm": 0.10690423131408978, + "learning_rate": 1.6749614232177273e-06, + "loss": 0.0281, + "num_tokens": 159740822.0, + "step": 1995 + }, + { + "epoch": 2.489082969432314, + "grad_norm": 0.11272934482085559, + "learning_rate": 1.6717728564781927e-06, + "loss": 0.0277, + "num_tokens": 159820583.0, + "step": 1996 + }, + { + "epoch": 2.4903306300686214, + "grad_norm": 0.11712009141925543, + "learning_rate": 1.6685912314715797e-06, + "loss": 0.0276, + "num_tokens": 159901927.0, + "step": 1997 + }, + { + "epoch": 2.491578290704928, + "grad_norm": 0.11201642856684556, + "learning_rate": 1.6654165539671342e-06, + "loss": 0.0273, + "num_tokens": 159982551.0, + "step": 1998 + }, + { + "epoch": 2.492825951341235, + "grad_norm": 0.12217007620784424, + "learning_rate": 1.6622488297215079e-06, + "loss": 0.0281, + "num_tokens": 160063000.0, + "step": 1999 + }, + { + "epoch": 2.494073611977542, + "grad_norm": 0.10737268262091881, + "learning_rate": 1.6590880644787407e-06, + "loss": 0.0268, + "num_tokens": 160142294.0, + "step": 2000 + }, + { + "epoch": 2.495321272613849, + "grad_norm": 0.1250676932249431, + "learning_rate": 1.6559342639702563e-06, + "loss": 0.0306, + "num_tokens": 160220912.0, + "step": 2001 + }, + { + "epoch": 2.496568933250156, + "grad_norm": 0.11211010682205132, + "learning_rate": 1.6527874339148484e-06, + "loss": 0.0264, + "num_tokens": 160301952.0, + "step": 2002 + }, + { + "epoch": 2.497816593886463, + "grad_norm": 0.11786513203484543, + "learning_rate": 1.6496475800186702e-06, + "loss": 0.0281, + "num_tokens": 160381913.0, + "step": 2003 + }, + { + "epoch": 2.4990642545227697, + "grad_norm": 0.10893692645197386, + "learning_rate": 1.6465147079752264e-06, + "loss": 0.0275, + "num_tokens": 160462281.0, + "step": 2004 + }, + { + "epoch": 2.5003119151590765, + "grad_norm": 0.11526335706838876, + "learning_rate": 1.6433888234653614e-06, + "loss": 0.0278, + "num_tokens": 160542998.0, + "step": 2005 + }, + { + "epoch": 2.5015595757953837, + "grad_norm": 0.1094817219693971, + "learning_rate": 1.6402699321572485e-06, + "loss": 0.0267, + "num_tokens": 160621811.0, + "step": 2006 + }, + { + "epoch": 2.5028072364316905, + "grad_norm": 0.11249905162283616, + "learning_rate": 1.6371580397063788e-06, + "loss": 0.0277, + "num_tokens": 160702177.0, + "step": 2007 + }, + { + "epoch": 2.5040548970679977, + "grad_norm": 0.1218621001840924, + "learning_rate": 1.6340531517555563e-06, + "loss": 0.0294, + "num_tokens": 160783219.0, + "step": 2008 + }, + { + "epoch": 2.5053025577043044, + "grad_norm": 0.10986240798677817, + "learning_rate": 1.6309552739348804e-06, + "loss": 0.0268, + "num_tokens": 160862773.0, + "step": 2009 + }, + { + "epoch": 2.506550218340611, + "grad_norm": 0.1207523920480209, + "learning_rate": 1.6278644118617375e-06, + "loss": 0.0275, + "num_tokens": 160942474.0, + "step": 2010 + }, + { + "epoch": 2.5077978789769184, + "grad_norm": 0.11803187004719243, + "learning_rate": 1.6247805711407993e-06, + "loss": 0.0279, + "num_tokens": 161021964.0, + "step": 2011 + }, + { + "epoch": 2.509045539613225, + "grad_norm": 0.11522862481685167, + "learning_rate": 1.6217037573639983e-06, + "loss": 0.0278, + "num_tokens": 161101620.0, + "step": 2012 + }, + { + "epoch": 2.5102932002495324, + "grad_norm": 0.11566448225959479, + "learning_rate": 1.6186339761105275e-06, + "loss": 0.0288, + "num_tokens": 161180903.0, + "step": 2013 + }, + { + "epoch": 2.511540860885839, + "grad_norm": 0.10653159823487614, + "learning_rate": 1.6155712329468305e-06, + "loss": 0.0267, + "num_tokens": 161260133.0, + "step": 2014 + }, + { + "epoch": 2.512788521522146, + "grad_norm": 0.11974291187007381, + "learning_rate": 1.6125155334265846e-06, + "loss": 0.0289, + "num_tokens": 161340599.0, + "step": 2015 + }, + { + "epoch": 2.5140361821584527, + "grad_norm": 0.10950555678047849, + "learning_rate": 1.6094668830906959e-06, + "loss": 0.0281, + "num_tokens": 161421054.0, + "step": 2016 + }, + { + "epoch": 2.51528384279476, + "grad_norm": 0.11950555449558682, + "learning_rate": 1.6064252874672904e-06, + "loss": 0.028, + "num_tokens": 161501219.0, + "step": 2017 + }, + { + "epoch": 2.5165315034310667, + "grad_norm": 0.11235098261736175, + "learning_rate": 1.6033907520717008e-06, + "loss": 0.0274, + "num_tokens": 161580744.0, + "step": 2018 + }, + { + "epoch": 2.517779164067374, + "grad_norm": 0.1093365486795711, + "learning_rate": 1.6003632824064553e-06, + "loss": 0.0267, + "num_tokens": 161660539.0, + "step": 2019 + }, + { + "epoch": 2.5190268247036807, + "grad_norm": 0.11976522826322118, + "learning_rate": 1.5973428839612727e-06, + "loss": 0.028, + "num_tokens": 161741138.0, + "step": 2020 + }, + { + "epoch": 2.5202744853399874, + "grad_norm": 0.1156287357078057, + "learning_rate": 1.5943295622130483e-06, + "loss": 0.028, + "num_tokens": 161821837.0, + "step": 2021 + }, + { + "epoch": 2.521522145976294, + "grad_norm": 0.11441831410599412, + "learning_rate": 1.5913233226258437e-06, + "loss": 0.0271, + "num_tokens": 161900788.0, + "step": 2022 + }, + { + "epoch": 2.5227698066126014, + "grad_norm": 0.1112970332304599, + "learning_rate": 1.5883241706508823e-06, + "loss": 0.0274, + "num_tokens": 161980892.0, + "step": 2023 + }, + { + "epoch": 2.524017467248908, + "grad_norm": 0.12122029274976619, + "learning_rate": 1.5853321117265317e-06, + "loss": 0.0278, + "num_tokens": 162060107.0, + "step": 2024 + }, + { + "epoch": 2.5252651278852154, + "grad_norm": 0.1129961042647627, + "learning_rate": 1.5823471512782983e-06, + "loss": 0.0276, + "num_tokens": 162140320.0, + "step": 2025 + }, + { + "epoch": 2.526512788521522, + "grad_norm": 0.10694528173242357, + "learning_rate": 1.579369294718819e-06, + "loss": 0.0273, + "num_tokens": 162220263.0, + "step": 2026 + }, + { + "epoch": 2.527760449157829, + "grad_norm": 0.10649726142962046, + "learning_rate": 1.5763985474478483e-06, + "loss": 0.0265, + "num_tokens": 162301940.0, + "step": 2027 + }, + { + "epoch": 2.529008109794136, + "grad_norm": 0.11052731141682264, + "learning_rate": 1.5734349148522471e-06, + "loss": 0.0266, + "num_tokens": 162381737.0, + "step": 2028 + }, + { + "epoch": 2.530255770430443, + "grad_norm": 0.11827410827331811, + "learning_rate": 1.5704784023059788e-06, + "loss": 0.0283, + "num_tokens": 162461425.0, + "step": 2029 + }, + { + "epoch": 2.5315034310667497, + "grad_norm": 0.11322497252667106, + "learning_rate": 1.5675290151700937e-06, + "loss": 0.0272, + "num_tokens": 162541913.0, + "step": 2030 + }, + { + "epoch": 2.532751091703057, + "grad_norm": 0.1169346308768019, + "learning_rate": 1.5645867587927208e-06, + "loss": 0.0276, + "num_tokens": 162623234.0, + "step": 2031 + }, + { + "epoch": 2.5339987523393637, + "grad_norm": 0.12004814618546314, + "learning_rate": 1.561651638509062e-06, + "loss": 0.0274, + "num_tokens": 162702087.0, + "step": 2032 + }, + { + "epoch": 2.5352464129756704, + "grad_norm": 0.10868378154550547, + "learning_rate": 1.5587236596413773e-06, + "loss": 0.0276, + "num_tokens": 162782960.0, + "step": 2033 + }, + { + "epoch": 2.5364940736119777, + "grad_norm": 0.11068877689549345, + "learning_rate": 1.5558028274989778e-06, + "loss": 0.028, + "num_tokens": 162862837.0, + "step": 2034 + }, + { + "epoch": 2.5377417342482844, + "grad_norm": 0.1438726002320652, + "learning_rate": 1.5528891473782126e-06, + "loss": 0.027, + "num_tokens": 162941638.0, + "step": 2035 + }, + { + "epoch": 2.5389893948845916, + "grad_norm": 0.11403604077645453, + "learning_rate": 1.5499826245624674e-06, + "loss": 0.0285, + "num_tokens": 163021811.0, + "step": 2036 + }, + { + "epoch": 2.5402370555208984, + "grad_norm": 0.11230917197373902, + "learning_rate": 1.547083264322145e-06, + "loss": 0.0278, + "num_tokens": 163101898.0, + "step": 2037 + }, + { + "epoch": 2.541484716157205, + "grad_norm": 0.11526915003925865, + "learning_rate": 1.5441910719146616e-06, + "loss": 0.0273, + "num_tokens": 163182128.0, + "step": 2038 + }, + { + "epoch": 2.542732376793512, + "grad_norm": 0.11514119570048018, + "learning_rate": 1.541306052584437e-06, + "loss": 0.0277, + "num_tokens": 163263096.0, + "step": 2039 + }, + { + "epoch": 2.543980037429819, + "grad_norm": 0.11954122082354755, + "learning_rate": 1.5384282115628834e-06, + "loss": 0.0275, + "num_tokens": 163343548.0, + "step": 2040 + }, + { + "epoch": 2.545227698066126, + "grad_norm": 0.11066461352482032, + "learning_rate": 1.5355575540683953e-06, + "loss": 0.0278, + "num_tokens": 163423064.0, + "step": 2041 + }, + { + "epoch": 2.546475358702433, + "grad_norm": 0.11137639959635122, + "learning_rate": 1.5326940853063443e-06, + "loss": 0.0282, + "num_tokens": 163502397.0, + "step": 2042 + }, + { + "epoch": 2.54772301933874, + "grad_norm": 0.11324148029107538, + "learning_rate": 1.5298378104690636e-06, + "loss": 0.027, + "num_tokens": 163581986.0, + "step": 2043 + }, + { + "epoch": 2.5489706799750467, + "grad_norm": 0.12329020795643553, + "learning_rate": 1.5269887347358414e-06, + "loss": 0.0279, + "num_tokens": 163661772.0, + "step": 2044 + }, + { + "epoch": 2.5502183406113534, + "grad_norm": 0.11352719316396652, + "learning_rate": 1.5241468632729161e-06, + "loss": 0.028, + "num_tokens": 163742085.0, + "step": 2045 + }, + { + "epoch": 2.5514660012476607, + "grad_norm": 0.11431548220101798, + "learning_rate": 1.5213122012334572e-06, + "loss": 0.0277, + "num_tokens": 163823576.0, + "step": 2046 + }, + { + "epoch": 2.5527136618839674, + "grad_norm": 0.1151041171348859, + "learning_rate": 1.5184847537575647e-06, + "loss": 0.0273, + "num_tokens": 163903399.0, + "step": 2047 + }, + { + "epoch": 2.5539613225202746, + "grad_norm": 0.11215314990558678, + "learning_rate": 1.5156645259722565e-06, + "loss": 0.0276, + "num_tokens": 163984511.0, + "step": 2048 + }, + { + "epoch": 2.5552089831565814, + "grad_norm": 0.11132081944460806, + "learning_rate": 1.5128515229914568e-06, + "loss": 0.0288, + "num_tokens": 164064533.0, + "step": 2049 + }, + { + "epoch": 2.556456643792888, + "grad_norm": 0.11240110122489118, + "learning_rate": 1.5100457499159897e-06, + "loss": 0.0276, + "num_tokens": 164145208.0, + "step": 2050 + }, + { + "epoch": 2.5577043044291954, + "grad_norm": 0.12087806227209612, + "learning_rate": 1.507247211833572e-06, + "loss": 0.0276, + "num_tokens": 164226437.0, + "step": 2051 + }, + { + "epoch": 2.558951965065502, + "grad_norm": 0.11949968556805396, + "learning_rate": 1.5044559138187967e-06, + "loss": 0.0273, + "num_tokens": 164306701.0, + "step": 2052 + }, + { + "epoch": 2.5601996257018094, + "grad_norm": 0.10637452873416925, + "learning_rate": 1.5016718609331315e-06, + "loss": 0.0269, + "num_tokens": 164386138.0, + "step": 2053 + }, + { + "epoch": 2.561447286338116, + "grad_norm": 0.10741900874283838, + "learning_rate": 1.4988950582249061e-06, + "loss": 0.0262, + "num_tokens": 164466296.0, + "step": 2054 + }, + { + "epoch": 2.562694946974423, + "grad_norm": 0.10847356735320615, + "learning_rate": 1.4961255107293044e-06, + "loss": 0.0273, + "num_tokens": 164546034.0, + "step": 2055 + }, + { + "epoch": 2.5639426076107297, + "grad_norm": 0.10867775728226359, + "learning_rate": 1.4933632234683506e-06, + "loss": 0.0275, + "num_tokens": 164627152.0, + "step": 2056 + }, + { + "epoch": 2.565190268247037, + "grad_norm": 0.10863086055220095, + "learning_rate": 1.4906082014509088e-06, + "loss": 0.0275, + "num_tokens": 164706832.0, + "step": 2057 + }, + { + "epoch": 2.5664379288833437, + "grad_norm": 0.11004843698519694, + "learning_rate": 1.4878604496726653e-06, + "loss": 0.0281, + "num_tokens": 164786256.0, + "step": 2058 + }, + { + "epoch": 2.567685589519651, + "grad_norm": 0.11568288570947931, + "learning_rate": 1.4851199731161243e-06, + "loss": 0.027, + "num_tokens": 164865074.0, + "step": 2059 + }, + { + "epoch": 2.5689332501559576, + "grad_norm": 0.10186204742079674, + "learning_rate": 1.4823867767505981e-06, + "loss": 0.0265, + "num_tokens": 164943814.0, + "step": 2060 + }, + { + "epoch": 2.5701809107922644, + "grad_norm": 0.10521205156822269, + "learning_rate": 1.4796608655322001e-06, + "loss": 0.0265, + "num_tokens": 165022913.0, + "step": 2061 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.11589068913118131, + "learning_rate": 1.476942244403829e-06, + "loss": 0.0289, + "num_tokens": 165102030.0, + "step": 2062 + }, + { + "epoch": 2.5726762320648784, + "grad_norm": 0.11233950248375833, + "learning_rate": 1.4742309182951663e-06, + "loss": 0.0273, + "num_tokens": 165182179.0, + "step": 2063 + }, + { + "epoch": 2.573923892701185, + "grad_norm": 0.11565035317539887, + "learning_rate": 1.4715268921226677e-06, + "loss": 0.0279, + "num_tokens": 165262779.0, + "step": 2064 + }, + { + "epoch": 2.5751715533374924, + "grad_norm": 0.11808315287951174, + "learning_rate": 1.468830170789548e-06, + "loss": 0.0278, + "num_tokens": 165344308.0, + "step": 2065 + }, + { + "epoch": 2.576419213973799, + "grad_norm": 0.11443778145606423, + "learning_rate": 1.4661407591857795e-06, + "loss": 0.0276, + "num_tokens": 165423934.0, + "step": 2066 + }, + { + "epoch": 2.577666874610106, + "grad_norm": 0.11190198300939089, + "learning_rate": 1.4634586621880786e-06, + "loss": 0.0273, + "num_tokens": 165503345.0, + "step": 2067 + }, + { + "epoch": 2.578914535246413, + "grad_norm": 0.11593353577598203, + "learning_rate": 1.4607838846598959e-06, + "loss": 0.0263, + "num_tokens": 165583590.0, + "step": 2068 + }, + { + "epoch": 2.58016219588272, + "grad_norm": 0.11449567615373328, + "learning_rate": 1.4581164314514127e-06, + "loss": 0.0279, + "num_tokens": 165663697.0, + "step": 2069 + }, + { + "epoch": 2.581409856519027, + "grad_norm": 0.10372245010081463, + "learning_rate": 1.4554563073995284e-06, + "loss": 0.027, + "num_tokens": 165744637.0, + "step": 2070 + }, + { + "epoch": 2.582657517155334, + "grad_norm": 0.10739901271925287, + "learning_rate": 1.452803517327852e-06, + "loss": 0.0269, + "num_tokens": 165825845.0, + "step": 2071 + }, + { + "epoch": 2.5839051777916406, + "grad_norm": 0.10419262399020085, + "learning_rate": 1.450158066046692e-06, + "loss": 0.0267, + "num_tokens": 165906115.0, + "step": 2072 + }, + { + "epoch": 2.5851528384279474, + "grad_norm": 0.10832482949706342, + "learning_rate": 1.4475199583530536e-06, + "loss": 0.0272, + "num_tokens": 165986633.0, + "step": 2073 + }, + { + "epoch": 2.5864004990642546, + "grad_norm": 0.11424246115691489, + "learning_rate": 1.444889199030622e-06, + "loss": 0.0284, + "num_tokens": 166066387.0, + "step": 2074 + }, + { + "epoch": 2.5876481597005614, + "grad_norm": 0.11100527216764833, + "learning_rate": 1.4422657928497572e-06, + "loss": 0.0265, + "num_tokens": 166148319.0, + "step": 2075 + }, + { + "epoch": 2.5888958203368686, + "grad_norm": 0.12099284205512863, + "learning_rate": 1.4396497445674917e-06, + "loss": 0.0296, + "num_tokens": 166228531.0, + "step": 2076 + }, + { + "epoch": 2.5901434809731754, + "grad_norm": 0.10891165530512759, + "learning_rate": 1.4370410589275096e-06, + "loss": 0.0279, + "num_tokens": 166307862.0, + "step": 2077 + }, + { + "epoch": 2.591391141609482, + "grad_norm": 0.12679386624360375, + "learning_rate": 1.4344397406601454e-06, + "loss": 0.0293, + "num_tokens": 166388213.0, + "step": 2078 + }, + { + "epoch": 2.592638802245789, + "grad_norm": 0.11178974797840838, + "learning_rate": 1.4318457944823775e-06, + "loss": 0.028, + "num_tokens": 166468608.0, + "step": 2079 + }, + { + "epoch": 2.593886462882096, + "grad_norm": 0.10698791554236746, + "learning_rate": 1.4292592250978137e-06, + "loss": 0.0268, + "num_tokens": 166548055.0, + "step": 2080 + }, + { + "epoch": 2.595134123518403, + "grad_norm": 0.10402086737935964, + "learning_rate": 1.4266800371966844e-06, + "loss": 0.0265, + "num_tokens": 166627914.0, + "step": 2081 + }, + { + "epoch": 2.59638178415471, + "grad_norm": 0.11256053343287431, + "learning_rate": 1.424108235455838e-06, + "loss": 0.0283, + "num_tokens": 166708046.0, + "step": 2082 + }, + { + "epoch": 2.597629444791017, + "grad_norm": 0.11648920137262059, + "learning_rate": 1.4215438245387303e-06, + "loss": 0.0278, + "num_tokens": 166788951.0, + "step": 2083 + }, + { + "epoch": 2.5988771054273236, + "grad_norm": 0.11412389464634487, + "learning_rate": 1.41898680909541e-06, + "loss": 0.0279, + "num_tokens": 166867908.0, + "step": 2084 + }, + { + "epoch": 2.600124766063631, + "grad_norm": 0.14616857946554868, + "learning_rate": 1.4164371937625222e-06, + "loss": 0.0274, + "num_tokens": 166947938.0, + "step": 2085 + }, + { + "epoch": 2.6013724266999376, + "grad_norm": 0.10641921005233325, + "learning_rate": 1.4138949831632879e-06, + "loss": 0.027, + "num_tokens": 167028248.0, + "step": 2086 + }, + { + "epoch": 2.6026200873362444, + "grad_norm": 0.11340157643915183, + "learning_rate": 1.4113601819075037e-06, + "loss": 0.0275, + "num_tokens": 167107712.0, + "step": 2087 + }, + { + "epoch": 2.6038677479725516, + "grad_norm": 0.11403401971381442, + "learning_rate": 1.4088327945915315e-06, + "loss": 0.0268, + "num_tokens": 167188583.0, + "step": 2088 + }, + { + "epoch": 2.6051154086088584, + "grad_norm": 0.10625095668404386, + "learning_rate": 1.4063128257982867e-06, + "loss": 0.0267, + "num_tokens": 167268518.0, + "step": 2089 + }, + { + "epoch": 2.606363069245165, + "grad_norm": 0.1171168338572617, + "learning_rate": 1.4038002800972362e-06, + "loss": 0.0275, + "num_tokens": 167348636.0, + "step": 2090 + }, + { + "epoch": 2.6076107298814724, + "grad_norm": 0.11542335973973553, + "learning_rate": 1.401295162044383e-06, + "loss": 0.0274, + "num_tokens": 167429322.0, + "step": 2091 + }, + { + "epoch": 2.608858390517779, + "grad_norm": 0.11727306920785573, + "learning_rate": 1.3987974761822656e-06, + "loss": 0.0275, + "num_tokens": 167510283.0, + "step": 2092 + }, + { + "epoch": 2.6101060511540863, + "grad_norm": 0.11766850450132074, + "learning_rate": 1.3963072270399411e-06, + "loss": 0.0273, + "num_tokens": 167590184.0, + "step": 2093 + }, + { + "epoch": 2.611353711790393, + "grad_norm": 0.11918576619487399, + "learning_rate": 1.393824419132986e-06, + "loss": 0.0279, + "num_tokens": 167669021.0, + "step": 2094 + }, + { + "epoch": 2.6126013724267, + "grad_norm": 0.11307732162265724, + "learning_rate": 1.3913490569634796e-06, + "loss": 0.0277, + "num_tokens": 167748127.0, + "step": 2095 + }, + { + "epoch": 2.6138490330630066, + "grad_norm": 0.12446227031099523, + "learning_rate": 1.388881145020002e-06, + "loss": 0.0297, + "num_tokens": 167828350.0, + "step": 2096 + }, + { + "epoch": 2.615096693699314, + "grad_norm": 0.11026469374300878, + "learning_rate": 1.3864206877776245e-06, + "loss": 0.0273, + "num_tokens": 167908530.0, + "step": 2097 + }, + { + "epoch": 2.6163443543356206, + "grad_norm": 0.11295221539796105, + "learning_rate": 1.3839676896978997e-06, + "loss": 0.0275, + "num_tokens": 167989313.0, + "step": 2098 + }, + { + "epoch": 2.617592014971928, + "grad_norm": 0.11298609559496968, + "learning_rate": 1.3815221552288541e-06, + "loss": 0.0276, + "num_tokens": 168069559.0, + "step": 2099 + }, + { + "epoch": 2.6188396756082346, + "grad_norm": 0.1141025393450742, + "learning_rate": 1.3790840888049802e-06, + "loss": 0.0272, + "num_tokens": 168148187.0, + "step": 2100 + }, + { + "epoch": 2.6200873362445414, + "grad_norm": 0.11929918883054025, + "learning_rate": 1.3766534948472307e-06, + "loss": 0.0271, + "num_tokens": 168228189.0, + "step": 2101 + }, + { + "epoch": 2.621334996880848, + "grad_norm": 0.11988289797660306, + "learning_rate": 1.3742303777630057e-06, + "loss": 0.0276, + "num_tokens": 168308932.0, + "step": 2102 + }, + { + "epoch": 2.6225826575171554, + "grad_norm": 0.11436560741581663, + "learning_rate": 1.3718147419461497e-06, + "loss": 0.0282, + "num_tokens": 168388848.0, + "step": 2103 + }, + { + "epoch": 2.623830318153462, + "grad_norm": 0.1056496402693744, + "learning_rate": 1.3694065917769414e-06, + "loss": 0.027, + "num_tokens": 168467832.0, + "step": 2104 + }, + { + "epoch": 2.6250779787897693, + "grad_norm": 0.1104079059611424, + "learning_rate": 1.367005931622084e-06, + "loss": 0.0282, + "num_tokens": 168547708.0, + "step": 2105 + }, + { + "epoch": 2.626325639426076, + "grad_norm": 0.10429616151130164, + "learning_rate": 1.3646127658346992e-06, + "loss": 0.0267, + "num_tokens": 168627496.0, + "step": 2106 + }, + { + "epoch": 2.627573300062383, + "grad_norm": 0.11808419996882835, + "learning_rate": 1.3622270987543215e-06, + "loss": 0.0275, + "num_tokens": 168706999.0, + "step": 2107 + }, + { + "epoch": 2.62882096069869, + "grad_norm": 0.11162808996935646, + "learning_rate": 1.3598489347068858e-06, + "loss": 0.0265, + "num_tokens": 168787359.0, + "step": 2108 + }, + { + "epoch": 2.630068621334997, + "grad_norm": 0.11556801052351962, + "learning_rate": 1.357478278004721e-06, + "loss": 0.027, + "num_tokens": 168868083.0, + "step": 2109 + }, + { + "epoch": 2.631316281971304, + "grad_norm": 0.11227131668663762, + "learning_rate": 1.3551151329465462e-06, + "loss": 0.0272, + "num_tokens": 168946701.0, + "step": 2110 + }, + { + "epoch": 2.632563942607611, + "grad_norm": 0.1145171495962, + "learning_rate": 1.3527595038174566e-06, + "loss": 0.0269, + "num_tokens": 169026964.0, + "step": 2111 + }, + { + "epoch": 2.6338116032439176, + "grad_norm": 0.11258767236811885, + "learning_rate": 1.35041139488892e-06, + "loss": 0.0269, + "num_tokens": 169106455.0, + "step": 2112 + }, + { + "epoch": 2.6350592638802244, + "grad_norm": 0.1269169979219692, + "learning_rate": 1.3480708104187685e-06, + "loss": 0.0278, + "num_tokens": 169187782.0, + "step": 2113 + }, + { + "epoch": 2.6363069245165316, + "grad_norm": 0.10759670867627756, + "learning_rate": 1.3457377546511882e-06, + "loss": 0.0274, + "num_tokens": 169268014.0, + "step": 2114 + }, + { + "epoch": 2.6375545851528384, + "grad_norm": 0.10677322152837186, + "learning_rate": 1.3434122318167142e-06, + "loss": 0.0267, + "num_tokens": 169347680.0, + "step": 2115 + }, + { + "epoch": 2.6388022457891456, + "grad_norm": 0.1116462775595202, + "learning_rate": 1.3410942461322236e-06, + "loss": 0.0273, + "num_tokens": 169428596.0, + "step": 2116 + }, + { + "epoch": 2.6400499064254523, + "grad_norm": 0.11029769450411615, + "learning_rate": 1.3387838018009239e-06, + "loss": 0.0267, + "num_tokens": 169509363.0, + "step": 2117 + }, + { + "epoch": 2.641297567061759, + "grad_norm": 0.10789263559722907, + "learning_rate": 1.3364809030123477e-06, + "loss": 0.0265, + "num_tokens": 169589358.0, + "step": 2118 + }, + { + "epoch": 2.642545227698066, + "grad_norm": 0.11544419735850721, + "learning_rate": 1.3341855539423499e-06, + "loss": 0.0272, + "num_tokens": 169669893.0, + "step": 2119 + }, + { + "epoch": 2.643792888334373, + "grad_norm": 0.13240480297882418, + "learning_rate": 1.3318977587530907e-06, + "loss": 0.0441, + "num_tokens": 169750499.0, + "step": 2120 + }, + { + "epoch": 2.64504054897068, + "grad_norm": 0.10788331716169906, + "learning_rate": 1.3296175215930326e-06, + "loss": 0.0266, + "num_tokens": 169829770.0, + "step": 2121 + }, + { + "epoch": 2.646288209606987, + "grad_norm": 0.10324597149645515, + "learning_rate": 1.3273448465969376e-06, + "loss": 0.0273, + "num_tokens": 169909873.0, + "step": 2122 + }, + { + "epoch": 2.647535870243294, + "grad_norm": 0.1083672497231884, + "learning_rate": 1.3250797378858507e-06, + "loss": 0.0273, + "num_tokens": 169990577.0, + "step": 2123 + }, + { + "epoch": 2.6487835308796006, + "grad_norm": 0.10796760309733955, + "learning_rate": 1.3228221995670987e-06, + "loss": 0.0272, + "num_tokens": 170069752.0, + "step": 2124 + }, + { + "epoch": 2.650031191515908, + "grad_norm": 0.11470562508991235, + "learning_rate": 1.3205722357342807e-06, + "loss": 0.0281, + "num_tokens": 170150197.0, + "step": 2125 + }, + { + "epoch": 2.6512788521522146, + "grad_norm": 0.10707306739767271, + "learning_rate": 1.3183298504672626e-06, + "loss": 0.0276, + "num_tokens": 170229547.0, + "step": 2126 + }, + { + "epoch": 2.6525265127885214, + "grad_norm": 0.10919949194770662, + "learning_rate": 1.316095047832166e-06, + "loss": 0.0274, + "num_tokens": 170309378.0, + "step": 2127 + }, + { + "epoch": 2.6537741734248286, + "grad_norm": 0.11957546966930166, + "learning_rate": 1.3138678318813618e-06, + "loss": 0.0285, + "num_tokens": 170389032.0, + "step": 2128 + }, + { + "epoch": 2.6550218340611353, + "grad_norm": 0.11617481255786126, + "learning_rate": 1.3116482066534686e-06, + "loss": 0.027, + "num_tokens": 170468247.0, + "step": 2129 + }, + { + "epoch": 2.656269494697442, + "grad_norm": 0.11357476091195207, + "learning_rate": 1.3094361761733356e-06, + "loss": 0.0275, + "num_tokens": 170548512.0, + "step": 2130 + }, + { + "epoch": 2.6575171553337493, + "grad_norm": 0.10925157715831366, + "learning_rate": 1.3072317444520449e-06, + "loss": 0.027, + "num_tokens": 170630459.0, + "step": 2131 + }, + { + "epoch": 2.658764815970056, + "grad_norm": 0.1169285409735411, + "learning_rate": 1.3050349154868946e-06, + "loss": 0.037, + "num_tokens": 170711970.0, + "step": 2132 + }, + { + "epoch": 2.6600124766063633, + "grad_norm": 0.11614548941267049, + "learning_rate": 1.3028456932614019e-06, + "loss": 0.027, + "num_tokens": 170791598.0, + "step": 2133 + }, + { + "epoch": 2.66126013724267, + "grad_norm": 0.12300026143621852, + "learning_rate": 1.3006640817452873e-06, + "loss": 0.0278, + "num_tokens": 170870743.0, + "step": 2134 + }, + { + "epoch": 2.662507797878977, + "grad_norm": 0.10949480726257627, + "learning_rate": 1.2984900848944727e-06, + "loss": 0.0268, + "num_tokens": 170950664.0, + "step": 2135 + }, + { + "epoch": 2.6637554585152836, + "grad_norm": 0.11076298472497624, + "learning_rate": 1.2963237066510715e-06, + "loss": 0.0279, + "num_tokens": 171030340.0, + "step": 2136 + }, + { + "epoch": 2.665003119151591, + "grad_norm": 0.11388272829849243, + "learning_rate": 1.2941649509433808e-06, + "loss": 0.0269, + "num_tokens": 171109325.0, + "step": 2137 + }, + { + "epoch": 2.6662507797878976, + "grad_norm": 0.12341205651411256, + "learning_rate": 1.2920138216858791e-06, + "loss": 0.0286, + "num_tokens": 171189575.0, + "step": 2138 + }, + { + "epoch": 2.667498440424205, + "grad_norm": 0.11331697834247403, + "learning_rate": 1.289870322779212e-06, + "loss": 0.0285, + "num_tokens": 171268927.0, + "step": 2139 + }, + { + "epoch": 2.6687461010605116, + "grad_norm": 0.11882020374000711, + "learning_rate": 1.2877344581101922e-06, + "loss": 0.0272, + "num_tokens": 171347477.0, + "step": 2140 + }, + { + "epoch": 2.6699937616968183, + "grad_norm": 0.10972444404388286, + "learning_rate": 1.2856062315517885e-06, + "loss": 0.0278, + "num_tokens": 171427407.0, + "step": 2141 + }, + { + "epoch": 2.671241422333125, + "grad_norm": 0.11946680862932044, + "learning_rate": 1.2834856469631174e-06, + "loss": 0.028, + "num_tokens": 171507650.0, + "step": 2142 + }, + { + "epoch": 2.6724890829694323, + "grad_norm": 0.10858577662792648, + "learning_rate": 1.28137270818944e-06, + "loss": 0.0276, + "num_tokens": 171588030.0, + "step": 2143 + }, + { + "epoch": 2.673736743605739, + "grad_norm": 0.11241775634496111, + "learning_rate": 1.279267419062155e-06, + "loss": 0.0272, + "num_tokens": 171668622.0, + "step": 2144 + }, + { + "epoch": 2.6749844042420463, + "grad_norm": 0.1138275210262026, + "learning_rate": 1.2771697833987852e-06, + "loss": 0.0274, + "num_tokens": 171747879.0, + "step": 2145 + }, + { + "epoch": 2.676232064878353, + "grad_norm": 0.12107882370247575, + "learning_rate": 1.2750798050029782e-06, + "loss": 0.0301, + "num_tokens": 171828495.0, + "step": 2146 + }, + { + "epoch": 2.67747972551466, + "grad_norm": 0.11812881048310148, + "learning_rate": 1.272997487664499e-06, + "loss": 0.0289, + "num_tokens": 171908174.0, + "step": 2147 + }, + { + "epoch": 2.678727386150967, + "grad_norm": 0.10269224386186213, + "learning_rate": 1.2709228351592167e-06, + "loss": 0.0261, + "num_tokens": 171988943.0, + "step": 2148 + }, + { + "epoch": 2.679975046787274, + "grad_norm": 0.10927504867119865, + "learning_rate": 1.2688558512491032e-06, + "loss": 0.028, + "num_tokens": 172068842.0, + "step": 2149 + }, + { + "epoch": 2.681222707423581, + "grad_norm": 0.11683785756855145, + "learning_rate": 1.2667965396822257e-06, + "loss": 0.0275, + "num_tokens": 172148664.0, + "step": 2150 + }, + { + "epoch": 2.682470368059888, + "grad_norm": 0.12167175622894594, + "learning_rate": 1.2647449041927385e-06, + "loss": 0.0278, + "num_tokens": 172229020.0, + "step": 2151 + }, + { + "epoch": 2.6837180286961946, + "grad_norm": 0.10888309628100702, + "learning_rate": 1.2627009485008754e-06, + "loss": 0.0272, + "num_tokens": 172309720.0, + "step": 2152 + }, + { + "epoch": 2.6849656893325013, + "grad_norm": 0.122912436571624, + "learning_rate": 1.2606646763129476e-06, + "loss": 0.0284, + "num_tokens": 172389961.0, + "step": 2153 + }, + { + "epoch": 2.6862133499688086, + "grad_norm": 0.10857265378270686, + "learning_rate": 1.2586360913213315e-06, + "loss": 0.0262, + "num_tokens": 172471162.0, + "step": 2154 + }, + { + "epoch": 2.6874610106051153, + "grad_norm": 0.11669471057841353, + "learning_rate": 1.256615197204465e-06, + "loss": 0.0283, + "num_tokens": 172551948.0, + "step": 2155 + }, + { + "epoch": 2.6887086712414225, + "grad_norm": 0.13329440954951594, + "learning_rate": 1.2546019976268403e-06, + "loss": 0.0292, + "num_tokens": 172632487.0, + "step": 2156 + }, + { + "epoch": 2.6899563318777293, + "grad_norm": 0.11522808892441781, + "learning_rate": 1.2525964962389961e-06, + "loss": 0.027, + "num_tokens": 172713350.0, + "step": 2157 + }, + { + "epoch": 2.691203992514036, + "grad_norm": 0.11229194690407834, + "learning_rate": 1.250598696677512e-06, + "loss": 0.0278, + "num_tokens": 172792803.0, + "step": 2158 + }, + { + "epoch": 2.692451653150343, + "grad_norm": 0.10819150987759615, + "learning_rate": 1.2486086025650045e-06, + "loss": 0.0269, + "num_tokens": 172873683.0, + "step": 2159 + }, + { + "epoch": 2.69369931378665, + "grad_norm": 0.10976961567370531, + "learning_rate": 1.246626217510114e-06, + "loss": 0.0273, + "num_tokens": 172953708.0, + "step": 2160 + }, + { + "epoch": 2.694946974422957, + "grad_norm": 0.10593975910980304, + "learning_rate": 1.244651545107503e-06, + "loss": 0.0275, + "num_tokens": 173034948.0, + "step": 2161 + }, + { + "epoch": 2.696194635059264, + "grad_norm": 0.10752240766626842, + "learning_rate": 1.2426845889378516e-06, + "loss": 0.0271, + "num_tokens": 173114456.0, + "step": 2162 + }, + { + "epoch": 2.697442295695571, + "grad_norm": 0.11100439809268697, + "learning_rate": 1.2407253525678453e-06, + "loss": 0.027, + "num_tokens": 173193677.0, + "step": 2163 + }, + { + "epoch": 2.6986899563318776, + "grad_norm": 0.11124936556530347, + "learning_rate": 1.2387738395501714e-06, + "loss": 0.028, + "num_tokens": 173273829.0, + "step": 2164 + }, + { + "epoch": 2.699937616968185, + "grad_norm": 0.11341877617750057, + "learning_rate": 1.236830053423512e-06, + "loss": 0.0279, + "num_tokens": 173354403.0, + "step": 2165 + }, + { + "epoch": 2.7011852776044916, + "grad_norm": 0.10432445599121043, + "learning_rate": 1.2348939977125412e-06, + "loss": 0.0273, + "num_tokens": 173434752.0, + "step": 2166 + }, + { + "epoch": 2.7024329382407988, + "grad_norm": 0.10349647102790291, + "learning_rate": 1.2329656759279108e-06, + "loss": 0.0265, + "num_tokens": 173513891.0, + "step": 2167 + }, + { + "epoch": 2.7036805988771055, + "grad_norm": 0.10730067378099671, + "learning_rate": 1.2310450915662516e-06, + "loss": 0.0263, + "num_tokens": 173593347.0, + "step": 2168 + }, + { + "epoch": 2.7049282595134123, + "grad_norm": 0.12358364979067543, + "learning_rate": 1.229132248110165e-06, + "loss": 0.0289, + "num_tokens": 173673697.0, + "step": 2169 + }, + { + "epoch": 2.706175920149719, + "grad_norm": 0.121969574680686, + "learning_rate": 1.2272271490282134e-06, + "loss": 0.0273, + "num_tokens": 173753524.0, + "step": 2170 + }, + { + "epoch": 2.7074235807860263, + "grad_norm": 0.10988278388736815, + "learning_rate": 1.2253297977749163e-06, + "loss": 0.0267, + "num_tokens": 173833499.0, + "step": 2171 + }, + { + "epoch": 2.708671241422333, + "grad_norm": 0.11127655885847154, + "learning_rate": 1.2234401977907468e-06, + "loss": 0.0276, + "num_tokens": 173913319.0, + "step": 2172 + }, + { + "epoch": 2.7099189020586403, + "grad_norm": 0.11247307459253601, + "learning_rate": 1.2215583525021203e-06, + "loss": 0.0273, + "num_tokens": 173992389.0, + "step": 2173 + }, + { + "epoch": 2.711166562694947, + "grad_norm": 0.11437743854655531, + "learning_rate": 1.2196842653213896e-06, + "loss": 0.0294, + "num_tokens": 174072025.0, + "step": 2174 + }, + { + "epoch": 2.712414223331254, + "grad_norm": 0.11260264853485703, + "learning_rate": 1.2178179396468428e-06, + "loss": 0.0284, + "num_tokens": 174152930.0, + "step": 2175 + }, + { + "epoch": 2.7136618839675606, + "grad_norm": 0.12047852452761347, + "learning_rate": 1.215959378862692e-06, + "loss": 0.0276, + "num_tokens": 174232157.0, + "step": 2176 + }, + { + "epoch": 2.714909544603868, + "grad_norm": 0.10620348737338733, + "learning_rate": 1.2141085863390696e-06, + "loss": 0.0267, + "num_tokens": 174312869.0, + "step": 2177 + }, + { + "epoch": 2.7161572052401746, + "grad_norm": 0.10200387743067355, + "learning_rate": 1.2122655654320225e-06, + "loss": 0.026, + "num_tokens": 174391850.0, + "step": 2178 + }, + { + "epoch": 2.717404865876482, + "grad_norm": 0.11792058541690678, + "learning_rate": 1.210430319483504e-06, + "loss": 0.0279, + "num_tokens": 174472240.0, + "step": 2179 + }, + { + "epoch": 2.7186525265127885, + "grad_norm": 0.11352708440547712, + "learning_rate": 1.2086028518213694e-06, + "loss": 0.0273, + "num_tokens": 174551750.0, + "step": 2180 + }, + { + "epoch": 2.7199001871490953, + "grad_norm": 0.11403171270711979, + "learning_rate": 1.206783165759371e-06, + "loss": 0.0273, + "num_tokens": 174631491.0, + "step": 2181 + }, + { + "epoch": 2.7211478477854025, + "grad_norm": 0.1196489902512615, + "learning_rate": 1.204971264597148e-06, + "loss": 0.0275, + "num_tokens": 174711527.0, + "step": 2182 + }, + { + "epoch": 2.7223955084217093, + "grad_norm": 0.1081192153362361, + "learning_rate": 1.2031671516202263e-06, + "loss": 0.0272, + "num_tokens": 174790208.0, + "step": 2183 + }, + { + "epoch": 2.723643169058016, + "grad_norm": 0.11976247939116517, + "learning_rate": 1.2013708301000082e-06, + "loss": 0.028, + "num_tokens": 174870561.0, + "step": 2184 + }, + { + "epoch": 2.7248908296943233, + "grad_norm": 0.11442096061556735, + "learning_rate": 1.199582303293767e-06, + "loss": 0.0274, + "num_tokens": 174951422.0, + "step": 2185 + }, + { + "epoch": 2.72613849033063, + "grad_norm": 0.11617050642869951, + "learning_rate": 1.1978015744446417e-06, + "loss": 0.0271, + "num_tokens": 175031463.0, + "step": 2186 + }, + { + "epoch": 2.727386150966937, + "grad_norm": 0.12293559783309263, + "learning_rate": 1.1960286467816331e-06, + "loss": 0.0278, + "num_tokens": 175112098.0, + "step": 2187 + }, + { + "epoch": 2.728633811603244, + "grad_norm": 0.11221752709437237, + "learning_rate": 1.1942635235195949e-06, + "loss": 0.0285, + "num_tokens": 175192431.0, + "step": 2188 + }, + { + "epoch": 2.729881472239551, + "grad_norm": 0.11210559785351049, + "learning_rate": 1.1925062078592279e-06, + "loss": 0.0267, + "num_tokens": 175272604.0, + "step": 2189 + }, + { + "epoch": 2.731129132875858, + "grad_norm": 0.10832068295079063, + "learning_rate": 1.190756702987077e-06, + "loss": 0.0269, + "num_tokens": 175352453.0, + "step": 2190 + }, + { + "epoch": 2.732376793512165, + "grad_norm": 0.12338756235671419, + "learning_rate": 1.1890150120755244e-06, + "loss": 0.0284, + "num_tokens": 175432528.0, + "step": 2191 + }, + { + "epoch": 2.7336244541484715, + "grad_norm": 0.11219089612196873, + "learning_rate": 1.1872811382827811e-06, + "loss": 0.027, + "num_tokens": 175512749.0, + "step": 2192 + }, + { + "epoch": 2.7348721147847783, + "grad_norm": 0.12417263033167164, + "learning_rate": 1.1855550847528849e-06, + "loss": 0.0271, + "num_tokens": 175593593.0, + "step": 2193 + }, + { + "epoch": 2.7361197754210855, + "grad_norm": 0.11709495692614103, + "learning_rate": 1.1838368546156924e-06, + "loss": 0.0266, + "num_tokens": 175674330.0, + "step": 2194 + }, + { + "epoch": 2.7373674360573923, + "grad_norm": 0.11715755751386901, + "learning_rate": 1.182126450986874e-06, + "loss": 0.028, + "num_tokens": 175754689.0, + "step": 2195 + }, + { + "epoch": 2.7386150966936995, + "grad_norm": 0.11080050252306903, + "learning_rate": 1.1804238769679077e-06, + "loss": 0.0273, + "num_tokens": 175834619.0, + "step": 2196 + }, + { + "epoch": 2.7398627573300063, + "grad_norm": 0.11676541458720631, + "learning_rate": 1.178729135646077e-06, + "loss": 0.0269, + "num_tokens": 175914942.0, + "step": 2197 + }, + { + "epoch": 2.741110417966313, + "grad_norm": 0.11457451396067345, + "learning_rate": 1.1770422300944586e-06, + "loss": 0.028, + "num_tokens": 175996002.0, + "step": 2198 + }, + { + "epoch": 2.74235807860262, + "grad_norm": 0.10756801142995977, + "learning_rate": 1.1753631633719217e-06, + "loss": 0.0281, + "num_tokens": 176076559.0, + "step": 2199 + }, + { + "epoch": 2.743605739238927, + "grad_norm": 0.12404043322828145, + "learning_rate": 1.1736919385231236e-06, + "loss": 0.028, + "num_tokens": 176156992.0, + "step": 2200 + }, + { + "epoch": 2.744853399875234, + "grad_norm": 0.10783216704576966, + "learning_rate": 1.1720285585784983e-06, + "loss": 0.0258, + "num_tokens": 176235147.0, + "step": 2201 + }, + { + "epoch": 2.746101060511541, + "grad_norm": 0.11227645559258542, + "learning_rate": 1.1703730265542569e-06, + "loss": 0.0275, + "num_tokens": 176315166.0, + "step": 2202 + }, + { + "epoch": 2.747348721147848, + "grad_norm": 0.11576428333299565, + "learning_rate": 1.16872534545238e-06, + "loss": 0.028, + "num_tokens": 176395152.0, + "step": 2203 + }, + { + "epoch": 2.7485963817841546, + "grad_norm": 0.11841116972911843, + "learning_rate": 1.1670855182606106e-06, + "loss": 0.0274, + "num_tokens": 176476772.0, + "step": 2204 + }, + { + "epoch": 2.7498440424204618, + "grad_norm": 0.10483183967952564, + "learning_rate": 1.1654535479524511e-06, + "loss": 0.0269, + "num_tokens": 176556474.0, + "step": 2205 + }, + { + "epoch": 2.7510917030567685, + "grad_norm": 0.11471768456732195, + "learning_rate": 1.163829437487158e-06, + "loss": 0.0277, + "num_tokens": 176637858.0, + "step": 2206 + }, + { + "epoch": 2.7523393636930757, + "grad_norm": 0.108264123677895, + "learning_rate": 1.162213189809734e-06, + "loss": 0.0271, + "num_tokens": 176717271.0, + "step": 2207 + }, + { + "epoch": 2.7535870243293825, + "grad_norm": 0.15282329766969122, + "learning_rate": 1.1606048078509235e-06, + "loss": 0.0346, + "num_tokens": 176797686.0, + "step": 2208 + }, + { + "epoch": 2.7548346849656893, + "grad_norm": 0.11649281003810305, + "learning_rate": 1.1590042945272108e-06, + "loss": 0.0272, + "num_tokens": 176876658.0, + "step": 2209 + }, + { + "epoch": 2.756082345601996, + "grad_norm": 0.10842683509303165, + "learning_rate": 1.1574116527408093e-06, + "loss": 0.0267, + "num_tokens": 176956269.0, + "step": 2210 + }, + { + "epoch": 2.7573300062383033, + "grad_norm": 0.11131932466818467, + "learning_rate": 1.1558268853796597e-06, + "loss": 0.0269, + "num_tokens": 177036068.0, + "step": 2211 + }, + { + "epoch": 2.75857766687461, + "grad_norm": 0.12181987186342921, + "learning_rate": 1.1542499953174257e-06, + "loss": 0.0282, + "num_tokens": 177117826.0, + "step": 2212 + }, + { + "epoch": 2.7598253275109172, + "grad_norm": 0.11099109738309532, + "learning_rate": 1.1526809854134844e-06, + "loss": 0.0281, + "num_tokens": 177198291.0, + "step": 2213 + }, + { + "epoch": 2.761072988147224, + "grad_norm": 0.11310340011164033, + "learning_rate": 1.151119858512925e-06, + "loss": 0.0268, + "num_tokens": 177278136.0, + "step": 2214 + }, + { + "epoch": 2.762320648783531, + "grad_norm": 0.12040084886173383, + "learning_rate": 1.149566617446543e-06, + "loss": 0.0273, + "num_tokens": 177358685.0, + "step": 2215 + }, + { + "epoch": 2.7635683094198376, + "grad_norm": 0.11023952574462588, + "learning_rate": 1.1480212650308337e-06, + "loss": 0.0277, + "num_tokens": 177439052.0, + "step": 2216 + }, + { + "epoch": 2.7648159700561448, + "grad_norm": 0.10245924815558206, + "learning_rate": 1.1464838040679876e-06, + "loss": 0.0265, + "num_tokens": 177518673.0, + "step": 2217 + }, + { + "epoch": 2.7660636306924515, + "grad_norm": 0.11896518323741509, + "learning_rate": 1.1449542373458867e-06, + "loss": 0.028, + "num_tokens": 177599813.0, + "step": 2218 + }, + { + "epoch": 2.7673112913287587, + "grad_norm": 0.11721840542278891, + "learning_rate": 1.1434325676380983e-06, + "loss": 0.0275, + "num_tokens": 177680047.0, + "step": 2219 + }, + { + "epoch": 2.7685589519650655, + "grad_norm": 0.11242112554545239, + "learning_rate": 1.141918797703868e-06, + "loss": 0.0273, + "num_tokens": 177759176.0, + "step": 2220 + }, + { + "epoch": 2.7698066126013723, + "grad_norm": 0.12213920330413136, + "learning_rate": 1.1404129302881193e-06, + "loss": 0.0276, + "num_tokens": 177840002.0, + "step": 2221 + }, + { + "epoch": 2.7710542732376795, + "grad_norm": 0.11684412169186235, + "learning_rate": 1.1389149681214456e-06, + "loss": 0.0285, + "num_tokens": 177920088.0, + "step": 2222 + }, + { + "epoch": 2.7723019338739863, + "grad_norm": 0.1119053015497875, + "learning_rate": 1.1374249139201035e-06, + "loss": 0.0274, + "num_tokens": 177999727.0, + "step": 2223 + }, + { + "epoch": 2.773549594510293, + "grad_norm": 0.11111524937162807, + "learning_rate": 1.135942770386013e-06, + "loss": 0.0268, + "num_tokens": 178078737.0, + "step": 2224 + }, + { + "epoch": 2.7747972551466002, + "grad_norm": 0.10700815561780962, + "learning_rate": 1.1344685402067475e-06, + "loss": 0.0269, + "num_tokens": 178157602.0, + "step": 2225 + }, + { + "epoch": 2.776044915782907, + "grad_norm": 0.12352603858436706, + "learning_rate": 1.1330022260555321e-06, + "loss": 0.0284, + "num_tokens": 178238153.0, + "step": 2226 + }, + { + "epoch": 2.777292576419214, + "grad_norm": 0.10878032533922675, + "learning_rate": 1.1315438305912377e-06, + "loss": 0.0265, + "num_tokens": 178317857.0, + "step": 2227 + }, + { + "epoch": 2.778540237055521, + "grad_norm": 0.11929846361121192, + "learning_rate": 1.1300933564583764e-06, + "loss": 0.0275, + "num_tokens": 178398144.0, + "step": 2228 + }, + { + "epoch": 2.7797878976918278, + "grad_norm": 0.11575836382461366, + "learning_rate": 1.1286508062870952e-06, + "loss": 0.0277, + "num_tokens": 178477765.0, + "step": 2229 + }, + { + "epoch": 2.781035558328135, + "grad_norm": 0.10742117792303336, + "learning_rate": 1.1272161826931745e-06, + "loss": 0.0265, + "num_tokens": 178558431.0, + "step": 2230 + }, + { + "epoch": 2.7822832189644418, + "grad_norm": 0.11418872199176901, + "learning_rate": 1.1257894882780206e-06, + "loss": 0.0283, + "num_tokens": 178639242.0, + "step": 2231 + }, + { + "epoch": 2.7835308796007485, + "grad_norm": 0.10723962416090788, + "learning_rate": 1.1243707256286606e-06, + "loss": 0.0271, + "num_tokens": 178719072.0, + "step": 2232 + }, + { + "epoch": 2.7847785402370553, + "grad_norm": 0.10881136487138986, + "learning_rate": 1.1229598973177407e-06, + "loss": 0.0272, + "num_tokens": 178797551.0, + "step": 2233 + }, + { + "epoch": 2.7860262008733625, + "grad_norm": 0.1033276997183416, + "learning_rate": 1.1215570059035199e-06, + "loss": 0.0261, + "num_tokens": 178876643.0, + "step": 2234 + }, + { + "epoch": 2.7872738615096693, + "grad_norm": 0.11221713077894142, + "learning_rate": 1.1201620539298636e-06, + "loss": 0.0277, + "num_tokens": 178956890.0, + "step": 2235 + }, + { + "epoch": 2.7885215221459765, + "grad_norm": 0.11497597690368146, + "learning_rate": 1.1187750439262405e-06, + "loss": 0.0274, + "num_tokens": 179036190.0, + "step": 2236 + }, + { + "epoch": 2.7897691827822833, + "grad_norm": 0.11685097421442404, + "learning_rate": 1.1173959784077207e-06, + "loss": 0.0292, + "num_tokens": 179117241.0, + "step": 2237 + }, + { + "epoch": 2.79101684341859, + "grad_norm": 0.11901364215641137, + "learning_rate": 1.1160248598749652e-06, + "loss": 0.0286, + "num_tokens": 179197576.0, + "step": 2238 + }, + { + "epoch": 2.7922645040548972, + "grad_norm": 0.10306800380453014, + "learning_rate": 1.114661690814227e-06, + "loss": 0.0266, + "num_tokens": 179276756.0, + "step": 2239 + }, + { + "epoch": 2.793512164691204, + "grad_norm": 0.11195164030232999, + "learning_rate": 1.1133064736973443e-06, + "loss": 0.0265, + "num_tokens": 179357775.0, + "step": 2240 + }, + { + "epoch": 2.7947598253275108, + "grad_norm": 0.11752175021287195, + "learning_rate": 1.1119592109817346e-06, + "loss": 0.0275, + "num_tokens": 179438867.0, + "step": 2241 + }, + { + "epoch": 2.796007485963818, + "grad_norm": 0.1140881376229996, + "learning_rate": 1.1106199051103922e-06, + "loss": 0.0271, + "num_tokens": 179518359.0, + "step": 2242 + }, + { + "epoch": 2.7972551466001248, + "grad_norm": 0.11251686289836357, + "learning_rate": 1.109288558511884e-06, + "loss": 0.0268, + "num_tokens": 179597988.0, + "step": 2243 + }, + { + "epoch": 2.7985028072364315, + "grad_norm": 0.10788731583211555, + "learning_rate": 1.1079651736003441e-06, + "loss": 0.0273, + "num_tokens": 179678144.0, + "step": 2244 + }, + { + "epoch": 2.7997504678727387, + "grad_norm": 0.11520086050466517, + "learning_rate": 1.106649752775468e-06, + "loss": 0.0267, + "num_tokens": 179758675.0, + "step": 2245 + }, + { + "epoch": 2.8009981285090455, + "grad_norm": 0.12597826940633167, + "learning_rate": 1.1053422984225127e-06, + "loss": 0.027, + "num_tokens": 179839093.0, + "step": 2246 + }, + { + "epoch": 2.8022457891453527, + "grad_norm": 0.11239946957555676, + "learning_rate": 1.1040428129122873e-06, + "loss": 0.0265, + "num_tokens": 179920234.0, + "step": 2247 + }, + { + "epoch": 2.8034934497816595, + "grad_norm": 0.11485169689579346, + "learning_rate": 1.102751298601152e-06, + "loss": 0.0272, + "num_tokens": 179999475.0, + "step": 2248 + }, + { + "epoch": 2.8047411104179663, + "grad_norm": 0.11199877474314972, + "learning_rate": 1.1014677578310128e-06, + "loss": 0.0277, + "num_tokens": 180078857.0, + "step": 2249 + }, + { + "epoch": 2.805988771054273, + "grad_norm": 0.11085777355981787, + "learning_rate": 1.1001921929293172e-06, + "loss": 0.0281, + "num_tokens": 180157620.0, + "step": 2250 + }, + { + "epoch": 2.8072364316905802, + "grad_norm": 0.11095268337610681, + "learning_rate": 1.0989246062090495e-06, + "loss": 0.0269, + "num_tokens": 180237202.0, + "step": 2251 + }, + { + "epoch": 2.808484092326887, + "grad_norm": 0.1314904657457488, + "learning_rate": 1.0976649999687282e-06, + "loss": 0.0273, + "num_tokens": 180316592.0, + "step": 2252 + }, + { + "epoch": 2.809731752963194, + "grad_norm": 0.10715724234388715, + "learning_rate": 1.096413376492399e-06, + "loss": 0.0272, + "num_tokens": 180396786.0, + "step": 2253 + }, + { + "epoch": 2.810979413599501, + "grad_norm": 0.11446876586604864, + "learning_rate": 1.0951697380496343e-06, + "loss": 0.0267, + "num_tokens": 180477208.0, + "step": 2254 + }, + { + "epoch": 2.8122270742358078, + "grad_norm": 0.10000594303521496, + "learning_rate": 1.093934086895526e-06, + "loss": 0.0262, + "num_tokens": 180557503.0, + "step": 2255 + }, + { + "epoch": 2.8134747348721145, + "grad_norm": 0.11468231454145983, + "learning_rate": 1.0927064252706845e-06, + "loss": 0.0264, + "num_tokens": 180636993.0, + "step": 2256 + }, + { + "epoch": 2.8147223955084217, + "grad_norm": 0.11326974384224862, + "learning_rate": 1.0914867554012297e-06, + "loss": 0.028, + "num_tokens": 180717357.0, + "step": 2257 + }, + { + "epoch": 2.8159700561447285, + "grad_norm": 0.12011090419081122, + "learning_rate": 1.090275079498793e-06, + "loss": 0.0286, + "num_tokens": 180796981.0, + "step": 2258 + }, + { + "epoch": 2.8172177167810357, + "grad_norm": 0.11208373588605422, + "learning_rate": 1.0890713997605085e-06, + "loss": 0.0276, + "num_tokens": 180876805.0, + "step": 2259 + }, + { + "epoch": 2.8184653774173425, + "grad_norm": 0.11929554034691514, + "learning_rate": 1.0878757183690112e-06, + "loss": 0.0276, + "num_tokens": 180956790.0, + "step": 2260 + }, + { + "epoch": 2.8197130380536493, + "grad_norm": 0.10883696590219329, + "learning_rate": 1.086688037492433e-06, + "loss": 0.027, + "num_tokens": 181037554.0, + "step": 2261 + }, + { + "epoch": 2.8209606986899565, + "grad_norm": 0.11656014791821229, + "learning_rate": 1.0855083592843985e-06, + "loss": 0.028, + "num_tokens": 181117563.0, + "step": 2262 + }, + { + "epoch": 2.8222083593262632, + "grad_norm": 0.10887563809750479, + "learning_rate": 1.0843366858840209e-06, + "loss": 0.0269, + "num_tokens": 181197130.0, + "step": 2263 + }, + { + "epoch": 2.8234560199625705, + "grad_norm": 0.10788088292257987, + "learning_rate": 1.0831730194158982e-06, + "loss": 0.0266, + "num_tokens": 181276375.0, + "step": 2264 + }, + { + "epoch": 2.824703680598877, + "grad_norm": 0.10930839605540808, + "learning_rate": 1.0820173619901093e-06, + "loss": 0.0271, + "num_tokens": 181356017.0, + "step": 2265 + }, + { + "epoch": 2.825951341235184, + "grad_norm": 0.12259984441092837, + "learning_rate": 1.08086971570221e-06, + "loss": 0.0292, + "num_tokens": 181436275.0, + "step": 2266 + }, + { + "epoch": 2.8271990018714908, + "grad_norm": 0.10639541929699158, + "learning_rate": 1.0797300826332307e-06, + "loss": 0.0268, + "num_tokens": 181516434.0, + "step": 2267 + }, + { + "epoch": 2.828446662507798, + "grad_norm": 0.12324754295042131, + "learning_rate": 1.07859846484967e-06, + "loss": 0.028, + "num_tokens": 181597476.0, + "step": 2268 + }, + { + "epoch": 2.8296943231441047, + "grad_norm": 0.10801092400541287, + "learning_rate": 1.0774748644034936e-06, + "loss": 0.0271, + "num_tokens": 181677449.0, + "step": 2269 + }, + { + "epoch": 2.830941983780412, + "grad_norm": 0.11429743139755069, + "learning_rate": 1.0763592833321277e-06, + "loss": 0.0269, + "num_tokens": 181757429.0, + "step": 2270 + }, + { + "epoch": 2.8321896444167187, + "grad_norm": 0.11295710267647807, + "learning_rate": 1.0752517236584595e-06, + "loss": 0.027, + "num_tokens": 181836252.0, + "step": 2271 + }, + { + "epoch": 2.8334373050530255, + "grad_norm": 0.11139902033265346, + "learning_rate": 1.0741521873908283e-06, + "loss": 0.0268, + "num_tokens": 181916218.0, + "step": 2272 + }, + { + "epoch": 2.8346849656893323, + "grad_norm": 0.1054002144624115, + "learning_rate": 1.0730606765230257e-06, + "loss": 0.0269, + "num_tokens": 181995689.0, + "step": 2273 + }, + { + "epoch": 2.8359326263256395, + "grad_norm": 0.10302028960792996, + "learning_rate": 1.0719771930342913e-06, + "loss": 0.0261, + "num_tokens": 182075038.0, + "step": 2274 + }, + { + "epoch": 2.8371802869619462, + "grad_norm": 0.11798628087878685, + "learning_rate": 1.0709017388893075e-06, + "loss": 0.0283, + "num_tokens": 182154597.0, + "step": 2275 + }, + { + "epoch": 2.8384279475982535, + "grad_norm": 0.11276374561736073, + "learning_rate": 1.0698343160381987e-06, + "loss": 0.0266, + "num_tokens": 182233887.0, + "step": 2276 + }, + { + "epoch": 2.8396756082345602, + "grad_norm": 0.1174576965951362, + "learning_rate": 1.0687749264165248e-06, + "loss": 0.028, + "num_tokens": 182314132.0, + "step": 2277 + }, + { + "epoch": 2.840923268870867, + "grad_norm": 0.10814129453586245, + "learning_rate": 1.067723571945279e-06, + "loss": 0.0261, + "num_tokens": 182394712.0, + "step": 2278 + }, + { + "epoch": 2.842170929507174, + "grad_norm": 0.11723270887984091, + "learning_rate": 1.0666802545308847e-06, + "loss": 0.0281, + "num_tokens": 182476047.0, + "step": 2279 + }, + { + "epoch": 2.843418590143481, + "grad_norm": 0.11081835835199184, + "learning_rate": 1.065644976065193e-06, + "loss": 0.0279, + "num_tokens": 182554762.0, + "step": 2280 + }, + { + "epoch": 2.8446662507797877, + "grad_norm": 0.10738520261965014, + "learning_rate": 1.0646177384254747e-06, + "loss": 0.027, + "num_tokens": 182633545.0, + "step": 2281 + }, + { + "epoch": 2.845913911416095, + "grad_norm": 0.10253227627415959, + "learning_rate": 1.063598543474423e-06, + "loss": 0.0263, + "num_tokens": 182711682.0, + "step": 2282 + }, + { + "epoch": 2.8471615720524017, + "grad_norm": 0.11632474054875933, + "learning_rate": 1.062587393060147e-06, + "loss": 0.0277, + "num_tokens": 182791996.0, + "step": 2283 + }, + { + "epoch": 2.8484092326887085, + "grad_norm": 0.11239301355373442, + "learning_rate": 1.0615842890161675e-06, + "loss": 0.0267, + "num_tokens": 182871570.0, + "step": 2284 + }, + { + "epoch": 2.8496568933250157, + "grad_norm": 0.1260833333963082, + "learning_rate": 1.0605892331614158e-06, + "loss": 0.0305, + "num_tokens": 182951531.0, + "step": 2285 + }, + { + "epoch": 2.8509045539613225, + "grad_norm": 0.10700130560287861, + "learning_rate": 1.0596022273002282e-06, + "loss": 0.0268, + "num_tokens": 183030679.0, + "step": 2286 + }, + { + "epoch": 2.8521522145976297, + "grad_norm": 0.11364031790238421, + "learning_rate": 1.0586232732223446e-06, + "loss": 0.0271, + "num_tokens": 183112076.0, + "step": 2287 + }, + { + "epoch": 2.8533998752339365, + "grad_norm": 0.10879548973912555, + "learning_rate": 1.0576523727029053e-06, + "loss": 0.0273, + "num_tokens": 183192328.0, + "step": 2288 + }, + { + "epoch": 2.8546475358702432, + "grad_norm": 0.11352263022080847, + "learning_rate": 1.0566895275024458e-06, + "loss": 0.0297, + "num_tokens": 183271794.0, + "step": 2289 + }, + { + "epoch": 2.85589519650655, + "grad_norm": 0.11082853865055407, + "learning_rate": 1.0557347393668966e-06, + "loss": 0.027, + "num_tokens": 183350787.0, + "step": 2290 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.11456889825294071, + "learning_rate": 1.0547880100275755e-06, + "loss": 0.0275, + "num_tokens": 183431435.0, + "step": 2291 + }, + { + "epoch": 2.858390517779164, + "grad_norm": 0.11016237704435282, + "learning_rate": 1.0538493412011901e-06, + "loss": 0.0267, + "num_tokens": 183510859.0, + "step": 2292 + }, + { + "epoch": 2.859638178415471, + "grad_norm": 0.11417004760246512, + "learning_rate": 1.0529187345898304e-06, + "loss": 0.0277, + "num_tokens": 183591361.0, + "step": 2293 + }, + { + "epoch": 2.860885839051778, + "grad_norm": 0.11122299199182507, + "learning_rate": 1.0519961918809675e-06, + "loss": 0.0271, + "num_tokens": 183671105.0, + "step": 2294 + }, + { + "epoch": 2.8621334996880847, + "grad_norm": 0.11426790938884109, + "learning_rate": 1.05108171474745e-06, + "loss": 0.0275, + "num_tokens": 183751497.0, + "step": 2295 + }, + { + "epoch": 2.8633811603243915, + "grad_norm": 0.1132178603222439, + "learning_rate": 1.050175304847502e-06, + "loss": 0.0276, + "num_tokens": 183831206.0, + "step": 2296 + }, + { + "epoch": 2.8646288209606987, + "grad_norm": 0.11247491256932537, + "learning_rate": 1.0492769638247177e-06, + "loss": 0.0278, + "num_tokens": 183912383.0, + "step": 2297 + }, + { + "epoch": 2.8658764815970055, + "grad_norm": 0.10746653770671626, + "learning_rate": 1.0483866933080611e-06, + "loss": 0.0265, + "num_tokens": 183992514.0, + "step": 2298 + }, + { + "epoch": 2.8671241422333127, + "grad_norm": 0.11160208763303599, + "learning_rate": 1.0475044949118624e-06, + "loss": 0.0276, + "num_tokens": 184072119.0, + "step": 2299 + }, + { + "epoch": 2.8683718028696195, + "grad_norm": 0.11850309053487157, + "learning_rate": 1.0466303702358139e-06, + "loss": 0.0279, + "num_tokens": 184155215.0, + "step": 2300 + }, + { + "epoch": 2.8696194635059262, + "grad_norm": 0.12631852786380027, + "learning_rate": 1.0457643208649665e-06, + "loss": 0.0288, + "num_tokens": 184235940.0, + "step": 2301 + }, + { + "epoch": 2.8708671241422334, + "grad_norm": 0.11138287151881872, + "learning_rate": 1.044906348369731e-06, + "loss": 0.0276, + "num_tokens": 184316322.0, + "step": 2302 + }, + { + "epoch": 2.87211478477854, + "grad_norm": 0.10931209504132226, + "learning_rate": 1.0440564543058703e-06, + "loss": 0.0268, + "num_tokens": 184396446.0, + "step": 2303 + }, + { + "epoch": 2.8733624454148474, + "grad_norm": 0.11012682455226674, + "learning_rate": 1.0432146402144986e-06, + "loss": 0.0278, + "num_tokens": 184475613.0, + "step": 2304 + }, + { + "epoch": 2.874610106051154, + "grad_norm": 0.10769527893932587, + "learning_rate": 1.0423809076220805e-06, + "loss": 0.0269, + "num_tokens": 184555834.0, + "step": 2305 + }, + { + "epoch": 2.875857766687461, + "grad_norm": 0.12501985655306355, + "learning_rate": 1.041555258040425e-06, + "loss": 0.0289, + "num_tokens": 184637311.0, + "step": 2306 + }, + { + "epoch": 2.8771054273237677, + "grad_norm": 0.10611283848990914, + "learning_rate": 1.0407376929666833e-06, + "loss": 0.0272, + "num_tokens": 184717540.0, + "step": 2307 + }, + { + "epoch": 2.878353087960075, + "grad_norm": 0.11684908656337946, + "learning_rate": 1.0399282138833488e-06, + "loss": 0.0275, + "num_tokens": 184796663.0, + "step": 2308 + }, + { + "epoch": 2.8796007485963817, + "grad_norm": 0.12163581971776054, + "learning_rate": 1.039126822258252e-06, + "loss": 0.0277, + "num_tokens": 184877517.0, + "step": 2309 + }, + { + "epoch": 2.880848409232689, + "grad_norm": 0.11561664029854038, + "learning_rate": 1.0383335195445573e-06, + "loss": 0.0271, + "num_tokens": 184956700.0, + "step": 2310 + }, + { + "epoch": 2.8820960698689957, + "grad_norm": 0.11377476242109719, + "learning_rate": 1.0375483071807626e-06, + "loss": 0.0277, + "num_tokens": 185036719.0, + "step": 2311 + }, + { + "epoch": 2.8833437305053025, + "grad_norm": 0.10571550440670317, + "learning_rate": 1.036771186590696e-06, + "loss": 0.0268, + "num_tokens": 185116185.0, + "step": 2312 + }, + { + "epoch": 2.8845913911416092, + "grad_norm": 0.11078801585715989, + "learning_rate": 1.0360021591835108e-06, + "loss": 0.0274, + "num_tokens": 185195865.0, + "step": 2313 + }, + { + "epoch": 2.8858390517779164, + "grad_norm": 0.10587436172290575, + "learning_rate": 1.0352412263536868e-06, + "loss": 0.0258, + "num_tokens": 185275227.0, + "step": 2314 + }, + { + "epoch": 2.887086712414223, + "grad_norm": 0.10645437206340462, + "learning_rate": 1.0344883894810257e-06, + "loss": 0.0274, + "num_tokens": 185354901.0, + "step": 2315 + }, + { + "epoch": 2.8883343730505304, + "grad_norm": 0.11587554997670732, + "learning_rate": 1.033743649930647e-06, + "loss": 0.0263, + "num_tokens": 185433853.0, + "step": 2316 + }, + { + "epoch": 2.889582033686837, + "grad_norm": 0.11506509844410158, + "learning_rate": 1.03300700905299e-06, + "loss": 0.0274, + "num_tokens": 185514489.0, + "step": 2317 + }, + { + "epoch": 2.890829694323144, + "grad_norm": 0.10249391579856434, + "learning_rate": 1.0322784681838062e-06, + "loss": 0.0266, + "num_tokens": 185595544.0, + "step": 2318 + }, + { + "epoch": 2.892077354959451, + "grad_norm": 0.1244108209218656, + "learning_rate": 1.0315580286441616e-06, + "loss": 0.027, + "num_tokens": 185675597.0, + "step": 2319 + }, + { + "epoch": 2.893325015595758, + "grad_norm": 0.11187955851184811, + "learning_rate": 1.0308456917404294e-06, + "loss": 0.0269, + "num_tokens": 185755490.0, + "step": 2320 + }, + { + "epoch": 2.8945726762320647, + "grad_norm": 0.1118035236277393, + "learning_rate": 1.0301414587642926e-06, + "loss": 0.0267, + "num_tokens": 185833837.0, + "step": 2321 + }, + { + "epoch": 2.895820336868372, + "grad_norm": 0.11431007300598617, + "learning_rate": 1.029445330992738e-06, + "loss": 0.0285, + "num_tokens": 185915064.0, + "step": 2322 + }, + { + "epoch": 2.8970679975046787, + "grad_norm": 0.10462947510709711, + "learning_rate": 1.0287573096880566e-06, + "loss": 0.0267, + "num_tokens": 185995076.0, + "step": 2323 + }, + { + "epoch": 2.8983156581409855, + "grad_norm": 0.11570282963133109, + "learning_rate": 1.028077396097838e-06, + "loss": 0.0269, + "num_tokens": 186074872.0, + "step": 2324 + }, + { + "epoch": 2.8995633187772927, + "grad_norm": 0.12626629625181418, + "learning_rate": 1.0274055914549708e-06, + "loss": 0.0281, + "num_tokens": 186156148.0, + "step": 2325 + }, + { + "epoch": 2.9008109794135994, + "grad_norm": 0.11032478054432032, + "learning_rate": 1.0267418969776405e-06, + "loss": 0.0264, + "num_tokens": 186236795.0, + "step": 2326 + }, + { + "epoch": 2.9020586400499067, + "grad_norm": 0.108031229594549, + "learning_rate": 1.0260863138693264e-06, + "loss": 0.0282, + "num_tokens": 186316351.0, + "step": 2327 + }, + { + "epoch": 2.9033063006862134, + "grad_norm": 0.11591386223810314, + "learning_rate": 1.0254388433187975e-06, + "loss": 0.0279, + "num_tokens": 186396106.0, + "step": 2328 + }, + { + "epoch": 2.90455396132252, + "grad_norm": 0.10622785774173073, + "learning_rate": 1.0247994865001147e-06, + "loss": 0.0259, + "num_tokens": 186475119.0, + "step": 2329 + }, + { + "epoch": 2.905801621958827, + "grad_norm": 0.12280744075841177, + "learning_rate": 1.0241682445726246e-06, + "loss": 0.0279, + "num_tokens": 186556403.0, + "step": 2330 + }, + { + "epoch": 2.907049282595134, + "grad_norm": 0.12850490414436164, + "learning_rate": 1.0235451186809596e-06, + "loss": 0.0281, + "num_tokens": 186636091.0, + "step": 2331 + }, + { + "epoch": 2.908296943231441, + "grad_norm": 0.12129199138278511, + "learning_rate": 1.0229301099550352e-06, + "loss": 0.0263, + "num_tokens": 186718680.0, + "step": 2332 + }, + { + "epoch": 2.909544603867748, + "grad_norm": 0.10134475681869966, + "learning_rate": 1.0223232195100485e-06, + "loss": 0.0267, + "num_tokens": 186797276.0, + "step": 2333 + }, + { + "epoch": 2.910792264504055, + "grad_norm": 0.10862663209574905, + "learning_rate": 1.0217244484464758e-06, + "loss": 0.0269, + "num_tokens": 186877678.0, + "step": 2334 + }, + { + "epoch": 2.9120399251403617, + "grad_norm": 0.10595868824858773, + "learning_rate": 1.0211337978500687e-06, + "loss": 0.0269, + "num_tokens": 186956753.0, + "step": 2335 + }, + { + "epoch": 2.913287585776669, + "grad_norm": 0.11109899490724645, + "learning_rate": 1.0205512687918558e-06, + "loss": 0.0275, + "num_tokens": 187037968.0, + "step": 2336 + }, + { + "epoch": 2.9145352464129757, + "grad_norm": 0.10985537743814915, + "learning_rate": 1.0199768623281388e-06, + "loss": 0.028, + "num_tokens": 187117786.0, + "step": 2337 + }, + { + "epoch": 2.9157829070492824, + "grad_norm": 0.11212443137177852, + "learning_rate": 1.0194105795004896e-06, + "loss": 0.0273, + "num_tokens": 187197753.0, + "step": 2338 + }, + { + "epoch": 2.9170305676855897, + "grad_norm": 0.11357099125731306, + "learning_rate": 1.0188524213357507e-06, + "loss": 0.027, + "num_tokens": 187278814.0, + "step": 2339 + }, + { + "epoch": 2.9182782283218964, + "grad_norm": 0.11863761901671414, + "learning_rate": 1.0183023888460312e-06, + "loss": 0.0278, + "num_tokens": 187359939.0, + "step": 2340 + }, + { + "epoch": 2.919525888958203, + "grad_norm": 0.11740995441233278, + "learning_rate": 1.017760483028706e-06, + "loss": 0.0276, + "num_tokens": 187439262.0, + "step": 2341 + }, + { + "epoch": 2.9207735495945104, + "grad_norm": 0.10961685956776607, + "learning_rate": 1.017226704866415e-06, + "loss": 0.0273, + "num_tokens": 187519426.0, + "step": 2342 + }, + { + "epoch": 2.922021210230817, + "grad_norm": 0.13193569549980025, + "learning_rate": 1.0167010553270588e-06, + "loss": 0.0275, + "num_tokens": 187599140.0, + "step": 2343 + }, + { + "epoch": 2.9232688708671244, + "grad_norm": 0.1090653954416858, + "learning_rate": 1.016183535363799e-06, + "loss": 0.0263, + "num_tokens": 187679734.0, + "step": 2344 + }, + { + "epoch": 2.924516531503431, + "grad_norm": 0.11264135405112981, + "learning_rate": 1.0156741459150556e-06, + "loss": 0.0271, + "num_tokens": 187760016.0, + "step": 2345 + }, + { + "epoch": 2.925764192139738, + "grad_norm": 0.1127907828444805, + "learning_rate": 1.0151728879045057e-06, + "loss": 0.0273, + "num_tokens": 187840030.0, + "step": 2346 + }, + { + "epoch": 2.9270118527760447, + "grad_norm": 0.11654407524405933, + "learning_rate": 1.0146797622410813e-06, + "loss": 0.0266, + "num_tokens": 187918803.0, + "step": 2347 + }, + { + "epoch": 2.928259513412352, + "grad_norm": 0.11309647290317737, + "learning_rate": 1.0141947698189684e-06, + "loss": 0.0272, + "num_tokens": 187998146.0, + "step": 2348 + }, + { + "epoch": 2.9295071740486587, + "grad_norm": 0.11280706058466917, + "learning_rate": 1.0137179115176055e-06, + "loss": 0.0265, + "num_tokens": 188077340.0, + "step": 2349 + }, + { + "epoch": 2.930754834684966, + "grad_norm": 0.13076601205501484, + "learning_rate": 1.0132491882016805e-06, + "loss": 0.0262, + "num_tokens": 188157146.0, + "step": 2350 + }, + { + "epoch": 2.9320024953212727, + "grad_norm": 0.11012339063784357, + "learning_rate": 1.0127886007211298e-06, + "loss": 0.0275, + "num_tokens": 188237197.0, + "step": 2351 + }, + { + "epoch": 2.9332501559575794, + "grad_norm": 0.11371206516952323, + "learning_rate": 1.0123361499111383e-06, + "loss": 0.0272, + "num_tokens": 188316620.0, + "step": 2352 + }, + { + "epoch": 2.934497816593886, + "grad_norm": 0.10964761702423996, + "learning_rate": 1.011891836592136e-06, + "loss": 0.0278, + "num_tokens": 188397001.0, + "step": 2353 + }, + { + "epoch": 2.9357454772301934, + "grad_norm": 0.1117067743384703, + "learning_rate": 1.0114556615697971e-06, + "loss": 0.027, + "num_tokens": 188476537.0, + "step": 2354 + }, + { + "epoch": 2.9369931378665, + "grad_norm": 0.10532925727660913, + "learning_rate": 1.0110276256350393e-06, + "loss": 0.0265, + "num_tokens": 188554894.0, + "step": 2355 + }, + { + "epoch": 2.9382407985028074, + "grad_norm": 0.11464738528815109, + "learning_rate": 1.010607729564021e-06, + "loss": 0.0273, + "num_tokens": 188635007.0, + "step": 2356 + }, + { + "epoch": 2.939488459139114, + "grad_norm": 0.11411346562493145, + "learning_rate": 1.0101959741181396e-06, + "loss": 0.0273, + "num_tokens": 188714832.0, + "step": 2357 + }, + { + "epoch": 2.940736119775421, + "grad_norm": 0.11297091154604039, + "learning_rate": 1.0097923600440335e-06, + "loss": 0.0266, + "num_tokens": 188794100.0, + "step": 2358 + }, + { + "epoch": 2.941983780411728, + "grad_norm": 0.11419809264174352, + "learning_rate": 1.0093968880735762e-06, + "loss": 0.0277, + "num_tokens": 188875432.0, + "step": 2359 + }, + { + "epoch": 2.943231441048035, + "grad_norm": 0.10794783898522096, + "learning_rate": 1.009009558923878e-06, + "loss": 0.027, + "num_tokens": 188954916.0, + "step": 2360 + }, + { + "epoch": 2.944479101684342, + "grad_norm": 0.10544977589684877, + "learning_rate": 1.0086303732972843e-06, + "loss": 0.0266, + "num_tokens": 189035103.0, + "step": 2361 + }, + { + "epoch": 2.945726762320649, + "grad_norm": 0.12035892909369736, + "learning_rate": 1.0082593318813728e-06, + "loss": 0.027, + "num_tokens": 189114556.0, + "step": 2362 + }, + { + "epoch": 2.9469744229569557, + "grad_norm": 0.11275061926774883, + "learning_rate": 1.0078964353489536e-06, + "loss": 0.0268, + "num_tokens": 189194415.0, + "step": 2363 + }, + { + "epoch": 2.9482220835932624, + "grad_norm": 0.11213359019218412, + "learning_rate": 1.0075416843580687e-06, + "loss": 0.0272, + "num_tokens": 189275904.0, + "step": 2364 + }, + { + "epoch": 2.9494697442295696, + "grad_norm": 0.11713393130799933, + "learning_rate": 1.0071950795519873e-06, + "loss": 0.0279, + "num_tokens": 189355944.0, + "step": 2365 + }, + { + "epoch": 2.9507174048658764, + "grad_norm": 0.10806295055772125, + "learning_rate": 1.00685662155921e-06, + "loss": 0.0274, + "num_tokens": 189436064.0, + "step": 2366 + }, + { + "epoch": 2.9519650655021836, + "grad_norm": 0.11788280963029622, + "learning_rate": 1.0065263109934633e-06, + "loss": 0.0277, + "num_tokens": 189516906.0, + "step": 2367 + }, + { + "epoch": 2.9532127261384904, + "grad_norm": 0.11439835587046002, + "learning_rate": 1.0062041484536994e-06, + "loss": 0.0292, + "num_tokens": 189597299.0, + "step": 2368 + }, + { + "epoch": 2.954460386774797, + "grad_norm": 0.11618035546784151, + "learning_rate": 1.0058901345240967e-06, + "loss": 0.0274, + "num_tokens": 189677346.0, + "step": 2369 + }, + { + "epoch": 2.955708047411104, + "grad_norm": 0.10998886233872446, + "learning_rate": 1.0055842697740576e-06, + "loss": 0.0268, + "num_tokens": 189756509.0, + "step": 2370 + }, + { + "epoch": 2.956955708047411, + "grad_norm": 0.1102297346203138, + "learning_rate": 1.0052865547582074e-06, + "loss": 0.0273, + "num_tokens": 189837787.0, + "step": 2371 + }, + { + "epoch": 2.958203368683718, + "grad_norm": 0.11011616094968564, + "learning_rate": 1.004996990016393e-06, + "loss": 0.0278, + "num_tokens": 189916820.0, + "step": 2372 + }, + { + "epoch": 2.959451029320025, + "grad_norm": 0.1058190186745868, + "learning_rate": 1.0047155760736828e-06, + "loss": 0.0263, + "num_tokens": 189996448.0, + "step": 2373 + }, + { + "epoch": 2.960698689956332, + "grad_norm": 0.09902032964536187, + "learning_rate": 1.004442313440366e-06, + "loss": 0.0261, + "num_tokens": 190075935.0, + "step": 2374 + }, + { + "epoch": 2.9619463505926387, + "grad_norm": 0.12154175766740306, + "learning_rate": 1.0041772026119493e-06, + "loss": 0.027, + "num_tokens": 190156058.0, + "step": 2375 + }, + { + "epoch": 2.963194011228946, + "grad_norm": 0.11633889607609949, + "learning_rate": 1.0039202440691598e-06, + "loss": 0.0271, + "num_tokens": 190236260.0, + "step": 2376 + }, + { + "epoch": 2.9644416718652526, + "grad_norm": 0.10715021652513443, + "learning_rate": 1.0036714382779405e-06, + "loss": 0.0263, + "num_tokens": 190315801.0, + "step": 2377 + }, + { + "epoch": 2.9656893325015594, + "grad_norm": 0.11833748571681439, + "learning_rate": 1.0034307856894511e-06, + "loss": 0.0268, + "num_tokens": 190394686.0, + "step": 2378 + }, + { + "epoch": 2.9669369931378666, + "grad_norm": 0.11390670772540895, + "learning_rate": 1.0031982867400683e-06, + "loss": 0.0274, + "num_tokens": 190474200.0, + "step": 2379 + }, + { + "epoch": 2.9681846537741734, + "grad_norm": 0.09835116296798709, + "learning_rate": 1.0029739418513825e-06, + "loss": 0.0262, + "num_tokens": 190553083.0, + "step": 2380 + }, + { + "epoch": 2.96943231441048, + "grad_norm": 0.10781406391922578, + "learning_rate": 1.0027577514301988e-06, + "loss": 0.0264, + "num_tokens": 190632255.0, + "step": 2381 + }, + { + "epoch": 2.9706799750467874, + "grad_norm": 0.10062575360038996, + "learning_rate": 1.002549715868536e-06, + "loss": 0.0262, + "num_tokens": 190712052.0, + "step": 2382 + }, + { + "epoch": 2.971927635683094, + "grad_norm": 0.10840533051873137, + "learning_rate": 1.0023498355436255e-06, + "loss": 0.0269, + "num_tokens": 190791575.0, + "step": 2383 + }, + { + "epoch": 2.9731752963194014, + "grad_norm": 0.10941923321723614, + "learning_rate": 1.0021581108179105e-06, + "loss": 0.026, + "num_tokens": 190870712.0, + "step": 2384 + }, + { + "epoch": 2.974422956955708, + "grad_norm": 0.10761702827903147, + "learning_rate": 1.0019745420390455e-06, + "loss": 0.027, + "num_tokens": 190951038.0, + "step": 2385 + }, + { + "epoch": 2.975670617592015, + "grad_norm": 0.11368723139537162, + "learning_rate": 1.001799129539897e-06, + "loss": 0.0276, + "num_tokens": 191030954.0, + "step": 2386 + }, + { + "epoch": 2.9769182782283217, + "grad_norm": 0.10855427538159527, + "learning_rate": 1.0016318736385406e-06, + "loss": 0.0268, + "num_tokens": 191110413.0, + "step": 2387 + }, + { + "epoch": 2.978165938864629, + "grad_norm": 0.1068843734320981, + "learning_rate": 1.0014727746382615e-06, + "loss": 0.0259, + "num_tokens": 191189445.0, + "step": 2388 + }, + { + "epoch": 2.9794135995009356, + "grad_norm": 0.1257994774418835, + "learning_rate": 1.0013218328275544e-06, + "loss": 0.0279, + "num_tokens": 191270715.0, + "step": 2389 + }, + { + "epoch": 2.980661260137243, + "grad_norm": 0.11714566321070789, + "learning_rate": 1.0011790484801231e-06, + "loss": 0.0281, + "num_tokens": 191350789.0, + "step": 2390 + }, + { + "epoch": 2.9819089207735496, + "grad_norm": 0.11060106527704666, + "learning_rate": 1.0010444218548777e-06, + "loss": 0.0272, + "num_tokens": 191432051.0, + "step": 2391 + }, + { + "epoch": 2.9831565814098564, + "grad_norm": 0.11345865119840931, + "learning_rate": 1.0009179531959374e-06, + "loss": 0.0274, + "num_tokens": 191514006.0, + "step": 2392 + }, + { + "epoch": 2.984404242046163, + "grad_norm": 0.11834001788647562, + "learning_rate": 1.0007996427326282e-06, + "loss": 0.0274, + "num_tokens": 191595045.0, + "step": 2393 + }, + { + "epoch": 2.9856519026824704, + "grad_norm": 0.11736078471554756, + "learning_rate": 1.0006894906794828e-06, + "loss": 0.0281, + "num_tokens": 191674983.0, + "step": 2394 + }, + { + "epoch": 2.986899563318777, + "grad_norm": 0.10662448160761655, + "learning_rate": 1.0005874972362403e-06, + "loss": 0.0271, + "num_tokens": 191754802.0, + "step": 2395 + }, + { + "epoch": 2.9881472239550844, + "grad_norm": 0.10732557554988709, + "learning_rate": 1.000493662587845e-06, + "loss": 0.0265, + "num_tokens": 191833832.0, + "step": 2396 + }, + { + "epoch": 2.989394884591391, + "grad_norm": 0.11470784626798826, + "learning_rate": 1.0004079869044482e-06, + "loss": 0.0274, + "num_tokens": 191913223.0, + "step": 2397 + }, + { + "epoch": 2.990642545227698, + "grad_norm": 0.10039798181941487, + "learning_rate": 1.0003304703414053e-06, + "loss": 0.0255, + "num_tokens": 191993666.0, + "step": 2398 + }, + { + "epoch": 2.991890205864005, + "grad_norm": 0.10361335698424663, + "learning_rate": 1.0002611130392772e-06, + "loss": 0.0263, + "num_tokens": 192074190.0, + "step": 2399 + }, + { + "epoch": 2.993137866500312, + "grad_norm": 0.10857365381947398, + "learning_rate": 1.0001999151238303e-06, + "loss": 0.0271, + "num_tokens": 192153740.0, + "step": 2400 + }, + { + "epoch": 2.994385527136619, + "grad_norm": 0.10539286028474344, + "learning_rate": 1.0001468767060341e-06, + "loss": 0.0264, + "num_tokens": 192232957.0, + "step": 2401 + }, + { + "epoch": 2.995633187772926, + "grad_norm": 0.1103209776499511, + "learning_rate": 1.000101997882064e-06, + "loss": 0.0273, + "num_tokens": 192312730.0, + "step": 2402 + }, + { + "epoch": 2.9968808484092326, + "grad_norm": 0.11105172947776741, + "learning_rate": 1.0000652787332984e-06, + "loss": 0.0274, + "num_tokens": 192392230.0, + "step": 2403 + }, + { + "epoch": 2.9981285090455394, + "grad_norm": 0.11159122867179921, + "learning_rate": 1.0000367193263206e-06, + "loss": 0.0275, + "num_tokens": 192471730.0, + "step": 2404 + }, + { + "epoch": 2.9993761696818466, + "grad_norm": 0.10531323399329573, + "learning_rate": 1.000016319712917e-06, + "loss": 0.0264, + "num_tokens": 192551044.0, + "step": 2405 + }, + { + "epoch": 3.0, + "grad_norm": 0.10531323399329573, + "learning_rate": 1.0000040799300788e-06, + "loss": 0.0257, + "num_tokens": 192590850.0, + "step": 2406 + }, + { + "epoch": 3.0, + "step": 2406, + "total_flos": 3.699807586474721e+17, + "train_loss": 0.0587594091221814, + "train_runtime": 5162.578, + "train_samples_per_second": 59.587, + "train_steps_per_second": 0.466 + } + ], + "logging_steps": 1, + "max_steps": 2406, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.699807586474721e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}