{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2406, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012476606363069245, "grad_norm": 9.34911917109977, "learning_rate": 0.0, "loss": 2.4049, "num_tokens": 79590.0, "step": 1 }, { "epoch": 0.002495321272613849, "grad_norm": 9.364768973646816, "learning_rate": 1.36986301369863e-07, "loss": 2.4134, "num_tokens": 159103.0, "step": 2 }, { "epoch": 0.0037429819089207735, "grad_norm": 9.456536966619405, "learning_rate": 2.73972602739726e-07, "loss": 2.4175, "num_tokens": 237799.0, "step": 3 }, { "epoch": 0.004990642545227698, "grad_norm": 9.426354062444751, "learning_rate": 4.1095890410958903e-07, "loss": 2.4288, "num_tokens": 316493.0, "step": 4 }, { "epoch": 0.006238303181534623, "grad_norm": 9.24236293603858, "learning_rate": 5.47945205479452e-07, "loss": 2.3781, "num_tokens": 396816.0, "step": 5 }, { "epoch": 0.007485963817841547, "grad_norm": 9.125681681424858, "learning_rate": 6.849315068493151e-07, "loss": 2.3607, "num_tokens": 477827.0, "step": 6 }, { "epoch": 0.008733624454148471, "grad_norm": 9.096924762031028, "learning_rate": 8.219178082191781e-07, "loss": 2.3668, "num_tokens": 557522.0, "step": 7 }, { "epoch": 0.009981285090455396, "grad_norm": 9.119042405502713, "learning_rate": 9.589041095890411e-07, "loss": 2.3608, "num_tokens": 636975.0, "step": 8 }, { "epoch": 0.011228945726762321, "grad_norm": 8.92660720049316, "learning_rate": 1.095890410958904e-06, "loss": 2.3189, "num_tokens": 715688.0, "step": 9 }, { "epoch": 0.012476606363069246, "grad_norm": 8.341929678010592, "learning_rate": 1.2328767123287673e-06, "loss": 2.2439, "num_tokens": 796141.0, "step": 10 }, { "epoch": 0.01372426699937617, "grad_norm": 8.261055619190312, "learning_rate": 1.3698630136986302e-06, "loss": 2.2317, "num_tokens": 875804.0, "step": 11 }, { "epoch": 0.014971927635683094, "grad_norm": 8.045610516077126, "learning_rate": 1.5068493150684932e-06, "loss": 2.1818, "num_tokens": 957377.0, "step": 12 }, { "epoch": 0.016219588271990017, "grad_norm": 6.709132143557291, "learning_rate": 1.6438356164383561e-06, "loss": 1.9262, "num_tokens": 1038132.0, "step": 13 }, { "epoch": 0.017467248908296942, "grad_norm": 6.562072632018573, "learning_rate": 1.7808219178082193e-06, "loss": 1.8848, "num_tokens": 1119984.0, "step": 14 }, { "epoch": 0.018714909544603867, "grad_norm": 6.461084025016272, "learning_rate": 1.9178082191780823e-06, "loss": 1.8372, "num_tokens": 1200787.0, "step": 15 }, { "epoch": 0.019962570180910792, "grad_norm": 6.432679419207876, "learning_rate": 2.0547945205479454e-06, "loss": 1.8135, "num_tokens": 1281286.0, "step": 16 }, { "epoch": 0.021210230817217717, "grad_norm": 6.869674718914819, "learning_rate": 2.191780821917808e-06, "loss": 1.2571, "num_tokens": 1360214.0, "step": 17 }, { "epoch": 0.022457891453524642, "grad_norm": 6.548310858256031, "learning_rate": 2.3287671232876713e-06, "loss": 1.2354, "num_tokens": 1442460.0, "step": 18 }, { "epoch": 0.023705552089831567, "grad_norm": 7.14680245757635, "learning_rate": 2.4657534246575345e-06, "loss": 1.1882, "num_tokens": 1522593.0, "step": 19 }, { "epoch": 0.024953212726138492, "grad_norm": 7.930859427939441, "learning_rate": 2.6027397260273973e-06, "loss": 1.0258, "num_tokens": 1603158.0, "step": 20 }, { "epoch": 0.026200873362445413, "grad_norm": 7.129618686033834, "learning_rate": 2.7397260273972604e-06, "loss": 0.9557, "num_tokens": 1682215.0, "step": 21 }, { "epoch": 0.02744853399875234, "grad_norm": 8.351617658742393, "learning_rate": 2.876712328767123e-06, "loss": 0.8064, "num_tokens": 1762875.0, "step": 22 }, { "epoch": 0.028696194635059263, "grad_norm": 6.202115696809869, "learning_rate": 3.0136986301369864e-06, "loss": 0.4053, "num_tokens": 1842280.0, "step": 23 }, { "epoch": 0.02994385527136619, "grad_norm": 3.015415758599779, "learning_rate": 3.1506849315068495e-06, "loss": 0.2977, "num_tokens": 1922017.0, "step": 24 }, { "epoch": 0.031191515907673113, "grad_norm": 1.7548078686272264, "learning_rate": 3.2876712328767123e-06, "loss": 0.2561, "num_tokens": 2001171.0, "step": 25 }, { "epoch": 0.032439176543980035, "grad_norm": 1.2570888602541386, "learning_rate": 3.4246575342465754e-06, "loss": 0.243, "num_tokens": 2082788.0, "step": 26 }, { "epoch": 0.03368683718028696, "grad_norm": 0.9858243334964526, "learning_rate": 3.5616438356164386e-06, "loss": 0.2084, "num_tokens": 2163533.0, "step": 27 }, { "epoch": 0.034934497816593885, "grad_norm": 0.8941120107804094, "learning_rate": 3.6986301369863014e-06, "loss": 0.2072, "num_tokens": 2242151.0, "step": 28 }, { "epoch": 0.03618215845290081, "grad_norm": 0.834578793765266, "learning_rate": 3.8356164383561645e-06, "loss": 0.2113, "num_tokens": 2321655.0, "step": 29 }, { "epoch": 0.037429819089207735, "grad_norm": 0.7958460007482222, "learning_rate": 3.972602739726027e-06, "loss": 0.1935, "num_tokens": 2400547.0, "step": 30 }, { "epoch": 0.03867747972551466, "grad_norm": 0.7476709118804037, "learning_rate": 4.109589041095891e-06, "loss": 0.1932, "num_tokens": 2481040.0, "step": 31 }, { "epoch": 0.039925140361821584, "grad_norm": 0.7336501580269786, "learning_rate": 4.246575342465754e-06, "loss": 0.1869, "num_tokens": 2561815.0, "step": 32 }, { "epoch": 0.041172800998128506, "grad_norm": 0.7276050156656503, "learning_rate": 4.383561643835616e-06, "loss": 0.1819, "num_tokens": 2642280.0, "step": 33 }, { "epoch": 0.042420461634435434, "grad_norm": 0.7320713436530883, "learning_rate": 4.52054794520548e-06, "loss": 0.1776, "num_tokens": 2722505.0, "step": 34 }, { "epoch": 0.043668122270742356, "grad_norm": 0.7211851177445922, "learning_rate": 4.657534246575343e-06, "loss": 0.1626, "num_tokens": 2801483.0, "step": 35 }, { "epoch": 0.044915782907049284, "grad_norm": 0.7706879408631792, "learning_rate": 4.7945205479452054e-06, "loss": 0.1652, "num_tokens": 2880749.0, "step": 36 }, { "epoch": 0.046163443543356206, "grad_norm": 0.6310470242010754, "learning_rate": 4.931506849315069e-06, "loss": 0.1578, "num_tokens": 2960025.0, "step": 37 }, { "epoch": 0.047411104179663134, "grad_norm": 0.5719157846400421, "learning_rate": 5.068493150684932e-06, "loss": 0.1559, "num_tokens": 3040006.0, "step": 38 }, { "epoch": 0.048658764815970056, "grad_norm": 0.5556127515496487, "learning_rate": 5.2054794520547945e-06, "loss": 0.1445, "num_tokens": 3118779.0, "step": 39 }, { "epoch": 0.049906425452276984, "grad_norm": 0.47419374873726, "learning_rate": 5.342465753424658e-06, "loss": 0.1494, "num_tokens": 3200031.0, "step": 40 }, { "epoch": 0.051154086088583905, "grad_norm": 0.4421370793504471, "learning_rate": 5.479452054794521e-06, "loss": 0.1523, "num_tokens": 3281300.0, "step": 41 }, { "epoch": 0.05240174672489083, "grad_norm": 0.43669518921865735, "learning_rate": 5.6164383561643845e-06, "loss": 0.1403, "num_tokens": 3361534.0, "step": 42 }, { "epoch": 0.053649407361197755, "grad_norm": 0.4277843089034562, "learning_rate": 5.753424657534246e-06, "loss": 0.1432, "num_tokens": 3441863.0, "step": 43 }, { "epoch": 0.05489706799750468, "grad_norm": 0.41630114035277377, "learning_rate": 5.89041095890411e-06, "loss": 0.1315, "num_tokens": 3521074.0, "step": 44 }, { "epoch": 0.056144728633811605, "grad_norm": 0.41370843939103547, "learning_rate": 6.027397260273973e-06, "loss": 0.1387, "num_tokens": 3602411.0, "step": 45 }, { "epoch": 0.05739238927011853, "grad_norm": 0.43876538412653404, "learning_rate": 6.164383561643836e-06, "loss": 0.1361, "num_tokens": 3682928.0, "step": 46 }, { "epoch": 0.058640049906425455, "grad_norm": 0.34801092467574696, "learning_rate": 6.301369863013699e-06, "loss": 0.1258, "num_tokens": 3763667.0, "step": 47 }, { "epoch": 0.05988771054273238, "grad_norm": 0.2939900530144378, "learning_rate": 6.438356164383563e-06, "loss": 0.1322, "num_tokens": 3845378.0, "step": 48 }, { "epoch": 0.0611353711790393, "grad_norm": 0.24786140273528653, "learning_rate": 6.5753424657534245e-06, "loss": 0.1267, "num_tokens": 3925502.0, "step": 49 }, { "epoch": 0.06238303181534623, "grad_norm": 0.20173255384894515, "learning_rate": 6.712328767123288e-06, "loss": 0.1208, "num_tokens": 4004936.0, "step": 50 }, { "epoch": 0.06363069245165315, "grad_norm": 0.18388818103544632, "learning_rate": 6.849315068493151e-06, "loss": 0.1225, "num_tokens": 4084695.0, "step": 51 }, { "epoch": 0.06487835308796007, "grad_norm": 0.21045301465823563, "learning_rate": 6.9863013698630145e-06, "loss": 0.1216, "num_tokens": 4163992.0, "step": 52 }, { "epoch": 0.066126013724267, "grad_norm": 0.18341190230418852, "learning_rate": 7.123287671232877e-06, "loss": 0.124, "num_tokens": 4244593.0, "step": 53 }, { "epoch": 0.06737367436057393, "grad_norm": 0.19088058421518178, "learning_rate": 7.260273972602741e-06, "loss": 0.1145, "num_tokens": 4324390.0, "step": 54 }, { "epoch": 0.06862133499688085, "grad_norm": 0.19103912807016007, "learning_rate": 7.397260273972603e-06, "loss": 0.1271, "num_tokens": 4405698.0, "step": 55 }, { "epoch": 0.06986899563318777, "grad_norm": 0.18970020622501074, "learning_rate": 7.534246575342466e-06, "loss": 0.112, "num_tokens": 4485053.0, "step": 56 }, { "epoch": 0.07111665626949469, "grad_norm": 0.19164305174626092, "learning_rate": 7.671232876712329e-06, "loss": 0.1098, "num_tokens": 4565926.0, "step": 57 }, { "epoch": 0.07236431690580163, "grad_norm": 0.19460729939643484, "learning_rate": 7.808219178082192e-06, "loss": 0.1087, "num_tokens": 4644896.0, "step": 58 }, { "epoch": 0.07361197754210855, "grad_norm": 0.18559860342670173, "learning_rate": 7.945205479452055e-06, "loss": 0.1101, "num_tokens": 4724338.0, "step": 59 }, { "epoch": 0.07485963817841547, "grad_norm": 0.20159007460588826, "learning_rate": 8.082191780821919e-06, "loss": 0.1194, "num_tokens": 4805035.0, "step": 60 }, { "epoch": 0.07610729881472239, "grad_norm": 0.19163951373300692, "learning_rate": 8.219178082191782e-06, "loss": 0.1101, "num_tokens": 4886497.0, "step": 61 }, { "epoch": 0.07735495945102933, "grad_norm": 0.1792411438909808, "learning_rate": 8.356164383561644e-06, "loss": 0.1052, "num_tokens": 4966066.0, "step": 62 }, { "epoch": 0.07860262008733625, "grad_norm": 0.19871949594852764, "learning_rate": 8.493150684931507e-06, "loss": 0.1107, "num_tokens": 5046294.0, "step": 63 }, { "epoch": 0.07985028072364317, "grad_norm": 0.2009426482501604, "learning_rate": 8.63013698630137e-06, "loss": 0.111, "num_tokens": 5127200.0, "step": 64 }, { "epoch": 0.08109794135995009, "grad_norm": 0.18854634659363081, "learning_rate": 8.767123287671233e-06, "loss": 0.1062, "num_tokens": 5207375.0, "step": 65 }, { "epoch": 0.08234560199625701, "grad_norm": 0.1909614838823602, "learning_rate": 8.904109589041097e-06, "loss": 0.1055, "num_tokens": 5288077.0, "step": 66 }, { "epoch": 0.08359326263256395, "grad_norm": 0.18966488054923936, "learning_rate": 9.04109589041096e-06, "loss": 0.0965, "num_tokens": 5368986.0, "step": 67 }, { "epoch": 0.08484092326887087, "grad_norm": 0.19600407468281897, "learning_rate": 9.178082191780823e-06, "loss": 0.0989, "num_tokens": 5448866.0, "step": 68 }, { "epoch": 0.08608858390517779, "grad_norm": 0.19248220731163004, "learning_rate": 9.315068493150685e-06, "loss": 0.1021, "num_tokens": 5529761.0, "step": 69 }, { "epoch": 0.08733624454148471, "grad_norm": 0.1790396798178214, "learning_rate": 9.452054794520548e-06, "loss": 0.0977, "num_tokens": 5610079.0, "step": 70 }, { "epoch": 0.08858390517779165, "grad_norm": 0.20227512996028574, "learning_rate": 9.589041095890411e-06, "loss": 0.103, "num_tokens": 5690121.0, "step": 71 }, { "epoch": 0.08983156581409857, "grad_norm": 0.1804340417389163, "learning_rate": 9.726027397260275e-06, "loss": 0.0988, "num_tokens": 5770807.0, "step": 72 }, { "epoch": 0.09107922645040549, "grad_norm": 0.18708792441875738, "learning_rate": 9.863013698630138e-06, "loss": 0.1007, "num_tokens": 5851162.0, "step": 73 }, { "epoch": 0.09232688708671241, "grad_norm": 0.18580134464623252, "learning_rate": 1e-05, "loss": 0.1015, "num_tokens": 5933144.0, "step": 74 }, { "epoch": 0.09357454772301933, "grad_norm": 0.1777511395438126, "learning_rate": 9.999995920069922e-06, "loss": 0.0925, "num_tokens": 6013167.0, "step": 75 }, { "epoch": 0.09482220835932627, "grad_norm": 0.18453230882301527, "learning_rate": 9.999983680287084e-06, "loss": 0.0995, "num_tokens": 6092429.0, "step": 76 }, { "epoch": 0.09606986899563319, "grad_norm": 0.17990626138096272, "learning_rate": 9.99996328067368e-06, "loss": 0.0894, "num_tokens": 6171395.0, "step": 77 }, { "epoch": 0.09731752963194011, "grad_norm": 0.18671247100201174, "learning_rate": 9.999934721266702e-06, "loss": 0.0954, "num_tokens": 6252161.0, "step": 78 }, { "epoch": 0.09856519026824703, "grad_norm": 0.17367676216662678, "learning_rate": 9.999898002117937e-06, "loss": 0.0882, "num_tokens": 6331946.0, "step": 79 }, { "epoch": 0.09981285090455397, "grad_norm": 0.17790787231369953, "learning_rate": 9.999853123293967e-06, "loss": 0.0948, "num_tokens": 6412878.0, "step": 80 }, { "epoch": 0.10106051154086089, "grad_norm": 0.17825720009899704, "learning_rate": 9.99980008487617e-06, "loss": 0.0883, "num_tokens": 6492625.0, "step": 81 }, { "epoch": 0.10230817217716781, "grad_norm": 0.177633671805786, "learning_rate": 9.999738886960724e-06, "loss": 0.0958, "num_tokens": 6572706.0, "step": 82 }, { "epoch": 0.10355583281347473, "grad_norm": 0.1781514598121185, "learning_rate": 9.999669529658596e-06, "loss": 0.1016, "num_tokens": 6654066.0, "step": 83 }, { "epoch": 0.10480349344978165, "grad_norm": 0.1797123476723708, "learning_rate": 9.999592013095553e-06, "loss": 0.0889, "num_tokens": 6733703.0, "step": 84 }, { "epoch": 0.10605115408608859, "grad_norm": 0.17668230595097628, "learning_rate": 9.999506337412157e-06, "loss": 0.0905, "num_tokens": 6813311.0, "step": 85 }, { "epoch": 0.10729881472239551, "grad_norm": 0.1635782657626139, "learning_rate": 9.99941250276376e-06, "loss": 0.0891, "num_tokens": 6892900.0, "step": 86 }, { "epoch": 0.10854647535870243, "grad_norm": 0.17152001517029147, "learning_rate": 9.999310509320518e-06, "loss": 0.0852, "num_tokens": 6971684.0, "step": 87 }, { "epoch": 0.10979413599500935, "grad_norm": 0.16674351199678547, "learning_rate": 9.999200357267373e-06, "loss": 0.0844, "num_tokens": 7050727.0, "step": 88 }, { "epoch": 0.11104179663131628, "grad_norm": 0.16284064715514623, "learning_rate": 9.999082046804062e-06, "loss": 0.0894, "num_tokens": 7130506.0, "step": 89 }, { "epoch": 0.11228945726762321, "grad_norm": 0.17699815453244996, "learning_rate": 9.998955578145124e-06, "loss": 0.0896, "num_tokens": 7210539.0, "step": 90 }, { "epoch": 0.11353711790393013, "grad_norm": 0.17175993286806868, "learning_rate": 9.998820951519877e-06, "loss": 0.0909, "num_tokens": 7291662.0, "step": 91 }, { "epoch": 0.11478477854023705, "grad_norm": 0.16895654572874105, "learning_rate": 9.998678167172446e-06, "loss": 0.0866, "num_tokens": 7371708.0, "step": 92 }, { "epoch": 0.11603243917654397, "grad_norm": 0.15898095818806993, "learning_rate": 9.99852722536174e-06, "loss": 0.0891, "num_tokens": 7451637.0, "step": 93 }, { "epoch": 0.11728009981285091, "grad_norm": 0.1655714312412077, "learning_rate": 9.998368126361459e-06, "loss": 0.0855, "num_tokens": 7532024.0, "step": 94 }, { "epoch": 0.11852776044915783, "grad_norm": 0.16278257907586435, "learning_rate": 9.998200870460103e-06, "loss": 0.0855, "num_tokens": 7611489.0, "step": 95 }, { "epoch": 0.11977542108546475, "grad_norm": 0.17618441557702208, "learning_rate": 9.998025457960955e-06, "loss": 0.0963, "num_tokens": 7693716.0, "step": 96 }, { "epoch": 0.12102308172177167, "grad_norm": 0.15790344775477788, "learning_rate": 9.997841889182091e-06, "loss": 0.0856, "num_tokens": 7774645.0, "step": 97 }, { "epoch": 0.1222707423580786, "grad_norm": 0.16185678156409003, "learning_rate": 9.997650164456375e-06, "loss": 0.0793, "num_tokens": 7855390.0, "step": 98 }, { "epoch": 0.12351840299438553, "grad_norm": 0.16681775638759538, "learning_rate": 9.997450284131465e-06, "loss": 0.0826, "num_tokens": 7935403.0, "step": 99 }, { "epoch": 0.12476606363069245, "grad_norm": 0.16550888127393662, "learning_rate": 9.997242248569802e-06, "loss": 0.0836, "num_tokens": 8015879.0, "step": 100 }, { "epoch": 0.1260137242669994, "grad_norm": 0.1771249194917155, "learning_rate": 9.997026058148617e-06, "loss": 0.0863, "num_tokens": 8096492.0, "step": 101 }, { "epoch": 0.1272613849033063, "grad_norm": 0.18698309428988277, "learning_rate": 9.996801713259933e-06, "loss": 0.0949, "num_tokens": 8177949.0, "step": 102 }, { "epoch": 0.12850904553961323, "grad_norm": 0.15901318998957345, "learning_rate": 9.996569214310549e-06, "loss": 0.0819, "num_tokens": 8256684.0, "step": 103 }, { "epoch": 0.12975670617592014, "grad_norm": 0.1659574005563422, "learning_rate": 9.99632856172206e-06, "loss": 0.0755, "num_tokens": 8335236.0, "step": 104 }, { "epoch": 0.13100436681222707, "grad_norm": 0.16236837894961287, "learning_rate": 9.99607975593084e-06, "loss": 0.0837, "num_tokens": 8414924.0, "step": 105 }, { "epoch": 0.132252027448534, "grad_norm": 0.16857841750351044, "learning_rate": 9.995822797388052e-06, "loss": 0.0832, "num_tokens": 8494719.0, "step": 106 }, { "epoch": 0.13349968808484092, "grad_norm": 0.16077102501626883, "learning_rate": 9.995557686559635e-06, "loss": 0.0825, "num_tokens": 8574385.0, "step": 107 }, { "epoch": 0.13474734872114785, "grad_norm": 0.16504694201517098, "learning_rate": 9.995284423926318e-06, "loss": 0.0823, "num_tokens": 8655947.0, "step": 108 }, { "epoch": 0.13599500935745476, "grad_norm": 0.16670618256633662, "learning_rate": 9.995003009983608e-06, "loss": 0.0859, "num_tokens": 8735660.0, "step": 109 }, { "epoch": 0.1372426699937617, "grad_norm": 0.18503209596000766, "learning_rate": 9.994713445241793e-06, "loss": 0.087, "num_tokens": 8816074.0, "step": 110 }, { "epoch": 0.13849033063006863, "grad_norm": 0.16851469586707762, "learning_rate": 9.994415730225943e-06, "loss": 0.0949, "num_tokens": 8897733.0, "step": 111 }, { "epoch": 0.13973799126637554, "grad_norm": 0.16695446108820516, "learning_rate": 9.994109865475903e-06, "loss": 0.0848, "num_tokens": 8977866.0, "step": 112 }, { "epoch": 0.14098565190268247, "grad_norm": 0.16886821971011903, "learning_rate": 9.993795851546302e-06, "loss": 0.0847, "num_tokens": 9059979.0, "step": 113 }, { "epoch": 0.14223331253898938, "grad_norm": 0.15825031266233286, "learning_rate": 9.993473689006538e-06, "loss": 0.0797, "num_tokens": 9139827.0, "step": 114 }, { "epoch": 0.14348097317529632, "grad_norm": 0.1628042779419236, "learning_rate": 9.99314337844079e-06, "loss": 0.0866, "num_tokens": 9220225.0, "step": 115 }, { "epoch": 0.14472863381160325, "grad_norm": 0.17318475399282052, "learning_rate": 9.992804920448013e-06, "loss": 0.0835, "num_tokens": 9300879.0, "step": 116 }, { "epoch": 0.14597629444791016, "grad_norm": 0.15657831342283274, "learning_rate": 9.992458315641932e-06, "loss": 0.0763, "num_tokens": 9380164.0, "step": 117 }, { "epoch": 0.1472239550842171, "grad_norm": 0.1716611418125263, "learning_rate": 9.992103564651048e-06, "loss": 0.0864, "num_tokens": 9460543.0, "step": 118 }, { "epoch": 0.14847161572052403, "grad_norm": 0.15729587536249917, "learning_rate": 9.991740668118629e-06, "loss": 0.078, "num_tokens": 9540063.0, "step": 119 }, { "epoch": 0.14971927635683094, "grad_norm": 0.17115153204306796, "learning_rate": 9.991369626702717e-06, "loss": 0.0838, "num_tokens": 9620959.0, "step": 120 }, { "epoch": 0.15096693699313787, "grad_norm": 0.16228121857689287, "learning_rate": 9.990990441076125e-06, "loss": 0.082, "num_tokens": 9702803.0, "step": 121 }, { "epoch": 0.15221459762944478, "grad_norm": 0.1635063935531281, "learning_rate": 9.990603111926424e-06, "loss": 0.0788, "num_tokens": 9782410.0, "step": 122 }, { "epoch": 0.15346225826575172, "grad_norm": 0.17181078017382084, "learning_rate": 9.990207639955969e-06, "loss": 0.0819, "num_tokens": 9863350.0, "step": 123 }, { "epoch": 0.15470991890205865, "grad_norm": 0.1643811410282833, "learning_rate": 9.989804025881862e-06, "loss": 0.077, "num_tokens": 9942485.0, "step": 124 }, { "epoch": 0.15595757953836556, "grad_norm": 0.1659276911169034, "learning_rate": 9.98939227043598e-06, "loss": 0.0785, "num_tokens": 10022555.0, "step": 125 }, { "epoch": 0.1572052401746725, "grad_norm": 0.1652036681004015, "learning_rate": 9.988972374364961e-06, "loss": 0.0802, "num_tokens": 10102391.0, "step": 126 }, { "epoch": 0.1584529008109794, "grad_norm": 0.17240153860438123, "learning_rate": 9.988544338430203e-06, "loss": 0.0796, "num_tokens": 10183708.0, "step": 127 }, { "epoch": 0.15970056144728634, "grad_norm": 0.16529523837288534, "learning_rate": 9.988108163407865e-06, "loss": 0.0809, "num_tokens": 10265029.0, "step": 128 }, { "epoch": 0.16094822208359327, "grad_norm": 0.1684552284249565, "learning_rate": 9.987663850088862e-06, "loss": 0.0787, "num_tokens": 10344489.0, "step": 129 }, { "epoch": 0.16219588271990018, "grad_norm": 0.15609693921109477, "learning_rate": 9.987211399278871e-06, "loss": 0.0765, "num_tokens": 10423520.0, "step": 130 }, { "epoch": 0.16344354335620712, "grad_norm": 0.16318291937030596, "learning_rate": 9.98675081179832e-06, "loss": 0.071, "num_tokens": 10502900.0, "step": 131 }, { "epoch": 0.16469120399251402, "grad_norm": 0.2007608468972681, "learning_rate": 9.986282088482397e-06, "loss": 0.0767, "num_tokens": 10583282.0, "step": 132 }, { "epoch": 0.16593886462882096, "grad_norm": 0.16935616163873396, "learning_rate": 9.985805230181031e-06, "loss": 0.0749, "num_tokens": 10662589.0, "step": 133 }, { "epoch": 0.1671865252651279, "grad_norm": 0.171740825216556, "learning_rate": 9.985320237758918e-06, "loss": 0.0775, "num_tokens": 10742307.0, "step": 134 }, { "epoch": 0.1684341859014348, "grad_norm": 0.15828176697546295, "learning_rate": 9.984827112095495e-06, "loss": 0.0753, "num_tokens": 10821872.0, "step": 135 }, { "epoch": 0.16968184653774174, "grad_norm": 0.16873525559334726, "learning_rate": 9.984325854084946e-06, "loss": 0.0786, "num_tokens": 10907937.0, "step": 136 }, { "epoch": 0.17092950717404864, "grad_norm": 0.16380536060958564, "learning_rate": 9.983816464636203e-06, "loss": 0.0784, "num_tokens": 10988958.0, "step": 137 }, { "epoch": 0.17217716781035558, "grad_norm": 0.160963929320833, "learning_rate": 9.983298944672942e-06, "loss": 0.0817, "num_tokens": 11070498.0, "step": 138 }, { "epoch": 0.17342482844666252, "grad_norm": 0.164166377631778, "learning_rate": 9.982773295133585e-06, "loss": 0.0754, "num_tokens": 11150195.0, "step": 139 }, { "epoch": 0.17467248908296942, "grad_norm": 0.18357918297872183, "learning_rate": 9.982239516971295e-06, "loss": 0.0783, "num_tokens": 11231131.0, "step": 140 }, { "epoch": 0.17592014971927636, "grad_norm": 0.15755220760124228, "learning_rate": 9.98169761115397e-06, "loss": 0.0786, "num_tokens": 11311740.0, "step": 141 }, { "epoch": 0.1771678103555833, "grad_norm": 0.16447695668867712, "learning_rate": 9.98114757866425e-06, "loss": 0.0784, "num_tokens": 11392420.0, "step": 142 }, { "epoch": 0.1784154709918902, "grad_norm": 0.15694055102021415, "learning_rate": 9.980589420499512e-06, "loss": 0.0841, "num_tokens": 11472491.0, "step": 143 }, { "epoch": 0.17966313162819714, "grad_norm": 0.1614277416464966, "learning_rate": 9.980023137671862e-06, "loss": 0.072, "num_tokens": 11552759.0, "step": 144 }, { "epoch": 0.18091079226450404, "grad_norm": 0.1581923947848541, "learning_rate": 9.979448731208145e-06, "loss": 0.0711, "num_tokens": 11632535.0, "step": 145 }, { "epoch": 0.18215845290081098, "grad_norm": 0.16298717987923186, "learning_rate": 9.978866202149931e-06, "loss": 0.0731, "num_tokens": 11712101.0, "step": 146 }, { "epoch": 0.18340611353711792, "grad_norm": 0.17368683828180292, "learning_rate": 9.978275551553526e-06, "loss": 0.0791, "num_tokens": 11791963.0, "step": 147 }, { "epoch": 0.18465377417342482, "grad_norm": 0.16144191529310298, "learning_rate": 9.977676780489953e-06, "loss": 0.0777, "num_tokens": 11872345.0, "step": 148 }, { "epoch": 0.18590143480973176, "grad_norm": 0.1651729281394997, "learning_rate": 9.977069890044965e-06, "loss": 0.0809, "num_tokens": 11954469.0, "step": 149 }, { "epoch": 0.18714909544603867, "grad_norm": 0.16467083171937716, "learning_rate": 9.976454881319041e-06, "loss": 0.0724, "num_tokens": 12033673.0, "step": 150 }, { "epoch": 0.1883967560823456, "grad_norm": 0.15270182358741702, "learning_rate": 9.975831755427376e-06, "loss": 0.0719, "num_tokens": 12113393.0, "step": 151 }, { "epoch": 0.18964441671865254, "grad_norm": 0.1552933576757993, "learning_rate": 9.975200513499886e-06, "loss": 0.0769, "num_tokens": 12194535.0, "step": 152 }, { "epoch": 0.19089207735495944, "grad_norm": 0.15708292865325066, "learning_rate": 9.974561156681203e-06, "loss": 0.076, "num_tokens": 12275521.0, "step": 153 }, { "epoch": 0.19213973799126638, "grad_norm": 0.15744491473566746, "learning_rate": 9.973913686130674e-06, "loss": 0.0718, "num_tokens": 12355487.0, "step": 154 }, { "epoch": 0.1933873986275733, "grad_norm": 0.1684494744054102, "learning_rate": 9.973258103022361e-06, "loss": 0.077, "num_tokens": 12435557.0, "step": 155 }, { "epoch": 0.19463505926388022, "grad_norm": 0.17092559295973017, "learning_rate": 9.97259440854503e-06, "loss": 0.078, "num_tokens": 12516421.0, "step": 156 }, { "epoch": 0.19588271990018716, "grad_norm": 0.16467948975450056, "learning_rate": 9.971922603902164e-06, "loss": 0.0792, "num_tokens": 12596956.0, "step": 157 }, { "epoch": 0.19713038053649407, "grad_norm": 0.16129266645735674, "learning_rate": 9.971242690311944e-06, "loss": 0.0715, "num_tokens": 12677329.0, "step": 158 }, { "epoch": 0.198378041172801, "grad_norm": 0.15566121358014243, "learning_rate": 9.970554669007264e-06, "loss": 0.071, "num_tokens": 12757136.0, "step": 159 }, { "epoch": 0.19962570180910794, "grad_norm": 0.15418083568009736, "learning_rate": 9.969858541235708e-06, "loss": 0.0707, "num_tokens": 12837208.0, "step": 160 }, { "epoch": 0.20087336244541484, "grad_norm": 0.15281662930143763, "learning_rate": 9.969154308259572e-06, "loss": 0.072, "num_tokens": 12916423.0, "step": 161 }, { "epoch": 0.20212102308172178, "grad_norm": 0.15806417128594089, "learning_rate": 9.968441971355839e-06, "loss": 0.0697, "num_tokens": 12995763.0, "step": 162 }, { "epoch": 0.2033686837180287, "grad_norm": 0.16247347666054926, "learning_rate": 9.967721531816194e-06, "loss": 0.069, "num_tokens": 13075036.0, "step": 163 }, { "epoch": 0.20461634435433562, "grad_norm": 0.16120489496865675, "learning_rate": 9.96699299094701e-06, "loss": 0.0657, "num_tokens": 13153140.0, "step": 164 }, { "epoch": 0.20586400499064256, "grad_norm": 0.1654367494643062, "learning_rate": 9.966256350069355e-06, "loss": 0.0719, "num_tokens": 13233496.0, "step": 165 }, { "epoch": 0.20711166562694946, "grad_norm": 0.15797090901409847, "learning_rate": 9.965511610518975e-06, "loss": 0.0741, "num_tokens": 13313688.0, "step": 166 }, { "epoch": 0.2083593262632564, "grad_norm": 0.17318092994565018, "learning_rate": 9.964758773646314e-06, "loss": 0.0705, "num_tokens": 13392817.0, "step": 167 }, { "epoch": 0.2096069868995633, "grad_norm": 0.16326342203924862, "learning_rate": 9.963997840816491e-06, "loss": 0.0694, "num_tokens": 13472052.0, "step": 168 }, { "epoch": 0.21085464753587024, "grad_norm": 0.1649032874821036, "learning_rate": 9.963228813409307e-06, "loss": 0.0718, "num_tokens": 13552229.0, "step": 169 }, { "epoch": 0.21210230817217718, "grad_norm": 0.14746862120200496, "learning_rate": 9.962451692819238e-06, "loss": 0.0674, "num_tokens": 13631487.0, "step": 170 }, { "epoch": 0.2133499688084841, "grad_norm": 0.16140552300758973, "learning_rate": 9.961666480455445e-06, "loss": 0.0711, "num_tokens": 13710876.0, "step": 171 }, { "epoch": 0.21459762944479102, "grad_norm": 0.1561744394885393, "learning_rate": 9.96087317774175e-06, "loss": 0.0678, "num_tokens": 13790236.0, "step": 172 }, { "epoch": 0.21584529008109793, "grad_norm": 0.1532654032002478, "learning_rate": 9.960071786116652e-06, "loss": 0.0701, "num_tokens": 13869459.0, "step": 173 }, { "epoch": 0.21709295071740486, "grad_norm": 0.16418271391935854, "learning_rate": 9.959262307033318e-06, "loss": 0.0702, "num_tokens": 13949850.0, "step": 174 }, { "epoch": 0.2183406113537118, "grad_norm": 0.16417748724525755, "learning_rate": 9.958444741959577e-06, "loss": 0.0794, "num_tokens": 14030886.0, "step": 175 }, { "epoch": 0.2195882719900187, "grad_norm": 0.1475411082063503, "learning_rate": 9.957619092377921e-06, "loss": 0.0697, "num_tokens": 14110548.0, "step": 176 }, { "epoch": 0.22083593262632564, "grad_norm": 0.15200242109370143, "learning_rate": 9.956785359785501e-06, "loss": 0.0725, "num_tokens": 14191162.0, "step": 177 }, { "epoch": 0.22208359326263255, "grad_norm": 0.14951724955021337, "learning_rate": 9.95594354569413e-06, "loss": 0.0713, "num_tokens": 14270373.0, "step": 178 }, { "epoch": 0.22333125389893949, "grad_norm": 0.15437761740995362, "learning_rate": 9.955093651630271e-06, "loss": 0.0694, "num_tokens": 14350521.0, "step": 179 }, { "epoch": 0.22457891453524642, "grad_norm": 0.16318758670731984, "learning_rate": 9.954235679135035e-06, "loss": 0.0648, "num_tokens": 14430117.0, "step": 180 }, { "epoch": 0.22582657517155333, "grad_norm": 0.1509213212454131, "learning_rate": 9.953369629764187e-06, "loss": 0.0659, "num_tokens": 14510184.0, "step": 181 }, { "epoch": 0.22707423580786026, "grad_norm": 0.15811847612755178, "learning_rate": 9.952495505088138e-06, "loss": 0.069, "num_tokens": 14589696.0, "step": 182 }, { "epoch": 0.2283218964441672, "grad_norm": 0.17426294289031205, "learning_rate": 9.95161330669194e-06, "loss": 0.0721, "num_tokens": 14669642.0, "step": 183 }, { "epoch": 0.2295695570804741, "grad_norm": 0.17769882981769738, "learning_rate": 9.950723036175282e-06, "loss": 0.0703, "num_tokens": 14749847.0, "step": 184 }, { "epoch": 0.23081721771678104, "grad_norm": 0.1674097835401589, "learning_rate": 9.9498246951525e-06, "loss": 0.0747, "num_tokens": 14829647.0, "step": 185 }, { "epoch": 0.23206487835308795, "grad_norm": 0.15100509686307984, "learning_rate": 9.948918285252551e-06, "loss": 0.0745, "num_tokens": 14910343.0, "step": 186 }, { "epoch": 0.23331253898939489, "grad_norm": 0.15360581031528767, "learning_rate": 9.948003808119034e-06, "loss": 0.0714, "num_tokens": 14990734.0, "step": 187 }, { "epoch": 0.23456019962570182, "grad_norm": 0.16940538023361718, "learning_rate": 9.94708126541017e-06, "loss": 0.0684, "num_tokens": 15070750.0, "step": 188 }, { "epoch": 0.23580786026200873, "grad_norm": 0.15548216290846645, "learning_rate": 9.94615065879881e-06, "loss": 0.069, "num_tokens": 15150020.0, "step": 189 }, { "epoch": 0.23705552089831566, "grad_norm": 0.14993741595806287, "learning_rate": 9.945211989972425e-06, "loss": 0.0703, "num_tokens": 15231989.0, "step": 190 }, { "epoch": 0.23830318153462257, "grad_norm": 0.1474419032756179, "learning_rate": 9.944265260633105e-06, "loss": 0.0711, "num_tokens": 15312107.0, "step": 191 }, { "epoch": 0.2395508421709295, "grad_norm": 0.1601090310495295, "learning_rate": 9.943310472497556e-06, "loss": 0.0696, "num_tokens": 15391755.0, "step": 192 }, { "epoch": 0.24079850280723644, "grad_norm": 0.16372600424581513, "learning_rate": 9.942347627297095e-06, "loss": 0.0719, "num_tokens": 15472703.0, "step": 193 }, { "epoch": 0.24204616344354335, "grad_norm": 0.15885711149872717, "learning_rate": 9.941376726777656e-06, "loss": 0.0703, "num_tokens": 15552902.0, "step": 194 }, { "epoch": 0.24329382407985028, "grad_norm": 0.14374818433177697, "learning_rate": 9.940397772699773e-06, "loss": 0.0674, "num_tokens": 15633405.0, "step": 195 }, { "epoch": 0.2445414847161572, "grad_norm": 0.14857277972441196, "learning_rate": 9.939410766838586e-06, "loss": 0.0715, "num_tokens": 15714298.0, "step": 196 }, { "epoch": 0.24578914535246413, "grad_norm": 0.15728113670824326, "learning_rate": 9.938415710983834e-06, "loss": 0.0655, "num_tokens": 15793415.0, "step": 197 }, { "epoch": 0.24703680598877106, "grad_norm": 0.15942515909588884, "learning_rate": 9.937412606939854e-06, "loss": 0.0725, "num_tokens": 15874136.0, "step": 198 }, { "epoch": 0.24828446662507797, "grad_norm": 0.13933287189995225, "learning_rate": 9.936401456525578e-06, "loss": 0.0686, "num_tokens": 15953965.0, "step": 199 }, { "epoch": 0.2495321272613849, "grad_norm": 0.14826313565340965, "learning_rate": 9.935382261574527e-06, "loss": 0.0646, "num_tokens": 16034405.0, "step": 200 }, { "epoch": 0.25077978789769184, "grad_norm": 0.14090118375202848, "learning_rate": 9.934355023934808e-06, "loss": 0.0595, "num_tokens": 16112634.0, "step": 201 }, { "epoch": 0.2520274485339988, "grad_norm": 0.15692035946489286, "learning_rate": 9.933319745469117e-06, "loss": 0.0713, "num_tokens": 16193908.0, "step": 202 }, { "epoch": 0.25327510917030566, "grad_norm": 0.1656646737047489, "learning_rate": 9.932276428054723e-06, "loss": 0.0748, "num_tokens": 16275142.0, "step": 203 }, { "epoch": 0.2545227698066126, "grad_norm": 0.1680255993051908, "learning_rate": 9.931225073583476e-06, "loss": 0.0718, "num_tokens": 16355412.0, "step": 204 }, { "epoch": 0.2557704304429195, "grad_norm": 0.1646716198896169, "learning_rate": 9.930165683961803e-06, "loss": 0.0661, "num_tokens": 16435568.0, "step": 205 }, { "epoch": 0.25701809107922646, "grad_norm": 0.1565441365205928, "learning_rate": 9.929098261110694e-06, "loss": 0.0653, "num_tokens": 16516116.0, "step": 206 }, { "epoch": 0.2582657517155334, "grad_norm": 0.15611179423469423, "learning_rate": 9.92802280696571e-06, "loss": 0.0689, "num_tokens": 16596327.0, "step": 207 }, { "epoch": 0.2595134123518403, "grad_norm": 0.16424170862613105, "learning_rate": 9.926939323476976e-06, "loss": 0.0714, "num_tokens": 16675904.0, "step": 208 }, { "epoch": 0.2607610729881472, "grad_norm": 0.1496338642109506, "learning_rate": 9.925847812609174e-06, "loss": 0.0647, "num_tokens": 16754483.0, "step": 209 }, { "epoch": 0.26200873362445415, "grad_norm": 0.14882331263651497, "learning_rate": 9.924748276341541e-06, "loss": 0.0667, "num_tokens": 16834373.0, "step": 210 }, { "epoch": 0.2632563942607611, "grad_norm": 0.14757123106619455, "learning_rate": 9.923640716667872e-06, "loss": 0.0624, "num_tokens": 16914378.0, "step": 211 }, { "epoch": 0.264504054897068, "grad_norm": 0.15327214179668633, "learning_rate": 9.922525135596507e-06, "loss": 0.0731, "num_tokens": 16995299.0, "step": 212 }, { "epoch": 0.2657517155333749, "grad_norm": 0.15329303036152214, "learning_rate": 9.92140153515033e-06, "loss": 0.0699, "num_tokens": 17074695.0, "step": 213 }, { "epoch": 0.26699937616968183, "grad_norm": 0.14973454931210878, "learning_rate": 9.92026991736677e-06, "loss": 0.0644, "num_tokens": 17154211.0, "step": 214 }, { "epoch": 0.26824703680598877, "grad_norm": 0.15013258739242202, "learning_rate": 9.919130284297791e-06, "loss": 0.0661, "num_tokens": 17234490.0, "step": 215 }, { "epoch": 0.2694946974422957, "grad_norm": 0.15536001180184006, "learning_rate": 9.917982638009891e-06, "loss": 0.0727, "num_tokens": 17314715.0, "step": 216 }, { "epoch": 0.27074235807860264, "grad_norm": 0.15435069913626115, "learning_rate": 9.916826980584103e-06, "loss": 0.0657, "num_tokens": 17395497.0, "step": 217 }, { "epoch": 0.2719900187149095, "grad_norm": 0.15469096812383332, "learning_rate": 9.91566331411598e-06, "loss": 0.0626, "num_tokens": 17474577.0, "step": 218 }, { "epoch": 0.27323767935121646, "grad_norm": 0.1600274953046015, "learning_rate": 9.914491640715603e-06, "loss": 0.0676, "num_tokens": 17555477.0, "step": 219 }, { "epoch": 0.2744853399875234, "grad_norm": 0.1406463527512994, "learning_rate": 9.913311962507569e-06, "loss": 0.0592, "num_tokens": 17635011.0, "step": 220 }, { "epoch": 0.2757330006238303, "grad_norm": 0.16023616480511607, "learning_rate": 9.912124281630991e-06, "loss": 0.069, "num_tokens": 17714394.0, "step": 221 }, { "epoch": 0.27698066126013726, "grad_norm": 0.14706360693121406, "learning_rate": 9.910928600239493e-06, "loss": 0.0672, "num_tokens": 17795018.0, "step": 222 }, { "epoch": 0.27822832189644414, "grad_norm": 0.15220399575934448, "learning_rate": 9.909724920501207e-06, "loss": 0.0657, "num_tokens": 17874644.0, "step": 223 }, { "epoch": 0.2794759825327511, "grad_norm": 0.15590253629720327, "learning_rate": 9.90851324459877e-06, "loss": 0.0692, "num_tokens": 17954798.0, "step": 224 }, { "epoch": 0.280723643169058, "grad_norm": 0.15484431097082246, "learning_rate": 9.907293574729317e-06, "loss": 0.0645, "num_tokens": 18034230.0, "step": 225 }, { "epoch": 0.28197130380536495, "grad_norm": 0.1449336777934227, "learning_rate": 9.906065913104474e-06, "loss": 0.0665, "num_tokens": 18113515.0, "step": 226 }, { "epoch": 0.2832189644416719, "grad_norm": 0.14201172287080596, "learning_rate": 9.904830261950366e-06, "loss": 0.0615, "num_tokens": 18193510.0, "step": 227 }, { "epoch": 0.28446662507797876, "grad_norm": 0.1422769788921952, "learning_rate": 9.903586623507603e-06, "loss": 0.0584, "num_tokens": 18273926.0, "step": 228 }, { "epoch": 0.2857142857142857, "grad_norm": 0.1567259496080909, "learning_rate": 9.902335000031273e-06, "loss": 0.0622, "num_tokens": 18354062.0, "step": 229 }, { "epoch": 0.28696194635059263, "grad_norm": 0.1534870201147952, "learning_rate": 9.901075393790953e-06, "loss": 0.0666, "num_tokens": 18434373.0, "step": 230 }, { "epoch": 0.28820960698689957, "grad_norm": 0.14650419642450943, "learning_rate": 9.899807807070684e-06, "loss": 0.0635, "num_tokens": 18513274.0, "step": 231 }, { "epoch": 0.2894572676232065, "grad_norm": 0.14778200561533844, "learning_rate": 9.898532242168987e-06, "loss": 0.0615, "num_tokens": 18592541.0, "step": 232 }, { "epoch": 0.29070492825951344, "grad_norm": 0.14114387988548585, "learning_rate": 9.897248701398848e-06, "loss": 0.0588, "num_tokens": 18671892.0, "step": 233 }, { "epoch": 0.2919525888958203, "grad_norm": 0.16178673205752156, "learning_rate": 9.895957187087713e-06, "loss": 0.0635, "num_tokens": 18751886.0, "step": 234 }, { "epoch": 0.29320024953212726, "grad_norm": 0.14871524378309123, "learning_rate": 9.894657701577488e-06, "loss": 0.0658, "num_tokens": 18832351.0, "step": 235 }, { "epoch": 0.2944479101684342, "grad_norm": 0.1548609260643329, "learning_rate": 9.893350247224532e-06, "loss": 0.0675, "num_tokens": 18912756.0, "step": 236 }, { "epoch": 0.2956955708047411, "grad_norm": 0.1507526419619637, "learning_rate": 9.892034826399657e-06, "loss": 0.0601, "num_tokens": 18993197.0, "step": 237 }, { "epoch": 0.29694323144104806, "grad_norm": 0.156742989410175, "learning_rate": 9.890711441488117e-06, "loss": 0.0669, "num_tokens": 19074321.0, "step": 238 }, { "epoch": 0.29819089207735494, "grad_norm": 0.15435161952476084, "learning_rate": 9.889380094889609e-06, "loss": 0.0659, "num_tokens": 19153748.0, "step": 239 }, { "epoch": 0.2994385527136619, "grad_norm": 0.14843331611676852, "learning_rate": 9.888040789018267e-06, "loss": 0.0608, "num_tokens": 19232902.0, "step": 240 }, { "epoch": 0.3006862133499688, "grad_norm": 0.15991403846542185, "learning_rate": 9.886693526302657e-06, "loss": 0.0699, "num_tokens": 19314211.0, "step": 241 }, { "epoch": 0.30193387398627575, "grad_norm": 0.14343130811237734, "learning_rate": 9.885338309185775e-06, "loss": 0.0593, "num_tokens": 19393486.0, "step": 242 }, { "epoch": 0.3031815346225827, "grad_norm": 0.14976887951301165, "learning_rate": 9.883975140125035e-06, "loss": 0.063, "num_tokens": 19474289.0, "step": 243 }, { "epoch": 0.30442919525888956, "grad_norm": 0.15391003667967385, "learning_rate": 9.88260402159228e-06, "loss": 0.0682, "num_tokens": 19554117.0, "step": 244 }, { "epoch": 0.3056768558951965, "grad_norm": 0.1558428875604292, "learning_rate": 9.88122495607376e-06, "loss": 0.0653, "num_tokens": 19634331.0, "step": 245 }, { "epoch": 0.30692451653150343, "grad_norm": 0.15426290191639358, "learning_rate": 9.879837946070138e-06, "loss": 0.0593, "num_tokens": 19713085.0, "step": 246 }, { "epoch": 0.30817217716781037, "grad_norm": 0.14767406997007507, "learning_rate": 9.878442994096481e-06, "loss": 0.0578, "num_tokens": 19792400.0, "step": 247 }, { "epoch": 0.3094198378041173, "grad_norm": 0.14741065221457425, "learning_rate": 9.87704010268226e-06, "loss": 0.0644, "num_tokens": 19873279.0, "step": 248 }, { "epoch": 0.3106674984404242, "grad_norm": 0.1529316875881466, "learning_rate": 9.87562927437134e-06, "loss": 0.0628, "num_tokens": 19953306.0, "step": 249 }, { "epoch": 0.3119151590767311, "grad_norm": 0.15478552257088712, "learning_rate": 9.87421051172198e-06, "loss": 0.0656, "num_tokens": 20033613.0, "step": 250 }, { "epoch": 0.31316281971303805, "grad_norm": 0.1469468822446913, "learning_rate": 9.872783817306827e-06, "loss": 0.0617, "num_tokens": 20113360.0, "step": 251 }, { "epoch": 0.314410480349345, "grad_norm": 0.1539836863937035, "learning_rate": 9.871349193712905e-06, "loss": 0.0654, "num_tokens": 20192829.0, "step": 252 }, { "epoch": 0.3156581409856519, "grad_norm": 0.15325635042763217, "learning_rate": 9.869906643541625e-06, "loss": 0.0577, "num_tokens": 20271785.0, "step": 253 }, { "epoch": 0.3169058016219588, "grad_norm": 0.14852576321041067, "learning_rate": 9.868456169408763e-06, "loss": 0.0591, "num_tokens": 20351250.0, "step": 254 }, { "epoch": 0.31815346225826574, "grad_norm": 0.14853921696894695, "learning_rate": 9.866997773944469e-06, "loss": 0.0649, "num_tokens": 20432591.0, "step": 255 }, { "epoch": 0.3194011228945727, "grad_norm": 0.15093199632607562, "learning_rate": 9.865531459793254e-06, "loss": 0.0616, "num_tokens": 20511910.0, "step": 256 }, { "epoch": 0.3206487835308796, "grad_norm": 0.14360562555088788, "learning_rate": 9.864057229613988e-06, "loss": 0.0587, "num_tokens": 20591960.0, "step": 257 }, { "epoch": 0.32189644416718655, "grad_norm": 0.15453328885789705, "learning_rate": 9.862575086079897e-06, "loss": 0.0646, "num_tokens": 20673004.0, "step": 258 }, { "epoch": 0.3231441048034934, "grad_norm": 0.14025760262079745, "learning_rate": 9.861085031878556e-06, "loss": 0.0566, "num_tokens": 20751778.0, "step": 259 }, { "epoch": 0.32439176543980036, "grad_norm": 0.154221736145205, "learning_rate": 9.859587069711883e-06, "loss": 0.0677, "num_tokens": 20833796.0, "step": 260 }, { "epoch": 0.3256394260761073, "grad_norm": 0.16473194510891925, "learning_rate": 9.858081202296133e-06, "loss": 0.0628, "num_tokens": 20913685.0, "step": 261 }, { "epoch": 0.32688708671241423, "grad_norm": 0.1477312778238374, "learning_rate": 9.856567432361903e-06, "loss": 0.0608, "num_tokens": 20995255.0, "step": 262 }, { "epoch": 0.32813474734872117, "grad_norm": 0.15295805096246043, "learning_rate": 9.855045762654115e-06, "loss": 0.0631, "num_tokens": 21077034.0, "step": 263 }, { "epoch": 0.32938240798502805, "grad_norm": 0.1447398547624195, "learning_rate": 9.853516195932014e-06, "loss": 0.0579, "num_tokens": 21156349.0, "step": 264 }, { "epoch": 0.330630068621335, "grad_norm": 0.14589297009135765, "learning_rate": 9.851978734969168e-06, "loss": 0.0583, "num_tokens": 21236413.0, "step": 265 }, { "epoch": 0.3318777292576419, "grad_norm": 0.143764406382231, "learning_rate": 9.850433382553457e-06, "loss": 0.062, "num_tokens": 21318138.0, "step": 266 }, { "epoch": 0.33312538989394885, "grad_norm": 0.16011843739089876, "learning_rate": 9.848880141487076e-06, "loss": 0.0695, "num_tokens": 21399813.0, "step": 267 }, { "epoch": 0.3343730505302558, "grad_norm": 0.14438380491620342, "learning_rate": 9.847319014586517e-06, "loss": 0.0598, "num_tokens": 21482627.0, "step": 268 }, { "epoch": 0.33562071116656267, "grad_norm": 0.14004050344567048, "learning_rate": 9.845750004682576e-06, "loss": 0.0591, "num_tokens": 21561686.0, "step": 269 }, { "epoch": 0.3368683718028696, "grad_norm": 0.15403062197709771, "learning_rate": 9.844173114620342e-06, "loss": 0.0595, "num_tokens": 21641830.0, "step": 270 }, { "epoch": 0.33811603243917654, "grad_norm": 0.15466717690067627, "learning_rate": 9.842588347259192e-06, "loss": 0.0568, "num_tokens": 21721224.0, "step": 271 }, { "epoch": 0.3393636930754835, "grad_norm": 0.13856966189349137, "learning_rate": 9.84099570547279e-06, "loss": 0.0598, "num_tokens": 21801699.0, "step": 272 }, { "epoch": 0.3406113537117904, "grad_norm": 0.15061899204640403, "learning_rate": 9.839395192149077e-06, "loss": 0.0591, "num_tokens": 21881771.0, "step": 273 }, { "epoch": 0.3418590143480973, "grad_norm": 0.14228872056371486, "learning_rate": 9.837786810190268e-06, "loss": 0.0608, "num_tokens": 21961323.0, "step": 274 }, { "epoch": 0.3431066749844042, "grad_norm": 0.16567790027277843, "learning_rate": 9.836170562512844e-06, "loss": 0.0608, "num_tokens": 22041013.0, "step": 275 }, { "epoch": 0.34435433562071116, "grad_norm": 0.14395086051397554, "learning_rate": 9.83454645204755e-06, "loss": 0.0581, "num_tokens": 22119949.0, "step": 276 }, { "epoch": 0.3456019962570181, "grad_norm": 0.1516359303870716, "learning_rate": 9.832914481739391e-06, "loss": 0.0634, "num_tokens": 22200178.0, "step": 277 }, { "epoch": 0.34684965689332503, "grad_norm": 0.16105443817398513, "learning_rate": 9.831274654547623e-06, "loss": 0.0626, "num_tokens": 22279674.0, "step": 278 }, { "epoch": 0.34809731752963197, "grad_norm": 0.1590038648782175, "learning_rate": 9.829626973445745e-06, "loss": 0.0635, "num_tokens": 22360285.0, "step": 279 }, { "epoch": 0.34934497816593885, "grad_norm": 0.14937561999750967, "learning_rate": 9.827971441421504e-06, "loss": 0.0613, "num_tokens": 22442292.0, "step": 280 }, { "epoch": 0.3505926388022458, "grad_norm": 0.1474165532101385, "learning_rate": 9.826308061476878e-06, "loss": 0.0565, "num_tokens": 22521661.0, "step": 281 }, { "epoch": 0.3518402994385527, "grad_norm": 0.15035582630080754, "learning_rate": 9.824636836628078e-06, "loss": 0.0624, "num_tokens": 22601095.0, "step": 282 }, { "epoch": 0.35308796007485965, "grad_norm": 0.14438687507583545, "learning_rate": 9.822957769905544e-06, "loss": 0.0573, "num_tokens": 22681539.0, "step": 283 }, { "epoch": 0.3543356207111666, "grad_norm": 0.14358016685030078, "learning_rate": 9.821270864353924e-06, "loss": 0.0532, "num_tokens": 22760996.0, "step": 284 }, { "epoch": 0.35558328134747347, "grad_norm": 0.14707615741317387, "learning_rate": 9.819576123032092e-06, "loss": 0.0603, "num_tokens": 22841748.0, "step": 285 }, { "epoch": 0.3568309419837804, "grad_norm": 0.14812718809993014, "learning_rate": 9.817873549013127e-06, "loss": 0.0592, "num_tokens": 22922667.0, "step": 286 }, { "epoch": 0.35807860262008734, "grad_norm": 0.1649467927673721, "learning_rate": 9.816163145384308e-06, "loss": 0.0554, "num_tokens": 23002249.0, "step": 287 }, { "epoch": 0.3593262632563943, "grad_norm": 0.15358093663493297, "learning_rate": 9.814444915247115e-06, "loss": 0.0632, "num_tokens": 23083209.0, "step": 288 }, { "epoch": 0.3605739238927012, "grad_norm": 0.15692040771846372, "learning_rate": 9.81271886171722e-06, "loss": 0.0592, "num_tokens": 23163343.0, "step": 289 }, { "epoch": 0.3618215845290081, "grad_norm": 0.1580641913112163, "learning_rate": 9.810984987924477e-06, "loss": 0.0579, "num_tokens": 23243258.0, "step": 290 }, { "epoch": 0.363069245165315, "grad_norm": 0.147171020680957, "learning_rate": 9.809243297012923e-06, "loss": 0.0622, "num_tokens": 23325453.0, "step": 291 }, { "epoch": 0.36431690580162196, "grad_norm": 0.15029752697042956, "learning_rate": 9.807493792140774e-06, "loss": 0.0575, "num_tokens": 23406618.0, "step": 292 }, { "epoch": 0.3655645664379289, "grad_norm": 0.1394169020427776, "learning_rate": 9.805736476480407e-06, "loss": 0.0565, "num_tokens": 23485923.0, "step": 293 }, { "epoch": 0.36681222707423583, "grad_norm": 0.14818497445465476, "learning_rate": 9.803971353218367e-06, "loss": 0.0526, "num_tokens": 23565444.0, "step": 294 }, { "epoch": 0.3680598877105427, "grad_norm": 0.14971154575642173, "learning_rate": 9.802198425555358e-06, "loss": 0.0636, "num_tokens": 23647126.0, "step": 295 }, { "epoch": 0.36930754834684965, "grad_norm": 0.16149541778586904, "learning_rate": 9.800417696706234e-06, "loss": 0.0636, "num_tokens": 23727284.0, "step": 296 }, { "epoch": 0.3705552089831566, "grad_norm": 0.14820265948993624, "learning_rate": 9.798629169899992e-06, "loss": 0.056, "num_tokens": 23807739.0, "step": 297 }, { "epoch": 0.3718028696194635, "grad_norm": 0.15131350009349487, "learning_rate": 9.796832848379775e-06, "loss": 0.0591, "num_tokens": 23887839.0, "step": 298 }, { "epoch": 0.37305053025577045, "grad_norm": 0.14986708525327327, "learning_rate": 9.795028735402853e-06, "loss": 0.0598, "num_tokens": 23968437.0, "step": 299 }, { "epoch": 0.37429819089207733, "grad_norm": 0.15808081173486005, "learning_rate": 9.79321683424063e-06, "loss": 0.0571, "num_tokens": 24047656.0, "step": 300 }, { "epoch": 0.37554585152838427, "grad_norm": 0.14958052977119102, "learning_rate": 9.791397148178632e-06, "loss": 0.0554, "num_tokens": 24127038.0, "step": 301 }, { "epoch": 0.3767935121646912, "grad_norm": 0.152499732455963, "learning_rate": 9.789569680516497e-06, "loss": 0.057, "num_tokens": 24207038.0, "step": 302 }, { "epoch": 0.37804117280099814, "grad_norm": 0.14559783006117594, "learning_rate": 9.78773443456798e-06, "loss": 0.0621, "num_tokens": 24287925.0, "step": 303 }, { "epoch": 0.3792888334373051, "grad_norm": 0.1452527426722152, "learning_rate": 9.785891413660931e-06, "loss": 0.0548, "num_tokens": 24367076.0, "step": 304 }, { "epoch": 0.38053649407361195, "grad_norm": 0.14896918649468194, "learning_rate": 9.784040621137308e-06, "loss": 0.0547, "num_tokens": 24446865.0, "step": 305 }, { "epoch": 0.3817841547099189, "grad_norm": 0.1390970861000212, "learning_rate": 9.78218206035316e-06, "loss": 0.0545, "num_tokens": 24526102.0, "step": 306 }, { "epoch": 0.3830318153462258, "grad_norm": 0.15617459548241075, "learning_rate": 9.780315734678612e-06, "loss": 0.0619, "num_tokens": 24606039.0, "step": 307 }, { "epoch": 0.38427947598253276, "grad_norm": 0.14418794697812448, "learning_rate": 9.778441647497882e-06, "loss": 0.0565, "num_tokens": 24685400.0, "step": 308 }, { "epoch": 0.3855271366188397, "grad_norm": 0.14701646779160854, "learning_rate": 9.776559802209255e-06, "loss": 0.0579, "num_tokens": 24765381.0, "step": 309 }, { "epoch": 0.3867747972551466, "grad_norm": 0.1631595272698652, "learning_rate": 9.774670202225084e-06, "loss": 0.0615, "num_tokens": 24845699.0, "step": 310 }, { "epoch": 0.3880224578914535, "grad_norm": 0.1545918579556817, "learning_rate": 9.772772850971788e-06, "loss": 0.0645, "num_tokens": 24925987.0, "step": 311 }, { "epoch": 0.38927011852776044, "grad_norm": 0.1441116111201304, "learning_rate": 9.770867751889837e-06, "loss": 0.0572, "num_tokens": 25005272.0, "step": 312 }, { "epoch": 0.3905177791640674, "grad_norm": 0.13781567161395594, "learning_rate": 9.76895490843375e-06, "loss": 0.0557, "num_tokens": 25085235.0, "step": 313 }, { "epoch": 0.3917654398003743, "grad_norm": 0.14477971945325008, "learning_rate": 9.767034324072091e-06, "loss": 0.0574, "num_tokens": 25164489.0, "step": 314 }, { "epoch": 0.3930131004366812, "grad_norm": 0.15372533695861323, "learning_rate": 9.76510600228746e-06, "loss": 0.0573, "num_tokens": 25243328.0, "step": 315 }, { "epoch": 0.39426076107298813, "grad_norm": 0.1501403033694261, "learning_rate": 9.763169946576488e-06, "loss": 0.0612, "num_tokens": 25323644.0, "step": 316 }, { "epoch": 0.39550842170929507, "grad_norm": 0.15008446129724604, "learning_rate": 9.76122616044983e-06, "loss": 0.0601, "num_tokens": 25402837.0, "step": 317 }, { "epoch": 0.396756082345602, "grad_norm": 0.15958664875841405, "learning_rate": 9.759274647432156e-06, "loss": 0.055, "num_tokens": 25481850.0, "step": 318 }, { "epoch": 0.39800374298190894, "grad_norm": 0.15101688409063593, "learning_rate": 9.75731541106215e-06, "loss": 0.0561, "num_tokens": 25562517.0, "step": 319 }, { "epoch": 0.39925140361821587, "grad_norm": 0.15012352120224384, "learning_rate": 9.755348454892498e-06, "loss": 0.0559, "num_tokens": 25642299.0, "step": 320 }, { "epoch": 0.40049906425452275, "grad_norm": 0.15137439858210344, "learning_rate": 9.753373782489887e-06, "loss": 0.0576, "num_tokens": 25722205.0, "step": 321 }, { "epoch": 0.4017467248908297, "grad_norm": 0.1506690147271384, "learning_rate": 9.751391397434996e-06, "loss": 0.0577, "num_tokens": 25801893.0, "step": 322 }, { "epoch": 0.4029943855271366, "grad_norm": 0.16103781809588258, "learning_rate": 9.74940130332249e-06, "loss": 0.0591, "num_tokens": 25882061.0, "step": 323 }, { "epoch": 0.40424204616344356, "grad_norm": 0.14444822336784965, "learning_rate": 9.747403503761006e-06, "loss": 0.0525, "num_tokens": 25960184.0, "step": 324 }, { "epoch": 0.4054897067997505, "grad_norm": 0.15166828709851793, "learning_rate": 9.74539800237316e-06, "loss": 0.0565, "num_tokens": 26040338.0, "step": 325 }, { "epoch": 0.4067373674360574, "grad_norm": 0.1504474187463855, "learning_rate": 9.743384802795535e-06, "loss": 0.0597, "num_tokens": 26121905.0, "step": 326 }, { "epoch": 0.4079850280723643, "grad_norm": 0.13344139836523075, "learning_rate": 9.741363908678669e-06, "loss": 0.0546, "num_tokens": 26202255.0, "step": 327 }, { "epoch": 0.40923268870867124, "grad_norm": 0.14377591000899273, "learning_rate": 9.739335323687052e-06, "loss": 0.0628, "num_tokens": 26283068.0, "step": 328 }, { "epoch": 0.4104803493449782, "grad_norm": 0.15940785319665565, "learning_rate": 9.737299051499125e-06, "loss": 0.0563, "num_tokens": 26363408.0, "step": 329 }, { "epoch": 0.4117280099812851, "grad_norm": 0.14599902917885316, "learning_rate": 9.735255095807263e-06, "loss": 0.059, "num_tokens": 26444199.0, "step": 330 }, { "epoch": 0.412975670617592, "grad_norm": 0.1351984864753291, "learning_rate": 9.733203460317777e-06, "loss": 0.0584, "num_tokens": 26524527.0, "step": 331 }, { "epoch": 0.41422333125389893, "grad_norm": 0.1397398203048671, "learning_rate": 9.731144148750898e-06, "loss": 0.0502, "num_tokens": 26604842.0, "step": 332 }, { "epoch": 0.41547099189020587, "grad_norm": 0.1502653858474547, "learning_rate": 9.729077164840784e-06, "loss": 0.0617, "num_tokens": 26685166.0, "step": 333 }, { "epoch": 0.4167186525265128, "grad_norm": 0.15884597841687997, "learning_rate": 9.727002512335502e-06, "loss": 0.0579, "num_tokens": 26766562.0, "step": 334 }, { "epoch": 0.41796631316281974, "grad_norm": 0.15085968982512207, "learning_rate": 9.724920194997022e-06, "loss": 0.0563, "num_tokens": 26845821.0, "step": 335 }, { "epoch": 0.4192139737991266, "grad_norm": 0.14636626725897792, "learning_rate": 9.722830216601217e-06, "loss": 0.0573, "num_tokens": 26925802.0, "step": 336 }, { "epoch": 0.42046163443543355, "grad_norm": 0.15213895413855882, "learning_rate": 9.720732580937848e-06, "loss": 0.0573, "num_tokens": 27004757.0, "step": 337 }, { "epoch": 0.4217092950717405, "grad_norm": 0.15478235225939369, "learning_rate": 9.718627291810561e-06, "loss": 0.0558, "num_tokens": 27085488.0, "step": 338 }, { "epoch": 0.4229569557080474, "grad_norm": 0.14333668524575108, "learning_rate": 9.716514353036884e-06, "loss": 0.0529, "num_tokens": 27165382.0, "step": 339 }, { "epoch": 0.42420461634435436, "grad_norm": 0.13816120021039435, "learning_rate": 9.714393768448214e-06, "loss": 0.055, "num_tokens": 27244847.0, "step": 340 }, { "epoch": 0.42545227698066124, "grad_norm": 0.1516209836461398, "learning_rate": 9.712265541889809e-06, "loss": 0.0571, "num_tokens": 27326623.0, "step": 341 }, { "epoch": 0.4266999376169682, "grad_norm": 0.14387239194414297, "learning_rate": 9.710129677220788e-06, "loss": 0.057, "num_tokens": 27408759.0, "step": 342 }, { "epoch": 0.4279475982532751, "grad_norm": 0.14913332717460548, "learning_rate": 9.707986178314123e-06, "loss": 0.0604, "num_tokens": 27489615.0, "step": 343 }, { "epoch": 0.42919525888958204, "grad_norm": 0.14600137001097122, "learning_rate": 9.705835049056621e-06, "loss": 0.0577, "num_tokens": 27569276.0, "step": 344 }, { "epoch": 0.430442919525889, "grad_norm": 0.16253517587195007, "learning_rate": 9.70367629334893e-06, "loss": 0.0593, "num_tokens": 27650089.0, "step": 345 }, { "epoch": 0.43169058016219586, "grad_norm": 0.15144799683797194, "learning_rate": 9.701509915105527e-06, "loss": 0.0551, "num_tokens": 27731133.0, "step": 346 }, { "epoch": 0.4329382407985028, "grad_norm": 0.1457681651429185, "learning_rate": 9.699335918254714e-06, "loss": 0.0539, "num_tokens": 27810902.0, "step": 347 }, { "epoch": 0.43418590143480973, "grad_norm": 0.1463053074053871, "learning_rate": 9.6971543067386e-06, "loss": 0.0588, "num_tokens": 27892340.0, "step": 348 }, { "epoch": 0.43543356207111666, "grad_norm": 0.1520505948725208, "learning_rate": 9.694965084513106e-06, "loss": 0.0562, "num_tokens": 27973118.0, "step": 349 }, { "epoch": 0.4366812227074236, "grad_norm": 0.15820851987269624, "learning_rate": 9.692768255547957e-06, "loss": 0.0567, "num_tokens": 28053723.0, "step": 350 }, { "epoch": 0.4379288833437305, "grad_norm": 0.1515133004587777, "learning_rate": 9.690563823826666e-06, "loss": 0.06, "num_tokens": 28134449.0, "step": 351 }, { "epoch": 0.4391765439800374, "grad_norm": 0.14702095630330445, "learning_rate": 9.688351793346533e-06, "loss": 0.0561, "num_tokens": 28214373.0, "step": 352 }, { "epoch": 0.44042420461634435, "grad_norm": 0.14263520426345258, "learning_rate": 9.68613216811864e-06, "loss": 0.0581, "num_tokens": 28294827.0, "step": 353 }, { "epoch": 0.4416718652526513, "grad_norm": 0.15433805512072737, "learning_rate": 9.683904952167837e-06, "loss": 0.0536, "num_tokens": 28374441.0, "step": 354 }, { "epoch": 0.4429195258889582, "grad_norm": 0.14704915295293733, "learning_rate": 9.681670149532739e-06, "loss": 0.0535, "num_tokens": 28452997.0, "step": 355 }, { "epoch": 0.4441671865252651, "grad_norm": 0.14891895587500484, "learning_rate": 9.67942776426572e-06, "loss": 0.0576, "num_tokens": 28533011.0, "step": 356 }, { "epoch": 0.44541484716157204, "grad_norm": 0.14800382217071825, "learning_rate": 9.677177800432903e-06, "loss": 0.056, "num_tokens": 28612714.0, "step": 357 }, { "epoch": 0.44666250779787897, "grad_norm": 0.1484302906683845, "learning_rate": 9.67492026211415e-06, "loss": 0.0541, "num_tokens": 28692174.0, "step": 358 }, { "epoch": 0.4479101684341859, "grad_norm": 0.1473133403449387, "learning_rate": 9.672655153403064e-06, "loss": 0.0556, "num_tokens": 28771843.0, "step": 359 }, { "epoch": 0.44915782907049284, "grad_norm": 0.13463944988028403, "learning_rate": 9.670382478406967e-06, "loss": 0.0562, "num_tokens": 28851787.0, "step": 360 }, { "epoch": 0.4504054897067998, "grad_norm": 0.14571672370885358, "learning_rate": 9.66810224124691e-06, "loss": 0.0561, "num_tokens": 28931886.0, "step": 361 }, { "epoch": 0.45165315034310666, "grad_norm": 0.13982592828240858, "learning_rate": 9.665814446057652e-06, "loss": 0.0527, "num_tokens": 29011364.0, "step": 362 }, { "epoch": 0.4529008109794136, "grad_norm": 0.1468605138111856, "learning_rate": 9.663519096987653e-06, "loss": 0.0569, "num_tokens": 29091203.0, "step": 363 }, { "epoch": 0.45414847161572053, "grad_norm": 0.15284313337192354, "learning_rate": 9.661216198199078e-06, "loss": 0.0553, "num_tokens": 29172304.0, "step": 364 }, { "epoch": 0.45539613225202746, "grad_norm": 0.15300814933585585, "learning_rate": 9.658905753867778e-06, "loss": 0.0608, "num_tokens": 29252117.0, "step": 365 }, { "epoch": 0.4566437928883344, "grad_norm": 0.16189213439882416, "learning_rate": 9.656587768183287e-06, "loss": 0.0546, "num_tokens": 29332772.0, "step": 366 }, { "epoch": 0.4578914535246413, "grad_norm": 0.14196790848023397, "learning_rate": 9.654262245348813e-06, "loss": 0.0517, "num_tokens": 29414702.0, "step": 367 }, { "epoch": 0.4591391141609482, "grad_norm": 0.14672195499400306, "learning_rate": 9.651929189581233e-06, "loss": 0.0595, "num_tokens": 29495754.0, "step": 368 }, { "epoch": 0.46038677479725515, "grad_norm": 0.14633512516697522, "learning_rate": 9.649588605111082e-06, "loss": 0.0554, "num_tokens": 29575118.0, "step": 369 }, { "epoch": 0.4616344354335621, "grad_norm": 0.14073644904651714, "learning_rate": 9.647240496182545e-06, "loss": 0.0556, "num_tokens": 29655784.0, "step": 370 }, { "epoch": 0.462882096069869, "grad_norm": 0.15125788764011439, "learning_rate": 9.644884867053455e-06, "loss": 0.0549, "num_tokens": 29736212.0, "step": 371 }, { "epoch": 0.4641297567061759, "grad_norm": 0.1498730140799902, "learning_rate": 9.64252172199528e-06, "loss": 0.0567, "num_tokens": 29816732.0, "step": 372 }, { "epoch": 0.46537741734248284, "grad_norm": 0.13559146848095813, "learning_rate": 9.640151065293117e-06, "loss": 0.0546, "num_tokens": 29896880.0, "step": 373 }, { "epoch": 0.46662507797878977, "grad_norm": 0.1640344913303149, "learning_rate": 9.63777290124568e-06, "loss": 0.0701, "num_tokens": 29980295.0, "step": 374 }, { "epoch": 0.4678727386150967, "grad_norm": 0.14536897470480084, "learning_rate": 9.635387234165303e-06, "loss": 0.0564, "num_tokens": 30061645.0, "step": 375 }, { "epoch": 0.46912039925140364, "grad_norm": 0.13332481481430616, "learning_rate": 9.632994068377916e-06, "loss": 0.0485, "num_tokens": 30141179.0, "step": 376 }, { "epoch": 0.4703680598877105, "grad_norm": 0.15492389456107522, "learning_rate": 9.63059340822306e-06, "loss": 0.0559, "num_tokens": 30220539.0, "step": 377 }, { "epoch": 0.47161572052401746, "grad_norm": 0.13840934949647143, "learning_rate": 9.628185258053852e-06, "loss": 0.0566, "num_tokens": 30301422.0, "step": 378 }, { "epoch": 0.4728633811603244, "grad_norm": 0.14291506474305504, "learning_rate": 9.625769622236995e-06, "loss": 0.0545, "num_tokens": 30381142.0, "step": 379 }, { "epoch": 0.4741110417966313, "grad_norm": 0.14000086512375753, "learning_rate": 9.623346505152771e-06, "loss": 0.0508, "num_tokens": 30460521.0, "step": 380 }, { "epoch": 0.47535870243293826, "grad_norm": 0.14400483775828635, "learning_rate": 9.620915911195021e-06, "loss": 0.0504, "num_tokens": 30539451.0, "step": 381 }, { "epoch": 0.47660636306924514, "grad_norm": 0.15315155933409735, "learning_rate": 9.618477844771147e-06, "loss": 0.0558, "num_tokens": 30618847.0, "step": 382 }, { "epoch": 0.4778540237055521, "grad_norm": 0.13374765928710877, "learning_rate": 9.6160323103021e-06, "loss": 0.0584, "num_tokens": 30697880.0, "step": 383 }, { "epoch": 0.479101684341859, "grad_norm": 0.13578880192469092, "learning_rate": 9.613579312222377e-06, "loss": 0.0495, "num_tokens": 30776740.0, "step": 384 }, { "epoch": 0.48034934497816595, "grad_norm": 0.14830879266525465, "learning_rate": 9.611118854979998e-06, "loss": 0.0588, "num_tokens": 30858807.0, "step": 385 }, { "epoch": 0.4815970056144729, "grad_norm": 0.13811959093377515, "learning_rate": 9.608650943036522e-06, "loss": 0.0563, "num_tokens": 30938478.0, "step": 386 }, { "epoch": 0.48284466625077976, "grad_norm": 0.152755680501879, "learning_rate": 9.606175580867016e-06, "loss": 0.0564, "num_tokens": 31019990.0, "step": 387 }, { "epoch": 0.4840923268870867, "grad_norm": 0.14052581035511705, "learning_rate": 9.60369277296006e-06, "loss": 0.0559, "num_tokens": 31100307.0, "step": 388 }, { "epoch": 0.48533998752339363, "grad_norm": 0.1486718670745852, "learning_rate": 9.601202523817735e-06, "loss": 0.0573, "num_tokens": 31179820.0, "step": 389 }, { "epoch": 0.48658764815970057, "grad_norm": 0.14296008439795943, "learning_rate": 9.598704837955618e-06, "loss": 0.0492, "num_tokens": 31258626.0, "step": 390 }, { "epoch": 0.4878353087960075, "grad_norm": 0.15097962419838978, "learning_rate": 9.596199719902765e-06, "loss": 0.0541, "num_tokens": 31339814.0, "step": 391 }, { "epoch": 0.4890829694323144, "grad_norm": 0.1379963039474492, "learning_rate": 9.593687174201715e-06, "loss": 0.053, "num_tokens": 31419517.0, "step": 392 }, { "epoch": 0.4903306300686213, "grad_norm": 0.13238251476670143, "learning_rate": 9.59116720540847e-06, "loss": 0.0492, "num_tokens": 31498667.0, "step": 393 }, { "epoch": 0.49157829070492826, "grad_norm": 0.1577098734056635, "learning_rate": 9.588639818092498e-06, "loss": 0.0574, "num_tokens": 31579341.0, "step": 394 }, { "epoch": 0.4928259513412352, "grad_norm": 0.1400848617210015, "learning_rate": 9.586105016836713e-06, "loss": 0.0547, "num_tokens": 31660051.0, "step": 395 }, { "epoch": 0.4940736119775421, "grad_norm": 0.14347417674815124, "learning_rate": 9.58356280623748e-06, "loss": 0.0561, "num_tokens": 31740543.0, "step": 396 }, { "epoch": 0.495321272613849, "grad_norm": 0.146326148226546, "learning_rate": 9.58101319090459e-06, "loss": 0.0534, "num_tokens": 31821119.0, "step": 397 }, { "epoch": 0.49656893325015594, "grad_norm": 0.13794827612031768, "learning_rate": 9.578456175461272e-06, "loss": 0.0517, "num_tokens": 31901059.0, "step": 398 }, { "epoch": 0.4978165938864629, "grad_norm": 0.1335465918733181, "learning_rate": 9.575891764544162e-06, "loss": 0.0501, "num_tokens": 31981392.0, "step": 399 }, { "epoch": 0.4990642545227698, "grad_norm": 0.14668973583020378, "learning_rate": 9.573319962803317e-06, "loss": 0.0534, "num_tokens": 32061494.0, "step": 400 }, { "epoch": 0.5003119151590767, "grad_norm": 0.15441275896158793, "learning_rate": 9.570740774902189e-06, "loss": 0.0616, "num_tokens": 32142304.0, "step": 401 }, { "epoch": 0.5015595757953837, "grad_norm": 0.14955081145369936, "learning_rate": 9.568154205517623e-06, "loss": 0.0522, "num_tokens": 32222526.0, "step": 402 }, { "epoch": 0.5028072364316906, "grad_norm": 0.1403875094386206, "learning_rate": 9.565560259339856e-06, "loss": 0.0485, "num_tokens": 32302513.0, "step": 403 }, { "epoch": 0.5040548970679976, "grad_norm": 0.13683297238023728, "learning_rate": 9.562958941072491e-06, "loss": 0.0536, "num_tokens": 32382934.0, "step": 404 }, { "epoch": 0.5053025577043044, "grad_norm": 0.14711235416131968, "learning_rate": 9.560350255432508e-06, "loss": 0.0542, "num_tokens": 32464107.0, "step": 405 }, { "epoch": 0.5065502183406113, "grad_norm": 0.14916320835261607, "learning_rate": 9.557734207150243e-06, "loss": 0.0539, "num_tokens": 32543923.0, "step": 406 }, { "epoch": 0.5077978789769183, "grad_norm": 0.13980467839315625, "learning_rate": 9.55511080096938e-06, "loss": 0.049, "num_tokens": 32623438.0, "step": 407 }, { "epoch": 0.5090455396132252, "grad_norm": 0.15152035984072354, "learning_rate": 9.552480041646949e-06, "loss": 0.055, "num_tokens": 32703162.0, "step": 408 }, { "epoch": 0.5102932002495322, "grad_norm": 0.1388088923573849, "learning_rate": 9.549841933953308e-06, "loss": 0.0487, "num_tokens": 32782526.0, "step": 409 }, { "epoch": 0.511540860885839, "grad_norm": 0.1384190124870507, "learning_rate": 9.547196482672148e-06, "loss": 0.0571, "num_tokens": 32863740.0, "step": 410 }, { "epoch": 0.5127885215221459, "grad_norm": 0.14052473157074324, "learning_rate": 9.544543692600473e-06, "loss": 0.0534, "num_tokens": 32942880.0, "step": 411 }, { "epoch": 0.5140361821584529, "grad_norm": 0.13353121925025832, "learning_rate": 9.541883568548588e-06, "loss": 0.0504, "num_tokens": 33022952.0, "step": 412 }, { "epoch": 0.5152838427947598, "grad_norm": 0.14750782511179328, "learning_rate": 9.539216115340106e-06, "loss": 0.0523, "num_tokens": 33101857.0, "step": 413 }, { "epoch": 0.5165315034310668, "grad_norm": 0.14466406906288587, "learning_rate": 9.536541337811923e-06, "loss": 0.0558, "num_tokens": 33181714.0, "step": 414 }, { "epoch": 0.5177791640673737, "grad_norm": 0.15553123717162787, "learning_rate": 9.533859240814221e-06, "loss": 0.055, "num_tokens": 33261500.0, "step": 415 }, { "epoch": 0.5190268247036806, "grad_norm": 0.13310331465324876, "learning_rate": 9.531169829210452e-06, "loss": 0.0501, "num_tokens": 33340399.0, "step": 416 }, { "epoch": 0.5202744853399875, "grad_norm": 0.14079149042098654, "learning_rate": 9.528473107877333e-06, "loss": 0.0497, "num_tokens": 33420893.0, "step": 417 }, { "epoch": 0.5215221459762944, "grad_norm": 0.1401444099939849, "learning_rate": 9.525769081704835e-06, "loss": 0.0526, "num_tokens": 33500831.0, "step": 418 }, { "epoch": 0.5227698066126014, "grad_norm": 0.13674566510520145, "learning_rate": 9.523057755596174e-06, "loss": 0.0524, "num_tokens": 33581334.0, "step": 419 }, { "epoch": 0.5240174672489083, "grad_norm": 0.14204922606844075, "learning_rate": 9.520339134467803e-06, "loss": 0.0538, "num_tokens": 33661326.0, "step": 420 }, { "epoch": 0.5252651278852152, "grad_norm": 0.1558664826050407, "learning_rate": 9.517613223249402e-06, "loss": 0.0572, "num_tokens": 33741261.0, "step": 421 }, { "epoch": 0.5265127885215222, "grad_norm": 0.15461038514035907, "learning_rate": 9.514880026883877e-06, "loss": 0.0575, "num_tokens": 33822933.0, "step": 422 }, { "epoch": 0.527760449157829, "grad_norm": 0.1607295696047527, "learning_rate": 9.512139550327338e-06, "loss": 0.0586, "num_tokens": 33903895.0, "step": 423 }, { "epoch": 0.529008109794136, "grad_norm": 0.1498493476829081, "learning_rate": 9.509391798549091e-06, "loss": 0.0579, "num_tokens": 33984607.0, "step": 424 }, { "epoch": 0.5302557704304429, "grad_norm": 0.14848887896859914, "learning_rate": 9.50663677653165e-06, "loss": 0.0526, "num_tokens": 34064151.0, "step": 425 }, { "epoch": 0.5315034310667498, "grad_norm": 0.14388277136012045, "learning_rate": 9.503874489270697e-06, "loss": 0.0507, "num_tokens": 34144372.0, "step": 426 }, { "epoch": 0.5327510917030568, "grad_norm": 0.14645441650969096, "learning_rate": 9.501104941775094e-06, "loss": 0.0582, "num_tokens": 34224279.0, "step": 427 }, { "epoch": 0.5339987523393637, "grad_norm": 0.14459393929071007, "learning_rate": 9.49832813906687e-06, "loss": 0.0536, "num_tokens": 34303390.0, "step": 428 }, { "epoch": 0.5352464129756707, "grad_norm": 0.12857512112686587, "learning_rate": 9.495544086181204e-06, "loss": 0.0485, "num_tokens": 34383555.0, "step": 429 }, { "epoch": 0.5364940736119775, "grad_norm": 0.1314484816771736, "learning_rate": 9.49275278816643e-06, "loss": 0.0498, "num_tokens": 34463812.0, "step": 430 }, { "epoch": 0.5377417342482844, "grad_norm": 0.13947556078476245, "learning_rate": 9.489954250084011e-06, "loss": 0.0545, "num_tokens": 34544321.0, "step": 431 }, { "epoch": 0.5389893948845914, "grad_norm": 0.1436721443948016, "learning_rate": 9.487148477008545e-06, "loss": 0.0525, "num_tokens": 34624266.0, "step": 432 }, { "epoch": 0.5402370555208983, "grad_norm": 0.13998074131625612, "learning_rate": 9.484335474027744e-06, "loss": 0.0514, "num_tokens": 34704134.0, "step": 433 }, { "epoch": 0.5414847161572053, "grad_norm": 0.14871361303836866, "learning_rate": 9.481515246242435e-06, "loss": 0.053, "num_tokens": 34784190.0, "step": 434 }, { "epoch": 0.5427323767935122, "grad_norm": 0.14624886222954445, "learning_rate": 9.478687798766544e-06, "loss": 0.0531, "num_tokens": 34864130.0, "step": 435 }, { "epoch": 0.543980037429819, "grad_norm": 0.13641332408321277, "learning_rate": 9.475853136727086e-06, "loss": 0.0492, "num_tokens": 34944204.0, "step": 436 }, { "epoch": 0.545227698066126, "grad_norm": 0.14134926386463173, "learning_rate": 9.473011265264159e-06, "loss": 0.0526, "num_tokens": 35024607.0, "step": 437 }, { "epoch": 0.5464753587024329, "grad_norm": 0.14553055880700336, "learning_rate": 9.470162189530938e-06, "loss": 0.0534, "num_tokens": 35104390.0, "step": 438 }, { "epoch": 0.5477230193387399, "grad_norm": 0.1437000622603961, "learning_rate": 9.467305914693658e-06, "loss": 0.0513, "num_tokens": 35184822.0, "step": 439 }, { "epoch": 0.5489706799750468, "grad_norm": 0.14612828451027657, "learning_rate": 9.464442445931605e-06, "loss": 0.0497, "num_tokens": 35264499.0, "step": 440 }, { "epoch": 0.5502183406113537, "grad_norm": 0.14828985114792528, "learning_rate": 9.461571788437119e-06, "loss": 0.0533, "num_tokens": 35345078.0, "step": 441 }, { "epoch": 0.5514660012476607, "grad_norm": 0.1374551082004164, "learning_rate": 9.458693947415564e-06, "loss": 0.0466, "num_tokens": 35424570.0, "step": 442 }, { "epoch": 0.5527136618839675, "grad_norm": 0.147677054111561, "learning_rate": 9.455808928085339e-06, "loss": 0.0542, "num_tokens": 35505946.0, "step": 443 }, { "epoch": 0.5539613225202745, "grad_norm": 0.13849172697877132, "learning_rate": 9.452916735677857e-06, "loss": 0.0485, "num_tokens": 35586346.0, "step": 444 }, { "epoch": 0.5552089831565814, "grad_norm": 0.15297999265119042, "learning_rate": 9.450017375437534e-06, "loss": 0.0531, "num_tokens": 35665891.0, "step": 445 }, { "epoch": 0.5564566437928883, "grad_norm": 0.15501802210849228, "learning_rate": 9.44711085262179e-06, "loss": 0.0573, "num_tokens": 35746479.0, "step": 446 }, { "epoch": 0.5577043044291953, "grad_norm": 0.13088367378601265, "learning_rate": 9.444197172501025e-06, "loss": 0.0484, "num_tokens": 35826159.0, "step": 447 }, { "epoch": 0.5589519650655022, "grad_norm": 0.16063425009788931, "learning_rate": 9.441276340358624e-06, "loss": 0.0576, "num_tokens": 35906837.0, "step": 448 }, { "epoch": 0.5601996257018091, "grad_norm": 0.1307271068905107, "learning_rate": 9.438348361490938e-06, "loss": 0.0525, "num_tokens": 35987331.0, "step": 449 }, { "epoch": 0.561447286338116, "grad_norm": 0.13171725020454944, "learning_rate": 9.43541324120728e-06, "loss": 0.0474, "num_tokens": 36066069.0, "step": 450 }, { "epoch": 0.5626949469744229, "grad_norm": 0.1309581854874849, "learning_rate": 9.432470984829908e-06, "loss": 0.0465, "num_tokens": 36144807.0, "step": 451 }, { "epoch": 0.5639426076107299, "grad_norm": 0.14475267528300384, "learning_rate": 9.429521597694023e-06, "loss": 0.0553, "num_tokens": 36226188.0, "step": 452 }, { "epoch": 0.5651902682470368, "grad_norm": 0.1425625122178402, "learning_rate": 9.426565085147755e-06, "loss": 0.0511, "num_tokens": 36305107.0, "step": 453 }, { "epoch": 0.5664379288833438, "grad_norm": 0.13616136813599408, "learning_rate": 9.423601452552153e-06, "loss": 0.0555, "num_tokens": 36386076.0, "step": 454 }, { "epoch": 0.5676855895196506, "grad_norm": 0.14376718432927843, "learning_rate": 9.420630705281182e-06, "loss": 0.0501, "num_tokens": 36466132.0, "step": 455 }, { "epoch": 0.5689332501559575, "grad_norm": 0.13639207933313066, "learning_rate": 9.417652848721704e-06, "loss": 0.0549, "num_tokens": 36547317.0, "step": 456 }, { "epoch": 0.5701809107922645, "grad_norm": 0.13858764737031773, "learning_rate": 9.41466788827347e-06, "loss": 0.0545, "num_tokens": 36626984.0, "step": 457 }, { "epoch": 0.5714285714285714, "grad_norm": 0.14505131114240344, "learning_rate": 9.411675829349119e-06, "loss": 0.0521, "num_tokens": 36707170.0, "step": 458 }, { "epoch": 0.5726762320648784, "grad_norm": 0.1475234995649541, "learning_rate": 9.408676677374158e-06, "loss": 0.0506, "num_tokens": 36787101.0, "step": 459 }, { "epoch": 0.5739238927011853, "grad_norm": 0.13633501610880747, "learning_rate": 9.405670437786953e-06, "loss": 0.0493, "num_tokens": 36867353.0, "step": 460 }, { "epoch": 0.5751715533374921, "grad_norm": 0.14436718039231583, "learning_rate": 9.402657116038728e-06, "loss": 0.0546, "num_tokens": 36948883.0, "step": 461 }, { "epoch": 0.5764192139737991, "grad_norm": 0.14203475439871666, "learning_rate": 9.399636717593545e-06, "loss": 0.0476, "num_tokens": 37028885.0, "step": 462 }, { "epoch": 0.577666874610106, "grad_norm": 0.14533730733608163, "learning_rate": 9.3966092479283e-06, "loss": 0.0516, "num_tokens": 37108694.0, "step": 463 }, { "epoch": 0.578914535246413, "grad_norm": 0.13808604624352258, "learning_rate": 9.39357471253271e-06, "loss": 0.0488, "num_tokens": 37188517.0, "step": 464 }, { "epoch": 0.5801621958827199, "grad_norm": 0.1461795781132845, "learning_rate": 9.390533116909305e-06, "loss": 0.054, "num_tokens": 37269409.0, "step": 465 }, { "epoch": 0.5814098565190269, "grad_norm": 0.14456406121879972, "learning_rate": 9.387484466573417e-06, "loss": 0.0513, "num_tokens": 37349786.0, "step": 466 }, { "epoch": 0.5826575171553338, "grad_norm": 0.15015842796580345, "learning_rate": 9.38442876705317e-06, "loss": 0.0488, "num_tokens": 37428973.0, "step": 467 }, { "epoch": 0.5839051777916406, "grad_norm": 0.14230447835176302, "learning_rate": 9.381366023889475e-06, "loss": 0.052, "num_tokens": 37509348.0, "step": 468 }, { "epoch": 0.5851528384279476, "grad_norm": 0.1450811926352791, "learning_rate": 9.378296242636002e-06, "loss": 0.0545, "num_tokens": 37590500.0, "step": 469 }, { "epoch": 0.5864004990642545, "grad_norm": 0.13517059402851564, "learning_rate": 9.375219428859202e-06, "loss": 0.05, "num_tokens": 37672765.0, "step": 470 }, { "epoch": 0.5876481597005615, "grad_norm": 0.14229031493976188, "learning_rate": 9.372135588138262e-06, "loss": 0.0508, "num_tokens": 37753260.0, "step": 471 }, { "epoch": 0.5888958203368684, "grad_norm": 0.13875205292970882, "learning_rate": 9.369044726065121e-06, "loss": 0.0515, "num_tokens": 37832754.0, "step": 472 }, { "epoch": 0.5901434809731753, "grad_norm": 0.13790837237053832, "learning_rate": 9.365946848244445e-06, "loss": 0.0508, "num_tokens": 37911845.0, "step": 473 }, { "epoch": 0.5913911416094823, "grad_norm": 0.140360358316561, "learning_rate": 9.362841960293622e-06, "loss": 0.0499, "num_tokens": 37991693.0, "step": 474 }, { "epoch": 0.5926388022457891, "grad_norm": 0.13689888549603996, "learning_rate": 9.359730067842753e-06, "loss": 0.0528, "num_tokens": 38072283.0, "step": 475 }, { "epoch": 0.5938864628820961, "grad_norm": 0.13291064397987987, "learning_rate": 9.35661117653464e-06, "loss": 0.0473, "num_tokens": 38150942.0, "step": 476 }, { "epoch": 0.595134123518403, "grad_norm": 0.1404556622746765, "learning_rate": 9.353485292024775e-06, "loss": 0.0554, "num_tokens": 38232536.0, "step": 477 }, { "epoch": 0.5963817841547099, "grad_norm": 0.13890649535466057, "learning_rate": 9.35035241998133e-06, "loss": 0.0487, "num_tokens": 38312747.0, "step": 478 }, { "epoch": 0.5976294447910169, "grad_norm": 0.13585175607106556, "learning_rate": 9.347212566085153e-06, "loss": 0.0512, "num_tokens": 38392202.0, "step": 479 }, { "epoch": 0.5988771054273238, "grad_norm": 0.14380652025332846, "learning_rate": 9.344065736029746e-06, "loss": 0.0542, "num_tokens": 38472582.0, "step": 480 }, { "epoch": 0.6001247660636307, "grad_norm": 0.1524558297733905, "learning_rate": 9.34091193552126e-06, "loss": 0.0522, "num_tokens": 38553322.0, "step": 481 }, { "epoch": 0.6013724266999376, "grad_norm": 0.1568778299979335, "learning_rate": 9.337751170278495e-06, "loss": 0.0541, "num_tokens": 38633844.0, "step": 482 }, { "epoch": 0.6026200873362445, "grad_norm": 0.14243398281550446, "learning_rate": 9.334583446032866e-06, "loss": 0.0528, "num_tokens": 38713546.0, "step": 483 }, { "epoch": 0.6038677479725515, "grad_norm": 0.14281972240999116, "learning_rate": 9.331408768528423e-06, "loss": 0.0504, "num_tokens": 38792792.0, "step": 484 }, { "epoch": 0.6051154086088584, "grad_norm": 0.1357329955012292, "learning_rate": 9.328227143521809e-06, "loss": 0.0511, "num_tokens": 38872893.0, "step": 485 }, { "epoch": 0.6063630692451654, "grad_norm": 0.12679897769025358, "learning_rate": 9.325038576782275e-06, "loss": 0.0476, "num_tokens": 38952415.0, "step": 486 }, { "epoch": 0.6076107298814722, "grad_norm": 0.1482843741539947, "learning_rate": 9.321843074091654e-06, "loss": 0.0524, "num_tokens": 39033272.0, "step": 487 }, { "epoch": 0.6088583905177791, "grad_norm": 0.14975071143827606, "learning_rate": 9.318640641244362e-06, "loss": 0.0488, "num_tokens": 39111891.0, "step": 488 }, { "epoch": 0.6101060511540861, "grad_norm": 0.1342258373618748, "learning_rate": 9.315431284047375e-06, "loss": 0.0505, "num_tokens": 39192041.0, "step": 489 }, { "epoch": 0.611353711790393, "grad_norm": 0.1285046989054189, "learning_rate": 9.312215008320228e-06, "loss": 0.0497, "num_tokens": 39271930.0, "step": 490 }, { "epoch": 0.6126013724267, "grad_norm": 0.1379379252162495, "learning_rate": 9.308991819895001e-06, "loss": 0.0529, "num_tokens": 39352986.0, "step": 491 }, { "epoch": 0.6138490330630069, "grad_norm": 0.14610092835159, "learning_rate": 9.30576172461631e-06, "loss": 0.0642, "num_tokens": 39433237.0, "step": 492 }, { "epoch": 0.6150966936993137, "grad_norm": 0.13695664278195094, "learning_rate": 9.302524728341292e-06, "loss": 0.0526, "num_tokens": 39514407.0, "step": 493 }, { "epoch": 0.6163443543356207, "grad_norm": 0.14695174818928397, "learning_rate": 9.299280836939602e-06, "loss": 0.0527, "num_tokens": 39594636.0, "step": 494 }, { "epoch": 0.6175920149719276, "grad_norm": 0.13598009895197866, "learning_rate": 9.296030056293394e-06, "loss": 0.0464, "num_tokens": 39674169.0, "step": 495 }, { "epoch": 0.6188396756082346, "grad_norm": 0.14132411207723297, "learning_rate": 9.292772392297316e-06, "loss": 0.0518, "num_tokens": 39754876.0, "step": 496 }, { "epoch": 0.6200873362445415, "grad_norm": 0.13790154255779888, "learning_rate": 9.289507850858498e-06, "loss": 0.0481, "num_tokens": 39835143.0, "step": 497 }, { "epoch": 0.6213349968808484, "grad_norm": 0.136274128237269, "learning_rate": 9.286236437896538e-06, "loss": 0.0487, "num_tokens": 39914731.0, "step": 498 }, { "epoch": 0.6225826575171554, "grad_norm": 0.13960062998109465, "learning_rate": 9.282958159343502e-06, "loss": 0.051, "num_tokens": 39994714.0, "step": 499 }, { "epoch": 0.6238303181534622, "grad_norm": 0.13253521456358003, "learning_rate": 9.279673021143895e-06, "loss": 0.0476, "num_tokens": 40075018.0, "step": 500 }, { "epoch": 0.6250779787897692, "grad_norm": 0.13428832406148858, "learning_rate": 9.276381029254668e-06, "loss": 0.0488, "num_tokens": 40155183.0, "step": 501 }, { "epoch": 0.6263256394260761, "grad_norm": 0.14564770785457162, "learning_rate": 9.273082189645197e-06, "loss": 0.053, "num_tokens": 40236110.0, "step": 502 }, { "epoch": 0.627573300062383, "grad_norm": 0.13321390812262743, "learning_rate": 9.269776508297272e-06, "loss": 0.0494, "num_tokens": 40316306.0, "step": 503 }, { "epoch": 0.62882096069869, "grad_norm": 0.13863622088015634, "learning_rate": 9.266463991205096e-06, "loss": 0.0507, "num_tokens": 40396697.0, "step": 504 }, { "epoch": 0.6300686213349969, "grad_norm": 0.12373924281291247, "learning_rate": 9.263144644375264e-06, "loss": 0.0453, "num_tokens": 40476696.0, "step": 505 }, { "epoch": 0.6313162819713038, "grad_norm": 0.14046491029490663, "learning_rate": 9.259818473826753e-06, "loss": 0.0463, "num_tokens": 40555802.0, "step": 506 }, { "epoch": 0.6325639426076107, "grad_norm": 0.14152092405005381, "learning_rate": 9.256485485590916e-06, "loss": 0.0485, "num_tokens": 40636230.0, "step": 507 }, { "epoch": 0.6338116032439176, "grad_norm": 0.1417103083447363, "learning_rate": 9.25314568571147e-06, "loss": 0.0475, "num_tokens": 40716545.0, "step": 508 }, { "epoch": 0.6350592638802246, "grad_norm": 0.14299937687535086, "learning_rate": 9.24979908024448e-06, "loss": 0.05, "num_tokens": 40795828.0, "step": 509 }, { "epoch": 0.6363069245165315, "grad_norm": 0.17469460318552915, "learning_rate": 9.246445675258353e-06, "loss": 0.0485, "num_tokens": 40876181.0, "step": 510 }, { "epoch": 0.6375545851528385, "grad_norm": 0.1416974505661574, "learning_rate": 9.243085476833827e-06, "loss": 0.0486, "num_tokens": 40956815.0, "step": 511 }, { "epoch": 0.6388022457891454, "grad_norm": 0.13184766297464506, "learning_rate": 9.239718491063956e-06, "loss": 0.0452, "num_tokens": 41035477.0, "step": 512 }, { "epoch": 0.6400499064254522, "grad_norm": 0.14293335757694378, "learning_rate": 9.236344724054104e-06, "loss": 0.0481, "num_tokens": 41114840.0, "step": 513 }, { "epoch": 0.6412975670617592, "grad_norm": 0.1346618868296363, "learning_rate": 9.232964181921931e-06, "loss": 0.0469, "num_tokens": 41194660.0, "step": 514 }, { "epoch": 0.6425452276980661, "grad_norm": 0.14380477524244686, "learning_rate": 9.22957687079738e-06, "loss": 0.0503, "num_tokens": 41274732.0, "step": 515 }, { "epoch": 0.6437928883343731, "grad_norm": 0.1548153138877275, "learning_rate": 9.22618279682267e-06, "loss": 0.0534, "num_tokens": 41355231.0, "step": 516 }, { "epoch": 0.64504054897068, "grad_norm": 0.12901632746395453, "learning_rate": 9.222781966152284e-06, "loss": 0.0487, "num_tokens": 41435607.0, "step": 517 }, { "epoch": 0.6462882096069869, "grad_norm": 0.13407720979842325, "learning_rate": 9.219374384952955e-06, "loss": 0.0459, "num_tokens": 41514560.0, "step": 518 }, { "epoch": 0.6475358702432938, "grad_norm": 0.12463947466497512, "learning_rate": 9.215960059403657e-06, "loss": 0.0473, "num_tokens": 41594491.0, "step": 519 }, { "epoch": 0.6487835308796007, "grad_norm": 0.1279220794254339, "learning_rate": 9.212538995695597e-06, "loss": 0.0445, "num_tokens": 41673375.0, "step": 520 }, { "epoch": 0.6500311915159077, "grad_norm": 0.1422667742985825, "learning_rate": 9.209111200032197e-06, "loss": 0.0457, "num_tokens": 41752931.0, "step": 521 }, { "epoch": 0.6512788521522146, "grad_norm": 0.13547316557303143, "learning_rate": 9.205676678629084e-06, "loss": 0.0514, "num_tokens": 41833139.0, "step": 522 }, { "epoch": 0.6525265127885215, "grad_norm": 0.1294378459458407, "learning_rate": 9.202235437714085e-06, "loss": 0.0452, "num_tokens": 41912284.0, "step": 523 }, { "epoch": 0.6537741734248285, "grad_norm": 0.13400192606267253, "learning_rate": 9.198787483527211e-06, "loss": 0.0499, "num_tokens": 41993387.0, "step": 524 }, { "epoch": 0.6550218340611353, "grad_norm": 0.1374789626351036, "learning_rate": 9.195332822320643e-06, "loss": 0.0507, "num_tokens": 42073397.0, "step": 525 }, { "epoch": 0.6562694946974423, "grad_norm": 0.14254850808958336, "learning_rate": 9.191871460358727e-06, "loss": 0.0498, "num_tokens": 42153736.0, "step": 526 }, { "epoch": 0.6575171553337492, "grad_norm": 0.1492013090788707, "learning_rate": 9.188403403917959e-06, "loss": 0.0508, "num_tokens": 42233576.0, "step": 527 }, { "epoch": 0.6587648159700561, "grad_norm": 0.13528472298759806, "learning_rate": 9.184928659286972e-06, "loss": 0.0461, "num_tokens": 42312824.0, "step": 528 }, { "epoch": 0.6600124766063631, "grad_norm": 0.14462582085339648, "learning_rate": 9.181447232766531e-06, "loss": 0.0541, "num_tokens": 42394280.0, "step": 529 }, { "epoch": 0.66126013724267, "grad_norm": 0.1363211647052413, "learning_rate": 9.177959130669512e-06, "loss": 0.0489, "num_tokens": 42473681.0, "step": 530 }, { "epoch": 0.662507797878977, "grad_norm": 0.13485083339287968, "learning_rate": 9.174464359320898e-06, "loss": 0.0471, "num_tokens": 42552772.0, "step": 531 }, { "epoch": 0.6637554585152838, "grad_norm": 0.1255665390873853, "learning_rate": 9.170962925057769e-06, "loss": 0.0472, "num_tokens": 42632415.0, "step": 532 }, { "epoch": 0.6650031191515907, "grad_norm": 0.1355451583887428, "learning_rate": 9.167454834229281e-06, "loss": 0.048, "num_tokens": 42712289.0, "step": 533 }, { "epoch": 0.6662507797878977, "grad_norm": 0.1350858530193199, "learning_rate": 9.163940093196663e-06, "loss": 0.0506, "num_tokens": 42792392.0, "step": 534 }, { "epoch": 0.6674984404242046, "grad_norm": 0.136136562989076, "learning_rate": 9.160418708333203e-06, "loss": 0.0478, "num_tokens": 42872872.0, "step": 535 }, { "epoch": 0.6687461010605116, "grad_norm": 0.146077765520201, "learning_rate": 9.156890686024239e-06, "loss": 0.0498, "num_tokens": 42953883.0, "step": 536 }, { "epoch": 0.6699937616968185, "grad_norm": 0.12797176999362384, "learning_rate": 9.153356032667138e-06, "loss": 0.046, "num_tokens": 43033437.0, "step": 537 }, { "epoch": 0.6712414223331253, "grad_norm": 0.12545897578255247, "learning_rate": 9.149814754671296e-06, "loss": 0.0495, "num_tokens": 43113703.0, "step": 538 }, { "epoch": 0.6724890829694323, "grad_norm": 0.13008384107108037, "learning_rate": 9.14626685845812e-06, "loss": 0.045, "num_tokens": 43192777.0, "step": 539 }, { "epoch": 0.6737367436057392, "grad_norm": 0.13576570983520106, "learning_rate": 9.142712350461021e-06, "loss": 0.0504, "num_tokens": 43272684.0, "step": 540 }, { "epoch": 0.6749844042420462, "grad_norm": 0.1429459018435017, "learning_rate": 9.139151237125393e-06, "loss": 0.052, "num_tokens": 43354053.0, "step": 541 }, { "epoch": 0.6762320648783531, "grad_norm": 0.11983657237309615, "learning_rate": 9.135583524908614e-06, "loss": 0.0441, "num_tokens": 43435103.0, "step": 542 }, { "epoch": 0.67747972551466, "grad_norm": 0.14035766138431027, "learning_rate": 9.132009220280021e-06, "loss": 0.0498, "num_tokens": 43514717.0, "step": 543 }, { "epoch": 0.678727386150967, "grad_norm": 0.1374244928438365, "learning_rate": 9.128428329720911e-06, "loss": 0.0507, "num_tokens": 43595017.0, "step": 544 }, { "epoch": 0.6799750467872738, "grad_norm": 0.14354836890668102, "learning_rate": 9.12484085972452e-06, "loss": 0.0477, "num_tokens": 43674479.0, "step": 545 }, { "epoch": 0.6812227074235808, "grad_norm": 0.14516354150067737, "learning_rate": 9.121246816796017e-06, "loss": 0.0507, "num_tokens": 43755079.0, "step": 546 }, { "epoch": 0.6824703680598877, "grad_norm": 0.1400453547205689, "learning_rate": 9.117646207452487e-06, "loss": 0.0465, "num_tokens": 43834370.0, "step": 547 }, { "epoch": 0.6837180286961946, "grad_norm": 0.14040265790324466, "learning_rate": 9.114039038222922e-06, "loss": 0.045, "num_tokens": 43914052.0, "step": 548 }, { "epoch": 0.6849656893325016, "grad_norm": 0.14521374141163426, "learning_rate": 9.110425315648212e-06, "loss": 0.0489, "num_tokens": 43993792.0, "step": 549 }, { "epoch": 0.6862133499688085, "grad_norm": 0.142707572861237, "learning_rate": 9.106805046281127e-06, "loss": 0.047, "num_tokens": 44073401.0, "step": 550 }, { "epoch": 0.6874610106051154, "grad_norm": 0.13023090116652908, "learning_rate": 9.103178236686309e-06, "loss": 0.0465, "num_tokens": 44152510.0, "step": 551 }, { "epoch": 0.6887086712414223, "grad_norm": 0.12890465812404028, "learning_rate": 9.099544893440265e-06, "loss": 0.0489, "num_tokens": 44233765.0, "step": 552 }, { "epoch": 0.6899563318777293, "grad_norm": 0.13134548120051465, "learning_rate": 9.095905023131337e-06, "loss": 0.0464, "num_tokens": 44313193.0, "step": 553 }, { "epoch": 0.6912039925140362, "grad_norm": 0.1392402498661132, "learning_rate": 9.092258632359714e-06, "loss": 0.0523, "num_tokens": 44393864.0, "step": 554 }, { "epoch": 0.6924516531503431, "grad_norm": 0.150967684448626, "learning_rate": 9.088605727737405e-06, "loss": 0.0491, "num_tokens": 44472774.0, "step": 555 }, { "epoch": 0.6936993137866501, "grad_norm": 0.13327113149933534, "learning_rate": 9.08494631588823e-06, "loss": 0.0479, "num_tokens": 44552481.0, "step": 556 }, { "epoch": 0.6949469744229569, "grad_norm": 0.15639101173467318, "learning_rate": 9.08128040344781e-06, "loss": 0.0481, "num_tokens": 44632296.0, "step": 557 }, { "epoch": 0.6961946350592639, "grad_norm": 0.15006175503991592, "learning_rate": 9.077607997063546e-06, "loss": 0.0513, "num_tokens": 44711761.0, "step": 558 }, { "epoch": 0.6974422956955708, "grad_norm": 0.13626963289805946, "learning_rate": 9.073929103394627e-06, "loss": 0.045, "num_tokens": 44790965.0, "step": 559 }, { "epoch": 0.6986899563318777, "grad_norm": 0.13584036137479072, "learning_rate": 9.070243729111998e-06, "loss": 0.0499, "num_tokens": 44871764.0, "step": 560 }, { "epoch": 0.6999376169681847, "grad_norm": 0.12333589383269866, "learning_rate": 9.066551880898356e-06, "loss": 0.0449, "num_tokens": 44951455.0, "step": 561 }, { "epoch": 0.7011852776044916, "grad_norm": 0.13494757593024734, "learning_rate": 9.062853565448137e-06, "loss": 0.0468, "num_tokens": 45030780.0, "step": 562 }, { "epoch": 0.7024329382407986, "grad_norm": 0.13272773332650525, "learning_rate": 9.059148789467508e-06, "loss": 0.0471, "num_tokens": 45110544.0, "step": 563 }, { "epoch": 0.7036805988771054, "grad_norm": 0.13404989182349922, "learning_rate": 9.055437559674343e-06, "loss": 0.05, "num_tokens": 45190997.0, "step": 564 }, { "epoch": 0.7049282595134123, "grad_norm": 0.13081012181879706, "learning_rate": 9.051719882798226e-06, "loss": 0.0466, "num_tokens": 45270643.0, "step": 565 }, { "epoch": 0.7061759201497193, "grad_norm": 0.125779618664369, "learning_rate": 9.047995765580428e-06, "loss": 0.0464, "num_tokens": 45351906.0, "step": 566 }, { "epoch": 0.7074235807860262, "grad_norm": 0.1318104484558839, "learning_rate": 9.044265214773901e-06, "loss": 0.0485, "num_tokens": 45431479.0, "step": 567 }, { "epoch": 0.7086712414223332, "grad_norm": 0.13586766601016315, "learning_rate": 9.040528237143258e-06, "loss": 0.0508, "num_tokens": 45511600.0, "step": 568 }, { "epoch": 0.7099189020586401, "grad_norm": 0.1493946566599566, "learning_rate": 9.036784839464771e-06, "loss": 0.0482, "num_tokens": 45591552.0, "step": 569 }, { "epoch": 0.7111665626949469, "grad_norm": 0.13417312397284684, "learning_rate": 9.033035028526352e-06, "loss": 0.0424, "num_tokens": 45670440.0, "step": 570 }, { "epoch": 0.7124142233312539, "grad_norm": 0.13228632139209826, "learning_rate": 9.029278811127539e-06, "loss": 0.0462, "num_tokens": 45750706.0, "step": 571 }, { "epoch": 0.7136618839675608, "grad_norm": 0.12684341302244245, "learning_rate": 9.025516194079493e-06, "loss": 0.0447, "num_tokens": 45830615.0, "step": 572 }, { "epoch": 0.7149095446038678, "grad_norm": 0.14484313305712285, "learning_rate": 9.021747184204974e-06, "loss": 0.0502, "num_tokens": 45912425.0, "step": 573 }, { "epoch": 0.7161572052401747, "grad_norm": 0.13886894398435345, "learning_rate": 9.017971788338338e-06, "loss": 0.0519, "num_tokens": 45994773.0, "step": 574 }, { "epoch": 0.7174048658764816, "grad_norm": 0.13735085430234897, "learning_rate": 9.014190013325514e-06, "loss": 0.0486, "num_tokens": 46075176.0, "step": 575 }, { "epoch": 0.7186525265127885, "grad_norm": 0.14128392290391406, "learning_rate": 9.010401866024007e-06, "loss": 0.0463, "num_tokens": 46155352.0, "step": 576 }, { "epoch": 0.7199001871490954, "grad_norm": 0.129131515512014, "learning_rate": 9.006607353302874e-06, "loss": 0.0489, "num_tokens": 46236369.0, "step": 577 }, { "epoch": 0.7211478477854024, "grad_norm": 0.1388535780094133, "learning_rate": 9.00280648204271e-06, "loss": 0.0457, "num_tokens": 46315492.0, "step": 578 }, { "epoch": 0.7223955084217093, "grad_norm": 0.13207054260238488, "learning_rate": 8.998999259135648e-06, "loss": 0.0484, "num_tokens": 46395923.0, "step": 579 }, { "epoch": 0.7236431690580162, "grad_norm": 0.13220982688358388, "learning_rate": 8.99518569148533e-06, "loss": 0.0477, "num_tokens": 46476015.0, "step": 580 }, { "epoch": 0.7248908296943232, "grad_norm": 0.13584168006721206, "learning_rate": 8.991365786006908e-06, "loss": 0.0457, "num_tokens": 46555654.0, "step": 581 }, { "epoch": 0.72613849033063, "grad_norm": 0.13797132921454844, "learning_rate": 8.987539549627026e-06, "loss": 0.0488, "num_tokens": 46635834.0, "step": 582 }, { "epoch": 0.727386150966937, "grad_norm": 0.12983941108044414, "learning_rate": 8.983706989283804e-06, "loss": 0.0439, "num_tokens": 46715099.0, "step": 583 }, { "epoch": 0.7286338116032439, "grad_norm": 0.12724057093818048, "learning_rate": 8.979868111926836e-06, "loss": 0.0453, "num_tokens": 46794497.0, "step": 584 }, { "epoch": 0.7298814722395508, "grad_norm": 0.14712794756597866, "learning_rate": 8.976022924517167e-06, "loss": 0.0523, "num_tokens": 46875764.0, "step": 585 }, { "epoch": 0.7311291328758578, "grad_norm": 0.1260016012184683, "learning_rate": 8.972171434027283e-06, "loss": 0.0467, "num_tokens": 46954348.0, "step": 586 }, { "epoch": 0.7323767935121647, "grad_norm": 0.13638875135524814, "learning_rate": 8.968313647441098e-06, "loss": 0.0495, "num_tokens": 47035156.0, "step": 587 }, { "epoch": 0.7336244541484717, "grad_norm": 0.13598723506181007, "learning_rate": 8.964449571753949e-06, "loss": 0.048, "num_tokens": 47114933.0, "step": 588 }, { "epoch": 0.7348721147847785, "grad_norm": 0.13957131093949476, "learning_rate": 8.96057921397257e-06, "loss": 0.0476, "num_tokens": 47195699.0, "step": 589 }, { "epoch": 0.7361197754210854, "grad_norm": 0.13018188693022992, "learning_rate": 8.95670258111509e-06, "loss": 0.0463, "num_tokens": 47275340.0, "step": 590 }, { "epoch": 0.7373674360573924, "grad_norm": 0.12927934262951574, "learning_rate": 8.95281968021102e-06, "loss": 0.0472, "num_tokens": 47355190.0, "step": 591 }, { "epoch": 0.7386150966936993, "grad_norm": 0.13407866372594857, "learning_rate": 8.948930518301228e-06, "loss": 0.047, "num_tokens": 47435372.0, "step": 592 }, { "epoch": 0.7398627573300063, "grad_norm": 0.13953419467602407, "learning_rate": 8.945035102437943e-06, "loss": 0.0457, "num_tokens": 47515076.0, "step": 593 }, { "epoch": 0.7411104179663132, "grad_norm": 0.13257678892969102, "learning_rate": 8.94113343968473e-06, "loss": 0.0472, "num_tokens": 47594913.0, "step": 594 }, { "epoch": 0.74235807860262, "grad_norm": 0.13956156798849514, "learning_rate": 8.937225537116482e-06, "loss": 0.0499, "num_tokens": 47674616.0, "step": 595 }, { "epoch": 0.743605739238927, "grad_norm": 0.12720381017536367, "learning_rate": 8.93331140181941e-06, "loss": 0.0488, "num_tokens": 47754372.0, "step": 596 }, { "epoch": 0.7448533998752339, "grad_norm": 0.13600897445694435, "learning_rate": 8.929391040891022e-06, "loss": 0.0521, "num_tokens": 47834920.0, "step": 597 }, { "epoch": 0.7461010605115409, "grad_norm": 0.1392796000732699, "learning_rate": 8.92546446144012e-06, "loss": 0.0472, "num_tokens": 47914382.0, "step": 598 }, { "epoch": 0.7473487211478478, "grad_norm": 0.1385112153330734, "learning_rate": 8.921531670586778e-06, "loss": 0.0447, "num_tokens": 47993846.0, "step": 599 }, { "epoch": 0.7485963817841547, "grad_norm": 0.12658122859507898, "learning_rate": 8.917592675462333e-06, "loss": 0.0464, "num_tokens": 48073646.0, "step": 600 }, { "epoch": 0.7498440424204617, "grad_norm": 0.1419041010382823, "learning_rate": 8.913647483209376e-06, "loss": 0.0466, "num_tokens": 48153764.0, "step": 601 }, { "epoch": 0.7510917030567685, "grad_norm": 0.12852069420000675, "learning_rate": 8.909696100981734e-06, "loss": 0.046, "num_tokens": 48235033.0, "step": 602 }, { "epoch": 0.7523393636930755, "grad_norm": 0.13929453816389586, "learning_rate": 8.905738535944453e-06, "loss": 0.0456, "num_tokens": 48315153.0, "step": 603 }, { "epoch": 0.7535870243293824, "grad_norm": 0.1294892482761023, "learning_rate": 8.901774795273799e-06, "loss": 0.0487, "num_tokens": 48394989.0, "step": 604 }, { "epoch": 0.7548346849656893, "grad_norm": 0.1265516497561457, "learning_rate": 8.897804886157229e-06, "loss": 0.0447, "num_tokens": 48475313.0, "step": 605 }, { "epoch": 0.7560823456019963, "grad_norm": 0.1250026876119754, "learning_rate": 8.893828815793389e-06, "loss": 0.0445, "num_tokens": 48554294.0, "step": 606 }, { "epoch": 0.7573300062383032, "grad_norm": 0.1395621069701436, "learning_rate": 8.889846591392097e-06, "loss": 0.045, "num_tokens": 48633837.0, "step": 607 }, { "epoch": 0.7585776668746101, "grad_norm": 0.1292703654270298, "learning_rate": 8.88585822017433e-06, "loss": 0.0438, "num_tokens": 48712902.0, "step": 608 }, { "epoch": 0.759825327510917, "grad_norm": 0.12675734039745323, "learning_rate": 8.881863709372207e-06, "loss": 0.0461, "num_tokens": 48792522.0, "step": 609 }, { "epoch": 0.7610729881472239, "grad_norm": 0.12830442879917106, "learning_rate": 8.877863066228987e-06, "loss": 0.0481, "num_tokens": 48873543.0, "step": 610 }, { "epoch": 0.7623206487835309, "grad_norm": 0.13244561051443332, "learning_rate": 8.873856297999045e-06, "loss": 0.047, "num_tokens": 48952745.0, "step": 611 }, { "epoch": 0.7635683094198378, "grad_norm": 0.1615375485783518, "learning_rate": 8.869843411947862e-06, "loss": 0.0496, "num_tokens": 49033599.0, "step": 612 }, { "epoch": 0.7648159700561448, "grad_norm": 0.13780128644772976, "learning_rate": 8.865824415352014e-06, "loss": 0.0461, "num_tokens": 49113085.0, "step": 613 }, { "epoch": 0.7660636306924516, "grad_norm": 0.13586985951384964, "learning_rate": 8.861799315499157e-06, "loss": 0.046, "num_tokens": 49191966.0, "step": 614 }, { "epoch": 0.7673112913287585, "grad_norm": 0.1271021845898844, "learning_rate": 8.85776811968801e-06, "loss": 0.0435, "num_tokens": 49271798.0, "step": 615 }, { "epoch": 0.7685589519650655, "grad_norm": 0.13241677295783946, "learning_rate": 8.853730835228354e-06, "loss": 0.0462, "num_tokens": 49351085.0, "step": 616 }, { "epoch": 0.7698066126013724, "grad_norm": 0.1473649979705641, "learning_rate": 8.849687469441003e-06, "loss": 0.0512, "num_tokens": 49432116.0, "step": 617 }, { "epoch": 0.7710542732376794, "grad_norm": 0.13824389423293423, "learning_rate": 8.845638029657804e-06, "loss": 0.0437, "num_tokens": 49511692.0, "step": 618 }, { "epoch": 0.7723019338739863, "grad_norm": 0.14209836646808868, "learning_rate": 8.841582523221614e-06, "loss": 0.0491, "num_tokens": 49592183.0, "step": 619 }, { "epoch": 0.7735495945102931, "grad_norm": 0.12494073093259925, "learning_rate": 8.83752095748629e-06, "loss": 0.0456, "num_tokens": 49672496.0, "step": 620 }, { "epoch": 0.7747972551466001, "grad_norm": 0.1494311981433102, "learning_rate": 8.833453339816682e-06, "loss": 0.0544, "num_tokens": 49752281.0, "step": 621 }, { "epoch": 0.776044915782907, "grad_norm": 0.14004390380525242, "learning_rate": 8.829379677588607e-06, "loss": 0.0486, "num_tokens": 49831426.0, "step": 622 }, { "epoch": 0.777292576419214, "grad_norm": 0.1339521325499603, "learning_rate": 8.825299978188847e-06, "loss": 0.0462, "num_tokens": 49911746.0, "step": 623 }, { "epoch": 0.7785402370555209, "grad_norm": 0.12799070765367507, "learning_rate": 8.821214249015133e-06, "loss": 0.0427, "num_tokens": 49990124.0, "step": 624 }, { "epoch": 0.7797878976918278, "grad_norm": 0.14563763920223902, "learning_rate": 8.817122497476122e-06, "loss": 0.0433, "num_tokens": 50069850.0, "step": 625 }, { "epoch": 0.7810355583281348, "grad_norm": 0.1412962344681427, "learning_rate": 8.8130247309914e-06, "loss": 0.0465, "num_tokens": 50148928.0, "step": 626 }, { "epoch": 0.7822832189644416, "grad_norm": 0.12673793649670753, "learning_rate": 8.808920956991455e-06, "loss": 0.0486, "num_tokens": 50229320.0, "step": 627 }, { "epoch": 0.7835308796007486, "grad_norm": 0.12112230001153174, "learning_rate": 8.80481118291767e-06, "loss": 0.0467, "num_tokens": 50310309.0, "step": 628 }, { "epoch": 0.7847785402370555, "grad_norm": 0.12671902753289974, "learning_rate": 8.800695416222305e-06, "loss": 0.0455, "num_tokens": 50389538.0, "step": 629 }, { "epoch": 0.7860262008733624, "grad_norm": 0.12959073046474556, "learning_rate": 8.796573664368492e-06, "loss": 0.0461, "num_tokens": 50469795.0, "step": 630 }, { "epoch": 0.7872738615096694, "grad_norm": 0.13262046702793368, "learning_rate": 8.792445934830215e-06, "loss": 0.0475, "num_tokens": 50550218.0, "step": 631 }, { "epoch": 0.7885215221459763, "grad_norm": 0.13536729420156757, "learning_rate": 8.78831223509229e-06, "loss": 0.0448, "num_tokens": 50630135.0, "step": 632 }, { "epoch": 0.7897691827822833, "grad_norm": 0.1342625728159916, "learning_rate": 8.784172572650366e-06, "loss": 0.0425, "num_tokens": 50709708.0, "step": 633 }, { "epoch": 0.7910168434185901, "grad_norm": 0.1359058372010772, "learning_rate": 8.780026955010903e-06, "loss": 0.0441, "num_tokens": 50789219.0, "step": 634 }, { "epoch": 0.7922645040548971, "grad_norm": 0.12874679521506371, "learning_rate": 8.77587538969116e-06, "loss": 0.0437, "num_tokens": 50869389.0, "step": 635 }, { "epoch": 0.793512164691204, "grad_norm": 0.1369083708111066, "learning_rate": 8.771717884219177e-06, "loss": 0.0493, "num_tokens": 50949396.0, "step": 636 }, { "epoch": 0.7947598253275109, "grad_norm": 0.14129983597974075, "learning_rate": 8.767554446133771e-06, "loss": 0.0451, "num_tokens": 51029592.0, "step": 637 }, { "epoch": 0.7960074859638179, "grad_norm": 0.1446719706863878, "learning_rate": 8.763385082984511e-06, "loss": 0.0506, "num_tokens": 51110547.0, "step": 638 }, { "epoch": 0.7972551466001248, "grad_norm": 0.13560344797018603, "learning_rate": 8.759209802331714e-06, "loss": 0.0472, "num_tokens": 51191617.0, "step": 639 }, { "epoch": 0.7985028072364317, "grad_norm": 0.15251529644301726, "learning_rate": 8.755028611746426e-06, "loss": 0.048, "num_tokens": 51271860.0, "step": 640 }, { "epoch": 0.7997504678727386, "grad_norm": 0.12215214673056587, "learning_rate": 8.750841518810407e-06, "loss": 0.0476, "num_tokens": 51351464.0, "step": 641 }, { "epoch": 0.8009981285090455, "grad_norm": 0.1295074754396241, "learning_rate": 8.746648531116126e-06, "loss": 0.0443, "num_tokens": 51432841.0, "step": 642 }, { "epoch": 0.8022457891453525, "grad_norm": 0.12895613658486638, "learning_rate": 8.742449656266733e-06, "loss": 0.0456, "num_tokens": 51512108.0, "step": 643 }, { "epoch": 0.8034934497816594, "grad_norm": 0.12440364391857861, "learning_rate": 8.738244901876061e-06, "loss": 0.0447, "num_tokens": 51592279.0, "step": 644 }, { "epoch": 0.8047411104179664, "grad_norm": 0.1326702881926524, "learning_rate": 8.7340342755686e-06, "loss": 0.0447, "num_tokens": 51672173.0, "step": 645 }, { "epoch": 0.8059887710542732, "grad_norm": 0.13915957948227095, "learning_rate": 8.729817784979485e-06, "loss": 0.049, "num_tokens": 51753477.0, "step": 646 }, { "epoch": 0.8072364316905801, "grad_norm": 0.14093605623183406, "learning_rate": 8.725595437754489e-06, "loss": 0.0649, "num_tokens": 51834370.0, "step": 647 }, { "epoch": 0.8084840923268871, "grad_norm": 0.1257616499498055, "learning_rate": 8.721367241550007e-06, "loss": 0.0433, "num_tokens": 51913279.0, "step": 648 }, { "epoch": 0.809731752963194, "grad_norm": 0.14095744719546538, "learning_rate": 8.717133204033034e-06, "loss": 0.0418, "num_tokens": 51991782.0, "step": 649 }, { "epoch": 0.810979413599501, "grad_norm": 0.13700683189419194, "learning_rate": 8.71289333288116e-06, "loss": 0.0469, "num_tokens": 52071863.0, "step": 650 }, { "epoch": 0.8122270742358079, "grad_norm": 0.13167335028118232, "learning_rate": 8.708647635782553e-06, "loss": 0.0469, "num_tokens": 52151659.0, "step": 651 }, { "epoch": 0.8134747348721147, "grad_norm": 0.13212682360066533, "learning_rate": 8.704396120435944e-06, "loss": 0.0418, "num_tokens": 52230329.0, "step": 652 }, { "epoch": 0.8147223955084217, "grad_norm": 0.13460259814153436, "learning_rate": 8.700138794550617e-06, "loss": 0.0477, "num_tokens": 52310621.0, "step": 653 }, { "epoch": 0.8159700561447286, "grad_norm": 0.1262576220568161, "learning_rate": 8.695875665846392e-06, "loss": 0.043, "num_tokens": 52390363.0, "step": 654 }, { "epoch": 0.8172177167810356, "grad_norm": 0.13451324588776792, "learning_rate": 8.691606742053608e-06, "loss": 0.0445, "num_tokens": 52470407.0, "step": 655 }, { "epoch": 0.8184653774173425, "grad_norm": 0.13838060943839892, "learning_rate": 8.687332030913114e-06, "loss": 0.0455, "num_tokens": 52550801.0, "step": 656 }, { "epoch": 0.8197130380536494, "grad_norm": 0.12544846461435052, "learning_rate": 8.683051540176252e-06, "loss": 0.0453, "num_tokens": 52630184.0, "step": 657 }, { "epoch": 0.8209606986899564, "grad_norm": 0.1227925470813221, "learning_rate": 8.67876527760485e-06, "loss": 0.0449, "num_tokens": 52710226.0, "step": 658 }, { "epoch": 0.8222083593262632, "grad_norm": 0.14125944393926942, "learning_rate": 8.674473250971194e-06, "loss": 0.0479, "num_tokens": 52789646.0, "step": 659 }, { "epoch": 0.8234560199625702, "grad_norm": 0.12898991776625454, "learning_rate": 8.670175468058027e-06, "loss": 0.0453, "num_tokens": 52870777.0, "step": 660 }, { "epoch": 0.8247036805988771, "grad_norm": 0.12662076248706405, "learning_rate": 8.665871936658525e-06, "loss": 0.0464, "num_tokens": 52950874.0, "step": 661 }, { "epoch": 0.825951341235184, "grad_norm": 0.1246338732142322, "learning_rate": 8.661562664576297e-06, "loss": 0.0449, "num_tokens": 53030308.0, "step": 662 }, { "epoch": 0.827199001871491, "grad_norm": 0.12386450841807353, "learning_rate": 8.65724765962535e-06, "loss": 0.0453, "num_tokens": 53110905.0, "step": 663 }, { "epoch": 0.8284466625077979, "grad_norm": 0.1252925833827586, "learning_rate": 8.652926929630097e-06, "loss": 0.0448, "num_tokens": 53190924.0, "step": 664 }, { "epoch": 0.8296943231441049, "grad_norm": 0.13199276345367864, "learning_rate": 8.648600482425325e-06, "loss": 0.0469, "num_tokens": 53271193.0, "step": 665 }, { "epoch": 0.8309419837804117, "grad_norm": 0.1226282221225014, "learning_rate": 8.644268325856193e-06, "loss": 0.0434, "num_tokens": 53350537.0, "step": 666 }, { "epoch": 0.8321896444167186, "grad_norm": 0.12053686374626792, "learning_rate": 8.639930467778206e-06, "loss": 0.0438, "num_tokens": 53432655.0, "step": 667 }, { "epoch": 0.8334373050530256, "grad_norm": 0.12451975349671367, "learning_rate": 8.635586916057214e-06, "loss": 0.0445, "num_tokens": 53512112.0, "step": 668 }, { "epoch": 0.8346849656893325, "grad_norm": 0.12083730426276222, "learning_rate": 8.631237678569391e-06, "loss": 0.0462, "num_tokens": 53593075.0, "step": 669 }, { "epoch": 0.8359326263256395, "grad_norm": 0.12769247867470077, "learning_rate": 8.626882763201215e-06, "loss": 0.0429, "num_tokens": 53672672.0, "step": 670 }, { "epoch": 0.8371802869619464, "grad_norm": 0.1238630262805468, "learning_rate": 8.62252217784947e-06, "loss": 0.0427, "num_tokens": 53751669.0, "step": 671 }, { "epoch": 0.8384279475982532, "grad_norm": 0.1391950829315506, "learning_rate": 8.61815593042121e-06, "loss": 0.0437, "num_tokens": 53831916.0, "step": 672 }, { "epoch": 0.8396756082345602, "grad_norm": 0.12832813628154105, "learning_rate": 8.61378402883376e-06, "loss": 0.0454, "num_tokens": 53911127.0, "step": 673 }, { "epoch": 0.8409232688708671, "grad_norm": 0.12869289586194874, "learning_rate": 8.609406481014704e-06, "loss": 0.0493, "num_tokens": 53992015.0, "step": 674 }, { "epoch": 0.8421709295071741, "grad_norm": 0.1295243174606551, "learning_rate": 8.605023294901857e-06, "loss": 0.0453, "num_tokens": 54074688.0, "step": 675 }, { "epoch": 0.843418590143481, "grad_norm": 0.14327157160951173, "learning_rate": 8.600634478443262e-06, "loss": 0.0475, "num_tokens": 54154762.0, "step": 676 }, { "epoch": 0.8446662507797879, "grad_norm": 0.13689413605347608, "learning_rate": 8.596240039597168e-06, "loss": 0.0487, "num_tokens": 54234083.0, "step": 677 }, { "epoch": 0.8459139114160948, "grad_norm": 0.12476227243203361, "learning_rate": 8.59183998633202e-06, "loss": 0.043, "num_tokens": 54313838.0, "step": 678 }, { "epoch": 0.8471615720524017, "grad_norm": 0.12843919174639454, "learning_rate": 8.587434326626446e-06, "loss": 0.0439, "num_tokens": 54393140.0, "step": 679 }, { "epoch": 0.8484092326887087, "grad_norm": 0.13686412036721382, "learning_rate": 8.58302306846924e-06, "loss": 0.0501, "num_tokens": 54474299.0, "step": 680 }, { "epoch": 0.8496568933250156, "grad_norm": 0.13617664891691592, "learning_rate": 8.57860621985934e-06, "loss": 0.0445, "num_tokens": 54553494.0, "step": 681 }, { "epoch": 0.8509045539613225, "grad_norm": 0.1271657490150743, "learning_rate": 8.574183788805838e-06, "loss": 0.044, "num_tokens": 54633507.0, "step": 682 }, { "epoch": 0.8521522145976295, "grad_norm": 0.13384106872272458, "learning_rate": 8.56975578332793e-06, "loss": 0.0443, "num_tokens": 54713372.0, "step": 683 }, { "epoch": 0.8533998752339363, "grad_norm": 0.11560773243774969, "learning_rate": 8.56532221145493e-06, "loss": 0.0429, "num_tokens": 54792488.0, "step": 684 }, { "epoch": 0.8546475358702433, "grad_norm": 0.1212428276738136, "learning_rate": 8.560883081226246e-06, "loss": 0.0443, "num_tokens": 54873959.0, "step": 685 }, { "epoch": 0.8558951965065502, "grad_norm": 0.14025896849225925, "learning_rate": 8.55643840069136e-06, "loss": 0.0466, "num_tokens": 54954820.0, "step": 686 }, { "epoch": 0.8571428571428571, "grad_norm": 0.13808021632034995, "learning_rate": 8.551988177909825e-06, "loss": 0.0462, "num_tokens": 55035474.0, "step": 687 }, { "epoch": 0.8583905177791641, "grad_norm": 0.14688685012575073, "learning_rate": 8.547532420951236e-06, "loss": 0.0474, "num_tokens": 55115987.0, "step": 688 }, { "epoch": 0.859638178415471, "grad_norm": 0.13519377075289518, "learning_rate": 8.543071137895231e-06, "loss": 0.0486, "num_tokens": 55195473.0, "step": 689 }, { "epoch": 0.860885839051778, "grad_norm": 0.1383342982212882, "learning_rate": 8.538604336831463e-06, "loss": 0.0456, "num_tokens": 55274897.0, "step": 690 }, { "epoch": 0.8621334996880848, "grad_norm": 0.12302270516387563, "learning_rate": 8.53413202585959e-06, "loss": 0.042, "num_tokens": 55354684.0, "step": 691 }, { "epoch": 0.8633811603243917, "grad_norm": 0.12034339097373062, "learning_rate": 8.529654213089266e-06, "loss": 0.0416, "num_tokens": 55434479.0, "step": 692 }, { "epoch": 0.8646288209606987, "grad_norm": 0.12441949815381428, "learning_rate": 8.52517090664012e-06, "loss": 0.0434, "num_tokens": 55513816.0, "step": 693 }, { "epoch": 0.8658764815970056, "grad_norm": 0.13357252884483028, "learning_rate": 8.520682114641739e-06, "loss": 0.0441, "num_tokens": 55593435.0, "step": 694 }, { "epoch": 0.8671241422333126, "grad_norm": 0.12424156938040647, "learning_rate": 8.51618784523366e-06, "loss": 0.0445, "num_tokens": 55673866.0, "step": 695 }, { "epoch": 0.8683718028696195, "grad_norm": 0.13488286343188147, "learning_rate": 8.511688106565356e-06, "loss": 0.0462, "num_tokens": 55754183.0, "step": 696 }, { "epoch": 0.8696194635059263, "grad_norm": 0.12902239020926456, "learning_rate": 8.507182906796209e-06, "loss": 0.0479, "num_tokens": 55835122.0, "step": 697 }, { "epoch": 0.8708671241422333, "grad_norm": 0.12700265076298542, "learning_rate": 8.50267225409551e-06, "loss": 0.0487, "num_tokens": 55916059.0, "step": 698 }, { "epoch": 0.8721147847785402, "grad_norm": 0.12439806501667146, "learning_rate": 8.498156156642434e-06, "loss": 0.0482, "num_tokens": 55996309.0, "step": 699 }, { "epoch": 0.8733624454148472, "grad_norm": 0.13662835946719407, "learning_rate": 8.493634622626031e-06, "loss": 0.0487, "num_tokens": 56076989.0, "step": 700 }, { "epoch": 0.8746101060511541, "grad_norm": 0.13097355494242618, "learning_rate": 8.489107660245208e-06, "loss": 0.0455, "num_tokens": 56156600.0, "step": 701 }, { "epoch": 0.875857766687461, "grad_norm": 0.12619811815079715, "learning_rate": 8.484575277708718e-06, "loss": 0.0482, "num_tokens": 56237662.0, "step": 702 }, { "epoch": 0.877105427323768, "grad_norm": 0.13272981560160393, "learning_rate": 8.480037483235142e-06, "loss": 0.0443, "num_tokens": 56318037.0, "step": 703 }, { "epoch": 0.8783530879600748, "grad_norm": 0.13876316053346943, "learning_rate": 8.475494285052873e-06, "loss": 0.0433, "num_tokens": 56397397.0, "step": 704 }, { "epoch": 0.8796007485963818, "grad_norm": 0.12990102861221764, "learning_rate": 8.470945691400095e-06, "loss": 0.0491, "num_tokens": 56478651.0, "step": 705 }, { "epoch": 0.8808484092326887, "grad_norm": 0.12177730095213436, "learning_rate": 8.466391710524792e-06, "loss": 0.0454, "num_tokens": 56559439.0, "step": 706 }, { "epoch": 0.8820960698689956, "grad_norm": 0.13073319119141943, "learning_rate": 8.461832350684701e-06, "loss": 0.044, "num_tokens": 56639419.0, "step": 707 }, { "epoch": 0.8833437305053026, "grad_norm": 0.13039116346051194, "learning_rate": 8.457267620147326e-06, "loss": 0.0479, "num_tokens": 56720177.0, "step": 708 }, { "epoch": 0.8845913911416095, "grad_norm": 0.11392658067382258, "learning_rate": 8.452697527189901e-06, "loss": 0.0417, "num_tokens": 56799870.0, "step": 709 }, { "epoch": 0.8858390517779164, "grad_norm": 0.12172332991038214, "learning_rate": 8.448122080099384e-06, "loss": 0.0436, "num_tokens": 56879737.0, "step": 710 }, { "epoch": 0.8870867124142233, "grad_norm": 0.12348030891715099, "learning_rate": 8.443541287172443e-06, "loss": 0.0464, "num_tokens": 56960161.0, "step": 711 }, { "epoch": 0.8883343730505302, "grad_norm": 0.1367197777768623, "learning_rate": 8.438955156715443e-06, "loss": 0.0472, "num_tokens": 57040569.0, "step": 712 }, { "epoch": 0.8895820336868372, "grad_norm": 0.12968055372030557, "learning_rate": 8.434363697044423e-06, "loss": 0.0475, "num_tokens": 57120700.0, "step": 713 }, { "epoch": 0.8908296943231441, "grad_norm": 0.10708595724786292, "learning_rate": 8.429766916485087e-06, "loss": 0.0398, "num_tokens": 57199169.0, "step": 714 }, { "epoch": 0.8920773549594511, "grad_norm": 0.1289490659534409, "learning_rate": 8.42516482337279e-06, "loss": 0.0419, "num_tokens": 57278512.0, "step": 715 }, { "epoch": 0.8933250155957579, "grad_norm": 0.13480571497635496, "learning_rate": 8.420557426052513e-06, "loss": 0.0432, "num_tokens": 57358212.0, "step": 716 }, { "epoch": 0.8945726762320648, "grad_norm": 0.1402455870237538, "learning_rate": 8.415944732878863e-06, "loss": 0.0413, "num_tokens": 57437516.0, "step": 717 }, { "epoch": 0.8958203368683718, "grad_norm": 0.13195092592426141, "learning_rate": 8.411326752216048e-06, "loss": 0.0469, "num_tokens": 57518129.0, "step": 718 }, { "epoch": 0.8970679975046787, "grad_norm": 0.12111826822571906, "learning_rate": 8.406703492437863e-06, "loss": 0.0464, "num_tokens": 57598680.0, "step": 719 }, { "epoch": 0.8983156581409857, "grad_norm": 0.1316599285653508, "learning_rate": 8.402074961927674e-06, "loss": 0.0467, "num_tokens": 57679535.0, "step": 720 }, { "epoch": 0.8995633187772926, "grad_norm": 0.12648224011362696, "learning_rate": 8.397441169078404e-06, "loss": 0.0474, "num_tokens": 57761803.0, "step": 721 }, { "epoch": 0.9008109794135996, "grad_norm": 0.13349369075280076, "learning_rate": 8.392802122292522e-06, "loss": 0.0453, "num_tokens": 57841711.0, "step": 722 }, { "epoch": 0.9020586400499064, "grad_norm": 0.12412030984325427, "learning_rate": 8.388157829982023e-06, "loss": 0.0462, "num_tokens": 57921862.0, "step": 723 }, { "epoch": 0.9033063006862133, "grad_norm": 0.12191290188001579, "learning_rate": 8.383508300568409e-06, "loss": 0.0423, "num_tokens": 58001170.0, "step": 724 }, { "epoch": 0.9045539613225203, "grad_norm": 0.12947888801849444, "learning_rate": 8.378853542482687e-06, "loss": 0.0444, "num_tokens": 58078720.0, "step": 725 }, { "epoch": 0.9058016219588272, "grad_norm": 0.12187109531791596, "learning_rate": 8.374193564165338e-06, "loss": 0.0417, "num_tokens": 58158057.0, "step": 726 }, { "epoch": 0.9070492825951342, "grad_norm": 0.1303865282661451, "learning_rate": 8.36952837406631e-06, "loss": 0.044, "num_tokens": 58237021.0, "step": 727 }, { "epoch": 0.9082969432314411, "grad_norm": 0.12384626240849539, "learning_rate": 8.364857980645006e-06, "loss": 0.0436, "num_tokens": 58318537.0, "step": 728 }, { "epoch": 0.9095446038677479, "grad_norm": 0.123878366797376, "learning_rate": 8.360182392370258e-06, "loss": 0.0463, "num_tokens": 58398712.0, "step": 729 }, { "epoch": 0.9107922645040549, "grad_norm": 0.1252594849614702, "learning_rate": 8.355501617720321e-06, "loss": 0.0469, "num_tokens": 58480120.0, "step": 730 }, { "epoch": 0.9120399251403618, "grad_norm": 0.12212319000991989, "learning_rate": 8.350815665182855e-06, "loss": 0.0415, "num_tokens": 58559337.0, "step": 731 }, { "epoch": 0.9132875857766688, "grad_norm": 0.13115254226617012, "learning_rate": 8.34612454325491e-06, "loss": 0.045, "num_tokens": 58639403.0, "step": 732 }, { "epoch": 0.9145352464129757, "grad_norm": 0.1153903682923885, "learning_rate": 8.341428260442907e-06, "loss": 0.0421, "num_tokens": 58719196.0, "step": 733 }, { "epoch": 0.9157829070492826, "grad_norm": 0.1350993442001407, "learning_rate": 8.336726825262622e-06, "loss": 0.0458, "num_tokens": 58798792.0, "step": 734 }, { "epoch": 0.9170305676855895, "grad_norm": 0.116827202506226, "learning_rate": 8.332020246239183e-06, "loss": 0.0454, "num_tokens": 58878585.0, "step": 735 }, { "epoch": 0.9182782283218964, "grad_norm": 0.13063470525394222, "learning_rate": 8.327308531907039e-06, "loss": 0.0429, "num_tokens": 58957398.0, "step": 736 }, { "epoch": 0.9195258889582034, "grad_norm": 0.11943276059553698, "learning_rate": 8.322591690809952e-06, "loss": 0.0436, "num_tokens": 59036436.0, "step": 737 }, { "epoch": 0.9207735495945103, "grad_norm": 0.13311888551517942, "learning_rate": 8.317869731500981e-06, "loss": 0.0472, "num_tokens": 59117727.0, "step": 738 }, { "epoch": 0.9220212102308172, "grad_norm": 0.13441754023620334, "learning_rate": 8.313142662542465e-06, "loss": 0.0427, "num_tokens": 59198600.0, "step": 739 }, { "epoch": 0.9232688708671242, "grad_norm": 0.12903006803949557, "learning_rate": 8.30841049250601e-06, "loss": 0.043, "num_tokens": 59276825.0, "step": 740 }, { "epoch": 0.924516531503431, "grad_norm": 0.11892900054116198, "learning_rate": 8.303673229972468e-06, "loss": 0.0428, "num_tokens": 59356479.0, "step": 741 }, { "epoch": 0.925764192139738, "grad_norm": 0.11014129765319786, "learning_rate": 8.298930883531932e-06, "loss": 0.0402, "num_tokens": 59435634.0, "step": 742 }, { "epoch": 0.9270118527760449, "grad_norm": 0.12790379344343886, "learning_rate": 8.294183461783704e-06, "loss": 0.0479, "num_tokens": 59518043.0, "step": 743 }, { "epoch": 0.9282595134123518, "grad_norm": 0.1271886081838466, "learning_rate": 8.2894309733363e-06, "loss": 0.0449, "num_tokens": 59598023.0, "step": 744 }, { "epoch": 0.9295071740486588, "grad_norm": 0.12608804603183602, "learning_rate": 8.284673426807413e-06, "loss": 0.0442, "num_tokens": 59677990.0, "step": 745 }, { "epoch": 0.9307548346849657, "grad_norm": 0.12197353808866439, "learning_rate": 8.279910830823917e-06, "loss": 0.0428, "num_tokens": 59757003.0, "step": 746 }, { "epoch": 0.9320024953212727, "grad_norm": 0.1183743614396383, "learning_rate": 8.275143194021837e-06, "loss": 0.0421, "num_tokens": 59835942.0, "step": 747 }, { "epoch": 0.9332501559575795, "grad_norm": 0.11788779371119068, "learning_rate": 8.270370525046338e-06, "loss": 0.0387, "num_tokens": 59915228.0, "step": 748 }, { "epoch": 0.9344978165938864, "grad_norm": 0.12838666597907625, "learning_rate": 8.265592832551714e-06, "loss": 0.0459, "num_tokens": 59997067.0, "step": 749 }, { "epoch": 0.9357454772301934, "grad_norm": 0.12163620589626947, "learning_rate": 8.260810125201363e-06, "loss": 0.0441, "num_tokens": 60076744.0, "step": 750 }, { "epoch": 0.9369931378665003, "grad_norm": 0.1208216904975396, "learning_rate": 8.25602241166778e-06, "loss": 0.0434, "num_tokens": 60156803.0, "step": 751 }, { "epoch": 0.9382407985028073, "grad_norm": 0.12583135298289463, "learning_rate": 8.251229700632536e-06, "loss": 0.0439, "num_tokens": 60237132.0, "step": 752 }, { "epoch": 0.9394884591391142, "grad_norm": 0.12011312406139918, "learning_rate": 8.246432000786267e-06, "loss": 0.0409, "num_tokens": 60316759.0, "step": 753 }, { "epoch": 0.940736119775421, "grad_norm": 0.1257483923597958, "learning_rate": 8.241629320828652e-06, "loss": 0.0431, "num_tokens": 60395865.0, "step": 754 }, { "epoch": 0.941983780411728, "grad_norm": 0.12598050416310153, "learning_rate": 8.2368216694684e-06, "loss": 0.0448, "num_tokens": 60476033.0, "step": 755 }, { "epoch": 0.9432314410480349, "grad_norm": 0.1371990547888002, "learning_rate": 8.232009055423236e-06, "loss": 0.0429, "num_tokens": 60555833.0, "step": 756 }, { "epoch": 0.9444791016843419, "grad_norm": 0.12968495911088507, "learning_rate": 8.227191487419887e-06, "loss": 0.0431, "num_tokens": 60635691.0, "step": 757 }, { "epoch": 0.9457267623206488, "grad_norm": 0.1222666448004795, "learning_rate": 8.222368974194057e-06, "loss": 0.0423, "num_tokens": 60715830.0, "step": 758 }, { "epoch": 0.9469744229569557, "grad_norm": 0.13249737757353294, "learning_rate": 8.217541524490422e-06, "loss": 0.0504, "num_tokens": 60796607.0, "step": 759 }, { "epoch": 0.9482220835932627, "grad_norm": 0.12046136010955967, "learning_rate": 8.212709147062604e-06, "loss": 0.0407, "num_tokens": 60875388.0, "step": 760 }, { "epoch": 0.9494697442295695, "grad_norm": 0.1347014144528205, "learning_rate": 8.207871850673168e-06, "loss": 0.0418, "num_tokens": 60954745.0, "step": 761 }, { "epoch": 0.9507174048658765, "grad_norm": 0.13217879759164464, "learning_rate": 8.203029644093593e-06, "loss": 0.0473, "num_tokens": 61035577.0, "step": 762 }, { "epoch": 0.9519650655021834, "grad_norm": 0.12642961452954143, "learning_rate": 8.198182536104262e-06, "loss": 0.0438, "num_tokens": 61114939.0, "step": 763 }, { "epoch": 0.9532127261384903, "grad_norm": 0.12699546277574592, "learning_rate": 8.193330535494448e-06, "loss": 0.0433, "num_tokens": 61194941.0, "step": 764 }, { "epoch": 0.9544603867747973, "grad_norm": 0.1248723416107508, "learning_rate": 8.188473651062296e-06, "loss": 0.0444, "num_tokens": 61275037.0, "step": 765 }, { "epoch": 0.9557080474111042, "grad_norm": 0.1224075580549069, "learning_rate": 8.183611891614803e-06, "loss": 0.0413, "num_tokens": 61354585.0, "step": 766 }, { "epoch": 0.9569557080474111, "grad_norm": 0.12023974168888206, "learning_rate": 8.178745265967808e-06, "loss": 0.0413, "num_tokens": 61434070.0, "step": 767 }, { "epoch": 0.958203368683718, "grad_norm": 0.11768365656858075, "learning_rate": 8.173873782945976e-06, "loss": 0.044, "num_tokens": 61513480.0, "step": 768 }, { "epoch": 0.9594510293200249, "grad_norm": 0.12252686093661061, "learning_rate": 8.168997451382778e-06, "loss": 0.0466, "num_tokens": 61593111.0, "step": 769 }, { "epoch": 0.9606986899563319, "grad_norm": 0.12444242831191968, "learning_rate": 8.164116280120478e-06, "loss": 0.0467, "num_tokens": 61673108.0, "step": 770 }, { "epoch": 0.9619463505926388, "grad_norm": 0.1170074311782113, "learning_rate": 8.159230278010113e-06, "loss": 0.0408, "num_tokens": 61752840.0, "step": 771 }, { "epoch": 0.9631940112289458, "grad_norm": 0.11877585272337694, "learning_rate": 8.154339453911483e-06, "loss": 0.0429, "num_tokens": 61832555.0, "step": 772 }, { "epoch": 0.9644416718652526, "grad_norm": 0.12480036823767689, "learning_rate": 8.14944381669313e-06, "loss": 0.0452, "num_tokens": 61912425.0, "step": 773 }, { "epoch": 0.9656893325015595, "grad_norm": 0.11828195611774657, "learning_rate": 8.144543375232322e-06, "loss": 0.0408, "num_tokens": 61992459.0, "step": 774 }, { "epoch": 0.9669369931378665, "grad_norm": 0.1326374355179406, "learning_rate": 8.139638138415041e-06, "loss": 0.0431, "num_tokens": 62073365.0, "step": 775 }, { "epoch": 0.9681846537741734, "grad_norm": 0.13258216878493195, "learning_rate": 8.134728115135967e-06, "loss": 0.042, "num_tokens": 62153344.0, "step": 776 }, { "epoch": 0.9694323144104804, "grad_norm": 0.11425515280955807, "learning_rate": 8.129813314298457e-06, "loss": 0.0429, "num_tokens": 62233445.0, "step": 777 }, { "epoch": 0.9706799750467873, "grad_norm": 0.12422677837162528, "learning_rate": 8.124893744814524e-06, "loss": 0.0483, "num_tokens": 62313446.0, "step": 778 }, { "epoch": 0.9719276356830941, "grad_norm": 0.12548120606989233, "learning_rate": 8.11996941560484e-06, "loss": 0.0423, "num_tokens": 62392794.0, "step": 779 }, { "epoch": 0.9731752963194011, "grad_norm": 0.11634689195370958, "learning_rate": 8.115040335598701e-06, "loss": 0.0441, "num_tokens": 62472294.0, "step": 780 }, { "epoch": 0.974422956955708, "grad_norm": 0.12855102743082578, "learning_rate": 8.110106513734019e-06, "loss": 0.0467, "num_tokens": 62552135.0, "step": 781 }, { "epoch": 0.975670617592015, "grad_norm": 0.12511638236191897, "learning_rate": 8.105167958957302e-06, "loss": 0.0425, "num_tokens": 62632686.0, "step": 782 }, { "epoch": 0.9769182782283219, "grad_norm": 0.1235968437568558, "learning_rate": 8.100224680223647e-06, "loss": 0.0458, "num_tokens": 62712757.0, "step": 783 }, { "epoch": 0.9781659388646288, "grad_norm": 0.12105321822365409, "learning_rate": 8.09527668649671e-06, "loss": 0.0455, "num_tokens": 62792452.0, "step": 784 }, { "epoch": 0.9794135995009358, "grad_norm": 0.12052561809276435, "learning_rate": 8.090323986748696e-06, "loss": 0.0433, "num_tokens": 62872571.0, "step": 785 }, { "epoch": 0.9806612601372426, "grad_norm": 0.11630478819939606, "learning_rate": 8.085366589960353e-06, "loss": 0.0434, "num_tokens": 62954093.0, "step": 786 }, { "epoch": 0.9819089207735496, "grad_norm": 0.12090710790599722, "learning_rate": 8.080404505120936e-06, "loss": 0.0415, "num_tokens": 63033837.0, "step": 787 }, { "epoch": 0.9831565814098565, "grad_norm": 0.11999218162825294, "learning_rate": 8.075437741228205e-06, "loss": 0.044, "num_tokens": 63113726.0, "step": 788 }, { "epoch": 0.9844042420461634, "grad_norm": 0.12012264791201754, "learning_rate": 8.070466307288404e-06, "loss": 0.045, "num_tokens": 63195437.0, "step": 789 }, { "epoch": 0.9856519026824704, "grad_norm": 0.10297347591984726, "learning_rate": 8.065490212316245e-06, "loss": 0.0385, "num_tokens": 63274120.0, "step": 790 }, { "epoch": 0.9868995633187773, "grad_norm": 0.1159425491489533, "learning_rate": 8.060509465334895e-06, "loss": 0.0421, "num_tokens": 63353768.0, "step": 791 }, { "epoch": 0.9881472239550843, "grad_norm": 0.12583816039244836, "learning_rate": 8.055524075375951e-06, "loss": 0.044, "num_tokens": 63433687.0, "step": 792 }, { "epoch": 0.9893948845913911, "grad_norm": 0.1300665820006064, "learning_rate": 8.050534051479432e-06, "loss": 0.0634, "num_tokens": 63514928.0, "step": 793 }, { "epoch": 0.990642545227698, "grad_norm": 0.1198989461072627, "learning_rate": 8.045539402693759e-06, "loss": 0.0435, "num_tokens": 63594238.0, "step": 794 }, { "epoch": 0.991890205864005, "grad_norm": 0.12871787019951422, "learning_rate": 8.040540138075743e-06, "loss": 0.044, "num_tokens": 63674352.0, "step": 795 }, { "epoch": 0.9931378665003119, "grad_norm": 0.14279234821369916, "learning_rate": 8.035536266690561e-06, "loss": 0.0437, "num_tokens": 63755466.0, "step": 796 }, { "epoch": 0.9943855271366189, "grad_norm": 0.12668717371746102, "learning_rate": 8.030527797611742e-06, "loss": 0.0472, "num_tokens": 63835541.0, "step": 797 }, { "epoch": 0.9956331877729258, "grad_norm": 0.12504493154631438, "learning_rate": 8.025514739921155e-06, "loss": 0.0439, "num_tokens": 63914999.0, "step": 798 }, { "epoch": 0.9968808484092326, "grad_norm": 0.12250309041796634, "learning_rate": 8.02049710270899e-06, "loss": 0.0408, "num_tokens": 63995141.0, "step": 799 }, { "epoch": 0.9981285090455396, "grad_norm": 0.11677292603996331, "learning_rate": 8.015474895073739e-06, "loss": 0.042, "num_tokens": 64074961.0, "step": 800 }, { "epoch": 0.9993761696818465, "grad_norm": 0.11305611872055077, "learning_rate": 8.010448126122183e-06, "loss": 0.0424, "num_tokens": 64156160.0, "step": 801 }, { "epoch": 1.0, "grad_norm": 0.11305611872055077, "learning_rate": 8.005416804969374e-06, "loss": 0.0453, "num_tokens": 64196778.0, "step": 802 }, { "epoch": 1.001247660636307, "grad_norm": 0.19297241332841056, "learning_rate": 8.000380940738616e-06, "loss": 0.0365, "num_tokens": 64276858.0, "step": 803 }, { "epoch": 1.0024953212726138, "grad_norm": 0.12339605814727976, "learning_rate": 7.995340542561453e-06, "loss": 0.0365, "num_tokens": 64356382.0, "step": 804 }, { "epoch": 1.0037429819089208, "grad_norm": 0.10655882436094036, "learning_rate": 7.990295619577653e-06, "loss": 0.0403, "num_tokens": 64437312.0, "step": 805 }, { "epoch": 1.0049906425452277, "grad_norm": 0.11516009325890297, "learning_rate": 7.985246180935184e-06, "loss": 0.0414, "num_tokens": 64518748.0, "step": 806 }, { "epoch": 1.0062383031815347, "grad_norm": 0.11139160659856995, "learning_rate": 7.980192235790207e-06, "loss": 0.0367, "num_tokens": 64598711.0, "step": 807 }, { "epoch": 1.0074859638178415, "grad_norm": 0.12064118588838565, "learning_rate": 7.97513379330705e-06, "loss": 0.0375, "num_tokens": 64678489.0, "step": 808 }, { "epoch": 1.0087336244541485, "grad_norm": 0.11389522139854903, "learning_rate": 7.970070862658198e-06, "loss": 0.036, "num_tokens": 64758290.0, "step": 809 }, { "epoch": 1.0099812850904555, "grad_norm": 0.11582742904776548, "learning_rate": 7.965003453024273e-06, "loss": 0.0387, "num_tokens": 64838965.0, "step": 810 }, { "epoch": 1.0112289457267623, "grad_norm": 0.11758744952524129, "learning_rate": 7.959931573594025e-06, "loss": 0.0373, "num_tokens": 64919317.0, "step": 811 }, { "epoch": 1.0124766063630692, "grad_norm": 0.1087404021707414, "learning_rate": 7.954855233564301e-06, "loss": 0.036, "num_tokens": 65000106.0, "step": 812 }, { "epoch": 1.0137242669993762, "grad_norm": 0.13115802969604487, "learning_rate": 7.949774442140043e-06, "loss": 0.0387, "num_tokens": 65079674.0, "step": 813 }, { "epoch": 1.014971927635683, "grad_norm": 0.1266899812719372, "learning_rate": 7.944689208534257e-06, "loss": 0.0383, "num_tokens": 65160522.0, "step": 814 }, { "epoch": 1.01621958827199, "grad_norm": 0.12285583224697105, "learning_rate": 7.939599541968012e-06, "loss": 0.0383, "num_tokens": 65240870.0, "step": 815 }, { "epoch": 1.017467248908297, "grad_norm": 0.1190497077658083, "learning_rate": 7.93450545167041e-06, "loss": 0.0352, "num_tokens": 65319646.0, "step": 816 }, { "epoch": 1.018714909544604, "grad_norm": 0.12563652698818017, "learning_rate": 7.929406946878576e-06, "loss": 0.0388, "num_tokens": 65400645.0, "step": 817 }, { "epoch": 1.0199625701809107, "grad_norm": 0.1234239723405108, "learning_rate": 7.924304036837643e-06, "loss": 0.0373, "num_tokens": 65479114.0, "step": 818 }, { "epoch": 1.0212102308172177, "grad_norm": 0.13005122038538955, "learning_rate": 7.919196730800727e-06, "loss": 0.0373, "num_tokens": 65559516.0, "step": 819 }, { "epoch": 1.0224578914535247, "grad_norm": 0.11725724234923955, "learning_rate": 7.914085038028918e-06, "loss": 0.0357, "num_tokens": 65639488.0, "step": 820 }, { "epoch": 1.0237055520898315, "grad_norm": 0.11014632164428752, "learning_rate": 7.908968967791262e-06, "loss": 0.0358, "num_tokens": 65719075.0, "step": 821 }, { "epoch": 1.0249532127261385, "grad_norm": 0.11721508584086246, "learning_rate": 7.903848529364738e-06, "loss": 0.0373, "num_tokens": 65799000.0, "step": 822 }, { "epoch": 1.0262008733624455, "grad_norm": 0.11517241066377722, "learning_rate": 7.89872373203425e-06, "loss": 0.0389, "num_tokens": 65878502.0, "step": 823 }, { "epoch": 1.0274485339987522, "grad_norm": 0.12088239271003214, "learning_rate": 7.893594585092601e-06, "loss": 0.0374, "num_tokens": 65959219.0, "step": 824 }, { "epoch": 1.0286961946350592, "grad_norm": 0.12193974539111146, "learning_rate": 7.888461097840494e-06, "loss": 0.0358, "num_tokens": 66039818.0, "step": 825 }, { "epoch": 1.0299438552713662, "grad_norm": 0.12182216187687943, "learning_rate": 7.883323279586483e-06, "loss": 0.0374, "num_tokens": 66119441.0, "step": 826 }, { "epoch": 1.0311915159076732, "grad_norm": 0.11841925845138539, "learning_rate": 7.87818113964699e-06, "loss": 0.0377, "num_tokens": 66199667.0, "step": 827 }, { "epoch": 1.03243917654398, "grad_norm": 0.12421286713634912, "learning_rate": 7.873034687346268e-06, "loss": 0.0371, "num_tokens": 66279952.0, "step": 828 }, { "epoch": 1.033686837180287, "grad_norm": 0.11425196856934319, "learning_rate": 7.86788393201639e-06, "loss": 0.0366, "num_tokens": 66359882.0, "step": 829 }, { "epoch": 1.034934497816594, "grad_norm": 0.12393074676184707, "learning_rate": 7.862728882997236e-06, "loss": 0.0372, "num_tokens": 66439831.0, "step": 830 }, { "epoch": 1.0361821584529007, "grad_norm": 0.11799704195610591, "learning_rate": 7.857569549636462e-06, "loss": 0.0416, "num_tokens": 66519952.0, "step": 831 }, { "epoch": 1.0374298190892077, "grad_norm": 0.12409491334059666, "learning_rate": 7.852405941289503e-06, "loss": 0.0372, "num_tokens": 66598948.0, "step": 832 }, { "epoch": 1.0386774797255147, "grad_norm": 0.12220754153408792, "learning_rate": 7.847238067319542e-06, "loss": 0.0372, "num_tokens": 66680599.0, "step": 833 }, { "epoch": 1.0399251403618215, "grad_norm": 0.12805122770002453, "learning_rate": 7.842065937097495e-06, "loss": 0.0357, "num_tokens": 66759842.0, "step": 834 }, { "epoch": 1.0411728009981285, "grad_norm": 0.11193160429999634, "learning_rate": 7.836889560001997e-06, "loss": 0.0372, "num_tokens": 66839975.0, "step": 835 }, { "epoch": 1.0424204616344355, "grad_norm": 0.11705280167012153, "learning_rate": 7.831708945419383e-06, "loss": 0.0354, "num_tokens": 66919473.0, "step": 836 }, { "epoch": 1.0436681222707425, "grad_norm": 0.11882428882714241, "learning_rate": 7.826524102743678e-06, "loss": 0.0469, "num_tokens": 67000971.0, "step": 837 }, { "epoch": 1.0449157829070492, "grad_norm": 0.11949649165411871, "learning_rate": 7.821335041376565e-06, "loss": 0.0399, "num_tokens": 67082398.0, "step": 838 }, { "epoch": 1.0461634435433562, "grad_norm": 0.13222049645536524, "learning_rate": 7.816141770727381e-06, "loss": 0.038, "num_tokens": 67162807.0, "step": 839 }, { "epoch": 1.0474111041796632, "grad_norm": 0.1287747273510056, "learning_rate": 7.810944300213095e-06, "loss": 0.0365, "num_tokens": 67243036.0, "step": 840 }, { "epoch": 1.04865876481597, "grad_norm": 0.12359681273704244, "learning_rate": 7.805742639258297e-06, "loss": 0.0377, "num_tokens": 67324171.0, "step": 841 }, { "epoch": 1.049906425452277, "grad_norm": 0.11791544460990037, "learning_rate": 7.800536797295164e-06, "loss": 0.0385, "num_tokens": 67406821.0, "step": 842 }, { "epoch": 1.051154086088584, "grad_norm": 0.11669683358736094, "learning_rate": 7.795326783763463e-06, "loss": 0.036, "num_tokens": 67486421.0, "step": 843 }, { "epoch": 1.0524017467248907, "grad_norm": 0.1145704300717487, "learning_rate": 7.790112608110523e-06, "loss": 0.0375, "num_tokens": 67566397.0, "step": 844 }, { "epoch": 1.0536494073611977, "grad_norm": 0.11192160356785108, "learning_rate": 7.784894279791224e-06, "loss": 0.0355, "num_tokens": 67646842.0, "step": 845 }, { "epoch": 1.0548970679975047, "grad_norm": 0.11562580889859264, "learning_rate": 7.779671808267968e-06, "loss": 0.0378, "num_tokens": 67727482.0, "step": 846 }, { "epoch": 1.0561447286338117, "grad_norm": 0.12153279759646565, "learning_rate": 7.774445203010676e-06, "loss": 0.0359, "num_tokens": 67807939.0, "step": 847 }, { "epoch": 1.0573923892701185, "grad_norm": 0.11470566792658915, "learning_rate": 7.769214473496766e-06, "loss": 0.0407, "num_tokens": 67888052.0, "step": 848 }, { "epoch": 1.0586400499064255, "grad_norm": 0.12383359576287097, "learning_rate": 7.763979629211127e-06, "loss": 0.038, "num_tokens": 67968032.0, "step": 849 }, { "epoch": 1.0598877105427325, "grad_norm": 0.11881454127188035, "learning_rate": 7.758740679646115e-06, "loss": 0.0371, "num_tokens": 68046937.0, "step": 850 }, { "epoch": 1.0611353711790392, "grad_norm": 0.11464903476697275, "learning_rate": 7.753497634301532e-06, "loss": 0.0364, "num_tokens": 68127203.0, "step": 851 }, { "epoch": 1.0623830318153462, "grad_norm": 0.12119441827724774, "learning_rate": 7.748250502684601e-06, "loss": 0.0367, "num_tokens": 68207782.0, "step": 852 }, { "epoch": 1.0636306924516532, "grad_norm": 0.13058645569693353, "learning_rate": 7.742999294309959e-06, "loss": 0.037, "num_tokens": 68287141.0, "step": 853 }, { "epoch": 1.06487835308796, "grad_norm": 0.12137449605751832, "learning_rate": 7.737744018699634e-06, "loss": 0.0386, "num_tokens": 68367278.0, "step": 854 }, { "epoch": 1.066126013724267, "grad_norm": 0.1175051411674423, "learning_rate": 7.732484685383027e-06, "loss": 0.0378, "num_tokens": 68448023.0, "step": 855 }, { "epoch": 1.067373674360574, "grad_norm": 0.11774104211220471, "learning_rate": 7.7272213038969e-06, "loss": 0.0362, "num_tokens": 68527504.0, "step": 856 }, { "epoch": 1.068621334996881, "grad_norm": 0.12107302317693551, "learning_rate": 7.72195388378536e-06, "loss": 0.0364, "num_tokens": 68607013.0, "step": 857 }, { "epoch": 1.0698689956331877, "grad_norm": 0.1227886444188267, "learning_rate": 7.716682434599823e-06, "loss": 0.0379, "num_tokens": 68687882.0, "step": 858 }, { "epoch": 1.0711166562694947, "grad_norm": 0.11421078198972955, "learning_rate": 7.711406965899026e-06, "loss": 0.0357, "num_tokens": 68767520.0, "step": 859 }, { "epoch": 1.0723643169058017, "grad_norm": 0.13139694750578262, "learning_rate": 7.706127487248984e-06, "loss": 0.0397, "num_tokens": 68848548.0, "step": 860 }, { "epoch": 1.0736119775421085, "grad_norm": 0.12048027098934615, "learning_rate": 7.70084400822299e-06, "loss": 0.0394, "num_tokens": 68929107.0, "step": 861 }, { "epoch": 1.0748596381784155, "grad_norm": 0.11688196802345287, "learning_rate": 7.695556538401588e-06, "loss": 0.0337, "num_tokens": 69008833.0, "step": 862 }, { "epoch": 1.0761072988147224, "grad_norm": 0.1178101906162924, "learning_rate": 7.690265087372559e-06, "loss": 0.0392, "num_tokens": 69089477.0, "step": 863 }, { "epoch": 1.0773549594510294, "grad_norm": 0.11901289710556308, "learning_rate": 7.684969664730903e-06, "loss": 0.0395, "num_tokens": 69170299.0, "step": 864 }, { "epoch": 1.0786026200873362, "grad_norm": 0.1131133914969235, "learning_rate": 7.679670280078823e-06, "loss": 0.0381, "num_tokens": 69251770.0, "step": 865 }, { "epoch": 1.0798502807236432, "grad_norm": 0.11744420013358728, "learning_rate": 7.674366943025705e-06, "loss": 0.0363, "num_tokens": 69331910.0, "step": 866 }, { "epoch": 1.0810979413599502, "grad_norm": 0.11404106643899889, "learning_rate": 7.669059663188099e-06, "loss": 0.037, "num_tokens": 69411473.0, "step": 867 }, { "epoch": 1.082345601996257, "grad_norm": 0.11319859025103475, "learning_rate": 7.66374845018971e-06, "loss": 0.0348, "num_tokens": 69491703.0, "step": 868 }, { "epoch": 1.083593262632564, "grad_norm": 0.112683453177615, "learning_rate": 7.658433313661372e-06, "loss": 0.0392, "num_tokens": 69572447.0, "step": 869 }, { "epoch": 1.084840923268871, "grad_norm": 0.1276133280655265, "learning_rate": 7.653114263241034e-06, "loss": 0.0388, "num_tokens": 69653823.0, "step": 870 }, { "epoch": 1.0860885839051777, "grad_norm": 0.1240984988092244, "learning_rate": 7.647791308573744e-06, "loss": 0.0387, "num_tokens": 69734055.0, "step": 871 }, { "epoch": 1.0873362445414847, "grad_norm": 0.12356452504447533, "learning_rate": 7.642464459311623e-06, "loss": 0.0347, "num_tokens": 69813965.0, "step": 872 }, { "epoch": 1.0885839051777917, "grad_norm": 0.11514715620013154, "learning_rate": 7.637133725113864e-06, "loss": 0.0366, "num_tokens": 69894363.0, "step": 873 }, { "epoch": 1.0898315658140985, "grad_norm": 0.12779474123132986, "learning_rate": 7.631799115646697e-06, "loss": 0.0357, "num_tokens": 69973323.0, "step": 874 }, { "epoch": 1.0910792264504054, "grad_norm": 0.11790975872627733, "learning_rate": 7.6264606405833805e-06, "loss": 0.0363, "num_tokens": 70054250.0, "step": 875 }, { "epoch": 1.0923268870867124, "grad_norm": 0.11641886596060783, "learning_rate": 7.621118309604186e-06, "loss": 0.0422, "num_tokens": 70133988.0, "step": 876 }, { "epoch": 1.0935745477230194, "grad_norm": 0.11751888094011356, "learning_rate": 7.615772132396373e-06, "loss": 0.035, "num_tokens": 70213674.0, "step": 877 }, { "epoch": 1.0948222083593262, "grad_norm": 0.10553263064141878, "learning_rate": 7.6104221186541745e-06, "loss": 0.0356, "num_tokens": 70292896.0, "step": 878 }, { "epoch": 1.0960698689956332, "grad_norm": 0.12384471705795567, "learning_rate": 7.6050682780787865e-06, "loss": 0.0397, "num_tokens": 70372902.0, "step": 879 }, { "epoch": 1.0973175296319402, "grad_norm": 0.1290651741195708, "learning_rate": 7.599710620378337e-06, "loss": 0.0362, "num_tokens": 70453829.0, "step": 880 }, { "epoch": 1.098565190268247, "grad_norm": 0.12342303328201779, "learning_rate": 7.594349155267879e-06, "loss": 0.0362, "num_tokens": 70533001.0, "step": 881 }, { "epoch": 1.099812850904554, "grad_norm": 0.12141677579551534, "learning_rate": 7.588983892469372e-06, "loss": 0.0367, "num_tokens": 70613271.0, "step": 882 }, { "epoch": 1.101060511540861, "grad_norm": 0.12196685769280396, "learning_rate": 7.583614841711657e-06, "loss": 0.0365, "num_tokens": 70692565.0, "step": 883 }, { "epoch": 1.102308172177168, "grad_norm": 0.12354073131658985, "learning_rate": 7.5782420127304466e-06, "loss": 0.0384, "num_tokens": 70772857.0, "step": 884 }, { "epoch": 1.1035558328134747, "grad_norm": 0.11720083779315267, "learning_rate": 7.572865415268303e-06, "loss": 0.0355, "num_tokens": 70852777.0, "step": 885 }, { "epoch": 1.1048034934497817, "grad_norm": 0.11247611644561628, "learning_rate": 7.567485059074623e-06, "loss": 0.0367, "num_tokens": 70933435.0, "step": 886 }, { "epoch": 1.1060511540860887, "grad_norm": 0.12333260571429218, "learning_rate": 7.5621009539056175e-06, "loss": 0.0374, "num_tokens": 71013136.0, "step": 887 }, { "epoch": 1.1072988147223954, "grad_norm": 0.12451469169995692, "learning_rate": 7.556713109524301e-06, "loss": 0.0372, "num_tokens": 71093256.0, "step": 888 }, { "epoch": 1.1085464753587024, "grad_norm": 0.13359967657120056, "learning_rate": 7.551321535700456e-06, "loss": 0.0358, "num_tokens": 71172930.0, "step": 889 }, { "epoch": 1.1097941359950094, "grad_norm": 0.11691502751945572, "learning_rate": 7.545926242210643e-06, "loss": 0.0362, "num_tokens": 71252476.0, "step": 890 }, { "epoch": 1.1110417966313162, "grad_norm": 0.12287538546531206, "learning_rate": 7.540527238838156e-06, "loss": 0.0352, "num_tokens": 71331645.0, "step": 891 }, { "epoch": 1.1122894572676232, "grad_norm": 0.12355870380253296, "learning_rate": 7.535124535373019e-06, "loss": 0.0352, "num_tokens": 71410967.0, "step": 892 }, { "epoch": 1.1135371179039302, "grad_norm": 0.11947008075731919, "learning_rate": 7.529718141611972e-06, "loss": 0.0369, "num_tokens": 71491316.0, "step": 893 }, { "epoch": 1.114784778540237, "grad_norm": 0.11449370018994913, "learning_rate": 7.5243080673584345e-06, "loss": 0.0338, "num_tokens": 71572312.0, "step": 894 }, { "epoch": 1.116032439176544, "grad_norm": 0.10417902495607685, "learning_rate": 7.51889432242251e-06, "loss": 0.0351, "num_tokens": 71652300.0, "step": 895 }, { "epoch": 1.117280099812851, "grad_norm": 0.11908753377459247, "learning_rate": 7.513476916620952e-06, "loss": 0.0387, "num_tokens": 71733471.0, "step": 896 }, { "epoch": 1.118527760449158, "grad_norm": 0.11776124309241255, "learning_rate": 7.508055859777157e-06, "loss": 0.0347, "num_tokens": 71812889.0, "step": 897 }, { "epoch": 1.1197754210854647, "grad_norm": 0.11842774970819654, "learning_rate": 7.502631161721139e-06, "loss": 0.0361, "num_tokens": 71892941.0, "step": 898 }, { "epoch": 1.1210230817217717, "grad_norm": 0.11809125594779071, "learning_rate": 7.497202832289514e-06, "loss": 0.0376, "num_tokens": 71972718.0, "step": 899 }, { "epoch": 1.1222707423580787, "grad_norm": 0.11729161979135838, "learning_rate": 7.4917708813254865e-06, "loss": 0.0387, "num_tokens": 72054222.0, "step": 900 }, { "epoch": 1.1235184029943854, "grad_norm": 0.11425658550363159, "learning_rate": 7.4863353186788234e-06, "loss": 0.0363, "num_tokens": 72133579.0, "step": 901 }, { "epoch": 1.1247660636306924, "grad_norm": 0.11436527897915852, "learning_rate": 7.480896154205844e-06, "loss": 0.0362, "num_tokens": 72213206.0, "step": 902 }, { "epoch": 1.1260137242669994, "grad_norm": 0.11730584452702315, "learning_rate": 7.475453397769396e-06, "loss": 0.0391, "num_tokens": 72292700.0, "step": 903 }, { "epoch": 1.1272613849033064, "grad_norm": 0.11830533017335358, "learning_rate": 7.470007059238842e-06, "loss": 0.0351, "num_tokens": 72371412.0, "step": 904 }, { "epoch": 1.1285090455396132, "grad_norm": 0.11213629131896848, "learning_rate": 7.464557148490041e-06, "loss": 0.035, "num_tokens": 72451362.0, "step": 905 }, { "epoch": 1.1297567061759202, "grad_norm": 0.14744282107332352, "learning_rate": 7.459103675405328e-06, "loss": 0.0384, "num_tokens": 72531571.0, "step": 906 }, { "epoch": 1.1310043668122272, "grad_norm": 0.11322054203080899, "learning_rate": 7.4536466498735e-06, "loss": 0.0362, "num_tokens": 72611893.0, "step": 907 }, { "epoch": 1.132252027448534, "grad_norm": 0.10786998375465344, "learning_rate": 7.44818608178979e-06, "loss": 0.0368, "num_tokens": 72691853.0, "step": 908 }, { "epoch": 1.133499688084841, "grad_norm": 0.1213664887001441, "learning_rate": 7.442721981055862e-06, "loss": 0.04, "num_tokens": 72773392.0, "step": 909 }, { "epoch": 1.134747348721148, "grad_norm": 0.11598720811101557, "learning_rate": 7.43725435757978e-06, "loss": 0.0359, "num_tokens": 72852913.0, "step": 910 }, { "epoch": 1.1359950093574547, "grad_norm": 0.11009547084534818, "learning_rate": 7.431783221275997e-06, "loss": 0.0372, "num_tokens": 72932495.0, "step": 911 }, { "epoch": 1.1372426699937617, "grad_norm": 0.13290296128464088, "learning_rate": 7.426308582065339e-06, "loss": 0.0375, "num_tokens": 73013678.0, "step": 912 }, { "epoch": 1.1384903306300687, "grad_norm": 0.12616925947987406, "learning_rate": 7.4208304498749825e-06, "loss": 0.0379, "num_tokens": 73095054.0, "step": 913 }, { "epoch": 1.1397379912663754, "grad_norm": 0.11425464035364839, "learning_rate": 7.415348834638433e-06, "loss": 0.0372, "num_tokens": 73175046.0, "step": 914 }, { "epoch": 1.1409856519026824, "grad_norm": 0.1245543910672972, "learning_rate": 7.40986374629552e-06, "loss": 0.0369, "num_tokens": 73254678.0, "step": 915 }, { "epoch": 1.1422333125389894, "grad_norm": 0.1179981926555406, "learning_rate": 7.404375194792365e-06, "loss": 0.0374, "num_tokens": 73334623.0, "step": 916 }, { "epoch": 1.1434809731752964, "grad_norm": 0.11126346028333928, "learning_rate": 7.398883190081368e-06, "loss": 0.0353, "num_tokens": 73414712.0, "step": 917 }, { "epoch": 1.1447286338116032, "grad_norm": 0.11760773052330453, "learning_rate": 7.3933877421211986e-06, "loss": 0.0356, "num_tokens": 73495679.0, "step": 918 }, { "epoch": 1.1459762944479102, "grad_norm": 0.11803137365247664, "learning_rate": 7.387888860876763e-06, "loss": 0.0362, "num_tokens": 73575931.0, "step": 919 }, { "epoch": 1.1472239550842172, "grad_norm": 0.11691088438783126, "learning_rate": 7.382386556319193e-06, "loss": 0.0357, "num_tokens": 73656607.0, "step": 920 }, { "epoch": 1.1484716157205241, "grad_norm": 0.11592560296885844, "learning_rate": 7.376880838425832e-06, "loss": 0.0366, "num_tokens": 73736234.0, "step": 921 }, { "epoch": 1.149719276356831, "grad_norm": 0.1211126889771943, "learning_rate": 7.3713717171802106e-06, "loss": 0.0354, "num_tokens": 73816380.0, "step": 922 }, { "epoch": 1.150966936993138, "grad_norm": 0.11810769499767688, "learning_rate": 7.3658592025720285e-06, "loss": 0.04, "num_tokens": 73897698.0, "step": 923 }, { "epoch": 1.152214597629445, "grad_norm": 0.11484038275799247, "learning_rate": 7.360343304597144e-06, "loss": 0.0354, "num_tokens": 73977453.0, "step": 924 }, { "epoch": 1.1534622582657517, "grad_norm": 0.10850730121509818, "learning_rate": 7.354824033257546e-06, "loss": 0.0365, "num_tokens": 74056422.0, "step": 925 }, { "epoch": 1.1547099189020587, "grad_norm": 0.11974147584903187, "learning_rate": 7.349301398561342e-06, "loss": 0.0351, "num_tokens": 74136845.0, "step": 926 }, { "epoch": 1.1559575795383656, "grad_norm": 0.11829958356397421, "learning_rate": 7.3437754105227365e-06, "loss": 0.0358, "num_tokens": 74217377.0, "step": 927 }, { "epoch": 1.1572052401746724, "grad_norm": 0.1119802739133821, "learning_rate": 7.3382460791620165e-06, "loss": 0.0365, "num_tokens": 74299632.0, "step": 928 }, { "epoch": 1.1584529008109794, "grad_norm": 0.11642309337985378, "learning_rate": 7.332713414505534e-06, "loss": 0.0373, "num_tokens": 74379725.0, "step": 929 }, { "epoch": 1.1597005614472864, "grad_norm": 0.12355861507215243, "learning_rate": 7.32717742658568e-06, "loss": 0.0355, "num_tokens": 74459397.0, "step": 930 }, { "epoch": 1.1609482220835932, "grad_norm": 0.11263553582694089, "learning_rate": 7.321638125440872e-06, "loss": 0.0338, "num_tokens": 74539162.0, "step": 931 }, { "epoch": 1.1621958827199002, "grad_norm": 0.11483802072503761, "learning_rate": 7.316095521115541e-06, "loss": 0.0395, "num_tokens": 74619166.0, "step": 932 }, { "epoch": 1.1634435433562071, "grad_norm": 0.12246171184256209, "learning_rate": 7.310549623660101e-06, "loss": 0.0378, "num_tokens": 74699097.0, "step": 933 }, { "epoch": 1.164691203992514, "grad_norm": 0.10951266608078988, "learning_rate": 7.305000443130943e-06, "loss": 0.0359, "num_tokens": 74778723.0, "step": 934 }, { "epoch": 1.165938864628821, "grad_norm": 0.1296103420820968, "learning_rate": 7.299447989590406e-06, "loss": 0.0379, "num_tokens": 74857957.0, "step": 935 }, { "epoch": 1.167186525265128, "grad_norm": 0.1106739790929476, "learning_rate": 7.293892273106768e-06, "loss": 0.0339, "num_tokens": 74937533.0, "step": 936 }, { "epoch": 1.1684341859014349, "grad_norm": 0.1159745943163753, "learning_rate": 7.2883333037542205e-06, "loss": 0.0361, "num_tokens": 75017116.0, "step": 937 }, { "epoch": 1.1696818465377417, "grad_norm": 0.11565795695947093, "learning_rate": 7.282771091612858e-06, "loss": 0.037, "num_tokens": 75097805.0, "step": 938 }, { "epoch": 1.1709295071740486, "grad_norm": 0.12436659888574522, "learning_rate": 7.27720564676865e-06, "loss": 0.0358, "num_tokens": 75177008.0, "step": 939 }, { "epoch": 1.1721771678103556, "grad_norm": 0.11305973816370699, "learning_rate": 7.271636979313432e-06, "loss": 0.0338, "num_tokens": 75256060.0, "step": 940 }, { "epoch": 1.1734248284466626, "grad_norm": 0.11468359036363136, "learning_rate": 7.266065099344881e-06, "loss": 0.0371, "num_tokens": 75336135.0, "step": 941 }, { "epoch": 1.1746724890829694, "grad_norm": 0.11929128098454908, "learning_rate": 7.260490016966497e-06, "loss": 0.0373, "num_tokens": 75416812.0, "step": 942 }, { "epoch": 1.1759201497192764, "grad_norm": 0.130142686177145, "learning_rate": 7.2549117422875925e-06, "loss": 0.0396, "num_tokens": 75496993.0, "step": 943 }, { "epoch": 1.1771678103555834, "grad_norm": 0.12008704233696307, "learning_rate": 7.249330285423265e-06, "loss": 0.0389, "num_tokens": 75577776.0, "step": 944 }, { "epoch": 1.1784154709918901, "grad_norm": 0.1220975661519173, "learning_rate": 7.243745656494382e-06, "loss": 0.038, "num_tokens": 75657576.0, "step": 945 }, { "epoch": 1.1796631316281971, "grad_norm": 0.12744332708746905, "learning_rate": 7.238157865627562e-06, "loss": 0.0364, "num_tokens": 75737747.0, "step": 946 }, { "epoch": 1.1809107922645041, "grad_norm": 0.11326510598111456, "learning_rate": 7.2325669229551636e-06, "loss": 0.0364, "num_tokens": 75819395.0, "step": 947 }, { "epoch": 1.182158452900811, "grad_norm": 0.1213820127222462, "learning_rate": 7.226972838615251e-06, "loss": 0.0384, "num_tokens": 75898696.0, "step": 948 }, { "epoch": 1.1834061135371179, "grad_norm": 0.1226667585479789, "learning_rate": 7.221375622751593e-06, "loss": 0.0407, "num_tokens": 75978883.0, "step": 949 }, { "epoch": 1.1846537741734249, "grad_norm": 0.1193275551364265, "learning_rate": 7.215775285513633e-06, "loss": 0.037, "num_tokens": 76058141.0, "step": 950 }, { "epoch": 1.1859014348097316, "grad_norm": 0.12089332711649621, "learning_rate": 7.210171837056474e-06, "loss": 0.0373, "num_tokens": 76138153.0, "step": 951 }, { "epoch": 1.1871490954460386, "grad_norm": 0.12206397250048755, "learning_rate": 7.2045652875408614e-06, "loss": 0.0362, "num_tokens": 76218972.0, "step": 952 }, { "epoch": 1.1883967560823456, "grad_norm": 0.11543786673607577, "learning_rate": 7.198955647133167e-06, "loss": 0.0364, "num_tokens": 76298129.0, "step": 953 }, { "epoch": 1.1896444167186526, "grad_norm": 0.1170017252794963, "learning_rate": 7.193342926005362e-06, "loss": 0.0359, "num_tokens": 76377939.0, "step": 954 }, { "epoch": 1.1908920773549594, "grad_norm": 0.12112960322552498, "learning_rate": 7.187727134335006e-06, "loss": 0.0386, "num_tokens": 76458143.0, "step": 955 }, { "epoch": 1.1921397379912664, "grad_norm": 0.12276416511613138, "learning_rate": 7.182108282305231e-06, "loss": 0.0366, "num_tokens": 76537173.0, "step": 956 }, { "epoch": 1.1933873986275734, "grad_norm": 0.12474176247002251, "learning_rate": 7.176486380104707e-06, "loss": 0.0372, "num_tokens": 76617763.0, "step": 957 }, { "epoch": 1.1946350592638801, "grad_norm": 0.12126829791671767, "learning_rate": 7.1708614379276485e-06, "loss": 0.0374, "num_tokens": 76698109.0, "step": 958 }, { "epoch": 1.1958827199001871, "grad_norm": 0.1248231725076693, "learning_rate": 7.165233465973771e-06, "loss": 0.0375, "num_tokens": 76777864.0, "step": 959 }, { "epoch": 1.1971303805364941, "grad_norm": 0.12569048840462274, "learning_rate": 7.159602474448292e-06, "loss": 0.0369, "num_tokens": 76857197.0, "step": 960 }, { "epoch": 1.1983780411728011, "grad_norm": 0.12332798310694695, "learning_rate": 7.1539684735618995e-06, "loss": 0.0364, "num_tokens": 76937469.0, "step": 961 }, { "epoch": 1.1996257018091079, "grad_norm": 0.11706852204502789, "learning_rate": 7.148331473530741e-06, "loss": 0.0383, "num_tokens": 77017833.0, "step": 962 }, { "epoch": 1.2008733624454149, "grad_norm": 0.12424916506541904, "learning_rate": 7.142691484576399e-06, "loss": 0.0361, "num_tokens": 77097219.0, "step": 963 }, { "epoch": 1.2021210230817219, "grad_norm": 0.10771753138390724, "learning_rate": 7.137048516925882e-06, "loss": 0.0335, "num_tokens": 77176147.0, "step": 964 }, { "epoch": 1.2033686837180286, "grad_norm": 0.11534715229087208, "learning_rate": 7.131402580811593e-06, "loss": 0.0373, "num_tokens": 77256126.0, "step": 965 }, { "epoch": 1.2046163443543356, "grad_norm": 0.12314227105562962, "learning_rate": 7.125753686471322e-06, "loss": 0.0361, "num_tokens": 77335910.0, "step": 966 }, { "epoch": 1.2058640049906426, "grad_norm": 0.12340324578863827, "learning_rate": 7.120101844148222e-06, "loss": 0.0358, "num_tokens": 77416109.0, "step": 967 }, { "epoch": 1.2071116656269494, "grad_norm": 0.12119177535967239, "learning_rate": 7.1144470640907906e-06, "loss": 0.0418, "num_tokens": 77497402.0, "step": 968 }, { "epoch": 1.2083593262632564, "grad_norm": 0.12173802469100313, "learning_rate": 7.1087893565528545e-06, "loss": 0.0372, "num_tokens": 77577501.0, "step": 969 }, { "epoch": 1.2096069868995634, "grad_norm": 0.11142850106634652, "learning_rate": 7.103128731793546e-06, "loss": 0.0365, "num_tokens": 77657098.0, "step": 970 }, { "epoch": 1.2108546475358701, "grad_norm": 0.11827410294214857, "learning_rate": 7.097465200077289e-06, "loss": 0.0358, "num_tokens": 77735468.0, "step": 971 }, { "epoch": 1.2121023081721771, "grad_norm": 0.12499420528672263, "learning_rate": 7.0917987716737795e-06, "loss": 0.0384, "num_tokens": 77815844.0, "step": 972 }, { "epoch": 1.2133499688084841, "grad_norm": 0.12225834238347827, "learning_rate": 7.086129456857963e-06, "loss": 0.0358, "num_tokens": 77895631.0, "step": 973 }, { "epoch": 1.214597629444791, "grad_norm": 0.11545282158638288, "learning_rate": 7.080457265910022e-06, "loss": 0.0364, "num_tokens": 77976656.0, "step": 974 }, { "epoch": 1.2158452900810979, "grad_norm": 0.12462671166142329, "learning_rate": 7.074782209115356e-06, "loss": 0.036, "num_tokens": 78058340.0, "step": 975 }, { "epoch": 1.2170929507174049, "grad_norm": 0.11756188080542514, "learning_rate": 7.069104296764553e-06, "loss": 0.0382, "num_tokens": 78139204.0, "step": 976 }, { "epoch": 1.2183406113537119, "grad_norm": 0.12216323940445717, "learning_rate": 7.0634235391533874e-06, "loss": 0.0383, "num_tokens": 78219057.0, "step": 977 }, { "epoch": 1.2195882719900186, "grad_norm": 0.12667133266309244, "learning_rate": 7.05773994658279e-06, "loss": 0.0365, "num_tokens": 78299547.0, "step": 978 }, { "epoch": 1.2208359326263256, "grad_norm": 0.13620206727181836, "learning_rate": 7.052053529358831e-06, "loss": 0.0352, "num_tokens": 78378421.0, "step": 979 }, { "epoch": 1.2220835932626326, "grad_norm": 0.11745647885605227, "learning_rate": 7.046364297792703e-06, "loss": 0.0348, "num_tokens": 78458843.0, "step": 980 }, { "epoch": 1.2233312538989396, "grad_norm": 0.11499104455275264, "learning_rate": 7.040672262200705e-06, "loss": 0.0407, "num_tokens": 78539916.0, "step": 981 }, { "epoch": 1.2245789145352464, "grad_norm": 0.11717217325806314, "learning_rate": 7.0349774329042135e-06, "loss": 0.0355, "num_tokens": 78619130.0, "step": 982 }, { "epoch": 1.2258265751715534, "grad_norm": 0.12516375230993684, "learning_rate": 7.02927982022968e-06, "loss": 0.0377, "num_tokens": 78699034.0, "step": 983 }, { "epoch": 1.2270742358078603, "grad_norm": 0.15353076061722065, "learning_rate": 7.023579434508596e-06, "loss": 0.0345, "num_tokens": 78777947.0, "step": 984 }, { "epoch": 1.2283218964441671, "grad_norm": 0.1197038982406782, "learning_rate": 7.017876286077484e-06, "loss": 0.0557, "num_tokens": 78859554.0, "step": 985 }, { "epoch": 1.229569557080474, "grad_norm": 0.13053665449746876, "learning_rate": 7.012170385277877e-06, "loss": 0.0347, "num_tokens": 78939749.0, "step": 986 }, { "epoch": 1.230817217716781, "grad_norm": 0.11902958957924019, "learning_rate": 7.006461742456297e-06, "loss": 0.0356, "num_tokens": 79019918.0, "step": 987 }, { "epoch": 1.2320648783530879, "grad_norm": 0.11340880688683018, "learning_rate": 7.000750367964239e-06, "loss": 0.0379, "num_tokens": 79099464.0, "step": 988 }, { "epoch": 1.2333125389893949, "grad_norm": 0.11875742504896669, "learning_rate": 6.99503627215815e-06, "loss": 0.0349, "num_tokens": 79178900.0, "step": 989 }, { "epoch": 1.2345601996257018, "grad_norm": 0.11112107319089891, "learning_rate": 6.989319465399415e-06, "loss": 0.0368, "num_tokens": 79258330.0, "step": 990 }, { "epoch": 1.2358078602620086, "grad_norm": 0.1127792821863249, "learning_rate": 6.983599958054331e-06, "loss": 0.0377, "num_tokens": 79337995.0, "step": 991 }, { "epoch": 1.2370555208983156, "grad_norm": 0.115241478566488, "learning_rate": 6.977877760494094e-06, "loss": 0.0348, "num_tokens": 79419296.0, "step": 992 }, { "epoch": 1.2383031815346226, "grad_norm": 0.12502572470635195, "learning_rate": 6.972152883094778e-06, "loss": 0.0355, "num_tokens": 79498279.0, "step": 993 }, { "epoch": 1.2395508421709296, "grad_norm": 0.12440205175969768, "learning_rate": 6.966425336237317e-06, "loss": 0.037, "num_tokens": 79578630.0, "step": 994 }, { "epoch": 1.2407985028072364, "grad_norm": 0.1251089531609647, "learning_rate": 6.960695130307484e-06, "loss": 0.039, "num_tokens": 79659951.0, "step": 995 }, { "epoch": 1.2420461634435433, "grad_norm": 0.12293216602459224, "learning_rate": 6.954962275695871e-06, "loss": 0.0372, "num_tokens": 79740063.0, "step": 996 }, { "epoch": 1.2432938240798503, "grad_norm": 0.12394497640341638, "learning_rate": 6.9492267827978824e-06, "loss": 0.0374, "num_tokens": 79821223.0, "step": 997 }, { "epoch": 1.244541484716157, "grad_norm": 0.1129659823407307, "learning_rate": 6.943488662013697e-06, "loss": 0.0355, "num_tokens": 79901255.0, "step": 998 }, { "epoch": 1.245789145352464, "grad_norm": 0.1191576382470901, "learning_rate": 6.93774792374826e-06, "loss": 0.0366, "num_tokens": 79981164.0, "step": 999 }, { "epoch": 1.247036805988771, "grad_norm": 0.12363634596962561, "learning_rate": 6.93200457841127e-06, "loss": 0.0345, "num_tokens": 80060941.0, "step": 1000 }, { "epoch": 1.248284466625078, "grad_norm": 0.11752492427361626, "learning_rate": 6.9262586364171455e-06, "loss": 0.0355, "num_tokens": 80140169.0, "step": 1001 }, { "epoch": 1.2495321272613849, "grad_norm": 0.12154555071949472, "learning_rate": 6.920510108185016e-06, "loss": 0.0398, "num_tokens": 80219606.0, "step": 1002 }, { "epoch": 1.2507797878976918, "grad_norm": 0.13878040487943977, "learning_rate": 6.9147590041387e-06, "loss": 0.0393, "num_tokens": 80300978.0, "step": 1003 }, { "epoch": 1.2520274485339988, "grad_norm": 0.11693892617963454, "learning_rate": 6.909005334706688e-06, "loss": 0.0357, "num_tokens": 80380717.0, "step": 1004 }, { "epoch": 1.2532751091703056, "grad_norm": 0.11483400032407586, "learning_rate": 6.903249110322123e-06, "loss": 0.0393, "num_tokens": 80461525.0, "step": 1005 }, { "epoch": 1.2545227698066126, "grad_norm": 0.1176675268573737, "learning_rate": 6.897490341422779e-06, "loss": 0.0337, "num_tokens": 80540527.0, "step": 1006 }, { "epoch": 1.2557704304429196, "grad_norm": 0.12167676830713421, "learning_rate": 6.8917290384510435e-06, "loss": 0.0375, "num_tokens": 80619663.0, "step": 1007 }, { "epoch": 1.2570180910792264, "grad_norm": 0.12421371269284932, "learning_rate": 6.885965211853902e-06, "loss": 0.035, "num_tokens": 80700557.0, "step": 1008 }, { "epoch": 1.2582657517155333, "grad_norm": 0.11922957080231958, "learning_rate": 6.8801988720829134e-06, "loss": 0.0369, "num_tokens": 80780369.0, "step": 1009 }, { "epoch": 1.2595134123518403, "grad_norm": 0.12633099452240243, "learning_rate": 6.874430029594194e-06, "loss": 0.0393, "num_tokens": 80859727.0, "step": 1010 }, { "epoch": 1.260761072988147, "grad_norm": 0.10977750675097879, "learning_rate": 6.8686586948483995e-06, "loss": 0.0385, "num_tokens": 80940815.0, "step": 1011 }, { "epoch": 1.262008733624454, "grad_norm": 0.11227691696830043, "learning_rate": 6.862884878310705e-06, "loss": 0.0361, "num_tokens": 81019729.0, "step": 1012 }, { "epoch": 1.263256394260761, "grad_norm": 0.11693513455086187, "learning_rate": 6.8571085904507825e-06, "loss": 0.0344, "num_tokens": 81099372.0, "step": 1013 }, { "epoch": 1.264504054897068, "grad_norm": 0.11256114481937485, "learning_rate": 6.8513298417427895e-06, "loss": 0.0363, "num_tokens": 81179368.0, "step": 1014 }, { "epoch": 1.2657517155333748, "grad_norm": 0.11521215818800695, "learning_rate": 6.845548642665347e-06, "loss": 0.0342, "num_tokens": 81257916.0, "step": 1015 }, { "epoch": 1.2669993761696818, "grad_norm": 0.10695999901129719, "learning_rate": 6.839765003701511e-06, "loss": 0.037, "num_tokens": 81337952.0, "step": 1016 }, { "epoch": 1.2682470368059888, "grad_norm": 0.12118215671319715, "learning_rate": 6.833978935338772e-06, "loss": 0.0363, "num_tokens": 81416824.0, "step": 1017 }, { "epoch": 1.2694946974422958, "grad_norm": 0.10660530253287213, "learning_rate": 6.828190448069016e-06, "loss": 0.035, "num_tokens": 81496879.0, "step": 1018 }, { "epoch": 1.2707423580786026, "grad_norm": 0.11394529803643212, "learning_rate": 6.822399552388523e-06, "loss": 0.0363, "num_tokens": 81576199.0, "step": 1019 }, { "epoch": 1.2719900187149096, "grad_norm": 0.12045967866524018, "learning_rate": 6.816606258797936e-06, "loss": 0.0347, "num_tokens": 81655945.0, "step": 1020 }, { "epoch": 1.2732376793512166, "grad_norm": 0.12925247193109857, "learning_rate": 6.810810577802249e-06, "loss": 0.0403, "num_tokens": 81736714.0, "step": 1021 }, { "epoch": 1.2744853399875233, "grad_norm": 0.12389301275829777, "learning_rate": 6.8050125199107835e-06, "loss": 0.038, "num_tokens": 81816119.0, "step": 1022 }, { "epoch": 1.2757330006238303, "grad_norm": 0.1167932325177084, "learning_rate": 6.799212095637169e-06, "loss": 0.0363, "num_tokens": 81896630.0, "step": 1023 }, { "epoch": 1.2769806612601373, "grad_norm": 0.11848171062553153, "learning_rate": 6.7934093154993285e-06, "loss": 0.0366, "num_tokens": 81977134.0, "step": 1024 }, { "epoch": 1.278228321896444, "grad_norm": 0.12149988422451896, "learning_rate": 6.787604190019456e-06, "loss": 0.0349, "num_tokens": 82057209.0, "step": 1025 }, { "epoch": 1.279475982532751, "grad_norm": 0.12548818987766705, "learning_rate": 6.781796729724001e-06, "loss": 0.0352, "num_tokens": 82136947.0, "step": 1026 }, { "epoch": 1.280723643169058, "grad_norm": 0.13025681794055913, "learning_rate": 6.775986945143641e-06, "loss": 0.0366, "num_tokens": 82217559.0, "step": 1027 }, { "epoch": 1.2819713038053648, "grad_norm": 0.12011534736589388, "learning_rate": 6.770174846813273e-06, "loss": 0.0372, "num_tokens": 82296722.0, "step": 1028 }, { "epoch": 1.2832189644416718, "grad_norm": 0.11817718721722607, "learning_rate": 6.7643604452719894e-06, "loss": 0.0366, "num_tokens": 82376898.0, "step": 1029 }, { "epoch": 1.2844666250779788, "grad_norm": 0.11923904159473753, "learning_rate": 6.758543751063055e-06, "loss": 0.0349, "num_tokens": 82457760.0, "step": 1030 }, { "epoch": 1.2857142857142856, "grad_norm": 0.11244501964085687, "learning_rate": 6.752724774733899e-06, "loss": 0.0348, "num_tokens": 82537611.0, "step": 1031 }, { "epoch": 1.2869619463505926, "grad_norm": 0.115025011194697, "learning_rate": 6.746903526836079e-06, "loss": 0.0359, "num_tokens": 82618396.0, "step": 1032 }, { "epoch": 1.2882096069868996, "grad_norm": 0.12002180306264341, "learning_rate": 6.741080017925279e-06, "loss": 0.0363, "num_tokens": 82698629.0, "step": 1033 }, { "epoch": 1.2894572676232066, "grad_norm": 0.1226967210363595, "learning_rate": 6.735254258561281e-06, "loss": 0.0376, "num_tokens": 82777184.0, "step": 1034 }, { "epoch": 1.2907049282595136, "grad_norm": 0.10358231550003588, "learning_rate": 6.729426259307948e-06, "loss": 0.0333, "num_tokens": 82856190.0, "step": 1035 }, { "epoch": 1.2919525888958203, "grad_norm": 0.11578090273117891, "learning_rate": 6.723596030733204e-06, "loss": 0.0347, "num_tokens": 82936794.0, "step": 1036 }, { "epoch": 1.2932002495321273, "grad_norm": 0.10081923729427175, "learning_rate": 6.717763583409016e-06, "loss": 0.0346, "num_tokens": 83016097.0, "step": 1037 }, { "epoch": 1.2944479101684343, "grad_norm": 0.12233503099838965, "learning_rate": 6.711928927911373e-06, "loss": 0.0376, "num_tokens": 83095632.0, "step": 1038 }, { "epoch": 1.295695570804741, "grad_norm": 0.11462973160105773, "learning_rate": 6.7060920748202674e-06, "loss": 0.0369, "num_tokens": 83177302.0, "step": 1039 }, { "epoch": 1.296943231441048, "grad_norm": 0.11946539311970528, "learning_rate": 6.700253034719684e-06, "loss": 0.0386, "num_tokens": 83258689.0, "step": 1040 }, { "epoch": 1.298190892077355, "grad_norm": 0.12359336236001878, "learning_rate": 6.694411818197561e-06, "loss": 0.0351, "num_tokens": 83338185.0, "step": 1041 }, { "epoch": 1.2994385527136618, "grad_norm": 0.1139374127755781, "learning_rate": 6.688568435845792e-06, "loss": 0.0347, "num_tokens": 83417497.0, "step": 1042 }, { "epoch": 1.3006862133499688, "grad_norm": 0.11205952042329616, "learning_rate": 6.682722898260195e-06, "loss": 0.0378, "num_tokens": 83498065.0, "step": 1043 }, { "epoch": 1.3019338739862758, "grad_norm": 0.1083104938066509, "learning_rate": 6.676875216040498e-06, "loss": 0.0339, "num_tokens": 83577372.0, "step": 1044 }, { "epoch": 1.3031815346225826, "grad_norm": 0.10828476825816279, "learning_rate": 6.671025399790315e-06, "loss": 0.0385, "num_tokens": 83657938.0, "step": 1045 }, { "epoch": 1.3044291952588896, "grad_norm": 0.11019045987410468, "learning_rate": 6.66517346011713e-06, "loss": 0.0365, "num_tokens": 83738524.0, "step": 1046 }, { "epoch": 1.3056768558951966, "grad_norm": 0.12276236466671721, "learning_rate": 6.659319407632282e-06, "loss": 0.0398, "num_tokens": 83818548.0, "step": 1047 }, { "epoch": 1.3069245165315033, "grad_norm": 0.1263772382661588, "learning_rate": 6.653463252950933e-06, "loss": 0.0378, "num_tokens": 83898937.0, "step": 1048 }, { "epoch": 1.3081721771678103, "grad_norm": 0.12027367867659687, "learning_rate": 6.647605006692066e-06, "loss": 0.037, "num_tokens": 83979503.0, "step": 1049 }, { "epoch": 1.3094198378041173, "grad_norm": 0.11001899385006926, "learning_rate": 6.641744679478448e-06, "loss": 0.0352, "num_tokens": 84058957.0, "step": 1050 }, { "epoch": 1.310667498440424, "grad_norm": 0.11280548803095132, "learning_rate": 6.635882281936625e-06, "loss": 0.0354, "num_tokens": 84138073.0, "step": 1051 }, { "epoch": 1.311915159076731, "grad_norm": 0.11780560554733235, "learning_rate": 6.630017824696898e-06, "loss": 0.0347, "num_tokens": 84218047.0, "step": 1052 }, { "epoch": 1.313162819713038, "grad_norm": 0.12058333417344305, "learning_rate": 6.624151318393298e-06, "loss": 0.0373, "num_tokens": 84298783.0, "step": 1053 }, { "epoch": 1.314410480349345, "grad_norm": 0.11911044876914684, "learning_rate": 6.618282773663576e-06, "loss": 0.0356, "num_tokens": 84378667.0, "step": 1054 }, { "epoch": 1.315658140985652, "grad_norm": 0.11647453576092717, "learning_rate": 6.612412201149175e-06, "loss": 0.037, "num_tokens": 84459833.0, "step": 1055 }, { "epoch": 1.3169058016219588, "grad_norm": 0.1381556034687667, "learning_rate": 6.6065396114952195e-06, "loss": 0.0345, "num_tokens": 84538415.0, "step": 1056 }, { "epoch": 1.3181534622582658, "grad_norm": 0.11291795464976989, "learning_rate": 6.600665015350487e-06, "loss": 0.0358, "num_tokens": 84618356.0, "step": 1057 }, { "epoch": 1.3194011228945728, "grad_norm": 0.11428531136103644, "learning_rate": 6.594788423367399e-06, "loss": 0.0379, "num_tokens": 84699392.0, "step": 1058 }, { "epoch": 1.3206487835308796, "grad_norm": 0.12559230820363362, "learning_rate": 6.588909846201992e-06, "loss": 0.0357, "num_tokens": 84780174.0, "step": 1059 }, { "epoch": 1.3218964441671865, "grad_norm": 0.11157729667744716, "learning_rate": 6.583029294513902e-06, "loss": 0.0374, "num_tokens": 84861023.0, "step": 1060 }, { "epoch": 1.3231441048034935, "grad_norm": 0.10532360001946126, "learning_rate": 6.577146778966347e-06, "loss": 0.0347, "num_tokens": 84941536.0, "step": 1061 }, { "epoch": 1.3243917654398003, "grad_norm": 0.11380661535983909, "learning_rate": 6.571262310226108e-06, "loss": 0.0361, "num_tokens": 85021693.0, "step": 1062 }, { "epoch": 1.3256394260761073, "grad_norm": 0.11751173019320106, "learning_rate": 6.565375898963503e-06, "loss": 0.0367, "num_tokens": 85101820.0, "step": 1063 }, { "epoch": 1.3268870867124143, "grad_norm": 0.11556897548657508, "learning_rate": 6.5594875558523755e-06, "loss": 0.0366, "num_tokens": 85182245.0, "step": 1064 }, { "epoch": 1.328134747348721, "grad_norm": 0.12290166180442655, "learning_rate": 6.553597291570071e-06, "loss": 0.034, "num_tokens": 85261741.0, "step": 1065 }, { "epoch": 1.329382407985028, "grad_norm": 0.11300496986222, "learning_rate": 6.547705116797422e-06, "loss": 0.0385, "num_tokens": 85341243.0, "step": 1066 }, { "epoch": 1.330630068621335, "grad_norm": 0.12054222224762111, "learning_rate": 6.5418110422187156e-06, "loss": 0.037, "num_tokens": 85421844.0, "step": 1067 }, { "epoch": 1.3318777292576418, "grad_norm": 0.11600190728796984, "learning_rate": 6.535915078521697e-06, "loss": 0.0364, "num_tokens": 85500760.0, "step": 1068 }, { "epoch": 1.3331253898939488, "grad_norm": 0.12283600030048854, "learning_rate": 6.530017236397529e-06, "loss": 0.0365, "num_tokens": 85580795.0, "step": 1069 }, { "epoch": 1.3343730505302558, "grad_norm": 0.11450545067136071, "learning_rate": 6.52411752654078e-06, "loss": 0.0347, "num_tokens": 85661620.0, "step": 1070 }, { "epoch": 1.3356207111665626, "grad_norm": 0.1114808904800754, "learning_rate": 6.518215959649409e-06, "loss": 0.0363, "num_tokens": 85742723.0, "step": 1071 }, { "epoch": 1.3368683718028695, "grad_norm": 0.12107476538251831, "learning_rate": 6.512312546424739e-06, "loss": 0.0332, "num_tokens": 85821771.0, "step": 1072 }, { "epoch": 1.3381160324391765, "grad_norm": 0.11709745580529317, "learning_rate": 6.506407297571445e-06, "loss": 0.0385, "num_tokens": 85901823.0, "step": 1073 }, { "epoch": 1.3393636930754835, "grad_norm": 0.12275293011077212, "learning_rate": 6.500500223797526e-06, "loss": 0.0357, "num_tokens": 85981915.0, "step": 1074 }, { "epoch": 1.3406113537117905, "grad_norm": 0.11678308605913426, "learning_rate": 6.494591335814292e-06, "loss": 0.0329, "num_tokens": 86061069.0, "step": 1075 }, { "epoch": 1.3418590143480973, "grad_norm": 0.11176605025879648, "learning_rate": 6.488680644336344e-06, "loss": 0.0379, "num_tokens": 86141683.0, "step": 1076 }, { "epoch": 1.3431066749844043, "grad_norm": 0.11363151432726346, "learning_rate": 6.482768160081553e-06, "loss": 0.0348, "num_tokens": 86222086.0, "step": 1077 }, { "epoch": 1.3443543356207113, "grad_norm": 0.11445806649743549, "learning_rate": 6.4768538937710364e-06, "loss": 0.0382, "num_tokens": 86302703.0, "step": 1078 }, { "epoch": 1.345601996257018, "grad_norm": 0.13464785228692652, "learning_rate": 6.470937856129152e-06, "loss": 0.0347, "num_tokens": 86382450.0, "step": 1079 }, { "epoch": 1.346849656893325, "grad_norm": 0.1280907328510425, "learning_rate": 6.465020057883461e-06, "loss": 0.0337, "num_tokens": 86462384.0, "step": 1080 }, { "epoch": 1.348097317529632, "grad_norm": 0.10759760692353552, "learning_rate": 6.45910050976472e-06, "loss": 0.0343, "num_tokens": 86541989.0, "step": 1081 }, { "epoch": 1.3493449781659388, "grad_norm": 0.11223790651092914, "learning_rate": 6.45317922250686e-06, "loss": 0.0383, "num_tokens": 86623690.0, "step": 1082 }, { "epoch": 1.3505926388022458, "grad_norm": 0.11168721545726104, "learning_rate": 6.447256206846963e-06, "loss": 0.0356, "num_tokens": 86703459.0, "step": 1083 }, { "epoch": 1.3518402994385528, "grad_norm": 0.11578985731239276, "learning_rate": 6.44133147352525e-06, "loss": 0.0351, "num_tokens": 86787758.0, "step": 1084 }, { "epoch": 1.3530879600748595, "grad_norm": 0.11806982316043722, "learning_rate": 6.4354050332850505e-06, "loss": 0.0351, "num_tokens": 86868231.0, "step": 1085 }, { "epoch": 1.3543356207111665, "grad_norm": 0.13776869218741045, "learning_rate": 6.429476896872793e-06, "loss": 0.0355, "num_tokens": 86948392.0, "step": 1086 }, { "epoch": 1.3555832813474735, "grad_norm": 0.1143170779368215, "learning_rate": 6.4235470750379794e-06, "loss": 0.0352, "num_tokens": 87028370.0, "step": 1087 }, { "epoch": 1.3568309419837803, "grad_norm": 0.10825819045832565, "learning_rate": 6.4176155785331705e-06, "loss": 0.0363, "num_tokens": 87109397.0, "step": 1088 }, { "epoch": 1.3580786026200873, "grad_norm": 0.12553276062809313, "learning_rate": 6.411682418113961e-06, "loss": 0.0401, "num_tokens": 87191884.0, "step": 1089 }, { "epoch": 1.3593262632563943, "grad_norm": 0.1190623682061882, "learning_rate": 6.405747604538965e-06, "loss": 0.036, "num_tokens": 87272846.0, "step": 1090 }, { "epoch": 1.3605739238927013, "grad_norm": 0.13120836782794848, "learning_rate": 6.399811148569794e-06, "loss": 0.0387, "num_tokens": 87354474.0, "step": 1091 }, { "epoch": 1.361821584529008, "grad_norm": 0.12749448720764697, "learning_rate": 6.393873060971036e-06, "loss": 0.0354, "num_tokens": 87434034.0, "step": 1092 }, { "epoch": 1.363069245165315, "grad_norm": 0.11265346187198873, "learning_rate": 6.3879333525102375e-06, "loss": 0.0383, "num_tokens": 87514380.0, "step": 1093 }, { "epoch": 1.364316905801622, "grad_norm": 0.10396098070933056, "learning_rate": 6.381992033957889e-06, "loss": 0.0374, "num_tokens": 87594266.0, "step": 1094 }, { "epoch": 1.365564566437929, "grad_norm": 0.12007723700394293, "learning_rate": 6.376049116087393e-06, "loss": 0.0358, "num_tokens": 87675016.0, "step": 1095 }, { "epoch": 1.3668122270742358, "grad_norm": 0.11088738920455023, "learning_rate": 6.370104609675058e-06, "loss": 0.0365, "num_tokens": 87755275.0, "step": 1096 }, { "epoch": 1.3680598877105428, "grad_norm": 0.12947725342366095, "learning_rate": 6.364158525500069e-06, "loss": 0.0386, "num_tokens": 87835968.0, "step": 1097 }, { "epoch": 1.3693075483468498, "grad_norm": 0.10743601396658202, "learning_rate": 6.358210874344476e-06, "loss": 0.0359, "num_tokens": 87916756.0, "step": 1098 }, { "epoch": 1.3705552089831565, "grad_norm": 0.10538537489377399, "learning_rate": 6.352261666993167e-06, "loss": 0.0344, "num_tokens": 87997097.0, "step": 1099 }, { "epoch": 1.3718028696194635, "grad_norm": 0.10920904020216766, "learning_rate": 6.346310914233854e-06, "loss": 0.0337, "num_tokens": 88075564.0, "step": 1100 }, { "epoch": 1.3730505302557705, "grad_norm": 0.12064678113049096, "learning_rate": 6.340358626857049e-06, "loss": 0.0374, "num_tokens": 88155637.0, "step": 1101 }, { "epoch": 1.3742981908920773, "grad_norm": 0.11366242121995487, "learning_rate": 6.334404815656049e-06, "loss": 0.034, "num_tokens": 88234184.0, "step": 1102 }, { "epoch": 1.3755458515283843, "grad_norm": 0.10541533458396993, "learning_rate": 6.328449491426914e-06, "loss": 0.0334, "num_tokens": 88313988.0, "step": 1103 }, { "epoch": 1.3767935121646913, "grad_norm": 0.1126212487247659, "learning_rate": 6.322492664968446e-06, "loss": 0.0361, "num_tokens": 88394035.0, "step": 1104 }, { "epoch": 1.378041172800998, "grad_norm": 0.10415858729571198, "learning_rate": 6.316534347082173e-06, "loss": 0.0353, "num_tokens": 88473457.0, "step": 1105 }, { "epoch": 1.379288833437305, "grad_norm": 0.11311924737695511, "learning_rate": 6.310574548572325e-06, "loss": 0.0396, "num_tokens": 88554028.0, "step": 1106 }, { "epoch": 1.380536494073612, "grad_norm": 0.11641164289365231, "learning_rate": 6.304613280245816e-06, "loss": 0.0351, "num_tokens": 88633282.0, "step": 1107 }, { "epoch": 1.3817841547099188, "grad_norm": 0.1238656036368708, "learning_rate": 6.298650552912233e-06, "loss": 0.0369, "num_tokens": 88713446.0, "step": 1108 }, { "epoch": 1.3830318153462258, "grad_norm": 0.10828413683130844, "learning_rate": 6.292686377383797e-06, "loss": 0.0366, "num_tokens": 88793591.0, "step": 1109 }, { "epoch": 1.3842794759825328, "grad_norm": 0.11180313140533374, "learning_rate": 6.286720764475365e-06, "loss": 0.0355, "num_tokens": 88872762.0, "step": 1110 }, { "epoch": 1.3855271366188397, "grad_norm": 0.11370487600194723, "learning_rate": 6.280753725004395e-06, "loss": 0.0362, "num_tokens": 88955457.0, "step": 1111 }, { "epoch": 1.3867747972551465, "grad_norm": 0.09617328020740629, "learning_rate": 6.274785269790932e-06, "loss": 0.0336, "num_tokens": 89035406.0, "step": 1112 }, { "epoch": 1.3880224578914535, "grad_norm": 0.11176348044222259, "learning_rate": 6.268815409657592e-06, "loss": 0.0342, "num_tokens": 89116507.0, "step": 1113 }, { "epoch": 1.3892701185277605, "grad_norm": 0.11325296873554795, "learning_rate": 6.262844155429533e-06, "loss": 0.0374, "num_tokens": 89199614.0, "step": 1114 }, { "epoch": 1.3905177791640675, "grad_norm": 0.11704030265578563, "learning_rate": 6.256871517934445e-06, "loss": 0.0357, "num_tokens": 89279144.0, "step": 1115 }, { "epoch": 1.3917654398003743, "grad_norm": 0.11952799829012306, "learning_rate": 6.2508975080025254e-06, "loss": 0.0353, "num_tokens": 89359708.0, "step": 1116 }, { "epoch": 1.3930131004366813, "grad_norm": 0.11502731870682785, "learning_rate": 6.24492213646646e-06, "loss": 0.0368, "num_tokens": 89439326.0, "step": 1117 }, { "epoch": 1.3942607610729882, "grad_norm": 0.12252081179674802, "learning_rate": 6.2389454141614024e-06, "loss": 0.0345, "num_tokens": 89518867.0, "step": 1118 }, { "epoch": 1.395508421709295, "grad_norm": 0.18070152071501802, "learning_rate": 6.232967351924959e-06, "loss": 0.0355, "num_tokens": 89598243.0, "step": 1119 }, { "epoch": 1.396756082345602, "grad_norm": 0.1228366552270244, "learning_rate": 6.226987960597161e-06, "loss": 0.0363, "num_tokens": 89678232.0, "step": 1120 }, { "epoch": 1.398003742981909, "grad_norm": 0.13722290208459134, "learning_rate": 6.22100725102045e-06, "loss": 0.0347, "num_tokens": 89758532.0, "step": 1121 }, { "epoch": 1.3992514036182158, "grad_norm": 0.11700538827862798, "learning_rate": 6.215025234039667e-06, "loss": 0.0371, "num_tokens": 89838761.0, "step": 1122 }, { "epoch": 1.4004990642545228, "grad_norm": 0.11749579319296499, "learning_rate": 6.209041920502012e-06, "loss": 0.0362, "num_tokens": 89919068.0, "step": 1123 }, { "epoch": 1.4017467248908297, "grad_norm": 0.1239771498120876, "learning_rate": 6.203057321257041e-06, "loss": 0.0347, "num_tokens": 89999454.0, "step": 1124 }, { "epoch": 1.4029943855271365, "grad_norm": 0.10457854348428894, "learning_rate": 6.197071447156643e-06, "loss": 0.0369, "num_tokens": 90079489.0, "step": 1125 }, { "epoch": 1.4042420461634435, "grad_norm": 0.12132963451268676, "learning_rate": 6.191084309055018e-06, "loss": 0.0359, "num_tokens": 90160079.0, "step": 1126 }, { "epoch": 1.4054897067997505, "grad_norm": 0.1149651839306163, "learning_rate": 6.185095917808654e-06, "loss": 0.0367, "num_tokens": 90239067.0, "step": 1127 }, { "epoch": 1.4067373674360573, "grad_norm": 0.11185660034071362, "learning_rate": 6.179106284276315e-06, "loss": 0.0345, "num_tokens": 90320588.0, "step": 1128 }, { "epoch": 1.4079850280723643, "grad_norm": 0.11791295713574668, "learning_rate": 6.173115419319019e-06, "loss": 0.0357, "num_tokens": 90400737.0, "step": 1129 }, { "epoch": 1.4092326887086712, "grad_norm": 0.12005035519918307, "learning_rate": 6.167123333800014e-06, "loss": 0.0372, "num_tokens": 90481553.0, "step": 1130 }, { "epoch": 1.4104803493449782, "grad_norm": 0.11280371649250795, "learning_rate": 6.161130038584762e-06, "loss": 0.0359, "num_tokens": 90560492.0, "step": 1131 }, { "epoch": 1.4117280099812852, "grad_norm": 0.11483105002319907, "learning_rate": 6.155135544540917e-06, "loss": 0.0329, "num_tokens": 90640526.0, "step": 1132 }, { "epoch": 1.412975670617592, "grad_norm": 0.10610511431168547, "learning_rate": 6.1491398625383116e-06, "loss": 0.0365, "num_tokens": 90720931.0, "step": 1133 }, { "epoch": 1.414223331253899, "grad_norm": 0.11403045781580023, "learning_rate": 6.143143003448929e-06, "loss": 0.0334, "num_tokens": 90799876.0, "step": 1134 }, { "epoch": 1.415470991890206, "grad_norm": 0.1128226484451764, "learning_rate": 6.1371449781468835e-06, "loss": 0.036, "num_tokens": 90879955.0, "step": 1135 }, { "epoch": 1.4167186525265127, "grad_norm": 0.1339643064995737, "learning_rate": 6.131145797508414e-06, "loss": 0.0362, "num_tokens": 90960140.0, "step": 1136 }, { "epoch": 1.4179663131628197, "grad_norm": 0.11977396725939997, "learning_rate": 6.125145472411845e-06, "loss": 0.0408, "num_tokens": 91040880.0, "step": 1137 }, { "epoch": 1.4192139737991267, "grad_norm": 0.1304144374986512, "learning_rate": 6.1191440137375775e-06, "loss": 0.0356, "num_tokens": 91120578.0, "step": 1138 }, { "epoch": 1.4204616344354335, "grad_norm": 0.10952816992518201, "learning_rate": 6.113141432368075e-06, "loss": 0.0342, "num_tokens": 91199968.0, "step": 1139 }, { "epoch": 1.4217092950717405, "grad_norm": 0.11923470924014716, "learning_rate": 6.107137739187827e-06, "loss": 0.0382, "num_tokens": 91280534.0, "step": 1140 }, { "epoch": 1.4229569557080475, "grad_norm": 0.13414248321277958, "learning_rate": 6.101132945083347e-06, "loss": 0.0353, "num_tokens": 91359718.0, "step": 1141 }, { "epoch": 1.4242046163443542, "grad_norm": 0.11441510307386335, "learning_rate": 6.095127060943141e-06, "loss": 0.0355, "num_tokens": 91439021.0, "step": 1142 }, { "epoch": 1.4254522769806612, "grad_norm": 0.11361047483263015, "learning_rate": 6.089120097657692e-06, "loss": 0.0399, "num_tokens": 91520278.0, "step": 1143 }, { "epoch": 1.4266999376169682, "grad_norm": 0.11528784116675486, "learning_rate": 6.083112066119439e-06, "loss": 0.0379, "num_tokens": 91600857.0, "step": 1144 }, { "epoch": 1.427947598253275, "grad_norm": 0.12339012360670368, "learning_rate": 6.077102977222763e-06, "loss": 0.0363, "num_tokens": 91681068.0, "step": 1145 }, { "epoch": 1.429195258889582, "grad_norm": 0.11822266945602426, "learning_rate": 6.0710928418639515e-06, "loss": 0.0369, "num_tokens": 91762429.0, "step": 1146 }, { "epoch": 1.430442919525889, "grad_norm": 0.10934047203309372, "learning_rate": 6.065081670941204e-06, "loss": 0.0343, "num_tokens": 91842442.0, "step": 1147 }, { "epoch": 1.4316905801621957, "grad_norm": 0.11563946305646088, "learning_rate": 6.059069475354586e-06, "loss": 0.0371, "num_tokens": 91921815.0, "step": 1148 }, { "epoch": 1.4329382407985027, "grad_norm": 0.13902235644402097, "learning_rate": 6.0530562660060276e-06, "loss": 0.035, "num_tokens": 92001388.0, "step": 1149 }, { "epoch": 1.4341859014348097, "grad_norm": 0.11365381032155394, "learning_rate": 6.0470420537992915e-06, "loss": 0.0361, "num_tokens": 92080682.0, "step": 1150 }, { "epoch": 1.4354335620711167, "grad_norm": 0.1215204822957711, "learning_rate": 6.041026849639966e-06, "loss": 0.0367, "num_tokens": 92160919.0, "step": 1151 }, { "epoch": 1.4366812227074237, "grad_norm": 0.10228268537749387, "learning_rate": 6.035010664435434e-06, "loss": 0.0361, "num_tokens": 92241085.0, "step": 1152 }, { "epoch": 1.4379288833437305, "grad_norm": 0.11290899169878665, "learning_rate": 6.0289935090948536e-06, "loss": 0.0339, "num_tokens": 92320066.0, "step": 1153 }, { "epoch": 1.4391765439800375, "grad_norm": 0.12929287703536887, "learning_rate": 6.022975394529149e-06, "loss": 0.0344, "num_tokens": 92399898.0, "step": 1154 }, { "epoch": 1.4404242046163445, "grad_norm": 0.10350159451251419, "learning_rate": 6.016956331650984e-06, "loss": 0.0338, "num_tokens": 92479871.0, "step": 1155 }, { "epoch": 1.4416718652526512, "grad_norm": 0.11525710135617265, "learning_rate": 6.010936331374735e-06, "loss": 0.0359, "num_tokens": 92560206.0, "step": 1156 }, { "epoch": 1.4429195258889582, "grad_norm": 0.11743945426514996, "learning_rate": 6.00491540461648e-06, "loss": 0.034, "num_tokens": 92639628.0, "step": 1157 }, { "epoch": 1.4441671865252652, "grad_norm": 0.10670559077717189, "learning_rate": 5.998893562293986e-06, "loss": 0.0377, "num_tokens": 92719681.0, "step": 1158 }, { "epoch": 1.445414847161572, "grad_norm": 0.11601172863515272, "learning_rate": 5.992870815326667e-06, "loss": 0.0366, "num_tokens": 92799584.0, "step": 1159 }, { "epoch": 1.446662507797879, "grad_norm": 0.11460386722320819, "learning_rate": 5.986847174635586e-06, "loss": 0.0332, "num_tokens": 92879565.0, "step": 1160 }, { "epoch": 1.447910168434186, "grad_norm": 0.10697055130942412, "learning_rate": 5.980822651143426e-06, "loss": 0.0365, "num_tokens": 92959785.0, "step": 1161 }, { "epoch": 1.4491578290704927, "grad_norm": 0.12723407944880333, "learning_rate": 5.9747972557744675e-06, "loss": 0.0382, "num_tokens": 93040108.0, "step": 1162 }, { "epoch": 1.4504054897067997, "grad_norm": 0.11841204743560867, "learning_rate": 5.968770999454572e-06, "loss": 0.036, "num_tokens": 93121058.0, "step": 1163 }, { "epoch": 1.4516531503431067, "grad_norm": 0.11569156196248792, "learning_rate": 5.962743893111165e-06, "loss": 0.0353, "num_tokens": 93200814.0, "step": 1164 }, { "epoch": 1.4529008109794135, "grad_norm": 0.10777418932429268, "learning_rate": 5.956715947673212e-06, "loss": 0.0348, "num_tokens": 93281213.0, "step": 1165 }, { "epoch": 1.4541484716157205, "grad_norm": 0.10994616619439264, "learning_rate": 5.950687174071201e-06, "loss": 0.0356, "num_tokens": 93360403.0, "step": 1166 }, { "epoch": 1.4553961322520275, "grad_norm": 0.10558957609563456, "learning_rate": 5.944657583237119e-06, "loss": 0.0359, "num_tokens": 93440112.0, "step": 1167 }, { "epoch": 1.4566437928883345, "grad_norm": 0.10340454479634237, "learning_rate": 5.938627186104438e-06, "loss": 0.0341, "num_tokens": 93519997.0, "step": 1168 }, { "epoch": 1.4578914535246412, "grad_norm": 0.11216617986802824, "learning_rate": 5.932595993608092e-06, "loss": 0.0367, "num_tokens": 93601531.0, "step": 1169 }, { "epoch": 1.4591391141609482, "grad_norm": 0.11118613571047507, "learning_rate": 5.926564016684453e-06, "loss": 0.0369, "num_tokens": 93684506.0, "step": 1170 }, { "epoch": 1.4603867747972552, "grad_norm": 0.11180759900830808, "learning_rate": 5.920531266271317e-06, "loss": 0.0345, "num_tokens": 93765144.0, "step": 1171 }, { "epoch": 1.4616344354335622, "grad_norm": 0.10849133876831726, "learning_rate": 5.9144977533078885e-06, "loss": 0.0338, "num_tokens": 93844866.0, "step": 1172 }, { "epoch": 1.462882096069869, "grad_norm": 0.10146908320691236, "learning_rate": 5.90846348873475e-06, "loss": 0.0376, "num_tokens": 93925678.0, "step": 1173 }, { "epoch": 1.464129756706176, "grad_norm": 0.11406035504111364, "learning_rate": 5.902428483493845e-06, "loss": 0.0354, "num_tokens": 94005650.0, "step": 1174 }, { "epoch": 1.465377417342483, "grad_norm": 0.10629587740727439, "learning_rate": 5.89639274852846e-06, "loss": 0.0338, "num_tokens": 94085102.0, "step": 1175 }, { "epoch": 1.4666250779787897, "grad_norm": 0.10967958676982019, "learning_rate": 5.890356294783213e-06, "loss": 0.0346, "num_tokens": 94163932.0, "step": 1176 }, { "epoch": 1.4678727386150967, "grad_norm": 0.1110479738340064, "learning_rate": 5.8843191332040125e-06, "loss": 0.0366, "num_tokens": 94244813.0, "step": 1177 }, { "epoch": 1.4691203992514037, "grad_norm": 0.11487895996237814, "learning_rate": 5.878281274738061e-06, "loss": 0.044, "num_tokens": 94326986.0, "step": 1178 }, { "epoch": 1.4703680598877105, "grad_norm": 0.12426013311708847, "learning_rate": 5.872242730333822e-06, "loss": 0.0373, "num_tokens": 94407345.0, "step": 1179 }, { "epoch": 1.4716157205240175, "grad_norm": 0.10392684893138183, "learning_rate": 5.866203510940998e-06, "loss": 0.0341, "num_tokens": 94486495.0, "step": 1180 }, { "epoch": 1.4728633811603244, "grad_norm": 0.11624920561782613, "learning_rate": 5.860163627510521e-06, "loss": 0.0354, "num_tokens": 94566567.0, "step": 1181 }, { "epoch": 1.4741110417966312, "grad_norm": 0.11632421080295033, "learning_rate": 5.854123090994524e-06, "loss": 0.0351, "num_tokens": 94646328.0, "step": 1182 }, { "epoch": 1.4753587024329382, "grad_norm": 0.1085474029571278, "learning_rate": 5.848081912346329e-06, "loss": 0.0357, "num_tokens": 94726254.0, "step": 1183 }, { "epoch": 1.4766063630692452, "grad_norm": 0.13928705664750285, "learning_rate": 5.842040102520416e-06, "loss": 0.0345, "num_tokens": 94806492.0, "step": 1184 }, { "epoch": 1.477854023705552, "grad_norm": 0.11956422118762197, "learning_rate": 5.8359976724724146e-06, "loss": 0.0373, "num_tokens": 94888343.0, "step": 1185 }, { "epoch": 1.479101684341859, "grad_norm": 0.10678348906652803, "learning_rate": 5.829954633159073e-06, "loss": 0.0365, "num_tokens": 94968750.0, "step": 1186 }, { "epoch": 1.480349344978166, "grad_norm": 0.10950504983347997, "learning_rate": 5.823910995538251e-06, "loss": 0.0363, "num_tokens": 95048007.0, "step": 1187 }, { "epoch": 1.481597005614473, "grad_norm": 0.10871513034226521, "learning_rate": 5.8178667705688895e-06, "loss": 0.034, "num_tokens": 95127214.0, "step": 1188 }, { "epoch": 1.4828446662507797, "grad_norm": 0.11813024759210294, "learning_rate": 5.811821969210995e-06, "loss": 0.0378, "num_tokens": 95207769.0, "step": 1189 }, { "epoch": 1.4840923268870867, "grad_norm": 0.12334121149788137, "learning_rate": 5.8057766024256205e-06, "loss": 0.0339, "num_tokens": 95286918.0, "step": 1190 }, { "epoch": 1.4853399875233937, "grad_norm": 0.11137548828632113, "learning_rate": 5.799730681174842e-06, "loss": 0.0345, "num_tokens": 95367783.0, "step": 1191 }, { "epoch": 1.4865876481597007, "grad_norm": 0.11258420176536157, "learning_rate": 5.793684216421744e-06, "loss": 0.0379, "num_tokens": 95449134.0, "step": 1192 }, { "epoch": 1.4878353087960074, "grad_norm": 0.1283974468707094, "learning_rate": 5.787637219130392e-06, "loss": 0.0373, "num_tokens": 95528492.0, "step": 1193 }, { "epoch": 1.4890829694323144, "grad_norm": 0.10883979183962499, "learning_rate": 5.781589700265823e-06, "loss": 0.0343, "num_tokens": 95608208.0, "step": 1194 }, { "epoch": 1.4903306300686214, "grad_norm": 0.10270640891807907, "learning_rate": 5.7755416707940135e-06, "loss": 0.0348, "num_tokens": 95687611.0, "step": 1195 }, { "epoch": 1.4915782907049282, "grad_norm": 0.12695450443859627, "learning_rate": 5.76949314168187e-06, "loss": 0.0359, "num_tokens": 95767108.0, "step": 1196 }, { "epoch": 1.4928259513412352, "grad_norm": 0.11114393977783613, "learning_rate": 5.763444123897206e-06, "loss": 0.0345, "num_tokens": 95846696.0, "step": 1197 }, { "epoch": 1.4940736119775422, "grad_norm": 0.10051611126921174, "learning_rate": 5.757394628408716e-06, "loss": 0.0355, "num_tokens": 95927423.0, "step": 1198 }, { "epoch": 1.495321272613849, "grad_norm": 0.10698918904729461, "learning_rate": 5.7513446661859664e-06, "loss": 0.0334, "num_tokens": 96008401.0, "step": 1199 }, { "epoch": 1.496568933250156, "grad_norm": 0.11515052950318992, "learning_rate": 5.7452942481993655e-06, "loss": 0.0329, "num_tokens": 96087128.0, "step": 1200 }, { "epoch": 1.497816593886463, "grad_norm": 0.10667262101407343, "learning_rate": 5.739243385420151e-06, "loss": 0.0358, "num_tokens": 96167367.0, "step": 1201 }, { "epoch": 1.4990642545227697, "grad_norm": 0.12405579945874308, "learning_rate": 5.7331920888203655e-06, "loss": 0.0346, "num_tokens": 96246922.0, "step": 1202 }, { "epoch": 1.5003119151590767, "grad_norm": 0.10489860643653304, "learning_rate": 5.727140369372838e-06, "loss": 0.0376, "num_tokens": 96327807.0, "step": 1203 }, { "epoch": 1.5015595757953837, "grad_norm": 0.12443731067183812, "learning_rate": 5.721088238051168e-06, "loss": 0.0356, "num_tokens": 96408288.0, "step": 1204 }, { "epoch": 1.5028072364316905, "grad_norm": 0.10422931405567512, "learning_rate": 5.715035705829696e-06, "loss": 0.0378, "num_tokens": 96488747.0, "step": 1205 }, { "epoch": 1.5040548970679977, "grad_norm": 0.11583240842588108, "learning_rate": 5.708982783683492e-06, "loss": 0.0351, "num_tokens": 96567394.0, "step": 1206 }, { "epoch": 1.5053025577043044, "grad_norm": 0.11679015463888166, "learning_rate": 5.7029294825883365e-06, "loss": 0.035, "num_tokens": 96646566.0, "step": 1207 }, { "epoch": 1.5065502183406112, "grad_norm": 0.11437606283844103, "learning_rate": 5.696875813520691e-06, "loss": 0.0392, "num_tokens": 96727492.0, "step": 1208 }, { "epoch": 1.5077978789769184, "grad_norm": 0.11691760587370684, "learning_rate": 5.69082178745769e-06, "loss": 0.0352, "num_tokens": 96807931.0, "step": 1209 }, { "epoch": 1.5090455396132252, "grad_norm": 0.10574935510817819, "learning_rate": 5.68476741537711e-06, "loss": 0.0346, "num_tokens": 96887154.0, "step": 1210 }, { "epoch": 1.5102932002495322, "grad_norm": 0.11625074019798143, "learning_rate": 5.678712708257358e-06, "loss": 0.039, "num_tokens": 96969029.0, "step": 1211 }, { "epoch": 1.5115408608858392, "grad_norm": 0.11630274180853753, "learning_rate": 5.672657677077449e-06, "loss": 0.0363, "num_tokens": 97049726.0, "step": 1212 }, { "epoch": 1.512788521522146, "grad_norm": 0.1127526825270542, "learning_rate": 5.666602332816985e-06, "loss": 0.0372, "num_tokens": 97130469.0, "step": 1213 }, { "epoch": 1.514036182158453, "grad_norm": 0.11863761031881935, "learning_rate": 5.6605466864561344e-06, "loss": 0.0344, "num_tokens": 97210798.0, "step": 1214 }, { "epoch": 1.51528384279476, "grad_norm": 0.11747556688292905, "learning_rate": 5.654490748975615e-06, "loss": 0.0368, "num_tokens": 97290820.0, "step": 1215 }, { "epoch": 1.5165315034310667, "grad_norm": 0.11740884642076882, "learning_rate": 5.648434531356671e-06, "loss": 0.0341, "num_tokens": 97370963.0, "step": 1216 }, { "epoch": 1.5177791640673737, "grad_norm": 0.1121718619003604, "learning_rate": 5.642378044581057e-06, "loss": 0.0372, "num_tokens": 97451787.0, "step": 1217 }, { "epoch": 1.5190268247036807, "grad_norm": 0.11028715028041086, "learning_rate": 5.636321299631015e-06, "loss": 0.0355, "num_tokens": 97531107.0, "step": 1218 }, { "epoch": 1.5202744853399874, "grad_norm": 0.11526797879935653, "learning_rate": 5.630264307489251e-06, "loss": 0.0356, "num_tokens": 97610596.0, "step": 1219 }, { "epoch": 1.5215221459762944, "grad_norm": 0.11697834812620382, "learning_rate": 5.624207079138922e-06, "loss": 0.0372, "num_tokens": 97692010.0, "step": 1220 }, { "epoch": 1.5227698066126014, "grad_norm": 0.11361340804878213, "learning_rate": 5.6181496255636195e-06, "loss": 0.038, "num_tokens": 97771259.0, "step": 1221 }, { "epoch": 1.5240174672489082, "grad_norm": 0.11948073802819091, "learning_rate": 5.612091957747333e-06, "loss": 0.0362, "num_tokens": 97851776.0, "step": 1222 }, { "epoch": 1.5252651278852152, "grad_norm": 0.11933952019877841, "learning_rate": 5.606034086674447e-06, "loss": 0.0347, "num_tokens": 97931323.0, "step": 1223 }, { "epoch": 1.5265127885215222, "grad_norm": 0.1119341984186723, "learning_rate": 5.5999760233297115e-06, "loss": 0.0355, "num_tokens": 98012414.0, "step": 1224 }, { "epoch": 1.527760449157829, "grad_norm": 0.12570139447794026, "learning_rate": 5.593917778698227e-06, "loss": 0.0351, "num_tokens": 98092865.0, "step": 1225 }, { "epoch": 1.5290081097941361, "grad_norm": 0.12255701579140926, "learning_rate": 5.5878593637654226e-06, "loss": 0.0372, "num_tokens": 98173575.0, "step": 1226 }, { "epoch": 1.530255770430443, "grad_norm": 0.12045998667186913, "learning_rate": 5.581800789517036e-06, "loss": 0.0338, "num_tokens": 98253478.0, "step": 1227 }, { "epoch": 1.5315034310667497, "grad_norm": 0.10599310040291675, "learning_rate": 5.5757420669390925e-06, "loss": 0.0333, "num_tokens": 98331761.0, "step": 1228 }, { "epoch": 1.532751091703057, "grad_norm": 0.1189636777802236, "learning_rate": 5.5696832070178885e-06, "loss": 0.0353, "num_tokens": 98412258.0, "step": 1229 }, { "epoch": 1.5339987523393637, "grad_norm": 0.11265547375888633, "learning_rate": 5.563624220739969e-06, "loss": 0.0369, "num_tokens": 98492861.0, "step": 1230 }, { "epoch": 1.5352464129756707, "grad_norm": 0.1135501412896236, "learning_rate": 5.557565119092106e-06, "loss": 0.034, "num_tokens": 98572091.0, "step": 1231 }, { "epoch": 1.5364940736119777, "grad_norm": 0.12706095636153494, "learning_rate": 5.551505913061281e-06, "loss": 0.0386, "num_tokens": 98652747.0, "step": 1232 }, { "epoch": 1.5377417342482844, "grad_norm": 0.11767249883496335, "learning_rate": 5.54544661363467e-06, "loss": 0.0356, "num_tokens": 98732307.0, "step": 1233 }, { "epoch": 1.5389893948845914, "grad_norm": 0.11861760143450235, "learning_rate": 5.53938723179961e-06, "loss": 0.0337, "num_tokens": 98811668.0, "step": 1234 }, { "epoch": 1.5402370555208984, "grad_norm": 0.1110579082090557, "learning_rate": 5.533327778543593e-06, "loss": 0.0356, "num_tokens": 98890773.0, "step": 1235 }, { "epoch": 1.5414847161572052, "grad_norm": 0.10918115524283789, "learning_rate": 5.527268264854241e-06, "loss": 0.0354, "num_tokens": 98970768.0, "step": 1236 }, { "epoch": 1.5427323767935122, "grad_norm": 0.1120508500986546, "learning_rate": 5.521208701719284e-06, "loss": 0.0371, "num_tokens": 99052179.0, "step": 1237 }, { "epoch": 1.5439800374298192, "grad_norm": 0.11689356960540866, "learning_rate": 5.515149100126539e-06, "loss": 0.0364, "num_tokens": 99132493.0, "step": 1238 }, { "epoch": 1.545227698066126, "grad_norm": 0.10663726507181617, "learning_rate": 5.509089471063897e-06, "loss": 0.0338, "num_tokens": 99212713.0, "step": 1239 }, { "epoch": 1.546475358702433, "grad_norm": 0.10825998214250622, "learning_rate": 5.503029825519296e-06, "loss": 0.0346, "num_tokens": 99292651.0, "step": 1240 }, { "epoch": 1.54772301933874, "grad_norm": 0.11313988408089551, "learning_rate": 5.496970174480706e-06, "loss": 0.0339, "num_tokens": 99372261.0, "step": 1241 }, { "epoch": 1.5489706799750467, "grad_norm": 0.12649997908530414, "learning_rate": 5.4909105289361055e-06, "loss": 0.0539, "num_tokens": 99453192.0, "step": 1242 }, { "epoch": 1.5502183406113537, "grad_norm": 0.1186247978835761, "learning_rate": 5.4848508998734626e-06, "loss": 0.0362, "num_tokens": 99534693.0, "step": 1243 }, { "epoch": 1.5514660012476607, "grad_norm": 0.11032319195795326, "learning_rate": 5.478791298280719e-06, "loss": 0.0325, "num_tokens": 99613614.0, "step": 1244 }, { "epoch": 1.5527136618839674, "grad_norm": 0.10331362127146462, "learning_rate": 5.47273173514576e-06, "loss": 0.0366, "num_tokens": 99694144.0, "step": 1245 }, { "epoch": 1.5539613225202746, "grad_norm": 0.11441502157583171, "learning_rate": 5.466672221456408e-06, "loss": 0.0352, "num_tokens": 99772396.0, "step": 1246 }, { "epoch": 1.5552089831565814, "grad_norm": 0.1384519685906425, "learning_rate": 5.4606127682003915e-06, "loss": 0.0364, "num_tokens": 99853878.0, "step": 1247 }, { "epoch": 1.5564566437928882, "grad_norm": 0.1187428026312172, "learning_rate": 5.454553386365333e-06, "loss": 0.0362, "num_tokens": 99933199.0, "step": 1248 }, { "epoch": 1.5577043044291954, "grad_norm": 0.1154288559693241, "learning_rate": 5.44849408693872e-06, "loss": 0.0355, "num_tokens": 100013822.0, "step": 1249 }, { "epoch": 1.5589519650655022, "grad_norm": 0.10939017703608667, "learning_rate": 5.4424348809078974e-06, "loss": 0.0364, "num_tokens": 100093850.0, "step": 1250 }, { "epoch": 1.5601996257018091, "grad_norm": 0.11577706451313442, "learning_rate": 5.436375779260034e-06, "loss": 0.0348, "num_tokens": 100174014.0, "step": 1251 }, { "epoch": 1.5614472863381161, "grad_norm": 0.11552930897371735, "learning_rate": 5.430316792982112e-06, "loss": 0.0364, "num_tokens": 100254096.0, "step": 1252 }, { "epoch": 1.562694946974423, "grad_norm": 0.12091413731054657, "learning_rate": 5.424257933060908e-06, "loss": 0.036, "num_tokens": 100335736.0, "step": 1253 }, { "epoch": 1.56394260761073, "grad_norm": 0.11349954972674088, "learning_rate": 5.418199210482965e-06, "loss": 0.0339, "num_tokens": 100415770.0, "step": 1254 }, { "epoch": 1.5651902682470369, "grad_norm": 0.11257539896050413, "learning_rate": 5.412140636234579e-06, "loss": 0.0365, "num_tokens": 100496239.0, "step": 1255 }, { "epoch": 1.5664379288833437, "grad_norm": 0.10433088685065614, "learning_rate": 5.4060822213017745e-06, "loss": 0.033, "num_tokens": 100575751.0, "step": 1256 }, { "epoch": 1.5676855895196506, "grad_norm": 0.11128220666515805, "learning_rate": 5.400023976670291e-06, "loss": 0.0362, "num_tokens": 100655896.0, "step": 1257 }, { "epoch": 1.5689332501559576, "grad_norm": 0.11665278503733945, "learning_rate": 5.393965913325555e-06, "loss": 0.036, "num_tokens": 100736726.0, "step": 1258 }, { "epoch": 1.5701809107922644, "grad_norm": 0.11372703831180105, "learning_rate": 5.387908042252667e-06, "loss": 0.0521, "num_tokens": 100817144.0, "step": 1259 }, { "epoch": 1.5714285714285714, "grad_norm": 0.11403138474441783, "learning_rate": 5.381850374436383e-06, "loss": 0.0349, "num_tokens": 100896712.0, "step": 1260 }, { "epoch": 1.5726762320648784, "grad_norm": 0.11114301741148946, "learning_rate": 5.3757929208610784e-06, "loss": 0.0338, "num_tokens": 100975384.0, "step": 1261 }, { "epoch": 1.5739238927011852, "grad_norm": 0.15901465459498454, "learning_rate": 5.3697356925107514e-06, "loss": 0.0353, "num_tokens": 101055790.0, "step": 1262 }, { "epoch": 1.5751715533374921, "grad_norm": 0.10627021014563413, "learning_rate": 5.363678700368987e-06, "loss": 0.0369, "num_tokens": 101136479.0, "step": 1263 }, { "epoch": 1.5764192139737991, "grad_norm": 0.12456078400858275, "learning_rate": 5.3576219554189445e-06, "loss": 0.0401, "num_tokens": 101217634.0, "step": 1264 }, { "epoch": 1.577666874610106, "grad_norm": 0.1210041167801264, "learning_rate": 5.35156546864333e-06, "loss": 0.037, "num_tokens": 101297872.0, "step": 1265 }, { "epoch": 1.5789145352464131, "grad_norm": 0.1070252120913601, "learning_rate": 5.345509251024387e-06, "loss": 0.0374, "num_tokens": 101380176.0, "step": 1266 }, { "epoch": 1.5801621958827199, "grad_norm": 0.10983324132682443, "learning_rate": 5.339453313543868e-06, "loss": 0.0331, "num_tokens": 101459436.0, "step": 1267 }, { "epoch": 1.5814098565190269, "grad_norm": 0.10244262198712618, "learning_rate": 5.3333976671830165e-06, "loss": 0.0343, "num_tokens": 101538263.0, "step": 1268 }, { "epoch": 1.5826575171553339, "grad_norm": 0.10678644753096563, "learning_rate": 5.327342322922553e-06, "loss": 0.0333, "num_tokens": 101618020.0, "step": 1269 }, { "epoch": 1.5839051777916406, "grad_norm": 0.10303177226381595, "learning_rate": 5.321287291742645e-06, "loss": 0.0335, "num_tokens": 101696955.0, "step": 1270 }, { "epoch": 1.5851528384279476, "grad_norm": 0.10922923647766163, "learning_rate": 5.315232584622893e-06, "loss": 0.0332, "num_tokens": 101776118.0, "step": 1271 }, { "epoch": 1.5864004990642546, "grad_norm": 0.11106303953873586, "learning_rate": 5.309178212542313e-06, "loss": 0.0342, "num_tokens": 101855486.0, "step": 1272 }, { "epoch": 1.5876481597005614, "grad_norm": 0.11146880211359997, "learning_rate": 5.303124186479309e-06, "loss": 0.0325, "num_tokens": 101933520.0, "step": 1273 }, { "epoch": 1.5888958203368684, "grad_norm": 0.11066657207520725, "learning_rate": 5.297070517411664e-06, "loss": 0.037, "num_tokens": 102015666.0, "step": 1274 }, { "epoch": 1.5901434809731754, "grad_norm": 0.12070337142463965, "learning_rate": 5.2910172163165096e-06, "loss": 0.0355, "num_tokens": 102095242.0, "step": 1275 }, { "epoch": 1.5913911416094821, "grad_norm": 0.11240092383283054, "learning_rate": 5.284964294170306e-06, "loss": 0.0362, "num_tokens": 102174205.0, "step": 1276 }, { "epoch": 1.5926388022457891, "grad_norm": 0.12576136429390572, "learning_rate": 5.278911761948834e-06, "loss": 0.0355, "num_tokens": 102255437.0, "step": 1277 }, { "epoch": 1.5938864628820961, "grad_norm": 0.11710384991621879, "learning_rate": 5.272859630627164e-06, "loss": 0.0347, "num_tokens": 102334946.0, "step": 1278 }, { "epoch": 1.595134123518403, "grad_norm": 0.10521059038838904, "learning_rate": 5.266807911179638e-06, "loss": 0.0354, "num_tokens": 102414226.0, "step": 1279 }, { "epoch": 1.5963817841547099, "grad_norm": 0.11308055443975817, "learning_rate": 5.260756614579851e-06, "loss": 0.0327, "num_tokens": 102493991.0, "step": 1280 }, { "epoch": 1.5976294447910169, "grad_norm": 0.1186031956772661, "learning_rate": 5.254705751800636e-06, "loss": 0.0348, "num_tokens": 102574527.0, "step": 1281 }, { "epoch": 1.5988771054273236, "grad_norm": 0.10866075596998348, "learning_rate": 5.248655333814036e-06, "loss": 0.0333, "num_tokens": 102654519.0, "step": 1282 }, { "epoch": 1.6001247660636309, "grad_norm": 0.10696137668864042, "learning_rate": 5.242605371591286e-06, "loss": 0.0352, "num_tokens": 102734795.0, "step": 1283 }, { "epoch": 1.6013724266999376, "grad_norm": 0.10337102406850418, "learning_rate": 5.236555876102797e-06, "loss": 0.0375, "num_tokens": 102814405.0, "step": 1284 }, { "epoch": 1.6026200873362444, "grad_norm": 0.12757465999262096, "learning_rate": 5.2305068583181314e-06, "loss": 0.036, "num_tokens": 102894980.0, "step": 1285 }, { "epoch": 1.6038677479725516, "grad_norm": 0.13690644302329974, "learning_rate": 5.2244583292059896e-06, "loss": 0.0366, "num_tokens": 102977017.0, "step": 1286 }, { "epoch": 1.6051154086088584, "grad_norm": 0.11930944617293818, "learning_rate": 5.218410299734181e-06, "loss": 0.034, "num_tokens": 103056779.0, "step": 1287 }, { "epoch": 1.6063630692451654, "grad_norm": 0.11012326303635499, "learning_rate": 5.2123627808696084e-06, "loss": 0.0339, "num_tokens": 103137283.0, "step": 1288 }, { "epoch": 1.6076107298814724, "grad_norm": 0.10957703717797697, "learning_rate": 5.206315783578258e-06, "loss": 0.0336, "num_tokens": 103216776.0, "step": 1289 }, { "epoch": 1.6088583905177791, "grad_norm": 0.10906712477554352, "learning_rate": 5.20026931882516e-06, "loss": 0.0341, "num_tokens": 103296399.0, "step": 1290 }, { "epoch": 1.6101060511540861, "grad_norm": 0.11440229595698308, "learning_rate": 5.194223397574381e-06, "loss": 0.0376, "num_tokens": 103376203.0, "step": 1291 }, { "epoch": 1.611353711790393, "grad_norm": 0.12588773683905502, "learning_rate": 5.188178030789008e-06, "loss": 0.0346, "num_tokens": 103456735.0, "step": 1292 }, { "epoch": 1.6126013724266999, "grad_norm": 0.12042374456895473, "learning_rate": 5.1821332294311136e-06, "loss": 0.0356, "num_tokens": 103537257.0, "step": 1293 }, { "epoch": 1.6138490330630069, "grad_norm": 0.11853035765383917, "learning_rate": 5.176089004461752e-06, "loss": 0.0361, "num_tokens": 103617737.0, "step": 1294 }, { "epoch": 1.6150966936993139, "grad_norm": 0.1084054829939777, "learning_rate": 5.170045366840929e-06, "loss": 0.0373, "num_tokens": 103698813.0, "step": 1295 }, { "epoch": 1.6163443543356206, "grad_norm": 0.12273703655358631, "learning_rate": 5.164002327527588e-06, "loss": 0.0374, "num_tokens": 103782172.0, "step": 1296 }, { "epoch": 1.6175920149719276, "grad_norm": 0.10996795677144779, "learning_rate": 5.157959897479587e-06, "loss": 0.0359, "num_tokens": 103862312.0, "step": 1297 }, { "epoch": 1.6188396756082346, "grad_norm": 0.11103120135245384, "learning_rate": 5.151918087653672e-06, "loss": 0.0336, "num_tokens": 103943466.0, "step": 1298 }, { "epoch": 1.6200873362445414, "grad_norm": 0.10467769656583091, "learning_rate": 5.145876909005477e-06, "loss": 0.0335, "num_tokens": 104023154.0, "step": 1299 }, { "epoch": 1.6213349968808484, "grad_norm": 0.10318413558631827, "learning_rate": 5.139836372489481e-06, "loss": 0.0318, "num_tokens": 104102426.0, "step": 1300 }, { "epoch": 1.6225826575171554, "grad_norm": 0.10919271565885186, "learning_rate": 5.133796489059005e-06, "loss": 0.0355, "num_tokens": 104182188.0, "step": 1301 }, { "epoch": 1.6238303181534621, "grad_norm": 0.11310842463169009, "learning_rate": 5.1277572696661806e-06, "loss": 0.0351, "num_tokens": 104261762.0, "step": 1302 }, { "epoch": 1.6250779787897693, "grad_norm": 0.1165076029660535, "learning_rate": 5.12171872526194e-06, "loss": 0.0355, "num_tokens": 104342372.0, "step": 1303 }, { "epoch": 1.626325639426076, "grad_norm": 0.11376695615925705, "learning_rate": 5.115680866795989e-06, "loss": 0.0342, "num_tokens": 104422382.0, "step": 1304 }, { "epoch": 1.6275733000623829, "grad_norm": 0.11767626403403031, "learning_rate": 5.109643705216789e-06, "loss": 0.0354, "num_tokens": 104503015.0, "step": 1305 }, { "epoch": 1.62882096069869, "grad_norm": 0.1181534867539231, "learning_rate": 5.103607251471541e-06, "loss": 0.0321, "num_tokens": 104582079.0, "step": 1306 }, { "epoch": 1.6300686213349969, "grad_norm": 0.09517674089409651, "learning_rate": 5.097571516506158e-06, "loss": 0.0331, "num_tokens": 104661780.0, "step": 1307 }, { "epoch": 1.6313162819713038, "grad_norm": 0.11931275868780851, "learning_rate": 5.091536511265253e-06, "loss": 0.0348, "num_tokens": 104741752.0, "step": 1308 }, { "epoch": 1.6325639426076108, "grad_norm": 0.10940449179708552, "learning_rate": 5.085502246692111e-06, "loss": 0.0354, "num_tokens": 104822380.0, "step": 1309 }, { "epoch": 1.6338116032439176, "grad_norm": 0.12053843559094413, "learning_rate": 5.079468733728684e-06, "loss": 0.0369, "num_tokens": 104902595.0, "step": 1310 }, { "epoch": 1.6350592638802246, "grad_norm": 0.11265796315863995, "learning_rate": 5.07343598331555e-06, "loss": 0.0336, "num_tokens": 104981340.0, "step": 1311 }, { "epoch": 1.6363069245165316, "grad_norm": 0.1109231726527133, "learning_rate": 5.0674040063919114e-06, "loss": 0.0349, "num_tokens": 105060850.0, "step": 1312 }, { "epoch": 1.6375545851528384, "grad_norm": 0.11588144786485258, "learning_rate": 5.0613728138955644e-06, "loss": 0.0343, "num_tokens": 105140234.0, "step": 1313 }, { "epoch": 1.6388022457891454, "grad_norm": 0.10922220587388409, "learning_rate": 5.055342416762883e-06, "loss": 0.0333, "num_tokens": 105219670.0, "step": 1314 }, { "epoch": 1.6400499064254523, "grad_norm": 0.10863070032451541, "learning_rate": 5.0493128259288025e-06, "loss": 0.0348, "num_tokens": 105300783.0, "step": 1315 }, { "epoch": 1.641297567061759, "grad_norm": 0.10544356260119463, "learning_rate": 5.043284052326789e-06, "loss": 0.0337, "num_tokens": 105380432.0, "step": 1316 }, { "epoch": 1.642545227698066, "grad_norm": 0.10945480005987275, "learning_rate": 5.037256106888837e-06, "loss": 0.0337, "num_tokens": 105459281.0, "step": 1317 }, { "epoch": 1.643792888334373, "grad_norm": 0.10513668226551459, "learning_rate": 5.03122900054543e-06, "loss": 0.0364, "num_tokens": 105539043.0, "step": 1318 }, { "epoch": 1.6450405489706799, "grad_norm": 0.11889358390219724, "learning_rate": 5.025202744225535e-06, "loss": 0.0323, "num_tokens": 105618097.0, "step": 1319 }, { "epoch": 1.6462882096069869, "grad_norm": 0.10819389235655742, "learning_rate": 5.019177348856576e-06, "loss": 0.0384, "num_tokens": 105700246.0, "step": 1320 }, { "epoch": 1.6475358702432938, "grad_norm": 0.12285064335570973, "learning_rate": 5.013152825364416e-06, "loss": 0.0335, "num_tokens": 105779956.0, "step": 1321 }, { "epoch": 1.6487835308796006, "grad_norm": 0.10457229323362498, "learning_rate": 5.007129184673335e-06, "loss": 0.0342, "num_tokens": 105859422.0, "step": 1322 }, { "epoch": 1.6500311915159078, "grad_norm": 0.10418232593169255, "learning_rate": 5.001106437706016e-06, "loss": 0.0354, "num_tokens": 105939798.0, "step": 1323 }, { "epoch": 1.6512788521522146, "grad_norm": 0.10621297870744945, "learning_rate": 4.99508459538352e-06, "loss": 0.035, "num_tokens": 106019008.0, "step": 1324 }, { "epoch": 1.6525265127885214, "grad_norm": 0.11055886736084558, "learning_rate": 4.989063668625267e-06, "loss": 0.0331, "num_tokens": 106099014.0, "step": 1325 }, { "epoch": 1.6537741734248286, "grad_norm": 0.12055990828923857, "learning_rate": 4.983043668349018e-06, "loss": 0.035, "num_tokens": 106178538.0, "step": 1326 }, { "epoch": 1.6550218340611353, "grad_norm": 0.11615411814602732, "learning_rate": 4.977024605470851e-06, "loss": 0.0342, "num_tokens": 106257051.0, "step": 1327 }, { "epoch": 1.6562694946974423, "grad_norm": 0.11242812128073622, "learning_rate": 4.971006490905148e-06, "loss": 0.0326, "num_tokens": 106337027.0, "step": 1328 }, { "epoch": 1.6575171553337493, "grad_norm": 0.10451594006379315, "learning_rate": 4.964989335564571e-06, "loss": 0.035, "num_tokens": 106415989.0, "step": 1329 }, { "epoch": 1.658764815970056, "grad_norm": 0.12008210058517821, "learning_rate": 4.958973150360034e-06, "loss": 0.0338, "num_tokens": 106496902.0, "step": 1330 }, { "epoch": 1.660012476606363, "grad_norm": 0.10283103060613702, "learning_rate": 4.952957946200709e-06, "loss": 0.0316, "num_tokens": 106576276.0, "step": 1331 }, { "epoch": 1.66126013724267, "grad_norm": 0.10840588788185893, "learning_rate": 4.946943733993974e-06, "loss": 0.0342, "num_tokens": 106656074.0, "step": 1332 }, { "epoch": 1.6625077978789768, "grad_norm": 0.1151122733175202, "learning_rate": 4.940930524645414e-06, "loss": 0.0359, "num_tokens": 106737048.0, "step": 1333 }, { "epoch": 1.6637554585152838, "grad_norm": 0.12138327404493408, "learning_rate": 4.934918329058798e-06, "loss": 0.0329, "num_tokens": 106817115.0, "step": 1334 }, { "epoch": 1.6650031191515908, "grad_norm": 0.10347193468215583, "learning_rate": 4.928907158136049e-06, "loss": 0.0351, "num_tokens": 106896600.0, "step": 1335 }, { "epoch": 1.6662507797878976, "grad_norm": 0.10818319318418472, "learning_rate": 4.922897022777241e-06, "loss": 0.0342, "num_tokens": 106976257.0, "step": 1336 }, { "epoch": 1.6674984404242046, "grad_norm": 0.125943792765203, "learning_rate": 4.916887933880562e-06, "loss": 0.0364, "num_tokens": 107056103.0, "step": 1337 }, { "epoch": 1.6687461010605116, "grad_norm": 0.10714932245614212, "learning_rate": 4.910879902342309e-06, "loss": 0.0328, "num_tokens": 107135177.0, "step": 1338 }, { "epoch": 1.6699937616968183, "grad_norm": 0.10022422578809588, "learning_rate": 4.904872939056859e-06, "loss": 0.0327, "num_tokens": 107215076.0, "step": 1339 }, { "epoch": 1.6712414223331253, "grad_norm": 0.11904997859265425, "learning_rate": 4.898867054916655e-06, "loss": 0.0331, "num_tokens": 107294670.0, "step": 1340 }, { "epoch": 1.6724890829694323, "grad_norm": 0.10346330508751622, "learning_rate": 4.892862260812174e-06, "loss": 0.0355, "num_tokens": 107375743.0, "step": 1341 }, { "epoch": 1.673736743605739, "grad_norm": 0.10320313489980291, "learning_rate": 4.886858567631927e-06, "loss": 0.037, "num_tokens": 107456245.0, "step": 1342 }, { "epoch": 1.6749844042420463, "grad_norm": 0.11376647833132077, "learning_rate": 4.880855986262424e-06, "loss": 0.0336, "num_tokens": 107535076.0, "step": 1343 }, { "epoch": 1.676232064878353, "grad_norm": 0.11251959987852904, "learning_rate": 4.874854527588159e-06, "loss": 0.0348, "num_tokens": 107615448.0, "step": 1344 }, { "epoch": 1.6774797255146598, "grad_norm": 0.11732474632394792, "learning_rate": 4.868854202491587e-06, "loss": 0.0343, "num_tokens": 107695214.0, "step": 1345 }, { "epoch": 1.678727386150967, "grad_norm": 0.11672350989227481, "learning_rate": 4.862855021853117e-06, "loss": 0.0352, "num_tokens": 107775647.0, "step": 1346 }, { "epoch": 1.6799750467872738, "grad_norm": 0.11684779926230927, "learning_rate": 4.856856996551074e-06, "loss": 0.0343, "num_tokens": 107855586.0, "step": 1347 }, { "epoch": 1.6812227074235808, "grad_norm": 0.11194528364428014, "learning_rate": 4.850860137461691e-06, "loss": 0.0349, "num_tokens": 107933910.0, "step": 1348 }, { "epoch": 1.6824703680598878, "grad_norm": 0.12241649866099316, "learning_rate": 4.844864455459085e-06, "loss": 0.0331, "num_tokens": 108013285.0, "step": 1349 }, { "epoch": 1.6837180286961946, "grad_norm": 0.10701404402743815, "learning_rate": 4.83886996141524e-06, "loss": 0.0355, "num_tokens": 108093741.0, "step": 1350 }, { "epoch": 1.6849656893325016, "grad_norm": 0.12427674118245957, "learning_rate": 4.8328766661999885e-06, "loss": 0.0351, "num_tokens": 108174452.0, "step": 1351 }, { "epoch": 1.6862133499688086, "grad_norm": 0.11109278194963701, "learning_rate": 4.826884580680981e-06, "loss": 0.0363, "num_tokens": 108255274.0, "step": 1352 }, { "epoch": 1.6874610106051153, "grad_norm": 0.11471172964381411, "learning_rate": 4.8208937157236855e-06, "loss": 0.033, "num_tokens": 108335095.0, "step": 1353 }, { "epoch": 1.6887086712414223, "grad_norm": 0.12341901134238756, "learning_rate": 4.814904082191349e-06, "loss": 0.0342, "num_tokens": 108415877.0, "step": 1354 }, { "epoch": 1.6899563318777293, "grad_norm": 0.1050413188206562, "learning_rate": 4.8089156909449845e-06, "loss": 0.0342, "num_tokens": 108495262.0, "step": 1355 }, { "epoch": 1.691203992514036, "grad_norm": 0.10900307986971748, "learning_rate": 4.802928552843358e-06, "loss": 0.0351, "num_tokens": 108574803.0, "step": 1356 }, { "epoch": 1.692451653150343, "grad_norm": 0.12184670852461642, "learning_rate": 4.79694267874296e-06, "loss": 0.0373, "num_tokens": 108655482.0, "step": 1357 }, { "epoch": 1.69369931378665, "grad_norm": 0.10975468138719802, "learning_rate": 4.790958079497991e-06, "loss": 0.0342, "num_tokens": 108735411.0, "step": 1358 }, { "epoch": 1.6949469744229568, "grad_norm": 0.10818231872378661, "learning_rate": 4.784974765960335e-06, "loss": 0.0361, "num_tokens": 108815263.0, "step": 1359 }, { "epoch": 1.696194635059264, "grad_norm": 0.10258324984418032, "learning_rate": 4.77899274897955e-06, "loss": 0.0342, "num_tokens": 108894762.0, "step": 1360 }, { "epoch": 1.6974422956955708, "grad_norm": 0.11081946376785735, "learning_rate": 4.773012039402841e-06, "loss": 0.0368, "num_tokens": 108975844.0, "step": 1361 }, { "epoch": 1.6986899563318776, "grad_norm": 0.10763088262915656, "learning_rate": 4.767032648075043e-06, "loss": 0.0352, "num_tokens": 109056110.0, "step": 1362 }, { "epoch": 1.6999376169681848, "grad_norm": 0.10826684816224101, "learning_rate": 4.761054585838599e-06, "loss": 0.0341, "num_tokens": 109136888.0, "step": 1363 }, { "epoch": 1.7011852776044916, "grad_norm": 0.11349832649415174, "learning_rate": 4.755077863533541e-06, "loss": 0.0342, "num_tokens": 109216345.0, "step": 1364 }, { "epoch": 1.7024329382407986, "grad_norm": 0.10721063931317605, "learning_rate": 4.749102491997476e-06, "loss": 0.0304, "num_tokens": 109297726.0, "step": 1365 }, { "epoch": 1.7036805988771055, "grad_norm": 0.11814753856914108, "learning_rate": 4.743128482065555e-06, "loss": 0.0368, "num_tokens": 109377771.0, "step": 1366 }, { "epoch": 1.7049282595134123, "grad_norm": 0.10923227600327748, "learning_rate": 4.737155844570468e-06, "loss": 0.0358, "num_tokens": 109457147.0, "step": 1367 }, { "epoch": 1.7061759201497193, "grad_norm": 0.1180774047748652, "learning_rate": 4.7311845903424104e-06, "loss": 0.0326, "num_tokens": 109536870.0, "step": 1368 }, { "epoch": 1.7074235807860263, "grad_norm": 0.10414052812314961, "learning_rate": 4.725214730209069e-06, "loss": 0.0348, "num_tokens": 109617166.0, "step": 1369 }, { "epoch": 1.708671241422333, "grad_norm": 0.11325120099720434, "learning_rate": 4.719246274995607e-06, "loss": 0.0373, "num_tokens": 109697533.0, "step": 1370 }, { "epoch": 1.70991890205864, "grad_norm": 0.10237282354895524, "learning_rate": 4.713279235524637e-06, "loss": 0.0339, "num_tokens": 109778192.0, "step": 1371 }, { "epoch": 1.711166562694947, "grad_norm": 0.1495589045820829, "learning_rate": 4.707313622616205e-06, "loss": 0.0348, "num_tokens": 109858008.0, "step": 1372 }, { "epoch": 1.7124142233312538, "grad_norm": 0.1155569950547421, "learning_rate": 4.701349447087769e-06, "loss": 0.0338, "num_tokens": 109938620.0, "step": 1373 }, { "epoch": 1.7136618839675608, "grad_norm": 0.1085633556890907, "learning_rate": 4.695386719754184e-06, "loss": 0.0344, "num_tokens": 110018354.0, "step": 1374 }, { "epoch": 1.7149095446038678, "grad_norm": 0.11234843873728942, "learning_rate": 4.689425451427677e-06, "loss": 0.0359, "num_tokens": 110098749.0, "step": 1375 }, { "epoch": 1.7161572052401746, "grad_norm": 0.11707214723093819, "learning_rate": 4.683465652917828e-06, "loss": 0.0358, "num_tokens": 110178574.0, "step": 1376 }, { "epoch": 1.7174048658764816, "grad_norm": 0.11575033343150132, "learning_rate": 4.677507335031555e-06, "loss": 0.0387, "num_tokens": 110258909.0, "step": 1377 }, { "epoch": 1.7186525265127885, "grad_norm": 0.1219047452029518, "learning_rate": 4.671550508573087e-06, "loss": 0.0364, "num_tokens": 110338876.0, "step": 1378 }, { "epoch": 1.7199001871490953, "grad_norm": 0.11606054592764724, "learning_rate": 4.6655951843439514e-06, "loss": 0.034, "num_tokens": 110417753.0, "step": 1379 }, { "epoch": 1.7211478477854025, "grad_norm": 0.11058642589780793, "learning_rate": 4.659641373142953e-06, "loss": 0.0369, "num_tokens": 110497691.0, "step": 1380 }, { "epoch": 1.7223955084217093, "grad_norm": 0.10502695534031538, "learning_rate": 4.653689085766147e-06, "loss": 0.0338, "num_tokens": 110578084.0, "step": 1381 }, { "epoch": 1.723643169058016, "grad_norm": 0.10033432947560307, "learning_rate": 4.6477383330068335e-06, "loss": 0.0324, "num_tokens": 110656605.0, "step": 1382 }, { "epoch": 1.7248908296943233, "grad_norm": 0.10347951460818469, "learning_rate": 4.641789125655526e-06, "loss": 0.0343, "num_tokens": 110736629.0, "step": 1383 }, { "epoch": 1.72613849033063, "grad_norm": 0.10404769094997354, "learning_rate": 4.6358414744999324e-06, "loss": 0.0322, "num_tokens": 110816051.0, "step": 1384 }, { "epoch": 1.727386150966937, "grad_norm": 0.10284553692602667, "learning_rate": 4.6298953903249455e-06, "loss": 0.0331, "num_tokens": 110894941.0, "step": 1385 }, { "epoch": 1.728633811603244, "grad_norm": 0.10288209629612745, "learning_rate": 4.623950883912609e-06, "loss": 0.0353, "num_tokens": 110975460.0, "step": 1386 }, { "epoch": 1.7298814722395508, "grad_norm": 0.10789958526274712, "learning_rate": 4.618007966042114e-06, "loss": 0.0342, "num_tokens": 111054782.0, "step": 1387 }, { "epoch": 1.7311291328758578, "grad_norm": 0.10876892596056022, "learning_rate": 4.612066647489762e-06, "loss": 0.0355, "num_tokens": 111135170.0, "step": 1388 }, { "epoch": 1.7323767935121648, "grad_norm": 0.12136812485887387, "learning_rate": 4.606126939028965e-06, "loss": 0.0338, "num_tokens": 111214768.0, "step": 1389 }, { "epoch": 1.7336244541484715, "grad_norm": 0.10679814813875112, "learning_rate": 4.600188851430206e-06, "loss": 0.0324, "num_tokens": 111294207.0, "step": 1390 }, { "epoch": 1.7348721147847785, "grad_norm": 0.1032000235074082, "learning_rate": 4.594252395461036e-06, "loss": 0.034, "num_tokens": 111374632.0, "step": 1391 }, { "epoch": 1.7361197754210855, "grad_norm": 0.10918063307642732, "learning_rate": 4.588317581886041e-06, "loss": 0.0344, "num_tokens": 111454535.0, "step": 1392 }, { "epoch": 1.7373674360573923, "grad_norm": 0.10999773787711306, "learning_rate": 4.5823844214668326e-06, "loss": 0.0352, "num_tokens": 111534940.0, "step": 1393 }, { "epoch": 1.7386150966936993, "grad_norm": 0.12310258864133658, "learning_rate": 4.576452924962024e-06, "loss": 0.0379, "num_tokens": 111616583.0, "step": 1394 }, { "epoch": 1.7398627573300063, "grad_norm": 0.11330748509366423, "learning_rate": 4.570523103127209e-06, "loss": 0.0334, "num_tokens": 111694802.0, "step": 1395 }, { "epoch": 1.741110417966313, "grad_norm": 0.11642883285973465, "learning_rate": 4.564594966714952e-06, "loss": 0.0344, "num_tokens": 111774583.0, "step": 1396 }, { "epoch": 1.74235807860262, "grad_norm": 0.11628781515429629, "learning_rate": 4.558668526474751e-06, "loss": 0.0361, "num_tokens": 111854564.0, "step": 1397 }, { "epoch": 1.743605739238927, "grad_norm": 0.11553952117653499, "learning_rate": 4.552743793153037e-06, "loss": 0.0342, "num_tokens": 111934145.0, "step": 1398 }, { "epoch": 1.7448533998752338, "grad_norm": 0.1130454484695648, "learning_rate": 4.5468207774931414e-06, "loss": 0.0362, "num_tokens": 112014599.0, "step": 1399 }, { "epoch": 1.746101060511541, "grad_norm": 0.10424941012798582, "learning_rate": 4.540899490235282e-06, "loss": 0.0353, "num_tokens": 112095898.0, "step": 1400 }, { "epoch": 1.7473487211478478, "grad_norm": 0.12176082233686349, "learning_rate": 4.534979942116542e-06, "loss": 0.0335, "num_tokens": 112174805.0, "step": 1401 }, { "epoch": 1.7485963817841546, "grad_norm": 0.10725290738018774, "learning_rate": 4.529062143870849e-06, "loss": 0.035, "num_tokens": 112254844.0, "step": 1402 }, { "epoch": 1.7498440424204618, "grad_norm": 0.11613612803991162, "learning_rate": 4.5231461062289624e-06, "loss": 0.0364, "num_tokens": 112336687.0, "step": 1403 }, { "epoch": 1.7510917030567685, "grad_norm": 0.11248286120432541, "learning_rate": 4.5172318399184485e-06, "loss": 0.0335, "num_tokens": 112416437.0, "step": 1404 }, { "epoch": 1.7523393636930755, "grad_norm": 0.11498685047252416, "learning_rate": 4.511319355663657e-06, "loss": 0.0363, "num_tokens": 112496544.0, "step": 1405 }, { "epoch": 1.7535870243293825, "grad_norm": 0.11584733211628963, "learning_rate": 4.50540866418571e-06, "loss": 0.0385, "num_tokens": 112577418.0, "step": 1406 }, { "epoch": 1.7548346849656893, "grad_norm": 0.10971791007225389, "learning_rate": 4.499499776202476e-06, "loss": 0.0327, "num_tokens": 112655726.0, "step": 1407 }, { "epoch": 1.7560823456019963, "grad_norm": 0.11202865802168581, "learning_rate": 4.493592702428558e-06, "loss": 0.0365, "num_tokens": 112736130.0, "step": 1408 }, { "epoch": 1.7573300062383033, "grad_norm": 0.11590791221388194, "learning_rate": 4.487687453575261e-06, "loss": 0.0359, "num_tokens": 112816354.0, "step": 1409 }, { "epoch": 1.75857766687461, "grad_norm": 0.11455918942737528, "learning_rate": 4.481784040350593e-06, "loss": 0.0357, "num_tokens": 112895741.0, "step": 1410 }, { "epoch": 1.759825327510917, "grad_norm": 0.11184953434294027, "learning_rate": 4.475882473459221e-06, "loss": 0.0323, "num_tokens": 112975009.0, "step": 1411 }, { "epoch": 1.761072988147224, "grad_norm": 0.1043909165249291, "learning_rate": 4.469982763602473e-06, "loss": 0.0376, "num_tokens": 113056299.0, "step": 1412 }, { "epoch": 1.7623206487835308, "grad_norm": 0.11796014666013199, "learning_rate": 4.464084921478303e-06, "loss": 0.0345, "num_tokens": 113135339.0, "step": 1413 }, { "epoch": 1.7635683094198378, "grad_norm": 0.10655530028085919, "learning_rate": 4.458188957781285e-06, "loss": 0.0332, "num_tokens": 113215391.0, "step": 1414 }, { "epoch": 1.7648159700561448, "grad_norm": 0.1137944705272993, "learning_rate": 4.452294883202581e-06, "loss": 0.0326, "num_tokens": 113294604.0, "step": 1415 }, { "epoch": 1.7660636306924515, "grad_norm": 0.10386535837108864, "learning_rate": 4.44640270842993e-06, "loss": 0.0355, "num_tokens": 113373702.0, "step": 1416 }, { "epoch": 1.7673112913287585, "grad_norm": 0.1124153813219037, "learning_rate": 4.440512444147626e-06, "loss": 0.0338, "num_tokens": 113454253.0, "step": 1417 }, { "epoch": 1.7685589519650655, "grad_norm": 0.11173982165813126, "learning_rate": 4.434624101036498e-06, "loss": 0.034, "num_tokens": 113534170.0, "step": 1418 }, { "epoch": 1.7698066126013723, "grad_norm": 0.10766159191175759, "learning_rate": 4.4287376897738945e-06, "loss": 0.0337, "num_tokens": 113613505.0, "step": 1419 }, { "epoch": 1.7710542732376795, "grad_norm": 0.11662200449928516, "learning_rate": 4.4228532210336535e-06, "loss": 0.0351, "num_tokens": 113694356.0, "step": 1420 }, { "epoch": 1.7723019338739863, "grad_norm": 0.12239528048779576, "learning_rate": 4.4169707054861e-06, "loss": 0.0348, "num_tokens": 113773772.0, "step": 1421 }, { "epoch": 1.773549594510293, "grad_norm": 0.11495501032768879, "learning_rate": 4.411090153798011e-06, "loss": 0.0361, "num_tokens": 113854394.0, "step": 1422 }, { "epoch": 1.7747972551466002, "grad_norm": 0.10793305404835776, "learning_rate": 4.405211576632602e-06, "loss": 0.0331, "num_tokens": 113934601.0, "step": 1423 }, { "epoch": 1.776044915782907, "grad_norm": 0.10521475892016557, "learning_rate": 4.3993349846495136e-06, "loss": 0.0336, "num_tokens": 114014077.0, "step": 1424 }, { "epoch": 1.777292576419214, "grad_norm": 0.11667480494909614, "learning_rate": 4.393460388504784e-06, "loss": 0.0364, "num_tokens": 114095540.0, "step": 1425 }, { "epoch": 1.778540237055521, "grad_norm": 0.11816708505706751, "learning_rate": 4.387587798850826e-06, "loss": 0.036, "num_tokens": 114175449.0, "step": 1426 }, { "epoch": 1.7797878976918278, "grad_norm": 0.11547381713071538, "learning_rate": 4.381717226336426e-06, "loss": 0.033, "num_tokens": 114255254.0, "step": 1427 }, { "epoch": 1.7810355583281348, "grad_norm": 0.11558449024822219, "learning_rate": 4.375848681606704e-06, "loss": 0.0355, "num_tokens": 114335582.0, "step": 1428 }, { "epoch": 1.7822832189644418, "grad_norm": 0.11501299427526183, "learning_rate": 4.369982175303104e-06, "loss": 0.0356, "num_tokens": 114417492.0, "step": 1429 }, { "epoch": 1.7835308796007485, "grad_norm": 0.11575029405727906, "learning_rate": 4.364117718063375e-06, "loss": 0.0342, "num_tokens": 114498411.0, "step": 1430 }, { "epoch": 1.7847785402370555, "grad_norm": 0.10471073262325635, "learning_rate": 4.358255320521553e-06, "loss": 0.0335, "num_tokens": 114579592.0, "step": 1431 }, { "epoch": 1.7860262008733625, "grad_norm": 0.10800367249362049, "learning_rate": 4.352394993307935e-06, "loss": 0.0346, "num_tokens": 114660132.0, "step": 1432 }, { "epoch": 1.7872738615096693, "grad_norm": 0.11784628615301575, "learning_rate": 4.346536747049068e-06, "loss": 0.035, "num_tokens": 114742718.0, "step": 1433 }, { "epoch": 1.7885215221459763, "grad_norm": 0.11047252826803172, "learning_rate": 4.340680592367721e-06, "loss": 0.0353, "num_tokens": 114823729.0, "step": 1434 }, { "epoch": 1.7897691827822833, "grad_norm": 0.10863771684339983, "learning_rate": 4.33482653988287e-06, "loss": 0.0352, "num_tokens": 114903277.0, "step": 1435 }, { "epoch": 1.79101684341859, "grad_norm": 0.11748528208682778, "learning_rate": 4.328974600209687e-06, "loss": 0.0333, "num_tokens": 114983776.0, "step": 1436 }, { "epoch": 1.7922645040548972, "grad_norm": 0.10380189194122497, "learning_rate": 4.3231247839595045e-06, "loss": 0.0334, "num_tokens": 115063423.0, "step": 1437 }, { "epoch": 1.793512164691204, "grad_norm": 0.1021667642296284, "learning_rate": 4.317277101739806e-06, "loss": 0.0338, "num_tokens": 115143251.0, "step": 1438 }, { "epoch": 1.7947598253275108, "grad_norm": 0.10887837383935073, "learning_rate": 4.3114315641542105e-06, "loss": 0.0342, "num_tokens": 115224078.0, "step": 1439 }, { "epoch": 1.796007485963818, "grad_norm": 0.12498802533911925, "learning_rate": 4.305588181802441e-06, "loss": 0.0333, "num_tokens": 115304392.0, "step": 1440 }, { "epoch": 1.7972551466001248, "grad_norm": 0.10562913529548969, "learning_rate": 4.2997469652803185e-06, "loss": 0.0359, "num_tokens": 115384699.0, "step": 1441 }, { "epoch": 1.7985028072364317, "grad_norm": 0.10968417412970875, "learning_rate": 4.293907925179733e-06, "loss": 0.0343, "num_tokens": 115465341.0, "step": 1442 }, { "epoch": 1.7997504678727387, "grad_norm": 0.10397756346348985, "learning_rate": 4.28807107208863e-06, "loss": 0.0328, "num_tokens": 115544709.0, "step": 1443 }, { "epoch": 1.8009981285090455, "grad_norm": 0.11506875540961697, "learning_rate": 4.282236416590986e-06, "loss": 0.0375, "num_tokens": 115625949.0, "step": 1444 }, { "epoch": 1.8022457891453525, "grad_norm": 0.10835660954726115, "learning_rate": 4.276403969266797e-06, "loss": 0.0336, "num_tokens": 115705144.0, "step": 1445 }, { "epoch": 1.8034934497816595, "grad_norm": 0.1125488424807153, "learning_rate": 4.270573740692053e-06, "loss": 0.0359, "num_tokens": 115786232.0, "step": 1446 }, { "epoch": 1.8047411104179663, "grad_norm": 0.11179280220174558, "learning_rate": 4.2647457414387205e-06, "loss": 0.0324, "num_tokens": 115865733.0, "step": 1447 }, { "epoch": 1.8059887710542732, "grad_norm": 0.10438418516083571, "learning_rate": 4.2589199820747226e-06, "loss": 0.0335, "num_tokens": 115946117.0, "step": 1448 }, { "epoch": 1.8072364316905802, "grad_norm": 0.12496207865854495, "learning_rate": 4.253096473163923e-06, "loss": 0.038, "num_tokens": 116028253.0, "step": 1449 }, { "epoch": 1.808484092326887, "grad_norm": 0.10562424147281688, "learning_rate": 4.247275225266103e-06, "loss": 0.0347, "num_tokens": 116109099.0, "step": 1450 }, { "epoch": 1.809731752963194, "grad_norm": 0.11476927473343183, "learning_rate": 4.241456248936946e-06, "loss": 0.0326, "num_tokens": 116189316.0, "step": 1451 }, { "epoch": 1.810979413599501, "grad_norm": 0.10456867355165096, "learning_rate": 4.23563955472801e-06, "loss": 0.035, "num_tokens": 116270657.0, "step": 1452 }, { "epoch": 1.8122270742358078, "grad_norm": 0.10972404579793639, "learning_rate": 4.229825153186727e-06, "loss": 0.035, "num_tokens": 116351453.0, "step": 1453 }, { "epoch": 1.8134747348721147, "grad_norm": 0.11614985559784176, "learning_rate": 4.22401305485636e-06, "loss": 0.035, "num_tokens": 116432720.0, "step": 1454 }, { "epoch": 1.8147223955084217, "grad_norm": 0.10838486449752817, "learning_rate": 4.218203270276e-06, "loss": 0.035, "num_tokens": 116512799.0, "step": 1455 }, { "epoch": 1.8159700561447285, "grad_norm": 0.11198739360311537, "learning_rate": 4.2123958099805466e-06, "loss": 0.0334, "num_tokens": 116592644.0, "step": 1456 }, { "epoch": 1.8172177167810357, "grad_norm": 0.11215477815079397, "learning_rate": 4.206590684500675e-06, "loss": 0.0348, "num_tokens": 116672720.0, "step": 1457 }, { "epoch": 1.8184653774173425, "grad_norm": 0.1130464870503315, "learning_rate": 4.200787904362833e-06, "loss": 0.0344, "num_tokens": 116753067.0, "step": 1458 }, { "epoch": 1.8197130380536493, "grad_norm": 0.10319308148117898, "learning_rate": 4.194987480089218e-06, "loss": 0.0333, "num_tokens": 116832122.0, "step": 1459 }, { "epoch": 1.8209606986899565, "grad_norm": 0.10789295427643053, "learning_rate": 4.189189422197751e-06, "loss": 0.0349, "num_tokens": 116911422.0, "step": 1460 }, { "epoch": 1.8222083593262632, "grad_norm": 0.10741052441160895, "learning_rate": 4.183393741202065e-06, "loss": 0.0346, "num_tokens": 116991316.0, "step": 1461 }, { "epoch": 1.8234560199625702, "grad_norm": 0.1149851515920513, "learning_rate": 4.177600447611478e-06, "loss": 0.0335, "num_tokens": 117069959.0, "step": 1462 }, { "epoch": 1.8247036805988772, "grad_norm": 0.11670396653884813, "learning_rate": 4.171809551930985e-06, "loss": 0.0344, "num_tokens": 117149818.0, "step": 1463 }, { "epoch": 1.825951341235184, "grad_norm": 0.11711057074934993, "learning_rate": 4.166021064661231e-06, "loss": 0.035, "num_tokens": 117230247.0, "step": 1464 }, { "epoch": 1.827199001871491, "grad_norm": 0.1163081603973351, "learning_rate": 4.160234996298491e-06, "loss": 0.0341, "num_tokens": 117310424.0, "step": 1465 }, { "epoch": 1.828446662507798, "grad_norm": 0.10458937874158106, "learning_rate": 4.154451357334654e-06, "loss": 0.0344, "num_tokens": 117389859.0, "step": 1466 }, { "epoch": 1.8296943231441047, "grad_norm": 0.10574896551270638, "learning_rate": 4.148670158257211e-06, "loss": 0.0341, "num_tokens": 117469420.0, "step": 1467 }, { "epoch": 1.8309419837804117, "grad_norm": 0.10899400352647608, "learning_rate": 4.142891409549219e-06, "loss": 0.0337, "num_tokens": 117548721.0, "step": 1468 }, { "epoch": 1.8321896444167187, "grad_norm": 0.10611750813836142, "learning_rate": 4.137115121689297e-06, "loss": 0.0348, "num_tokens": 117628895.0, "step": 1469 }, { "epoch": 1.8334373050530255, "grad_norm": 0.1111805298125393, "learning_rate": 4.131341305151603e-06, "loss": 0.0348, "num_tokens": 117710011.0, "step": 1470 }, { "epoch": 1.8346849656893325, "grad_norm": 0.10266067632336437, "learning_rate": 4.1255699704058085e-06, "loss": 0.0332, "num_tokens": 117790110.0, "step": 1471 }, { "epoch": 1.8359326263256395, "grad_norm": 0.12013831863359381, "learning_rate": 4.119801127917089e-06, "loss": 0.0329, "num_tokens": 117870583.0, "step": 1472 }, { "epoch": 1.8371802869619462, "grad_norm": 0.10173281550009515, "learning_rate": 4.114034788146101e-06, "loss": 0.0344, "num_tokens": 117949961.0, "step": 1473 }, { "epoch": 1.8384279475982532, "grad_norm": 0.10191141790449527, "learning_rate": 4.108270961548957e-06, "loss": 0.033, "num_tokens": 118028504.0, "step": 1474 }, { "epoch": 1.8396756082345602, "grad_norm": 0.12183190605596082, "learning_rate": 4.102509658577223e-06, "loss": 0.0338, "num_tokens": 118108384.0, "step": 1475 }, { "epoch": 1.840923268870867, "grad_norm": 0.10416883300068461, "learning_rate": 4.096750889677878e-06, "loss": 0.0349, "num_tokens": 118188162.0, "step": 1476 }, { "epoch": 1.8421709295071742, "grad_norm": 0.11986832551794753, "learning_rate": 4.090994665293313e-06, "loss": 0.0346, "num_tokens": 118268028.0, "step": 1477 }, { "epoch": 1.843418590143481, "grad_norm": 0.10172966968166275, "learning_rate": 4.085240995861301e-06, "loss": 0.0342, "num_tokens": 118348423.0, "step": 1478 }, { "epoch": 1.8446662507797877, "grad_norm": 0.11569226583939839, "learning_rate": 4.079489891814986e-06, "loss": 0.0352, "num_tokens": 118429210.0, "step": 1479 }, { "epoch": 1.845913911416095, "grad_norm": 0.10916278732424804, "learning_rate": 4.073741363582856e-06, "loss": 0.0352, "num_tokens": 118508578.0, "step": 1480 }, { "epoch": 1.8471615720524017, "grad_norm": 0.11293663723584749, "learning_rate": 4.06799542158873e-06, "loss": 0.0355, "num_tokens": 118588744.0, "step": 1481 }, { "epoch": 1.8484092326887087, "grad_norm": 0.11722344024028651, "learning_rate": 4.062252076251739e-06, "loss": 0.0328, "num_tokens": 118667906.0, "step": 1482 }, { "epoch": 1.8496568933250157, "grad_norm": 0.10122769351963658, "learning_rate": 4.056511337986304e-06, "loss": 0.0318, "num_tokens": 118746761.0, "step": 1483 }, { "epoch": 1.8509045539613225, "grad_norm": 0.10229816908665479, "learning_rate": 4.05077321720212e-06, "loss": 0.032, "num_tokens": 118826981.0, "step": 1484 }, { "epoch": 1.8521522145976295, "grad_norm": 0.11918185823359788, "learning_rate": 4.045037724304129e-06, "loss": 0.0338, "num_tokens": 118906395.0, "step": 1485 }, { "epoch": 1.8533998752339365, "grad_norm": 0.10337247858601341, "learning_rate": 4.039304869692518e-06, "loss": 0.0333, "num_tokens": 118985392.0, "step": 1486 }, { "epoch": 1.8546475358702432, "grad_norm": 0.1019281333615369, "learning_rate": 4.033574663762685e-06, "loss": 0.0354, "num_tokens": 119065923.0, "step": 1487 }, { "epoch": 1.8558951965065502, "grad_norm": 0.11895729478200164, "learning_rate": 4.0278471169052224e-06, "loss": 0.0333, "num_tokens": 119144893.0, "step": 1488 }, { "epoch": 1.8571428571428572, "grad_norm": 0.10167459971375013, "learning_rate": 4.022122239505906e-06, "loss": 0.0335, "num_tokens": 119224641.0, "step": 1489 }, { "epoch": 1.858390517779164, "grad_norm": 0.10149185182575064, "learning_rate": 4.0164000419456715e-06, "loss": 0.0341, "num_tokens": 119305051.0, "step": 1490 }, { "epoch": 1.859638178415471, "grad_norm": 0.11077520841456237, "learning_rate": 4.010680534600587e-06, "loss": 0.034, "num_tokens": 119385471.0, "step": 1491 }, { "epoch": 1.860885839051778, "grad_norm": 0.1120329287780735, "learning_rate": 4.004963727841852e-06, "loss": 0.0358, "num_tokens": 119465985.0, "step": 1492 }, { "epoch": 1.8621334996880847, "grad_norm": 0.11813850687069626, "learning_rate": 3.9992496320357645e-06, "loss": 0.0351, "num_tokens": 119546271.0, "step": 1493 }, { "epoch": 1.8633811603243917, "grad_norm": 0.11886113375271443, "learning_rate": 3.993538257543706e-06, "loss": 0.0316, "num_tokens": 119627060.0, "step": 1494 }, { "epoch": 1.8646288209606987, "grad_norm": 0.09874498532250679, "learning_rate": 3.987829614722124e-06, "loss": 0.0335, "num_tokens": 119707551.0, "step": 1495 }, { "epoch": 1.8658764815970055, "grad_norm": 0.10786226744861373, "learning_rate": 3.982123713922517e-06, "loss": 0.0344, "num_tokens": 119787384.0, "step": 1496 }, { "epoch": 1.8671241422333127, "grad_norm": 0.10917423418357436, "learning_rate": 3.976420565491404e-06, "loss": 0.0322, "num_tokens": 119866556.0, "step": 1497 }, { "epoch": 1.8683718028696195, "grad_norm": 0.10509609392688785, "learning_rate": 3.970720179770322e-06, "loss": 0.032, "num_tokens": 119946177.0, "step": 1498 }, { "epoch": 1.8696194635059262, "grad_norm": 0.11086500849159564, "learning_rate": 3.965022567095788e-06, "loss": 0.0382, "num_tokens": 120026777.0, "step": 1499 }, { "epoch": 1.8708671241422334, "grad_norm": 0.1026552922683715, "learning_rate": 3.959327737799298e-06, "loss": 0.0349, "num_tokens": 120106915.0, "step": 1500 }, { "epoch": 1.8721147847785402, "grad_norm": 0.10134968732729031, "learning_rate": 3.953635702207299e-06, "loss": 0.0316, "num_tokens": 120186044.0, "step": 1501 }, { "epoch": 1.8733624454148472, "grad_norm": 0.09477996979178789, "learning_rate": 3.947946470641169e-06, "loss": 0.0314, "num_tokens": 120265104.0, "step": 1502 }, { "epoch": 1.8746101060511542, "grad_norm": 0.10341753477077234, "learning_rate": 3.9422600534172105e-06, "loss": 0.0347, "num_tokens": 120345399.0, "step": 1503 }, { "epoch": 1.875857766687461, "grad_norm": 0.109955990836735, "learning_rate": 3.936576460846614e-06, "loss": 0.0343, "num_tokens": 120424888.0, "step": 1504 }, { "epoch": 1.877105427323768, "grad_norm": 0.1067874469666549, "learning_rate": 3.930895703235448e-06, "loss": 0.0316, "num_tokens": 120504351.0, "step": 1505 }, { "epoch": 1.878353087960075, "grad_norm": 0.10876424328262234, "learning_rate": 3.925217790884646e-06, "loss": 0.0325, "num_tokens": 120583924.0, "step": 1506 }, { "epoch": 1.8796007485963817, "grad_norm": 0.11262768159157971, "learning_rate": 3.919542734089978e-06, "loss": 0.0354, "num_tokens": 120664606.0, "step": 1507 }, { "epoch": 1.8808484092326887, "grad_norm": 0.10063627694670048, "learning_rate": 3.913870543142038e-06, "loss": 0.0359, "num_tokens": 120744208.0, "step": 1508 }, { "epoch": 1.8820960698689957, "grad_norm": 0.12382898842560677, "learning_rate": 3.908201228326222e-06, "loss": 0.0328, "num_tokens": 120824125.0, "step": 1509 }, { "epoch": 1.8833437305053025, "grad_norm": 0.10492818441358094, "learning_rate": 3.902534799922713e-06, "loss": 0.0318, "num_tokens": 120902940.0, "step": 1510 }, { "epoch": 1.8845913911416095, "grad_norm": 0.10403111315609859, "learning_rate": 3.896871268206456e-06, "loss": 0.0364, "num_tokens": 120985013.0, "step": 1511 }, { "epoch": 1.8858390517779164, "grad_norm": 0.11769575525186173, "learning_rate": 3.8912106434471486e-06, "loss": 0.0325, "num_tokens": 121064553.0, "step": 1512 }, { "epoch": 1.8870867124142232, "grad_norm": 0.10389306641364818, "learning_rate": 3.885552935909212e-06, "loss": 0.0344, "num_tokens": 121144893.0, "step": 1513 }, { "epoch": 1.8883343730505302, "grad_norm": 0.10981301470560886, "learning_rate": 3.879898155851779e-06, "loss": 0.0337, "num_tokens": 121225640.0, "step": 1514 }, { "epoch": 1.8895820336868372, "grad_norm": 0.10151306602448709, "learning_rate": 3.874246313528679e-06, "loss": 0.034, "num_tokens": 121307302.0, "step": 1515 }, { "epoch": 1.890829694323144, "grad_norm": 0.11075900606639615, "learning_rate": 3.868597419188409e-06, "loss": 0.0356, "num_tokens": 121388078.0, "step": 1516 }, { "epoch": 1.8920773549594512, "grad_norm": 0.11142810402559525, "learning_rate": 3.862951483074119e-06, "loss": 0.0329, "num_tokens": 121467954.0, "step": 1517 }, { "epoch": 1.893325015595758, "grad_norm": 0.10044572617648408, "learning_rate": 3.857308515423601e-06, "loss": 0.0354, "num_tokens": 121547246.0, "step": 1518 }, { "epoch": 1.8945726762320647, "grad_norm": 0.12058986292961471, "learning_rate": 3.851668526469261e-06, "loss": 0.0361, "num_tokens": 121628571.0, "step": 1519 }, { "epoch": 1.895820336868372, "grad_norm": 0.10836004623444401, "learning_rate": 3.846031526438102e-06, "loss": 0.0332, "num_tokens": 121709487.0, "step": 1520 }, { "epoch": 1.8970679975046787, "grad_norm": 0.10878927022806904, "learning_rate": 3.84039752555171e-06, "loss": 0.0332, "num_tokens": 121789703.0, "step": 1521 }, { "epoch": 1.8983156581409857, "grad_norm": 0.10824955993736991, "learning_rate": 3.834766534026231e-06, "loss": 0.0332, "num_tokens": 121869985.0, "step": 1522 }, { "epoch": 1.8995633187772927, "grad_norm": 0.10955981205776032, "learning_rate": 3.829138562072353e-06, "loss": 0.0335, "num_tokens": 121948903.0, "step": 1523 }, { "epoch": 1.9008109794135994, "grad_norm": 0.10837260665067358, "learning_rate": 3.823513619895293e-06, "loss": 0.034, "num_tokens": 122028802.0, "step": 1524 }, { "epoch": 1.9020586400499064, "grad_norm": 0.11392485556540904, "learning_rate": 3.81789171769477e-06, "loss": 0.0339, "num_tokens": 122110650.0, "step": 1525 }, { "epoch": 1.9033063006862134, "grad_norm": 0.10471350453492868, "learning_rate": 3.812272865664994e-06, "loss": 0.0349, "num_tokens": 122190750.0, "step": 1526 }, { "epoch": 1.9045539613225202, "grad_norm": 0.11294540539518536, "learning_rate": 3.8066570739946394e-06, "loss": 0.0346, "num_tokens": 122271273.0, "step": 1527 }, { "epoch": 1.9058016219588272, "grad_norm": 0.11267496722591189, "learning_rate": 3.801044352866834e-06, "loss": 0.0358, "num_tokens": 122351564.0, "step": 1528 }, { "epoch": 1.9070492825951342, "grad_norm": 0.11401020747834047, "learning_rate": 3.7954347124591395e-06, "loss": 0.0344, "num_tokens": 122431176.0, "step": 1529 }, { "epoch": 1.908296943231441, "grad_norm": 0.10875035338383428, "learning_rate": 3.7898281629435286e-06, "loss": 0.0348, "num_tokens": 122512805.0, "step": 1530 }, { "epoch": 1.909544603867748, "grad_norm": 0.1104918835133121, "learning_rate": 3.7842247144863686e-06, "loss": 0.0321, "num_tokens": 122592405.0, "step": 1531 }, { "epoch": 1.910792264504055, "grad_norm": 0.10420246611454374, "learning_rate": 3.778624377248409e-06, "loss": 0.0336, "num_tokens": 122672154.0, "step": 1532 }, { "epoch": 1.9120399251403617, "grad_norm": 0.10975968736912756, "learning_rate": 3.77302716138475e-06, "loss": 0.0355, "num_tokens": 122753896.0, "step": 1533 }, { "epoch": 1.913287585776669, "grad_norm": 0.11562096811664528, "learning_rate": 3.7674330770448374e-06, "loss": 0.0339, "num_tokens": 122833339.0, "step": 1534 }, { "epoch": 1.9145352464129757, "grad_norm": 0.09559889402594825, "learning_rate": 3.7618421343724386e-06, "loss": 0.0343, "num_tokens": 122913131.0, "step": 1535 }, { "epoch": 1.9157829070492824, "grad_norm": 0.12632567082780488, "learning_rate": 3.756254343505621e-06, "loss": 0.0326, "num_tokens": 122992473.0, "step": 1536 }, { "epoch": 1.9170305676855897, "grad_norm": 0.10173152865169789, "learning_rate": 3.7506697145767367e-06, "loss": 0.0334, "num_tokens": 123072318.0, "step": 1537 }, { "epoch": 1.9182782283218964, "grad_norm": 0.11406574908341054, "learning_rate": 3.745088257712408e-06, "loss": 0.0337, "num_tokens": 123151348.0, "step": 1538 }, { "epoch": 1.9195258889582034, "grad_norm": 0.10892304795841887, "learning_rate": 3.7395099830335034e-06, "loss": 0.0342, "num_tokens": 123231936.0, "step": 1539 }, { "epoch": 1.9207735495945104, "grad_norm": 0.10694417012259536, "learning_rate": 3.7339349006551193e-06, "loss": 0.0337, "num_tokens": 123312950.0, "step": 1540 }, { "epoch": 1.9220212102308172, "grad_norm": 0.10017582315819759, "learning_rate": 3.7283630206865696e-06, "loss": 0.0333, "num_tokens": 123392780.0, "step": 1541 }, { "epoch": 1.9232688708671242, "grad_norm": 0.10747239898877138, "learning_rate": 3.7227943532313504e-06, "loss": 0.0341, "num_tokens": 123472235.0, "step": 1542 }, { "epoch": 1.9245165315034312, "grad_norm": 0.11544815418654572, "learning_rate": 3.7172289083871436e-06, "loss": 0.0362, "num_tokens": 123552579.0, "step": 1543 }, { "epoch": 1.925764192139738, "grad_norm": 0.11739967817563109, "learning_rate": 3.7116666962457813e-06, "loss": 0.033, "num_tokens": 123631233.0, "step": 1544 }, { "epoch": 1.927011852776045, "grad_norm": 0.10369089138250741, "learning_rate": 3.7061077268932333e-06, "loss": 0.0344, "num_tokens": 123711026.0, "step": 1545 }, { "epoch": 1.928259513412352, "grad_norm": 0.11479151752423149, "learning_rate": 3.700552010409596e-06, "loss": 0.0358, "num_tokens": 123790909.0, "step": 1546 }, { "epoch": 1.9295071740486587, "grad_norm": 0.1028773833805819, "learning_rate": 3.694999556869059e-06, "loss": 0.0351, "num_tokens": 123872098.0, "step": 1547 }, { "epoch": 1.9307548346849657, "grad_norm": 0.10697413999677229, "learning_rate": 3.6894503763399003e-06, "loss": 0.033, "num_tokens": 123952070.0, "step": 1548 }, { "epoch": 1.9320024953212727, "grad_norm": 0.10029417018146669, "learning_rate": 3.683904478884461e-06, "loss": 0.0324, "num_tokens": 124032234.0, "step": 1549 }, { "epoch": 1.9332501559575794, "grad_norm": 0.11027618652006071, "learning_rate": 3.67836187455913e-06, "loss": 0.0326, "num_tokens": 124111121.0, "step": 1550 }, { "epoch": 1.9344978165938864, "grad_norm": 0.1056614113686575, "learning_rate": 3.672822573414323e-06, "loss": 0.0369, "num_tokens": 124191867.0, "step": 1551 }, { "epoch": 1.9357454772301934, "grad_norm": 0.11091294820691895, "learning_rate": 3.6672865854944673e-06, "loss": 0.0356, "num_tokens": 124272599.0, "step": 1552 }, { "epoch": 1.9369931378665002, "grad_norm": 0.1084522698863579, "learning_rate": 3.6617539208379836e-06, "loss": 0.0336, "num_tokens": 124352094.0, "step": 1553 }, { "epoch": 1.9382407985028074, "grad_norm": 0.10585047276937648, "learning_rate": 3.656224589477264e-06, "loss": 0.0352, "num_tokens": 124433325.0, "step": 1554 }, { "epoch": 1.9394884591391142, "grad_norm": 0.10935995370656627, "learning_rate": 3.65069860143866e-06, "loss": 0.0312, "num_tokens": 124511945.0, "step": 1555 }, { "epoch": 1.940736119775421, "grad_norm": 0.10900465709590437, "learning_rate": 3.645175966742456e-06, "loss": 0.0355, "num_tokens": 124592313.0, "step": 1556 }, { "epoch": 1.9419837804117281, "grad_norm": 0.11473015794303712, "learning_rate": 3.639656695402858e-06, "loss": 0.0349, "num_tokens": 124672598.0, "step": 1557 }, { "epoch": 1.943231441048035, "grad_norm": 0.10421687435100782, "learning_rate": 3.634140797427974e-06, "loss": 0.0343, "num_tokens": 124752029.0, "step": 1558 }, { "epoch": 1.944479101684342, "grad_norm": 0.11638448293053776, "learning_rate": 3.6286282828197904e-06, "loss": 0.0361, "num_tokens": 124832634.0, "step": 1559 }, { "epoch": 1.945726762320649, "grad_norm": 0.11391845107383987, "learning_rate": 3.623119161574169e-06, "loss": 0.0327, "num_tokens": 124912364.0, "step": 1560 }, { "epoch": 1.9469744229569557, "grad_norm": 0.10616429489708827, "learning_rate": 3.6176134436808074e-06, "loss": 0.0344, "num_tokens": 124991270.0, "step": 1561 }, { "epoch": 1.9482220835932627, "grad_norm": 0.11167759034779555, "learning_rate": 3.612111139123239e-06, "loss": 0.0348, "num_tokens": 125070833.0, "step": 1562 }, { "epoch": 1.9494697442295696, "grad_norm": 0.1127208715547816, "learning_rate": 3.6066122578788033e-06, "loss": 0.0381, "num_tokens": 125152435.0, "step": 1563 }, { "epoch": 1.9507174048658764, "grad_norm": 0.10729694888144387, "learning_rate": 3.6011168099186322e-06, "loss": 0.0335, "num_tokens": 125233146.0, "step": 1564 }, { "epoch": 1.9519650655021834, "grad_norm": 0.11577479373886601, "learning_rate": 3.5956248052076383e-06, "loss": 0.0332, "num_tokens": 125312477.0, "step": 1565 }, { "epoch": 1.9532127261384904, "grad_norm": 0.10816544241229983, "learning_rate": 3.5901362537044826e-06, "loss": 0.0353, "num_tokens": 125393204.0, "step": 1566 }, { "epoch": 1.9544603867747972, "grad_norm": 0.1144921783827139, "learning_rate": 3.584651165361568e-06, "loss": 0.0339, "num_tokens": 125473311.0, "step": 1567 }, { "epoch": 1.9557080474111042, "grad_norm": 0.1023826741374561, "learning_rate": 3.579169550125019e-06, "loss": 0.0314, "num_tokens": 125553173.0, "step": 1568 }, { "epoch": 1.9569557080474111, "grad_norm": 0.1081903875181614, "learning_rate": 3.5736914179346626e-06, "loss": 0.0359, "num_tokens": 125633344.0, "step": 1569 }, { "epoch": 1.958203368683718, "grad_norm": 0.11814223600506968, "learning_rate": 3.5682167787240053e-06, "loss": 0.0333, "num_tokens": 125713700.0, "step": 1570 }, { "epoch": 1.959451029320025, "grad_norm": 0.11106552128597881, "learning_rate": 3.5627456424202223e-06, "loss": 0.0336, "num_tokens": 125793802.0, "step": 1571 }, { "epoch": 1.960698689956332, "grad_norm": 0.11282020876952419, "learning_rate": 3.55727801894414e-06, "loss": 0.0328, "num_tokens": 125873850.0, "step": 1572 }, { "epoch": 1.9619463505926387, "grad_norm": 0.10618294835959388, "learning_rate": 3.5518139182102106e-06, "loss": 0.033, "num_tokens": 125953640.0, "step": 1573 }, { "epoch": 1.9631940112289459, "grad_norm": 0.09834808850932374, "learning_rate": 3.5463533501265e-06, "loss": 0.032, "num_tokens": 126033564.0, "step": 1574 }, { "epoch": 1.9644416718652526, "grad_norm": 0.11284124916966394, "learning_rate": 3.5408963245946714e-06, "loss": 0.0348, "num_tokens": 126114330.0, "step": 1575 }, { "epoch": 1.9656893325015594, "grad_norm": 0.09978958639176037, "learning_rate": 3.53544285150996e-06, "loss": 0.0329, "num_tokens": 126194064.0, "step": 1576 }, { "epoch": 1.9669369931378666, "grad_norm": 0.11717035362702442, "learning_rate": 3.529992940761159e-06, "loss": 0.0397, "num_tokens": 126274303.0, "step": 1577 }, { "epoch": 1.9681846537741734, "grad_norm": 0.11953639846307562, "learning_rate": 3.524546602230606e-06, "loss": 0.0351, "num_tokens": 126355527.0, "step": 1578 }, { "epoch": 1.9694323144104804, "grad_norm": 0.10799542403102128, "learning_rate": 3.5191038457941596e-06, "loss": 0.0324, "num_tokens": 126434438.0, "step": 1579 }, { "epoch": 1.9706799750467874, "grad_norm": 0.11164008102679486, "learning_rate": 3.5136646813211784e-06, "loss": 0.0338, "num_tokens": 126513806.0, "step": 1580 }, { "epoch": 1.9719276356830941, "grad_norm": 0.11042889340863964, "learning_rate": 3.5082291186745145e-06, "loss": 0.0342, "num_tokens": 126594160.0, "step": 1581 }, { "epoch": 1.9731752963194011, "grad_norm": 0.11255018277654384, "learning_rate": 3.5027971677104867e-06, "loss": 0.0349, "num_tokens": 126674625.0, "step": 1582 }, { "epoch": 1.9744229569557081, "grad_norm": 0.11010249880686576, "learning_rate": 3.497368838278862e-06, "loss": 0.0343, "num_tokens": 126754334.0, "step": 1583 }, { "epoch": 1.975670617592015, "grad_norm": 0.10466844945374801, "learning_rate": 3.491944140222845e-06, "loss": 0.0327, "num_tokens": 126834485.0, "step": 1584 }, { "epoch": 1.976918278228322, "grad_norm": 0.10973920592988463, "learning_rate": 3.486523083379051e-06, "loss": 0.0336, "num_tokens": 126913919.0, "step": 1585 }, { "epoch": 1.9781659388646289, "grad_norm": 0.09624968559215073, "learning_rate": 3.481105677577493e-06, "loss": 0.0322, "num_tokens": 126992768.0, "step": 1586 }, { "epoch": 1.9794135995009356, "grad_norm": 0.10348850398918999, "learning_rate": 3.475691932641569e-06, "loss": 0.0327, "num_tokens": 127073558.0, "step": 1587 }, { "epoch": 1.9806612601372426, "grad_norm": 0.11008522746530837, "learning_rate": 3.4702818583880305e-06, "loss": 0.0329, "num_tokens": 127154390.0, "step": 1588 }, { "epoch": 1.9819089207735496, "grad_norm": 0.10576324521968579, "learning_rate": 3.46487546462698e-06, "loss": 0.0335, "num_tokens": 127233684.0, "step": 1589 }, { "epoch": 1.9831565814098564, "grad_norm": 0.10909609236650647, "learning_rate": 3.4594727611618462e-06, "loss": 0.0354, "num_tokens": 127314072.0, "step": 1590 }, { "epoch": 1.9844042420461634, "grad_norm": 0.10226819882059832, "learning_rate": 3.454073757789359e-06, "loss": 0.0344, "num_tokens": 127393809.0, "step": 1591 }, { "epoch": 1.9856519026824704, "grad_norm": 0.11124791645732714, "learning_rate": 3.4486784642995442e-06, "loss": 0.0338, "num_tokens": 127474232.0, "step": 1592 }, { "epoch": 1.9868995633187772, "grad_norm": 0.10680204301961628, "learning_rate": 3.4432868904757024e-06, "loss": 0.0342, "num_tokens": 127554705.0, "step": 1593 }, { "epoch": 1.9881472239550844, "grad_norm": 0.10744418843654158, "learning_rate": 3.437899046094384e-06, "loss": 0.0334, "num_tokens": 127634236.0, "step": 1594 }, { "epoch": 1.9893948845913911, "grad_norm": 0.10963223394836877, "learning_rate": 3.432514940925378e-06, "loss": 0.0344, "num_tokens": 127714557.0, "step": 1595 }, { "epoch": 1.990642545227698, "grad_norm": 0.11012697826983461, "learning_rate": 3.4271345847316974e-06, "loss": 0.0364, "num_tokens": 127795159.0, "step": 1596 }, { "epoch": 1.9918902058640051, "grad_norm": 0.11016272568594698, "learning_rate": 3.421757987269554e-06, "loss": 0.0362, "num_tokens": 127875081.0, "step": 1597 }, { "epoch": 1.9931378665003119, "grad_norm": 0.10065770585753678, "learning_rate": 3.416385158288343e-06, "loss": 0.0327, "num_tokens": 127954573.0, "step": 1598 }, { "epoch": 1.9943855271366189, "grad_norm": 0.10974734302658783, "learning_rate": 3.411016107530628e-06, "loss": 0.033, "num_tokens": 128034668.0, "step": 1599 }, { "epoch": 1.9956331877729259, "grad_norm": 0.11061221640528077, "learning_rate": 3.405650844732122e-06, "loss": 0.0351, "num_tokens": 128114461.0, "step": 1600 }, { "epoch": 1.9968808484092326, "grad_norm": 0.09898172330167038, "learning_rate": 3.400289379621664e-06, "loss": 0.0334, "num_tokens": 128194681.0, "step": 1601 }, { "epoch": 1.9981285090455396, "grad_norm": 0.10720532173976896, "learning_rate": 3.394931721921214e-06, "loss": 0.0323, "num_tokens": 128274005.0, "step": 1602 }, { "epoch": 1.9993761696818466, "grad_norm": 0.10715909655170884, "learning_rate": 3.3895778813458256e-06, "loss": 0.0339, "num_tokens": 128353693.0, "step": 1603 }, { "epoch": 2.0, "grad_norm": 0.15351115677037117, "learning_rate": 3.3842278676036293e-06, "loss": 0.0295, "num_tokens": 128394204.0, "step": 1604 }, { "epoch": 2.0012476606363068, "grad_norm": 0.09668414870141348, "learning_rate": 3.3788816903958145e-06, "loss": 0.0292, "num_tokens": 128474132.0, "step": 1605 }, { "epoch": 2.002495321272614, "grad_norm": 0.09284121837219293, "learning_rate": 3.37353935941662e-06, "loss": 0.0277, "num_tokens": 128554051.0, "step": 1606 }, { "epoch": 2.0037429819089208, "grad_norm": 0.09339384544922638, "learning_rate": 3.3682008843533055e-06, "loss": 0.029, "num_tokens": 128634209.0, "step": 1607 }, { "epoch": 2.0049906425452275, "grad_norm": 0.09435077752847291, "learning_rate": 3.3628662748861374e-06, "loss": 0.0282, "num_tokens": 128715069.0, "step": 1608 }, { "epoch": 2.0062383031815347, "grad_norm": 0.09614281235112, "learning_rate": 3.357535540688379e-06, "loss": 0.0278, "num_tokens": 128795504.0, "step": 1609 }, { "epoch": 2.0074859638178415, "grad_norm": 0.09264669561245602, "learning_rate": 3.3522086914262585e-06, "loss": 0.0271, "num_tokens": 128876123.0, "step": 1610 }, { "epoch": 2.0087336244541483, "grad_norm": 0.11140936885389302, "learning_rate": 3.3468857367589665e-06, "loss": 0.0275, "num_tokens": 128955991.0, "step": 1611 }, { "epoch": 2.0099812850904555, "grad_norm": 0.09169587047179813, "learning_rate": 3.3415666863386298e-06, "loss": 0.0275, "num_tokens": 129039016.0, "step": 1612 }, { "epoch": 2.0112289457267623, "grad_norm": 0.1042070178139685, "learning_rate": 3.3362515498102934e-06, "loss": 0.0275, "num_tokens": 129120041.0, "step": 1613 }, { "epoch": 2.0124766063630695, "grad_norm": 0.11233635245764659, "learning_rate": 3.330940336811903e-06, "loss": 0.0281, "num_tokens": 129200867.0, "step": 1614 }, { "epoch": 2.0137242669993762, "grad_norm": 0.09294626630897553, "learning_rate": 3.325633056974298e-06, "loss": 0.0259, "num_tokens": 129279446.0, "step": 1615 }, { "epoch": 2.014971927635683, "grad_norm": 0.10425624673925903, "learning_rate": 3.3203297199211794e-06, "loss": 0.0273, "num_tokens": 129359517.0, "step": 1616 }, { "epoch": 2.01621958827199, "grad_norm": 0.10772022702288936, "learning_rate": 3.315030335269096e-06, "loss": 0.0272, "num_tokens": 129439678.0, "step": 1617 }, { "epoch": 2.017467248908297, "grad_norm": 0.11804003684908118, "learning_rate": 3.309734912627441e-06, "loss": 0.0282, "num_tokens": 129519376.0, "step": 1618 }, { "epoch": 2.0187149095446038, "grad_norm": 0.12381607955705823, "learning_rate": 3.304443461598413e-06, "loss": 0.0288, "num_tokens": 129600651.0, "step": 1619 }, { "epoch": 2.019962570180911, "grad_norm": 0.13319111452989643, "learning_rate": 3.299155991777011e-06, "loss": 0.0297, "num_tokens": 129680386.0, "step": 1620 }, { "epoch": 2.0212102308172177, "grad_norm": 0.11432348701501038, "learning_rate": 3.2938725127510185e-06, "loss": 0.0282, "num_tokens": 129760859.0, "step": 1621 }, { "epoch": 2.0224578914535245, "grad_norm": 0.10791791050792644, "learning_rate": 3.2885930341009774e-06, "loss": 0.0277, "num_tokens": 129840051.0, "step": 1622 }, { "epoch": 2.0237055520898317, "grad_norm": 0.11686951819864136, "learning_rate": 3.2833175654001787e-06, "loss": 0.0283, "num_tokens": 129919701.0, "step": 1623 }, { "epoch": 2.0249532127261385, "grad_norm": 0.11359351664175828, "learning_rate": 3.278046116214642e-06, "loss": 0.0269, "num_tokens": 129999877.0, "step": 1624 }, { "epoch": 2.0262008733624453, "grad_norm": 0.1023939359316259, "learning_rate": 3.272778696103099e-06, "loss": 0.0275, "num_tokens": 130079968.0, "step": 1625 }, { "epoch": 2.0274485339987525, "grad_norm": 0.10892544064999989, "learning_rate": 3.2675153146169736e-06, "loss": 0.0275, "num_tokens": 130160624.0, "step": 1626 }, { "epoch": 2.0286961946350592, "grad_norm": 0.10430749700068655, "learning_rate": 3.2622559813003684e-06, "loss": 0.0288, "num_tokens": 130239925.0, "step": 1627 }, { "epoch": 2.029943855271366, "grad_norm": 0.10317672945567567, "learning_rate": 3.2570007056900437e-06, "loss": 0.0271, "num_tokens": 130320799.0, "step": 1628 }, { "epoch": 2.031191515907673, "grad_norm": 0.10963736701978923, "learning_rate": 3.2517494973154008e-06, "loss": 0.0277, "num_tokens": 130400099.0, "step": 1629 }, { "epoch": 2.03243917654398, "grad_norm": 0.12238499931140737, "learning_rate": 3.2465023656984707e-06, "loss": 0.0292, "num_tokens": 130480729.0, "step": 1630 }, { "epoch": 2.0336868371802868, "grad_norm": 0.10971492853164319, "learning_rate": 3.2412593203538857e-06, "loss": 0.0297, "num_tokens": 130560371.0, "step": 1631 }, { "epoch": 2.034934497816594, "grad_norm": 0.1230436141198275, "learning_rate": 3.236020370788876e-06, "loss": 0.0273, "num_tokens": 130639731.0, "step": 1632 }, { "epoch": 2.0361821584529007, "grad_norm": 0.10170244549018313, "learning_rate": 3.230785526503236e-06, "loss": 0.0269, "num_tokens": 130719595.0, "step": 1633 }, { "epoch": 2.037429819089208, "grad_norm": 0.11160175846037224, "learning_rate": 3.225554796989325e-06, "loss": 0.0279, "num_tokens": 130800181.0, "step": 1634 }, { "epoch": 2.0386774797255147, "grad_norm": 0.09813935826310065, "learning_rate": 3.2203281917320328e-06, "loss": 0.0261, "num_tokens": 130879513.0, "step": 1635 }, { "epoch": 2.0399251403618215, "grad_norm": 0.11117192931618461, "learning_rate": 3.2151057202087783e-06, "loss": 0.0283, "num_tokens": 130958850.0, "step": 1636 }, { "epoch": 2.0411728009981287, "grad_norm": 0.10698263215463055, "learning_rate": 3.209887391889479e-06, "loss": 0.027, "num_tokens": 131038985.0, "step": 1637 }, { "epoch": 2.0424204616344355, "grad_norm": 0.10820076619681371, "learning_rate": 3.204673216236539e-06, "loss": 0.0275, "num_tokens": 131118757.0, "step": 1638 }, { "epoch": 2.0436681222707422, "grad_norm": 0.11467484503330413, "learning_rate": 3.199463202704838e-06, "loss": 0.0273, "num_tokens": 131201221.0, "step": 1639 }, { "epoch": 2.0449157829070495, "grad_norm": 0.12247988636033476, "learning_rate": 3.194257360741706e-06, "loss": 0.0288, "num_tokens": 131282719.0, "step": 1640 }, { "epoch": 2.046163443543356, "grad_norm": 0.1057593800426446, "learning_rate": 3.189055699786906e-06, "loss": 0.0274, "num_tokens": 131362232.0, "step": 1641 }, { "epoch": 2.047411104179663, "grad_norm": 0.12364375320242353, "learning_rate": 3.1838582292726206e-06, "loss": 0.0289, "num_tokens": 131442046.0, "step": 1642 }, { "epoch": 2.04865876481597, "grad_norm": 0.1145129297133959, "learning_rate": 3.1786649586234373e-06, "loss": 0.0282, "num_tokens": 131523135.0, "step": 1643 }, { "epoch": 2.049906425452277, "grad_norm": 0.10215949526592619, "learning_rate": 3.173475897256325e-06, "loss": 0.0266, "num_tokens": 131603284.0, "step": 1644 }, { "epoch": 2.0511540860885837, "grad_norm": 0.10761560661797925, "learning_rate": 3.1682910545806167e-06, "loss": 0.028, "num_tokens": 131683908.0, "step": 1645 }, { "epoch": 2.052401746724891, "grad_norm": 0.11121482292743783, "learning_rate": 3.1631104399980053e-06, "loss": 0.0267, "num_tokens": 131762852.0, "step": 1646 }, { "epoch": 2.0536494073611977, "grad_norm": 0.11153961622318409, "learning_rate": 3.157934062902508e-06, "loss": 0.0276, "num_tokens": 131842654.0, "step": 1647 }, { "epoch": 2.0548970679975045, "grad_norm": 0.11202792323594137, "learning_rate": 3.1527619326804594e-06, "loss": 0.0278, "num_tokens": 131921403.0, "step": 1648 }, { "epoch": 2.0561447286338117, "grad_norm": 0.11878747584208416, "learning_rate": 3.147594058710498e-06, "loss": 0.0279, "num_tokens": 132003424.0, "step": 1649 }, { "epoch": 2.0573923892701185, "grad_norm": 0.11600844083004452, "learning_rate": 3.14243045036354e-06, "loss": 0.0276, "num_tokens": 132082840.0, "step": 1650 }, { "epoch": 2.0586400499064252, "grad_norm": 0.11293539536069673, "learning_rate": 3.1372711170027666e-06, "loss": 0.0275, "num_tokens": 132162541.0, "step": 1651 }, { "epoch": 2.0598877105427325, "grad_norm": 0.11026505266606129, "learning_rate": 3.13211606798361e-06, "loss": 0.0274, "num_tokens": 132242631.0, "step": 1652 }, { "epoch": 2.061135371179039, "grad_norm": 0.12188319048183703, "learning_rate": 3.1269653126537344e-06, "loss": 0.0278, "num_tokens": 132323488.0, "step": 1653 }, { "epoch": 2.0623830318153464, "grad_norm": 0.12082041385469403, "learning_rate": 3.121818860353011e-06, "loss": 0.0275, "num_tokens": 132403983.0, "step": 1654 }, { "epoch": 2.063630692451653, "grad_norm": 0.10629866280671935, "learning_rate": 3.116676720413519e-06, "loss": 0.0267, "num_tokens": 132483686.0, "step": 1655 }, { "epoch": 2.06487835308796, "grad_norm": 0.10618934252287057, "learning_rate": 3.11153890215951e-06, "loss": 0.028, "num_tokens": 132562728.0, "step": 1656 }, { "epoch": 2.066126013724267, "grad_norm": 0.1179037811248608, "learning_rate": 3.1064054149073984e-06, "loss": 0.0287, "num_tokens": 132643301.0, "step": 1657 }, { "epoch": 2.067373674360574, "grad_norm": 0.1068883102584125, "learning_rate": 3.1012762679657525e-06, "loss": 0.0271, "num_tokens": 132722314.0, "step": 1658 }, { "epoch": 2.0686213349968807, "grad_norm": 0.10906974578938657, "learning_rate": 3.0961514706352654e-06, "loss": 0.028, "num_tokens": 132801325.0, "step": 1659 }, { "epoch": 2.069868995633188, "grad_norm": 0.11050438055442657, "learning_rate": 3.09103103220874e-06, "loss": 0.0273, "num_tokens": 132881040.0, "step": 1660 }, { "epoch": 2.0711166562694947, "grad_norm": 0.11682521580597607, "learning_rate": 3.085914961971082e-06, "loss": 0.0283, "num_tokens": 132960595.0, "step": 1661 }, { "epoch": 2.0723643169058015, "grad_norm": 0.11450793460923425, "learning_rate": 3.080803269199275e-06, "loss": 0.0277, "num_tokens": 133039931.0, "step": 1662 }, { "epoch": 2.0736119775421087, "grad_norm": 0.11467759010106195, "learning_rate": 3.0756959631623583e-06, "loss": 0.0276, "num_tokens": 133120290.0, "step": 1663 }, { "epoch": 2.0748596381784155, "grad_norm": 0.11713560955703137, "learning_rate": 3.0705930531214255e-06, "loss": 0.0284, "num_tokens": 133201093.0, "step": 1664 }, { "epoch": 2.0761072988147222, "grad_norm": 0.11798027964717171, "learning_rate": 3.065494548329594e-06, "loss": 0.0283, "num_tokens": 133280732.0, "step": 1665 }, { "epoch": 2.0773549594510294, "grad_norm": 0.11006281058166327, "learning_rate": 3.060400458031991e-06, "loss": 0.0267, "num_tokens": 133360752.0, "step": 1666 }, { "epoch": 2.078602620087336, "grad_norm": 0.10693659229092088, "learning_rate": 3.055310791465744e-06, "loss": 0.0281, "num_tokens": 133440361.0, "step": 1667 }, { "epoch": 2.079850280723643, "grad_norm": 0.10276904415759595, "learning_rate": 3.0502255578599594e-06, "loss": 0.0266, "num_tokens": 133520074.0, "step": 1668 }, { "epoch": 2.08109794135995, "grad_norm": 0.10682226267908494, "learning_rate": 3.0451447664357005e-06, "loss": 0.0274, "num_tokens": 133599454.0, "step": 1669 }, { "epoch": 2.082345601996257, "grad_norm": 0.11642340085158942, "learning_rate": 3.040068426405976e-06, "loss": 0.028, "num_tokens": 133679535.0, "step": 1670 }, { "epoch": 2.083593262632564, "grad_norm": 0.10439274580853032, "learning_rate": 3.0349965469757283e-06, "loss": 0.0276, "num_tokens": 133759276.0, "step": 1671 }, { "epoch": 2.084840923268871, "grad_norm": 0.10178900758944974, "learning_rate": 3.0299291373418038e-06, "loss": 0.0271, "num_tokens": 133839238.0, "step": 1672 }, { "epoch": 2.0860885839051777, "grad_norm": 0.10108576053246532, "learning_rate": 3.024866206692953e-06, "loss": 0.0266, "num_tokens": 133918423.0, "step": 1673 }, { "epoch": 2.087336244541485, "grad_norm": 0.10890004660532689, "learning_rate": 3.0198077642097945e-06, "loss": 0.0281, "num_tokens": 133999314.0, "step": 1674 }, { "epoch": 2.0885839051777917, "grad_norm": 0.11949459892569592, "learning_rate": 3.014753819064817e-06, "loss": 0.0285, "num_tokens": 134079417.0, "step": 1675 }, { "epoch": 2.0898315658140985, "grad_norm": 0.11019307261367436, "learning_rate": 3.009704380422348e-06, "loss": 0.0268, "num_tokens": 134159336.0, "step": 1676 }, { "epoch": 2.0910792264504057, "grad_norm": 0.1094076338739407, "learning_rate": 3.004659457438548e-06, "loss": 0.0282, "num_tokens": 134238687.0, "step": 1677 }, { "epoch": 2.0923268870867124, "grad_norm": 0.11203035809174719, "learning_rate": 2.999619059261387e-06, "loss": 0.0277, "num_tokens": 134319431.0, "step": 1678 }, { "epoch": 2.093574547723019, "grad_norm": 0.10125109772296438, "learning_rate": 2.9945831950306285e-06, "loss": 0.0267, "num_tokens": 134399747.0, "step": 1679 }, { "epoch": 2.0948222083593264, "grad_norm": 0.10479161190075127, "learning_rate": 2.9895518738778196e-06, "loss": 0.0271, "num_tokens": 134479438.0, "step": 1680 }, { "epoch": 2.096069868995633, "grad_norm": 0.11262694348153912, "learning_rate": 2.984525104926262e-06, "loss": 0.0285, "num_tokens": 134559824.0, "step": 1681 }, { "epoch": 2.09731752963194, "grad_norm": 0.11572630284775189, "learning_rate": 2.97950289729101e-06, "loss": 0.0286, "num_tokens": 134639963.0, "step": 1682 }, { "epoch": 2.098565190268247, "grad_norm": 0.11746731799240802, "learning_rate": 2.974485260078846e-06, "loss": 0.0277, "num_tokens": 134719925.0, "step": 1683 }, { "epoch": 2.099812850904554, "grad_norm": 0.121581880657992, "learning_rate": 2.9694722023882607e-06, "loss": 0.0269, "num_tokens": 134799277.0, "step": 1684 }, { "epoch": 2.1010605115408607, "grad_norm": 0.11598556908383914, "learning_rate": 2.9644637333094404e-06, "loss": 0.0284, "num_tokens": 134879892.0, "step": 1685 }, { "epoch": 2.102308172177168, "grad_norm": 0.14223573137275083, "learning_rate": 2.959459861924258e-06, "loss": 0.028, "num_tokens": 134959367.0, "step": 1686 }, { "epoch": 2.1035558328134747, "grad_norm": 0.12003504700797399, "learning_rate": 2.954460597306242e-06, "loss": 0.0291, "num_tokens": 135039815.0, "step": 1687 }, { "epoch": 2.1048034934497815, "grad_norm": 0.12915254648172153, "learning_rate": 2.9494659485205683e-06, "loss": 0.0459, "num_tokens": 135121468.0, "step": 1688 }, { "epoch": 2.1060511540860887, "grad_norm": 0.10985724087657672, "learning_rate": 2.9444759246240505e-06, "loss": 0.0271, "num_tokens": 135200652.0, "step": 1689 }, { "epoch": 2.1072988147223954, "grad_norm": 0.11936091256114177, "learning_rate": 2.939490534665107e-06, "loss": 0.0278, "num_tokens": 135281590.0, "step": 1690 }, { "epoch": 2.108546475358702, "grad_norm": 0.11222455840445227, "learning_rate": 2.934509787683755e-06, "loss": 0.0281, "num_tokens": 135361375.0, "step": 1691 }, { "epoch": 2.1097941359950094, "grad_norm": 0.11147040706020958, "learning_rate": 2.929533692711598e-06, "loss": 0.0274, "num_tokens": 135442925.0, "step": 1692 }, { "epoch": 2.111041796631316, "grad_norm": 0.10467852886453667, "learning_rate": 2.9245622587717982e-06, "loss": 0.0275, "num_tokens": 135523906.0, "step": 1693 }, { "epoch": 2.1122894572676234, "grad_norm": 0.10869697183862845, "learning_rate": 2.919595494879065e-06, "loss": 0.0276, "num_tokens": 135603783.0, "step": 1694 }, { "epoch": 2.11353711790393, "grad_norm": 0.1155482691293836, "learning_rate": 2.9146334100396474e-06, "loss": 0.0282, "num_tokens": 135684084.0, "step": 1695 }, { "epoch": 2.114784778540237, "grad_norm": 0.11252805380717851, "learning_rate": 2.9096760132513036e-06, "loss": 0.0286, "num_tokens": 135765048.0, "step": 1696 }, { "epoch": 2.116032439176544, "grad_norm": 0.11033818934026106, "learning_rate": 2.9047233135032927e-06, "loss": 0.0275, "num_tokens": 135845335.0, "step": 1697 }, { "epoch": 2.117280099812851, "grad_norm": 0.13859448472332372, "learning_rate": 2.8997753197763532e-06, "loss": 0.03, "num_tokens": 135925643.0, "step": 1698 }, { "epoch": 2.1185277604491577, "grad_norm": 0.11224407005748382, "learning_rate": 2.894832041042699e-06, "loss": 0.0288, "num_tokens": 136005781.0, "step": 1699 }, { "epoch": 2.119775421085465, "grad_norm": 0.10019855101966049, "learning_rate": 2.8898934862659823e-06, "loss": 0.0266, "num_tokens": 136085338.0, "step": 1700 }, { "epoch": 2.1210230817217717, "grad_norm": 0.12238851973022535, "learning_rate": 2.8849596644013e-06, "loss": 0.0277, "num_tokens": 136166090.0, "step": 1701 }, { "epoch": 2.1222707423580784, "grad_norm": 0.11577197457908402, "learning_rate": 2.880030584395162e-06, "loss": 0.0281, "num_tokens": 136246455.0, "step": 1702 }, { "epoch": 2.1235184029943857, "grad_norm": 0.11575314696378225, "learning_rate": 2.8751062551854775e-06, "loss": 0.0292, "num_tokens": 136326020.0, "step": 1703 }, { "epoch": 2.1247660636306924, "grad_norm": 0.1236602811227467, "learning_rate": 2.870186685701545e-06, "loss": 0.0278, "num_tokens": 136405902.0, "step": 1704 }, { "epoch": 2.126013724266999, "grad_norm": 0.10698357238776467, "learning_rate": 2.8652718848640337e-06, "loss": 0.0271, "num_tokens": 136484304.0, "step": 1705 }, { "epoch": 2.1272613849033064, "grad_norm": 0.11565417571999578, "learning_rate": 2.8603618615849603e-06, "loss": 0.0289, "num_tokens": 136563783.0, "step": 1706 }, { "epoch": 2.128509045539613, "grad_norm": 0.11178198694056778, "learning_rate": 2.8554566247676806e-06, "loss": 0.027, "num_tokens": 136643585.0, "step": 1707 }, { "epoch": 2.12975670617592, "grad_norm": 0.10484411194496365, "learning_rate": 2.850556183306874e-06, "loss": 0.0277, "num_tokens": 136723910.0, "step": 1708 }, { "epoch": 2.131004366812227, "grad_norm": 0.11008252356620116, "learning_rate": 2.845660546088519e-06, "loss": 0.0271, "num_tokens": 136802746.0, "step": 1709 }, { "epoch": 2.132252027448534, "grad_norm": 0.1244585389706024, "learning_rate": 2.8407697219898865e-06, "loss": 0.0277, "num_tokens": 136883051.0, "step": 1710 }, { "epoch": 2.133499688084841, "grad_norm": 0.11761109934284099, "learning_rate": 2.8358837198795223e-06, "loss": 0.0285, "num_tokens": 136963474.0, "step": 1711 }, { "epoch": 2.134747348721148, "grad_norm": 0.11248525039399156, "learning_rate": 2.8310025486172223e-06, "loss": 0.0288, "num_tokens": 137043833.0, "step": 1712 }, { "epoch": 2.1359950093574547, "grad_norm": 0.11548415099531208, "learning_rate": 2.8261262170540242e-06, "loss": 0.0274, "num_tokens": 137123180.0, "step": 1713 }, { "epoch": 2.137242669993762, "grad_norm": 0.10709230843633225, "learning_rate": 2.821254734032194e-06, "loss": 0.0275, "num_tokens": 137203580.0, "step": 1714 }, { "epoch": 2.1384903306300687, "grad_norm": 0.11759009663698383, "learning_rate": 2.8163881083852e-06, "loss": 0.0299, "num_tokens": 137284710.0, "step": 1715 }, { "epoch": 2.1397379912663754, "grad_norm": 0.10669561409225682, "learning_rate": 2.811526348937706e-06, "loss": 0.0275, "num_tokens": 137363810.0, "step": 1716 }, { "epoch": 2.1409856519026826, "grad_norm": 0.10185154190042917, "learning_rate": 2.806669464505552e-06, "loss": 0.0274, "num_tokens": 137443227.0, "step": 1717 }, { "epoch": 2.1422333125389894, "grad_norm": 0.11279381093375922, "learning_rate": 2.80181746389574e-06, "loss": 0.0277, "num_tokens": 137522857.0, "step": 1718 }, { "epoch": 2.143480973175296, "grad_norm": 0.11315608721853433, "learning_rate": 2.7969703559064076e-06, "loss": 0.0278, "num_tokens": 137602845.0, "step": 1719 }, { "epoch": 2.1447286338116034, "grad_norm": 0.11922283733338566, "learning_rate": 2.792128149326833e-06, "loss": 0.0289, "num_tokens": 137683630.0, "step": 1720 }, { "epoch": 2.14597629444791, "grad_norm": 0.115407862616771, "learning_rate": 2.7872908529373976e-06, "loss": 0.0286, "num_tokens": 137765456.0, "step": 1721 }, { "epoch": 2.147223955084217, "grad_norm": 0.11208353296927773, "learning_rate": 2.782458475509581e-06, "loss": 0.0271, "num_tokens": 137845544.0, "step": 1722 }, { "epoch": 2.148471615720524, "grad_norm": 0.1068207673083175, "learning_rate": 2.7776310258059447e-06, "loss": 0.0271, "num_tokens": 137924567.0, "step": 1723 }, { "epoch": 2.149719276356831, "grad_norm": 0.1144766801038503, "learning_rate": 2.772808512580114e-06, "loss": 0.0311, "num_tokens": 138004671.0, "step": 1724 }, { "epoch": 2.1509669369931377, "grad_norm": 0.11230257045378114, "learning_rate": 2.767990944576763e-06, "loss": 0.0281, "num_tokens": 138085655.0, "step": 1725 }, { "epoch": 2.152214597629445, "grad_norm": 0.10991165872168095, "learning_rate": 2.7631783305316017e-06, "loss": 0.0268, "num_tokens": 138166694.0, "step": 1726 }, { "epoch": 2.1534622582657517, "grad_norm": 0.10971797219708118, "learning_rate": 2.7583706791713503e-06, "loss": 0.0273, "num_tokens": 138246414.0, "step": 1727 }, { "epoch": 2.154709918902059, "grad_norm": 0.11270715704229847, "learning_rate": 2.7535679992137338e-06, "loss": 0.0271, "num_tokens": 138326768.0, "step": 1728 }, { "epoch": 2.1559575795383656, "grad_norm": 0.12328542886456291, "learning_rate": 2.7487702993674647e-06, "loss": 0.029, "num_tokens": 138406486.0, "step": 1729 }, { "epoch": 2.1572052401746724, "grad_norm": 0.1190445079709797, "learning_rate": 2.7439775883322228e-06, "loss": 0.0284, "num_tokens": 138488315.0, "step": 1730 }, { "epoch": 2.158452900810979, "grad_norm": 0.11340655933746487, "learning_rate": 2.739189874798639e-06, "loss": 0.0286, "num_tokens": 138569358.0, "step": 1731 }, { "epoch": 2.1597005614472864, "grad_norm": 0.11303296924070988, "learning_rate": 2.7344071674482874e-06, "loss": 0.0281, "num_tokens": 138648911.0, "step": 1732 }, { "epoch": 2.160948222083593, "grad_norm": 0.1034480761800008, "learning_rate": 2.729629474953662e-06, "loss": 0.0266, "num_tokens": 138727334.0, "step": 1733 }, { "epoch": 2.1621958827199004, "grad_norm": 0.11156242541516115, "learning_rate": 2.7248568059781654e-06, "loss": 0.0263, "num_tokens": 138807463.0, "step": 1734 }, { "epoch": 2.163443543356207, "grad_norm": 0.10227201302760246, "learning_rate": 2.7200891691760838e-06, "loss": 0.0272, "num_tokens": 138888573.0, "step": 1735 }, { "epoch": 2.164691203992514, "grad_norm": 0.11926979754093582, "learning_rate": 2.715326573192588e-06, "loss": 0.028, "num_tokens": 138967599.0, "step": 1736 }, { "epoch": 2.165938864628821, "grad_norm": 0.10590335435742637, "learning_rate": 2.710569026663702e-06, "loss": 0.027, "num_tokens": 139048151.0, "step": 1737 }, { "epoch": 2.167186525265128, "grad_norm": 0.1148169854925881, "learning_rate": 2.705816538216296e-06, "loss": 0.0272, "num_tokens": 139128924.0, "step": 1738 }, { "epoch": 2.1684341859014347, "grad_norm": 0.1019205633049057, "learning_rate": 2.7010691164680696e-06, "loss": 0.0268, "num_tokens": 139210922.0, "step": 1739 }, { "epoch": 2.169681846537742, "grad_norm": 0.12286991978648502, "learning_rate": 2.696326770027533e-06, "loss": 0.0352, "num_tokens": 139292094.0, "step": 1740 }, { "epoch": 2.1709295071740486, "grad_norm": 0.11680777294553368, "learning_rate": 2.6915895074939912e-06, "loss": 0.0274, "num_tokens": 139372386.0, "step": 1741 }, { "epoch": 2.1721771678103554, "grad_norm": 0.11434415621297753, "learning_rate": 2.6868573374575356e-06, "loss": 0.028, "num_tokens": 139451840.0, "step": 1742 }, { "epoch": 2.1734248284466626, "grad_norm": 0.11068475985426603, "learning_rate": 2.6821302684990204e-06, "loss": 0.0282, "num_tokens": 139531179.0, "step": 1743 }, { "epoch": 2.1746724890829694, "grad_norm": 0.10765013051927665, "learning_rate": 2.677408309190049e-06, "loss": 0.0273, "num_tokens": 139611340.0, "step": 1744 }, { "epoch": 2.175920149719276, "grad_norm": 0.1047064628578538, "learning_rate": 2.672691468092963e-06, "loss": 0.0266, "num_tokens": 139690494.0, "step": 1745 }, { "epoch": 2.1771678103555834, "grad_norm": 0.12852423753495817, "learning_rate": 2.6679797537608184e-06, "loss": 0.0283, "num_tokens": 139773131.0, "step": 1746 }, { "epoch": 2.17841547099189, "grad_norm": 0.10882310129634558, "learning_rate": 2.6632731747373785e-06, "loss": 0.0281, "num_tokens": 139853287.0, "step": 1747 }, { "epoch": 2.179663131628197, "grad_norm": 0.11131589806757092, "learning_rate": 2.658571739557096e-06, "loss": 0.0278, "num_tokens": 139934168.0, "step": 1748 }, { "epoch": 2.180910792264504, "grad_norm": 0.11163023096979792, "learning_rate": 2.653875456745092e-06, "loss": 0.0274, "num_tokens": 140014255.0, "step": 1749 }, { "epoch": 2.182158452900811, "grad_norm": 0.10614675078257936, "learning_rate": 2.6491843348171455e-06, "loss": 0.0275, "num_tokens": 140094036.0, "step": 1750 }, { "epoch": 2.183406113537118, "grad_norm": 0.11255684585168828, "learning_rate": 2.644498382279681e-06, "loss": 0.0279, "num_tokens": 140174691.0, "step": 1751 }, { "epoch": 2.184653774173425, "grad_norm": 0.11526184776808238, "learning_rate": 2.639817607629745e-06, "loss": 0.028, "num_tokens": 140254358.0, "step": 1752 }, { "epoch": 2.1859014348097316, "grad_norm": 0.11014270003322352, "learning_rate": 2.635142019354995e-06, "loss": 0.0279, "num_tokens": 140335203.0, "step": 1753 }, { "epoch": 2.187149095446039, "grad_norm": 0.11726190195059955, "learning_rate": 2.6304716259336903e-06, "loss": 0.027, "num_tokens": 140415001.0, "step": 1754 }, { "epoch": 2.1883967560823456, "grad_norm": 0.11551789258082427, "learning_rate": 2.6258064358346642e-06, "loss": 0.0276, "num_tokens": 140495012.0, "step": 1755 }, { "epoch": 2.1896444167186524, "grad_norm": 0.12911904078324782, "learning_rate": 2.621146457517314e-06, "loss": 0.0298, "num_tokens": 140577112.0, "step": 1756 }, { "epoch": 2.1908920773549596, "grad_norm": 0.10893674109666582, "learning_rate": 2.6164916994315916e-06, "loss": 0.0269, "num_tokens": 140657288.0, "step": 1757 }, { "epoch": 2.1921397379912664, "grad_norm": 0.11561232996328151, "learning_rate": 2.6118421700179795e-06, "loss": 0.0275, "num_tokens": 140737502.0, "step": 1758 }, { "epoch": 2.193387398627573, "grad_norm": 0.11811526803531466, "learning_rate": 2.6071978777074796e-06, "loss": 0.0278, "num_tokens": 140816956.0, "step": 1759 }, { "epoch": 2.1946350592638804, "grad_norm": 0.10604896561401701, "learning_rate": 2.6025588309215975e-06, "loss": 0.0277, "num_tokens": 140898099.0, "step": 1760 }, { "epoch": 2.195882719900187, "grad_norm": 0.11100890813179569, "learning_rate": 2.5979250380723287e-06, "loss": 0.0281, "num_tokens": 140977581.0, "step": 1761 }, { "epoch": 2.197130380536494, "grad_norm": 0.12574865966231316, "learning_rate": 2.5932965075621376e-06, "loss": 0.0291, "num_tokens": 141058023.0, "step": 1762 }, { "epoch": 2.198378041172801, "grad_norm": 0.11094390713862573, "learning_rate": 2.5886732477839514e-06, "loss": 0.0278, "num_tokens": 141137869.0, "step": 1763 }, { "epoch": 2.199625701809108, "grad_norm": 0.10875999873552089, "learning_rate": 2.584055267121137e-06, "loss": 0.0264, "num_tokens": 141216853.0, "step": 1764 }, { "epoch": 2.2008733624454146, "grad_norm": 0.11359059997751579, "learning_rate": 2.579442573947488e-06, "loss": 0.0302, "num_tokens": 141298589.0, "step": 1765 }, { "epoch": 2.202121023081722, "grad_norm": 0.12697222632331867, "learning_rate": 2.5748351766272127e-06, "loss": 0.0289, "num_tokens": 141378590.0, "step": 1766 }, { "epoch": 2.2033686837180286, "grad_norm": 0.10545644958622358, "learning_rate": 2.5702330835149137e-06, "loss": 0.0263, "num_tokens": 141457825.0, "step": 1767 }, { "epoch": 2.204616344354336, "grad_norm": 0.11016392505811934, "learning_rate": 2.5656363029555788e-06, "loss": 0.0289, "num_tokens": 141538133.0, "step": 1768 }, { "epoch": 2.2058640049906426, "grad_norm": 0.10965895021224552, "learning_rate": 2.561044843284558e-06, "loss": 0.0281, "num_tokens": 141617180.0, "step": 1769 }, { "epoch": 2.2071116656269494, "grad_norm": 0.1133356513749835, "learning_rate": 2.556458712827558e-06, "loss": 0.0287, "num_tokens": 141695468.0, "step": 1770 }, { "epoch": 2.2083593262632566, "grad_norm": 0.11395886557137493, "learning_rate": 2.551877919900619e-06, "loss": 0.0282, "num_tokens": 141775903.0, "step": 1771 }, { "epoch": 2.2096069868995634, "grad_norm": 0.11822802385416521, "learning_rate": 2.5473024728101004e-06, "loss": 0.0278, "num_tokens": 141856125.0, "step": 1772 }, { "epoch": 2.21085464753587, "grad_norm": 0.11611618297885314, "learning_rate": 2.5427323798526747e-06, "loss": 0.0277, "num_tokens": 141936379.0, "step": 1773 }, { "epoch": 2.2121023081721773, "grad_norm": 0.11559137208466257, "learning_rate": 2.538167649315298e-06, "loss": 0.0287, "num_tokens": 142018427.0, "step": 1774 }, { "epoch": 2.213349968808484, "grad_norm": 0.11659924816734041, "learning_rate": 2.5336082894752084e-06, "loss": 0.0285, "num_tokens": 142098768.0, "step": 1775 }, { "epoch": 2.214597629444791, "grad_norm": 0.11963881975246271, "learning_rate": 2.529054308599906e-06, "loss": 0.0308, "num_tokens": 142178577.0, "step": 1776 }, { "epoch": 2.215845290081098, "grad_norm": 0.11090511560379994, "learning_rate": 2.524505714947131e-06, "loss": 0.0281, "num_tokens": 142258299.0, "step": 1777 }, { "epoch": 2.217092950717405, "grad_norm": 0.11296744141624795, "learning_rate": 2.5199625167648576e-06, "loss": 0.028, "num_tokens": 142339109.0, "step": 1778 }, { "epoch": 2.2183406113537116, "grad_norm": 0.1257848652662134, "learning_rate": 2.515424722291282e-06, "loss": 0.0268, "num_tokens": 142421110.0, "step": 1779 }, { "epoch": 2.219588271990019, "grad_norm": 0.10900698032162932, "learning_rate": 2.5108923397547934e-06, "loss": 0.027, "num_tokens": 142501998.0, "step": 1780 }, { "epoch": 2.2208359326263256, "grad_norm": 0.11575161471868187, "learning_rate": 2.5063653773739705e-06, "loss": 0.0278, "num_tokens": 142581395.0, "step": 1781 }, { "epoch": 2.2220835932626324, "grad_norm": 0.10090543842040466, "learning_rate": 2.501843843357568e-06, "loss": 0.0263, "num_tokens": 142659673.0, "step": 1782 }, { "epoch": 2.2233312538989396, "grad_norm": 0.1115252060659669, "learning_rate": 2.4973277459044927e-06, "loss": 0.0282, "num_tokens": 142741046.0, "step": 1783 }, { "epoch": 2.2245789145352464, "grad_norm": 0.1133673733213164, "learning_rate": 2.4928170932037916e-06, "loss": 0.0277, "num_tokens": 142820299.0, "step": 1784 }, { "epoch": 2.225826575171553, "grad_norm": 0.1127136967529174, "learning_rate": 2.4883118934346446e-06, "loss": 0.0273, "num_tokens": 142900381.0, "step": 1785 }, { "epoch": 2.2270742358078603, "grad_norm": 0.10491586464405452, "learning_rate": 2.48381215476634e-06, "loss": 0.0265, "num_tokens": 142980799.0, "step": 1786 }, { "epoch": 2.228321896444167, "grad_norm": 0.11469618848304418, "learning_rate": 2.4793178853582624e-06, "loss": 0.0273, "num_tokens": 143061287.0, "step": 1787 }, { "epoch": 2.229569557080474, "grad_norm": 0.1087127525924716, "learning_rate": 2.474829093359881e-06, "loss": 0.0275, "num_tokens": 143141303.0, "step": 1788 }, { "epoch": 2.230817217716781, "grad_norm": 0.10390821955698, "learning_rate": 2.4703457869107346e-06, "loss": 0.0272, "num_tokens": 143221934.0, "step": 1789 }, { "epoch": 2.232064878353088, "grad_norm": 0.11473477655662885, "learning_rate": 2.4658679741404106e-06, "loss": 0.0287, "num_tokens": 143303459.0, "step": 1790 }, { "epoch": 2.233312538989395, "grad_norm": 0.11105580740416413, "learning_rate": 2.461395663168539e-06, "loss": 0.0281, "num_tokens": 143383014.0, "step": 1791 }, { "epoch": 2.234560199625702, "grad_norm": 0.11200067622309487, "learning_rate": 2.4569288621047704e-06, "loss": 0.0284, "num_tokens": 143462866.0, "step": 1792 }, { "epoch": 2.2358078602620086, "grad_norm": 0.10838875549676315, "learning_rate": 2.452467579048764e-06, "loss": 0.0267, "num_tokens": 143541856.0, "step": 1793 }, { "epoch": 2.237055520898316, "grad_norm": 0.11004549583890312, "learning_rate": 2.4480118220901764e-06, "loss": 0.0268, "num_tokens": 143621766.0, "step": 1794 }, { "epoch": 2.2383031815346226, "grad_norm": 0.10774371386318345, "learning_rate": 2.4435615993086414e-06, "loss": 0.0281, "num_tokens": 143700863.0, "step": 1795 }, { "epoch": 2.2395508421709294, "grad_norm": 0.10262726213219055, "learning_rate": 2.4391169187737555e-06, "loss": 0.0264, "num_tokens": 143780027.0, "step": 1796 }, { "epoch": 2.2407985028072366, "grad_norm": 0.10594867992251468, "learning_rate": 2.434677788545071e-06, "loss": 0.0278, "num_tokens": 143859671.0, "step": 1797 }, { "epoch": 2.2420461634435433, "grad_norm": 0.10587436147761728, "learning_rate": 2.4302442166720723e-06, "loss": 0.0275, "num_tokens": 143940423.0, "step": 1798 }, { "epoch": 2.24329382407985, "grad_norm": 0.11934655967345954, "learning_rate": 2.4258162111941634e-06, "loss": 0.0276, "num_tokens": 144021103.0, "step": 1799 }, { "epoch": 2.2445414847161573, "grad_norm": 0.11108721213520488, "learning_rate": 2.42139378014066e-06, "loss": 0.028, "num_tokens": 144101887.0, "step": 1800 }, { "epoch": 2.245789145352464, "grad_norm": 0.11656736214212844, "learning_rate": 2.416976931530764e-06, "loss": 0.0287, "num_tokens": 144182215.0, "step": 1801 }, { "epoch": 2.247036805988771, "grad_norm": 0.11510680943141609, "learning_rate": 2.4125656733735554e-06, "loss": 0.0281, "num_tokens": 144263091.0, "step": 1802 }, { "epoch": 2.248284466625078, "grad_norm": 0.10345650824569999, "learning_rate": 2.4081600136679805e-06, "loss": 0.0276, "num_tokens": 144342322.0, "step": 1803 }, { "epoch": 2.249532127261385, "grad_norm": 0.11225842570308209, "learning_rate": 2.403759960402834e-06, "loss": 0.028, "num_tokens": 144422662.0, "step": 1804 }, { "epoch": 2.2507797878976916, "grad_norm": 0.14563905711788522, "learning_rate": 2.39936552155674e-06, "loss": 0.0276, "num_tokens": 144502890.0, "step": 1805 }, { "epoch": 2.252027448533999, "grad_norm": 0.11458859468044612, "learning_rate": 2.394976705098143e-06, "loss": 0.0277, "num_tokens": 144583307.0, "step": 1806 }, { "epoch": 2.2532751091703056, "grad_norm": 0.11282706392587076, "learning_rate": 2.3905935189852967e-06, "loss": 0.0286, "num_tokens": 144664063.0, "step": 1807 }, { "epoch": 2.254522769806613, "grad_norm": 0.10716579111382028, "learning_rate": 2.386215971166242e-06, "loss": 0.0273, "num_tokens": 144744193.0, "step": 1808 }, { "epoch": 2.2557704304429196, "grad_norm": 0.10816884263366056, "learning_rate": 2.381844069578793e-06, "loss": 0.0273, "num_tokens": 144825420.0, "step": 1809 }, { "epoch": 2.2570180910792264, "grad_norm": 0.11079261493952917, "learning_rate": 2.3774778221505316e-06, "loss": 0.0269, "num_tokens": 144904705.0, "step": 1810 }, { "epoch": 2.2582657517155336, "grad_norm": 0.12304447536252638, "learning_rate": 2.3731172367987856e-06, "loss": 0.0279, "num_tokens": 144984409.0, "step": 1811 }, { "epoch": 2.2595134123518403, "grad_norm": 0.12073914506900349, "learning_rate": 2.3687623214306096e-06, "loss": 0.0281, "num_tokens": 145065436.0, "step": 1812 }, { "epoch": 2.260761072988147, "grad_norm": 0.11025114886878515, "learning_rate": 2.364413083942787e-06, "loss": 0.0278, "num_tokens": 145145726.0, "step": 1813 }, { "epoch": 2.2620087336244543, "grad_norm": 0.10678041974245804, "learning_rate": 2.3600695322217965e-06, "loss": 0.0278, "num_tokens": 145225716.0, "step": 1814 }, { "epoch": 2.263256394260761, "grad_norm": 0.1094478383949689, "learning_rate": 2.355731674143809e-06, "loss": 0.0266, "num_tokens": 145304672.0, "step": 1815 }, { "epoch": 2.264504054897068, "grad_norm": 0.11066469315589064, "learning_rate": 2.3513995175746757e-06, "loss": 0.0276, "num_tokens": 145383597.0, "step": 1816 }, { "epoch": 2.265751715533375, "grad_norm": 0.11081072087024889, "learning_rate": 2.3470730703699034e-06, "loss": 0.0264, "num_tokens": 145463533.0, "step": 1817 }, { "epoch": 2.266999376169682, "grad_norm": 0.12335653809318686, "learning_rate": 2.3427523403746496e-06, "loss": 0.0286, "num_tokens": 145543691.0, "step": 1818 }, { "epoch": 2.2682470368059886, "grad_norm": 0.10000215802185779, "learning_rate": 2.338437335423705e-06, "loss": 0.0264, "num_tokens": 145622759.0, "step": 1819 }, { "epoch": 2.269494697442296, "grad_norm": 0.11700316815610115, "learning_rate": 2.3341280633414763e-06, "loss": 0.0286, "num_tokens": 145703874.0, "step": 1820 }, { "epoch": 2.2707423580786026, "grad_norm": 0.11974933764453737, "learning_rate": 2.3298245319419755e-06, "loss": 0.0281, "num_tokens": 145784643.0, "step": 1821 }, { "epoch": 2.2719900187149094, "grad_norm": 0.11752237392767895, "learning_rate": 2.325526749028808e-06, "loss": 0.0278, "num_tokens": 145865466.0, "step": 1822 }, { "epoch": 2.2732376793512166, "grad_norm": 0.12283401566553695, "learning_rate": 2.321234722395152e-06, "loss": 0.0282, "num_tokens": 145944342.0, "step": 1823 }, { "epoch": 2.2744853399875233, "grad_norm": 0.11195250793318184, "learning_rate": 2.3169484598237484e-06, "loss": 0.0276, "num_tokens": 146023270.0, "step": 1824 }, { "epoch": 2.2757330006238305, "grad_norm": 0.11013517177727546, "learning_rate": 2.312667969086887e-06, "loss": 0.0267, "num_tokens": 146102006.0, "step": 1825 }, { "epoch": 2.2769806612601373, "grad_norm": 0.11787583993905608, "learning_rate": 2.308393257946393e-06, "loss": 0.0274, "num_tokens": 146181867.0, "step": 1826 }, { "epoch": 2.278228321896444, "grad_norm": 0.11819294612785929, "learning_rate": 2.304124334153608e-06, "loss": 0.0269, "num_tokens": 146261476.0, "step": 1827 }, { "epoch": 2.279475982532751, "grad_norm": 0.11184828699005617, "learning_rate": 2.2998612054493827e-06, "loss": 0.0284, "num_tokens": 146341515.0, "step": 1828 }, { "epoch": 2.280723643169058, "grad_norm": 0.10777522190355769, "learning_rate": 2.2956038795640573e-06, "loss": 0.0274, "num_tokens": 146420733.0, "step": 1829 }, { "epoch": 2.281971303805365, "grad_norm": 0.10954932409855346, "learning_rate": 2.291352364217449e-06, "loss": 0.028, "num_tokens": 146501895.0, "step": 1830 }, { "epoch": 2.283218964441672, "grad_norm": 0.10884043485738872, "learning_rate": 2.287106667118841e-06, "loss": 0.0281, "num_tokens": 146581796.0, "step": 1831 }, { "epoch": 2.284466625077979, "grad_norm": 0.11517754082110432, "learning_rate": 2.2828667959669674e-06, "loss": 0.0286, "num_tokens": 146663043.0, "step": 1832 }, { "epoch": 2.2857142857142856, "grad_norm": 0.1259652230677902, "learning_rate": 2.2786327584499944e-06, "loss": 0.0291, "num_tokens": 146744082.0, "step": 1833 }, { "epoch": 2.286961946350593, "grad_norm": 0.11385926033954893, "learning_rate": 2.2744045622455112e-06, "loss": 0.0278, "num_tokens": 146824514.0, "step": 1834 }, { "epoch": 2.2882096069868996, "grad_norm": 0.10823713522415669, "learning_rate": 2.270182215020517e-06, "loss": 0.0275, "num_tokens": 146905003.0, "step": 1835 }, { "epoch": 2.2894572676232063, "grad_norm": 0.10607107777899051, "learning_rate": 2.2659657244314017e-06, "loss": 0.0274, "num_tokens": 146984485.0, "step": 1836 }, { "epoch": 2.2907049282595136, "grad_norm": 0.10959122120451005, "learning_rate": 2.26175509812394e-06, "loss": 0.0276, "num_tokens": 147064157.0, "step": 1837 }, { "epoch": 2.2919525888958203, "grad_norm": 0.10057700457691329, "learning_rate": 2.2575503437332677e-06, "loss": 0.0273, "num_tokens": 147143970.0, "step": 1838 }, { "epoch": 2.293200249532127, "grad_norm": 0.10943750560043476, "learning_rate": 2.2533514688838755e-06, "loss": 0.028, "num_tokens": 147225213.0, "step": 1839 }, { "epoch": 2.2944479101684343, "grad_norm": 0.1101560015216109, "learning_rate": 2.2491584811895927e-06, "loss": 0.0272, "num_tokens": 147305029.0, "step": 1840 }, { "epoch": 2.295695570804741, "grad_norm": 0.1108231639983885, "learning_rate": 2.244971388253576e-06, "loss": 0.0261, "num_tokens": 147385812.0, "step": 1841 }, { "epoch": 2.2969432314410483, "grad_norm": 0.11679304596070587, "learning_rate": 2.2407901976682884e-06, "loss": 0.0294, "num_tokens": 147466071.0, "step": 1842 }, { "epoch": 2.298190892077355, "grad_norm": 0.09947228828930545, "learning_rate": 2.2366149170154907e-06, "loss": 0.0262, "num_tokens": 147544270.0, "step": 1843 }, { "epoch": 2.299438552713662, "grad_norm": 0.11014547069179065, "learning_rate": 2.232445553866231e-06, "loss": 0.0278, "num_tokens": 147624344.0, "step": 1844 }, { "epoch": 2.3006862133499686, "grad_norm": 0.11618854645406114, "learning_rate": 2.228282115780824e-06, "loss": 0.0283, "num_tokens": 147703871.0, "step": 1845 }, { "epoch": 2.301933873986276, "grad_norm": 0.11530991453976354, "learning_rate": 2.22412461030884e-06, "loss": 0.0277, "num_tokens": 147783896.0, "step": 1846 }, { "epoch": 2.3031815346225826, "grad_norm": 0.10820957500421617, "learning_rate": 2.2199730449890964e-06, "loss": 0.027, "num_tokens": 147864352.0, "step": 1847 }, { "epoch": 2.30442919525889, "grad_norm": 0.11573884363992257, "learning_rate": 2.215827427349635e-06, "loss": 0.0282, "num_tokens": 147944345.0, "step": 1848 }, { "epoch": 2.3056768558951966, "grad_norm": 0.11777830036437165, "learning_rate": 2.211687764907711e-06, "loss": 0.0272, "num_tokens": 148026015.0, "step": 1849 }, { "epoch": 2.3069245165315033, "grad_norm": 0.11222239508770851, "learning_rate": 2.2075540651697873e-06, "loss": 0.0275, "num_tokens": 148106133.0, "step": 1850 }, { "epoch": 2.3081721771678105, "grad_norm": 0.10933938525610572, "learning_rate": 2.2034263356315087e-06, "loss": 0.0278, "num_tokens": 148186803.0, "step": 1851 }, { "epoch": 2.3094198378041173, "grad_norm": 0.11453260431350368, "learning_rate": 2.1993045837776957e-06, "loss": 0.0278, "num_tokens": 148267327.0, "step": 1852 }, { "epoch": 2.310667498440424, "grad_norm": 0.11836630922316033, "learning_rate": 2.195188817082331e-06, "loss": 0.0283, "num_tokens": 148349060.0, "step": 1853 }, { "epoch": 2.3119151590767313, "grad_norm": 0.12681294550992456, "learning_rate": 2.1910790430085465e-06, "loss": 0.0275, "num_tokens": 148428476.0, "step": 1854 }, { "epoch": 2.313162819713038, "grad_norm": 0.10886143919786201, "learning_rate": 2.1869752690086e-06, "loss": 0.0269, "num_tokens": 148507956.0, "step": 1855 }, { "epoch": 2.314410480349345, "grad_norm": 0.1123737906437985, "learning_rate": 2.1828775025238787e-06, "loss": 0.027, "num_tokens": 148587206.0, "step": 1856 }, { "epoch": 2.315658140985652, "grad_norm": 0.1087200879265718, "learning_rate": 2.1787857509848693e-06, "loss": 0.027, "num_tokens": 148666530.0, "step": 1857 }, { "epoch": 2.316905801621959, "grad_norm": 0.12845500495545087, "learning_rate": 2.174700021811153e-06, "loss": 0.0288, "num_tokens": 148745506.0, "step": 1858 }, { "epoch": 2.3181534622582656, "grad_norm": 0.10251335702347915, "learning_rate": 2.1706203224113944e-06, "loss": 0.0273, "num_tokens": 148826801.0, "step": 1859 }, { "epoch": 2.319401122894573, "grad_norm": 0.13207173451085918, "learning_rate": 2.1665466601833197e-06, "loss": 0.0282, "num_tokens": 148907396.0, "step": 1860 }, { "epoch": 2.3206487835308796, "grad_norm": 0.11045087789527284, "learning_rate": 2.162479042513711e-06, "loss": 0.0275, "num_tokens": 148987223.0, "step": 1861 }, { "epoch": 2.3218964441671863, "grad_norm": 0.10466374938904822, "learning_rate": 2.158417476778388e-06, "loss": 0.0276, "num_tokens": 149068080.0, "step": 1862 }, { "epoch": 2.3231441048034935, "grad_norm": 0.11264871081312319, "learning_rate": 2.1543619703421975e-06, "loss": 0.0278, "num_tokens": 149147571.0, "step": 1863 }, { "epoch": 2.3243917654398003, "grad_norm": 0.104606366045223, "learning_rate": 2.1503125305589976e-06, "loss": 0.0272, "num_tokens": 149227191.0, "step": 1864 }, { "epoch": 2.3256394260761075, "grad_norm": 0.12583404589107572, "learning_rate": 2.146269164771648e-06, "loss": 0.0292, "num_tokens": 149308765.0, "step": 1865 }, { "epoch": 2.3268870867124143, "grad_norm": 0.11179688039422762, "learning_rate": 2.142231880311992e-06, "loss": 0.0274, "num_tokens": 149388926.0, "step": 1866 }, { "epoch": 2.328134747348721, "grad_norm": 0.10361239593188457, "learning_rate": 2.1382006845008456e-06, "loss": 0.0267, "num_tokens": 149468409.0, "step": 1867 }, { "epoch": 2.329382407985028, "grad_norm": 0.10285293247205865, "learning_rate": 2.1341755846479868e-06, "loss": 0.0274, "num_tokens": 149548190.0, "step": 1868 }, { "epoch": 2.330630068621335, "grad_norm": 0.11770503261290141, "learning_rate": 2.1301565880521387e-06, "loss": 0.0273, "num_tokens": 149628012.0, "step": 1869 }, { "epoch": 2.331877729257642, "grad_norm": 0.1074927375632693, "learning_rate": 2.1261437020009565e-06, "loss": 0.0271, "num_tokens": 149708217.0, "step": 1870 }, { "epoch": 2.333125389893949, "grad_norm": 0.11030656423571202, "learning_rate": 2.122136933771014e-06, "loss": 0.0272, "num_tokens": 149788673.0, "step": 1871 }, { "epoch": 2.334373050530256, "grad_norm": 0.12162150889745205, "learning_rate": 2.118136290627795e-06, "loss": 0.0287, "num_tokens": 149868514.0, "step": 1872 }, { "epoch": 2.3356207111665626, "grad_norm": 0.126632958015695, "learning_rate": 2.114141779825674e-06, "loss": 0.0282, "num_tokens": 149948614.0, "step": 1873 }, { "epoch": 2.3368683718028698, "grad_norm": 0.10058350258071957, "learning_rate": 2.110153408607904e-06, "loss": 0.0262, "num_tokens": 150028919.0, "step": 1874 }, { "epoch": 2.3381160324391765, "grad_norm": 0.11778995124698656, "learning_rate": 2.1061711842066124e-06, "loss": 0.028, "num_tokens": 150108918.0, "step": 1875 }, { "epoch": 2.3393636930754833, "grad_norm": 0.11526712748316235, "learning_rate": 2.1021951138427736e-06, "loss": 0.0275, "num_tokens": 150188111.0, "step": 1876 }, { "epoch": 2.3406113537117905, "grad_norm": 0.10576774424191199, "learning_rate": 2.0982252047262025e-06, "loss": 0.0277, "num_tokens": 150267512.0, "step": 1877 }, { "epoch": 2.3418590143480973, "grad_norm": 0.11922101045159628, "learning_rate": 2.094261464055548e-06, "loss": 0.028, "num_tokens": 150349541.0, "step": 1878 }, { "epoch": 2.343106674984404, "grad_norm": 0.11472447341513481, "learning_rate": 2.0903038990182684e-06, "loss": 0.0281, "num_tokens": 150429122.0, "step": 1879 }, { "epoch": 2.3443543356207113, "grad_norm": 0.11865849401366725, "learning_rate": 2.086352516790624e-06, "loss": 0.0282, "num_tokens": 150509545.0, "step": 1880 }, { "epoch": 2.345601996257018, "grad_norm": 0.11861011227322517, "learning_rate": 2.082407324537668e-06, "loss": 0.028, "num_tokens": 150589757.0, "step": 1881 }, { "epoch": 2.3468496568933253, "grad_norm": 0.1151057044148094, "learning_rate": 2.078468329413223e-06, "loss": 0.0278, "num_tokens": 150671338.0, "step": 1882 }, { "epoch": 2.348097317529632, "grad_norm": 0.10255007625515707, "learning_rate": 2.07453553855988e-06, "loss": 0.0264, "num_tokens": 150751508.0, "step": 1883 }, { "epoch": 2.349344978165939, "grad_norm": 0.11660533332586445, "learning_rate": 2.0706089591089785e-06, "loss": 0.0282, "num_tokens": 150832075.0, "step": 1884 }, { "epoch": 2.3505926388022456, "grad_norm": 0.1155557180421997, "learning_rate": 2.0666885981805916e-06, "loss": 0.0273, "num_tokens": 150912548.0, "step": 1885 }, { "epoch": 2.3518402994385528, "grad_norm": 0.11188233909320953, "learning_rate": 2.0627744628835196e-06, "loss": 0.0278, "num_tokens": 150991832.0, "step": 1886 }, { "epoch": 2.3530879600748595, "grad_norm": 0.11350157709518613, "learning_rate": 2.058866560315273e-06, "loss": 0.0272, "num_tokens": 151071514.0, "step": 1887 }, { "epoch": 2.3543356207111668, "grad_norm": 0.11410767768820493, "learning_rate": 2.054964897562061e-06, "loss": 0.0276, "num_tokens": 151150714.0, "step": 1888 }, { "epoch": 2.3555832813474735, "grad_norm": 0.10872089005911116, "learning_rate": 2.0510694816987724e-06, "loss": 0.0279, "num_tokens": 151231361.0, "step": 1889 }, { "epoch": 2.3568309419837803, "grad_norm": 0.1196042703091023, "learning_rate": 2.047180319788981e-06, "loss": 0.0276, "num_tokens": 151311684.0, "step": 1890 }, { "epoch": 2.3580786026200875, "grad_norm": 0.11841309111635819, "learning_rate": 2.0432974188849103e-06, "loss": 0.0275, "num_tokens": 151392783.0, "step": 1891 }, { "epoch": 2.3593262632563943, "grad_norm": 0.11255039160218248, "learning_rate": 2.0394207860274304e-06, "loss": 0.0277, "num_tokens": 151472580.0, "step": 1892 }, { "epoch": 2.360573923892701, "grad_norm": 0.10746347740509427, "learning_rate": 2.035550428246053e-06, "loss": 0.0272, "num_tokens": 151552266.0, "step": 1893 }, { "epoch": 2.3618215845290083, "grad_norm": 0.11470087718795037, "learning_rate": 2.0316863525589037e-06, "loss": 0.0279, "num_tokens": 151631911.0, "step": 1894 }, { "epoch": 2.363069245165315, "grad_norm": 0.12367004565272316, "learning_rate": 2.0278285659727187e-06, "loss": 0.0273, "num_tokens": 151710855.0, "step": 1895 }, { "epoch": 2.364316905801622, "grad_norm": 0.12984514692522087, "learning_rate": 2.023977075482833e-06, "loss": 0.0307, "num_tokens": 151793755.0, "step": 1896 }, { "epoch": 2.365564566437929, "grad_norm": 0.12148820532988883, "learning_rate": 2.0201318880731633e-06, "loss": 0.0283, "num_tokens": 151875625.0, "step": 1897 }, { "epoch": 2.3668122270742358, "grad_norm": 0.11585673908956987, "learning_rate": 2.0162930107161963e-06, "loss": 0.0273, "num_tokens": 151955662.0, "step": 1898 }, { "epoch": 2.3680598877105425, "grad_norm": 0.12778131669298992, "learning_rate": 2.012460450372976e-06, "loss": 0.0296, "num_tokens": 152037665.0, "step": 1899 }, { "epoch": 2.3693075483468498, "grad_norm": 0.12449191375414341, "learning_rate": 2.0086342139930932e-06, "loss": 0.031, "num_tokens": 152116842.0, "step": 1900 }, { "epoch": 2.3705552089831565, "grad_norm": 0.12060369090810642, "learning_rate": 2.004814308514671e-06, "loss": 0.0287, "num_tokens": 152196571.0, "step": 1901 }, { "epoch": 2.3718028696194633, "grad_norm": 0.12138740721784497, "learning_rate": 2.001000740864353e-06, "loss": 0.0276, "num_tokens": 152276228.0, "step": 1902 }, { "epoch": 2.3730505302557705, "grad_norm": 0.10235549260740247, "learning_rate": 1.9971935179572893e-06, "loss": 0.0268, "num_tokens": 152355915.0, "step": 1903 }, { "epoch": 2.3742981908920773, "grad_norm": 0.1156977150303195, "learning_rate": 1.993392646697127e-06, "loss": 0.0276, "num_tokens": 152435603.0, "step": 1904 }, { "epoch": 2.3755458515283845, "grad_norm": 0.1134522871431333, "learning_rate": 1.9895981339759927e-06, "loss": 0.0276, "num_tokens": 152516244.0, "step": 1905 }, { "epoch": 2.3767935121646913, "grad_norm": 0.116802365651326, "learning_rate": 1.985809986674487e-06, "loss": 0.0281, "num_tokens": 152595457.0, "step": 1906 }, { "epoch": 2.378041172800998, "grad_norm": 0.1162946923631464, "learning_rate": 1.982028211661665e-06, "loss": 0.0275, "num_tokens": 152675867.0, "step": 1907 }, { "epoch": 2.3792888334373052, "grad_norm": 0.12149058207102886, "learning_rate": 1.9782528157950266e-06, "loss": 0.0285, "num_tokens": 152756569.0, "step": 1908 }, { "epoch": 2.380536494073612, "grad_norm": 0.1201975730835599, "learning_rate": 1.974483805920508e-06, "loss": 0.0285, "num_tokens": 152836657.0, "step": 1909 }, { "epoch": 2.3817841547099188, "grad_norm": 0.11308003280924744, "learning_rate": 1.970721188872461e-06, "loss": 0.0283, "num_tokens": 152918160.0, "step": 1910 }, { "epoch": 2.383031815346226, "grad_norm": 0.10654932419802848, "learning_rate": 1.966964971473649e-06, "loss": 0.0264, "num_tokens": 153002339.0, "step": 1911 }, { "epoch": 2.3842794759825328, "grad_norm": 0.10620407558672333, "learning_rate": 1.9632151605352296e-06, "loss": 0.0279, "num_tokens": 153081791.0, "step": 1912 }, { "epoch": 2.3855271366188395, "grad_norm": 0.11368468414357902, "learning_rate": 1.9594717628567432e-06, "loss": 0.0274, "num_tokens": 153162006.0, "step": 1913 }, { "epoch": 2.3867747972551467, "grad_norm": 0.11000047901719937, "learning_rate": 1.9557347852261007e-06, "loss": 0.0274, "num_tokens": 153242632.0, "step": 1914 }, { "epoch": 2.3880224578914535, "grad_norm": 0.1089512194404227, "learning_rate": 1.9520042344195727e-06, "loss": 0.0272, "num_tokens": 153323418.0, "step": 1915 }, { "epoch": 2.3892701185277603, "grad_norm": 0.10861808273136546, "learning_rate": 1.9482801172017758e-06, "loss": 0.0266, "num_tokens": 153403974.0, "step": 1916 }, { "epoch": 2.3905177791640675, "grad_norm": 0.11473505086721252, "learning_rate": 1.9445624403256576e-06, "loss": 0.0267, "num_tokens": 153483141.0, "step": 1917 }, { "epoch": 2.3917654398003743, "grad_norm": 0.10654799873398078, "learning_rate": 1.940851210532493e-06, "loss": 0.0279, "num_tokens": 153563318.0, "step": 1918 }, { "epoch": 2.393013100436681, "grad_norm": 0.10839257143529775, "learning_rate": 1.937146434551863e-06, "loss": 0.0269, "num_tokens": 153643226.0, "step": 1919 }, { "epoch": 2.3942607610729882, "grad_norm": 0.10860934191124491, "learning_rate": 1.933448119101644e-06, "loss": 0.0286, "num_tokens": 153722527.0, "step": 1920 }, { "epoch": 2.395508421709295, "grad_norm": 0.10443243666752638, "learning_rate": 1.929756270888003e-06, "loss": 0.0265, "num_tokens": 153802225.0, "step": 1921 }, { "epoch": 2.3967560823456022, "grad_norm": 0.10427755534598981, "learning_rate": 1.9260708966053744e-06, "loss": 0.0271, "num_tokens": 153881820.0, "step": 1922 }, { "epoch": 2.398003742981909, "grad_norm": 0.1177864618305411, "learning_rate": 1.9223920029364555e-06, "loss": 0.0278, "num_tokens": 153961515.0, "step": 1923 }, { "epoch": 2.3992514036182158, "grad_norm": 0.11765676375572733, "learning_rate": 1.9187195965521934e-06, "loss": 0.028, "num_tokens": 154041531.0, "step": 1924 }, { "epoch": 2.4004990642545225, "grad_norm": 0.10500942947449511, "learning_rate": 1.9150536841117713e-06, "loss": 0.027, "num_tokens": 154121639.0, "step": 1925 }, { "epoch": 2.4017467248908297, "grad_norm": 0.12216098631005727, "learning_rate": 1.911394272262595e-06, "loss": 0.0298, "num_tokens": 154203019.0, "step": 1926 }, { "epoch": 2.4029943855271365, "grad_norm": 0.10777756704694372, "learning_rate": 1.907741367640286e-06, "loss": 0.0273, "num_tokens": 154283769.0, "step": 1927 }, { "epoch": 2.4042420461634437, "grad_norm": 0.12546585298956595, "learning_rate": 1.9040949768686646e-06, "loss": 0.0282, "num_tokens": 154364349.0, "step": 1928 }, { "epoch": 2.4054897067997505, "grad_norm": 0.11144911675809406, "learning_rate": 1.900455106559737e-06, "loss": 0.0275, "num_tokens": 154444248.0, "step": 1929 }, { "epoch": 2.4067373674360573, "grad_norm": 0.10425862497450274, "learning_rate": 1.8968217633136909e-06, "loss": 0.0266, "num_tokens": 154523672.0, "step": 1930 }, { "epoch": 2.4079850280723645, "grad_norm": 0.10633358249196864, "learning_rate": 1.893194953718875e-06, "loss": 0.0267, "num_tokens": 154603383.0, "step": 1931 }, { "epoch": 2.4092326887086712, "grad_norm": 0.11025002525121674, "learning_rate": 1.8895746843517892e-06, "loss": 0.0273, "num_tokens": 154683826.0, "step": 1932 }, { "epoch": 2.410480349344978, "grad_norm": 0.12840305730755552, "learning_rate": 1.8859609617770786e-06, "loss": 0.0436, "num_tokens": 154764256.0, "step": 1933 }, { "epoch": 2.4117280099812852, "grad_norm": 0.10790452835307801, "learning_rate": 1.8823537925475143e-06, "loss": 0.0272, "num_tokens": 154843472.0, "step": 1934 }, { "epoch": 2.412975670617592, "grad_norm": 0.10970134188031404, "learning_rate": 1.8787531832039846e-06, "loss": 0.0278, "num_tokens": 154923415.0, "step": 1935 }, { "epoch": 2.4142233312538988, "grad_norm": 0.10844976031889222, "learning_rate": 1.8751591402754802e-06, "loss": 0.0271, "num_tokens": 155001644.0, "step": 1936 }, { "epoch": 2.415470991890206, "grad_norm": 0.10790804890822218, "learning_rate": 1.8715716702790903e-06, "loss": 0.0278, "num_tokens": 155082689.0, "step": 1937 }, { "epoch": 2.4167186525265127, "grad_norm": 0.10777373189548671, "learning_rate": 1.8679907797199798e-06, "loss": 0.0271, "num_tokens": 155161299.0, "step": 1938 }, { "epoch": 2.41796631316282, "grad_norm": 0.10873821982168631, "learning_rate": 1.8644164750913868e-06, "loss": 0.0274, "num_tokens": 155240482.0, "step": 1939 }, { "epoch": 2.4192139737991267, "grad_norm": 0.10820841964710465, "learning_rate": 1.8608487628746072e-06, "loss": 0.0272, "num_tokens": 155320020.0, "step": 1940 }, { "epoch": 2.4204616344354335, "grad_norm": 0.11112717770569587, "learning_rate": 1.8572876495389808e-06, "loss": 0.0272, "num_tokens": 155400618.0, "step": 1941 }, { "epoch": 2.4217092950717403, "grad_norm": 0.11217938609690513, "learning_rate": 1.8537331415418802e-06, "loss": 0.0274, "num_tokens": 155480528.0, "step": 1942 }, { "epoch": 2.4229569557080475, "grad_norm": 0.11097473906028339, "learning_rate": 1.8501852453287056e-06, "loss": 0.0272, "num_tokens": 155562620.0, "step": 1943 }, { "epoch": 2.4242046163443542, "grad_norm": 0.10818902006610549, "learning_rate": 1.846643967332865e-06, "loss": 0.0274, "num_tokens": 155641589.0, "step": 1944 }, { "epoch": 2.4254522769806615, "grad_norm": 0.11177617772827625, "learning_rate": 1.8431093139757635e-06, "loss": 0.028, "num_tokens": 155723189.0, "step": 1945 }, { "epoch": 2.4266999376169682, "grad_norm": 0.11904079761014202, "learning_rate": 1.8395812916667974e-06, "loss": 0.0276, "num_tokens": 155802507.0, "step": 1946 }, { "epoch": 2.427947598253275, "grad_norm": 0.10796895127972975, "learning_rate": 1.836059906803339e-06, "loss": 0.0275, "num_tokens": 155882396.0, "step": 1947 }, { "epoch": 2.429195258889582, "grad_norm": 0.11670361221308223, "learning_rate": 1.832545165770721e-06, "loss": 0.0283, "num_tokens": 155962352.0, "step": 1948 }, { "epoch": 2.430442919525889, "grad_norm": 0.11461518816560916, "learning_rate": 1.8290370749422327e-06, "loss": 0.0279, "num_tokens": 156043890.0, "step": 1949 }, { "epoch": 2.4316905801621957, "grad_norm": 0.10679786955474743, "learning_rate": 1.8255356406791036e-06, "loss": 0.0272, "num_tokens": 156123445.0, "step": 1950 }, { "epoch": 2.432938240798503, "grad_norm": 0.10688075962587382, "learning_rate": 1.82204086933049e-06, "loss": 0.0268, "num_tokens": 156203255.0, "step": 1951 }, { "epoch": 2.4341859014348097, "grad_norm": 0.11372182933826729, "learning_rate": 1.8185527672334712e-06, "loss": 0.0265, "num_tokens": 156282984.0, "step": 1952 }, { "epoch": 2.4354335620711165, "grad_norm": 0.11467488601047325, "learning_rate": 1.8150713407130283e-06, "loss": 0.0278, "num_tokens": 156362864.0, "step": 1953 }, { "epoch": 2.4366812227074237, "grad_norm": 0.11881203317305625, "learning_rate": 1.8115965960820414e-06, "loss": 0.0284, "num_tokens": 156443925.0, "step": 1954 }, { "epoch": 2.4379288833437305, "grad_norm": 0.11291508916496522, "learning_rate": 1.8081285396412738e-06, "loss": 0.0275, "num_tokens": 156526214.0, "step": 1955 }, { "epoch": 2.4391765439800372, "grad_norm": 0.1127357370742545, "learning_rate": 1.8046671776793584e-06, "loss": 0.028, "num_tokens": 156606671.0, "step": 1956 }, { "epoch": 2.4404242046163445, "grad_norm": 0.11312286561244898, "learning_rate": 1.80121251647279e-06, "loss": 0.0274, "num_tokens": 156686884.0, "step": 1957 }, { "epoch": 2.4416718652526512, "grad_norm": 0.11771902116431116, "learning_rate": 1.7977645622859157e-06, "loss": 0.0285, "num_tokens": 156767153.0, "step": 1958 }, { "epoch": 2.442919525888958, "grad_norm": 0.11286969218942611, "learning_rate": 1.7943233213709173e-06, "loss": 0.0276, "num_tokens": 156848405.0, "step": 1959 }, { "epoch": 2.444167186525265, "grad_norm": 0.10775615689943846, "learning_rate": 1.7908887999678046e-06, "loss": 0.0269, "num_tokens": 156927640.0, "step": 1960 }, { "epoch": 2.445414847161572, "grad_norm": 0.12023464215065506, "learning_rate": 1.7874610043044027e-06, "loss": 0.0288, "num_tokens": 157009867.0, "step": 1961 }, { "epoch": 2.446662507797879, "grad_norm": 0.1109760515355431, "learning_rate": 1.7840399405963432e-06, "loss": 0.027, "num_tokens": 157090040.0, "step": 1962 }, { "epoch": 2.447910168434186, "grad_norm": 0.11510300631971997, "learning_rate": 1.7806256150470472e-06, "loss": 0.0278, "num_tokens": 157171270.0, "step": 1963 }, { "epoch": 2.4491578290704927, "grad_norm": 0.113356826538854, "learning_rate": 1.7772180338477173e-06, "loss": 0.0269, "num_tokens": 157250589.0, "step": 1964 }, { "epoch": 2.4504054897068, "grad_norm": 0.11020161557209646, "learning_rate": 1.7738172031773322e-06, "loss": 0.0272, "num_tokens": 157332005.0, "step": 1965 }, { "epoch": 2.4516531503431067, "grad_norm": 0.10920573621039224, "learning_rate": 1.7704231292026219e-06, "loss": 0.0274, "num_tokens": 157412201.0, "step": 1966 }, { "epoch": 2.4529008109794135, "grad_norm": 0.11239699652286433, "learning_rate": 1.76703581807807e-06, "loss": 0.0272, "num_tokens": 157493943.0, "step": 1967 }, { "epoch": 2.4541484716157207, "grad_norm": 0.1082221654475512, "learning_rate": 1.7636552759458963e-06, "loss": 0.0269, "num_tokens": 157572738.0, "step": 1968 }, { "epoch": 2.4553961322520275, "grad_norm": 0.11287162279835761, "learning_rate": 1.760281508936045e-06, "loss": 0.0274, "num_tokens": 157653339.0, "step": 1969 }, { "epoch": 2.4566437928883342, "grad_norm": 0.11440457618886876, "learning_rate": 1.7569145231661738e-06, "loss": 0.028, "num_tokens": 157734700.0, "step": 1970 }, { "epoch": 2.4578914535246414, "grad_norm": 0.11118819360031555, "learning_rate": 1.753554324741648e-06, "loss": 0.0272, "num_tokens": 157815324.0, "step": 1971 }, { "epoch": 2.459139114160948, "grad_norm": 0.11370081079972753, "learning_rate": 1.7502009197555215e-06, "loss": 0.0287, "num_tokens": 157896290.0, "step": 1972 }, { "epoch": 2.460386774797255, "grad_norm": 0.13188166291275744, "learning_rate": 1.7468543142885308e-06, "loss": 0.0291, "num_tokens": 157978183.0, "step": 1973 }, { "epoch": 2.461634435433562, "grad_norm": 0.10901400798519152, "learning_rate": 1.7435145144090852e-06, "loss": 0.0281, "num_tokens": 158056882.0, "step": 1974 }, { "epoch": 2.462882096069869, "grad_norm": 0.10230118141977014, "learning_rate": 1.740181526173248e-06, "loss": 0.0271, "num_tokens": 158136794.0, "step": 1975 }, { "epoch": 2.4641297567061757, "grad_norm": 0.11403183233637615, "learning_rate": 1.736855355624737e-06, "loss": 0.0278, "num_tokens": 158216836.0, "step": 1976 }, { "epoch": 2.465377417342483, "grad_norm": 0.10903076514079771, "learning_rate": 1.7335360087949048e-06, "loss": 0.0275, "num_tokens": 158297798.0, "step": 1977 }, { "epoch": 2.4666250779787897, "grad_norm": 0.10466959609108883, "learning_rate": 1.73022349170273e-06, "loss": 0.0271, "num_tokens": 158378165.0, "step": 1978 }, { "epoch": 2.467872738615097, "grad_norm": 0.11147187512684335, "learning_rate": 1.7269178103548057e-06, "loss": 0.0271, "num_tokens": 158459178.0, "step": 1979 }, { "epoch": 2.4691203992514037, "grad_norm": 0.11409682525660257, "learning_rate": 1.723618970745334e-06, "loss": 0.0281, "num_tokens": 158540418.0, "step": 1980 }, { "epoch": 2.4703680598877105, "grad_norm": 0.10477788112973571, "learning_rate": 1.7203269788561067e-06, "loss": 0.027, "num_tokens": 158619445.0, "step": 1981 }, { "epoch": 2.4716157205240172, "grad_norm": 0.1074788211668467, "learning_rate": 1.7170418406564982e-06, "loss": 0.027, "num_tokens": 158700296.0, "step": 1982 }, { "epoch": 2.4728633811603244, "grad_norm": 0.11129094430099397, "learning_rate": 1.7137635621034614e-06, "loss": 0.0277, "num_tokens": 158780253.0, "step": 1983 }, { "epoch": 2.474111041796631, "grad_norm": 0.10164355680147183, "learning_rate": 1.7104921491415038e-06, "loss": 0.0268, "num_tokens": 158860384.0, "step": 1984 }, { "epoch": 2.4753587024329384, "grad_norm": 0.11686566136668367, "learning_rate": 1.7072276077026856e-06, "loss": 0.0283, "num_tokens": 158939479.0, "step": 1985 }, { "epoch": 2.476606363069245, "grad_norm": 0.11938137497936563, "learning_rate": 1.7039699437066076e-06, "loss": 0.0274, "num_tokens": 159021019.0, "step": 1986 }, { "epoch": 2.477854023705552, "grad_norm": 0.110769383334913, "learning_rate": 1.7007191630604003e-06, "loss": 0.0269, "num_tokens": 159100911.0, "step": 1987 }, { "epoch": 2.479101684341859, "grad_norm": 0.1066179656774663, "learning_rate": 1.6974752716587092e-06, "loss": 0.0276, "num_tokens": 159180930.0, "step": 1988 }, { "epoch": 2.480349344978166, "grad_norm": 0.12059952000176534, "learning_rate": 1.6942382753836912e-06, "loss": 0.0287, "num_tokens": 159260601.0, "step": 1989 }, { "epoch": 2.4815970056144727, "grad_norm": 0.11227342940000516, "learning_rate": 1.691008180105e-06, "loss": 0.0273, "num_tokens": 159340555.0, "step": 1990 }, { "epoch": 2.48284466625078, "grad_norm": 0.1098479649477286, "learning_rate": 1.6877849916797728e-06, "loss": 0.0288, "num_tokens": 159420038.0, "step": 1991 }, { "epoch": 2.4840923268870867, "grad_norm": 0.1111828471383589, "learning_rate": 1.684568715952626e-06, "loss": 0.0277, "num_tokens": 159499863.0, "step": 1992 }, { "epoch": 2.4853399875233935, "grad_norm": 0.11053778696407449, "learning_rate": 1.6813593587556392e-06, "loss": 0.0272, "num_tokens": 159579187.0, "step": 1993 }, { "epoch": 2.4865876481597007, "grad_norm": 0.11439406236806961, "learning_rate": 1.6781569259083463e-06, "loss": 0.0282, "num_tokens": 159659711.0, "step": 1994 }, { "epoch": 2.4878353087960074, "grad_norm": 0.10690423131408978, "learning_rate": 1.6749614232177273e-06, "loss": 0.0281, "num_tokens": 159740822.0, "step": 1995 }, { "epoch": 2.489082969432314, "grad_norm": 0.11272934482085559, "learning_rate": 1.6717728564781927e-06, "loss": 0.0277, "num_tokens": 159820583.0, "step": 1996 }, { "epoch": 2.4903306300686214, "grad_norm": 0.11712009141925543, "learning_rate": 1.6685912314715797e-06, "loss": 0.0276, "num_tokens": 159901927.0, "step": 1997 }, { "epoch": 2.491578290704928, "grad_norm": 0.11201642856684556, "learning_rate": 1.6654165539671342e-06, "loss": 0.0273, "num_tokens": 159982551.0, "step": 1998 }, { "epoch": 2.492825951341235, "grad_norm": 0.12217007620784424, "learning_rate": 1.6622488297215079e-06, "loss": 0.0281, "num_tokens": 160063000.0, "step": 1999 }, { "epoch": 2.494073611977542, "grad_norm": 0.10737268262091881, "learning_rate": 1.6590880644787407e-06, "loss": 0.0268, "num_tokens": 160142294.0, "step": 2000 }, { "epoch": 2.495321272613849, "grad_norm": 0.1250676932249431, "learning_rate": 1.6559342639702563e-06, "loss": 0.0306, "num_tokens": 160220912.0, "step": 2001 }, { "epoch": 2.496568933250156, "grad_norm": 0.11211010682205132, "learning_rate": 1.6527874339148484e-06, "loss": 0.0264, "num_tokens": 160301952.0, "step": 2002 }, { "epoch": 2.497816593886463, "grad_norm": 0.11786513203484543, "learning_rate": 1.6496475800186702e-06, "loss": 0.0281, "num_tokens": 160381913.0, "step": 2003 }, { "epoch": 2.4990642545227697, "grad_norm": 0.10893692645197386, "learning_rate": 1.6465147079752264e-06, "loss": 0.0275, "num_tokens": 160462281.0, "step": 2004 }, { "epoch": 2.5003119151590765, "grad_norm": 0.11526335706838876, "learning_rate": 1.6433888234653614e-06, "loss": 0.0278, "num_tokens": 160542998.0, "step": 2005 }, { "epoch": 2.5015595757953837, "grad_norm": 0.1094817219693971, "learning_rate": 1.6402699321572485e-06, "loss": 0.0267, "num_tokens": 160621811.0, "step": 2006 }, { "epoch": 2.5028072364316905, "grad_norm": 0.11249905162283616, "learning_rate": 1.6371580397063788e-06, "loss": 0.0277, "num_tokens": 160702177.0, "step": 2007 }, { "epoch": 2.5040548970679977, "grad_norm": 0.1218621001840924, "learning_rate": 1.6340531517555563e-06, "loss": 0.0294, "num_tokens": 160783219.0, "step": 2008 }, { "epoch": 2.5053025577043044, "grad_norm": 0.10986240798677817, "learning_rate": 1.6309552739348804e-06, "loss": 0.0268, "num_tokens": 160862773.0, "step": 2009 }, { "epoch": 2.506550218340611, "grad_norm": 0.1207523920480209, "learning_rate": 1.6278644118617375e-06, "loss": 0.0275, "num_tokens": 160942474.0, "step": 2010 }, { "epoch": 2.5077978789769184, "grad_norm": 0.11803187004719243, "learning_rate": 1.6247805711407993e-06, "loss": 0.0279, "num_tokens": 161021964.0, "step": 2011 }, { "epoch": 2.509045539613225, "grad_norm": 0.11522862481685167, "learning_rate": 1.6217037573639983e-06, "loss": 0.0278, "num_tokens": 161101620.0, "step": 2012 }, { "epoch": 2.5102932002495324, "grad_norm": 0.11566448225959479, "learning_rate": 1.6186339761105275e-06, "loss": 0.0288, "num_tokens": 161180903.0, "step": 2013 }, { "epoch": 2.511540860885839, "grad_norm": 0.10653159823487614, "learning_rate": 1.6155712329468305e-06, "loss": 0.0267, "num_tokens": 161260133.0, "step": 2014 }, { "epoch": 2.512788521522146, "grad_norm": 0.11974291187007381, "learning_rate": 1.6125155334265846e-06, "loss": 0.0289, "num_tokens": 161340599.0, "step": 2015 }, { "epoch": 2.5140361821584527, "grad_norm": 0.10950555678047849, "learning_rate": 1.6094668830906959e-06, "loss": 0.0281, "num_tokens": 161421054.0, "step": 2016 }, { "epoch": 2.51528384279476, "grad_norm": 0.11950555449558682, "learning_rate": 1.6064252874672904e-06, "loss": 0.028, "num_tokens": 161501219.0, "step": 2017 }, { "epoch": 2.5165315034310667, "grad_norm": 0.11235098261736175, "learning_rate": 1.6033907520717008e-06, "loss": 0.0274, "num_tokens": 161580744.0, "step": 2018 }, { "epoch": 2.517779164067374, "grad_norm": 0.1093365486795711, "learning_rate": 1.6003632824064553e-06, "loss": 0.0267, "num_tokens": 161660539.0, "step": 2019 }, { "epoch": 2.5190268247036807, "grad_norm": 0.11976522826322118, "learning_rate": 1.5973428839612727e-06, "loss": 0.028, "num_tokens": 161741138.0, "step": 2020 }, { "epoch": 2.5202744853399874, "grad_norm": 0.1156287357078057, "learning_rate": 1.5943295622130483e-06, "loss": 0.028, "num_tokens": 161821837.0, "step": 2021 }, { "epoch": 2.521522145976294, "grad_norm": 0.11441831410599412, "learning_rate": 1.5913233226258437e-06, "loss": 0.0271, "num_tokens": 161900788.0, "step": 2022 }, { "epoch": 2.5227698066126014, "grad_norm": 0.1112970332304599, "learning_rate": 1.5883241706508823e-06, "loss": 0.0274, "num_tokens": 161980892.0, "step": 2023 }, { "epoch": 2.524017467248908, "grad_norm": 0.12122029274976619, "learning_rate": 1.5853321117265317e-06, "loss": 0.0278, "num_tokens": 162060107.0, "step": 2024 }, { "epoch": 2.5252651278852154, "grad_norm": 0.1129961042647627, "learning_rate": 1.5823471512782983e-06, "loss": 0.0276, "num_tokens": 162140320.0, "step": 2025 }, { "epoch": 2.526512788521522, "grad_norm": 0.10694528173242357, "learning_rate": 1.579369294718819e-06, "loss": 0.0273, "num_tokens": 162220263.0, "step": 2026 }, { "epoch": 2.527760449157829, "grad_norm": 0.10649726142962046, "learning_rate": 1.5763985474478483e-06, "loss": 0.0265, "num_tokens": 162301940.0, "step": 2027 }, { "epoch": 2.529008109794136, "grad_norm": 0.11052731141682264, "learning_rate": 1.5734349148522471e-06, "loss": 0.0266, "num_tokens": 162381737.0, "step": 2028 }, { "epoch": 2.530255770430443, "grad_norm": 0.11827410827331811, "learning_rate": 1.5704784023059788e-06, "loss": 0.0283, "num_tokens": 162461425.0, "step": 2029 }, { "epoch": 2.5315034310667497, "grad_norm": 0.11322497252667106, "learning_rate": 1.5675290151700937e-06, "loss": 0.0272, "num_tokens": 162541913.0, "step": 2030 }, { "epoch": 2.532751091703057, "grad_norm": 0.1169346308768019, "learning_rate": 1.5645867587927208e-06, "loss": 0.0276, "num_tokens": 162623234.0, "step": 2031 }, { "epoch": 2.5339987523393637, "grad_norm": 0.12004814618546314, "learning_rate": 1.561651638509062e-06, "loss": 0.0274, "num_tokens": 162702087.0, "step": 2032 }, { "epoch": 2.5352464129756704, "grad_norm": 0.10868378154550547, "learning_rate": 1.5587236596413773e-06, "loss": 0.0276, "num_tokens": 162782960.0, "step": 2033 }, { "epoch": 2.5364940736119777, "grad_norm": 0.11068877689549345, "learning_rate": 1.5558028274989778e-06, "loss": 0.028, "num_tokens": 162862837.0, "step": 2034 }, { "epoch": 2.5377417342482844, "grad_norm": 0.1438726002320652, "learning_rate": 1.5528891473782126e-06, "loss": 0.027, "num_tokens": 162941638.0, "step": 2035 }, { "epoch": 2.5389893948845916, "grad_norm": 0.11403604077645453, "learning_rate": 1.5499826245624674e-06, "loss": 0.0285, "num_tokens": 163021811.0, "step": 2036 }, { "epoch": 2.5402370555208984, "grad_norm": 0.11230917197373902, "learning_rate": 1.547083264322145e-06, "loss": 0.0278, "num_tokens": 163101898.0, "step": 2037 }, { "epoch": 2.541484716157205, "grad_norm": 0.11526915003925865, "learning_rate": 1.5441910719146616e-06, "loss": 0.0273, "num_tokens": 163182128.0, "step": 2038 }, { "epoch": 2.542732376793512, "grad_norm": 0.11514119570048018, "learning_rate": 1.541306052584437e-06, "loss": 0.0277, "num_tokens": 163263096.0, "step": 2039 }, { "epoch": 2.543980037429819, "grad_norm": 0.11954122082354755, "learning_rate": 1.5384282115628834e-06, "loss": 0.0275, "num_tokens": 163343548.0, "step": 2040 }, { "epoch": 2.545227698066126, "grad_norm": 0.11066461352482032, "learning_rate": 1.5355575540683953e-06, "loss": 0.0278, "num_tokens": 163423064.0, "step": 2041 }, { "epoch": 2.546475358702433, "grad_norm": 0.11137639959635122, "learning_rate": 1.5326940853063443e-06, "loss": 0.0282, "num_tokens": 163502397.0, "step": 2042 }, { "epoch": 2.54772301933874, "grad_norm": 0.11324148029107538, "learning_rate": 1.5298378104690636e-06, "loss": 0.027, "num_tokens": 163581986.0, "step": 2043 }, { "epoch": 2.5489706799750467, "grad_norm": 0.12329020795643553, "learning_rate": 1.5269887347358414e-06, "loss": 0.0279, "num_tokens": 163661772.0, "step": 2044 }, { "epoch": 2.5502183406113534, "grad_norm": 0.11352719316396652, "learning_rate": 1.5241468632729161e-06, "loss": 0.028, "num_tokens": 163742085.0, "step": 2045 }, { "epoch": 2.5514660012476607, "grad_norm": 0.11431548220101798, "learning_rate": 1.5213122012334572e-06, "loss": 0.0277, "num_tokens": 163823576.0, "step": 2046 }, { "epoch": 2.5527136618839674, "grad_norm": 0.1151041171348859, "learning_rate": 1.5184847537575647e-06, "loss": 0.0273, "num_tokens": 163903399.0, "step": 2047 }, { "epoch": 2.5539613225202746, "grad_norm": 0.11215314990558678, "learning_rate": 1.5156645259722565e-06, "loss": 0.0276, "num_tokens": 163984511.0, "step": 2048 }, { "epoch": 2.5552089831565814, "grad_norm": 0.11132081944460806, "learning_rate": 1.5128515229914568e-06, "loss": 0.0288, "num_tokens": 164064533.0, "step": 2049 }, { "epoch": 2.556456643792888, "grad_norm": 0.11240110122489118, "learning_rate": 1.5100457499159897e-06, "loss": 0.0276, "num_tokens": 164145208.0, "step": 2050 }, { "epoch": 2.5577043044291954, "grad_norm": 0.12087806227209612, "learning_rate": 1.507247211833572e-06, "loss": 0.0276, "num_tokens": 164226437.0, "step": 2051 }, { "epoch": 2.558951965065502, "grad_norm": 0.11949968556805396, "learning_rate": 1.5044559138187967e-06, "loss": 0.0273, "num_tokens": 164306701.0, "step": 2052 }, { "epoch": 2.5601996257018094, "grad_norm": 0.10637452873416925, "learning_rate": 1.5016718609331315e-06, "loss": 0.0269, "num_tokens": 164386138.0, "step": 2053 }, { "epoch": 2.561447286338116, "grad_norm": 0.10741900874283838, "learning_rate": 1.4988950582249061e-06, "loss": 0.0262, "num_tokens": 164466296.0, "step": 2054 }, { "epoch": 2.562694946974423, "grad_norm": 0.10847356735320615, "learning_rate": 1.4961255107293044e-06, "loss": 0.0273, "num_tokens": 164546034.0, "step": 2055 }, { "epoch": 2.5639426076107297, "grad_norm": 0.10867775728226359, "learning_rate": 1.4933632234683506e-06, "loss": 0.0275, "num_tokens": 164627152.0, "step": 2056 }, { "epoch": 2.565190268247037, "grad_norm": 0.10863086055220095, "learning_rate": 1.4906082014509088e-06, "loss": 0.0275, "num_tokens": 164706832.0, "step": 2057 }, { "epoch": 2.5664379288833437, "grad_norm": 0.11004843698519694, "learning_rate": 1.4878604496726653e-06, "loss": 0.0281, "num_tokens": 164786256.0, "step": 2058 }, { "epoch": 2.567685589519651, "grad_norm": 0.11568288570947931, "learning_rate": 1.4851199731161243e-06, "loss": 0.027, "num_tokens": 164865074.0, "step": 2059 }, { "epoch": 2.5689332501559576, "grad_norm": 0.10186204742079674, "learning_rate": 1.4823867767505981e-06, "loss": 0.0265, "num_tokens": 164943814.0, "step": 2060 }, { "epoch": 2.5701809107922644, "grad_norm": 0.10521205156822269, "learning_rate": 1.4796608655322001e-06, "loss": 0.0265, "num_tokens": 165022913.0, "step": 2061 }, { "epoch": 2.571428571428571, "grad_norm": 0.11589068913118131, "learning_rate": 1.476942244403829e-06, "loss": 0.0289, "num_tokens": 165102030.0, "step": 2062 }, { "epoch": 2.5726762320648784, "grad_norm": 0.11233950248375833, "learning_rate": 1.4742309182951663e-06, "loss": 0.0273, "num_tokens": 165182179.0, "step": 2063 }, { "epoch": 2.573923892701185, "grad_norm": 0.11565035317539887, "learning_rate": 1.4715268921226677e-06, "loss": 0.0279, "num_tokens": 165262779.0, "step": 2064 }, { "epoch": 2.5751715533374924, "grad_norm": 0.11808315287951174, "learning_rate": 1.468830170789548e-06, "loss": 0.0278, "num_tokens": 165344308.0, "step": 2065 }, { "epoch": 2.576419213973799, "grad_norm": 0.11443778145606423, "learning_rate": 1.4661407591857795e-06, "loss": 0.0276, "num_tokens": 165423934.0, "step": 2066 }, { "epoch": 2.577666874610106, "grad_norm": 0.11190198300939089, "learning_rate": 1.4634586621880786e-06, "loss": 0.0273, "num_tokens": 165503345.0, "step": 2067 }, { "epoch": 2.578914535246413, "grad_norm": 0.11593353577598203, "learning_rate": 1.4607838846598959e-06, "loss": 0.0263, "num_tokens": 165583590.0, "step": 2068 }, { "epoch": 2.58016219588272, "grad_norm": 0.11449567615373328, "learning_rate": 1.4581164314514127e-06, "loss": 0.0279, "num_tokens": 165663697.0, "step": 2069 }, { "epoch": 2.581409856519027, "grad_norm": 0.10372245010081463, "learning_rate": 1.4554563073995284e-06, "loss": 0.027, "num_tokens": 165744637.0, "step": 2070 }, { "epoch": 2.582657517155334, "grad_norm": 0.10739901271925287, "learning_rate": 1.452803517327852e-06, "loss": 0.0269, "num_tokens": 165825845.0, "step": 2071 }, { "epoch": 2.5839051777916406, "grad_norm": 0.10419262399020085, "learning_rate": 1.450158066046692e-06, "loss": 0.0267, "num_tokens": 165906115.0, "step": 2072 }, { "epoch": 2.5851528384279474, "grad_norm": 0.10832482949706342, "learning_rate": 1.4475199583530536e-06, "loss": 0.0272, "num_tokens": 165986633.0, "step": 2073 }, { "epoch": 2.5864004990642546, "grad_norm": 0.11424246115691489, "learning_rate": 1.444889199030622e-06, "loss": 0.0284, "num_tokens": 166066387.0, "step": 2074 }, { "epoch": 2.5876481597005614, "grad_norm": 0.11100527216764833, "learning_rate": 1.4422657928497572e-06, "loss": 0.0265, "num_tokens": 166148319.0, "step": 2075 }, { "epoch": 2.5888958203368686, "grad_norm": 0.12099284205512863, "learning_rate": 1.4396497445674917e-06, "loss": 0.0296, "num_tokens": 166228531.0, "step": 2076 }, { "epoch": 2.5901434809731754, "grad_norm": 0.10891165530512759, "learning_rate": 1.4370410589275096e-06, "loss": 0.0279, "num_tokens": 166307862.0, "step": 2077 }, { "epoch": 2.591391141609482, "grad_norm": 0.12679386624360375, "learning_rate": 1.4344397406601454e-06, "loss": 0.0293, "num_tokens": 166388213.0, "step": 2078 }, { "epoch": 2.592638802245789, "grad_norm": 0.11178974797840838, "learning_rate": 1.4318457944823775e-06, "loss": 0.028, "num_tokens": 166468608.0, "step": 2079 }, { "epoch": 2.593886462882096, "grad_norm": 0.10698791554236746, "learning_rate": 1.4292592250978137e-06, "loss": 0.0268, "num_tokens": 166548055.0, "step": 2080 }, { "epoch": 2.595134123518403, "grad_norm": 0.10402086737935964, "learning_rate": 1.4266800371966844e-06, "loss": 0.0265, "num_tokens": 166627914.0, "step": 2081 }, { "epoch": 2.59638178415471, "grad_norm": 0.11256053343287431, "learning_rate": 1.424108235455838e-06, "loss": 0.0283, "num_tokens": 166708046.0, "step": 2082 }, { "epoch": 2.597629444791017, "grad_norm": 0.11648920137262059, "learning_rate": 1.4215438245387303e-06, "loss": 0.0278, "num_tokens": 166788951.0, "step": 2083 }, { "epoch": 2.5988771054273236, "grad_norm": 0.11412389464634487, "learning_rate": 1.41898680909541e-06, "loss": 0.0279, "num_tokens": 166867908.0, "step": 2084 }, { "epoch": 2.600124766063631, "grad_norm": 0.14616857946554868, "learning_rate": 1.4164371937625222e-06, "loss": 0.0274, "num_tokens": 166947938.0, "step": 2085 }, { "epoch": 2.6013724266999376, "grad_norm": 0.10641921005233325, "learning_rate": 1.4138949831632879e-06, "loss": 0.027, "num_tokens": 167028248.0, "step": 2086 }, { "epoch": 2.6026200873362444, "grad_norm": 0.11340157643915183, "learning_rate": 1.4113601819075037e-06, "loss": 0.0275, "num_tokens": 167107712.0, "step": 2087 }, { "epoch": 2.6038677479725516, "grad_norm": 0.11403401971381442, "learning_rate": 1.4088327945915315e-06, "loss": 0.0268, "num_tokens": 167188583.0, "step": 2088 }, { "epoch": 2.6051154086088584, "grad_norm": 0.10625095668404386, "learning_rate": 1.4063128257982867e-06, "loss": 0.0267, "num_tokens": 167268518.0, "step": 2089 }, { "epoch": 2.606363069245165, "grad_norm": 0.1171168338572617, "learning_rate": 1.4038002800972362e-06, "loss": 0.0275, "num_tokens": 167348636.0, "step": 2090 }, { "epoch": 2.6076107298814724, "grad_norm": 0.11542335973973553, "learning_rate": 1.401295162044383e-06, "loss": 0.0274, "num_tokens": 167429322.0, "step": 2091 }, { "epoch": 2.608858390517779, "grad_norm": 0.11727306920785573, "learning_rate": 1.3987974761822656e-06, "loss": 0.0275, "num_tokens": 167510283.0, "step": 2092 }, { "epoch": 2.6101060511540863, "grad_norm": 0.11766850450132074, "learning_rate": 1.3963072270399411e-06, "loss": 0.0273, "num_tokens": 167590184.0, "step": 2093 }, { "epoch": 2.611353711790393, "grad_norm": 0.11918576619487399, "learning_rate": 1.393824419132986e-06, "loss": 0.0279, "num_tokens": 167669021.0, "step": 2094 }, { "epoch": 2.6126013724267, "grad_norm": 0.11307732162265724, "learning_rate": 1.3913490569634796e-06, "loss": 0.0277, "num_tokens": 167748127.0, "step": 2095 }, { "epoch": 2.6138490330630066, "grad_norm": 0.12446227031099523, "learning_rate": 1.388881145020002e-06, "loss": 0.0297, "num_tokens": 167828350.0, "step": 2096 }, { "epoch": 2.615096693699314, "grad_norm": 0.11026469374300878, "learning_rate": 1.3864206877776245e-06, "loss": 0.0273, "num_tokens": 167908530.0, "step": 2097 }, { "epoch": 2.6163443543356206, "grad_norm": 0.11295221539796105, "learning_rate": 1.3839676896978997e-06, "loss": 0.0275, "num_tokens": 167989313.0, "step": 2098 }, { "epoch": 2.617592014971928, "grad_norm": 0.11298609559496968, "learning_rate": 1.3815221552288541e-06, "loss": 0.0276, "num_tokens": 168069559.0, "step": 2099 }, { "epoch": 2.6188396756082346, "grad_norm": 0.1141025393450742, "learning_rate": 1.3790840888049802e-06, "loss": 0.0272, "num_tokens": 168148187.0, "step": 2100 }, { "epoch": 2.6200873362445414, "grad_norm": 0.11929918883054025, "learning_rate": 1.3766534948472307e-06, "loss": 0.0271, "num_tokens": 168228189.0, "step": 2101 }, { "epoch": 2.621334996880848, "grad_norm": 0.11988289797660306, "learning_rate": 1.3742303777630057e-06, "loss": 0.0276, "num_tokens": 168308932.0, "step": 2102 }, { "epoch": 2.6225826575171554, "grad_norm": 0.11436560741581663, "learning_rate": 1.3718147419461497e-06, "loss": 0.0282, "num_tokens": 168388848.0, "step": 2103 }, { "epoch": 2.623830318153462, "grad_norm": 0.1056496402693744, "learning_rate": 1.3694065917769414e-06, "loss": 0.027, "num_tokens": 168467832.0, "step": 2104 }, { "epoch": 2.6250779787897693, "grad_norm": 0.1104079059611424, "learning_rate": 1.367005931622084e-06, "loss": 0.0282, "num_tokens": 168547708.0, "step": 2105 }, { "epoch": 2.626325639426076, "grad_norm": 0.10429616151130164, "learning_rate": 1.3646127658346992e-06, "loss": 0.0267, "num_tokens": 168627496.0, "step": 2106 }, { "epoch": 2.627573300062383, "grad_norm": 0.11808419996882835, "learning_rate": 1.3622270987543215e-06, "loss": 0.0275, "num_tokens": 168706999.0, "step": 2107 }, { "epoch": 2.62882096069869, "grad_norm": 0.11162808996935646, "learning_rate": 1.3598489347068858e-06, "loss": 0.0265, "num_tokens": 168787359.0, "step": 2108 }, { "epoch": 2.630068621334997, "grad_norm": 0.11556801052351962, "learning_rate": 1.357478278004721e-06, "loss": 0.027, "num_tokens": 168868083.0, "step": 2109 }, { "epoch": 2.631316281971304, "grad_norm": 0.11227131668663762, "learning_rate": 1.3551151329465462e-06, "loss": 0.0272, "num_tokens": 168946701.0, "step": 2110 }, { "epoch": 2.632563942607611, "grad_norm": 0.1145171495962, "learning_rate": 1.3527595038174566e-06, "loss": 0.0269, "num_tokens": 169026964.0, "step": 2111 }, { "epoch": 2.6338116032439176, "grad_norm": 0.11258767236811885, "learning_rate": 1.35041139488892e-06, "loss": 0.0269, "num_tokens": 169106455.0, "step": 2112 }, { "epoch": 2.6350592638802244, "grad_norm": 0.1269169979219692, "learning_rate": 1.3480708104187685e-06, "loss": 0.0278, "num_tokens": 169187782.0, "step": 2113 }, { "epoch": 2.6363069245165316, "grad_norm": 0.10759670867627756, "learning_rate": 1.3457377546511882e-06, "loss": 0.0274, "num_tokens": 169268014.0, "step": 2114 }, { "epoch": 2.6375545851528384, "grad_norm": 0.10677322152837186, "learning_rate": 1.3434122318167142e-06, "loss": 0.0267, "num_tokens": 169347680.0, "step": 2115 }, { "epoch": 2.6388022457891456, "grad_norm": 0.1116462775595202, "learning_rate": 1.3410942461322236e-06, "loss": 0.0273, "num_tokens": 169428596.0, "step": 2116 }, { "epoch": 2.6400499064254523, "grad_norm": 0.11029769450411615, "learning_rate": 1.3387838018009239e-06, "loss": 0.0267, "num_tokens": 169509363.0, "step": 2117 }, { "epoch": 2.641297567061759, "grad_norm": 0.10789263559722907, "learning_rate": 1.3364809030123477e-06, "loss": 0.0265, "num_tokens": 169589358.0, "step": 2118 }, { "epoch": 2.642545227698066, "grad_norm": 0.11544419735850721, "learning_rate": 1.3341855539423499e-06, "loss": 0.0272, "num_tokens": 169669893.0, "step": 2119 }, { "epoch": 2.643792888334373, "grad_norm": 0.13240480297882418, "learning_rate": 1.3318977587530907e-06, "loss": 0.0441, "num_tokens": 169750499.0, "step": 2120 }, { "epoch": 2.64504054897068, "grad_norm": 0.10788331716169906, "learning_rate": 1.3296175215930326e-06, "loss": 0.0266, "num_tokens": 169829770.0, "step": 2121 }, { "epoch": 2.646288209606987, "grad_norm": 0.10324597149645515, "learning_rate": 1.3273448465969376e-06, "loss": 0.0273, "num_tokens": 169909873.0, "step": 2122 }, { "epoch": 2.647535870243294, "grad_norm": 0.1083672497231884, "learning_rate": 1.3250797378858507e-06, "loss": 0.0273, "num_tokens": 169990577.0, "step": 2123 }, { "epoch": 2.6487835308796006, "grad_norm": 0.10796760309733955, "learning_rate": 1.3228221995670987e-06, "loss": 0.0272, "num_tokens": 170069752.0, "step": 2124 }, { "epoch": 2.650031191515908, "grad_norm": 0.11470562508991235, "learning_rate": 1.3205722357342807e-06, "loss": 0.0281, "num_tokens": 170150197.0, "step": 2125 }, { "epoch": 2.6512788521522146, "grad_norm": 0.10707306739767271, "learning_rate": 1.3183298504672626e-06, "loss": 0.0276, "num_tokens": 170229547.0, "step": 2126 }, { "epoch": 2.6525265127885214, "grad_norm": 0.10919949194770662, "learning_rate": 1.316095047832166e-06, "loss": 0.0274, "num_tokens": 170309378.0, "step": 2127 }, { "epoch": 2.6537741734248286, "grad_norm": 0.11957546966930166, "learning_rate": 1.3138678318813618e-06, "loss": 0.0285, "num_tokens": 170389032.0, "step": 2128 }, { "epoch": 2.6550218340611353, "grad_norm": 0.11617481255786126, "learning_rate": 1.3116482066534686e-06, "loss": 0.027, "num_tokens": 170468247.0, "step": 2129 }, { "epoch": 2.656269494697442, "grad_norm": 0.11357476091195207, "learning_rate": 1.3094361761733356e-06, "loss": 0.0275, "num_tokens": 170548512.0, "step": 2130 }, { "epoch": 2.6575171553337493, "grad_norm": 0.10925157715831366, "learning_rate": 1.3072317444520449e-06, "loss": 0.027, "num_tokens": 170630459.0, "step": 2131 }, { "epoch": 2.658764815970056, "grad_norm": 0.1169285409735411, "learning_rate": 1.3050349154868946e-06, "loss": 0.037, "num_tokens": 170711970.0, "step": 2132 }, { "epoch": 2.6600124766063633, "grad_norm": 0.11614548941267049, "learning_rate": 1.3028456932614019e-06, "loss": 0.027, "num_tokens": 170791598.0, "step": 2133 }, { "epoch": 2.66126013724267, "grad_norm": 0.12300026143621852, "learning_rate": 1.3006640817452873e-06, "loss": 0.0278, "num_tokens": 170870743.0, "step": 2134 }, { "epoch": 2.662507797878977, "grad_norm": 0.10949480726257627, "learning_rate": 1.2984900848944727e-06, "loss": 0.0268, "num_tokens": 170950664.0, "step": 2135 }, { "epoch": 2.6637554585152836, "grad_norm": 0.11076298472497624, "learning_rate": 1.2963237066510715e-06, "loss": 0.0279, "num_tokens": 171030340.0, "step": 2136 }, { "epoch": 2.665003119151591, "grad_norm": 0.11388272829849243, "learning_rate": 1.2941649509433808e-06, "loss": 0.0269, "num_tokens": 171109325.0, "step": 2137 }, { "epoch": 2.6662507797878976, "grad_norm": 0.12341205651411256, "learning_rate": 1.2920138216858791e-06, "loss": 0.0286, "num_tokens": 171189575.0, "step": 2138 }, { "epoch": 2.667498440424205, "grad_norm": 0.11331697834247403, "learning_rate": 1.289870322779212e-06, "loss": 0.0285, "num_tokens": 171268927.0, "step": 2139 }, { "epoch": 2.6687461010605116, "grad_norm": 0.11882020374000711, "learning_rate": 1.2877344581101922e-06, "loss": 0.0272, "num_tokens": 171347477.0, "step": 2140 }, { "epoch": 2.6699937616968183, "grad_norm": 0.10972444404388286, "learning_rate": 1.2856062315517885e-06, "loss": 0.0278, "num_tokens": 171427407.0, "step": 2141 }, { "epoch": 2.671241422333125, "grad_norm": 0.11946680862932044, "learning_rate": 1.2834856469631174e-06, "loss": 0.028, "num_tokens": 171507650.0, "step": 2142 }, { "epoch": 2.6724890829694323, "grad_norm": 0.10858577662792648, "learning_rate": 1.28137270818944e-06, "loss": 0.0276, "num_tokens": 171588030.0, "step": 2143 }, { "epoch": 2.673736743605739, "grad_norm": 0.11241775634496111, "learning_rate": 1.279267419062155e-06, "loss": 0.0272, "num_tokens": 171668622.0, "step": 2144 }, { "epoch": 2.6749844042420463, "grad_norm": 0.1138275210262026, "learning_rate": 1.2771697833987852e-06, "loss": 0.0274, "num_tokens": 171747879.0, "step": 2145 }, { "epoch": 2.676232064878353, "grad_norm": 0.12107882370247575, "learning_rate": 1.2750798050029782e-06, "loss": 0.0301, "num_tokens": 171828495.0, "step": 2146 }, { "epoch": 2.67747972551466, "grad_norm": 0.11812881048310148, "learning_rate": 1.272997487664499e-06, "loss": 0.0289, "num_tokens": 171908174.0, "step": 2147 }, { "epoch": 2.678727386150967, "grad_norm": 0.10269224386186213, "learning_rate": 1.2709228351592167e-06, "loss": 0.0261, "num_tokens": 171988943.0, "step": 2148 }, { "epoch": 2.679975046787274, "grad_norm": 0.10927504867119865, "learning_rate": 1.2688558512491032e-06, "loss": 0.028, "num_tokens": 172068842.0, "step": 2149 }, { "epoch": 2.681222707423581, "grad_norm": 0.11683785756855145, "learning_rate": 1.2667965396822257e-06, "loss": 0.0275, "num_tokens": 172148664.0, "step": 2150 }, { "epoch": 2.682470368059888, "grad_norm": 0.12167175622894594, "learning_rate": 1.2647449041927385e-06, "loss": 0.0278, "num_tokens": 172229020.0, "step": 2151 }, { "epoch": 2.6837180286961946, "grad_norm": 0.10888309628100702, "learning_rate": 1.2627009485008754e-06, "loss": 0.0272, "num_tokens": 172309720.0, "step": 2152 }, { "epoch": 2.6849656893325013, "grad_norm": 0.122912436571624, "learning_rate": 1.2606646763129476e-06, "loss": 0.0284, "num_tokens": 172389961.0, "step": 2153 }, { "epoch": 2.6862133499688086, "grad_norm": 0.10857265378270686, "learning_rate": 1.2586360913213315e-06, "loss": 0.0262, "num_tokens": 172471162.0, "step": 2154 }, { "epoch": 2.6874610106051153, "grad_norm": 0.11669471057841353, "learning_rate": 1.256615197204465e-06, "loss": 0.0283, "num_tokens": 172551948.0, "step": 2155 }, { "epoch": 2.6887086712414225, "grad_norm": 0.13329440954951594, "learning_rate": 1.2546019976268403e-06, "loss": 0.0292, "num_tokens": 172632487.0, "step": 2156 }, { "epoch": 2.6899563318777293, "grad_norm": 0.11522808892441781, "learning_rate": 1.2525964962389961e-06, "loss": 0.027, "num_tokens": 172713350.0, "step": 2157 }, { "epoch": 2.691203992514036, "grad_norm": 0.11229194690407834, "learning_rate": 1.250598696677512e-06, "loss": 0.0278, "num_tokens": 172792803.0, "step": 2158 }, { "epoch": 2.692451653150343, "grad_norm": 0.10819150987759615, "learning_rate": 1.2486086025650045e-06, "loss": 0.0269, "num_tokens": 172873683.0, "step": 2159 }, { "epoch": 2.69369931378665, "grad_norm": 0.10976961567370531, "learning_rate": 1.246626217510114e-06, "loss": 0.0273, "num_tokens": 172953708.0, "step": 2160 }, { "epoch": 2.694946974422957, "grad_norm": 0.10593975910980304, "learning_rate": 1.244651545107503e-06, "loss": 0.0275, "num_tokens": 173034948.0, "step": 2161 }, { "epoch": 2.696194635059264, "grad_norm": 0.10752240766626842, "learning_rate": 1.2426845889378516e-06, "loss": 0.0271, "num_tokens": 173114456.0, "step": 2162 }, { "epoch": 2.697442295695571, "grad_norm": 0.11100439809268697, "learning_rate": 1.2407253525678453e-06, "loss": 0.027, "num_tokens": 173193677.0, "step": 2163 }, { "epoch": 2.6986899563318776, "grad_norm": 0.11124936556530347, "learning_rate": 1.2387738395501714e-06, "loss": 0.028, "num_tokens": 173273829.0, "step": 2164 }, { "epoch": 2.699937616968185, "grad_norm": 0.11341877617750057, "learning_rate": 1.236830053423512e-06, "loss": 0.0279, "num_tokens": 173354403.0, "step": 2165 }, { "epoch": 2.7011852776044916, "grad_norm": 0.10432445599121043, "learning_rate": 1.2348939977125412e-06, "loss": 0.0273, "num_tokens": 173434752.0, "step": 2166 }, { "epoch": 2.7024329382407988, "grad_norm": 0.10349647102790291, "learning_rate": 1.2329656759279108e-06, "loss": 0.0265, "num_tokens": 173513891.0, "step": 2167 }, { "epoch": 2.7036805988771055, "grad_norm": 0.10730067378099671, "learning_rate": 1.2310450915662516e-06, "loss": 0.0263, "num_tokens": 173593347.0, "step": 2168 }, { "epoch": 2.7049282595134123, "grad_norm": 0.12358364979067543, "learning_rate": 1.229132248110165e-06, "loss": 0.0289, "num_tokens": 173673697.0, "step": 2169 }, { "epoch": 2.706175920149719, "grad_norm": 0.121969574680686, "learning_rate": 1.2272271490282134e-06, "loss": 0.0273, "num_tokens": 173753524.0, "step": 2170 }, { "epoch": 2.7074235807860263, "grad_norm": 0.10988278388736815, "learning_rate": 1.2253297977749163e-06, "loss": 0.0267, "num_tokens": 173833499.0, "step": 2171 }, { "epoch": 2.708671241422333, "grad_norm": 0.11127655885847154, "learning_rate": 1.2234401977907468e-06, "loss": 0.0276, "num_tokens": 173913319.0, "step": 2172 }, { "epoch": 2.7099189020586403, "grad_norm": 0.11247307459253601, "learning_rate": 1.2215583525021203e-06, "loss": 0.0273, "num_tokens": 173992389.0, "step": 2173 }, { "epoch": 2.711166562694947, "grad_norm": 0.11437743854655531, "learning_rate": 1.2196842653213896e-06, "loss": 0.0294, "num_tokens": 174072025.0, "step": 2174 }, { "epoch": 2.712414223331254, "grad_norm": 0.11260264853485703, "learning_rate": 1.2178179396468428e-06, "loss": 0.0284, "num_tokens": 174152930.0, "step": 2175 }, { "epoch": 2.7136618839675606, "grad_norm": 0.12047852452761347, "learning_rate": 1.215959378862692e-06, "loss": 0.0276, "num_tokens": 174232157.0, "step": 2176 }, { "epoch": 2.714909544603868, "grad_norm": 0.10620348737338733, "learning_rate": 1.2141085863390696e-06, "loss": 0.0267, "num_tokens": 174312869.0, "step": 2177 }, { "epoch": 2.7161572052401746, "grad_norm": 0.10200387743067355, "learning_rate": 1.2122655654320225e-06, "loss": 0.026, "num_tokens": 174391850.0, "step": 2178 }, { "epoch": 2.717404865876482, "grad_norm": 0.11792058541690678, "learning_rate": 1.210430319483504e-06, "loss": 0.0279, "num_tokens": 174472240.0, "step": 2179 }, { "epoch": 2.7186525265127885, "grad_norm": 0.11352708440547712, "learning_rate": 1.2086028518213694e-06, "loss": 0.0273, "num_tokens": 174551750.0, "step": 2180 }, { "epoch": 2.7199001871490953, "grad_norm": 0.11403171270711979, "learning_rate": 1.206783165759371e-06, "loss": 0.0273, "num_tokens": 174631491.0, "step": 2181 }, { "epoch": 2.7211478477854025, "grad_norm": 0.1196489902512615, "learning_rate": 1.204971264597148e-06, "loss": 0.0275, "num_tokens": 174711527.0, "step": 2182 }, { "epoch": 2.7223955084217093, "grad_norm": 0.1081192153362361, "learning_rate": 1.2031671516202263e-06, "loss": 0.0272, "num_tokens": 174790208.0, "step": 2183 }, { "epoch": 2.723643169058016, "grad_norm": 0.11976247939116517, "learning_rate": 1.2013708301000082e-06, "loss": 0.028, "num_tokens": 174870561.0, "step": 2184 }, { "epoch": 2.7248908296943233, "grad_norm": 0.11442096061556735, "learning_rate": 1.199582303293767e-06, "loss": 0.0274, "num_tokens": 174951422.0, "step": 2185 }, { "epoch": 2.72613849033063, "grad_norm": 0.11617050642869951, "learning_rate": 1.1978015744446417e-06, "loss": 0.0271, "num_tokens": 175031463.0, "step": 2186 }, { "epoch": 2.727386150966937, "grad_norm": 0.12293559783309263, "learning_rate": 1.1960286467816331e-06, "loss": 0.0278, "num_tokens": 175112098.0, "step": 2187 }, { "epoch": 2.728633811603244, "grad_norm": 0.11221752709437237, "learning_rate": 1.1942635235195949e-06, "loss": 0.0285, "num_tokens": 175192431.0, "step": 2188 }, { "epoch": 2.729881472239551, "grad_norm": 0.11210559785351049, "learning_rate": 1.1925062078592279e-06, "loss": 0.0267, "num_tokens": 175272604.0, "step": 2189 }, { "epoch": 2.731129132875858, "grad_norm": 0.10832068295079063, "learning_rate": 1.190756702987077e-06, "loss": 0.0269, "num_tokens": 175352453.0, "step": 2190 }, { "epoch": 2.732376793512165, "grad_norm": 0.12338756235671419, "learning_rate": 1.1890150120755244e-06, "loss": 0.0284, "num_tokens": 175432528.0, "step": 2191 }, { "epoch": 2.7336244541484715, "grad_norm": 0.11219089612196873, "learning_rate": 1.1872811382827811e-06, "loss": 0.027, "num_tokens": 175512749.0, "step": 2192 }, { "epoch": 2.7348721147847783, "grad_norm": 0.12417263033167164, "learning_rate": 1.1855550847528849e-06, "loss": 0.0271, "num_tokens": 175593593.0, "step": 2193 }, { "epoch": 2.7361197754210855, "grad_norm": 0.11709495692614103, "learning_rate": 1.1838368546156924e-06, "loss": 0.0266, "num_tokens": 175674330.0, "step": 2194 }, { "epoch": 2.7373674360573923, "grad_norm": 0.11715755751386901, "learning_rate": 1.182126450986874e-06, "loss": 0.028, "num_tokens": 175754689.0, "step": 2195 }, { "epoch": 2.7386150966936995, "grad_norm": 0.11080050252306903, "learning_rate": 1.1804238769679077e-06, "loss": 0.0273, "num_tokens": 175834619.0, "step": 2196 }, { "epoch": 2.7398627573300063, "grad_norm": 0.11676541458720631, "learning_rate": 1.178729135646077e-06, "loss": 0.0269, "num_tokens": 175914942.0, "step": 2197 }, { "epoch": 2.741110417966313, "grad_norm": 0.11457451396067345, "learning_rate": 1.1770422300944586e-06, "loss": 0.028, "num_tokens": 175996002.0, "step": 2198 }, { "epoch": 2.74235807860262, "grad_norm": 0.10756801142995977, "learning_rate": 1.1753631633719217e-06, "loss": 0.0281, "num_tokens": 176076559.0, "step": 2199 }, { "epoch": 2.743605739238927, "grad_norm": 0.12404043322828145, "learning_rate": 1.1736919385231236e-06, "loss": 0.028, "num_tokens": 176156992.0, "step": 2200 }, { "epoch": 2.744853399875234, "grad_norm": 0.10783216704576966, "learning_rate": 1.1720285585784983e-06, "loss": 0.0258, "num_tokens": 176235147.0, "step": 2201 }, { "epoch": 2.746101060511541, "grad_norm": 0.11227645559258542, "learning_rate": 1.1703730265542569e-06, "loss": 0.0275, "num_tokens": 176315166.0, "step": 2202 }, { "epoch": 2.747348721147848, "grad_norm": 0.11576428333299565, "learning_rate": 1.16872534545238e-06, "loss": 0.028, "num_tokens": 176395152.0, "step": 2203 }, { "epoch": 2.7485963817841546, "grad_norm": 0.11841116972911843, "learning_rate": 1.1670855182606106e-06, "loss": 0.0274, "num_tokens": 176476772.0, "step": 2204 }, { "epoch": 2.7498440424204618, "grad_norm": 0.10483183967952564, "learning_rate": 1.1654535479524511e-06, "loss": 0.0269, "num_tokens": 176556474.0, "step": 2205 }, { "epoch": 2.7510917030567685, "grad_norm": 0.11471768456732195, "learning_rate": 1.163829437487158e-06, "loss": 0.0277, "num_tokens": 176637858.0, "step": 2206 }, { "epoch": 2.7523393636930757, "grad_norm": 0.108264123677895, "learning_rate": 1.162213189809734e-06, "loss": 0.0271, "num_tokens": 176717271.0, "step": 2207 }, { "epoch": 2.7535870243293825, "grad_norm": 0.15282329766969122, "learning_rate": 1.1606048078509235e-06, "loss": 0.0346, "num_tokens": 176797686.0, "step": 2208 }, { "epoch": 2.7548346849656893, "grad_norm": 0.11649281003810305, "learning_rate": 1.1590042945272108e-06, "loss": 0.0272, "num_tokens": 176876658.0, "step": 2209 }, { "epoch": 2.756082345601996, "grad_norm": 0.10842683509303165, "learning_rate": 1.1574116527408093e-06, "loss": 0.0267, "num_tokens": 176956269.0, "step": 2210 }, { "epoch": 2.7573300062383033, "grad_norm": 0.11131932466818467, "learning_rate": 1.1558268853796597e-06, "loss": 0.0269, "num_tokens": 177036068.0, "step": 2211 }, { "epoch": 2.75857766687461, "grad_norm": 0.12181987186342921, "learning_rate": 1.1542499953174257e-06, "loss": 0.0282, "num_tokens": 177117826.0, "step": 2212 }, { "epoch": 2.7598253275109172, "grad_norm": 0.11099109738309532, "learning_rate": 1.1526809854134844e-06, "loss": 0.0281, "num_tokens": 177198291.0, "step": 2213 }, { "epoch": 2.761072988147224, "grad_norm": 0.11310340011164033, "learning_rate": 1.151119858512925e-06, "loss": 0.0268, "num_tokens": 177278136.0, "step": 2214 }, { "epoch": 2.762320648783531, "grad_norm": 0.12040084886173383, "learning_rate": 1.149566617446543e-06, "loss": 0.0273, "num_tokens": 177358685.0, "step": 2215 }, { "epoch": 2.7635683094198376, "grad_norm": 0.11023952574462588, "learning_rate": 1.1480212650308337e-06, "loss": 0.0277, "num_tokens": 177439052.0, "step": 2216 }, { "epoch": 2.7648159700561448, "grad_norm": 0.10245924815558206, "learning_rate": 1.1464838040679876e-06, "loss": 0.0265, "num_tokens": 177518673.0, "step": 2217 }, { "epoch": 2.7660636306924515, "grad_norm": 0.11896518323741509, "learning_rate": 1.1449542373458867e-06, "loss": 0.028, "num_tokens": 177599813.0, "step": 2218 }, { "epoch": 2.7673112913287587, "grad_norm": 0.11721840542278891, "learning_rate": 1.1434325676380983e-06, "loss": 0.0275, "num_tokens": 177680047.0, "step": 2219 }, { "epoch": 2.7685589519650655, "grad_norm": 0.11242112554545239, "learning_rate": 1.141918797703868e-06, "loss": 0.0273, "num_tokens": 177759176.0, "step": 2220 }, { "epoch": 2.7698066126013723, "grad_norm": 0.12213920330413136, "learning_rate": 1.1404129302881193e-06, "loss": 0.0276, "num_tokens": 177840002.0, "step": 2221 }, { "epoch": 2.7710542732376795, "grad_norm": 0.11684412169186235, "learning_rate": 1.1389149681214456e-06, "loss": 0.0285, "num_tokens": 177920088.0, "step": 2222 }, { "epoch": 2.7723019338739863, "grad_norm": 0.1119053015497875, "learning_rate": 1.1374249139201035e-06, "loss": 0.0274, "num_tokens": 177999727.0, "step": 2223 }, { "epoch": 2.773549594510293, "grad_norm": 0.11111524937162807, "learning_rate": 1.135942770386013e-06, "loss": 0.0268, "num_tokens": 178078737.0, "step": 2224 }, { "epoch": 2.7747972551466002, "grad_norm": 0.10700815561780962, "learning_rate": 1.1344685402067475e-06, "loss": 0.0269, "num_tokens": 178157602.0, "step": 2225 }, { "epoch": 2.776044915782907, "grad_norm": 0.12352603858436706, "learning_rate": 1.1330022260555321e-06, "loss": 0.0284, "num_tokens": 178238153.0, "step": 2226 }, { "epoch": 2.777292576419214, "grad_norm": 0.10878032533922675, "learning_rate": 1.1315438305912377e-06, "loss": 0.0265, "num_tokens": 178317857.0, "step": 2227 }, { "epoch": 2.778540237055521, "grad_norm": 0.11929846361121192, "learning_rate": 1.1300933564583764e-06, "loss": 0.0275, "num_tokens": 178398144.0, "step": 2228 }, { "epoch": 2.7797878976918278, "grad_norm": 0.11575836382461366, "learning_rate": 1.1286508062870952e-06, "loss": 0.0277, "num_tokens": 178477765.0, "step": 2229 }, { "epoch": 2.781035558328135, "grad_norm": 0.10742117792303336, "learning_rate": 1.1272161826931745e-06, "loss": 0.0265, "num_tokens": 178558431.0, "step": 2230 }, { "epoch": 2.7822832189644418, "grad_norm": 0.11418872199176901, "learning_rate": 1.1257894882780206e-06, "loss": 0.0283, "num_tokens": 178639242.0, "step": 2231 }, { "epoch": 2.7835308796007485, "grad_norm": 0.10723962416090788, "learning_rate": 1.1243707256286606e-06, "loss": 0.0271, "num_tokens": 178719072.0, "step": 2232 }, { "epoch": 2.7847785402370553, "grad_norm": 0.10881136487138986, "learning_rate": 1.1229598973177407e-06, "loss": 0.0272, "num_tokens": 178797551.0, "step": 2233 }, { "epoch": 2.7860262008733625, "grad_norm": 0.1033276997183416, "learning_rate": 1.1215570059035199e-06, "loss": 0.0261, "num_tokens": 178876643.0, "step": 2234 }, { "epoch": 2.7872738615096693, "grad_norm": 0.11221713077894142, "learning_rate": 1.1201620539298636e-06, "loss": 0.0277, "num_tokens": 178956890.0, "step": 2235 }, { "epoch": 2.7885215221459765, "grad_norm": 0.11497597690368146, "learning_rate": 1.1187750439262405e-06, "loss": 0.0274, "num_tokens": 179036190.0, "step": 2236 }, { "epoch": 2.7897691827822833, "grad_norm": 0.11685097421442404, "learning_rate": 1.1173959784077207e-06, "loss": 0.0292, "num_tokens": 179117241.0, "step": 2237 }, { "epoch": 2.79101684341859, "grad_norm": 0.11901364215641137, "learning_rate": 1.1160248598749652e-06, "loss": 0.0286, "num_tokens": 179197576.0, "step": 2238 }, { "epoch": 2.7922645040548972, "grad_norm": 0.10306800380453014, "learning_rate": 1.114661690814227e-06, "loss": 0.0266, "num_tokens": 179276756.0, "step": 2239 }, { "epoch": 2.793512164691204, "grad_norm": 0.11195164030232999, "learning_rate": 1.1133064736973443e-06, "loss": 0.0265, "num_tokens": 179357775.0, "step": 2240 }, { "epoch": 2.7947598253275108, "grad_norm": 0.11752175021287195, "learning_rate": 1.1119592109817346e-06, "loss": 0.0275, "num_tokens": 179438867.0, "step": 2241 }, { "epoch": 2.796007485963818, "grad_norm": 0.1140881376229996, "learning_rate": 1.1106199051103922e-06, "loss": 0.0271, "num_tokens": 179518359.0, "step": 2242 }, { "epoch": 2.7972551466001248, "grad_norm": 0.11251686289836357, "learning_rate": 1.109288558511884e-06, "loss": 0.0268, "num_tokens": 179597988.0, "step": 2243 }, { "epoch": 2.7985028072364315, "grad_norm": 0.10788731583211555, "learning_rate": 1.1079651736003441e-06, "loss": 0.0273, "num_tokens": 179678144.0, "step": 2244 }, { "epoch": 2.7997504678727387, "grad_norm": 0.11520086050466517, "learning_rate": 1.106649752775468e-06, "loss": 0.0267, "num_tokens": 179758675.0, "step": 2245 }, { "epoch": 2.8009981285090455, "grad_norm": 0.12597826940633167, "learning_rate": 1.1053422984225127e-06, "loss": 0.027, "num_tokens": 179839093.0, "step": 2246 }, { "epoch": 2.8022457891453527, "grad_norm": 0.11239946957555676, "learning_rate": 1.1040428129122873e-06, "loss": 0.0265, "num_tokens": 179920234.0, "step": 2247 }, { "epoch": 2.8034934497816595, "grad_norm": 0.11485169689579346, "learning_rate": 1.102751298601152e-06, "loss": 0.0272, "num_tokens": 179999475.0, "step": 2248 }, { "epoch": 2.8047411104179663, "grad_norm": 0.11199877474314972, "learning_rate": 1.1014677578310128e-06, "loss": 0.0277, "num_tokens": 180078857.0, "step": 2249 }, { "epoch": 2.805988771054273, "grad_norm": 0.11085777355981787, "learning_rate": 1.1001921929293172e-06, "loss": 0.0281, "num_tokens": 180157620.0, "step": 2250 }, { "epoch": 2.8072364316905802, "grad_norm": 0.11095268337610681, "learning_rate": 1.0989246062090495e-06, "loss": 0.0269, "num_tokens": 180237202.0, "step": 2251 }, { "epoch": 2.808484092326887, "grad_norm": 0.1314904657457488, "learning_rate": 1.0976649999687282e-06, "loss": 0.0273, "num_tokens": 180316592.0, "step": 2252 }, { "epoch": 2.809731752963194, "grad_norm": 0.10715724234388715, "learning_rate": 1.096413376492399e-06, "loss": 0.0272, "num_tokens": 180396786.0, "step": 2253 }, { "epoch": 2.810979413599501, "grad_norm": 0.11446876586604864, "learning_rate": 1.0951697380496343e-06, "loss": 0.0267, "num_tokens": 180477208.0, "step": 2254 }, { "epoch": 2.8122270742358078, "grad_norm": 0.10000594303521496, "learning_rate": 1.093934086895526e-06, "loss": 0.0262, "num_tokens": 180557503.0, "step": 2255 }, { "epoch": 2.8134747348721145, "grad_norm": 0.11468231454145983, "learning_rate": 1.0927064252706845e-06, "loss": 0.0264, "num_tokens": 180636993.0, "step": 2256 }, { "epoch": 2.8147223955084217, "grad_norm": 0.11326974384224862, "learning_rate": 1.0914867554012297e-06, "loss": 0.028, "num_tokens": 180717357.0, "step": 2257 }, { "epoch": 2.8159700561447285, "grad_norm": 0.12011090419081122, "learning_rate": 1.090275079498793e-06, "loss": 0.0286, "num_tokens": 180796981.0, "step": 2258 }, { "epoch": 2.8172177167810357, "grad_norm": 0.11208373588605422, "learning_rate": 1.0890713997605085e-06, "loss": 0.0276, "num_tokens": 180876805.0, "step": 2259 }, { "epoch": 2.8184653774173425, "grad_norm": 0.11929554034691514, "learning_rate": 1.0878757183690112e-06, "loss": 0.0276, "num_tokens": 180956790.0, "step": 2260 }, { "epoch": 2.8197130380536493, "grad_norm": 0.10883696590219329, "learning_rate": 1.086688037492433e-06, "loss": 0.027, "num_tokens": 181037554.0, "step": 2261 }, { "epoch": 2.8209606986899565, "grad_norm": 0.11656014791821229, "learning_rate": 1.0855083592843985e-06, "loss": 0.028, "num_tokens": 181117563.0, "step": 2262 }, { "epoch": 2.8222083593262632, "grad_norm": 0.10887563809750479, "learning_rate": 1.0843366858840209e-06, "loss": 0.0269, "num_tokens": 181197130.0, "step": 2263 }, { "epoch": 2.8234560199625705, "grad_norm": 0.10788088292257987, "learning_rate": 1.0831730194158982e-06, "loss": 0.0266, "num_tokens": 181276375.0, "step": 2264 }, { "epoch": 2.824703680598877, "grad_norm": 0.10930839605540808, "learning_rate": 1.0820173619901093e-06, "loss": 0.0271, "num_tokens": 181356017.0, "step": 2265 }, { "epoch": 2.825951341235184, "grad_norm": 0.12259984441092837, "learning_rate": 1.08086971570221e-06, "loss": 0.0292, "num_tokens": 181436275.0, "step": 2266 }, { "epoch": 2.8271990018714908, "grad_norm": 0.10639541929699158, "learning_rate": 1.0797300826332307e-06, "loss": 0.0268, "num_tokens": 181516434.0, "step": 2267 }, { "epoch": 2.828446662507798, "grad_norm": 0.12324754295042131, "learning_rate": 1.07859846484967e-06, "loss": 0.028, "num_tokens": 181597476.0, "step": 2268 }, { "epoch": 2.8296943231441047, "grad_norm": 0.10801092400541287, "learning_rate": 1.0774748644034936e-06, "loss": 0.0271, "num_tokens": 181677449.0, "step": 2269 }, { "epoch": 2.830941983780412, "grad_norm": 0.11429743139755069, "learning_rate": 1.0763592833321277e-06, "loss": 0.0269, "num_tokens": 181757429.0, "step": 2270 }, { "epoch": 2.8321896444167187, "grad_norm": 0.11295710267647807, "learning_rate": 1.0752517236584595e-06, "loss": 0.027, "num_tokens": 181836252.0, "step": 2271 }, { "epoch": 2.8334373050530255, "grad_norm": 0.11139902033265346, "learning_rate": 1.0741521873908283e-06, "loss": 0.0268, "num_tokens": 181916218.0, "step": 2272 }, { "epoch": 2.8346849656893323, "grad_norm": 0.1054002144624115, "learning_rate": 1.0730606765230257e-06, "loss": 0.0269, "num_tokens": 181995689.0, "step": 2273 }, { "epoch": 2.8359326263256395, "grad_norm": 0.10302028960792996, "learning_rate": 1.0719771930342913e-06, "loss": 0.0261, "num_tokens": 182075038.0, "step": 2274 }, { "epoch": 2.8371802869619462, "grad_norm": 0.11798628087878685, "learning_rate": 1.0709017388893075e-06, "loss": 0.0283, "num_tokens": 182154597.0, "step": 2275 }, { "epoch": 2.8384279475982535, "grad_norm": 0.11276374561736073, "learning_rate": 1.0698343160381987e-06, "loss": 0.0266, "num_tokens": 182233887.0, "step": 2276 }, { "epoch": 2.8396756082345602, "grad_norm": 0.1174576965951362, "learning_rate": 1.0687749264165248e-06, "loss": 0.028, "num_tokens": 182314132.0, "step": 2277 }, { "epoch": 2.840923268870867, "grad_norm": 0.10814129453586245, "learning_rate": 1.067723571945279e-06, "loss": 0.0261, "num_tokens": 182394712.0, "step": 2278 }, { "epoch": 2.842170929507174, "grad_norm": 0.11723270887984091, "learning_rate": 1.0666802545308847e-06, "loss": 0.0281, "num_tokens": 182476047.0, "step": 2279 }, { "epoch": 2.843418590143481, "grad_norm": 0.11081835835199184, "learning_rate": 1.065644976065193e-06, "loss": 0.0279, "num_tokens": 182554762.0, "step": 2280 }, { "epoch": 2.8446662507797877, "grad_norm": 0.10738520261965014, "learning_rate": 1.0646177384254747e-06, "loss": 0.027, "num_tokens": 182633545.0, "step": 2281 }, { "epoch": 2.845913911416095, "grad_norm": 0.10253227627415959, "learning_rate": 1.063598543474423e-06, "loss": 0.0263, "num_tokens": 182711682.0, "step": 2282 }, { "epoch": 2.8471615720524017, "grad_norm": 0.11632474054875933, "learning_rate": 1.062587393060147e-06, "loss": 0.0277, "num_tokens": 182791996.0, "step": 2283 }, { "epoch": 2.8484092326887085, "grad_norm": 0.11239301355373442, "learning_rate": 1.0615842890161675e-06, "loss": 0.0267, "num_tokens": 182871570.0, "step": 2284 }, { "epoch": 2.8496568933250157, "grad_norm": 0.1260833333963082, "learning_rate": 1.0605892331614158e-06, "loss": 0.0305, "num_tokens": 182951531.0, "step": 2285 }, { "epoch": 2.8509045539613225, "grad_norm": 0.10700130560287861, "learning_rate": 1.0596022273002282e-06, "loss": 0.0268, "num_tokens": 183030679.0, "step": 2286 }, { "epoch": 2.8521522145976297, "grad_norm": 0.11364031790238421, "learning_rate": 1.0586232732223446e-06, "loss": 0.0271, "num_tokens": 183112076.0, "step": 2287 }, { "epoch": 2.8533998752339365, "grad_norm": 0.10879548973912555, "learning_rate": 1.0576523727029053e-06, "loss": 0.0273, "num_tokens": 183192328.0, "step": 2288 }, { "epoch": 2.8546475358702432, "grad_norm": 0.11352263022080847, "learning_rate": 1.0566895275024458e-06, "loss": 0.0297, "num_tokens": 183271794.0, "step": 2289 }, { "epoch": 2.85589519650655, "grad_norm": 0.11082853865055407, "learning_rate": 1.0557347393668966e-06, "loss": 0.027, "num_tokens": 183350787.0, "step": 2290 }, { "epoch": 2.857142857142857, "grad_norm": 0.11456889825294071, "learning_rate": 1.0547880100275755e-06, "loss": 0.0275, "num_tokens": 183431435.0, "step": 2291 }, { "epoch": 2.858390517779164, "grad_norm": 0.11016237704435282, "learning_rate": 1.0538493412011901e-06, "loss": 0.0267, "num_tokens": 183510859.0, "step": 2292 }, { "epoch": 2.859638178415471, "grad_norm": 0.11417004760246512, "learning_rate": 1.0529187345898304e-06, "loss": 0.0277, "num_tokens": 183591361.0, "step": 2293 }, { "epoch": 2.860885839051778, "grad_norm": 0.11122299199182507, "learning_rate": 1.0519961918809675e-06, "loss": 0.0271, "num_tokens": 183671105.0, "step": 2294 }, { "epoch": 2.8621334996880847, "grad_norm": 0.11426790938884109, "learning_rate": 1.05108171474745e-06, "loss": 0.0275, "num_tokens": 183751497.0, "step": 2295 }, { "epoch": 2.8633811603243915, "grad_norm": 0.1132178603222439, "learning_rate": 1.050175304847502e-06, "loss": 0.0276, "num_tokens": 183831206.0, "step": 2296 }, { "epoch": 2.8646288209606987, "grad_norm": 0.11247491256932537, "learning_rate": 1.0492769638247177e-06, "loss": 0.0278, "num_tokens": 183912383.0, "step": 2297 }, { "epoch": 2.8658764815970055, "grad_norm": 0.10746653770671626, "learning_rate": 1.0483866933080611e-06, "loss": 0.0265, "num_tokens": 183992514.0, "step": 2298 }, { "epoch": 2.8671241422333127, "grad_norm": 0.11160208763303599, "learning_rate": 1.0475044949118624e-06, "loss": 0.0276, "num_tokens": 184072119.0, "step": 2299 }, { "epoch": 2.8683718028696195, "grad_norm": 0.11850309053487157, "learning_rate": 1.0466303702358139e-06, "loss": 0.0279, "num_tokens": 184155215.0, "step": 2300 }, { "epoch": 2.8696194635059262, "grad_norm": 0.12631852786380027, "learning_rate": 1.0457643208649665e-06, "loss": 0.0288, "num_tokens": 184235940.0, "step": 2301 }, { "epoch": 2.8708671241422334, "grad_norm": 0.11138287151881872, "learning_rate": 1.044906348369731e-06, "loss": 0.0276, "num_tokens": 184316322.0, "step": 2302 }, { "epoch": 2.87211478477854, "grad_norm": 0.10931209504132226, "learning_rate": 1.0440564543058703e-06, "loss": 0.0268, "num_tokens": 184396446.0, "step": 2303 }, { "epoch": 2.8733624454148474, "grad_norm": 0.11012682455226674, "learning_rate": 1.0432146402144986e-06, "loss": 0.0278, "num_tokens": 184475613.0, "step": 2304 }, { "epoch": 2.874610106051154, "grad_norm": 0.10769527893932587, "learning_rate": 1.0423809076220805e-06, "loss": 0.0269, "num_tokens": 184555834.0, "step": 2305 }, { "epoch": 2.875857766687461, "grad_norm": 0.12501985655306355, "learning_rate": 1.041555258040425e-06, "loss": 0.0289, "num_tokens": 184637311.0, "step": 2306 }, { "epoch": 2.8771054273237677, "grad_norm": 0.10611283848990914, "learning_rate": 1.0407376929666833e-06, "loss": 0.0272, "num_tokens": 184717540.0, "step": 2307 }, { "epoch": 2.878353087960075, "grad_norm": 0.11684908656337946, "learning_rate": 1.0399282138833488e-06, "loss": 0.0275, "num_tokens": 184796663.0, "step": 2308 }, { "epoch": 2.8796007485963817, "grad_norm": 0.12163581971776054, "learning_rate": 1.039126822258252e-06, "loss": 0.0277, "num_tokens": 184877517.0, "step": 2309 }, { "epoch": 2.880848409232689, "grad_norm": 0.11561664029854038, "learning_rate": 1.0383335195445573e-06, "loss": 0.0271, "num_tokens": 184956700.0, "step": 2310 }, { "epoch": 2.8820960698689957, "grad_norm": 0.11377476242109719, "learning_rate": 1.0375483071807626e-06, "loss": 0.0277, "num_tokens": 185036719.0, "step": 2311 }, { "epoch": 2.8833437305053025, "grad_norm": 0.10571550440670317, "learning_rate": 1.036771186590696e-06, "loss": 0.0268, "num_tokens": 185116185.0, "step": 2312 }, { "epoch": 2.8845913911416092, "grad_norm": 0.11078801585715989, "learning_rate": 1.0360021591835108e-06, "loss": 0.0274, "num_tokens": 185195865.0, "step": 2313 }, { "epoch": 2.8858390517779164, "grad_norm": 0.10587436172290575, "learning_rate": 1.0352412263536868e-06, "loss": 0.0258, "num_tokens": 185275227.0, "step": 2314 }, { "epoch": 2.887086712414223, "grad_norm": 0.10645437206340462, "learning_rate": 1.0344883894810257e-06, "loss": 0.0274, "num_tokens": 185354901.0, "step": 2315 }, { "epoch": 2.8883343730505304, "grad_norm": 0.11587554997670732, "learning_rate": 1.033743649930647e-06, "loss": 0.0263, "num_tokens": 185433853.0, "step": 2316 }, { "epoch": 2.889582033686837, "grad_norm": 0.11506509844410158, "learning_rate": 1.03300700905299e-06, "loss": 0.0274, "num_tokens": 185514489.0, "step": 2317 }, { "epoch": 2.890829694323144, "grad_norm": 0.10249391579856434, "learning_rate": 1.0322784681838062e-06, "loss": 0.0266, "num_tokens": 185595544.0, "step": 2318 }, { "epoch": 2.892077354959451, "grad_norm": 0.1244108209218656, "learning_rate": 1.0315580286441616e-06, "loss": 0.027, "num_tokens": 185675597.0, "step": 2319 }, { "epoch": 2.893325015595758, "grad_norm": 0.11187955851184811, "learning_rate": 1.0308456917404294e-06, "loss": 0.0269, "num_tokens": 185755490.0, "step": 2320 }, { "epoch": 2.8945726762320647, "grad_norm": 0.1118035236277393, "learning_rate": 1.0301414587642926e-06, "loss": 0.0267, "num_tokens": 185833837.0, "step": 2321 }, { "epoch": 2.895820336868372, "grad_norm": 0.11431007300598617, "learning_rate": 1.029445330992738e-06, "loss": 0.0285, "num_tokens": 185915064.0, "step": 2322 }, { "epoch": 2.8970679975046787, "grad_norm": 0.10462947510709711, "learning_rate": 1.0287573096880566e-06, "loss": 0.0267, "num_tokens": 185995076.0, "step": 2323 }, { "epoch": 2.8983156581409855, "grad_norm": 0.11570282963133109, "learning_rate": 1.028077396097838e-06, "loss": 0.0269, "num_tokens": 186074872.0, "step": 2324 }, { "epoch": 2.8995633187772927, "grad_norm": 0.12626629625181418, "learning_rate": 1.0274055914549708e-06, "loss": 0.0281, "num_tokens": 186156148.0, "step": 2325 }, { "epoch": 2.9008109794135994, "grad_norm": 0.11032478054432032, "learning_rate": 1.0267418969776405e-06, "loss": 0.0264, "num_tokens": 186236795.0, "step": 2326 }, { "epoch": 2.9020586400499067, "grad_norm": 0.108031229594549, "learning_rate": 1.0260863138693264e-06, "loss": 0.0282, "num_tokens": 186316351.0, "step": 2327 }, { "epoch": 2.9033063006862134, "grad_norm": 0.11591386223810314, "learning_rate": 1.0254388433187975e-06, "loss": 0.0279, "num_tokens": 186396106.0, "step": 2328 }, { "epoch": 2.90455396132252, "grad_norm": 0.10622785774173073, "learning_rate": 1.0247994865001147e-06, "loss": 0.0259, "num_tokens": 186475119.0, "step": 2329 }, { "epoch": 2.905801621958827, "grad_norm": 0.12280744075841177, "learning_rate": 1.0241682445726246e-06, "loss": 0.0279, "num_tokens": 186556403.0, "step": 2330 }, { "epoch": 2.907049282595134, "grad_norm": 0.12850490414436164, "learning_rate": 1.0235451186809596e-06, "loss": 0.0281, "num_tokens": 186636091.0, "step": 2331 }, { "epoch": 2.908296943231441, "grad_norm": 0.12129199138278511, "learning_rate": 1.0229301099550352e-06, "loss": 0.0263, "num_tokens": 186718680.0, "step": 2332 }, { "epoch": 2.909544603867748, "grad_norm": 0.10134475681869966, "learning_rate": 1.0223232195100485e-06, "loss": 0.0267, "num_tokens": 186797276.0, "step": 2333 }, { "epoch": 2.910792264504055, "grad_norm": 0.10862663209574905, "learning_rate": 1.0217244484464758e-06, "loss": 0.0269, "num_tokens": 186877678.0, "step": 2334 }, { "epoch": 2.9120399251403617, "grad_norm": 0.10595868824858773, "learning_rate": 1.0211337978500687e-06, "loss": 0.0269, "num_tokens": 186956753.0, "step": 2335 }, { "epoch": 2.913287585776669, "grad_norm": 0.11109899490724645, "learning_rate": 1.0205512687918558e-06, "loss": 0.0275, "num_tokens": 187037968.0, "step": 2336 }, { "epoch": 2.9145352464129757, "grad_norm": 0.10985537743814915, "learning_rate": 1.0199768623281388e-06, "loss": 0.028, "num_tokens": 187117786.0, "step": 2337 }, { "epoch": 2.9157829070492824, "grad_norm": 0.11212443137177852, "learning_rate": 1.0194105795004896e-06, "loss": 0.0273, "num_tokens": 187197753.0, "step": 2338 }, { "epoch": 2.9170305676855897, "grad_norm": 0.11357099125731306, "learning_rate": 1.0188524213357507e-06, "loss": 0.027, "num_tokens": 187278814.0, "step": 2339 }, { "epoch": 2.9182782283218964, "grad_norm": 0.11863761901671414, "learning_rate": 1.0183023888460312e-06, "loss": 0.0278, "num_tokens": 187359939.0, "step": 2340 }, { "epoch": 2.919525888958203, "grad_norm": 0.11740995441233278, "learning_rate": 1.017760483028706e-06, "loss": 0.0276, "num_tokens": 187439262.0, "step": 2341 }, { "epoch": 2.9207735495945104, "grad_norm": 0.10961685956776607, "learning_rate": 1.017226704866415e-06, "loss": 0.0273, "num_tokens": 187519426.0, "step": 2342 }, { "epoch": 2.922021210230817, "grad_norm": 0.13193569549980025, "learning_rate": 1.0167010553270588e-06, "loss": 0.0275, "num_tokens": 187599140.0, "step": 2343 }, { "epoch": 2.9232688708671244, "grad_norm": 0.1090653954416858, "learning_rate": 1.016183535363799e-06, "loss": 0.0263, "num_tokens": 187679734.0, "step": 2344 }, { "epoch": 2.924516531503431, "grad_norm": 0.11264135405112981, "learning_rate": 1.0156741459150556e-06, "loss": 0.0271, "num_tokens": 187760016.0, "step": 2345 }, { "epoch": 2.925764192139738, "grad_norm": 0.1127907828444805, "learning_rate": 1.0151728879045057e-06, "loss": 0.0273, "num_tokens": 187840030.0, "step": 2346 }, { "epoch": 2.9270118527760447, "grad_norm": 0.11654407524405933, "learning_rate": 1.0146797622410813e-06, "loss": 0.0266, "num_tokens": 187918803.0, "step": 2347 }, { "epoch": 2.928259513412352, "grad_norm": 0.11309647290317737, "learning_rate": 1.0141947698189684e-06, "loss": 0.0272, "num_tokens": 187998146.0, "step": 2348 }, { "epoch": 2.9295071740486587, "grad_norm": 0.11280706058466917, "learning_rate": 1.0137179115176055e-06, "loss": 0.0265, "num_tokens": 188077340.0, "step": 2349 }, { "epoch": 2.930754834684966, "grad_norm": 0.13076601205501484, "learning_rate": 1.0132491882016805e-06, "loss": 0.0262, "num_tokens": 188157146.0, "step": 2350 }, { "epoch": 2.9320024953212727, "grad_norm": 0.11012339063784357, "learning_rate": 1.0127886007211298e-06, "loss": 0.0275, "num_tokens": 188237197.0, "step": 2351 }, { "epoch": 2.9332501559575794, "grad_norm": 0.11371206516952323, "learning_rate": 1.0123361499111383e-06, "loss": 0.0272, "num_tokens": 188316620.0, "step": 2352 }, { "epoch": 2.934497816593886, "grad_norm": 0.10964761702423996, "learning_rate": 1.011891836592136e-06, "loss": 0.0278, "num_tokens": 188397001.0, "step": 2353 }, { "epoch": 2.9357454772301934, "grad_norm": 0.1117067743384703, "learning_rate": 1.0114556615697971e-06, "loss": 0.027, "num_tokens": 188476537.0, "step": 2354 }, { "epoch": 2.9369931378665, "grad_norm": 0.10532925727660913, "learning_rate": 1.0110276256350393e-06, "loss": 0.0265, "num_tokens": 188554894.0, "step": 2355 }, { "epoch": 2.9382407985028074, "grad_norm": 0.11464738528815109, "learning_rate": 1.010607729564021e-06, "loss": 0.0273, "num_tokens": 188635007.0, "step": 2356 }, { "epoch": 2.939488459139114, "grad_norm": 0.11411346562493145, "learning_rate": 1.0101959741181396e-06, "loss": 0.0273, "num_tokens": 188714832.0, "step": 2357 }, { "epoch": 2.940736119775421, "grad_norm": 0.11297091154604039, "learning_rate": 1.0097923600440335e-06, "loss": 0.0266, "num_tokens": 188794100.0, "step": 2358 }, { "epoch": 2.941983780411728, "grad_norm": 0.11419809264174352, "learning_rate": 1.0093968880735762e-06, "loss": 0.0277, "num_tokens": 188875432.0, "step": 2359 }, { "epoch": 2.943231441048035, "grad_norm": 0.10794783898522096, "learning_rate": 1.009009558923878e-06, "loss": 0.027, "num_tokens": 188954916.0, "step": 2360 }, { "epoch": 2.944479101684342, "grad_norm": 0.10544977589684877, "learning_rate": 1.0086303732972843e-06, "loss": 0.0266, "num_tokens": 189035103.0, "step": 2361 }, { "epoch": 2.945726762320649, "grad_norm": 0.12035892909369736, "learning_rate": 1.0082593318813728e-06, "loss": 0.027, "num_tokens": 189114556.0, "step": 2362 }, { "epoch": 2.9469744229569557, "grad_norm": 0.11275061926774883, "learning_rate": 1.0078964353489536e-06, "loss": 0.0268, "num_tokens": 189194415.0, "step": 2363 }, { "epoch": 2.9482220835932624, "grad_norm": 0.11213359019218412, "learning_rate": 1.0075416843580687e-06, "loss": 0.0272, "num_tokens": 189275904.0, "step": 2364 }, { "epoch": 2.9494697442295696, "grad_norm": 0.11713393130799933, "learning_rate": 1.0071950795519873e-06, "loss": 0.0279, "num_tokens": 189355944.0, "step": 2365 }, { "epoch": 2.9507174048658764, "grad_norm": 0.10806295055772125, "learning_rate": 1.00685662155921e-06, "loss": 0.0274, "num_tokens": 189436064.0, "step": 2366 }, { "epoch": 2.9519650655021836, "grad_norm": 0.11788280963029622, "learning_rate": 1.0065263109934633e-06, "loss": 0.0277, "num_tokens": 189516906.0, "step": 2367 }, { "epoch": 2.9532127261384904, "grad_norm": 0.11439835587046002, "learning_rate": 1.0062041484536994e-06, "loss": 0.0292, "num_tokens": 189597299.0, "step": 2368 }, { "epoch": 2.954460386774797, "grad_norm": 0.11618035546784151, "learning_rate": 1.0058901345240967e-06, "loss": 0.0274, "num_tokens": 189677346.0, "step": 2369 }, { "epoch": 2.955708047411104, "grad_norm": 0.10998886233872446, "learning_rate": 1.0055842697740576e-06, "loss": 0.0268, "num_tokens": 189756509.0, "step": 2370 }, { "epoch": 2.956955708047411, "grad_norm": 0.1102297346203138, "learning_rate": 1.0052865547582074e-06, "loss": 0.0273, "num_tokens": 189837787.0, "step": 2371 }, { "epoch": 2.958203368683718, "grad_norm": 0.11011616094968564, "learning_rate": 1.004996990016393e-06, "loss": 0.0278, "num_tokens": 189916820.0, "step": 2372 }, { "epoch": 2.959451029320025, "grad_norm": 0.1058190186745868, "learning_rate": 1.0047155760736828e-06, "loss": 0.0263, "num_tokens": 189996448.0, "step": 2373 }, { "epoch": 2.960698689956332, "grad_norm": 0.09902032964536187, "learning_rate": 1.004442313440366e-06, "loss": 0.0261, "num_tokens": 190075935.0, "step": 2374 }, { "epoch": 2.9619463505926387, "grad_norm": 0.12154175766740306, "learning_rate": 1.0041772026119493e-06, "loss": 0.027, "num_tokens": 190156058.0, "step": 2375 }, { "epoch": 2.963194011228946, "grad_norm": 0.11633889607609949, "learning_rate": 1.0039202440691598e-06, "loss": 0.0271, "num_tokens": 190236260.0, "step": 2376 }, { "epoch": 2.9644416718652526, "grad_norm": 0.10715021652513443, "learning_rate": 1.0036714382779405e-06, "loss": 0.0263, "num_tokens": 190315801.0, "step": 2377 }, { "epoch": 2.9656893325015594, "grad_norm": 0.11833748571681439, "learning_rate": 1.0034307856894511e-06, "loss": 0.0268, "num_tokens": 190394686.0, "step": 2378 }, { "epoch": 2.9669369931378666, "grad_norm": 0.11390670772540895, "learning_rate": 1.0031982867400683e-06, "loss": 0.0274, "num_tokens": 190474200.0, "step": 2379 }, { "epoch": 2.9681846537741734, "grad_norm": 0.09835116296798709, "learning_rate": 1.0029739418513825e-06, "loss": 0.0262, "num_tokens": 190553083.0, "step": 2380 }, { "epoch": 2.96943231441048, "grad_norm": 0.10781406391922578, "learning_rate": 1.0027577514301988e-06, "loss": 0.0264, "num_tokens": 190632255.0, "step": 2381 }, { "epoch": 2.9706799750467874, "grad_norm": 0.10062575360038996, "learning_rate": 1.002549715868536e-06, "loss": 0.0262, "num_tokens": 190712052.0, "step": 2382 }, { "epoch": 2.971927635683094, "grad_norm": 0.10840533051873137, "learning_rate": 1.0023498355436255e-06, "loss": 0.0269, "num_tokens": 190791575.0, "step": 2383 }, { "epoch": 2.9731752963194014, "grad_norm": 0.10941923321723614, "learning_rate": 1.0021581108179105e-06, "loss": 0.026, "num_tokens": 190870712.0, "step": 2384 }, { "epoch": 2.974422956955708, "grad_norm": 0.10761702827903147, "learning_rate": 1.0019745420390455e-06, "loss": 0.027, "num_tokens": 190951038.0, "step": 2385 }, { "epoch": 2.975670617592015, "grad_norm": 0.11368723139537162, "learning_rate": 1.001799129539897e-06, "loss": 0.0276, "num_tokens": 191030954.0, "step": 2386 }, { "epoch": 2.9769182782283217, "grad_norm": 0.10855427538159527, "learning_rate": 1.0016318736385406e-06, "loss": 0.0268, "num_tokens": 191110413.0, "step": 2387 }, { "epoch": 2.978165938864629, "grad_norm": 0.1068843734320981, "learning_rate": 1.0014727746382615e-06, "loss": 0.0259, "num_tokens": 191189445.0, "step": 2388 }, { "epoch": 2.9794135995009356, "grad_norm": 0.1257994774418835, "learning_rate": 1.0013218328275544e-06, "loss": 0.0279, "num_tokens": 191270715.0, "step": 2389 }, { "epoch": 2.980661260137243, "grad_norm": 0.11714566321070789, "learning_rate": 1.0011790484801231e-06, "loss": 0.0281, "num_tokens": 191350789.0, "step": 2390 }, { "epoch": 2.9819089207735496, "grad_norm": 0.11060106527704666, "learning_rate": 1.0010444218548777e-06, "loss": 0.0272, "num_tokens": 191432051.0, "step": 2391 }, { "epoch": 2.9831565814098564, "grad_norm": 0.11345865119840931, "learning_rate": 1.0009179531959374e-06, "loss": 0.0274, "num_tokens": 191514006.0, "step": 2392 }, { "epoch": 2.984404242046163, "grad_norm": 0.11834001788647562, "learning_rate": 1.0007996427326282e-06, "loss": 0.0274, "num_tokens": 191595045.0, "step": 2393 }, { "epoch": 2.9856519026824704, "grad_norm": 0.11736078471554756, "learning_rate": 1.0006894906794828e-06, "loss": 0.0281, "num_tokens": 191674983.0, "step": 2394 }, { "epoch": 2.986899563318777, "grad_norm": 0.10662448160761655, "learning_rate": 1.0005874972362403e-06, "loss": 0.0271, "num_tokens": 191754802.0, "step": 2395 }, { "epoch": 2.9881472239550844, "grad_norm": 0.10732557554988709, "learning_rate": 1.000493662587845e-06, "loss": 0.0265, "num_tokens": 191833832.0, "step": 2396 }, { "epoch": 2.989394884591391, "grad_norm": 0.11470784626798826, "learning_rate": 1.0004079869044482e-06, "loss": 0.0274, "num_tokens": 191913223.0, "step": 2397 }, { "epoch": 2.990642545227698, "grad_norm": 0.10039798181941487, "learning_rate": 1.0003304703414053e-06, "loss": 0.0255, "num_tokens": 191993666.0, "step": 2398 }, { "epoch": 2.991890205864005, "grad_norm": 0.10361335698424663, "learning_rate": 1.0002611130392772e-06, "loss": 0.0263, "num_tokens": 192074190.0, "step": 2399 }, { "epoch": 2.993137866500312, "grad_norm": 0.10857365381947398, "learning_rate": 1.0001999151238303e-06, "loss": 0.0271, "num_tokens": 192153740.0, "step": 2400 }, { "epoch": 2.994385527136619, "grad_norm": 0.10539286028474344, "learning_rate": 1.0001468767060341e-06, "loss": 0.0264, "num_tokens": 192232957.0, "step": 2401 }, { "epoch": 2.995633187772926, "grad_norm": 0.1103209776499511, "learning_rate": 1.000101997882064e-06, "loss": 0.0273, "num_tokens": 192312730.0, "step": 2402 }, { "epoch": 2.9968808484092326, "grad_norm": 0.11105172947776741, "learning_rate": 1.0000652787332984e-06, "loss": 0.0274, "num_tokens": 192392230.0, "step": 2403 }, { "epoch": 2.9981285090455394, "grad_norm": 0.11159122867179921, "learning_rate": 1.0000367193263206e-06, "loss": 0.0275, "num_tokens": 192471730.0, "step": 2404 }, { "epoch": 2.9993761696818466, "grad_norm": 0.10531323399329573, "learning_rate": 1.000016319712917e-06, "loss": 0.0264, "num_tokens": 192551044.0, "step": 2405 }, { "epoch": 3.0, "grad_norm": 0.10531323399329573, "learning_rate": 1.0000040799300788e-06, "loss": 0.0257, "num_tokens": 192590850.0, "step": 2406 }, { "epoch": 3.0, "step": 2406, "total_flos": 3.699807586474721e+17, "train_loss": 0.0587594091221814, "train_runtime": 5162.578, "train_samples_per_second": 59.587, "train_steps_per_second": 0.466 } ], "logging_steps": 1, "max_steps": 2406, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.699807586474721e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }