diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,4470 +2,1037 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 3.0, + "epoch": 2.0, "eval_steps": 500, - "global_step": 552, + "global_step": 124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.005434782608695652, - "grad_norm": 30.931470023815386, + "epoch": 0.016129032258064516, + "grad_norm": 19.043760587831713, "learning_rate": 0.0, - "loss": 2.463, - "num_tokens": 934637.0, + "loss": 2.0614, + "num_tokens": 313347.0, "step": 1 }, { - "epoch": 0.010869565217391304, - "grad_norm": 31.489623321141185, - "learning_rate": 5.882352941176471e-07, - "loss": 2.474, - "num_tokens": 1760595.0, + "epoch": 0.03225806451612903, + "grad_norm": 19.191061349201824, + "learning_rate": 2.5e-06, + "loss": 1.9809, + "num_tokens": 580861.0, "step": 2 }, { - "epoch": 0.016304347826086956, - "grad_norm": 31.108447609992588, - "learning_rate": 1.1764705882352942e-06, - "loss": 2.4621, - "num_tokens": 2596146.0, + "epoch": 0.04838709677419355, + "grad_norm": 17.128791951644, + "learning_rate": 5e-06, + "loss": 1.9838, + "num_tokens": 877737.0, "step": 3 }, { - "epoch": 0.021739130434782608, - "grad_norm": 30.000375004622274, - "learning_rate": 1.7647058823529414e-06, - "loss": 2.4652, - "num_tokens": 3464061.0, + "epoch": 0.06451612903225806, + "grad_norm": 12.56253606523345, + "learning_rate": 7.500000000000001e-06, + "loss": 1.9359, + "num_tokens": 1169090.0, "step": 4 }, { - "epoch": 0.02717391304347826, - "grad_norm": 26.594555017169228, - "learning_rate": 2.3529411764705885e-06, - "loss": 2.4123, - "num_tokens": 4302551.0, + "epoch": 0.08064516129032258, + "grad_norm": 6.340013165071523, + "learning_rate": 1e-05, + "loss": 1.8001, + "num_tokens": 1421402.0, "step": 5 }, { - "epoch": 0.03260869565217391, - "grad_norm": 18.864816379816464, - "learning_rate": 2.9411764705882355e-06, - "loss": 2.3219, - "num_tokens": 5154623.0, + "epoch": 0.0967741935483871, + "grad_norm": 2.5907459760185483, + "learning_rate": 9.998457962390009e-06, + "loss": 1.7182, + "num_tokens": 1748430.0, "step": 6 }, { - "epoch": 0.03804347826086957, - "grad_norm": 10.446852684747173, - "learning_rate": 3.529411764705883e-06, - "loss": 2.2219, - "num_tokens": 5954398.0, + "epoch": 0.11290322580645161, + "grad_norm": 1.993397891811303, + "learning_rate": 9.993832906395583e-06, + "loss": 1.6561, + "num_tokens": 2026425.0, "step": 7 }, { - "epoch": 0.043478260869565216, - "grad_norm": 9.024830472433766, - "learning_rate": 4.11764705882353e-06, - "loss": 2.1842, - "num_tokens": 6758441.0, + "epoch": 0.12903225806451613, + "grad_norm": 1.4145167358442785, + "learning_rate": 9.986128001799077e-06, + "loss": 1.6065, + "num_tokens": 2320763.0, "step": 8 }, { - "epoch": 0.04891304347826087, - "grad_norm": 4.294804553874004, - "learning_rate": 4.705882352941177e-06, - "loss": 2.0795, - "num_tokens": 7651800.0, + "epoch": 0.14516129032258066, + "grad_norm": 1.1519464875365162, + "learning_rate": 9.97534852915723e-06, + "loss": 1.5791, + "num_tokens": 2606598.0, "step": 9 }, { - "epoch": 0.05434782608695652, - "grad_norm": 3.831516223584368, - "learning_rate": 5.294117647058824e-06, - "loss": 2.0564, - "num_tokens": 8431972.0, + "epoch": 0.16129032258064516, + "grad_norm": 1.009580165383821, + "learning_rate": 9.961501876182148e-06, + "loss": 1.5296, + "num_tokens": 2882213.0, "step": 10 }, { - "epoch": 0.059782608695652176, - "grad_norm": 3.1066148849044355, - "learning_rate": 5.882352941176471e-06, - "loss": 2.005, - "num_tokens": 9325646.0, + "epoch": 0.1774193548387097, + "grad_norm": 1.2058759135949697, + "learning_rate": 9.94459753267812e-06, + "loss": 1.3391, + "num_tokens": 3073573.0, "step": 11 }, { - "epoch": 0.06521739130434782, - "grad_norm": 1.9811391246411751, - "learning_rate": 6.470588235294119e-06, - "loss": 1.9594, - "num_tokens": 10123238.0, + "epoch": 0.1935483870967742, + "grad_norm": 0.7676255805716512, + "learning_rate": 9.924647084037798e-06, + "loss": 1.5614, + "num_tokens": 3373574.0, "step": 12 }, { - "epoch": 0.07065217391304347, - "grad_norm": 1.815307583357642, - "learning_rate": 7.058823529411766e-06, - "loss": 1.933, - "num_tokens": 10986330.0, + "epoch": 0.20967741935483872, + "grad_norm": 0.6826412328593823, + "learning_rate": 9.901664203302126e-06, + "loss": 1.4283, + "num_tokens": 3631840.0, "step": 13 }, { - "epoch": 0.07608695652173914, - "grad_norm": 1.5197953846158025, - "learning_rate": 7.647058823529411e-06, - "loss": 1.932, - "num_tokens": 11858835.0, + "epoch": 0.22580645161290322, + "grad_norm": 0.7077083320322445, + "learning_rate": 9.875664641789545e-06, + "loss": 1.3923, + "num_tokens": 3915934.0, "step": 14 }, { - "epoch": 0.08152173913043478, - "grad_norm": 1.2531688120792746, - "learning_rate": 8.23529411764706e-06, - "loss": 1.8809, - "num_tokens": 12632696.0, + "epoch": 0.24193548387096775, + "grad_norm": 0.6141603112481281, + "learning_rate": 9.846666218300808e-06, + "loss": 1.4541, + "num_tokens": 4202285.0, "step": 15 }, { - "epoch": 0.08695652173913043, - "grad_norm": 1.0095986761399403, - "learning_rate": 8.823529411764707e-06, - "loss": 1.8721, - "num_tokens": 13473339.0, + "epoch": 0.25806451612903225, + "grad_norm": 0.5548222636117033, + "learning_rate": 9.814688806906869e-06, + "loss": 1.4786, + "num_tokens": 4507975.0, "step": 16 }, { - "epoch": 0.09239130434782608, - "grad_norm": 0.9721665109368234, - "learning_rate": 9.411764705882354e-06, - "loss": 1.808, - "num_tokens": 14333010.0, + "epoch": 0.27419354838709675, + "grad_norm": 0.5658935043666198, + "learning_rate": 9.779754323328192e-06, + "loss": 1.3057, + "num_tokens": 4739046.0, "step": 17 }, { - "epoch": 0.09782608695652174, - "grad_norm": 0.8709709682954093, - "learning_rate": 1e-05, - "loss": 1.7815, - "num_tokens": 15173517.0, + "epoch": 0.2903225806451613, + "grad_norm": 0.5037446088693822, + "learning_rate": 9.741886709914804e-06, + "loss": 1.4294, + "num_tokens": 5076962.0, "step": 18 }, { - "epoch": 0.10326086956521739, - "grad_norm": 0.6803540631317997, - "learning_rate": 9.99992241577049e-06, - "loss": 1.758, - "num_tokens": 15957365.0, + "epoch": 0.3064516129032258, + "grad_norm": 0.5378930507735589, + "learning_rate": 9.701111919237408e-06, + "loss": 1.3543, + "num_tokens": 5358491.0, "step": 19 }, { - "epoch": 0.10869565217391304, - "grad_norm": 0.6679029150502125, - "learning_rate": 9.999689665757205e-06, - "loss": 1.7524, - "num_tokens": 16745072.0, + "epoch": 0.3225806451612903, + "grad_norm": 0.5184640141452085, + "learning_rate": 9.65745789630079e-06, + "loss": 1.3506, + "num_tokens": 5604193.0, "step": 20 }, { - "epoch": 0.11413043478260869, - "grad_norm": 0.6618774203762685, - "learning_rate": 9.999301757985807e-06, - "loss": 1.7412, - "num_tokens": 17580494.0, + "epoch": 0.3387096774193548, + "grad_norm": 0.46831532530186054, + "learning_rate": 9.610954559391704e-06, + "loss": 1.4229, + "num_tokens": 5933679.0, "step": 21 }, { - "epoch": 0.11956521739130435, - "grad_norm": 0.5913838446441821, - "learning_rate": 9.998758705832084e-06, - "loss": 1.6931, - "num_tokens": 18417515.0, + "epoch": 0.3548387096774194, + "grad_norm": 0.4399291891190999, + "learning_rate": 9.561633779574375e-06, + "loss": 1.5099, + "num_tokens": 6300341.0, "step": 22 }, { - "epoch": 0.125, - "grad_norm": 0.5660660798912789, - "learning_rate": 9.998060528021493e-06, - "loss": 1.6828, - "num_tokens": 19260295.0, + "epoch": 0.3709677419354839, + "grad_norm": 0.5384364821541251, + "learning_rate": 9.509529358847655e-06, + "loss": 1.2766, + "num_tokens": 6565793.0, "step": 23 }, { - "epoch": 0.13043478260869565, - "grad_norm": 0.5724367315512363, - "learning_rate": 9.99720724862852e-06, - "loss": 1.6306, - "num_tokens": 20067769.0, + "epoch": 0.3870967741935484, + "grad_norm": 0.4763787750395734, + "learning_rate": 9.454677006978843e-06, + "loss": 1.3702, + "num_tokens": 6848147.0, "step": 24 }, { - "epoch": 0.1358695652173913, - "grad_norm": 0.4987897240832889, - "learning_rate": 9.996198897075842e-06, - "loss": 1.6535, - "num_tokens": 20923659.0, + "epoch": 0.4032258064516129, + "grad_norm": 0.524933436689644, + "learning_rate": 9.397114317029975e-06, + "loss": 1.1909, + "num_tokens": 7079129.0, "step": 25 }, { - "epoch": 0.14130434782608695, - "grad_norm": 0.569279701787871, - "learning_rate": 9.995035508133316e-06, - "loss": 1.5989, - "num_tokens": 21753912.0, + "epoch": 0.41935483870967744, + "grad_norm": 0.5589595769627278, + "learning_rate": 9.336880739593415e-06, + "loss": 1.2973, + "num_tokens": 7292676.0, "step": 26 }, { - "epoch": 0.14673913043478262, - "grad_norm": 0.5527506476811272, - "learning_rate": 9.993717121916778e-06, - "loss": 1.6212, - "num_tokens": 22671460.0, + "epoch": 0.43548387096774194, + "grad_norm": 0.5148871595386001, + "learning_rate": 9.274017555754408e-06, + "loss": 1.365, + "num_tokens": 7544904.0, "step": 27 }, { - "epoch": 0.15217391304347827, - "grad_norm": 0.4626809535443287, - "learning_rate": 9.992243783886663e-06, - "loss": 1.5866, - "num_tokens": 23397618.0, + "epoch": 0.45161290322580644, + "grad_norm": 0.48028169158196776, + "learning_rate": 9.20856784879907e-06, + "loss": 1.3684, + "num_tokens": 7857723.0, "step": 28 }, { - "epoch": 0.15760869565217392, - "grad_norm": 0.5294266278853005, - "learning_rate": 9.990615544846439e-06, - "loss": 1.5405, - "num_tokens": 24241586.0, + "epoch": 0.46774193548387094, + "grad_norm": 0.4808194286278664, + "learning_rate": 9.140576474687263e-06, + "loss": 1.3565, + "num_tokens": 8147374.0, "step": 29 }, { - "epoch": 0.16304347826086957, - "grad_norm": 0.45086474263260157, - "learning_rate": 9.988832460940846e-06, - "loss": 1.5416, - "num_tokens": 25109486.0, + "epoch": 0.4838709677419355, + "grad_norm": 0.506071943414548, + "learning_rate": 9.070090031310558e-06, + "loss": 1.2919, + "num_tokens": 8416061.0, "step": 30 }, { - "epoch": 0.16847826086956522, - "grad_norm": 0.5287629110395582, - "learning_rate": 9.986894593653969e-06, - "loss": 1.4971, - "num_tokens": 25837433.0, + "epoch": 0.5, + "grad_norm": 0.5021781927784371, + "learning_rate": 8.99715682655637e-06, + "loss": 1.2801, + "num_tokens": 8664701.0, "step": 31 }, { - "epoch": 0.17391304347826086, - "grad_norm": 0.5444997747567997, - "learning_rate": 9.984802009807117e-06, - "loss": 1.5018, - "num_tokens": 26717387.0, + "epoch": 0.5161290322580645, + "grad_norm": 0.42732442561710704, + "learning_rate": 8.92182684520014e-06, + "loss": 1.3522, + "num_tokens": 8978731.0, "step": 32 }, { - "epoch": 0.1793478260869565, - "grad_norm": 0.421750120715681, - "learning_rate": 9.982554781556512e-06, - "loss": 1.4965, - "num_tokens": 27566062.0, + "epoch": 0.532258064516129, + "grad_norm": 0.44909325150666163, + "learning_rate": 8.844151714648274e-06, + "loss": 1.3614, + "num_tokens": 9282179.0, "step": 33 }, { - "epoch": 0.18478260869565216, - "grad_norm": 0.4443148972342859, - "learning_rate": 9.98015298639081e-06, - "loss": 1.4654, - "num_tokens": 28399076.0, + "epoch": 0.5483870967741935, + "grad_norm": 0.4824455673507225, + "learning_rate": 8.764184669555295e-06, + "loss": 1.2084, + "num_tokens": 9502743.0, "step": 34 }, { - "epoch": 0.19021739130434784, - "grad_norm": 0.45877290987079933, - "learning_rate": 9.977596707128424e-06, - "loss": 1.4661, - "num_tokens": 29213011.0, + "epoch": 0.5645161290322581, + "grad_norm": 0.47992163887297146, + "learning_rate": 8.681980515339464e-06, + "loss": 1.1844, + "num_tokens": 9743949.0, "step": 35 }, { - "epoch": 0.1956521739130435, - "grad_norm": 0.41374557301191983, - "learning_rate": 9.974886031914665e-06, - "loss": 1.4237, - "num_tokens": 30060053.0, + "epoch": 0.5806451612903226, + "grad_norm": 0.46178742851005894, + "learning_rate": 8.597595590621893e-06, + "loss": 1.1921, + "num_tokens": 9959600.0, "step": 36 }, { - "epoch": 0.20108695652173914, - "grad_norm": 0.39953873131094225, - "learning_rate": 9.972021054218712e-06, - "loss": 1.4068, - "num_tokens": 30907525.0, + "epoch": 0.5967741935483871, + "grad_norm": 0.4340374656500575, + "learning_rate": 8.511087728614863e-06, + "loss": 1.3161, + "num_tokens": 10262291.0, "step": 37 }, { - "epoch": 0.20652173913043478, - "grad_norm": 0.35550524219246343, - "learning_rate": 9.969001872830383e-06, - "loss": 1.4106, - "num_tokens": 31678105.0, + "epoch": 0.6129032258064516, + "grad_norm": 0.45433902955394256, + "learning_rate": 8.422516217485826e-06, + "loss": 1.2079, + "num_tokens": 10543614.0, "step": 38 }, { - "epoch": 0.21195652173913043, - "grad_norm": 0.39325204679064013, - "learning_rate": 9.965828591856725e-06, - "loss": 1.3806, - "num_tokens": 32502810.0, + "epoch": 0.6290322580645161, + "grad_norm": 0.46311608539915283, + "learning_rate": 8.331941759724268e-06, + "loss": 1.2788, + "num_tokens": 10811522.0, "step": 39 }, { - "epoch": 0.21739130434782608, - "grad_norm": 0.3292339428008424, - "learning_rate": 9.962501320718432e-06, - "loss": 1.4045, - "num_tokens": 33366912.0, + "epoch": 0.6451612903225806, + "grad_norm": 0.4366093034406423, + "learning_rate": 8.239426430539243e-06, + "loss": 1.305, + "num_tokens": 11120108.0, "step": 40 }, { - "epoch": 0.22282608695652173, - "grad_norm": 0.3056101339882414, - "learning_rate": 9.959020174146066e-06, - "loss": 1.3811, - "num_tokens": 34235993.0, + "epoch": 0.6612903225806451, + "grad_norm": 0.5072981753583605, + "learning_rate": 8.14503363531613e-06, + "loss": 1.1433, + "num_tokens": 11325841.0, "step": 41 }, { - "epoch": 0.22826086956521738, - "grad_norm": 0.3913174080963275, - "learning_rate": 9.955385272176108e-06, - "loss": 1.4036, - "num_tokens": 35033428.0, + "epoch": 0.6774193548387096, + "grad_norm": 0.477037782285179, + "learning_rate": 8.048828066161748e-06, + "loss": 1.1719, + "num_tokens": 11569462.0, "step": 42 }, { - "epoch": 0.23369565217391305, - "grad_norm": 0.31162982854342974, - "learning_rate": 9.951596740146809e-06, - "loss": 1.4087, - "num_tokens": 35858670.0, + "epoch": 0.6935483870967742, + "grad_norm": 0.46609436274208466, + "learning_rate": 7.950875657567624e-06, + "loss": 1.1795, + "num_tokens": 11814487.0, "step": 43 }, { - "epoch": 0.2391304347826087, - "grad_norm": 0.3449184872880199, - "learning_rate": 9.947654708693872e-06, - "loss": 1.3565, - "num_tokens": 36640735.0, + "epoch": 0.7096774193548387, + "grad_norm": 0.4298159483980554, + "learning_rate": 7.85124354122177e-06, + "loss": 1.375, + "num_tokens": 12144553.0, "step": 44 }, { - "epoch": 0.24456521739130435, - "grad_norm": 0.35069789860103123, - "learning_rate": 9.943559313745957e-06, - "loss": 1.3857, - "num_tokens": 37442617.0, + "epoch": 0.7258064516129032, + "grad_norm": 0.4708649466235671, + "learning_rate": 7.75e-06, + "loss": 1.1701, + "num_tokens": 12400951.0, "step": 45 }, { - "epoch": 0.25, - "grad_norm": 0.3357337683263222, - "learning_rate": 9.939310696519977e-06, - "loss": 1.3612, - "num_tokens": 38262668.0, + "epoch": 0.7419354838709677, + "grad_norm": 0.44791488338817836, + "learning_rate": 7.64721442116824e-06, + "loss": 1.3548, + "num_tokens": 12693220.0, "step": 46 }, { - "epoch": 0.2554347826086957, - "grad_norm": 0.3269706093287553, - "learning_rate": 9.93490900351624e-06, - "loss": 1.4312, - "num_tokens": 39043577.0, + "epoch": 0.7580645161290323, + "grad_norm": 0.4433596715974645, + "learning_rate": 7.5429572488279615e-06, + "loss": 1.2491, + "num_tokens": 12977731.0, "step": 47 }, { - "epoch": 0.2608695652173913, - "grad_norm": 0.31667080944485854, - "learning_rate": 9.930354386513399e-06, - "loss": 1.3592, - "num_tokens": 39916985.0, + "epoch": 0.7741935483870968, + "grad_norm": 0.44185232442897254, + "learning_rate": 7.437299935637329e-06, + "loss": 1.2102, + "num_tokens": 13259223.0, "step": 48 }, { - "epoch": 0.266304347826087, - "grad_norm": 0.32866308366553426, - "learning_rate": 9.925647002563205e-06, - "loss": 1.3687, - "num_tokens": 40728183.0, + "epoch": 0.7903225806451613, + "grad_norm": 0.4472937078922799, + "learning_rate": 7.330314893841102e-06, + "loss": 1.3116, + "num_tokens": 13573794.0, "step": 49 }, { - "epoch": 0.2717391304347826, - "grad_norm": 0.3417326900026665, - "learning_rate": 9.920787013985106e-06, - "loss": 1.3411, - "num_tokens": 41591690.0, + "epoch": 0.8064516129032258, + "grad_norm": 0.41299180492717147, + "learning_rate": 7.222075445642904e-06, + "loss": 1.2618, + "num_tokens": 13899767.0, "step": 50 }, { - "epoch": 0.27717391304347827, - "grad_norm": 0.3143667375050653, - "learning_rate": 9.915774588360649e-06, - "loss": 1.3622, - "num_tokens": 42474792.0, + "epoch": 0.8225806451612904, + "grad_norm": 0.46038863199961155, + "learning_rate": 7.11265577295385e-06, + "loss": 1.1971, + "num_tokens": 14151819.0, "step": 51 }, { - "epoch": 0.2826086956521739, - "grad_norm": 0.39102018844666386, - "learning_rate": 9.910609898527686e-06, - "loss": 1.3498, - "num_tokens": 43338238.0, + "epoch": 0.8387096774193549, + "grad_norm": 0.450651782222284, + "learning_rate": 7.002130866551969e-06, + "loss": 1.2098, + "num_tokens": 14410241.0, "step": 52 }, { - "epoch": 0.28804347826086957, - "grad_norm": 0.32307873691348704, - "learning_rate": 9.905293122574433e-06, - "loss": 1.3416, - "num_tokens": 44166705.0, + "epoch": 0.8548387096774194, + "grad_norm": 0.4863917693028348, + "learning_rate": 6.890576474687264e-06, + "loss": 1.2556, + "num_tokens": 14667870.0, "step": 53 }, { - "epoch": 0.29347826086956524, - "grad_norm": 0.39172034819128265, - "learning_rate": 9.89982444383332e-06, - "loss": 1.3893, - "num_tokens": 45043592.0, + "epoch": 0.8709677419354839, + "grad_norm": 0.4556360391443774, + "learning_rate": 6.778069051167654e-06, + "loss": 1.227, + "num_tokens": 14958182.0, "step": 54 }, { - "epoch": 0.29891304347826086, - "grad_norm": 0.3754744196617686, - "learning_rate": 9.89420405087467e-06, - "loss": 1.339, - "num_tokens": 45896379.0, + "epoch": 0.8870967741935484, + "grad_norm": 0.46399595849428665, + "learning_rate": 6.664685702961344e-06, + "loss": 1.2075, + "num_tokens": 15186149.0, "step": 55 }, { - "epoch": 0.30434782608695654, - "grad_norm": 0.4073887677998757, - "learning_rate": 9.888432137500194e-06, - "loss": 1.3368, - "num_tokens": 46824934.0, + "epoch": 0.9032258064516129, + "grad_norm": 0.46467115415892357, + "learning_rate": 6.550504137351576e-06, + "loss": 1.1141, + "num_tokens": 15418340.0, "step": 56 }, { - "epoch": 0.30978260869565216, - "grad_norm": 0.4034319004576986, - "learning_rate": 9.88250890273632e-06, - "loss": 1.3016, - "num_tokens": 47724850.0, + "epoch": 0.9193548387096774, + "grad_norm": 0.4479031632733105, + "learning_rate": 6.4356026086799176e-06, + "loss": 1.231, + "num_tokens": 15687896.0, "step": 57 }, { - "epoch": 0.31521739130434784, - "grad_norm": 0.3540569448635205, - "learning_rate": 9.876434550827315e-06, - "loss": 1.2675, - "num_tokens": 48481161.0, + "epoch": 0.9354838709677419, + "grad_norm": 0.4787645012496271, + "learning_rate": 6.3200598647146645e-06, + "loss": 1.2224, + "num_tokens": 15939836.0, "step": 58 }, { - "epoch": 0.32065217391304346, - "grad_norm": 0.37996194073137046, - "learning_rate": 9.87020929122825e-06, - "loss": 1.3083, - "num_tokens": 49281414.0, + "epoch": 0.9516129032258065, + "grad_norm": 0.4784147917094902, + "learning_rate": 6.20395509268104e-06, + "loss": 1.1783, + "num_tokens": 16182264.0, "step": 59 }, { - "epoch": 0.32608695652173914, - "grad_norm": 0.2886408351426047, - "learning_rate": 9.86383333859778e-06, - "loss": 1.313, - "num_tokens": 50122650.0, + "epoch": 0.967741935483871, + "grad_norm": 0.4409279029067165, + "learning_rate": 6.087367864990234e-06, + "loss": 1.2301, + "num_tokens": 16479812.0, "step": 60 }, { - "epoch": 0.33152173913043476, - "grad_norm": 0.49520282031068646, - "learning_rate": 9.857306912790737e-06, - "loss": 1.2704, - "num_tokens": 50931600.0, + "epoch": 0.9838709677419355, + "grad_norm": 0.46090210524534153, + "learning_rate": 5.970378084704441e-06, + "loss": 1.1717, + "num_tokens": 16745204.0, "step": 61 }, { - "epoch": 0.33695652173913043, - "grad_norm": 0.2845258505703522, - "learning_rate": 9.850630238850549e-06, - "loss": 1.3057, - "num_tokens": 51863119.0, + "epoch": 1.0, + "grad_norm": 0.44871916967703795, + "learning_rate": 5.8530659307753034e-06, + "loss": 1.2622, + "num_tokens": 17062353.0, + "step": 62 + }, + { + "epoch": 1.0, + "eval_loss": 0.7913813591003418, + "eval_num_tokens": 17062353.0, + "eval_runtime": 16.1766, + "eval_samples_per_second": 26.952, + "eval_steps_per_second": 3.4, "step": 62 }, { - "epoch": 0.3423913043478261, - "grad_norm": 0.3736561127913658, - "learning_rate": 9.843803547001487e-06, - "loss": 1.3444, - "num_tokens": 52720131.0, + "epoch": 1.0161290322580645, + "grad_norm": 0.5678161695951662, + "learning_rate": 5.735511803093249e-06, + "loss": 1.164, + "num_tokens": 17330838.0, "step": 63 }, { - "epoch": 0.34782608695652173, - "grad_norm": 0.2938598936747867, - "learning_rate": 9.836827072640716e-06, - "loss": 1.3123, - "num_tokens": 53562926.0, + "epoch": 1.032258064516129, + "grad_norm": 0.49714776946648626, + "learning_rate": 5.61779626738543e-06, + "loss": 1.1099, + "num_tokens": 17584811.0, "step": 64 }, { - "epoch": 0.3532608695652174, - "grad_norm": 0.3009801939956506, - "learning_rate": 9.829701056330188e-06, - "loss": 1.3823, - "num_tokens": 54385006.0, + "epoch": 1.0483870967741935, + "grad_norm": 0.4952614524607988, + "learning_rate": 5.500000000000001e-06, + "loss": 1.068, + "num_tokens": 17816065.0, "step": 65 }, { - "epoch": 0.358695652173913, - "grad_norm": 0.3418977440700839, - "learning_rate": 9.82242574378834e-06, - "loss": 1.3006, - "num_tokens": 55136819.0, + "epoch": 1.064516129032258, + "grad_norm": 0.47737829878536614, + "learning_rate": 5.382203732614573e-06, + "loss": 1.1192, + "num_tokens": 18058289.0, "step": 66 }, { - "epoch": 0.3641304347826087, - "grad_norm": 0.297311298187414, - "learning_rate": 9.815001385881624e-06, - "loss": 1.282, - "num_tokens": 55938417.0, + "epoch": 1.0806451612903225, + "grad_norm": 0.5366710942774546, + "learning_rate": 5.264488196906753e-06, + "loss": 1.1419, + "num_tokens": 18306075.0, "step": 67 }, { - "epoch": 0.3695652173913043, - "grad_norm": 0.35597318816694135, - "learning_rate": 9.807428238615858e-06, - "loss": 1.3081, - "num_tokens": 56858823.0, + "epoch": 1.096774193548387, + "grad_norm": 0.5257565227179216, + "learning_rate": 5.1469340692247e-06, + "loss": 1.0779, + "num_tokens": 18565439.0, "step": 68 }, { - "epoch": 0.375, - "grad_norm": 0.3366963789989372, - "learning_rate": 9.799706563127395e-06, - "loss": 1.2703, - "num_tokens": 57649612.0, + "epoch": 1.1129032258064515, + "grad_norm": 0.45901117062143715, + "learning_rate": 5.02962191529556e-06, + "loss": 1.1919, + "num_tokens": 18888202.0, "step": 69 }, { - "epoch": 0.3804347826086957, - "grad_norm": 0.3242176442515732, - "learning_rate": 9.79183662567412e-06, - "loss": 1.2013, - "num_tokens": 58532874.0, + "epoch": 1.129032258064516, + "grad_norm": 0.4499403181957758, + "learning_rate": 4.912632135009769e-06, + "loss": 1.1876, + "num_tokens": 19227461.0, "step": 70 }, { - "epoch": 0.3858695652173913, - "grad_norm": 0.3089960724382554, - "learning_rate": 9.783818697626273e-06, - "loss": 1.3188, - "num_tokens": 59347395.0, + "epoch": 1.1451612903225807, + "grad_norm": 0.4457809525628867, + "learning_rate": 4.796044907318961e-06, + "loss": 1.1825, + "num_tokens": 19526976.0, "step": 71 }, { - "epoch": 0.391304347826087, - "grad_norm": 0.3566100658076413, - "learning_rate": 9.775653055457082e-06, - "loss": 1.2759, - "num_tokens": 60223045.0, + "epoch": 1.1612903225806452, + "grad_norm": 0.4844392819779952, + "learning_rate": 4.679940135285337e-06, + "loss": 1.1624, + "num_tokens": 19827305.0, "step": 72 }, { - "epoch": 0.3967391304347826, - "grad_norm": 0.31607856919293054, - "learning_rate": 9.76733998073324e-06, - "loss": 1.2761, - "num_tokens": 61112287.0, + "epoch": 1.1774193548387097, + "grad_norm": 0.49981501606286877, + "learning_rate": 4.564397391320085e-06, + "loss": 1.0475, + "num_tokens": 20060320.0, "step": 73 }, { - "epoch": 0.40217391304347827, - "grad_norm": 0.33524520211204156, - "learning_rate": 9.75887976010519e-06, - "loss": 1.2214, - "num_tokens": 61999584.0, + "epoch": 1.1935483870967742, + "grad_norm": 0.5223696586794989, + "learning_rate": 4.449495862648427e-06, + "loss": 1.1767, + "num_tokens": 20360669.0, "step": 74 }, { - "epoch": 0.4076086956521739, - "grad_norm": 0.3318957781485302, - "learning_rate": 9.750272685297241e-06, - "loss": 1.3252, - "num_tokens": 62836474.0, + "epoch": 1.2096774193548387, + "grad_norm": 0.4714460348035537, + "learning_rate": 4.335314297038656e-06, + "loss": 1.0923, + "num_tokens": 20626073.0, "step": 75 }, { - "epoch": 0.41304347826086957, - "grad_norm": 0.3405291896634781, - "learning_rate": 9.741519053097516e-06, - "loss": 1.2536, - "num_tokens": 63680479.0, + "epoch": 1.2258064516129032, + "grad_norm": 0.4642610991815464, + "learning_rate": 4.221930948832349e-06, + "loss": 1.0713, + "num_tokens": 20880729.0, "step": 76 }, { - "epoch": 0.41847826086956524, - "grad_norm": 0.31652450850198227, - "learning_rate": 9.732619165347705e-06, - "loss": 1.2756, - "num_tokens": 64500924.0, + "epoch": 1.2419354838709677, + "grad_norm": 0.6040299115460407, + "learning_rate": 4.109423525312738e-06, + "loss": 1.1555, + "num_tokens": 21158635.0, "step": 77 }, { - "epoch": 0.42391304347826086, - "grad_norm": 0.42768097965959784, - "learning_rate": 9.723573328932669e-06, - "loss": 1.2416, - "num_tokens": 65381318.0, + "epoch": 1.2580645161290323, + "grad_norm": 0.4520754012286213, + "learning_rate": 3.997869133448031e-06, + "loss": 1.1952, + "num_tokens": 21478814.0, "step": 78 }, { - "epoch": 0.42934782608695654, - "grad_norm": 0.3175011196248488, - "learning_rate": 9.71438185576985e-06, - "loss": 1.3265, - "num_tokens": 66223314.0, + "epoch": 1.2741935483870968, + "grad_norm": 0.46470644722571786, + "learning_rate": 3.887344227046149e-06, + "loss": 1.1584, + "num_tokens": 21746719.0, "step": 79 }, { - "epoch": 0.43478260869565216, - "grad_norm": 0.35099150469597595, - "learning_rate": 9.705045062798519e-06, - "loss": 1.2465, - "num_tokens": 67069071.0, + "epoch": 1.2903225806451613, + "grad_norm": 0.5174274295638124, + "learning_rate": 3.777924554357096e-06, + "loss": 1.1286, + "num_tokens": 22019852.0, "step": 80 }, { - "epoch": 0.44021739130434784, - "grad_norm": 0.31259023838571287, - "learning_rate": 9.695563271968853e-06, - "loss": 1.302, - "num_tokens": 67908788.0, + "epoch": 1.3064516129032258, + "grad_norm": 0.5667364164421924, + "learning_rate": 3.6696851061589e-06, + "loss": 0.9772, + "num_tokens": 22214967.0, "step": 81 }, { - "epoch": 0.44565217391304346, - "grad_norm": 0.342434777113354, - "learning_rate": 9.685936810230824e-06, - "loss": 1.1891, - "num_tokens": 68788781.0, + "epoch": 1.3225806451612903, + "grad_norm": 0.49744086185801495, + "learning_rate": 3.5627000643626707e-06, + "loss": 1.0362, + "num_tokens": 22435176.0, "step": 82 }, { - "epoch": 0.45108695652173914, - "grad_norm": 0.35249522091625607, - "learning_rate": 9.676166009522925e-06, - "loss": 1.2201, - "num_tokens": 69568499.0, + "epoch": 1.3387096774193548, + "grad_norm": 0.449325096358852, + "learning_rate": 3.45704275117204e-06, + "loss": 1.152, + "num_tokens": 22690775.0, "step": 83 }, { - "epoch": 0.45652173913043476, - "grad_norm": 0.3281902496464512, - "learning_rate": 9.666251206760732e-06, - "loss": 1.1964, - "num_tokens": 70417513.0, + "epoch": 1.3548387096774195, + "grad_norm": 0.4841951045590905, + "learning_rate": 3.352785578831762e-06, + "loss": 1.0755, + "num_tokens": 22922696.0, "step": 84 }, { - "epoch": 0.46195652173913043, - "grad_norm": 0.33253990126652816, - "learning_rate": 9.656192743825283e-06, - "loss": 1.3254, - "num_tokens": 71232903.0, + "epoch": 1.370967741935484, + "grad_norm": 0.5063933022129122, + "learning_rate": 3.2500000000000015e-06, + "loss": 0.9671, + "num_tokens": 23153591.0, "step": 85 }, { - "epoch": 0.4673913043478261, - "grad_norm": 0.34464296190071064, - "learning_rate": 9.645990967551287e-06, - "loss": 1.2181, - "num_tokens": 72117100.0, + "epoch": 1.3870967741935485, + "grad_norm": 0.461942743253413, + "learning_rate": 3.1487564587782306e-06, + "loss": 1.172, + "num_tokens": 23441243.0, "step": 86 }, { - "epoch": 0.47282608695652173, - "grad_norm": 0.32303008461614624, - "learning_rate": 9.635646229715168e-06, - "loss": 1.2541, - "num_tokens": 72955942.0, + "epoch": 1.403225806451613, + "grad_norm": 0.4634253227239053, + "learning_rate": 3.0491243424323787e-06, + "loss": 1.1146, + "num_tokens": 23737358.0, "step": 87 }, { - "epoch": 0.4782608695652174, - "grad_norm": 0.4087448612559883, - "learning_rate": 9.62515888702293e-06, - "loss": 1.2175, - "num_tokens": 73799335.0, + "epoch": 1.4193548387096775, + "grad_norm": 0.4751346862673272, + "learning_rate": 2.951171933838254e-06, + "loss": 1.0479, + "num_tokens": 23968817.0, "step": 88 }, { - "epoch": 0.483695652173913, - "grad_norm": 0.32968875759796035, - "learning_rate": 9.614529301097867e-06, - "loss": 1.3086, - "num_tokens": 74583507.0, + "epoch": 1.435483870967742, + "grad_norm": 0.45323636924397753, + "learning_rate": 2.854966364683872e-06, + "loss": 1.0562, + "num_tokens": 24256864.0, "step": 89 }, { - "epoch": 0.4891304347826087, - "grad_norm": 0.4968077240893464, - "learning_rate": 9.603757838468079e-06, - "loss": 1.2153, - "num_tokens": 75427148.0, + "epoch": 1.4516129032258065, + "grad_norm": 0.46528677302886307, + "learning_rate": 2.760573569460757e-06, + "loss": 1.0695, + "num_tokens": 24501706.0, "step": 90 }, { - "epoch": 0.4945652173913043, - "grad_norm": 0.30442420617213245, - "learning_rate": 9.592844870553849e-06, - "loss": 1.2644, - "num_tokens": 76235709.0, + "epoch": 1.467741935483871, + "grad_norm": 0.505657466750309, + "learning_rate": 2.6680582402757326e-06, + "loss": 1.1085, + "num_tokens": 24786181.0, "step": 91 }, { - "epoch": 0.5, - "grad_norm": 0.42123695977036113, - "learning_rate": 9.581790773654821e-06, - "loss": 1.2912, - "num_tokens": 77072728.0, + "epoch": 1.4838709677419355, + "grad_norm": 0.5206793115485164, + "learning_rate": 2.577483782514174e-06, + "loss": 1.1014, + "num_tokens": 25088184.0, "step": 92 }, { - "epoch": 0.5054347826086957, - "grad_norm": 0.3233765627590572, - "learning_rate": 9.57059592893704e-06, - "loss": 1.206, - "num_tokens": 77937646.0, + "epoch": 1.5, + "grad_norm": 0.45895212888032455, + "learning_rate": 2.4889122713851397e-06, + "loss": 1.1286, + "num_tokens": 25383583.0, "step": 93 }, { - "epoch": 0.5108695652173914, - "grad_norm": 0.5054335403354332, - "learning_rate": 9.55926072241979e-06, - "loss": 1.2503, - "num_tokens": 78761723.0, + "epoch": 1.5161290322580645, + "grad_norm": 0.475442254498189, + "learning_rate": 2.402404409378107e-06, + "loss": 1.1804, + "num_tokens": 25680306.0, "step": 94 }, { - "epoch": 0.5163043478260869, - "grad_norm": 0.31425650067487326, - "learning_rate": 9.547785544962303e-06, - "loss": 1.2237, - "num_tokens": 79555511.0, + "epoch": 1.532258064516129, + "grad_norm": 0.4681465978823353, + "learning_rate": 2.3180194846605367e-06, + "loss": 1.0562, + "num_tokens": 25916706.0, "step": 95 }, { - "epoch": 0.5217391304347826, - "grad_norm": 0.3394176534060976, - "learning_rate": 9.53617079225027e-06, - "loss": 1.2688, - "num_tokens": 80432436.0, + "epoch": 1.5483870967741935, + "grad_norm": 0.4698329009277512, + "learning_rate": 2.235815330444707e-06, + "loss": 1.0496, + "num_tokens": 26203702.0, "step": 96 }, { - "epoch": 0.5271739130434783, - "grad_norm": 0.3282087898215183, - "learning_rate": 9.524416864782196e-06, - "loss": 1.2155, - "num_tokens": 81254903.0, + "epoch": 1.564516129032258, + "grad_norm": 0.4620776407761363, + "learning_rate": 2.1558482853517257e-06, + "loss": 1.1171, + "num_tokens": 26454432.0, "step": 97 }, { - "epoch": 0.532608695652174, - "grad_norm": 0.40745731372236615, - "learning_rate": 9.51252416785559e-06, - "loss": 1.2437, - "num_tokens": 82021702.0, + "epoch": 1.5806451612903225, + "grad_norm": 0.4778576114278719, + "learning_rate": 2.0781731547998614e-06, + "loss": 1.1412, + "num_tokens": 26757353.0, "step": 98 }, { - "epoch": 0.5380434782608695, - "grad_norm": 0.3296122949882556, - "learning_rate": 9.500493111553007e-06, - "loss": 1.1678, - "num_tokens": 82863748.0, + "epoch": 1.596774193548387, + "grad_norm": 0.49222323971075366, + "learning_rate": 2.002843173443631e-06, + "loss": 1.1074, + "num_tokens": 27011103.0, "step": 99 }, { - "epoch": 0.5434782608695652, - "grad_norm": 0.36733750482254623, - "learning_rate": 9.488324110727878e-06, - "loss": 1.1795, - "num_tokens": 83720349.0, + "epoch": 1.6129032258064515, + "grad_norm": 0.4793979551450915, + "learning_rate": 1.9299099686894426e-06, + "loss": 1.0936, + "num_tokens": 27292511.0, "step": 100 }, { - "epoch": 0.5489130434782609, - "grad_norm": 0.32480120236031906, - "learning_rate": 9.476017584990229e-06, - "loss": 1.2254, - "num_tokens": 84577086.0, + "epoch": 1.629032258064516, + "grad_norm": 0.4685219627346812, + "learning_rate": 1.8594235253127373e-06, + "loss": 1.1612, + "num_tokens": 27582747.0, "step": 101 }, { - "epoch": 0.5543478260869565, - "grad_norm": 0.3456700140954568, - "learning_rate": 9.4635739586922e-06, - "loss": 1.2249, - "num_tokens": 85379172.0, + "epoch": 1.6451612903225805, + "grad_norm": 0.43357949601614376, + "learning_rate": 1.7914321512009297e-06, + "loss": 1.1014, + "num_tokens": 27890298.0, "step": 102 }, { - "epoch": 0.5597826086956522, - "grad_norm": 0.3327804028738996, - "learning_rate": 9.450993660913418e-06, - "loss": 1.246, - "num_tokens": 86178628.0, + "epoch": 1.661290322580645, + "grad_norm": 0.47228828800176886, + "learning_rate": 1.7259824442455925e-06, + "loss": 1.1231, + "num_tokens": 28189675.0, "step": 103 }, { - "epoch": 0.5652173913043478, - "grad_norm": 0.33610320319412545, - "learning_rate": 9.438277125446194e-06, - "loss": 1.2872, - "num_tokens": 86888597.0, + "epoch": 1.6774193548387095, + "grad_norm": 0.47005836494248976, + "learning_rate": 1.6631192604065856e-06, + "loss": 1.1725, + "num_tokens": 28467191.0, "step": 104 }, { - "epoch": 0.5706521739130435, - "grad_norm": 0.35507846358914824, - "learning_rate": 9.425424790780581e-06, - "loss": 1.231, - "num_tokens": 87680751.0, + "epoch": 1.6935483870967742, + "grad_norm": 0.45748103277177316, + "learning_rate": 1.602885682970026e-06, + "loss": 1.0356, + "num_tokens": 28721358.0, "step": 105 }, { - "epoch": 0.5760869565217391, - "grad_norm": 0.37608759046628953, - "learning_rate": 9.412437100089236e-06, - "loss": 1.2717, - "num_tokens": 88523003.0, + "epoch": 1.7096774193548387, + "grad_norm": 0.4704671291960182, + "learning_rate": 1.5453229930211567e-06, + "loss": 0.9526, + "num_tokens": 28961647.0, "step": 106 }, { - "epoch": 0.5815217391304348, - "grad_norm": 0.3371473758227733, - "learning_rate": 9.39931450121215e-06, - "loss": 1.1768, - "num_tokens": 89341610.0, + "epoch": 1.7258064516129032, + "grad_norm": 0.46468131549229014, + "learning_rate": 1.490470641152345e-06, + "loss": 1.0949, + "num_tokens": 29240678.0, "step": 107 }, { - "epoch": 0.5869565217391305, - "grad_norm": 0.4098885996751854, - "learning_rate": 9.386057446641195e-06, - "loss": 1.207, - "num_tokens": 90208259.0, + "epoch": 1.7419354838709677, + "grad_norm": 0.4560208901940087, + "learning_rate": 1.4383662204256283e-06, + "loss": 1.0717, + "num_tokens": 29494619.0, "step": 108 }, { - "epoch": 0.592391304347826, - "grad_norm": 0.3292910941182598, - "learning_rate": 9.372666393504537e-06, - "loss": 1.1822, - "num_tokens": 91019847.0, + "epoch": 1.7580645161290323, + "grad_norm": 0.46298104044213245, + "learning_rate": 1.389045440608296e-06, + "loss": 1.0458, + "num_tokens": 29743614.0, "step": 109 }, { - "epoch": 0.5978260869565217, - "grad_norm": 0.4063196810849176, - "learning_rate": 9.35914180355086e-06, - "loss": 1.1848, - "num_tokens": 91876225.0, + "epoch": 1.7741935483870968, + "grad_norm": 0.4638150820016886, + "learning_rate": 1.3425421036992098e-06, + "loss": 1.013, + "num_tokens": 30007700.0, "step": 110 }, { - "epoch": 0.6032608695652174, - "grad_norm": 0.37489279113107293, - "learning_rate": 9.345484143133447e-06, - "loss": 1.1911, - "num_tokens": 92726215.0, + "epoch": 1.7903225806451613, + "grad_norm": 0.5022999342160732, + "learning_rate": 1.2988880807625928e-06, + "loss": 1.0493, + "num_tokens": 30271127.0, "step": 111 }, { - "epoch": 0.6086956521739131, - "grad_norm": 0.3347930475232912, - "learning_rate": 9.331693883194105e-06, - "loss": 1.2572, - "num_tokens": 93557294.0, + "epoch": 1.8064516129032258, + "grad_norm": 0.47530583419287215, + "learning_rate": 1.2581132900851973e-06, + "loss": 1.0934, + "num_tokens": 30544026.0, "step": 112 }, { - "epoch": 0.6141304347826086, - "grad_norm": 0.42217331484514864, - "learning_rate": 9.317771499246918e-06, - "loss": 1.0868, - "num_tokens": 94355208.0, + "epoch": 1.8225806451612905, + "grad_norm": 0.4654618652730785, + "learning_rate": 1.2202456766718092e-06, + "loss": 1.2132, + "num_tokens": 30845167.0, "step": 113 }, { - "epoch": 0.6195652173913043, - "grad_norm": 0.35824817626614924, - "learning_rate": 9.303717471361855e-06, - "loss": 1.1502, - "num_tokens": 95180197.0, + "epoch": 1.838709677419355, + "grad_norm": 0.4758858120756182, + "learning_rate": 1.1853111930931314e-06, + "loss": 1.0661, + "num_tokens": 31116528.0, "step": 114 }, { - "epoch": 0.625, - "grad_norm": 0.3577504011359328, - "learning_rate": 9.289532284148218e-06, - "loss": 1.1584, - "num_tokens": 95965657.0, + "epoch": 1.8548387096774195, + "grad_norm": 0.4266705527526466, + "learning_rate": 1.1533337816991932e-06, + "loss": 1.1427, + "num_tokens": 31449920.0, "step": 115 }, { - "epoch": 0.6304347826086957, - "grad_norm": 0.4118880839491574, - "learning_rate": 9.275216426737924e-06, - "loss": 1.184, - "num_tokens": 96711264.0, + "epoch": 1.870967741935484, + "grad_norm": 0.46950406426161967, + "learning_rate": 1.1243353582104556e-06, + "loss": 1.0575, + "num_tokens": 31725049.0, "step": 116 }, { - "epoch": 0.6358695652173914, - "grad_norm": 0.34956250639415626, - "learning_rate": 9.260770392768652e-06, - "loss": 1.2688, - "num_tokens": 97532279.0, + "epoch": 1.8870967741935485, + "grad_norm": 0.4136012379656415, + "learning_rate": 1.0983357966978747e-06, + "loss": 1.1996, + "num_tokens": 32069007.0, "step": 117 }, { - "epoch": 0.6413043478260869, - "grad_norm": 0.3691660640244806, - "learning_rate": 9.246194680366802e-06, - "loss": 1.1405, - "num_tokens": 98354575.0, + "epoch": 1.903225806451613, + "grad_norm": 0.4517949911395405, + "learning_rate": 1.075352915962205e-06, + "loss": 1.1149, + "num_tokens": 32326796.0, "step": 118 }, { - "epoch": 0.6467391304347826, - "grad_norm": 0.39892059252243106, - "learning_rate": 9.231489792130343e-06, - "loss": 1.1899, - "num_tokens": 99122392.0, + "epoch": 1.9193548387096775, + "grad_norm": 0.4603464019921881, + "learning_rate": 1.0554024673218808e-06, + "loss": 1.1207, + "num_tokens": 32608767.0, "step": 119 }, { - "epoch": 0.6521739130434783, - "grad_norm": 0.3473137462183092, - "learning_rate": 9.216656235111463e-06, - "loss": 1.1618, - "num_tokens": 100066934.0, + "epoch": 1.935483870967742, + "grad_norm": 0.4679596836349307, + "learning_rate": 1.0384981238178535e-06, + "loss": 1.0565, + "num_tokens": 32850972.0, "step": 120 }, { - "epoch": 0.657608695652174, - "grad_norm": 0.34159360019061497, - "learning_rate": 9.201694520799086e-06, - "loss": 1.2785, - "num_tokens": 100968865.0, + "epoch": 1.9516129032258065, + "grad_norm": 0.463299490417067, + "learning_rate": 1.0246514708427703e-06, + "loss": 1.005, + "num_tokens": 33141047.0, "step": 121 }, { - "epoch": 0.6630434782608695, - "grad_norm": 0.349168816341091, - "learning_rate": 9.186605165101253e-06, - "loss": 1.1744, - "num_tokens": 101861480.0, + "epoch": 1.967741935483871, + "grad_norm": 0.4104285038193735, + "learning_rate": 1.0138719982009243e-06, + "loss": 1.1878, + "num_tokens": 33479683.0, "step": 122 }, { - "epoch": 0.6684782608695652, - "grad_norm": 0.48715571757970294, - "learning_rate": 9.171388688327307e-06, - "loss": 1.0673, - "num_tokens": 102672098.0, + "epoch": 1.9838709677419355, + "grad_norm": 0.4425250017797949, + "learning_rate": 1.0061670936044179e-06, + "loss": 1.1142, + "num_tokens": 33799628.0, "step": 123 }, { - "epoch": 0.6739130434782609, - "grad_norm": 0.40462894394274823, - "learning_rate": 9.156045615169978e-06, - "loss": 1.1947, - "num_tokens": 103464153.0, + "epoch": 2.0, + "grad_norm": 0.49001700049858166, + "learning_rate": 1.0015420376099925e-06, + "loss": 1.05, + "num_tokens": 34065558.0, "step": 124 }, { - "epoch": 0.6793478260869565, - "grad_norm": 0.400412668537012, - "learning_rate": 9.140576474687263e-06, - "loss": 1.2133, - "num_tokens": 104296723.0, - "step": 125 - }, - { - "epoch": 0.6847826086956522, - "grad_norm": 0.4267731447100407, - "learning_rate": 9.12498180028421e-06, - "loss": 1.1219, - "num_tokens": 105122812.0, - "step": 126 - }, - { - "epoch": 0.6902173913043478, - "grad_norm": 0.36403725435544143, - "learning_rate": 9.109262129694506e-06, - "loss": 1.1643, - "num_tokens": 105977186.0, - "step": 127 - }, - { - "epoch": 0.6956521739130435, - "grad_norm": 0.3689438135593094, - "learning_rate": 9.093418004961939e-06, - "loss": 1.1379, - "num_tokens": 106853500.0, - "step": 128 - }, - { - "epoch": 0.7010869565217391, - "grad_norm": 0.3699932926490634, - "learning_rate": 9.077449972421716e-06, - "loss": 1.0791, - "num_tokens": 107702626.0, - "step": 129 - }, - { - "epoch": 0.7065217391304348, - "grad_norm": 0.3557575331468489, - "learning_rate": 9.061358582681614e-06, - "loss": 1.1377, - "num_tokens": 108617243.0, - "step": 130 - }, - { - "epoch": 0.7119565217391305, - "grad_norm": 0.41864057524706433, - "learning_rate": 9.045144390603e-06, - "loss": 1.1609, - "num_tokens": 109434573.0, - "step": 131 - }, - { - "epoch": 0.717391304347826, - "grad_norm": 0.3985478205432592, - "learning_rate": 9.028807955281701e-06, - "loss": 1.1596, - "num_tokens": 110290620.0, - "step": 132 - }, - { - "epoch": 0.7228260869565217, - "grad_norm": 0.38174230253717656, - "learning_rate": 9.012349840028707e-06, - "loss": 1.2587, - "num_tokens": 111137178.0, - "step": 133 - }, - { - "epoch": 0.7282608695652174, - "grad_norm": 0.4429844876484063, - "learning_rate": 8.995770612350778e-06, - "loss": 1.0897, - "num_tokens": 111957193.0, - "step": 134 - }, - { - "epoch": 0.7336956521739131, - "grad_norm": 0.36878198276025403, - "learning_rate": 8.979070843930841e-06, - "loss": 1.1068, - "num_tokens": 112822859.0, - "step": 135 - }, - { - "epoch": 0.7391304347826086, - "grad_norm": 0.41569112518405493, - "learning_rate": 8.96225111060831e-06, - "loss": 1.1478, - "num_tokens": 113629995.0, - "step": 136 - }, - { - "epoch": 0.7445652173913043, - "grad_norm": 0.3991002343202174, - "learning_rate": 8.945311992359206e-06, - "loss": 1.149, - "num_tokens": 114406929.0, - "step": 137 - }, - { - "epoch": 0.75, - "grad_norm": 0.3851422262699403, - "learning_rate": 8.928254073276166e-06, - "loss": 1.134, - "num_tokens": 115289074.0, - "step": 138 - }, - { - "epoch": 0.7554347826086957, - "grad_norm": 0.3958650986478213, - "learning_rate": 8.911077941548306e-06, - "loss": 1.0919, - "num_tokens": 116124554.0, - "step": 139 - }, - { - "epoch": 0.7608695652173914, - "grad_norm": 0.42020377168887346, - "learning_rate": 8.893784189440937e-06, - "loss": 1.1666, - "num_tokens": 116865404.0, - "step": 140 - }, - { - "epoch": 0.7663043478260869, - "grad_norm": 0.39659340251721403, - "learning_rate": 8.876373413275139e-06, - "loss": 1.1569, - "num_tokens": 117700490.0, - "step": 141 - }, - { - "epoch": 0.7717391304347826, - "grad_norm": 0.4092121559904955, - "learning_rate": 8.858846213407201e-06, - "loss": 1.1584, - "num_tokens": 118525192.0, - "step": 142 - }, - { - "epoch": 0.7771739130434783, - "grad_norm": 0.5943065877313993, - "learning_rate": 8.841203194207925e-06, - "loss": 0.9875, - "num_tokens": 119327143.0, - "step": 143 - }, - { - "epoch": 0.782608695652174, - "grad_norm": 0.3884971352039889, - "learning_rate": 8.823444964041777e-06, - "loss": 1.1211, - "num_tokens": 120179598.0, - "step": 144 - }, - { - "epoch": 0.7880434782608695, - "grad_norm": 0.7599341798230562, - "learning_rate": 8.805572135245911e-06, - "loss": 1.0971, - "num_tokens": 120962600.0, - "step": 145 - }, - { - "epoch": 0.7934782608695652, - "grad_norm": 0.41080230187979083, - "learning_rate": 8.787585324109067e-06, - "loss": 1.1224, - "num_tokens": 121848370.0, - "step": 146 - }, - { - "epoch": 0.7989130434782609, - "grad_norm": 0.6507551224599566, - "learning_rate": 8.7694851508503e-06, - "loss": 1.0679, - "num_tokens": 122669064.0, - "step": 147 - }, - { - "epoch": 0.8043478260869565, - "grad_norm": 0.397919573924152, - "learning_rate": 8.751272239597612e-06, - "loss": 1.1958, - "num_tokens": 123524731.0, - "step": 148 - }, - { - "epoch": 0.8097826086956522, - "grad_norm": 0.7009649855156489, - "learning_rate": 8.732947218366414e-06, - "loss": 1.1231, - "num_tokens": 124370704.0, - "step": 149 - }, - { - "epoch": 0.8152173913043478, - "grad_norm": 0.4438846321792291, - "learning_rate": 8.71451071903789e-06, - "loss": 1.1334, - "num_tokens": 125139083.0, - "step": 150 - }, - { - "epoch": 0.8206521739130435, - "grad_norm": 0.5807316774912559, - "learning_rate": 8.695963377337191e-06, - "loss": 1.1102, - "num_tokens": 125968663.0, - "step": 151 - }, - { - "epoch": 0.8260869565217391, - "grad_norm": 0.47495874069754923, - "learning_rate": 8.677305832811524e-06, - "loss": 1.093, - "num_tokens": 126778083.0, - "step": 152 - }, - { - "epoch": 0.8315217391304348, - "grad_norm": 0.5997580007680087, - "learning_rate": 8.658538728808097e-06, - "loss": 1.2028, - "num_tokens": 127678753.0, - "step": 153 - }, - { - "epoch": 0.8369565217391305, - "grad_norm": 0.40279919197392205, - "learning_rate": 8.639662712451935e-06, - "loss": 1.1701, - "num_tokens": 128502938.0, - "step": 154 - }, - { - "epoch": 0.842391304347826, - "grad_norm": 0.5526326204068822, - "learning_rate": 8.620678434623563e-06, - "loss": 1.1252, - "num_tokens": 129345404.0, - "step": 155 - }, - { - "epoch": 0.8478260869565217, - "grad_norm": 0.419237231246158, - "learning_rate": 8.601586549936567e-06, - "loss": 1.0567, - "num_tokens": 130127627.0, - "step": 156 - }, - { - "epoch": 0.8532608695652174, - "grad_norm": 0.6590374996771516, - "learning_rate": 8.582387716715021e-06, - "loss": 1.096, - "num_tokens": 130908779.0, - "step": 157 - }, - { - "epoch": 0.8586956521739131, - "grad_norm": 0.4116437048227146, - "learning_rate": 8.563082596970785e-06, - "loss": 1.0645, - "num_tokens": 131740109.0, - "step": 158 - }, - { - "epoch": 0.8641304347826086, - "grad_norm": 0.7235012628969889, - "learning_rate": 8.543671856380672e-06, - "loss": 1.1546, - "num_tokens": 132563707.0, - "step": 159 - }, - { - "epoch": 0.8695652173913043, - "grad_norm": 0.4489313897309195, - "learning_rate": 8.524156164263509e-06, - "loss": 1.1447, - "num_tokens": 133376972.0, - "step": 160 - }, - { - "epoch": 0.875, - "grad_norm": 0.7048899032217469, - "learning_rate": 8.504536193557049e-06, - "loss": 1.0747, - "num_tokens": 134196485.0, - "step": 161 - }, - { - "epoch": 0.8804347826086957, - "grad_norm": 0.7655990331835255, - "learning_rate": 8.484812620794757e-06, - "loss": 1.0732, - "num_tokens": 134961911.0, - "step": 162 - }, - { - "epoch": 0.8858695652173914, - "grad_norm": 0.7306102642093804, - "learning_rate": 8.4649861260825e-06, - "loss": 1.1207, - "num_tokens": 135841447.0, - "step": 163 - }, - { - "epoch": 0.8913043478260869, - "grad_norm": 0.7318731588610641, - "learning_rate": 8.445057393075088e-06, - "loss": 1.1429, - "num_tokens": 136657977.0, - "step": 164 - }, - { - "epoch": 0.8967391304347826, - "grad_norm": 0.5519109731700101, - "learning_rate": 8.425027108952693e-06, - "loss": 1.0926, - "num_tokens": 137439821.0, - "step": 165 - }, - { - "epoch": 0.9021739130434783, - "grad_norm": 0.729877195905496, - "learning_rate": 8.404895964397166e-06, - "loss": 1.0711, - "num_tokens": 138324575.0, - "step": 166 - }, - { - "epoch": 0.907608695652174, - "grad_norm": 0.4841610288318797, - "learning_rate": 8.384664653568213e-06, - "loss": 1.0236, - "num_tokens": 139110177.0, - "step": 167 - }, - { - "epoch": 0.9130434782608695, - "grad_norm": 0.6132670925841823, - "learning_rate": 8.364333874079462e-06, - "loss": 1.1489, - "num_tokens": 139984515.0, - "step": 168 - }, - { - "epoch": 0.9184782608695652, - "grad_norm": 0.4702511572635688, - "learning_rate": 8.343904326974409e-06, - "loss": 1.0838, - "num_tokens": 140794223.0, - "step": 169 - }, - { - "epoch": 0.9239130434782609, - "grad_norm": 0.47600806420109454, - "learning_rate": 8.323376716702236e-06, - "loss": 1.1584, - "num_tokens": 141671790.0, - "step": 170 - }, - { - "epoch": 0.9293478260869565, - "grad_norm": 0.4892484481019011, - "learning_rate": 8.302751751093539e-06, - "loss": 1.087, - "num_tokens": 142589193.0, - "step": 171 - }, - { - "epoch": 0.9347826086956522, - "grad_norm": 0.39811210734784225, - "learning_rate": 8.282030141335899e-06, - "loss": 1.1127, - "num_tokens": 143424641.0, - "step": 172 - }, - { - "epoch": 0.9402173913043478, - "grad_norm": 0.4735646138056289, - "learning_rate": 8.261212601949374e-06, - "loss": 1.1186, - "num_tokens": 144254056.0, - "step": 173 - }, - { - "epoch": 0.9456521739130435, - "grad_norm": 0.43457638568705226, - "learning_rate": 8.240299850761851e-06, - "loss": 1.1558, - "num_tokens": 145058100.0, - "step": 174 - }, - { - "epoch": 0.9510869565217391, - "grad_norm": 0.5474669838467183, - "learning_rate": 8.219292608884309e-06, - "loss": 1.0842, - "num_tokens": 145874986.0, - "step": 175 - }, - { - "epoch": 0.9565217391304348, - "grad_norm": 0.42042241204376524, - "learning_rate": 8.198191600685931e-06, - "loss": 1.0286, - "num_tokens": 146747131.0, - "step": 176 - }, - { - "epoch": 0.9619565217391305, - "grad_norm": 0.4401784756495304, - "learning_rate": 8.176997553769146e-06, - "loss": 1.0931, - "num_tokens": 147579942.0, - "step": 177 - }, - { - "epoch": 0.967391304347826, - "grad_norm": 0.4198879718494924, - "learning_rate": 8.155711198944536e-06, - "loss": 1.0821, - "num_tokens": 148454246.0, - "step": 178 - }, - { - "epoch": 0.9728260869565217, - "grad_norm": 0.42262883519991834, - "learning_rate": 8.134333270205624e-06, - "loss": 1.1195, - "num_tokens": 149217392.0, - "step": 179 - }, - { - "epoch": 0.9782608695652174, - "grad_norm": 0.43257345855354584, - "learning_rate": 8.112864504703582e-06, - "loss": 1.1016, - "num_tokens": 150120592.0, - "step": 180 - }, - { - "epoch": 0.9836956521739131, - "grad_norm": 0.5443709834488111, - "learning_rate": 8.0913056427218e-06, - "loss": 1.0348, - "num_tokens": 151030177.0, - "step": 181 - }, - { - "epoch": 0.9891304347826086, - "grad_norm": 0.45036072810937827, - "learning_rate": 8.069657427650364e-06, - "loss": 1.0514, - "num_tokens": 151868928.0, - "step": 182 - }, - { - "epoch": 0.9945652173913043, - "grad_norm": 0.6115163980654007, - "learning_rate": 8.047920605960428e-06, - "loss": 1.0633, - "num_tokens": 152709677.0, - "step": 183 - }, - { - "epoch": 1.0, - "grad_norm": 0.4423384270708878, - "learning_rate": 8.026095927178458e-06, - "loss": 1.0377, - "num_tokens": 153598466.0, - "step": 184 - }, - { - "epoch": 1.0, - "eval_loss": 0.8116728663444519, - "eval_num_tokens": 153598466.0, - "eval_runtime": 116.3987, - "eval_samples_per_second": 44.94, - "eval_steps_per_second": 5.619, - "step": 184 - }, - { - "epoch": 1.0054347826086956, - "grad_norm": 0.7729891726078799, - "learning_rate": 8.004184143860408e-06, - "loss": 1.0648, - "num_tokens": 154435241.0, - "step": 185 - }, - { - "epoch": 1.0108695652173914, - "grad_norm": 0.5867866688003616, - "learning_rate": 7.982186011565755e-06, - "loss": 0.9857, - "num_tokens": 155233012.0, - "step": 186 - }, - { - "epoch": 1.016304347826087, - "grad_norm": 0.748318392981897, - "learning_rate": 7.960102288831454e-06, - "loss": 1.0936, - "num_tokens": 156068245.0, - "step": 187 - }, - { - "epoch": 1.0217391304347827, - "grad_norm": 0.6766886272953564, - "learning_rate": 7.937933737145777e-06, - "loss": 1.0318, - "num_tokens": 156917237.0, - "step": 188 - }, - { - "epoch": 1.0271739130434783, - "grad_norm": 0.5751547728567178, - "learning_rate": 7.915681120922055e-06, - "loss": 1.0895, - "num_tokens": 157726124.0, - "step": 189 - }, - { - "epoch": 1.0326086956521738, - "grad_norm": 0.652421513757397, - "learning_rate": 7.893345207472329e-06, - "loss": 1.0166, - "num_tokens": 158541791.0, - "step": 190 - }, - { - "epoch": 1.0380434782608696, - "grad_norm": 0.7072500365040744, - "learning_rate": 7.870926766980879e-06, - "loss": 1.0475, - "num_tokens": 159364181.0, - "step": 191 - }, - { - "epoch": 1.0434782608695652, - "grad_norm": 0.6233691645635457, - "learning_rate": 7.848426572477677e-06, - "loss": 1.0351, - "num_tokens": 160208993.0, - "step": 192 - }, - { - "epoch": 1.048913043478261, - "grad_norm": 0.48198089839824576, - "learning_rate": 7.825845399811723e-06, - "loss": 1.0098, - "num_tokens": 161011393.0, - "step": 193 - }, - { - "epoch": 1.0543478260869565, - "grad_norm": 0.4595646765612945, - "learning_rate": 7.8031840276243e-06, - "loss": 1.0633, - "num_tokens": 161890267.0, - "step": 194 - }, - { - "epoch": 1.059782608695652, - "grad_norm": 0.5917347479597387, - "learning_rate": 7.78044323732212e-06, - "loss": 0.9305, - "num_tokens": 162691891.0, - "step": 195 - }, - { - "epoch": 1.065217391304348, - "grad_norm": 0.5153644484556954, - "learning_rate": 7.75762381305038e-06, - "loss": 1.0327, - "num_tokens": 163497163.0, - "step": 196 - }, - { - "epoch": 1.0706521739130435, - "grad_norm": 0.6049212399144008, - "learning_rate": 7.734726541665722e-06, - "loss": 0.9907, - "num_tokens": 164379270.0, - "step": 197 - }, - { - "epoch": 1.0760869565217392, - "grad_norm": 0.4380348492738434, - "learning_rate": 7.711752212709106e-06, - "loss": 1.0875, - "num_tokens": 165182653.0, - "step": 198 - }, - { - "epoch": 1.0815217391304348, - "grad_norm": 0.644663054623725, - "learning_rate": 7.688701618378583e-06, - "loss": 0.9748, - "num_tokens": 165982495.0, - "step": 199 - }, - { - "epoch": 1.0869565217391304, - "grad_norm": 0.4810295772680435, - "learning_rate": 7.665575553501973e-06, - "loss": 1.0002, - "num_tokens": 166812144.0, - "step": 200 - }, - { - "epoch": 1.0923913043478262, - "grad_norm": 0.632025835493392, - "learning_rate": 7.64237481550947e-06, - "loss": 1.0258, - "num_tokens": 167689643.0, - "step": 201 - }, - { - "epoch": 1.0978260869565217, - "grad_norm": 0.47775863822859244, - "learning_rate": 7.619100204406127e-06, - "loss": 0.9575, - "num_tokens": 168572405.0, - "step": 202 - }, - { - "epoch": 1.1032608695652173, - "grad_norm": 0.7719138228541088, - "learning_rate": 7.595752522744287e-06, - "loss": 0.9942, - "num_tokens": 169473211.0, - "step": 203 - }, - { - "epoch": 1.108695652173913, - "grad_norm": 0.4601408289120345, - "learning_rate": 7.572332575595904e-06, - "loss": 0.98, - "num_tokens": 170315046.0, - "step": 204 - }, - { - "epoch": 1.1141304347826086, - "grad_norm": 0.6828651716457399, - "learning_rate": 7.548841170524779e-06, - "loss": 0.8964, - "num_tokens": 171200672.0, - "step": 205 - }, - { - "epoch": 1.1195652173913044, - "grad_norm": 0.5713003323730678, - "learning_rate": 7.525279117558719e-06, - "loss": 0.998, - "num_tokens": 172044295.0, - "step": 206 - }, - { - "epoch": 1.125, - "grad_norm": 0.7426722544915143, - "learning_rate": 7.501647229161599e-06, - "loss": 1.0206, - "num_tokens": 172855232.0, - "step": 207 - }, - { - "epoch": 1.1304347826086956, - "grad_norm": 0.7673163483132489, - "learning_rate": 7.477946320205358e-06, - "loss": 1.0895, - "num_tokens": 173728391.0, - "step": 208 - }, - { - "epoch": 1.1358695652173914, - "grad_norm": 0.5298469653563895, - "learning_rate": 7.454177207941884e-06, - "loss": 0.9346, - "num_tokens": 174587763.0, - "step": 209 - }, - { - "epoch": 1.141304347826087, - "grad_norm": 0.6057350246674399, - "learning_rate": 7.430340711974855e-06, - "loss": 1.0299, - "num_tokens": 175483652.0, - "step": 210 - }, - { - "epoch": 1.1467391304347827, - "grad_norm": 0.46922407617378503, - "learning_rate": 7.406437654231453e-06, - "loss": 1.0501, - "num_tokens": 176272672.0, - "step": 211 - }, - { - "epoch": 1.1521739130434783, - "grad_norm": 0.6030403553630953, - "learning_rate": 7.382468858934046e-06, - "loss": 1.017, - "num_tokens": 177048310.0, - "step": 212 - }, - { - "epoch": 1.1576086956521738, - "grad_norm": 0.43230362489103435, - "learning_rate": 7.358435152571749e-06, - "loss": 1.008, - "num_tokens": 177839107.0, - "step": 213 - }, - { - "epoch": 1.1630434782608696, - "grad_norm": 0.5448245892571197, - "learning_rate": 7.334337363871936e-06, - "loss": 0.8768, - "num_tokens": 178618954.0, - "step": 214 - }, - { - "epoch": 1.1684782608695652, - "grad_norm": 0.4880077220539455, - "learning_rate": 7.310176323771663e-06, - "loss": 0.9951, - "num_tokens": 179430065.0, - "step": 215 - }, - { - "epoch": 1.1739130434782608, - "grad_norm": 0.4876168134427255, - "learning_rate": 7.285952865389007e-06, - "loss": 0.9202, - "num_tokens": 180289162.0, - "step": 216 - }, - { - "epoch": 1.1793478260869565, - "grad_norm": 0.469527959973456, - "learning_rate": 7.261667823994351e-06, - "loss": 0.9602, - "num_tokens": 181130219.0, - "step": 217 - }, - { - "epoch": 1.184782608695652, - "grad_norm": 0.4953909092855739, - "learning_rate": 7.237322036981568e-06, - "loss": 1.0315, - "num_tokens": 181946236.0, - "step": 218 - }, - { - "epoch": 1.190217391304348, - "grad_norm": 0.4774488084639226, - "learning_rate": 7.212916343839163e-06, - "loss": 0.9541, - "num_tokens": 182783428.0, - "step": 219 - }, - { - "epoch": 1.1956521739130435, - "grad_norm": 0.47379668683218334, - "learning_rate": 7.188451586121312e-06, - "loss": 0.9369, - "num_tokens": 183616914.0, - "step": 220 - }, - { - "epoch": 1.2010869565217392, - "grad_norm": 0.511952905506674, - "learning_rate": 7.163928607418849e-06, - "loss": 0.9844, - "num_tokens": 184505786.0, - "step": 221 - }, - { - "epoch": 1.2065217391304348, - "grad_norm": 0.49063209058524954, - "learning_rate": 7.139348253330177e-06, - "loss": 0.9794, - "num_tokens": 185345903.0, - "step": 222 - }, - { - "epoch": 1.2119565217391304, - "grad_norm": 0.4803261309055045, - "learning_rate": 7.114711371432113e-06, - "loss": 0.9172, - "num_tokens": 186214745.0, - "step": 223 - }, - { - "epoch": 1.2173913043478262, - "grad_norm": 0.4684467286790229, - "learning_rate": 7.090018811250653e-06, - "loss": 1.0251, - "num_tokens": 187078658.0, - "step": 224 - }, - { - "epoch": 1.2228260869565217, - "grad_norm": 0.45476529917577446, - "learning_rate": 7.065271424231694e-06, - "loss": 0.9518, - "num_tokens": 187868285.0, - "step": 225 - }, - { - "epoch": 1.2282608695652173, - "grad_norm": 0.46693532592731146, - "learning_rate": 7.040470063711655e-06, - "loss": 0.9018, - "num_tokens": 188693201.0, - "step": 226 - }, - { - "epoch": 1.233695652173913, - "grad_norm": 0.4930658690295571, - "learning_rate": 7.015615584888072e-06, - "loss": 0.982, - "num_tokens": 189553180.0, - "step": 227 - }, - { - "epoch": 1.2391304347826086, - "grad_norm": 0.4835466860532625, - "learning_rate": 6.990708844790093e-06, - "loss": 0.9711, - "num_tokens": 190386572.0, - "step": 228 - }, - { - "epoch": 1.2445652173913044, - "grad_norm": 0.4852934677176225, - "learning_rate": 6.965750702248936e-06, - "loss": 0.9742, - "num_tokens": 191209747.0, - "step": 229 - }, - { - "epoch": 1.25, - "grad_norm": 0.41281861954385113, - "learning_rate": 6.940742017868274e-06, - "loss": 1.0304, - "num_tokens": 192083591.0, - "step": 230 + "epoch": 2.0, + "eval_loss": 0.7506336569786072, + "eval_num_tokens": 34065558.0, + "eval_runtime": 15.5922, + "eval_samples_per_second": 27.963, + "eval_steps_per_second": 3.527, + "step": 124 }, { - "epoch": 1.2554347826086958, - "grad_norm": 0.5379830623809766, - "learning_rate": 6.9156836539945535e-06, - "loss": 0.9603, - "num_tokens": 192952509.0, - "step": 231 - }, - { - "epoch": 1.2608695652173914, - "grad_norm": 0.4583357149101248, - "learning_rate": 6.890576474687264e-06, - "loss": 0.9592, - "num_tokens": 193784622.0, - "step": 232 - }, - { - "epoch": 1.266304347826087, - "grad_norm": 0.49248565076137285, - "learning_rate": 6.865421345689147e-06, - "loss": 1.007, - "num_tokens": 194645421.0, - "step": 233 - }, - { - "epoch": 1.2717391304347827, - "grad_norm": 0.503639933708953, - "learning_rate": 6.840219134396334e-06, - "loss": 1.0084, - "num_tokens": 195464421.0, - "step": 234 - }, - { - "epoch": 1.2771739130434783, - "grad_norm": 0.5216304382891217, - "learning_rate": 6.814970709828448e-06, - "loss": 0.9359, - "num_tokens": 196232757.0, - "step": 235 - }, - { - "epoch": 1.2826086956521738, - "grad_norm": 0.45832000640398235, - "learning_rate": 6.789676942598626e-06, - "loss": 1.019, - "num_tokens": 196993328.0, - "step": 236 - }, - { - "epoch": 1.2880434782608696, - "grad_norm": 0.4888056205645728, - "learning_rate": 6.764338704883511e-06, - "loss": 0.9833, - "num_tokens": 197842972.0, - "step": 237 - }, - { - "epoch": 1.2934782608695652, - "grad_norm": 0.424259471078176, - "learning_rate": 6.73895687039317e-06, - "loss": 1.0477, - "num_tokens": 198775071.0, - "step": 238 - }, - { - "epoch": 1.2989130434782608, - "grad_norm": 0.4738026617794314, - "learning_rate": 6.713532314340968e-06, - "loss": 1.0088, - "num_tokens": 199605517.0, - "step": 239 - }, - { - "epoch": 1.3043478260869565, - "grad_norm": 0.4492262228667904, - "learning_rate": 6.688065913413391e-06, - "loss": 1.0076, - "num_tokens": 200462948.0, - "step": 240 - }, - { - "epoch": 1.309782608695652, - "grad_norm": 0.5266897333579397, - "learning_rate": 6.662558545739812e-06, - "loss": 0.9553, - "num_tokens": 201254553.0, - "step": 241 - }, - { - "epoch": 1.315217391304348, - "grad_norm": 0.5044655819460336, - "learning_rate": 6.637011090862219e-06, - "loss": 0.9178, - "num_tokens": 202117833.0, - "step": 242 - }, - { - "epoch": 1.3206521739130435, - "grad_norm": 0.4926047411840709, - "learning_rate": 6.611424429704879e-06, - "loss": 1.0102, - "num_tokens": 202958199.0, - "step": 243 - }, - { - "epoch": 1.3260869565217392, - "grad_norm": 0.44629993398334267, - "learning_rate": 6.585799444543967e-06, - "loss": 1.0296, - "num_tokens": 203865043.0, - "step": 244 - }, - { - "epoch": 1.3315217391304348, - "grad_norm": 0.48718773409967614, - "learning_rate": 6.560137018977139e-06, - "loss": 0.9706, - "num_tokens": 204697445.0, - "step": 245 - }, - { - "epoch": 1.3369565217391304, - "grad_norm": 0.5347335066897773, - "learning_rate": 6.53443803789307e-06, - "loss": 0.931, - "num_tokens": 205550423.0, - "step": 246 - }, - { - "epoch": 1.3423913043478262, - "grad_norm": 0.4835007860499471, - "learning_rate": 6.5087033874409354e-06, - "loss": 1.0485, - "num_tokens": 206361818.0, - "step": 247 - }, - { - "epoch": 1.3478260869565217, - "grad_norm": 0.49682989878458295, - "learning_rate": 6.482933954999858e-06, - "loss": 0.9569, - "num_tokens": 207162202.0, - "step": 248 - }, - { - "epoch": 1.3532608695652173, - "grad_norm": 0.48604607203541333, - "learning_rate": 6.457130629148312e-06, - "loss": 1.0242, - "num_tokens": 207982118.0, - "step": 249 - }, - { - "epoch": 1.358695652173913, - "grad_norm": 0.49136925224395633, - "learning_rate": 6.431294299633473e-06, - "loss": 0.9358, - "num_tokens": 208798264.0, - "step": 250 - }, - { - "epoch": 1.3641304347826086, - "grad_norm": 0.5135733708571742, - "learning_rate": 6.405425857340554e-06, - "loss": 0.8883, - "num_tokens": 209645647.0, - "step": 251 - }, - { - "epoch": 1.3695652173913042, - "grad_norm": 0.5748699167895962, - "learning_rate": 6.3795261942620665e-06, - "loss": 0.831, - "num_tokens": 210500505.0, - "step": 252 - }, - { - "epoch": 1.375, - "grad_norm": 0.4347647256494062, - "learning_rate": 6.353596203467085e-06, - "loss": 1.0276, - "num_tokens": 211352989.0, - "step": 253 - }, - { - "epoch": 1.3804347826086958, - "grad_norm": 0.5593692195039036, - "learning_rate": 6.3276367790704315e-06, - "loss": 0.9188, - "num_tokens": 212210323.0, - "step": 254 - }, - { - "epoch": 1.3858695652173914, - "grad_norm": 0.501586708029447, - "learning_rate": 6.30164881620186e-06, - "loss": 0.9177, - "num_tokens": 212999101.0, - "step": 255 - }, - { - "epoch": 1.391304347826087, - "grad_norm": 0.4761388526416261, - "learning_rate": 6.275633210975179e-06, - "loss": 0.9635, - "num_tokens": 213807539.0, - "step": 256 - }, - { - "epoch": 1.3967391304347827, - "grad_norm": 0.7079104884965576, - "learning_rate": 6.249590860457362e-06, - "loss": 0.936, - "num_tokens": 214563435.0, - "step": 257 - }, - { - "epoch": 1.4021739130434783, - "grad_norm": 0.4607025462309026, - "learning_rate": 6.2235226626376075e-06, - "loss": 0.977, - "num_tokens": 215403936.0, - "step": 258 - }, - { - "epoch": 1.4076086956521738, - "grad_norm": 0.5992927631339661, - "learning_rate": 6.19742951639638e-06, - "loss": 0.8826, - "num_tokens": 216186119.0, - "step": 259 - }, - { - "epoch": 1.4130434782608696, - "grad_norm": 0.47309177952769493, - "learning_rate": 6.171312321474413e-06, - "loss": 1.0086, - "num_tokens": 217057725.0, - "step": 260 - }, - { - "epoch": 1.4184782608695652, - "grad_norm": 0.5947429552853162, - "learning_rate": 6.1451719784416775e-06, - "loss": 0.9028, - "num_tokens": 217861129.0, - "step": 261 - }, - { - "epoch": 1.4239130434782608, - "grad_norm": 0.4605662311729455, - "learning_rate": 6.119009388666344e-06, - "loss": 0.9635, - "num_tokens": 218696929.0, - "step": 262 - }, - { - "epoch": 1.4293478260869565, - "grad_norm": 0.4539223081529101, - "learning_rate": 6.0928254542836855e-06, - "loss": 1.0757, - "num_tokens": 219507650.0, - "step": 263 - }, - { - "epoch": 1.434782608695652, - "grad_norm": 0.5214888309662288, - "learning_rate": 6.066621078164979e-06, - "loss": 0.8576, - "num_tokens": 220352475.0, - "step": 264 - }, - { - "epoch": 1.440217391304348, - "grad_norm": 0.48316529011400333, - "learning_rate": 6.040397163886376e-06, - "loss": 0.9293, - "num_tokens": 221149474.0, - "step": 265 - }, - { - "epoch": 1.4456521739130435, - "grad_norm": 0.5089006578572763, - "learning_rate": 6.014154615697729e-06, - "loss": 0.9412, - "num_tokens": 221981897.0, - "step": 266 - }, - { - "epoch": 1.4510869565217392, - "grad_norm": 0.46809528790737637, - "learning_rate": 5.987894338491438e-06, - "loss": 0.9386, - "num_tokens": 222825554.0, - "step": 267 - }, - { - "epoch": 1.4565217391304348, - "grad_norm": 0.5183630856980103, - "learning_rate": 5.961617237771217e-06, - "loss": 1.0011, - "num_tokens": 223614846.0, - "step": 268 - }, - { - "epoch": 1.4619565217391304, - "grad_norm": 0.5281469620794269, - "learning_rate": 5.935324219620897e-06, - "loss": 0.9369, - "num_tokens": 224416773.0, - "step": 269 - }, - { - "epoch": 1.4673913043478262, - "grad_norm": 0.5452128676586095, - "learning_rate": 5.909016190673173e-06, - "loss": 0.8694, - "num_tokens": 225257332.0, - "step": 270 - }, - { - "epoch": 1.4728260869565217, - "grad_norm": 0.5366647661362368, - "learning_rate": 5.88269405807833e-06, - "loss": 0.8403, - "num_tokens": 226117463.0, - "step": 271 - }, - { - "epoch": 1.4782608695652173, - "grad_norm": 0.4785272172063298, - "learning_rate": 5.856358729472984e-06, - "loss": 0.9872, - "num_tokens": 227057124.0, - "step": 272 - }, - { - "epoch": 1.483695652173913, - "grad_norm": 0.48058788852849177, - "learning_rate": 5.830011112948768e-06, - "loss": 0.9887, - "num_tokens": 227914008.0, - "step": 273 - }, - { - "epoch": 1.4891304347826086, - "grad_norm": 0.45796547344368305, - "learning_rate": 5.803652117021029e-06, - "loss": 0.9978, - "num_tokens": 228787533.0, - "step": 274 - }, - { - "epoch": 1.4945652173913042, - "grad_norm": 0.4754307783345142, - "learning_rate": 5.777282650597496e-06, - "loss": 1.0031, - "num_tokens": 229610626.0, - "step": 275 - }, - { - "epoch": 1.5, - "grad_norm": 0.5886367778567009, - "learning_rate": 5.750903622946938e-06, - "loss": 0.8612, - "num_tokens": 230450267.0, - "step": 276 - }, - { - "epoch": 1.5054347826086958, - "grad_norm": 0.48579086470030963, - "learning_rate": 5.724515943667818e-06, - "loss": 0.9728, - "num_tokens": 231229938.0, - "step": 277 - }, - { - "epoch": 1.5108695652173914, - "grad_norm": 0.5456006186458885, - "learning_rate": 5.698120522656916e-06, - "loss": 0.8451, - "num_tokens": 232151140.0, - "step": 278 - }, - { - "epoch": 1.516304347826087, - "grad_norm": 0.4723944904567319, - "learning_rate": 5.671718270077971e-06, - "loss": 0.9205, - "num_tokens": 232968829.0, - "step": 279 - }, - { - "epoch": 1.5217391304347827, - "grad_norm": 0.4955922792130881, - "learning_rate": 5.645310096330281e-06, - "loss": 1.0536, - "num_tokens": 233792405.0, - "step": 280 - }, - { - "epoch": 1.5271739130434783, - "grad_norm": 0.48095668095634725, - "learning_rate": 5.618896912017318e-06, - "loss": 0.9331, - "num_tokens": 234642312.0, - "step": 281 - }, - { - "epoch": 1.5326086956521738, - "grad_norm": 0.43956394618563654, - "learning_rate": 5.592479627915329e-06, - "loss": 0.9626, - "num_tokens": 235509729.0, - "step": 282 - }, - { - "epoch": 1.5380434782608696, - "grad_norm": 0.48850505034509745, - "learning_rate": 5.566059154941925e-06, - "loss": 0.9131, - "num_tokens": 236303967.0, - "step": 283 - }, - { - "epoch": 1.5434782608695652, - "grad_norm": 0.45628959831143406, - "learning_rate": 5.539636404124684e-06, - "loss": 0.9329, - "num_tokens": 237145438.0, - "step": 284 - }, - { - "epoch": 1.5489130434782608, - "grad_norm": 0.49901605657674153, - "learning_rate": 5.513212286569721e-06, - "loss": 0.9331, - "num_tokens": 237961859.0, - "step": 285 - }, - { - "epoch": 1.5543478260869565, - "grad_norm": 0.4929646508453616, - "learning_rate": 5.48678771343028e-06, - "loss": 0.9098, - "num_tokens": 238765601.0, - "step": 286 - }, - { - "epoch": 1.5597826086956523, - "grad_norm": 0.4998589986156106, - "learning_rate": 5.4603635958753175e-06, - "loss": 1.0031, - "num_tokens": 239549930.0, - "step": 287 - }, - { - "epoch": 1.5652173913043477, - "grad_norm": 0.46124830740660894, - "learning_rate": 5.433940845058076e-06, - "loss": 0.9741, - "num_tokens": 240380529.0, - "step": 288 - }, - { - "epoch": 1.5706521739130435, - "grad_norm": 0.5486007366307898, - "learning_rate": 5.407520372084675e-06, - "loss": 0.9231, - "num_tokens": 241226627.0, - "step": 289 - }, - { - "epoch": 1.5760869565217392, - "grad_norm": 0.5184363624069811, - "learning_rate": 5.381103087982684e-06, - "loss": 0.8461, - "num_tokens": 242094535.0, - "step": 290 - }, - { - "epoch": 1.5815217391304348, - "grad_norm": 0.4837976811372812, - "learning_rate": 5.354689903669721e-06, - "loss": 0.9587, - "num_tokens": 242992792.0, - "step": 291 - }, - { - "epoch": 1.5869565217391304, - "grad_norm": 0.4916359390877864, - "learning_rate": 5.3282817299220305e-06, - "loss": 0.986, - "num_tokens": 243778201.0, - "step": 292 - }, - { - "epoch": 1.5923913043478262, - "grad_norm": 0.5075475546171477, - "learning_rate": 5.301879477343086e-06, - "loss": 0.9105, - "num_tokens": 244637452.0, - "step": 293 - }, - { - "epoch": 1.5978260869565217, - "grad_norm": 0.547518662376535, - "learning_rate": 5.2754840563321855e-06, - "loss": 0.8231, - "num_tokens": 245474932.0, - "step": 294 - }, - { - "epoch": 1.6032608695652173, - "grad_norm": 0.48841687453221216, - "learning_rate": 5.249096377053064e-06, - "loss": 0.8715, - "num_tokens": 246385864.0, - "step": 295 - }, - { - "epoch": 1.608695652173913, - "grad_norm": 0.48332832949034804, - "learning_rate": 5.222717349402506e-06, - "loss": 0.8894, - "num_tokens": 247233868.0, - "step": 296 - }, - { - "epoch": 1.6141304347826086, - "grad_norm": 0.5186215837517858, - "learning_rate": 5.196347882978971e-06, - "loss": 0.8804, - "num_tokens": 248038839.0, - "step": 297 - }, - { - "epoch": 1.6195652173913042, - "grad_norm": 0.4794963510671903, - "learning_rate": 5.169988887051234e-06, - "loss": 0.9761, - "num_tokens": 248838659.0, - "step": 298 - }, - { - "epoch": 1.625, - "grad_norm": 0.4764998146716315, - "learning_rate": 5.143641270527018e-06, - "loss": 0.9302, - "num_tokens": 249673557.0, - "step": 299 - }, - { - "epoch": 1.6304347826086958, - "grad_norm": 0.5208992798322531, - "learning_rate": 5.117305941921672e-06, - "loss": 0.8405, - "num_tokens": 250541800.0, - "step": 300 - }, - { - "epoch": 1.6358695652173914, - "grad_norm": 0.4707803571978035, - "learning_rate": 5.0909838093268294e-06, - "loss": 0.9375, - "num_tokens": 251432104.0, - "step": 301 - }, - { - "epoch": 1.641304347826087, - "grad_norm": 0.5327648983863379, - "learning_rate": 5.064675780379104e-06, - "loss": 0.9696, - "num_tokens": 252202097.0, - "step": 302 - }, - { - "epoch": 1.6467391304347827, - "grad_norm": 0.5032050282566293, - "learning_rate": 5.038382762228786e-06, - "loss": 0.8875, - "num_tokens": 253012007.0, - "step": 303 - }, - { - "epoch": 1.6521739130434783, - "grad_norm": 0.5056029507702218, - "learning_rate": 5.012105661508566e-06, - "loss": 0.9087, - "num_tokens": 253785330.0, - "step": 304 - }, - { - "epoch": 1.6576086956521738, - "grad_norm": 0.4789485330661956, - "learning_rate": 4.985845384302271e-06, - "loss": 0.9499, - "num_tokens": 254643671.0, - "step": 305 - }, - { - "epoch": 1.6630434782608696, - "grad_norm": 0.45360594618306144, - "learning_rate": 4.9596028361136265e-06, - "loss": 0.9342, - "num_tokens": 255565652.0, - "step": 306 - }, - { - "epoch": 1.6684782608695652, - "grad_norm": 0.49148936631218604, - "learning_rate": 4.933378921835021e-06, - "loss": 0.9425, - "num_tokens": 256415218.0, - "step": 307 - }, - { - "epoch": 1.6739130434782608, - "grad_norm": 0.4727296330572373, - "learning_rate": 4.907174545716317e-06, - "loss": 0.9524, - "num_tokens": 257293576.0, - "step": 308 - }, - { - "epoch": 1.6793478260869565, - "grad_norm": 0.4294077987646295, - "learning_rate": 4.8809906113336584e-06, - "loss": 0.9699, - "num_tokens": 258139940.0, - "step": 309 - }, - { - "epoch": 1.6847826086956523, - "grad_norm": 0.46396440371930237, - "learning_rate": 4.854828021558323e-06, - "loss": 0.908, - "num_tokens": 258995189.0, - "step": 310 - }, - { - "epoch": 1.6902173913043477, - "grad_norm": 0.4451003909507794, - "learning_rate": 4.8286876785255895e-06, - "loss": 0.9113, - "num_tokens": 259849182.0, - "step": 311 - }, - { - "epoch": 1.6956521739130435, - "grad_norm": 0.4644149101702856, - "learning_rate": 4.80257048360362e-06, - "loss": 0.9047, - "num_tokens": 260728692.0, - "step": 312 - }, - { - "epoch": 1.7010869565217392, - "grad_norm": 0.4765302230895247, - "learning_rate": 4.776477337362394e-06, - "loss": 0.9364, - "num_tokens": 261498550.0, - "step": 313 - }, - { - "epoch": 1.7065217391304348, - "grad_norm": 0.4799480621946747, - "learning_rate": 4.75040913954264e-06, - "loss": 0.8931, - "num_tokens": 262339383.0, - "step": 314 - }, - { - "epoch": 1.7119565217391304, - "grad_norm": 0.4330158349077731, - "learning_rate": 4.724366789024822e-06, - "loss": 0.9221, - "num_tokens": 263211486.0, - "step": 315 - }, - { - "epoch": 1.7173913043478262, - "grad_norm": 0.4719343700469422, - "learning_rate": 4.698351183798141e-06, - "loss": 0.9044, - "num_tokens": 264046091.0, - "step": 316 - }, - { - "epoch": 1.7228260869565217, - "grad_norm": 0.5784427415196676, - "learning_rate": 4.672363220929567e-06, - "loss": 0.8772, - "num_tokens": 264888538.0, - "step": 317 - }, - { - "epoch": 1.7282608695652173, - "grad_norm": 0.5091837295220749, - "learning_rate": 4.646403796532916e-06, - "loss": 0.8942, - "num_tokens": 265630637.0, - "step": 318 - }, - { - "epoch": 1.733695652173913, - "grad_norm": 0.4667527874733642, - "learning_rate": 4.620473805737934e-06, - "loss": 0.9195, - "num_tokens": 266433532.0, - "step": 319 - }, - { - "epoch": 1.7391304347826086, - "grad_norm": 0.46506831938872817, - "learning_rate": 4.594574142659448e-06, - "loss": 0.9369, - "num_tokens": 267259816.0, - "step": 320 - }, - { - "epoch": 1.7445652173913042, - "grad_norm": 0.43003978346559196, - "learning_rate": 4.568705700366527e-06, - "loss": 0.9196, - "num_tokens": 268110996.0, - "step": 321 - }, - { - "epoch": 1.75, - "grad_norm": 0.41920568386474366, - "learning_rate": 4.542869370851689e-06, - "loss": 1.013, - "num_tokens": 268972337.0, - "step": 322 - }, - { - "epoch": 1.7554347826086958, - "grad_norm": 0.42507556465528384, - "learning_rate": 4.517066045000142e-06, - "loss": 0.9282, - "num_tokens": 269834146.0, - "step": 323 - }, - { - "epoch": 1.7608695652173914, - "grad_norm": 0.4422683354025743, - "learning_rate": 4.491296612559066e-06, - "loss": 0.9635, - "num_tokens": 270706702.0, - "step": 324 - }, - { - "epoch": 1.766304347826087, - "grad_norm": 0.4622516150145145, - "learning_rate": 4.465561962106931e-06, - "loss": 0.917, - "num_tokens": 271556924.0, - "step": 325 - }, - { - "epoch": 1.7717391304347827, - "grad_norm": 0.4707369643560396, - "learning_rate": 4.439862981022862e-06, - "loss": 0.8909, - "num_tokens": 272426695.0, - "step": 326 - }, - { - "epoch": 1.7771739130434783, - "grad_norm": 0.46585694696977864, - "learning_rate": 4.4142005554560345e-06, - "loss": 0.8839, - "num_tokens": 273265387.0, - "step": 327 - }, - { - "epoch": 1.7826086956521738, - "grad_norm": 0.4788713880587738, - "learning_rate": 4.388575570295123e-06, - "loss": 0.9002, - "num_tokens": 274049150.0, - "step": 328 - }, - { - "epoch": 1.7880434782608696, - "grad_norm": 0.4531241832723311, - "learning_rate": 4.362988909137783e-06, - "loss": 0.871, - "num_tokens": 274875505.0, - "step": 329 - }, - { - "epoch": 1.7934782608695652, - "grad_norm": 0.4925895978493023, - "learning_rate": 4.33744145426019e-06, - "loss": 0.869, - "num_tokens": 275693800.0, - "step": 330 - }, - { - "epoch": 1.7989130434782608, - "grad_norm": 0.47581574588692704, - "learning_rate": 4.311934086586611e-06, - "loss": 0.9176, - "num_tokens": 276516215.0, - "step": 331 - }, - { - "epoch": 1.8043478260869565, - "grad_norm": 0.5145400148096132, - "learning_rate": 4.286467685659034e-06, - "loss": 0.8774, - "num_tokens": 277343626.0, - "step": 332 - }, - { - "epoch": 1.8097826086956523, - "grad_norm": 0.4628475433186417, - "learning_rate": 4.261043129606832e-06, - "loss": 0.9669, - "num_tokens": 278089598.0, - "step": 333 - }, - { - "epoch": 1.8152173913043477, - "grad_norm": 0.4271094269757471, - "learning_rate": 4.23566129511649e-06, - "loss": 0.9878, - "num_tokens": 278943252.0, - "step": 334 - }, - { - "epoch": 1.8206521739130435, - "grad_norm": 0.4662067117686951, - "learning_rate": 4.210323057401375e-06, - "loss": 0.9906, - "num_tokens": 279813500.0, - "step": 335 - }, - { - "epoch": 1.8260869565217392, - "grad_norm": 0.4546197238056592, - "learning_rate": 4.185029290171554e-06, - "loss": 0.8521, - "num_tokens": 280615503.0, - "step": 336 - }, - { - "epoch": 1.8315217391304348, - "grad_norm": 0.4202719918715804, - "learning_rate": 4.159780865603667e-06, - "loss": 0.9647, - "num_tokens": 281423980.0, - "step": 337 - }, - { - "epoch": 1.8369565217391304, - "grad_norm": 0.40457263606820165, - "learning_rate": 4.134578654310854e-06, - "loss": 0.9852, - "num_tokens": 282282019.0, - "step": 338 - }, - { - "epoch": 1.8423913043478262, - "grad_norm": 0.4560747998505136, - "learning_rate": 4.109423525312738e-06, - "loss": 0.9767, - "num_tokens": 283156303.0, - "step": 339 - }, - { - "epoch": 1.8478260869565217, - "grad_norm": 0.45185534245989895, - "learning_rate": 4.084316346005449e-06, - "loss": 0.8952, - "num_tokens": 283990835.0, - "step": 340 - }, - { - "epoch": 1.8532608695652173, - "grad_norm": 0.5026480004340574, - "learning_rate": 4.059257982131728e-06, - "loss": 0.7806, - "num_tokens": 284772502.0, - "step": 341 - }, - { - "epoch": 1.858695652173913, - "grad_norm": 0.4862030095553426, - "learning_rate": 4.034249297751064e-06, - "loss": 0.8633, - "num_tokens": 285540737.0, - "step": 342 - }, - { - "epoch": 1.8641304347826086, - "grad_norm": 0.8798921866611329, - "learning_rate": 4.009291155209909e-06, - "loss": 0.8267, - "num_tokens": 286344793.0, - "step": 343 - }, - { - "epoch": 1.8695652173913042, - "grad_norm": 0.44507904137666465, - "learning_rate": 3.9843844151119306e-06, - "loss": 0.9401, - "num_tokens": 287183890.0, - "step": 344 - }, - { - "epoch": 1.875, - "grad_norm": 0.48470270510022156, - "learning_rate": 3.959529936288345e-06, - "loss": 0.8448, - "num_tokens": 287987343.0, - "step": 345 - }, - { - "epoch": 1.8804347826086958, - "grad_norm": 0.46250239356468154, - "learning_rate": 3.934728575768307e-06, - "loss": 0.8711, - "num_tokens": 288888769.0, - "step": 346 - }, - { - "epoch": 1.8858695652173914, - "grad_norm": 0.4402667150189706, - "learning_rate": 3.909981188749347e-06, - "loss": 0.9335, - "num_tokens": 289748009.0, - "step": 347 - }, - { - "epoch": 1.891304347826087, - "grad_norm": 0.438457735442828, - "learning_rate": 3.8852886285678896e-06, - "loss": 0.8716, - "num_tokens": 290622938.0, - "step": 348 - }, - { - "epoch": 1.8967391304347827, - "grad_norm": 0.4461771933881913, - "learning_rate": 3.8606517466698246e-06, - "loss": 0.8591, - "num_tokens": 291485055.0, - "step": 349 - }, - { - "epoch": 1.9021739130434783, - "grad_norm": 0.40547876560113, - "learning_rate": 3.8360713925811535e-06, - "loss": 0.9276, - "num_tokens": 292402125.0, - "step": 350 - }, - { - "epoch": 1.9076086956521738, - "grad_norm": 0.4652174044644762, - "learning_rate": 3.8115484138786896e-06, - "loss": 0.7969, - "num_tokens": 293187972.0, - "step": 351 - }, - { - "epoch": 1.9130434782608696, - "grad_norm": 0.4114735123923004, - "learning_rate": 3.787083656160838e-06, - "loss": 0.9328, - "num_tokens": 294085825.0, - "step": 352 - }, - { - "epoch": 1.9184782608695652, - "grad_norm": 0.4182275804025475, - "learning_rate": 3.762677963018433e-06, - "loss": 0.9367, - "num_tokens": 294887669.0, - "step": 353 - }, - { - "epoch": 1.9239130434782608, - "grad_norm": 0.45051073241068795, - "learning_rate": 3.7383321760056524e-06, - "loss": 0.8766, - "num_tokens": 295722993.0, - "step": 354 - }, - { - "epoch": 1.9293478260869565, - "grad_norm": 0.43307154681774906, - "learning_rate": 3.714047134610994e-06, - "loss": 0.8914, - "num_tokens": 296620729.0, - "step": 355 - }, - { - "epoch": 1.9347826086956523, - "grad_norm": 0.5477818107241225, - "learning_rate": 3.6898236762283378e-06, - "loss": 0.7846, - "num_tokens": 297404600.0, - "step": 356 - }, - { - "epoch": 1.9402173913043477, - "grad_norm": 0.41249150875190044, - "learning_rate": 3.6656626361280645e-06, - "loss": 0.9446, - "num_tokens": 298233764.0, - "step": 357 - }, - { - "epoch": 1.9456521739130435, - "grad_norm": 0.42525055229176784, - "learning_rate": 3.641564847428254e-06, - "loss": 0.942, - "num_tokens": 299127521.0, - "step": 358 - }, - { - "epoch": 1.9510869565217392, - "grad_norm": 0.449431576288473, - "learning_rate": 3.617531141065956e-06, - "loss": 0.8601, - "num_tokens": 299982434.0, - "step": 359 - }, - { - "epoch": 1.9565217391304348, - "grad_norm": 0.4064111339005786, - "learning_rate": 3.593562345768549e-06, - "loss": 0.8646, - "num_tokens": 300818439.0, - "step": 360 - }, - { - "epoch": 1.9619565217391304, - "grad_norm": 0.4307163312053481, - "learning_rate": 3.5696592880251467e-06, - "loss": 0.9335, - "num_tokens": 301635342.0, - "step": 361 - }, - { - "epoch": 1.9673913043478262, - "grad_norm": 0.4073183602601777, - "learning_rate": 3.5458227920581154e-06, - "loss": 0.9036, - "num_tokens": 302440399.0, - "step": 362 - }, - { - "epoch": 1.9728260869565217, - "grad_norm": 0.40367545222463547, - "learning_rate": 3.5220536797946447e-06, - "loss": 0.8582, - "num_tokens": 303203873.0, - "step": 363 - }, - { - "epoch": 1.9782608695652173, - "grad_norm": 0.4445278734784835, - "learning_rate": 3.4983527708384023e-06, - "loss": 0.9542, - "num_tokens": 304034628.0, - "step": 364 - }, - { - "epoch": 1.983695652173913, - "grad_norm": 0.4219555341031118, - "learning_rate": 3.4747208824412827e-06, - "loss": 0.8915, - "num_tokens": 304839253.0, - "step": 365 - }, - { - "epoch": 1.9891304347826086, - "grad_norm": 0.4506145971755832, - "learning_rate": 3.451158829475222e-06, - "loss": 0.8492, - "num_tokens": 305555931.0, - "step": 366 - }, - { - "epoch": 1.9945652173913042, - "grad_norm": 0.447095773856711, - "learning_rate": 3.4276674244040976e-06, - "loss": 0.8216, - "num_tokens": 306343455.0, - "step": 367 - }, - { - "epoch": 2.0, - "grad_norm": 0.44739997510719476, - "learning_rate": 3.4042474772557143e-06, - "loss": 0.8574, - "num_tokens": 307150552.0, - "step": 368 - }, - { - "epoch": 2.0, - "eval_loss": 0.6972317099571228, - "eval_num_tokens": 307150552.0, - "eval_runtime": 115.9725, - "eval_samples_per_second": 45.106, - "eval_steps_per_second": 5.639, - "step": 368 - }, - { - "epoch": 2.005434782608696, - "grad_norm": 0.38636761878459847, - "learning_rate": 3.3808997955938754e-06, - "loss": 0.892, - "num_tokens": 307989807.0, - "step": 369 - }, - { - "epoch": 2.010869565217391, - "grad_norm": 0.3829451714194781, - "learning_rate": 3.3576251844905317e-06, - "loss": 0.8861, - "num_tokens": 308868273.0, - "step": 370 - }, - { - "epoch": 2.016304347826087, - "grad_norm": 0.43968135329318825, - "learning_rate": 3.3344244464980267e-06, - "loss": 0.8656, - "num_tokens": 309656308.0, - "step": 371 - }, - { - "epoch": 2.0217391304347827, - "grad_norm": 0.4093528483136871, - "learning_rate": 3.3112983816214184e-06, - "loss": 0.8591, - "num_tokens": 310461076.0, - "step": 372 - }, - { - "epoch": 2.027173913043478, - "grad_norm": 0.38976210096491537, - "learning_rate": 3.2882477872908965e-06, - "loss": 0.9011, - "num_tokens": 311276939.0, - "step": 373 - }, - { - "epoch": 2.032608695652174, - "grad_norm": 0.39368206902618713, - "learning_rate": 3.2652734583342815e-06, - "loss": 0.8588, - "num_tokens": 312165932.0, - "step": 374 - }, - { - "epoch": 2.0380434782608696, - "grad_norm": 0.3983831266526713, - "learning_rate": 3.242376186949623e-06, - "loss": 0.8093, - "num_tokens": 312978583.0, - "step": 375 - }, - { - "epoch": 2.0434782608695654, - "grad_norm": 0.4135612842166327, - "learning_rate": 3.219556762677881e-06, - "loss": 0.8531, - "num_tokens": 313816716.0, - "step": 376 - }, - { - "epoch": 2.0489130434782608, - "grad_norm": 0.39779756390210336, - "learning_rate": 3.1968159723756997e-06, - "loss": 0.8901, - "num_tokens": 314622313.0, - "step": 377 - }, - { - "epoch": 2.0543478260869565, - "grad_norm": 0.3919197553570745, - "learning_rate": 3.1741546001882773e-06, - "loss": 0.8571, - "num_tokens": 315395623.0, - "step": 378 - }, - { - "epoch": 2.0597826086956523, - "grad_norm": 0.3781577490653326, - "learning_rate": 3.151573427522324e-06, - "loss": 0.8994, - "num_tokens": 316215321.0, - "step": 379 - }, - { - "epoch": 2.0652173913043477, - "grad_norm": 0.43117494158316355, - "learning_rate": 3.1290732330191222e-06, - "loss": 0.7714, - "num_tokens": 317020574.0, - "step": 380 - }, - { - "epoch": 2.0706521739130435, - "grad_norm": 0.39659443790663584, - "learning_rate": 3.1066547925276725e-06, - "loss": 0.871, - "num_tokens": 317847775.0, - "step": 381 - }, - { - "epoch": 2.0760869565217392, - "grad_norm": 0.4255646668460152, - "learning_rate": 3.0843188790779455e-06, - "loss": 0.8323, - "num_tokens": 318656420.0, - "step": 382 - }, - { - "epoch": 2.0815217391304346, - "grad_norm": 0.3419689090065395, - "learning_rate": 3.0620662628542256e-06, - "loss": 1.0455, - "num_tokens": 319517679.0, - "step": 383 - }, - { - "epoch": 2.0869565217391304, - "grad_norm": 0.38761705361585463, - "learning_rate": 3.039897711168547e-06, - "loss": 0.7977, - "num_tokens": 320311242.0, - "step": 384 - }, - { - "epoch": 2.092391304347826, - "grad_norm": 0.4398300188788278, - "learning_rate": 3.017813988434245e-06, - "loss": 0.7794, - "num_tokens": 321075317.0, - "step": 385 - }, - { - "epoch": 2.097826086956522, - "grad_norm": 0.38030062397688985, - "learning_rate": 2.9958158561395933e-06, - "loss": 0.9574, - "num_tokens": 321944658.0, - "step": 386 - }, - { - "epoch": 2.1032608695652173, - "grad_norm": 0.37914375055949284, - "learning_rate": 2.9739040728215427e-06, - "loss": 0.9026, - "num_tokens": 322795388.0, - "step": 387 - }, - { - "epoch": 2.108695652173913, - "grad_norm": 0.3681406688591765, - "learning_rate": 2.9520793940395735e-06, - "loss": 0.7614, - "num_tokens": 323620956.0, - "step": 388 - }, - { - "epoch": 2.114130434782609, - "grad_norm": 0.40820419568639127, - "learning_rate": 2.9303425723496353e-06, - "loss": 0.8671, - "num_tokens": 324370926.0, - "step": 389 - }, - { - "epoch": 2.119565217391304, - "grad_norm": 0.40450291077944434, - "learning_rate": 2.9086943572782e-06, - "loss": 0.9099, - "num_tokens": 325196703.0, - "step": 390 - }, - { - "epoch": 2.125, - "grad_norm": 0.38129385362446383, - "learning_rate": 2.8871354952964183e-06, - "loss": 0.9411, - "num_tokens": 326000243.0, - "step": 391 - }, - { - "epoch": 2.130434782608696, - "grad_norm": 0.3799222952796977, - "learning_rate": 2.8656667297943757e-06, - "loss": 0.9011, - "num_tokens": 326819387.0, - "step": 392 - }, - { - "epoch": 2.135869565217391, - "grad_norm": 0.3927989353518067, - "learning_rate": 2.8442888010554658e-06, - "loss": 0.8171, - "num_tokens": 327589016.0, - "step": 393 - }, - { - "epoch": 2.141304347826087, - "grad_norm": 0.37907717159131554, - "learning_rate": 2.8230024462308547e-06, - "loss": 0.93, - "num_tokens": 328465302.0, - "step": 394 - }, - { - "epoch": 2.1467391304347827, - "grad_norm": 0.37822166658254774, - "learning_rate": 2.801808399314071e-06, - "loss": 0.9274, - "num_tokens": 329345660.0, - "step": 395 - }, - { - "epoch": 2.1521739130434785, - "grad_norm": 0.42108543525700537, - "learning_rate": 2.7807073911156934e-06, - "loss": 0.827, - "num_tokens": 330139715.0, - "step": 396 - }, - { - "epoch": 2.157608695652174, - "grad_norm": 0.4008797341414234, - "learning_rate": 2.7597001492381493e-06, - "loss": 0.7955, - "num_tokens": 330949482.0, - "step": 397 - }, - { - "epoch": 2.1630434782608696, - "grad_norm": 0.3777857356482234, - "learning_rate": 2.7387873980506286e-06, - "loss": 0.789, - "num_tokens": 331830357.0, - "step": 398 - }, - { - "epoch": 2.1684782608695654, - "grad_norm": 0.450815446987175, - "learning_rate": 2.7179698586641024e-06, - "loss": 0.7961, - "num_tokens": 332668804.0, - "step": 399 - }, - { - "epoch": 2.1739130434782608, - "grad_norm": 0.3655309894339068, - "learning_rate": 2.6972482489064615e-06, - "loss": 0.9005, - "num_tokens": 333523686.0, - "step": 400 - }, - { - "epoch": 2.1793478260869565, - "grad_norm": 0.35812024427404354, - "learning_rate": 2.6766232832977636e-06, - "loss": 0.9119, - "num_tokens": 334381666.0, - "step": 401 - }, - { - "epoch": 2.1847826086956523, - "grad_norm": 0.3659167904477761, - "learning_rate": 2.6560956730255937e-06, - "loss": 0.9706, - "num_tokens": 335307725.0, - "step": 402 - }, - { - "epoch": 2.1902173913043477, - "grad_norm": 0.3429215811966493, - "learning_rate": 2.6356661259205396e-06, - "loss": 0.908, - "num_tokens": 336161428.0, - "step": 403 - }, - { - "epoch": 2.1956521739130435, - "grad_norm": 0.3824382924417251, - "learning_rate": 2.615335346431789e-06, - "loss": 0.8392, - "num_tokens": 337029640.0, - "step": 404 - }, - { - "epoch": 2.2010869565217392, - "grad_norm": 0.40449862861918684, - "learning_rate": 2.5951040356028357e-06, - "loss": 0.8373, - "num_tokens": 337838364.0, - "step": 405 - }, - { - "epoch": 2.2065217391304346, - "grad_norm": 0.3481749039841372, - "learning_rate": 2.574972891047308e-06, - "loss": 0.9422, - "num_tokens": 338651556.0, - "step": 406 - }, - { - "epoch": 2.2119565217391304, - "grad_norm": 0.3924961725704576, - "learning_rate": 2.554942606924914e-06, - "loss": 0.8228, - "num_tokens": 339511406.0, - "step": 407 - }, - { - "epoch": 2.217391304347826, - "grad_norm": 0.34049826315688747, - "learning_rate": 2.535013873917501e-06, - "loss": 0.8667, - "num_tokens": 340443629.0, - "step": 408 - }, - { - "epoch": 2.2228260869565215, - "grad_norm": 0.4117155572499064, - "learning_rate": 2.515187379205245e-06, - "loss": 0.8342, - "num_tokens": 341279653.0, - "step": 409 - }, - { - "epoch": 2.2282608695652173, - "grad_norm": 0.4441595850045378, - "learning_rate": 2.495463806442953e-06, - "loss": 0.7691, - "num_tokens": 342195766.0, - "step": 410 - }, - { - "epoch": 2.233695652173913, - "grad_norm": 0.35169984350005207, - "learning_rate": 2.4758438357364913e-06, - "loss": 0.9449, - "num_tokens": 343057767.0, - "step": 411 - }, - { - "epoch": 2.239130434782609, - "grad_norm": 0.37934796213532046, - "learning_rate": 2.4563281436193304e-06, - "loss": 0.8667, - "num_tokens": 343872498.0, - "step": 412 - }, - { - "epoch": 2.244565217391304, - "grad_norm": 0.34470012483067713, - "learning_rate": 2.436917403029219e-06, - "loss": 0.9304, - "num_tokens": 344702347.0, - "step": 413 - }, - { - "epoch": 2.25, - "grad_norm": 0.3869353927249258, - "learning_rate": 2.4176122832849806e-06, - "loss": 0.9261, - "num_tokens": 345556219.0, - "step": 414 - }, - { - "epoch": 2.255434782608696, - "grad_norm": 0.376280379959303, - "learning_rate": 2.3984134500634344e-06, - "loss": 0.8187, - "num_tokens": 346341189.0, - "step": 415 - }, - { - "epoch": 2.260869565217391, - "grad_norm": 0.36254799864472526, - "learning_rate": 2.379321565376439e-06, - "loss": 0.9454, - "num_tokens": 347121836.0, - "step": 416 - }, - { - "epoch": 2.266304347826087, - "grad_norm": 0.37665052955616984, - "learning_rate": 2.3603372875480662e-06, - "loss": 0.8902, - "num_tokens": 347915123.0, - "step": 417 - }, - { - "epoch": 2.2717391304347827, - "grad_norm": 0.3619099925225754, - "learning_rate": 2.341461271191903e-06, - "loss": 0.871, - "num_tokens": 348720454.0, - "step": 418 - }, - { - "epoch": 2.2771739130434785, - "grad_norm": 0.38144845503617586, - "learning_rate": 2.3226941671884766e-06, - "loss": 0.8471, - "num_tokens": 349564775.0, - "step": 419 - }, - { - "epoch": 2.282608695652174, - "grad_norm": 0.37803485943173243, - "learning_rate": 2.3040366226628104e-06, - "loss": 0.874, - "num_tokens": 350336647.0, - "step": 420 - }, - { - "epoch": 2.2880434782608696, - "grad_norm": 0.36406159830794205, - "learning_rate": 2.2854892809621113e-06, - "loss": 0.7889, - "num_tokens": 351237635.0, - "step": 421 - }, - { - "epoch": 2.2934782608695654, - "grad_norm": 0.38102789495537004, - "learning_rate": 2.267052781633588e-06, - "loss": 0.7828, - "num_tokens": 352050882.0, - "step": 422 - }, - { - "epoch": 2.2989130434782608, - "grad_norm": 0.35887276733788503, - "learning_rate": 2.248727760402391e-06, - "loss": 0.8599, - "num_tokens": 352878475.0, - "step": 423 - }, - { - "epoch": 2.3043478260869565, - "grad_norm": 0.35052707875020117, - "learning_rate": 2.2305148491497013e-06, - "loss": 0.935, - "num_tokens": 353696090.0, - "step": 424 - }, - { - "epoch": 2.3097826086956523, - "grad_norm": 0.3796455050371005, - "learning_rate": 2.2124146758909344e-06, - "loss": 0.7871, - "num_tokens": 354551580.0, - "step": 425 - }, - { - "epoch": 2.3152173913043477, - "grad_norm": 0.35413248660813723, - "learning_rate": 2.1944278647540897e-06, - "loss": 0.9125, - "num_tokens": 355427660.0, - "step": 426 - }, - { - "epoch": 2.3206521739130435, - "grad_norm": 0.3550372496306149, - "learning_rate": 2.176555035958225e-06, - "loss": 0.9098, - "num_tokens": 356260823.0, - "step": 427 - }, - { - "epoch": 2.3260869565217392, - "grad_norm": 0.3579466405075418, - "learning_rate": 2.158796805792076e-06, - "loss": 0.9144, - "num_tokens": 357121483.0, - "step": 428 - }, - { - "epoch": 2.3315217391304346, - "grad_norm": 0.3529362149072696, - "learning_rate": 2.1411537865927996e-06, - "loss": 0.8598, - "num_tokens": 357940148.0, - "step": 429 - }, - { - "epoch": 2.3369565217391304, - "grad_norm": 0.3715063084651279, - "learning_rate": 2.1236265867248624e-06, - "loss": 0.7717, - "num_tokens": 358802710.0, - "step": 430 - }, - { - "epoch": 2.342391304347826, - "grad_norm": 0.3576601357221509, - "learning_rate": 2.106215810559064e-06, - "loss": 0.8561, - "num_tokens": 359630097.0, - "step": 431 - }, - { - "epoch": 2.3478260869565215, - "grad_norm": 0.3483196895578794, - "learning_rate": 2.0889220584516953e-06, - "loss": 0.9144, - "num_tokens": 360480329.0, - "step": 432 - }, - { - "epoch": 2.3532608695652173, - "grad_norm": 0.3447961481134239, - "learning_rate": 2.071745926723836e-06, - "loss": 0.8636, - "num_tokens": 361308682.0, - "step": 433 - }, - { - "epoch": 2.358695652173913, - "grad_norm": 0.3766304528077358, - "learning_rate": 2.054688007640796e-06, - "loss": 0.7479, - "num_tokens": 362169219.0, - "step": 434 - }, - { - "epoch": 2.364130434782609, - "grad_norm": 0.3746268395382482, - "learning_rate": 2.0377488893916915e-06, - "loss": 0.8258, - "num_tokens": 363005241.0, - "step": 435 - }, - { - "epoch": 2.369565217391304, - "grad_norm": 0.36014765575775626, - "learning_rate": 2.02092915606916e-06, - "loss": 0.8801, - "num_tokens": 363883118.0, - "step": 436 - }, - { - "epoch": 2.375, - "grad_norm": 0.33453942334522974, - "learning_rate": 2.004229387649225e-06, - "loss": 0.86, - "num_tokens": 364788569.0, - "step": 437 - }, - { - "epoch": 2.380434782608696, - "grad_norm": 0.38679043535672375, - "learning_rate": 1.9876501599712933e-06, - "loss": 0.7929, - "num_tokens": 365606670.0, - "step": 438 - }, - { - "epoch": 2.385869565217391, - "grad_norm": 0.3359349574746618, - "learning_rate": 1.9711920447183007e-06, - "loss": 0.8163, - "num_tokens": 366444635.0, - "step": 439 - }, - { - "epoch": 2.391304347826087, - "grad_norm": 0.3644533679055179, - "learning_rate": 1.9548556093969988e-06, - "loss": 0.7937, - "num_tokens": 367323309.0, - "step": 440 - }, - { - "epoch": 2.3967391304347827, - "grad_norm": 0.3771715348384541, - "learning_rate": 1.9386414173183867e-06, - "loss": 0.8485, - "num_tokens": 368064890.0, - "step": 441 - }, - { - "epoch": 2.4021739130434785, - "grad_norm": 0.3551550392055021, - "learning_rate": 1.9225500275782865e-06, - "loss": 0.83, - "num_tokens": 369002766.0, - "step": 442 - }, - { - "epoch": 2.407608695652174, - "grad_norm": 0.35673807977602134, - "learning_rate": 1.9065819950380634e-06, - "loss": 0.8095, - "num_tokens": 369842266.0, - "step": 443 - }, - { - "epoch": 2.4130434782608696, - "grad_norm": 0.3678005067941679, - "learning_rate": 1.8907378703054965e-06, - "loss": 0.916, - "num_tokens": 370684795.0, - "step": 444 - }, - { - "epoch": 2.4184782608695654, - "grad_norm": 0.3550346657051313, - "learning_rate": 1.8750181997157906e-06, - "loss": 0.9113, - "num_tokens": 371537705.0, - "step": 445 - }, - { - "epoch": 2.4239130434782608, - "grad_norm": 0.354949703749135, - "learning_rate": 1.8594235253127373e-06, - "loss": 0.8938, - "num_tokens": 372305511.0, - "step": 446 - }, - { - "epoch": 2.4293478260869565, - "grad_norm": 0.36864809186508585, - "learning_rate": 1.8439543848300234e-06, - "loss": 0.7865, - "num_tokens": 373111650.0, - "step": 447 - }, - { - "epoch": 2.4347826086956523, - "grad_norm": 0.37352388254746816, - "learning_rate": 1.8286113116726928e-06, - "loss": 0.7825, - "num_tokens": 373957500.0, - "step": 448 - }, - { - "epoch": 2.4402173913043477, - "grad_norm": 0.3695867321551594, - "learning_rate": 1.813394834898749e-06, - "loss": 0.8468, - "num_tokens": 374732469.0, - "step": 449 - }, - { - "epoch": 2.4456521739130435, - "grad_norm": 0.3455694762603206, - "learning_rate": 1.7983054792009146e-06, - "loss": 0.8205, - "num_tokens": 375590847.0, - "step": 450 - }, - { - "epoch": 2.4510869565217392, - "grad_norm": 0.34383273093647165, - "learning_rate": 1.7833437648885391e-06, - "loss": 0.9337, - "num_tokens": 376472609.0, - "step": 451 - }, - { - "epoch": 2.4565217391304346, - "grad_norm": 0.34396344325737654, - "learning_rate": 1.768510207869658e-06, - "loss": 0.8295, - "num_tokens": 377360835.0, - "step": 452 - }, - { - "epoch": 2.4619565217391304, - "grad_norm": 0.36772931369842965, - "learning_rate": 1.7538053196331988e-06, - "loss": 0.8552, - "num_tokens": 378129547.0, - "step": 453 - }, - { - "epoch": 2.467391304347826, - "grad_norm": 0.3721073295136887, - "learning_rate": 1.739229607231351e-06, - "loss": 0.8254, - "num_tokens": 378985755.0, - "step": 454 - }, - { - "epoch": 2.4728260869565215, - "grad_norm": 0.375300924188451, - "learning_rate": 1.724783573262077e-06, - "loss": 0.8427, - "num_tokens": 379827893.0, - "step": 455 - }, - { - "epoch": 2.4782608695652173, - "grad_norm": 0.37674482179096, - "learning_rate": 1.7104677158517838e-06, - "loss": 0.6842, - "num_tokens": 380654729.0, - "step": 456 - }, - { - "epoch": 2.483695652173913, - "grad_norm": 0.3476948995891136, - "learning_rate": 1.6962825286381456e-06, - "loss": 0.823, - "num_tokens": 381422237.0, - "step": 457 - }, - { - "epoch": 2.489130434782609, - "grad_norm": 0.3638801948734728, - "learning_rate": 1.682228500753083e-06, - "loss": 0.8774, - "num_tokens": 382221856.0, - "step": 458 - }, - { - "epoch": 2.494565217391304, - "grad_norm": 0.3358560532553637, - "learning_rate": 1.6683061168058957e-06, - "loss": 0.8696, - "num_tokens": 383048436.0, - "step": 459 - }, - { - "epoch": 2.5, - "grad_norm": 0.3280496068759726, - "learning_rate": 1.6545158568665525e-06, - "loss": 0.8815, - "num_tokens": 383884603.0, - "step": 460 - }, - { - "epoch": 2.505434782608696, - "grad_norm": 0.33002937132151383, - "learning_rate": 1.6408581964491405e-06, - "loss": 0.8951, - "num_tokens": 384785328.0, - "step": 461 - }, - { - "epoch": 2.5108695652173916, - "grad_norm": 0.3532135137719775, - "learning_rate": 1.6273336064954637e-06, - "loss": 0.8446, - "num_tokens": 385597787.0, - "step": 462 - }, - { - "epoch": 2.516304347826087, - "grad_norm": 0.3197220986983031, - "learning_rate": 1.6139425533588055e-06, - "loss": 0.9167, - "num_tokens": 386427545.0, - "step": 463 - }, - { - "epoch": 2.5217391304347827, - "grad_norm": 0.3413188251047705, - "learning_rate": 1.6006854987878517e-06, - "loss": 0.8567, - "num_tokens": 387313898.0, - "step": 464 - }, - { - "epoch": 2.5271739130434785, - "grad_norm": 0.3444895369070734, - "learning_rate": 1.5875628999107633e-06, - "loss": 0.8591, - "num_tokens": 388118530.0, - "step": 465 - }, - { - "epoch": 2.532608695652174, - "grad_norm": 0.3310567768494456, - "learning_rate": 1.5745752092194187e-06, - "loss": 0.8368, - "num_tokens": 388980173.0, - "step": 466 - }, - { - "epoch": 2.5380434782608696, - "grad_norm": 0.3640834859217282, - "learning_rate": 1.561722874553806e-06, - "loss": 0.7713, - "num_tokens": 389804181.0, - "step": 467 - }, - { - "epoch": 2.5434782608695654, - "grad_norm": 0.32818875673224424, - "learning_rate": 1.5490063390865845e-06, - "loss": 0.8075, - "num_tokens": 390738842.0, - "step": 468 - }, - { - "epoch": 2.5489130434782608, - "grad_norm": 0.3370297039341805, - "learning_rate": 1.536426041307801e-06, - "loss": 0.8709, - "num_tokens": 391585641.0, - "step": 469 - }, - { - "epoch": 2.5543478260869565, - "grad_norm": 0.3326492694705219, - "learning_rate": 1.5239824150097712e-06, - "loss": 0.8091, - "num_tokens": 392461130.0, - "step": 470 - }, - { - "epoch": 2.5597826086956523, - "grad_norm": 0.3246932898412604, - "learning_rate": 1.5116758892721214e-06, - "loss": 0.86, - "num_tokens": 393303819.0, - "step": 471 - }, - { - "epoch": 2.5652173913043477, - "grad_norm": 0.3358790775832897, - "learning_rate": 1.4995068884469941e-06, - "loss": 0.8047, - "num_tokens": 394139329.0, - "step": 472 - }, - { - "epoch": 2.5706521739130435, - "grad_norm": 0.34951877174632945, - "learning_rate": 1.4874758321444091e-06, - "loss": 0.8456, - "num_tokens": 394970536.0, - "step": 473 - }, - { - "epoch": 2.5760869565217392, - "grad_norm": 0.33912836478591646, - "learning_rate": 1.475583135217807e-06, - "loss": 0.865, - "num_tokens": 395802220.0, - "step": 474 - }, - { - "epoch": 2.5815217391304346, - "grad_norm": 0.33546628623087094, - "learning_rate": 1.4638292077497313e-06, - "loss": 0.8554, - "num_tokens": 396631158.0, - "step": 475 - }, - { - "epoch": 2.5869565217391304, - "grad_norm": 0.3521105782958677, - "learning_rate": 1.4522144550376968e-06, - "loss": 0.8423, - "num_tokens": 397478676.0, - "step": 476 - }, - { - "epoch": 2.592391304347826, - "grad_norm": 0.3509341949307193, - "learning_rate": 1.4407392775802109e-06, - "loss": 0.82, - "num_tokens": 398246760.0, - "step": 477 - }, - { - "epoch": 2.5978260869565215, - "grad_norm": 0.32911402990375643, - "learning_rate": 1.4294040710629617e-06, - "loss": 0.8769, - "num_tokens": 399099437.0, - "step": 478 - }, - { - "epoch": 2.6032608695652173, - "grad_norm": 0.3186571796247337, - "learning_rate": 1.418209226345179e-06, - "loss": 0.8793, - "num_tokens": 399969697.0, - "step": 479 - }, - { - "epoch": 2.608695652173913, - "grad_norm": 0.3279788778493384, - "learning_rate": 1.407155129446152e-06, - "loss": 0.9707, - "num_tokens": 400797805.0, - "step": 480 - }, - { - "epoch": 2.6141304347826084, - "grad_norm": 0.34256729343775694, - "learning_rate": 1.396242161531921e-06, - "loss": 0.8696, - "num_tokens": 401547630.0, - "step": 481 - }, - { - "epoch": 2.619565217391304, - "grad_norm": 0.3172284147790305, - "learning_rate": 1.385470698902134e-06, - "loss": 0.832, - "num_tokens": 402392182.0, - "step": 482 - }, - { - "epoch": 2.625, - "grad_norm": 0.47594779079448835, - "learning_rate": 1.3748411129770703e-06, - "loss": 0.8696, - "num_tokens": 403183866.0, - "step": 483 - }, - { - "epoch": 2.630434782608696, - "grad_norm": 0.3276956523974404, - "learning_rate": 1.3643537702848333e-06, - "loss": 0.894, - "num_tokens": 404038755.0, - "step": 484 - }, - { - "epoch": 2.6358695652173916, - "grad_norm": 0.34310596614188354, - "learning_rate": 1.3540090324487142e-06, - "loss": 0.8598, - "num_tokens": 404864717.0, - "step": 485 - }, - { - "epoch": 2.641304347826087, - "grad_norm": 0.3352550272338416, - "learning_rate": 1.343807256174718e-06, - "loss": 0.8273, - "num_tokens": 405697412.0, - "step": 486 - }, - { - "epoch": 2.6467391304347827, - "grad_norm": 0.35497871766230493, - "learning_rate": 1.333748793239269e-06, - "loss": 0.8314, - "num_tokens": 406541909.0, - "step": 487 - }, - { - "epoch": 2.6521739130434785, - "grad_norm": 0.3063939735113217, - "learning_rate": 1.323833990477076e-06, - "loss": 0.8449, - "num_tokens": 407434141.0, - "step": 488 - }, - { - "epoch": 2.657608695652174, - "grad_norm": 0.2918078346539291, - "learning_rate": 1.3140631897691767e-06, - "loss": 0.9203, - "num_tokens": 408213064.0, - "step": 489 - }, - { - "epoch": 2.6630434782608696, - "grad_norm": 0.3306410671065819, - "learning_rate": 1.3044367280311462e-06, - "loss": 0.8499, - "num_tokens": 409071350.0, - "step": 490 - }, - { - "epoch": 2.6684782608695654, - "grad_norm": 0.31425658255388933, - "learning_rate": 1.2949549372014806e-06, - "loss": 0.9014, - "num_tokens": 409903386.0, - "step": 491 - }, - { - "epoch": 2.6739130434782608, - "grad_norm": 0.31317506453276645, - "learning_rate": 1.2856181442301524e-06, - "loss": 0.8652, - "num_tokens": 410734160.0, - "step": 492 - }, - { - "epoch": 2.6793478260869565, - "grad_norm": 0.31381245027317006, - "learning_rate": 1.2764266710673335e-06, - "loss": 0.9083, - "num_tokens": 411551118.0, - "step": 493 - }, - { - "epoch": 2.6847826086956523, - "grad_norm": 0.3165302151758142, - "learning_rate": 1.267380834652296e-06, - "loss": 0.9255, - "num_tokens": 412396952.0, - "step": 494 - }, - { - "epoch": 2.6902173913043477, - "grad_norm": 0.32292463903234464, - "learning_rate": 1.2584809469024848e-06, - "loss": 0.8961, - "num_tokens": 413158720.0, - "step": 495 - }, - { - "epoch": 2.6956521739130435, - "grad_norm": 0.31354859521048606, - "learning_rate": 1.249727314702759e-06, - "loss": 0.9043, - "num_tokens": 414025866.0, - "step": 496 - }, - { - "epoch": 2.7010869565217392, - "grad_norm": 0.33837050467306035, - "learning_rate": 1.2411202398948116e-06, - "loss": 0.8551, - "num_tokens": 414826744.0, - "step": 497 - }, - { - "epoch": 2.7065217391304346, - "grad_norm": 0.31857152993791094, - "learning_rate": 1.2326600192667612e-06, - "loss": 0.8835, - "num_tokens": 415658618.0, - "step": 498 - }, - { - "epoch": 2.7119565217391304, - "grad_norm": 0.3030421658445399, - "learning_rate": 1.2243469445429192e-06, - "loss": 0.8971, - "num_tokens": 416525614.0, - "step": 499 - }, - { - "epoch": 2.717391304347826, - "grad_norm": 0.30567436323419056, - "learning_rate": 1.2161813023737283e-06, - "loss": 0.8611, - "num_tokens": 417373841.0, - "step": 500 - }, - { - "epoch": 2.7228260869565215, - "grad_norm": 0.3302663704352623, - "learning_rate": 1.2081633743258807e-06, - "loss": 0.8105, - "num_tokens": 418207244.0, - "step": 501 - }, - { - "epoch": 2.7282608695652173, - "grad_norm": 0.33433073358856535, - "learning_rate": 1.2002934368726062e-06, - "loss": 0.764, - "num_tokens": 419000665.0, - "step": 502 - }, - { - "epoch": 2.733695652173913, - "grad_norm": 0.3241604502595133, - "learning_rate": 1.1925717613841432e-06, - "loss": 0.9584, - "num_tokens": 419794625.0, - "step": 503 - }, - { - "epoch": 2.7391304347826084, - "grad_norm": 0.32461465405581225, - "learning_rate": 1.184998614118377e-06, - "loss": 0.8969, - "num_tokens": 420639455.0, - "step": 504 - }, - { - "epoch": 2.744565217391304, - "grad_norm": 0.30608737482567067, - "learning_rate": 1.1775742562116616e-06, - "loss": 0.8808, - "num_tokens": 421467203.0, - "step": 505 - }, - { - "epoch": 2.75, - "grad_norm": 0.33056114911881734, - "learning_rate": 1.1702989436698139e-06, - "loss": 0.896, - "num_tokens": 422328867.0, - "step": 506 - }, - { - "epoch": 2.755434782608696, - "grad_norm": 0.3255646424875819, - "learning_rate": 1.163172927359285e-06, - "loss": 0.8577, - "num_tokens": 423189748.0, - "step": 507 - }, - { - "epoch": 2.7608695652173916, - "grad_norm": 0.33584968403928006, - "learning_rate": 1.1561964529985143e-06, - "loss": 0.8156, - "num_tokens": 424005796.0, - "step": 508 - }, - { - "epoch": 2.766304347826087, - "grad_norm": 0.323421018923075, - "learning_rate": 1.1493697611494512e-06, - "loss": 0.8255, - "num_tokens": 424799841.0, - "step": 509 - }, - { - "epoch": 2.7717391304347827, - "grad_norm": 0.3295272495710808, - "learning_rate": 1.142693087209264e-06, - "loss": 0.9751, - "num_tokens": 425631055.0, - "step": 510 - }, - { - "epoch": 2.7771739130434785, - "grad_norm": 0.3279812282350535, - "learning_rate": 1.13616666140222e-06, - "loss": 0.8055, - "num_tokens": 426533170.0, - "step": 511 - }, - { - "epoch": 2.782608695652174, - "grad_norm": 0.29896410886279173, - "learning_rate": 1.1297907087717499e-06, - "loss": 0.8991, - "num_tokens": 427315655.0, - "step": 512 - }, - { - "epoch": 2.7880434782608696, - "grad_norm": 0.3163307617484885, - "learning_rate": 1.1235654491726853e-06, - "loss": 0.788, - "num_tokens": 428164717.0, - "step": 513 - }, - { - "epoch": 2.7934782608695654, - "grad_norm": 0.2931770479467324, - "learning_rate": 1.11749109726368e-06, - "loss": 0.8384, - "num_tokens": 429023017.0, - "step": 514 - }, - { - "epoch": 2.7989130434782608, - "grad_norm": 0.31236832418646115, - "learning_rate": 1.1115678624998057e-06, - "loss": 0.9362, - "num_tokens": 429866574.0, - "step": 515 - }, - { - "epoch": 2.8043478260869565, - "grad_norm": 0.3242237659649276, - "learning_rate": 1.1057959491253322e-06, - "loss": 0.7995, - "num_tokens": 430676238.0, - "step": 516 - }, - { - "epoch": 2.8097826086956523, - "grad_norm": 0.3036166991407892, - "learning_rate": 1.1001755561666812e-06, - "loss": 0.8371, - "num_tokens": 431514288.0, - "step": 517 - }, - { - "epoch": 2.8152173913043477, - "grad_norm": 0.32280595452436234, - "learning_rate": 1.0947068774255675e-06, - "loss": 0.807, - "num_tokens": 432392393.0, - "step": 518 - }, - { - "epoch": 2.8206521739130435, - "grad_norm": 0.32712965260938454, - "learning_rate": 1.0893901014723154e-06, - "loss": 0.8215, - "num_tokens": 433253184.0, - "step": 519 - }, - { - "epoch": 2.8260869565217392, - "grad_norm": 0.3227790299421114, - "learning_rate": 1.0842254116393524e-06, - "loss": 0.821, - "num_tokens": 434053516.0, - "step": 520 - }, - { - "epoch": 2.8315217391304346, - "grad_norm": 0.32611632405660934, - "learning_rate": 1.0792129860148939e-06, - "loss": 0.8623, - "num_tokens": 434790448.0, - "step": 521 - }, - { - "epoch": 2.8369565217391304, - "grad_norm": 0.3555686123895394, - "learning_rate": 1.074352997436797e-06, - "loss": 0.7353, - "num_tokens": 435603778.0, - "step": 522 - }, - { - "epoch": 2.842391304347826, - "grad_norm": 0.36002880926614983, - "learning_rate": 1.0696456134866027e-06, - "loss": 0.8001, - "num_tokens": 436440204.0, - "step": 523 - }, - { - "epoch": 2.8478260869565215, - "grad_norm": 0.3080276331188948, - "learning_rate": 1.06509099648376e-06, - "loss": 0.8874, - "num_tokens": 437277908.0, - "step": 524 - }, - { - "epoch": 2.8532608695652173, - "grad_norm": 0.3038579198576587, - "learning_rate": 1.0606893034800243e-06, - "loss": 0.8889, - "num_tokens": 438200019.0, - "step": 525 - }, - { - "epoch": 2.858695652173913, - "grad_norm": 0.3188499078923347, - "learning_rate": 1.0564406862540442e-06, - "loss": 0.8234, - "num_tokens": 439031032.0, - "step": 526 - }, - { - "epoch": 2.8641304347826084, - "grad_norm": 0.3163232661276132, - "learning_rate": 1.0523452913061287e-06, - "loss": 0.9376, - "num_tokens": 439908258.0, - "step": 527 - }, - { - "epoch": 2.869565217391304, - "grad_norm": 0.3205748856172401, - "learning_rate": 1.0484032598531933e-06, - "loss": 0.8969, - "num_tokens": 440770759.0, - "step": 528 - }, - { - "epoch": 2.875, - "grad_norm": 0.3336093178600353, - "learning_rate": 1.044614727823893e-06, - "loss": 0.8814, - "num_tokens": 441610467.0, - "step": 529 - }, - { - "epoch": 2.880434782608696, - "grad_norm": 0.339579008072357, - "learning_rate": 1.0409798258539342e-06, - "loss": 0.7824, - "num_tokens": 442442060.0, - "step": 530 - }, - { - "epoch": 2.8858695652173916, - "grad_norm": 0.30343658556813535, - "learning_rate": 1.0374986792815698e-06, - "loss": 0.8269, - "num_tokens": 443296175.0, - "step": 531 - }, - { - "epoch": 2.891304347826087, - "grad_norm": 0.3402667170917275, - "learning_rate": 1.0341714081432765e-06, - "loss": 0.7638, - "num_tokens": 444156784.0, - "step": 532 - }, - { - "epoch": 2.8967391304347827, - "grad_norm": 0.31169236296214, - "learning_rate": 1.0309981271696186e-06, - "loss": 0.8148, - "num_tokens": 445007467.0, - "step": 533 - }, - { - "epoch": 2.9021739130434785, - "grad_norm": 0.30949456956927557, - "learning_rate": 1.0279789457812883e-06, - "loss": 0.8734, - "num_tokens": 445915302.0, - "step": 534 - }, - { - "epoch": 2.907608695652174, - "grad_norm": 0.32662740565262155, - "learning_rate": 1.0251139680853362e-06, - "loss": 0.9046, - "num_tokens": 446684613.0, - "step": 535 - }, - { - "epoch": 2.9130434782608696, - "grad_norm": 0.3039883271970662, - "learning_rate": 1.0224032928715779e-06, - "loss": 0.8139, - "num_tokens": 447529851.0, - "step": 536 - }, - { - "epoch": 2.9184782608695654, - "grad_norm": 0.29995513771563415, - "learning_rate": 1.0198470136091907e-06, - "loss": 0.906, - "num_tokens": 448361652.0, - "step": 537 - }, - { - "epoch": 2.9239130434782608, - "grad_norm": 0.32922775379683195, - "learning_rate": 1.0174452184434888e-06, - "loss": 0.8472, - "num_tokens": 449206270.0, - "step": 538 - }, - { - "epoch": 2.9293478260869565, - "grad_norm": 0.31634541908199004, - "learning_rate": 1.015197990192884e-06, - "loss": 0.7621, - "num_tokens": 450074620.0, - "step": 539 - }, - { - "epoch": 2.9347826086956523, - "grad_norm": 0.2973593789509411, - "learning_rate": 1.0131054063460314e-06, - "loss": 0.9702, - "num_tokens": 450858066.0, - "step": 540 - }, - { - "epoch": 2.9402173913043477, - "grad_norm": 0.3108474113687747, - "learning_rate": 1.0111675390591551e-06, - "loss": 0.9196, - "num_tokens": 451684385.0, - "step": 541 - }, - { - "epoch": 2.9456521739130435, - "grad_norm": 0.30825641786035934, - "learning_rate": 1.0093844551535627e-06, - "loss": 0.8517, - "num_tokens": 452500740.0, - "step": 542 - }, - { - "epoch": 2.9510869565217392, - "grad_norm": 0.31975962016702786, - "learning_rate": 1.0077562161133376e-06, - "loss": 0.8561, - "num_tokens": 453311676.0, - "step": 543 - }, - { - "epoch": 2.9565217391304346, - "grad_norm": 0.3171145276717419, - "learning_rate": 1.006282878083224e-06, - "loss": 0.8614, - "num_tokens": 454161473.0, - "step": 544 - }, - { - "epoch": 2.9619565217391304, - "grad_norm": 0.3330844602709494, - "learning_rate": 1.0049644918666862e-06, - "loss": 0.8472, - "num_tokens": 454913322.0, - "step": 545 - }, - { - "epoch": 2.967391304347826, - "grad_norm": 0.3218989363030988, - "learning_rate": 1.003801102924159e-06, - "loss": 0.9203, - "num_tokens": 455798549.0, - "step": 546 - }, - { - "epoch": 2.9728260869565215, - "grad_norm": 0.329031606721965, - "learning_rate": 1.0027927513714805e-06, - "loss": 0.8401, - "num_tokens": 456581900.0, - "step": 547 - }, - { - "epoch": 2.9782608695652173, - "grad_norm": 0.32291601500960876, - "learning_rate": 1.0019394719785073e-06, - "loss": 0.8265, - "num_tokens": 457468475.0, - "step": 548 - }, - { - "epoch": 2.983695652173913, - "grad_norm": 0.35665352121026894, - "learning_rate": 1.0012412941679172e-06, - "loss": 0.8668, - "num_tokens": 458212148.0, - "step": 549 - }, - { - "epoch": 2.9891304347826084, - "grad_norm": 0.31822874209028135, - "learning_rate": 1.0006982420141937e-06, - "loss": 0.8655, - "num_tokens": 459035504.0, - "step": 550 - }, - { - "epoch": 2.994565217391304, - "grad_norm": 0.32078359060378214, - "learning_rate": 1.0003103342427952e-06, - "loss": 0.8234, - "num_tokens": 459883895.0, - "step": 551 - }, - { - "epoch": 3.0, - "grad_norm": 0.31075975332177497, - "learning_rate": 1.0000775842295116e-06, - "loss": 0.8399, - "num_tokens": 460733269.0, - "step": 552 - }, - { - "epoch": 3.0, - "eval_loss": 0.665989875793457, - "eval_num_tokens": 460733269.0, - "eval_runtime": 117.721, - "eval_samples_per_second": 44.436, - "eval_steps_per_second": 5.556, - "step": 552 - }, - { - "epoch": 3.0, - "step": 552, - "total_flos": 967966479089664.0, - "train_loss": 1.0463442883413772, - "train_runtime": 10114.1342, - "train_samples_per_second": 13.962, - "train_steps_per_second": 0.055 + "epoch": 2.0, + "step": 124, + "total_flos": 94118804062208.0, + "train_loss": 1.2374288660864676, + "train_runtime": 1146.56, + "train_samples_per_second": 6.843, + "train_steps_per_second": 0.108 } ], "logging_steps": 1, - "max_steps": 552, + "max_steps": 124, "num_input_tokens_seen": 0, - "num_train_epochs": 3, + "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -4479,7 +1046,7 @@ "attributes": {} } }, - "total_flos": 967966479089664.0, + "total_flos": 94118804062208.0, "train_batch_size": 8, "trial_name": null, "trial_params": null