diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,2670 +3,4467 @@ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, - "eval_steps": 51, - "global_step": 324, + "eval_steps": 500, + "global_step": 552, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.009324009324009324, - "grad_norm": 15.097446706645455, + "epoch": 0.005434782608695652, + "grad_norm": 30.931470023815386, "learning_rate": 0.0, - "loss": 1.8292, - "num_tokens": 224382.0, + "loss": 2.463, + "num_tokens": 934637.0, "step": 1 }, { - "epoch": 0.018648018648018648, - "grad_norm": 15.427115110106058, - "learning_rate": 1.0000000000000002e-06, - "loss": 1.8351, - "num_tokens": 438803.0, + "epoch": 0.010869565217391304, + "grad_norm": 31.489623321141185, + "learning_rate": 5.882352941176471e-07, + "loss": 2.474, + "num_tokens": 1760595.0, "step": 2 }, { - "epoch": 0.027972027972027972, - "grad_norm": 14.875101522845485, - "learning_rate": 2.0000000000000003e-06, - "loss": 1.8224, - "num_tokens": 657344.0, + "epoch": 0.016304347826086956, + "grad_norm": 31.108447609992588, + "learning_rate": 1.1764705882352942e-06, + "loss": 2.4621, + "num_tokens": 2596146.0, "step": 3 }, { - "epoch": 0.037296037296037296, - "grad_norm": 13.4290730870021, - "learning_rate": 3e-06, - "loss": 1.7588, - "num_tokens": 881653.0, + "epoch": 0.021739130434782608, + "grad_norm": 30.000375004622274, + "learning_rate": 1.7647058823529414e-06, + "loss": 2.4652, + "num_tokens": 3464061.0, "step": 4 }, { - "epoch": 0.046620046620046623, - "grad_norm": 10.499006101838235, - "learning_rate": 4.000000000000001e-06, - "loss": 1.6163, - "num_tokens": 1113061.0, + "epoch": 0.02717391304347826, + "grad_norm": 26.594555017169228, + "learning_rate": 2.3529411764705885e-06, + "loss": 2.4123, + "num_tokens": 4302551.0, "step": 5 }, { - "epoch": 0.055944055944055944, - "grad_norm": 8.049414704255772, - "learning_rate": 5e-06, - "loss": 1.4542, - "num_tokens": 1331772.0, + "epoch": 0.03260869565217391, + "grad_norm": 18.864816379816464, + "learning_rate": 2.9411764705882355e-06, + "loss": 2.3219, + "num_tokens": 5154623.0, "step": 6 }, { - "epoch": 0.06526806526806526, - "grad_norm": 7.722374086770013, - "learning_rate": 6e-06, - "loss": 1.3668, - "num_tokens": 1562018.0, + "epoch": 0.03804347826086957, + "grad_norm": 10.446852684747173, + "learning_rate": 3.529411764705883e-06, + "loss": 2.2219, + "num_tokens": 5954398.0, "step": 7 }, { - "epoch": 0.07459207459207459, - "grad_norm": 5.947276769620558, - "learning_rate": 7e-06, - "loss": 1.1882, - "num_tokens": 1788639.0, + "epoch": 0.043478260869565216, + "grad_norm": 9.024830472433766, + "learning_rate": 4.11764705882353e-06, + "loss": 2.1842, + "num_tokens": 6758441.0, "step": 8 }, { - "epoch": 0.08391608391608392, - "grad_norm": 5.113219943739578, - "learning_rate": 8.000000000000001e-06, - "loss": 1.1126, - "num_tokens": 2014673.0, + "epoch": 0.04891304347826087, + "grad_norm": 4.294804553874004, + "learning_rate": 4.705882352941177e-06, + "loss": 2.0795, + "num_tokens": 7651800.0, "step": 9 }, { - "epoch": 0.09324009324009325, - "grad_norm": 14.715304846179434, - "learning_rate": 9e-06, - "loss": 1.0331, - "num_tokens": 2250746.0, + "epoch": 0.05434782608695652, + "grad_norm": 3.831516223584368, + "learning_rate": 5.294117647058824e-06, + "loss": 2.0564, + "num_tokens": 8431972.0, "step": 10 }, { - "epoch": 0.10256410256410256, - "grad_norm": 6.82907493160171, - "learning_rate": 1e-05, - "loss": 0.9996, - "num_tokens": 2478394.0, + "epoch": 0.059782608695652176, + "grad_norm": 3.1066148849044355, + "learning_rate": 5.882352941176471e-06, + "loss": 2.005, + "num_tokens": 9325646.0, "step": 11 }, { - "epoch": 0.11188811188811189, - "grad_norm": 3.6488282737279203, - "learning_rate": 9.999774773574383e-06, - "loss": 0.9297, - "num_tokens": 2713916.0, + "epoch": 0.06521739130434782, + "grad_norm": 1.9811391246411751, + "learning_rate": 6.470588235294119e-06, + "loss": 1.9594, + "num_tokens": 10123238.0, "step": 12 }, { - "epoch": 0.12121212121212122, - "grad_norm": 2.8908805137859828, - "learning_rate": 9.999099116842838e-06, - "loss": 0.8859, - "num_tokens": 2943638.0, + "epoch": 0.07065217391304347, + "grad_norm": 1.815307583357642, + "learning_rate": 7.058823529411766e-06, + "loss": 1.933, + "num_tokens": 10986330.0, "step": 13 }, { - "epoch": 0.13053613053613053, - "grad_norm": 2.593442104323103, - "learning_rate": 9.99797309743903e-06, - "loss": 0.898, - "num_tokens": 3158482.0, + "epoch": 0.07608695652173914, + "grad_norm": 1.5197953846158025, + "learning_rate": 7.647058823529411e-06, + "loss": 1.932, + "num_tokens": 11858835.0, "step": 14 }, { - "epoch": 0.13986013986013987, - "grad_norm": 2.374961152220559, - "learning_rate": 9.99639682807822e-06, - "loss": 0.8464, - "num_tokens": 3398166.0, + "epoch": 0.08152173913043478, + "grad_norm": 1.2531688120792746, + "learning_rate": 8.23529411764706e-06, + "loss": 1.8809, + "num_tokens": 12632696.0, "step": 15 }, { - "epoch": 0.14918414918414918, - "grad_norm": 1.8725167978976107, - "learning_rate": 9.994370466545966e-06, - "loss": 0.829, - "num_tokens": 3631244.0, + "epoch": 0.08695652173913043, + "grad_norm": 1.0095986761399403, + "learning_rate": 8.823529411764707e-06, + "loss": 1.8721, + "num_tokens": 13473339.0, "step": 16 }, { - "epoch": 0.1585081585081585, - "grad_norm": 1.3376641050560698, - "learning_rate": 9.99189421568234e-06, - "loss": 0.7943, - "num_tokens": 3851898.0, + "epoch": 0.09239130434782608, + "grad_norm": 0.9721665109368234, + "learning_rate": 9.411764705882354e-06, + "loss": 1.808, + "num_tokens": 14333010.0, "step": 17 }, { - "epoch": 0.16783216783216784, - "grad_norm": 1.4995925443300326, - "learning_rate": 9.988968323361627e-06, - "loss": 0.7756, - "num_tokens": 4079798.0, + "epoch": 0.09782608695652174, + "grad_norm": 0.8709709682954093, + "learning_rate": 1e-05, + "loss": 1.7815, + "num_tokens": 15173517.0, "step": 18 }, { - "epoch": 0.17715617715617715, - "grad_norm": 1.275240322623827, - "learning_rate": 9.985593082467498e-06, - "loss": 0.7681, - "num_tokens": 4308418.0, + "epoch": 0.10326086956521739, + "grad_norm": 0.6803540631317997, + "learning_rate": 9.99992241577049e-06, + "loss": 1.758, + "num_tokens": 15957365.0, "step": 19 }, { - "epoch": 0.1864801864801865, - "grad_norm": 1.049158733271031, - "learning_rate": 9.981768830863707e-06, - "loss": 0.7208, - "num_tokens": 4528712.0, + "epoch": 0.10869565217391304, + "grad_norm": 0.6679029150502125, + "learning_rate": 9.999689665757205e-06, + "loss": 1.7524, + "num_tokens": 16745072.0, "step": 20 }, { - "epoch": 0.1958041958041958, - "grad_norm": 1.0263177055889612, - "learning_rate": 9.977495951360264e-06, - "loss": 0.7459, - "num_tokens": 4756886.0, + "epoch": 0.11413043478260869, + "grad_norm": 0.6618774203762685, + "learning_rate": 9.999301757985807e-06, + "loss": 1.7412, + "num_tokens": 17580494.0, "step": 21 }, { - "epoch": 0.20512820512820512, - "grad_norm": 1.0822926194764306, - "learning_rate": 9.97277487167511e-06, - "loss": 0.7052, - "num_tokens": 4984115.0, + "epoch": 0.11956521739130435, + "grad_norm": 0.5913838446441821, + "learning_rate": 9.998758705832084e-06, + "loss": 1.6931, + "num_tokens": 18417515.0, "step": 22 }, { - "epoch": 0.21445221445221446, - "grad_norm": 1.0092686382544667, - "learning_rate": 9.967606064391318e-06, - "loss": 0.7186, - "num_tokens": 5204818.0, + "epoch": 0.125, + "grad_norm": 0.5660660798912789, + "learning_rate": 9.998060528021493e-06, + "loss": 1.6828, + "num_tokens": 19260295.0, "step": 23 }, { - "epoch": 0.22377622377622378, - "grad_norm": 0.7782398807672024, - "learning_rate": 9.96199004690977e-06, - "loss": 0.6978, - "num_tokens": 5431406.0, + "epoch": 0.13043478260869565, + "grad_norm": 0.5724367315512363, + "learning_rate": 9.99720724862852e-06, + "loss": 1.6306, + "num_tokens": 20067769.0, "step": 24 }, { - "epoch": 0.2331002331002331, - "grad_norm": 0.9300259353100147, - "learning_rate": 9.955927381397374e-06, - "loss": 0.6841, - "num_tokens": 5661807.0, + "epoch": 0.1358695652173913, + "grad_norm": 0.4987897240832889, + "learning_rate": 9.996198897075842e-06, + "loss": 1.6535, + "num_tokens": 20923659.0, "step": 25 }, { - "epoch": 0.24242424242424243, - "grad_norm": 0.8985698420847705, - "learning_rate": 9.949418674730787e-06, - "loss": 0.6456, - "num_tokens": 5883328.0, + "epoch": 0.14130434782608695, + "grad_norm": 0.569279701787871, + "learning_rate": 9.995035508133316e-06, + "loss": 1.5989, + "num_tokens": 21753912.0, "step": 26 }, { - "epoch": 0.2517482517482518, - "grad_norm": 0.7655861536606542, - "learning_rate": 9.942464578435674e-06, - "loss": 0.6666, - "num_tokens": 6116065.0, + "epoch": 0.14673913043478262, + "grad_norm": 0.5527506476811272, + "learning_rate": 9.993717121916778e-06, + "loss": 1.6212, + "num_tokens": 22671460.0, "step": 27 }, { - "epoch": 0.26107226107226106, - "grad_norm": 0.8426013594218673, - "learning_rate": 9.935065788621479e-06, - "loss": 0.653, - "num_tokens": 6337735.0, + "epoch": 0.15217391304347827, + "grad_norm": 0.4626809535443287, + "learning_rate": 9.992243783886663e-06, + "loss": 1.5866, + "num_tokens": 23397618.0, "step": 28 }, { - "epoch": 0.2703962703962704, - "grad_norm": 0.7979514227426084, - "learning_rate": 9.92722304591175e-06, - "loss": 0.649, - "num_tokens": 6559217.0, + "epoch": 0.15760869565217392, + "grad_norm": 0.5294266278853005, + "learning_rate": 9.990615544846439e-06, + "loss": 1.5405, + "num_tokens": 24241586.0, "step": 29 }, { - "epoch": 0.27972027972027974, - "grad_norm": 0.7580072355627283, - "learning_rate": 9.918937135370002e-06, - "loss": 0.6319, - "num_tokens": 6789052.0, + "epoch": 0.16304347826086957, + "grad_norm": 0.45086474263260157, + "learning_rate": 9.988832460940846e-06, + "loss": 1.5416, + "num_tokens": 25109486.0, "step": 30 }, { - "epoch": 0.289044289044289, - "grad_norm": 0.8286474240201317, - "learning_rate": 9.91020888642113e-06, - "loss": 0.6591, - "num_tokens": 7026823.0, + "epoch": 0.16847826086956522, + "grad_norm": 0.5287629110395582, + "learning_rate": 9.986894593653969e-06, + "loss": 1.4971, + "num_tokens": 25837433.0, "step": 31 }, { - "epoch": 0.29836829836829837, - "grad_norm": 0.7702640758775066, - "learning_rate": 9.901039172768384e-06, - "loss": 0.6296, - "num_tokens": 7254546.0, + "epoch": 0.17391304347826086, + "grad_norm": 0.5444997747567997, + "learning_rate": 9.984802009807117e-06, + "loss": 1.5018, + "num_tokens": 26717387.0, "step": 32 }, { - "epoch": 0.3076923076923077, - "grad_norm": 0.7489368196898364, - "learning_rate": 9.89142891230591e-06, - "loss": 0.5884, - "num_tokens": 7490975.0, + "epoch": 0.1793478260869565, + "grad_norm": 0.421750120715681, + "learning_rate": 9.982554781556512e-06, + "loss": 1.4965, + "num_tokens": 27566062.0, "step": 33 }, { - "epoch": 0.317016317016317, - "grad_norm": 0.8438034724677627, - "learning_rate": 9.88137906702687e-06, - "loss": 0.5961, - "num_tokens": 7716958.0, + "epoch": 0.18478260869565216, + "grad_norm": 0.4443148972342859, + "learning_rate": 9.98015298639081e-06, + "loss": 1.4654, + "num_tokens": 28399076.0, "step": 34 }, { - "epoch": 0.32634032634032634, - "grad_norm": 0.8465427563269303, - "learning_rate": 9.870890642927145e-06, - "loss": 0.5883, - "num_tokens": 7945144.0, + "epoch": 0.19021739130434784, + "grad_norm": 0.45877290987079933, + "learning_rate": 9.977596707128424e-06, + "loss": 1.4661, + "num_tokens": 29213011.0, "step": 35 }, { - "epoch": 0.3356643356643357, - "grad_norm": 0.6949632315845249, - "learning_rate": 9.859964689904631e-06, - "loss": 0.566, - "num_tokens": 8163469.0, + "epoch": 0.1956521739130435, + "grad_norm": 0.41374557301191983, + "learning_rate": 9.974886031914665e-06, + "loss": 1.4237, + "num_tokens": 30060053.0, "step": 36 }, { - "epoch": 0.34498834498834496, - "grad_norm": 0.7572473510759273, - "learning_rate": 9.848602301654151e-06, - "loss": 0.5746, - "num_tokens": 8388850.0, + "epoch": 0.20108695652173914, + "grad_norm": 0.39953873131094225, + "learning_rate": 9.972021054218712e-06, + "loss": 1.4068, + "num_tokens": 30907525.0, "step": 37 }, { - "epoch": 0.3543123543123543, - "grad_norm": 0.8258450110130665, - "learning_rate": 9.836804615557965e-06, - "loss": 0.5932, - "num_tokens": 8606930.0, + "epoch": 0.20652173913043478, + "grad_norm": 0.35550524219246343, + "learning_rate": 9.969001872830383e-06, + "loss": 1.4106, + "num_tokens": 31678105.0, "step": 38 }, { - "epoch": 0.36363636363636365, - "grad_norm": 0.736881354400413, - "learning_rate": 9.82457281257193e-06, - "loss": 0.5718, - "num_tokens": 8833247.0, + "epoch": 0.21195652173913043, + "grad_norm": 0.39325204679064013, + "learning_rate": 9.965828591856725e-06, + "loss": 1.3806, + "num_tokens": 32502810.0, "step": 39 }, { - "epoch": 0.372960372960373, - "grad_norm": 0.6874213684175794, - "learning_rate": 9.811908117107269e-06, - "loss": 0.5685, - "num_tokens": 9065155.0, + "epoch": 0.21739130434782608, + "grad_norm": 0.3292339428008424, + "learning_rate": 9.962501320718432e-06, + "loss": 1.4045, + "num_tokens": 33366912.0, "step": 40 }, { - "epoch": 0.3822843822843823, - "grad_norm": 0.6917539701436399, - "learning_rate": 9.79881179690803e-06, - "loss": 0.5615, - "num_tokens": 9293680.0, + "epoch": 0.22282608695652173, + "grad_norm": 0.3056101339882414, + "learning_rate": 9.959020174146066e-06, + "loss": 1.3811, + "num_tokens": 34235993.0, "step": 41 }, { - "epoch": 0.3916083916083916, - "grad_norm": 0.7525097359649862, - "learning_rate": 9.78528516292416e-06, - "loss": 0.5655, - "num_tokens": 9511799.0, + "epoch": 0.22826086956521738, + "grad_norm": 0.3913174080963275, + "learning_rate": 9.955385272176108e-06, + "loss": 1.4036, + "num_tokens": 35033428.0, "step": 42 }, { - "epoch": 0.40093240093240096, - "grad_norm": 0.7242611329025388, - "learning_rate": 9.771329569180288e-06, - "loss": 0.5473, - "num_tokens": 9737324.0, + "epoch": 0.23369565217391305, + "grad_norm": 0.31162982854342974, + "learning_rate": 9.951596740146809e-06, + "loss": 1.4087, + "num_tokens": 35858670.0, "step": 43 }, { - "epoch": 0.41025641025641024, - "grad_norm": 0.6542100040132239, - "learning_rate": 9.756946412640193e-06, - "loss": 0.5499, - "num_tokens": 9981387.0, + "epoch": 0.2391304347826087, + "grad_norm": 0.3449184872880199, + "learning_rate": 9.947654708693872e-06, + "loss": 1.3565, + "num_tokens": 36640735.0, "step": 44 }, { - "epoch": 0.4195804195804196, - "grad_norm": 0.6916418171976277, - "learning_rate": 9.742137133066959e-06, - "loss": 0.5377, - "num_tokens": 10208518.0, + "epoch": 0.24456521739130435, + "grad_norm": 0.35069789860103123, + "learning_rate": 9.943559313745957e-06, + "loss": 1.3857, + "num_tokens": 37442617.0, "step": 45 }, { - "epoch": 0.4289044289044289, - "grad_norm": 0.798650044046329, - "learning_rate": 9.726903212878853e-06, - "loss": 0.5762, - "num_tokens": 10424661.0, + "epoch": 0.25, + "grad_norm": 0.3357337683263222, + "learning_rate": 9.939310696519977e-06, + "loss": 1.3612, + "num_tokens": 38262668.0, "step": 46 }, { - "epoch": 0.4382284382284382, - "grad_norm": 0.7072203438915419, - "learning_rate": 9.711246177000938e-06, - "loss": 0.5368, - "num_tokens": 10657919.0, + "epoch": 0.2554347826086957, + "grad_norm": 0.3269706093287553, + "learning_rate": 9.93490900351624e-06, + "loss": 1.4312, + "num_tokens": 39043577.0, "step": 47 }, { - "epoch": 0.44755244755244755, - "grad_norm": 0.6637477276125824, - "learning_rate": 9.695167592712426e-06, - "loss": 0.5341, - "num_tokens": 10885634.0, + "epoch": 0.2608695652173913, + "grad_norm": 0.31667080944485854, + "learning_rate": 9.930354386513399e-06, + "loss": 1.3592, + "num_tokens": 39916985.0, "step": 48 }, { - "epoch": 0.4568764568764569, - "grad_norm": 0.7461742922095873, - "learning_rate": 9.678669069489793e-06, - "loss": 0.5477, - "num_tokens": 11121932.0, + "epoch": 0.266304347826087, + "grad_norm": 0.32866308366553426, + "learning_rate": 9.925647002563205e-06, + "loss": 1.3687, + "num_tokens": 40728183.0, "step": 49 }, { - "epoch": 0.4662004662004662, - "grad_norm": 0.7083145131153268, - "learning_rate": 9.661752258845667e-06, - "loss": 0.5411, - "num_tokens": 11346099.0, + "epoch": 0.2717391304347826, + "grad_norm": 0.3417326900026665, + "learning_rate": 9.920787013985106e-06, + "loss": 1.3411, + "num_tokens": 41591690.0, "step": 50 }, { - "epoch": 0.4755244755244755, - "grad_norm": 0.6804629701224661, - "learning_rate": 9.644418854163509e-06, - "loss": 0.5495, - "num_tokens": 11573878.0, - "step": 51 - }, - { - "epoch": 0.4755244755244755, - "eval_loss": 0.5184547305107117, - "eval_num_tokens": 11573878.0, - "eval_runtime": 46.9117, - "eval_samples_per_second": 65.037, - "eval_steps_per_second": 8.143, + "epoch": 0.27717391304347827, + "grad_norm": 0.3143667375050653, + "learning_rate": 9.915774588360649e-06, + "loss": 1.3622, + "num_tokens": 42474792.0, "step": 51 }, { - "epoch": 0.48484848484848486, - "grad_norm": 0.670194145666963, - "learning_rate": 9.626670590528115e-06, - "loss": 0.5251, - "num_tokens": 11809080.0, + "epoch": 0.2826086956521739, + "grad_norm": 0.39102018844666386, + "learning_rate": 9.910609898527686e-06, + "loss": 1.3498, + "num_tokens": 43338238.0, "step": 52 }, { - "epoch": 0.49417249417249415, - "grad_norm": 0.6532238552343984, - "learning_rate": 9.608509244551916e-06, - "loss": 0.5283, - "num_tokens": 12033466.0, + "epoch": 0.28804347826086957, + "grad_norm": 0.32307873691348704, + "learning_rate": 9.905293122574433e-06, + "loss": 1.3416, + "num_tokens": 44166705.0, "step": 53 }, { - "epoch": 0.5034965034965035, - "grad_norm": 0.6540950806759922, - "learning_rate": 9.589936634197155e-06, - "loss": 0.525, - "num_tokens": 12266159.0, + "epoch": 0.29347826086956524, + "grad_norm": 0.39172034819128265, + "learning_rate": 9.89982444383332e-06, + "loss": 1.3893, + "num_tokens": 45043592.0, "step": 54 }, { - "epoch": 0.5128205128205128, - "grad_norm": 0.6803327517194798, - "learning_rate": 9.570954618593895e-06, - "loss": 0.5118, - "num_tokens": 12481983.0, + "epoch": 0.29891304347826086, + "grad_norm": 0.3754744196617686, + "learning_rate": 9.89420405087467e-06, + "loss": 1.339, + "num_tokens": 45896379.0, "step": 55 }, { - "epoch": 0.5221445221445221, - "grad_norm": 0.6856024969619839, - "learning_rate": 9.551565097853923e-06, - "loss": 0.4929, - "num_tokens": 12708247.0, + "epoch": 0.30434782608695654, + "grad_norm": 0.4073887677998757, + "learning_rate": 9.888432137500194e-06, + "loss": 1.3368, + "num_tokens": 46824934.0, "step": 56 }, { - "epoch": 0.5314685314685315, - "grad_norm": 0.6101426047566378, - "learning_rate": 9.531770012880554e-06, - "loss": 0.5095, - "num_tokens": 12939945.0, + "epoch": 0.30978260869565216, + "grad_norm": 0.4034319004576986, + "learning_rate": 9.88250890273632e-06, + "loss": 1.3016, + "num_tokens": 47724850.0, "step": 57 }, { - "epoch": 0.5407925407925408, - "grad_norm": 0.6475609980018124, - "learning_rate": 9.511571345174331e-06, - "loss": 0.4989, - "num_tokens": 13159254.0, + "epoch": 0.31521739130434784, + "grad_norm": 0.3540569448635205, + "learning_rate": 9.876434550827315e-06, + "loss": 1.2675, + "num_tokens": 48481161.0, "step": 58 }, { - "epoch": 0.5501165501165501, - "grad_norm": 0.6732344762060629, - "learning_rate": 9.490971116634695e-06, - "loss": 0.5106, - "num_tokens": 13379346.0, + "epoch": 0.32065217391304346, + "grad_norm": 0.37996194073137046, + "learning_rate": 9.87020929122825e-06, + "loss": 1.3083, + "num_tokens": 49281414.0, "step": 59 }, { - "epoch": 0.5594405594405595, - "grad_norm": 0.6389095283638084, - "learning_rate": 9.469971389357574e-06, - "loss": 0.4976, - "num_tokens": 13612453.0, + "epoch": 0.32608695652173914, + "grad_norm": 0.2886408351426047, + "learning_rate": 9.86383333859778e-06, + "loss": 1.313, + "num_tokens": 50122650.0, "step": 60 }, { - "epoch": 0.5687645687645687, - "grad_norm": 0.7102859144485985, - "learning_rate": 9.448574265428972e-06, - "loss": 0.4926, - "num_tokens": 13843528.0, + "epoch": 0.33152173913043476, + "grad_norm": 0.49520282031068646, + "learning_rate": 9.857306912790737e-06, + "loss": 1.2704, + "num_tokens": 50931600.0, "step": 61 }, { - "epoch": 0.578088578088578, - "grad_norm": 0.5992429263255332, - "learning_rate": 9.426781886714551e-06, - "loss": 0.4936, - "num_tokens": 14077000.0, + "epoch": 0.33695652173913043, + "grad_norm": 0.2845258505703522, + "learning_rate": 9.850630238850549e-06, + "loss": 1.3057, + "num_tokens": 51863119.0, "step": 62 }, { - "epoch": 0.5874125874125874, - "grad_norm": 0.7166387603738363, - "learning_rate": 9.404596434645232e-06, - "loss": 0.4889, - "num_tokens": 14294350.0, + "epoch": 0.3423913043478261, + "grad_norm": 0.3736561127913658, + "learning_rate": 9.843803547001487e-06, + "loss": 1.3444, + "num_tokens": 52720131.0, "step": 63 }, { - "epoch": 0.5967365967365967, - "grad_norm": 0.6561978843481102, - "learning_rate": 9.382020129998821e-06, - "loss": 0.4979, - "num_tokens": 14532343.0, + "epoch": 0.34782608695652173, + "grad_norm": 0.2938598936747867, + "learning_rate": 9.836827072640716e-06, + "loss": 1.3123, + "num_tokens": 53562926.0, "step": 64 }, { - "epoch": 0.6060606060606061, - "grad_norm": 0.61526807109734, - "learning_rate": 9.359055232677718e-06, - "loss": 0.4759, - "num_tokens": 14756340.0, + "epoch": 0.3532608695652174, + "grad_norm": 0.3009801939956506, + "learning_rate": 9.829701056330188e-06, + "loss": 1.3823, + "num_tokens": 54385006.0, "step": 65 }, { - "epoch": 0.6153846153846154, - "grad_norm": 0.658194861071045, - "learning_rate": 9.335704041482697e-06, - "loss": 0.471, - "num_tokens": 14968956.0, + "epoch": 0.358695652173913, + "grad_norm": 0.3418977440700839, + "learning_rate": 9.82242574378834e-06, + "loss": 1.3006, + "num_tokens": 55136819.0, "step": 66 }, { - "epoch": 0.6247086247086248, - "grad_norm": 0.616416691332377, - "learning_rate": 9.311968893882791e-06, - "loss": 0.4912, - "num_tokens": 15200242.0, + "epoch": 0.3641304347826087, + "grad_norm": 0.297311298187414, + "learning_rate": 9.815001385881624e-06, + "loss": 1.282, + "num_tokens": 55938417.0, "step": 67 }, { - "epoch": 0.634032634032634, - "grad_norm": 0.6343339828665332, - "learning_rate": 9.287852165781312e-06, - "loss": 0.4919, - "num_tokens": 15426840.0, + "epoch": 0.3695652173913043, + "grad_norm": 0.35597318816694135, + "learning_rate": 9.807428238615858e-06, + "loss": 1.3081, + "num_tokens": 56858823.0, "step": 68 }, { - "epoch": 0.6433566433566433, - "grad_norm": 0.6564153295890287, - "learning_rate": 9.263356271278027e-06, - "loss": 0.4793, - "num_tokens": 15658816.0, + "epoch": 0.375, + "grad_norm": 0.3366963789989372, + "learning_rate": 9.799706563127395e-06, + "loss": 1.2703, + "num_tokens": 57649612.0, "step": 69 }, { - "epoch": 0.6526806526806527, - "grad_norm": 0.6425664251133348, - "learning_rate": 9.238483662427493e-06, - "loss": 0.4823, - "num_tokens": 15885811.0, + "epoch": 0.3804347826086957, + "grad_norm": 0.3242176442515732, + "learning_rate": 9.79183662567412e-06, + "loss": 1.2013, + "num_tokens": 58532874.0, "step": 70 }, { - "epoch": 0.662004662004662, - "grad_norm": 0.628140689158425, - "learning_rate": 9.213236828993619e-06, - "loss": 0.4749, - "num_tokens": 16118002.0, + "epoch": 0.3858695652173913, + "grad_norm": 0.3089960724382554, + "learning_rate": 9.783818697626273e-06, + "loss": 1.3188, + "num_tokens": 59347395.0, "step": 71 }, { - "epoch": 0.6713286713286714, - "grad_norm": 0.6485184833730361, - "learning_rate": 9.187618298200425e-06, - "loss": 0.4786, - "num_tokens": 16346221.0, + "epoch": 0.391304347826087, + "grad_norm": 0.3566100658076413, + "learning_rate": 9.775653055457082e-06, + "loss": 1.2759, + "num_tokens": 60223045.0, "step": 72 }, { - "epoch": 0.6806526806526807, - "grad_norm": 0.5979662338785944, - "learning_rate": 9.16163063447908e-06, - "loss": 0.4879, - "num_tokens": 16575378.0, + "epoch": 0.3967391304347826, + "grad_norm": 0.31607856919293054, + "learning_rate": 9.76733998073324e-06, + "loss": 1.2761, + "num_tokens": 61112287.0, "step": 73 }, { - "epoch": 0.6899766899766899, - "grad_norm": 0.675557026688775, - "learning_rate": 9.13527643921118e-06, - "loss": 0.4661, - "num_tokens": 16809130.0, + "epoch": 0.40217391304347827, + "grad_norm": 0.33524520211204156, + "learning_rate": 9.75887976010519e-06, + "loss": 1.2214, + "num_tokens": 61999584.0, "step": 74 }, { - "epoch": 0.6993006993006993, - "grad_norm": 0.6422885724710128, - "learning_rate": 9.10855835046838e-06, - "loss": 0.4679, - "num_tokens": 17026726.0, + "epoch": 0.4076086956521739, + "grad_norm": 0.3318957781485302, + "learning_rate": 9.750272685297241e-06, + "loss": 1.3252, + "num_tokens": 62836474.0, "step": 75 }, { - "epoch": 0.7086247086247086, - "grad_norm": 0.6114995628017877, - "learning_rate": 9.081479042748286e-06, - "loss": 0.465, - "num_tokens": 17257265.0, + "epoch": 0.41304347826086957, + "grad_norm": 0.3405291896634781, + "learning_rate": 9.741519053097516e-06, + "loss": 1.2536, + "num_tokens": 63680479.0, "step": 76 }, { - "epoch": 0.717948717948718, - "grad_norm": 0.6151916913104115, - "learning_rate": 9.05404122670676e-06, - "loss": 0.4646, - "num_tokens": 17478618.0, + "epoch": 0.41847826086956524, + "grad_norm": 0.31652450850198227, + "learning_rate": 9.732619165347705e-06, + "loss": 1.2756, + "num_tokens": 64500924.0, "step": 77 }, { - "epoch": 0.7272727272727273, - "grad_norm": 0.5936620988012073, - "learning_rate": 9.026247648886567e-06, - "loss": 0.461, - "num_tokens": 17704974.0, + "epoch": 0.42391304347826086, + "grad_norm": 0.42768097965959784, + "learning_rate": 9.723573328932669e-06, + "loss": 1.2416, + "num_tokens": 65381318.0, "step": 78 }, { - "epoch": 0.7365967365967366, - "grad_norm": 0.6780504989722205, - "learning_rate": 8.998101091442469e-06, - "loss": 0.4558, - "num_tokens": 17930277.0, + "epoch": 0.42934782608695654, + "grad_norm": 0.3175011196248488, + "learning_rate": 9.71438185576985e-06, + "loss": 1.3265, + "num_tokens": 66223314.0, "step": 79 }, { - "epoch": 0.745920745920746, - "grad_norm": 0.660954300165115, - "learning_rate": 8.969604371862689e-06, - "loss": 0.4718, - "num_tokens": 18152384.0, + "epoch": 0.43478260869565216, + "grad_norm": 0.35099150469597595, + "learning_rate": 9.705045062798519e-06, + "loss": 1.2465, + "num_tokens": 67069071.0, "step": 80 }, { - "epoch": 0.7552447552447552, - "grad_norm": 0.6094153908408396, - "learning_rate": 8.940760342686918e-06, - "loss": 0.4721, - "num_tokens": 18378793.0, + "epoch": 0.44021739130434784, + "grad_norm": 0.31259023838571287, + "learning_rate": 9.695563271968853e-06, + "loss": 1.302, + "num_tokens": 67908788.0, "step": 81 }, { - "epoch": 0.7645687645687645, - "grad_norm": 0.7092610082946307, - "learning_rate": 8.911571891220749e-06, - "loss": 0.4547, - "num_tokens": 18595092.0, + "epoch": 0.44565217391304346, + "grad_norm": 0.342434777113354, + "learning_rate": 9.685936810230824e-06, + "loss": 1.1891, + "num_tokens": 68788781.0, "step": 82 }, { - "epoch": 0.7738927738927739, - "grad_norm": 0.6629720414327388, - "learning_rate": 8.882041939246671e-06, - "loss": 0.4705, - "num_tokens": 18817832.0, + "epoch": 0.45108695652173914, + "grad_norm": 0.35249522091625607, + "learning_rate": 9.676166009522925e-06, + "loss": 1.2201, + "num_tokens": 69568499.0, "step": 83 }, { - "epoch": 0.7832167832167832, - "grad_norm": 0.6373290745696255, - "learning_rate": 8.852173442731586e-06, - "loss": 0.4454, - "num_tokens": 19032032.0, + "epoch": 0.45652173913043476, + "grad_norm": 0.3281902496464512, + "learning_rate": 9.666251206760732e-06, + "loss": 1.1964, + "num_tokens": 70417513.0, "step": 84 }, { - "epoch": 0.7925407925407926, - "grad_norm": 0.6617499162396578, - "learning_rate": 8.821969391530922e-06, - "loss": 0.4508, - "num_tokens": 19267954.0, + "epoch": 0.46195652173913043, + "grad_norm": 0.33253990126652816, + "learning_rate": 9.656192743825283e-06, + "loss": 1.3254, + "num_tokens": 71232903.0, "step": 85 }, { - "epoch": 0.8018648018648019, - "grad_norm": 0.6918672339945613, - "learning_rate": 8.791432809089337e-06, - "loss": 0.459, - "num_tokens": 19500998.0, + "epoch": 0.4673913043478261, + "grad_norm": 0.34464296190071064, + "learning_rate": 9.645990967551287e-06, + "loss": 1.2181, + "num_tokens": 72117100.0, "step": 86 }, { - "epoch": 0.8111888111888111, - "grad_norm": 0.7075903033564215, - "learning_rate": 8.760566752138085e-06, - "loss": 0.4594, - "num_tokens": 19727754.0, + "epoch": 0.47282608695652173, + "grad_norm": 0.32303008461614624, + "learning_rate": 9.635646229715168e-06, + "loss": 1.2541, + "num_tokens": 72955942.0, "step": 87 }, { - "epoch": 0.8205128205128205, - "grad_norm": 0.5902673136102629, - "learning_rate": 8.729374310389024e-06, - "loss": 0.433, - "num_tokens": 19949582.0, + "epoch": 0.4782608695652174, + "grad_norm": 0.4087448612559883, + "learning_rate": 9.62515888702293e-06, + "loss": 1.2175, + "num_tokens": 73799335.0, "step": 88 }, { - "epoch": 0.8298368298368298, - "grad_norm": 0.6005303704900206, - "learning_rate": 8.697858606225336e-06, - "loss": 0.4524, - "num_tokens": 20179845.0, + "epoch": 0.483695652173913, + "grad_norm": 0.32968875759796035, + "learning_rate": 9.614529301097867e-06, + "loss": 1.3086, + "num_tokens": 74583507.0, "step": 89 }, { - "epoch": 0.8391608391608392, - "grad_norm": 0.6267051648548888, - "learning_rate": 8.666022794388975e-06, - "loss": 0.423, - "num_tokens": 20398810.0, + "epoch": 0.4891304347826087, + "grad_norm": 0.4968077240893464, + "learning_rate": 9.603757838468079e-06, + "loss": 1.2153, + "num_tokens": 75427148.0, "step": 90 }, { - "epoch": 0.8484848484848485, - "grad_norm": 0.613848865649995, - "learning_rate": 8.633870061664878e-06, - "loss": 0.4327, - "num_tokens": 20621339.0, + "epoch": 0.4945652173913043, + "grad_norm": 0.30442420617213245, + "learning_rate": 9.592844870553849e-06, + "loss": 1.2644, + "num_tokens": 76235709.0, "step": 91 }, { - "epoch": 0.8578088578088578, - "grad_norm": 0.6527947401981106, - "learning_rate": 8.601403626561965e-06, - "loss": 0.4299, - "num_tokens": 20850876.0, + "epoch": 0.5, + "grad_norm": 0.42123695977036113, + "learning_rate": 9.581790773654821e-06, + "loss": 1.2912, + "num_tokens": 77072728.0, "step": 92 }, { - "epoch": 0.8671328671328671, - "grad_norm": 0.6168809536564613, - "learning_rate": 8.568626738990958e-06, - "loss": 0.4293, - "num_tokens": 21073850.0, + "epoch": 0.5054347826086957, + "grad_norm": 0.3233765627590572, + "learning_rate": 9.57059592893704e-06, + "loss": 1.206, + "num_tokens": 77937646.0, "step": 93 }, { - "epoch": 0.8764568764568764, - "grad_norm": 0.6122685135496609, - "learning_rate": 8.535542679939074e-06, - "loss": 0.4565, - "num_tokens": 21309531.0, + "epoch": 0.5108695652173914, + "grad_norm": 0.5054335403354332, + "learning_rate": 9.55926072241979e-06, + "loss": 1.2503, + "num_tokens": 78761723.0, "step": 94 }, { - "epoch": 0.8857808857808858, - "grad_norm": 0.633692832704968, - "learning_rate": 8.502154761141581e-06, - "loss": 0.4284, - "num_tokens": 21536877.0, + "epoch": 0.5163043478260869, + "grad_norm": 0.31425650067487326, + "learning_rate": 9.547785544962303e-06, + "loss": 1.2237, + "num_tokens": 79555511.0, "step": 95 }, { - "epoch": 0.8951048951048951, - "grad_norm": 0.6287354994581217, - "learning_rate": 8.46846632475031e-06, - "loss": 0.4424, - "num_tokens": 21759224.0, + "epoch": 0.5217391304347826, + "grad_norm": 0.3394176534060976, + "learning_rate": 9.53617079225027e-06, + "loss": 1.2688, + "num_tokens": 80432436.0, "step": 96 }, { - "epoch": 0.9044289044289044, - "grad_norm": 0.6127026636463726, - "learning_rate": 8.434480742999089e-06, - "loss": 0.4381, - "num_tokens": 21988805.0, + "epoch": 0.5271739130434783, + "grad_norm": 0.3282087898215183, + "learning_rate": 9.524416864782196e-06, + "loss": 1.2155, + "num_tokens": 81254903.0, "step": 97 }, { - "epoch": 0.9137529137529138, - "grad_norm": 0.684690186227573, - "learning_rate": 8.400201417866184e-06, - "loss": 0.4503, - "num_tokens": 22205757.0, + "epoch": 0.532608695652174, + "grad_norm": 0.40745731372236615, + "learning_rate": 9.51252416785559e-06, + "loss": 1.2437, + "num_tokens": 82021702.0, "step": 98 }, { - "epoch": 0.9230769230769231, - "grad_norm": 0.6105227864598858, - "learning_rate": 8.365631780733757e-06, - "loss": 0.4226, - "num_tokens": 22437039.0, + "epoch": 0.5380434782608695, + "grad_norm": 0.3296122949882556, + "learning_rate": 9.500493111553007e-06, + "loss": 1.1678, + "num_tokens": 82863748.0, "step": 99 }, { - "epoch": 0.9324009324009324, - "grad_norm": 0.5934626007869728, - "learning_rate": 8.330775292044395e-06, - "loss": 0.4349, - "num_tokens": 22665513.0, + "epoch": 0.5434782608695652, + "grad_norm": 0.36733750482254623, + "learning_rate": 9.488324110727878e-06, + "loss": 1.1795, + "num_tokens": 83720349.0, "step": 100 }, { - "epoch": 0.9417249417249417, - "grad_norm": 0.6001619283133106, - "learning_rate": 8.295635440954696e-06, - "loss": 0.4346, - "num_tokens": 22894964.0, + "epoch": 0.5489130434782609, + "grad_norm": 0.32480120236031906, + "learning_rate": 9.476017584990229e-06, + "loss": 1.2254, + "num_tokens": 84577086.0, "step": 101 }, { - "epoch": 0.951048951048951, - "grad_norm": 0.5567734223853613, - "learning_rate": 8.260215744986021e-06, - "loss": 0.435, - "num_tokens": 23133799.0, - "step": 102 - }, - { - "epoch": 0.951048951048951, - "eval_loss": 0.4339418113231659, - "eval_num_tokens": 23133799.0, - "eval_runtime": 44.5965, - "eval_samples_per_second": 68.413, - "eval_steps_per_second": 8.566, + "epoch": 0.5543478260869565, + "grad_norm": 0.3456700140954568, + "learning_rate": 9.4635739586922e-06, + "loss": 1.2249, + "num_tokens": 85379172.0, "step": 102 }, { - "epoch": 0.9603729603729604, - "grad_norm": 0.5734597599242294, - "learning_rate": 8.224519749672377e-06, - "loss": 0.4206, - "num_tokens": 23358871.0, + "epoch": 0.5597826086956522, + "grad_norm": 0.3327804028738996, + "learning_rate": 9.450993660913418e-06, + "loss": 1.246, + "num_tokens": 86178628.0, "step": 103 }, { - "epoch": 0.9696969696969697, - "grad_norm": 0.6086368650839594, - "learning_rate": 8.188551028205515e-06, - "loss": 0.4354, - "num_tokens": 23585729.0, + "epoch": 0.5652173913043478, + "grad_norm": 0.33610320319412545, + "learning_rate": 9.438277125446194e-06, + "loss": 1.2872, + "num_tokens": 86888597.0, "step": 104 }, { - "epoch": 0.9790209790209791, - "grad_norm": 0.5954657007627898, - "learning_rate": 8.152313181077242e-06, - "loss": 0.4146, - "num_tokens": 23805562.0, + "epoch": 0.5706521739130435, + "grad_norm": 0.35507846358914824, + "learning_rate": 9.425424790780581e-06, + "loss": 1.231, + "num_tokens": 87680751.0, "step": 105 }, { - "epoch": 0.9883449883449883, - "grad_norm": 0.6138199768988847, - "learning_rate": 8.115809835719015e-06, - "loss": 0.425, - "num_tokens": 24029901.0, + "epoch": 0.5760869565217391, + "grad_norm": 0.37608759046628953, + "learning_rate": 9.412437100089236e-06, + "loss": 1.2717, + "num_tokens": 88523003.0, "step": 106 }, { - "epoch": 0.9976689976689976, - "grad_norm": 0.546520390136051, - "learning_rate": 8.079044646138837e-06, - "loss": 0.428, - "num_tokens": 24269754.0, + "epoch": 0.5815217391304348, + "grad_norm": 0.3371473758227733, + "learning_rate": 9.39931450121215e-06, + "loss": 1.1768, + "num_tokens": 89341610.0, "step": 107 }, { - "epoch": 1.0, - "grad_norm": 0.546520390136051, - "learning_rate": 8.042021292555477e-06, - "loss": 0.4373, - "num_tokens": 24325562.0, + "epoch": 0.5869565217391305, + "grad_norm": 0.4098885996751854, + "learning_rate": 9.386057446641195e-06, + "loss": 1.207, + "num_tokens": 90208259.0, "step": 108 }, { - "epoch": 1.0093240093240092, - "grad_norm": 1.2798009950935532, - "learning_rate": 8.004743481030088e-06, - "loss": 0.3592, - "num_tokens": 24569588.0, + "epoch": 0.592391304347826, + "grad_norm": 0.3292910941182598, + "learning_rate": 9.372666393504537e-06, + "loss": 1.1822, + "num_tokens": 91019847.0, "step": 109 }, { - "epoch": 1.0186480186480187, - "grad_norm": 0.6696134953932688, - "learning_rate": 7.967214943095222e-06, - "loss": 0.3744, - "num_tokens": 24794492.0, + "epoch": 0.5978260869565217, + "grad_norm": 0.4063196810849176, + "learning_rate": 9.35914180355086e-06, + "loss": 1.1848, + "num_tokens": 91876225.0, "step": 110 }, { - "epoch": 1.027972027972028, - "grad_norm": 0.5214295855483705, - "learning_rate": 7.929439435381305e-06, - "loss": 0.3423, - "num_tokens": 25023843.0, + "epoch": 0.6032608695652174, + "grad_norm": 0.37489279113107293, + "learning_rate": 9.345484143133447e-06, + "loss": 1.1911, + "num_tokens": 92726215.0, "step": 111 }, { - "epoch": 1.0372960372960374, - "grad_norm": 0.5791254604356956, - "learning_rate": 7.891420739240593e-06, - "loss": 0.3585, - "num_tokens": 25255674.0, + "epoch": 0.6086956521739131, + "grad_norm": 0.3347930475232912, + "learning_rate": 9.331693883194105e-06, + "loss": 1.2572, + "num_tokens": 93557294.0, "step": 112 }, { - "epoch": 1.0466200466200466, - "grad_norm": 0.5476766811139614, - "learning_rate": 7.853162660368664e-06, - "loss": 0.3516, - "num_tokens": 25486215.0, + "epoch": 0.6141304347826086, + "grad_norm": 0.42217331484514864, + "learning_rate": 9.317771499246918e-06, + "loss": 1.0868, + "num_tokens": 94355208.0, "step": 113 }, { - "epoch": 1.055944055944056, - "grad_norm": 0.5758914973043039, - "learning_rate": 7.814669028423444e-06, - "loss": 0.3425, - "num_tokens": 25712704.0, + "epoch": 0.6195652173913043, + "grad_norm": 0.35824817626614924, + "learning_rate": 9.303717471361855e-06, + "loss": 1.1502, + "num_tokens": 95180197.0, "step": 114 }, { - "epoch": 1.0652680652680653, - "grad_norm": 0.5760123728030753, - "learning_rate": 7.775943696641889e-06, - "loss": 0.3435, - "num_tokens": 25938915.0, + "epoch": 0.625, + "grad_norm": 0.3577504011359328, + "learning_rate": 9.289532284148218e-06, + "loss": 1.1584, + "num_tokens": 95965657.0, "step": 115 }, { - "epoch": 1.0745920745920745, - "grad_norm": 0.576701480722012, - "learning_rate": 7.736990541454244e-06, - "loss": 0.3543, - "num_tokens": 26163159.0, + "epoch": 0.6304347826086957, + "grad_norm": 0.4118880839491574, + "learning_rate": 9.275216426737924e-06, + "loss": 1.184, + "num_tokens": 96711264.0, "step": 116 }, { - "epoch": 1.083916083916084, - "grad_norm": 0.576650945358382, - "learning_rate": 7.697813462096026e-06, - "loss": 0.3453, - "num_tokens": 26382888.0, + "epoch": 0.6358695652173914, + "grad_norm": 0.34956250639415626, + "learning_rate": 9.260770392768652e-06, + "loss": 1.2688, + "num_tokens": 97532279.0, "step": 117 }, { - "epoch": 1.0932400932400932, - "grad_norm": 0.5276923197878292, - "learning_rate": 7.658416380217698e-06, - "loss": 0.3527, - "num_tokens": 26611322.0, + "epoch": 0.6413043478260869, + "grad_norm": 0.3691660640244806, + "learning_rate": 9.246194680366802e-06, + "loss": 1.1405, + "num_tokens": 98354575.0, "step": 118 }, { - "epoch": 1.1025641025641026, - "grad_norm": 0.5201043693489312, - "learning_rate": 7.618803239492122e-06, - "loss": 0.329, - "num_tokens": 26833150.0, + "epoch": 0.6467391304347826, + "grad_norm": 0.39892059252243106, + "learning_rate": 9.231489792130343e-06, + "loss": 1.1899, + "num_tokens": 99122392.0, "step": 119 }, { - "epoch": 1.1118881118881119, - "grad_norm": 0.5333967888348916, - "learning_rate": 7.57897800521978e-06, - "loss": 0.3533, - "num_tokens": 27061630.0, + "epoch": 0.6521739130434783, + "grad_norm": 0.3473137462183092, + "learning_rate": 9.216656235111463e-06, + "loss": 1.1618, + "num_tokens": 100066934.0, "step": 120 }, { - "epoch": 1.121212121212121, - "grad_norm": 0.5362717013397229, - "learning_rate": 7.538944663931862e-06, - "loss": 0.3577, - "num_tokens": 27285796.0, + "epoch": 0.657608695652174, + "grad_norm": 0.34159360019061497, + "learning_rate": 9.201694520799086e-06, + "loss": 1.2785, + "num_tokens": 100968865.0, "step": 121 }, { - "epoch": 1.1305361305361306, - "grad_norm": 0.5541137828108857, - "learning_rate": 7.49870722299119e-06, - "loss": 0.352, - "num_tokens": 27517383.0, + "epoch": 0.6630434782608695, + "grad_norm": 0.349168816341091, + "learning_rate": 9.186605165101253e-06, + "loss": 1.1744, + "num_tokens": 101861480.0, "step": 122 }, { - "epoch": 1.1398601398601398, - "grad_norm": 0.5232896976757799, - "learning_rate": 7.4582697101911015e-06, - "loss": 0.3368, - "num_tokens": 27756017.0, + "epoch": 0.6684782608695652, + "grad_norm": 0.48715571757970294, + "learning_rate": 9.171388688327307e-06, + "loss": 1.0673, + "num_tokens": 102672098.0, "step": 123 }, { - "epoch": 1.1491841491841492, - "grad_norm": 0.5586660395107629, - "learning_rate": 7.417636173352247e-06, - "loss": 0.3512, - "num_tokens": 27988182.0, + "epoch": 0.6739130434782609, + "grad_norm": 0.40462894394274823, + "learning_rate": 9.156045615169978e-06, + "loss": 1.1947, + "num_tokens": 103464153.0, "step": 124 }, { - "epoch": 1.1585081585081585, - "grad_norm": 0.5275698359391946, - "learning_rate": 7.376810679917411e-06, - "loss": 0.3367, - "num_tokens": 28216063.0, + "epoch": 0.6793478260869565, + "grad_norm": 0.400412668537012, + "learning_rate": 9.140576474687263e-06, + "loss": 1.2133, + "num_tokens": 104296723.0, "step": 125 }, { - "epoch": 1.167832167832168, - "grad_norm": 0.5330631408961283, - "learning_rate": 7.335797316544352e-06, - "loss": 0.3405, - "num_tokens": 28444175.0, + "epoch": 0.6847826086956522, + "grad_norm": 0.4267731447100407, + "learning_rate": 9.12498180028421e-06, + "loss": 1.1219, + "num_tokens": 105122812.0, "step": 126 }, { - "epoch": 1.1771561771561772, - "grad_norm": 0.5300170588990584, - "learning_rate": 7.2946001886967336e-06, - "loss": 0.3385, - "num_tokens": 28679208.0, + "epoch": 0.6902173913043478, + "grad_norm": 0.36403725435544143, + "learning_rate": 9.109262129694506e-06, + "loss": 1.1643, + "num_tokens": 105977186.0, "step": 127 }, { - "epoch": 1.1864801864801864, - "grad_norm": 0.535737598999273, - "learning_rate": 7.253223420233151e-06, - "loss": 0.3267, - "num_tokens": 28915039.0, + "epoch": 0.6956521739130435, + "grad_norm": 0.3689438135593094, + "learning_rate": 9.093418004961939e-06, + "loss": 1.1379, + "num_tokens": 106853500.0, "step": 128 }, { - "epoch": 1.1958041958041958, - "grad_norm": 0.5305723111392093, - "learning_rate": 7.211671152994348e-06, - "loss": 0.3585, - "num_tokens": 29128873.0, + "epoch": 0.7010869565217391, + "grad_norm": 0.3699932926490634, + "learning_rate": 9.077449972421716e-06, + "loss": 1.0791, + "num_tokens": 107702626.0, "step": 129 }, { - "epoch": 1.205128205128205, - "grad_norm": 0.490754554204438, - "learning_rate": 7.169947546388602e-06, - "loss": 0.3285, - "num_tokens": 29357646.0, + "epoch": 0.7065217391304348, + "grad_norm": 0.3557575331468489, + "learning_rate": 9.061358582681614e-06, + "loss": 1.1377, + "num_tokens": 108617243.0, "step": 130 }, { - "epoch": 1.2144522144522145, - "grad_norm": 0.5126971866608595, - "learning_rate": 7.12805677697537e-06, - "loss": 0.3452, - "num_tokens": 29588680.0, + "epoch": 0.7119565217391305, + "grad_norm": 0.41864057524706433, + "learning_rate": 9.045144390603e-06, + "loss": 1.1609, + "num_tokens": 109434573.0, "step": 131 }, { - "epoch": 1.2237762237762237, - "grad_norm": 0.5626164047551704, - "learning_rate": 7.086003038047213e-06, - "loss": 0.3485, - "num_tokens": 29804673.0, + "epoch": 0.717391304347826, + "grad_norm": 0.3985478205432592, + "learning_rate": 9.028807955281701e-06, + "loss": 1.1596, + "num_tokens": 110290620.0, "step": 132 }, { - "epoch": 1.2331002331002332, - "grad_norm": 0.5161966131055838, - "learning_rate": 7.043790539210045e-06, - "loss": 0.3361, - "num_tokens": 30034232.0, + "epoch": 0.7228260869565217, + "grad_norm": 0.38174230253717656, + "learning_rate": 9.012349840028707e-06, + "loss": 1.2587, + "num_tokens": 111137178.0, "step": 133 }, { - "epoch": 1.2424242424242424, - "grad_norm": 0.5359714333297103, - "learning_rate": 7.001423505961742e-06, - "loss": 0.3465, - "num_tokens": 30256381.0, + "epoch": 0.7282608695652174, + "grad_norm": 0.4429844876484063, + "learning_rate": 8.995770612350778e-06, + "loss": 1.0897, + "num_tokens": 111957193.0, "step": 134 }, { - "epoch": 1.2517482517482517, - "grad_norm": 0.5204156892409427, - "learning_rate": 6.95890617926918e-06, - "loss": 0.3342, - "num_tokens": 30481505.0, + "epoch": 0.7336956521739131, + "grad_norm": 0.36878198276025403, + "learning_rate": 8.979070843930841e-06, + "loss": 1.1068, + "num_tokens": 112822859.0, "step": 135 }, { - "epoch": 1.2610722610722611, - "grad_norm": 0.5228157991201735, - "learning_rate": 6.916242815143697e-06, - "loss": 0.3307, - "num_tokens": 30714212.0, + "epoch": 0.7391304347826086, + "grad_norm": 0.41569112518405493, + "learning_rate": 8.96225111060831e-06, + "loss": 1.1478, + "num_tokens": 113629995.0, "step": 136 }, { - "epoch": 1.2703962703962703, - "grad_norm": 0.5506261937038931, - "learning_rate": 6.873437684215078e-06, - "loss": 0.3478, - "num_tokens": 30933792.0, + "epoch": 0.7445652173913043, + "grad_norm": 0.3991002343202174, + "learning_rate": 8.945311992359206e-06, + "loss": 1.149, + "num_tokens": 114406929.0, "step": 137 }, { - "epoch": 1.2797202797202798, - "grad_norm": 0.4972019266946924, - "learning_rate": 6.830495071304046e-06, - "loss": 0.3363, - "num_tokens": 31166629.0, + "epoch": 0.75, + "grad_norm": 0.3851422262699403, + "learning_rate": 8.928254073276166e-06, + "loss": 1.134, + "num_tokens": 115289074.0, "step": 138 }, { - "epoch": 1.289044289044289, - "grad_norm": 0.5120634677161241, - "learning_rate": 6.787419274993365e-06, - "loss": 0.3363, - "num_tokens": 31399523.0, + "epoch": 0.7554347826086957, + "grad_norm": 0.3958650986478213, + "learning_rate": 8.911077941548306e-06, + "loss": 1.0919, + "num_tokens": 116124554.0, "step": 139 }, { - "epoch": 1.2983682983682985, - "grad_norm": 0.49782505687695744, - "learning_rate": 6.744214607197539e-06, - "loss": 0.3354, - "num_tokens": 31636222.0, + "epoch": 0.7608695652173914, + "grad_norm": 0.42020377168887346, + "learning_rate": 8.893784189440937e-06, + "loss": 1.1666, + "num_tokens": 116865404.0, "step": 140 }, { - "epoch": 1.3076923076923077, - "grad_norm": 0.5390261779041863, - "learning_rate": 6.700885392731188e-06, - "loss": 0.3425, - "num_tokens": 31869574.0, + "epoch": 0.7663043478260869, + "grad_norm": 0.39659340251721403, + "learning_rate": 8.876373413275139e-06, + "loss": 1.1569, + "num_tokens": 117700490.0, "step": 141 }, { - "epoch": 1.317016317016317, - "grad_norm": 0.509309660832688, - "learning_rate": 6.657435968876133e-06, - "loss": 0.3602, - "num_tokens": 32089479.0, + "epoch": 0.7717391304347826, + "grad_norm": 0.4092121559904955, + "learning_rate": 8.858846213407201e-06, + "loss": 1.1584, + "num_tokens": 118525192.0, "step": 142 }, { - "epoch": 1.3263403263403264, - "grad_norm": 0.48678955681043784, - "learning_rate": 6.613870684947232e-06, - "loss": 0.3304, - "num_tokens": 32323608.0, + "epoch": 0.7771739130434783, + "grad_norm": 0.5943065877313993, + "learning_rate": 8.841203194207925e-06, + "loss": 0.9875, + "num_tokens": 119327143.0, "step": 143 }, { - "epoch": 1.3356643356643356, - "grad_norm": 0.5260358713582577, - "learning_rate": 6.570193901857013e-06, - "loss": 0.3479, - "num_tokens": 32544996.0, + "epoch": 0.782608695652174, + "grad_norm": 0.3884971352039889, + "learning_rate": 8.823444964041777e-06, + "loss": 1.1211, + "num_tokens": 120179598.0, "step": 144 }, { - "epoch": 1.3449883449883449, - "grad_norm": 0.5197126640451202, - "learning_rate": 6.526409991679134e-06, - "loss": 0.3448, - "num_tokens": 32769167.0, + "epoch": 0.7880434782608695, + "grad_norm": 0.7599341798230562, + "learning_rate": 8.805572135245911e-06, + "loss": 1.0971, + "num_tokens": 120962600.0, "step": 145 }, { - "epoch": 1.3543123543123543, - "grad_norm": 0.5311482925052395, - "learning_rate": 6.482523337210746e-06, - "loss": 0.3442, - "num_tokens": 32992656.0, + "epoch": 0.7934782608695652, + "grad_norm": 0.41080230187979083, + "learning_rate": 8.787585324109067e-06, + "loss": 1.1224, + "num_tokens": 121848370.0, "step": 146 }, { - "epoch": 1.3636363636363638, - "grad_norm": 0.5132102539844359, - "learning_rate": 6.438538331533769e-06, - "loss": 0.3401, - "num_tokens": 33219244.0, + "epoch": 0.7989130434782609, + "grad_norm": 0.6507551224599566, + "learning_rate": 8.7694851508503e-06, + "loss": 1.0679, + "num_tokens": 122669064.0, "step": 147 }, { - "epoch": 1.372960372960373, - "grad_norm": 0.5013721607406705, - "learning_rate": 6.3944593775751395e-06, - "loss": 0.336, - "num_tokens": 33448947.0, + "epoch": 0.8043478260869565, + "grad_norm": 0.397919573924152, + "learning_rate": 8.751272239597612e-06, + "loss": 1.1958, + "num_tokens": 123524731.0, "step": 148 }, { - "epoch": 1.3822843822843822, - "grad_norm": 0.5043281141721423, - "learning_rate": 6.350290887666078e-06, - "loss": 0.3289, - "num_tokens": 33677944.0, + "epoch": 0.8097826086956522, + "grad_norm": 0.7009649855156489, + "learning_rate": 8.732947218366414e-06, + "loss": 1.1231, + "num_tokens": 124370704.0, "step": 149 }, { - "epoch": 1.3916083916083917, - "grad_norm": 0.5044274541117446, - "learning_rate": 6.306037283100412e-06, - "loss": 0.34, - "num_tokens": 33909419.0, + "epoch": 0.8152173913043478, + "grad_norm": 0.4438846321792291, + "learning_rate": 8.71451071903789e-06, + "loss": 1.1334, + "num_tokens": 125139083.0, "step": 150 }, { - "epoch": 1.400932400932401, - "grad_norm": 0.5210082282628941, - "learning_rate": 6.261702993691994e-06, - "loss": 0.3297, - "num_tokens": 34130758.0, + "epoch": 0.8206521739130435, + "grad_norm": 0.5807316774912559, + "learning_rate": 8.695963377337191e-06, + "loss": 1.1102, + "num_tokens": 125968663.0, "step": 151 }, { - "epoch": 1.4102564102564101, - "grad_norm": 0.5330978559123218, - "learning_rate": 6.217292457331286e-06, - "loss": 0.3459, - "num_tokens": 34354750.0, + "epoch": 0.8260869565217391, + "grad_norm": 0.47495874069754923, + "learning_rate": 8.677305832811524e-06, + "loss": 1.093, + "num_tokens": 126778083.0, "step": 152 }, { - "epoch": 1.4195804195804196, - "grad_norm": 0.5271964829991576, - "learning_rate": 6.172810119541118e-06, - "loss": 0.3299, - "num_tokens": 34570483.0, - "step": 153 - }, - { - "epoch": 1.4195804195804196, - "eval_loss": 0.4102487564086914, - "eval_num_tokens": 34570483.0, - "eval_runtime": 44.6685, - "eval_samples_per_second": 68.303, - "eval_steps_per_second": 8.552, + "epoch": 0.8315217391304348, + "grad_norm": 0.5997580007680087, + "learning_rate": 8.658538728808097e-06, + "loss": 1.2028, + "num_tokens": 127678753.0, "step": 153 }, { - "epoch": 1.428904428904429, - "grad_norm": 0.5344459993409891, - "learning_rate": 6.128260433031688e-06, - "loss": 0.329, - "num_tokens": 34795100.0, + "epoch": 0.8369565217391305, + "grad_norm": 0.40279919197392205, + "learning_rate": 8.639662712451935e-06, + "loss": 1.1701, + "num_tokens": 128502938.0, "step": 154 }, { - "epoch": 1.4382284382284383, - "grad_norm": 0.5033120854345594, - "learning_rate": 6.083647857254837e-06, - "loss": 0.3389, - "num_tokens": 35019260.0, + "epoch": 0.842391304347826, + "grad_norm": 0.5526326204068822, + "learning_rate": 8.620678434623563e-06, + "loss": 1.1252, + "num_tokens": 129345404.0, "step": 155 }, { - "epoch": 1.4475524475524475, - "grad_norm": 0.5519614788540163, - "learning_rate": 6.038976857957674e-06, - "loss": 0.3326, - "num_tokens": 35232059.0, + "epoch": 0.8478260869565217, + "grad_norm": 0.419237231246158, + "learning_rate": 8.601586549936567e-06, + "loss": 1.0567, + "num_tokens": 130127627.0, "step": 156 }, { - "epoch": 1.456876456876457, - "grad_norm": 0.49066707028364026, - "learning_rate": 5.994251906735529e-06, - "loss": 0.318, - "num_tokens": 35452738.0, + "epoch": 0.8532608695652174, + "grad_norm": 0.6590374996771516, + "learning_rate": 8.582387716715021e-06, + "loss": 1.096, + "num_tokens": 130908779.0, "step": 157 }, { - "epoch": 1.4662004662004662, - "grad_norm": 0.5407716108214161, - "learning_rate": 5.949477480584356e-06, - "loss": 0.3434, - "num_tokens": 35687577.0, + "epoch": 0.8586956521739131, + "grad_norm": 0.4116437048227146, + "learning_rate": 8.563082596970785e-06, + "loss": 1.0645, + "num_tokens": 131740109.0, "step": 158 }, { - "epoch": 1.4755244755244754, - "grad_norm": 0.510940703001653, - "learning_rate": 5.904658061452585e-06, - "loss": 0.3268, - "num_tokens": 35921313.0, + "epoch": 0.8641304347826086, + "grad_norm": 0.7235012628969889, + "learning_rate": 8.543671856380672e-06, + "loss": 1.1546, + "num_tokens": 132563707.0, "step": 159 }, { - "epoch": 1.4848484848484849, - "grad_norm": 0.5033518297300301, - "learning_rate": 5.859798135792469e-06, - "loss": 0.3388, - "num_tokens": 36141640.0, + "epoch": 0.8695652173913043, + "grad_norm": 0.4489313897309195, + "learning_rate": 8.524156164263509e-06, + "loss": 1.1447, + "num_tokens": 133376972.0, "step": 160 }, { - "epoch": 1.494172494172494, - "grad_norm": 0.5277378237245701, - "learning_rate": 5.8149021941109886e-06, - "loss": 0.3432, - "num_tokens": 36374008.0, + "epoch": 0.875, + "grad_norm": 0.7048899032217469, + "learning_rate": 8.504536193557049e-06, + "loss": 1.0747, + "num_tokens": 134196485.0, "step": 161 }, { - "epoch": 1.5034965034965035, - "grad_norm": 0.5059008621994278, - "learning_rate": 5.769974730520352e-06, - "loss": 0.3332, - "num_tokens": 36596425.0, + "epoch": 0.8804347826086957, + "grad_norm": 0.7655990331835255, + "learning_rate": 8.484812620794757e-06, + "loss": 1.0732, + "num_tokens": 134961911.0, "step": 162 }, { - "epoch": 1.5128205128205128, - "grad_norm": 0.5153843221684504, - "learning_rate": 5.725020242288134e-06, - "loss": 0.349, - "num_tokens": 36828175.0, + "epoch": 0.8858695652173914, + "grad_norm": 0.7306102642093804, + "learning_rate": 8.4649861260825e-06, + "loss": 1.1207, + "num_tokens": 135841447.0, "step": 163 }, { - "epoch": 1.5221445221445222, - "grad_norm": 0.49021702361667463, - "learning_rate": 5.680043229387086e-06, - "loss": 0.3336, - "num_tokens": 37063422.0, + "epoch": 0.8913043478260869, + "grad_norm": 0.7318731588610641, + "learning_rate": 8.445057393075088e-06, + "loss": 1.1429, + "num_tokens": 136657977.0, "step": 164 }, { - "epoch": 1.5314685314685315, - "grad_norm": 0.5136418808781229, - "learning_rate": 5.6350481940447025e-06, - "loss": 0.3572, - "num_tokens": 37291951.0, + "epoch": 0.8967391304347826, + "grad_norm": 0.5519109731700101, + "learning_rate": 8.425027108952693e-06, + "loss": 1.0926, + "num_tokens": 137439821.0, "step": 165 }, { - "epoch": 1.5407925407925407, - "grad_norm": 0.5200442031961835, - "learning_rate": 5.590039640292525e-06, - "loss": 0.3333, - "num_tokens": 37522766.0, + "epoch": 0.9021739130434783, + "grad_norm": 0.729877195905496, + "learning_rate": 8.404895964397166e-06, + "loss": 1.0711, + "num_tokens": 138324575.0, "step": 166 }, { - "epoch": 1.5501165501165501, - "grad_norm": 0.5314710688003101, - "learning_rate": 5.545022073515306e-06, - "loss": 0.3343, - "num_tokens": 37748838.0, + "epoch": 0.907608695652174, + "grad_norm": 0.4841610288318797, + "learning_rate": 8.384664653568213e-06, + "loss": 1.0236, + "num_tokens": 139110177.0, "step": 167 }, { - "epoch": 1.5594405594405596, - "grad_norm": 0.5404598630521976, - "learning_rate": 5.500000000000001e-06, - "loss": 0.3286, - "num_tokens": 37969252.0, + "epoch": 0.9130434782608695, + "grad_norm": 0.6132670925841823, + "learning_rate": 8.364333874079462e-06, + "loss": 1.1489, + "num_tokens": 139984515.0, "step": 168 }, { - "epoch": 1.5687645687645686, - "grad_norm": 0.5248227901946765, - "learning_rate": 5.454977926484696e-06, - "loss": 0.3348, - "num_tokens": 38196458.0, + "epoch": 0.9184782608695652, + "grad_norm": 0.4702511572635688, + "learning_rate": 8.343904326974409e-06, + "loss": 1.0838, + "num_tokens": 140794223.0, "step": 169 }, { - "epoch": 1.578088578088578, - "grad_norm": 0.5333527697636569, - "learning_rate": 5.409960359707476e-06, - "loss": 0.3518, - "num_tokens": 38427692.0, + "epoch": 0.9239130434782609, + "grad_norm": 0.47600806420109454, + "learning_rate": 8.323376716702236e-06, + "loss": 1.1584, + "num_tokens": 141671790.0, "step": 170 }, { - "epoch": 1.5874125874125875, - "grad_norm": 0.5209184747461538, - "learning_rate": 5.3649518059553e-06, - "loss": 0.327, - "num_tokens": 38652447.0, + "epoch": 0.9293478260869565, + "grad_norm": 0.4892484481019011, + "learning_rate": 8.302751751093539e-06, + "loss": 1.087, + "num_tokens": 142589193.0, "step": 171 }, { - "epoch": 1.5967365967365967, - "grad_norm": 0.5072928544219357, - "learning_rate": 5.319956770612915e-06, - "loss": 0.3249, - "num_tokens": 38882101.0, + "epoch": 0.9347826086956522, + "grad_norm": 0.39811210734784225, + "learning_rate": 8.282030141335899e-06, + "loss": 1.1127, + "num_tokens": 143424641.0, "step": 172 }, { - "epoch": 1.606060606060606, - "grad_norm": 0.5186576324632163, - "learning_rate": 5.274979757711868e-06, - "loss": 0.3378, - "num_tokens": 39103980.0, + "epoch": 0.9402173913043478, + "grad_norm": 0.4735646138056289, + "learning_rate": 8.261212601949374e-06, + "loss": 1.1186, + "num_tokens": 144254056.0, "step": 173 }, { - "epoch": 1.6153846153846154, - "grad_norm": 0.5457391949139522, - "learning_rate": 5.230025269479649e-06, - "loss": 0.3495, - "num_tokens": 39327215.0, + "epoch": 0.9456521739130435, + "grad_norm": 0.43457638568705226, + "learning_rate": 8.240299850761851e-06, + "loss": 1.1558, + "num_tokens": 145058100.0, "step": 174 }, { - "epoch": 1.6247086247086249, - "grad_norm": 0.5257683547837523, - "learning_rate": 5.185097805889014e-06, - "loss": 0.324, - "num_tokens": 39543065.0, + "epoch": 0.9510869565217391, + "grad_norm": 0.5474669838467183, + "learning_rate": 8.219292608884309e-06, + "loss": 1.0842, + "num_tokens": 145874986.0, "step": 175 }, { - "epoch": 1.6340326340326339, - "grad_norm": 0.5184260973571639, - "learning_rate": 5.1402018642075336e-06, - "loss": 0.3494, - "num_tokens": 39767880.0, + "epoch": 0.9565217391304348, + "grad_norm": 0.42042241204376524, + "learning_rate": 8.198191600685931e-06, + "loss": 1.0286, + "num_tokens": 146747131.0, "step": 176 }, { - "epoch": 1.6433566433566433, - "grad_norm": 0.5190968681145691, - "learning_rate": 5.095341938547416e-06, - "loss": 0.3211, - "num_tokens": 39995732.0, + "epoch": 0.9619565217391305, + "grad_norm": 0.4401784756495304, + "learning_rate": 8.176997553769146e-06, + "loss": 1.0931, + "num_tokens": 147579942.0, "step": 177 }, { - "epoch": 1.6526806526806528, - "grad_norm": 0.4893090655080212, - "learning_rate": 5.050522519415646e-06, - "loss": 0.3272, - "num_tokens": 40222806.0, + "epoch": 0.967391304347826, + "grad_norm": 0.4198879718494924, + "learning_rate": 8.155711198944536e-06, + "loss": 1.0821, + "num_tokens": 148454246.0, "step": 178 }, { - "epoch": 1.662004662004662, - "grad_norm": 0.4951936701558321, - "learning_rate": 5.005748093264473e-06, - "loss": 0.3265, - "num_tokens": 40450154.0, + "epoch": 0.9728260869565217, + "grad_norm": 0.42262883519991834, + "learning_rate": 8.134333270205624e-06, + "loss": 1.1195, + "num_tokens": 149217392.0, "step": 179 }, { - "epoch": 1.6713286713286712, - "grad_norm": 0.4922175483307558, - "learning_rate": 4.961023142042329e-06, - "loss": 0.3305, - "num_tokens": 40679571.0, + "epoch": 0.9782608695652174, + "grad_norm": 0.43257345855354584, + "learning_rate": 8.112864504703582e-06, + "loss": 1.1016, + "num_tokens": 150120592.0, "step": 180 }, { - "epoch": 1.6806526806526807, - "grad_norm": 0.5225798586624328, - "learning_rate": 4.916352142745163e-06, - "loss": 0.3344, - "num_tokens": 40915005.0, + "epoch": 0.9836956521739131, + "grad_norm": 0.5443709834488111, + "learning_rate": 8.0913056427218e-06, + "loss": 1.0348, + "num_tokens": 151030177.0, "step": 181 }, { - "epoch": 1.68997668997669, - "grad_norm": 0.5228426979014617, - "learning_rate": 4.871739566968315e-06, - "loss": 0.3192, - "num_tokens": 41145279.0, + "epoch": 0.9891304347826086, + "grad_norm": 0.45036072810937827, + "learning_rate": 8.069657427650364e-06, + "loss": 1.0514, + "num_tokens": 151868928.0, "step": 182 }, { - "epoch": 1.6993006993006992, - "grad_norm": 0.5083022419487268, - "learning_rate": 4.8271898804588825e-06, - "loss": 0.3254, - "num_tokens": 41370598.0, + "epoch": 0.9945652173913043, + "grad_norm": 0.6115163980654007, + "learning_rate": 8.047920605960428e-06, + "loss": 1.0633, + "num_tokens": 152709677.0, "step": 183 }, { - "epoch": 1.7086247086247086, - "grad_norm": 0.5201836986924885, - "learning_rate": 4.782707542668715e-06, - "loss": 0.3311, - "num_tokens": 41603620.0, + "epoch": 1.0, + "grad_norm": 0.4423384270708878, + "learning_rate": 8.026095927178458e-06, + "loss": 1.0377, + "num_tokens": 153598466.0, + "step": 184 + }, + { + "epoch": 1.0, + "eval_loss": 0.8116728663444519, + "eval_num_tokens": 153598466.0, + "eval_runtime": 116.3987, + "eval_samples_per_second": 44.94, + "eval_steps_per_second": 5.619, "step": 184 }, { - "epoch": 1.717948717948718, - "grad_norm": 0.48151780775578606, - "learning_rate": 4.738297006308008e-06, - "loss": 0.3208, - "num_tokens": 41838817.0, + "epoch": 1.0054347826086956, + "grad_norm": 0.7729891726078799, + "learning_rate": 8.004184143860408e-06, + "loss": 1.0648, + "num_tokens": 154435241.0, "step": 185 }, { - "epoch": 1.7272727272727273, - "grad_norm": 0.5356315914755293, - "learning_rate": 4.6939627168995915e-06, - "loss": 0.3288, - "num_tokens": 42060021.0, + "epoch": 1.0108695652173914, + "grad_norm": 0.5867866688003616, + "learning_rate": 7.982186011565755e-06, + "loss": 0.9857, + "num_tokens": 155233012.0, "step": 186 }, { - "epoch": 1.7365967365967365, - "grad_norm": 0.514889519867123, - "learning_rate": 4.649709112333923e-06, - "loss": 0.328, - "num_tokens": 42284510.0, + "epoch": 1.016304347826087, + "grad_norm": 0.748318392981897, + "learning_rate": 7.960102288831454e-06, + "loss": 1.0936, + "num_tokens": 156068245.0, "step": 187 }, { - "epoch": 1.745920745920746, - "grad_norm": 0.4724010610876581, - "learning_rate": 4.605540622424862e-06, - "loss": 0.3287, - "num_tokens": 42518962.0, + "epoch": 1.0217391304347827, + "grad_norm": 0.6766886272953564, + "learning_rate": 7.937933737145777e-06, + "loss": 1.0318, + "num_tokens": 156917237.0, "step": 188 }, { - "epoch": 1.7552447552447552, - "grad_norm": 0.49538080834069576, - "learning_rate": 4.561461668466233e-06, - "loss": 0.3316, - "num_tokens": 42744627.0, + "epoch": 1.0271739130434783, + "grad_norm": 0.5751547728567178, + "learning_rate": 7.915681120922055e-06, + "loss": 1.0895, + "num_tokens": 157726124.0, "step": 189 }, { - "epoch": 1.7645687645687644, - "grad_norm": 0.5024879109051644, - "learning_rate": 4.517476662789257e-06, - "loss": 0.3219, - "num_tokens": 42968537.0, + "epoch": 1.0326086956521738, + "grad_norm": 0.652421513757397, + "learning_rate": 7.893345207472329e-06, + "loss": 1.0166, + "num_tokens": 158541791.0, "step": 190 }, { - "epoch": 1.7738927738927739, - "grad_norm": 0.4886882856604164, - "learning_rate": 4.473590008320868e-06, - "loss": 0.3311, - "num_tokens": 43186024.0, + "epoch": 1.0380434782608696, + "grad_norm": 0.7072500365040744, + "learning_rate": 7.870926766980879e-06, + "loss": 1.0475, + "num_tokens": 159364181.0, "step": 191 }, { - "epoch": 1.7832167832167833, - "grad_norm": 0.5277124654943276, - "learning_rate": 4.429806098142989e-06, - "loss": 0.3259, - "num_tokens": 43406230.0, + "epoch": 1.0434782608695652, + "grad_norm": 0.6233691645635457, + "learning_rate": 7.848426572477677e-06, + "loss": 1.0351, + "num_tokens": 160208993.0, "step": 192 }, { - "epoch": 1.7925407925407926, - "grad_norm": 0.49430765763937234, - "learning_rate": 4.386129315052768e-06, - "loss": 0.3263, - "num_tokens": 43632452.0, + "epoch": 1.048913043478261, + "grad_norm": 0.48198089839824576, + "learning_rate": 7.825845399811723e-06, + "loss": 1.0098, + "num_tokens": 161011393.0, "step": 193 }, { - "epoch": 1.8018648018648018, - "grad_norm": 0.4916236071607527, - "learning_rate": 4.3425640311238695e-06, - "loss": 0.3281, - "num_tokens": 43860663.0, + "epoch": 1.0543478260869565, + "grad_norm": 0.4595646765612945, + "learning_rate": 7.8031840276243e-06, + "loss": 1.0633, + "num_tokens": 161890267.0, "step": 194 }, { - "epoch": 1.8111888111888113, - "grad_norm": 0.5515259138593007, - "learning_rate": 4.299114607268814e-06, - "loss": 0.3434, - "num_tokens": 44085405.0, + "epoch": 1.059782608695652, + "grad_norm": 0.5917347479597387, + "learning_rate": 7.78044323732212e-06, + "loss": 0.9305, + "num_tokens": 162691891.0, "step": 195 }, { - "epoch": 1.8205128205128205, - "grad_norm": 0.4896559926348576, - "learning_rate": 4.255785392802464e-06, - "loss": 0.3309, - "num_tokens": 44311745.0, + "epoch": 1.065217391304348, + "grad_norm": 0.5153644484556954, + "learning_rate": 7.75762381305038e-06, + "loss": 1.0327, + "num_tokens": 163497163.0, "step": 196 }, { - "epoch": 1.8298368298368297, - "grad_norm": 0.4748622553446698, - "learning_rate": 4.212580725006635e-06, - "loss": 0.3122, - "num_tokens": 44541583.0, + "epoch": 1.0706521739130435, + "grad_norm": 0.6049212399144008, + "learning_rate": 7.734726541665722e-06, + "loss": 0.9907, + "num_tokens": 164379270.0, "step": 197 }, { - "epoch": 1.8391608391608392, - "grad_norm": 0.5235696028853547, - "learning_rate": 4.169504928695956e-06, - "loss": 0.3343, - "num_tokens": 44758719.0, + "epoch": 1.0760869565217392, + "grad_norm": 0.4380348492738434, + "learning_rate": 7.711752212709106e-06, + "loss": 1.0875, + "num_tokens": 165182653.0, "step": 198 }, { - "epoch": 1.8484848484848486, - "grad_norm": 0.4799565738729803, - "learning_rate": 4.126562315784924e-06, - "loss": 0.3335, - "num_tokens": 44984400.0, + "epoch": 1.0815217391304348, + "grad_norm": 0.644663054623725, + "learning_rate": 7.688701618378583e-06, + "loss": 0.9748, + "num_tokens": 165982495.0, "step": 199 }, { - "epoch": 1.8578088578088578, - "grad_norm": 0.47269486609129163, - "learning_rate": 4.083757184856304e-06, - "loss": 0.3316, - "num_tokens": 45202481.0, + "epoch": 1.0869565217391304, + "grad_norm": 0.4810295772680435, + "learning_rate": 7.665575553501973e-06, + "loss": 1.0002, + "num_tokens": 166812144.0, "step": 200 }, { - "epoch": 1.867132867132867, - "grad_norm": 0.5004916428890924, - "learning_rate": 4.041093820730821e-06, - "loss": 0.3305, - "num_tokens": 45424342.0, + "epoch": 1.0923913043478262, + "grad_norm": 0.632025835493392, + "learning_rate": 7.64237481550947e-06, + "loss": 1.0258, + "num_tokens": 167689643.0, "step": 201 }, { - "epoch": 1.8764568764568765, - "grad_norm": 0.49094397631517767, - "learning_rate": 3.99857649403826e-06, - "loss": 0.3252, - "num_tokens": 45646541.0, + "epoch": 1.0978260869565217, + "grad_norm": 0.47775863822859244, + "learning_rate": 7.619100204406127e-06, + "loss": 0.9575, + "num_tokens": 168572405.0, "step": 202 }, { - "epoch": 1.8857808857808858, - "grad_norm": 0.48010716713043344, - "learning_rate": 3.956209460789957e-06, - "loss": 0.3201, - "num_tokens": 45865887.0, + "epoch": 1.1032608695652173, + "grad_norm": 0.7719138228541088, + "learning_rate": 7.595752522744287e-06, + "loss": 0.9942, + "num_tokens": 169473211.0, "step": 203 }, { - "epoch": 1.895104895104895, - "grad_norm": 0.5003078282639417, - "learning_rate": 3.913996961952789e-06, - "loss": 0.3345, - "num_tokens": 46090891.0, - "step": 204 - }, - { - "epoch": 1.895104895104895, - "eval_loss": 0.3945937752723694, - "eval_num_tokens": 46090891.0, - "eval_runtime": 44.9767, - "eval_samples_per_second": 67.835, - "eval_steps_per_second": 8.493, + "epoch": 1.108695652173913, + "grad_norm": 0.4601408289120345, + "learning_rate": 7.572332575595904e-06, + "loss": 0.98, + "num_tokens": 170315046.0, "step": 204 }, { - "epoch": 1.9044289044289044, - "grad_norm": 0.5030448753628592, - "learning_rate": 3.871943223024632e-06, - "loss": 0.3273, - "num_tokens": 46333399.0, + "epoch": 1.1141304347826086, + "grad_norm": 0.6828651716457399, + "learning_rate": 7.548841170524779e-06, + "loss": 0.8964, + "num_tokens": 171200672.0, "step": 205 }, { - "epoch": 1.913752913752914, - "grad_norm": 0.4936076223940164, - "learning_rate": 3.8300524536114004e-06, - "loss": 0.3376, - "num_tokens": 46551999.0, + "epoch": 1.1195652173913044, + "grad_norm": 0.5713003323730678, + "learning_rate": 7.525279117558719e-06, + "loss": 0.998, + "num_tokens": 172044295.0, "step": 206 }, { - "epoch": 1.9230769230769231, - "grad_norm": 0.46513542076706943, - "learning_rate": 3.7883288470056543e-06, - "loss": 0.3184, - "num_tokens": 46782533.0, + "epoch": 1.125, + "grad_norm": 0.7426722544915143, + "learning_rate": 7.501647229161599e-06, + "loss": 1.0206, + "num_tokens": 172855232.0, "step": 207 }, { - "epoch": 1.9324009324009324, - "grad_norm": 0.5034814125152446, - "learning_rate": 3.746776579766851e-06, - "loss": 0.3301, - "num_tokens": 47010120.0, + "epoch": 1.1304347826086956, + "grad_norm": 0.7673163483132489, + "learning_rate": 7.477946320205358e-06, + "loss": 1.0895, + "num_tokens": 173728391.0, "step": 208 }, { - "epoch": 1.9417249417249418, - "grad_norm": 0.5131605114056567, - "learning_rate": 3.7053998113032695e-06, - "loss": 0.3218, - "num_tokens": 47236233.0, + "epoch": 1.1358695652173914, + "grad_norm": 0.5298469653563895, + "learning_rate": 7.454177207941884e-06, + "loss": 0.9346, + "num_tokens": 174587763.0, "step": 209 }, { - "epoch": 1.951048951048951, - "grad_norm": 0.47485454111075154, - "learning_rate": 3.6642026834556488e-06, - "loss": 0.3238, - "num_tokens": 47461465.0, + "epoch": 1.141304347826087, + "grad_norm": 0.6057350246674399, + "learning_rate": 7.430340711974855e-06, + "loss": 1.0299, + "num_tokens": 175483652.0, "step": 210 }, { - "epoch": 1.9603729603729603, - "grad_norm": 0.5481840012126614, - "learning_rate": 3.6231893200825917e-06, - "loss": 0.3354, - "num_tokens": 47679615.0, + "epoch": 1.1467391304347827, + "grad_norm": 0.46922407617378503, + "learning_rate": 7.406437654231453e-06, + "loss": 1.0501, + "num_tokens": 176272672.0, "step": 211 }, { - "epoch": 1.9696969696969697, - "grad_norm": 0.5149177715727978, - "learning_rate": 3.582363826647756e-06, - "loss": 0.3281, - "num_tokens": 47906581.0, + "epoch": 1.1521739130434783, + "grad_norm": 0.6030403553630953, + "learning_rate": 7.382468858934046e-06, + "loss": 1.017, + "num_tokens": 177048310.0, "step": 212 }, { - "epoch": 1.9790209790209792, - "grad_norm": 0.488341341221377, - "learning_rate": 3.5417302898089e-06, - "loss": 0.3084, - "num_tokens": 48135155.0, + "epoch": 1.1576086956521738, + "grad_norm": 0.43230362489103435, + "learning_rate": 7.358435152571749e-06, + "loss": 1.008, + "num_tokens": 177839107.0, "step": 213 }, { - "epoch": 1.9883449883449882, - "grad_norm": 0.4755923848180776, - "learning_rate": 3.501292777008811e-06, - "loss": 0.3251, - "num_tokens": 48356662.0, + "epoch": 1.1630434782608696, + "grad_norm": 0.5448245892571197, + "learning_rate": 7.334337363871936e-06, + "loss": 0.8768, + "num_tokens": 178618954.0, "step": 214 }, { - "epoch": 1.9976689976689976, - "grad_norm": 0.4928939994589957, - "learning_rate": 3.461055336068141e-06, - "loss": 0.319, - "num_tokens": 48592154.0, + "epoch": 1.1684782608695652, + "grad_norm": 0.4880077220539455, + "learning_rate": 7.310176323771663e-06, + "loss": 0.9951, + "num_tokens": 179430065.0, "step": 215 }, { - "epoch": 2.0, - "grad_norm": 0.4928939994589957, - "learning_rate": 3.4210219947802214e-06, - "loss": 0.3182, - "num_tokens": 48651124.0, + "epoch": 1.1739130434782608, + "grad_norm": 0.4876168134427255, + "learning_rate": 7.285952865389007e-06, + "loss": 0.9202, + "num_tokens": 180289162.0, "step": 216 }, { - "epoch": 2.0093240093240095, - "grad_norm": 1.0077926731304525, - "learning_rate": 3.38119676050788e-06, - "loss": 0.2658, - "num_tokens": 48884071.0, + "epoch": 1.1793478260869565, + "grad_norm": 0.469527959973456, + "learning_rate": 7.261667823994351e-06, + "loss": 0.9602, + "num_tokens": 181130219.0, "step": 217 }, { - "epoch": 2.0186480186480185, - "grad_norm": 0.536857443533564, - "learning_rate": 3.341583619782304e-06, - "loss": 0.2712, - "num_tokens": 49121191.0, + "epoch": 1.184782608695652, + "grad_norm": 0.4953909092855739, + "learning_rate": 7.237322036981568e-06, + "loss": 1.0315, + "num_tokens": 181946236.0, "step": 218 }, { - "epoch": 2.027972027972028, - "grad_norm": 0.4931471166108591, - "learning_rate": 3.3021865379039765e-06, - "loss": 0.265, - "num_tokens": 49336073.0, + "epoch": 1.190217391304348, + "grad_norm": 0.4774488084639226, + "learning_rate": 7.212916343839163e-06, + "loss": 0.9541, + "num_tokens": 182783428.0, "step": 219 }, { - "epoch": 2.0372960372960374, - "grad_norm": 0.4722559359039849, - "learning_rate": 3.2630094585457583e-06, - "loss": 0.2754, - "num_tokens": 49557586.0, + "epoch": 1.1956521739130435, + "grad_norm": 0.47379668683218334, + "learning_rate": 7.188451586121312e-06, + "loss": 0.9369, + "num_tokens": 183616914.0, "step": 220 }, { - "epoch": 2.046620046620047, - "grad_norm": 0.45498374408979997, - "learning_rate": 3.2240563033581117e-06, - "loss": 0.2676, - "num_tokens": 49783374.0, + "epoch": 1.2010869565217392, + "grad_norm": 0.511952905506674, + "learning_rate": 7.163928607418849e-06, + "loss": 0.9844, + "num_tokens": 184505786.0, "step": 221 }, { - "epoch": 2.055944055944056, - "grad_norm": 0.443089196299474, - "learning_rate": 3.1853309715765567e-06, - "loss": 0.2628, - "num_tokens": 50015143.0, + "epoch": 1.2065217391304348, + "grad_norm": 0.49063209058524954, + "learning_rate": 7.139348253330177e-06, + "loss": 0.9794, + "num_tokens": 185345903.0, "step": 222 }, { - "epoch": 2.0652680652680653, - "grad_norm": 0.4593634390850791, - "learning_rate": 3.14683733963134e-06, - "loss": 0.2724, - "num_tokens": 50248084.0, + "epoch": 1.2119565217391304, + "grad_norm": 0.4803261309055045, + "learning_rate": 7.114711371432113e-06, + "loss": 0.9172, + "num_tokens": 186214745.0, "step": 223 }, { - "epoch": 2.0745920745920747, - "grad_norm": 0.4968173973247625, - "learning_rate": 3.108579260759409e-06, - "loss": 0.2718, - "num_tokens": 50466092.0, + "epoch": 1.2173913043478262, + "grad_norm": 0.4684467286790229, + "learning_rate": 7.090018811250653e-06, + "loss": 1.0251, + "num_tokens": 187078658.0, "step": 224 }, { - "epoch": 2.0839160839160837, - "grad_norm": 0.48173251591172095, - "learning_rate": 3.0705605646186966e-06, - "loss": 0.2705, - "num_tokens": 50700030.0, + "epoch": 1.2228260869565217, + "grad_norm": 0.45476529917577446, + "learning_rate": 7.065271424231694e-06, + "loss": 0.9518, + "num_tokens": 187868285.0, "step": 225 }, { - "epoch": 2.093240093240093, - "grad_norm": 0.47536341261873954, - "learning_rate": 3.0327850569047803e-06, - "loss": 0.2754, - "num_tokens": 50926029.0, + "epoch": 1.2282608695652173, + "grad_norm": 0.46693532592731146, + "learning_rate": 7.040470063711655e-06, + "loss": 0.9018, + "num_tokens": 188693201.0, "step": 226 }, { - "epoch": 2.1025641025641026, - "grad_norm": 0.48555103105685965, - "learning_rate": 2.995256518969914e-06, - "loss": 0.2665, - "num_tokens": 51151982.0, + "epoch": 1.233695652173913, + "grad_norm": 0.4930658690295571, + "learning_rate": 7.015615584888072e-06, + "loss": 0.982, + "num_tokens": 189553180.0, "step": 227 }, { - "epoch": 2.111888111888112, - "grad_norm": 0.44975846542574716, - "learning_rate": 2.9579787074445244e-06, - "loss": 0.2662, - "num_tokens": 51365485.0, + "epoch": 1.2391304347826086, + "grad_norm": 0.4835466860532625, + "learning_rate": 6.990708844790093e-06, + "loss": 0.9711, + "num_tokens": 190386572.0, "step": 228 }, { - "epoch": 2.121212121212121, - "grad_norm": 0.44850869772061025, - "learning_rate": 2.9209553538611634e-06, - "loss": 0.269, - "num_tokens": 51594394.0, + "epoch": 1.2445652173913044, + "grad_norm": 0.4852934677176225, + "learning_rate": 6.965750702248936e-06, + "loss": 0.9742, + "num_tokens": 191209747.0, "step": 229 }, { - "epoch": 2.1305361305361306, - "grad_norm": 0.4388553942308307, - "learning_rate": 2.8841901642809843e-06, - "loss": 0.2691, - "num_tokens": 51823497.0, + "epoch": 1.25, + "grad_norm": 0.41281861954385113, + "learning_rate": 6.940742017868274e-06, + "loss": 1.0304, + "num_tokens": 192083591.0, "step": 230 }, { - "epoch": 2.13986013986014, - "grad_norm": 0.433932495158785, - "learning_rate": 2.8476868189227603e-06, - "loss": 0.267, - "num_tokens": 52059777.0, + "epoch": 1.2554347826086958, + "grad_norm": 0.5379830623809766, + "learning_rate": 6.9156836539945535e-06, + "loss": 0.9603, + "num_tokens": 192952509.0, "step": 231 }, { - "epoch": 2.149184149184149, - "grad_norm": 0.4570113864425228, - "learning_rate": 2.811448971794487e-06, - "loss": 0.2628, - "num_tokens": 52291004.0, + "epoch": 1.2608695652173914, + "grad_norm": 0.4583357149101248, + "learning_rate": 6.890576474687264e-06, + "loss": 0.9592, + "num_tokens": 193784622.0, "step": 232 }, { - "epoch": 2.1585081585081585, - "grad_norm": 0.4454627004878239, - "learning_rate": 2.7754802503276235e-06, - "loss": 0.2661, - "num_tokens": 52510106.0, + "epoch": 1.266304347826087, + "grad_norm": 0.49248565076137285, + "learning_rate": 6.865421345689147e-06, + "loss": 1.007, + "num_tokens": 194645421.0, "step": 233 }, { - "epoch": 2.167832167832168, - "grad_norm": 0.4266944564191006, - "learning_rate": 2.7397842550139813e-06, - "loss": 0.2762, - "num_tokens": 52740260.0, + "epoch": 1.2717391304347827, + "grad_norm": 0.503639933708953, + "learning_rate": 6.840219134396334e-06, + "loss": 1.0084, + "num_tokens": 195464421.0, "step": 234 }, { - "epoch": 2.177156177156177, - "grad_norm": 0.4828957968087209, - "learning_rate": 2.7043645590453067e-06, - "loss": 0.2713, - "num_tokens": 52965642.0, + "epoch": 1.2771739130434783, + "grad_norm": 0.5216304382891217, + "learning_rate": 6.814970709828448e-06, + "loss": 0.9359, + "num_tokens": 196232757.0, "step": 235 }, { - "epoch": 2.1864801864801864, - "grad_norm": 0.4377873737923911, - "learning_rate": 2.669224707955608e-06, - "loss": 0.2592, - "num_tokens": 53204133.0, + "epoch": 1.2826086956521738, + "grad_norm": 0.45832000640398235, + "learning_rate": 6.789676942598626e-06, + "loss": 1.019, + "num_tokens": 196993328.0, "step": 236 }, { - "epoch": 2.195804195804196, - "grad_norm": 0.4356864024809861, - "learning_rate": 2.6343682192662434e-06, - "loss": 0.2638, - "num_tokens": 53442257.0, + "epoch": 1.2880434782608696, + "grad_norm": 0.4888056205645728, + "learning_rate": 6.764338704883511e-06, + "loss": 0.9833, + "num_tokens": 197842972.0, "step": 237 }, { - "epoch": 2.2051282051282053, - "grad_norm": 0.44278157572380683, - "learning_rate": 2.5997985821338183e-06, - "loss": 0.2693, - "num_tokens": 53668484.0, + "epoch": 1.2934782608695652, + "grad_norm": 0.424259471078176, + "learning_rate": 6.73895687039317e-06, + "loss": 1.0477, + "num_tokens": 198775071.0, "step": 238 }, { - "epoch": 2.2144522144522143, - "grad_norm": 0.4620625252037188, - "learning_rate": 2.5655192570009124e-06, - "loss": 0.2758, - "num_tokens": 53888930.0, + "epoch": 1.2989130434782608, + "grad_norm": 0.4738026617794314, + "learning_rate": 6.713532314340968e-06, + "loss": 1.0088, + "num_tokens": 199605517.0, "step": 239 }, { - "epoch": 2.2237762237762237, - "grad_norm": 0.45332648933552794, - "learning_rate": 2.531533675249691e-06, - "loss": 0.2593, - "num_tokens": 54104360.0, + "epoch": 1.3043478260869565, + "grad_norm": 0.4492262228667904, + "learning_rate": 6.688065913413391e-06, + "loss": 1.0076, + "num_tokens": 200462948.0, "step": 240 }, { - "epoch": 2.233100233100233, - "grad_norm": 0.438761293853775, - "learning_rate": 2.4978452388584192e-06, - "loss": 0.2653, - "num_tokens": 54344051.0, + "epoch": 1.309782608695652, + "grad_norm": 0.5266897333579397, + "learning_rate": 6.662558545739812e-06, + "loss": 0.9553, + "num_tokens": 201254553.0, "step": 241 }, { - "epoch": 2.242424242424242, - "grad_norm": 0.42897612399239216, - "learning_rate": 2.464457320060929e-06, - "loss": 0.2567, - "num_tokens": 54575177.0, + "epoch": 1.315217391304348, + "grad_norm": 0.5044655819460336, + "learning_rate": 6.637011090862219e-06, + "loss": 0.9178, + "num_tokens": 202117833.0, "step": 242 }, { - "epoch": 2.2517482517482517, - "grad_norm": 0.45297972683885784, - "learning_rate": 2.4313732610090438e-06, - "loss": 0.268, - "num_tokens": 54806777.0, + "epoch": 1.3206521739130435, + "grad_norm": 0.4926047411840709, + "learning_rate": 6.611424429704879e-06, + "loss": 1.0102, + "num_tokens": 202958199.0, "step": 243 }, { - "epoch": 2.261072261072261, - "grad_norm": 0.4411649453120093, - "learning_rate": 2.398596373438038e-06, - "loss": 0.2583, - "num_tokens": 55037822.0, + "epoch": 1.3260869565217392, + "grad_norm": 0.44629993398334267, + "learning_rate": 6.585799444543967e-06, + "loss": 1.0296, + "num_tokens": 203865043.0, "step": 244 }, { - "epoch": 2.2703962703962706, - "grad_norm": 0.4345572710831335, - "learning_rate": 2.366129938335123e-06, - "loss": 0.2629, - "num_tokens": 55265922.0, + "epoch": 1.3315217391304348, + "grad_norm": 0.48718773409967614, + "learning_rate": 6.560137018977139e-06, + "loss": 0.9706, + "num_tokens": 204697445.0, "step": 245 }, { - "epoch": 2.2797202797202796, - "grad_norm": 0.42869651807401005, - "learning_rate": 2.3339772056110278e-06, - "loss": 0.2793, - "num_tokens": 55492290.0, + "epoch": 1.3369565217391304, + "grad_norm": 0.5347335066897773, + "learning_rate": 6.53443803789307e-06, + "loss": 0.931, + "num_tokens": 205550423.0, "step": 246 }, { - "epoch": 2.289044289044289, - "grad_norm": 0.4420299559105195, - "learning_rate": 2.302141393774666e-06, - "loss": 0.2615, - "num_tokens": 55725914.0, + "epoch": 1.3423913043478262, + "grad_norm": 0.4835007860499471, + "learning_rate": 6.5087033874409354e-06, + "loss": 1.0485, + "num_tokens": 206361818.0, "step": 247 }, { - "epoch": 2.2983682983682985, - "grad_norm": 0.4321678479969795, - "learning_rate": 2.2706256896109774e-06, - "loss": 0.2716, - "num_tokens": 55951887.0, + "epoch": 1.3478260869565217, + "grad_norm": 0.49682989878458295, + "learning_rate": 6.482933954999858e-06, + "loss": 0.9569, + "num_tokens": 207162202.0, "step": 248 }, { - "epoch": 2.3076923076923075, - "grad_norm": 0.441066204708253, - "learning_rate": 2.239433247861915e-06, - "loss": 0.259, - "num_tokens": 56172209.0, + "epoch": 1.3532608695652173, + "grad_norm": 0.48604607203541333, + "learning_rate": 6.457130629148312e-06, + "loss": 1.0242, + "num_tokens": 207982118.0, "step": 249 }, { - "epoch": 2.317016317016317, - "grad_norm": 0.43938915815345186, - "learning_rate": 2.208567190910663e-06, - "loss": 0.2687, - "num_tokens": 56389635.0, + "epoch": 1.358695652173913, + "grad_norm": 0.49136925224395633, + "learning_rate": 6.431294299633473e-06, + "loss": 0.9358, + "num_tokens": 208798264.0, "step": 250 }, { - "epoch": 2.3263403263403264, - "grad_norm": 0.4540274939571687, - "learning_rate": 2.1780306084690794e-06, - "loss": 0.2661, - "num_tokens": 56609219.0, + "epoch": 1.3641304347826086, + "grad_norm": 0.5135733708571742, + "learning_rate": 6.405425857340554e-06, + "loss": 0.8883, + "num_tokens": 209645647.0, "step": 251 }, { - "epoch": 2.335664335664336, - "grad_norm": 0.45455353071523924, - "learning_rate": 2.1478265572684142e-06, - "loss": 0.2647, - "num_tokens": 56831374.0, + "epoch": 1.3695652173913042, + "grad_norm": 0.5748699167895962, + "learning_rate": 6.3795261942620665e-06, + "loss": 0.831, + "num_tokens": 210500505.0, "step": 252 }, { - "epoch": 2.344988344988345, - "grad_norm": 0.4143941270620651, - "learning_rate": 2.1179580607533284e-06, - "loss": 0.2608, - "num_tokens": 57060957.0, + "epoch": 1.375, + "grad_norm": 0.4347647256494062, + "learning_rate": 6.353596203467085e-06, + "loss": 1.0276, + "num_tokens": 211352989.0, "step": 253 }, { - "epoch": 2.3543123543123543, - "grad_norm": 0.4388946585859175, - "learning_rate": 2.088428108779251e-06, - "loss": 0.2678, - "num_tokens": 57297316.0, + "epoch": 1.3804347826086958, + "grad_norm": 0.5593692195039036, + "learning_rate": 6.3276367790704315e-06, + "loss": 0.9188, + "num_tokens": 212210323.0, "step": 254 }, { - "epoch": 2.3636363636363638, - "grad_norm": 0.4171222019484084, - "learning_rate": 2.059239657313084e-06, - "loss": 0.2686, - "num_tokens": 57528400.0, + "epoch": 1.3858695652173914, + "grad_norm": 0.501586708029447, + "learning_rate": 6.30164881620186e-06, + "loss": 0.9177, + "num_tokens": 212999101.0, "step": 255 }, { - "epoch": 2.3636363636363638, - "eval_loss": 0.4000723958015442, - "eval_num_tokens": 57528400.0, - "eval_runtime": 45.0359, - "eval_samples_per_second": 67.746, - "eval_steps_per_second": 8.482, - "step": 255 - }, - { - "epoch": 2.3729603729603728, - "grad_norm": 0.42332245702440635, - "learning_rate": 2.0303956281373132e-06, - "loss": 0.2688, - "num_tokens": 57762038.0, + "epoch": 1.391304347826087, + "grad_norm": 0.4761388526416261, + "learning_rate": 6.275633210975179e-06, + "loss": 0.9635, + "num_tokens": 213807539.0, "step": 256 }, { - "epoch": 2.382284382284382, - "grad_norm": 0.4274471409323921, - "learning_rate": 2.001898908557533e-06, - "loss": 0.2671, - "num_tokens": 57993540.0, + "epoch": 1.3967391304347827, + "grad_norm": 0.7079104884965576, + "learning_rate": 6.249590860457362e-06, + "loss": 0.936, + "num_tokens": 214563435.0, "step": 257 }, { - "epoch": 2.3916083916083917, - "grad_norm": 0.44507562857654975, - "learning_rate": 1.9737523511134322e-06, - "loss": 0.2643, - "num_tokens": 58216266.0, + "epoch": 1.4021739130434783, + "grad_norm": 0.4607025462309026, + "learning_rate": 6.2235226626376075e-06, + "loss": 0.977, + "num_tokens": 215403936.0, "step": 258 }, { - "epoch": 2.400932400932401, - "grad_norm": 0.4466201275079433, - "learning_rate": 1.9459587732932427e-06, - "loss": 0.2621, - "num_tokens": 58449705.0, + "epoch": 1.4076086956521738, + "grad_norm": 0.5992927631339661, + "learning_rate": 6.19742951639638e-06, + "loss": 0.8826, + "num_tokens": 216186119.0, "step": 259 }, { - "epoch": 2.41025641025641, - "grad_norm": 0.40869783871580784, - "learning_rate": 1.918520957251716e-06, - "loss": 0.2647, - "num_tokens": 58680399.0, + "epoch": 1.4130434782608696, + "grad_norm": 0.47309177952769493, + "learning_rate": 6.171312321474413e-06, + "loss": 1.0086, + "num_tokens": 217057725.0, "step": 260 }, { - "epoch": 2.4195804195804196, - "grad_norm": 0.4537001488141776, - "learning_rate": 1.8914416495316201e-06, - "loss": 0.2621, - "num_tokens": 58906596.0, + "epoch": 1.4184782608695652, + "grad_norm": 0.5947429552853162, + "learning_rate": 6.1451719784416775e-06, + "loss": 0.9028, + "num_tokens": 217861129.0, "step": 261 }, { - "epoch": 2.428904428904429, - "grad_norm": 0.4337917513928828, - "learning_rate": 1.8647235607888192e-06, - "loss": 0.2598, - "num_tokens": 59141325.0, + "epoch": 1.4239130434782608, + "grad_norm": 0.4605662311729455, + "learning_rate": 6.119009388666344e-06, + "loss": 0.9635, + "num_tokens": 218696929.0, "step": 262 }, { - "epoch": 2.438228438228438, - "grad_norm": 0.41191514199500184, - "learning_rate": 1.8383693655209223e-06, - "loss": 0.2684, - "num_tokens": 59368701.0, + "epoch": 1.4293478260869565, + "grad_norm": 0.4539223081529101, + "learning_rate": 6.0928254542836855e-06, + "loss": 1.0757, + "num_tokens": 219507650.0, "step": 263 }, { - "epoch": 2.4475524475524475, - "grad_norm": 0.4342780664561112, - "learning_rate": 1.8123817017995754e-06, - "loss": 0.2628, - "num_tokens": 59597922.0, + "epoch": 1.434782608695652, + "grad_norm": 0.5214888309662288, + "learning_rate": 6.066621078164979e-06, + "loss": 0.8576, + "num_tokens": 220352475.0, "step": 264 }, { - "epoch": 2.456876456876457, - "grad_norm": 0.42508370862489697, - "learning_rate": 1.7867631710063814e-06, - "loss": 0.2797, - "num_tokens": 59816592.0, + "epoch": 1.440217391304348, + "grad_norm": 0.48316529011400333, + "learning_rate": 6.040397163886376e-06, + "loss": 0.9293, + "num_tokens": 221149474.0, "step": 265 }, { - "epoch": 2.4662004662004664, - "grad_norm": 0.43487947849731023, - "learning_rate": 1.7615163375725069e-06, - "loss": 0.2713, - "num_tokens": 60036267.0, + "epoch": 1.4456521739130435, + "grad_norm": 0.5089006578572763, + "learning_rate": 6.014154615697729e-06, + "loss": 0.9412, + "num_tokens": 221981897.0, "step": 266 }, { - "epoch": 2.4755244755244754, - "grad_norm": 0.4317316527617555, - "learning_rate": 1.7366437287219745e-06, - "loss": 0.2741, - "num_tokens": 60267834.0, + "epoch": 1.4510869565217392, + "grad_norm": 0.46809528790737637, + "learning_rate": 5.987894338491438e-06, + "loss": 0.9386, + "num_tokens": 222825554.0, "step": 267 }, { - "epoch": 2.484848484848485, - "grad_norm": 0.43452295527728907, - "learning_rate": 1.7121478342186893e-06, - "loss": 0.2621, - "num_tokens": 60493778.0, + "epoch": 1.4565217391304348, + "grad_norm": 0.5183630856980103, + "learning_rate": 5.961617237771217e-06, + "loss": 1.0011, + "num_tokens": 223614846.0, "step": 268 }, { - "epoch": 2.4941724941724943, - "grad_norm": 0.43615413703439787, - "learning_rate": 1.6880311061172105e-06, - "loss": 0.2699, - "num_tokens": 60714160.0, + "epoch": 1.4619565217391304, + "grad_norm": 0.5281469620794269, + "learning_rate": 5.935324219620897e-06, + "loss": 0.9369, + "num_tokens": 224416773.0, "step": 269 }, { - "epoch": 2.5034965034965033, - "grad_norm": 0.4170018950625692, - "learning_rate": 1.664295958517304e-06, - "loss": 0.2689, - "num_tokens": 60945537.0, + "epoch": 1.4673913043478262, + "grad_norm": 0.5452128676586095, + "learning_rate": 5.909016190673173e-06, + "loss": 0.8694, + "num_tokens": 225257332.0, "step": 270 }, { - "epoch": 2.5128205128205128, - "grad_norm": 0.4265907439907391, - "learning_rate": 1.6409447673222828e-06, - "loss": 0.2636, - "num_tokens": 61165444.0, + "epoch": 1.4728260869565217, + "grad_norm": 0.5366647661362368, + "learning_rate": 5.88269405807833e-06, + "loss": 0.8403, + "num_tokens": 226117463.0, "step": 271 }, { - "epoch": 2.5221445221445222, - "grad_norm": 0.4318980822762591, - "learning_rate": 1.6179798700011806e-06, - "loss": 0.256, - "num_tokens": 61406513.0, + "epoch": 1.4782608695652173, + "grad_norm": 0.4785272172063298, + "learning_rate": 5.856358729472984e-06, + "loss": 0.9872, + "num_tokens": 227057124.0, "step": 272 }, { - "epoch": 2.5314685314685317, - "grad_norm": 0.44694267169277563, - "learning_rate": 1.5954035653547689e-06, - "loss": 0.2694, - "num_tokens": 61632914.0, + "epoch": 1.483695652173913, + "grad_norm": 0.48058788852849177, + "learning_rate": 5.830011112948768e-06, + "loss": 0.9887, + "num_tokens": 227914008.0, "step": 273 }, { - "epoch": 2.5407925407925407, - "grad_norm": 0.42882361209303854, - "learning_rate": 1.5732181132854492e-06, - "loss": 0.2769, - "num_tokens": 61860817.0, + "epoch": 1.4891304347826086, + "grad_norm": 0.45796547344368305, + "learning_rate": 5.803652117021029e-06, + "loss": 0.9978, + "num_tokens": 228787533.0, "step": 274 }, { - "epoch": 2.55011655011655, - "grad_norm": 0.41313799908685594, - "learning_rate": 1.55142573457103e-06, - "loss": 0.2575, - "num_tokens": 62096041.0, + "epoch": 1.4945652173913042, + "grad_norm": 0.4754307783345142, + "learning_rate": 5.777282650597496e-06, + "loss": 1.0031, + "num_tokens": 229610626.0, "step": 275 }, { - "epoch": 2.5594405594405596, - "grad_norm": 0.4374318776570311, - "learning_rate": 1.5300286106424279e-06, - "loss": 0.2605, - "num_tokens": 62325189.0, + "epoch": 1.5, + "grad_norm": 0.5886367778567009, + "learning_rate": 5.750903622946938e-06, + "loss": 0.8612, + "num_tokens": 230450267.0, "step": 276 }, { - "epoch": 2.5687645687645686, - "grad_norm": 0.4235604710951482, - "learning_rate": 1.509028883365305e-06, - "loss": 0.2695, - "num_tokens": 62553758.0, + "epoch": 1.5054347826086958, + "grad_norm": 0.48579086470030963, + "learning_rate": 5.724515943667818e-06, + "loss": 0.9728, + "num_tokens": 231229938.0, "step": 277 }, { - "epoch": 2.578088578088578, - "grad_norm": 0.4297349959495502, - "learning_rate": 1.488428654825669e-06, - "loss": 0.2661, - "num_tokens": 62781233.0, + "epoch": 1.5108695652173914, + "grad_norm": 0.5456006186458885, + "learning_rate": 5.698120522656916e-06, + "loss": 0.8451, + "num_tokens": 232151140.0, "step": 278 }, { - "epoch": 2.5874125874125875, - "grad_norm": 0.4460197650538502, - "learning_rate": 1.468229987119448e-06, - "loss": 0.2749, - "num_tokens": 63013487.0, + "epoch": 1.516304347826087, + "grad_norm": 0.4723944904567319, + "learning_rate": 5.671718270077971e-06, + "loss": 0.9205, + "num_tokens": 232968829.0, "step": 279 }, { - "epoch": 2.596736596736597, - "grad_norm": 0.4286182611725574, - "learning_rate": 1.4484349021460784e-06, - "loss": 0.2599, - "num_tokens": 63242597.0, + "epoch": 1.5217391304347827, + "grad_norm": 0.4955922792130881, + "learning_rate": 5.645310096330281e-06, + "loss": 1.0536, + "num_tokens": 233792405.0, "step": 280 }, { - "epoch": 2.606060606060606, - "grad_norm": 0.4511458824539744, - "learning_rate": 1.4290453814061065e-06, - "loss": 0.2676, - "num_tokens": 63465524.0, + "epoch": 1.5271739130434783, + "grad_norm": 0.48095668095634725, + "learning_rate": 5.618896912017318e-06, + "loss": 0.9331, + "num_tokens": 234642312.0, "step": 281 }, { - "epoch": 2.6153846153846154, - "grad_norm": 0.4357813194887112, - "learning_rate": 1.4100633658028456e-06, - "loss": 0.2685, - "num_tokens": 63685410.0, + "epoch": 1.5326086956521738, + "grad_norm": 0.43956394618563654, + "learning_rate": 5.592479627915329e-06, + "loss": 0.9626, + "num_tokens": 235509729.0, "step": 282 }, { - "epoch": 2.624708624708625, - "grad_norm": 0.42120478929241784, - "learning_rate": 1.3914907554480842e-06, - "loss": 0.2612, - "num_tokens": 63915223.0, + "epoch": 1.5380434782608696, + "grad_norm": 0.48850505034509745, + "learning_rate": 5.566059154941925e-06, + "loss": 0.9131, + "num_tokens": 236303967.0, "step": 283 }, { - "epoch": 2.634032634032634, - "grad_norm": 0.4389076820558484, - "learning_rate": 1.3733294094718866e-06, - "loss": 0.2709, - "num_tokens": 64138023.0, + "epoch": 1.5434782608695652, + "grad_norm": 0.45628959831143406, + "learning_rate": 5.539636404124684e-06, + "loss": 0.9329, + "num_tokens": 237145438.0, "step": 284 }, { - "epoch": 2.6433566433566433, - "grad_norm": 0.4374182234608829, - "learning_rate": 1.3555811458364907e-06, - "loss": 0.2704, - "num_tokens": 64364256.0, + "epoch": 1.5489130434782608, + "grad_norm": 0.49901605657674153, + "learning_rate": 5.513212286569721e-06, + "loss": 0.9331, + "num_tokens": 237961859.0, "step": 285 }, { - "epoch": 2.652680652680653, - "grad_norm": 0.44236834709549855, - "learning_rate": 1.3382477411543343e-06, - "loss": 0.2655, - "num_tokens": 64587438.0, + "epoch": 1.5543478260869565, + "grad_norm": 0.4929646508453616, + "learning_rate": 5.48678771343028e-06, + "loss": 0.9098, + "num_tokens": 238765601.0, "step": 286 }, { - "epoch": 2.6620046620046622, - "grad_norm": 0.3962866587873811, - "learning_rate": 1.3213309305102079e-06, - "loss": 0.2667, - "num_tokens": 64816847.0, + "epoch": 1.5597826086956523, + "grad_norm": 0.4998589986156106, + "learning_rate": 5.4603635958753175e-06, + "loss": 1.0031, + "num_tokens": 239549930.0, "step": 287 }, { - "epoch": 2.6713286713286712, - "grad_norm": 0.44441366707237095, - "learning_rate": 1.304832407287574e-06, - "loss": 0.2728, - "num_tokens": 65039513.0, + "epoch": 1.5652173913043477, + "grad_norm": 0.46124830740660894, + "learning_rate": 5.433940845058076e-06, + "loss": 0.9741, + "num_tokens": 240380529.0, "step": 288 }, { - "epoch": 2.6806526806526807, - "grad_norm": 0.4221320298969624, - "learning_rate": 1.2887538229990627e-06, - "loss": 0.2631, - "num_tokens": 65275781.0, + "epoch": 1.5706521739130435, + "grad_norm": 0.5486007366307898, + "learning_rate": 5.407520372084675e-06, + "loss": 0.9231, + "num_tokens": 241226627.0, "step": 289 }, { - "epoch": 2.6899766899766897, - "grad_norm": 0.41457174464701124, - "learning_rate": 1.2730967871211484e-06, - "loss": 0.2588, - "num_tokens": 65502493.0, + "epoch": 1.5760869565217392, + "grad_norm": 0.5184363624069811, + "learning_rate": 5.381103087982684e-06, + "loss": 0.8461, + "num_tokens": 242094535.0, "step": 290 }, { - "epoch": 2.699300699300699, - "grad_norm": 0.41598874521296114, - "learning_rate": 1.2578628669330422e-06, - "loss": 0.2596, - "num_tokens": 65736335.0, + "epoch": 1.5815217391304348, + "grad_norm": 0.4837976811372812, + "learning_rate": 5.354689903669721e-06, + "loss": 0.9587, + "num_tokens": 242992792.0, "step": 291 }, { - "epoch": 2.7086247086247086, - "grad_norm": 0.42296533421373433, - "learning_rate": 1.2430535873598074e-06, - "loss": 0.2704, - "num_tokens": 65950812.0, + "epoch": 1.5869565217391304, + "grad_norm": 0.4916359390877864, + "learning_rate": 5.3282817299220305e-06, + "loss": 0.986, + "num_tokens": 243778201.0, "step": 292 }, { - "epoch": 2.717948717948718, - "grad_norm": 0.45062440552954913, - "learning_rate": 1.2286704308197135e-06, - "loss": 0.2832, - "num_tokens": 66165588.0, + "epoch": 1.5923913043478262, + "grad_norm": 0.5075475546171477, + "learning_rate": 5.301879477343086e-06, + "loss": 0.9105, + "num_tokens": 244637452.0, "step": 293 }, { - "epoch": 2.7272727272727275, - "grad_norm": 0.4445602504728877, - "learning_rate": 1.2147148370758422e-06, - "loss": 0.2616, - "num_tokens": 66393469.0, + "epoch": 1.5978260869565217, + "grad_norm": 0.547518662376535, + "learning_rate": 5.2754840563321855e-06, + "loss": 0.8231, + "num_tokens": 245474932.0, "step": 294 }, { - "epoch": 2.7365967365967365, - "grad_norm": 0.4223681741501879, - "learning_rate": 1.2011882030919707e-06, - "loss": 0.2603, - "num_tokens": 66617146.0, + "epoch": 1.6032608695652173, + "grad_norm": 0.48841687453221216, + "learning_rate": 5.249096377053064e-06, + "loss": 0.8715, + "num_tokens": 246385864.0, "step": 295 }, { - "epoch": 2.745920745920746, - "grad_norm": 0.42062867718418007, - "learning_rate": 1.1880918828927305e-06, - "loss": 0.2559, - "num_tokens": 66847475.0, + "epoch": 1.608695652173913, + "grad_norm": 0.48332832949034804, + "learning_rate": 5.222717349402506e-06, + "loss": 0.8894, + "num_tokens": 247233868.0, "step": 296 }, { - "epoch": 2.755244755244755, - "grad_norm": 0.4065760757827556, - "learning_rate": 1.175427187428072e-06, - "loss": 0.2574, - "num_tokens": 67082325.0, + "epoch": 1.6141304347826086, + "grad_norm": 0.5186215837517858, + "learning_rate": 5.196347882978971e-06, + "loss": 0.8804, + "num_tokens": 248038839.0, "step": 297 }, { - "epoch": 2.7645687645687644, - "grad_norm": 0.4455630867473245, - "learning_rate": 1.163195384442036e-06, - "loss": 0.2738, - "num_tokens": 67299520.0, + "epoch": 1.6195652173913042, + "grad_norm": 0.4794963510671903, + "learning_rate": 5.169988887051234e-06, + "loss": 0.9761, + "num_tokens": 248838659.0, "step": 298 }, { - "epoch": 2.773892773892774, - "grad_norm": 0.4327619189440975, - "learning_rate": 1.1513976983458506e-06, - "loss": 0.2647, - "num_tokens": 67526642.0, + "epoch": 1.625, + "grad_norm": 0.4764998146716315, + "learning_rate": 5.143641270527018e-06, + "loss": 0.9302, + "num_tokens": 249673557.0, "step": 299 }, { - "epoch": 2.7832167832167833, - "grad_norm": 0.4244253793482913, - "learning_rate": 1.1400353100953692e-06, - "loss": 0.2594, - "num_tokens": 67753621.0, + "epoch": 1.6304347826086958, + "grad_norm": 0.5208992798322531, + "learning_rate": 5.117305941921672e-06, + "loss": 0.8405, + "num_tokens": 250541800.0, "step": 300 }, { - "epoch": 2.792540792540793, - "grad_norm": 0.42422473241175773, - "learning_rate": 1.1291093570728561e-06, - "loss": 0.2695, - "num_tokens": 67971387.0, + "epoch": 1.6358695652173914, + "grad_norm": 0.4707803571978035, + "learning_rate": 5.0909838093268294e-06, + "loss": 0.9375, + "num_tokens": 251432104.0, "step": 301 }, { - "epoch": 2.801864801864802, - "grad_norm": 0.4247831741758565, - "learning_rate": 1.1186209329731306e-06, - "loss": 0.2728, - "num_tokens": 68191217.0, + "epoch": 1.641304347826087, + "grad_norm": 0.5327648983863379, + "learning_rate": 5.064675780379104e-06, + "loss": 0.9696, + "num_tokens": 252202097.0, "step": 302 }, { - "epoch": 2.8111888111888113, - "grad_norm": 0.4408896307276524, - "learning_rate": 1.1085710876940913e-06, - "loss": 0.262, - "num_tokens": 68415882.0, + "epoch": 1.6467391304347827, + "grad_norm": 0.5032050282566293, + "learning_rate": 5.038382762228786e-06, + "loss": 0.8875, + "num_tokens": 253012007.0, "step": 303 }, { - "epoch": 2.8205128205128203, - "grad_norm": 0.41385534921176376, - "learning_rate": 1.0989608272316172e-06, - "loss": 0.2528, - "num_tokens": 68641200.0, + "epoch": 1.6521739130434783, + "grad_norm": 0.5056029507702218, + "learning_rate": 5.012105661508566e-06, + "loss": 0.9087, + "num_tokens": 253785330.0, "step": 304 }, { - "epoch": 2.8298368298368297, - "grad_norm": 0.4300973675335551, - "learning_rate": 1.089791113578871e-06, - "loss": 0.2583, - "num_tokens": 68869654.0, + "epoch": 1.6576086956521738, + "grad_norm": 0.4789485330661956, + "learning_rate": 4.985845384302271e-06, + "loss": 0.9499, + "num_tokens": 254643671.0, "step": 305 }, { - "epoch": 2.839160839160839, - "grad_norm": 0.40220366787982387, - "learning_rate": 1.0810628646299988e-06, - "loss": 0.2673, - "num_tokens": 69098611.0, - "step": 306 - }, - { - "epoch": 2.839160839160839, - "eval_loss": 0.3964461386203766, - "eval_num_tokens": 69098611.0, - "eval_runtime": 44.935, - "eval_samples_per_second": 67.898, - "eval_steps_per_second": 8.501, + "epoch": 1.6630434782608696, + "grad_norm": 0.45360594618306144, + "learning_rate": 4.9596028361136265e-06, + "loss": 0.9342, + "num_tokens": 255565652.0, "step": 306 }, { - "epoch": 2.8484848484848486, - "grad_norm": 0.4343795644115535, - "learning_rate": 1.072776954088251e-06, - "loss": 0.2716, - "num_tokens": 69322088.0, + "epoch": 1.6684782608695652, + "grad_norm": 0.49148936631218604, + "learning_rate": 4.933378921835021e-06, + "loss": 0.9425, + "num_tokens": 256415218.0, "step": 307 }, { - "epoch": 2.857808857808858, - "grad_norm": 0.42045510778209694, - "learning_rate": 1.0649342113785217e-06, - "loss": 0.2656, - "num_tokens": 69538824.0, + "epoch": 1.6739130434782608, + "grad_norm": 0.4727296330572373, + "learning_rate": 4.907174545716317e-06, + "loss": 0.9524, + "num_tokens": 257293576.0, "step": 308 }, { - "epoch": 2.867132867132867, - "grad_norm": 0.4257152687835886, - "learning_rate": 1.057535421564327e-06, - "loss": 0.2697, - "num_tokens": 69770210.0, + "epoch": 1.6793478260869565, + "grad_norm": 0.4294077987646295, + "learning_rate": 4.8809906113336584e-06, + "loss": 0.9699, + "num_tokens": 258139940.0, "step": 309 }, { - "epoch": 2.8764568764568765, - "grad_norm": 0.4182840339935867, - "learning_rate": 1.0505813252692142e-06, - "loss": 0.2705, - "num_tokens": 69993710.0, + "epoch": 1.6847826086956523, + "grad_norm": 0.46396440371930237, + "learning_rate": 4.854828021558323e-06, + "loss": 0.908, + "num_tokens": 258995189.0, "step": 310 }, { - "epoch": 2.8857808857808855, - "grad_norm": 0.4270654163158855, - "learning_rate": 1.0440726186026289e-06, - "loss": 0.2824, - "num_tokens": 70216546.0, + "epoch": 1.6902173913043477, + "grad_norm": 0.4451003909507794, + "learning_rate": 4.8286876785255895e-06, + "loss": 0.9113, + "num_tokens": 259849182.0, "step": 311 }, { - "epoch": 2.895104895104895, - "grad_norm": 0.45311248568170176, - "learning_rate": 1.038009953090232e-06, - "loss": 0.2749, - "num_tokens": 70439614.0, + "epoch": 1.6956521739130435, + "grad_norm": 0.4644149101702856, + "learning_rate": 4.80257048360362e-06, + "loss": 0.9047, + "num_tokens": 260728692.0, "step": 312 }, { - "epoch": 2.9044289044289044, - "grad_norm": 0.41047094656547684, - "learning_rate": 1.032393935608683e-06, - "loss": 0.2626, - "num_tokens": 70670380.0, + "epoch": 1.7010869565217392, + "grad_norm": 0.4765302230895247, + "learning_rate": 4.776477337362394e-06, + "loss": 0.9364, + "num_tokens": 261498550.0, "step": 313 }, { - "epoch": 2.913752913752914, - "grad_norm": 0.42280895869632795, - "learning_rate": 1.0272251283248903e-06, - "loss": 0.264, - "num_tokens": 70900553.0, + "epoch": 1.7065217391304348, + "grad_norm": 0.4799480621946747, + "learning_rate": 4.75040913954264e-06, + "loss": 0.8931, + "num_tokens": 262339383.0, "step": 314 }, { - "epoch": 2.9230769230769234, - "grad_norm": 0.420152194916759, - "learning_rate": 1.022504048639738e-06, - "loss": 0.2698, - "num_tokens": 71129446.0, + "epoch": 1.7119565217391304, + "grad_norm": 0.4330158349077731, + "learning_rate": 4.724366789024822e-06, + "loss": 0.9221, + "num_tokens": 263211486.0, "step": 315 }, { - "epoch": 2.9324009324009324, - "grad_norm": 0.4252339208960209, - "learning_rate": 1.0182311691362935e-06, - "loss": 0.2574, - "num_tokens": 71364044.0, + "epoch": 1.7173913043478262, + "grad_norm": 0.4719343700469422, + "learning_rate": 4.698351183798141e-06, + "loss": 0.9044, + "num_tokens": 264046091.0, "step": 316 }, { - "epoch": 2.941724941724942, - "grad_norm": 0.4146036330803457, - "learning_rate": 1.014406917532503e-06, - "loss": 0.2686, - "num_tokens": 71582926.0, + "epoch": 1.7228260869565217, + "grad_norm": 0.5784427415196676, + "learning_rate": 4.672363220929567e-06, + "loss": 0.8772, + "num_tokens": 264888538.0, "step": 317 }, { - "epoch": 2.951048951048951, - "grad_norm": 0.4266092057892508, - "learning_rate": 1.0110316766383745e-06, - "loss": 0.2625, - "num_tokens": 71805330.0, + "epoch": 1.7282608695652173, + "grad_norm": 0.5091837295220749, + "learning_rate": 4.646403796532916e-06, + "loss": 0.8942, + "num_tokens": 265630637.0, "step": 318 }, { - "epoch": 2.9603729603729603, - "grad_norm": 0.4217243793579386, - "learning_rate": 1.00810578431766e-06, - "loss": 0.2587, - "num_tokens": 72035215.0, + "epoch": 1.733695652173913, + "grad_norm": 0.4667527874733642, + "learning_rate": 4.620473805737934e-06, + "loss": 0.9195, + "num_tokens": 266433532.0, "step": 319 }, { - "epoch": 2.9696969696969697, - "grad_norm": 0.4199077117513419, - "learning_rate": 1.0056295334540357e-06, - "loss": 0.2596, - "num_tokens": 72263057.0, + "epoch": 1.7391304347826086, + "grad_norm": 0.46506831938872817, + "learning_rate": 4.594574142659448e-06, + "loss": 0.9369, + "num_tokens": 267259816.0, "step": 320 }, { - "epoch": 2.979020979020979, - "grad_norm": 0.4533624252529699, - "learning_rate": 1.0036031719217808e-06, - "loss": 0.2663, - "num_tokens": 72485545.0, + "epoch": 1.7445652173913042, + "grad_norm": 0.43003978346559196, + "learning_rate": 4.568705700366527e-06, + "loss": 0.9196, + "num_tokens": 268110996.0, "step": 321 }, { - "epoch": 2.988344988344988, - "grad_norm": 0.4453560519473016, - "learning_rate": 1.0020269025609697e-06, - "loss": 0.2657, - "num_tokens": 72698615.0, + "epoch": 1.75, + "grad_norm": 0.41920568386474366, + "learning_rate": 4.542869370851689e-06, + "loss": 1.013, + "num_tokens": 268972337.0, "step": 322 }, { - "epoch": 2.9976689976689976, - "grad_norm": 0.44895378041312567, - "learning_rate": 1.0009008831571635e-06, - "loss": 0.2785, - "num_tokens": 72922319.0, + "epoch": 1.7554347826086958, + "grad_norm": 0.42507556465528384, + "learning_rate": 4.517066045000142e-06, + "loss": 0.9282, + "num_tokens": 269834146.0, "step": 323 }, { - "epoch": 3.0, - "grad_norm": 0.44895378041312567, - "learning_rate": 1.000225226425618e-06, - "loss": 0.2816, - "num_tokens": 72976686.0, + "epoch": 1.7608695652173914, + "grad_norm": 0.4422683354025743, + "learning_rate": 4.491296612559066e-06, + "loss": 0.9635, + "num_tokens": 270706702.0, "step": 324 }, + { + "epoch": 1.766304347826087, + "grad_norm": 0.4622516150145145, + "learning_rate": 4.465561962106931e-06, + "loss": 0.917, + "num_tokens": 271556924.0, + "step": 325 + }, + { + "epoch": 1.7717391304347827, + "grad_norm": 0.4707369643560396, + "learning_rate": 4.439862981022862e-06, + "loss": 0.8909, + "num_tokens": 272426695.0, + "step": 326 + }, + { + "epoch": 1.7771739130434783, + "grad_norm": 0.46585694696977864, + "learning_rate": 4.4142005554560345e-06, + "loss": 0.8839, + "num_tokens": 273265387.0, + "step": 327 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 0.4788713880587738, + "learning_rate": 4.388575570295123e-06, + "loss": 0.9002, + "num_tokens": 274049150.0, + "step": 328 + }, + { + "epoch": 1.7880434782608696, + "grad_norm": 0.4531241832723311, + "learning_rate": 4.362988909137783e-06, + "loss": 0.871, + "num_tokens": 274875505.0, + "step": 329 + }, + { + "epoch": 1.7934782608695652, + "grad_norm": 0.4925895978493023, + "learning_rate": 4.33744145426019e-06, + "loss": 0.869, + "num_tokens": 275693800.0, + "step": 330 + }, + { + "epoch": 1.7989130434782608, + "grad_norm": 0.47581574588692704, + "learning_rate": 4.311934086586611e-06, + "loss": 0.9176, + "num_tokens": 276516215.0, + "step": 331 + }, + { + "epoch": 1.8043478260869565, + "grad_norm": 0.5145400148096132, + "learning_rate": 4.286467685659034e-06, + "loss": 0.8774, + "num_tokens": 277343626.0, + "step": 332 + }, + { + "epoch": 1.8097826086956523, + "grad_norm": 0.4628475433186417, + "learning_rate": 4.261043129606832e-06, + "loss": 0.9669, + "num_tokens": 278089598.0, + "step": 333 + }, + { + "epoch": 1.8152173913043477, + "grad_norm": 0.4271094269757471, + "learning_rate": 4.23566129511649e-06, + "loss": 0.9878, + "num_tokens": 278943252.0, + "step": 334 + }, + { + "epoch": 1.8206521739130435, + "grad_norm": 0.4662067117686951, + "learning_rate": 4.210323057401375e-06, + "loss": 0.9906, + "num_tokens": 279813500.0, + "step": 335 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.4546197238056592, + "learning_rate": 4.185029290171554e-06, + "loss": 0.8521, + "num_tokens": 280615503.0, + "step": 336 + }, + { + "epoch": 1.8315217391304348, + "grad_norm": 0.4202719918715804, + "learning_rate": 4.159780865603667e-06, + "loss": 0.9647, + "num_tokens": 281423980.0, + "step": 337 + }, + { + "epoch": 1.8369565217391304, + "grad_norm": 0.40457263606820165, + "learning_rate": 4.134578654310854e-06, + "loss": 0.9852, + "num_tokens": 282282019.0, + "step": 338 + }, + { + "epoch": 1.8423913043478262, + "grad_norm": 0.4560747998505136, + "learning_rate": 4.109423525312738e-06, + "loss": 0.9767, + "num_tokens": 283156303.0, + "step": 339 + }, + { + "epoch": 1.8478260869565217, + "grad_norm": 0.45185534245989895, + "learning_rate": 4.084316346005449e-06, + "loss": 0.8952, + "num_tokens": 283990835.0, + "step": 340 + }, + { + "epoch": 1.8532608695652173, + "grad_norm": 0.5026480004340574, + "learning_rate": 4.059257982131728e-06, + "loss": 0.7806, + "num_tokens": 284772502.0, + "step": 341 + }, + { + "epoch": 1.858695652173913, + "grad_norm": 0.4862030095553426, + "learning_rate": 4.034249297751064e-06, + "loss": 0.8633, + "num_tokens": 285540737.0, + "step": 342 + }, + { + "epoch": 1.8641304347826086, + "grad_norm": 0.8798921866611329, + "learning_rate": 4.009291155209909e-06, + "loss": 0.8267, + "num_tokens": 286344793.0, + "step": 343 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.44507904137666465, + "learning_rate": 3.9843844151119306e-06, + "loss": 0.9401, + "num_tokens": 287183890.0, + "step": 344 + }, + { + "epoch": 1.875, + "grad_norm": 0.48470270510022156, + "learning_rate": 3.959529936288345e-06, + "loss": 0.8448, + "num_tokens": 287987343.0, + "step": 345 + }, + { + "epoch": 1.8804347826086958, + "grad_norm": 0.46250239356468154, + "learning_rate": 3.934728575768307e-06, + "loss": 0.8711, + "num_tokens": 288888769.0, + "step": 346 + }, + { + "epoch": 1.8858695652173914, + "grad_norm": 0.4402667150189706, + "learning_rate": 3.909981188749347e-06, + "loss": 0.9335, + "num_tokens": 289748009.0, + "step": 347 + }, + { + "epoch": 1.891304347826087, + "grad_norm": 0.438457735442828, + "learning_rate": 3.8852886285678896e-06, + "loss": 0.8716, + "num_tokens": 290622938.0, + "step": 348 + }, + { + "epoch": 1.8967391304347827, + "grad_norm": 0.4461771933881913, + "learning_rate": 3.8606517466698246e-06, + "loss": 0.8591, + "num_tokens": 291485055.0, + "step": 349 + }, + { + "epoch": 1.9021739130434783, + "grad_norm": 0.40547876560113, + "learning_rate": 3.8360713925811535e-06, + "loss": 0.9276, + "num_tokens": 292402125.0, + "step": 350 + }, + { + "epoch": 1.9076086956521738, + "grad_norm": 0.4652174044644762, + "learning_rate": 3.8115484138786896e-06, + "loss": 0.7969, + "num_tokens": 293187972.0, + "step": 351 + }, + { + "epoch": 1.9130434782608696, + "grad_norm": 0.4114735123923004, + "learning_rate": 3.787083656160838e-06, + "loss": 0.9328, + "num_tokens": 294085825.0, + "step": 352 + }, + { + "epoch": 1.9184782608695652, + "grad_norm": 0.4182275804025475, + "learning_rate": 3.762677963018433e-06, + "loss": 0.9367, + "num_tokens": 294887669.0, + "step": 353 + }, + { + "epoch": 1.9239130434782608, + "grad_norm": 0.45051073241068795, + "learning_rate": 3.7383321760056524e-06, + "loss": 0.8766, + "num_tokens": 295722993.0, + "step": 354 + }, + { + "epoch": 1.9293478260869565, + "grad_norm": 0.43307154681774906, + "learning_rate": 3.714047134610994e-06, + "loss": 0.8914, + "num_tokens": 296620729.0, + "step": 355 + }, + { + "epoch": 1.9347826086956523, + "grad_norm": 0.5477818107241225, + "learning_rate": 3.6898236762283378e-06, + "loss": 0.7846, + "num_tokens": 297404600.0, + "step": 356 + }, + { + "epoch": 1.9402173913043477, + "grad_norm": 0.41249150875190044, + "learning_rate": 3.6656626361280645e-06, + "loss": 0.9446, + "num_tokens": 298233764.0, + "step": 357 + }, + { + "epoch": 1.9456521739130435, + "grad_norm": 0.42525055229176784, + "learning_rate": 3.641564847428254e-06, + "loss": 0.942, + "num_tokens": 299127521.0, + "step": 358 + }, + { + "epoch": 1.9510869565217392, + "grad_norm": 0.449431576288473, + "learning_rate": 3.617531141065956e-06, + "loss": 0.8601, + "num_tokens": 299982434.0, + "step": 359 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 0.4064111339005786, + "learning_rate": 3.593562345768549e-06, + "loss": 0.8646, + "num_tokens": 300818439.0, + "step": 360 + }, + { + "epoch": 1.9619565217391304, + "grad_norm": 0.4307163312053481, + "learning_rate": 3.5696592880251467e-06, + "loss": 0.9335, + "num_tokens": 301635342.0, + "step": 361 + }, + { + "epoch": 1.9673913043478262, + "grad_norm": 0.4073183602601777, + "learning_rate": 3.5458227920581154e-06, + "loss": 0.9036, + "num_tokens": 302440399.0, + "step": 362 + }, + { + "epoch": 1.9728260869565217, + "grad_norm": 0.40367545222463547, + "learning_rate": 3.5220536797946447e-06, + "loss": 0.8582, + "num_tokens": 303203873.0, + "step": 363 + }, + { + "epoch": 1.9782608695652173, + "grad_norm": 0.4445278734784835, + "learning_rate": 3.4983527708384023e-06, + "loss": 0.9542, + "num_tokens": 304034628.0, + "step": 364 + }, + { + "epoch": 1.983695652173913, + "grad_norm": 0.4219555341031118, + "learning_rate": 3.4747208824412827e-06, + "loss": 0.8915, + "num_tokens": 304839253.0, + "step": 365 + }, + { + "epoch": 1.9891304347826086, + "grad_norm": 0.4506145971755832, + "learning_rate": 3.451158829475222e-06, + "loss": 0.8492, + "num_tokens": 305555931.0, + "step": 366 + }, + { + "epoch": 1.9945652173913042, + "grad_norm": 0.447095773856711, + "learning_rate": 3.4276674244040976e-06, + "loss": 0.8216, + "num_tokens": 306343455.0, + "step": 367 + }, + { + "epoch": 2.0, + "grad_norm": 0.44739997510719476, + "learning_rate": 3.4042474772557143e-06, + "loss": 0.8574, + "num_tokens": 307150552.0, + "step": 368 + }, + { + "epoch": 2.0, + "eval_loss": 0.6972317099571228, + "eval_num_tokens": 307150552.0, + "eval_runtime": 115.9725, + "eval_samples_per_second": 45.106, + "eval_steps_per_second": 5.639, + "step": 368 + }, + { + "epoch": 2.005434782608696, + "grad_norm": 0.38636761878459847, + "learning_rate": 3.3808997955938754e-06, + "loss": 0.892, + "num_tokens": 307989807.0, + "step": 369 + }, + { + "epoch": 2.010869565217391, + "grad_norm": 0.3829451714194781, + "learning_rate": 3.3576251844905317e-06, + "loss": 0.8861, + "num_tokens": 308868273.0, + "step": 370 + }, + { + "epoch": 2.016304347826087, + "grad_norm": 0.43968135329318825, + "learning_rate": 3.3344244464980267e-06, + "loss": 0.8656, + "num_tokens": 309656308.0, + "step": 371 + }, + { + "epoch": 2.0217391304347827, + "grad_norm": 0.4093528483136871, + "learning_rate": 3.3112983816214184e-06, + "loss": 0.8591, + "num_tokens": 310461076.0, + "step": 372 + }, + { + "epoch": 2.027173913043478, + "grad_norm": 0.38976210096491537, + "learning_rate": 3.2882477872908965e-06, + "loss": 0.9011, + "num_tokens": 311276939.0, + "step": 373 + }, + { + "epoch": 2.032608695652174, + "grad_norm": 0.39368206902618713, + "learning_rate": 3.2652734583342815e-06, + "loss": 0.8588, + "num_tokens": 312165932.0, + "step": 374 + }, + { + "epoch": 2.0380434782608696, + "grad_norm": 0.3983831266526713, + "learning_rate": 3.242376186949623e-06, + "loss": 0.8093, + "num_tokens": 312978583.0, + "step": 375 + }, + { + "epoch": 2.0434782608695654, + "grad_norm": 0.4135612842166327, + "learning_rate": 3.219556762677881e-06, + "loss": 0.8531, + "num_tokens": 313816716.0, + "step": 376 + }, + { + "epoch": 2.0489130434782608, + "grad_norm": 0.39779756390210336, + "learning_rate": 3.1968159723756997e-06, + "loss": 0.8901, + "num_tokens": 314622313.0, + "step": 377 + }, + { + "epoch": 2.0543478260869565, + "grad_norm": 0.3919197553570745, + "learning_rate": 3.1741546001882773e-06, + "loss": 0.8571, + "num_tokens": 315395623.0, + "step": 378 + }, + { + "epoch": 2.0597826086956523, + "grad_norm": 0.3781577490653326, + "learning_rate": 3.151573427522324e-06, + "loss": 0.8994, + "num_tokens": 316215321.0, + "step": 379 + }, + { + "epoch": 2.0652173913043477, + "grad_norm": 0.43117494158316355, + "learning_rate": 3.1290732330191222e-06, + "loss": 0.7714, + "num_tokens": 317020574.0, + "step": 380 + }, + { + "epoch": 2.0706521739130435, + "grad_norm": 0.39659443790663584, + "learning_rate": 3.1066547925276725e-06, + "loss": 0.871, + "num_tokens": 317847775.0, + "step": 381 + }, + { + "epoch": 2.0760869565217392, + "grad_norm": 0.4255646668460152, + "learning_rate": 3.0843188790779455e-06, + "loss": 0.8323, + "num_tokens": 318656420.0, + "step": 382 + }, + { + "epoch": 2.0815217391304346, + "grad_norm": 0.3419689090065395, + "learning_rate": 3.0620662628542256e-06, + "loss": 1.0455, + "num_tokens": 319517679.0, + "step": 383 + }, + { + "epoch": 2.0869565217391304, + "grad_norm": 0.38761705361585463, + "learning_rate": 3.039897711168547e-06, + "loss": 0.7977, + "num_tokens": 320311242.0, + "step": 384 + }, + { + "epoch": 2.092391304347826, + "grad_norm": 0.4398300188788278, + "learning_rate": 3.017813988434245e-06, + "loss": 0.7794, + "num_tokens": 321075317.0, + "step": 385 + }, + { + "epoch": 2.097826086956522, + "grad_norm": 0.38030062397688985, + "learning_rate": 2.9958158561395933e-06, + "loss": 0.9574, + "num_tokens": 321944658.0, + "step": 386 + }, + { + "epoch": 2.1032608695652173, + "grad_norm": 0.37914375055949284, + "learning_rate": 2.9739040728215427e-06, + "loss": 0.9026, + "num_tokens": 322795388.0, + "step": 387 + }, + { + "epoch": 2.108695652173913, + "grad_norm": 0.3681406688591765, + "learning_rate": 2.9520793940395735e-06, + "loss": 0.7614, + "num_tokens": 323620956.0, + "step": 388 + }, + { + "epoch": 2.114130434782609, + "grad_norm": 0.40820419568639127, + "learning_rate": 2.9303425723496353e-06, + "loss": 0.8671, + "num_tokens": 324370926.0, + "step": 389 + }, + { + "epoch": 2.119565217391304, + "grad_norm": 0.40450291077944434, + "learning_rate": 2.9086943572782e-06, + "loss": 0.9099, + "num_tokens": 325196703.0, + "step": 390 + }, + { + "epoch": 2.125, + "grad_norm": 0.38129385362446383, + "learning_rate": 2.8871354952964183e-06, + "loss": 0.9411, + "num_tokens": 326000243.0, + "step": 391 + }, + { + "epoch": 2.130434782608696, + "grad_norm": 0.3799222952796977, + "learning_rate": 2.8656667297943757e-06, + "loss": 0.9011, + "num_tokens": 326819387.0, + "step": 392 + }, + { + "epoch": 2.135869565217391, + "grad_norm": 0.3927989353518067, + "learning_rate": 2.8442888010554658e-06, + "loss": 0.8171, + "num_tokens": 327589016.0, + "step": 393 + }, + { + "epoch": 2.141304347826087, + "grad_norm": 0.37907717159131554, + "learning_rate": 2.8230024462308547e-06, + "loss": 0.93, + "num_tokens": 328465302.0, + "step": 394 + }, + { + "epoch": 2.1467391304347827, + "grad_norm": 0.37822166658254774, + "learning_rate": 2.801808399314071e-06, + "loss": 0.9274, + "num_tokens": 329345660.0, + "step": 395 + }, + { + "epoch": 2.1521739130434785, + "grad_norm": 0.42108543525700537, + "learning_rate": 2.7807073911156934e-06, + "loss": 0.827, + "num_tokens": 330139715.0, + "step": 396 + }, + { + "epoch": 2.157608695652174, + "grad_norm": 0.4008797341414234, + "learning_rate": 2.7597001492381493e-06, + "loss": 0.7955, + "num_tokens": 330949482.0, + "step": 397 + }, + { + "epoch": 2.1630434782608696, + "grad_norm": 0.3777857356482234, + "learning_rate": 2.7387873980506286e-06, + "loss": 0.789, + "num_tokens": 331830357.0, + "step": 398 + }, + { + "epoch": 2.1684782608695654, + "grad_norm": 0.450815446987175, + "learning_rate": 2.7179698586641024e-06, + "loss": 0.7961, + "num_tokens": 332668804.0, + "step": 399 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.3655309894339068, + "learning_rate": 2.6972482489064615e-06, + "loss": 0.9005, + "num_tokens": 333523686.0, + "step": 400 + }, + { + "epoch": 2.1793478260869565, + "grad_norm": 0.35812024427404354, + "learning_rate": 2.6766232832977636e-06, + "loss": 0.9119, + "num_tokens": 334381666.0, + "step": 401 + }, + { + "epoch": 2.1847826086956523, + "grad_norm": 0.3659167904477761, + "learning_rate": 2.6560956730255937e-06, + "loss": 0.9706, + "num_tokens": 335307725.0, + "step": 402 + }, + { + "epoch": 2.1902173913043477, + "grad_norm": 0.3429215811966493, + "learning_rate": 2.6356661259205396e-06, + "loss": 0.908, + "num_tokens": 336161428.0, + "step": 403 + }, + { + "epoch": 2.1956521739130435, + "grad_norm": 0.3824382924417251, + "learning_rate": 2.615335346431789e-06, + "loss": 0.8392, + "num_tokens": 337029640.0, + "step": 404 + }, + { + "epoch": 2.2010869565217392, + "grad_norm": 0.40449862861918684, + "learning_rate": 2.5951040356028357e-06, + "loss": 0.8373, + "num_tokens": 337838364.0, + "step": 405 + }, + { + "epoch": 2.2065217391304346, + "grad_norm": 0.3481749039841372, + "learning_rate": 2.574972891047308e-06, + "loss": 0.9422, + "num_tokens": 338651556.0, + "step": 406 + }, + { + "epoch": 2.2119565217391304, + "grad_norm": 0.3924961725704576, + "learning_rate": 2.554942606924914e-06, + "loss": 0.8228, + "num_tokens": 339511406.0, + "step": 407 + }, + { + "epoch": 2.217391304347826, + "grad_norm": 0.34049826315688747, + "learning_rate": 2.535013873917501e-06, + "loss": 0.8667, + "num_tokens": 340443629.0, + "step": 408 + }, + { + "epoch": 2.2228260869565215, + "grad_norm": 0.4117155572499064, + "learning_rate": 2.515187379205245e-06, + "loss": 0.8342, + "num_tokens": 341279653.0, + "step": 409 + }, + { + "epoch": 2.2282608695652173, + "grad_norm": 0.4441595850045378, + "learning_rate": 2.495463806442953e-06, + "loss": 0.7691, + "num_tokens": 342195766.0, + "step": 410 + }, + { + "epoch": 2.233695652173913, + "grad_norm": 0.35169984350005207, + "learning_rate": 2.4758438357364913e-06, + "loss": 0.9449, + "num_tokens": 343057767.0, + "step": 411 + }, + { + "epoch": 2.239130434782609, + "grad_norm": 0.37934796213532046, + "learning_rate": 2.4563281436193304e-06, + "loss": 0.8667, + "num_tokens": 343872498.0, + "step": 412 + }, + { + "epoch": 2.244565217391304, + "grad_norm": 0.34470012483067713, + "learning_rate": 2.436917403029219e-06, + "loss": 0.9304, + "num_tokens": 344702347.0, + "step": 413 + }, + { + "epoch": 2.25, + "grad_norm": 0.3869353927249258, + "learning_rate": 2.4176122832849806e-06, + "loss": 0.9261, + "num_tokens": 345556219.0, + "step": 414 + }, + { + "epoch": 2.255434782608696, + "grad_norm": 0.376280379959303, + "learning_rate": 2.3984134500634344e-06, + "loss": 0.8187, + "num_tokens": 346341189.0, + "step": 415 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 0.36254799864472526, + "learning_rate": 2.379321565376439e-06, + "loss": 0.9454, + "num_tokens": 347121836.0, + "step": 416 + }, + { + "epoch": 2.266304347826087, + "grad_norm": 0.37665052955616984, + "learning_rate": 2.3603372875480662e-06, + "loss": 0.8902, + "num_tokens": 347915123.0, + "step": 417 + }, + { + "epoch": 2.2717391304347827, + "grad_norm": 0.3619099925225754, + "learning_rate": 2.341461271191903e-06, + "loss": 0.871, + "num_tokens": 348720454.0, + "step": 418 + }, + { + "epoch": 2.2771739130434785, + "grad_norm": 0.38144845503617586, + "learning_rate": 2.3226941671884766e-06, + "loss": 0.8471, + "num_tokens": 349564775.0, + "step": 419 + }, + { + "epoch": 2.282608695652174, + "grad_norm": 0.37803485943173243, + "learning_rate": 2.3040366226628104e-06, + "loss": 0.874, + "num_tokens": 350336647.0, + "step": 420 + }, + { + "epoch": 2.2880434782608696, + "grad_norm": 0.36406159830794205, + "learning_rate": 2.2854892809621113e-06, + "loss": 0.7889, + "num_tokens": 351237635.0, + "step": 421 + }, + { + "epoch": 2.2934782608695654, + "grad_norm": 0.38102789495537004, + "learning_rate": 2.267052781633588e-06, + "loss": 0.7828, + "num_tokens": 352050882.0, + "step": 422 + }, + { + "epoch": 2.2989130434782608, + "grad_norm": 0.35887276733788503, + "learning_rate": 2.248727760402391e-06, + "loss": 0.8599, + "num_tokens": 352878475.0, + "step": 423 + }, + { + "epoch": 2.3043478260869565, + "grad_norm": 0.35052707875020117, + "learning_rate": 2.2305148491497013e-06, + "loss": 0.935, + "num_tokens": 353696090.0, + "step": 424 + }, + { + "epoch": 2.3097826086956523, + "grad_norm": 0.3796455050371005, + "learning_rate": 2.2124146758909344e-06, + "loss": 0.7871, + "num_tokens": 354551580.0, + "step": 425 + }, + { + "epoch": 2.3152173913043477, + "grad_norm": 0.35413248660813723, + "learning_rate": 2.1944278647540897e-06, + "loss": 0.9125, + "num_tokens": 355427660.0, + "step": 426 + }, + { + "epoch": 2.3206521739130435, + "grad_norm": 0.3550372496306149, + "learning_rate": 2.176555035958225e-06, + "loss": 0.9098, + "num_tokens": 356260823.0, + "step": 427 + }, + { + "epoch": 2.3260869565217392, + "grad_norm": 0.3579466405075418, + "learning_rate": 2.158796805792076e-06, + "loss": 0.9144, + "num_tokens": 357121483.0, + "step": 428 + }, + { + "epoch": 2.3315217391304346, + "grad_norm": 0.3529362149072696, + "learning_rate": 2.1411537865927996e-06, + "loss": 0.8598, + "num_tokens": 357940148.0, + "step": 429 + }, + { + "epoch": 2.3369565217391304, + "grad_norm": 0.3715063084651279, + "learning_rate": 2.1236265867248624e-06, + "loss": 0.7717, + "num_tokens": 358802710.0, + "step": 430 + }, + { + "epoch": 2.342391304347826, + "grad_norm": 0.3576601357221509, + "learning_rate": 2.106215810559064e-06, + "loss": 0.8561, + "num_tokens": 359630097.0, + "step": 431 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 0.3483196895578794, + "learning_rate": 2.0889220584516953e-06, + "loss": 0.9144, + "num_tokens": 360480329.0, + "step": 432 + }, + { + "epoch": 2.3532608695652173, + "grad_norm": 0.3447961481134239, + "learning_rate": 2.071745926723836e-06, + "loss": 0.8636, + "num_tokens": 361308682.0, + "step": 433 + }, + { + "epoch": 2.358695652173913, + "grad_norm": 0.3766304528077358, + "learning_rate": 2.054688007640796e-06, + "loss": 0.7479, + "num_tokens": 362169219.0, + "step": 434 + }, + { + "epoch": 2.364130434782609, + "grad_norm": 0.3746268395382482, + "learning_rate": 2.0377488893916915e-06, + "loss": 0.8258, + "num_tokens": 363005241.0, + "step": 435 + }, + { + "epoch": 2.369565217391304, + "grad_norm": 0.36014765575775626, + "learning_rate": 2.02092915606916e-06, + "loss": 0.8801, + "num_tokens": 363883118.0, + "step": 436 + }, + { + "epoch": 2.375, + "grad_norm": 0.33453942334522974, + "learning_rate": 2.004229387649225e-06, + "loss": 0.86, + "num_tokens": 364788569.0, + "step": 437 + }, + { + "epoch": 2.380434782608696, + "grad_norm": 0.38679043535672375, + "learning_rate": 1.9876501599712933e-06, + "loss": 0.7929, + "num_tokens": 365606670.0, + "step": 438 + }, + { + "epoch": 2.385869565217391, + "grad_norm": 0.3359349574746618, + "learning_rate": 1.9711920447183007e-06, + "loss": 0.8163, + "num_tokens": 366444635.0, + "step": 439 + }, + { + "epoch": 2.391304347826087, + "grad_norm": 0.3644533679055179, + "learning_rate": 1.9548556093969988e-06, + "loss": 0.7937, + "num_tokens": 367323309.0, + "step": 440 + }, + { + "epoch": 2.3967391304347827, + "grad_norm": 0.3771715348384541, + "learning_rate": 1.9386414173183867e-06, + "loss": 0.8485, + "num_tokens": 368064890.0, + "step": 441 + }, + { + "epoch": 2.4021739130434785, + "grad_norm": 0.3551550392055021, + "learning_rate": 1.9225500275782865e-06, + "loss": 0.83, + "num_tokens": 369002766.0, + "step": 442 + }, + { + "epoch": 2.407608695652174, + "grad_norm": 0.35673807977602134, + "learning_rate": 1.9065819950380634e-06, + "loss": 0.8095, + "num_tokens": 369842266.0, + "step": 443 + }, + { + "epoch": 2.4130434782608696, + "grad_norm": 0.3678005067941679, + "learning_rate": 1.8907378703054965e-06, + "loss": 0.916, + "num_tokens": 370684795.0, + "step": 444 + }, + { + "epoch": 2.4184782608695654, + "grad_norm": 0.3550346657051313, + "learning_rate": 1.8750181997157906e-06, + "loss": 0.9113, + "num_tokens": 371537705.0, + "step": 445 + }, + { + "epoch": 2.4239130434782608, + "grad_norm": 0.354949703749135, + "learning_rate": 1.8594235253127373e-06, + "loss": 0.8938, + "num_tokens": 372305511.0, + "step": 446 + }, + { + "epoch": 2.4293478260869565, + "grad_norm": 0.36864809186508585, + "learning_rate": 1.8439543848300234e-06, + "loss": 0.7865, + "num_tokens": 373111650.0, + "step": 447 + }, + { + "epoch": 2.4347826086956523, + "grad_norm": 0.37352388254746816, + "learning_rate": 1.8286113116726928e-06, + "loss": 0.7825, + "num_tokens": 373957500.0, + "step": 448 + }, + { + "epoch": 2.4402173913043477, + "grad_norm": 0.3695867321551594, + "learning_rate": 1.813394834898749e-06, + "loss": 0.8468, + "num_tokens": 374732469.0, + "step": 449 + }, + { + "epoch": 2.4456521739130435, + "grad_norm": 0.3455694762603206, + "learning_rate": 1.7983054792009146e-06, + "loss": 0.8205, + "num_tokens": 375590847.0, + "step": 450 + }, + { + "epoch": 2.4510869565217392, + "grad_norm": 0.34383273093647165, + "learning_rate": 1.7833437648885391e-06, + "loss": 0.9337, + "num_tokens": 376472609.0, + "step": 451 + }, + { + "epoch": 2.4565217391304346, + "grad_norm": 0.34396344325737654, + "learning_rate": 1.768510207869658e-06, + "loss": 0.8295, + "num_tokens": 377360835.0, + "step": 452 + }, + { + "epoch": 2.4619565217391304, + "grad_norm": 0.36772931369842965, + "learning_rate": 1.7538053196331988e-06, + "loss": 0.8552, + "num_tokens": 378129547.0, + "step": 453 + }, + { + "epoch": 2.467391304347826, + "grad_norm": 0.3721073295136887, + "learning_rate": 1.739229607231351e-06, + "loss": 0.8254, + "num_tokens": 378985755.0, + "step": 454 + }, + { + "epoch": 2.4728260869565215, + "grad_norm": 0.375300924188451, + "learning_rate": 1.724783573262077e-06, + "loss": 0.8427, + "num_tokens": 379827893.0, + "step": 455 + }, + { + "epoch": 2.4782608695652173, + "grad_norm": 0.37674482179096, + "learning_rate": 1.7104677158517838e-06, + "loss": 0.6842, + "num_tokens": 380654729.0, + "step": 456 + }, + { + "epoch": 2.483695652173913, + "grad_norm": 0.3476948995891136, + "learning_rate": 1.6962825286381456e-06, + "loss": 0.823, + "num_tokens": 381422237.0, + "step": 457 + }, + { + "epoch": 2.489130434782609, + "grad_norm": 0.3638801948734728, + "learning_rate": 1.682228500753083e-06, + "loss": 0.8774, + "num_tokens": 382221856.0, + "step": 458 + }, + { + "epoch": 2.494565217391304, + "grad_norm": 0.3358560532553637, + "learning_rate": 1.6683061168058957e-06, + "loss": 0.8696, + "num_tokens": 383048436.0, + "step": 459 + }, + { + "epoch": 2.5, + "grad_norm": 0.3280496068759726, + "learning_rate": 1.6545158568665525e-06, + "loss": 0.8815, + "num_tokens": 383884603.0, + "step": 460 + }, + { + "epoch": 2.505434782608696, + "grad_norm": 0.33002937132151383, + "learning_rate": 1.6408581964491405e-06, + "loss": 0.8951, + "num_tokens": 384785328.0, + "step": 461 + }, + { + "epoch": 2.5108695652173916, + "grad_norm": 0.3532135137719775, + "learning_rate": 1.6273336064954637e-06, + "loss": 0.8446, + "num_tokens": 385597787.0, + "step": 462 + }, + { + "epoch": 2.516304347826087, + "grad_norm": 0.3197220986983031, + "learning_rate": 1.6139425533588055e-06, + "loss": 0.9167, + "num_tokens": 386427545.0, + "step": 463 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.3413188251047705, + "learning_rate": 1.6006854987878517e-06, + "loss": 0.8567, + "num_tokens": 387313898.0, + "step": 464 + }, + { + "epoch": 2.5271739130434785, + "grad_norm": 0.3444895369070734, + "learning_rate": 1.5875628999107633e-06, + "loss": 0.8591, + "num_tokens": 388118530.0, + "step": 465 + }, + { + "epoch": 2.532608695652174, + "grad_norm": 0.3310567768494456, + "learning_rate": 1.5745752092194187e-06, + "loss": 0.8368, + "num_tokens": 388980173.0, + "step": 466 + }, + { + "epoch": 2.5380434782608696, + "grad_norm": 0.3640834859217282, + "learning_rate": 1.561722874553806e-06, + "loss": 0.7713, + "num_tokens": 389804181.0, + "step": 467 + }, + { + "epoch": 2.5434782608695654, + "grad_norm": 0.32818875673224424, + "learning_rate": 1.5490063390865845e-06, + "loss": 0.8075, + "num_tokens": 390738842.0, + "step": 468 + }, + { + "epoch": 2.5489130434782608, + "grad_norm": 0.3370297039341805, + "learning_rate": 1.536426041307801e-06, + "loss": 0.8709, + "num_tokens": 391585641.0, + "step": 469 + }, + { + "epoch": 2.5543478260869565, + "grad_norm": 0.3326492694705219, + "learning_rate": 1.5239824150097712e-06, + "loss": 0.8091, + "num_tokens": 392461130.0, + "step": 470 + }, + { + "epoch": 2.5597826086956523, + "grad_norm": 0.3246932898412604, + "learning_rate": 1.5116758892721214e-06, + "loss": 0.86, + "num_tokens": 393303819.0, + "step": 471 + }, + { + "epoch": 2.5652173913043477, + "grad_norm": 0.3358790775832897, + "learning_rate": 1.4995068884469941e-06, + "loss": 0.8047, + "num_tokens": 394139329.0, + "step": 472 + }, + { + "epoch": 2.5706521739130435, + "grad_norm": 0.34951877174632945, + "learning_rate": 1.4874758321444091e-06, + "loss": 0.8456, + "num_tokens": 394970536.0, + "step": 473 + }, + { + "epoch": 2.5760869565217392, + "grad_norm": 0.33912836478591646, + "learning_rate": 1.475583135217807e-06, + "loss": 0.865, + "num_tokens": 395802220.0, + "step": 474 + }, + { + "epoch": 2.5815217391304346, + "grad_norm": 0.33546628623087094, + "learning_rate": 1.4638292077497313e-06, + "loss": 0.8554, + "num_tokens": 396631158.0, + "step": 475 + }, + { + "epoch": 2.5869565217391304, + "grad_norm": 0.3521105782958677, + "learning_rate": 1.4522144550376968e-06, + "loss": 0.8423, + "num_tokens": 397478676.0, + "step": 476 + }, + { + "epoch": 2.592391304347826, + "grad_norm": 0.3509341949307193, + "learning_rate": 1.4407392775802109e-06, + "loss": 0.82, + "num_tokens": 398246760.0, + "step": 477 + }, + { + "epoch": 2.5978260869565215, + "grad_norm": 0.32911402990375643, + "learning_rate": 1.4294040710629617e-06, + "loss": 0.8769, + "num_tokens": 399099437.0, + "step": 478 + }, + { + "epoch": 2.6032608695652173, + "grad_norm": 0.3186571796247337, + "learning_rate": 1.418209226345179e-06, + "loss": 0.8793, + "num_tokens": 399969697.0, + "step": 479 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.3279788778493384, + "learning_rate": 1.407155129446152e-06, + "loss": 0.9707, + "num_tokens": 400797805.0, + "step": 480 + }, + { + "epoch": 2.6141304347826084, + "grad_norm": 0.34256729343775694, + "learning_rate": 1.396242161531921e-06, + "loss": 0.8696, + "num_tokens": 401547630.0, + "step": 481 + }, + { + "epoch": 2.619565217391304, + "grad_norm": 0.3172284147790305, + "learning_rate": 1.385470698902134e-06, + "loss": 0.832, + "num_tokens": 402392182.0, + "step": 482 + }, + { + "epoch": 2.625, + "grad_norm": 0.47594779079448835, + "learning_rate": 1.3748411129770703e-06, + "loss": 0.8696, + "num_tokens": 403183866.0, + "step": 483 + }, + { + "epoch": 2.630434782608696, + "grad_norm": 0.3276956523974404, + "learning_rate": 1.3643537702848333e-06, + "loss": 0.894, + "num_tokens": 404038755.0, + "step": 484 + }, + { + "epoch": 2.6358695652173916, + "grad_norm": 0.34310596614188354, + "learning_rate": 1.3540090324487142e-06, + "loss": 0.8598, + "num_tokens": 404864717.0, + "step": 485 + }, + { + "epoch": 2.641304347826087, + "grad_norm": 0.3352550272338416, + "learning_rate": 1.343807256174718e-06, + "loss": 0.8273, + "num_tokens": 405697412.0, + "step": 486 + }, + { + "epoch": 2.6467391304347827, + "grad_norm": 0.35497871766230493, + "learning_rate": 1.333748793239269e-06, + "loss": 0.8314, + "num_tokens": 406541909.0, + "step": 487 + }, + { + "epoch": 2.6521739130434785, + "grad_norm": 0.3063939735113217, + "learning_rate": 1.323833990477076e-06, + "loss": 0.8449, + "num_tokens": 407434141.0, + "step": 488 + }, + { + "epoch": 2.657608695652174, + "grad_norm": 0.2918078346539291, + "learning_rate": 1.3140631897691767e-06, + "loss": 0.9203, + "num_tokens": 408213064.0, + "step": 489 + }, + { + "epoch": 2.6630434782608696, + "grad_norm": 0.3306410671065819, + "learning_rate": 1.3044367280311462e-06, + "loss": 0.8499, + "num_tokens": 409071350.0, + "step": 490 + }, + { + "epoch": 2.6684782608695654, + "grad_norm": 0.31425658255388933, + "learning_rate": 1.2949549372014806e-06, + "loss": 0.9014, + "num_tokens": 409903386.0, + "step": 491 + }, + { + "epoch": 2.6739130434782608, + "grad_norm": 0.31317506453276645, + "learning_rate": 1.2856181442301524e-06, + "loss": 0.8652, + "num_tokens": 410734160.0, + "step": 492 + }, + { + "epoch": 2.6793478260869565, + "grad_norm": 0.31381245027317006, + "learning_rate": 1.2764266710673335e-06, + "loss": 0.9083, + "num_tokens": 411551118.0, + "step": 493 + }, + { + "epoch": 2.6847826086956523, + "grad_norm": 0.3165302151758142, + "learning_rate": 1.267380834652296e-06, + "loss": 0.9255, + "num_tokens": 412396952.0, + "step": 494 + }, + { + "epoch": 2.6902173913043477, + "grad_norm": 0.32292463903234464, + "learning_rate": 1.2584809469024848e-06, + "loss": 0.8961, + "num_tokens": 413158720.0, + "step": 495 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 0.31354859521048606, + "learning_rate": 1.249727314702759e-06, + "loss": 0.9043, + "num_tokens": 414025866.0, + "step": 496 + }, + { + "epoch": 2.7010869565217392, + "grad_norm": 0.33837050467306035, + "learning_rate": 1.2411202398948116e-06, + "loss": 0.8551, + "num_tokens": 414826744.0, + "step": 497 + }, + { + "epoch": 2.7065217391304346, + "grad_norm": 0.31857152993791094, + "learning_rate": 1.2326600192667612e-06, + "loss": 0.8835, + "num_tokens": 415658618.0, + "step": 498 + }, + { + "epoch": 2.7119565217391304, + "grad_norm": 0.3030421658445399, + "learning_rate": 1.2243469445429192e-06, + "loss": 0.8971, + "num_tokens": 416525614.0, + "step": 499 + }, + { + "epoch": 2.717391304347826, + "grad_norm": 0.30567436323419056, + "learning_rate": 1.2161813023737283e-06, + "loss": 0.8611, + "num_tokens": 417373841.0, + "step": 500 + }, + { + "epoch": 2.7228260869565215, + "grad_norm": 0.3302663704352623, + "learning_rate": 1.2081633743258807e-06, + "loss": 0.8105, + "num_tokens": 418207244.0, + "step": 501 + }, + { + "epoch": 2.7282608695652173, + "grad_norm": 0.33433073358856535, + "learning_rate": 1.2002934368726062e-06, + "loss": 0.764, + "num_tokens": 419000665.0, + "step": 502 + }, + { + "epoch": 2.733695652173913, + "grad_norm": 0.3241604502595133, + "learning_rate": 1.1925717613841432e-06, + "loss": 0.9584, + "num_tokens": 419794625.0, + "step": 503 + }, + { + "epoch": 2.7391304347826084, + "grad_norm": 0.32461465405581225, + "learning_rate": 1.184998614118377e-06, + "loss": 0.8969, + "num_tokens": 420639455.0, + "step": 504 + }, + { + "epoch": 2.744565217391304, + "grad_norm": 0.30608737482567067, + "learning_rate": 1.1775742562116616e-06, + "loss": 0.8808, + "num_tokens": 421467203.0, + "step": 505 + }, + { + "epoch": 2.75, + "grad_norm": 0.33056114911881734, + "learning_rate": 1.1702989436698139e-06, + "loss": 0.896, + "num_tokens": 422328867.0, + "step": 506 + }, + { + "epoch": 2.755434782608696, + "grad_norm": 0.3255646424875819, + "learning_rate": 1.163172927359285e-06, + "loss": 0.8577, + "num_tokens": 423189748.0, + "step": 507 + }, + { + "epoch": 2.7608695652173916, + "grad_norm": 0.33584968403928006, + "learning_rate": 1.1561964529985143e-06, + "loss": 0.8156, + "num_tokens": 424005796.0, + "step": 508 + }, + { + "epoch": 2.766304347826087, + "grad_norm": 0.323421018923075, + "learning_rate": 1.1493697611494512e-06, + "loss": 0.8255, + "num_tokens": 424799841.0, + "step": 509 + }, + { + "epoch": 2.7717391304347827, + "grad_norm": 0.3295272495710808, + "learning_rate": 1.142693087209264e-06, + "loss": 0.9751, + "num_tokens": 425631055.0, + "step": 510 + }, + { + "epoch": 2.7771739130434785, + "grad_norm": 0.3279812282350535, + "learning_rate": 1.13616666140222e-06, + "loss": 0.8055, + "num_tokens": 426533170.0, + "step": 511 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 0.29896410886279173, + "learning_rate": 1.1297907087717499e-06, + "loss": 0.8991, + "num_tokens": 427315655.0, + "step": 512 + }, + { + "epoch": 2.7880434782608696, + "grad_norm": 0.3163307617484885, + "learning_rate": 1.1235654491726853e-06, + "loss": 0.788, + "num_tokens": 428164717.0, + "step": 513 + }, + { + "epoch": 2.7934782608695654, + "grad_norm": 0.2931770479467324, + "learning_rate": 1.11749109726368e-06, + "loss": 0.8384, + "num_tokens": 429023017.0, + "step": 514 + }, + { + "epoch": 2.7989130434782608, + "grad_norm": 0.31236832418646115, + "learning_rate": 1.1115678624998057e-06, + "loss": 0.9362, + "num_tokens": 429866574.0, + "step": 515 + }, + { + "epoch": 2.8043478260869565, + "grad_norm": 0.3242237659649276, + "learning_rate": 1.1057959491253322e-06, + "loss": 0.7995, + "num_tokens": 430676238.0, + "step": 516 + }, + { + "epoch": 2.8097826086956523, + "grad_norm": 0.3036166991407892, + "learning_rate": 1.1001755561666812e-06, + "loss": 0.8371, + "num_tokens": 431514288.0, + "step": 517 + }, + { + "epoch": 2.8152173913043477, + "grad_norm": 0.32280595452436234, + "learning_rate": 1.0947068774255675e-06, + "loss": 0.807, + "num_tokens": 432392393.0, + "step": 518 + }, + { + "epoch": 2.8206521739130435, + "grad_norm": 0.32712965260938454, + "learning_rate": 1.0893901014723154e-06, + "loss": 0.8215, + "num_tokens": 433253184.0, + "step": 519 + }, + { + "epoch": 2.8260869565217392, + "grad_norm": 0.3227790299421114, + "learning_rate": 1.0842254116393524e-06, + "loss": 0.821, + "num_tokens": 434053516.0, + "step": 520 + }, + { + "epoch": 2.8315217391304346, + "grad_norm": 0.32611632405660934, + "learning_rate": 1.0792129860148939e-06, + "loss": 0.8623, + "num_tokens": 434790448.0, + "step": 521 + }, + { + "epoch": 2.8369565217391304, + "grad_norm": 0.3555686123895394, + "learning_rate": 1.074352997436797e-06, + "loss": 0.7353, + "num_tokens": 435603778.0, + "step": 522 + }, + { + "epoch": 2.842391304347826, + "grad_norm": 0.36002880926614983, + "learning_rate": 1.0696456134866027e-06, + "loss": 0.8001, + "num_tokens": 436440204.0, + "step": 523 + }, + { + "epoch": 2.8478260869565215, + "grad_norm": 0.3080276331188948, + "learning_rate": 1.06509099648376e-06, + "loss": 0.8874, + "num_tokens": 437277908.0, + "step": 524 + }, + { + "epoch": 2.8532608695652173, + "grad_norm": 0.3038579198576587, + "learning_rate": 1.0606893034800243e-06, + "loss": 0.8889, + "num_tokens": 438200019.0, + "step": 525 + }, + { + "epoch": 2.858695652173913, + "grad_norm": 0.3188499078923347, + "learning_rate": 1.0564406862540442e-06, + "loss": 0.8234, + "num_tokens": 439031032.0, + "step": 526 + }, + { + "epoch": 2.8641304347826084, + "grad_norm": 0.3163232661276132, + "learning_rate": 1.0523452913061287e-06, + "loss": 0.9376, + "num_tokens": 439908258.0, + "step": 527 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 0.3205748856172401, + "learning_rate": 1.0484032598531933e-06, + "loss": 0.8969, + "num_tokens": 440770759.0, + "step": 528 + }, + { + "epoch": 2.875, + "grad_norm": 0.3336093178600353, + "learning_rate": 1.044614727823893e-06, + "loss": 0.8814, + "num_tokens": 441610467.0, + "step": 529 + }, + { + "epoch": 2.880434782608696, + "grad_norm": 0.339579008072357, + "learning_rate": 1.0409798258539342e-06, + "loss": 0.7824, + "num_tokens": 442442060.0, + "step": 530 + }, + { + "epoch": 2.8858695652173916, + "grad_norm": 0.30343658556813535, + "learning_rate": 1.0374986792815698e-06, + "loss": 0.8269, + "num_tokens": 443296175.0, + "step": 531 + }, + { + "epoch": 2.891304347826087, + "grad_norm": 0.3402667170917275, + "learning_rate": 1.0341714081432765e-06, + "loss": 0.7638, + "num_tokens": 444156784.0, + "step": 532 + }, + { + "epoch": 2.8967391304347827, + "grad_norm": 0.31169236296214, + "learning_rate": 1.0309981271696186e-06, + "loss": 0.8148, + "num_tokens": 445007467.0, + "step": 533 + }, + { + "epoch": 2.9021739130434785, + "grad_norm": 0.30949456956927557, + "learning_rate": 1.0279789457812883e-06, + "loss": 0.8734, + "num_tokens": 445915302.0, + "step": 534 + }, + { + "epoch": 2.907608695652174, + "grad_norm": 0.32662740565262155, + "learning_rate": 1.0251139680853362e-06, + "loss": 0.9046, + "num_tokens": 446684613.0, + "step": 535 + }, + { + "epoch": 2.9130434782608696, + "grad_norm": 0.3039883271970662, + "learning_rate": 1.0224032928715779e-06, + "loss": 0.8139, + "num_tokens": 447529851.0, + "step": 536 + }, + { + "epoch": 2.9184782608695654, + "grad_norm": 0.29995513771563415, + "learning_rate": 1.0198470136091907e-06, + "loss": 0.906, + "num_tokens": 448361652.0, + "step": 537 + }, + { + "epoch": 2.9239130434782608, + "grad_norm": 0.32922775379683195, + "learning_rate": 1.0174452184434888e-06, + "loss": 0.8472, + "num_tokens": 449206270.0, + "step": 538 + }, + { + "epoch": 2.9293478260869565, + "grad_norm": 0.31634541908199004, + "learning_rate": 1.015197990192884e-06, + "loss": 0.7621, + "num_tokens": 450074620.0, + "step": 539 + }, + { + "epoch": 2.9347826086956523, + "grad_norm": 0.2973593789509411, + "learning_rate": 1.0131054063460314e-06, + "loss": 0.9702, + "num_tokens": 450858066.0, + "step": 540 + }, + { + "epoch": 2.9402173913043477, + "grad_norm": 0.3108474113687747, + "learning_rate": 1.0111675390591551e-06, + "loss": 0.9196, + "num_tokens": 451684385.0, + "step": 541 + }, + { + "epoch": 2.9456521739130435, + "grad_norm": 0.30825641786035934, + "learning_rate": 1.0093844551535627e-06, + "loss": 0.8517, + "num_tokens": 452500740.0, + "step": 542 + }, + { + "epoch": 2.9510869565217392, + "grad_norm": 0.31975962016702786, + "learning_rate": 1.0077562161133376e-06, + "loss": 0.8561, + "num_tokens": 453311676.0, + "step": 543 + }, + { + "epoch": 2.9565217391304346, + "grad_norm": 0.3171145276717419, + "learning_rate": 1.006282878083224e-06, + "loss": 0.8614, + "num_tokens": 454161473.0, + "step": 544 + }, + { + "epoch": 2.9619565217391304, + "grad_norm": 0.3330844602709494, + "learning_rate": 1.0049644918666862e-06, + "loss": 0.8472, + "num_tokens": 454913322.0, + "step": 545 + }, + { + "epoch": 2.967391304347826, + "grad_norm": 0.3218989363030988, + "learning_rate": 1.003801102924159e-06, + "loss": 0.9203, + "num_tokens": 455798549.0, + "step": 546 + }, + { + "epoch": 2.9728260869565215, + "grad_norm": 0.329031606721965, + "learning_rate": 1.0027927513714805e-06, + "loss": 0.8401, + "num_tokens": 456581900.0, + "step": 547 + }, + { + "epoch": 2.9782608695652173, + "grad_norm": 0.32291601500960876, + "learning_rate": 1.0019394719785073e-06, + "loss": 0.8265, + "num_tokens": 457468475.0, + "step": 548 + }, + { + "epoch": 2.983695652173913, + "grad_norm": 0.35665352121026894, + "learning_rate": 1.0012412941679172e-06, + "loss": 0.8668, + "num_tokens": 458212148.0, + "step": 549 + }, + { + "epoch": 2.9891304347826084, + "grad_norm": 0.31822874209028135, + "learning_rate": 1.0006982420141937e-06, + "loss": 0.8655, + "num_tokens": 459035504.0, + "step": 550 + }, + { + "epoch": 2.994565217391304, + "grad_norm": 0.32078359060378214, + "learning_rate": 1.0003103342427952e-06, + "loss": 0.8234, + "num_tokens": 459883895.0, + "step": 551 + }, + { + "epoch": 3.0, + "grad_norm": 0.31075975332177497, + "learning_rate": 1.0000775842295116e-06, + "loss": 0.8399, + "num_tokens": 460733269.0, + "step": 552 + }, + { + "epoch": 3.0, + "eval_loss": 0.665989875793457, + "eval_num_tokens": 460733269.0, + "eval_runtime": 117.721, + "eval_samples_per_second": 44.436, + "eval_steps_per_second": 5.556, + "step": 552 + }, { "epoch": 3.0, - "step": 324, - "total_flos": 3.064163325664297e+17, - "train_loss": 0.4129233885510468, - "train_runtime": 3035.4475, - "train_samples_per_second": 27.135, - "train_steps_per_second": 0.107 + "step": 552, + "total_flos": 967966479089664.0, + "train_loss": 1.0463442883413772, + "train_runtime": 10114.1342, + "train_samples_per_second": 13.962, + "train_steps_per_second": 0.055 } ], "logging_steps": 1, - "max_steps": 324, + "max_steps": 552, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, @@ -2682,8 +4479,8 @@ "attributes": {} } }, - "total_flos": 3.064163325664297e+17, - "train_batch_size": 4, + "total_flos": 967966479089664.0, + "train_batch_size": 8, "trial_name": null, "trial_params": null }