diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,5037 +2,2673 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.0, - "eval_steps": 500, - "global_step": 624, + "epoch": 3.0, + "eval_steps": 51, + "global_step": 324, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0032147880249146074, - "grad_norm": 5.018370779973616, + "epoch": 0.009324009324009324, + "grad_norm": 15.097446706645455, "learning_rate": 0.0, - "loss": 1.8005, - "num_tokens": 834234.0, + "loss": 1.8292, + "num_tokens": 224382.0, "step": 1 }, { - "epoch": 0.006429576049829215, - "grad_norm": 5.285795629185287, - "learning_rate": 5.263157894736843e-07, - "loss": 1.7416, - "num_tokens": 1582170.0, + "epoch": 0.018648018648018648, + "grad_norm": 15.427115110106058, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.8351, + "num_tokens": 438803.0, "step": 2 }, { - "epoch": 0.009644364074743821, - "grad_norm": 5.355946444791632, - "learning_rate": 1.0526315789473685e-06, - "loss": 1.8605, - "num_tokens": 2310664.0, + "epoch": 0.027972027972027972, + "grad_norm": 14.875101522845485, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.8224, + "num_tokens": 657344.0, "step": 3 }, { - "epoch": 0.01285915209965843, - "grad_norm": 4.630568418836809, - "learning_rate": 1.5789473684210526e-06, - "loss": 1.6951, - "num_tokens": 3218804.0, + "epoch": 0.037296037296037296, + "grad_norm": 13.4290730870021, + "learning_rate": 3e-06, + "loss": 1.7588, + "num_tokens": 881653.0, "step": 4 }, { - "epoch": 0.016073940124573034, - "grad_norm": 4.606087537452017, - "learning_rate": 2.105263157894737e-06, - "loss": 1.8162, - "num_tokens": 4012626.0, + "epoch": 0.046620046620046623, + "grad_norm": 10.499006101838235, + "learning_rate": 4.000000000000001e-06, + "loss": 1.6163, + "num_tokens": 1113061.0, "step": 5 }, { - "epoch": 0.019288728149487643, - "grad_norm": 3.446983316569978, - "learning_rate": 2.631578947368421e-06, - "loss": 1.6542, - "num_tokens": 4844317.0, + "epoch": 0.055944055944055944, + "grad_norm": 8.049414704255772, + "learning_rate": 5e-06, + "loss": 1.4542, + "num_tokens": 1331772.0, "step": 6 }, { - "epoch": 0.02250351617440225, - "grad_norm": 2.467317650305565, - "learning_rate": 3.157894736842105e-06, - "loss": 1.7126, - "num_tokens": 5729512.0, + "epoch": 0.06526806526806526, + "grad_norm": 7.722374086770013, + "learning_rate": 6e-06, + "loss": 1.3668, + "num_tokens": 1562018.0, "step": 7 }, { - "epoch": 0.02571830419931686, - "grad_norm": 2.4184789170509697, - "learning_rate": 3.6842105263157896e-06, - "loss": 1.5641, - "num_tokens": 6511297.0, + "epoch": 0.07459207459207459, + "grad_norm": 5.947276769620558, + "learning_rate": 7e-06, + "loss": 1.1882, + "num_tokens": 1788639.0, "step": 8 }, { - "epoch": 0.028933092224231464, - "grad_norm": 2.0433418789267157, - "learning_rate": 4.210526315789474e-06, - "loss": 1.5966, - "num_tokens": 7267796.0, + "epoch": 0.08391608391608392, + "grad_norm": 5.113219943739578, + "learning_rate": 8.000000000000001e-06, + "loss": 1.1126, + "num_tokens": 2014673.0, "step": 9 }, { - "epoch": 0.03214788024914607, - "grad_norm": 2.043560232971009, - "learning_rate": 4.736842105263158e-06, - "loss": 1.5521, - "num_tokens": 8175090.0, + "epoch": 0.09324009324009325, + "grad_norm": 14.715304846179434, + "learning_rate": 9e-06, + "loss": 1.0331, + "num_tokens": 2250746.0, "step": 10 }, { - "epoch": 0.03536266827406068, - "grad_norm": 2.0005277319771237, - "learning_rate": 5.263157894736842e-06, - "loss": 1.5093, - "num_tokens": 9009584.0, + "epoch": 0.10256410256410256, + "grad_norm": 6.82907493160171, + "learning_rate": 1e-05, + "loss": 0.9996, + "num_tokens": 2478394.0, "step": 11 }, { - "epoch": 0.038577456298975285, - "grad_norm": 1.7470627630108726, - "learning_rate": 5.789473684210527e-06, - "loss": 1.3749, - "num_tokens": 9829075.0, + "epoch": 0.11188811188811189, + "grad_norm": 3.6488282737279203, + "learning_rate": 9.999774773574383e-06, + "loss": 0.9297, + "num_tokens": 2713916.0, "step": 12 }, { - "epoch": 0.04179224432388989, - "grad_norm": 1.6870904358184513, - "learning_rate": 6.31578947368421e-06, - "loss": 1.406, - "num_tokens": 10698507.0, + "epoch": 0.12121212121212122, + "grad_norm": 2.8908805137859828, + "learning_rate": 9.999099116842838e-06, + "loss": 0.8859, + "num_tokens": 2943638.0, "step": 13 }, { - "epoch": 0.0450070323488045, - "grad_norm": 1.5086244299682119, - "learning_rate": 6.842105263157896e-06, - "loss": 1.3892, - "num_tokens": 11558592.0, + "epoch": 0.13053613053613053, + "grad_norm": 2.593442104323103, + "learning_rate": 9.99797309743903e-06, + "loss": 0.898, + "num_tokens": 3158482.0, "step": 14 }, { - "epoch": 0.04822182037371911, - "grad_norm": 1.2676270261507918, - "learning_rate": 7.368421052631579e-06, - "loss": 1.4179, - "num_tokens": 12402138.0, + "epoch": 0.13986013986013987, + "grad_norm": 2.374961152220559, + "learning_rate": 9.99639682807822e-06, + "loss": 0.8464, + "num_tokens": 3398166.0, "step": 15 }, { - "epoch": 0.05143660839863372, - "grad_norm": 1.1825447622786052, - "learning_rate": 7.894736842105265e-06, - "loss": 1.3664, - "num_tokens": 13178988.0, + "epoch": 0.14918414918414918, + "grad_norm": 1.8725167978976107, + "learning_rate": 9.994370466545966e-06, + "loss": 0.829, + "num_tokens": 3631244.0, "step": 16 }, { - "epoch": 0.05465139642354832, - "grad_norm": 1.0410376181586634, - "learning_rate": 8.421052631578948e-06, - "loss": 1.2379, - "num_tokens": 14083529.0, + "epoch": 0.1585081585081585, + "grad_norm": 1.3376641050560698, + "learning_rate": 9.99189421568234e-06, + "loss": 0.7943, + "num_tokens": 3851898.0, "step": 17 }, { - "epoch": 0.05786618444846293, - "grad_norm": 0.965106740214503, - "learning_rate": 8.947368421052632e-06, - "loss": 1.2962, - "num_tokens": 14838407.0, + "epoch": 0.16783216783216784, + "grad_norm": 1.4995925443300326, + "learning_rate": 9.988968323361627e-06, + "loss": 0.7756, + "num_tokens": 4079798.0, "step": 18 }, { - "epoch": 0.061080972473377536, - "grad_norm": 1.2039075649568454, - "learning_rate": 9.473684210526315e-06, - "loss": 1.4121, - "num_tokens": 15594653.0, + "epoch": 0.17715617715617715, + "grad_norm": 1.275240322623827, + "learning_rate": 9.985593082467498e-06, + "loss": 0.7681, + "num_tokens": 4308418.0, "step": 19 }, { - "epoch": 0.06429576049829214, - "grad_norm": 0.7854097257840835, - "learning_rate": 1e-05, - "loss": 1.2706, - "num_tokens": 16474437.0, + "epoch": 0.1864801864801865, + "grad_norm": 1.049158733271031, + "learning_rate": 9.981768830863707e-06, + "loss": 0.7208, + "num_tokens": 4528712.0, "step": 20 }, { - "epoch": 0.06751054852320675, - "grad_norm": 0.8320199433925605, - "learning_rate": 9.99993933048288e-06, - "loss": 1.2198, - "num_tokens": 17245976.0, + "epoch": 0.1958041958041958, + "grad_norm": 1.0263177055889612, + "learning_rate": 9.977495951360264e-06, + "loss": 0.7459, + "num_tokens": 4756886.0, "step": 21 }, { - "epoch": 0.07072533654812135, - "grad_norm": 0.7129188000282078, - "learning_rate": 9.999757323567429e-06, - "loss": 1.1134, - "num_tokens": 18127984.0, + "epoch": 0.20512820512820512, + "grad_norm": 1.0822926194764306, + "learning_rate": 9.97277487167511e-06, + "loss": 0.7052, + "num_tokens": 4984115.0, "step": 22 }, { - "epoch": 0.07394012457303596, - "grad_norm": 0.6722845582272569, - "learning_rate": 9.999453984161321e-06, - "loss": 1.1478, - "num_tokens": 18970588.0, + "epoch": 0.21445221445221446, + "grad_norm": 1.0092686382544667, + "learning_rate": 9.967606064391318e-06, + "loss": 0.7186, + "num_tokens": 5204818.0, "step": 23 }, { - "epoch": 0.07715491259795057, - "grad_norm": 0.6030801292784839, - "learning_rate": 9.99902932044387e-06, - "loss": 1.1103, - "num_tokens": 19800781.0, + "epoch": 0.22377622377622378, + "grad_norm": 0.7782398807672024, + "learning_rate": 9.96199004690977e-06, + "loss": 0.6978, + "num_tokens": 5431406.0, "step": 24 }, { - "epoch": 0.08036970062286518, - "grad_norm": 0.6372911582485848, - "learning_rate": 9.998483343865806e-06, - "loss": 1.1283, - "num_tokens": 20631221.0, + "epoch": 0.2331002331002331, + "grad_norm": 0.9300259353100147, + "learning_rate": 9.955927381397374e-06, + "loss": 0.6841, + "num_tokens": 5661807.0, "step": 25 }, { - "epoch": 0.08358448864777979, - "grad_norm": 0.6405070539836101, - "learning_rate": 9.99781606914897e-06, - "loss": 1.1999, - "num_tokens": 21400785.0, + "epoch": 0.24242424242424243, + "grad_norm": 0.8985698420847705, + "learning_rate": 9.949418674730787e-06, + "loss": 0.6456, + "num_tokens": 5883328.0, "step": 26 }, { - "epoch": 0.0867992766726944, - "grad_norm": 0.5061595936280484, - "learning_rate": 9.997027514285905e-06, - "loss": 1.0864, - "num_tokens": 22260021.0, + "epoch": 0.2517482517482518, + "grad_norm": 0.7655861536606542, + "learning_rate": 9.942464578435674e-06, + "loss": 0.6666, + "num_tokens": 6116065.0, "step": 27 }, { - "epoch": 0.090014064697609, - "grad_norm": 0.49092433177759565, - "learning_rate": 9.996117700539389e-06, - "loss": 1.0715, - "num_tokens": 23083303.0, + "epoch": 0.26107226107226106, + "grad_norm": 0.8426013594218673, + "learning_rate": 9.935065788621479e-06, + "loss": 0.653, + "num_tokens": 6337735.0, "step": 28 }, { - "epoch": 0.09322885272252361, - "grad_norm": 0.44643949398460303, - "learning_rate": 9.99508665244185e-06, - "loss": 0.9406, - "num_tokens": 23865289.0, + "epoch": 0.2703962703962704, + "grad_norm": 0.7979514227426084, + "learning_rate": 9.92722304591175e-06, + "loss": 0.649, + "num_tokens": 6559217.0, "step": 29 }, { - "epoch": 0.09644364074743822, - "grad_norm": 0.4530732634846802, - "learning_rate": 9.993934397794704e-06, - "loss": 1.053, - "num_tokens": 24747706.0, + "epoch": 0.27972027972027974, + "grad_norm": 0.7580072355627283, + "learning_rate": 9.918937135370002e-06, + "loss": 0.6319, + "num_tokens": 6789052.0, "step": 30 }, { - "epoch": 0.09965842877235283, - "grad_norm": 0.44781482088612473, - "learning_rate": 9.99266096766761e-06, - "loss": 1.0716, - "num_tokens": 25568954.0, + "epoch": 0.289044289044289, + "grad_norm": 0.8286474240201317, + "learning_rate": 9.91020888642113e-06, + "loss": 0.6591, + "num_tokens": 7026823.0, "step": 31 }, { - "epoch": 0.10287321679726744, - "grad_norm": 0.3864901699187842, - "learning_rate": 9.991266396397634e-06, - "loss": 0.9772, - "num_tokens": 26357204.0, + "epoch": 0.29836829836829837, + "grad_norm": 0.7702640758775066, + "learning_rate": 9.901039172768384e-06, + "loss": 0.6296, + "num_tokens": 7254546.0, "step": 32 }, { - "epoch": 0.10608800482218204, - "grad_norm": 0.41856495008454664, - "learning_rate": 9.98975072158831e-06, - "loss": 1.0565, - "num_tokens": 27226911.0, + "epoch": 0.3076923076923077, + "grad_norm": 0.7489368196898364, + "learning_rate": 9.89142891230591e-06, + "loss": 0.5884, + "num_tokens": 7490975.0, "step": 33 }, { - "epoch": 0.10930279284709664, - "grad_norm": 0.404558177581084, - "learning_rate": 9.988113984108649e-06, - "loss": 1.005, - "num_tokens": 28025969.0, + "epoch": 0.317016317016317, + "grad_norm": 0.8438034724677627, + "learning_rate": 9.88137906702687e-06, + "loss": 0.5961, + "num_tokens": 7716958.0, "step": 34 }, { - "epoch": 0.11251758087201125, - "grad_norm": 0.38269257479493757, - "learning_rate": 9.986356228092011e-06, - "loss": 1.0499, - "num_tokens": 28879552.0, + "epoch": 0.32634032634032634, + "grad_norm": 0.8465427563269303, + "learning_rate": 9.870890642927145e-06, + "loss": 0.5883, + "num_tokens": 7945144.0, "step": 35 }, { - "epoch": 0.11573236889692586, - "grad_norm": 0.368432044258959, - "learning_rate": 9.984477500934935e-06, - "loss": 0.9644, - "num_tokens": 29711995.0, + "epoch": 0.3356643356643357, + "grad_norm": 0.6949632315845249, + "learning_rate": 9.859964689904631e-06, + "loss": 0.566, + "num_tokens": 8163469.0, "step": 36 }, { - "epoch": 0.11894715692184046, - "grad_norm": 0.3776367028480142, - "learning_rate": 9.982477853295853e-06, - "loss": 0.9161, - "num_tokens": 30587748.0, + "epoch": 0.34498834498834496, + "grad_norm": 0.7572473510759273, + "learning_rate": 9.848602301654151e-06, + "loss": 0.5746, + "num_tokens": 8388850.0, "step": 37 }, { - "epoch": 0.12216194494675507, - "grad_norm": 0.3894030927742351, - "learning_rate": 9.980357339093722e-06, - "loss": 1.0458, - "num_tokens": 31409239.0, + "epoch": 0.3543123543123543, + "grad_norm": 0.8258450110130665, + "learning_rate": 9.836804615557965e-06, + "loss": 0.5932, + "num_tokens": 8606930.0, "step": 38 }, { - "epoch": 0.12537673297166968, - "grad_norm": 0.35161842166771995, - "learning_rate": 9.978116015506575e-06, - "loss": 0.9182, - "num_tokens": 32188445.0, + "epoch": 0.36363636363636365, + "grad_norm": 0.736881354400413, + "learning_rate": 9.82457281257193e-06, + "loss": 0.5718, + "num_tokens": 8833247.0, "step": 39 }, { - "epoch": 0.12859152099658427, - "grad_norm": 0.3565210905884514, - "learning_rate": 9.975753942969978e-06, - "loss": 0.9141, - "num_tokens": 32989758.0, + "epoch": 0.372960372960373, + "grad_norm": 0.6874213684175794, + "learning_rate": 9.811908117107269e-06, + "loss": 0.5685, + "num_tokens": 9065155.0, "step": 40 }, { - "epoch": 0.1318063090214989, - "grad_norm": 0.36427320051129425, - "learning_rate": 9.973271185175394e-06, - "loss": 1.0101, - "num_tokens": 33808924.0, + "epoch": 0.3822843822843823, + "grad_norm": 0.6917539701436399, + "learning_rate": 9.79881179690803e-06, + "loss": 0.5615, + "num_tokens": 9293680.0, "step": 41 }, { - "epoch": 0.1350210970464135, - "grad_norm": 0.36994663947393924, - "learning_rate": 9.970667809068476e-06, - "loss": 1.0084, - "num_tokens": 34673373.0, + "epoch": 0.3916083916083916, + "grad_norm": 0.7525097359649862, + "learning_rate": 9.78528516292416e-06, + "loss": 0.5655, + "num_tokens": 9511799.0, "step": 42 }, { - "epoch": 0.1382358850713281, - "grad_norm": 0.3320100538768802, - "learning_rate": 9.967943884847259e-06, - "loss": 0.8867, - "num_tokens": 35555852.0, + "epoch": 0.40093240093240096, + "grad_norm": 0.7242611329025388, + "learning_rate": 9.771329569180288e-06, + "loss": 0.5473, + "num_tokens": 9737324.0, "step": 43 }, { - "epoch": 0.1414506730962427, - "grad_norm": 0.3488125662115648, - "learning_rate": 9.965099485960257e-06, - "loss": 1.0164, - "num_tokens": 36414846.0, + "epoch": 0.41025641025641024, + "grad_norm": 0.6542100040132239, + "learning_rate": 9.756946412640193e-06, + "loss": 0.5499, + "num_tokens": 9981387.0, "step": 44 }, { - "epoch": 0.14466546112115733, - "grad_norm": 0.3026831580817585, - "learning_rate": 9.962134689104498e-06, - "loss": 0.9572, - "num_tokens": 37288272.0, + "epoch": 0.4195804195804196, + "grad_norm": 0.6916418171976277, + "learning_rate": 9.742137133066959e-06, + "loss": 0.5377, + "num_tokens": 10208518.0, "step": 45 }, { - "epoch": 0.14788024914607192, - "grad_norm": 0.333752952947364, - "learning_rate": 9.959049574223447e-06, - "loss": 0.9021, - "num_tokens": 38184064.0, + "epoch": 0.4289044289044289, + "grad_norm": 0.798650044046329, + "learning_rate": 9.726903212878853e-06, + "loss": 0.5762, + "num_tokens": 10424661.0, "step": 46 }, { - "epoch": 0.15109503717098655, - "grad_norm": 0.36416859139573193, - "learning_rate": 9.955844224504849e-06, - "loss": 1.0018, - "num_tokens": 39017031.0, + "epoch": 0.4382284382284382, + "grad_norm": 0.7072203438915419, + "learning_rate": 9.711246177000938e-06, + "loss": 0.5368, + "num_tokens": 10657919.0, "step": 47 }, { - "epoch": 0.15430982519590114, - "grad_norm": 0.3924003711313513, - "learning_rate": 9.95251872637849e-06, - "loss": 1.1063, - "num_tokens": 39904372.0, + "epoch": 0.44755244755244755, + "grad_norm": 0.6637477276125824, + "learning_rate": 9.695167592712426e-06, + "loss": 0.5341, + "num_tokens": 10885634.0, "step": 48 }, { - "epoch": 0.15752461322081576, - "grad_norm": 0.3842477493485173, - "learning_rate": 9.949073169513865e-06, - "loss": 0.9811, - "num_tokens": 40724694.0, + "epoch": 0.4568764568764569, + "grad_norm": 0.7461742922095873, + "learning_rate": 9.678669069489793e-06, + "loss": 0.5477, + "num_tokens": 11121932.0, "step": 49 }, { - "epoch": 0.16073940124573036, - "grad_norm": 0.34280723864927665, - "learning_rate": 9.945507646817764e-06, - "loss": 0.9856, - "num_tokens": 41595935.0, + "epoch": 0.4662004662004662, + "grad_norm": 0.7083145131153268, + "learning_rate": 9.661752258845667e-06, + "loss": 0.5411, + "num_tokens": 11346099.0, "step": 50 }, { - "epoch": 0.16395418927064498, - "grad_norm": 0.38335048202794086, - "learning_rate": 9.941822254431759e-06, - "loss": 1.1081, - "num_tokens": 42410254.0, + "epoch": 0.4755244755244755, + "grad_norm": 0.6804629701224661, + "learning_rate": 9.644418854163509e-06, + "loss": 0.5495, + "num_tokens": 11573878.0, + "step": 51 + }, + { + "epoch": 0.4755244755244755, + "eval_loss": 0.5184547305107117, + "eval_num_tokens": 11573878.0, + "eval_runtime": 46.9117, + "eval_samples_per_second": 65.037, + "eval_steps_per_second": 8.143, "step": 51 }, { - "epoch": 0.16716897729555957, - "grad_norm": 0.35909501963913254, - "learning_rate": 9.938017091729616e-06, - "loss": 0.9258, - "num_tokens": 43232271.0, + "epoch": 0.48484848484848486, + "grad_norm": 0.670194145666963, + "learning_rate": 9.626670590528115e-06, + "loss": 0.5251, + "num_tokens": 11809080.0, "step": 52 }, { - "epoch": 0.17038376532047417, - "grad_norm": 0.3497836040749439, - "learning_rate": 9.93409226131462e-06, - "loss": 0.9528, - "num_tokens": 44042404.0, + "epoch": 0.49417249417249415, + "grad_norm": 0.6532238552343984, + "learning_rate": 9.608509244551916e-06, + "loss": 0.5283, + "num_tokens": 12033466.0, "step": 53 }, { - "epoch": 0.1735985533453888, - "grad_norm": 0.40119263957232615, - "learning_rate": 9.930047869016796e-06, - "loss": 1.0352, - "num_tokens": 44856500.0, + "epoch": 0.5034965034965035, + "grad_norm": 0.6540950806759922, + "learning_rate": 9.589936634197155e-06, + "loss": 0.525, + "num_tokens": 12266159.0, "step": 54 }, { - "epoch": 0.17681334137030338, - "grad_norm": 0.36866920413699433, - "learning_rate": 9.925884023890072e-06, - "loss": 0.9902, - "num_tokens": 45696584.0, + "epoch": 0.5128205128205128, + "grad_norm": 0.6803327517194798, + "learning_rate": 9.570954618593895e-06, + "loss": 0.5118, + "num_tokens": 12481983.0, "step": 55 }, { - "epoch": 0.180028129395218, - "grad_norm": 0.39414649999947265, - "learning_rate": 9.92160083820932e-06, - "loss": 1.0943, - "num_tokens": 46538123.0, + "epoch": 0.5221445221445221, + "grad_norm": 0.6856024969619839, + "learning_rate": 9.551565097853923e-06, + "loss": 0.4929, + "num_tokens": 12708247.0, "step": 56 }, { - "epoch": 0.1832429174201326, - "grad_norm": 0.5439161215339307, - "learning_rate": 9.917198427467349e-06, - "loss": 1.0921, - "num_tokens": 47293261.0, + "epoch": 0.5314685314685315, + "grad_norm": 0.6101426047566378, + "learning_rate": 9.531770012880554e-06, + "loss": 0.5095, + "num_tokens": 12939945.0, "step": 57 }, { - "epoch": 0.18645770544504722, - "grad_norm": 0.3717714491902534, - "learning_rate": 9.912676910371769e-06, - "loss": 1.0704, - "num_tokens": 48163851.0, + "epoch": 0.5407925407925408, + "grad_norm": 0.6475609980018124, + "learning_rate": 9.511571345174331e-06, + "loss": 0.4989, + "num_tokens": 13159254.0, "step": 58 }, { - "epoch": 0.18967249346996182, - "grad_norm": 0.4108510169114107, - "learning_rate": 9.908036408841808e-06, - "loss": 0.9808, - "num_tokens": 49049880.0, + "epoch": 0.5501165501165501, + "grad_norm": 0.6732344762060629, + "learning_rate": 9.490971116634695e-06, + "loss": 0.5106, + "num_tokens": 13379346.0, "step": 59 }, { - "epoch": 0.19288728149487644, - "grad_norm": 0.3756801961132257, - "learning_rate": 9.903277048005017e-06, - "loss": 0.9794, - "num_tokens": 49877687.0, + "epoch": 0.5594405594405595, + "grad_norm": 0.6389095283638084, + "learning_rate": 9.469971389357574e-06, + "loss": 0.4976, + "num_tokens": 13612453.0, "step": 60 }, { - "epoch": 0.19610206951979103, - "grad_norm": 0.3680823897076448, - "learning_rate": 9.898398956193896e-06, - "loss": 1.0079, - "num_tokens": 50760445.0, + "epoch": 0.5687645687645687, + "grad_norm": 0.7102859144485985, + "learning_rate": 9.448574265428972e-06, + "loss": 0.4926, + "num_tokens": 13843528.0, "step": 61 }, { - "epoch": 0.19931685754470566, - "grad_norm": 0.3588649642897562, - "learning_rate": 9.893402264942427e-06, - "loss": 1.0316, - "num_tokens": 51558687.0, + "epoch": 0.578088578088578, + "grad_norm": 0.5992429263255332, + "learning_rate": 9.426781886714551e-06, + "loss": 0.4936, + "num_tokens": 14077000.0, "step": 62 }, { - "epoch": 0.20253164556962025, - "grad_norm": 0.37244130989853, - "learning_rate": 9.88828710898255e-06, - "loss": 0.8763, - "num_tokens": 52413025.0, + "epoch": 0.5874125874125874, + "grad_norm": 0.7166387603738363, + "learning_rate": 9.404596434645232e-06, + "loss": 0.4889, + "num_tokens": 14294350.0, "step": 63 }, { - "epoch": 0.20574643359453487, - "grad_norm": 0.3366971836092278, - "learning_rate": 9.883053626240503e-06, - "loss": 0.9735, - "num_tokens": 53244993.0, + "epoch": 0.5967365967365967, + "grad_norm": 0.6561978843481102, + "learning_rate": 9.382020129998821e-06, + "loss": 0.4979, + "num_tokens": 14532343.0, "step": 64 }, { - "epoch": 0.20896122161944947, - "grad_norm": 0.36086635391864136, - "learning_rate": 9.877701957833113e-06, - "loss": 0.9933, - "num_tokens": 54087376.0, + "epoch": 0.6060606060606061, + "grad_norm": 0.61526807109734, + "learning_rate": 9.359055232677718e-06, + "loss": 0.4759, + "num_tokens": 14756340.0, "step": 65 }, { - "epoch": 0.2121760096443641, - "grad_norm": 0.3362733367365835, - "learning_rate": 9.872232248064005e-06, - "loss": 0.9259, - "num_tokens": 54917212.0, + "epoch": 0.6153846153846154, + "grad_norm": 0.658194861071045, + "learning_rate": 9.335704041482697e-06, + "loss": 0.471, + "num_tokens": 14968956.0, "step": 66 }, { - "epoch": 0.21539079766927868, - "grad_norm": 0.39480077981213596, - "learning_rate": 9.866644644419681e-06, - "loss": 0.9974, - "num_tokens": 55724847.0, + "epoch": 0.6247086247086248, + "grad_norm": 0.616416691332377, + "learning_rate": 9.311968893882791e-06, + "loss": 0.4912, + "num_tokens": 15200242.0, "step": 67 }, { - "epoch": 0.21860558569419328, - "grad_norm": 0.33268267716701183, - "learning_rate": 9.860939297565579e-06, - "loss": 0.9181, - "num_tokens": 56496956.0, + "epoch": 0.634032634032634, + "grad_norm": 0.6343339828665332, + "learning_rate": 9.287852165781312e-06, + "loss": 0.4919, + "num_tokens": 15426840.0, "step": 68 }, { - "epoch": 0.2218203737191079, - "grad_norm": 0.3256850104196996, - "learning_rate": 9.855116361341977e-06, - "loss": 0.925, - "num_tokens": 57322615.0, + "epoch": 0.6433566433566433, + "grad_norm": 0.6564153295890287, + "learning_rate": 9.263356271278027e-06, + "loss": 0.4793, + "num_tokens": 15658816.0, "step": 69 }, { - "epoch": 0.2250351617440225, - "grad_norm": 0.3292952414692179, - "learning_rate": 9.849175992759867e-06, - "loss": 0.8195, - "num_tokens": 58133079.0, + "epoch": 0.6526806526806527, + "grad_norm": 0.6425664251133348, + "learning_rate": 9.238483662427493e-06, + "loss": 0.4823, + "num_tokens": 15885811.0, "step": 70 }, { - "epoch": 0.22824994976893712, - "grad_norm": 0.3714186436003435, - "learning_rate": 9.843118351996715e-06, - "loss": 1.002, - "num_tokens": 58959915.0, + "epoch": 0.662004662004662, + "grad_norm": 0.628140689158425, + "learning_rate": 9.213236828993619e-06, + "loss": 0.4749, + "num_tokens": 16118002.0, "step": 71 }, { - "epoch": 0.2314647377938517, - "grad_norm": 0.37277494767147934, - "learning_rate": 9.836943602392134e-06, - "loss": 0.9215, - "num_tokens": 59859503.0, + "epoch": 0.6713286713286714, + "grad_norm": 0.6485184833730361, + "learning_rate": 9.187618298200425e-06, + "loss": 0.4786, + "num_tokens": 16346221.0, "step": 72 }, { - "epoch": 0.23467952581876633, - "grad_norm": 0.35690842786835025, - "learning_rate": 9.830651910443495e-06, - "loss": 0.9979, - "num_tokens": 60693073.0, + "epoch": 0.6806526806526807, + "grad_norm": 0.5979662338785944, + "learning_rate": 9.16163063447908e-06, + "loss": 0.4879, + "num_tokens": 16575378.0, "step": 73 }, { - "epoch": 0.23789431384368093, - "grad_norm": 0.4008076785449049, - "learning_rate": 9.824243445801427e-06, - "loss": 1.0384, - "num_tokens": 61536239.0, + "epoch": 0.6899766899766899, + "grad_norm": 0.675557026688775, + "learning_rate": 9.13527643921118e-06, + "loss": 0.4661, + "num_tokens": 16809130.0, "step": 74 }, { - "epoch": 0.24110910186859555, - "grad_norm": 0.33201196901193847, - "learning_rate": 9.81771838126524e-06, - "loss": 0.9466, - "num_tokens": 62374068.0, + "epoch": 0.6993006993006993, + "grad_norm": 0.6422885724710128, + "learning_rate": 9.10855835046838e-06, + "loss": 0.4679, + "num_tokens": 17026726.0, "step": 75 }, { - "epoch": 0.24432388989351014, - "grad_norm": 0.34366584744174794, - "learning_rate": 9.811076892778274e-06, - "loss": 0.9766, - "num_tokens": 63171704.0, + "epoch": 0.7086247086247086, + "grad_norm": 0.6114995628017877, + "learning_rate": 9.081479042748286e-06, + "loss": 0.465, + "num_tokens": 17257265.0, "step": 76 }, { - "epoch": 0.24753867791842477, - "grad_norm": 0.31472988083332465, - "learning_rate": 9.804319159423153e-06, - "loss": 0.9824, - "num_tokens": 64119499.0, + "epoch": 0.717948717948718, + "grad_norm": 0.6151916913104115, + "learning_rate": 9.05404122670676e-06, + "loss": 0.4646, + "num_tokens": 17478618.0, "step": 77 }, { - "epoch": 0.25075346594333936, - "grad_norm": 0.349249603256903, - "learning_rate": 9.797445363416954e-06, - "loss": 0.9869, - "num_tokens": 65007373.0, + "epoch": 0.7272727272727273, + "grad_norm": 0.5936620988012073, + "learning_rate": 9.026247648886567e-06, + "loss": 0.461, + "num_tokens": 17704974.0, "step": 78 }, { - "epoch": 0.25396825396825395, - "grad_norm": 0.3225294656226075, - "learning_rate": 9.790455690106288e-06, - "loss": 0.9655, - "num_tokens": 65856713.0, + "epoch": 0.7365967365967366, + "grad_norm": 0.6780504989722205, + "learning_rate": 8.998101091442469e-06, + "loss": 0.4558, + "num_tokens": 17930277.0, "step": 79 }, { - "epoch": 0.25718304199316855, - "grad_norm": 0.32874833994115543, - "learning_rate": 9.783350327962313e-06, - "loss": 0.9478, - "num_tokens": 66702200.0, + "epoch": 0.745920745920746, + "grad_norm": 0.660954300165115, + "learning_rate": 8.969604371862689e-06, + "loss": 0.4718, + "num_tokens": 18152384.0, "step": 80 }, { - "epoch": 0.2603978300180832, - "grad_norm": 0.3957648430833569, - "learning_rate": 9.77612946857565e-06, - "loss": 1.0101, - "num_tokens": 67594499.0, + "epoch": 0.7552447552447552, + "grad_norm": 0.6094153908408396, + "learning_rate": 8.940760342686918e-06, + "loss": 0.4721, + "num_tokens": 18378793.0, "step": 81 }, { - "epoch": 0.2636126180429978, - "grad_norm": 0.3202828135095863, - "learning_rate": 9.76879330665121e-06, - "loss": 0.8986, - "num_tokens": 68445250.0, + "epoch": 0.7645687645687645, + "grad_norm": 0.7092610082946307, + "learning_rate": 8.911571891220749e-06, + "loss": 0.4547, + "num_tokens": 18595092.0, "step": 82 }, { - "epoch": 0.2668274060679124, - "grad_norm": 0.32514848411939723, - "learning_rate": 9.76134204000295e-06, - "loss": 0.937, - "num_tokens": 69245032.0, + "epoch": 0.7738927738927739, + "grad_norm": 0.6629720414327388, + "learning_rate": 8.882041939246671e-06, + "loss": 0.4705, + "num_tokens": 18817832.0, "step": 83 }, { - "epoch": 0.270042194092827, - "grad_norm": 0.33091920640015715, - "learning_rate": 9.753775869548531e-06, - "loss": 0.9132, - "num_tokens": 70143909.0, + "epoch": 0.7832167832167832, + "grad_norm": 0.6373290745696255, + "learning_rate": 8.852173442731586e-06, + "loss": 0.4454, + "num_tokens": 19032032.0, "step": 84 }, { - "epoch": 0.27325698211774163, - "grad_norm": 0.3507015129833903, - "learning_rate": 9.74609499930392e-06, - "loss": 0.926, - "num_tokens": 70920896.0, + "epoch": 0.7925407925407926, + "grad_norm": 0.6617499162396578, + "learning_rate": 8.821969391530922e-06, + "loss": 0.4508, + "num_tokens": 19267954.0, "step": 85 }, { - "epoch": 0.2764717701426562, - "grad_norm": 0.3657803712498244, - "learning_rate": 9.738299636377863e-06, - "loss": 1.0303, - "num_tokens": 71765297.0, + "epoch": 0.8018648018648019, + "grad_norm": 0.6918672339945613, + "learning_rate": 8.791432809089337e-06, + "loss": 0.459, + "num_tokens": 19500998.0, "step": 86 }, { - "epoch": 0.2796865581675708, - "grad_norm": 0.3127658249666305, - "learning_rate": 9.730389990966316e-06, - "loss": 0.9874, - "num_tokens": 72625810.0, + "epoch": 0.8111888111888111, + "grad_norm": 0.7075903033564215, + "learning_rate": 8.760566752138085e-06, + "loss": 0.4594, + "num_tokens": 19727754.0, "step": 87 }, { - "epoch": 0.2829013461924854, - "grad_norm": 0.3075107299296093, - "learning_rate": 9.722366276346782e-06, - "loss": 0.8407, - "num_tokens": 73425163.0, + "epoch": 0.8205128205128205, + "grad_norm": 0.5902673136102629, + "learning_rate": 8.729374310389024e-06, + "loss": 0.433, + "num_tokens": 19949582.0, "step": 88 }, { - "epoch": 0.28611613421740006, - "grad_norm": 0.34876414592481475, - "learning_rate": 9.714228708872538e-06, - "loss": 1.0851, - "num_tokens": 74306863.0, + "epoch": 0.8298368298368298, + "grad_norm": 0.6005303704900206, + "learning_rate": 8.697858606225336e-06, + "loss": 0.4524, + "num_tokens": 20179845.0, "step": 89 }, { - "epoch": 0.28933092224231466, - "grad_norm": 0.3445381243110137, - "learning_rate": 9.70597750796683e-06, - "loss": 0.8917, - "num_tokens": 75117773.0, + "epoch": 0.8391608391608392, + "grad_norm": 0.6267051648548888, + "learning_rate": 8.666022794388975e-06, + "loss": 0.423, + "num_tokens": 20398810.0, "step": 90 }, { - "epoch": 0.29254571026722925, - "grad_norm": 0.31343080108381277, - "learning_rate": 9.697612896116932e-06, - "loss": 0.8662, - "num_tokens": 75916954.0, + "epoch": 0.8484848484848485, + "grad_norm": 0.613848865649995, + "learning_rate": 8.633870061664878e-06, + "loss": 0.4327, + "num_tokens": 20621339.0, "step": 91 }, { - "epoch": 0.29576049829214385, - "grad_norm": 0.3617828667633587, - "learning_rate": 9.689135098868165e-06, - "loss": 1.0487, - "num_tokens": 76784257.0, + "epoch": 0.8578088578088578, + "grad_norm": 0.6527947401981106, + "learning_rate": 8.601403626561965e-06, + "loss": 0.4299, + "num_tokens": 20850876.0, "step": 92 }, { - "epoch": 0.29897528631705844, - "grad_norm": 0.30353817201679567, - "learning_rate": 9.680544344817796e-06, - "loss": 0.9209, - "num_tokens": 77676398.0, + "epoch": 0.8671328671328671, + "grad_norm": 0.6168809536564613, + "learning_rate": 8.568626738990958e-06, + "loss": 0.4293, + "num_tokens": 21073850.0, "step": 93 }, { - "epoch": 0.3021900743419731, - "grad_norm": 0.3358065065228029, - "learning_rate": 9.671840865608895e-06, - "loss": 0.8786, - "num_tokens": 78472781.0, + "epoch": 0.8764568764568764, + "grad_norm": 0.6122685135496609, + "learning_rate": 8.535542679939074e-06, + "loss": 0.4565, + "num_tokens": 21309531.0, "step": 94 }, { - "epoch": 0.3054048623668877, - "grad_norm": 0.3245495143319221, - "learning_rate": 9.663024895924078e-06, - "loss": 0.9495, - "num_tokens": 79258318.0, + "epoch": 0.8857808857808858, + "grad_norm": 0.633692832704968, + "learning_rate": 8.502154761141581e-06, + "loss": 0.4284, + "num_tokens": 21536877.0, "step": 95 }, { - "epoch": 0.3086196503918023, - "grad_norm": 0.35123175846989657, - "learning_rate": 9.654096673479175e-06, - "loss": 0.9088, - "num_tokens": 80099527.0, + "epoch": 0.8951048951048951, + "grad_norm": 0.6287354994581217, + "learning_rate": 8.46846632475031e-06, + "loss": 0.4424, + "num_tokens": 21759224.0, "step": 96 }, { - "epoch": 0.3118344384167169, - "grad_norm": 0.333569564544308, - "learning_rate": 9.645056439016827e-06, - "loss": 1.0341, - "num_tokens": 80894502.0, + "epoch": 0.9044289044289044, + "grad_norm": 0.6127026636463726, + "learning_rate": 8.434480742999089e-06, + "loss": 0.4381, + "num_tokens": 21988805.0, "step": 97 }, { - "epoch": 0.3150492264416315, - "grad_norm": 0.3342225039327973, - "learning_rate": 9.635904436299996e-06, - "loss": 0.9264, - "num_tokens": 81676277.0, + "epoch": 0.9137529137529138, + "grad_norm": 0.684690186227573, + "learning_rate": 8.400201417866184e-06, + "loss": 0.4503, + "num_tokens": 22205757.0, "step": 98 }, { - "epoch": 0.3182640144665461, - "grad_norm": 0.3162516881881075, - "learning_rate": 9.626640912105384e-06, - "loss": 0.8747, - "num_tokens": 82505285.0, + "epoch": 0.9230769230769231, + "grad_norm": 0.6105227864598858, + "learning_rate": 8.365631780733757e-06, + "loss": 0.4226, + "num_tokens": 22437039.0, "step": 99 }, { - "epoch": 0.3214788024914607, - "grad_norm": 0.3596687823408207, - "learning_rate": 9.61726611621679e-06, - "loss": 0.9256, - "num_tokens": 83384349.0, + "epoch": 0.9324009324009324, + "grad_norm": 0.5934626007869728, + "learning_rate": 8.330775292044395e-06, + "loss": 0.4349, + "num_tokens": 22665513.0, "step": 100 }, { - "epoch": 0.3246935905163753, - "grad_norm": 0.3256384184342662, - "learning_rate": 9.607780301418363e-06, - "loss": 0.888, - "num_tokens": 84222793.0, + "epoch": 0.9417249417249417, + "grad_norm": 0.6001619283133106, + "learning_rate": 8.295635440954696e-06, + "loss": 0.4346, + "num_tokens": 22894964.0, "step": 101 }, { - "epoch": 0.32790837854128996, - "grad_norm": 0.324981337065647, - "learning_rate": 9.598183723487792e-06, - "loss": 0.8409, - "num_tokens": 85013797.0, + "epoch": 0.951048951048951, + "grad_norm": 0.5567734223853613, + "learning_rate": 8.260215744986021e-06, + "loss": 0.435, + "num_tokens": 23133799.0, + "step": 102 + }, + { + "epoch": 0.951048951048951, + "eval_loss": 0.4339418113231659, + "eval_num_tokens": 23133799.0, + "eval_runtime": 44.5965, + "eval_samples_per_second": 68.413, + "eval_steps_per_second": 8.566, "step": 102 }, { - "epoch": 0.33112316656620455, - "grad_norm": 0.2937739692357729, - "learning_rate": 9.588476641189414e-06, - "loss": 0.7813, - "num_tokens": 85869670.0, + "epoch": 0.9603729603729604, + "grad_norm": 0.5734597599242294, + "learning_rate": 8.224519749672377e-06, + "loss": 0.4206, + "num_tokens": 23358871.0, "step": 103 }, { - "epoch": 0.33433795459111915, - "grad_norm": 0.3812633866725124, - "learning_rate": 9.578659316267223e-06, - "loss": 0.9707, - "num_tokens": 86695185.0, + "epoch": 0.9696969696969697, + "grad_norm": 0.6086368650839594, + "learning_rate": 8.188551028205515e-06, + "loss": 0.4354, + "num_tokens": 23585729.0, "step": 104 }, { - "epoch": 0.33755274261603374, - "grad_norm": 0.36860579813580074, - "learning_rate": 9.568732013437827e-06, - "loss": 0.945, - "num_tokens": 87502060.0, + "epoch": 0.9790209790209791, + "grad_norm": 0.5954657007627898, + "learning_rate": 8.152313181077242e-06, + "loss": 0.4146, + "num_tokens": 23805562.0, "step": 105 }, { - "epoch": 0.34076753064094834, - "grad_norm": 0.3601782476233201, - "learning_rate": 9.5586950003833e-06, - "loss": 0.9642, - "num_tokens": 88358884.0, + "epoch": 0.9883449883449883, + "grad_norm": 0.6138199768988847, + "learning_rate": 8.115809835719015e-06, + "loss": 0.425, + "num_tokens": 24029901.0, "step": 106 }, { - "epoch": 0.343982318665863, - "grad_norm": 0.3169451260952309, - "learning_rate": 9.548548547743967e-06, - "loss": 0.8514, - "num_tokens": 89187281.0, + "epoch": 0.9976689976689976, + "grad_norm": 0.546520390136051, + "learning_rate": 8.079044646138837e-06, + "loss": 0.428, + "num_tokens": 24269754.0, "step": 107 }, { - "epoch": 0.3471971066907776, - "grad_norm": 0.31394701858353663, - "learning_rate": 9.538292929111114e-06, - "loss": 0.954, - "num_tokens": 90057815.0, + "epoch": 1.0, + "grad_norm": 0.546520390136051, + "learning_rate": 8.042021292555477e-06, + "loss": 0.4373, + "num_tokens": 24325562.0, "step": 108 }, { - "epoch": 0.3504118947156922, - "grad_norm": 0.2884039477403369, - "learning_rate": 9.527928421019594e-06, - "loss": 0.9003, - "num_tokens": 90908813.0, + "epoch": 1.0093240093240092, + "grad_norm": 1.2798009950935532, + "learning_rate": 8.004743481030088e-06, + "loss": 0.3592, + "num_tokens": 24569588.0, "step": 109 }, { - "epoch": 0.35362668274060677, - "grad_norm": 0.3005820615872394, - "learning_rate": 9.517455302940388e-06, - "loss": 0.9265, - "num_tokens": 91808335.0, + "epoch": 1.0186480186480187, + "grad_norm": 0.6696134953932688, + "learning_rate": 7.967214943095222e-06, + "loss": 0.3744, + "num_tokens": 24794492.0, "step": 110 }, { - "epoch": 0.3568414707655214, - "grad_norm": 0.33829940710424095, - "learning_rate": 9.506873857273057e-06, - "loss": 0.8822, - "num_tokens": 92562034.0, + "epoch": 1.027972027972028, + "grad_norm": 0.5214295855483705, + "learning_rate": 7.929439435381305e-06, + "loss": 0.3423, + "num_tokens": 25023843.0, "step": 111 }, { - "epoch": 0.360056258790436, - "grad_norm": 0.388165575804309, - "learning_rate": 9.496184369338136e-06, - "loss": 0.9504, - "num_tokens": 93401780.0, + "epoch": 1.0372960372960374, + "grad_norm": 0.5791254604356956, + "learning_rate": 7.891420739240593e-06, + "loss": 0.3585, + "num_tokens": 25255674.0, "step": 112 }, { - "epoch": 0.3632710468153506, - "grad_norm": 0.30828600079194246, - "learning_rate": 9.48538712736943e-06, - "loss": 0.8456, - "num_tokens": 94223464.0, + "epoch": 1.0466200466200466, + "grad_norm": 0.5476766811139614, + "learning_rate": 7.853162660368664e-06, + "loss": 0.3516, + "num_tokens": 25486215.0, "step": 113 }, { - "epoch": 0.3664858348402652, - "grad_norm": 0.32678478073274536, - "learning_rate": 9.474482422506261e-06, - "loss": 0.9946, - "num_tokens": 95045568.0, + "epoch": 1.055944055944056, + "grad_norm": 0.5758914973043039, + "learning_rate": 7.814669028423444e-06, + "loss": 0.3425, + "num_tokens": 25712704.0, "step": 114 }, { - "epoch": 0.36970062286517985, - "grad_norm": 0.3074550684319162, - "learning_rate": 9.46347054878559e-06, - "loss": 0.8969, - "num_tokens": 95900328.0, + "epoch": 1.0652680652680653, + "grad_norm": 0.5760123728030753, + "learning_rate": 7.775943696641889e-06, + "loss": 0.3435, + "num_tokens": 25938915.0, "step": 115 }, { - "epoch": 0.37291541089009445, - "grad_norm": 0.33732267078935047, - "learning_rate": 9.452351803134115e-06, - "loss": 0.9139, - "num_tokens": 96730746.0, + "epoch": 1.0745920745920745, + "grad_norm": 0.576701480722012, + "learning_rate": 7.736990541454244e-06, + "loss": 0.3543, + "num_tokens": 26163159.0, "step": 116 }, { - "epoch": 0.37613019891500904, - "grad_norm": 0.31201329273532136, - "learning_rate": 9.441126485360246e-06, - "loss": 0.8488, - "num_tokens": 97610362.0, + "epoch": 1.083916083916084, + "grad_norm": 0.576650945358382, + "learning_rate": 7.697813462096026e-06, + "loss": 0.3453, + "num_tokens": 26382888.0, "step": 117 }, { - "epoch": 0.37934498693992363, - "grad_norm": 0.3182555002023676, - "learning_rate": 9.429794898146033e-06, - "loss": 0.9738, - "num_tokens": 98486690.0, + "epoch": 1.0932400932400932, + "grad_norm": 0.5276923197878292, + "learning_rate": 7.658416380217698e-06, + "loss": 0.3527, + "num_tokens": 26611322.0, "step": 118 }, { - "epoch": 0.38255977496483823, - "grad_norm": 0.2985517590920871, - "learning_rate": 9.418357347038999e-06, - "loss": 0.9262, - "num_tokens": 99360561.0, + "epoch": 1.1025641025641026, + "grad_norm": 0.5201043693489312, + "learning_rate": 7.618803239492122e-06, + "loss": 0.329, + "num_tokens": 26833150.0, "step": 119 }, { - "epoch": 0.3857745629897529, - "grad_norm": 0.3444165605200549, - "learning_rate": 9.406814140443898e-06, - "loss": 0.9422, - "num_tokens": 100160618.0, + "epoch": 1.1118881118881119, + "grad_norm": 0.5333967888348916, + "learning_rate": 7.57897800521978e-06, + "loss": 0.3533, + "num_tokens": 27061630.0, "step": 120 }, { - "epoch": 0.3889893510146675, - "grad_norm": 0.3090687093250579, - "learning_rate": 9.395165589614409e-06, - "loss": 0.8525, - "num_tokens": 100947113.0, + "epoch": 1.121212121212121, + "grad_norm": 0.5362717013397229, + "learning_rate": 7.538944663931862e-06, + "loss": 0.3577, + "num_tokens": 27285796.0, "step": 121 }, { - "epoch": 0.39220413903958207, - "grad_norm": 0.3634132717775097, - "learning_rate": 9.38341200864473e-06, - "loss": 0.9269, - "num_tokens": 101769806.0, + "epoch": 1.1305361305361306, + "grad_norm": 0.5541137828108857, + "learning_rate": 7.49870722299119e-06, + "loss": 0.352, + "num_tokens": 27517383.0, "step": 122 }, { - "epoch": 0.39541892706449666, - "grad_norm": 0.3200786763228011, - "learning_rate": 9.371553714461124e-06, - "loss": 0.9373, - "num_tokens": 102675009.0, + "epoch": 1.1398601398601398, + "grad_norm": 0.5232896976757799, + "learning_rate": 7.4582697101911015e-06, + "loss": 0.3368, + "num_tokens": 27756017.0, "step": 123 }, { - "epoch": 0.3986337150894113, - "grad_norm": 0.3383864598548103, - "learning_rate": 9.359591026813358e-06, - "loss": 0.8685, - "num_tokens": 103518516.0, + "epoch": 1.1491841491841492, + "grad_norm": 0.5586660395107629, + "learning_rate": 7.417636173352247e-06, + "loss": 0.3512, + "num_tokens": 27988182.0, "step": 124 }, { - "epoch": 0.4018485031143259, - "grad_norm": 0.3308647282436344, - "learning_rate": 9.347524268266092e-06, - "loss": 0.8594, - "num_tokens": 104353765.0, + "epoch": 1.1585081585081585, + "grad_norm": 0.5275698359391946, + "learning_rate": 7.376810679917411e-06, + "loss": 0.3367, + "num_tokens": 28216063.0, "step": 125 }, { - "epoch": 0.4050632911392405, - "grad_norm": 0.35477967488608536, - "learning_rate": 9.335353764190174e-06, - "loss": 0.9134, - "num_tokens": 105168371.0, + "epoch": 1.167832167832168, + "grad_norm": 0.5330631408961283, + "learning_rate": 7.335797316544352e-06, + "loss": 0.3405, + "num_tokens": 28444175.0, "step": 126 }, { - "epoch": 0.4082780791641551, - "grad_norm": 0.321423423959591, - "learning_rate": 9.323079842753876e-06, - "loss": 0.8584, - "num_tokens": 106003766.0, + "epoch": 1.1771561771561772, + "grad_norm": 0.5300170588990584, + "learning_rate": 7.2946001886967336e-06, + "loss": 0.3385, + "num_tokens": 28679208.0, "step": 127 }, { - "epoch": 0.41149286718906974, - "grad_norm": 0.34234527852927943, - "learning_rate": 9.310702834914038e-06, - "loss": 0.889, - "num_tokens": 106841038.0, + "epoch": 1.1864801864801864, + "grad_norm": 0.535737598999273, + "learning_rate": 7.253223420233151e-06, + "loss": 0.3267, + "num_tokens": 28915039.0, "step": 128 }, { - "epoch": 0.41470765521398434, - "grad_norm": 0.34516181476757696, - "learning_rate": 9.29822307440714e-06, - "loss": 0.9519, - "num_tokens": 107659159.0, + "epoch": 1.1958041958041958, + "grad_norm": 0.5305723111392093, + "learning_rate": 7.211671152994348e-06, + "loss": 0.3585, + "num_tokens": 29128873.0, "step": 129 }, { - "epoch": 0.41792244323889893, - "grad_norm": 0.35189021805312765, - "learning_rate": 9.285640897740316e-06, - "loss": 0.967, - "num_tokens": 108585780.0, + "epoch": 1.205128205128205, + "grad_norm": 0.490754554204438, + "learning_rate": 7.169947546388602e-06, + "loss": 0.3285, + "num_tokens": 29357646.0, "step": 130 }, { - "epoch": 0.4211372312638135, - "grad_norm": 0.33186488122192076, - "learning_rate": 9.272956644182267e-06, - "loss": 0.891, - "num_tokens": 109484031.0, + "epoch": 1.2144522144522145, + "grad_norm": 0.5126971866608595, + "learning_rate": 7.12805677697537e-06, + "loss": 0.3452, + "num_tokens": 29588680.0, "step": 131 }, { - "epoch": 0.4243520192887282, - "grad_norm": 0.2873985309573661, - "learning_rate": 9.260170655754123e-06, - "loss": 0.8424, - "num_tokens": 110291460.0, + "epoch": 1.2237762237762237, + "grad_norm": 0.5626164047551704, + "learning_rate": 7.086003038047213e-06, + "loss": 0.3485, + "num_tokens": 29804673.0, "step": 132 }, { - "epoch": 0.42756680731364277, - "grad_norm": 0.3464704529977045, - "learning_rate": 9.247283277220213e-06, - "loss": 0.9191, - "num_tokens": 111094610.0, + "epoch": 1.2331002331002332, + "grad_norm": 0.5161966131055838, + "learning_rate": 7.043790539210045e-06, + "loss": 0.3361, + "num_tokens": 30034232.0, "step": 133 }, { - "epoch": 0.43078159533855737, - "grad_norm": 0.3386965294874816, - "learning_rate": 9.234294856078774e-06, - "loss": 0.8931, - "num_tokens": 111892626.0, + "epoch": 1.2424242424242424, + "grad_norm": 0.5359714333297103, + "learning_rate": 7.001423505961742e-06, + "loss": 0.3465, + "num_tokens": 30256381.0, "step": 134 }, { - "epoch": 0.43399638336347196, - "grad_norm": 0.3152468479446592, - "learning_rate": 9.22120574255258e-06, - "loss": 0.9255, - "num_tokens": 112707646.0, + "epoch": 1.2517482517482517, + "grad_norm": 0.5204156892409427, + "learning_rate": 6.95890617926918e-06, + "loss": 0.3342, + "num_tokens": 30481505.0, "step": 135 }, { - "epoch": 0.43721117138838655, - "grad_norm": 0.3195736989613159, - "learning_rate": 9.208016289579495e-06, - "loss": 0.8758, - "num_tokens": 113527822.0, + "epoch": 1.2610722610722611, + "grad_norm": 0.5228157991201735, + "learning_rate": 6.916242815143697e-06, + "loss": 0.3307, + "num_tokens": 30714212.0, "step": 136 }, { - "epoch": 0.4404259594133012, - "grad_norm": 0.3194701690902287, - "learning_rate": 9.194726852802962e-06, - "loss": 0.9334, - "num_tokens": 114341936.0, + "epoch": 1.2703962703962703, + "grad_norm": 0.5506261937038931, + "learning_rate": 6.873437684215078e-06, + "loss": 0.3478, + "num_tokens": 30933792.0, "step": 137 }, { - "epoch": 0.4436407474382158, - "grad_norm": 0.32150273836540016, - "learning_rate": 9.181337790562407e-06, - "loss": 0.7614, - "num_tokens": 115124756.0, + "epoch": 1.2797202797202798, + "grad_norm": 0.4972019266946924, + "learning_rate": 6.830495071304046e-06, + "loss": 0.3363, + "num_tokens": 31166629.0, "step": 138 }, { - "epoch": 0.4468555354631304, - "grad_norm": 0.33425672075500795, - "learning_rate": 9.16784946388358e-06, - "loss": 0.929, - "num_tokens": 115961832.0, + "epoch": 1.289044289044289, + "grad_norm": 0.5120634677161241, + "learning_rate": 6.787419274993365e-06, + "loss": 0.3363, + "num_tokens": 31399523.0, "step": 139 }, { - "epoch": 0.450070323488045, - "grad_norm": 0.29361887385145263, - "learning_rate": 9.154262236468826e-06, - "loss": 0.8765, - "num_tokens": 116808705.0, + "epoch": 1.2983682983682985, + "grad_norm": 0.49782505687695744, + "learning_rate": 6.744214607197539e-06, + "loss": 0.3354, + "num_tokens": 31636222.0, "step": 140 }, { - "epoch": 0.45328511151295964, - "grad_norm": 0.29842042549715203, - "learning_rate": 9.140576474687263e-06, - "loss": 0.8668, - "num_tokens": 117596498.0, + "epoch": 1.3076923076923077, + "grad_norm": 0.5390261779041863, + "learning_rate": 6.700885392731188e-06, + "loss": 0.3425, + "num_tokens": 31869574.0, "step": 141 }, { - "epoch": 0.45649989953787423, - "grad_norm": 0.3296883611990118, - "learning_rate": 9.126792547564922e-06, - "loss": 0.8515, - "num_tokens": 118390987.0, + "epoch": 1.317016317016317, + "grad_norm": 0.509309660832688, + "learning_rate": 6.657435968876133e-06, + "loss": 0.3602, + "num_tokens": 32089479.0, "step": 142 }, { - "epoch": 0.4597146875627888, - "grad_norm": 0.31150647632686185, - "learning_rate": 9.112910826774778e-06, - "loss": 0.8278, - "num_tokens": 119162382.0, + "epoch": 1.3263403263403264, + "grad_norm": 0.48678955681043784, + "learning_rate": 6.613870684947232e-06, + "loss": 0.3304, + "num_tokens": 32323608.0, "step": 143 }, { - "epoch": 0.4629294755877034, - "grad_norm": 0.33540408530666066, - "learning_rate": 9.098931686626744e-06, - "loss": 0.91, - "num_tokens": 119936224.0, + "epoch": 1.3356643356643356, + "grad_norm": 0.5260358713582577, + "learning_rate": 6.570193901857013e-06, + "loss": 0.3479, + "num_tokens": 32544996.0, "step": 144 }, { - "epoch": 0.46614426361261807, - "grad_norm": 0.3370795977253573, - "learning_rate": 9.084855504057562e-06, - "loss": 0.8773, - "num_tokens": 120686511.0, + "epoch": 1.3449883449883449, + "grad_norm": 0.5197126640451202, + "learning_rate": 6.526409991679134e-06, + "loss": 0.3448, + "num_tokens": 32769167.0, "step": 145 }, { - "epoch": 0.46935905163753266, - "grad_norm": 0.32697044456244534, - "learning_rate": 9.070682658620662e-06, - "loss": 0.9322, - "num_tokens": 121542139.0, + "epoch": 1.3543123543123543, + "grad_norm": 0.5311482925052395, + "learning_rate": 6.482523337210746e-06, + "loss": 0.3442, + "num_tokens": 32992656.0, "step": 146 }, { - "epoch": 0.47257383966244726, - "grad_norm": 0.3122506402407834, - "learning_rate": 9.0564135324759e-06, - "loss": 0.8789, - "num_tokens": 122320053.0, + "epoch": 1.3636363636363638, + "grad_norm": 0.5132102539844359, + "learning_rate": 6.438538331533769e-06, + "loss": 0.3401, + "num_tokens": 33219244.0, "step": 147 }, { - "epoch": 0.47578862768736185, - "grad_norm": 0.3275301400023383, - "learning_rate": 9.042048510379273e-06, - "loss": 0.9306, - "num_tokens": 123168338.0, + "epoch": 1.372960372960373, + "grad_norm": 0.5013721607406705, + "learning_rate": 6.3944593775751395e-06, + "loss": 0.336, + "num_tokens": 33448947.0, "step": 148 }, { - "epoch": 0.47900341571227645, - "grad_norm": 0.3307298586719574, - "learning_rate": 9.027587979672541e-06, - "loss": 0.9568, - "num_tokens": 124021285.0, + "epoch": 1.3822843822843822, + "grad_norm": 0.5043281141721423, + "learning_rate": 6.350290887666078e-06, + "loss": 0.3289, + "num_tokens": 33677944.0, "step": 149 }, { - "epoch": 0.4822182037371911, - "grad_norm": 0.3222274080061541, - "learning_rate": 9.013032330272777e-06, - "loss": 0.8757, - "num_tokens": 124874169.0, + "epoch": 1.3916083916083917, + "grad_norm": 0.5044274541117446, + "learning_rate": 6.306037283100412e-06, + "loss": 0.34, + "num_tokens": 33909419.0, "step": 150 }, { - "epoch": 0.4854329917621057, - "grad_norm": 0.33114181109039703, - "learning_rate": 8.998381954661854e-06, - "loss": 0.8985, - "num_tokens": 125713976.0, + "epoch": 1.400932400932401, + "grad_norm": 0.5210082282628941, + "learning_rate": 6.261702993691994e-06, + "loss": 0.3297, + "num_tokens": 34130758.0, "step": 151 }, { - "epoch": 0.4886477797870203, - "grad_norm": 0.31360733324141654, - "learning_rate": 8.983637247875872e-06, - "loss": 0.8989, - "num_tokens": 126619669.0, + "epoch": 1.4102564102564101, + "grad_norm": 0.5330978559123218, + "learning_rate": 6.217292457331286e-06, + "loss": 0.3459, + "num_tokens": 34354750.0, "step": 152 }, { - "epoch": 0.4918625678119349, - "grad_norm": 0.33257699154027903, - "learning_rate": 8.968798607494489e-06, - "loss": 0.8759, - "num_tokens": 127448351.0, + "epoch": 1.4195804195804196, + "grad_norm": 0.5271964829991576, + "learning_rate": 6.172810119541118e-06, + "loss": 0.3299, + "num_tokens": 34570483.0, + "step": 153 + }, + { + "epoch": 1.4195804195804196, + "eval_loss": 0.4102487564086914, + "eval_num_tokens": 34570483.0, + "eval_runtime": 44.6685, + "eval_samples_per_second": 68.303, + "eval_steps_per_second": 8.552, "step": 153 }, { - "epoch": 0.49507735583684953, - "grad_norm": 0.37198646104493516, - "learning_rate": 8.953866433630216e-06, - "loss": 0.9051, - "num_tokens": 128208302.0, + "epoch": 1.428904428904429, + "grad_norm": 0.5344459993409891, + "learning_rate": 6.128260433031688e-06, + "loss": 0.329, + "num_tokens": 34795100.0, "step": 154 }, { - "epoch": 0.4982921438617641, - "grad_norm": 0.3192818068542491, - "learning_rate": 8.938841128917622e-06, - "loss": 0.8564, - "num_tokens": 129034843.0, + "epoch": 1.4382284382284383, + "grad_norm": 0.5033120854345594, + "learning_rate": 6.083647857254837e-06, + "loss": 0.3389, + "num_tokens": 35019260.0, "step": 155 }, { - "epoch": 0.5015069318866787, - "grad_norm": 0.3592609826251884, - "learning_rate": 8.923723098502475e-06, - "loss": 0.9851, - "num_tokens": 129825111.0, + "epoch": 1.4475524475524475, + "grad_norm": 0.5519614788540163, + "learning_rate": 6.038976857957674e-06, + "loss": 0.3326, + "num_tokens": 35232059.0, "step": 156 }, { - "epoch": 0.5047217199115933, - "grad_norm": 0.34008283178223125, - "learning_rate": 8.908512750030823e-06, - "loss": 0.9246, - "num_tokens": 130711423.0, + "epoch": 1.456876456876457, + "grad_norm": 0.49066707028364026, + "learning_rate": 5.994251906735529e-06, + "loss": 0.318, + "num_tokens": 35452738.0, "step": 157 }, { - "epoch": 0.5079365079365079, - "grad_norm": 0.32873657916310983, - "learning_rate": 8.893210493637997e-06, - "loss": 0.9849, - "num_tokens": 131540140.0, + "epoch": 1.4662004662004662, + "grad_norm": 0.5407716108214161, + "learning_rate": 5.949477480584356e-06, + "loss": 0.3434, + "num_tokens": 35687577.0, "step": 158 }, { - "epoch": 0.5111512959614225, - "grad_norm": 0.38816281593290697, - "learning_rate": 8.877816741937557e-06, - "loss": 0.8654, - "num_tokens": 132358532.0, + "epoch": 1.4755244755244754, + "grad_norm": 0.510940703001653, + "learning_rate": 5.904658061452585e-06, + "loss": 0.3268, + "num_tokens": 35921313.0, "step": 159 }, { - "epoch": 0.5143660839863371, - "grad_norm": 0.3544660654548133, - "learning_rate": 8.86233191001016e-06, - "loss": 0.899, - "num_tokens": 133139458.0, + "epoch": 1.4848484848484849, + "grad_norm": 0.5033518297300301, + "learning_rate": 5.859798135792469e-06, + "loss": 0.3388, + "num_tokens": 36141640.0, "step": 160 }, { - "epoch": 0.5175808720112518, - "grad_norm": 0.3560879964112565, - "learning_rate": 8.846756415392372e-06, - "loss": 0.8814, - "num_tokens": 134001828.0, + "epoch": 1.494172494172494, + "grad_norm": 0.5277378237245701, + "learning_rate": 5.8149021941109886e-06, + "loss": 0.3432, + "num_tokens": 36374008.0, "step": 161 }, { - "epoch": 0.5207956600361664, - "grad_norm": 0.3169268496559368, - "learning_rate": 8.831090678065414e-06, - "loss": 0.9115, - "num_tokens": 134767371.0, + "epoch": 1.5034965034965035, + "grad_norm": 0.5059008621994278, + "learning_rate": 5.769974730520352e-06, + "loss": 0.3332, + "num_tokens": 36596425.0, "step": 162 }, { - "epoch": 0.524010448061081, - "grad_norm": 0.36550764106666367, - "learning_rate": 8.815335120443822e-06, - "loss": 0.9042, - "num_tokens": 135577779.0, + "epoch": 1.5128205128205128, + "grad_norm": 0.5153843221684504, + "learning_rate": 5.725020242288134e-06, + "loss": 0.349, + "num_tokens": 36828175.0, "step": 163 }, { - "epoch": 0.5272252360859956, - "grad_norm": 0.3335129692567876, - "learning_rate": 8.799490167364078e-06, - "loss": 0.9059, - "num_tokens": 136402897.0, + "epoch": 1.5221445221445222, + "grad_norm": 0.49021702361667463, + "learning_rate": 5.680043229387086e-06, + "loss": 0.3336, + "num_tokens": 37063422.0, "step": 164 }, { - "epoch": 0.5304400241109102, - "grad_norm": 0.35711909878553505, - "learning_rate": 8.783556246073135e-06, - "loss": 0.9577, - "num_tokens": 137163591.0, + "epoch": 1.5314685314685315, + "grad_norm": 0.5136418808781229, + "learning_rate": 5.6350481940447025e-06, + "loss": 0.3572, + "num_tokens": 37291951.0, "step": 165 }, { - "epoch": 0.5336548121358248, - "grad_norm": 0.2862081289833857, - "learning_rate": 8.76753378621691e-06, - "loss": 0.6896, - "num_tokens": 137959576.0, + "epoch": 1.5407925407925407, + "grad_norm": 0.5200442031961835, + "learning_rate": 5.590039640292525e-06, + "loss": 0.3333, + "num_tokens": 37522766.0, "step": 166 }, { - "epoch": 0.5368696001607394, - "grad_norm": 0.30807513132168196, - "learning_rate": 8.751423219828696e-06, - "loss": 0.8403, - "num_tokens": 138797654.0, + "epoch": 1.5501165501165501, + "grad_norm": 0.5314710688003101, + "learning_rate": 5.545022073515306e-06, + "loss": 0.3343, + "num_tokens": 37748838.0, "step": 167 }, { - "epoch": 0.540084388185654, - "grad_norm": 0.3564973067565867, - "learning_rate": 8.735224981317502e-06, - "loss": 0.9099, - "num_tokens": 139621700.0, + "epoch": 1.5594405594405596, + "grad_norm": 0.5404598630521976, + "learning_rate": 5.500000000000001e-06, + "loss": 0.3286, + "num_tokens": 37969252.0, "step": 168 }, { - "epoch": 0.5432991762105687, - "grad_norm": 0.3406387053575515, - "learning_rate": 8.718939507456358e-06, - "loss": 0.8667, - "num_tokens": 140419502.0, + "epoch": 1.5687645687645686, + "grad_norm": 0.5248227901946765, + "learning_rate": 5.454977926484696e-06, + "loss": 0.3348, + "num_tokens": 38196458.0, "step": 169 }, { - "epoch": 0.5465139642354833, - "grad_norm": 0.32964271245166193, - "learning_rate": 8.702567237370521e-06, - "loss": 0.8848, - "num_tokens": 141315581.0, + "epoch": 1.578088578088578, + "grad_norm": 0.5333527697636569, + "learning_rate": 5.409960359707476e-06, + "loss": 0.3518, + "num_tokens": 38427692.0, "step": 170 }, { - "epoch": 0.5497287522603979, - "grad_norm": 0.32777806479826777, - "learning_rate": 8.686108612525648e-06, - "loss": 0.8581, - "num_tokens": 142095306.0, + "epoch": 1.5874125874125875, + "grad_norm": 0.5209184747461538, + "learning_rate": 5.3649518059553e-06, + "loss": 0.327, + "num_tokens": 38652447.0, "step": 171 }, { - "epoch": 0.5529435402853125, - "grad_norm": 0.33829242730399295, - "learning_rate": 8.669564076715881e-06, - "loss": 0.7832, - "num_tokens": 142939289.0, + "epoch": 1.5967365967365967, + "grad_norm": 0.5072928544219357, + "learning_rate": 5.319956770612915e-06, + "loss": 0.3249, + "num_tokens": 38882101.0, "step": 172 }, { - "epoch": 0.556158328310227, - "grad_norm": 0.34739576784183324, - "learning_rate": 8.652934076051884e-06, - "loss": 0.8408, - "num_tokens": 143817969.0, + "epoch": 1.606060606060606, + "grad_norm": 0.5186576324632163, + "learning_rate": 5.274979757711868e-06, + "loss": 0.3378, + "num_tokens": 39103980.0, "step": 173 }, { - "epoch": 0.5593731163351416, - "grad_norm": 0.3395314300498932, - "learning_rate": 8.636219058948823e-06, - "loss": 0.9149, - "num_tokens": 144680954.0, + "epoch": 1.6153846153846154, + "grad_norm": 0.5457391949139522, + "learning_rate": 5.230025269479649e-06, + "loss": 0.3495, + "num_tokens": 39327215.0, "step": 174 }, { - "epoch": 0.5625879043600562, - "grad_norm": 0.3063402802877196, - "learning_rate": 8.619419476114251e-06, - "loss": 0.9248, - "num_tokens": 145532678.0, + "epoch": 1.6247086247086249, + "grad_norm": 0.5257683547837523, + "learning_rate": 5.185097805889014e-06, + "loss": 0.324, + "num_tokens": 39543065.0, "step": 175 }, { - "epoch": 0.5658026923849708, - "grad_norm": 0.32790237683570705, - "learning_rate": 8.602535780535987e-06, - "loss": 0.9122, - "num_tokens": 146389287.0, + "epoch": 1.6340326340326339, + "grad_norm": 0.5184260973571639, + "learning_rate": 5.1402018642075336e-06, + "loss": 0.3494, + "num_tokens": 39767880.0, "step": 176 }, { - "epoch": 0.5690174804098854, - "grad_norm": 0.3144730611122465, - "learning_rate": 8.585568427469882e-06, - "loss": 0.8896, - "num_tokens": 147236962.0, + "epoch": 1.6433566433566433, + "grad_norm": 0.5190968681145691, + "learning_rate": 5.095341938547416e-06, + "loss": 0.3211, + "num_tokens": 39995732.0, "step": 177 }, { - "epoch": 0.5722322684348001, - "grad_norm": 0.33145037136611205, - "learning_rate": 8.568517874427535e-06, - "loss": 0.8768, - "num_tokens": 147990396.0, + "epoch": 1.6526806526806528, + "grad_norm": 0.4893090655080212, + "learning_rate": 5.050522519415646e-06, + "loss": 0.3272, + "num_tokens": 40222806.0, "step": 178 }, { - "epoch": 0.5754470564597147, - "grad_norm": 0.31845069665494014, - "learning_rate": 8.551384581163983e-06, - "loss": 0.771, - "num_tokens": 148793930.0, + "epoch": 1.662004662004662, + "grad_norm": 0.4951936701558321, + "learning_rate": 5.005748093264473e-06, + "loss": 0.3265, + "num_tokens": 40450154.0, "step": 179 }, { - "epoch": 0.5786618444846293, - "grad_norm": 0.3166657284585529, - "learning_rate": 8.534169009665282e-06, - "loss": 0.8269, - "num_tokens": 149609271.0, + "epoch": 1.6713286713286712, + "grad_norm": 0.4922175483307558, + "learning_rate": 4.961023142042329e-06, + "loss": 0.3305, + "num_tokens": 40679571.0, "step": 180 }, { - "epoch": 0.5818766325095439, - "grad_norm": 0.2956485070981934, - "learning_rate": 8.516871624136058e-06, - "loss": 0.8999, - "num_tokens": 150479022.0, + "epoch": 1.6806526806526807, + "grad_norm": 0.5225798586624328, + "learning_rate": 4.916352142745163e-06, + "loss": 0.3344, + "num_tokens": 40915005.0, "step": 181 }, { - "epoch": 0.5850914205344585, - "grad_norm": 0.2974221126194666, - "learning_rate": 8.49949289098699e-06, - "loss": 0.7071, - "num_tokens": 151320271.0, + "epoch": 1.68997668997669, + "grad_norm": 0.5228426979014617, + "learning_rate": 4.871739566968315e-06, + "loss": 0.3192, + "num_tokens": 41145279.0, "step": 182 }, { - "epoch": 0.5883062085593731, - "grad_norm": 0.3301439989773727, - "learning_rate": 8.482033278822236e-06, - "loss": 0.9202, - "num_tokens": 152115374.0, + "epoch": 1.6993006993006992, + "grad_norm": 0.5083022419487268, + "learning_rate": 4.8271898804588825e-06, + "loss": 0.3254, + "num_tokens": 41370598.0, "step": 183 }, { - "epoch": 0.5915209965842877, - "grad_norm": 0.33170766358561987, - "learning_rate": 8.464493258426785e-06, - "loss": 0.8821, - "num_tokens": 152924348.0, + "epoch": 1.7086247086247086, + "grad_norm": 0.5201836986924885, + "learning_rate": 4.782707542668715e-06, + "loss": 0.3311, + "num_tokens": 41603620.0, "step": 184 }, { - "epoch": 0.5947357846092023, - "grad_norm": 0.3351466260753276, - "learning_rate": 8.446873302753783e-06, - "loss": 0.8675, - "num_tokens": 153698494.0, + "epoch": 1.717948717948718, + "grad_norm": 0.48151780775578606, + "learning_rate": 4.738297006308008e-06, + "loss": 0.3208, + "num_tokens": 41838817.0, "step": 185 }, { - "epoch": 0.5979505726341169, - "grad_norm": 0.3207278813809398, - "learning_rate": 8.429173886911765e-06, - "loss": 0.8676, - "num_tokens": 154532465.0, + "epoch": 1.7272727272727273, + "grad_norm": 0.5356315914755293, + "learning_rate": 4.6939627168995915e-06, + "loss": 0.3288, + "num_tokens": 42060021.0, "step": 186 }, { - "epoch": 0.6011653606590316, - "grad_norm": 0.32377017847811457, - "learning_rate": 8.411395488151842e-06, - "loss": 0.8454, - "num_tokens": 155323002.0, + "epoch": 1.7365967365967365, + "grad_norm": 0.514889519867123, + "learning_rate": 4.649709112333923e-06, + "loss": 0.328, + "num_tokens": 42284510.0, "step": 187 }, { - "epoch": 0.6043801486839462, - "grad_norm": 0.3301964081819988, - "learning_rate": 8.393538585854848e-06, - "loss": 0.8586, - "num_tokens": 156112815.0, + "epoch": 1.745920745920746, + "grad_norm": 0.4724010610876581, + "learning_rate": 4.605540622424862e-06, + "loss": 0.3287, + "num_tokens": 42518962.0, "step": 188 }, { - "epoch": 0.6075949367088608, - "grad_norm": 0.30558428223830236, - "learning_rate": 8.375603661518401e-06, - "loss": 0.8794, - "num_tokens": 157026982.0, + "epoch": 1.7552447552447552, + "grad_norm": 0.49538080834069576, + "learning_rate": 4.561461668466233e-06, + "loss": 0.3316, + "num_tokens": 42744627.0, "step": 189 }, { - "epoch": 0.6108097247337754, - "grad_norm": 0.344723262215173, - "learning_rate": 8.357591198743923e-06, - "loss": 0.8404, - "num_tokens": 157811261.0, + "epoch": 1.7645687645687644, + "grad_norm": 0.5024879109051644, + "learning_rate": 4.517476662789257e-06, + "loss": 0.3219, + "num_tokens": 42968537.0, "step": 190 }, { - "epoch": 0.61402451275869, - "grad_norm": 0.3892249539503774, - "learning_rate": 8.339501683223599e-06, - "loss": 0.9379, - "num_tokens": 158634889.0, + "epoch": 1.7738927738927739, + "grad_norm": 0.4886882856604164, + "learning_rate": 4.473590008320868e-06, + "loss": 0.3311, + "num_tokens": 43186024.0, "step": 191 }, { - "epoch": 0.6172393007836046, - "grad_norm": 0.40778600299450035, - "learning_rate": 8.321335602727283e-06, - "loss": 0.9513, - "num_tokens": 159433565.0, + "epoch": 1.7832167832167833, + "grad_norm": 0.5277124654943276, + "learning_rate": 4.429806098142989e-06, + "loss": 0.3259, + "num_tokens": 43406230.0, "step": 192 }, { - "epoch": 0.6204540888085192, - "grad_norm": 0.3077472501112009, - "learning_rate": 8.303093447089346e-06, - "loss": 0.8628, - "num_tokens": 160311360.0, + "epoch": 1.7925407925407926, + "grad_norm": 0.49430765763937234, + "learning_rate": 4.386129315052768e-06, + "loss": 0.3263, + "num_tokens": 43632452.0, "step": 193 }, { - "epoch": 0.6236688768334337, - "grad_norm": 0.3483135339241395, - "learning_rate": 8.284775708195462e-06, - "loss": 0.9275, - "num_tokens": 161169144.0, + "epoch": 1.8018648018648018, + "grad_norm": 0.4916236071607527, + "learning_rate": 4.3425640311238695e-06, + "loss": 0.3281, + "num_tokens": 43860663.0, "step": 194 }, { - "epoch": 0.6268836648583485, - "grad_norm": 0.28509163051039826, - "learning_rate": 8.266382879969356e-06, - "loss": 0.9117, - "num_tokens": 162067745.0, + "epoch": 1.8111888111888113, + "grad_norm": 0.5515259138593007, + "learning_rate": 4.299114607268814e-06, + "loss": 0.3434, + "num_tokens": 44085405.0, "step": 195 }, { - "epoch": 0.630098452883263, - "grad_norm": 0.35298749290163184, - "learning_rate": 8.247915458359473e-06, - "loss": 0.7792, - "num_tokens": 162943979.0, + "epoch": 1.8205128205128205, + "grad_norm": 0.4896559926348576, + "learning_rate": 4.255785392802464e-06, + "loss": 0.3309, + "num_tokens": 44311745.0, "step": 196 }, { - "epoch": 0.6333132409081776, - "grad_norm": 0.3791342196243438, - "learning_rate": 8.229373941325616e-06, - "loss": 0.9286, - "num_tokens": 163719744.0, + "epoch": 1.8298368298368297, + "grad_norm": 0.4748622553446698, + "learning_rate": 4.212580725006635e-06, + "loss": 0.3122, + "num_tokens": 44541583.0, "step": 197 }, { - "epoch": 0.6365280289330922, - "grad_norm": 0.35334178330548377, - "learning_rate": 8.210758828825508e-06, - "loss": 0.9461, - "num_tokens": 164616226.0, + "epoch": 1.8391608391608392, + "grad_norm": 0.5235696028853547, + "learning_rate": 4.169504928695956e-06, + "loss": 0.3343, + "num_tokens": 44758719.0, "step": 198 }, { - "epoch": 0.6397428169580068, - "grad_norm": 0.35384850999679074, - "learning_rate": 8.192070622801326e-06, - "loss": 0.9685, - "num_tokens": 165473490.0, + "epoch": 1.8484848484848486, + "grad_norm": 0.4799565738729803, + "learning_rate": 4.126562315784924e-06, + "loss": 0.3335, + "num_tokens": 44984400.0, "step": 199 }, { - "epoch": 0.6429576049829214, - "grad_norm": 0.28922867998230706, - "learning_rate": 8.17330982716615e-06, - "loss": 0.8125, - "num_tokens": 166332405.0, + "epoch": 1.8578088578088578, + "grad_norm": 0.47269486609129163, + "learning_rate": 4.083757184856304e-06, + "loss": 0.3316, + "num_tokens": 45202481.0, "step": 200 }, { - "epoch": 0.646172393007836, - "grad_norm": 0.37006397791361156, - "learning_rate": 8.154476947790383e-06, - "loss": 0.9404, - "num_tokens": 167147449.0, + "epoch": 1.867132867132867, + "grad_norm": 0.5004916428890924, + "learning_rate": 4.041093820730821e-06, + "loss": 0.3305, + "num_tokens": 45424342.0, "step": 201 }, { - "epoch": 0.6493871810327506, - "grad_norm": 0.26503009165389946, - "learning_rate": 8.135572492488114e-06, - "loss": 0.7629, - "num_tokens": 168013738.0, + "epoch": 1.8764568764568765, + "grad_norm": 0.49094397631517767, + "learning_rate": 3.99857649403826e-06, + "loss": 0.3252, + "num_tokens": 45646541.0, "step": 202 }, { - "epoch": 0.6526019690576652, - "grad_norm": 0.3381608286411406, - "learning_rate": 8.116596971003422e-06, - "loss": 0.8069, - "num_tokens": 168805606.0, + "epoch": 1.8857808857808858, + "grad_norm": 0.48010716713043344, + "learning_rate": 3.956209460789957e-06, + "loss": 0.3201, + "num_tokens": 45865887.0, "step": 203 }, { - "epoch": 0.6558167570825799, - "grad_norm": 0.38031100642269483, - "learning_rate": 8.097550894996632e-06, - "loss": 0.8917, - "num_tokens": 169661272.0, + "epoch": 1.895104895104895, + "grad_norm": 0.5003078282639417, + "learning_rate": 3.913996961952789e-06, + "loss": 0.3345, + "num_tokens": 46090891.0, + "step": 204 + }, + { + "epoch": 1.895104895104895, + "eval_loss": 0.3945937752723694, + "eval_num_tokens": 46090891.0, + "eval_runtime": 44.9767, + "eval_samples_per_second": 67.835, + "eval_steps_per_second": 8.493, "step": 204 }, { - "epoch": 0.6590315451074945, - "grad_norm": 0.30146809296785787, - "learning_rate": 8.078434778030511e-06, - "loss": 0.7977, - "num_tokens": 170563774.0, + "epoch": 1.9044289044289044, + "grad_norm": 0.5030448753628592, + "learning_rate": 3.871943223024632e-06, + "loss": 0.3273, + "num_tokens": 46333399.0, "step": 205 }, { - "epoch": 0.6622463331324091, - "grad_norm": 0.37991768275367294, - "learning_rate": 8.05924913555643e-06, - "loss": 0.8179, - "num_tokens": 171338738.0, + "epoch": 1.913752913752914, + "grad_norm": 0.4936076223940164, + "learning_rate": 3.8300524536114004e-06, + "loss": 0.3376, + "num_tokens": 46551999.0, "step": 206 }, { - "epoch": 0.6654611211573237, - "grad_norm": 0.2849543425380391, - "learning_rate": 8.039994484900463e-06, - "loss": 0.8189, - "num_tokens": 172219480.0, + "epoch": 1.9230769230769231, + "grad_norm": 0.46513542076706943, + "learning_rate": 3.7883288470056543e-06, + "loss": 0.3184, + "num_tokens": 46782533.0, "step": 207 }, { - "epoch": 0.6686759091822383, - "grad_norm": 0.3273036502963336, - "learning_rate": 8.020671345249435e-06, - "loss": 0.8182, - "num_tokens": 173109293.0, + "epoch": 1.9324009324009324, + "grad_norm": 0.5034814125152446, + "learning_rate": 3.746776579766851e-06, + "loss": 0.3301, + "num_tokens": 47010120.0, "step": 208 }, { - "epoch": 0.6718906972071529, - "grad_norm": 0.32889391343217705, - "learning_rate": 8.001280237636926e-06, - "loss": 0.8277, - "num_tokens": 173947806.0, + "epoch": 1.9417249417249418, + "grad_norm": 0.5131605114056567, + "learning_rate": 3.7053998113032695e-06, + "loss": 0.3218, + "num_tokens": 47236233.0, "step": 209 }, { - "epoch": 0.6751054852320675, - "grad_norm": 0.30290794267040716, - "learning_rate": 7.981821684929218e-06, - "loss": 0.8817, - "num_tokens": 174782714.0, + "epoch": 1.951048951048951, + "grad_norm": 0.47485454111075154, + "learning_rate": 3.6642026834556488e-06, + "loss": 0.3238, + "num_tokens": 47461465.0, "step": 210 }, { - "epoch": 0.6783202732569821, - "grad_norm": 0.33781385572391937, - "learning_rate": 7.962296211811197e-06, - "loss": 0.8541, - "num_tokens": 175598688.0, + "epoch": 1.9603729603729603, + "grad_norm": 0.5481840012126614, + "learning_rate": 3.6231893200825917e-06, + "loss": 0.3354, + "num_tokens": 47679615.0, "step": 211 }, { - "epoch": 0.6815350612818967, - "grad_norm": 0.31043268498077026, - "learning_rate": 7.942704344772212e-06, - "loss": 0.8446, - "num_tokens": 176421999.0, + "epoch": 1.9696969696969697, + "grad_norm": 0.5149177715727978, + "learning_rate": 3.582363826647756e-06, + "loss": 0.3281, + "num_tokens": 47906581.0, "step": 212 }, { - "epoch": 0.6847498493068114, - "grad_norm": 0.3106756258118995, - "learning_rate": 7.923046612091864e-06, - "loss": 0.8417, - "num_tokens": 177231214.0, + "epoch": 1.9790209790209792, + "grad_norm": 0.488341341221377, + "learning_rate": 3.5417302898089e-06, + "loss": 0.3084, + "num_tokens": 48135155.0, "step": 213 }, { - "epoch": 0.687964637331726, - "grad_norm": 0.3455688936932671, - "learning_rate": 7.903323543825776e-06, - "loss": 0.8823, - "num_tokens": 178047310.0, + "epoch": 1.9883449883449882, + "grad_norm": 0.4755923848180776, + "learning_rate": 3.501292777008811e-06, + "loss": 0.3251, + "num_tokens": 48356662.0, "step": 214 }, { - "epoch": 0.6911794253566406, - "grad_norm": 0.32419064722798224, - "learning_rate": 7.883535671791294e-06, - "loss": 0.9333, - "num_tokens": 178857355.0, + "epoch": 1.9976689976689976, + "grad_norm": 0.4928939994589957, + "learning_rate": 3.461055336068141e-06, + "loss": 0.319, + "num_tokens": 48592154.0, "step": 215 }, { - "epoch": 0.6943942133815552, - "grad_norm": 0.31666623739149313, - "learning_rate": 7.863683529553148e-06, - "loss": 0.8679, - "num_tokens": 179701817.0, + "epoch": 2.0, + "grad_norm": 0.4928939994589957, + "learning_rate": 3.4210219947802214e-06, + "loss": 0.3182, + "num_tokens": 48651124.0, "step": 216 }, { - "epoch": 0.6976090014064698, - "grad_norm": 0.2857763487302272, - "learning_rate": 7.843767652409063e-06, - "loss": 0.8154, - "num_tokens": 180509118.0, + "epoch": 2.0093240093240095, + "grad_norm": 1.0077926731304525, + "learning_rate": 3.38119676050788e-06, + "loss": 0.2658, + "num_tokens": 48884071.0, "step": 217 }, { - "epoch": 0.7008237894313843, - "grad_norm": 0.290558414238153, - "learning_rate": 7.82378857737533e-06, - "loss": 0.8237, - "num_tokens": 181345716.0, + "epoch": 2.0186480186480185, + "grad_norm": 0.536857443533564, + "learning_rate": 3.341583619782304e-06, + "loss": 0.2712, + "num_tokens": 49121191.0, "step": 218 }, { - "epoch": 0.7040385774562989, - "grad_norm": 0.32249354070765945, - "learning_rate": 7.803746843172315e-06, - "loss": 0.8869, - "num_tokens": 182163997.0, + "epoch": 2.027972027972028, + "grad_norm": 0.4931471166108591, + "learning_rate": 3.3021865379039765e-06, + "loss": 0.265, + "num_tokens": 49336073.0, "step": 219 }, { - "epoch": 0.7072533654812135, - "grad_norm": 0.3858415339157623, - "learning_rate": 7.783642990209951e-06, - "loss": 0.8885, - "num_tokens": 182919306.0, + "epoch": 2.0372960372960374, + "grad_norm": 0.4722559359039849, + "learning_rate": 3.2630094585457583e-06, + "loss": 0.2754, + "num_tokens": 49557586.0, "step": 220 }, { - "epoch": 0.7104681535061282, - "grad_norm": 0.29093268616759727, - "learning_rate": 7.763477560573147e-06, - "loss": 0.8844, - "num_tokens": 183787300.0, + "epoch": 2.046620046620047, + "grad_norm": 0.45498374408979997, + "learning_rate": 3.2240563033581117e-06, + "loss": 0.2676, + "num_tokens": 49783374.0, "step": 221 }, { - "epoch": 0.7136829415310428, - "grad_norm": 0.33440898051637297, - "learning_rate": 7.743251098007182e-06, - "loss": 0.9026, - "num_tokens": 184598822.0, + "epoch": 2.055944055944056, + "grad_norm": 0.443089196299474, + "learning_rate": 3.1853309715765567e-06, + "loss": 0.2628, + "num_tokens": 50015143.0, "step": 222 }, { - "epoch": 0.7168977295559574, - "grad_norm": 0.3445014289155674, - "learning_rate": 7.72296414790304e-06, - "loss": 0.9325, - "num_tokens": 185406155.0, + "epoch": 2.0652680652680653, + "grad_norm": 0.4593634390850791, + "learning_rate": 3.14683733963134e-06, + "loss": 0.2724, + "num_tokens": 50248084.0, "step": 223 }, { - "epoch": 0.720112517580872, - "grad_norm": 0.29287073604860037, - "learning_rate": 7.702617257282712e-06, - "loss": 0.7298, - "num_tokens": 186253978.0, + "epoch": 2.0745920745920747, + "grad_norm": 0.4968173973247625, + "learning_rate": 3.108579260759409e-06, + "loss": 0.2718, + "num_tokens": 50466092.0, "step": 224 }, { - "epoch": 0.7233273056057866, - "grad_norm": 0.39430801798012816, - "learning_rate": 7.682210974784426e-06, - "loss": 0.9014, - "num_tokens": 187090941.0, + "epoch": 2.0839160839160837, + "grad_norm": 0.48173251591172095, + "learning_rate": 3.0705605646186966e-06, + "loss": 0.2705, + "num_tokens": 50700030.0, "step": 225 }, { - "epoch": 0.7265420936307012, - "grad_norm": 0.29969786644842994, - "learning_rate": 7.661745850647877e-06, - "loss": 0.9243, - "num_tokens": 187943635.0, + "epoch": 2.093240093240093, + "grad_norm": 0.47536341261873954, + "learning_rate": 3.0327850569047803e-06, + "loss": 0.2754, + "num_tokens": 50926029.0, "step": 226 }, { - "epoch": 0.7297568816556158, - "grad_norm": 0.3062505801174923, - "learning_rate": 7.641222436699377e-06, - "loss": 0.8008, - "num_tokens": 188739652.0, + "epoch": 2.1025641025641026, + "grad_norm": 0.48555103105685965, + "learning_rate": 2.995256518969914e-06, + "loss": 0.2665, + "num_tokens": 51151982.0, "step": 227 }, { - "epoch": 0.7329716696805304, - "grad_norm": 0.327288677089715, - "learning_rate": 7.6206412863369704e-06, - "loss": 0.898, - "num_tokens": 189473807.0, + "epoch": 2.111888111888112, + "grad_norm": 0.44975846542574716, + "learning_rate": 2.9579787074445244e-06, + "loss": 0.2662, + "num_tokens": 51365485.0, "step": 228 }, { - "epoch": 0.736186457705445, - "grad_norm": 0.3229456954085576, - "learning_rate": 7.600002954515532e-06, - "loss": 0.9412, - "num_tokens": 190318599.0, + "epoch": 2.121212121212121, + "grad_norm": 0.44850869772061025, + "learning_rate": 2.9209553538611634e-06, + "loss": 0.269, + "num_tokens": 51594394.0, "step": 229 }, { - "epoch": 0.7394012457303597, - "grad_norm": 0.2988851973450552, - "learning_rate": 7.579307997731783e-06, - "loss": 0.8377, - "num_tokens": 191182296.0, + "epoch": 2.1305361305361306, + "grad_norm": 0.4388553942308307, + "learning_rate": 2.8841901642809843e-06, + "loss": 0.2691, + "num_tokens": 51823497.0, "step": 230 }, { - "epoch": 0.7426160337552743, - "grad_norm": 0.2874083850336822, - "learning_rate": 7.558556974009294e-06, - "loss": 0.8527, - "num_tokens": 192005140.0, + "epoch": 2.13986013986014, + "grad_norm": 0.433932495158785, + "learning_rate": 2.8476868189227603e-06, + "loss": 0.267, + "num_tokens": 52059777.0, "step": 231 }, { - "epoch": 0.7458308217801889, - "grad_norm": 0.28049026034864727, - "learning_rate": 7.537750442883437e-06, - "loss": 0.844, - "num_tokens": 192820565.0, + "epoch": 2.149184149184149, + "grad_norm": 0.4570113864425228, + "learning_rate": 2.811448971794487e-06, + "loss": 0.2628, + "num_tokens": 52291004.0, "step": 232 }, { - "epoch": 0.7490456098051035, - "grad_norm": 0.28337752398616894, - "learning_rate": 7.5168889653863e-06, - "loss": 0.7871, - "num_tokens": 193684911.0, + "epoch": 2.1585081585081585, + "grad_norm": 0.4454627004878239, + "learning_rate": 2.7754802503276235e-06, + "loss": 0.2661, + "num_tokens": 52510106.0, "step": 233 }, { - "epoch": 0.7522603978300181, - "grad_norm": 0.31751723853872776, - "learning_rate": 7.495973104031558e-06, - "loss": 0.8705, - "num_tokens": 194488942.0, + "epoch": 2.167832167832168, + "grad_norm": 0.4266944564191006, + "learning_rate": 2.7397842550139813e-06, + "loss": 0.2762, + "num_tokens": 52740260.0, "step": 234 }, { - "epoch": 0.7554751858549327, - "grad_norm": 0.2818519811862278, - "learning_rate": 7.475003422799302e-06, - "loss": 0.7078, - "num_tokens": 195301487.0, + "epoch": 2.177156177156177, + "grad_norm": 0.4828957968087209, + "learning_rate": 2.7043645590453067e-06, + "loss": 0.2713, + "num_tokens": 52965642.0, "step": 235 }, { - "epoch": 0.7586899738798473, - "grad_norm": 0.2761706034040925, - "learning_rate": 7.453980487120835e-06, - "loss": 0.8615, - "num_tokens": 196175699.0, + "epoch": 2.1864801864801864, + "grad_norm": 0.4377873737923911, + "learning_rate": 2.669224707955608e-06, + "loss": 0.2592, + "num_tokens": 53204133.0, "step": 236 }, { - "epoch": 0.7619047619047619, - "grad_norm": 0.28243630620576576, - "learning_rate": 7.432904863863431e-06, - "loss": 0.9808, - "num_tokens": 197025302.0, + "epoch": 2.195804195804196, + "grad_norm": 0.4356864024809861, + "learning_rate": 2.6343682192662434e-06, + "loss": 0.2638, + "num_tokens": 53442257.0, "step": 237 }, { - "epoch": 0.7651195499296765, - "grad_norm": 0.2888524553949164, - "learning_rate": 7.411777121315035e-06, - "loss": 0.7977, - "num_tokens": 197838734.0, + "epoch": 2.2051282051282053, + "grad_norm": 0.44278157572380683, + "learning_rate": 2.5997985821338183e-06, + "loss": 0.2693, + "num_tokens": 53668484.0, "step": 238 }, { - "epoch": 0.7683343379545912, - "grad_norm": 0.2963619165027587, - "learning_rate": 7.390597829168953e-06, - "loss": 0.8945, - "num_tokens": 198668326.0, + "epoch": 2.2144522144522143, + "grad_norm": 0.4620625252037188, + "learning_rate": 2.5655192570009124e-06, + "loss": 0.2758, + "num_tokens": 53888930.0, "step": 239 }, { - "epoch": 0.7715491259795058, - "grad_norm": 0.3261034728364785, - "learning_rate": 7.36936755850849e-06, - "loss": 0.9067, - "num_tokens": 199465622.0, + "epoch": 2.2237762237762237, + "grad_norm": 0.45332648933552794, + "learning_rate": 2.531533675249691e-06, + "loss": 0.2593, + "num_tokens": 54104360.0, "step": 240 }, { - "epoch": 0.7747639140044204, - "grad_norm": 0.2897864871860178, - "learning_rate": 7.348086881791539e-06, - "loss": 0.8255, - "num_tokens": 200335191.0, + "epoch": 2.233100233100233, + "grad_norm": 0.438761293853775, + "learning_rate": 2.4978452388584192e-06, + "loss": 0.2653, + "num_tokens": 54344051.0, "step": 241 }, { - "epoch": 0.777978702029335, - "grad_norm": 0.3165574100318601, - "learning_rate": 7.3267563728351605e-06, - "loss": 0.8945, - "num_tokens": 201088412.0, + "epoch": 2.242424242424242, + "grad_norm": 0.42897612399239216, + "learning_rate": 2.464457320060929e-06, + "loss": 0.2567, + "num_tokens": 54575177.0, "step": 242 }, { - "epoch": 0.7811934900542495, - "grad_norm": 0.2823042361212196, - "learning_rate": 7.305376606800101e-06, - "loss": 0.7548, - "num_tokens": 201956278.0, + "epoch": 2.2517482517482517, + "grad_norm": 0.45297972683885784, + "learning_rate": 2.4313732610090438e-06, + "loss": 0.268, + "num_tokens": 54806777.0, "step": 243 }, { - "epoch": 0.7844082780791641, - "grad_norm": 0.28833163746728885, - "learning_rate": 7.2839481601752825e-06, - "loss": 0.8268, - "num_tokens": 202809315.0, + "epoch": 2.261072261072261, + "grad_norm": 0.4411649453120093, + "learning_rate": 2.398596373438038e-06, + "loss": 0.2583, + "num_tokens": 55037822.0, "step": 244 }, { - "epoch": 0.7876230661040787, - "grad_norm": 0.29376467474453444, - "learning_rate": 7.2624716107622675e-06, - "loss": 0.8253, - "num_tokens": 203672135.0, + "epoch": 2.2703962703962706, + "grad_norm": 0.4345572710831335, + "learning_rate": 2.366129938335123e-06, + "loss": 0.2629, + "num_tokens": 55265922.0, "step": 245 }, { - "epoch": 0.7908378541289933, - "grad_norm": 0.32038814580048, - "learning_rate": 7.24094753765967e-06, - "loss": 0.8887, - "num_tokens": 204487150.0, + "epoch": 2.2797202797202796, + "grad_norm": 0.42869651807401005, + "learning_rate": 2.3339772056110278e-06, + "loss": 0.2793, + "num_tokens": 55492290.0, "step": 246 }, { - "epoch": 0.794052642153908, - "grad_norm": 0.27214835334751064, - "learning_rate": 7.219376521247541e-06, - "loss": 0.8535, - "num_tokens": 205370971.0, + "epoch": 2.289044289044289, + "grad_norm": 0.4420299559105195, + "learning_rate": 2.302141393774666e-06, + "loss": 0.2615, + "num_tokens": 55725914.0, "step": 247 }, { - "epoch": 0.7972674301788226, - "grad_norm": 0.3002279544094117, - "learning_rate": 7.197759143171728e-06, - "loss": 0.9061, - "num_tokens": 206188591.0, + "epoch": 2.2983682983682985, + "grad_norm": 0.4321678479969795, + "learning_rate": 2.2706256896109774e-06, + "loss": 0.2716, + "num_tokens": 55951887.0, "step": 248 }, { - "epoch": 0.8004822182037372, - "grad_norm": 0.2909079731328829, - "learning_rate": 7.176095986328181e-06, - "loss": 0.8594, - "num_tokens": 207059837.0, + "epoch": 2.3076923076923075, + "grad_norm": 0.441066204708253, + "learning_rate": 2.239433247861915e-06, + "loss": 0.259, + "num_tokens": 56172209.0, "step": 249 }, { - "epoch": 0.8036970062286518, - "grad_norm": 0.29787302805929866, - "learning_rate": 7.154387634847241e-06, - "loss": 0.8447, - "num_tokens": 207937966.0, + "epoch": 2.317016317016317, + "grad_norm": 0.43938915815345186, + "learning_rate": 2.208567190910663e-06, + "loss": 0.2687, + "num_tokens": 56389635.0, "step": 250 }, { - "epoch": 0.8069117942535664, - "grad_norm": 0.28215441474127373, - "learning_rate": 7.132634674077884e-06, - "loss": 0.8734, - "num_tokens": 208785297.0, + "epoch": 2.3263403263403264, + "grad_norm": 0.4540274939571687, + "learning_rate": 2.1780306084690794e-06, + "loss": 0.2661, + "num_tokens": 56609219.0, "step": 251 }, { - "epoch": 0.810126582278481, - "grad_norm": 0.29733898843701784, - "learning_rate": 7.110837690571947e-06, - "loss": 0.8839, - "num_tokens": 209650153.0, + "epoch": 2.335664335664336, + "grad_norm": 0.45455353071523924, + "learning_rate": 2.1478265572684142e-06, + "loss": 0.2647, + "num_tokens": 56831374.0, "step": 252 }, { - "epoch": 0.8133413703033956, - "grad_norm": 0.3279464405497076, - "learning_rate": 7.0889972720683e-06, - "loss": 0.8464, - "num_tokens": 210450707.0, + "epoch": 2.344988344988345, + "grad_norm": 0.4143941270620651, + "learning_rate": 2.1179580607533284e-06, + "loss": 0.2608, + "num_tokens": 57060957.0, "step": 253 }, { - "epoch": 0.8165561583283102, - "grad_norm": 0.30433136921138526, - "learning_rate": 7.067114007477009e-06, - "loss": 0.9101, - "num_tokens": 211221469.0, + "epoch": 2.3543123543123543, + "grad_norm": 0.4388946585859175, + "learning_rate": 2.088428108779251e-06, + "loss": 0.2678, + "num_tokens": 57297316.0, "step": 254 }, { - "epoch": 0.8197709463532248, - "grad_norm": 0.28931341627132034, - "learning_rate": 7.045188486863449e-06, - "loss": 0.7987, - "num_tokens": 212024751.0, + "epoch": 2.3636363636363638, + "grad_norm": 0.4171222019484084, + "learning_rate": 2.059239657313084e-06, + "loss": 0.2686, + "num_tokens": 57528400.0, + "step": 255 + }, + { + "epoch": 2.3636363636363638, + "eval_loss": 0.4000723958015442, + "eval_num_tokens": 57528400.0, + "eval_runtime": 45.0359, + "eval_samples_per_second": 67.746, + "eval_steps_per_second": 8.482, "step": 255 }, { - "epoch": 0.8229857343781395, - "grad_norm": 0.2722844305631139, - "learning_rate": 7.023221301432397e-06, - "loss": 0.7881, - "num_tokens": 212906392.0, + "epoch": 2.3729603729603728, + "grad_norm": 0.42332245702440635, + "learning_rate": 2.0303956281373132e-06, + "loss": 0.2688, + "num_tokens": 57762038.0, "step": 256 }, { - "epoch": 0.8262005224030541, - "grad_norm": 0.3420065759680841, - "learning_rate": 7.00121304351209e-06, - "loss": 0.9015, - "num_tokens": 213683660.0, + "epoch": 2.382284382284382, + "grad_norm": 0.4274471409323921, + "learning_rate": 2.001898908557533e-06, + "loss": 0.2671, + "num_tokens": 57993540.0, "step": 257 }, { - "epoch": 0.8294153104279687, - "grad_norm": 0.3206190389503571, - "learning_rate": 6.979164306538252e-06, - "loss": 0.8941, - "num_tokens": 214533793.0, + "epoch": 2.3916083916083917, + "grad_norm": 0.44507562857654975, + "learning_rate": 1.9737523511134322e-06, + "loss": 0.2643, + "num_tokens": 58216266.0, "step": 258 }, { - "epoch": 0.8326300984528833, - "grad_norm": 0.3055841437669692, - "learning_rate": 6.957075685038094e-06, - "loss": 0.8125, - "num_tokens": 215274343.0, + "epoch": 2.400932400932401, + "grad_norm": 0.4466201275079433, + "learning_rate": 1.9459587732932427e-06, + "loss": 0.2621, + "num_tokens": 58449705.0, "step": 259 }, { - "epoch": 0.8358448864777979, - "grad_norm": 0.3341643753455828, - "learning_rate": 6.9349477746142846e-06, - "loss": 0.9064, - "num_tokens": 216106096.0, + "epoch": 2.41025641025641, + "grad_norm": 0.40869783871580784, + "learning_rate": 1.918520957251716e-06, + "loss": 0.2647, + "num_tokens": 58680399.0, "step": 260 }, { - "epoch": 0.8390596745027125, - "grad_norm": 0.28585103983713367, - "learning_rate": 6.912781171928884e-06, - "loss": 0.8646, - "num_tokens": 216981264.0, + "epoch": 2.4195804195804196, + "grad_norm": 0.4537001488141776, + "learning_rate": 1.8914416495316201e-06, + "loss": 0.2621, + "num_tokens": 58906596.0, "step": 261 }, { - "epoch": 0.842274462527627, - "grad_norm": 0.34110071465678155, - "learning_rate": 6.890576474687264e-06, - "loss": 0.8987, - "num_tokens": 217787433.0, + "epoch": 2.428904428904429, + "grad_norm": 0.4337917513928828, + "learning_rate": 1.8647235607888192e-06, + "loss": 0.2598, + "num_tokens": 59141325.0, "step": 262 }, { - "epoch": 0.8454892505525416, - "grad_norm": 0.28947459223036504, - "learning_rate": 6.868334281621983e-06, - "loss": 0.9221, - "num_tokens": 218688233.0, + "epoch": 2.438228438228438, + "grad_norm": 0.41191514199500184, + "learning_rate": 1.8383693655209223e-06, + "loss": 0.2684, + "num_tokens": 59368701.0, "step": 263 }, { - "epoch": 0.8487040385774564, - "grad_norm": 0.2837683679653108, - "learning_rate": 6.846055192476646e-06, - "loss": 0.7313, - "num_tokens": 219547242.0, + "epoch": 2.4475524475524475, + "grad_norm": 0.4342780664561112, + "learning_rate": 1.8123817017995754e-06, + "loss": 0.2628, + "num_tokens": 59597922.0, "step": 264 }, { - "epoch": 0.851918826602371, - "grad_norm": 0.28398009309233047, - "learning_rate": 6.823739807989734e-06, - "loss": 0.851, - "num_tokens": 220433013.0, + "epoch": 2.456876456876457, + "grad_norm": 0.42508370862489697, + "learning_rate": 1.7867631710063814e-06, + "loss": 0.2797, + "num_tokens": 59816592.0, "step": 265 }, { - "epoch": 0.8551336146272855, - "grad_norm": 0.29371486394777635, - "learning_rate": 6.8013887298784035e-06, - "loss": 0.9102, - "num_tokens": 221327336.0, + "epoch": 2.4662004662004664, + "grad_norm": 0.43487947849731023, + "learning_rate": 1.7615163375725069e-06, + "loss": 0.2713, + "num_tokens": 60036267.0, "step": 266 }, { - "epoch": 0.8583484026522001, - "grad_norm": 0.3172199224731009, - "learning_rate": 6.779002560822261e-06, - "loss": 0.8892, - "num_tokens": 222176464.0, + "epoch": 2.4755244755244754, + "grad_norm": 0.4317316527617555, + "learning_rate": 1.7366437287219745e-06, + "loss": 0.2741, + "num_tokens": 60267834.0, "step": 267 }, { - "epoch": 0.8615631906771147, - "grad_norm": 0.27792773622583583, - "learning_rate": 6.756581904447113e-06, - "loss": 0.7646, - "num_tokens": 223016527.0, + "epoch": 2.484848484848485, + "grad_norm": 0.43452295527728907, + "learning_rate": 1.7121478342186893e-06, + "loss": 0.2621, + "num_tokens": 60493778.0, "step": 268 }, { - "epoch": 0.8647779787020293, - "grad_norm": 0.2789104323015457, - "learning_rate": 6.734127365308695e-06, - "loss": 0.665, - "num_tokens": 223811029.0, + "epoch": 2.4941724941724943, + "grad_norm": 0.43615413703439787, + "learning_rate": 1.6880311061172105e-06, + "loss": 0.2699, + "num_tokens": 60714160.0, "step": 269 }, { - "epoch": 0.8679927667269439, - "grad_norm": 0.2785680982095492, - "learning_rate": 6.7116395488763565e-06, - "loss": 0.7606, - "num_tokens": 224678282.0, + "epoch": 2.5034965034965033, + "grad_norm": 0.4170018950625692, + "learning_rate": 1.664295958517304e-06, + "loss": 0.2689, + "num_tokens": 60945537.0, "step": 270 }, { - "epoch": 0.8712075547518585, - "grad_norm": 0.3293017781857914, - "learning_rate": 6.689119061516749e-06, - "loss": 1.0157, - "num_tokens": 225550320.0, + "epoch": 2.5128205128205128, + "grad_norm": 0.4265907439907391, + "learning_rate": 1.6409447673222828e-06, + "loss": 0.2636, + "num_tokens": 61165444.0, "step": 271 }, { - "epoch": 0.8744223427767731, - "grad_norm": 0.3117563817753623, - "learning_rate": 6.666566510477471e-06, - "loss": 0.9501, - "num_tokens": 226406243.0, + "epoch": 2.5221445221445222, + "grad_norm": 0.4318980822762591, + "learning_rate": 1.6179798700011806e-06, + "loss": 0.256, + "num_tokens": 61406513.0, "step": 272 }, { - "epoch": 0.8776371308016878, - "grad_norm": 0.26885284363376966, - "learning_rate": 6.643982503870693e-06, - "loss": 0.8067, - "num_tokens": 227204455.0, + "epoch": 2.5314685314685317, + "grad_norm": 0.44694267169277563, + "learning_rate": 1.5954035653547689e-06, + "loss": 0.2694, + "num_tokens": 61632914.0, "step": 273 }, { - "epoch": 0.8808519188266024, - "grad_norm": 0.3233869593222032, - "learning_rate": 6.621367650656756e-06, - "loss": 0.8647, - "num_tokens": 228060801.0, + "epoch": 2.5407925407925407, + "grad_norm": 0.42882361209303854, + "learning_rate": 1.5732181132854492e-06, + "loss": 0.2769, + "num_tokens": 61860817.0, "step": 274 }, { - "epoch": 0.884066706851517, - "grad_norm": 0.3121880629149497, - "learning_rate": 6.598722560627761e-06, - "loss": 0.8517, - "num_tokens": 228913422.0, + "epoch": 2.55011655011655, + "grad_norm": 0.41313799908685594, + "learning_rate": 1.55142573457103e-06, + "loss": 0.2575, + "num_tokens": 62096041.0, "step": 275 }, { - "epoch": 0.8872814948764316, - "grad_norm": 0.3029973272208078, - "learning_rate": 6.576047844391124e-06, - "loss": 0.7623, - "num_tokens": 229711312.0, + "epoch": 2.5594405594405596, + "grad_norm": 0.4374318776570311, + "learning_rate": 1.5300286106424279e-06, + "loss": 0.2605, + "num_tokens": 62325189.0, "step": 276 }, { - "epoch": 0.8904962829013462, - "grad_norm": 0.2960915793153165, - "learning_rate": 6.553344113353098e-06, - "loss": 0.8361, - "num_tokens": 230503685.0, + "epoch": 2.5687645687645686, + "grad_norm": 0.4235604710951482, + "learning_rate": 1.509028883365305e-06, + "loss": 0.2695, + "num_tokens": 62553758.0, "step": 277 }, { - "epoch": 0.8937110709262608, - "grad_norm": 0.26462306514841755, - "learning_rate": 6.530611979702312e-06, - "loss": 0.8982, - "num_tokens": 231396081.0, + "epoch": 2.578088578088578, + "grad_norm": 0.4297349959495502, + "learning_rate": 1.488428654825669e-06, + "loss": 0.2661, + "num_tokens": 62781233.0, "step": 278 }, { - "epoch": 0.8969258589511754, - "grad_norm": 0.2908358901422216, - "learning_rate": 6.507852056393236e-06, - "loss": 0.9777, - "num_tokens": 232219037.0, + "epoch": 2.5874125874125875, + "grad_norm": 0.4460197650538502, + "learning_rate": 1.468229987119448e-06, + "loss": 0.2749, + "num_tokens": 63013487.0, "step": 279 }, { - "epoch": 0.90014064697609, - "grad_norm": 0.2919167758688302, - "learning_rate": 6.485064957129677e-06, - "loss": 0.8767, - "num_tokens": 233115729.0, + "epoch": 2.596736596736597, + "grad_norm": 0.4286182611725574, + "learning_rate": 1.4484349021460784e-06, + "loss": 0.2599, + "num_tokens": 63242597.0, "step": 280 }, { - "epoch": 0.9033554350010046, - "grad_norm": 0.2977335786151108, - "learning_rate": 6.462251296348216e-06, - "loss": 0.9373, - "num_tokens": 233868597.0, + "epoch": 2.606060606060606, + "grad_norm": 0.4511458824539744, + "learning_rate": 1.4290453814061065e-06, + "loss": 0.2676, + "num_tokens": 63465524.0, "step": 281 }, { - "epoch": 0.9065702230259193, - "grad_norm": 0.27321689364158774, - "learning_rate": 6.439411689201643e-06, - "loss": 0.7251, - "num_tokens": 234679187.0, + "epoch": 2.6153846153846154, + "grad_norm": 0.4357813194887112, + "learning_rate": 1.4100633658028456e-06, + "loss": 0.2685, + "num_tokens": 63685410.0, "step": 282 }, { - "epoch": 0.9097850110508339, - "grad_norm": 0.31365396892322, - "learning_rate": 6.41654675154238e-06, - "loss": 0.8985, - "num_tokens": 235491399.0, + "epoch": 2.624708624708625, + "grad_norm": 0.42120478929241784, + "learning_rate": 1.3914907554480842e-06, + "loss": 0.2612, + "num_tokens": 63915223.0, "step": 283 }, { - "epoch": 0.9129997990757485, - "grad_norm": 0.2773849701735814, - "learning_rate": 6.393657099905854e-06, - "loss": 0.9344, - "num_tokens": 236352672.0, + "epoch": 2.634032634032634, + "grad_norm": 0.4389076820558484, + "learning_rate": 1.3733294094718866e-06, + "loss": 0.2709, + "num_tokens": 64138023.0, "step": 284 }, { - "epoch": 0.9162145871006631, - "grad_norm": 0.2645401551925225, - "learning_rate": 6.370743351493899e-06, - "loss": 0.8071, - "num_tokens": 237180799.0, + "epoch": 2.6433566433566433, + "grad_norm": 0.4374182234608829, + "learning_rate": 1.3555811458364907e-06, + "loss": 0.2704, + "num_tokens": 64364256.0, "step": 285 }, { - "epoch": 0.9194293751255777, - "grad_norm": 0.28851610288381874, - "learning_rate": 6.3478061241580904e-06, - "loss": 0.9002, - "num_tokens": 237988043.0, + "epoch": 2.652680652680653, + "grad_norm": 0.44236834709549855, + "learning_rate": 1.3382477411543343e-06, + "loss": 0.2655, + "num_tokens": 64587438.0, "step": 286 }, { - "epoch": 0.9226441631504922, - "grad_norm": 0.3274610499229697, - "learning_rate": 6.324846036383096e-06, - "loss": 0.8437, - "num_tokens": 238745687.0, + "epoch": 2.6620046620046622, + "grad_norm": 0.3962866587873811, + "learning_rate": 1.3213309305102079e-06, + "loss": 0.2667, + "num_tokens": 64816847.0, "step": 287 }, { - "epoch": 0.9258589511754068, - "grad_norm": 0.2814370905653731, - "learning_rate": 6.301863707270003e-06, - "loss": 0.8168, - "num_tokens": 239522057.0, + "epoch": 2.6713286713286712, + "grad_norm": 0.44441366707237095, + "learning_rate": 1.304832407287574e-06, + "loss": 0.2728, + "num_tokens": 65039513.0, "step": 288 }, { - "epoch": 0.9290737392003214, - "grad_norm": 0.2640287762511696, - "learning_rate": 6.278859756519613e-06, - "loss": 0.8278, - "num_tokens": 240414986.0, + "epoch": 2.6806526806526807, + "grad_norm": 0.4221320298969624, + "learning_rate": 1.2887538229990627e-06, + "loss": 0.2631, + "num_tokens": 65275781.0, "step": 289 }, { - "epoch": 0.9322885272252361, - "grad_norm": 0.2786543918308066, - "learning_rate": 6.255834804415742e-06, - "loss": 0.7807, - "num_tokens": 241197257.0, + "epoch": 2.6899766899766897, + "grad_norm": 0.41457174464701124, + "learning_rate": 1.2730967871211484e-06, + "loss": 0.2588, + "num_tokens": 65502493.0, "step": 290 }, { - "epoch": 0.9355033152501507, - "grad_norm": 0.27966431552094895, - "learning_rate": 6.2327894718084916e-06, - "loss": 0.839, - "num_tokens": 242056898.0, + "epoch": 2.699300699300699, + "grad_norm": 0.41598874521296114, + "learning_rate": 1.2578628669330422e-06, + "loss": 0.2596, + "num_tokens": 65736335.0, "step": 291 }, { - "epoch": 0.9387181032750653, - "grad_norm": 0.28404029598767777, - "learning_rate": 6.209724380097505e-06, - "loss": 0.8483, - "num_tokens": 242881406.0, + "epoch": 2.7086247086247086, + "grad_norm": 0.42296533421373433, + "learning_rate": 1.2430535873598074e-06, + "loss": 0.2704, + "num_tokens": 65950812.0, "step": 292 }, { - "epoch": 0.9419328912999799, - "grad_norm": 0.2662025528578104, - "learning_rate": 6.186640151215217e-06, - "loss": 0.6669, - "num_tokens": 243716610.0, + "epoch": 2.717948717948718, + "grad_norm": 0.45062440552954913, + "learning_rate": 1.2286704308197135e-06, + "loss": 0.2832, + "num_tokens": 66165588.0, "step": 293 }, { - "epoch": 0.9451476793248945, - "grad_norm": 0.28618097691672517, - "learning_rate": 6.163537407610081e-06, - "loss": 0.8233, - "num_tokens": 244588893.0, + "epoch": 2.7272727272727275, + "grad_norm": 0.4445602504728877, + "learning_rate": 1.2147148370758422e-06, + "loss": 0.2616, + "num_tokens": 66393469.0, "step": 294 }, { - "epoch": 0.9483624673498091, - "grad_norm": 0.2814023456928436, - "learning_rate": 6.140416772229785e-06, - "loss": 0.7837, - "num_tokens": 245427971.0, + "epoch": 2.7365967365967365, + "grad_norm": 0.4223681741501879, + "learning_rate": 1.2011882030919707e-06, + "loss": 0.2603, + "num_tokens": 66617146.0, "step": 295 }, { - "epoch": 0.9515772553747237, - "grad_norm": 0.3280677142624181, - "learning_rate": 6.117278868504454e-06, - "loss": 0.9404, - "num_tokens": 246259750.0, + "epoch": 2.745920745920746, + "grad_norm": 0.42062867718418007, + "learning_rate": 1.1880918828927305e-06, + "loss": 0.2559, + "num_tokens": 66847475.0, "step": 296 }, { - "epoch": 0.9547920433996383, - "grad_norm": 0.28497232475973866, - "learning_rate": 6.094124320329843e-06, - "loss": 0.868, - "num_tokens": 247136022.0, + "epoch": 2.755244755244755, + "grad_norm": 0.4065760757827556, + "learning_rate": 1.175427187428072e-06, + "loss": 0.2574, + "num_tokens": 67082325.0, "step": 297 }, { - "epoch": 0.9580068314245529, - "grad_norm": 0.2758861944067013, - "learning_rate": 6.070953752050509e-06, - "loss": 0.8361, - "num_tokens": 247958906.0, + "epoch": 2.7645687645687644, + "grad_norm": 0.4455630867473245, + "learning_rate": 1.163195384442036e-06, + "loss": 0.2738, + "num_tokens": 67299520.0, "step": 298 }, { - "epoch": 0.9612216194494676, - "grad_norm": 0.28802455190621595, - "learning_rate": 6.047767788442983e-06, - "loss": 0.7433, - "num_tokens": 248704826.0, + "epoch": 2.773892773892774, + "grad_norm": 0.4327619189440975, + "learning_rate": 1.1513976983458506e-06, + "loss": 0.2647, + "num_tokens": 67526642.0, "step": 299 }, { - "epoch": 0.9644364074743822, - "grad_norm": 0.31056071926580564, - "learning_rate": 6.0245670546989165e-06, - "loss": 0.8575, - "num_tokens": 249485746.0, + "epoch": 2.7832167832167833, + "grad_norm": 0.4244253793482913, + "learning_rate": 1.1400353100953692e-06, + "loss": 0.2594, + "num_tokens": 67753621.0, "step": 300 }, { - "epoch": 0.9676511954992968, - "grad_norm": 0.3204096864163064, - "learning_rate": 6.001352176408225e-06, - "loss": 0.8137, - "num_tokens": 250237095.0, + "epoch": 2.792540792540793, + "grad_norm": 0.42422473241175773, + "learning_rate": 1.1291093570728561e-06, + "loss": 0.2695, + "num_tokens": 67971387.0, "step": 301 }, { - "epoch": 0.9708659835242114, - "grad_norm": 0.3704473764934116, - "learning_rate": 5.9781237795422234e-06, - "loss": 0.8937, - "num_tokens": 251047039.0, + "epoch": 2.801864801864802, + "grad_norm": 0.4247831741758565, + "learning_rate": 1.1186209329731306e-06, + "loss": 0.2728, + "num_tokens": 68191217.0, "step": 302 }, { - "epoch": 0.974080771549126, - "grad_norm": 0.26873972748467573, - "learning_rate": 5.954882490436742e-06, - "loss": 0.7545, - "num_tokens": 251893123.0, + "epoch": 2.8111888111888113, + "grad_norm": 0.4408896307276524, + "learning_rate": 1.1085710876940913e-06, + "loss": 0.262, + "num_tokens": 68415882.0, "step": 303 }, { - "epoch": 0.9772955595740406, - "grad_norm": 0.30581639989328346, - "learning_rate": 5.931628935775241e-06, - "loss": 0.8054, - "num_tokens": 252724270.0, + "epoch": 2.8205128205128203, + "grad_norm": 0.41385534921176376, + "learning_rate": 1.0989608272316172e-06, + "loss": 0.2528, + "num_tokens": 68641200.0, "step": 304 }, { - "epoch": 0.9805103475989552, - "grad_norm": 0.3289733614982222, - "learning_rate": 5.908363742571915e-06, - "loss": 0.8718, - "num_tokens": 253551425.0, + "epoch": 2.8298368298368297, + "grad_norm": 0.4300973675335551, + "learning_rate": 1.089791113578871e-06, + "loss": 0.2583, + "num_tokens": 68869654.0, "step": 305 }, { - "epoch": 0.9837251356238698, - "grad_norm": 0.2993193044129647, - "learning_rate": 5.88508753815478e-06, - "loss": 0.8546, - "num_tokens": 254374504.0, + "epoch": 2.839160839160839, + "grad_norm": 0.40220366787982387, + "learning_rate": 1.0810628646299988e-06, + "loss": 0.2673, + "num_tokens": 69098611.0, + "step": 306 + }, + { + "epoch": 2.839160839160839, + "eval_loss": 0.3964461386203766, + "eval_num_tokens": 69098611.0, + "eval_runtime": 44.935, + "eval_samples_per_second": 67.898, + "eval_steps_per_second": 8.501, "step": 306 }, { - "epoch": 0.9869399236487844, - "grad_norm": 0.267911025679125, - "learning_rate": 5.861800950148761e-06, - "loss": 0.7953, - "num_tokens": 255216423.0, + "epoch": 2.8484848484848486, + "grad_norm": 0.4343795644115535, + "learning_rate": 1.072776954088251e-06, + "loss": 0.2716, + "num_tokens": 69322088.0, "step": 307 }, { - "epoch": 0.9901547116736991, - "grad_norm": 0.3650782265894403, - "learning_rate": 5.838504606458768e-06, - "loss": 0.9558, - "num_tokens": 255964424.0, + "epoch": 2.857808857808858, + "grad_norm": 0.42045510778209694, + "learning_rate": 1.0649342113785217e-06, + "loss": 0.2656, + "num_tokens": 69538824.0, "step": 308 }, { - "epoch": 0.9933694996986137, - "grad_norm": 0.3114000873229984, - "learning_rate": 5.81519913525277e-06, - "loss": 0.9134, - "num_tokens": 256829564.0, + "epoch": 2.867132867132867, + "grad_norm": 0.4257152687835886, + "learning_rate": 1.057535421564327e-06, + "loss": 0.2697, + "num_tokens": 69770210.0, "step": 309 }, { - "epoch": 0.9965842877235283, - "grad_norm": 0.33158523687081076, - "learning_rate": 5.791885164944844e-06, - "loss": 0.8301, - "num_tokens": 257604305.0, + "epoch": 2.8764568764568765, + "grad_norm": 0.4182840339935867, + "learning_rate": 1.0505813252692142e-06, + "loss": 0.2705, + "num_tokens": 69993710.0, "step": 310 }, { - "epoch": 0.9997990757484428, - "grad_norm": 0.2853174073030489, - "learning_rate": 5.768563324178247e-06, - "loss": 0.8099, - "num_tokens": 258422101.0, + "epoch": 2.8857808857808855, + "grad_norm": 0.4270654163158855, + "learning_rate": 1.0440726186026289e-06, + "loss": 0.2824, + "num_tokens": 70216546.0, "step": 311 }, { - "epoch": 1.0, - "grad_norm": 0.2853174073030489, - "learning_rate": 5.745234241808454e-06, - "loss": 0.6789, - "num_tokens": 258474863.0, - "step": 312 - }, - { - "epoch": 1.0, - "eval_loss": 0.6563695669174194, - "eval_num_tokens": 258474863.0, - "eval_runtime": 198.7553, - "eval_samples_per_second": 44.517, - "eval_steps_per_second": 5.565, + "epoch": 2.895104895104895, + "grad_norm": 0.45311248568170176, + "learning_rate": 1.038009953090232e-06, + "loss": 0.2749, + "num_tokens": 70439614.0, "step": 312 }, { - "epoch": 1.0032147880249147, - "grad_norm": 0.29773732604881553, - "learning_rate": 5.721898546886201e-06, - "loss": 0.7732, - "num_tokens": 744214.0, + "epoch": 2.9044289044289044, + "grad_norm": 0.41047094656547684, + "learning_rate": 1.032393935608683e-06, + "loss": 0.2626, + "num_tokens": 70670380.0, "step": 313 }, { - "epoch": 1.0064295760498292, - "grad_norm": 0.29997749253731704, - "learning_rate": 5.698556868640531e-06, - "loss": 0.7741, - "num_tokens": 1573854.0, + "epoch": 2.913752913752914, + "grad_norm": 0.42280895869632795, + "learning_rate": 1.0272251283248903e-06, + "loss": 0.264, + "num_tokens": 70900553.0, "step": 314 }, { - "epoch": 1.009644364074744, - "grad_norm": 0.28340782585886687, - "learning_rate": 5.67520983646182e-06, - "loss": 0.8009, - "num_tokens": 2435877.0, + "epoch": 2.9230769230769234, + "grad_norm": 0.420152194916759, + "learning_rate": 1.022504048639738e-06, + "loss": 0.2698, + "num_tokens": 71129446.0, "step": 315 }, { - "epoch": 1.0128591520996584, - "grad_norm": 0.3048030231342332, - "learning_rate": 5.651858079884811e-06, - "loss": 0.841, - "num_tokens": 3256874.0, + "epoch": 2.9324009324009324, + "grad_norm": 0.4252339208960209, + "learning_rate": 1.0182311691362935e-06, + "loss": 0.2574, + "num_tokens": 71364044.0, "step": 316 }, { - "epoch": 1.016073940124573, - "grad_norm": 0.31361002129198445, - "learning_rate": 5.6285022285716325e-06, - "loss": 0.7919, - "num_tokens": 4040615.0, + "epoch": 2.941724941724942, + "grad_norm": 0.4146036330803457, + "learning_rate": 1.014406917532503e-06, + "loss": 0.2686, + "num_tokens": 71582926.0, "step": 317 }, { - "epoch": 1.0192887281494876, - "grad_norm": 0.2641359904470198, - "learning_rate": 5.605142912294831e-06, - "loss": 0.741, - "num_tokens": 4873583.0, + "epoch": 2.951048951048951, + "grad_norm": 0.4266092057892508, + "learning_rate": 1.0110316766383745e-06, + "loss": 0.2625, + "num_tokens": 71805330.0, "step": 318 }, { - "epoch": 1.0225035161744023, - "grad_norm": 0.2828199296344485, - "learning_rate": 5.581780760920378e-06, - "loss": 0.763, - "num_tokens": 5707319.0, + "epoch": 2.9603729603729603, + "grad_norm": 0.4217243793579386, + "learning_rate": 1.00810578431766e-06, + "loss": 0.2587, + "num_tokens": 72035215.0, "step": 319 }, { - "epoch": 1.0257183041993168, - "grad_norm": 0.2950193785016559, - "learning_rate": 5.5584164043906895e-06, - "loss": 0.8235, - "num_tokens": 6516170.0, + "epoch": 2.9696969696969697, + "grad_norm": 0.4199077117513419, + "learning_rate": 1.0056295334540357e-06, + "loss": 0.2596, + "num_tokens": 72263057.0, "step": 320 }, { - "epoch": 1.0289330922242315, - "grad_norm": 0.2548931046045369, - "learning_rate": 5.535050472707648e-06, - "loss": 0.6627, - "num_tokens": 7287585.0, + "epoch": 2.979020979020979, + "grad_norm": 0.4533624252529699, + "learning_rate": 1.0036031719217808e-06, + "loss": 0.2663, + "num_tokens": 72485545.0, "step": 321 }, { - "epoch": 1.0321478802491462, - "grad_norm": 0.3185973954451004, - "learning_rate": 5.5116835959156045e-06, - "loss": 0.7707, - "num_tokens": 8122985.0, + "epoch": 2.988344988344988, + "grad_norm": 0.4453560519473016, + "learning_rate": 1.0020269025609697e-06, + "loss": 0.2657, + "num_tokens": 72698615.0, "step": 322 }, { - "epoch": 1.0353626682740606, - "grad_norm": 0.32478485338090357, - "learning_rate": 5.488316404084397e-06, - "loss": 0.8762, - "num_tokens": 8943399.0, + "epoch": 2.9976689976689976, + "grad_norm": 0.44895378041312567, + "learning_rate": 1.0009008831571635e-06, + "loss": 0.2785, + "num_tokens": 72922319.0, "step": 323 }, { - "epoch": 1.0385774562989754, - "grad_norm": 0.30167349789091147, - "learning_rate": 5.464949527292352e-06, - "loss": 0.8779, - "num_tokens": 9847975.0, + "epoch": 3.0, + "grad_norm": 0.44895378041312567, + "learning_rate": 1.000225226425618e-06, + "loss": 0.2816, + "num_tokens": 72976686.0, "step": 324 }, { - "epoch": 1.0417922443238898, - "grad_norm": 0.29450857709602724, - "learning_rate": 5.441583595609312e-06, - "loss": 0.9535, - "num_tokens": 10654365.0, - "step": 325 - }, - { - "epoch": 1.0450070323488045, - "grad_norm": 0.29182225025218944, - "learning_rate": 5.418219239079624e-06, - "loss": 0.7577, - "num_tokens": 11420194.0, - "step": 326 - }, - { - "epoch": 1.048221820373719, - "grad_norm": 0.3041378234576701, - "learning_rate": 5.39485708770517e-06, - "loss": 0.821, - "num_tokens": 12222162.0, - "step": 327 - }, - { - "epoch": 1.0514366083986337, - "grad_norm": 0.2957533117853562, - "learning_rate": 5.371497771428368e-06, - "loss": 0.884, - "num_tokens": 12990790.0, - "step": 328 - }, - { - "epoch": 1.0546513964235482, - "grad_norm": 0.26985723996826116, - "learning_rate": 5.348141920115191e-06, - "loss": 0.7335, - "num_tokens": 13813577.0, - "step": 329 - }, - { - "epoch": 1.057866184448463, - "grad_norm": 0.28012407668708217, - "learning_rate": 5.324790163538181e-06, - "loss": 0.8132, - "num_tokens": 14679304.0, - "step": 330 - }, - { - "epoch": 1.0610809724733776, - "grad_norm": 0.26703763473284864, - "learning_rate": 5.30144313135947e-06, - "loss": 0.7919, - "num_tokens": 15520191.0, - "step": 331 - }, - { - "epoch": 1.064295760498292, - "grad_norm": 0.3279895776464466, - "learning_rate": 5.278101453113801e-06, - "loss": 0.8726, - "num_tokens": 16347999.0, - "step": 332 - }, - { - "epoch": 1.0675105485232068, - "grad_norm": 0.29100239315593257, - "learning_rate": 5.254765758191547e-06, - "loss": 0.853, - "num_tokens": 17179863.0, - "step": 333 - }, - { - "epoch": 1.0707253365481213, - "grad_norm": 0.25436005411053014, - "learning_rate": 5.231436675821754e-06, - "loss": 0.7805, - "num_tokens": 18010698.0, - "step": 334 - }, - { - "epoch": 1.073940124573036, - "grad_norm": 0.39468232284937566, - "learning_rate": 5.208114835055157e-06, - "loss": 0.7931, - "num_tokens": 18800675.0, - "step": 335 - }, - { - "epoch": 1.0771549125979505, - "grad_norm": 0.2609325367233198, - "learning_rate": 5.184800864747233e-06, - "loss": 0.6901, - "num_tokens": 19629163.0, - "step": 336 - }, - { - "epoch": 1.0803697006228652, - "grad_norm": 0.2851800406052326, - "learning_rate": 5.161495393541233e-06, - "loss": 0.7736, - "num_tokens": 20442399.0, - "step": 337 - }, - { - "epoch": 1.0835844886477797, - "grad_norm": 0.283041770428445, - "learning_rate": 5.1381990498512415e-06, - "loss": 0.7906, - "num_tokens": 21292500.0, - "step": 338 - }, - { - "epoch": 1.0867992766726944, - "grad_norm": 0.2861114841812242, - "learning_rate": 5.114912461845223e-06, - "loss": 0.7966, - "num_tokens": 22132998.0, - "step": 339 - }, - { - "epoch": 1.090014064697609, - "grad_norm": 0.2708986154860288, - "learning_rate": 5.0916362574280864e-06, - "loss": 0.7789, - "num_tokens": 23019322.0, - "step": 340 - }, - { - "epoch": 1.0932288527225236, - "grad_norm": 0.28793375034832, - "learning_rate": 5.0683710642247616e-06, - "loss": 0.8376, - "num_tokens": 23940510.0, - "step": 341 - }, - { - "epoch": 1.0964436407474383, - "grad_norm": 0.30225075622837805, - "learning_rate": 5.04511750956326e-06, - "loss": 0.8916, - "num_tokens": 24713209.0, - "step": 342 - }, - { - "epoch": 1.0996584287723528, - "grad_norm": 0.26655917817006813, - "learning_rate": 5.02187622045778e-06, - "loss": 0.8194, - "num_tokens": 25533514.0, - "step": 343 - }, - { - "epoch": 1.1028732167972675, - "grad_norm": 0.28538979814112303, - "learning_rate": 4.998647823591776e-06, - "loss": 0.7943, - "num_tokens": 26366593.0, - "step": 344 - }, - { - "epoch": 1.106088004822182, - "grad_norm": 0.28523185501933707, - "learning_rate": 4.975432945301085e-06, - "loss": 0.8115, - "num_tokens": 27171515.0, - "step": 345 - }, - { - "epoch": 1.1093027928470967, - "grad_norm": 0.2760152558029347, - "learning_rate": 4.952232211557016e-06, - "loss": 0.8237, - "num_tokens": 27986461.0, - "step": 346 - }, - { - "epoch": 1.1125175808720114, - "grad_norm": 0.30683324246726074, - "learning_rate": 4.929046247949493e-06, - "loss": 0.867, - "num_tokens": 28806721.0, - "step": 347 - }, - { - "epoch": 1.1157323688969258, - "grad_norm": 0.3219809931238027, - "learning_rate": 4.90587567967016e-06, - "loss": 0.897, - "num_tokens": 29686799.0, - "step": 348 - }, - { - "epoch": 1.1189471569218405, - "grad_norm": 0.2885549341938194, - "learning_rate": 4.882721131495548e-06, - "loss": 0.8561, - "num_tokens": 30473321.0, - "step": 349 - }, - { - "epoch": 1.122161944946755, - "grad_norm": 0.31219249749001193, - "learning_rate": 4.859583227770218e-06, - "loss": 0.8746, - "num_tokens": 31346354.0, - "step": 350 - }, - { - "epoch": 1.1253767329716697, - "grad_norm": 0.2744719455872982, - "learning_rate": 4.83646259238992e-06, - "loss": 0.7586, - "num_tokens": 32151169.0, - "step": 351 - }, - { - "epoch": 1.1285915209965842, - "grad_norm": 0.2719465259878642, - "learning_rate": 4.813359848784784e-06, - "loss": 0.8296, - "num_tokens": 33000318.0, - "step": 352 - }, - { - "epoch": 1.131806309021499, - "grad_norm": 0.2731499386230732, - "learning_rate": 4.790275619902496e-06, - "loss": 0.828, - "num_tokens": 33855212.0, - "step": 353 - }, - { - "epoch": 1.1350210970464134, - "grad_norm": 0.24491242680730593, - "learning_rate": 4.767210528191511e-06, - "loss": 0.718, - "num_tokens": 34674479.0, - "step": 354 - }, - { - "epoch": 1.1382358850713281, - "grad_norm": 0.241364209695039, - "learning_rate": 4.744165195584258e-06, - "loss": 0.7271, - "num_tokens": 35497170.0, - "step": 355 - }, - { - "epoch": 1.1414506730962426, - "grad_norm": 0.2699288005061086, - "learning_rate": 4.721140243480389e-06, - "loss": 0.7729, - "num_tokens": 36361086.0, - "step": 356 - }, - { - "epoch": 1.1446654611211573, - "grad_norm": 0.2747502099174342, - "learning_rate": 4.6981362927299975e-06, - "loss": 0.7253, - "num_tokens": 37181714.0, - "step": 357 - }, - { - "epoch": 1.147880249146072, - "grad_norm": 0.2640456488027195, - "learning_rate": 4.675153963616905e-06, - "loss": 0.8073, - "num_tokens": 38017495.0, - "step": 358 - }, - { - "epoch": 1.1510950371709865, - "grad_norm": 0.24720621846535545, - "learning_rate": 4.652193875841913e-06, - "loss": 0.794, - "num_tokens": 38922269.0, - "step": 359 - }, - { - "epoch": 1.1543098251959012, - "grad_norm": 0.2898206291501093, - "learning_rate": 4.6292566485061015e-06, - "loss": 0.8139, - "num_tokens": 39757816.0, - "step": 360 - }, - { - "epoch": 1.1575246132208157, - "grad_norm": 0.2833878551097471, - "learning_rate": 4.606342900094147e-06, - "loss": 0.7894, - "num_tokens": 40618751.0, - "step": 361 - }, - { - "epoch": 1.1607394012457304, - "grad_norm": 0.26539693106847995, - "learning_rate": 4.583453248457622e-06, - "loss": 0.76, - "num_tokens": 41455981.0, - "step": 362 - }, - { - "epoch": 1.1639541892706449, - "grad_norm": 0.2556129492651888, - "learning_rate": 4.5605883107983575e-06, - "loss": 0.7165, - "num_tokens": 42294822.0, - "step": 363 - }, - { - "epoch": 1.1671689772955596, - "grad_norm": 0.26430920444715233, - "learning_rate": 4.537748703651785e-06, - "loss": 0.7099, - "num_tokens": 43189461.0, - "step": 364 - }, - { - "epoch": 1.1703837653204743, - "grad_norm": 0.2559203970581411, - "learning_rate": 4.514935042870324e-06, - "loss": 0.6911, - "num_tokens": 44030419.0, - "step": 365 - }, - { - "epoch": 1.1735985533453888, - "grad_norm": 0.2689858272909754, - "learning_rate": 4.492147943606765e-06, - "loss": 0.731, - "num_tokens": 44808020.0, - "step": 366 - }, - { - "epoch": 1.1768133413703035, - "grad_norm": 0.2715780338321884, - "learning_rate": 4.469388020297691e-06, - "loss": 0.8605, - "num_tokens": 45663922.0, - "step": 367 - }, - { - "epoch": 1.180028129395218, - "grad_norm": 0.26052636065101675, - "learning_rate": 4.446655886646903e-06, - "loss": 0.7554, - "num_tokens": 46453986.0, - "step": 368 - }, - { - "epoch": 1.1832429174201327, - "grad_norm": 0.27167375999843213, - "learning_rate": 4.423952155608878e-06, - "loss": 0.8279, - "num_tokens": 47306541.0, - "step": 369 - }, - { - "epoch": 1.1864577054450471, - "grad_norm": 0.2710756521672715, - "learning_rate": 4.40127743937224e-06, - "loss": 0.719, - "num_tokens": 48090427.0, - "step": 370 - }, - { - "epoch": 1.1896724934699618, - "grad_norm": 0.2535254786000629, - "learning_rate": 4.378632349343245e-06, - "loss": 0.7535, - "num_tokens": 48961866.0, - "step": 371 - }, - { - "epoch": 1.1928872814948766, - "grad_norm": 0.24076007447445372, - "learning_rate": 4.35601749612931e-06, - "loss": 0.7612, - "num_tokens": 49886554.0, - "step": 372 - }, - { - "epoch": 1.196102069519791, - "grad_norm": 0.25901260483629757, - "learning_rate": 4.333433489522529e-06, - "loss": 0.7665, - "num_tokens": 50728645.0, - "step": 373 - }, - { - "epoch": 1.1993168575447057, - "grad_norm": 0.26783353876505317, - "learning_rate": 4.310880938483253e-06, - "loss": 0.7812, - "num_tokens": 51574362.0, - "step": 374 - }, - { - "epoch": 1.2025316455696202, - "grad_norm": 0.26460932826536054, - "learning_rate": 4.288360451123646e-06, - "loss": 0.7804, - "num_tokens": 52386191.0, - "step": 375 - }, - { - "epoch": 1.205746433594535, - "grad_norm": 0.2474357198912021, - "learning_rate": 4.265872634691307e-06, - "loss": 0.7532, - "num_tokens": 53260334.0, - "step": 376 - }, - { - "epoch": 1.2089612216194494, - "grad_norm": 0.23913128576917642, - "learning_rate": 4.2434180955528855e-06, - "loss": 0.7552, - "num_tokens": 54146391.0, - "step": 377 - }, - { - "epoch": 1.2121760096443641, - "grad_norm": 0.2544585227812706, - "learning_rate": 4.220997439177741e-06, - "loss": 0.8663, - "num_tokens": 55041956.0, - "step": 378 - }, - { - "epoch": 1.2153907976692786, - "grad_norm": 0.2791457877110213, - "learning_rate": 4.198611270121598e-06, - "loss": 0.8836, - "num_tokens": 55853888.0, - "step": 379 - }, - { - "epoch": 1.2186055856941933, - "grad_norm": 0.2601427563819937, - "learning_rate": 4.1762601920102675e-06, - "loss": 0.8338, - "num_tokens": 56667891.0, - "step": 380 - }, - { - "epoch": 1.2218203737191078, - "grad_norm": 0.2881042786165002, - "learning_rate": 4.153944807523356e-06, - "loss": 0.7515, - "num_tokens": 57494051.0, - "step": 381 - }, - { - "epoch": 1.2250351617440225, - "grad_norm": 0.2717376099655538, - "learning_rate": 4.13166571837802e-06, - "loss": 0.7304, - "num_tokens": 58273438.0, - "step": 382 - }, - { - "epoch": 1.2282499497689372, - "grad_norm": 0.24796160518353433, - "learning_rate": 4.109423525312738e-06, - "loss": 0.7527, - "num_tokens": 59084119.0, - "step": 383 - }, - { - "epoch": 1.2314647377938517, - "grad_norm": 0.24443369548659674, - "learning_rate": 4.087218828071116e-06, - "loss": 0.7157, - "num_tokens": 59973616.0, - "step": 384 - }, - { - "epoch": 1.2346795258187664, - "grad_norm": 0.28019239779197574, - "learning_rate": 4.065052225385717e-06, - "loss": 0.8255, - "num_tokens": 60858377.0, - "step": 385 - }, - { - "epoch": 1.2378943138436809, - "grad_norm": 0.2552782179183279, - "learning_rate": 4.0429243149619065e-06, - "loss": 0.8367, - "num_tokens": 61731411.0, - "step": 386 - }, - { - "epoch": 1.2411091018685956, - "grad_norm": 0.2666120199534677, - "learning_rate": 4.020835693461751e-06, - "loss": 0.7578, - "num_tokens": 62498701.0, - "step": 387 - }, - { - "epoch": 1.24432388989351, - "grad_norm": 0.2936643180030148, - "learning_rate": 3.998786956487913e-06, - "loss": 0.8695, - "num_tokens": 63360468.0, - "step": 388 - }, - { - "epoch": 1.2475386779184248, - "grad_norm": 0.2820327965592649, - "learning_rate": 3.976778698567605e-06, - "loss": 0.7907, - "num_tokens": 64140584.0, - "step": 389 - }, - { - "epoch": 1.2507534659433395, - "grad_norm": 0.2793026613841628, - "learning_rate": 3.954811513136554e-06, - "loss": 0.848, - "num_tokens": 65012995.0, - "step": 390 - }, - { - "epoch": 1.253968253968254, - "grad_norm": 0.2958719965667329, - "learning_rate": 3.932885992522992e-06, - "loss": 0.7252, - "num_tokens": 65798860.0, - "step": 391 - }, - { - "epoch": 1.2571830419931684, - "grad_norm": 0.24632028953303942, - "learning_rate": 3.911002727931701e-06, - "loss": 0.7348, - "num_tokens": 66669823.0, - "step": 392 - }, - { - "epoch": 1.2603978300180831, - "grad_norm": 0.23797767648678955, - "learning_rate": 3.8891623094280535e-06, - "loss": 0.7345, - "num_tokens": 67493654.0, - "step": 393 - }, - { - "epoch": 1.2636126180429978, - "grad_norm": 0.3073836669686047, - "learning_rate": 3.867365325922116e-06, - "loss": 0.8871, - "num_tokens": 68347303.0, - "step": 394 - }, - { - "epoch": 1.2668274060679123, - "grad_norm": 0.2605636404088573, - "learning_rate": 3.84561236515276e-06, - "loss": 0.7849, - "num_tokens": 69200332.0, - "step": 395 - }, - { - "epoch": 1.270042194092827, - "grad_norm": 0.2561929964279003, - "learning_rate": 3.82390401367182e-06, - "loss": 0.8371, - "num_tokens": 70012951.0, - "step": 396 - }, - { - "epoch": 1.2732569821177417, - "grad_norm": 0.26123463361452404, - "learning_rate": 3.802240856828273e-06, - "loss": 0.7643, - "num_tokens": 70802232.0, - "step": 397 - }, - { - "epoch": 1.2764717701426562, - "grad_norm": 0.2452305457154863, - "learning_rate": 3.7806234787524622e-06, - "loss": 0.7797, - "num_tokens": 71692197.0, - "step": 398 - }, - { - "epoch": 1.2796865581675707, - "grad_norm": 0.24446776284318306, - "learning_rate": 3.7590524623403335e-06, - "loss": 0.8281, - "num_tokens": 72565305.0, - "step": 399 - }, - { - "epoch": 1.2829013461924854, - "grad_norm": 0.2532276733394788, - "learning_rate": 3.7375283892377344e-06, - "loss": 0.7878, - "num_tokens": 73399383.0, - "step": 400 - }, - { - "epoch": 1.2861161342174001, - "grad_norm": 0.25233679329333114, - "learning_rate": 3.7160518398247193e-06, - "loss": 0.7846, - "num_tokens": 74304601.0, - "step": 401 - }, - { - "epoch": 1.2893309222423146, - "grad_norm": 0.2505753606884865, - "learning_rate": 3.694623393199901e-06, - "loss": 0.775, - "num_tokens": 75160936.0, - "step": 402 - }, - { - "epoch": 1.2925457102672293, - "grad_norm": 0.27831371811506406, - "learning_rate": 3.673243627164841e-06, - "loss": 0.875, - "num_tokens": 75939207.0, - "step": 403 - }, - { - "epoch": 1.2957604982921438, - "grad_norm": 0.27209809805527896, - "learning_rate": 3.6519131182084612e-06, - "loss": 0.7547, - "num_tokens": 76745423.0, - "step": 404 - }, - { - "epoch": 1.2989752863170585, - "grad_norm": 0.2770005326111012, - "learning_rate": 3.630632441491512e-06, - "loss": 0.8654, - "num_tokens": 77546229.0, - "step": 405 - }, - { - "epoch": 1.302190074341973, - "grad_norm": 0.25112243918761823, - "learning_rate": 3.609402170831048e-06, - "loss": 0.7995, - "num_tokens": 78409533.0, - "step": 406 - }, - { - "epoch": 1.3054048623668877, - "grad_norm": 0.2786376316429162, - "learning_rate": 3.588222878684966e-06, - "loss": 0.8676, - "num_tokens": 79213285.0, - "step": 407 - }, - { - "epoch": 1.3086196503918024, - "grad_norm": 0.2539894759870622, - "learning_rate": 3.567095136136571e-06, - "loss": 0.8167, - "num_tokens": 80048188.0, - "step": 408 - }, - { - "epoch": 1.3118344384167169, - "grad_norm": 0.2561947101428561, - "learning_rate": 3.546019512879164e-06, - "loss": 0.7443, - "num_tokens": 80859952.0, - "step": 409 - }, - { - "epoch": 1.3150492264416316, - "grad_norm": 0.269421047983167, - "learning_rate": 3.5249965772007e-06, - "loss": 0.8164, - "num_tokens": 81678179.0, - "step": 410 - }, - { - "epoch": 1.318264014466546, - "grad_norm": 0.2512906728263445, - "learning_rate": 3.5040268959684433e-06, - "loss": 0.7971, - "num_tokens": 82537286.0, - "step": 411 - }, - { - "epoch": 1.3214788024914608, - "grad_norm": 0.2739804437734352, - "learning_rate": 3.4831110346137005e-06, - "loss": 0.792, - "num_tokens": 83337690.0, - "step": 412 - }, - { - "epoch": 1.3246935905163753, - "grad_norm": 0.2695497497338681, - "learning_rate": 3.4622495571165633e-06, - "loss": 0.8549, - "num_tokens": 84190187.0, - "step": 413 - }, - { - "epoch": 1.32790837854129, - "grad_norm": 0.26419092284628565, - "learning_rate": 3.4414430259907074e-06, - "loss": 0.832, - "num_tokens": 85019526.0, - "step": 414 - }, - { - "epoch": 1.3311231665662047, - "grad_norm": 0.2666306611252099, - "learning_rate": 3.4206920022682173e-06, - "loss": 0.8098, - "num_tokens": 85824148.0, - "step": 415 - }, - { - "epoch": 1.3343379545911191, - "grad_norm": 0.2490145705647061, - "learning_rate": 3.3999970454844688e-06, - "loss": 0.8554, - "num_tokens": 86723516.0, - "step": 416 - }, - { - "epoch": 1.3375527426160336, - "grad_norm": 0.2871288045632187, - "learning_rate": 3.3793587136630314e-06, - "loss": 0.8226, - "num_tokens": 87502869.0, - "step": 417 - }, - { - "epoch": 1.3407675306409483, - "grad_norm": 0.25797074273590526, - "learning_rate": 3.3587775633006264e-06, - "loss": 0.7654, - "num_tokens": 88342215.0, - "step": 418 - }, - { - "epoch": 1.343982318665863, - "grad_norm": 0.23681829418935663, - "learning_rate": 3.3382541493521255e-06, - "loss": 0.6972, - "num_tokens": 89191542.0, - "step": 419 - }, - { - "epoch": 1.3471971066907775, - "grad_norm": 0.2561406920096222, - "learning_rate": 3.3177890252155755e-06, - "loss": 0.798, - "num_tokens": 90018973.0, - "step": 420 - }, - { - "epoch": 1.3504118947156922, - "grad_norm": 0.24957187541274106, - "learning_rate": 3.297382742717291e-06, - "loss": 0.8241, - "num_tokens": 90843777.0, - "step": 421 - }, - { - "epoch": 1.3536266827406067, - "grad_norm": 0.24711605699464417, - "learning_rate": 3.2770358520969596e-06, - "loss": 0.8288, - "num_tokens": 91668578.0, - "step": 422 - }, - { - "epoch": 1.3568414707655214, - "grad_norm": 0.28458496860745003, - "learning_rate": 3.2567489019928204e-06, - "loss": 0.8581, - "num_tokens": 92456165.0, - "step": 423 - }, - { - "epoch": 1.360056258790436, - "grad_norm": 0.26372609997872326, - "learning_rate": 3.2365224394268542e-06, - "loss": 0.8301, - "num_tokens": 93288010.0, - "step": 424 - }, - { - "epoch": 1.3632710468153506, - "grad_norm": 0.25880756448236597, - "learning_rate": 3.2163570097900497e-06, - "loss": 0.7003, - "num_tokens": 94076772.0, - "step": 425 - }, - { - "epoch": 1.3664858348402653, - "grad_norm": 0.26509850977489957, - "learning_rate": 3.1962531568276843e-06, - "loss": 0.8705, - "num_tokens": 94898037.0, - "step": 426 - }, - { - "epoch": 1.3697006228651798, - "grad_norm": 0.26982337864045847, - "learning_rate": 3.176211422624672e-06, - "loss": 0.7821, - "num_tokens": 95736365.0, - "step": 427 - }, - { - "epoch": 1.3729154108900945, - "grad_norm": 0.25481825059494023, - "learning_rate": 3.1562323475909385e-06, - "loss": 0.7911, - "num_tokens": 96560443.0, - "step": 428 - }, - { - "epoch": 1.376130198915009, - "grad_norm": 0.26608889624846666, - "learning_rate": 3.136316470446853e-06, - "loss": 0.8219, - "num_tokens": 97384378.0, - "step": 429 - }, - { - "epoch": 1.3793449869399237, - "grad_norm": 0.2702314901178419, - "learning_rate": 3.116464328208708e-06, - "loss": 0.8838, - "num_tokens": 98197907.0, - "step": 430 - }, - { - "epoch": 1.3825597749648382, - "grad_norm": 0.27233781178375016, - "learning_rate": 3.096676456174225e-06, - "loss": 0.8457, - "num_tokens": 99017573.0, - "step": 431 - }, - { - "epoch": 1.3857745629897529, - "grad_norm": 0.2720355778698873, - "learning_rate": 3.0769533879081383e-06, - "loss": 0.8099, - "num_tokens": 99815790.0, - "step": 432 - }, - { - "epoch": 1.3889893510146676, - "grad_norm": 0.2709262045643404, - "learning_rate": 3.0572956552277896e-06, - "loss": 0.7968, - "num_tokens": 100629678.0, - "step": 433 - }, - { - "epoch": 1.392204139039582, - "grad_norm": 0.2778417968495386, - "learning_rate": 3.0377037881888027e-06, - "loss": 0.8568, - "num_tokens": 101442913.0, - "step": 434 - }, - { - "epoch": 1.3954189270644966, - "grad_norm": 0.2678724711104578, - "learning_rate": 3.0181783150707827e-06, - "loss": 0.7567, - "num_tokens": 102238177.0, - "step": 435 - }, - { - "epoch": 1.3986337150894113, - "grad_norm": 0.24691610325723734, - "learning_rate": 2.998719762363076e-06, - "loss": 0.7975, - "num_tokens": 103093757.0, - "step": 436 - }, - { - "epoch": 1.401848503114326, - "grad_norm": 0.24504796916539434, - "learning_rate": 2.979328654750566e-06, - "loss": 0.7806, - "num_tokens": 103922686.0, - "step": 437 - }, - { - "epoch": 1.4050632911392404, - "grad_norm": 0.27204309845099944, - "learning_rate": 2.9600055150995397e-06, - "loss": 0.8973, - "num_tokens": 104756269.0, - "step": 438 - }, - { - "epoch": 1.4082780791641551, - "grad_norm": 0.2765510124268507, - "learning_rate": 2.9407508644435724e-06, - "loss": 0.7893, - "num_tokens": 105498163.0, - "step": 439 - }, - { - "epoch": 1.4114928671890699, - "grad_norm": 0.2649491430779883, - "learning_rate": 2.921565221969492e-06, - "loss": 0.8025, - "num_tokens": 106291959.0, - "step": 440 - }, - { - "epoch": 1.4147076552139843, - "grad_norm": 0.2547139708059402, - "learning_rate": 2.9024491050033687e-06, - "loss": 0.8436, - "num_tokens": 107148793.0, - "step": 441 - }, - { - "epoch": 1.4179224432388988, - "grad_norm": 0.26401470785437514, - "learning_rate": 2.8834030289965776e-06, - "loss": 0.7897, - "num_tokens": 107936426.0, - "step": 442 - }, - { - "epoch": 1.4211372312638135, - "grad_norm": 0.2683748440212363, - "learning_rate": 2.864427507511886e-06, - "loss": 0.8697, - "num_tokens": 108802309.0, - "step": 443 - }, - { - "epoch": 1.4243520192887282, - "grad_norm": 0.2604773606257542, - "learning_rate": 2.8455230522096177e-06, - "loss": 0.8515, - "num_tokens": 109677615.0, - "step": 444 - }, - { - "epoch": 1.4275668073136427, - "grad_norm": 0.2502932437830269, - "learning_rate": 2.8266901728338526e-06, - "loss": 0.8426, - "num_tokens": 110486407.0, - "step": 445 - }, - { - "epoch": 1.4307815953385574, - "grad_norm": 0.25314189849323304, - "learning_rate": 2.807929377198674e-06, - "loss": 0.7229, - "num_tokens": 111360404.0, - "step": 446 - }, - { - "epoch": 1.433996383363472, - "grad_norm": 0.2379397383151164, - "learning_rate": 2.7892411711744925e-06, - "loss": 0.7642, - "num_tokens": 112295752.0, - "step": 447 - }, - { - "epoch": 1.4372111713883866, - "grad_norm": 0.25377452590631056, - "learning_rate": 2.770626058674387e-06, - "loss": 0.6995, - "num_tokens": 113100488.0, - "step": 448 - }, - { - "epoch": 1.440425959413301, - "grad_norm": 0.25454746865211153, - "learning_rate": 2.7520845416405285e-06, - "loss": 0.7284, - "num_tokens": 113936981.0, - "step": 449 - }, - { - "epoch": 1.4436407474382158, - "grad_norm": 0.25876333986082656, - "learning_rate": 2.7336171200306467e-06, - "loss": 0.8231, - "num_tokens": 114730281.0, - "step": 450 - }, - { - "epoch": 1.4468555354631305, - "grad_norm": 0.25167097343840633, - "learning_rate": 2.715224291804539e-06, - "loss": 0.7068, - "num_tokens": 115574422.0, - "step": 451 - }, - { - "epoch": 1.450070323488045, - "grad_norm": 0.2596150797991358, - "learning_rate": 2.696906552910657e-06, - "loss": 0.7978, - "num_tokens": 116435503.0, - "step": 452 - }, - { - "epoch": 1.4532851115129597, - "grad_norm": 0.24201584887018057, - "learning_rate": 2.6786643972727177e-06, - "loss": 0.7482, - "num_tokens": 117326599.0, - "step": 453 - }, - { - "epoch": 1.4564998995378742, - "grad_norm": 0.25878521706358387, - "learning_rate": 2.660498316776402e-06, - "loss": 0.7912, - "num_tokens": 118155674.0, - "step": 454 - }, - { - "epoch": 1.4597146875627889, - "grad_norm": 0.24812071994107007, - "learning_rate": 2.6424088012560766e-06, - "loss": 0.685, - "num_tokens": 118875393.0, - "step": 455 - }, - { - "epoch": 1.4629294755877034, - "grad_norm": 0.23254966421936232, - "learning_rate": 2.6243963384815995e-06, - "loss": 0.824, - "num_tokens": 119750240.0, - "step": 456 - }, - { - "epoch": 1.466144263612618, - "grad_norm": 0.2519370291416487, - "learning_rate": 2.606461414145154e-06, - "loss": 0.6745, - "num_tokens": 120597196.0, - "step": 457 - }, - { - "epoch": 1.4693590516375328, - "grad_norm": 0.2647912210058911, - "learning_rate": 2.58860451184816e-06, - "loss": 0.805, - "num_tokens": 121468747.0, - "step": 458 - }, - { - "epoch": 1.4725738396624473, - "grad_norm": 0.27507392498483907, - "learning_rate": 2.570826113088239e-06, - "loss": 0.8446, - "num_tokens": 122267886.0, - "step": 459 - }, - { - "epoch": 1.4757886276873617, - "grad_norm": 0.26362477049123234, - "learning_rate": 2.5531266972462176e-06, - "loss": 0.7366, - "num_tokens": 123126664.0, - "step": 460 - }, - { - "epoch": 1.4790034157122764, - "grad_norm": 0.2680464229105509, - "learning_rate": 2.5355067415732163e-06, - "loss": 0.7676, - "num_tokens": 123973743.0, - "step": 461 - }, - { - "epoch": 1.4822182037371912, - "grad_norm": 0.2643530238558276, - "learning_rate": 2.5179667211777657e-06, - "loss": 0.8623, - "num_tokens": 124781051.0, - "step": 462 - }, - { - "epoch": 1.4854329917621056, - "grad_norm": 0.24815465020380179, - "learning_rate": 2.5005071090130107e-06, - "loss": 0.8306, - "num_tokens": 125677143.0, - "step": 463 - }, - { - "epoch": 1.4886477797870203, - "grad_norm": 0.2324232229434005, - "learning_rate": 2.483128375863943e-06, - "loss": 0.6697, - "num_tokens": 126527714.0, - "step": 464 - }, - { - "epoch": 1.4918625678119348, - "grad_norm": 0.2820743394326073, - "learning_rate": 2.4658309903347196e-06, - "loss": 0.9259, - "num_tokens": 127426604.0, - "step": 465 - }, - { - "epoch": 1.4950773558368495, - "grad_norm": 0.27261471827159156, - "learning_rate": 2.4486154188360177e-06, - "loss": 0.7822, - "num_tokens": 128249974.0, - "step": 466 - }, - { - "epoch": 1.498292143861764, - "grad_norm": 0.2916059802121358, - "learning_rate": 2.431482125572467e-06, - "loss": 0.9665, - "num_tokens": 129020832.0, - "step": 467 - }, - { - "epoch": 1.5015069318866787, - "grad_norm": 0.26475467982906703, - "learning_rate": 2.4144315725301222e-06, - "loss": 0.7683, - "num_tokens": 129798805.0, - "step": 468 - }, - { - "epoch": 1.5047217199115934, - "grad_norm": 0.2691414211654332, - "learning_rate": 2.3974642194640134e-06, - "loss": 0.7628, - "num_tokens": 130612319.0, - "step": 469 - }, - { - "epoch": 1.507936507936508, - "grad_norm": 0.2546651120004282, - "learning_rate": 2.380580523885751e-06, - "loss": 0.7445, - "num_tokens": 131384964.0, - "step": 470 - }, - { - "epoch": 1.5111512959614224, - "grad_norm": 0.2432355476567615, - "learning_rate": 2.36378094105118e-06, - "loss": 0.7524, - "num_tokens": 132220717.0, - "step": 471 - }, - { - "epoch": 1.514366083986337, - "grad_norm": 0.25601280838595436, - "learning_rate": 2.3470659239481167e-06, - "loss": 0.714, - "num_tokens": 133035706.0, - "step": 472 - }, - { - "epoch": 1.5175808720112518, - "grad_norm": 0.25038445325570996, - "learning_rate": 2.3304359232841204e-06, - "loss": 0.8312, - "num_tokens": 133847183.0, - "step": 473 - }, - { - "epoch": 1.5207956600361663, - "grad_norm": 0.2781111772639142, - "learning_rate": 2.313891387474352e-06, - "loss": 0.8699, - "num_tokens": 134714334.0, - "step": 474 - }, - { - "epoch": 1.524010448061081, - "grad_norm": 0.2485311264124574, - "learning_rate": 2.29743276262948e-06, - "loss": 0.8261, - "num_tokens": 135580687.0, - "step": 475 - }, - { - "epoch": 1.5272252360859957, - "grad_norm": 0.25039628992685026, - "learning_rate": 2.281060492543644e-06, - "loss": 0.7426, - "num_tokens": 136417913.0, - "step": 476 - }, - { - "epoch": 1.5304400241109102, - "grad_norm": 0.243784045014063, - "learning_rate": 2.2647750186825e-06, - "loss": 0.7438, - "num_tokens": 137252131.0, - "step": 477 - }, - { - "epoch": 1.5336548121358247, - "grad_norm": 0.2560213448308454, - "learning_rate": 2.248576780171306e-06, - "loss": 0.7877, - "num_tokens": 138129191.0, - "step": 478 - }, - { - "epoch": 1.5368696001607394, - "grad_norm": 0.2431314320507105, - "learning_rate": 2.23246621378309e-06, - "loss": 0.7381, - "num_tokens": 138953656.0, - "step": 479 - }, - { - "epoch": 1.540084388185654, - "grad_norm": 0.24955952074382184, - "learning_rate": 2.2164437539268652e-06, - "loss": 0.7119, - "num_tokens": 139772354.0, - "step": 480 - }, - { - "epoch": 1.5432991762105686, - "grad_norm": 0.2584531368303038, - "learning_rate": 2.200509832635923e-06, - "loss": 0.8268, - "num_tokens": 140667739.0, - "step": 481 - }, - { - "epoch": 1.5465139642354833, - "grad_norm": 0.23814643088642898, - "learning_rate": 2.1846648795561777e-06, - "loss": 0.7639, - "num_tokens": 141492108.0, - "step": 482 - }, - { - "epoch": 1.549728752260398, - "grad_norm": 0.24286140193720698, - "learning_rate": 2.168909321934588e-06, - "loss": 0.7297, - "num_tokens": 142264138.0, - "step": 483 - }, - { - "epoch": 1.5529435402853125, - "grad_norm": 0.2501679856605912, - "learning_rate": 2.1532435846076277e-06, - "loss": 0.7924, - "num_tokens": 143088713.0, - "step": 484 - }, - { - "epoch": 1.556158328310227, - "grad_norm": 0.2512659042513529, - "learning_rate": 2.1376680899898415e-06, - "loss": 0.6957, - "num_tokens": 143858894.0, - "step": 485 - }, - { - "epoch": 1.5593731163351416, - "grad_norm": 0.24980910632142586, - "learning_rate": 2.1221832580624435e-06, - "loss": 0.8104, - "num_tokens": 144728257.0, - "step": 486 - }, - { - "epoch": 1.5625879043600563, - "grad_norm": 0.22308542819009722, - "learning_rate": 2.1067895063620034e-06, - "loss": 0.7067, - "num_tokens": 145513192.0, - "step": 487 - }, - { - "epoch": 1.5658026923849708, - "grad_norm": 0.2594272068599245, - "learning_rate": 2.0914872499691785e-06, - "loss": 0.7728, - "num_tokens": 146318597.0, - "step": 488 - }, - { - "epoch": 1.5690174804098853, - "grad_norm": 0.23649552319624165, - "learning_rate": 2.076276901497526e-06, - "loss": 0.6542, - "num_tokens": 147186539.0, - "step": 489 - }, - { - "epoch": 1.5722322684348002, - "grad_norm": 0.2846339774513336, - "learning_rate": 2.0611588710823797e-06, - "loss": 0.8653, - "num_tokens": 148026310.0, - "step": 490 - }, - { - "epoch": 1.5754470564597147, - "grad_norm": 0.25610814139845334, - "learning_rate": 2.0461335663697847e-06, - "loss": 0.8409, - "num_tokens": 148893677.0, - "step": 491 - }, - { - "epoch": 1.5786618444846292, - "grad_norm": 0.24262741212524863, - "learning_rate": 2.0312013925055128e-06, - "loss": 0.7497, - "num_tokens": 149712167.0, - "step": 492 - }, - { - "epoch": 1.581876632509544, - "grad_norm": 0.2433345711035905, - "learning_rate": 2.016362752124129e-06, - "loss": 0.8424, - "num_tokens": 150535790.0, - "step": 493 - }, - { - "epoch": 1.5850914205344586, - "grad_norm": 0.24482974313873548, - "learning_rate": 2.0016180453381463e-06, - "loss": 0.8348, - "num_tokens": 151323036.0, - "step": 494 - }, - { - "epoch": 1.588306208559373, - "grad_norm": 0.2456621341387075, - "learning_rate": 1.986967669727224e-06, - "loss": 0.748, - "num_tokens": 152144736.0, - "step": 495 - }, - { - "epoch": 1.5915209965842876, - "grad_norm": 0.24569608941122856, - "learning_rate": 1.9724120203274595e-06, - "loss": 0.7493, - "num_tokens": 152935889.0, - "step": 496 - }, - { - "epoch": 1.5947357846092023, - "grad_norm": 0.26166602781955245, - "learning_rate": 1.9579514896207284e-06, - "loss": 0.7749, - "num_tokens": 153768543.0, - "step": 497 - }, - { - "epoch": 1.597950572634117, - "grad_norm": 0.2696877412236291, - "learning_rate": 1.943586467524102e-06, - "loss": 0.8786, - "num_tokens": 154653874.0, - "step": 498 - }, - { - "epoch": 1.6011653606590315, - "grad_norm": 0.24259561016088813, - "learning_rate": 1.9293173413793408e-06, - "loss": 0.7054, - "num_tokens": 155503587.0, - "step": 499 - }, - { - "epoch": 1.6043801486839462, - "grad_norm": 0.23770858403564227, - "learning_rate": 1.9151444959424383e-06, - "loss": 0.7879, - "num_tokens": 156333121.0, - "step": 500 - }, - { - "epoch": 1.6075949367088609, - "grad_norm": 0.23325112314910393, - "learning_rate": 1.9010683133732593e-06, - "loss": 0.7238, - "num_tokens": 157193419.0, - "step": 501 - }, - { - "epoch": 1.6108097247337754, - "grad_norm": 0.2499729759276442, - "learning_rate": 1.8870891732252228e-06, - "loss": 0.7663, - "num_tokens": 158107661.0, - "step": 502 - }, - { - "epoch": 1.6140245127586899, - "grad_norm": 0.2903409999289307, - "learning_rate": 1.8732074524350798e-06, - "loss": 0.8387, - "num_tokens": 158916357.0, - "step": 503 - }, - { - "epoch": 1.6172393007836046, - "grad_norm": 0.2384683337038407, - "learning_rate": 1.8594235253127373e-06, - "loss": 0.7074, - "num_tokens": 159750104.0, - "step": 504 - }, - { - "epoch": 1.6204540888085193, - "grad_norm": 0.2436839893133206, - "learning_rate": 1.8457377635311763e-06, - "loss": 0.81, - "num_tokens": 160623884.0, - "step": 505 - }, - { - "epoch": 1.6236688768334337, - "grad_norm": 0.24572502160484339, - "learning_rate": 1.832150536116421e-06, - "loss": 0.668, - "num_tokens": 161451622.0, - "step": 506 - }, - { - "epoch": 1.6268836648583485, - "grad_norm": 0.25824900709932386, - "learning_rate": 1.8186622094375955e-06, - "loss": 0.8017, - "num_tokens": 162294153.0, - "step": 507 - }, - { - "epoch": 1.6300984528832632, - "grad_norm": 0.2607051667213561, - "learning_rate": 1.8052731471970398e-06, - "loss": 0.7992, - "num_tokens": 163126511.0, - "step": 508 - }, - { - "epoch": 1.6333132409081776, - "grad_norm": 0.2457788665067117, - "learning_rate": 1.7919837104205056e-06, - "loss": 0.8138, - "num_tokens": 163919233.0, - "step": 509 - }, - { - "epoch": 1.6365280289330921, - "grad_norm": 0.28345935653263943, - "learning_rate": 1.7787942574474215e-06, - "loss": 0.919, - "num_tokens": 164739051.0, - "step": 510 - }, - { - "epoch": 1.6397428169580068, - "grad_norm": 0.2340875650229455, - "learning_rate": 1.7657051439212265e-06, - "loss": 0.6995, - "num_tokens": 165535828.0, - "step": 511 - }, - { - "epoch": 1.6429576049829215, - "grad_norm": 0.2422390254042746, - "learning_rate": 1.7527167227797881e-06, - "loss": 0.7921, - "num_tokens": 166387479.0, - "step": 512 - }, - { - "epoch": 1.646172393007836, - "grad_norm": 0.26618051546177984, - "learning_rate": 1.739829344245878e-06, - "loss": 0.8081, - "num_tokens": 167195962.0, - "step": 513 - }, - { - "epoch": 1.6493871810327505, - "grad_norm": 0.2371858854989563, - "learning_rate": 1.7270433558177334e-06, - "loss": 0.7764, - "num_tokens": 168030941.0, - "step": 514 - }, - { - "epoch": 1.6526019690576652, - "grad_norm": 0.25632355048937316, - "learning_rate": 1.7143591022596846e-06, - "loss": 0.7869, - "num_tokens": 168881073.0, - "step": 515 - }, - { - "epoch": 1.65581675708258, - "grad_norm": 0.25495604284857626, - "learning_rate": 1.7017769255928602e-06, - "loss": 0.8732, - "num_tokens": 169661744.0, - "step": 516 - }, - { - "epoch": 1.6590315451074944, - "grad_norm": 0.24334649844883566, - "learning_rate": 1.6892971650859635e-06, - "loss": 0.7352, - "num_tokens": 170493380.0, - "step": 517 - }, - { - "epoch": 1.662246333132409, - "grad_norm": 0.28195748354750644, - "learning_rate": 1.6769201572461242e-06, - "loss": 0.8909, - "num_tokens": 171220301.0, - "step": 518 - }, - { - "epoch": 1.6654611211573238, - "grad_norm": 0.27470825914141944, - "learning_rate": 1.6646462358098275e-06, - "loss": 0.8595, - "num_tokens": 172050191.0, - "step": 519 - }, - { - "epoch": 1.6686759091822383, - "grad_norm": 0.23364869043368322, - "learning_rate": 1.6524757317339102e-06, - "loss": 0.7429, - "num_tokens": 172900564.0, - "step": 520 - }, - { - "epoch": 1.6718906972071528, - "grad_norm": 0.24939702130984195, - "learning_rate": 1.6404089731866435e-06, - "loss": 0.7664, - "num_tokens": 173783873.0, - "step": 521 - }, - { - "epoch": 1.6751054852320675, - "grad_norm": 0.2468256468691575, - "learning_rate": 1.6284462855388769e-06, - "loss": 0.7848, - "num_tokens": 174619064.0, - "step": 522 - }, - { - "epoch": 1.6783202732569822, - "grad_norm": 0.2489884682065826, - "learning_rate": 1.6165879913552705e-06, - "loss": 0.7318, - "num_tokens": 175412868.0, - "step": 523 - }, - { - "epoch": 1.6815350612818967, - "grad_norm": 0.24471261266913003, - "learning_rate": 1.6048344103855927e-06, - "loss": 0.7574, - "num_tokens": 176276633.0, - "step": 524 - }, - { - "epoch": 1.6847498493068114, - "grad_norm": 0.25351042609637503, - "learning_rate": 1.593185859556103e-06, - "loss": 0.7928, - "num_tokens": 177151925.0, - "step": 525 - }, - { - "epoch": 1.687964637331726, - "grad_norm": 0.2566146976154411, - "learning_rate": 1.5816426529610035e-06, - "loss": 0.8449, - "num_tokens": 177955624.0, - "step": 526 - }, - { - "epoch": 1.6911794253566406, - "grad_norm": 0.23538792436540756, - "learning_rate": 1.5702051018539684e-06, - "loss": 0.7615, - "num_tokens": 178855199.0, - "step": 527 - }, - { - "epoch": 1.694394213381555, - "grad_norm": 0.23491490633366938, - "learning_rate": 1.558873514639756e-06, - "loss": 0.7572, - "num_tokens": 179670207.0, - "step": 528 - }, - { - "epoch": 1.6976090014064698, - "grad_norm": 0.23501024625345632, - "learning_rate": 1.5476481968658874e-06, - "loss": 0.8104, - "num_tokens": 180484701.0, - "step": 529 - }, - { - "epoch": 1.7008237894313845, - "grad_norm": 0.22863229610491123, - "learning_rate": 1.5365294512144114e-06, - "loss": 0.7156, - "num_tokens": 181352190.0, - "step": 530 - }, - { - "epoch": 1.704038577456299, - "grad_norm": 0.22522585509472975, - "learning_rate": 1.5255175774937406e-06, - "loss": 0.775, - "num_tokens": 182330861.0, - "step": 531 - }, - { - "epoch": 1.7072533654812134, - "grad_norm": 0.22627930960209444, - "learning_rate": 1.51461287263057e-06, - "loss": 0.7562, - "num_tokens": 183212938.0, - "step": 532 - }, - { - "epoch": 1.7104681535061284, - "grad_norm": 0.2377422877172429, - "learning_rate": 1.503815630661866e-06, - "loss": 0.708, - "num_tokens": 184032551.0, - "step": 533 - }, - { - "epoch": 1.7136829415310428, - "grad_norm": 0.24691075863376308, - "learning_rate": 1.493126142726945e-06, - "loss": 0.86, - "num_tokens": 184891245.0, - "step": 534 - }, - { - "epoch": 1.7168977295559573, - "grad_norm": 0.25442519284305526, - "learning_rate": 1.4825446970596136e-06, - "loss": 0.7899, - "num_tokens": 185765708.0, - "step": 535 - }, - { - "epoch": 1.720112517580872, - "grad_norm": 0.25917934944331666, - "learning_rate": 1.4720715789804077e-06, - "loss": 0.7783, - "num_tokens": 186613876.0, - "step": 536 - }, - { - "epoch": 1.7233273056057867, - "grad_norm": 0.25196089901644464, - "learning_rate": 1.4617070708888882e-06, - "loss": 0.8441, - "num_tokens": 187423862.0, - "step": 537 - }, - { - "epoch": 1.7265420936307012, - "grad_norm": 0.2512883060345931, - "learning_rate": 1.4514514522560336e-06, - "loss": 0.79, - "num_tokens": 188280375.0, - "step": 538 - }, - { - "epoch": 1.7297568816556157, - "grad_norm": 0.24987887637768286, - "learning_rate": 1.4413049996167017e-06, - "loss": 0.8442, - "num_tokens": 189144121.0, - "step": 539 - }, - { - "epoch": 1.7329716696805304, - "grad_norm": 0.23853952082735316, - "learning_rate": 1.4312679865621742e-06, - "loss": 0.7863, - "num_tokens": 190022157.0, - "step": 540 - }, - { - "epoch": 1.736186457705445, - "grad_norm": 0.24004351197968307, - "learning_rate": 1.4213406837327777e-06, - "loss": 0.8271, - "num_tokens": 190896241.0, - "step": 541 - }, - { - "epoch": 1.7394012457303596, - "grad_norm": 0.23354559253718268, - "learning_rate": 1.4115233588105876e-06, - "loss": 0.7371, - "num_tokens": 191760127.0, - "step": 542 - }, - { - "epoch": 1.7426160337552743, - "grad_norm": 0.2403295880993666, - "learning_rate": 1.4018162765122076e-06, - "loss": 0.7675, - "num_tokens": 192567114.0, - "step": 543 - }, - { - "epoch": 1.745830821780189, - "grad_norm": 0.2530600725115969, - "learning_rate": 1.3922196985816381e-06, - "loss": 0.7938, - "num_tokens": 193329370.0, - "step": 544 - }, - { - "epoch": 1.7490456098051035, - "grad_norm": 0.24939871292349367, - "learning_rate": 1.382733883783211e-06, - "loss": 0.8283, - "num_tokens": 194251343.0, - "step": 545 - }, - { - "epoch": 1.752260397830018, - "grad_norm": 0.23924721564709042, - "learning_rate": 1.373359087894617e-06, - "loss": 0.8244, - "num_tokens": 195043302.0, - "step": 546 - }, - { - "epoch": 1.7554751858549327, - "grad_norm": 0.22711116182469693, - "learning_rate": 1.3640955637000061e-06, - "loss": 0.5745, - "num_tokens": 195837695.0, - "step": 547 - }, - { - "epoch": 1.7586899738798474, - "grad_norm": 0.25948939092932555, - "learning_rate": 1.354943560983175e-06, - "loss": 0.7893, - "num_tokens": 196642158.0, - "step": 548 - }, - { - "epoch": 1.7619047619047619, - "grad_norm": 0.25344477070087834, - "learning_rate": 1.345903326520827e-06, - "loss": 0.7948, - "num_tokens": 197393763.0, - "step": 549 - }, - { - "epoch": 1.7651195499296763, - "grad_norm": 0.24360768653209378, - "learning_rate": 1.3369751040759236e-06, - "loss": 0.7838, - "num_tokens": 198205381.0, - "step": 550 - }, - { - "epoch": 1.7683343379545913, - "grad_norm": 0.25595992970482323, - "learning_rate": 1.3281591343911046e-06, - "loss": 0.8622, - "num_tokens": 199041320.0, - "step": 551 - }, - { - "epoch": 1.7715491259795058, - "grad_norm": 0.2516640546487355, - "learning_rate": 1.3194556551822053e-06, - "loss": 0.7459, - "num_tokens": 199844172.0, - "step": 552 - }, - { - "epoch": 1.7747639140044202, - "grad_norm": 0.26565624817624645, - "learning_rate": 1.3108649011318371e-06, - "loss": 0.806, - "num_tokens": 200675342.0, - "step": 553 - }, - { - "epoch": 1.777978702029335, - "grad_norm": 0.26056007547368304, - "learning_rate": 1.302387103883068e-06, - "loss": 0.8373, - "num_tokens": 201492910.0, - "step": 554 - }, - { - "epoch": 1.7811934900542497, - "grad_norm": 0.2193075675884426, - "learning_rate": 1.2940224920331707e-06, - "loss": 0.6554, - "num_tokens": 202371415.0, - "step": 555 - }, - { - "epoch": 1.7844082780791641, - "grad_norm": 0.24476127070381667, - "learning_rate": 1.2857712911274628e-06, - "loss": 0.8789, - "num_tokens": 203234229.0, - "step": 556 - }, - { - "epoch": 1.7876230661040786, - "grad_norm": 0.24744383482240603, - "learning_rate": 1.2776337236532213e-06, - "loss": 0.8397, - "num_tokens": 204091625.0, - "step": 557 - }, - { - "epoch": 1.7908378541289933, - "grad_norm": 0.24190730441647348, - "learning_rate": 1.2696100090336847e-06, - "loss": 0.7299, - "num_tokens": 204826007.0, - "step": 558 - }, - { - "epoch": 1.794052642153908, - "grad_norm": 0.24746376469158593, - "learning_rate": 1.2617003636221394e-06, - "loss": 0.8083, - "num_tokens": 205666409.0, - "step": 559 - }, - { - "epoch": 1.7972674301788225, - "grad_norm": 0.26176272109587745, - "learning_rate": 1.2539050006960814e-06, - "loss": 0.8342, - "num_tokens": 206462474.0, - "step": 560 - }, - { - "epoch": 1.8004822182037372, - "grad_norm": 0.2791374382580208, - "learning_rate": 1.2462241304514693e-06, - "loss": 0.9057, - "num_tokens": 207288020.0, - "step": 561 - }, - { - "epoch": 1.803697006228652, - "grad_norm": 0.2612041235973392, - "learning_rate": 1.2386579599970524e-06, - "loss": 0.764, - "num_tokens": 208093649.0, - "step": 562 - }, - { - "epoch": 1.8069117942535664, - "grad_norm": 0.2509749476948631, - "learning_rate": 1.2312066933487906e-06, - "loss": 0.8823, - "num_tokens": 208918969.0, - "step": 563 - }, - { - "epoch": 1.810126582278481, - "grad_norm": 0.2639759111678885, - "learning_rate": 1.22387053142435e-06, - "loss": 0.7374, - "num_tokens": 209693371.0, - "step": 564 - }, - { - "epoch": 1.8133413703033956, - "grad_norm": 0.24891741992023017, - "learning_rate": 1.2166496720376874e-06, - "loss": 0.8994, - "num_tokens": 210552556.0, - "step": 565 - }, - { - "epoch": 1.8165561583283103, - "grad_norm": 0.22683549151420104, - "learning_rate": 1.2095443098937143e-06, - "loss": 0.7893, - "num_tokens": 211475509.0, - "step": 566 - }, - { - "epoch": 1.8197709463532248, - "grad_norm": 0.21465240595326388, - "learning_rate": 1.2025546365830483e-06, - "loss": 0.6999, - "num_tokens": 212382390.0, - "step": 567 - }, - { - "epoch": 1.8229857343781395, - "grad_norm": 0.22512728472555754, - "learning_rate": 1.1956808405768472e-06, - "loss": 0.7317, - "num_tokens": 213188571.0, - "step": 568 - }, - { - "epoch": 1.8262005224030542, - "grad_norm": 0.25247522555353324, - "learning_rate": 1.1889231072217266e-06, - "loss": 0.7595, - "num_tokens": 213951357.0, - "step": 569 - }, - { - "epoch": 1.8294153104279687, - "grad_norm": 0.2444488058654175, - "learning_rate": 1.1822816187347625e-06, - "loss": 0.6918, - "num_tokens": 214686689.0, - "step": 570 - }, - { - "epoch": 1.8326300984528832, - "grad_norm": 0.24901080106942608, - "learning_rate": 1.1757565541985754e-06, - "loss": 0.8472, - "num_tokens": 215531950.0, - "step": 571 - }, - { - "epoch": 1.8358448864777979, - "grad_norm": 0.22992063557767975, - "learning_rate": 1.1693480895565062e-06, - "loss": 0.7387, - "num_tokens": 216422632.0, - "step": 572 - }, - { - "epoch": 1.8390596745027126, - "grad_norm": 0.24102476056232364, - "learning_rate": 1.163056397607867e-06, - "loss": 0.827, - "num_tokens": 217297785.0, - "step": 573 - }, - { - "epoch": 1.842274462527627, - "grad_norm": 0.2334995596964159, - "learning_rate": 1.1568816480032876e-06, - "loss": 0.6935, - "num_tokens": 218118571.0, - "step": 574 - }, - { - "epoch": 1.8454892505525415, - "grad_norm": 0.23168654270538455, - "learning_rate": 1.1508240072401336e-06, - "loss": 0.6726, - "num_tokens": 218895980.0, - "step": 575 - }, - { - "epoch": 1.8487040385774565, - "grad_norm": 0.23849726753023398, - "learning_rate": 1.1448836386580239e-06, - "loss": 0.849, - "num_tokens": 219768969.0, - "step": 576 - }, - { - "epoch": 1.851918826602371, - "grad_norm": 0.23973496720376417, - "learning_rate": 1.1390607024344224e-06, - "loss": 0.829, - "num_tokens": 220576609.0, - "step": 577 - }, - { - "epoch": 1.8551336146272854, - "grad_norm": 0.24311858709639553, - "learning_rate": 1.1333553555803188e-06, - "loss": 0.835, - "num_tokens": 221400443.0, - "step": 578 - }, - { - "epoch": 1.8583484026522001, - "grad_norm": 0.23184083017439597, - "learning_rate": 1.127767751935998e-06, - "loss": 0.7064, - "num_tokens": 222141188.0, - "step": 579 - }, - { - "epoch": 1.8615631906771148, - "grad_norm": 0.24110607700241163, - "learning_rate": 1.1222980421668874e-06, - "loss": 0.7592, - "num_tokens": 222974799.0, - "step": 580 - }, - { - "epoch": 1.8647779787020293, - "grad_norm": 0.25518420435896805, - "learning_rate": 1.1169463737594995e-06, - "loss": 0.7689, - "num_tokens": 223746143.0, - "step": 581 - }, - { - "epoch": 1.8679927667269438, - "grad_norm": 0.2576552077825468, - "learning_rate": 1.1117128910174505e-06, - "loss": 0.8438, - "num_tokens": 224555219.0, - "step": 582 - }, - { - "epoch": 1.8712075547518585, - "grad_norm": 0.2191561890293986, - "learning_rate": 1.106597735057573e-06, - "loss": 0.6908, - "num_tokens": 225411414.0, - "step": 583 - }, - { - "epoch": 1.8744223427767732, - "grad_norm": 0.2572906172176427, - "learning_rate": 1.1016010438061063e-06, - "loss": 0.7564, - "num_tokens": 226151777.0, - "step": 584 - }, - { - "epoch": 1.8776371308016877, - "grad_norm": 0.2310182251613661, - "learning_rate": 1.0967229519949833e-06, - "loss": 0.8158, - "num_tokens": 227029894.0, - "step": 585 - }, - { - "epoch": 1.8808519188266024, - "grad_norm": 0.2630064861411062, - "learning_rate": 1.091963591158192e-06, - "loss": 0.8907, - "num_tokens": 227803854.0, - "step": 586 - }, - { - "epoch": 1.8840667068515171, - "grad_norm": 0.22235886880583858, - "learning_rate": 1.0873230896282314e-06, - "loss": 0.6793, - "num_tokens": 228677483.0, - "step": 587 - }, - { - "epoch": 1.8872814948764316, - "grad_norm": 0.2634912036918554, - "learning_rate": 1.082801572532652e-06, - "loss": 0.7726, - "num_tokens": 229498024.0, - "step": 588 - }, - { - "epoch": 1.890496282901346, - "grad_norm": 0.257794453465977, - "learning_rate": 1.0783991617906798e-06, - "loss": 0.8223, - "num_tokens": 230276429.0, - "step": 589 - }, - { - "epoch": 1.8937110709262608, - "grad_norm": 0.23100780709249996, - "learning_rate": 1.0741159761099294e-06, - "loss": 0.7438, - "num_tokens": 231127189.0, - "step": 590 - }, - { - "epoch": 1.8969258589511755, - "grad_norm": 0.24293781210631205, - "learning_rate": 1.0699521309832042e-06, - "loss": 0.7482, - "num_tokens": 231932725.0, - "step": 591 - }, - { - "epoch": 1.90014064697609, - "grad_norm": 0.222633670642957, - "learning_rate": 1.0659077386853817e-06, - "loss": 0.7483, - "num_tokens": 232798962.0, - "step": 592 - }, - { - "epoch": 1.9033554350010045, - "grad_norm": 0.2369979453803826, - "learning_rate": 1.0619829082703846e-06, - "loss": 0.7682, - "num_tokens": 233641760.0, - "step": 593 - }, - { - "epoch": 1.9065702230259194, - "grad_norm": 0.2348139409515845, - "learning_rate": 1.0581777455682428e-06, - "loss": 0.8232, - "num_tokens": 234523716.0, - "step": 594 - }, - { - "epoch": 1.9097850110508339, - "grad_norm": 0.21543079662902476, - "learning_rate": 1.054492353182237e-06, - "loss": 0.6764, - "num_tokens": 235441766.0, - "step": 595 - }, - { - "epoch": 1.9129997990757484, - "grad_norm": 0.28135712263939683, - "learning_rate": 1.0509268304861358e-06, - "loss": 0.9252, - "num_tokens": 236215520.0, - "step": 596 - }, - { - "epoch": 1.916214587100663, - "grad_norm": 0.252768897893859, - "learning_rate": 1.0474812736215122e-06, - "loss": 0.8575, - "num_tokens": 237091607.0, - "step": 597 - }, - { - "epoch": 1.9194293751255778, - "grad_norm": 0.235775188354417, - "learning_rate": 1.0441557754951527e-06, - "loss": 0.7563, - "num_tokens": 237891695.0, - "step": 598 - }, - { - "epoch": 1.9226441631504922, - "grad_norm": 0.2521745823178246, - "learning_rate": 1.0409504257765536e-06, - "loss": 0.8244, - "num_tokens": 238710354.0, - "step": 599 - }, - { - "epoch": 1.9258589511754067, - "grad_norm": 0.24875563980994658, - "learning_rate": 1.0378653108955017e-06, - "loss": 0.7996, - "num_tokens": 239538696.0, - "step": 600 - }, - { - "epoch": 1.9290737392003214, - "grad_norm": 0.2649578867494801, - "learning_rate": 1.0349005140397438e-06, - "loss": 0.7845, - "num_tokens": 240277767.0, - "step": 601 - }, - { - "epoch": 1.9322885272252361, - "grad_norm": 0.23865127900655456, - "learning_rate": 1.0320561151527425e-06, - "loss": 0.7832, - "num_tokens": 241108706.0, - "step": 602 - }, - { - "epoch": 1.9355033152501506, - "grad_norm": 0.24189323771249807, - "learning_rate": 1.0293321909315242e-06, - "loss": 0.6937, - "num_tokens": 241891368.0, - "step": 603 - }, - { - "epoch": 1.9387181032750653, - "grad_norm": 0.23018796376232412, - "learning_rate": 1.0267288148246075e-06, - "loss": 0.8141, - "num_tokens": 242731976.0, - "step": 604 - }, - { - "epoch": 1.94193289129998, - "grad_norm": 0.24387982865367927, - "learning_rate": 1.0242460570300241e-06, - "loss": 0.7372, - "num_tokens": 243562131.0, - "step": 605 - }, - { - "epoch": 1.9451476793248945, - "grad_norm": 0.26888274887148733, - "learning_rate": 1.021883984493426e-06, - "loss": 0.9365, - "num_tokens": 244329865.0, - "step": 606 - }, - { - "epoch": 1.948362467349809, - "grad_norm": 0.24490289163791748, - "learning_rate": 1.0196426609062788e-06, - "loss": 0.7716, - "num_tokens": 245164637.0, - "step": 607 - }, - { - "epoch": 1.9515772553747237, - "grad_norm": 0.2608625536338709, - "learning_rate": 1.0175221467041479e-06, - "loss": 0.8335, - "num_tokens": 245933494.0, - "step": 608 - }, - { - "epoch": 1.9547920433996384, - "grad_norm": 0.2659841954395964, - "learning_rate": 1.015522499065066e-06, - "loss": 0.7835, - "num_tokens": 246688927.0, - "step": 609 - }, - { - "epoch": 1.958006831424553, - "grad_norm": 0.22433644772055011, - "learning_rate": 1.01364377190799e-06, - "loss": 0.7819, - "num_tokens": 247552395.0, - "step": 610 - }, - { - "epoch": 1.9612216194494676, - "grad_norm": 0.236779865903178, - "learning_rate": 1.0118860158913527e-06, - "loss": 0.7382, - "num_tokens": 248410260.0, - "step": 611 - }, - { - "epoch": 1.9644364074743823, - "grad_norm": 0.2426766158302053, - "learning_rate": 1.0102492784116897e-06, - "loss": 0.8233, - "num_tokens": 249239516.0, - "step": 612 - }, - { - "epoch": 1.9676511954992968, - "grad_norm": 0.22655059778311104, - "learning_rate": 1.0087336036023673e-06, - "loss": 0.671, - "num_tokens": 250040321.0, - "step": 613 - }, - { - "epoch": 1.9708659835242113, - "grad_norm": 0.24100866478105012, - "learning_rate": 1.0073390323323897e-06, - "loss": 0.7513, - "num_tokens": 250836977.0, - "step": 614 - }, - { - "epoch": 1.974080771549126, - "grad_norm": 0.22926333112269273, - "learning_rate": 1.0060656022052966e-06, - "loss": 0.749, - "num_tokens": 251711943.0, - "step": 615 - }, - { - "epoch": 1.9772955595740407, - "grad_norm": 0.23260077066414575, - "learning_rate": 1.0049133475581504e-06, - "loss": 0.7723, - "num_tokens": 252585708.0, - "step": 616 - }, - { - "epoch": 1.9805103475989552, - "grad_norm": 0.2605281342579115, - "learning_rate": 1.0038822994606109e-06, - "loss": 0.8502, - "num_tokens": 253417041.0, - "step": 617 - }, - { - "epoch": 1.9837251356238697, - "grad_norm": 0.2412497429281566, - "learning_rate": 1.0029724857140962e-06, - "loss": 0.6234, - "num_tokens": 254171628.0, - "step": 618 - }, - { - "epoch": 1.9869399236487844, - "grad_norm": 0.25746351281176827, - "learning_rate": 1.002183930851032e-06, - "loss": 0.7873, - "num_tokens": 254956082.0, - "step": 619 - }, - { - "epoch": 1.990154711673699, - "grad_norm": 0.2458754330245484, - "learning_rate": 1.0015166561341943e-06, - "loss": 0.7761, - "num_tokens": 255850029.0, - "step": 620 - }, - { - "epoch": 1.9933694996986135, - "grad_norm": 0.24289535830110504, - "learning_rate": 1.0009706795561308e-06, - "loss": 0.8013, - "num_tokens": 256733081.0, - "step": 621 - }, - { - "epoch": 1.9965842877235283, - "grad_norm": 0.25614737146528116, - "learning_rate": 1.0005460158386799e-06, - "loss": 0.8623, - "num_tokens": 257547044.0, - "step": 622 - }, - { - "epoch": 1.999799075748443, - "grad_norm": 0.24015035103986382, - "learning_rate": 1.0002426764325719e-06, - "loss": 0.8037, - "num_tokens": 258413945.0, - "step": 623 - }, - { - "epoch": 2.0, - "grad_norm": 0.24015035103986382, - "learning_rate": 1.0000606695171197e-06, - "loss": 0.5419, - "num_tokens": 258470904.0, - "step": 624 - }, - { - "epoch": 2.0, - "eval_loss": 0.5947001576423645, - "eval_num_tokens": 258470904.0, - "eval_runtime": 198.9138, - "eval_samples_per_second": 44.482, - "eval_steps_per_second": 5.56, - "step": 624 - }, - { - "epoch": 2.0, - "step": 624, - "total_flos": 2.2203599102840668e+18, - "train_loss": 0.3945140190995656, - "train_runtime": 7938.1861, - "train_samples_per_second": 20.062, - "train_steps_per_second": 0.079 + "epoch": 3.0, + "step": 324, + "total_flos": 3.064163325664297e+17, + "train_loss": 0.4129233885510468, + "train_runtime": 3035.4475, + "train_samples_per_second": 27.135, + "train_steps_per_second": 0.107 } ], "logging_steps": 1, - "max_steps": 624, + "max_steps": 324, "num_input_tokens_seen": 0, - "num_train_epochs": 2, + "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -5046,8 +2682,8 @@ "attributes": {} } }, - "total_flos": 2.2203599102840668e+18, - "train_batch_size": 2, + "total_flos": 3.064163325664297e+17, + "train_batch_size": 4, "trial_name": null, "trial_params": null }