{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.553351909523029, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06556302245533518, "grad_norm": 0.2032165825366974, "learning_rate": 9.999745598795031e-08, "loss": 0.4359, "num_input_tokens_seen": 3794784, "step": 50, "train_runtime": 288.0643, "train_tokens_per_second": 13173.393 }, { "epoch": 0.13112604491067037, "grad_norm": 0.12323546409606934, "learning_rate": 9.998961548920028e-08, "loss": 0.5461, "num_input_tokens_seen": 7656384, "step": 100, "train_runtime": 563.6125, "train_tokens_per_second": 13584.481 }, { "epoch": 0.19668906736600558, "grad_norm": 0.6143731474876404, "learning_rate": 9.997647827492774e-08, "loss": 0.4957, "num_input_tokens_seen": 11324688, "step": 150, "train_runtime": 840.9128, "train_tokens_per_second": 13467.137 }, { "epoch": 0.26225208982134074, "grad_norm": 5.337645053863525, "learning_rate": 9.995804573710351e-08, "loss": 0.4317, "num_input_tokens_seen": 14915760, "step": 200, "train_runtime": 1098.5331, "train_tokens_per_second": 13577.889 }, { "epoch": 0.327815112276676, "grad_norm": 9.094950675964355, "learning_rate": 9.993431982877141e-08, "loss": 0.3758, "num_input_tokens_seen": 18760920, "step": 250, "train_runtime": 1370.436, "train_tokens_per_second": 13689.745 }, { "epoch": 0.39337813473201116, "grad_norm": 8.739721298217773, "learning_rate": 9.990530306384132e-08, "loss": 0.4875, "num_input_tokens_seen": 22666272, "step": 300, "train_runtime": 1675.777, "train_tokens_per_second": 13525.828 }, { "epoch": 0.45894115718734635, "grad_norm": 5.213091850280762, "learning_rate": 9.987099851682273e-08, "loss": 0.5377, "num_input_tokens_seen": 26550816, "step": 350, "train_runtime": 1969.5671, "train_tokens_per_second": 13480.534 }, { "epoch": 0.5245041796426815, "grad_norm": 6.928552627563477, "learning_rate": 9.983140982249912e-08, "loss": 0.5284, "num_input_tokens_seen": 30502512, "step": 400, "train_runtime": 2276.9451, "train_tokens_per_second": 13396.244 }, { "epoch": 0.5900672020980167, "grad_norm": 7.398318290710449, "learning_rate": 9.978654117554268e-08, "loss": 0.3501, "num_input_tokens_seen": 34219392, "step": 450, "train_runtime": 2557.2601, "train_tokens_per_second": 13381.272 }, { "epoch": 0.655630224553352, "grad_norm": 0.12503379583358765, "learning_rate": 9.973639733006998e-08, "loss": 0.4336, "num_input_tokens_seen": 38231808, "step": 500, "train_runtime": 2907.4779, "train_tokens_per_second": 13149.475 }, { "epoch": 0.7211932470086871, "grad_norm": 1.006555199623108, "learning_rate": 9.968098359913822e-08, "loss": 0.382, "num_input_tokens_seen": 42037704, "step": 550, "train_runtime": 3185.8277, "train_tokens_per_second": 13195.222 }, { "epoch": 0.7867562694640223, "grad_norm": 7.536371231079102, "learning_rate": 9.962030585418215e-08, "loss": 0.3866, "num_input_tokens_seen": 46037664, "step": 600, "train_runtime": 3488.8435, "train_tokens_per_second": 13195.681 }, { "epoch": 0.8523192919193575, "grad_norm": 0.24487841129302979, "learning_rate": 9.955437052439219e-08, "loss": 0.4026, "num_input_tokens_seen": 49944816, "step": 650, "train_runtime": 3776.75, "train_tokens_per_second": 13224.284 }, { "epoch": 0.9178823143746927, "grad_norm": 1.2577345371246338, "learning_rate": 9.948318459603297e-08, "loss": 0.3547, "num_input_tokens_seen": 53838960, "step": 700, "train_runtime": 4095.3801, "train_tokens_per_second": 13146.267 }, { "epoch": 0.9834453368300279, "grad_norm": 0.23559170961380005, "learning_rate": 9.940675561170326e-08, "loss": 0.3269, "num_input_tokens_seen": 57703848, "step": 750, "train_runtime": 4401.0597, "train_tokens_per_second": 13111.353 }, { "epoch": 1.048516636616948, "grad_norm": 2.2465434074401855, "learning_rate": 9.932509166953673e-08, "loss": 0.38, "num_input_tokens_seen": 61456680, "step": 800, "train_runtime": 4678.4603, "train_tokens_per_second": 13136.091 }, { "epoch": 1.1140796590722832, "grad_norm": 0.8857269287109375, "learning_rate": 9.923820142234384e-08, "loss": 0.3671, "num_input_tokens_seen": 65352192, "step": 850, "train_runtime": 4987.8785, "train_tokens_per_second": 13102.202 }, { "epoch": 1.1796426815276184, "grad_norm": 2.611070394515991, "learning_rate": 9.914609407669518e-08, "loss": 0.2795, "num_input_tokens_seen": 69406008, "step": 900, "train_runtime": 5331.3796, "train_tokens_per_second": 13018.395 }, { "epoch": 1.2452057039829536, "grad_norm": 0.18760572373867035, "learning_rate": 9.904877939194582e-08, "loss": 0.3224, "num_input_tokens_seen": 73152336, "step": 950, "train_runtime": 5603.6792, "train_tokens_per_second": 13054.341 }, { "epoch": 1.3107687264382888, "grad_norm": 7.031470775604248, "learning_rate": 9.894626767920125e-08, "loss": 0.2581, "num_input_tokens_seen": 76955160, "step": 1000, "train_runtime": 5891.4617, "train_tokens_per_second": 13062.151 }, { "epoch": 1.376331748893624, "grad_norm": 3.1105947494506836, "learning_rate": 9.883856980022501e-08, "loss": 0.2146, "num_input_tokens_seen": 80682888, "step": 1050, "train_runtime": 6172.6315, "train_tokens_per_second": 13071.068 }, { "epoch": 1.4418947713489592, "grad_norm": 3.3154454231262207, "learning_rate": 9.872569716628762e-08, "loss": 0.1974, "num_input_tokens_seen": 84505128, "step": 1100, "train_runtime": 6464.0066, "train_tokens_per_second": 13073.181 }, { "epoch": 1.5074577938042943, "grad_norm": 2.295762062072754, "learning_rate": 9.860766173695762e-08, "loss": 0.331, "num_input_tokens_seen": 88457640, "step": 1150, "train_runtime": 6787.6545, "train_tokens_per_second": 13032.137 }, { "epoch": 1.5730208162596295, "grad_norm": 3.430027484893799, "learning_rate": 9.848447601883434e-08, "loss": 0.2295, "num_input_tokens_seen": 92425752, "step": 1200, "train_runtime": 7110.9534, "train_tokens_per_second": 12997.66 }, { "epoch": 1.6385838387149647, "grad_norm": 5.2876200675964355, "learning_rate": 9.83561530642227e-08, "loss": 0.3534, "num_input_tokens_seen": 96447384, "step": 1250, "train_runtime": 7430.2464, "train_tokens_per_second": 12980.375 }, { "epoch": 1.7041468611703, "grad_norm": 2.3764872550964355, "learning_rate": 9.822270646975031e-08, "loss": 0.2875, "num_input_tokens_seen": 100202232, "step": 1300, "train_runtime": 7704.6648, "train_tokens_per_second": 13005.398 }, { "epoch": 1.769709883625635, "grad_norm": 0.5971184968948364, "learning_rate": 9.808415037492677e-08, "loss": 0.1869, "num_input_tokens_seen": 103938744, "step": 1350, "train_runtime": 7967.1016, "train_tokens_per_second": 13045.992 }, { "epoch": 1.8352729060809705, "grad_norm": 1.1916333436965942, "learning_rate": 9.794049946064551e-08, "loss": 0.2173, "num_input_tokens_seen": 107626320, "step": 1400, "train_runtime": 8229.9563, "train_tokens_per_second": 13077.387 }, { "epoch": 1.9008359285363055, "grad_norm": 1.6566100120544434, "learning_rate": 9.779176894762831e-08, "loss": 0.2168, "num_input_tokens_seen": 111377760, "step": 1450, "train_runtime": 8503.7645, "train_tokens_per_second": 13097.465 }, { "epoch": 1.9663989509916409, "grad_norm": 3.6912384033203125, "learning_rate": 9.763797459481244e-08, "loss": 0.2844, "num_input_tokens_seen": 115314840, "step": 1500, "train_runtime": 8803.7543, "train_tokens_per_second": 13098.371 }, { "epoch": 2.0314702507785607, "grad_norm": 0.7536889910697937, "learning_rate": 9.747913269768107e-08, "loss": 0.1743, "num_input_tokens_seen": 118930008, "step": 1550, "train_runtime": 9062.5998, "train_tokens_per_second": 13123.167 }, { "epoch": 2.097033273233896, "grad_norm": 4.382725715637207, "learning_rate": 9.731526008653652e-08, "loss": 0.1793, "num_input_tokens_seen": 122730384, "step": 1600, "train_runtime": 9342.1738, "train_tokens_per_second": 13137.24 }, { "epoch": 2.162596295689231, "grad_norm": 1.2656387090682983, "learning_rate": 9.714637412471703e-08, "loss": 0.2939, "num_input_tokens_seen": 126529800, "step": 1650, "train_runtime": 9635.6982, "train_tokens_per_second": 13131.358 }, { "epoch": 2.2281593181445665, "grad_norm": 2.5040361881256104, "learning_rate": 9.697249270675705e-08, "loss": 0.2434, "num_input_tokens_seen": 130443600, "step": 1700, "train_runtime": 9927.959, "train_tokens_per_second": 13139.015 }, { "epoch": 2.293722340599902, "grad_norm": 0.9235166311264038, "learning_rate": 9.679363425649115e-08, "loss": 0.2993, "num_input_tokens_seen": 134517072, "step": 1750, "train_runtime": 10260.6078, "train_tokens_per_second": 13110.049 }, { "epoch": 2.359285363055237, "grad_norm": 1.0807639360427856, "learning_rate": 9.660981772510188e-08, "loss": 0.192, "num_input_tokens_seen": 138214584, "step": 1800, "train_runtime": 10530.3372, "train_tokens_per_second": 13125.371 }, { "epoch": 2.424848385510572, "grad_norm": 1.5869427919387817, "learning_rate": 9.642106258911184e-08, "loss": 0.2412, "num_input_tokens_seen": 142113144, "step": 1850, "train_runtime": 10835.7191, "train_tokens_per_second": 13115.248 }, { "epoch": 2.490411407965907, "grad_norm": 1.165739893913269, "learning_rate": 9.622738884831996e-08, "loss": 0.2425, "num_input_tokens_seen": 146119920, "step": 1900, "train_runtime": 11148.5673, "train_tokens_per_second": 13106.61 }, { "epoch": 2.5559744304212426, "grad_norm": 1.7617275714874268, "learning_rate": 9.602881702368242e-08, "loss": 0.2262, "num_input_tokens_seen": 150087360, "step": 1950, "train_runtime": 11458.6247, "train_tokens_per_second": 13098.2 }, { "epoch": 2.6215374528765776, "grad_norm": 0.4497505724430084, "learning_rate": 9.582536815513833e-08, "loss": 0.1427, "num_input_tokens_seen": 153908160, "step": 2000, "train_runtime": 11749.4731, "train_tokens_per_second": 13099.154 }, { "epoch": 2.6871004753319125, "grad_norm": 0.7155716419219971, "learning_rate": 9.561706379938041e-08, "loss": 0.222, "num_input_tokens_seen": 157607040, "step": 2050, "train_runtime": 12052.7614, "train_tokens_per_second": 13076.426 }, { "epoch": 2.752663497787248, "grad_norm": 1.3807727098464966, "learning_rate": 9.540392602757093e-08, "loss": 0.1474, "num_input_tokens_seen": 161453160, "step": 2100, "train_runtime": 12357.3875, "train_tokens_per_second": 13065.315 }, { "epoch": 2.8182265202425834, "grad_norm": 0.739932119846344, "learning_rate": 9.518597742300308e-08, "loss": 0.265, "num_input_tokens_seen": 165287904, "step": 2150, "train_runtime": 12651.7227, "train_tokens_per_second": 13064.458 }, { "epoch": 2.8837895426979183, "grad_norm": 0.4396991431713104, "learning_rate": 9.496324107870821e-08, "loss": 0.2944, "num_input_tokens_seen": 169326888, "step": 2200, "train_runtime": 12967.7154, "train_tokens_per_second": 13057.573 }, { "epoch": 2.9493525651532537, "grad_norm": 0.38162505626678467, "learning_rate": 9.47357405950089e-08, "loss": 0.2348, "num_input_tokens_seen": 173020800, "step": 2250, "train_runtime": 13223.1088, "train_tokens_per_second": 13084.729 }, { "epoch": 3.0144238649401736, "grad_norm": 3.874674081802368, "learning_rate": 9.450350007701847e-08, "loss": 0.2311, "num_input_tokens_seen": 176668584, "step": 2300, "train_runtime": 13516.1565, "train_tokens_per_second": 13070.919 }, { "epoch": 3.079986887395509, "grad_norm": 0.7723739743232727, "learning_rate": 9.426654413208668e-08, "loss": 0.2964, "num_input_tokens_seen": 180729120, "step": 2350, "train_runtime": 13841.6367, "train_tokens_per_second": 13056.918 }, { "epoch": 3.145549909850844, "grad_norm": 1.5033811330795288, "learning_rate": 9.40248978671927e-08, "loss": 0.2084, "num_input_tokens_seen": 184677672, "step": 2400, "train_runtime": 14150.4953, "train_tokens_per_second": 13050.969 }, { "epoch": 3.2111129323061793, "grad_norm": 1.8196630477905273, "learning_rate": 9.377858688628464e-08, "loss": 0.1717, "num_input_tokens_seen": 188404488, "step": 2450, "train_runtime": 14408.9636, "train_tokens_per_second": 13075.506 }, { "epoch": 3.2766759547615143, "grad_norm": 0.9214364290237427, "learning_rate": 9.352763728756675e-08, "loss": 0.23, "num_input_tokens_seen": 192323616, "step": 2500, "train_runtime": 14710.1132, "train_tokens_per_second": 13074.244 }, { "epoch": 3.3422389772168497, "grad_norm": 1.0862064361572266, "learning_rate": 9.327207566073416e-08, "loss": 0.2271, "num_input_tokens_seen": 196108272, "step": 2550, "train_runtime": 14979.1529, "train_tokens_per_second": 13092.08 }, { "epoch": 3.407801999672185, "grad_norm": 0.8413626551628113, "learning_rate": 9.301192908415552e-08, "loss": 0.2193, "num_input_tokens_seen": 199941432, "step": 2600, "train_runtime": 15282.6531, "train_tokens_per_second": 13082.901 }, { "epoch": 3.47336502212752, "grad_norm": 1.531718134880066, "learning_rate": 9.274722512200379e-08, "loss": 0.1382, "num_input_tokens_seen": 203779920, "step": 2650, "train_runtime": 15565.7388, "train_tokens_per_second": 13091.568 }, { "epoch": 3.538928044582855, "grad_norm": 0.0838296189904213, "learning_rate": 9.247799182133582e-08, "loss": 0.2191, "num_input_tokens_seen": 207633384, "step": 2700, "train_runtime": 15868.2059, "train_tokens_per_second": 13084.868 }, { "epoch": 3.6044910670381904, "grad_norm": 1.1013773679733276, "learning_rate": 9.220425770912042e-08, "loss": 0.1988, "num_input_tokens_seen": 211368360, "step": 2750, "train_runtime": 16143.9244, "train_tokens_per_second": 13092.75 }, { "epoch": 3.670054089493526, "grad_norm": 0.40529268980026245, "learning_rate": 9.192605178921584e-08, "loss": 0.3072, "num_input_tokens_seen": 215149128, "step": 2800, "train_runtime": 16445.4494, "train_tokens_per_second": 13082.593 }, { "epoch": 3.735617111948861, "grad_norm": 1.5882924795150757, "learning_rate": 9.164340353929659e-08, "loss": 0.1822, "num_input_tokens_seen": 218796552, "step": 2850, "train_runtime": 16707.491, "train_tokens_per_second": 13095.716 }, { "epoch": 3.8011801344041958, "grad_norm": 0.862838089466095, "learning_rate": 9.13563429077301e-08, "loss": 0.2437, "num_input_tokens_seen": 222623832, "step": 2900, "train_runtime": 16994.843, "train_tokens_per_second": 13099.493 }, { "epoch": 3.866743156859531, "grad_norm": 0.7801971435546875, "learning_rate": 9.106490031040353e-08, "loss": 0.3174, "num_input_tokens_seen": 226629408, "step": 2950, "train_runtime": 17320.4321, "train_tokens_per_second": 13084.512 }, { "epoch": 3.9323061793148666, "grad_norm": 0.4492790699005127, "learning_rate": 9.076910662750096e-08, "loss": 0.199, "num_input_tokens_seen": 230444736, "step": 3000, "train_runtime": 17612.3894, "train_tokens_per_second": 13084.24 }, { "epoch": 3.9978692017702016, "grad_norm": 4.88616418838501, "learning_rate": 9.04689932002315e-08, "loss": 0.1764, "num_input_tokens_seen": 234389904, "step": 3050, "train_runtime": 17949.0057, "train_tokens_per_second": 13058.657 }, { "epoch": 4.062940501557121, "grad_norm": 0.597968339920044, "learning_rate": 9.016459182750843e-08, "loss": 0.209, "num_input_tokens_seen": 238124880, "step": 3100, "train_runtime": 18244.7826, "train_tokens_per_second": 13051.67 }, { "epoch": 4.128503524012457, "grad_norm": 0.8793305158615112, "learning_rate": 8.985593476257997e-08, "loss": 0.2686, "num_input_tokens_seen": 241758864, "step": 3150, "train_runtime": 18507.4406, "train_tokens_per_second": 13062.793 }, { "epoch": 4.194066546467792, "grad_norm": 7.551540851593018, "learning_rate": 8.954305470961178e-08, "loss": 0.2529, "num_input_tokens_seen": 245698488, "step": 3200, "train_runtime": 18827.139, "train_tokens_per_second": 13050.23 }, { "epoch": 4.259629568923128, "grad_norm": 0.4505975842475891, "learning_rate": 8.922598482022182e-08, "loss": 0.2384, "num_input_tokens_seen": 249595968, "step": 3250, "train_runtime": 19129.2909, "train_tokens_per_second": 13047.842 }, { "epoch": 4.325192591378462, "grad_norm": 2.2207558155059814, "learning_rate": 8.890475868996762e-08, "loss": 0.1867, "num_input_tokens_seen": 253481304, "step": 3300, "train_runtime": 19419.7804, "train_tokens_per_second": 13052.738 }, { "epoch": 4.3907556138337975, "grad_norm": 0.9266397356987, "learning_rate": 8.857941035478673e-08, "loss": 0.1763, "num_input_tokens_seen": 257255976, "step": 3350, "train_runtime": 19702.252, "train_tokens_per_second": 13057.186 }, { "epoch": 4.456318636289133, "grad_norm": 0.29596129059791565, "learning_rate": 8.824997428739036e-08, "loss": 0.2278, "num_input_tokens_seen": 261064368, "step": 3400, "train_runtime": 19998.9663, "train_tokens_per_second": 13053.893 }, { "epoch": 4.521881658744468, "grad_norm": 0.9699137210845947, "learning_rate": 8.791648539361072e-08, "loss": 0.201, "num_input_tokens_seen": 264944352, "step": 3450, "train_runtime": 20299.7802, "train_tokens_per_second": 13051.587 }, { "epoch": 4.587444681199804, "grad_norm": 1.298768401145935, "learning_rate": 8.757897900870261e-08, "loss": 0.2057, "num_input_tokens_seen": 268791072, "step": 3500, "train_runtime": 20594.4257, "train_tokens_per_second": 13051.642 }, { "epoch": 4.653007703655138, "grad_norm": 12.011015892028809, "learning_rate": 8.72374908935994e-08, "loss": 0.2351, "num_input_tokens_seen": 272495832, "step": 3550, "train_runtime": 20885.3413, "train_tokens_per_second": 13047.229 }, { "epoch": 4.718570726110474, "grad_norm": 0.24729423224925995, "learning_rate": 8.689205723112387e-08, "loss": 0.2065, "num_input_tokens_seen": 276393408, "step": 3600, "train_runtime": 21206.0433, "train_tokens_per_second": 13033.71 }, { "epoch": 4.784133748565809, "grad_norm": 2.150505781173706, "learning_rate": 8.654271462215454e-08, "loss": 0.158, "num_input_tokens_seen": 280197624, "step": 3650, "train_runtime": 21488.9397, "train_tokens_per_second": 13039.155 }, { "epoch": 4.849696771021144, "grad_norm": 0.4875163435935974, "learning_rate": 8.618950008174746e-08, "loss": 0.1832, "num_input_tokens_seen": 284031624, "step": 3700, "train_runtime": 21778.1233, "train_tokens_per_second": 13042.062 }, { "epoch": 4.915259793476479, "grad_norm": 0.5430140495300293, "learning_rate": 8.583245103521428e-08, "loss": 0.2566, "num_input_tokens_seen": 287936280, "step": 3750, "train_runtime": 22067.8249, "train_tokens_per_second": 13047.787 }, { "epoch": 4.980822815931814, "grad_norm": 0.3734208941459656, "learning_rate": 8.547160531415679e-08, "loss": 0.2775, "num_input_tokens_seen": 291838584, "step": 3800, "train_runtime": 22359.3364, "train_tokens_per_second": 13052.202 }, { "epoch": 5.045894115718735, "grad_norm": 0.9905921220779419, "learning_rate": 8.510700115245841e-08, "loss": 0.1971, "num_input_tokens_seen": 295643712, "step": 3850, "train_runtime": 22653.1086, "train_tokens_per_second": 13050.911 }, { "epoch": 5.11145713817407, "grad_norm": 0.0872701108455658, "learning_rate": 8.473867718223315e-08, "loss": 0.3142, "num_input_tokens_seen": 299528016, "step": 3900, "train_runtime": 22970.1152, "train_tokens_per_second": 13039.9 }, { "epoch": 5.177020160629405, "grad_norm": 0.7591832876205444, "learning_rate": 8.436667242973218e-08, "loss": 0.2291, "num_input_tokens_seen": 303643632, "step": 3950, "train_runtime": 23324.5779, "train_tokens_per_second": 13018.183 }, { "epoch": 5.24258318308474, "grad_norm": 0.44477882981300354, "learning_rate": 8.399102631120877e-08, "loss": 0.2128, "num_input_tokens_seen": 307574184, "step": 4000, "train_runtime": 23603.684, "train_tokens_per_second": 13030.77 }, { "epoch": 5.308146205540075, "grad_norm": 0.48096030950546265, "learning_rate": 8.361177862874202e-08, "loss": 0.1472, "num_input_tokens_seen": 311323584, "step": 4050, "train_runtime": 23888.6512, "train_tokens_per_second": 13032.28 }, { "epoch": 5.373709227995411, "grad_norm": 0.9138302206993103, "learning_rate": 8.32289695660194e-08, "loss": 0.1981, "num_input_tokens_seen": 315158328, "step": 4100, "train_runtime": 24182.2327, "train_tokens_per_second": 13032.64 }, { "epoch": 5.439272250450745, "grad_norm": 0.3333579897880554, "learning_rate": 8.284263968407912e-08, "loss": 0.1837, "num_input_tokens_seen": 318844944, "step": 4150, "train_runtime": 24456.5915, "train_tokens_per_second": 13037.178 }, { "epoch": 5.504835272906081, "grad_norm": 0.9484214782714844, "learning_rate": 8.245282991701243e-08, "loss": 0.2015, "num_input_tokens_seen": 322685568, "step": 4200, "train_runtime": 24723.1173, "train_tokens_per_second": 13051.977 }, { "epoch": 5.570398295361416, "grad_norm": 0.4100230634212494, "learning_rate": 8.205958156762646e-08, "loss": 0.2554, "num_input_tokens_seen": 326275680, "step": 4250, "train_runtime": 24984.4942, "train_tokens_per_second": 13059.127 }, { "epoch": 5.635961317816752, "grad_norm": 0.9571174383163452, "learning_rate": 8.166293630306773e-08, "loss": 0.2039, "num_input_tokens_seen": 330026184, "step": 4300, "train_runtime": 25280.2384, "train_tokens_per_second": 13054.71 }, { "epoch": 5.701524340272087, "grad_norm": 0.5215702652931213, "learning_rate": 8.126293615040747e-08, "loss": 0.2277, "num_input_tokens_seen": 333968520, "step": 4350, "train_runtime": 25565.1364, "train_tokens_per_second": 13063.436 }, { "epoch": 5.7670873627274215, "grad_norm": 0.4471840560436249, "learning_rate": 8.085962349218847e-08, "loss": 0.2104, "num_input_tokens_seen": 337707624, "step": 4400, "train_runtime": 25841.3753, "train_tokens_per_second": 13068.485 }, { "epoch": 5.832650385182757, "grad_norm": 1.0097142457962036, "learning_rate": 8.04530410619344e-08, "loss": 0.2524, "num_input_tokens_seen": 341503488, "step": 4450, "train_runtime": 26137.1854, "train_tokens_per_second": 13065.81 }, { "epoch": 5.898213407638092, "grad_norm": 1.6211527585983276, "learning_rate": 8.004323193962197e-08, "loss": 0.1595, "num_input_tokens_seen": 345388440, "step": 4500, "train_runtime": 26453.2756, "train_tokens_per_second": 13056.547 }, { "epoch": 5.963776430093427, "grad_norm": 0.25499045848846436, "learning_rate": 7.963023954711624e-08, "loss": 0.2721, "num_input_tokens_seen": 349216920, "step": 4550, "train_runtime": 26741.8598, "train_tokens_per_second": 13058.812 }, { "epoch": 6.028847729880347, "grad_norm": 0.6265522837638855, "learning_rate": 7.921410764356988e-08, "loss": 0.1993, "num_input_tokens_seen": 353096424, "step": 4600, "train_runtime": 27061.8507, "train_tokens_per_second": 13047.756 }, { "epoch": 6.0944107523356825, "grad_norm": 0.06899835169315338, "learning_rate": 7.87948803207866e-08, "loss": 0.2228, "num_input_tokens_seen": 356829384, "step": 4650, "train_runtime": 27330.1966, "train_tokens_per_second": 13056.232 }, { "epoch": 6.159973774791018, "grad_norm": 0.8082672953605652, "learning_rate": 7.837260199854929e-08, "loss": 0.1859, "num_input_tokens_seen": 360447864, "step": 4700, "train_runtime": 27571.8796, "train_tokens_per_second": 13073.025 }, { "epoch": 6.225536797246353, "grad_norm": 0.6293157339096069, "learning_rate": 7.794731741991355e-08, "loss": 0.2223, "num_input_tokens_seen": 364279296, "step": 4750, "train_runtime": 27852.8113, "train_tokens_per_second": 13078.726 }, { "epoch": 6.291099819701688, "grad_norm": 0.7018508315086365, "learning_rate": 7.751907164646682e-08, "loss": 0.1709, "num_input_tokens_seen": 368000976, "step": 4800, "train_runtime": 28103.875, "train_tokens_per_second": 13094.314 }, { "epoch": 6.356662842157023, "grad_norm": 0.3939789831638336, "learning_rate": 7.70879100535538e-08, "loss": 0.1903, "num_input_tokens_seen": 371666208, "step": 4850, "train_runtime": 28370.3397, "train_tokens_per_second": 13100.52 }, { "epoch": 6.422225864612359, "grad_norm": 0.07075575739145279, "learning_rate": 7.665387832546873e-08, "loss": 0.1653, "num_input_tokens_seen": 375530976, "step": 4900, "train_runtime": 28672.2738, "train_tokens_per_second": 13097.356 }, { "epoch": 6.487788887067694, "grad_norm": 1.4741820096969604, "learning_rate": 7.621702245061479e-08, "loss": 0.2247, "num_input_tokens_seen": 379400040, "step": 4950, "train_runtime": 28956.4169, "train_tokens_per_second": 13102.451 }, { "epoch": 6.553351909523029, "grad_norm": 0.756077229976654, "learning_rate": 7.577738871663131e-08, "loss": 0.2299, "num_input_tokens_seen": 383417568, "step": 5000, "train_runtime": 29294.8197, "train_tokens_per_second": 13088.238 } ], "logging_steps": 50, "max_steps": 15260, "num_input_tokens_seen": 383417568, "num_train_epochs": 20, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.358913850245906e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }