| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 6.553351909523029, |
| "eval_steps": 500, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.06556302245533518, |
| "grad_norm": 0.2032165825366974, |
| "learning_rate": 9.999745598795031e-08, |
| "loss": 0.4359, |
| "num_input_tokens_seen": 3794784, |
| "step": 50, |
| "train_runtime": 288.0643, |
| "train_tokens_per_second": 13173.393 |
| }, |
| { |
| "epoch": 0.13112604491067037, |
| "grad_norm": 0.12323546409606934, |
| "learning_rate": 9.998961548920028e-08, |
| "loss": 0.5461, |
| "num_input_tokens_seen": 7656384, |
| "step": 100, |
| "train_runtime": 563.6125, |
| "train_tokens_per_second": 13584.481 |
| }, |
| { |
| "epoch": 0.19668906736600558, |
| "grad_norm": 0.6143731474876404, |
| "learning_rate": 9.997647827492774e-08, |
| "loss": 0.4957, |
| "num_input_tokens_seen": 11324688, |
| "step": 150, |
| "train_runtime": 840.9128, |
| "train_tokens_per_second": 13467.137 |
| }, |
| { |
| "epoch": 0.26225208982134074, |
| "grad_norm": 5.337645053863525, |
| "learning_rate": 9.995804573710351e-08, |
| "loss": 0.4317, |
| "num_input_tokens_seen": 14915760, |
| "step": 200, |
| "train_runtime": 1098.5331, |
| "train_tokens_per_second": 13577.889 |
| }, |
| { |
| "epoch": 0.327815112276676, |
| "grad_norm": 9.094950675964355, |
| "learning_rate": 9.993431982877141e-08, |
| "loss": 0.3758, |
| "num_input_tokens_seen": 18760920, |
| "step": 250, |
| "train_runtime": 1370.436, |
| "train_tokens_per_second": 13689.745 |
| }, |
| { |
| "epoch": 0.39337813473201116, |
| "grad_norm": 8.739721298217773, |
| "learning_rate": 9.990530306384132e-08, |
| "loss": 0.4875, |
| "num_input_tokens_seen": 22666272, |
| "step": 300, |
| "train_runtime": 1675.777, |
| "train_tokens_per_second": 13525.828 |
| }, |
| { |
| "epoch": 0.45894115718734635, |
| "grad_norm": 5.213091850280762, |
| "learning_rate": 9.987099851682273e-08, |
| "loss": 0.5377, |
| "num_input_tokens_seen": 26550816, |
| "step": 350, |
| "train_runtime": 1969.5671, |
| "train_tokens_per_second": 13480.534 |
| }, |
| { |
| "epoch": 0.5245041796426815, |
| "grad_norm": 6.928552627563477, |
| "learning_rate": 9.983140982249912e-08, |
| "loss": 0.5284, |
| "num_input_tokens_seen": 30502512, |
| "step": 400, |
| "train_runtime": 2276.9451, |
| "train_tokens_per_second": 13396.244 |
| }, |
| { |
| "epoch": 0.5900672020980167, |
| "grad_norm": 7.398318290710449, |
| "learning_rate": 9.978654117554268e-08, |
| "loss": 0.3501, |
| "num_input_tokens_seen": 34219392, |
| "step": 450, |
| "train_runtime": 2557.2601, |
| "train_tokens_per_second": 13381.272 |
| }, |
| { |
| "epoch": 0.655630224553352, |
| "grad_norm": 0.12503379583358765, |
| "learning_rate": 9.973639733006998e-08, |
| "loss": 0.4336, |
| "num_input_tokens_seen": 38231808, |
| "step": 500, |
| "train_runtime": 2907.4779, |
| "train_tokens_per_second": 13149.475 |
| }, |
| { |
| "epoch": 0.7211932470086871, |
| "grad_norm": 1.006555199623108, |
| "learning_rate": 9.968098359913822e-08, |
| "loss": 0.382, |
| "num_input_tokens_seen": 42037704, |
| "step": 550, |
| "train_runtime": 3185.8277, |
| "train_tokens_per_second": 13195.222 |
| }, |
| { |
| "epoch": 0.7867562694640223, |
| "grad_norm": 7.536371231079102, |
| "learning_rate": 9.962030585418215e-08, |
| "loss": 0.3866, |
| "num_input_tokens_seen": 46037664, |
| "step": 600, |
| "train_runtime": 3488.8435, |
| "train_tokens_per_second": 13195.681 |
| }, |
| { |
| "epoch": 0.8523192919193575, |
| "grad_norm": 0.24487841129302979, |
| "learning_rate": 9.955437052439219e-08, |
| "loss": 0.4026, |
| "num_input_tokens_seen": 49944816, |
| "step": 650, |
| "train_runtime": 3776.75, |
| "train_tokens_per_second": 13224.284 |
| }, |
| { |
| "epoch": 0.9178823143746927, |
| "grad_norm": 1.2577345371246338, |
| "learning_rate": 9.948318459603297e-08, |
| "loss": 0.3547, |
| "num_input_tokens_seen": 53838960, |
| "step": 700, |
| "train_runtime": 4095.3801, |
| "train_tokens_per_second": 13146.267 |
| }, |
| { |
| "epoch": 0.9834453368300279, |
| "grad_norm": 0.23559170961380005, |
| "learning_rate": 9.940675561170326e-08, |
| "loss": 0.3269, |
| "num_input_tokens_seen": 57703848, |
| "step": 750, |
| "train_runtime": 4401.0597, |
| "train_tokens_per_second": 13111.353 |
| }, |
| { |
| "epoch": 1.048516636616948, |
| "grad_norm": 2.2465434074401855, |
| "learning_rate": 9.932509166953673e-08, |
| "loss": 0.38, |
| "num_input_tokens_seen": 61456680, |
| "step": 800, |
| "train_runtime": 4678.4603, |
| "train_tokens_per_second": 13136.091 |
| }, |
| { |
| "epoch": 1.1140796590722832, |
| "grad_norm": 0.8857269287109375, |
| "learning_rate": 9.923820142234384e-08, |
| "loss": 0.3671, |
| "num_input_tokens_seen": 65352192, |
| "step": 850, |
| "train_runtime": 4987.8785, |
| "train_tokens_per_second": 13102.202 |
| }, |
| { |
| "epoch": 1.1796426815276184, |
| "grad_norm": 2.611070394515991, |
| "learning_rate": 9.914609407669518e-08, |
| "loss": 0.2795, |
| "num_input_tokens_seen": 69406008, |
| "step": 900, |
| "train_runtime": 5331.3796, |
| "train_tokens_per_second": 13018.395 |
| }, |
| { |
| "epoch": 1.2452057039829536, |
| "grad_norm": 0.18760572373867035, |
| "learning_rate": 9.904877939194582e-08, |
| "loss": 0.3224, |
| "num_input_tokens_seen": 73152336, |
| "step": 950, |
| "train_runtime": 5603.6792, |
| "train_tokens_per_second": 13054.341 |
| }, |
| { |
| "epoch": 1.3107687264382888, |
| "grad_norm": 7.031470775604248, |
| "learning_rate": 9.894626767920125e-08, |
| "loss": 0.2581, |
| "num_input_tokens_seen": 76955160, |
| "step": 1000, |
| "train_runtime": 5891.4617, |
| "train_tokens_per_second": 13062.151 |
| }, |
| { |
| "epoch": 1.376331748893624, |
| "grad_norm": 3.1105947494506836, |
| "learning_rate": 9.883856980022501e-08, |
| "loss": 0.2146, |
| "num_input_tokens_seen": 80682888, |
| "step": 1050, |
| "train_runtime": 6172.6315, |
| "train_tokens_per_second": 13071.068 |
| }, |
| { |
| "epoch": 1.4418947713489592, |
| "grad_norm": 3.3154454231262207, |
| "learning_rate": 9.872569716628762e-08, |
| "loss": 0.1974, |
| "num_input_tokens_seen": 84505128, |
| "step": 1100, |
| "train_runtime": 6464.0066, |
| "train_tokens_per_second": 13073.181 |
| }, |
| { |
| "epoch": 1.5074577938042943, |
| "grad_norm": 2.295762062072754, |
| "learning_rate": 9.860766173695762e-08, |
| "loss": 0.331, |
| "num_input_tokens_seen": 88457640, |
| "step": 1150, |
| "train_runtime": 6787.6545, |
| "train_tokens_per_second": 13032.137 |
| }, |
| { |
| "epoch": 1.5730208162596295, |
| "grad_norm": 3.430027484893799, |
| "learning_rate": 9.848447601883434e-08, |
| "loss": 0.2295, |
| "num_input_tokens_seen": 92425752, |
| "step": 1200, |
| "train_runtime": 7110.9534, |
| "train_tokens_per_second": 12997.66 |
| }, |
| { |
| "epoch": 1.6385838387149647, |
| "grad_norm": 5.2876200675964355, |
| "learning_rate": 9.83561530642227e-08, |
| "loss": 0.3534, |
| "num_input_tokens_seen": 96447384, |
| "step": 1250, |
| "train_runtime": 7430.2464, |
| "train_tokens_per_second": 12980.375 |
| }, |
| { |
| "epoch": 1.7041468611703, |
| "grad_norm": 2.3764872550964355, |
| "learning_rate": 9.822270646975031e-08, |
| "loss": 0.2875, |
| "num_input_tokens_seen": 100202232, |
| "step": 1300, |
| "train_runtime": 7704.6648, |
| "train_tokens_per_second": 13005.398 |
| }, |
| { |
| "epoch": 1.769709883625635, |
| "grad_norm": 0.5971184968948364, |
| "learning_rate": 9.808415037492677e-08, |
| "loss": 0.1869, |
| "num_input_tokens_seen": 103938744, |
| "step": 1350, |
| "train_runtime": 7967.1016, |
| "train_tokens_per_second": 13045.992 |
| }, |
| { |
| "epoch": 1.8352729060809705, |
| "grad_norm": 1.1916333436965942, |
| "learning_rate": 9.794049946064551e-08, |
| "loss": 0.2173, |
| "num_input_tokens_seen": 107626320, |
| "step": 1400, |
| "train_runtime": 8229.9563, |
| "train_tokens_per_second": 13077.387 |
| }, |
| { |
| "epoch": 1.9008359285363055, |
| "grad_norm": 1.6566100120544434, |
| "learning_rate": 9.779176894762831e-08, |
| "loss": 0.2168, |
| "num_input_tokens_seen": 111377760, |
| "step": 1450, |
| "train_runtime": 8503.7645, |
| "train_tokens_per_second": 13097.465 |
| }, |
| { |
| "epoch": 1.9663989509916409, |
| "grad_norm": 3.6912384033203125, |
| "learning_rate": 9.763797459481244e-08, |
| "loss": 0.2844, |
| "num_input_tokens_seen": 115314840, |
| "step": 1500, |
| "train_runtime": 8803.7543, |
| "train_tokens_per_second": 13098.371 |
| }, |
| { |
| "epoch": 2.0314702507785607, |
| "grad_norm": 0.7536889910697937, |
| "learning_rate": 9.747913269768107e-08, |
| "loss": 0.1743, |
| "num_input_tokens_seen": 118930008, |
| "step": 1550, |
| "train_runtime": 9062.5998, |
| "train_tokens_per_second": 13123.167 |
| }, |
| { |
| "epoch": 2.097033273233896, |
| "grad_norm": 4.382725715637207, |
| "learning_rate": 9.731526008653652e-08, |
| "loss": 0.1793, |
| "num_input_tokens_seen": 122730384, |
| "step": 1600, |
| "train_runtime": 9342.1738, |
| "train_tokens_per_second": 13137.24 |
| }, |
| { |
| "epoch": 2.162596295689231, |
| "grad_norm": 1.2656387090682983, |
| "learning_rate": 9.714637412471703e-08, |
| "loss": 0.2939, |
| "num_input_tokens_seen": 126529800, |
| "step": 1650, |
| "train_runtime": 9635.6982, |
| "train_tokens_per_second": 13131.358 |
| }, |
| { |
| "epoch": 2.2281593181445665, |
| "grad_norm": 2.5040361881256104, |
| "learning_rate": 9.697249270675705e-08, |
| "loss": 0.2434, |
| "num_input_tokens_seen": 130443600, |
| "step": 1700, |
| "train_runtime": 9927.959, |
| "train_tokens_per_second": 13139.015 |
| }, |
| { |
| "epoch": 2.293722340599902, |
| "grad_norm": 0.9235166311264038, |
| "learning_rate": 9.679363425649115e-08, |
| "loss": 0.2993, |
| "num_input_tokens_seen": 134517072, |
| "step": 1750, |
| "train_runtime": 10260.6078, |
| "train_tokens_per_second": 13110.049 |
| }, |
| { |
| "epoch": 2.359285363055237, |
| "grad_norm": 1.0807639360427856, |
| "learning_rate": 9.660981772510188e-08, |
| "loss": 0.192, |
| "num_input_tokens_seen": 138214584, |
| "step": 1800, |
| "train_runtime": 10530.3372, |
| "train_tokens_per_second": 13125.371 |
| }, |
| { |
| "epoch": 2.424848385510572, |
| "grad_norm": 1.5869427919387817, |
| "learning_rate": 9.642106258911184e-08, |
| "loss": 0.2412, |
| "num_input_tokens_seen": 142113144, |
| "step": 1850, |
| "train_runtime": 10835.7191, |
| "train_tokens_per_second": 13115.248 |
| }, |
| { |
| "epoch": 2.490411407965907, |
| "grad_norm": 1.165739893913269, |
| "learning_rate": 9.622738884831996e-08, |
| "loss": 0.2425, |
| "num_input_tokens_seen": 146119920, |
| "step": 1900, |
| "train_runtime": 11148.5673, |
| "train_tokens_per_second": 13106.61 |
| }, |
| { |
| "epoch": 2.5559744304212426, |
| "grad_norm": 1.7617275714874268, |
| "learning_rate": 9.602881702368242e-08, |
| "loss": 0.2262, |
| "num_input_tokens_seen": 150087360, |
| "step": 1950, |
| "train_runtime": 11458.6247, |
| "train_tokens_per_second": 13098.2 |
| }, |
| { |
| "epoch": 2.6215374528765776, |
| "grad_norm": 0.4497505724430084, |
| "learning_rate": 9.582536815513833e-08, |
| "loss": 0.1427, |
| "num_input_tokens_seen": 153908160, |
| "step": 2000, |
| "train_runtime": 11749.4731, |
| "train_tokens_per_second": 13099.154 |
| }, |
| { |
| "epoch": 2.6871004753319125, |
| "grad_norm": 0.7155716419219971, |
| "learning_rate": 9.561706379938041e-08, |
| "loss": 0.222, |
| "num_input_tokens_seen": 157607040, |
| "step": 2050, |
| "train_runtime": 12052.7614, |
| "train_tokens_per_second": 13076.426 |
| }, |
| { |
| "epoch": 2.752663497787248, |
| "grad_norm": 1.3807727098464966, |
| "learning_rate": 9.540392602757093e-08, |
| "loss": 0.1474, |
| "num_input_tokens_seen": 161453160, |
| "step": 2100, |
| "train_runtime": 12357.3875, |
| "train_tokens_per_second": 13065.315 |
| }, |
| { |
| "epoch": 2.8182265202425834, |
| "grad_norm": 0.739932119846344, |
| "learning_rate": 9.518597742300308e-08, |
| "loss": 0.265, |
| "num_input_tokens_seen": 165287904, |
| "step": 2150, |
| "train_runtime": 12651.7227, |
| "train_tokens_per_second": 13064.458 |
| }, |
| { |
| "epoch": 2.8837895426979183, |
| "grad_norm": 0.4396991431713104, |
| "learning_rate": 9.496324107870821e-08, |
| "loss": 0.2944, |
| "num_input_tokens_seen": 169326888, |
| "step": 2200, |
| "train_runtime": 12967.7154, |
| "train_tokens_per_second": 13057.573 |
| }, |
| { |
| "epoch": 2.9493525651532537, |
| "grad_norm": 0.38162505626678467, |
| "learning_rate": 9.47357405950089e-08, |
| "loss": 0.2348, |
| "num_input_tokens_seen": 173020800, |
| "step": 2250, |
| "train_runtime": 13223.1088, |
| "train_tokens_per_second": 13084.729 |
| }, |
| { |
| "epoch": 3.0144238649401736, |
| "grad_norm": 3.874674081802368, |
| "learning_rate": 9.450350007701847e-08, |
| "loss": 0.2311, |
| "num_input_tokens_seen": 176668584, |
| "step": 2300, |
| "train_runtime": 13516.1565, |
| "train_tokens_per_second": 13070.919 |
| }, |
| { |
| "epoch": 3.079986887395509, |
| "grad_norm": 0.7723739743232727, |
| "learning_rate": 9.426654413208668e-08, |
| "loss": 0.2964, |
| "num_input_tokens_seen": 180729120, |
| "step": 2350, |
| "train_runtime": 13841.6367, |
| "train_tokens_per_second": 13056.918 |
| }, |
| { |
| "epoch": 3.145549909850844, |
| "grad_norm": 1.5033811330795288, |
| "learning_rate": 9.40248978671927e-08, |
| "loss": 0.2084, |
| "num_input_tokens_seen": 184677672, |
| "step": 2400, |
| "train_runtime": 14150.4953, |
| "train_tokens_per_second": 13050.969 |
| }, |
| { |
| "epoch": 3.2111129323061793, |
| "grad_norm": 1.8196630477905273, |
| "learning_rate": 9.377858688628464e-08, |
| "loss": 0.1717, |
| "num_input_tokens_seen": 188404488, |
| "step": 2450, |
| "train_runtime": 14408.9636, |
| "train_tokens_per_second": 13075.506 |
| }, |
| { |
| "epoch": 3.2766759547615143, |
| "grad_norm": 0.9214364290237427, |
| "learning_rate": 9.352763728756675e-08, |
| "loss": 0.23, |
| "num_input_tokens_seen": 192323616, |
| "step": 2500, |
| "train_runtime": 14710.1132, |
| "train_tokens_per_second": 13074.244 |
| }, |
| { |
| "epoch": 3.3422389772168497, |
| "grad_norm": 1.0862064361572266, |
| "learning_rate": 9.327207566073416e-08, |
| "loss": 0.2271, |
| "num_input_tokens_seen": 196108272, |
| "step": 2550, |
| "train_runtime": 14979.1529, |
| "train_tokens_per_second": 13092.08 |
| }, |
| { |
| "epoch": 3.407801999672185, |
| "grad_norm": 0.8413626551628113, |
| "learning_rate": 9.301192908415552e-08, |
| "loss": 0.2193, |
| "num_input_tokens_seen": 199941432, |
| "step": 2600, |
| "train_runtime": 15282.6531, |
| "train_tokens_per_second": 13082.901 |
| }, |
| { |
| "epoch": 3.47336502212752, |
| "grad_norm": 1.531718134880066, |
| "learning_rate": 9.274722512200379e-08, |
| "loss": 0.1382, |
| "num_input_tokens_seen": 203779920, |
| "step": 2650, |
| "train_runtime": 15565.7388, |
| "train_tokens_per_second": 13091.568 |
| }, |
| { |
| "epoch": 3.538928044582855, |
| "grad_norm": 0.0838296189904213, |
| "learning_rate": 9.247799182133582e-08, |
| "loss": 0.2191, |
| "num_input_tokens_seen": 207633384, |
| "step": 2700, |
| "train_runtime": 15868.2059, |
| "train_tokens_per_second": 13084.868 |
| }, |
| { |
| "epoch": 3.6044910670381904, |
| "grad_norm": 1.1013773679733276, |
| "learning_rate": 9.220425770912042e-08, |
| "loss": 0.1988, |
| "num_input_tokens_seen": 211368360, |
| "step": 2750, |
| "train_runtime": 16143.9244, |
| "train_tokens_per_second": 13092.75 |
| }, |
| { |
| "epoch": 3.670054089493526, |
| "grad_norm": 0.40529268980026245, |
| "learning_rate": 9.192605178921584e-08, |
| "loss": 0.3072, |
| "num_input_tokens_seen": 215149128, |
| "step": 2800, |
| "train_runtime": 16445.4494, |
| "train_tokens_per_second": 13082.593 |
| }, |
| { |
| "epoch": 3.735617111948861, |
| "grad_norm": 1.5882924795150757, |
| "learning_rate": 9.164340353929659e-08, |
| "loss": 0.1822, |
| "num_input_tokens_seen": 218796552, |
| "step": 2850, |
| "train_runtime": 16707.491, |
| "train_tokens_per_second": 13095.716 |
| }, |
| { |
| "epoch": 3.8011801344041958, |
| "grad_norm": 0.862838089466095, |
| "learning_rate": 9.13563429077301e-08, |
| "loss": 0.2437, |
| "num_input_tokens_seen": 222623832, |
| "step": 2900, |
| "train_runtime": 16994.843, |
| "train_tokens_per_second": 13099.493 |
| }, |
| { |
| "epoch": 3.866743156859531, |
| "grad_norm": 0.7801971435546875, |
| "learning_rate": 9.106490031040353e-08, |
| "loss": 0.3174, |
| "num_input_tokens_seen": 226629408, |
| "step": 2950, |
| "train_runtime": 17320.4321, |
| "train_tokens_per_second": 13084.512 |
| }, |
| { |
| "epoch": 3.9323061793148666, |
| "grad_norm": 0.4492790699005127, |
| "learning_rate": 9.076910662750096e-08, |
| "loss": 0.199, |
| "num_input_tokens_seen": 230444736, |
| "step": 3000, |
| "train_runtime": 17612.3894, |
| "train_tokens_per_second": 13084.24 |
| }, |
| { |
| "epoch": 3.9978692017702016, |
| "grad_norm": 4.88616418838501, |
| "learning_rate": 9.04689932002315e-08, |
| "loss": 0.1764, |
| "num_input_tokens_seen": 234389904, |
| "step": 3050, |
| "train_runtime": 17949.0057, |
| "train_tokens_per_second": 13058.657 |
| }, |
| { |
| "epoch": 4.062940501557121, |
| "grad_norm": 0.597968339920044, |
| "learning_rate": 9.016459182750843e-08, |
| "loss": 0.209, |
| "num_input_tokens_seen": 238124880, |
| "step": 3100, |
| "train_runtime": 18244.7826, |
| "train_tokens_per_second": 13051.67 |
| }, |
| { |
| "epoch": 4.128503524012457, |
| "grad_norm": 0.8793305158615112, |
| "learning_rate": 8.985593476257997e-08, |
| "loss": 0.2686, |
| "num_input_tokens_seen": 241758864, |
| "step": 3150, |
| "train_runtime": 18507.4406, |
| "train_tokens_per_second": 13062.793 |
| }, |
| { |
| "epoch": 4.194066546467792, |
| "grad_norm": 7.551540851593018, |
| "learning_rate": 8.954305470961178e-08, |
| "loss": 0.2529, |
| "num_input_tokens_seen": 245698488, |
| "step": 3200, |
| "train_runtime": 18827.139, |
| "train_tokens_per_second": 13050.23 |
| }, |
| { |
| "epoch": 4.259629568923128, |
| "grad_norm": 0.4505975842475891, |
| "learning_rate": 8.922598482022182e-08, |
| "loss": 0.2384, |
| "num_input_tokens_seen": 249595968, |
| "step": 3250, |
| "train_runtime": 19129.2909, |
| "train_tokens_per_second": 13047.842 |
| }, |
| { |
| "epoch": 4.325192591378462, |
| "grad_norm": 2.2207558155059814, |
| "learning_rate": 8.890475868996762e-08, |
| "loss": 0.1867, |
| "num_input_tokens_seen": 253481304, |
| "step": 3300, |
| "train_runtime": 19419.7804, |
| "train_tokens_per_second": 13052.738 |
| }, |
| { |
| "epoch": 4.3907556138337975, |
| "grad_norm": 0.9266397356987, |
| "learning_rate": 8.857941035478673e-08, |
| "loss": 0.1763, |
| "num_input_tokens_seen": 257255976, |
| "step": 3350, |
| "train_runtime": 19702.252, |
| "train_tokens_per_second": 13057.186 |
| }, |
| { |
| "epoch": 4.456318636289133, |
| "grad_norm": 0.29596129059791565, |
| "learning_rate": 8.824997428739036e-08, |
| "loss": 0.2278, |
| "num_input_tokens_seen": 261064368, |
| "step": 3400, |
| "train_runtime": 19998.9663, |
| "train_tokens_per_second": 13053.893 |
| }, |
| { |
| "epoch": 4.521881658744468, |
| "grad_norm": 0.9699137210845947, |
| "learning_rate": 8.791648539361072e-08, |
| "loss": 0.201, |
| "num_input_tokens_seen": 264944352, |
| "step": 3450, |
| "train_runtime": 20299.7802, |
| "train_tokens_per_second": 13051.587 |
| }, |
| { |
| "epoch": 4.587444681199804, |
| "grad_norm": 1.298768401145935, |
| "learning_rate": 8.757897900870261e-08, |
| "loss": 0.2057, |
| "num_input_tokens_seen": 268791072, |
| "step": 3500, |
| "train_runtime": 20594.4257, |
| "train_tokens_per_second": 13051.642 |
| }, |
| { |
| "epoch": 4.653007703655138, |
| "grad_norm": 12.011015892028809, |
| "learning_rate": 8.72374908935994e-08, |
| "loss": 0.2351, |
| "num_input_tokens_seen": 272495832, |
| "step": 3550, |
| "train_runtime": 20885.3413, |
| "train_tokens_per_second": 13047.229 |
| }, |
| { |
| "epoch": 4.718570726110474, |
| "grad_norm": 0.24729423224925995, |
| "learning_rate": 8.689205723112387e-08, |
| "loss": 0.2065, |
| "num_input_tokens_seen": 276393408, |
| "step": 3600, |
| "train_runtime": 21206.0433, |
| "train_tokens_per_second": 13033.71 |
| }, |
| { |
| "epoch": 4.784133748565809, |
| "grad_norm": 2.150505781173706, |
| "learning_rate": 8.654271462215454e-08, |
| "loss": 0.158, |
| "num_input_tokens_seen": 280197624, |
| "step": 3650, |
| "train_runtime": 21488.9397, |
| "train_tokens_per_second": 13039.155 |
| }, |
| { |
| "epoch": 4.849696771021144, |
| "grad_norm": 0.4875163435935974, |
| "learning_rate": 8.618950008174746e-08, |
| "loss": 0.1832, |
| "num_input_tokens_seen": 284031624, |
| "step": 3700, |
| "train_runtime": 21778.1233, |
| "train_tokens_per_second": 13042.062 |
| }, |
| { |
| "epoch": 4.915259793476479, |
| "grad_norm": 0.5430140495300293, |
| "learning_rate": 8.583245103521428e-08, |
| "loss": 0.2566, |
| "num_input_tokens_seen": 287936280, |
| "step": 3750, |
| "train_runtime": 22067.8249, |
| "train_tokens_per_second": 13047.787 |
| }, |
| { |
| "epoch": 4.980822815931814, |
| "grad_norm": 0.3734208941459656, |
| "learning_rate": 8.547160531415679e-08, |
| "loss": 0.2775, |
| "num_input_tokens_seen": 291838584, |
| "step": 3800, |
| "train_runtime": 22359.3364, |
| "train_tokens_per_second": 13052.202 |
| }, |
| { |
| "epoch": 5.045894115718735, |
| "grad_norm": 0.9905921220779419, |
| "learning_rate": 8.510700115245841e-08, |
| "loss": 0.1971, |
| "num_input_tokens_seen": 295643712, |
| "step": 3850, |
| "train_runtime": 22653.1086, |
| "train_tokens_per_second": 13050.911 |
| }, |
| { |
| "epoch": 5.11145713817407, |
| "grad_norm": 0.0872701108455658, |
| "learning_rate": 8.473867718223315e-08, |
| "loss": 0.3142, |
| "num_input_tokens_seen": 299528016, |
| "step": 3900, |
| "train_runtime": 22970.1152, |
| "train_tokens_per_second": 13039.9 |
| }, |
| { |
| "epoch": 5.177020160629405, |
| "grad_norm": 0.7591832876205444, |
| "learning_rate": 8.436667242973218e-08, |
| "loss": 0.2291, |
| "num_input_tokens_seen": 303643632, |
| "step": 3950, |
| "train_runtime": 23324.5779, |
| "train_tokens_per_second": 13018.183 |
| }, |
| { |
| "epoch": 5.24258318308474, |
| "grad_norm": 0.44477882981300354, |
| "learning_rate": 8.399102631120877e-08, |
| "loss": 0.2128, |
| "num_input_tokens_seen": 307574184, |
| "step": 4000, |
| "train_runtime": 23603.684, |
| "train_tokens_per_second": 13030.77 |
| }, |
| { |
| "epoch": 5.308146205540075, |
| "grad_norm": 0.48096030950546265, |
| "learning_rate": 8.361177862874202e-08, |
| "loss": 0.1472, |
| "num_input_tokens_seen": 311323584, |
| "step": 4050, |
| "train_runtime": 23888.6512, |
| "train_tokens_per_second": 13032.28 |
| }, |
| { |
| "epoch": 5.373709227995411, |
| "grad_norm": 0.9138302206993103, |
| "learning_rate": 8.32289695660194e-08, |
| "loss": 0.1981, |
| "num_input_tokens_seen": 315158328, |
| "step": 4100, |
| "train_runtime": 24182.2327, |
| "train_tokens_per_second": 13032.64 |
| }, |
| { |
| "epoch": 5.439272250450745, |
| "grad_norm": 0.3333579897880554, |
| "learning_rate": 8.284263968407912e-08, |
| "loss": 0.1837, |
| "num_input_tokens_seen": 318844944, |
| "step": 4150, |
| "train_runtime": 24456.5915, |
| "train_tokens_per_second": 13037.178 |
| }, |
| { |
| "epoch": 5.504835272906081, |
| "grad_norm": 0.9484214782714844, |
| "learning_rate": 8.245282991701243e-08, |
| "loss": 0.2015, |
| "num_input_tokens_seen": 322685568, |
| "step": 4200, |
| "train_runtime": 24723.1173, |
| "train_tokens_per_second": 13051.977 |
| }, |
| { |
| "epoch": 5.570398295361416, |
| "grad_norm": 0.4100230634212494, |
| "learning_rate": 8.205958156762646e-08, |
| "loss": 0.2554, |
| "num_input_tokens_seen": 326275680, |
| "step": 4250, |
| "train_runtime": 24984.4942, |
| "train_tokens_per_second": 13059.127 |
| }, |
| { |
| "epoch": 5.635961317816752, |
| "grad_norm": 0.9571174383163452, |
| "learning_rate": 8.166293630306773e-08, |
| "loss": 0.2039, |
| "num_input_tokens_seen": 330026184, |
| "step": 4300, |
| "train_runtime": 25280.2384, |
| "train_tokens_per_second": 13054.71 |
| }, |
| { |
| "epoch": 5.701524340272087, |
| "grad_norm": 0.5215702652931213, |
| "learning_rate": 8.126293615040747e-08, |
| "loss": 0.2277, |
| "num_input_tokens_seen": 333968520, |
| "step": 4350, |
| "train_runtime": 25565.1364, |
| "train_tokens_per_second": 13063.436 |
| }, |
| { |
| "epoch": 5.7670873627274215, |
| "grad_norm": 0.4471840560436249, |
| "learning_rate": 8.085962349218847e-08, |
| "loss": 0.2104, |
| "num_input_tokens_seen": 337707624, |
| "step": 4400, |
| "train_runtime": 25841.3753, |
| "train_tokens_per_second": 13068.485 |
| }, |
| { |
| "epoch": 5.832650385182757, |
| "grad_norm": 1.0097142457962036, |
| "learning_rate": 8.04530410619344e-08, |
| "loss": 0.2524, |
| "num_input_tokens_seen": 341503488, |
| "step": 4450, |
| "train_runtime": 26137.1854, |
| "train_tokens_per_second": 13065.81 |
| }, |
| { |
| "epoch": 5.898213407638092, |
| "grad_norm": 1.6211527585983276, |
| "learning_rate": 8.004323193962197e-08, |
| "loss": 0.1595, |
| "num_input_tokens_seen": 345388440, |
| "step": 4500, |
| "train_runtime": 26453.2756, |
| "train_tokens_per_second": 13056.547 |
| }, |
| { |
| "epoch": 5.963776430093427, |
| "grad_norm": 0.25499045848846436, |
| "learning_rate": 7.963023954711624e-08, |
| "loss": 0.2721, |
| "num_input_tokens_seen": 349216920, |
| "step": 4550, |
| "train_runtime": 26741.8598, |
| "train_tokens_per_second": 13058.812 |
| }, |
| { |
| "epoch": 6.028847729880347, |
| "grad_norm": 0.6265522837638855, |
| "learning_rate": 7.921410764356988e-08, |
| "loss": 0.1993, |
| "num_input_tokens_seen": 353096424, |
| "step": 4600, |
| "train_runtime": 27061.8507, |
| "train_tokens_per_second": 13047.756 |
| }, |
| { |
| "epoch": 6.0944107523356825, |
| "grad_norm": 0.06899835169315338, |
| "learning_rate": 7.87948803207866e-08, |
| "loss": 0.2228, |
| "num_input_tokens_seen": 356829384, |
| "step": 4650, |
| "train_runtime": 27330.1966, |
| "train_tokens_per_second": 13056.232 |
| }, |
| { |
| "epoch": 6.159973774791018, |
| "grad_norm": 0.8082672953605652, |
| "learning_rate": 7.837260199854929e-08, |
| "loss": 0.1859, |
| "num_input_tokens_seen": 360447864, |
| "step": 4700, |
| "train_runtime": 27571.8796, |
| "train_tokens_per_second": 13073.025 |
| }, |
| { |
| "epoch": 6.225536797246353, |
| "grad_norm": 0.6293157339096069, |
| "learning_rate": 7.794731741991355e-08, |
| "loss": 0.2223, |
| "num_input_tokens_seen": 364279296, |
| "step": 4750, |
| "train_runtime": 27852.8113, |
| "train_tokens_per_second": 13078.726 |
| }, |
| { |
| "epoch": 6.291099819701688, |
| "grad_norm": 0.7018508315086365, |
| "learning_rate": 7.751907164646682e-08, |
| "loss": 0.1709, |
| "num_input_tokens_seen": 368000976, |
| "step": 4800, |
| "train_runtime": 28103.875, |
| "train_tokens_per_second": 13094.314 |
| }, |
| { |
| "epoch": 6.356662842157023, |
| "grad_norm": 0.3939789831638336, |
| "learning_rate": 7.70879100535538e-08, |
| "loss": 0.1903, |
| "num_input_tokens_seen": 371666208, |
| "step": 4850, |
| "train_runtime": 28370.3397, |
| "train_tokens_per_second": 13100.52 |
| }, |
| { |
| "epoch": 6.422225864612359, |
| "grad_norm": 0.07075575739145279, |
| "learning_rate": 7.665387832546873e-08, |
| "loss": 0.1653, |
| "num_input_tokens_seen": 375530976, |
| "step": 4900, |
| "train_runtime": 28672.2738, |
| "train_tokens_per_second": 13097.356 |
| }, |
| { |
| "epoch": 6.487788887067694, |
| "grad_norm": 1.4741820096969604, |
| "learning_rate": 7.621702245061479e-08, |
| "loss": 0.2247, |
| "num_input_tokens_seen": 379400040, |
| "step": 4950, |
| "train_runtime": 28956.4169, |
| "train_tokens_per_second": 13102.451 |
| }, |
| { |
| "epoch": 6.553351909523029, |
| "grad_norm": 0.756077229976654, |
| "learning_rate": 7.577738871663131e-08, |
| "loss": 0.2299, |
| "num_input_tokens_seen": 383417568, |
| "step": 5000, |
| "train_runtime": 29294.8197, |
| "train_tokens_per_second": 13088.238 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 15260, |
| "num_input_tokens_seen": 383417568, |
| "num_train_epochs": 20, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.358913850245906e+18, |
| "train_batch_size": 3, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|