Affine-star_v2 / trainer_state.json
void-818's picture
Upload folder using huggingface_hub
cb75450 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.553351909523029,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06556302245533518,
"grad_norm": 0.2032165825366974,
"learning_rate": 9.999745598795031e-08,
"loss": 0.4359,
"num_input_tokens_seen": 3794784,
"step": 50,
"train_runtime": 288.0643,
"train_tokens_per_second": 13173.393
},
{
"epoch": 0.13112604491067037,
"grad_norm": 0.12323546409606934,
"learning_rate": 9.998961548920028e-08,
"loss": 0.5461,
"num_input_tokens_seen": 7656384,
"step": 100,
"train_runtime": 563.6125,
"train_tokens_per_second": 13584.481
},
{
"epoch": 0.19668906736600558,
"grad_norm": 0.6143731474876404,
"learning_rate": 9.997647827492774e-08,
"loss": 0.4957,
"num_input_tokens_seen": 11324688,
"step": 150,
"train_runtime": 840.9128,
"train_tokens_per_second": 13467.137
},
{
"epoch": 0.26225208982134074,
"grad_norm": 5.337645053863525,
"learning_rate": 9.995804573710351e-08,
"loss": 0.4317,
"num_input_tokens_seen": 14915760,
"step": 200,
"train_runtime": 1098.5331,
"train_tokens_per_second": 13577.889
},
{
"epoch": 0.327815112276676,
"grad_norm": 9.094950675964355,
"learning_rate": 9.993431982877141e-08,
"loss": 0.3758,
"num_input_tokens_seen": 18760920,
"step": 250,
"train_runtime": 1370.436,
"train_tokens_per_second": 13689.745
},
{
"epoch": 0.39337813473201116,
"grad_norm": 8.739721298217773,
"learning_rate": 9.990530306384132e-08,
"loss": 0.4875,
"num_input_tokens_seen": 22666272,
"step": 300,
"train_runtime": 1675.777,
"train_tokens_per_second": 13525.828
},
{
"epoch": 0.45894115718734635,
"grad_norm": 5.213091850280762,
"learning_rate": 9.987099851682273e-08,
"loss": 0.5377,
"num_input_tokens_seen": 26550816,
"step": 350,
"train_runtime": 1969.5671,
"train_tokens_per_second": 13480.534
},
{
"epoch": 0.5245041796426815,
"grad_norm": 6.928552627563477,
"learning_rate": 9.983140982249912e-08,
"loss": 0.5284,
"num_input_tokens_seen": 30502512,
"step": 400,
"train_runtime": 2276.9451,
"train_tokens_per_second": 13396.244
},
{
"epoch": 0.5900672020980167,
"grad_norm": 7.398318290710449,
"learning_rate": 9.978654117554268e-08,
"loss": 0.3501,
"num_input_tokens_seen": 34219392,
"step": 450,
"train_runtime": 2557.2601,
"train_tokens_per_second": 13381.272
},
{
"epoch": 0.655630224553352,
"grad_norm": 0.12503379583358765,
"learning_rate": 9.973639733006998e-08,
"loss": 0.4336,
"num_input_tokens_seen": 38231808,
"step": 500,
"train_runtime": 2907.4779,
"train_tokens_per_second": 13149.475
},
{
"epoch": 0.7211932470086871,
"grad_norm": 1.006555199623108,
"learning_rate": 9.968098359913822e-08,
"loss": 0.382,
"num_input_tokens_seen": 42037704,
"step": 550,
"train_runtime": 3185.8277,
"train_tokens_per_second": 13195.222
},
{
"epoch": 0.7867562694640223,
"grad_norm": 7.536371231079102,
"learning_rate": 9.962030585418215e-08,
"loss": 0.3866,
"num_input_tokens_seen": 46037664,
"step": 600,
"train_runtime": 3488.8435,
"train_tokens_per_second": 13195.681
},
{
"epoch": 0.8523192919193575,
"grad_norm": 0.24487841129302979,
"learning_rate": 9.955437052439219e-08,
"loss": 0.4026,
"num_input_tokens_seen": 49944816,
"step": 650,
"train_runtime": 3776.75,
"train_tokens_per_second": 13224.284
},
{
"epoch": 0.9178823143746927,
"grad_norm": 1.2577345371246338,
"learning_rate": 9.948318459603297e-08,
"loss": 0.3547,
"num_input_tokens_seen": 53838960,
"step": 700,
"train_runtime": 4095.3801,
"train_tokens_per_second": 13146.267
},
{
"epoch": 0.9834453368300279,
"grad_norm": 0.23559170961380005,
"learning_rate": 9.940675561170326e-08,
"loss": 0.3269,
"num_input_tokens_seen": 57703848,
"step": 750,
"train_runtime": 4401.0597,
"train_tokens_per_second": 13111.353
},
{
"epoch": 1.048516636616948,
"grad_norm": 2.2465434074401855,
"learning_rate": 9.932509166953673e-08,
"loss": 0.38,
"num_input_tokens_seen": 61456680,
"step": 800,
"train_runtime": 4678.4603,
"train_tokens_per_second": 13136.091
},
{
"epoch": 1.1140796590722832,
"grad_norm": 0.8857269287109375,
"learning_rate": 9.923820142234384e-08,
"loss": 0.3671,
"num_input_tokens_seen": 65352192,
"step": 850,
"train_runtime": 4987.8785,
"train_tokens_per_second": 13102.202
},
{
"epoch": 1.1796426815276184,
"grad_norm": 2.611070394515991,
"learning_rate": 9.914609407669518e-08,
"loss": 0.2795,
"num_input_tokens_seen": 69406008,
"step": 900,
"train_runtime": 5331.3796,
"train_tokens_per_second": 13018.395
},
{
"epoch": 1.2452057039829536,
"grad_norm": 0.18760572373867035,
"learning_rate": 9.904877939194582e-08,
"loss": 0.3224,
"num_input_tokens_seen": 73152336,
"step": 950,
"train_runtime": 5603.6792,
"train_tokens_per_second": 13054.341
},
{
"epoch": 1.3107687264382888,
"grad_norm": 7.031470775604248,
"learning_rate": 9.894626767920125e-08,
"loss": 0.2581,
"num_input_tokens_seen": 76955160,
"step": 1000,
"train_runtime": 5891.4617,
"train_tokens_per_second": 13062.151
},
{
"epoch": 1.376331748893624,
"grad_norm": 3.1105947494506836,
"learning_rate": 9.883856980022501e-08,
"loss": 0.2146,
"num_input_tokens_seen": 80682888,
"step": 1050,
"train_runtime": 6172.6315,
"train_tokens_per_second": 13071.068
},
{
"epoch": 1.4418947713489592,
"grad_norm": 3.3154454231262207,
"learning_rate": 9.872569716628762e-08,
"loss": 0.1974,
"num_input_tokens_seen": 84505128,
"step": 1100,
"train_runtime": 6464.0066,
"train_tokens_per_second": 13073.181
},
{
"epoch": 1.5074577938042943,
"grad_norm": 2.295762062072754,
"learning_rate": 9.860766173695762e-08,
"loss": 0.331,
"num_input_tokens_seen": 88457640,
"step": 1150,
"train_runtime": 6787.6545,
"train_tokens_per_second": 13032.137
},
{
"epoch": 1.5730208162596295,
"grad_norm": 3.430027484893799,
"learning_rate": 9.848447601883434e-08,
"loss": 0.2295,
"num_input_tokens_seen": 92425752,
"step": 1200,
"train_runtime": 7110.9534,
"train_tokens_per_second": 12997.66
},
{
"epoch": 1.6385838387149647,
"grad_norm": 5.2876200675964355,
"learning_rate": 9.83561530642227e-08,
"loss": 0.3534,
"num_input_tokens_seen": 96447384,
"step": 1250,
"train_runtime": 7430.2464,
"train_tokens_per_second": 12980.375
},
{
"epoch": 1.7041468611703,
"grad_norm": 2.3764872550964355,
"learning_rate": 9.822270646975031e-08,
"loss": 0.2875,
"num_input_tokens_seen": 100202232,
"step": 1300,
"train_runtime": 7704.6648,
"train_tokens_per_second": 13005.398
},
{
"epoch": 1.769709883625635,
"grad_norm": 0.5971184968948364,
"learning_rate": 9.808415037492677e-08,
"loss": 0.1869,
"num_input_tokens_seen": 103938744,
"step": 1350,
"train_runtime": 7967.1016,
"train_tokens_per_second": 13045.992
},
{
"epoch": 1.8352729060809705,
"grad_norm": 1.1916333436965942,
"learning_rate": 9.794049946064551e-08,
"loss": 0.2173,
"num_input_tokens_seen": 107626320,
"step": 1400,
"train_runtime": 8229.9563,
"train_tokens_per_second": 13077.387
},
{
"epoch": 1.9008359285363055,
"grad_norm": 1.6566100120544434,
"learning_rate": 9.779176894762831e-08,
"loss": 0.2168,
"num_input_tokens_seen": 111377760,
"step": 1450,
"train_runtime": 8503.7645,
"train_tokens_per_second": 13097.465
},
{
"epoch": 1.9663989509916409,
"grad_norm": 3.6912384033203125,
"learning_rate": 9.763797459481244e-08,
"loss": 0.2844,
"num_input_tokens_seen": 115314840,
"step": 1500,
"train_runtime": 8803.7543,
"train_tokens_per_second": 13098.371
},
{
"epoch": 2.0314702507785607,
"grad_norm": 0.7536889910697937,
"learning_rate": 9.747913269768107e-08,
"loss": 0.1743,
"num_input_tokens_seen": 118930008,
"step": 1550,
"train_runtime": 9062.5998,
"train_tokens_per_second": 13123.167
},
{
"epoch": 2.097033273233896,
"grad_norm": 4.382725715637207,
"learning_rate": 9.731526008653652e-08,
"loss": 0.1793,
"num_input_tokens_seen": 122730384,
"step": 1600,
"train_runtime": 9342.1738,
"train_tokens_per_second": 13137.24
},
{
"epoch": 2.162596295689231,
"grad_norm": 1.2656387090682983,
"learning_rate": 9.714637412471703e-08,
"loss": 0.2939,
"num_input_tokens_seen": 126529800,
"step": 1650,
"train_runtime": 9635.6982,
"train_tokens_per_second": 13131.358
},
{
"epoch": 2.2281593181445665,
"grad_norm": 2.5040361881256104,
"learning_rate": 9.697249270675705e-08,
"loss": 0.2434,
"num_input_tokens_seen": 130443600,
"step": 1700,
"train_runtime": 9927.959,
"train_tokens_per_second": 13139.015
},
{
"epoch": 2.293722340599902,
"grad_norm": 0.9235166311264038,
"learning_rate": 9.679363425649115e-08,
"loss": 0.2993,
"num_input_tokens_seen": 134517072,
"step": 1750,
"train_runtime": 10260.6078,
"train_tokens_per_second": 13110.049
},
{
"epoch": 2.359285363055237,
"grad_norm": 1.0807639360427856,
"learning_rate": 9.660981772510188e-08,
"loss": 0.192,
"num_input_tokens_seen": 138214584,
"step": 1800,
"train_runtime": 10530.3372,
"train_tokens_per_second": 13125.371
},
{
"epoch": 2.424848385510572,
"grad_norm": 1.5869427919387817,
"learning_rate": 9.642106258911184e-08,
"loss": 0.2412,
"num_input_tokens_seen": 142113144,
"step": 1850,
"train_runtime": 10835.7191,
"train_tokens_per_second": 13115.248
},
{
"epoch": 2.490411407965907,
"grad_norm": 1.165739893913269,
"learning_rate": 9.622738884831996e-08,
"loss": 0.2425,
"num_input_tokens_seen": 146119920,
"step": 1900,
"train_runtime": 11148.5673,
"train_tokens_per_second": 13106.61
},
{
"epoch": 2.5559744304212426,
"grad_norm": 1.7617275714874268,
"learning_rate": 9.602881702368242e-08,
"loss": 0.2262,
"num_input_tokens_seen": 150087360,
"step": 1950,
"train_runtime": 11458.6247,
"train_tokens_per_second": 13098.2
},
{
"epoch": 2.6215374528765776,
"grad_norm": 0.4497505724430084,
"learning_rate": 9.582536815513833e-08,
"loss": 0.1427,
"num_input_tokens_seen": 153908160,
"step": 2000,
"train_runtime": 11749.4731,
"train_tokens_per_second": 13099.154
},
{
"epoch": 2.6871004753319125,
"grad_norm": 0.7155716419219971,
"learning_rate": 9.561706379938041e-08,
"loss": 0.222,
"num_input_tokens_seen": 157607040,
"step": 2050,
"train_runtime": 12052.7614,
"train_tokens_per_second": 13076.426
},
{
"epoch": 2.752663497787248,
"grad_norm": 1.3807727098464966,
"learning_rate": 9.540392602757093e-08,
"loss": 0.1474,
"num_input_tokens_seen": 161453160,
"step": 2100,
"train_runtime": 12357.3875,
"train_tokens_per_second": 13065.315
},
{
"epoch": 2.8182265202425834,
"grad_norm": 0.739932119846344,
"learning_rate": 9.518597742300308e-08,
"loss": 0.265,
"num_input_tokens_seen": 165287904,
"step": 2150,
"train_runtime": 12651.7227,
"train_tokens_per_second": 13064.458
},
{
"epoch": 2.8837895426979183,
"grad_norm": 0.4396991431713104,
"learning_rate": 9.496324107870821e-08,
"loss": 0.2944,
"num_input_tokens_seen": 169326888,
"step": 2200,
"train_runtime": 12967.7154,
"train_tokens_per_second": 13057.573
},
{
"epoch": 2.9493525651532537,
"grad_norm": 0.38162505626678467,
"learning_rate": 9.47357405950089e-08,
"loss": 0.2348,
"num_input_tokens_seen": 173020800,
"step": 2250,
"train_runtime": 13223.1088,
"train_tokens_per_second": 13084.729
},
{
"epoch": 3.0144238649401736,
"grad_norm": 3.874674081802368,
"learning_rate": 9.450350007701847e-08,
"loss": 0.2311,
"num_input_tokens_seen": 176668584,
"step": 2300,
"train_runtime": 13516.1565,
"train_tokens_per_second": 13070.919
},
{
"epoch": 3.079986887395509,
"grad_norm": 0.7723739743232727,
"learning_rate": 9.426654413208668e-08,
"loss": 0.2964,
"num_input_tokens_seen": 180729120,
"step": 2350,
"train_runtime": 13841.6367,
"train_tokens_per_second": 13056.918
},
{
"epoch": 3.145549909850844,
"grad_norm": 1.5033811330795288,
"learning_rate": 9.40248978671927e-08,
"loss": 0.2084,
"num_input_tokens_seen": 184677672,
"step": 2400,
"train_runtime": 14150.4953,
"train_tokens_per_second": 13050.969
},
{
"epoch": 3.2111129323061793,
"grad_norm": 1.8196630477905273,
"learning_rate": 9.377858688628464e-08,
"loss": 0.1717,
"num_input_tokens_seen": 188404488,
"step": 2450,
"train_runtime": 14408.9636,
"train_tokens_per_second": 13075.506
},
{
"epoch": 3.2766759547615143,
"grad_norm": 0.9214364290237427,
"learning_rate": 9.352763728756675e-08,
"loss": 0.23,
"num_input_tokens_seen": 192323616,
"step": 2500,
"train_runtime": 14710.1132,
"train_tokens_per_second": 13074.244
},
{
"epoch": 3.3422389772168497,
"grad_norm": 1.0862064361572266,
"learning_rate": 9.327207566073416e-08,
"loss": 0.2271,
"num_input_tokens_seen": 196108272,
"step": 2550,
"train_runtime": 14979.1529,
"train_tokens_per_second": 13092.08
},
{
"epoch": 3.407801999672185,
"grad_norm": 0.8413626551628113,
"learning_rate": 9.301192908415552e-08,
"loss": 0.2193,
"num_input_tokens_seen": 199941432,
"step": 2600,
"train_runtime": 15282.6531,
"train_tokens_per_second": 13082.901
},
{
"epoch": 3.47336502212752,
"grad_norm": 1.531718134880066,
"learning_rate": 9.274722512200379e-08,
"loss": 0.1382,
"num_input_tokens_seen": 203779920,
"step": 2650,
"train_runtime": 15565.7388,
"train_tokens_per_second": 13091.568
},
{
"epoch": 3.538928044582855,
"grad_norm": 0.0838296189904213,
"learning_rate": 9.247799182133582e-08,
"loss": 0.2191,
"num_input_tokens_seen": 207633384,
"step": 2700,
"train_runtime": 15868.2059,
"train_tokens_per_second": 13084.868
},
{
"epoch": 3.6044910670381904,
"grad_norm": 1.1013773679733276,
"learning_rate": 9.220425770912042e-08,
"loss": 0.1988,
"num_input_tokens_seen": 211368360,
"step": 2750,
"train_runtime": 16143.9244,
"train_tokens_per_second": 13092.75
},
{
"epoch": 3.670054089493526,
"grad_norm": 0.40529268980026245,
"learning_rate": 9.192605178921584e-08,
"loss": 0.3072,
"num_input_tokens_seen": 215149128,
"step": 2800,
"train_runtime": 16445.4494,
"train_tokens_per_second": 13082.593
},
{
"epoch": 3.735617111948861,
"grad_norm": 1.5882924795150757,
"learning_rate": 9.164340353929659e-08,
"loss": 0.1822,
"num_input_tokens_seen": 218796552,
"step": 2850,
"train_runtime": 16707.491,
"train_tokens_per_second": 13095.716
},
{
"epoch": 3.8011801344041958,
"grad_norm": 0.862838089466095,
"learning_rate": 9.13563429077301e-08,
"loss": 0.2437,
"num_input_tokens_seen": 222623832,
"step": 2900,
"train_runtime": 16994.843,
"train_tokens_per_second": 13099.493
},
{
"epoch": 3.866743156859531,
"grad_norm": 0.7801971435546875,
"learning_rate": 9.106490031040353e-08,
"loss": 0.3174,
"num_input_tokens_seen": 226629408,
"step": 2950,
"train_runtime": 17320.4321,
"train_tokens_per_second": 13084.512
},
{
"epoch": 3.9323061793148666,
"grad_norm": 0.4492790699005127,
"learning_rate": 9.076910662750096e-08,
"loss": 0.199,
"num_input_tokens_seen": 230444736,
"step": 3000,
"train_runtime": 17612.3894,
"train_tokens_per_second": 13084.24
},
{
"epoch": 3.9978692017702016,
"grad_norm": 4.88616418838501,
"learning_rate": 9.04689932002315e-08,
"loss": 0.1764,
"num_input_tokens_seen": 234389904,
"step": 3050,
"train_runtime": 17949.0057,
"train_tokens_per_second": 13058.657
},
{
"epoch": 4.062940501557121,
"grad_norm": 0.597968339920044,
"learning_rate": 9.016459182750843e-08,
"loss": 0.209,
"num_input_tokens_seen": 238124880,
"step": 3100,
"train_runtime": 18244.7826,
"train_tokens_per_second": 13051.67
},
{
"epoch": 4.128503524012457,
"grad_norm": 0.8793305158615112,
"learning_rate": 8.985593476257997e-08,
"loss": 0.2686,
"num_input_tokens_seen": 241758864,
"step": 3150,
"train_runtime": 18507.4406,
"train_tokens_per_second": 13062.793
},
{
"epoch": 4.194066546467792,
"grad_norm": 7.551540851593018,
"learning_rate": 8.954305470961178e-08,
"loss": 0.2529,
"num_input_tokens_seen": 245698488,
"step": 3200,
"train_runtime": 18827.139,
"train_tokens_per_second": 13050.23
},
{
"epoch": 4.259629568923128,
"grad_norm": 0.4505975842475891,
"learning_rate": 8.922598482022182e-08,
"loss": 0.2384,
"num_input_tokens_seen": 249595968,
"step": 3250,
"train_runtime": 19129.2909,
"train_tokens_per_second": 13047.842
},
{
"epoch": 4.325192591378462,
"grad_norm": 2.2207558155059814,
"learning_rate": 8.890475868996762e-08,
"loss": 0.1867,
"num_input_tokens_seen": 253481304,
"step": 3300,
"train_runtime": 19419.7804,
"train_tokens_per_second": 13052.738
},
{
"epoch": 4.3907556138337975,
"grad_norm": 0.9266397356987,
"learning_rate": 8.857941035478673e-08,
"loss": 0.1763,
"num_input_tokens_seen": 257255976,
"step": 3350,
"train_runtime": 19702.252,
"train_tokens_per_second": 13057.186
},
{
"epoch": 4.456318636289133,
"grad_norm": 0.29596129059791565,
"learning_rate": 8.824997428739036e-08,
"loss": 0.2278,
"num_input_tokens_seen": 261064368,
"step": 3400,
"train_runtime": 19998.9663,
"train_tokens_per_second": 13053.893
},
{
"epoch": 4.521881658744468,
"grad_norm": 0.9699137210845947,
"learning_rate": 8.791648539361072e-08,
"loss": 0.201,
"num_input_tokens_seen": 264944352,
"step": 3450,
"train_runtime": 20299.7802,
"train_tokens_per_second": 13051.587
},
{
"epoch": 4.587444681199804,
"grad_norm": 1.298768401145935,
"learning_rate": 8.757897900870261e-08,
"loss": 0.2057,
"num_input_tokens_seen": 268791072,
"step": 3500,
"train_runtime": 20594.4257,
"train_tokens_per_second": 13051.642
},
{
"epoch": 4.653007703655138,
"grad_norm": 12.011015892028809,
"learning_rate": 8.72374908935994e-08,
"loss": 0.2351,
"num_input_tokens_seen": 272495832,
"step": 3550,
"train_runtime": 20885.3413,
"train_tokens_per_second": 13047.229
},
{
"epoch": 4.718570726110474,
"grad_norm": 0.24729423224925995,
"learning_rate": 8.689205723112387e-08,
"loss": 0.2065,
"num_input_tokens_seen": 276393408,
"step": 3600,
"train_runtime": 21206.0433,
"train_tokens_per_second": 13033.71
},
{
"epoch": 4.784133748565809,
"grad_norm": 2.150505781173706,
"learning_rate": 8.654271462215454e-08,
"loss": 0.158,
"num_input_tokens_seen": 280197624,
"step": 3650,
"train_runtime": 21488.9397,
"train_tokens_per_second": 13039.155
},
{
"epoch": 4.849696771021144,
"grad_norm": 0.4875163435935974,
"learning_rate": 8.618950008174746e-08,
"loss": 0.1832,
"num_input_tokens_seen": 284031624,
"step": 3700,
"train_runtime": 21778.1233,
"train_tokens_per_second": 13042.062
},
{
"epoch": 4.915259793476479,
"grad_norm": 0.5430140495300293,
"learning_rate": 8.583245103521428e-08,
"loss": 0.2566,
"num_input_tokens_seen": 287936280,
"step": 3750,
"train_runtime": 22067.8249,
"train_tokens_per_second": 13047.787
},
{
"epoch": 4.980822815931814,
"grad_norm": 0.3734208941459656,
"learning_rate": 8.547160531415679e-08,
"loss": 0.2775,
"num_input_tokens_seen": 291838584,
"step": 3800,
"train_runtime": 22359.3364,
"train_tokens_per_second": 13052.202
},
{
"epoch": 5.045894115718735,
"grad_norm": 0.9905921220779419,
"learning_rate": 8.510700115245841e-08,
"loss": 0.1971,
"num_input_tokens_seen": 295643712,
"step": 3850,
"train_runtime": 22653.1086,
"train_tokens_per_second": 13050.911
},
{
"epoch": 5.11145713817407,
"grad_norm": 0.0872701108455658,
"learning_rate": 8.473867718223315e-08,
"loss": 0.3142,
"num_input_tokens_seen": 299528016,
"step": 3900,
"train_runtime": 22970.1152,
"train_tokens_per_second": 13039.9
},
{
"epoch": 5.177020160629405,
"grad_norm": 0.7591832876205444,
"learning_rate": 8.436667242973218e-08,
"loss": 0.2291,
"num_input_tokens_seen": 303643632,
"step": 3950,
"train_runtime": 23324.5779,
"train_tokens_per_second": 13018.183
},
{
"epoch": 5.24258318308474,
"grad_norm": 0.44477882981300354,
"learning_rate": 8.399102631120877e-08,
"loss": 0.2128,
"num_input_tokens_seen": 307574184,
"step": 4000,
"train_runtime": 23603.684,
"train_tokens_per_second": 13030.77
},
{
"epoch": 5.308146205540075,
"grad_norm": 0.48096030950546265,
"learning_rate": 8.361177862874202e-08,
"loss": 0.1472,
"num_input_tokens_seen": 311323584,
"step": 4050,
"train_runtime": 23888.6512,
"train_tokens_per_second": 13032.28
},
{
"epoch": 5.373709227995411,
"grad_norm": 0.9138302206993103,
"learning_rate": 8.32289695660194e-08,
"loss": 0.1981,
"num_input_tokens_seen": 315158328,
"step": 4100,
"train_runtime": 24182.2327,
"train_tokens_per_second": 13032.64
},
{
"epoch": 5.439272250450745,
"grad_norm": 0.3333579897880554,
"learning_rate": 8.284263968407912e-08,
"loss": 0.1837,
"num_input_tokens_seen": 318844944,
"step": 4150,
"train_runtime": 24456.5915,
"train_tokens_per_second": 13037.178
},
{
"epoch": 5.504835272906081,
"grad_norm": 0.9484214782714844,
"learning_rate": 8.245282991701243e-08,
"loss": 0.2015,
"num_input_tokens_seen": 322685568,
"step": 4200,
"train_runtime": 24723.1173,
"train_tokens_per_second": 13051.977
},
{
"epoch": 5.570398295361416,
"grad_norm": 0.4100230634212494,
"learning_rate": 8.205958156762646e-08,
"loss": 0.2554,
"num_input_tokens_seen": 326275680,
"step": 4250,
"train_runtime": 24984.4942,
"train_tokens_per_second": 13059.127
},
{
"epoch": 5.635961317816752,
"grad_norm": 0.9571174383163452,
"learning_rate": 8.166293630306773e-08,
"loss": 0.2039,
"num_input_tokens_seen": 330026184,
"step": 4300,
"train_runtime": 25280.2384,
"train_tokens_per_second": 13054.71
},
{
"epoch": 5.701524340272087,
"grad_norm": 0.5215702652931213,
"learning_rate": 8.126293615040747e-08,
"loss": 0.2277,
"num_input_tokens_seen": 333968520,
"step": 4350,
"train_runtime": 25565.1364,
"train_tokens_per_second": 13063.436
},
{
"epoch": 5.7670873627274215,
"grad_norm": 0.4471840560436249,
"learning_rate": 8.085962349218847e-08,
"loss": 0.2104,
"num_input_tokens_seen": 337707624,
"step": 4400,
"train_runtime": 25841.3753,
"train_tokens_per_second": 13068.485
},
{
"epoch": 5.832650385182757,
"grad_norm": 1.0097142457962036,
"learning_rate": 8.04530410619344e-08,
"loss": 0.2524,
"num_input_tokens_seen": 341503488,
"step": 4450,
"train_runtime": 26137.1854,
"train_tokens_per_second": 13065.81
},
{
"epoch": 5.898213407638092,
"grad_norm": 1.6211527585983276,
"learning_rate": 8.004323193962197e-08,
"loss": 0.1595,
"num_input_tokens_seen": 345388440,
"step": 4500,
"train_runtime": 26453.2756,
"train_tokens_per_second": 13056.547
},
{
"epoch": 5.963776430093427,
"grad_norm": 0.25499045848846436,
"learning_rate": 7.963023954711624e-08,
"loss": 0.2721,
"num_input_tokens_seen": 349216920,
"step": 4550,
"train_runtime": 26741.8598,
"train_tokens_per_second": 13058.812
},
{
"epoch": 6.028847729880347,
"grad_norm": 0.6265522837638855,
"learning_rate": 7.921410764356988e-08,
"loss": 0.1993,
"num_input_tokens_seen": 353096424,
"step": 4600,
"train_runtime": 27061.8507,
"train_tokens_per_second": 13047.756
},
{
"epoch": 6.0944107523356825,
"grad_norm": 0.06899835169315338,
"learning_rate": 7.87948803207866e-08,
"loss": 0.2228,
"num_input_tokens_seen": 356829384,
"step": 4650,
"train_runtime": 27330.1966,
"train_tokens_per_second": 13056.232
},
{
"epoch": 6.159973774791018,
"grad_norm": 0.8082672953605652,
"learning_rate": 7.837260199854929e-08,
"loss": 0.1859,
"num_input_tokens_seen": 360447864,
"step": 4700,
"train_runtime": 27571.8796,
"train_tokens_per_second": 13073.025
},
{
"epoch": 6.225536797246353,
"grad_norm": 0.6293157339096069,
"learning_rate": 7.794731741991355e-08,
"loss": 0.2223,
"num_input_tokens_seen": 364279296,
"step": 4750,
"train_runtime": 27852.8113,
"train_tokens_per_second": 13078.726
},
{
"epoch": 6.291099819701688,
"grad_norm": 0.7018508315086365,
"learning_rate": 7.751907164646682e-08,
"loss": 0.1709,
"num_input_tokens_seen": 368000976,
"step": 4800,
"train_runtime": 28103.875,
"train_tokens_per_second": 13094.314
},
{
"epoch": 6.356662842157023,
"grad_norm": 0.3939789831638336,
"learning_rate": 7.70879100535538e-08,
"loss": 0.1903,
"num_input_tokens_seen": 371666208,
"step": 4850,
"train_runtime": 28370.3397,
"train_tokens_per_second": 13100.52
},
{
"epoch": 6.422225864612359,
"grad_norm": 0.07075575739145279,
"learning_rate": 7.665387832546873e-08,
"loss": 0.1653,
"num_input_tokens_seen": 375530976,
"step": 4900,
"train_runtime": 28672.2738,
"train_tokens_per_second": 13097.356
},
{
"epoch": 6.487788887067694,
"grad_norm": 1.4741820096969604,
"learning_rate": 7.621702245061479e-08,
"loss": 0.2247,
"num_input_tokens_seen": 379400040,
"step": 4950,
"train_runtime": 28956.4169,
"train_tokens_per_second": 13102.451
},
{
"epoch": 6.553351909523029,
"grad_norm": 0.756077229976654,
"learning_rate": 7.577738871663131e-08,
"loss": 0.2299,
"num_input_tokens_seen": 383417568,
"step": 5000,
"train_runtime": 29294.8197,
"train_tokens_per_second": 13088.238
}
],
"logging_steps": 50,
"max_steps": 15260,
"num_input_tokens_seen": 383417568,
"num_train_epochs": 20,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.358913850245906e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}