StepCount-7B-SFT-1M / trainer_state.json
zhangchenhao
Upload StepCount-7B-SFT-1M checkpoint-476 remaining files
6088da0
Raw
History Blame Contribute Delete
36.3 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.24257867244234935,
"eval_steps": 238,
"global_step": 476,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002038476239011339,
"grad_norm": 0.6477837651252902,
"learning_rate": 2.699735382882792e-06,
"loss": 0.5815,
"num_input_tokens_seen": 6936832,
"step": 4,
"train_runtime": 594.7401,
"train_tokens_per_second": 11663.637
},
{
"epoch": 0.004076952478022678,
"grad_norm": 0.6182898594326122,
"learning_rate": 2.698559515983203e-06,
"loss": 0.5691,
"num_input_tokens_seen": 14270848,
"step": 8,
"train_runtime": 1203.7054,
"train_tokens_per_second": 11855.764
},
{
"epoch": 0.006115428717034017,
"grad_norm": 0.6559666459303339,
"learning_rate": 2.696443816026242e-06,
"loss": 0.5597,
"num_input_tokens_seen": 21129216,
"step": 12,
"train_runtime": 1762.6999,
"train_tokens_per_second": 11986.848
},
{
"epoch": 0.008153904956045356,
"grad_norm": 0.6460312307750392,
"learning_rate": 2.693389757477617e-06,
"loss": 0.564,
"num_input_tokens_seen": 28254528,
"step": 16,
"train_runtime": 2338.7069,
"train_tokens_per_second": 12081.261
},
{
"epoch": 0.010192381195056695,
"grad_norm": 0.7820470969086647,
"learning_rate": 2.689399468760395e-06,
"loss": 0.5386,
"num_input_tokens_seen": 35623168,
"step": 20,
"train_runtime": 2958.6163,
"train_tokens_per_second": 12040.482
},
{
"epoch": 0.012230857434068034,
"grad_norm": 0.7662986210694266,
"learning_rate": 2.6844757307716743e-06,
"loss": 0.5589,
"num_input_tokens_seen": 42908288,
"step": 24,
"train_runtime": 3591.5632,
"train_tokens_per_second": 11946.967
},
{
"epoch": 0.014269333673079374,
"grad_norm": 0.7264783091425356,
"learning_rate": 2.678621974944531e-06,
"loss": 0.5349,
"num_input_tokens_seen": 49922816,
"step": 28,
"train_runtime": 4214.8304,
"train_tokens_per_second": 11844.561
},
{
"epoch": 0.01630780991209071,
"grad_norm": 0.7961355791031945,
"learning_rate": 2.6718422808565973e-06,
"loss": 0.5432,
"num_input_tokens_seen": 56972736,
"step": 32,
"train_runtime": 4806.2524,
"train_tokens_per_second": 11853.879
},
{
"epoch": 0.01834628615110205,
"grad_norm": 0.8138139768273569,
"learning_rate": 2.6641413733869353e-06,
"loss": 0.538,
"num_input_tokens_seen": 64012032,
"step": 36,
"train_runtime": 5375.3791,
"train_tokens_per_second": 11908.375
},
{
"epoch": 0.02038476239011339,
"grad_norm": 0.7766770677480328,
"learning_rate": 2.6555246194231957e-06,
"loss": 0.5368,
"num_input_tokens_seen": 71145984,
"step": 40,
"train_runtime": 5983.1229,
"train_tokens_per_second": 11891.112
},
{
"epoch": 0.02242323862912473,
"grad_norm": 0.786674093556373,
"learning_rate": 2.6459980241213444e-06,
"loss": 0.5267,
"num_input_tokens_seen": 78403392,
"step": 44,
"train_runtime": 6587.3777,
"train_tokens_per_second": 11902.064
},
{
"epoch": 0.02446171486813607,
"grad_norm": 0.895151868173495,
"learning_rate": 2.6355682267205705e-06,
"loss": 0.5334,
"num_input_tokens_seen": 85316672,
"step": 48,
"train_runtime": 7184.1032,
"train_tokens_per_second": 11875.758
},
{
"epoch": 0.02650019110714741,
"grad_norm": 0.7705686339787958,
"learning_rate": 2.6242424959162964e-06,
"loss": 0.531,
"num_input_tokens_seen": 92412736,
"step": 52,
"train_runtime": 7797.6138,
"train_tokens_per_second": 11851.412
},
{
"epoch": 0.028538667346158748,
"grad_norm": 0.819091795569789,
"learning_rate": 2.612028724794501e-06,
"loss": 0.5213,
"num_input_tokens_seen": 99543616,
"step": 56,
"train_runtime": 8349.8154,
"train_tokens_per_second": 11921.655
},
{
"epoch": 0.030577143585170087,
"grad_norm": 0.8450961282572036,
"learning_rate": 2.598935425330904e-06,
"loss": 0.5254,
"num_input_tokens_seen": 106489728,
"step": 60,
"train_runtime": 8960.4265,
"train_tokens_per_second": 11884.449
},
{
"epoch": 0.03261561982418142,
"grad_norm": 0.9361937413311264,
"learning_rate": 2.5849717224588284e-06,
"loss": 0.5376,
"num_input_tokens_seen": 113524032,
"step": 64,
"train_runtime": 9543.0991,
"train_tokens_per_second": 11895.929
},
{
"epoch": 0.034654096063192766,
"grad_norm": 0.8847320129841505,
"learning_rate": 2.5701473477098874e-06,
"loss": 0.5651,
"num_input_tokens_seen": 120475456,
"step": 68,
"train_runtime": 10086.9254,
"train_tokens_per_second": 11943.724
},
{
"epoch": 0.0366925723022041,
"grad_norm": 0.8777390400419234,
"learning_rate": 2.5544726324319225e-06,
"loss": 0.5692,
"num_input_tokens_seen": 127670080,
"step": 72,
"train_runtime": 10665.8117,
"train_tokens_per_second": 11970.029
},
{
"epoch": 0.038731048541215445,
"grad_norm": 0.9576210756092853,
"learning_rate": 2.5379585005889178e-06,
"loss": 0.5625,
"num_input_tokens_seen": 134768512,
"step": 76,
"train_runtime": 11283.1943,
"train_tokens_per_second": 11944.181
},
{
"epoch": 0.04076952478022678,
"grad_norm": 0.8753248708570508,
"learning_rate": 2.5206164611479122e-06,
"loss": 0.5805,
"num_input_tokens_seen": 141573760,
"step": 80,
"train_runtime": 11827.0251,
"train_tokens_per_second": 11970.361
},
{
"epoch": 0.042808001019238116,
"grad_norm": 0.8684000412026938,
"learning_rate": 2.502458600058214e-06,
"loss": 0.5426,
"num_input_tokens_seen": 148565440,
"step": 84,
"train_runtime": 12416.0172,
"train_tokens_per_second": 11965.628
},
{
"epoch": 0.04484647725824946,
"grad_norm": 0.7849759417456085,
"learning_rate": 2.4834975718285047e-06,
"loss": 0.5676,
"num_input_tokens_seen": 155693632,
"step": 88,
"train_runtime": 13027.3428,
"train_tokens_per_second": 11951.296
},
{
"epoch": 0.046884953497260795,
"grad_norm": 0.8527233643540989,
"learning_rate": 2.463746590707708e-06,
"loss": 0.5706,
"num_input_tokens_seen": 162884544,
"step": 92,
"train_runtime": 13621.8968,
"train_tokens_per_second": 11957.552
},
{
"epoch": 0.04892342973627214,
"grad_norm": 0.8245948514984758,
"learning_rate": 2.4432194214757634e-06,
"loss": 0.5601,
"num_input_tokens_seen": 169884480,
"step": 96,
"train_runtime": 14201.6379,
"train_tokens_per_second": 11962.316
},
{
"epoch": 0.050961905975283474,
"grad_norm": 0.9463896487710859,
"learning_rate": 2.4219303698507273e-06,
"loss": 0.578,
"num_input_tokens_seen": 177144768,
"step": 100,
"train_runtime": 14812.6402,
"train_tokens_per_second": 11959.027
},
{
"epoch": 0.05300038221429482,
"grad_norm": 0.8528440593847427,
"learning_rate": 2.399894272518887e-06,
"loss": 0.5695,
"num_input_tokens_seen": 184355904,
"step": 104,
"train_runtime": 15392.8793,
"train_tokens_per_second": 11976.7
},
{
"epoch": 0.05503885845330615,
"grad_norm": 0.8983751041750461,
"learning_rate": 2.3771264867948297e-06,
"loss": 0.5643,
"num_input_tokens_seen": 191461120,
"step": 108,
"train_runtime": 15962.3103,
"train_tokens_per_second": 11994.574
},
{
"epoch": 0.057077334692317495,
"grad_norm": 0.9738501561599475,
"learning_rate": 2.353642879918684e-06,
"loss": 0.5611,
"num_input_tokens_seen": 198512640,
"step": 112,
"train_runtime": 16542.875,
"train_tokens_per_second": 11999.888
},
{
"epoch": 0.05911581093132883,
"grad_norm": 0.9069372359842974,
"learning_rate": 2.329459817997979e-06,
"loss": 0.555,
"num_input_tokens_seen": 205184640,
"step": 116,
"train_runtime": 17132.5621,
"train_tokens_per_second": 11976.296
},
{
"epoch": 0.061154287170340174,
"grad_norm": 0.8535229191703974,
"learning_rate": 2.304594154601839e-06,
"loss": 0.5802,
"num_input_tokens_seen": 212192768,
"step": 120,
"train_runtime": 17695.1167,
"train_tokens_per_second": 11991.6
},
{
"epoch": 0.06319276340935151,
"grad_norm": 0.9235254276307827,
"learning_rate": 2.2790632190154588e-06,
"loss": 0.5602,
"num_input_tokens_seen": 219343424,
"step": 124,
"train_runtime": 18276.554,
"train_tokens_per_second": 12001.356
},
{
"epoch": 0.06523123964836285,
"grad_norm": 0.8469907739766386,
"learning_rate": 2.2528848041630394e-06,
"loss": 0.5726,
"num_input_tokens_seen": 226825024,
"step": 128,
"train_runtime": 18867.9805,
"train_tokens_per_second": 12021.691
},
{
"epoch": 0.06726971588737418,
"grad_norm": 0.863302212361952,
"learning_rate": 2.226077154207613e-06,
"loss": 0.5458,
"num_input_tokens_seen": 233873600,
"step": 132,
"train_runtime": 19428.0179,
"train_tokens_per_second": 12037.955
},
{
"epoch": 0.06930819212638553,
"grad_norm": 0.890826252542842,
"learning_rate": 2.1986589518363884e-06,
"loss": 0.5683,
"num_input_tokens_seen": 241046016,
"step": 136,
"train_runtime": 20027.9177,
"train_tokens_per_second": 12035.501
},
{
"epoch": 0.07134666836539687,
"grad_norm": 0.8897818157637029,
"learning_rate": 2.17064930524048e-06,
"loss": 0.5753,
"num_input_tokens_seen": 248445952,
"step": 140,
"train_runtime": 20655.1895,
"train_tokens_per_second": 12028.258
},
{
"epoch": 0.0733851446044082,
"grad_norm": 0.7970770860102688,
"learning_rate": 2.1420677347981022e-06,
"loss": 0.5492,
"num_input_tokens_seen": 255691072,
"step": 144,
"train_runtime": 21246.8799,
"train_tokens_per_second": 12034.288
},
{
"epoch": 0.07542362084341954,
"grad_norm": 1.114229924043429,
"learning_rate": 2.112934159470499e-06,
"loss": 0.5696,
"num_input_tokens_seen": 263081280,
"step": 148,
"train_runtime": 21841.1341,
"train_tokens_per_second": 12045.221
},
{
"epoch": 0.07746209708243089,
"grad_norm": 0.8591461034263972,
"learning_rate": 2.083268882920095e-06,
"loss": 0.5765,
"num_input_tokens_seen": 270227584,
"step": 152,
"train_runtime": 22468.8462,
"train_tokens_per_second": 12026.767
},
{
"epoch": 0.07950057332144222,
"grad_norm": 0.8676767698372961,
"learning_rate": 2.053092579360543e-06,
"loss": 0.5706,
"num_input_tokens_seen": 277353664,
"step": 156,
"train_runtime": 23053.2043,
"train_tokens_per_second": 12031.024
},
{
"epoch": 0.08153904956045356,
"grad_norm": 0.9224293367262005,
"learning_rate": 2.0224262791485315e-06,
"loss": 0.5608,
"num_input_tokens_seen": 284646784,
"step": 160,
"train_runtime": 23643.9972,
"train_tokens_per_second": 12038.86
},
{
"epoch": 0.0835775257994649,
"grad_norm": 0.887751023260673,
"learning_rate": 1.991291354127381e-06,
"loss": 0.5636,
"num_input_tokens_seen": 291840192,
"step": 164,
"train_runtime": 24251.2033,
"train_tokens_per_second": 12034.05
},
{
"epoch": 0.08561600203847623,
"grad_norm": 0.8971166373496055,
"learning_rate": 1.959709502732666e-06,
"loss": 0.5624,
"num_input_tokens_seen": 298899456,
"step": 168,
"train_runtime": 24847.2173,
"train_tokens_per_second": 12029.494
},
{
"epoch": 0.08765447827748758,
"grad_norm": 0.884570439459004,
"learning_rate": 1.927702734870216e-06,
"loss": 0.5802,
"num_input_tokens_seen": 305987520,
"step": 172,
"train_runtime": 25438.1181,
"train_tokens_per_second": 12028.701
},
{
"epoch": 0.08969295451649892,
"grad_norm": 0.7905104567457971,
"learning_rate": 1.895293356577058e-06,
"loss": 0.5557,
"num_input_tokens_seen": 312989312,
"step": 176,
"train_runtime": 26002.8183,
"train_tokens_per_second": 12036.746
},
{
"epoch": 0.09173143075551025,
"grad_norm": 0.9424718927507695,
"learning_rate": 1.8625039544759767e-06,
"loss": 0.5663,
"num_input_tokens_seen": 320097152,
"step": 180,
"train_runtime": 26586.1368,
"train_tokens_per_second": 12040.002
},
{
"epoch": 0.09376990699452159,
"grad_norm": 0.8615504446489718,
"learning_rate": 1.8293573800345261e-06,
"loss": 0.5729,
"num_input_tokens_seen": 327245696,
"step": 184,
"train_runtime": 27215.267,
"train_tokens_per_second": 12024.343
},
{
"epoch": 0.09580838323353294,
"grad_norm": 0.9699743298030914,
"learning_rate": 1.7958767336394758e-06,
"loss": 0.5737,
"num_input_tokens_seen": 334634880,
"step": 188,
"train_runtime": 27830.7489,
"train_tokens_per_second": 12023.927
},
{
"epoch": 0.09784685947254428,
"grad_norm": 0.8699479610963319,
"learning_rate": 1.7620853484977693e-06,
"loss": 0.5578,
"num_input_tokens_seen": 341804032,
"step": 192,
"train_runtime": 28418.8119,
"train_tokens_per_second": 12027.386
},
{
"epoch": 0.09988533571155561,
"grad_norm": 0.9257049516328657,
"learning_rate": 1.7280067743752384e-06,
"loss": 0.5521,
"num_input_tokens_seen": 348671424,
"step": 196,
"train_runtime": 29007.5671,
"train_tokens_per_second": 12020.016
},
{
"epoch": 0.10192381195056695,
"grad_norm": 0.9131171296688921,
"learning_rate": 1.6936647611843846e-06,
"loss": 0.5682,
"num_input_tokens_seen": 355675520,
"step": 200,
"train_runtime": 29580.1297,
"train_tokens_per_second": 12024.137
},
{
"epoch": 0.10396228818957828,
"grad_norm": 0.8769808506815606,
"learning_rate": 1.659083242432681e-06,
"loss": 0.5584,
"num_input_tokens_seen": 362651648,
"step": 204,
"train_runtime": 30152.9255,
"train_tokens_per_second": 12027.08
},
{
"epoch": 0.10600076442858963,
"grad_norm": 0.9497619416730099,
"learning_rate": 1.6242863185429212e-06,
"loss": 0.5879,
"num_input_tokens_seen": 369726720,
"step": 208,
"train_runtime": 30751.1104,
"train_tokens_per_second": 12023.199
},
{
"epoch": 0.10803924066760097,
"grad_norm": 0.9230137333974207,
"learning_rate": 1.5892982400572422e-06,
"loss": 0.5681,
"num_input_tokens_seen": 376862016,
"step": 212,
"train_runtime": 31362.1922,
"train_tokens_per_second": 12016.444
},
{
"epoch": 0.1100777169066123,
"grad_norm": 0.8995334199668418,
"learning_rate": 1.5541433907365264e-06,
"loss": 0.5455,
"num_input_tokens_seen": 383921152,
"step": 216,
"train_runtime": 31914.7502,
"train_tokens_per_second": 12029.583
},
{
"epoch": 0.11211619314562364,
"grad_norm": 0.9043735392416916,
"learning_rate": 1.5188462705669648e-06,
"loss": 0.5641,
"num_input_tokens_seen": 390988416,
"step": 220,
"train_runtime": 32509.882,
"train_tokens_per_second": 12026.756
},
{
"epoch": 0.11415466938463499,
"grad_norm": 0.9729636398442392,
"learning_rate": 1.4834314786856161e-06,
"loss": 0.5607,
"num_input_tokens_seen": 398152576,
"step": 224,
"train_runtime": 33085.7673,
"train_tokens_per_second": 12033.953
},
{
"epoch": 0.11619314562364633,
"grad_norm": 0.9539597912248131,
"learning_rate": 1.4479236962368684e-06,
"loss": 0.5589,
"num_input_tokens_seen": 405173888,
"step": 228,
"train_runtime": 33682.3182,
"train_tokens_per_second": 12029.276
},
{
"epoch": 0.11823162186265766,
"grad_norm": 0.8598081231037438,
"learning_rate": 1.4123476691717487e-06,
"loss": 0.5518,
"num_input_tokens_seen": 412070528,
"step": 232,
"train_runtime": 34252.7917,
"train_tokens_per_second": 12030.276
},
{
"epoch": 0.120270098101669,
"grad_norm": 0.8860637699401077,
"learning_rate": 1.376728191002066e-06,
"loss": 0.558,
"num_input_tokens_seen": 419423744,
"step": 236,
"train_runtime": 34844.2912,
"train_tokens_per_second": 12037.086
},
{
"epoch": 0.12128933622117467,
"eval_loss": 0.7941220998764038,
"eval_runtime": 213.2901,
"eval_samples_per_second": 4.717,
"eval_steps_per_second": 0.075,
"num_input_tokens_seen": 422911872,
"step": 238
},
{
"epoch": 0.12230857434068035,
"grad_norm": 0.9547518291189157,
"learning_rate": 1.3410900855214124e-06,
"loss": 0.5732,
"num_input_tokens_seen": 426493248,
"step": 240,
"train_runtime": 35670.5805,
"train_tokens_per_second": 11956.443
},
{
"epoch": 0.12434705057969168,
"grad_norm": 0.8725219735566059,
"learning_rate": 1.305458189505055e-06,
"loss": 0.5605,
"num_input_tokens_seen": 433674688,
"step": 244,
"train_runtime": 36270.2542,
"train_tokens_per_second": 11956.759
},
{
"epoch": 0.12638552681870302,
"grad_norm": 0.9381950485311329,
"learning_rate": 1.269857335400783e-06,
"loss": 0.5758,
"num_input_tokens_seen": 441063552,
"step": 248,
"train_runtime": 36895.0601,
"train_tokens_per_second": 11954.542
},
{
"epoch": 0.12842400305771437,
"grad_norm": 0.8095871776257958,
"learning_rate": 1.2343123340227753e-06,
"loss": 0.5682,
"num_input_tokens_seen": 448549632,
"step": 252,
"train_runtime": 37565.6709,
"train_tokens_per_second": 11940.413
},
{
"epoch": 0.1304624792967257,
"grad_norm": 0.8656130470620739,
"learning_rate": 1.1988479572605345e-06,
"loss": 0.5545,
"num_input_tokens_seen": 455502912,
"step": 256,
"train_runtime": 38151.4724,
"train_tokens_per_second": 11939.327
},
{
"epoch": 0.13250095553573704,
"grad_norm": 0.9346351864084386,
"learning_rate": 1.1634889208149594e-06,
"loss": 0.5664,
"num_input_tokens_seen": 462435200,
"step": 260,
"train_runtime": 38714.2142,
"train_tokens_per_second": 11944.843
},
{
"epoch": 0.13453943177474836,
"grad_norm": 0.899249765870996,
"learning_rate": 1.1282598669735671e-06,
"loss": 0.558,
"num_input_tokens_seen": 469336640,
"step": 264,
"train_runtime": 39254.3889,
"train_tokens_per_second": 11956.284
},
{
"epoch": 0.1365779080137597,
"grad_norm": 0.8458171655451426,
"learning_rate": 1.093185347436887e-06,
"loss": 0.5538,
"num_input_tokens_seen": 476310656,
"step": 268,
"train_runtime": 39827.3684,
"train_tokens_per_second": 11959.381
},
{
"epoch": 0.13861638425277106,
"grad_norm": 0.9179027627040293,
"learning_rate": 1.058289806207975e-06,
"loss": 0.5698,
"num_input_tokens_seen": 483771840,
"step": 272,
"train_runtime": 40459.3108,
"train_tokens_per_second": 11956.997
},
{
"epoch": 0.14065486049178239,
"grad_norm": 0.9442703488926768,
"learning_rate": 1.0235975625569967e-06,
"loss": 0.5649,
"num_input_tokens_seen": 490750336,
"step": 276,
"train_runtime": 41048.4541,
"train_tokens_per_second": 11955.391
},
{
"epoch": 0.14269333673079374,
"grad_norm": 0.8974082301624304,
"learning_rate": 9.891327940727266e-07,
"loss": 0.5657,
"num_input_tokens_seen": 497702912,
"step": 280,
"train_runtime": 41610.7519,
"train_tokens_per_second": 11960.921
},
{
"epoch": 0.14473181296980506,
"grad_norm": 0.9898213671868435,
"learning_rate": 9.549195198127994e-07,
"loss": 0.58,
"num_input_tokens_seen": 505099584,
"step": 284,
"train_runtime": 42219.4154,
"train_tokens_per_second": 11963.68
},
{
"epoch": 0.1467702892088164,
"grad_norm": 0.9363039019647847,
"learning_rate": 9.209815835644328e-07,
"loss": 0.5512,
"num_input_tokens_seen": 512152896,
"step": 288,
"train_runtime": 42766.508,
"train_tokens_per_second": 11975.56
},
{
"epoch": 0.14880876544782776,
"grad_norm": 0.9589553041250718,
"learning_rate": 8.873426372273072e-07,
"loss": 0.5747,
"num_input_tokens_seen": 519187584,
"step": 292,
"train_runtime": 43380.0418,
"train_tokens_per_second": 11968.351
},
{
"epoch": 0.15084724168683908,
"grad_norm": 0.9433972448251747,
"learning_rate": 8.540261243301721e-07,
"loss": 0.5509,
"num_input_tokens_seen": 526140736,
"step": 296,
"train_runtime": 43995.5053,
"train_tokens_per_second": 11958.966
},
{
"epoch": 0.15288571792585043,
"grad_norm": 0.9393953597111547,
"learning_rate": 8.210552636926686e-07,
"loss": 0.5629,
"num_input_tokens_seen": 532992384,
"step": 300,
"train_runtime": 44595.3555,
"train_tokens_per_second": 11951.746
},
{
"epoch": 0.15492419416486178,
"grad_norm": 0.9091057919029727,
"learning_rate": 7.884530332437565e-07,
"loss": 0.5663,
"num_input_tokens_seen": 540379392,
"step": 304,
"train_runtime": 45193.3818,
"train_tokens_per_second": 11957.047
},
{
"epoch": 0.1569626704038731,
"grad_norm": 0.8832084768576383,
"learning_rate": 7.562421540080231e-07,
"loss": 0.5613,
"num_input_tokens_seen": 547461440,
"step": 308,
"train_runtime": 45756.6672,
"train_tokens_per_second": 11964.627
},
{
"epoch": 0.15900114664288445,
"grad_norm": 0.8587618292954599,
"learning_rate": 7.244450742710321e-07,
"loss": 0.561,
"num_input_tokens_seen": 554617984,
"step": 312,
"train_runtime": 46381.4062,
"train_tokens_per_second": 11957.766
},
{
"epoch": 0.16103962288189577,
"grad_norm": 0.9446856546867352,
"learning_rate": 6.930839539347442e-07,
"loss": 0.5532,
"num_input_tokens_seen": 561693888,
"step": 316,
"train_runtime": 46994.8669,
"train_tokens_per_second": 11952.239
},
{
"epoch": 0.16307809912090712,
"grad_norm": 0.9429683753374553,
"learning_rate": 6.621806490739267e-07,
"loss": 0.5595,
"num_input_tokens_seen": 568833664,
"step": 320,
"train_runtime": 47586.3302,
"train_tokens_per_second": 11953.72
},
{
"epoch": 0.16511657535991847,
"grad_norm": 0.9035859891449433,
"learning_rate": 6.317566967042958e-07,
"loss": 0.5504,
"num_input_tokens_seen": 576161408,
"step": 324,
"train_runtime": 48187.0813,
"train_tokens_per_second": 11956.761
},
{
"epoch": 0.1671550515989298,
"grad_norm": 0.9033151093018099,
"learning_rate": 6.018332997730213e-07,
"loss": 0.5605,
"num_input_tokens_seen": 583395328,
"step": 328,
"train_runtime": 48788.5572,
"train_tokens_per_second": 11957.626
},
{
"epoch": 0.16919352783794114,
"grad_norm": 0.9609542284697502,
"learning_rate": 5.724313123820482e-07,
"loss": 0.5557,
"num_input_tokens_seen": 590636544,
"step": 332,
"train_runtime": 49400.1577,
"train_tokens_per_second": 11956.167
},
{
"epoch": 0.17123200407695247,
"grad_norm": 0.8861118611359287,
"learning_rate": 5.435712252545331e-07,
"loss": 0.5608,
"num_input_tokens_seen": 597723776,
"step": 336,
"train_runtime": 50003.6729,
"train_tokens_per_second": 11953.597
},
{
"epoch": 0.17327048031596382,
"grad_norm": 0.9761110376120566,
"learning_rate": 5.152731514545266e-07,
"loss": 0.5478,
"num_input_tokens_seen": 604770752,
"step": 340,
"train_runtime": 50553.9712,
"train_tokens_per_second": 11962.873
},
{
"epoch": 0.17530895655497516,
"grad_norm": 0.9053070583607986,
"learning_rate": 4.875568123698525e-07,
"loss": 0.55,
"num_input_tokens_seen": 612028800,
"step": 344,
"train_runtime": 51128.5271,
"train_tokens_per_second": 11970.398
},
{
"epoch": 0.1773474327939865,
"grad_norm": 0.901496752129611,
"learning_rate": 4.604415239679492e-07,
"loss": 0.5682,
"num_input_tokens_seen": 619137792,
"step": 348,
"train_runtime": 51752.3485,
"train_tokens_per_second": 11963.472
},
{
"epoch": 0.17938590903299784,
"grad_norm": 0.9438945381441597,
"learning_rate": 4.3394618333426135e-07,
"loss": 0.5652,
"num_input_tokens_seen": 625854848,
"step": 352,
"train_runtime": 52310.1207,
"train_tokens_per_second": 11964.317
},
{
"epoch": 0.1814243852720092,
"grad_norm": 0.9543610627021044,
"learning_rate": 4.080892555025522e-07,
"loss": 0.5581,
"num_input_tokens_seen": 632626752,
"step": 356,
"train_runtime": 52857.0375,
"train_tokens_per_second": 11968.638
},
{
"epoch": 0.1834628615110205,
"grad_norm": 0.9118473255578559,
"learning_rate": 3.8288876058632056e-07,
"loss": 0.56,
"num_input_tokens_seen": 639638080,
"step": 360,
"train_runtime": 53443.4994,
"train_tokens_per_second": 11968.492
},
{
"epoch": 0.18550133775003186,
"grad_norm": 0.9212497452458377,
"learning_rate": 3.5836226122029165e-07,
"loss": 0.557,
"num_input_tokens_seen": 646705152,
"step": 364,
"train_runtime": 54052.7201,
"train_tokens_per_second": 11964.341
},
{
"epoch": 0.18753981398904318,
"grad_norm": 0.9056019341991234,
"learning_rate": 3.34526850320731e-07,
"loss": 0.5602,
"num_input_tokens_seen": 653674880,
"step": 368,
"train_runtime": 54641.6155,
"train_tokens_per_second": 11962.949
},
{
"epoch": 0.18957829022805453,
"grad_norm": 1.0036743775927282,
"learning_rate": 3.1139913917311347e-07,
"loss": 0.5787,
"num_input_tokens_seen": 660674240,
"step": 372,
"train_runtime": 55222.9846,
"train_tokens_per_second": 11963.755
},
{
"epoch": 0.19161676646706588,
"grad_norm": 0.9225731699510716,
"learning_rate": 2.889952458554475e-07,
"loss": 0.5515,
"num_input_tokens_seen": 667601792,
"step": 376,
"train_runtime": 55825.7562,
"train_tokens_per_second": 11958.67
},
{
"epoch": 0.1936552427060772,
"grad_norm": 0.9067845518246624,
"learning_rate": 2.6733078400532475e-07,
"loss": 0.5621,
"num_input_tokens_seen": 674622976,
"step": 380,
"train_runtime": 56384.2096,
"train_tokens_per_second": 11964.75
},
{
"epoch": 0.19569371894508855,
"grad_norm": 0.9543732534991831,
"learning_rate": 2.464208519385228e-07,
"loss": 0.5708,
"num_input_tokens_seen": 681708672,
"step": 384,
"train_runtime": 56981.2121,
"train_tokens_per_second": 11963.745
},
{
"epoch": 0.19773219518409987,
"grad_norm": 0.8996552227811947,
"learning_rate": 2.2628002212674264e-07,
"loss": 0.5671,
"num_input_tokens_seen": 689109376,
"step": 388,
"train_runtime": 57593.2165,
"train_tokens_per_second": 11965.114
},
{
"epoch": 0.19977067142311122,
"grad_norm": 0.955255917559042,
"learning_rate": 2.0692233104181644e-07,
"loss": 0.5449,
"num_input_tokens_seen": 696091456,
"step": 392,
"train_runtime": 58166.8582,
"train_tokens_per_second": 11967.149
},
{
"epoch": 0.20180914766212257,
"grad_norm": 0.9040308153396052,
"learning_rate": 1.8836126937346177e-07,
"loss": 0.5618,
"num_input_tokens_seen": 703111168,
"step": 396,
"train_runtime": 58738.4377,
"train_tokens_per_second": 11970.205
},
{
"epoch": 0.2038476239011339,
"grad_norm": 0.97944459658204,
"learning_rate": 1.706097726274012e-07,
"loss": 0.5622,
"num_input_tokens_seen": 710207168,
"step": 400,
"train_runtime": 59356.9508,
"train_tokens_per_second": 11965.021
},
{
"epoch": 0.20588610014014525,
"grad_norm": 0.9381590971188796,
"learning_rate": 1.5368021211039678e-07,
"loss": 0.5411,
"num_input_tokens_seen": 717098944,
"step": 404,
"train_runtime": 59911.4198,
"train_tokens_per_second": 11969.32
},
{
"epoch": 0.20792457637915657,
"grad_norm": 0.8878878520807448,
"learning_rate": 1.3758438630848725e-07,
"loss": 0.5585,
"num_input_tokens_seen": 723993152,
"step": 408,
"train_runtime": 60475.8972,
"train_tokens_per_second": 11971.598
},
{
"epoch": 0.20996305261816792,
"grad_norm": 0.8806187323037155,
"learning_rate": 1.2233351266442794e-07,
"loss": 0.5541,
"num_input_tokens_seen": 731057472,
"step": 412,
"train_runtime": 61037.4217,
"train_tokens_per_second": 11977.201
},
{
"epoch": 0.21200152885717927,
"grad_norm": 0.9150900155470338,
"learning_rate": 1.0793821976007693e-07,
"loss": 0.566,
"num_input_tokens_seen": 737872960,
"step": 416,
"train_runtime": 61615.4698,
"train_tokens_per_second": 11975.45
},
{
"epoch": 0.2140400050961906,
"grad_norm": 0.9022599301148059,
"learning_rate": 9.440853990915897e-08,
"loss": 0.5454,
"num_input_tokens_seen": 744627648,
"step": 420,
"train_runtime": 62161.1667,
"train_tokens_per_second": 11978.984
},
{
"epoch": 0.21607848133520194,
"grad_norm": 0.9035906517868586,
"learning_rate": 8.17539021655864e-08,
"loss": 0.5571,
"num_input_tokens_seen": 751584960,
"step": 424,
"train_runtime": 62697.4235,
"train_tokens_per_second": 11987.494
},
{
"epoch": 0.2181169575742133,
"grad_norm": 0.8993779939634194,
"learning_rate": 6.99831257521961e-08,
"loss": 0.5444,
"num_input_tokens_seen": 758504960,
"step": 428,
"train_runtime": 63274.1778,
"train_tokens_per_second": 11987.591
},
{
"epoch": 0.2201554338132246,
"grad_norm": 0.9210930898367543,
"learning_rate": 5.9104413914490546e-08,
"loss": 0.5625,
"num_input_tokens_seen": 765707712,
"step": 432,
"train_runtime": 63862.7303,
"train_tokens_per_second": 11989.899
},
{
"epoch": 0.22219391005223596,
"grad_norm": 0.9280791017512283,
"learning_rate": 4.912534820366224e-08,
"loss": 0.5613,
"num_input_tokens_seen": 772952256,
"step": 436,
"train_runtime": 64479.6745,
"train_tokens_per_second": 11987.533
},
{
"epoch": 0.22423238629124728,
"grad_norm": 0.907069920744252,
"learning_rate": 4.005288319288777e-08,
"loss": 0.566,
"num_input_tokens_seen": 780056832,
"step": 440,
"train_runtime": 65078.7995,
"train_tokens_per_second": 11986.343
},
{
"epoch": 0.22627086253025863,
"grad_norm": 0.9315099081560159,
"learning_rate": 3.189334163057219e-08,
"loss": 0.5666,
"num_input_tokens_seen": 786978752,
"step": 444,
"train_runtime": 65667.9097,
"train_tokens_per_second": 11984.221
},
{
"epoch": 0.22830933876926998,
"grad_norm": 0.9383224396822377,
"learning_rate": 2.4652410033923543e-08,
"loss": 0.5541,
"num_input_tokens_seen": 793887616,
"step": 448,
"train_runtime": 66257.6668,
"train_tokens_per_second": 11981.823
},
{
"epoch": 0.2303478150082813,
"grad_norm": 0.8944859848575912,
"learning_rate": 1.8335134725925177e-08,
"loss": 0.5526,
"num_input_tokens_seen": 800645760,
"step": 452,
"train_runtime": 66822.0683,
"train_tokens_per_second": 11981.757
},
{
"epoch": 0.23238629124729265,
"grad_norm": 0.9050253510819353,
"learning_rate": 1.2945918318473138e-08,
"loss": 0.5634,
"num_input_tokens_seen": 807760064,
"step": 456,
"train_runtime": 67425.8238,
"train_tokens_per_second": 11979.981
},
{
"epoch": 0.23442476748630398,
"grad_norm": 0.9352706962645487,
"learning_rate": 8.488516644122484e-09,
"loss": 0.5618,
"num_input_tokens_seen": 814702080,
"step": 460,
"train_runtime": 67994.9171,
"train_tokens_per_second": 11981.809
},
{
"epoch": 0.23646324372531533,
"grad_norm": 0.8645415996226098,
"learning_rate": 4.966036138587982e-09,
"loss": 0.5553,
"num_input_tokens_seen": 821691008,
"step": 464,
"train_runtime": 68583.3469,
"train_tokens_per_second": 11980.911
},
{
"epoch": 0.23850171996432667,
"grad_norm": 0.9706971370356248,
"learning_rate": 2.380931675817649e-09,
"loss": 0.5601,
"num_input_tokens_seen": 828902464,
"step": 468,
"train_runtime": 69210.2105,
"train_tokens_per_second": 11976.592
},
{
"epoch": 0.240540196203338,
"grad_norm": 0.9160395400629753,
"learning_rate": 7.350048571510504e-10,
"loss": 0.5567,
"num_input_tokens_seen": 835703232,
"step": 472,
"train_runtime": 69768.608,
"train_tokens_per_second": 11978.213
},
{
"epoch": 0.24257867244234935,
"grad_norm": 0.839011112693872,
"learning_rate": 2.9402755754737166e-11,
"loss": 0.5519,
"num_input_tokens_seen": 842742656,
"step": 476,
"train_runtime": 70378.9536,
"train_tokens_per_second": 11974.356
},
{
"epoch": 0.24257867244234935,
"eval_loss": 0.7951585054397583,
"eval_runtime": 212.309,
"eval_samples_per_second": 4.738,
"eval_steps_per_second": 0.075,
"num_input_tokens_seen": 842742656,
"step": 476
}
],
"logging_steps": 4,
"max_steps": 476,
"num_input_tokens_seen": 842742656,
"num_train_epochs": 1,
"save_steps": 238,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4291443279527936.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}