{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24257867244234935, "eval_steps": 238, "global_step": 476, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002038476239011339, "grad_norm": 0.6477837651252902, "learning_rate": 2.699735382882792e-06, "loss": 0.5815, "num_input_tokens_seen": 6936832, "step": 4, "train_runtime": 594.7401, "train_tokens_per_second": 11663.637 }, { "epoch": 0.004076952478022678, "grad_norm": 0.6182898594326122, "learning_rate": 2.698559515983203e-06, "loss": 0.5691, "num_input_tokens_seen": 14270848, "step": 8, "train_runtime": 1203.7054, "train_tokens_per_second": 11855.764 }, { "epoch": 0.006115428717034017, "grad_norm": 0.6559666459303339, "learning_rate": 2.696443816026242e-06, "loss": 0.5597, "num_input_tokens_seen": 21129216, "step": 12, "train_runtime": 1762.6999, "train_tokens_per_second": 11986.848 }, { "epoch": 0.008153904956045356, "grad_norm": 0.6460312307750392, "learning_rate": 2.693389757477617e-06, "loss": 0.564, "num_input_tokens_seen": 28254528, "step": 16, "train_runtime": 2338.7069, "train_tokens_per_second": 12081.261 }, { "epoch": 0.010192381195056695, "grad_norm": 0.7820470969086647, "learning_rate": 2.689399468760395e-06, "loss": 0.5386, "num_input_tokens_seen": 35623168, "step": 20, "train_runtime": 2958.6163, "train_tokens_per_second": 12040.482 }, { "epoch": 0.012230857434068034, "grad_norm": 0.7662986210694266, "learning_rate": 2.6844757307716743e-06, "loss": 0.5589, "num_input_tokens_seen": 42908288, "step": 24, "train_runtime": 3591.5632, "train_tokens_per_second": 11946.967 }, { "epoch": 0.014269333673079374, "grad_norm": 0.7264783091425356, "learning_rate": 2.678621974944531e-06, "loss": 0.5349, "num_input_tokens_seen": 49922816, "step": 28, "train_runtime": 4214.8304, "train_tokens_per_second": 11844.561 }, { "epoch": 0.01630780991209071, "grad_norm": 0.7961355791031945, "learning_rate": 2.6718422808565973e-06, "loss": 0.5432, "num_input_tokens_seen": 56972736, "step": 32, "train_runtime": 4806.2524, "train_tokens_per_second": 11853.879 }, { "epoch": 0.01834628615110205, "grad_norm": 0.8138139768273569, "learning_rate": 2.6641413733869353e-06, "loss": 0.538, "num_input_tokens_seen": 64012032, "step": 36, "train_runtime": 5375.3791, "train_tokens_per_second": 11908.375 }, { "epoch": 0.02038476239011339, "grad_norm": 0.7766770677480328, "learning_rate": 2.6555246194231957e-06, "loss": 0.5368, "num_input_tokens_seen": 71145984, "step": 40, "train_runtime": 5983.1229, "train_tokens_per_second": 11891.112 }, { "epoch": 0.02242323862912473, "grad_norm": 0.786674093556373, "learning_rate": 2.6459980241213444e-06, "loss": 0.5267, "num_input_tokens_seen": 78403392, "step": 44, "train_runtime": 6587.3777, "train_tokens_per_second": 11902.064 }, { "epoch": 0.02446171486813607, "grad_norm": 0.895151868173495, "learning_rate": 2.6355682267205705e-06, "loss": 0.5334, "num_input_tokens_seen": 85316672, "step": 48, "train_runtime": 7184.1032, "train_tokens_per_second": 11875.758 }, { "epoch": 0.02650019110714741, "grad_norm": 0.7705686339787958, "learning_rate": 2.6242424959162964e-06, "loss": 0.531, "num_input_tokens_seen": 92412736, "step": 52, "train_runtime": 7797.6138, "train_tokens_per_second": 11851.412 }, { "epoch": 0.028538667346158748, "grad_norm": 0.819091795569789, "learning_rate": 2.612028724794501e-06, "loss": 0.5213, "num_input_tokens_seen": 99543616, "step": 56, "train_runtime": 8349.8154, "train_tokens_per_second": 11921.655 }, { "epoch": 0.030577143585170087, "grad_norm": 0.8450961282572036, "learning_rate": 2.598935425330904e-06, "loss": 0.5254, "num_input_tokens_seen": 106489728, "step": 60, "train_runtime": 8960.4265, "train_tokens_per_second": 11884.449 }, { "epoch": 0.03261561982418142, "grad_norm": 0.9361937413311264, "learning_rate": 2.5849717224588284e-06, "loss": 0.5376, "num_input_tokens_seen": 113524032, "step": 64, "train_runtime": 9543.0991, "train_tokens_per_second": 11895.929 }, { "epoch": 0.034654096063192766, "grad_norm": 0.8847320129841505, "learning_rate": 2.5701473477098874e-06, "loss": 0.5651, "num_input_tokens_seen": 120475456, "step": 68, "train_runtime": 10086.9254, "train_tokens_per_second": 11943.724 }, { "epoch": 0.0366925723022041, "grad_norm": 0.8777390400419234, "learning_rate": 2.5544726324319225e-06, "loss": 0.5692, "num_input_tokens_seen": 127670080, "step": 72, "train_runtime": 10665.8117, "train_tokens_per_second": 11970.029 }, { "epoch": 0.038731048541215445, "grad_norm": 0.9576210756092853, "learning_rate": 2.5379585005889178e-06, "loss": 0.5625, "num_input_tokens_seen": 134768512, "step": 76, "train_runtime": 11283.1943, "train_tokens_per_second": 11944.181 }, { "epoch": 0.04076952478022678, "grad_norm": 0.8753248708570508, "learning_rate": 2.5206164611479122e-06, "loss": 0.5805, "num_input_tokens_seen": 141573760, "step": 80, "train_runtime": 11827.0251, "train_tokens_per_second": 11970.361 }, { "epoch": 0.042808001019238116, "grad_norm": 0.8684000412026938, "learning_rate": 2.502458600058214e-06, "loss": 0.5426, "num_input_tokens_seen": 148565440, "step": 84, "train_runtime": 12416.0172, "train_tokens_per_second": 11965.628 }, { "epoch": 0.04484647725824946, "grad_norm": 0.7849759417456085, "learning_rate": 2.4834975718285047e-06, "loss": 0.5676, "num_input_tokens_seen": 155693632, "step": 88, "train_runtime": 13027.3428, "train_tokens_per_second": 11951.296 }, { "epoch": 0.046884953497260795, "grad_norm": 0.8527233643540989, "learning_rate": 2.463746590707708e-06, "loss": 0.5706, "num_input_tokens_seen": 162884544, "step": 92, "train_runtime": 13621.8968, "train_tokens_per_second": 11957.552 }, { "epoch": 0.04892342973627214, "grad_norm": 0.8245948514984758, "learning_rate": 2.4432194214757634e-06, "loss": 0.5601, "num_input_tokens_seen": 169884480, "step": 96, "train_runtime": 14201.6379, "train_tokens_per_second": 11962.316 }, { "epoch": 0.050961905975283474, "grad_norm": 0.9463896487710859, "learning_rate": 2.4219303698507273e-06, "loss": 0.578, "num_input_tokens_seen": 177144768, "step": 100, "train_runtime": 14812.6402, "train_tokens_per_second": 11959.027 }, { "epoch": 0.05300038221429482, "grad_norm": 0.8528440593847427, "learning_rate": 2.399894272518887e-06, "loss": 0.5695, "num_input_tokens_seen": 184355904, "step": 104, "train_runtime": 15392.8793, "train_tokens_per_second": 11976.7 }, { "epoch": 0.05503885845330615, "grad_norm": 0.8983751041750461, "learning_rate": 2.3771264867948297e-06, "loss": 0.5643, "num_input_tokens_seen": 191461120, "step": 108, "train_runtime": 15962.3103, "train_tokens_per_second": 11994.574 }, { "epoch": 0.057077334692317495, "grad_norm": 0.9738501561599475, "learning_rate": 2.353642879918684e-06, "loss": 0.5611, "num_input_tokens_seen": 198512640, "step": 112, "train_runtime": 16542.875, "train_tokens_per_second": 11999.888 }, { "epoch": 0.05911581093132883, "grad_norm": 0.9069372359842974, "learning_rate": 2.329459817997979e-06, "loss": 0.555, "num_input_tokens_seen": 205184640, "step": 116, "train_runtime": 17132.5621, "train_tokens_per_second": 11976.296 }, { "epoch": 0.061154287170340174, "grad_norm": 0.8535229191703974, "learning_rate": 2.304594154601839e-06, "loss": 0.5802, "num_input_tokens_seen": 212192768, "step": 120, "train_runtime": 17695.1167, "train_tokens_per_second": 11991.6 }, { "epoch": 0.06319276340935151, "grad_norm": 0.9235254276307827, "learning_rate": 2.2790632190154588e-06, "loss": 0.5602, "num_input_tokens_seen": 219343424, "step": 124, "train_runtime": 18276.554, "train_tokens_per_second": 12001.356 }, { "epoch": 0.06523123964836285, "grad_norm": 0.8469907739766386, "learning_rate": 2.2528848041630394e-06, "loss": 0.5726, "num_input_tokens_seen": 226825024, "step": 128, "train_runtime": 18867.9805, "train_tokens_per_second": 12021.691 }, { "epoch": 0.06726971588737418, "grad_norm": 0.863302212361952, "learning_rate": 2.226077154207613e-06, "loss": 0.5458, "num_input_tokens_seen": 233873600, "step": 132, "train_runtime": 19428.0179, "train_tokens_per_second": 12037.955 }, { "epoch": 0.06930819212638553, "grad_norm": 0.890826252542842, "learning_rate": 2.1986589518363884e-06, "loss": 0.5683, "num_input_tokens_seen": 241046016, "step": 136, "train_runtime": 20027.9177, "train_tokens_per_second": 12035.501 }, { "epoch": 0.07134666836539687, "grad_norm": 0.8897818157637029, "learning_rate": 2.17064930524048e-06, "loss": 0.5753, "num_input_tokens_seen": 248445952, "step": 140, "train_runtime": 20655.1895, "train_tokens_per_second": 12028.258 }, { "epoch": 0.0733851446044082, "grad_norm": 0.7970770860102688, "learning_rate": 2.1420677347981022e-06, "loss": 0.5492, "num_input_tokens_seen": 255691072, "step": 144, "train_runtime": 21246.8799, "train_tokens_per_second": 12034.288 }, { "epoch": 0.07542362084341954, "grad_norm": 1.114229924043429, "learning_rate": 2.112934159470499e-06, "loss": 0.5696, "num_input_tokens_seen": 263081280, "step": 148, "train_runtime": 21841.1341, "train_tokens_per_second": 12045.221 }, { "epoch": 0.07746209708243089, "grad_norm": 0.8591461034263972, "learning_rate": 2.083268882920095e-06, "loss": 0.5765, "num_input_tokens_seen": 270227584, "step": 152, "train_runtime": 22468.8462, "train_tokens_per_second": 12026.767 }, { "epoch": 0.07950057332144222, "grad_norm": 0.8676767698372961, "learning_rate": 2.053092579360543e-06, "loss": 0.5706, "num_input_tokens_seen": 277353664, "step": 156, "train_runtime": 23053.2043, "train_tokens_per_second": 12031.024 }, { "epoch": 0.08153904956045356, "grad_norm": 0.9224293367262005, "learning_rate": 2.0224262791485315e-06, "loss": 0.5608, "num_input_tokens_seen": 284646784, "step": 160, "train_runtime": 23643.9972, "train_tokens_per_second": 12038.86 }, { "epoch": 0.0835775257994649, "grad_norm": 0.887751023260673, "learning_rate": 1.991291354127381e-06, "loss": 0.5636, "num_input_tokens_seen": 291840192, "step": 164, "train_runtime": 24251.2033, "train_tokens_per_second": 12034.05 }, { "epoch": 0.08561600203847623, "grad_norm": 0.8971166373496055, "learning_rate": 1.959709502732666e-06, "loss": 0.5624, "num_input_tokens_seen": 298899456, "step": 168, "train_runtime": 24847.2173, "train_tokens_per_second": 12029.494 }, { "epoch": 0.08765447827748758, "grad_norm": 0.884570439459004, "learning_rate": 1.927702734870216e-06, "loss": 0.5802, "num_input_tokens_seen": 305987520, "step": 172, "train_runtime": 25438.1181, "train_tokens_per_second": 12028.701 }, { "epoch": 0.08969295451649892, "grad_norm": 0.7905104567457971, "learning_rate": 1.895293356577058e-06, "loss": 0.5557, "num_input_tokens_seen": 312989312, "step": 176, "train_runtime": 26002.8183, "train_tokens_per_second": 12036.746 }, { "epoch": 0.09173143075551025, "grad_norm": 0.9424718927507695, "learning_rate": 1.8625039544759767e-06, "loss": 0.5663, "num_input_tokens_seen": 320097152, "step": 180, "train_runtime": 26586.1368, "train_tokens_per_second": 12040.002 }, { "epoch": 0.09376990699452159, "grad_norm": 0.8615504446489718, "learning_rate": 1.8293573800345261e-06, "loss": 0.5729, "num_input_tokens_seen": 327245696, "step": 184, "train_runtime": 27215.267, "train_tokens_per_second": 12024.343 }, { "epoch": 0.09580838323353294, "grad_norm": 0.9699743298030914, "learning_rate": 1.7958767336394758e-06, "loss": 0.5737, "num_input_tokens_seen": 334634880, "step": 188, "train_runtime": 27830.7489, "train_tokens_per_second": 12023.927 }, { "epoch": 0.09784685947254428, "grad_norm": 0.8699479610963319, "learning_rate": 1.7620853484977693e-06, "loss": 0.5578, "num_input_tokens_seen": 341804032, "step": 192, "train_runtime": 28418.8119, "train_tokens_per_second": 12027.386 }, { "epoch": 0.09988533571155561, "grad_norm": 0.9257049516328657, "learning_rate": 1.7280067743752384e-06, "loss": 0.5521, "num_input_tokens_seen": 348671424, "step": 196, "train_runtime": 29007.5671, "train_tokens_per_second": 12020.016 }, { "epoch": 0.10192381195056695, "grad_norm": 0.9131171296688921, "learning_rate": 1.6936647611843846e-06, "loss": 0.5682, "num_input_tokens_seen": 355675520, "step": 200, "train_runtime": 29580.1297, "train_tokens_per_second": 12024.137 }, { "epoch": 0.10396228818957828, "grad_norm": 0.8769808506815606, "learning_rate": 1.659083242432681e-06, "loss": 0.5584, "num_input_tokens_seen": 362651648, "step": 204, "train_runtime": 30152.9255, "train_tokens_per_second": 12027.08 }, { "epoch": 0.10600076442858963, "grad_norm": 0.9497619416730099, "learning_rate": 1.6242863185429212e-06, "loss": 0.5879, "num_input_tokens_seen": 369726720, "step": 208, "train_runtime": 30751.1104, "train_tokens_per_second": 12023.199 }, { "epoch": 0.10803924066760097, "grad_norm": 0.9230137333974207, "learning_rate": 1.5892982400572422e-06, "loss": 0.5681, "num_input_tokens_seen": 376862016, "step": 212, "train_runtime": 31362.1922, "train_tokens_per_second": 12016.444 }, { "epoch": 0.1100777169066123, "grad_norm": 0.8995334199668418, "learning_rate": 1.5541433907365264e-06, "loss": 0.5455, "num_input_tokens_seen": 383921152, "step": 216, "train_runtime": 31914.7502, "train_tokens_per_second": 12029.583 }, { "epoch": 0.11211619314562364, "grad_norm": 0.9043735392416916, "learning_rate": 1.5188462705669648e-06, "loss": 0.5641, "num_input_tokens_seen": 390988416, "step": 220, "train_runtime": 32509.882, "train_tokens_per_second": 12026.756 }, { "epoch": 0.11415466938463499, "grad_norm": 0.9729636398442392, "learning_rate": 1.4834314786856161e-06, "loss": 0.5607, "num_input_tokens_seen": 398152576, "step": 224, "train_runtime": 33085.7673, "train_tokens_per_second": 12033.953 }, { "epoch": 0.11619314562364633, "grad_norm": 0.9539597912248131, "learning_rate": 1.4479236962368684e-06, "loss": 0.5589, "num_input_tokens_seen": 405173888, "step": 228, "train_runtime": 33682.3182, "train_tokens_per_second": 12029.276 }, { "epoch": 0.11823162186265766, "grad_norm": 0.8598081231037438, "learning_rate": 1.4123476691717487e-06, "loss": 0.5518, "num_input_tokens_seen": 412070528, "step": 232, "train_runtime": 34252.7917, "train_tokens_per_second": 12030.276 }, { "epoch": 0.120270098101669, "grad_norm": 0.8860637699401077, "learning_rate": 1.376728191002066e-06, "loss": 0.558, "num_input_tokens_seen": 419423744, "step": 236, "train_runtime": 34844.2912, "train_tokens_per_second": 12037.086 }, { "epoch": 0.12128933622117467, "eval_loss": 0.7941220998764038, "eval_runtime": 213.2901, "eval_samples_per_second": 4.717, "eval_steps_per_second": 0.075, "num_input_tokens_seen": 422911872, "step": 238 }, { "epoch": 0.12230857434068035, "grad_norm": 0.9547518291189157, "learning_rate": 1.3410900855214124e-06, "loss": 0.5732, "num_input_tokens_seen": 426493248, "step": 240, "train_runtime": 35670.5805, "train_tokens_per_second": 11956.443 }, { "epoch": 0.12434705057969168, "grad_norm": 0.8725219735566059, "learning_rate": 1.305458189505055e-06, "loss": 0.5605, "num_input_tokens_seen": 433674688, "step": 244, "train_runtime": 36270.2542, "train_tokens_per_second": 11956.759 }, { "epoch": 0.12638552681870302, "grad_norm": 0.9381950485311329, "learning_rate": 1.269857335400783e-06, "loss": 0.5758, "num_input_tokens_seen": 441063552, "step": 248, "train_runtime": 36895.0601, "train_tokens_per_second": 11954.542 }, { "epoch": 0.12842400305771437, "grad_norm": 0.8095871776257958, "learning_rate": 1.2343123340227753e-06, "loss": 0.5682, "num_input_tokens_seen": 448549632, "step": 252, "train_runtime": 37565.6709, "train_tokens_per_second": 11940.413 }, { "epoch": 0.1304624792967257, "grad_norm": 0.8656130470620739, "learning_rate": 1.1988479572605345e-06, "loss": 0.5545, "num_input_tokens_seen": 455502912, "step": 256, "train_runtime": 38151.4724, "train_tokens_per_second": 11939.327 }, { "epoch": 0.13250095553573704, "grad_norm": 0.9346351864084386, "learning_rate": 1.1634889208149594e-06, "loss": 0.5664, "num_input_tokens_seen": 462435200, "step": 260, "train_runtime": 38714.2142, "train_tokens_per_second": 11944.843 }, { "epoch": 0.13453943177474836, "grad_norm": 0.899249765870996, "learning_rate": 1.1282598669735671e-06, "loss": 0.558, "num_input_tokens_seen": 469336640, "step": 264, "train_runtime": 39254.3889, "train_tokens_per_second": 11956.284 }, { "epoch": 0.1365779080137597, "grad_norm": 0.8458171655451426, "learning_rate": 1.093185347436887e-06, "loss": 0.5538, "num_input_tokens_seen": 476310656, "step": 268, "train_runtime": 39827.3684, "train_tokens_per_second": 11959.381 }, { "epoch": 0.13861638425277106, "grad_norm": 0.9179027627040293, "learning_rate": 1.058289806207975e-06, "loss": 0.5698, "num_input_tokens_seen": 483771840, "step": 272, "train_runtime": 40459.3108, "train_tokens_per_second": 11956.997 }, { "epoch": 0.14065486049178239, "grad_norm": 0.9442703488926768, "learning_rate": 1.0235975625569967e-06, "loss": 0.5649, "num_input_tokens_seen": 490750336, "step": 276, "train_runtime": 41048.4541, "train_tokens_per_second": 11955.391 }, { "epoch": 0.14269333673079374, "grad_norm": 0.8974082301624304, "learning_rate": 9.891327940727266e-07, "loss": 0.5657, "num_input_tokens_seen": 497702912, "step": 280, "train_runtime": 41610.7519, "train_tokens_per_second": 11960.921 }, { "epoch": 0.14473181296980506, "grad_norm": 0.9898213671868435, "learning_rate": 9.549195198127994e-07, "loss": 0.58, "num_input_tokens_seen": 505099584, "step": 284, "train_runtime": 42219.4154, "train_tokens_per_second": 11963.68 }, { "epoch": 0.1467702892088164, "grad_norm": 0.9363039019647847, "learning_rate": 9.209815835644328e-07, "loss": 0.5512, "num_input_tokens_seen": 512152896, "step": 288, "train_runtime": 42766.508, "train_tokens_per_second": 11975.56 }, { "epoch": 0.14880876544782776, "grad_norm": 0.9589553041250718, "learning_rate": 8.873426372273072e-07, "loss": 0.5747, "num_input_tokens_seen": 519187584, "step": 292, "train_runtime": 43380.0418, "train_tokens_per_second": 11968.351 }, { "epoch": 0.15084724168683908, "grad_norm": 0.9433972448251747, "learning_rate": 8.540261243301721e-07, "loss": 0.5509, "num_input_tokens_seen": 526140736, "step": 296, "train_runtime": 43995.5053, "train_tokens_per_second": 11958.966 }, { "epoch": 0.15288571792585043, "grad_norm": 0.9393953597111547, "learning_rate": 8.210552636926686e-07, "loss": 0.5629, "num_input_tokens_seen": 532992384, "step": 300, "train_runtime": 44595.3555, "train_tokens_per_second": 11951.746 }, { "epoch": 0.15492419416486178, "grad_norm": 0.9091057919029727, "learning_rate": 7.884530332437565e-07, "loss": 0.5663, "num_input_tokens_seen": 540379392, "step": 304, "train_runtime": 45193.3818, "train_tokens_per_second": 11957.047 }, { "epoch": 0.1569626704038731, "grad_norm": 0.8832084768576383, "learning_rate": 7.562421540080231e-07, "loss": 0.5613, "num_input_tokens_seen": 547461440, "step": 308, "train_runtime": 45756.6672, "train_tokens_per_second": 11964.627 }, { "epoch": 0.15900114664288445, "grad_norm": 0.8587618292954599, "learning_rate": 7.244450742710321e-07, "loss": 0.561, "num_input_tokens_seen": 554617984, "step": 312, "train_runtime": 46381.4062, "train_tokens_per_second": 11957.766 }, { "epoch": 0.16103962288189577, "grad_norm": 0.9446856546867352, "learning_rate": 6.930839539347442e-07, "loss": 0.5532, "num_input_tokens_seen": 561693888, "step": 316, "train_runtime": 46994.8669, "train_tokens_per_second": 11952.239 }, { "epoch": 0.16307809912090712, "grad_norm": 0.9429683753374553, "learning_rate": 6.621806490739267e-07, "loss": 0.5595, "num_input_tokens_seen": 568833664, "step": 320, "train_runtime": 47586.3302, "train_tokens_per_second": 11953.72 }, { "epoch": 0.16511657535991847, "grad_norm": 0.9035859891449433, "learning_rate": 6.317566967042958e-07, "loss": 0.5504, "num_input_tokens_seen": 576161408, "step": 324, "train_runtime": 48187.0813, "train_tokens_per_second": 11956.761 }, { "epoch": 0.1671550515989298, "grad_norm": 0.9033151093018099, "learning_rate": 6.018332997730213e-07, "loss": 0.5605, "num_input_tokens_seen": 583395328, "step": 328, "train_runtime": 48788.5572, "train_tokens_per_second": 11957.626 }, { "epoch": 0.16919352783794114, "grad_norm": 0.9609542284697502, "learning_rate": 5.724313123820482e-07, "loss": 0.5557, "num_input_tokens_seen": 590636544, "step": 332, "train_runtime": 49400.1577, "train_tokens_per_second": 11956.167 }, { "epoch": 0.17123200407695247, "grad_norm": 0.8861118611359287, "learning_rate": 5.435712252545331e-07, "loss": 0.5608, "num_input_tokens_seen": 597723776, "step": 336, "train_runtime": 50003.6729, "train_tokens_per_second": 11953.597 }, { "epoch": 0.17327048031596382, "grad_norm": 0.9761110376120566, "learning_rate": 5.152731514545266e-07, "loss": 0.5478, "num_input_tokens_seen": 604770752, "step": 340, "train_runtime": 50553.9712, "train_tokens_per_second": 11962.873 }, { "epoch": 0.17530895655497516, "grad_norm": 0.9053070583607986, "learning_rate": 4.875568123698525e-07, "loss": 0.55, "num_input_tokens_seen": 612028800, "step": 344, "train_runtime": 51128.5271, "train_tokens_per_second": 11970.398 }, { "epoch": 0.1773474327939865, "grad_norm": 0.901496752129611, "learning_rate": 4.604415239679492e-07, "loss": 0.5682, "num_input_tokens_seen": 619137792, "step": 348, "train_runtime": 51752.3485, "train_tokens_per_second": 11963.472 }, { "epoch": 0.17938590903299784, "grad_norm": 0.9438945381441597, "learning_rate": 4.3394618333426135e-07, "loss": 0.5652, "num_input_tokens_seen": 625854848, "step": 352, "train_runtime": 52310.1207, "train_tokens_per_second": 11964.317 }, { "epoch": 0.1814243852720092, "grad_norm": 0.9543610627021044, "learning_rate": 4.080892555025522e-07, "loss": 0.5581, "num_input_tokens_seen": 632626752, "step": 356, "train_runtime": 52857.0375, "train_tokens_per_second": 11968.638 }, { "epoch": 0.1834628615110205, "grad_norm": 0.9118473255578559, "learning_rate": 3.8288876058632056e-07, "loss": 0.56, "num_input_tokens_seen": 639638080, "step": 360, "train_runtime": 53443.4994, "train_tokens_per_second": 11968.492 }, { "epoch": 0.18550133775003186, "grad_norm": 0.9212497452458377, "learning_rate": 3.5836226122029165e-07, "loss": 0.557, "num_input_tokens_seen": 646705152, "step": 364, "train_runtime": 54052.7201, "train_tokens_per_second": 11964.341 }, { "epoch": 0.18753981398904318, "grad_norm": 0.9056019341991234, "learning_rate": 3.34526850320731e-07, "loss": 0.5602, "num_input_tokens_seen": 653674880, "step": 368, "train_runtime": 54641.6155, "train_tokens_per_second": 11962.949 }, { "epoch": 0.18957829022805453, "grad_norm": 1.0036743775927282, "learning_rate": 3.1139913917311347e-07, "loss": 0.5787, "num_input_tokens_seen": 660674240, "step": 372, "train_runtime": 55222.9846, "train_tokens_per_second": 11963.755 }, { "epoch": 0.19161676646706588, "grad_norm": 0.9225731699510716, "learning_rate": 2.889952458554475e-07, "loss": 0.5515, "num_input_tokens_seen": 667601792, "step": 376, "train_runtime": 55825.7562, "train_tokens_per_second": 11958.67 }, { "epoch": 0.1936552427060772, "grad_norm": 0.9067845518246624, "learning_rate": 2.6733078400532475e-07, "loss": 0.5621, "num_input_tokens_seen": 674622976, "step": 380, "train_runtime": 56384.2096, "train_tokens_per_second": 11964.75 }, { "epoch": 0.19569371894508855, "grad_norm": 0.9543732534991831, "learning_rate": 2.464208519385228e-07, "loss": 0.5708, "num_input_tokens_seen": 681708672, "step": 384, "train_runtime": 56981.2121, "train_tokens_per_second": 11963.745 }, { "epoch": 0.19773219518409987, "grad_norm": 0.8996552227811947, "learning_rate": 2.2628002212674264e-07, "loss": 0.5671, "num_input_tokens_seen": 689109376, "step": 388, "train_runtime": 57593.2165, "train_tokens_per_second": 11965.114 }, { "epoch": 0.19977067142311122, "grad_norm": 0.955255917559042, "learning_rate": 2.0692233104181644e-07, "loss": 0.5449, "num_input_tokens_seen": 696091456, "step": 392, "train_runtime": 58166.8582, "train_tokens_per_second": 11967.149 }, { "epoch": 0.20180914766212257, "grad_norm": 0.9040308153396052, "learning_rate": 1.8836126937346177e-07, "loss": 0.5618, "num_input_tokens_seen": 703111168, "step": 396, "train_runtime": 58738.4377, "train_tokens_per_second": 11970.205 }, { "epoch": 0.2038476239011339, "grad_norm": 0.97944459658204, "learning_rate": 1.706097726274012e-07, "loss": 0.5622, "num_input_tokens_seen": 710207168, "step": 400, "train_runtime": 59356.9508, "train_tokens_per_second": 11965.021 }, { "epoch": 0.20588610014014525, "grad_norm": 0.9381590971188796, "learning_rate": 1.5368021211039678e-07, "loss": 0.5411, "num_input_tokens_seen": 717098944, "step": 404, "train_runtime": 59911.4198, "train_tokens_per_second": 11969.32 }, { "epoch": 0.20792457637915657, "grad_norm": 0.8878878520807448, "learning_rate": 1.3758438630848725e-07, "loss": 0.5585, "num_input_tokens_seen": 723993152, "step": 408, "train_runtime": 60475.8972, "train_tokens_per_second": 11971.598 }, { "epoch": 0.20996305261816792, "grad_norm": 0.8806187323037155, "learning_rate": 1.2233351266442794e-07, "loss": 0.5541, "num_input_tokens_seen": 731057472, "step": 412, "train_runtime": 61037.4217, "train_tokens_per_second": 11977.201 }, { "epoch": 0.21200152885717927, "grad_norm": 0.9150900155470338, "learning_rate": 1.0793821976007693e-07, "loss": 0.566, "num_input_tokens_seen": 737872960, "step": 416, "train_runtime": 61615.4698, "train_tokens_per_second": 11975.45 }, { "epoch": 0.2140400050961906, "grad_norm": 0.9022599301148059, "learning_rate": 9.440853990915897e-08, "loss": 0.5454, "num_input_tokens_seen": 744627648, "step": 420, "train_runtime": 62161.1667, "train_tokens_per_second": 11978.984 }, { "epoch": 0.21607848133520194, "grad_norm": 0.9035906517868586, "learning_rate": 8.17539021655864e-08, "loss": 0.5571, "num_input_tokens_seen": 751584960, "step": 424, "train_runtime": 62697.4235, "train_tokens_per_second": 11987.494 }, { "epoch": 0.2181169575742133, "grad_norm": 0.8993779939634194, "learning_rate": 6.99831257521961e-08, "loss": 0.5444, "num_input_tokens_seen": 758504960, "step": 428, "train_runtime": 63274.1778, "train_tokens_per_second": 11987.591 }, { "epoch": 0.2201554338132246, "grad_norm": 0.9210930898367543, "learning_rate": 5.9104413914490546e-08, "loss": 0.5625, "num_input_tokens_seen": 765707712, "step": 432, "train_runtime": 63862.7303, "train_tokens_per_second": 11989.899 }, { "epoch": 0.22219391005223596, "grad_norm": 0.9280791017512283, "learning_rate": 4.912534820366224e-08, "loss": 0.5613, "num_input_tokens_seen": 772952256, "step": 436, "train_runtime": 64479.6745, "train_tokens_per_second": 11987.533 }, { "epoch": 0.22423238629124728, "grad_norm": 0.907069920744252, "learning_rate": 4.005288319288777e-08, "loss": 0.566, "num_input_tokens_seen": 780056832, "step": 440, "train_runtime": 65078.7995, "train_tokens_per_second": 11986.343 }, { "epoch": 0.22627086253025863, "grad_norm": 0.9315099081560159, "learning_rate": 3.189334163057219e-08, "loss": 0.5666, "num_input_tokens_seen": 786978752, "step": 444, "train_runtime": 65667.9097, "train_tokens_per_second": 11984.221 }, { "epoch": 0.22830933876926998, "grad_norm": 0.9383224396822377, "learning_rate": 2.4652410033923543e-08, "loss": 0.5541, "num_input_tokens_seen": 793887616, "step": 448, "train_runtime": 66257.6668, "train_tokens_per_second": 11981.823 }, { "epoch": 0.2303478150082813, "grad_norm": 0.8944859848575912, "learning_rate": 1.8335134725925177e-08, "loss": 0.5526, "num_input_tokens_seen": 800645760, "step": 452, "train_runtime": 66822.0683, "train_tokens_per_second": 11981.757 }, { "epoch": 0.23238629124729265, "grad_norm": 0.9050253510819353, "learning_rate": 1.2945918318473138e-08, "loss": 0.5634, "num_input_tokens_seen": 807760064, "step": 456, "train_runtime": 67425.8238, "train_tokens_per_second": 11979.981 }, { "epoch": 0.23442476748630398, "grad_norm": 0.9352706962645487, "learning_rate": 8.488516644122484e-09, "loss": 0.5618, "num_input_tokens_seen": 814702080, "step": 460, "train_runtime": 67994.9171, "train_tokens_per_second": 11981.809 }, { "epoch": 0.23646324372531533, "grad_norm": 0.8645415996226098, "learning_rate": 4.966036138587982e-09, "loss": 0.5553, "num_input_tokens_seen": 821691008, "step": 464, "train_runtime": 68583.3469, "train_tokens_per_second": 11980.911 }, { "epoch": 0.23850171996432667, "grad_norm": 0.9706971370356248, "learning_rate": 2.380931675817649e-09, "loss": 0.5601, "num_input_tokens_seen": 828902464, "step": 468, "train_runtime": 69210.2105, "train_tokens_per_second": 11976.592 }, { "epoch": 0.240540196203338, "grad_norm": 0.9160395400629753, "learning_rate": 7.350048571510504e-10, "loss": 0.5567, "num_input_tokens_seen": 835703232, "step": 472, "train_runtime": 69768.608, "train_tokens_per_second": 11978.213 }, { "epoch": 0.24257867244234935, "grad_norm": 0.839011112693872, "learning_rate": 2.9402755754737166e-11, "loss": 0.5519, "num_input_tokens_seen": 842742656, "step": 476, "train_runtime": 70378.9536, "train_tokens_per_second": 11974.356 }, { "epoch": 0.24257867244234935, "eval_loss": 0.7951585054397583, "eval_runtime": 212.309, "eval_samples_per_second": 4.738, "eval_steps_per_second": 0.075, "num_input_tokens_seen": 842742656, "step": 476 } ], "logging_steps": 4, "max_steps": 476, "num_input_tokens_seen": 842742656, "num_train_epochs": 1, "save_steps": 238, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4291443279527936.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }