{ "best_metric": 2.6091578006744385, "best_model_checkpoint": "learning_source_20260316/protein_sequence/bert-output/protein_sequence-small/checkpoint-44000", "epoch": 3505.9075391180654, "eval_steps": 100, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.689900426742532, "grad_norm": 0.5827894806861877, "learning_rate": 3e-06, "loss": 3.1299, "step": 100 }, { "epoch": 5.689900426742532, "eval_loss": 2.7554705142974854, "eval_runtime": 13.2804, "eval_samples_per_second": 104.967, "eval_steps_per_second": 104.967, "step": 100 }, { "epoch": 11.379800853485063, "grad_norm": 0.47490769624710083, "learning_rate": 6e-06, "loss": 2.7301, "step": 200 }, { "epoch": 11.379800853485063, "eval_loss": 2.695699453353882, "eval_runtime": 13.2496, "eval_samples_per_second": 105.211, "eval_steps_per_second": 105.211, "step": 200 }, { "epoch": 17.069701280227594, "grad_norm": 0.27892372012138367, "learning_rate": 5.998999666555519e-06, "loss": 2.6967, "step": 300 }, { "epoch": 17.069701280227594, "eval_loss": 2.6834542751312256, "eval_runtime": 13.2537, "eval_samples_per_second": 105.178, "eval_steps_per_second": 105.178, "step": 300 }, { "epoch": 22.759601706970127, "grad_norm": 0.26072826981544495, "learning_rate": 5.997999333111037e-06, "loss": 2.6876, "step": 400 }, { "epoch": 22.759601706970127, "eval_loss": 2.6789743900299072, "eval_runtime": 13.6654, "eval_samples_per_second": 102.01, "eval_steps_per_second": 102.01, "step": 400 }, { "epoch": 28.44950213371266, "grad_norm": 0.29876092076301575, "learning_rate": 5.9969989996665554e-06, "loss": 2.6822, "step": 500 }, { "epoch": 28.44950213371266, "eval_loss": 2.6772079467773438, "eval_runtime": 13.27, "eval_samples_per_second": 105.049, "eval_steps_per_second": 105.049, "step": 500 }, { "epoch": 34.13940256045519, "grad_norm": 0.3491186499595642, "learning_rate": 5.995998666222074e-06, "loss": 2.6793, "step": 600 }, { "epoch": 34.13940256045519, "eval_loss": 2.6701366901397705, "eval_runtime": 13.2631, "eval_samples_per_second": 105.104, "eval_steps_per_second": 105.104, "step": 600 }, { "epoch": 39.82930298719772, "grad_norm": 0.38635119795799255, "learning_rate": 5.994998332777593e-06, "loss": 2.6775, "step": 700 }, { "epoch": 39.82930298719772, "eval_loss": 2.675004243850708, "eval_runtime": 13.7168, "eval_samples_per_second": 101.627, "eval_steps_per_second": 101.627, "step": 700 }, { "epoch": 45.519203413940254, "grad_norm": 0.34463900327682495, "learning_rate": 5.9939979993331115e-06, "loss": 2.6754, "step": 800 }, { "epoch": 45.519203413940254, "eval_loss": 2.673832416534424, "eval_runtime": 13.2859, "eval_samples_per_second": 104.923, "eval_steps_per_second": 104.923, "step": 800 }, { "epoch": 51.209103840682786, "grad_norm": 0.35753050446510315, "learning_rate": 5.992997665888629e-06, "loss": 2.6743, "step": 900 }, { "epoch": 51.209103840682786, "eval_loss": 2.670893907546997, "eval_runtime": 13.2644, "eval_samples_per_second": 105.093, "eval_steps_per_second": 105.093, "step": 900 }, { "epoch": 56.89900426742532, "grad_norm": 0.30704179406166077, "learning_rate": 5.991997332444148e-06, "loss": 2.6733, "step": 1000 }, { "epoch": 56.89900426742532, "eval_loss": 2.6703639030456543, "eval_runtime": 13.274, "eval_samples_per_second": 105.017, "eval_steps_per_second": 105.017, "step": 1000 }, { "epoch": 62.58890469416785, "grad_norm": 0.20322857797145844, "learning_rate": 5.990996998999667e-06, "loss": 2.6718, "step": 1100 }, { "epoch": 62.58890469416785, "eval_loss": 2.6725869178771973, "eval_runtime": 13.6525, "eval_samples_per_second": 102.106, "eval_steps_per_second": 102.106, "step": 1100 }, { "epoch": 68.27880512091038, "grad_norm": 0.29705750942230225, "learning_rate": 5.989996665555185e-06, "loss": 2.6712, "step": 1200 }, { "epoch": 68.27880512091038, "eval_loss": 2.6705477237701416, "eval_runtime": 13.7099, "eval_samples_per_second": 101.678, "eval_steps_per_second": 101.678, "step": 1200 }, { "epoch": 73.96870554765292, "grad_norm": 0.2920830249786377, "learning_rate": 5.988996332110703e-06, "loss": 2.671, "step": 1300 }, { "epoch": 73.96870554765292, "eval_loss": 2.6691176891326904, "eval_runtime": 13.276, "eval_samples_per_second": 105.002, "eval_steps_per_second": 105.002, "step": 1300 }, { "epoch": 79.65860597439544, "grad_norm": 0.38358381390571594, "learning_rate": 5.987995998666222e-06, "loss": 2.6703, "step": 1400 }, { "epoch": 79.65860597439544, "eval_loss": 2.6659200191497803, "eval_runtime": 13.2877, "eval_samples_per_second": 104.909, "eval_steps_per_second": 104.909, "step": 1400 }, { "epoch": 85.34850640113798, "grad_norm": 0.23219753801822662, "learning_rate": 5.986995665221741e-06, "loss": 2.6704, "step": 1500 }, { "epoch": 85.34850640113798, "eval_loss": 2.6674177646636963, "eval_runtime": 13.666, "eval_samples_per_second": 102.005, "eval_steps_per_second": 102.005, "step": 1500 }, { "epoch": 91.03840682788051, "grad_norm": 0.23956173658370972, "learning_rate": 5.9859953317772595e-06, "loss": 2.6704, "step": 1600 }, { "epoch": 91.03840682788051, "eval_loss": 2.667738199234009, "eval_runtime": 13.2689, "eval_samples_per_second": 105.058, "eval_steps_per_second": 105.058, "step": 1600 }, { "epoch": 96.72830725462305, "grad_norm": 0.22576624155044556, "learning_rate": 5.984994998332777e-06, "loss": 2.6696, "step": 1700 }, { "epoch": 96.72830725462305, "eval_loss": 2.666335344314575, "eval_runtime": 13.2674, "eval_samples_per_second": 105.069, "eval_steps_per_second": 105.069, "step": 1700 }, { "epoch": 102.41820768136557, "grad_norm": 0.2869977653026581, "learning_rate": 5.983994664888296e-06, "loss": 2.6692, "step": 1800 }, { "epoch": 102.41820768136557, "eval_loss": 2.667121171951294, "eval_runtime": 13.2883, "eval_samples_per_second": 104.905, "eval_steps_per_second": 104.905, "step": 1800 }, { "epoch": 108.10810810810811, "grad_norm": 0.24629302322864532, "learning_rate": 5.982994331443815e-06, "loss": 2.6685, "step": 1900 }, { "epoch": 108.10810810810811, "eval_loss": 2.6667563915252686, "eval_runtime": 13.6825, "eval_samples_per_second": 101.882, "eval_steps_per_second": 101.882, "step": 1900 }, { "epoch": 113.79800853485064, "grad_norm": 0.23221346735954285, "learning_rate": 5.981993997999333e-06, "loss": 2.6683, "step": 2000 }, { "epoch": 113.79800853485064, "eval_loss": 2.6664071083068848, "eval_runtime": 13.2695, "eval_samples_per_second": 105.053, "eval_steps_per_second": 105.053, "step": 2000 }, { "epoch": 119.48790896159318, "grad_norm": 0.24480201303958893, "learning_rate": 5.980993664554851e-06, "loss": 2.668, "step": 2100 }, { "epoch": 119.48790896159318, "eval_loss": 2.6675596237182617, "eval_runtime": 13.6602, "eval_samples_per_second": 102.048, "eval_steps_per_second": 102.048, "step": 2100 }, { "epoch": 125.1778093883357, "grad_norm": 0.2695687413215637, "learning_rate": 5.979993331110371e-06, "loss": 2.6683, "step": 2200 }, { "epoch": 125.1778093883357, "eval_loss": 2.6677987575531006, "eval_runtime": 13.2773, "eval_samples_per_second": 104.991, "eval_steps_per_second": 104.991, "step": 2200 }, { "epoch": 130.86770981507823, "grad_norm": 0.2357303947210312, "learning_rate": 5.978992997665889e-06, "loss": 2.6678, "step": 2300 }, { "epoch": 130.86770981507823, "eval_loss": 2.6650021076202393, "eval_runtime": 13.256, "eval_samples_per_second": 105.16, "eval_steps_per_second": 105.16, "step": 2300 }, { "epoch": 136.55761024182075, "grad_norm": 0.23957480490207672, "learning_rate": 5.9779926642214075e-06, "loss": 2.6679, "step": 2400 }, { "epoch": 136.55761024182075, "eval_loss": 2.6645851135253906, "eval_runtime": 13.7249, "eval_samples_per_second": 101.567, "eval_steps_per_second": 101.567, "step": 2400 }, { "epoch": 146.86059743954482, "grad_norm": 0.19333045184612274, "learning_rate": 5.976992330776926e-06, "loss": 2.6671, "step": 2500 }, { "epoch": 146.86059743954482, "eval_loss": 2.666929244995117, "eval_runtime": 8.8729, "eval_samples_per_second": 157.107, "eval_steps_per_second": 19.723, "step": 2500 }, { "epoch": 152.55049786628734, "grad_norm": 0.3093737065792084, "learning_rate": 5.975991997332444e-06, "loss": 2.6674, "step": 2600 }, { "epoch": 152.55049786628734, "eval_loss": 2.6641287803649902, "eval_runtime": 8.544, "eval_samples_per_second": 163.155, "eval_steps_per_second": 20.482, "step": 2600 }, { "epoch": 158.24039829302987, "grad_norm": 0.2492215484380722, "learning_rate": 5.974991663887963e-06, "loss": 2.6675, "step": 2700 }, { "epoch": 158.24039829302987, "eval_loss": 2.6672415733337402, "eval_runtime": 8.5402, "eval_samples_per_second": 163.229, "eval_steps_per_second": 20.491, "step": 2700 }, { "epoch": 163.9302987197724, "grad_norm": 0.3064326047897339, "learning_rate": 5.973991330443481e-06, "loss": 2.6674, "step": 2800 }, { "epoch": 163.9302987197724, "eval_loss": 2.667715072631836, "eval_runtime": 8.861, "eval_samples_per_second": 157.319, "eval_steps_per_second": 19.75, "step": 2800 }, { "epoch": 169.62019914651495, "grad_norm": 0.2401367574930191, "learning_rate": 5.972990996999e-06, "loss": 2.6668, "step": 2900 }, { "epoch": 169.62019914651495, "eval_loss": 2.663231611251831, "eval_runtime": 8.5263, "eval_samples_per_second": 163.495, "eval_steps_per_second": 20.525, "step": 2900 }, { "epoch": 175.31009957325747, "grad_norm": 0.26518478989601135, "learning_rate": 5.971990663554519e-06, "loss": 2.6664, "step": 3000 }, { "epoch": 175.31009957325747, "eval_loss": 2.66806960105896, "eval_runtime": 8.5256, "eval_samples_per_second": 163.508, "eval_steps_per_second": 20.526, "step": 3000 }, { "epoch": 181.0, "grad_norm": 0.21279650926589966, "learning_rate": 5.970990330110037e-06, "loss": 2.6662, "step": 3100 }, { "epoch": 181.0, "eval_loss": 2.66540789604187, "eval_runtime": 8.9087, "eval_samples_per_second": 156.477, "eval_steps_per_second": 19.644, "step": 3100 }, { "epoch": 186.68990042674253, "grad_norm": 0.20601896941661835, "learning_rate": 5.9699899966655554e-06, "loss": 2.6662, "step": 3200 }, { "epoch": 186.68990042674253, "eval_loss": 2.661759614944458, "eval_runtime": 8.5361, "eval_samples_per_second": 163.306, "eval_steps_per_second": 20.501, "step": 3200 }, { "epoch": 192.37980085348505, "grad_norm": 0.30063194036483765, "learning_rate": 5.968989663221074e-06, "loss": 2.666, "step": 3300 }, { "epoch": 192.37980085348505, "eval_loss": 2.6638128757476807, "eval_runtime": 8.5253, "eval_samples_per_second": 163.514, "eval_steps_per_second": 20.527, "step": 3300 }, { "epoch": 198.0697012802276, "grad_norm": 0.17756374180316925, "learning_rate": 5.967989329776592e-06, "loss": 2.6652, "step": 3400 }, { "epoch": 198.0697012802276, "eval_loss": 2.6624886989593506, "eval_runtime": 8.9634, "eval_samples_per_second": 155.521, "eval_steps_per_second": 19.524, "step": 3400 }, { "epoch": 203.75960170697013, "grad_norm": 0.3183553218841553, "learning_rate": 5.966988996332111e-06, "loss": 2.6656, "step": 3500 }, { "epoch": 203.75960170697013, "eval_loss": 2.666609764099121, "eval_runtime": 8.5304, "eval_samples_per_second": 163.416, "eval_steps_per_second": 20.515, "step": 3500 }, { "epoch": 209.44950213371266, "grad_norm": 0.23746278882026672, "learning_rate": 5.965988662887629e-06, "loss": 2.6656, "step": 3600 }, { "epoch": 209.44950213371266, "eval_loss": 2.664607048034668, "eval_runtime": 8.5297, "eval_samples_per_second": 163.429, "eval_steps_per_second": 20.516, "step": 3600 }, { "epoch": 215.13940256045518, "grad_norm": 0.2566852271556854, "learning_rate": 5.964988329443148e-06, "loss": 2.6652, "step": 3700 }, { "epoch": 215.13940256045518, "eval_loss": 2.663752794265747, "eval_runtime": 8.5306, "eval_samples_per_second": 163.412, "eval_steps_per_second": 20.514, "step": 3700 }, { "epoch": 220.82930298719774, "grad_norm": 0.19710654020309448, "learning_rate": 5.963987995998667e-06, "loss": 2.6657, "step": 3800 }, { "epoch": 220.82930298719774, "eval_loss": 2.66432785987854, "eval_runtime": 8.9192, "eval_samples_per_second": 156.293, "eval_steps_per_second": 19.621, "step": 3800 }, { "epoch": 226.51920341394026, "grad_norm": 0.20113052427768707, "learning_rate": 5.962987662554185e-06, "loss": 2.6655, "step": 3900 }, { "epoch": 226.51920341394026, "eval_loss": 2.662318706512451, "eval_runtime": 8.5269, "eval_samples_per_second": 163.483, "eval_steps_per_second": 20.523, "step": 3900 }, { "epoch": 232.2091038406828, "grad_norm": 0.24698683619499207, "learning_rate": 5.961987329109703e-06, "loss": 2.6652, "step": 4000 }, { "epoch": 232.2091038406828, "eval_loss": 2.6657159328460693, "eval_runtime": 8.5292, "eval_samples_per_second": 163.438, "eval_steps_per_second": 20.518, "step": 4000 }, { "epoch": 237.8990042674253, "grad_norm": 0.24947816133499146, "learning_rate": 5.960986995665222e-06, "loss": 2.6652, "step": 4100 }, { "epoch": 237.8990042674253, "eval_loss": 2.6663217544555664, "eval_runtime": 8.5354, "eval_samples_per_second": 163.32, "eval_steps_per_second": 20.503, "step": 4100 }, { "epoch": 243.58890469416787, "grad_norm": 0.2810859680175781, "learning_rate": 5.95998666222074e-06, "loss": 2.6649, "step": 4200 }, { "epoch": 243.58890469416787, "eval_loss": 2.6661739349365234, "eval_runtime": 8.8623, "eval_samples_per_second": 157.295, "eval_steps_per_second": 19.747, "step": 4200 }, { "epoch": 252.74679943100995, "grad_norm": 0.18688435852527618, "learning_rate": 5.588628762541806e-06, "loss": 2.6646, "step": 4300 }, { "epoch": 252.74679943100995, "eval_loss": 2.664121389389038, "eval_runtime": 9.162, "eval_samples_per_second": 152.149, "eval_steps_per_second": 19.101, "step": 4300 }, { "epoch": 258.4366998577525, "grad_norm": 0.19968199729919434, "learning_rate": 5.578595317725753e-06, "loss": 2.6649, "step": 4400 }, { "epoch": 258.4366998577525, "eval_loss": 2.666635274887085, "eval_runtime": 11.7784, "eval_samples_per_second": 118.352, "eval_steps_per_second": 14.858, "step": 4400 }, { "epoch": 264.126600284495, "grad_norm": 0.18012067675590515, "learning_rate": 5.568561872909699e-06, "loss": 2.6653, "step": 4500 }, { "epoch": 264.126600284495, "eval_loss": 2.662325143814087, "eval_runtime": 11.7766, "eval_samples_per_second": 118.371, "eval_steps_per_second": 14.86, "step": 4500 }, { "epoch": 269.81650071123755, "grad_norm": 0.18739238381385803, "learning_rate": 5.558528428093646e-06, "loss": 2.6652, "step": 4600 }, { "epoch": 269.81650071123755, "eval_loss": 2.664424419403076, "eval_runtime": 9.739, "eval_samples_per_second": 143.137, "eval_steps_per_second": 17.969, "step": 4600 }, { "epoch": 275.5064011379801, "grad_norm": 0.2488318383693695, "learning_rate": 5.548494983277593e-06, "loss": 2.6648, "step": 4700 }, { "epoch": 275.5064011379801, "eval_loss": 2.6640284061431885, "eval_runtime": 8.9011, "eval_samples_per_second": 156.609, "eval_steps_per_second": 19.66, "step": 4700 }, { "epoch": 281.1963015647226, "grad_norm": 0.22808881103992462, "learning_rate": 5.5384615384615385e-06, "loss": 2.6651, "step": 4800 }, { "epoch": 281.1963015647226, "eval_loss": 2.6617281436920166, "eval_runtime": 8.5632, "eval_samples_per_second": 162.79, "eval_steps_per_second": 20.436, "step": 4800 }, { "epoch": 286.88620199146516, "grad_norm": 0.1917983591556549, "learning_rate": 5.528428093645485e-06, "loss": 2.6647, "step": 4900 }, { "epoch": 286.88620199146516, "eval_loss": 2.6639668941497803, "eval_runtime": 8.5741, "eval_samples_per_second": 162.583, "eval_steps_per_second": 20.41, "step": 4900 }, { "epoch": 292.57610241820765, "grad_norm": 0.247116819024086, "learning_rate": 5.518394648829432e-06, "loss": 2.6648, "step": 5000 }, { "epoch": 292.57610241820765, "eval_loss": 2.660776376724243, "eval_runtime": 8.9014, "eval_samples_per_second": 156.605, "eval_steps_per_second": 19.66, "step": 5000 }, { "epoch": 298.2660028449502, "grad_norm": 0.18090835213661194, "learning_rate": 5.508361204013378e-06, "loss": 2.6643, "step": 5100 }, { "epoch": 298.2660028449502, "eval_loss": 2.6607048511505127, "eval_runtime": 8.5599, "eval_samples_per_second": 162.853, "eval_steps_per_second": 20.444, "step": 5100 }, { "epoch": 303.95590327169276, "grad_norm": 0.1796797215938568, "learning_rate": 5.498327759197324e-06, "loss": 2.6645, "step": 5200 }, { "epoch": 303.95590327169276, "eval_loss": 2.6626744270324707, "eval_runtime": 8.9139, "eval_samples_per_second": 156.385, "eval_steps_per_second": 19.632, "step": 5200 }, { "epoch": 309.64580369843526, "grad_norm": 0.19111952185630798, "learning_rate": 5.488294314381271e-06, "loss": 2.6647, "step": 5300 }, { "epoch": 309.64580369843526, "eval_loss": 2.6617257595062256, "eval_runtime": 8.5801, "eval_samples_per_second": 162.47, "eval_steps_per_second": 20.396, "step": 5300 }, { "epoch": 315.3357041251778, "grad_norm": 0.17278283834457397, "learning_rate": 5.478260869565217e-06, "loss": 2.6645, "step": 5400 }, { "epoch": 315.3357041251778, "eval_loss": 2.6651811599731445, "eval_runtime": 8.8919, "eval_samples_per_second": 156.771, "eval_steps_per_second": 19.681, "step": 5400 }, { "epoch": 321.02560455192037, "grad_norm": 0.24506501853466034, "learning_rate": 5.468227424749163e-06, "loss": 2.6644, "step": 5500 }, { "epoch": 321.02560455192037, "eval_loss": 2.6612024307250977, "eval_runtime": 8.5651, "eval_samples_per_second": 162.754, "eval_steps_per_second": 20.432, "step": 5500 }, { "epoch": 326.71550497866286, "grad_norm": 0.17717023193836212, "learning_rate": 5.45819397993311e-06, "loss": 2.6644, "step": 5600 }, { "epoch": 326.71550497866286, "eval_loss": 2.661200523376465, "eval_runtime": 8.5626, "eval_samples_per_second": 162.801, "eval_steps_per_second": 20.438, "step": 5600 }, { "epoch": 332.4054054054054, "grad_norm": 0.12661577761173248, "learning_rate": 5.448160535117057e-06, "loss": 2.6641, "step": 5700 }, { "epoch": 332.4054054054054, "eval_loss": 2.6609702110290527, "eval_runtime": 8.883, "eval_samples_per_second": 156.929, "eval_steps_per_second": 19.701, "step": 5700 }, { "epoch": 338.0953058321479, "grad_norm": 0.199785977602005, "learning_rate": 5.438127090301003e-06, "loss": 2.6643, "step": 5800 }, { "epoch": 338.0953058321479, "eval_loss": 2.660168409347534, "eval_runtime": 8.5745, "eval_samples_per_second": 162.574, "eval_steps_per_second": 20.409, "step": 5800 }, { "epoch": 343.78520625889047, "grad_norm": 0.2726210057735443, "learning_rate": 5.4280936454849495e-06, "loss": 2.6646, "step": 5900 }, { "epoch": 343.78520625889047, "eval_loss": 2.664670944213867, "eval_runtime": 8.5589, "eval_samples_per_second": 162.871, "eval_steps_per_second": 20.447, "step": 5900 }, { "epoch": 349.475106685633, "grad_norm": 0.3512348234653473, "learning_rate": 5.418060200668896e-06, "loss": 2.664, "step": 6000 }, { "epoch": 349.475106685633, "eval_loss": 2.665173292160034, "eval_runtime": 8.5571, "eval_samples_per_second": 162.905, "eval_steps_per_second": 20.451, "step": 6000 }, { "epoch": 355.1650071123755, "grad_norm": 0.20835170149803162, "learning_rate": 5.408026755852843e-06, "loss": 2.6641, "step": 6100 }, { "epoch": 355.1650071123755, "eval_loss": 2.662048101425171, "eval_runtime": 8.9087, "eval_samples_per_second": 156.476, "eval_steps_per_second": 19.644, "step": 6100 }, { "epoch": 360.8549075391181, "grad_norm": 0.11575555801391602, "learning_rate": 5.397993311036789e-06, "loss": 2.6645, "step": 6200 }, { "epoch": 360.8549075391181, "eval_loss": 2.6617300510406494, "eval_runtime": 8.5709, "eval_samples_per_second": 162.643, "eval_steps_per_second": 20.418, "step": 6200 }, { "epoch": 366.54480796586057, "grad_norm": 0.18948699533939362, "learning_rate": 5.387959866220736e-06, "loss": 2.6639, "step": 6300 }, { "epoch": 366.54480796586057, "eval_loss": 2.6628897190093994, "eval_runtime": 8.9034, "eval_samples_per_second": 156.569, "eval_steps_per_second": 19.655, "step": 6300 }, { "epoch": 372.2347083926031, "grad_norm": 0.12320856750011444, "learning_rate": 5.3779264214046825e-06, "loss": 2.6647, "step": 6400 }, { "epoch": 372.2347083926031, "eval_loss": 2.663992166519165, "eval_runtime": 8.5612, "eval_samples_per_second": 162.828, "eval_steps_per_second": 20.441, "step": 6400 }, { "epoch": 377.9246088193457, "grad_norm": 0.26067054271698, "learning_rate": 5.367892976588628e-06, "loss": 2.6643, "step": 6500 }, { "epoch": 377.9246088193457, "eval_loss": 2.6624581813812256, "eval_runtime": 8.572, "eval_samples_per_second": 162.623, "eval_steps_per_second": 20.415, "step": 6500 }, { "epoch": 383.6145092460882, "grad_norm": 0.18116046488285065, "learning_rate": 5.357859531772575e-06, "loss": 2.664, "step": 6600 }, { "epoch": 383.6145092460882, "eval_loss": 2.662827491760254, "eval_runtime": 8.8871, "eval_samples_per_second": 156.857, "eval_steps_per_second": 19.692, "step": 6600 }, { "epoch": 389.30440967283073, "grad_norm": 0.21489782631397247, "learning_rate": 5.347826086956522e-06, "loss": 2.6635, "step": 6700 }, { "epoch": 389.30440967283073, "eval_loss": 2.6603777408599854, "eval_runtime": 8.5641, "eval_samples_per_second": 162.772, "eval_steps_per_second": 20.434, "step": 6700 }, { "epoch": 394.9943100995733, "grad_norm": 0.1781698316335678, "learning_rate": 5.337792642140468e-06, "loss": 2.6645, "step": 6800 }, { "epoch": 394.9943100995733, "eval_loss": 2.661527156829834, "eval_runtime": 8.5625, "eval_samples_per_second": 162.802, "eval_steps_per_second": 20.438, "step": 6800 }, { "epoch": 400.6842105263158, "grad_norm": 0.18622642755508423, "learning_rate": 5.327759197324415e-06, "loss": 2.6647, "step": 6900 }, { "epoch": 400.6842105263158, "eval_loss": 2.661090135574341, "eval_runtime": 8.5509, "eval_samples_per_second": 163.023, "eval_steps_per_second": 20.466, "step": 6900 }, { "epoch": 406.37411095305833, "grad_norm": 0.15774820744991302, "learning_rate": 5.317725752508361e-06, "loss": 2.6636, "step": 7000 }, { "epoch": 406.37411095305833, "eval_loss": 2.66558575630188, "eval_runtime": 8.898, "eval_samples_per_second": 156.665, "eval_steps_per_second": 19.667, "step": 7000 }, { "epoch": 412.06401137980083, "grad_norm": 0.18330508470535278, "learning_rate": 5.307692307692307e-06, "loss": 2.6645, "step": 7100 }, { "epoch": 412.06401137980083, "eval_loss": 2.6627676486968994, "eval_runtime": 8.5744, "eval_samples_per_second": 162.576, "eval_steps_per_second": 20.409, "step": 7100 }, { "epoch": 417.7539118065434, "grad_norm": 0.23223190009593964, "learning_rate": 5.297658862876254e-06, "loss": 2.6636, "step": 7200 }, { "epoch": 417.7539118065434, "eval_loss": 2.661203145980835, "eval_runtime": 8.5675, "eval_samples_per_second": 162.708, "eval_steps_per_second": 20.426, "step": 7200 }, { "epoch": 423.44381223328594, "grad_norm": 0.15261903405189514, "learning_rate": 5.287625418060201e-06, "loss": 2.6641, "step": 7300 }, { "epoch": 423.44381223328594, "eval_loss": 2.6625847816467285, "eval_runtime": 8.9059, "eval_samples_per_second": 156.525, "eval_steps_per_second": 19.65, "step": 7300 }, { "epoch": 429.13371266002844, "grad_norm": 0.1654181033372879, "learning_rate": 5.277591973244147e-06, "loss": 2.6641, "step": 7400 }, { "epoch": 429.13371266002844, "eval_loss": 2.6628565788269043, "eval_runtime": 8.555, "eval_samples_per_second": 162.946, "eval_steps_per_second": 20.456, "step": 7400 }, { "epoch": 434.823613086771, "grad_norm": 0.2062557488679886, "learning_rate": 5.2675585284280935e-06, "loss": 2.6634, "step": 7500 }, { "epoch": 434.823613086771, "eval_loss": 2.6651501655578613, "eval_runtime": 8.9081, "eval_samples_per_second": 156.487, "eval_steps_per_second": 19.645, "step": 7500 }, { "epoch": 440.5135135135135, "grad_norm": 0.21824122965335846, "learning_rate": 5.25752508361204e-06, "loss": 2.6637, "step": 7600 }, { "epoch": 440.5135135135135, "eval_loss": 2.6624720096588135, "eval_runtime": 8.5608, "eval_samples_per_second": 162.836, "eval_steps_per_second": 20.442, "step": 7600 }, { "epoch": 446.20341394025604, "grad_norm": 0.2458944469690323, "learning_rate": 5.247491638795986e-06, "loss": 2.6637, "step": 7700 }, { "epoch": 446.20341394025604, "eval_loss": 2.661086082458496, "eval_runtime": 8.8963, "eval_samples_per_second": 156.694, "eval_steps_per_second": 19.671, "step": 7700 }, { "epoch": 451.8933143669986, "grad_norm": 0.1574467271566391, "learning_rate": 5.237458193979933e-06, "loss": 2.6639, "step": 7800 }, { "epoch": 451.8933143669986, "eval_loss": 2.6646134853363037, "eval_runtime": 8.5514, "eval_samples_per_second": 163.014, "eval_steps_per_second": 20.464, "step": 7800 }, { "epoch": 457.5832147937411, "grad_norm": 0.1982835829257965, "learning_rate": 5.22742474916388e-06, "loss": 2.664, "step": 7900 }, { "epoch": 457.5832147937411, "eval_loss": 2.6606264114379883, "eval_runtime": 8.5483, "eval_samples_per_second": 163.073, "eval_steps_per_second": 20.472, "step": 7900 }, { "epoch": 463.27311522048365, "grad_norm": 0.19593903422355652, "learning_rate": 5.2173913043478265e-06, "loss": 2.6632, "step": 8000 }, { "epoch": 463.27311522048365, "eval_loss": 2.664707899093628, "eval_runtime": 8.5595, "eval_samples_per_second": 162.861, "eval_steps_per_second": 20.445, "step": 8000 }, { "epoch": 468.9630156472262, "grad_norm": 0.22343507409095764, "learning_rate": 5.207357859531772e-06, "loss": 2.6634, "step": 8100 }, { "epoch": 468.9630156472262, "eval_loss": 2.6642260551452637, "eval_runtime": 8.9066, "eval_samples_per_second": 156.513, "eval_steps_per_second": 19.648, "step": 8100 }, { "epoch": 474.6529160739687, "grad_norm": 0.16728109121322632, "learning_rate": 5.197324414715719e-06, "loss": 2.6633, "step": 8200 }, { "epoch": 474.6529160739687, "eval_loss": 2.6625194549560547, "eval_runtime": 8.5593, "eval_samples_per_second": 162.863, "eval_steps_per_second": 20.445, "step": 8200 }, { "epoch": 480.34281650071125, "grad_norm": 0.23255111277103424, "learning_rate": 5.187290969899666e-06, "loss": 2.6634, "step": 8300 }, { "epoch": 480.34281650071125, "eval_loss": 2.6606099605560303, "eval_runtime": 8.559, "eval_samples_per_second": 162.87, "eval_steps_per_second": 20.446, "step": 8300 }, { "epoch": 486.03271692745375, "grad_norm": 0.118553027510643, "learning_rate": 5.177257525083612e-06, "loss": 2.6632, "step": 8400 }, { "epoch": 486.03271692745375, "eval_loss": 2.663628101348877, "eval_runtime": 8.912, "eval_samples_per_second": 156.418, "eval_steps_per_second": 19.636, "step": 8400 }, { "epoch": 491.7226173541963, "grad_norm": 0.23464259505271912, "learning_rate": 5.167224080267559e-06, "loss": 2.6636, "step": 8500 }, { "epoch": 491.7226173541963, "eval_loss": 2.6618993282318115, "eval_runtime": 8.5674, "eval_samples_per_second": 162.711, "eval_steps_per_second": 20.426, "step": 8500 }, { "epoch": 497.41251778093886, "grad_norm": 0.14757351577281952, "learning_rate": 5.157190635451505e-06, "loss": 2.6634, "step": 8600 }, { "epoch": 497.41251778093886, "eval_loss": 2.6627461910247803, "eval_runtime": 8.553, "eval_samples_per_second": 162.984, "eval_steps_per_second": 20.461, "step": 8600 }, { "epoch": 503.10241820768135, "grad_norm": 0.16491751372814178, "learning_rate": 5.147157190635451e-06, "loss": 2.6634, "step": 8700 }, { "epoch": 503.10241820768135, "eval_loss": 2.6589674949645996, "eval_runtime": 8.8861, "eval_samples_per_second": 156.874, "eval_steps_per_second": 19.694, "step": 8700 }, { "epoch": 508.7923186344239, "grad_norm": 0.17845740914344788, "learning_rate": 5.137123745819398e-06, "loss": 2.6634, "step": 8800 }, { "epoch": 508.7923186344239, "eval_loss": 2.6661596298217773, "eval_runtime": 8.5694, "eval_samples_per_second": 162.672, "eval_steps_per_second": 20.422, "step": 8800 }, { "epoch": 514.4822190611665, "grad_norm": 0.11282111704349518, "learning_rate": 5.127090301003345e-06, "loss": 2.6635, "step": 8900 }, { "epoch": 514.4822190611665, "eval_loss": 2.6612164974212646, "eval_runtime": 8.9016, "eval_samples_per_second": 156.6, "eval_steps_per_second": 19.659, "step": 8900 }, { "epoch": 520.172119487909, "grad_norm": 0.11933238804340363, "learning_rate": 5.117056856187291e-06, "loss": 2.6629, "step": 9000 }, { "epoch": 520.172119487909, "eval_loss": 2.663548707962036, "eval_runtime": 8.5756, "eval_samples_per_second": 162.553, "eval_steps_per_second": 20.407, "step": 9000 }, { "epoch": 525.8620199146515, "grad_norm": 0.16832073032855988, "learning_rate": 5.1070234113712375e-06, "loss": 2.6632, "step": 9100 }, { "epoch": 525.8620199146515, "eval_loss": 2.665459156036377, "eval_runtime": 8.8932, "eval_samples_per_second": 156.75, "eval_steps_per_second": 19.678, "step": 9100 }, { "epoch": 531.5519203413941, "grad_norm": 0.1491301953792572, "learning_rate": 5.096989966555184e-06, "loss": 2.6633, "step": 9200 }, { "epoch": 531.5519203413941, "eval_loss": 2.6649389266967773, "eval_runtime": 8.5727, "eval_samples_per_second": 162.609, "eval_steps_per_second": 20.414, "step": 9200 }, { "epoch": 537.2418207681366, "grad_norm": 0.20299378037452698, "learning_rate": 5.08695652173913e-06, "loss": 2.663, "step": 9300 }, { "epoch": 537.2418207681366, "eval_loss": 2.662057638168335, "eval_runtime": 8.5522, "eval_samples_per_second": 162.999, "eval_steps_per_second": 20.463, "step": 9300 }, { "epoch": 542.9317211948791, "grad_norm": 0.1609990894794464, "learning_rate": 5.076923076923077e-06, "loss": 2.6631, "step": 9400 }, { "epoch": 542.9317211948791, "eval_loss": 2.6604907512664795, "eval_runtime": 8.8909, "eval_samples_per_second": 156.79, "eval_steps_per_second": 19.683, "step": 9400 }, { "epoch": 548.6216216216217, "grad_norm": 0.18364398181438446, "learning_rate": 5.066889632107024e-06, "loss": 2.663, "step": 9500 }, { "epoch": 548.6216216216217, "eval_loss": 2.660076856613159, "eval_runtime": 8.5556, "eval_samples_per_second": 162.934, "eval_steps_per_second": 20.454, "step": 9500 }, { "epoch": 554.3115220483642, "grad_norm": 0.15186648070812225, "learning_rate": 5.05685618729097e-06, "loss": 2.6631, "step": 9600 }, { "epoch": 554.3115220483642, "eval_loss": 2.6639227867126465, "eval_runtime": 8.551, "eval_samples_per_second": 163.021, "eval_steps_per_second": 20.465, "step": 9600 }, { "epoch": 560.0014224751067, "grad_norm": 0.14984333515167236, "learning_rate": 5.046822742474916e-06, "loss": 2.6632, "step": 9700 }, { "epoch": 560.0014224751067, "eval_loss": 2.6611454486846924, "eval_runtime": 8.9892, "eval_samples_per_second": 155.075, "eval_steps_per_second": 19.468, "step": 9700 }, { "epoch": 565.6913229018492, "grad_norm": 0.1124359741806984, "learning_rate": 5.036789297658863e-06, "loss": 2.663, "step": 9800 }, { "epoch": 565.6913229018492, "eval_loss": 2.661329746246338, "eval_runtime": 8.5548, "eval_samples_per_second": 162.95, "eval_steps_per_second": 20.456, "step": 9800 }, { "epoch": 571.3812233285918, "grad_norm": 0.230003222823143, "learning_rate": 5.02675585284281e-06, "loss": 2.6631, "step": 9900 }, { "epoch": 571.3812233285918, "eval_loss": 2.6644487380981445, "eval_runtime": 8.563, "eval_samples_per_second": 162.793, "eval_steps_per_second": 20.437, "step": 9900 }, { "epoch": 577.0711237553343, "grad_norm": 0.172781303524971, "learning_rate": 5.016722408026756e-06, "loss": 2.6626, "step": 10000 }, { "epoch": 577.0711237553343, "eval_loss": 2.662069082260132, "eval_runtime": 8.892, "eval_samples_per_second": 156.769, "eval_steps_per_second": 19.681, "step": 10000 }, { "epoch": 582.7610241820768, "grad_norm": 0.15369383990764618, "learning_rate": 5.0066889632107026e-06, "loss": 2.663, "step": 10100 }, { "epoch": 582.7610241820768, "eval_loss": 2.6648526191711426, "eval_runtime": 8.5714, "eval_samples_per_second": 162.633, "eval_steps_per_second": 20.417, "step": 10100 }, { "epoch": 588.4509246088194, "grad_norm": 0.1935221403837204, "learning_rate": 4.996655518394649e-06, "loss": 2.6632, "step": 10200 }, { "epoch": 588.4509246088194, "eval_loss": 2.6587936878204346, "eval_runtime": 8.5618, "eval_samples_per_second": 162.816, "eval_steps_per_second": 20.44, "step": 10200 }, { "epoch": 594.1408250355619, "grad_norm": 0.14302797615528107, "learning_rate": 4.986622073578595e-06, "loss": 2.6626, "step": 10300 }, { "epoch": 594.1408250355619, "eval_loss": 2.662747383117676, "eval_runtime": 8.8896, "eval_samples_per_second": 156.812, "eval_steps_per_second": 19.686, "step": 10300 }, { "epoch": 599.8307254623044, "grad_norm": 0.18007439374923706, "learning_rate": 4.976588628762542e-06, "loss": 2.6631, "step": 10400 }, { "epoch": 599.8307254623044, "eval_loss": 2.6642062664031982, "eval_runtime": 8.5619, "eval_samples_per_second": 162.814, "eval_steps_per_second": 20.439, "step": 10400 }, { "epoch": 605.520625889047, "grad_norm": 0.2200157195329666, "learning_rate": 4.966555183946489e-06, "loss": 2.6625, "step": 10500 }, { "epoch": 605.520625889047, "eval_loss": 2.6608235836029053, "eval_runtime": 8.5591, "eval_samples_per_second": 162.868, "eval_steps_per_second": 20.446, "step": 10500 }, { "epoch": 611.2105263157895, "grad_norm": 0.1693902462720871, "learning_rate": 4.956521739130435e-06, "loss": 2.6629, "step": 10600 }, { "epoch": 611.2105263157895, "eval_loss": 2.6646671295166016, "eval_runtime": 8.5548, "eval_samples_per_second": 162.95, "eval_steps_per_second": 20.456, "step": 10600 }, { "epoch": 616.900426742532, "grad_norm": 0.17042887210845947, "learning_rate": 4.9464882943143815e-06, "loss": 2.663, "step": 10700 }, { "epoch": 616.900426742532, "eval_loss": 2.6628501415252686, "eval_runtime": 8.9184, "eval_samples_per_second": 156.306, "eval_steps_per_second": 19.622, "step": 10700 }, { "epoch": 622.5903271692746, "grad_norm": 0.15105395019054413, "learning_rate": 4.936454849498328e-06, "loss": 2.6622, "step": 10800 }, { "epoch": 622.5903271692746, "eval_loss": 2.663177251815796, "eval_runtime": 8.5551, "eval_samples_per_second": 162.943, "eval_steps_per_second": 20.456, "step": 10800 }, { "epoch": 628.2802275960171, "grad_norm": 0.16232497990131378, "learning_rate": 4.926421404682274e-06, "loss": 2.662, "step": 10900 }, { "epoch": 628.2802275960171, "eval_loss": 2.662868022918701, "eval_runtime": 8.5562, "eval_samples_per_second": 162.923, "eval_steps_per_second": 20.453, "step": 10900 }, { "epoch": 633.9701280227596, "grad_norm": 0.19268840551376343, "learning_rate": 4.916387959866221e-06, "loss": 2.6616, "step": 11000 }, { "epoch": 633.9701280227596, "eval_loss": 2.664335250854492, "eval_runtime": 8.5889, "eval_samples_per_second": 162.302, "eval_steps_per_second": 20.375, "step": 11000 }, { "epoch": 639.6600284495021, "grad_norm": 0.11089065670967102, "learning_rate": 4.906354515050168e-06, "loss": 2.6604, "step": 11100 }, { "epoch": 639.6600284495021, "eval_loss": 2.656398057937622, "eval_runtime": 8.9031, "eval_samples_per_second": 156.575, "eval_steps_per_second": 19.656, "step": 11100 }, { "epoch": 645.3499288762447, "grad_norm": 0.1336248517036438, "learning_rate": 4.8963210702341136e-06, "loss": 2.6599, "step": 11200 }, { "epoch": 645.3499288762447, "eval_loss": 2.6563735008239746, "eval_runtime": 8.5539, "eval_samples_per_second": 162.966, "eval_steps_per_second": 20.458, "step": 11200 }, { "epoch": 651.0398293029872, "grad_norm": 0.12397616356611252, "learning_rate": 4.88628762541806e-06, "loss": 2.6581, "step": 11300 }, { "epoch": 651.0398293029872, "eval_loss": 2.65476131439209, "eval_runtime": 8.5563, "eval_samples_per_second": 162.92, "eval_steps_per_second": 20.453, "step": 11300 }, { "epoch": 656.7297297297297, "grad_norm": 0.2090333253145218, "learning_rate": 4.876254180602007e-06, "loss": 2.6553, "step": 11400 }, { "epoch": 656.7297297297297, "eval_loss": 2.6521565914154053, "eval_runtime": 8.892, "eval_samples_per_second": 156.77, "eval_steps_per_second": 19.681, "step": 11400 }, { "epoch": 662.4196301564723, "grad_norm": 0.22825314104557037, "learning_rate": 4.866220735785953e-06, "loss": 2.654, "step": 11500 }, { "epoch": 662.4196301564723, "eval_loss": 2.649017095565796, "eval_runtime": 8.5668, "eval_samples_per_second": 162.721, "eval_steps_per_second": 20.428, "step": 11500 }, { "epoch": 668.1095305832148, "grad_norm": 0.19265511631965637, "learning_rate": 4.8561872909699e-06, "loss": 2.6522, "step": 11600 }, { "epoch": 668.1095305832148, "eval_loss": 2.650679588317871, "eval_runtime": 8.5489, "eval_samples_per_second": 163.061, "eval_steps_per_second": 20.47, "step": 11600 }, { "epoch": 673.7994310099573, "grad_norm": 0.1772225797176361, "learning_rate": 4.8461538461538465e-06, "loss": 2.6506, "step": 11700 }, { "epoch": 673.7994310099573, "eval_loss": 2.644835948944092, "eval_runtime": 8.8953, "eval_samples_per_second": 156.711, "eval_steps_per_second": 19.673, "step": 11700 }, { "epoch": 679.4893314366999, "grad_norm": 0.21952596306800842, "learning_rate": 4.8361204013377925e-06, "loss": 2.6495, "step": 11800 }, { "epoch": 679.4893314366999, "eval_loss": 2.6477487087249756, "eval_runtime": 8.5582, "eval_samples_per_second": 162.885, "eval_steps_per_second": 20.448, "step": 11800 }, { "epoch": 685.1792318634424, "grad_norm": 0.15563735365867615, "learning_rate": 4.826086956521739e-06, "loss": 2.6488, "step": 11900 }, { "epoch": 685.1792318634424, "eval_loss": 2.6446661949157715, "eval_runtime": 8.5731, "eval_samples_per_second": 162.602, "eval_steps_per_second": 20.413, "step": 11900 }, { "epoch": 690.8691322901849, "grad_norm": 0.19501689076423645, "learning_rate": 4.816053511705686e-06, "loss": 2.6477, "step": 12000 }, { "epoch": 690.8691322901849, "eval_loss": 2.644357442855835, "eval_runtime": 8.9024, "eval_samples_per_second": 156.587, "eval_steps_per_second": 19.658, "step": 12000 }, { "epoch": 696.5590327169275, "grad_norm": 0.18384377658367157, "learning_rate": 4.806020066889633e-06, "loss": 2.6469, "step": 12100 }, { "epoch": 696.5590327169275, "eval_loss": 2.6418209075927734, "eval_runtime": 8.5626, "eval_samples_per_second": 162.801, "eval_steps_per_second": 20.438, "step": 12100 }, { "epoch": 702.24893314367, "grad_norm": 0.1915460228919983, "learning_rate": 4.795986622073579e-06, "loss": 2.6454, "step": 12200 }, { "epoch": 702.24893314367, "eval_loss": 2.641967535018921, "eval_runtime": 8.5587, "eval_samples_per_second": 162.875, "eval_steps_per_second": 20.447, "step": 12200 }, { "epoch": 707.9388335704125, "grad_norm": 0.18700934946537018, "learning_rate": 4.785953177257525e-06, "loss": 2.6448, "step": 12300 }, { "epoch": 707.9388335704125, "eval_loss": 2.638808012008667, "eval_runtime": 8.5628, "eval_samples_per_second": 162.798, "eval_steps_per_second": 20.437, "step": 12300 }, { "epoch": 713.628733997155, "grad_norm": 0.17106923460960388, "learning_rate": 4.775919732441472e-06, "loss": 2.6446, "step": 12400 }, { "epoch": 713.628733997155, "eval_loss": 2.6404778957366943, "eval_runtime": 8.8985, "eval_samples_per_second": 156.655, "eval_steps_per_second": 19.666, "step": 12400 }, { "epoch": 719.3186344238976, "grad_norm": 0.17941860854625702, "learning_rate": 4.765886287625418e-06, "loss": 2.6436, "step": 12500 }, { "epoch": 719.3186344238976, "eval_loss": 2.6373584270477295, "eval_runtime": 8.557, "eval_samples_per_second": 162.907, "eval_steps_per_second": 20.451, "step": 12500 }, { "epoch": 725.0085348506401, "grad_norm": 0.17565137147903442, "learning_rate": 4.755852842809365e-06, "loss": 2.6434, "step": 12600 }, { "epoch": 725.0085348506401, "eval_loss": 2.639042377471924, "eval_runtime": 8.5557, "eval_samples_per_second": 162.932, "eval_steps_per_second": 20.454, "step": 12600 }, { "epoch": 730.6984352773826, "grad_norm": 0.18980301916599274, "learning_rate": 4.745819397993312e-06, "loss": 2.6428, "step": 12700 }, { "epoch": 730.6984352773826, "eval_loss": 2.6368398666381836, "eval_runtime": 8.9007, "eval_samples_per_second": 156.617, "eval_steps_per_second": 19.661, "step": 12700 }, { "epoch": 736.3883357041252, "grad_norm": 0.1572832465171814, "learning_rate": 4.7357859531772575e-06, "loss": 2.6423, "step": 12800 }, { "epoch": 736.3883357041252, "eval_loss": 2.6357386112213135, "eval_runtime": 8.5632, "eval_samples_per_second": 162.79, "eval_steps_per_second": 20.436, "step": 12800 }, { "epoch": 742.0782361308677, "grad_norm": 0.17804701626300812, "learning_rate": 4.725752508361204e-06, "loss": 2.6415, "step": 12900 }, { "epoch": 742.0782361308677, "eval_loss": 2.636728525161743, "eval_runtime": 8.5558, "eval_samples_per_second": 162.931, "eval_steps_per_second": 20.454, "step": 12900 }, { "epoch": 747.7681365576102, "grad_norm": 0.14196521043777466, "learning_rate": 4.715719063545151e-06, "loss": 2.6415, "step": 13000 }, { "epoch": 747.7681365576102, "eval_loss": 2.6351287364959717, "eval_runtime": 8.5495, "eval_samples_per_second": 163.05, "eval_steps_per_second": 20.469, "step": 13000 }, { "epoch": 753.4580369843528, "grad_norm": 0.16282819211483002, "learning_rate": 4.705685618729097e-06, "loss": 2.6409, "step": 13100 }, { "epoch": 753.4580369843528, "eval_loss": 2.6369380950927734, "eval_runtime": 8.8961, "eval_samples_per_second": 156.698, "eval_steps_per_second": 19.672, "step": 13100 }, { "epoch": 759.1479374110953, "grad_norm": 0.1580921709537506, "learning_rate": 4.695652173913044e-06, "loss": 2.6404, "step": 13200 }, { "epoch": 759.1479374110953, "eval_loss": 2.6370317935943604, "eval_runtime": 8.5558, "eval_samples_per_second": 162.93, "eval_steps_per_second": 20.454, "step": 13200 }, { "epoch": 764.8378378378378, "grad_norm": 0.23563043773174286, "learning_rate": 4.6856187290969905e-06, "loss": 2.6394, "step": 13300 }, { "epoch": 764.8378378378378, "eval_loss": 2.6321442127227783, "eval_runtime": 8.5496, "eval_samples_per_second": 163.048, "eval_steps_per_second": 20.469, "step": 13300 }, { "epoch": 770.5277382645804, "grad_norm": 0.16354724764823914, "learning_rate": 4.675585284280936e-06, "loss": 2.639, "step": 13400 }, { "epoch": 770.5277382645804, "eval_loss": 2.6348910331726074, "eval_runtime": 8.9264, "eval_samples_per_second": 156.166, "eval_steps_per_second": 19.605, "step": 13400 }, { "epoch": 776.2176386913229, "grad_norm": 0.1707228273153305, "learning_rate": 4.665551839464883e-06, "loss": 2.639, "step": 13500 }, { "epoch": 776.2176386913229, "eval_loss": 2.635204792022705, "eval_runtime": 8.5691, "eval_samples_per_second": 162.678, "eval_steps_per_second": 20.422, "step": 13500 }, { "epoch": 781.9075391180654, "grad_norm": 0.16934677958488464, "learning_rate": 4.65551839464883e-06, "loss": 2.6385, "step": 13600 }, { "epoch": 781.9075391180654, "eval_loss": 2.633455276489258, "eval_runtime": 8.557, "eval_samples_per_second": 162.907, "eval_steps_per_second": 20.451, "step": 13600 }, { "epoch": 787.5974395448079, "grad_norm": 0.1871781051158905, "learning_rate": 4.645484949832776e-06, "loss": 2.6379, "step": 13700 }, { "epoch": 787.5974395448079, "eval_loss": 2.633129119873047, "eval_runtime": 8.9094, "eval_samples_per_second": 156.463, "eval_steps_per_second": 19.642, "step": 13700 }, { "epoch": 793.2873399715505, "grad_norm": 0.20615407824516296, "learning_rate": 4.635451505016723e-06, "loss": 2.6376, "step": 13800 }, { "epoch": 793.2873399715505, "eval_loss": 2.634012222290039, "eval_runtime": 8.551, "eval_samples_per_second": 163.021, "eval_steps_per_second": 20.465, "step": 13800 }, { "epoch": 798.977240398293, "grad_norm": 0.21352247893810272, "learning_rate": 4.625418060200669e-06, "loss": 2.6374, "step": 13900 }, { "epoch": 798.977240398293, "eval_loss": 2.6326115131378174, "eval_runtime": 8.5835, "eval_samples_per_second": 162.405, "eval_steps_per_second": 20.388, "step": 13900 }, { "epoch": 804.6671408250355, "grad_norm": 0.21041567623615265, "learning_rate": 4.615384615384616e-06, "loss": 2.6373, "step": 14000 }, { "epoch": 804.6671408250355, "eval_loss": 2.632585287094116, "eval_runtime": 8.9033, "eval_samples_per_second": 156.571, "eval_steps_per_second": 19.656, "step": 14000 }, { "epoch": 810.3570412517781, "grad_norm": 0.16558390855789185, "learning_rate": 4.605351170568562e-06, "loss": 2.637, "step": 14100 }, { "epoch": 810.3570412517781, "eval_loss": 2.6330647468566895, "eval_runtime": 8.5657, "eval_samples_per_second": 162.743, "eval_steps_per_second": 20.43, "step": 14100 }, { "epoch": 816.0469416785206, "grad_norm": 0.14121714234352112, "learning_rate": 4.595317725752509e-06, "loss": 2.6369, "step": 14200 }, { "epoch": 816.0469416785206, "eval_loss": 2.633366823196411, "eval_runtime": 8.5562, "eval_samples_per_second": 162.924, "eval_steps_per_second": 20.453, "step": 14200 }, { "epoch": 821.7368421052631, "grad_norm": 0.18725652992725372, "learning_rate": 4.585284280936456e-06, "loss": 2.6366, "step": 14300 }, { "epoch": 821.7368421052631, "eval_loss": 2.633021116256714, "eval_runtime": 8.547, "eval_samples_per_second": 163.099, "eval_steps_per_second": 20.475, "step": 14300 }, { "epoch": 827.4267425320057, "grad_norm": 0.17320464551448822, "learning_rate": 4.5752508361204015e-06, "loss": 2.6362, "step": 14400 }, { "epoch": 827.4267425320057, "eval_loss": 2.6336045265197754, "eval_runtime": 8.9019, "eval_samples_per_second": 156.595, "eval_steps_per_second": 19.659, "step": 14400 }, { "epoch": 833.1166429587482, "grad_norm": 0.25663965940475464, "learning_rate": 4.565217391304348e-06, "loss": 2.6358, "step": 14500 }, { "epoch": 833.1166429587482, "eval_loss": 2.629626750946045, "eval_runtime": 8.555, "eval_samples_per_second": 162.946, "eval_steps_per_second": 20.456, "step": 14500 }, { "epoch": 838.8065433854907, "grad_norm": 0.19742050766944885, "learning_rate": 4.555183946488295e-06, "loss": 2.6359, "step": 14600 }, { "epoch": 838.8065433854907, "eval_loss": 2.6323554515838623, "eval_runtime": 8.5537, "eval_samples_per_second": 162.97, "eval_steps_per_second": 20.459, "step": 14600 }, { "epoch": 844.4964438122333, "grad_norm": 0.17131681740283966, "learning_rate": 4.545150501672241e-06, "loss": 2.636, "step": 14700 }, { "epoch": 844.4964438122333, "eval_loss": 2.628143787384033, "eval_runtime": 8.8882, "eval_samples_per_second": 156.837, "eval_steps_per_second": 19.689, "step": 14700 }, { "epoch": 850.1863442389758, "grad_norm": 0.12929615378379822, "learning_rate": 4.535117056856188e-06, "loss": 2.6355, "step": 14800 }, { "epoch": 850.1863442389758, "eval_loss": 2.62906813621521, "eval_runtime": 8.5719, "eval_samples_per_second": 162.624, "eval_steps_per_second": 20.415, "step": 14800 }, { "epoch": 855.8762446657183, "grad_norm": 0.1839623749256134, "learning_rate": 4.5250836120401345e-06, "loss": 2.6357, "step": 14900 }, { "epoch": 855.8762446657183, "eval_loss": 2.6294586658477783, "eval_runtime": 8.556, "eval_samples_per_second": 162.926, "eval_steps_per_second": 20.453, "step": 14900 }, { "epoch": 861.5661450924608, "grad_norm": 0.16717371344566345, "learning_rate": 4.51505016722408e-06, "loss": 2.6344, "step": 15000 }, { "epoch": 861.5661450924608, "eval_loss": 2.63043475151062, "eval_runtime": 8.8856, "eval_samples_per_second": 156.883, "eval_steps_per_second": 19.695, "step": 15000 }, { "epoch": 867.2560455192034, "grad_norm": 0.1823185533285141, "learning_rate": 4.505016722408027e-06, "loss": 2.6348, "step": 15100 }, { "epoch": 867.2560455192034, "eval_loss": 2.6305038928985596, "eval_runtime": 8.5615, "eval_samples_per_second": 162.822, "eval_steps_per_second": 20.44, "step": 15100 }, { "epoch": 872.9459459459459, "grad_norm": 0.1823842078447342, "learning_rate": 4.494983277591973e-06, "loss": 2.6348, "step": 15200 }, { "epoch": 872.9459459459459, "eval_loss": 2.6309924125671387, "eval_runtime": 8.5581, "eval_samples_per_second": 162.887, "eval_steps_per_second": 20.449, "step": 15200 }, { "epoch": 878.6358463726884, "grad_norm": 0.20153598487377167, "learning_rate": 4.48494983277592e-06, "loss": 2.6342, "step": 15300 }, { "epoch": 878.6358463726884, "eval_loss": 2.6329071521759033, "eval_runtime": 8.8917, "eval_samples_per_second": 156.776, "eval_steps_per_second": 19.681, "step": 15300 }, { "epoch": 884.325746799431, "grad_norm": 0.18218009173870087, "learning_rate": 4.474916387959866e-06, "loss": 2.6344, "step": 15400 }, { "epoch": 884.325746799431, "eval_loss": 2.6302568912506104, "eval_runtime": 8.5652, "eval_samples_per_second": 162.752, "eval_steps_per_second": 20.432, "step": 15400 }, { "epoch": 890.0156472261735, "grad_norm": 0.16739265620708466, "learning_rate": 4.4648829431438125e-06, "loss": 2.6343, "step": 15500 }, { "epoch": 890.0156472261735, "eval_loss": 2.6310319900512695, "eval_runtime": 8.5584, "eval_samples_per_second": 162.88, "eval_steps_per_second": 20.448, "step": 15500 }, { "epoch": 895.705547652916, "grad_norm": 0.1390063315629959, "learning_rate": 4.454849498327759e-06, "loss": 2.6339, "step": 15600 }, { "epoch": 895.705547652916, "eval_loss": 2.6301069259643555, "eval_runtime": 8.9049, "eval_samples_per_second": 156.544, "eval_steps_per_second": 19.652, "step": 15600 }, { "epoch": 901.3954480796586, "grad_norm": 0.18924345076084137, "learning_rate": 4.444816053511705e-06, "loss": 2.6339, "step": 15700 }, { "epoch": 901.3954480796586, "eval_loss": 2.6323258876800537, "eval_runtime": 8.5547, "eval_samples_per_second": 162.952, "eval_steps_per_second": 20.457, "step": 15700 }, { "epoch": 907.0853485064011, "grad_norm": 0.18514582514762878, "learning_rate": 4.434782608695652e-06, "loss": 2.6338, "step": 15800 }, { "epoch": 907.0853485064011, "eval_loss": 2.629317045211792, "eval_runtime": 8.5557, "eval_samples_per_second": 162.933, "eval_steps_per_second": 20.454, "step": 15800 }, { "epoch": 912.7752489331436, "grad_norm": 0.16134916245937347, "learning_rate": 4.424749163879599e-06, "loss": 2.6332, "step": 15900 }, { "epoch": 912.7752489331436, "eval_loss": 2.6283786296844482, "eval_runtime": 8.8916, "eval_samples_per_second": 156.778, "eval_steps_per_second": 19.682, "step": 15900 }, { "epoch": 918.4651493598863, "grad_norm": 0.15325242280960083, "learning_rate": 4.414715719063545e-06, "loss": 2.6327, "step": 16000 }, { "epoch": 918.4651493598863, "eval_loss": 2.628596305847168, "eval_runtime": 8.5648, "eval_samples_per_second": 162.759, "eval_steps_per_second": 20.432, "step": 16000 }, { "epoch": 924.1550497866288, "grad_norm": 0.16646109521389008, "learning_rate": 4.404682274247491e-06, "loss": 2.6334, "step": 16100 }, { "epoch": 924.1550497866288, "eval_loss": 2.6277356147766113, "eval_runtime": 8.5578, "eval_samples_per_second": 162.891, "eval_steps_per_second": 20.449, "step": 16100 }, { "epoch": 929.8449502133712, "grad_norm": 0.190487802028656, "learning_rate": 4.394648829431438e-06, "loss": 2.6324, "step": 16200 }, { "epoch": 929.8449502133712, "eval_loss": 2.632991075515747, "eval_runtime": 8.8984, "eval_samples_per_second": 156.657, "eval_steps_per_second": 19.666, "step": 16200 }, { "epoch": 935.5348506401137, "grad_norm": 0.12819956243038177, "learning_rate": 4.384615384615384e-06, "loss": 2.6329, "step": 16300 }, { "epoch": 935.5348506401137, "eval_loss": 2.6287131309509277, "eval_runtime": 8.5512, "eval_samples_per_second": 163.019, "eval_steps_per_second": 20.465, "step": 16300 }, { "epoch": 941.2247510668564, "grad_norm": 0.1414095014333725, "learning_rate": 4.374581939799331e-06, "loss": 2.6329, "step": 16400 }, { "epoch": 941.2247510668564, "eval_loss": 2.6280200481414795, "eval_runtime": 8.5543, "eval_samples_per_second": 162.96, "eval_steps_per_second": 20.458, "step": 16400 }, { "epoch": 946.9146514935989, "grad_norm": 0.1598784625530243, "learning_rate": 4.364548494983278e-06, "loss": 2.6321, "step": 16500 }, { "epoch": 946.9146514935989, "eval_loss": 2.627798080444336, "eval_runtime": 8.5617, "eval_samples_per_second": 162.818, "eval_steps_per_second": 20.44, "step": 16500 }, { "epoch": 952.6045519203414, "grad_norm": 0.16925720870494843, "learning_rate": 4.354515050167224e-06, "loss": 2.6323, "step": 16600 }, { "epoch": 952.6045519203414, "eval_loss": 2.627779722213745, "eval_runtime": 8.8962, "eval_samples_per_second": 156.697, "eval_steps_per_second": 19.671, "step": 16600 }, { "epoch": 958.294452347084, "grad_norm": 0.14368008077144623, "learning_rate": 4.34448160535117e-06, "loss": 2.6325, "step": 16700 }, { "epoch": 958.294452347084, "eval_loss": 2.6313493251800537, "eval_runtime": 8.5564, "eval_samples_per_second": 162.919, "eval_steps_per_second": 20.452, "step": 16700 }, { "epoch": 963.9843527738265, "grad_norm": 0.17267128825187683, "learning_rate": 4.334448160535117e-06, "loss": 2.6323, "step": 16800 }, { "epoch": 963.9843527738265, "eval_loss": 2.628115653991699, "eval_runtime": 8.9, "eval_samples_per_second": 156.628, "eval_steps_per_second": 19.663, "step": 16800 }, { "epoch": 969.674253200569, "grad_norm": 0.19119863212108612, "learning_rate": 4.324414715719064e-06, "loss": 2.6318, "step": 16900 }, { "epoch": 969.674253200569, "eval_loss": 2.627437114715576, "eval_runtime": 8.5548, "eval_samples_per_second": 162.95, "eval_steps_per_second": 20.456, "step": 16900 }, { "epoch": 975.3641536273116, "grad_norm": 0.13695764541625977, "learning_rate": 4.31438127090301e-06, "loss": 2.6315, "step": 17000 }, { "epoch": 975.3641536273116, "eval_loss": 2.6262221336364746, "eval_runtime": 8.8997, "eval_samples_per_second": 156.634, "eval_steps_per_second": 19.664, "step": 17000 }, { "epoch": 981.0540540540541, "grad_norm": 0.14241984486579895, "learning_rate": 4.3043478260869565e-06, "loss": 2.6318, "step": 17100 }, { "epoch": 981.0540540540541, "eval_loss": 2.6269607543945312, "eval_runtime": 8.5513, "eval_samples_per_second": 163.017, "eval_steps_per_second": 20.465, "step": 17100 }, { "epoch": 986.7439544807966, "grad_norm": 0.15792237222194672, "learning_rate": 4.294314381270903e-06, "loss": 2.6315, "step": 17200 }, { "epoch": 986.7439544807966, "eval_loss": 2.6278719902038574, "eval_runtime": 8.6117, "eval_samples_per_second": 161.873, "eval_steps_per_second": 20.321, "step": 17200 }, { "epoch": 992.4338549075392, "grad_norm": 0.17118434607982635, "learning_rate": 4.284280936454849e-06, "loss": 2.6316, "step": 17300 }, { "epoch": 992.4338549075392, "eval_loss": 2.6280527114868164, "eval_runtime": 8.9035, "eval_samples_per_second": 156.568, "eval_steps_per_second": 19.655, "step": 17300 }, { "epoch": 998.1237553342817, "grad_norm": 0.15846611559391022, "learning_rate": 4.274247491638796e-06, "loss": 2.6313, "step": 17400 }, { "epoch": 998.1237553342817, "eval_loss": 2.6250662803649902, "eval_runtime": 8.6057, "eval_samples_per_second": 161.986, "eval_steps_per_second": 20.335, "step": 17400 }, { "epoch": 1003.8136557610242, "grad_norm": 0.17078837752342224, "learning_rate": 4.264214046822743e-06, "loss": 2.6312, "step": 17500 }, { "epoch": 1003.8136557610242, "eval_loss": 2.6274046897888184, "eval_runtime": 8.564, "eval_samples_per_second": 162.775, "eval_steps_per_second": 20.434, "step": 17500 }, { "epoch": 1009.5035561877667, "grad_norm": 0.1965128779411316, "learning_rate": 4.254180602006689e-06, "loss": 2.6312, "step": 17600 }, { "epoch": 1009.5035561877667, "eval_loss": 2.6278066635131836, "eval_runtime": 8.9009, "eval_samples_per_second": 156.614, "eval_steps_per_second": 19.661, "step": 17600 }, { "epoch": 1015.1934566145093, "grad_norm": 0.19483456015586853, "learning_rate": 4.244147157190635e-06, "loss": 2.6311, "step": 17700 }, { "epoch": 1015.1934566145093, "eval_loss": 2.623715400695801, "eval_runtime": 8.5597, "eval_samples_per_second": 162.857, "eval_steps_per_second": 20.445, "step": 17700 }, { "epoch": 1020.8833570412518, "grad_norm": 0.14647985994815826, "learning_rate": 4.234113712374582e-06, "loss": 2.6309, "step": 17800 }, { "epoch": 1020.8833570412518, "eval_loss": 2.625011920928955, "eval_runtime": 8.8958, "eval_samples_per_second": 156.704, "eval_steps_per_second": 19.672, "step": 17800 }, { "epoch": 1026.5732574679944, "grad_norm": 0.1495138704776764, "learning_rate": 4.224080267558528e-06, "loss": 2.6303, "step": 17900 }, { "epoch": 1026.5732574679944, "eval_loss": 2.6249139308929443, "eval_runtime": 8.5668, "eval_samples_per_second": 162.72, "eval_steps_per_second": 20.428, "step": 17900 }, { "epoch": 1032.2631578947369, "grad_norm": 0.1665605753660202, "learning_rate": 4.214046822742475e-06, "loss": 2.6305, "step": 18000 }, { "epoch": 1032.2631578947369, "eval_loss": 2.6271395683288574, "eval_runtime": 8.8963, "eval_samples_per_second": 156.694, "eval_steps_per_second": 19.671, "step": 18000 }, { "epoch": 1037.9530583214794, "grad_norm": 0.1886260211467743, "learning_rate": 4.2040133779264216e-06, "loss": 2.6307, "step": 18100 }, { "epoch": 1037.9530583214794, "eval_loss": 2.6232030391693115, "eval_runtime": 8.5585, "eval_samples_per_second": 162.879, "eval_steps_per_second": 20.447, "step": 18100 }, { "epoch": 1043.6429587482219, "grad_norm": 0.1451101154088974, "learning_rate": 4.1939799331103675e-06, "loss": 2.6304, "step": 18200 }, { "epoch": 1043.6429587482219, "eval_loss": 2.624784469604492, "eval_runtime": 8.5566, "eval_samples_per_second": 162.916, "eval_steps_per_second": 20.452, "step": 18200 }, { "epoch": 1049.3328591749644, "grad_norm": 0.13841372728347778, "learning_rate": 4.183946488294314e-06, "loss": 2.6305, "step": 18300 }, { "epoch": 1049.3328591749644, "eval_loss": 2.626993179321289, "eval_runtime": 8.9131, "eval_samples_per_second": 156.398, "eval_steps_per_second": 19.634, "step": 18300 }, { "epoch": 1055.0227596017069, "grad_norm": 0.1455683559179306, "learning_rate": 4.173913043478261e-06, "loss": 2.6301, "step": 18400 }, { "epoch": 1055.0227596017069, "eval_loss": 2.6284282207489014, "eval_runtime": 8.555, "eval_samples_per_second": 162.946, "eval_steps_per_second": 20.456, "step": 18400 }, { "epoch": 1060.7126600284496, "grad_norm": 0.14764897525310516, "learning_rate": 4.163879598662208e-06, "loss": 2.6305, "step": 18500 }, { "epoch": 1060.7126600284496, "eval_loss": 2.626128673553467, "eval_runtime": 8.5552, "eval_samples_per_second": 162.942, "eval_steps_per_second": 20.455, "step": 18500 }, { "epoch": 1066.402560455192, "grad_norm": 0.15602290630340576, "learning_rate": 4.153846153846154e-06, "loss": 2.6303, "step": 18600 }, { "epoch": 1066.402560455192, "eval_loss": 2.6236324310302734, "eval_runtime": 8.5623, "eval_samples_per_second": 162.806, "eval_steps_per_second": 20.438, "step": 18600 }, { "epoch": 1099.8036984352773, "grad_norm": 0.1434181034564972, "learning_rate": 4.1438127090301005e-06, "loss": 2.6299, "step": 18700 }, { "epoch": 1099.8036984352773, "eval_loss": 2.6240837574005127, "eval_runtime": 8.8817, "eval_samples_per_second": 156.952, "eval_steps_per_second": 19.703, "step": 18700 }, { "epoch": 1105.49359886202, "grad_norm": 0.17055080831050873, "learning_rate": 4.133779264214047e-06, "loss": 2.6296, "step": 18800 }, { "epoch": 1105.49359886202, "eval_loss": 2.627481698989868, "eval_runtime": 8.5387, "eval_samples_per_second": 163.257, "eval_steps_per_second": 20.495, "step": 18800 }, { "epoch": 1111.1834992887625, "grad_norm": 0.15118207037448883, "learning_rate": 4.123745819397993e-06, "loss": 2.6295, "step": 18900 }, { "epoch": 1111.1834992887625, "eval_loss": 2.6243932247161865, "eval_runtime": 8.5529, "eval_samples_per_second": 162.985, "eval_steps_per_second": 20.461, "step": 18900 }, { "epoch": 1116.873399715505, "grad_norm": 0.14333444833755493, "learning_rate": 4.11371237458194e-06, "loss": 2.6294, "step": 19000 }, { "epoch": 1116.873399715505, "eval_loss": 2.6264147758483887, "eval_runtime": 8.8805, "eval_samples_per_second": 156.973, "eval_steps_per_second": 19.706, "step": 19000 }, { "epoch": 1122.5633001422475, "grad_norm": 0.13676032423973083, "learning_rate": 4.103678929765887e-06, "loss": 2.6292, "step": 19100 }, { "epoch": 1122.5633001422475, "eval_loss": 2.6256096363067627, "eval_runtime": 8.5396, "eval_samples_per_second": 163.24, "eval_steps_per_second": 20.493, "step": 19100 }, { "epoch": 1128.25320056899, "grad_norm": 0.13608410954475403, "learning_rate": 4.0936454849498326e-06, "loss": 2.6292, "step": 19200 }, { "epoch": 1128.25320056899, "eval_loss": 2.6272470951080322, "eval_runtime": 8.5412, "eval_samples_per_second": 163.209, "eval_steps_per_second": 20.489, "step": 19200 }, { "epoch": 1133.9431009957325, "grad_norm": 0.16941364109516144, "learning_rate": 4.083612040133779e-06, "loss": 2.6294, "step": 19300 }, { "epoch": 1133.9431009957325, "eval_loss": 2.6245925426483154, "eval_runtime": 8.8711, "eval_samples_per_second": 157.14, "eval_steps_per_second": 19.727, "step": 19300 }, { "epoch": 1139.6330014224752, "grad_norm": 0.17961208522319794, "learning_rate": 4.073578595317726e-06, "loss": 2.6291, "step": 19400 }, { "epoch": 1139.6330014224752, "eval_loss": 2.6260921955108643, "eval_runtime": 8.5481, "eval_samples_per_second": 163.078, "eval_steps_per_second": 20.472, "step": 19400 }, { "epoch": 1145.3229018492177, "grad_norm": 0.15234056115150452, "learning_rate": 4.063545150501672e-06, "loss": 2.6288, "step": 19500 }, { "epoch": 1145.3229018492177, "eval_loss": 2.624178647994995, "eval_runtime": 8.5458, "eval_samples_per_second": 163.121, "eval_steps_per_second": 20.478, "step": 19500 }, { "epoch": 1151.0128022759602, "grad_norm": 0.1660071462392807, "learning_rate": 4.053511705685619e-06, "loss": 2.6289, "step": 19600 }, { "epoch": 1151.0128022759602, "eval_loss": 2.625214099884033, "eval_runtime": 8.8562, "eval_samples_per_second": 157.404, "eval_steps_per_second": 19.76, "step": 19600 }, { "epoch": 1156.7027027027027, "grad_norm": 0.1432279646396637, "learning_rate": 4.0434782608695655e-06, "loss": 2.6288, "step": 19700 }, { "epoch": 1156.7027027027027, "eval_loss": 2.6248371601104736, "eval_runtime": 8.5388, "eval_samples_per_second": 163.255, "eval_steps_per_second": 20.495, "step": 19700 }, { "epoch": 1162.3926031294452, "grad_norm": 0.13359645009040833, "learning_rate": 4.0334448160535115e-06, "loss": 2.6291, "step": 19800 }, { "epoch": 1162.3926031294452, "eval_loss": 2.6228439807891846, "eval_runtime": 8.5401, "eval_samples_per_second": 163.23, "eval_steps_per_second": 20.492, "step": 19800 }, { "epoch": 1168.0825035561877, "grad_norm": 0.18464621901512146, "learning_rate": 4.023411371237458e-06, "loss": 2.6286, "step": 19900 }, { "epoch": 1168.0825035561877, "eval_loss": 2.624844789505005, "eval_runtime": 8.5435, "eval_samples_per_second": 163.165, "eval_steps_per_second": 20.483, "step": 19900 }, { "epoch": 1173.7724039829302, "grad_norm": 0.14693519473075867, "learning_rate": 4.013377926421405e-06, "loss": 2.6282, "step": 20000 }, { "epoch": 1173.7724039829302, "eval_loss": 2.625211238861084, "eval_runtime": 8.876, "eval_samples_per_second": 157.053, "eval_steps_per_second": 19.716, "step": 20000 }, { "epoch": 1179.462304409673, "grad_norm": 0.14849957823753357, "learning_rate": 4.003344481605351e-06, "loss": 2.6281, "step": 20100 }, { "epoch": 1179.462304409673, "eval_loss": 2.6256697177886963, "eval_runtime": 8.8789, "eval_samples_per_second": 157.002, "eval_steps_per_second": 19.71, "step": 20100 }, { "epoch": 1185.1522048364154, "grad_norm": 0.1465172916650772, "learning_rate": 3.993311036789298e-06, "loss": 2.6279, "step": 20200 }, { "epoch": 1185.1522048364154, "eval_loss": 2.6242611408233643, "eval_runtime": 8.5384, "eval_samples_per_second": 163.263, "eval_steps_per_second": 20.496, "step": 20200 }, { "epoch": 1190.842105263158, "grad_norm": 0.15794384479522705, "learning_rate": 3.9832775919732444e-06, "loss": 2.6281, "step": 20300 }, { "epoch": 1190.842105263158, "eval_loss": 2.623426914215088, "eval_runtime": 8.5314, "eval_samples_per_second": 163.397, "eval_steps_per_second": 20.512, "step": 20300 }, { "epoch": 1196.5320056899004, "grad_norm": 0.1284749060869217, "learning_rate": 3.97324414715719e-06, "loss": 2.6281, "step": 20400 }, { "epoch": 1196.5320056899004, "eval_loss": 2.622859239578247, "eval_runtime": 8.863, "eval_samples_per_second": 157.283, "eval_steps_per_second": 19.745, "step": 20400 }, { "epoch": 1202.221906116643, "grad_norm": 0.1669575572013855, "learning_rate": 3.963210702341137e-06, "loss": 2.6281, "step": 20500 }, { "epoch": 1202.221906116643, "eval_loss": 2.6228220462799072, "eval_runtime": 8.5362, "eval_samples_per_second": 163.305, "eval_steps_per_second": 20.501, "step": 20500 }, { "epoch": 1207.9118065433854, "grad_norm": 0.12002875655889511, "learning_rate": 3.953177257525084e-06, "loss": 2.6284, "step": 20600 }, { "epoch": 1207.9118065433854, "eval_loss": 2.6229088306427, "eval_runtime": 8.5208, "eval_samples_per_second": 163.599, "eval_steps_per_second": 20.538, "step": 20600 }, { "epoch": 1213.6017069701281, "grad_norm": 0.14911407232284546, "learning_rate": 3.943143812709031e-06, "loss": 2.6278, "step": 20700 }, { "epoch": 1213.6017069701281, "eval_loss": 2.6207728385925293, "eval_runtime": 8.5412, "eval_samples_per_second": 163.208, "eval_steps_per_second": 20.489, "step": 20700 }, { "epoch": 1219.2916073968706, "grad_norm": 0.1687910258769989, "learning_rate": 3.9331103678929765e-06, "loss": 2.6277, "step": 20800 }, { "epoch": 1219.2916073968706, "eval_loss": 2.623382806777954, "eval_runtime": 8.8763, "eval_samples_per_second": 157.047, "eval_steps_per_second": 19.715, "step": 20800 }, { "epoch": 1224.9815078236131, "grad_norm": 0.1914646476507187, "learning_rate": 3.923076923076923e-06, "loss": 2.6271, "step": 20900 }, { "epoch": 1224.9815078236131, "eval_loss": 2.6222121715545654, "eval_runtime": 8.5242, "eval_samples_per_second": 163.535, "eval_steps_per_second": 20.53, "step": 20900 }, { "epoch": 1230.6714082503556, "grad_norm": 0.15010875463485718, "learning_rate": 3.91304347826087e-06, "loss": 2.6276, "step": 21000 }, { "epoch": 1230.6714082503556, "eval_loss": 2.6212801933288574, "eval_runtime": 8.5399, "eval_samples_per_second": 163.234, "eval_steps_per_second": 20.492, "step": 21000 }, { "epoch": 1236.3613086770981, "grad_norm": 0.1383567601442337, "learning_rate": 3.903010033444816e-06, "loss": 2.6275, "step": 21100 }, { "epoch": 1236.3613086770981, "eval_loss": 2.6240437030792236, "eval_runtime": 8.5381, "eval_samples_per_second": 163.269, "eval_steps_per_second": 20.496, "step": 21100 }, { "epoch": 1242.0512091038406, "grad_norm": 0.15790875256061554, "learning_rate": 3.892976588628763e-06, "loss": 2.6267, "step": 21200 }, { "epoch": 1242.0512091038406, "eval_loss": 2.623500108718872, "eval_runtime": 8.8811, "eval_samples_per_second": 156.963, "eval_steps_per_second": 19.705, "step": 21200 }, { "epoch": 1247.7411095305831, "grad_norm": 0.15240466594696045, "learning_rate": 3.8829431438127095e-06, "loss": 2.6269, "step": 21300 }, { "epoch": 1247.7411095305831, "eval_loss": 2.6207492351531982, "eval_runtime": 8.5211, "eval_samples_per_second": 163.594, "eval_steps_per_second": 20.537, "step": 21300 }, { "epoch": 1253.4310099573258, "grad_norm": 0.1933618187904358, "learning_rate": 3.8729096989966554e-06, "loss": 2.627, "step": 21400 }, { "epoch": 1253.4310099573258, "eval_loss": 2.62373948097229, "eval_runtime": 8.5399, "eval_samples_per_second": 163.235, "eval_steps_per_second": 20.492, "step": 21400 }, { "epoch": 1259.1209103840683, "grad_norm": 0.17298194766044617, "learning_rate": 3.862876254180602e-06, "loss": 2.6273, "step": 21500 }, { "epoch": 1259.1209103840683, "eval_loss": 2.626997232437134, "eval_runtime": 8.88, "eval_samples_per_second": 156.981, "eval_steps_per_second": 19.707, "step": 21500 }, { "epoch": 1264.8108108108108, "grad_norm": 0.15336528420448303, "learning_rate": 3.852842809364549e-06, "loss": 2.6276, "step": 21600 }, { "epoch": 1264.8108108108108, "eval_loss": 2.6227035522460938, "eval_runtime": 8.5395, "eval_samples_per_second": 163.241, "eval_steps_per_second": 20.493, "step": 21600 }, { "epoch": 1270.5007112375533, "grad_norm": 0.1456770896911621, "learning_rate": 3.842809364548495e-06, "loss": 2.6264, "step": 21700 }, { "epoch": 1270.5007112375533, "eval_loss": 2.6244804859161377, "eval_runtime": 8.5371, "eval_samples_per_second": 163.287, "eval_steps_per_second": 20.499, "step": 21700 }, { "epoch": 1276.1906116642958, "grad_norm": 0.14131468534469604, "learning_rate": 3.832775919732442e-06, "loss": 2.6261, "step": 21800 }, { "epoch": 1276.1906116642958, "eval_loss": 2.6235532760620117, "eval_runtime": 8.8848, "eval_samples_per_second": 156.898, "eval_steps_per_second": 19.697, "step": 21800 }, { "epoch": 1281.8805120910383, "grad_norm": 0.16801823675632477, "learning_rate": 3.822742474916388e-06, "loss": 2.6266, "step": 21900 }, { "epoch": 1281.8805120910383, "eval_loss": 2.6234781742095947, "eval_runtime": 8.5394, "eval_samples_per_second": 163.243, "eval_steps_per_second": 20.493, "step": 21900 }, { "epoch": 1287.570412517781, "grad_norm": 0.13501711189746857, "learning_rate": 3.8127090301003347e-06, "loss": 2.6261, "step": 22000 }, { "epoch": 1287.570412517781, "eval_loss": 2.621072292327881, "eval_runtime": 8.5307, "eval_samples_per_second": 163.41, "eval_steps_per_second": 20.514, "step": 22000 }, { "epoch": 1293.2603129445235, "grad_norm": 0.14802291989326477, "learning_rate": 3.802675585284281e-06, "loss": 2.6267, "step": 22100 }, { "epoch": 1293.2603129445235, "eval_loss": 2.625509023666382, "eval_runtime": 8.876, "eval_samples_per_second": 157.052, "eval_steps_per_second": 19.716, "step": 22100 }, { "epoch": 1298.950213371266, "grad_norm": 0.149693563580513, "learning_rate": 3.792642140468228e-06, "loss": 2.6266, "step": 22200 }, { "epoch": 1298.950213371266, "eval_loss": 2.6230156421661377, "eval_runtime": 8.5523, "eval_samples_per_second": 162.996, "eval_steps_per_second": 20.462, "step": 22200 }, { "epoch": 1304.6401137980085, "grad_norm": 0.16010881960391998, "learning_rate": 3.782608695652174e-06, "loss": 2.6263, "step": 22300 }, { "epoch": 1304.6401137980085, "eval_loss": 2.623608112335205, "eval_runtime": 8.5405, "eval_samples_per_second": 163.222, "eval_steps_per_second": 20.491, "step": 22300 }, { "epoch": 1310.330014224751, "grad_norm": 0.1507118195295334, "learning_rate": 3.7725752508361205e-06, "loss": 2.6262, "step": 22400 }, { "epoch": 1310.330014224751, "eval_loss": 2.6196281909942627, "eval_runtime": 8.8776, "eval_samples_per_second": 157.024, "eval_steps_per_second": 19.713, "step": 22400 }, { "epoch": 1316.0199146514935, "grad_norm": 0.12015032023191452, "learning_rate": 3.7625418060200673e-06, "loss": 2.6261, "step": 22500 }, { "epoch": 1316.0199146514935, "eval_loss": 2.6222877502441406, "eval_runtime": 8.5476, "eval_samples_per_second": 163.086, "eval_steps_per_second": 20.474, "step": 22500 }, { "epoch": 1321.709815078236, "grad_norm": 0.14796671271324158, "learning_rate": 3.7525083612040136e-06, "loss": 2.6261, "step": 22600 }, { "epoch": 1321.709815078236, "eval_loss": 2.623142957687378, "eval_runtime": 8.8638, "eval_samples_per_second": 157.268, "eval_steps_per_second": 19.743, "step": 22600 }, { "epoch": 1327.3997155049788, "grad_norm": 0.14206399023532867, "learning_rate": 3.74247491638796e-06, "loss": 2.6261, "step": 22700 }, { "epoch": 1327.3997155049788, "eval_loss": 2.620297431945801, "eval_runtime": 8.5358, "eval_samples_per_second": 163.313, "eval_steps_per_second": 20.502, "step": 22700 }, { "epoch": 1333.0896159317213, "grad_norm": 0.1448485553264618, "learning_rate": 3.7324414715719067e-06, "loss": 2.6258, "step": 22800 }, { "epoch": 1333.0896159317213, "eval_loss": 2.6241817474365234, "eval_runtime": 8.5523, "eval_samples_per_second": 162.998, "eval_steps_per_second": 20.462, "step": 22800 }, { "epoch": 1338.7795163584638, "grad_norm": 0.14887595176696777, "learning_rate": 3.722408026755853e-06, "loss": 2.6255, "step": 22900 }, { "epoch": 1338.7795163584638, "eval_loss": 2.622042179107666, "eval_runtime": 8.902, "eval_samples_per_second": 156.594, "eval_steps_per_second": 19.658, "step": 22900 }, { "epoch": 1344.4694167852062, "grad_norm": 0.16686739027500153, "learning_rate": 3.7123745819398e-06, "loss": 2.6258, "step": 23000 }, { "epoch": 1344.4694167852062, "eval_loss": 2.6229121685028076, "eval_runtime": 8.5332, "eval_samples_per_second": 163.362, "eval_steps_per_second": 20.508, "step": 23000 }, { "epoch": 1350.1593172119487, "grad_norm": 0.16153846681118011, "learning_rate": 3.702341137123746e-06, "loss": 2.6257, "step": 23100 }, { "epoch": 1350.1593172119487, "eval_loss": 2.6239538192749023, "eval_runtime": 8.8767, "eval_samples_per_second": 157.04, "eval_steps_per_second": 19.715, "step": 23100 }, { "epoch": 1355.8492176386912, "grad_norm": 0.1725204735994339, "learning_rate": 3.6923076923076925e-06, "loss": 2.6258, "step": 23200 }, { "epoch": 1355.8492176386912, "eval_loss": 2.6215097904205322, "eval_runtime": 8.5286, "eval_samples_per_second": 163.45, "eval_steps_per_second": 20.519, "step": 23200 }, { "epoch": 1361.539118065434, "grad_norm": 0.12999078631401062, "learning_rate": 3.6822742474916393e-06, "loss": 2.6253, "step": 23300 }, { "epoch": 1361.539118065434, "eval_loss": 2.6233925819396973, "eval_runtime": 8.8885, "eval_samples_per_second": 156.832, "eval_steps_per_second": 19.688, "step": 23300 }, { "epoch": 1367.2290184921765, "grad_norm": 0.1744973212480545, "learning_rate": 3.6722408026755856e-06, "loss": 2.6257, "step": 23400 }, { "epoch": 1367.2290184921765, "eval_loss": 2.623767614364624, "eval_runtime": 8.5312, "eval_samples_per_second": 163.401, "eval_steps_per_second": 20.513, "step": 23400 }, { "epoch": 1372.918918918919, "grad_norm": 0.13030101358890533, "learning_rate": 3.662207357859532e-06, "loss": 2.6254, "step": 23500 }, { "epoch": 1372.918918918919, "eval_loss": 2.622628927230835, "eval_runtime": 8.8974, "eval_samples_per_second": 156.675, "eval_steps_per_second": 19.669, "step": 23500 }, { "epoch": 1378.6088193456615, "grad_norm": 0.15082061290740967, "learning_rate": 3.6521739130434787e-06, "loss": 2.6258, "step": 23600 }, { "epoch": 1378.6088193456615, "eval_loss": 2.62248158454895, "eval_runtime": 8.5269, "eval_samples_per_second": 163.482, "eval_steps_per_second": 20.523, "step": 23600 }, { "epoch": 1384.298719772404, "grad_norm": 0.1196790486574173, "learning_rate": 3.642140468227425e-06, "loss": 2.6254, "step": 23700 }, { "epoch": 1384.298719772404, "eval_loss": 2.618326187133789, "eval_runtime": 8.5494, "eval_samples_per_second": 163.052, "eval_steps_per_second": 20.469, "step": 23700 }, { "epoch": 1389.9886201991465, "grad_norm": 0.168843612074852, "learning_rate": 3.6321070234113714e-06, "loss": 2.6249, "step": 23800 }, { "epoch": 1389.9886201991465, "eval_loss": 2.621375799179077, "eval_runtime": 8.859, "eval_samples_per_second": 157.355, "eval_steps_per_second": 19.754, "step": 23800 }, { "epoch": 1395.678520625889, "grad_norm": 0.1318158209323883, "learning_rate": 3.622073578595318e-06, "loss": 2.6248, "step": 23900 }, { "epoch": 1395.678520625889, "eval_loss": 2.6230545043945312, "eval_runtime": 8.5184, "eval_samples_per_second": 163.646, "eval_steps_per_second": 20.544, "step": 23900 }, { "epoch": 1401.3684210526317, "grad_norm": 0.14110194146633148, "learning_rate": 3.6120401337792645e-06, "loss": 2.6252, "step": 24000 }, { "epoch": 1401.3684210526317, "eval_loss": 2.6207733154296875, "eval_runtime": 8.5346, "eval_samples_per_second": 163.335, "eval_steps_per_second": 20.505, "step": 24000 }, { "epoch": 1407.0583214793742, "grad_norm": 0.14449109137058258, "learning_rate": 3.6020066889632112e-06, "loss": 2.6245, "step": 24100 }, { "epoch": 1407.0583214793742, "eval_loss": 2.6209616661071777, "eval_runtime": 8.5416, "eval_samples_per_second": 163.201, "eval_steps_per_second": 20.488, "step": 24100 }, { "epoch": 1412.7482219061167, "grad_norm": 0.12893743813037872, "learning_rate": 3.5919732441471576e-06, "loss": 2.6247, "step": 24200 }, { "epoch": 1412.7482219061167, "eval_loss": 2.6214792728424072, "eval_runtime": 8.8839, "eval_samples_per_second": 156.913, "eval_steps_per_second": 19.699, "step": 24200 }, { "epoch": 1418.4381223328592, "grad_norm": 0.15788990259170532, "learning_rate": 3.581939799331104e-06, "loss": 2.6249, "step": 24300 }, { "epoch": 1418.4381223328592, "eval_loss": 2.6239373683929443, "eval_runtime": 8.5329, "eval_samples_per_second": 163.368, "eval_steps_per_second": 20.509, "step": 24300 }, { "epoch": 1424.1280227596017, "grad_norm": 0.14352256059646606, "learning_rate": 3.5719063545150507e-06, "loss": 2.6244, "step": 24400 }, { "epoch": 1424.1280227596017, "eval_loss": 2.621476888656616, "eval_runtime": 8.8748, "eval_samples_per_second": 157.073, "eval_steps_per_second": 19.719, "step": 24400 }, { "epoch": 1429.8179231863442, "grad_norm": 0.1311691254377365, "learning_rate": 3.561872909698997e-06, "loss": 2.6243, "step": 24500 }, { "epoch": 1429.8179231863442, "eval_loss": 2.6242871284484863, "eval_runtime": 8.5283, "eval_samples_per_second": 163.456, "eval_steps_per_second": 20.52, "step": 24500 }, { "epoch": 1435.5078236130869, "grad_norm": 0.15464642643928528, "learning_rate": 3.5518394648829434e-06, "loss": 2.624, "step": 24600 }, { "epoch": 1435.5078236130869, "eval_loss": 2.6201913356781006, "eval_runtime": 8.8606, "eval_samples_per_second": 157.326, "eval_steps_per_second": 19.75, "step": 24600 }, { "epoch": 1441.1977240398294, "grad_norm": 0.19396920502185822, "learning_rate": 3.54180602006689e-06, "loss": 2.625, "step": 24700 }, { "epoch": 1441.1977240398294, "eval_loss": 2.619835138320923, "eval_runtime": 8.5378, "eval_samples_per_second": 163.275, "eval_steps_per_second": 20.497, "step": 24700 }, { "epoch": 1446.8876244665719, "grad_norm": 0.16594748198986053, "learning_rate": 3.5317725752508365e-06, "loss": 2.6238, "step": 24800 }, { "epoch": 1446.8876244665719, "eval_loss": 2.620967388153076, "eval_runtime": 8.5405, "eval_samples_per_second": 163.221, "eval_steps_per_second": 20.49, "step": 24800 }, { "epoch": 1452.5775248933144, "grad_norm": 0.12998247146606445, "learning_rate": 3.521739130434783e-06, "loss": 2.6237, "step": 24900 }, { "epoch": 1452.5775248933144, "eval_loss": 2.622404098510742, "eval_runtime": 8.5397, "eval_samples_per_second": 163.237, "eval_steps_per_second": 20.492, "step": 24900 }, { "epoch": 1458.2674253200569, "grad_norm": 0.15071412920951843, "learning_rate": 3.5117056856187296e-06, "loss": 2.6245, "step": 25000 }, { "epoch": 1458.2674253200569, "eval_loss": 2.6213462352752686, "eval_runtime": 8.8844, "eval_samples_per_second": 156.905, "eval_steps_per_second": 19.698, "step": 25000 }, { "epoch": 1463.9573257467994, "grad_norm": 0.1532295048236847, "learning_rate": 3.501672240802676e-06, "loss": 2.6245, "step": 25100 }, { "epoch": 1463.9573257467994, "eval_loss": 2.6207022666931152, "eval_runtime": 8.5372, "eval_samples_per_second": 163.285, "eval_steps_per_second": 20.498, "step": 25100 }, { "epoch": 1469.6472261735419, "grad_norm": 0.13699106872081757, "learning_rate": 3.491638795986622e-06, "loss": 2.6239, "step": 25200 }, { "epoch": 1469.6472261735419, "eval_loss": 2.6193158626556396, "eval_runtime": 8.8816, "eval_samples_per_second": 156.954, "eval_steps_per_second": 19.704, "step": 25200 }, { "epoch": 1475.3371266002846, "grad_norm": 0.14744792878627777, "learning_rate": 3.481605351170568e-06, "loss": 2.624, "step": 25300 }, { "epoch": 1475.3371266002846, "eval_loss": 2.6224136352539062, "eval_runtime": 8.5419, "eval_samples_per_second": 163.196, "eval_steps_per_second": 20.487, "step": 25300 }, { "epoch": 1481.027027027027, "grad_norm": 0.1340937465429306, "learning_rate": 3.471571906354515e-06, "loss": 2.624, "step": 25400 }, { "epoch": 1481.027027027027, "eval_loss": 2.620910882949829, "eval_runtime": 8.551, "eval_samples_per_second": 163.022, "eval_steps_per_second": 20.465, "step": 25400 }, { "epoch": 1486.7169274537696, "grad_norm": 0.16349473595619202, "learning_rate": 3.4615384615384613e-06, "loss": 2.6236, "step": 25500 }, { "epoch": 1486.7169274537696, "eval_loss": 2.6188619136810303, "eval_runtime": 8.8975, "eval_samples_per_second": 156.673, "eval_steps_per_second": 19.668, "step": 25500 }, { "epoch": 1492.406827880512, "grad_norm": 0.16049961745738983, "learning_rate": 3.4515050167224076e-06, "loss": 2.6236, "step": 25600 }, { "epoch": 1492.406827880512, "eval_loss": 2.6201858520507812, "eval_runtime": 8.5265, "eval_samples_per_second": 163.49, "eval_steps_per_second": 20.524, "step": 25600 }, { "epoch": 1498.0967283072546, "grad_norm": 0.1545686572790146, "learning_rate": 3.4414715719063544e-06, "loss": 2.6237, "step": 25700 }, { "epoch": 1498.0967283072546, "eval_loss": 2.6238884925842285, "eval_runtime": 8.5206, "eval_samples_per_second": 163.603, "eval_steps_per_second": 20.538, "step": 25700 }, { "epoch": 1503.786628733997, "grad_norm": 0.11945275217294693, "learning_rate": 3.4314381270903007e-06, "loss": 2.6237, "step": 25800 }, { "epoch": 1503.786628733997, "eval_loss": 2.617251396179199, "eval_runtime": 8.5374, "eval_samples_per_second": 163.282, "eval_steps_per_second": 20.498, "step": 25800 }, { "epoch": 1509.4765291607396, "grad_norm": 0.16417285799980164, "learning_rate": 3.4214046822742475e-06, "loss": 2.6234, "step": 25900 }, { "epoch": 1509.4765291607396, "eval_loss": 2.61983060836792, "eval_runtime": 8.8777, "eval_samples_per_second": 157.022, "eval_steps_per_second": 19.712, "step": 25900 }, { "epoch": 1515.1664295874823, "grad_norm": 0.1562732458114624, "learning_rate": 3.411371237458194e-06, "loss": 2.6234, "step": 26000 }, { "epoch": 1515.1664295874823, "eval_loss": 2.6201515197753906, "eval_runtime": 8.5309, "eval_samples_per_second": 163.407, "eval_steps_per_second": 20.514, "step": 26000 }, { "epoch": 1520.8563300142248, "grad_norm": 0.1490921974182129, "learning_rate": 3.40133779264214e-06, "loss": 2.6233, "step": 26100 }, { "epoch": 1520.8563300142248, "eval_loss": 2.6187028884887695, "eval_runtime": 8.5278, "eval_samples_per_second": 163.466, "eval_steps_per_second": 20.521, "step": 26100 }, { "epoch": 1526.5462304409673, "grad_norm": 0.13493777811527252, "learning_rate": 3.391304347826087e-06, "loss": 2.6232, "step": 26200 }, { "epoch": 1526.5462304409673, "eval_loss": 2.6198177337646484, "eval_runtime": 8.5368, "eval_samples_per_second": 163.293, "eval_steps_per_second": 20.5, "step": 26200 }, { "epoch": 1532.2361308677098, "grad_norm": 0.16828219592571259, "learning_rate": 3.3812709030100333e-06, "loss": 2.6235, "step": 26300 }, { "epoch": 1532.2361308677098, "eval_loss": 2.620209217071533, "eval_runtime": 8.8832, "eval_samples_per_second": 156.925, "eval_steps_per_second": 19.7, "step": 26300 }, { "epoch": 1537.9260312944523, "grad_norm": 0.13606858253479004, "learning_rate": 3.3712374581939796e-06, "loss": 2.6236, "step": 26400 }, { "epoch": 1537.9260312944523, "eval_loss": 2.620745897293091, "eval_runtime": 8.5351, "eval_samples_per_second": 163.325, "eval_steps_per_second": 20.504, "step": 26400 }, { "epoch": 1543.6159317211948, "grad_norm": 0.15643203258514404, "learning_rate": 3.3612040133779264e-06, "loss": 2.6233, "step": 26500 }, { "epoch": 1543.6159317211948, "eval_loss": 2.6203880310058594, "eval_runtime": 8.5194, "eval_samples_per_second": 163.627, "eval_steps_per_second": 20.541, "step": 26500 }, { "epoch": 1549.3058321479375, "grad_norm": 0.15990637242794037, "learning_rate": 3.3511705685618727e-06, "loss": 2.6235, "step": 26600 }, { "epoch": 1549.3058321479375, "eval_loss": 2.618859052658081, "eval_runtime": 8.8671, "eval_samples_per_second": 157.21, "eval_steps_per_second": 19.736, "step": 26600 }, { "epoch": 1554.99573257468, "grad_norm": 0.1532638967037201, "learning_rate": 3.3411371237458195e-06, "loss": 2.6227, "step": 26700 }, { "epoch": 1554.99573257468, "eval_loss": 2.621203660964966, "eval_runtime": 8.5393, "eval_samples_per_second": 163.246, "eval_steps_per_second": 20.494, "step": 26700 }, { "epoch": 1560.6856330014225, "grad_norm": 0.14362338185310364, "learning_rate": 3.331103678929766e-06, "loss": 2.6233, "step": 26800 }, { "epoch": 1560.6856330014225, "eval_loss": 2.6196324825286865, "eval_runtime": 8.5352, "eval_samples_per_second": 163.323, "eval_steps_per_second": 20.503, "step": 26800 }, { "epoch": 1566.375533428165, "grad_norm": 0.15064574778079987, "learning_rate": 3.321070234113712e-06, "loss": 2.6231, "step": 26900 }, { "epoch": 1566.375533428165, "eval_loss": 2.621459722518921, "eval_runtime": 8.8713, "eval_samples_per_second": 157.136, "eval_steps_per_second": 19.726, "step": 26900 }, { "epoch": 1572.0654338549075, "grad_norm": 0.14329403638839722, "learning_rate": 3.311036789297659e-06, "loss": 2.623, "step": 27000 }, { "epoch": 1572.0654338549075, "eval_loss": 2.619920253753662, "eval_runtime": 8.5347, "eval_samples_per_second": 163.333, "eval_steps_per_second": 20.505, "step": 27000 }, { "epoch": 1577.75533428165, "grad_norm": 0.14685587584972382, "learning_rate": 3.3010033444816052e-06, "loss": 2.6233, "step": 27100 }, { "epoch": 1577.75533428165, "eval_loss": 2.620281934738159, "eval_runtime": 8.5331, "eval_samples_per_second": 163.363, "eval_steps_per_second": 20.508, "step": 27100 }, { "epoch": 1583.4452347083925, "grad_norm": 0.14042943716049194, "learning_rate": 3.2909698996655516e-06, "loss": 2.6227, "step": 27200 }, { "epoch": 1583.4452347083925, "eval_loss": 2.6232104301452637, "eval_runtime": 8.5205, "eval_samples_per_second": 163.606, "eval_steps_per_second": 20.539, "step": 27200 }, { "epoch": 1589.1351351351352, "grad_norm": 0.15437842905521393, "learning_rate": 3.2809364548494983e-06, "loss": 2.6228, "step": 27300 }, { "epoch": 1589.1351351351352, "eval_loss": 2.6217334270477295, "eval_runtime": 8.887, "eval_samples_per_second": 156.858, "eval_steps_per_second": 19.692, "step": 27300 }, { "epoch": 1594.8250355618777, "grad_norm": 0.13956615328788757, "learning_rate": 3.2709030100334447e-06, "loss": 2.6227, "step": 27400 }, { "epoch": 1594.8250355618777, "eval_loss": 2.619623899459839, "eval_runtime": 8.5391, "eval_samples_per_second": 163.248, "eval_steps_per_second": 20.494, "step": 27400 }, { "epoch": 1600.5149359886202, "grad_norm": 0.1520717293024063, "learning_rate": 3.260869565217391e-06, "loss": 2.6224, "step": 27500 }, { "epoch": 1600.5149359886202, "eval_loss": 2.61783766746521, "eval_runtime": 8.8816, "eval_samples_per_second": 156.954, "eval_steps_per_second": 19.704, "step": 27500 }, { "epoch": 1606.2048364153627, "grad_norm": 0.12460660189390182, "learning_rate": 3.2508361204013378e-06, "loss": 2.6228, "step": 27600 }, { "epoch": 1606.2048364153627, "eval_loss": 2.6196727752685547, "eval_runtime": 8.5446, "eval_samples_per_second": 163.145, "eval_steps_per_second": 20.481, "step": 27600 }, { "epoch": 1611.8947368421052, "grad_norm": 0.14338594675064087, "learning_rate": 3.240802675585284e-06, "loss": 2.6219, "step": 27700 }, { "epoch": 1611.8947368421052, "eval_loss": 2.6190030574798584, "eval_runtime": 8.5528, "eval_samples_per_second": 162.987, "eval_steps_per_second": 20.461, "step": 27700 }, { "epoch": 1617.5846372688477, "grad_norm": 0.1541885882616043, "learning_rate": 3.230769230769231e-06, "loss": 2.6223, "step": 27800 }, { "epoch": 1617.5846372688477, "eval_loss": 2.6206130981445312, "eval_runtime": 8.5355, "eval_samples_per_second": 163.319, "eval_steps_per_second": 20.503, "step": 27800 }, { "epoch": 1623.2745376955904, "grad_norm": 0.14063502848148346, "learning_rate": 3.2207357859531772e-06, "loss": 2.6226, "step": 27900 }, { "epoch": 1623.2745376955904, "eval_loss": 2.6189980506896973, "eval_runtime": 8.8925, "eval_samples_per_second": 156.76, "eval_steps_per_second": 19.679, "step": 27900 }, { "epoch": 1628.964438122333, "grad_norm": 0.1286516785621643, "learning_rate": 3.2107023411371236e-06, "loss": 2.6221, "step": 28000 }, { "epoch": 1628.964438122333, "eval_loss": 2.620689630508423, "eval_runtime": 8.542, "eval_samples_per_second": 163.193, "eval_steps_per_second": 20.487, "step": 28000 }, { "epoch": 1634.6543385490754, "grad_norm": 0.1280793398618698, "learning_rate": 3.2006688963210703e-06, "loss": 2.6224, "step": 28100 }, { "epoch": 1634.6543385490754, "eval_loss": 2.6217143535614014, "eval_runtime": 8.8756, "eval_samples_per_second": 157.06, "eval_steps_per_second": 19.717, "step": 28100 }, { "epoch": 1640.344238975818, "grad_norm": 0.15803121030330658, "learning_rate": 3.1906354515050167e-06, "loss": 2.6219, "step": 28200 }, { "epoch": 1640.344238975818, "eval_loss": 2.6206395626068115, "eval_runtime": 8.8837, "eval_samples_per_second": 156.917, "eval_steps_per_second": 19.699, "step": 28200 }, { "epoch": 1646.0341394025604, "grad_norm": 0.1751488745212555, "learning_rate": 3.180602006688963e-06, "loss": 2.6224, "step": 28300 }, { "epoch": 1646.0341394025604, "eval_loss": 2.6214957237243652, "eval_runtime": 8.5404, "eval_samples_per_second": 163.224, "eval_steps_per_second": 20.491, "step": 28300 }, { "epoch": 1651.724039829303, "grad_norm": 0.15003472566604614, "learning_rate": 3.1705685618729098e-06, "loss": 2.6223, "step": 28400 }, { "epoch": 1651.724039829303, "eval_loss": 2.619629144668579, "eval_runtime": 8.5381, "eval_samples_per_second": 163.268, "eval_steps_per_second": 20.496, "step": 28400 }, { "epoch": 1657.4139402560454, "grad_norm": 0.13195043802261353, "learning_rate": 3.160535117056856e-06, "loss": 2.6213, "step": 28500 }, { "epoch": 1657.4139402560454, "eval_loss": 2.623068332672119, "eval_runtime": 8.5234, "eval_samples_per_second": 163.55, "eval_steps_per_second": 20.532, "step": 28500 }, { "epoch": 1663.1038406827881, "grad_norm": 0.12435358017683029, "learning_rate": 3.1505016722408024e-06, "loss": 2.6218, "step": 28600 }, { "epoch": 1663.1038406827881, "eval_loss": 2.6203911304473877, "eval_runtime": 8.8568, "eval_samples_per_second": 157.394, "eval_steps_per_second": 19.759, "step": 28600 }, { "epoch": 1668.7937411095306, "grad_norm": 0.12473925203084946, "learning_rate": 3.140468227424749e-06, "loss": 2.6219, "step": 28700 }, { "epoch": 1668.7937411095306, "eval_loss": 2.620685338973999, "eval_runtime": 8.5395, "eval_samples_per_second": 163.241, "eval_steps_per_second": 20.493, "step": 28700 }, { "epoch": 1674.4836415362731, "grad_norm": 0.14964550733566284, "learning_rate": 3.1304347826086955e-06, "loss": 2.6216, "step": 28800 }, { "epoch": 1674.4836415362731, "eval_loss": 2.619400978088379, "eval_runtime": 8.5365, "eval_samples_per_second": 163.298, "eval_steps_per_second": 20.5, "step": 28800 }, { "epoch": 1680.1735419630156, "grad_norm": 0.12900976836681366, "learning_rate": 3.1204013377926423e-06, "loss": 2.6218, "step": 28900 }, { "epoch": 1680.1735419630156, "eval_loss": 2.621912717819214, "eval_runtime": 8.5373, "eval_samples_per_second": 163.284, "eval_steps_per_second": 20.498, "step": 28900 }, { "epoch": 1685.8634423897581, "grad_norm": 0.1679168939590454, "learning_rate": 3.1103678929765886e-06, "loss": 2.622, "step": 29000 }, { "epoch": 1685.8634423897581, "eval_loss": 2.6172022819519043, "eval_runtime": 8.8725, "eval_samples_per_second": 157.114, "eval_steps_per_second": 19.724, "step": 29000 }, { "epoch": 1691.5533428165006, "grad_norm": 0.14349579811096191, "learning_rate": 3.100334448160535e-06, "loss": 2.6214, "step": 29100 }, { "epoch": 1691.5533428165006, "eval_loss": 2.6180310249328613, "eval_runtime": 8.5353, "eval_samples_per_second": 163.321, "eval_steps_per_second": 20.503, "step": 29100 }, { "epoch": 1697.2432432432433, "grad_norm": 0.11367882043123245, "learning_rate": 3.0903010033444818e-06, "loss": 2.6216, "step": 29200 }, { "epoch": 1697.2432432432433, "eval_loss": 2.6190216541290283, "eval_runtime": 8.874, "eval_samples_per_second": 157.088, "eval_steps_per_second": 19.72, "step": 29200 }, { "epoch": 1702.9331436699858, "grad_norm": 0.1360355168581009, "learning_rate": 3.080267558528428e-06, "loss": 2.6209, "step": 29300 }, { "epoch": 1702.9331436699858, "eval_loss": 2.618488311767578, "eval_runtime": 8.544, "eval_samples_per_second": 163.156, "eval_steps_per_second": 20.482, "step": 29300 }, { "epoch": 1708.6230440967283, "grad_norm": 0.15486325323581696, "learning_rate": 3.0702341137123744e-06, "loss": 2.6213, "step": 29400 }, { "epoch": 1708.6230440967283, "eval_loss": 2.6200103759765625, "eval_runtime": 8.88, "eval_samples_per_second": 156.982, "eval_steps_per_second": 19.707, "step": 29400 }, { "epoch": 1714.3129445234708, "grad_norm": 0.16179534792900085, "learning_rate": 3.060200668896321e-06, "loss": 2.6216, "step": 29500 }, { "epoch": 1714.3129445234708, "eval_loss": 2.619476795196533, "eval_runtime": 8.5238, "eval_samples_per_second": 163.542, "eval_steps_per_second": 20.531, "step": 29500 }, { "epoch": 1720.0028449502133, "grad_norm": 0.12888365983963013, "learning_rate": 3.0501672240802675e-06, "loss": 2.621, "step": 29600 }, { "epoch": 1720.0028449502133, "eval_loss": 2.6209278106689453, "eval_runtime": 8.8646, "eval_samples_per_second": 157.255, "eval_steps_per_second": 19.742, "step": 29600 }, { "epoch": 1725.6927453769558, "grad_norm": 0.1323317587375641, "learning_rate": 3.0401337792642143e-06, "loss": 2.6217, "step": 29700 }, { "epoch": 1725.6927453769558, "eval_loss": 2.6187312602996826, "eval_runtime": 8.5424, "eval_samples_per_second": 163.186, "eval_steps_per_second": 20.486, "step": 29700 }, { "epoch": 1731.3826458036983, "grad_norm": 0.13297787308692932, "learning_rate": 3.0301003344481606e-06, "loss": 2.6212, "step": 29800 }, { "epoch": 1731.3826458036983, "eval_loss": 2.6216437816619873, "eval_runtime": 8.5329, "eval_samples_per_second": 163.368, "eval_steps_per_second": 20.509, "step": 29800 }, { "epoch": 1737.072546230441, "grad_norm": 0.11761217564344406, "learning_rate": 3.020066889632107e-06, "loss": 2.6211, "step": 29900 }, { "epoch": 1737.072546230441, "eval_loss": 2.621067523956299, "eval_runtime": 8.8814, "eval_samples_per_second": 156.957, "eval_steps_per_second": 19.704, "step": 29900 }, { "epoch": 1742.7624466571835, "grad_norm": 0.13174152374267578, "learning_rate": 3.0100334448160537e-06, "loss": 2.621, "step": 30000 }, { "epoch": 1742.7624466571835, "eval_loss": 2.619697093963623, "eval_runtime": 8.5326, "eval_samples_per_second": 163.373, "eval_steps_per_second": 20.509, "step": 30000 }, { "epoch": 1748.452347083926, "grad_norm": 0.13943453133106232, "learning_rate": 3e-06, "loss": 2.6208, "step": 30100 }, { "epoch": 1748.452347083926, "eval_loss": 2.6210110187530518, "eval_runtime": 8.5178, "eval_samples_per_second": 163.658, "eval_steps_per_second": 20.545, "step": 30100 }, { "epoch": 1754.1422475106685, "grad_norm": 0.13520394265651703, "learning_rate": 2.9899665551839464e-06, "loss": 2.6213, "step": 30200 }, { "epoch": 1754.1422475106685, "eval_loss": 2.616352081298828, "eval_runtime": 8.8621, "eval_samples_per_second": 157.3, "eval_steps_per_second": 19.747, "step": 30200 }, { "epoch": 1759.832147937411, "grad_norm": 0.1447754055261612, "learning_rate": 2.979933110367893e-06, "loss": 2.6212, "step": 30300 }, { "epoch": 1759.832147937411, "eval_loss": 2.6177382469177246, "eval_runtime": 8.5425, "eval_samples_per_second": 163.184, "eval_steps_per_second": 20.486, "step": 30300 }, { "epoch": 1765.5220483641535, "grad_norm": 0.1305381804704666, "learning_rate": 2.9698996655518395e-06, "loss": 2.6207, "step": 30400 }, { "epoch": 1765.5220483641535, "eval_loss": 2.6181886196136475, "eval_runtime": 8.5323, "eval_samples_per_second": 163.379, "eval_steps_per_second": 20.51, "step": 30400 }, { "epoch": 1771.2119487908963, "grad_norm": 0.13752570748329163, "learning_rate": 2.959866220735786e-06, "loss": 2.6211, "step": 30500 }, { "epoch": 1771.2119487908963, "eval_loss": 2.6209466457366943, "eval_runtime": 8.8636, "eval_samples_per_second": 157.273, "eval_steps_per_second": 19.744, "step": 30500 }, { "epoch": 1776.9018492176388, "grad_norm": 0.15597382187843323, "learning_rate": 2.9498327759197326e-06, "loss": 2.6209, "step": 30600 }, { "epoch": 1776.9018492176388, "eval_loss": 2.6217684745788574, "eval_runtime": 8.535, "eval_samples_per_second": 163.328, "eval_steps_per_second": 20.504, "step": 30600 }, { "epoch": 1782.5917496443813, "grad_norm": 0.13857756555080414, "learning_rate": 2.939799331103679e-06, "loss": 2.6203, "step": 30700 }, { "epoch": 1782.5917496443813, "eval_loss": 2.6178483963012695, "eval_runtime": 8.5228, "eval_samples_per_second": 163.56, "eval_steps_per_second": 20.533, "step": 30700 }, { "epoch": 1788.2816500711237, "grad_norm": 0.12845158576965332, "learning_rate": 2.9297658862876257e-06, "loss": 2.6207, "step": 30800 }, { "epoch": 1788.2816500711237, "eval_loss": 2.615445137023926, "eval_runtime": 7.9709, "eval_samples_per_second": 174.886, "eval_steps_per_second": 21.955, "step": 30800 }, { "epoch": 1793.9715504978662, "grad_norm": 0.12672263383865356, "learning_rate": 2.919732441471572e-06, "loss": 2.6207, "step": 30900 }, { "epoch": 1793.9715504978662, "eval_loss": 2.621990919113159, "eval_runtime": 8.6765, "eval_samples_per_second": 160.663, "eval_steps_per_second": 20.169, "step": 30900 }, { "epoch": 1799.6614509246087, "grad_norm": 0.15212363004684448, "learning_rate": 2.9096989966555184e-06, "loss": 2.6205, "step": 31000 }, { "epoch": 1799.6614509246087, "eval_loss": 2.617125988006592, "eval_runtime": 8.5348, "eval_samples_per_second": 163.33, "eval_steps_per_second": 20.504, "step": 31000 }, { "epoch": 1805.3513513513512, "grad_norm": 0.14816269278526306, "learning_rate": 2.899665551839465e-06, "loss": 2.6204, "step": 31100 }, { "epoch": 1805.3513513513512, "eval_loss": 2.619084358215332, "eval_runtime": 8.5316, "eval_samples_per_second": 163.393, "eval_steps_per_second": 20.512, "step": 31100 }, { "epoch": 1811.041251778094, "grad_norm": 0.12133249640464783, "learning_rate": 2.8896321070234115e-06, "loss": 2.6202, "step": 31200 }, { "epoch": 1811.041251778094, "eval_loss": 2.6200191974639893, "eval_runtime": 8.8693, "eval_samples_per_second": 157.171, "eval_steps_per_second": 19.731, "step": 31200 }, { "epoch": 1816.7311522048365, "grad_norm": 0.122464619576931, "learning_rate": 2.879598662207358e-06, "loss": 2.6206, "step": 31300 }, { "epoch": 1816.7311522048365, "eval_loss": 2.6166939735412598, "eval_runtime": 8.5416, "eval_samples_per_second": 163.202, "eval_steps_per_second": 20.488, "step": 31300 }, { "epoch": 1822.421052631579, "grad_norm": 0.12631458044052124, "learning_rate": 2.8695652173913046e-06, "loss": 2.6207, "step": 31400 }, { "epoch": 1822.421052631579, "eval_loss": 2.619025468826294, "eval_runtime": 8.8769, "eval_samples_per_second": 157.038, "eval_steps_per_second": 19.714, "step": 31400 }, { "epoch": 1828.1109530583215, "grad_norm": 0.13460245728492737, "learning_rate": 2.859531772575251e-06, "loss": 2.6205, "step": 31500 }, { "epoch": 1828.1109530583215, "eval_loss": 2.6193785667419434, "eval_runtime": 8.5355, "eval_samples_per_second": 163.318, "eval_steps_per_second": 20.503, "step": 31500 }, { "epoch": 1833.800853485064, "grad_norm": 0.1309368908405304, "learning_rate": 2.8494983277591977e-06, "loss": 2.6202, "step": 31600 }, { "epoch": 1833.800853485064, "eval_loss": 2.6178736686706543, "eval_runtime": 8.5249, "eval_samples_per_second": 163.52, "eval_steps_per_second": 20.528, "step": 31600 }, { "epoch": 1839.4907539118065, "grad_norm": 0.13755999505519867, "learning_rate": 2.839464882943144e-06, "loss": 2.6202, "step": 31700 }, { "epoch": 1839.4907539118065, "eval_loss": 2.6197922229766846, "eval_runtime": 8.8802, "eval_samples_per_second": 156.978, "eval_steps_per_second": 19.707, "step": 31700 }, { "epoch": 1845.1806543385492, "grad_norm": 0.15736857056617737, "learning_rate": 2.8294314381270904e-06, "loss": 2.6205, "step": 31800 }, { "epoch": 1845.1806543385492, "eval_loss": 2.617283344268799, "eval_runtime": 8.5421, "eval_samples_per_second": 163.192, "eval_steps_per_second": 20.487, "step": 31800 }, { "epoch": 1850.8705547652917, "grad_norm": 0.13804545998573303, "learning_rate": 2.819397993311037e-06, "loss": 2.6202, "step": 31900 }, { "epoch": 1850.8705547652917, "eval_loss": 2.619748592376709, "eval_runtime": 8.5172, "eval_samples_per_second": 163.669, "eval_steps_per_second": 20.547, "step": 31900 }, { "epoch": 1856.5604551920342, "grad_norm": 0.14598102867603302, "learning_rate": 2.8093645484949835e-06, "loss": 2.6198, "step": 32000 }, { "epoch": 1856.5604551920342, "eval_loss": 2.6171820163726807, "eval_runtime": 8.5287, "eval_samples_per_second": 163.449, "eval_steps_per_second": 20.519, "step": 32000 }, { "epoch": 1862.2503556187767, "grad_norm": 0.1180824562907219, "learning_rate": 2.79933110367893e-06, "loss": 2.62, "step": 32100 }, { "epoch": 1862.2503556187767, "eval_loss": 2.6209444999694824, "eval_runtime": 8.8778, "eval_samples_per_second": 157.021, "eval_steps_per_second": 19.712, "step": 32100 }, { "epoch": 1867.9402560455192, "grad_norm": 0.13339059054851532, "learning_rate": 2.7892976588628766e-06, "loss": 2.6199, "step": 32200 }, { "epoch": 1867.9402560455192, "eval_loss": 2.6180646419525146, "eval_runtime": 8.5333, "eval_samples_per_second": 163.36, "eval_steps_per_second": 20.508, "step": 32200 }, { "epoch": 1873.6301564722617, "grad_norm": 0.13802410662174225, "learning_rate": 2.779264214046823e-06, "loss": 2.6199, "step": 32300 }, { "epoch": 1873.6301564722617, "eval_loss": 2.6181840896606445, "eval_runtime": 8.5319, "eval_samples_per_second": 163.386, "eval_steps_per_second": 20.511, "step": 32300 }, { "epoch": 1879.3200568990042, "grad_norm": 0.1414729207754135, "learning_rate": 2.7692307692307693e-06, "loss": 2.6204, "step": 32400 }, { "epoch": 1879.3200568990042, "eval_loss": 2.6203253269195557, "eval_runtime": 8.8692, "eval_samples_per_second": 157.174, "eval_steps_per_second": 19.731, "step": 32400 }, { "epoch": 1885.0099573257469, "grad_norm": 0.14050759375095367, "learning_rate": 2.759197324414716e-06, "loss": 2.6198, "step": 32500 }, { "epoch": 1885.0099573257469, "eval_loss": 2.6230709552764893, "eval_runtime": 8.5278, "eval_samples_per_second": 163.466, "eval_steps_per_second": 20.521, "step": 32500 }, { "epoch": 1890.6998577524894, "grad_norm": 0.12877824902534485, "learning_rate": 2.749163879598662e-06, "loss": 2.6196, "step": 32600 }, { "epoch": 1890.6998577524894, "eval_loss": 2.617913246154785, "eval_runtime": 8.8665, "eval_samples_per_second": 157.221, "eval_steps_per_second": 19.737, "step": 32600 }, { "epoch": 1896.3897581792319, "grad_norm": 0.14117339253425598, "learning_rate": 2.7391304347826087e-06, "loss": 2.6201, "step": 32700 }, { "epoch": 1896.3897581792319, "eval_loss": 2.6172854900360107, "eval_runtime": 8.5203, "eval_samples_per_second": 163.609, "eval_steps_per_second": 20.539, "step": 32700 }, { "epoch": 1902.0796586059744, "grad_norm": 0.13245785236358643, "learning_rate": 2.729096989966555e-06, "loss": 2.6195, "step": 32800 }, { "epoch": 1902.0796586059744, "eval_loss": 2.618534564971924, "eval_runtime": 8.5363, "eval_samples_per_second": 163.303, "eval_steps_per_second": 20.501, "step": 32800 }, { "epoch": 1907.7695590327169, "grad_norm": 0.1612655520439148, "learning_rate": 2.7190635451505014e-06, "loss": 2.6196, "step": 32900 }, { "epoch": 1907.7695590327169, "eval_loss": 2.619488477706909, "eval_runtime": 8.5526, "eval_samples_per_second": 162.992, "eval_steps_per_second": 20.462, "step": 32900 }, { "epoch": 1913.4594594594594, "grad_norm": 0.12668026983737946, "learning_rate": 2.709030100334448e-06, "loss": 2.6193, "step": 33000 }, { "epoch": 1913.4594594594594, "eval_loss": 2.617690086364746, "eval_runtime": 8.8781, "eval_samples_per_second": 157.015, "eval_steps_per_second": 19.711, "step": 33000 }, { "epoch": 1919.149359886202, "grad_norm": 0.1749388575553894, "learning_rate": 2.6989966555183945e-06, "loss": 2.6198, "step": 33100 }, { "epoch": 1919.149359886202, "eval_loss": 2.6185007095336914, "eval_runtime": 8.5203, "eval_samples_per_second": 163.61, "eval_steps_per_second": 20.539, "step": 33100 }, { "epoch": 1924.8392603129446, "grad_norm": 0.14269417524337769, "learning_rate": 2.6889632107023413e-06, "loss": 2.6195, "step": 33200 }, { "epoch": 1924.8392603129446, "eval_loss": 2.6211161613464355, "eval_runtime": 8.869, "eval_samples_per_second": 157.176, "eval_steps_per_second": 19.732, "step": 33200 }, { "epoch": 1958.6002844950212, "grad_norm": 0.13085490465164185, "learning_rate": 2.6789297658862876e-06, "loss": 2.619, "step": 33300 }, { "epoch": 1958.6002844950212, "eval_loss": 2.6180379390716553, "eval_runtime": 9.0762, "eval_samples_per_second": 153.589, "eval_steps_per_second": 19.281, "step": 33300 }, { "epoch": 1964.290184921764, "grad_norm": 0.14272978901863098, "learning_rate": 2.668896321070234e-06, "loss": 2.6189, "step": 33400 }, { "epoch": 1964.290184921764, "eval_loss": 2.617365598678589, "eval_runtime": 8.6925, "eval_samples_per_second": 160.369, "eval_steps_per_second": 20.132, "step": 33400 }, { "epoch": 1969.9800853485065, "grad_norm": 0.1391880363225937, "learning_rate": 2.6588628762541807e-06, "loss": 2.6194, "step": 33500 }, { "epoch": 1969.9800853485065, "eval_loss": 2.620480537414551, "eval_runtime": 8.6935, "eval_samples_per_second": 160.349, "eval_steps_per_second": 20.13, "step": 33500 }, { "epoch": 1975.669985775249, "grad_norm": 0.13617493212223053, "learning_rate": 2.648829431438127e-06, "loss": 2.619, "step": 33600 }, { "epoch": 1975.669985775249, "eval_loss": 2.6200404167175293, "eval_runtime": 8.5474, "eval_samples_per_second": 163.091, "eval_steps_per_second": 20.474, "step": 33600 }, { "epoch": 1981.3598862019915, "grad_norm": 0.14002011716365814, "learning_rate": 2.6387959866220734e-06, "loss": 2.6193, "step": 33700 }, { "epoch": 1981.3598862019915, "eval_loss": 2.619243621826172, "eval_runtime": 8.852, "eval_samples_per_second": 157.478, "eval_steps_per_second": 19.77, "step": 33700 }, { "epoch": 1987.049786628734, "grad_norm": 0.12899306416511536, "learning_rate": 2.62876254180602e-06, "loss": 2.6187, "step": 33800 }, { "epoch": 1987.049786628734, "eval_loss": 2.6176578998565674, "eval_runtime": 8.5349, "eval_samples_per_second": 163.329, "eval_steps_per_second": 20.504, "step": 33800 }, { "epoch": 1992.7396870554765, "grad_norm": 0.13901114463806152, "learning_rate": 2.6187290969899665e-06, "loss": 2.619, "step": 33900 }, { "epoch": 1992.7396870554765, "eval_loss": 2.6168863773345947, "eval_runtime": 8.5326, "eval_samples_per_second": 163.373, "eval_steps_per_second": 20.509, "step": 33900 }, { "epoch": 1998.429587482219, "grad_norm": 0.15283076465129852, "learning_rate": 2.6086956521739132e-06, "loss": 2.6185, "step": 34000 }, { "epoch": 1998.429587482219, "eval_loss": 2.6182827949523926, "eval_runtime": 8.8501, "eval_samples_per_second": 157.512, "eval_steps_per_second": 19.774, "step": 34000 }, { "epoch": 2004.1194879089617, "grad_norm": 0.1270897537469864, "learning_rate": 2.5986622073578596e-06, "loss": 2.6191, "step": 34100 }, { "epoch": 2004.1194879089617, "eval_loss": 2.616523265838623, "eval_runtime": 8.545, "eval_samples_per_second": 163.136, "eval_steps_per_second": 20.48, "step": 34100 }, { "epoch": 2009.8093883357042, "grad_norm": 0.11230363696813583, "learning_rate": 2.588628762541806e-06, "loss": 2.6187, "step": 34200 }, { "epoch": 2009.8093883357042, "eval_loss": 2.619399309158325, "eval_runtime": 8.8553, "eval_samples_per_second": 157.42, "eval_steps_per_second": 19.762, "step": 34200 }, { "epoch": 2015.4992887624467, "grad_norm": 0.14034995436668396, "learning_rate": 2.5785953177257527e-06, "loss": 2.6187, "step": 34300 }, { "epoch": 2015.4992887624467, "eval_loss": 2.6191158294677734, "eval_runtime": 8.5383, "eval_samples_per_second": 163.263, "eval_steps_per_second": 20.496, "step": 34300 }, { "epoch": 2021.1891891891892, "grad_norm": 0.14701803028583527, "learning_rate": 2.568561872909699e-06, "loss": 2.6183, "step": 34400 }, { "epoch": 2021.1891891891892, "eval_loss": 2.620706796646118, "eval_runtime": 8.5271, "eval_samples_per_second": 163.479, "eval_steps_per_second": 20.523, "step": 34400 }, { "epoch": 2026.8790896159317, "grad_norm": 0.15207096934318542, "learning_rate": 2.5585284280936454e-06, "loss": 2.6189, "step": 34500 }, { "epoch": 2026.8790896159317, "eval_loss": 2.619361400604248, "eval_runtime": 8.5356, "eval_samples_per_second": 163.316, "eval_steps_per_second": 20.502, "step": 34500 }, { "epoch": 2032.5689900426742, "grad_norm": 0.1416121870279312, "learning_rate": 2.548494983277592e-06, "loss": 2.6182, "step": 34600 }, { "epoch": 2032.5689900426742, "eval_loss": 2.6184678077697754, "eval_runtime": 8.8549, "eval_samples_per_second": 157.428, "eval_steps_per_second": 19.763, "step": 34600 }, { "epoch": 2038.2588904694169, "grad_norm": 0.1294640153646469, "learning_rate": 2.5384615384615385e-06, "loss": 2.6185, "step": 34700 }, { "epoch": 2038.2588904694169, "eval_loss": 2.618467330932617, "eval_runtime": 8.5393, "eval_samples_per_second": 163.246, "eval_steps_per_second": 20.494, "step": 34700 }, { "epoch": 2043.9487908961594, "grad_norm": 0.1140933409333229, "learning_rate": 2.528428093645485e-06, "loss": 2.618, "step": 34800 }, { "epoch": 2043.9487908961594, "eval_loss": 2.6173062324523926, "eval_runtime": 8.8708, "eval_samples_per_second": 157.146, "eval_steps_per_second": 19.728, "step": 34800 }, { "epoch": 2049.6386913229016, "grad_norm": 0.15031367540359497, "learning_rate": 2.5183946488294316e-06, "loss": 2.6185, "step": 34900 }, { "epoch": 2049.6386913229016, "eval_loss": 2.6177406311035156, "eval_runtime": 8.5417, "eval_samples_per_second": 163.2, "eval_steps_per_second": 20.488, "step": 34900 }, { "epoch": 2055.3285917496446, "grad_norm": 0.11632242053747177, "learning_rate": 2.508361204013378e-06, "loss": 2.6181, "step": 35000 }, { "epoch": 2055.3285917496446, "eval_loss": 2.6203091144561768, "eval_runtime": 8.8703, "eval_samples_per_second": 157.154, "eval_steps_per_second": 19.729, "step": 35000 }, { "epoch": 2061.018492176387, "grad_norm": 0.14546014368534088, "learning_rate": 2.4983277591973247e-06, "loss": 2.6182, "step": 35100 }, { "epoch": 2061.018492176387, "eval_loss": 2.6166837215423584, "eval_runtime": 8.5363, "eval_samples_per_second": 163.303, "eval_steps_per_second": 20.501, "step": 35100 }, { "epoch": 2066.7083926031296, "grad_norm": 0.15595249831676483, "learning_rate": 2.488294314381271e-06, "loss": 2.618, "step": 35200 }, { "epoch": 2066.7083926031296, "eval_loss": 2.620990514755249, "eval_runtime": 8.8529, "eval_samples_per_second": 157.463, "eval_steps_per_second": 19.768, "step": 35200 }, { "epoch": 2072.398293029872, "grad_norm": 0.15020006895065308, "learning_rate": 2.4782608695652173e-06, "loss": 2.6181, "step": 35300 }, { "epoch": 2072.398293029872, "eval_loss": 2.617979049682617, "eval_runtime": 8.8415, "eval_samples_per_second": 157.666, "eval_steps_per_second": 19.793, "step": 35300 }, { "epoch": 2078.0881934566146, "grad_norm": 0.12532344460487366, "learning_rate": 2.468227424749164e-06, "loss": 2.618, "step": 35400 }, { "epoch": 2078.0881934566146, "eval_loss": 2.6182172298431396, "eval_runtime": 8.8375, "eval_samples_per_second": 157.737, "eval_steps_per_second": 19.802, "step": 35400 }, { "epoch": 2083.778093883357, "grad_norm": 0.13622809946537018, "learning_rate": 2.4581939799331104e-06, "loss": 2.6178, "step": 35500 }, { "epoch": 2083.778093883357, "eval_loss": 2.6147806644439697, "eval_runtime": 8.8411, "eval_samples_per_second": 157.674, "eval_steps_per_second": 19.794, "step": 35500 }, { "epoch": 2089.4679943100996, "grad_norm": 0.11295317858457565, "learning_rate": 2.4481605351170568e-06, "loss": 2.6185, "step": 35600 }, { "epoch": 2089.4679943100996, "eval_loss": 2.617539167404175, "eval_runtime": 8.8516, "eval_samples_per_second": 157.485, "eval_steps_per_second": 19.77, "step": 35600 }, { "epoch": 2095.157894736842, "grad_norm": 0.12703397870063782, "learning_rate": 2.4381270903010035e-06, "loss": 2.6178, "step": 35700 }, { "epoch": 2095.157894736842, "eval_loss": 2.6192715167999268, "eval_runtime": 8.8438, "eval_samples_per_second": 157.624, "eval_steps_per_second": 19.788, "step": 35700 }, { "epoch": 2100.8477951635846, "grad_norm": 0.13047580420970917, "learning_rate": 2.42809364548495e-06, "loss": 2.6175, "step": 35800 }, { "epoch": 2100.8477951635846, "eval_loss": 2.6149110794067383, "eval_runtime": 8.5422, "eval_samples_per_second": 163.189, "eval_steps_per_second": 20.486, "step": 35800 }, { "epoch": 2106.537695590327, "grad_norm": 0.1494310200214386, "learning_rate": 2.4180602006688962e-06, "loss": 2.6183, "step": 35900 }, { "epoch": 2106.537695590327, "eval_loss": 2.617572069168091, "eval_runtime": 8.5269, "eval_samples_per_second": 163.483, "eval_steps_per_second": 20.523, "step": 35900 }, { "epoch": 2112.2275960170696, "grad_norm": 0.14913226664066315, "learning_rate": 2.408026755852843e-06, "loss": 2.6175, "step": 36000 }, { "epoch": 2112.2275960170696, "eval_loss": 2.6157870292663574, "eval_runtime": 8.5322, "eval_samples_per_second": 163.381, "eval_steps_per_second": 20.511, "step": 36000 }, { "epoch": 2117.917496443812, "grad_norm": 0.12804996967315674, "learning_rate": 2.3979933110367893e-06, "loss": 2.6175, "step": 36100 }, { "epoch": 2117.917496443812, "eval_loss": 2.6161787509918213, "eval_runtime": 8.8464, "eval_samples_per_second": 157.578, "eval_steps_per_second": 19.782, "step": 36100 }, { "epoch": 2123.6073968705546, "grad_norm": 0.1311938613653183, "learning_rate": 2.387959866220736e-06, "loss": 2.6177, "step": 36200 }, { "epoch": 2123.6073968705546, "eval_loss": 2.6184916496276855, "eval_runtime": 8.5276, "eval_samples_per_second": 163.47, "eval_steps_per_second": 20.522, "step": 36200 }, { "epoch": 2129.2972972972975, "grad_norm": 0.14833857119083405, "learning_rate": 2.3779264214046824e-06, "loss": 2.618, "step": 36300 }, { "epoch": 2129.2972972972975, "eval_loss": 2.616685628890991, "eval_runtime": 8.5313, "eval_samples_per_second": 163.399, "eval_steps_per_second": 20.513, "step": 36300 }, { "epoch": 2134.98719772404, "grad_norm": 0.14459851384162903, "learning_rate": 2.3678929765886288e-06, "loss": 2.6173, "step": 36400 }, { "epoch": 2134.98719772404, "eval_loss": 2.6192727088928223, "eval_runtime": 8.5314, "eval_samples_per_second": 163.397, "eval_steps_per_second": 20.513, "step": 36400 }, { "epoch": 2140.6770981507825, "grad_norm": 0.12654992938041687, "learning_rate": 2.3578595317725755e-06, "loss": 2.6174, "step": 36500 }, { "epoch": 2140.6770981507825, "eval_loss": 2.614757537841797, "eval_runtime": 8.8498, "eval_samples_per_second": 157.517, "eval_steps_per_second": 19.774, "step": 36500 }, { "epoch": 2146.366998577525, "grad_norm": 0.16258764266967773, "learning_rate": 2.347826086956522e-06, "loss": 2.618, "step": 36600 }, { "epoch": 2146.366998577525, "eval_loss": 2.61818528175354, "eval_runtime": 8.5339, "eval_samples_per_second": 163.349, "eval_steps_per_second": 20.507, "step": 36600 }, { "epoch": 2152.0568990042675, "grad_norm": 0.1515471637248993, "learning_rate": 2.337792642140468e-06, "loss": 2.6177, "step": 36700 }, { "epoch": 2152.0568990042675, "eval_loss": 2.6178441047668457, "eval_runtime": 8.8414, "eval_samples_per_second": 157.668, "eval_steps_per_second": 19.793, "step": 36700 }, { "epoch": 2157.74679943101, "grad_norm": 0.1283411979675293, "learning_rate": 2.327759197324415e-06, "loss": 2.6173, "step": 36800 }, { "epoch": 2157.74679943101, "eval_loss": 2.6143412590026855, "eval_runtime": 8.5345, "eval_samples_per_second": 163.338, "eval_steps_per_second": 20.505, "step": 36800 }, { "epoch": 2163.4366998577525, "grad_norm": 0.13093768060207367, "learning_rate": 2.3177257525083613e-06, "loss": 2.6175, "step": 36900 }, { "epoch": 2163.4366998577525, "eval_loss": 2.6168148517608643, "eval_runtime": 8.5431, "eval_samples_per_second": 163.172, "eval_steps_per_second": 20.484, "step": 36900 }, { "epoch": 2169.126600284495, "grad_norm": 0.12476625293493271, "learning_rate": 2.307692307692308e-06, "loss": 2.6174, "step": 37000 }, { "epoch": 2169.126600284495, "eval_loss": 2.6173489093780518, "eval_runtime": 8.8493, "eval_samples_per_second": 157.526, "eval_steps_per_second": 19.776, "step": 37000 }, { "epoch": 2174.8165007112375, "grad_norm": 0.11948033422231674, "learning_rate": 2.2976588628762544e-06, "loss": 2.617, "step": 37100 }, { "epoch": 2174.8165007112375, "eval_loss": 2.616060972213745, "eval_runtime": 8.8543, "eval_samples_per_second": 157.438, "eval_steps_per_second": 19.764, "step": 37100 }, { "epoch": 2180.50640113798, "grad_norm": 0.12949152290821075, "learning_rate": 2.2876254180602008e-06, "loss": 2.6175, "step": 37200 }, { "epoch": 2180.50640113798, "eval_loss": 2.619767427444458, "eval_runtime": 8.5433, "eval_samples_per_second": 163.169, "eval_steps_per_second": 20.484, "step": 37200 }, { "epoch": 2186.1963015647225, "grad_norm": 0.14393049478530884, "learning_rate": 2.2775919732441475e-06, "loss": 2.6173, "step": 37300 }, { "epoch": 2186.1963015647225, "eval_loss": 2.614513635635376, "eval_runtime": 8.5464, "eval_samples_per_second": 163.11, "eval_steps_per_second": 20.477, "step": 37300 }, { "epoch": 2191.886201991465, "grad_norm": 0.12848299741744995, "learning_rate": 2.267558528428094e-06, "loss": 2.6171, "step": 37400 }, { "epoch": 2191.886201991465, "eval_loss": 2.615206241607666, "eval_runtime": 8.8388, "eval_samples_per_second": 157.714, "eval_steps_per_second": 19.799, "step": 37400 }, { "epoch": 2197.5761024182075, "grad_norm": 0.13800281286239624, "learning_rate": 2.25752508361204e-06, "loss": 2.6171, "step": 37500 }, { "epoch": 2197.5761024182075, "eval_loss": 2.616520404815674, "eval_runtime": 8.5307, "eval_samples_per_second": 163.41, "eval_steps_per_second": 20.514, "step": 37500 }, { "epoch": 2203.2660028449504, "grad_norm": 0.1414160281419754, "learning_rate": 2.2474916387959865e-06, "loss": 2.617, "step": 37600 }, { "epoch": 2203.2660028449504, "eval_loss": 2.617866039276123, "eval_runtime": 8.8509, "eval_samples_per_second": 157.499, "eval_steps_per_second": 19.772, "step": 37600 }, { "epoch": 2208.955903271693, "grad_norm": 0.129195898771286, "learning_rate": 2.237458193979933e-06, "loss": 2.617, "step": 37700 }, { "epoch": 2208.955903271693, "eval_loss": 2.616370677947998, "eval_runtime": 8.5271, "eval_samples_per_second": 163.479, "eval_steps_per_second": 20.523, "step": 37700 }, { "epoch": 2214.6458036984354, "grad_norm": 0.12701831758022308, "learning_rate": 2.2274247491638796e-06, "loss": 2.6172, "step": 37800 }, { "epoch": 2214.6458036984354, "eval_loss": 2.619422197341919, "eval_runtime": 8.5388, "eval_samples_per_second": 163.255, "eval_steps_per_second": 20.495, "step": 37800 }, { "epoch": 2220.335704125178, "grad_norm": 0.1434861570596695, "learning_rate": 2.217391304347826e-06, "loss": 2.6168, "step": 37900 }, { "epoch": 2220.335704125178, "eval_loss": 2.6175920963287354, "eval_runtime": 8.528, "eval_samples_per_second": 163.461, "eval_steps_per_second": 20.521, "step": 37900 }, { "epoch": 2226.0256045519204, "grad_norm": 0.1319652646780014, "learning_rate": 2.2073578595317723e-06, "loss": 2.6169, "step": 38000 }, { "epoch": 2226.0256045519204, "eval_loss": 2.6176187992095947, "eval_runtime": 8.8424, "eval_samples_per_second": 157.65, "eval_steps_per_second": 19.791, "step": 38000 }, { "epoch": 2231.715504978663, "grad_norm": 0.13358598947525024, "learning_rate": 2.197324414715719e-06, "loss": 2.6167, "step": 38100 }, { "epoch": 2231.715504978663, "eval_loss": 2.616727828979492, "eval_runtime": 8.5384, "eval_samples_per_second": 163.263, "eval_steps_per_second": 20.496, "step": 38100 }, { "epoch": 2237.4054054054054, "grad_norm": 0.12551608681678772, "learning_rate": 2.1872909698996654e-06, "loss": 2.617, "step": 38200 }, { "epoch": 2237.4054054054054, "eval_loss": 2.616206645965576, "eval_runtime": 8.8419, "eval_samples_per_second": 157.658, "eval_steps_per_second": 19.792, "step": 38200 }, { "epoch": 2243.095305832148, "grad_norm": 0.1412065029144287, "learning_rate": 2.177257525083612e-06, "loss": 2.6172, "step": 38300 }, { "epoch": 2243.095305832148, "eval_loss": 2.618215799331665, "eval_runtime": 8.54, "eval_samples_per_second": 163.232, "eval_steps_per_second": 20.492, "step": 38300 }, { "epoch": 2248.7852062588904, "grad_norm": 0.16305094957351685, "learning_rate": 2.1672240802675585e-06, "loss": 2.6166, "step": 38400 }, { "epoch": 2248.7852062588904, "eval_loss": 2.618960380554199, "eval_runtime": 8.5327, "eval_samples_per_second": 163.371, "eval_steps_per_second": 20.509, "step": 38400 }, { "epoch": 2254.475106685633, "grad_norm": 0.14737871289253235, "learning_rate": 2.157190635451505e-06, "loss": 2.6165, "step": 38500 }, { "epoch": 2254.475106685633, "eval_loss": 2.618856906890869, "eval_runtime": 8.8618, "eval_samples_per_second": 157.305, "eval_steps_per_second": 19.748, "step": 38500 }, { "epoch": 2260.1650071123754, "grad_norm": 0.11627591401338577, "learning_rate": 2.1471571906354516e-06, "loss": 2.6169, "step": 38600 }, { "epoch": 2260.1650071123754, "eval_loss": 2.6156229972839355, "eval_runtime": 8.5295, "eval_samples_per_second": 163.432, "eval_steps_per_second": 20.517, "step": 38600 }, { "epoch": 2265.854907539118, "grad_norm": 0.1361280232667923, "learning_rate": 2.137123745819398e-06, "loss": 2.6168, "step": 38700 }, { "epoch": 2265.854907539118, "eval_loss": 2.6178250312805176, "eval_runtime": 8.8561, "eval_samples_per_second": 157.405, "eval_steps_per_second": 19.76, "step": 38700 }, { "epoch": 2271.5448079658604, "grad_norm": 0.13634426891803741, "learning_rate": 2.1270903010033443e-06, "loss": 2.6168, "step": 38800 }, { "epoch": 2271.5448079658604, "eval_loss": 2.620987892150879, "eval_runtime": 8.542, "eval_samples_per_second": 163.194, "eval_steps_per_second": 20.487, "step": 38800 }, { "epoch": 2277.2347083926034, "grad_norm": 0.11851690709590912, "learning_rate": 2.117056856187291e-06, "loss": 2.6169, "step": 38900 }, { "epoch": 2277.2347083926034, "eval_loss": 2.6175451278686523, "eval_runtime": 8.5298, "eval_samples_per_second": 163.428, "eval_steps_per_second": 20.516, "step": 38900 }, { "epoch": 2282.924608819346, "grad_norm": 0.15516361594200134, "learning_rate": 2.1070234113712374e-06, "loss": 2.6164, "step": 39000 }, { "epoch": 2282.924608819346, "eval_loss": 2.6171224117279053, "eval_runtime": 8.5317, "eval_samples_per_second": 163.391, "eval_steps_per_second": 20.512, "step": 39000 }, { "epoch": 2288.6145092460883, "grad_norm": 0.14551801979541779, "learning_rate": 2.0969899665551837e-06, "loss": 2.6166, "step": 39100 }, { "epoch": 2288.6145092460883, "eval_loss": 2.618269920349121, "eval_runtime": 8.8607, "eval_samples_per_second": 157.324, "eval_steps_per_second": 19.75, "step": 39100 }, { "epoch": 2294.304409672831, "grad_norm": 0.13568130135536194, "learning_rate": 2.0869565217391305e-06, "loss": 2.6162, "step": 39200 }, { "epoch": 2294.304409672831, "eval_loss": 2.6169166564941406, "eval_runtime": 8.5331, "eval_samples_per_second": 163.363, "eval_steps_per_second": 20.508, "step": 39200 }, { "epoch": 2299.9943100995733, "grad_norm": 0.1397295743227005, "learning_rate": 2.076923076923077e-06, "loss": 2.6165, "step": 39300 }, { "epoch": 2299.9943100995733, "eval_loss": 2.617077589035034, "eval_runtime": 8.8589, "eval_samples_per_second": 157.356, "eval_steps_per_second": 19.754, "step": 39300 }, { "epoch": 2305.684210526316, "grad_norm": 0.1272270530462265, "learning_rate": 2.0668896321070236e-06, "loss": 2.6167, "step": 39400 }, { "epoch": 2305.684210526316, "eval_loss": 2.6153969764709473, "eval_runtime": 8.5514, "eval_samples_per_second": 163.015, "eval_steps_per_second": 20.465, "step": 39400 }, { "epoch": 2311.3741109530583, "grad_norm": 0.13360774517059326, "learning_rate": 2.05685618729097e-06, "loss": 2.616, "step": 39500 }, { "epoch": 2311.3741109530583, "eval_loss": 2.6174352169036865, "eval_runtime": 8.851, "eval_samples_per_second": 157.496, "eval_steps_per_second": 19.772, "step": 39500 }, { "epoch": 2317.064011379801, "grad_norm": 0.14483892917633057, "learning_rate": 2.0468227424749163e-06, "loss": 2.6158, "step": 39600 }, { "epoch": 2317.064011379801, "eval_loss": 2.617931842803955, "eval_runtime": 8.5303, "eval_samples_per_second": 163.418, "eval_steps_per_second": 20.515, "step": 39600 }, { "epoch": 2322.7539118065433, "grad_norm": 0.12557685375213623, "learning_rate": 2.036789297658863e-06, "loss": 2.6163, "step": 39700 }, { "epoch": 2322.7539118065433, "eval_loss": 2.616457462310791, "eval_runtime": 8.8454, "eval_samples_per_second": 157.596, "eval_steps_per_second": 19.784, "step": 39700 }, { "epoch": 2328.443812233286, "grad_norm": 0.14481040835380554, "learning_rate": 2.0267558528428094e-06, "loss": 2.6161, "step": 39800 }, { "epoch": 2328.443812233286, "eval_loss": 2.6148922443389893, "eval_runtime": 8.5419, "eval_samples_per_second": 163.195, "eval_steps_per_second": 20.487, "step": 39800 }, { "epoch": 2334.1337126600283, "grad_norm": 0.1371890753507614, "learning_rate": 2.0167224080267557e-06, "loss": 2.6156, "step": 39900 }, { "epoch": 2334.1337126600283, "eval_loss": 2.6165366172790527, "eval_runtime": 8.5312, "eval_samples_per_second": 163.401, "eval_steps_per_second": 20.513, "step": 39900 }, { "epoch": 2339.823613086771, "grad_norm": 0.11908498406410217, "learning_rate": 2.0066889632107025e-06, "loss": 2.6161, "step": 40000 }, { "epoch": 2339.823613086771, "eval_loss": 2.6168572902679443, "eval_runtime": 8.5312, "eval_samples_per_second": 163.4, "eval_steps_per_second": 20.513, "step": 40000 }, { "epoch": 2345.5135135135133, "grad_norm": 0.15776848793029785, "learning_rate": 1.996655518394649e-06, "loss": 2.6161, "step": 40100 }, { "epoch": 2345.5135135135133, "eval_loss": 2.61922550201416, "eval_runtime": 8.8387, "eval_samples_per_second": 157.716, "eval_steps_per_second": 19.799, "step": 40100 }, { "epoch": 2351.2034139402563, "grad_norm": 0.13650420308113098, "learning_rate": 1.986622073578595e-06, "loss": 2.6157, "step": 40200 }, { "epoch": 2351.2034139402563, "eval_loss": 2.6171460151672363, "eval_runtime": 8.8588, "eval_samples_per_second": 157.357, "eval_steps_per_second": 19.754, "step": 40200 }, { "epoch": 2356.8933143669988, "grad_norm": 0.14394904673099518, "learning_rate": 1.976588628762542e-06, "loss": 2.6156, "step": 40300 }, { "epoch": 2356.8933143669988, "eval_loss": 2.617033004760742, "eval_runtime": 8.5275, "eval_samples_per_second": 163.472, "eval_steps_per_second": 20.522, "step": 40300 }, { "epoch": 2362.5832147937413, "grad_norm": 0.14980724453926086, "learning_rate": 1.9665551839464883e-06, "loss": 2.6163, "step": 40400 }, { "epoch": 2362.5832147937413, "eval_loss": 2.614140510559082, "eval_runtime": 8.5229, "eval_samples_per_second": 163.56, "eval_steps_per_second": 20.533, "step": 40400 }, { "epoch": 2368.2731152204838, "grad_norm": 0.13233982026576996, "learning_rate": 1.956521739130435e-06, "loss": 2.6156, "step": 40500 }, { "epoch": 2368.2731152204838, "eval_loss": 2.615586042404175, "eval_runtime": 8.847, "eval_samples_per_second": 157.568, "eval_steps_per_second": 19.781, "step": 40500 }, { "epoch": 2373.9630156472263, "grad_norm": 0.13586369156837463, "learning_rate": 1.9464882943143814e-06, "loss": 2.6159, "step": 40600 }, { "epoch": 2373.9630156472263, "eval_loss": 2.61820650100708, "eval_runtime": 8.8392, "eval_samples_per_second": 157.707, "eval_steps_per_second": 19.798, "step": 40600 }, { "epoch": 2379.6529160739688, "grad_norm": 0.13869047164916992, "learning_rate": 1.9364548494983277e-06, "loss": 2.6152, "step": 40700 }, { "epoch": 2379.6529160739688, "eval_loss": 2.614039897918701, "eval_runtime": 8.5304, "eval_samples_per_second": 163.415, "eval_steps_per_second": 20.515, "step": 40700 }, { "epoch": 2385.3428165007113, "grad_norm": 0.1269962042570114, "learning_rate": 1.9264214046822745e-06, "loss": 2.6152, "step": 40800 }, { "epoch": 2385.3428165007113, "eval_loss": 2.614192247390747, "eval_runtime": 8.5419, "eval_samples_per_second": 163.195, "eval_steps_per_second": 20.487, "step": 40800 }, { "epoch": 2391.0327169274537, "grad_norm": 0.14708365499973297, "learning_rate": 1.916387959866221e-06, "loss": 2.6155, "step": 40900 }, { "epoch": 2391.0327169274537, "eval_loss": 2.615812301635742, "eval_runtime": 8.8721, "eval_samples_per_second": 157.122, "eval_steps_per_second": 19.725, "step": 40900 }, { "epoch": 2396.7226173541962, "grad_norm": 0.11788502335548401, "learning_rate": 1.9063545150501674e-06, "loss": 2.6158, "step": 41000 }, { "epoch": 2396.7226173541962, "eval_loss": 2.615337610244751, "eval_runtime": 8.8548, "eval_samples_per_second": 157.429, "eval_steps_per_second": 19.763, "step": 41000 }, { "epoch": 2402.4125177809387, "grad_norm": 0.14130190014839172, "learning_rate": 1.896321070234114e-06, "loss": 2.6153, "step": 41100 }, { "epoch": 2402.4125177809387, "eval_loss": 2.6168124675750732, "eval_runtime": 8.5314, "eval_samples_per_second": 163.396, "eval_steps_per_second": 20.512, "step": 41100 }, { "epoch": 2408.1024182076812, "grad_norm": 0.14463502168655396, "learning_rate": 1.8862876254180603e-06, "loss": 2.6155, "step": 41200 }, { "epoch": 2408.1024182076812, "eval_loss": 2.6179542541503906, "eval_runtime": 8.5339, "eval_samples_per_second": 163.348, "eval_steps_per_second": 20.506, "step": 41200 }, { "epoch": 2413.7923186344237, "grad_norm": 0.12708818912506104, "learning_rate": 1.8762541806020068e-06, "loss": 2.6155, "step": 41300 }, { "epoch": 2413.7923186344237, "eval_loss": 2.616238832473755, "eval_runtime": 8.5448, "eval_samples_per_second": 163.139, "eval_steps_per_second": 20.48, "step": 41300 }, { "epoch": 2419.4822190611662, "grad_norm": 0.1303997039794922, "learning_rate": 1.8662207357859534e-06, "loss": 2.616, "step": 41400 }, { "epoch": 2419.4822190611662, "eval_loss": 2.615610122680664, "eval_runtime": 8.8552, "eval_samples_per_second": 157.421, "eval_steps_per_second": 19.762, "step": 41400 }, { "epoch": 2425.172119487909, "grad_norm": 0.14887328445911407, "learning_rate": 1.8561872909699e-06, "loss": 2.6158, "step": 41500 }, { "epoch": 2425.172119487909, "eval_loss": 2.6136879920959473, "eval_runtime": 8.5262, "eval_samples_per_second": 163.496, "eval_steps_per_second": 20.525, "step": 41500 }, { "epoch": 2430.8620199146517, "grad_norm": 0.12649740278720856, "learning_rate": 1.8461538461538462e-06, "loss": 2.6156, "step": 41600 }, { "epoch": 2430.8620199146517, "eval_loss": 2.6171023845672607, "eval_runtime": 8.8681, "eval_samples_per_second": 157.192, "eval_steps_per_second": 19.734, "step": 41600 }, { "epoch": 2436.551920341394, "grad_norm": 0.14125467836856842, "learning_rate": 1.8361204013377928e-06, "loss": 2.6152, "step": 41700 }, { "epoch": 2436.551920341394, "eval_loss": 2.6158018112182617, "eval_runtime": 8.5206, "eval_samples_per_second": 163.604, "eval_steps_per_second": 20.538, "step": 41700 }, { "epoch": 2442.2418207681367, "grad_norm": 0.12283240258693695, "learning_rate": 1.8260869565217394e-06, "loss": 2.6159, "step": 41800 }, { "epoch": 2442.2418207681367, "eval_loss": 2.6169424057006836, "eval_runtime": 8.5518, "eval_samples_per_second": 163.007, "eval_steps_per_second": 20.464, "step": 41800 }, { "epoch": 2447.931721194879, "grad_norm": 0.15379033982753754, "learning_rate": 1.8160535117056857e-06, "loss": 2.6152, "step": 41900 }, { "epoch": 2447.931721194879, "eval_loss": 2.6155290603637695, "eval_runtime": 8.8698, "eval_samples_per_second": 157.162, "eval_steps_per_second": 19.73, "step": 41900 }, { "epoch": 2453.6216216216217, "grad_norm": 0.15148812532424927, "learning_rate": 1.8060200668896322e-06, "loss": 2.6152, "step": 42000 }, { "epoch": 2453.6216216216217, "eval_loss": 2.6144700050354004, "eval_runtime": 8.5457, "eval_samples_per_second": 163.122, "eval_steps_per_second": 20.478, "step": 42000 }, { "epoch": 2459.311522048364, "grad_norm": 0.1490088701248169, "learning_rate": 1.7959866220735788e-06, "loss": 2.615, "step": 42100 }, { "epoch": 2459.311522048364, "eval_loss": 2.6168768405914307, "eval_runtime": 8.5451, "eval_samples_per_second": 163.135, "eval_steps_per_second": 20.48, "step": 42100 }, { "epoch": 2465.0014224751067, "grad_norm": 0.11491715162992477, "learning_rate": 1.7859531772575253e-06, "loss": 2.6157, "step": 42200 }, { "epoch": 2465.0014224751067, "eval_loss": 2.6155242919921875, "eval_runtime": 8.5538, "eval_samples_per_second": 162.969, "eval_steps_per_second": 20.459, "step": 42200 }, { "epoch": 2470.691322901849, "grad_norm": 0.15772178769111633, "learning_rate": 1.7759197324414717e-06, "loss": 2.6153, "step": 42300 }, { "epoch": 2470.691322901849, "eval_loss": 2.613830804824829, "eval_runtime": 8.8766, "eval_samples_per_second": 157.041, "eval_steps_per_second": 19.715, "step": 42300 }, { "epoch": 2476.3812233285917, "grad_norm": 0.13534432649612427, "learning_rate": 1.7658862876254182e-06, "loss": 2.6149, "step": 42400 }, { "epoch": 2476.3812233285917, "eval_loss": 2.6143946647644043, "eval_runtime": 8.5381, "eval_samples_per_second": 163.268, "eval_steps_per_second": 20.496, "step": 42400 }, { "epoch": 2482.071123755334, "grad_norm": 0.11993639171123505, "learning_rate": 1.7558528428093648e-06, "loss": 2.6154, "step": 42500 }, { "epoch": 2482.071123755334, "eval_loss": 2.6130027770996094, "eval_runtime": 8.5295, "eval_samples_per_second": 163.434, "eval_steps_per_second": 20.517, "step": 42500 }, { "epoch": 2487.7610241820767, "grad_norm": 0.12379685789346695, "learning_rate": 1.745819397993311e-06, "loss": 2.6152, "step": 42600 }, { "epoch": 2487.7610241820767, "eval_loss": 2.616774082183838, "eval_runtime": 8.854, "eval_samples_per_second": 157.444, "eval_steps_per_second": 19.765, "step": 42600 }, { "epoch": 2493.450924608819, "grad_norm": 0.11662384122610092, "learning_rate": 1.7357859531772575e-06, "loss": 2.6152, "step": 42700 }, { "epoch": 2493.450924608819, "eval_loss": 2.6169705390930176, "eval_runtime": 8.5399, "eval_samples_per_second": 163.234, "eval_steps_per_second": 20.492, "step": 42700 }, { "epoch": 2499.140825035562, "grad_norm": 0.13475127518177032, "learning_rate": 1.7257525083612038e-06, "loss": 2.6153, "step": 42800 }, { "epoch": 2499.140825035562, "eval_loss": 2.6149654388427734, "eval_runtime": 8.5276, "eval_samples_per_second": 163.468, "eval_steps_per_second": 20.522, "step": 42800 }, { "epoch": 2504.8307254623046, "grad_norm": 0.12163935601711273, "learning_rate": 1.7157190635451504e-06, "loss": 2.6146, "step": 42900 }, { "epoch": 2504.8307254623046, "eval_loss": 2.616426467895508, "eval_runtime": 8.5341, "eval_samples_per_second": 163.345, "eval_steps_per_second": 20.506, "step": 42900 }, { "epoch": 2510.520625889047, "grad_norm": 0.12904202938079834, "learning_rate": 1.705685618729097e-06, "loss": 2.615, "step": 43000 }, { "epoch": 2510.520625889047, "eval_loss": 2.6142711639404297, "eval_runtime": 8.8632, "eval_samples_per_second": 157.279, "eval_steps_per_second": 19.744, "step": 43000 }, { "epoch": 2516.2105263157896, "grad_norm": 0.14409850537776947, "learning_rate": 1.6956521739130435e-06, "loss": 2.615, "step": 43100 }, { "epoch": 2516.2105263157896, "eval_loss": 2.6173369884490967, "eval_runtime": 8.517, "eval_samples_per_second": 163.673, "eval_steps_per_second": 20.547, "step": 43100 }, { "epoch": 2521.900426742532, "grad_norm": 0.12942758202552795, "learning_rate": 1.6856187290969898e-06, "loss": 2.6147, "step": 43200 }, { "epoch": 2521.900426742532, "eval_loss": 2.6177051067352295, "eval_runtime": 8.8403, "eval_samples_per_second": 157.688, "eval_steps_per_second": 19.796, "step": 43200 }, { "epoch": 2527.5903271692746, "grad_norm": 0.14761574566364288, "learning_rate": 1.6755852842809363e-06, "loss": 2.6143, "step": 43300 }, { "epoch": 2527.5903271692746, "eval_loss": 2.6154208183288574, "eval_runtime": 8.5203, "eval_samples_per_second": 163.609, "eval_steps_per_second": 20.539, "step": 43300 }, { "epoch": 2533.280227596017, "grad_norm": 0.1361926943063736, "learning_rate": 1.665551839464883e-06, "loss": 2.615, "step": 43400 }, { "epoch": 2533.280227596017, "eval_loss": 2.617976188659668, "eval_runtime": 8.5489, "eval_samples_per_second": 163.062, "eval_steps_per_second": 20.47, "step": 43400 }, { "epoch": 2538.9701280227596, "grad_norm": 0.1490316092967987, "learning_rate": 1.6555183946488294e-06, "loss": 2.6146, "step": 43500 }, { "epoch": 2538.9701280227596, "eval_loss": 2.616652250289917, "eval_runtime": 8.5376, "eval_samples_per_second": 163.279, "eval_steps_per_second": 20.498, "step": 43500 }, { "epoch": 2544.660028449502, "grad_norm": 0.13588373363018036, "learning_rate": 1.6454849498327758e-06, "loss": 2.6152, "step": 43600 }, { "epoch": 2544.660028449502, "eval_loss": 2.6177802085876465, "eval_runtime": 8.8667, "eval_samples_per_second": 157.217, "eval_steps_per_second": 19.737, "step": 43600 }, { "epoch": 2550.3499288762446, "grad_norm": 0.12654942274093628, "learning_rate": 1.6354515050167223e-06, "loss": 2.6146, "step": 43700 }, { "epoch": 2550.3499288762446, "eval_loss": 2.615847110748291, "eval_runtime": 8.5335, "eval_samples_per_second": 163.356, "eval_steps_per_second": 20.507, "step": 43700 }, { "epoch": 2556.039829302987, "grad_norm": 0.15947924554347992, "learning_rate": 1.6254180602006689e-06, "loss": 2.6149, "step": 43800 }, { "epoch": 2556.039829302987, "eval_loss": 2.613116502761841, "eval_runtime": 8.5279, "eval_samples_per_second": 163.464, "eval_steps_per_second": 20.521, "step": 43800 }, { "epoch": 2561.7297297297296, "grad_norm": 0.11915856599807739, "learning_rate": 1.6153846153846154e-06, "loss": 2.6146, "step": 43900 }, { "epoch": 2561.7297297297296, "eval_loss": 2.614288568496704, "eval_runtime": 8.8597, "eval_samples_per_second": 157.342, "eval_steps_per_second": 19.752, "step": 43900 }, { "epoch": 2567.419630156472, "grad_norm": 0.1312067210674286, "learning_rate": 1.6053511705685618e-06, "loss": 2.6147, "step": 44000 }, { "epoch": 2567.419630156472, "eval_loss": 2.6091578006744385, "eval_runtime": 8.5457, "eval_samples_per_second": 163.122, "eval_steps_per_second": 20.478, "step": 44000 }, { "epoch": 2573.109530583215, "grad_norm": 0.14233353734016418, "learning_rate": 1.5953177257525083e-06, "loss": 2.6148, "step": 44100 }, { "epoch": 2573.109530583215, "eval_loss": 2.612126111984253, "eval_runtime": 8.5355, "eval_samples_per_second": 163.318, "eval_steps_per_second": 20.503, "step": 44100 }, { "epoch": 2578.7994310099575, "grad_norm": 0.1357184797525406, "learning_rate": 1.5852842809364549e-06, "loss": 2.6149, "step": 44200 }, { "epoch": 2578.7994310099575, "eval_loss": 2.618696928024292, "eval_runtime": 8.8741, "eval_samples_per_second": 157.087, "eval_steps_per_second": 19.72, "step": 44200 }, { "epoch": 2584.4893314367, "grad_norm": 0.14556884765625, "learning_rate": 1.5752508361204012e-06, "loss": 2.6142, "step": 44300 }, { "epoch": 2584.4893314367, "eval_loss": 2.616926908493042, "eval_runtime": 8.5359, "eval_samples_per_second": 163.311, "eval_steps_per_second": 20.502, "step": 44300 }, { "epoch": 2590.1792318634425, "grad_norm": 0.12908801436424255, "learning_rate": 1.5652173913043478e-06, "loss": 2.6145, "step": 44400 }, { "epoch": 2590.1792318634425, "eval_loss": 2.6157069206237793, "eval_runtime": 8.8877, "eval_samples_per_second": 156.846, "eval_steps_per_second": 19.69, "step": 44400 }, { "epoch": 2595.869132290185, "grad_norm": 0.14168845117092133, "learning_rate": 1.5551839464882943e-06, "loss": 2.6146, "step": 44500 }, { "epoch": 2595.869132290185, "eval_loss": 2.615161657333374, "eval_runtime": 8.5305, "eval_samples_per_second": 163.413, "eval_steps_per_second": 20.515, "step": 44500 }, { "epoch": 2601.5590327169275, "grad_norm": 0.13634611666202545, "learning_rate": 1.5451505016722409e-06, "loss": 2.6146, "step": 44600 }, { "epoch": 2601.5590327169275, "eval_loss": 2.6135544776916504, "eval_runtime": 8.54, "eval_samples_per_second": 163.233, "eval_steps_per_second": 20.492, "step": 44600 }, { "epoch": 2607.24893314367, "grad_norm": 0.14684821665287018, "learning_rate": 1.5351170568561872e-06, "loss": 2.6149, "step": 44700 }, { "epoch": 2607.24893314367, "eval_loss": 2.616076707839966, "eval_runtime": 8.517, "eval_samples_per_second": 163.672, "eval_steps_per_second": 20.547, "step": 44700 }, { "epoch": 2612.9388335704125, "grad_norm": 0.14135567843914032, "learning_rate": 1.5250836120401338e-06, "loss": 2.6142, "step": 44800 }, { "epoch": 2612.9388335704125, "eval_loss": 2.6159145832061768, "eval_runtime": 8.8492, "eval_samples_per_second": 157.529, "eval_steps_per_second": 19.776, "step": 44800 }, { "epoch": 2618.628733997155, "grad_norm": 0.1256554275751114, "learning_rate": 1.5150501672240803e-06, "loss": 2.6142, "step": 44900 }, { "epoch": 2618.628733997155, "eval_loss": 2.6102206707000732, "eval_runtime": 8.5351, "eval_samples_per_second": 163.325, "eval_steps_per_second": 20.503, "step": 44900 }, { "epoch": 2624.3186344238975, "grad_norm": 0.12723155319690704, "learning_rate": 1.5050167224080269e-06, "loss": 2.614, "step": 45000 }, { "epoch": 2624.3186344238975, "eval_loss": 2.6176397800445557, "eval_runtime": 8.5392, "eval_samples_per_second": 163.248, "eval_steps_per_second": 20.494, "step": 45000 }, { "epoch": 2630.00853485064, "grad_norm": 0.1423732191324234, "learning_rate": 1.4949832775919732e-06, "loss": 2.6145, "step": 45100 }, { "epoch": 2630.00853485064, "eval_loss": 2.613284111022949, "eval_runtime": 8.5394, "eval_samples_per_second": 163.243, "eval_steps_per_second": 20.493, "step": 45100 }, { "epoch": 2635.6984352773825, "grad_norm": 0.127468079328537, "learning_rate": 1.4849498327759198e-06, "loss": 2.6143, "step": 45200 }, { "epoch": 2635.6984352773825, "eval_loss": 2.614154100418091, "eval_runtime": 8.842, "eval_samples_per_second": 157.657, "eval_steps_per_second": 19.792, "step": 45200 }, { "epoch": 2641.388335704125, "grad_norm": 0.13406263291835785, "learning_rate": 1.4749163879598663e-06, "loss": 2.6141, "step": 45300 }, { "epoch": 2641.388335704125, "eval_loss": 2.614849328994751, "eval_runtime": 8.53, "eval_samples_per_second": 163.424, "eval_steps_per_second": 20.516, "step": 45300 }, { "epoch": 2647.078236130868, "grad_norm": 0.14327415823936462, "learning_rate": 1.4648829431438129e-06, "loss": 2.6142, "step": 45400 }, { "epoch": 2647.078236130868, "eval_loss": 2.6156599521636963, "eval_runtime": 8.5293, "eval_samples_per_second": 163.437, "eval_steps_per_second": 20.518, "step": 45400 }, { "epoch": 2652.7681365576104, "grad_norm": 0.13055378198623657, "learning_rate": 1.4548494983277592e-06, "loss": 2.6141, "step": 45500 }, { "epoch": 2652.7681365576104, "eval_loss": 2.6118390560150146, "eval_runtime": 8.5382, "eval_samples_per_second": 163.267, "eval_steps_per_second": 20.496, "step": 45500 }, { "epoch": 2658.458036984353, "grad_norm": 0.14269088208675385, "learning_rate": 1.4448160535117058e-06, "loss": 2.6149, "step": 45600 }, { "epoch": 2658.458036984353, "eval_loss": 2.616089105606079, "eval_runtime": 8.8625, "eval_samples_per_second": 157.291, "eval_steps_per_second": 19.746, "step": 45600 }, { "epoch": 2664.1479374110954, "grad_norm": 0.13923226296901703, "learning_rate": 1.4347826086956523e-06, "loss": 2.6141, "step": 45700 }, { "epoch": 2664.1479374110954, "eval_loss": 2.615753412246704, "eval_runtime": 8.5429, "eval_samples_per_second": 163.175, "eval_steps_per_second": 20.485, "step": 45700 }, { "epoch": 2669.837837837838, "grad_norm": 0.11520116031169891, "learning_rate": 1.4247491638795989e-06, "loss": 2.614, "step": 45800 }, { "epoch": 2669.837837837838, "eval_loss": 2.614213228225708, "eval_runtime": 8.8633, "eval_samples_per_second": 157.278, "eval_steps_per_second": 19.744, "step": 45800 }, { "epoch": 2675.5277382645804, "grad_norm": 0.13826854526996613, "learning_rate": 1.4147157190635452e-06, "loss": 2.6141, "step": 45900 }, { "epoch": 2675.5277382645804, "eval_loss": 2.6141257286071777, "eval_runtime": 8.531, "eval_samples_per_second": 163.403, "eval_steps_per_second": 20.513, "step": 45900 }, { "epoch": 2681.217638691323, "grad_norm": 0.1388641595840454, "learning_rate": 1.4046822742474917e-06, "loss": 2.614, "step": 46000 }, { "epoch": 2681.217638691323, "eval_loss": 2.6166601181030273, "eval_runtime": 8.8588, "eval_samples_per_second": 157.357, "eval_steps_per_second": 19.754, "step": 46000 }, { "epoch": 2686.9075391180654, "grad_norm": 0.1250719428062439, "learning_rate": 1.3946488294314383e-06, "loss": 2.6134, "step": 46100 }, { "epoch": 2686.9075391180654, "eval_loss": 2.6177427768707275, "eval_runtime": 8.53, "eval_samples_per_second": 163.424, "eval_steps_per_second": 20.516, "step": 46100 }, { "epoch": 2692.597439544808, "grad_norm": 0.1312686949968338, "learning_rate": 1.3846153846153846e-06, "loss": 2.614, "step": 46200 }, { "epoch": 2692.597439544808, "eval_loss": 2.61600399017334, "eval_runtime": 8.518, "eval_samples_per_second": 163.653, "eval_steps_per_second": 20.545, "step": 46200 }, { "epoch": 2698.2873399715504, "grad_norm": 0.1418214589357376, "learning_rate": 1.374581939799331e-06, "loss": 2.6142, "step": 46300 }, { "epoch": 2698.2873399715504, "eval_loss": 2.6169888973236084, "eval_runtime": 8.5439, "eval_samples_per_second": 163.157, "eval_steps_per_second": 20.482, "step": 46300 }, { "epoch": 2703.977240398293, "grad_norm": 0.13503268361091614, "learning_rate": 1.3645484949832775e-06, "loss": 2.6141, "step": 46400 }, { "epoch": 2703.977240398293, "eval_loss": 2.617550849914551, "eval_runtime": 8.8621, "eval_samples_per_second": 157.299, "eval_steps_per_second": 19.747, "step": 46400 }, { "epoch": 2709.6671408250354, "grad_norm": 0.13151606917381287, "learning_rate": 1.354515050167224e-06, "loss": 2.6138, "step": 46500 }, { "epoch": 2709.6671408250354, "eval_loss": 2.614605665206909, "eval_runtime": 8.5338, "eval_samples_per_second": 163.351, "eval_steps_per_second": 20.507, "step": 46500 }, { "epoch": 2715.357041251778, "grad_norm": 0.12771758437156677, "learning_rate": 1.3444816053511706e-06, "loss": 2.6141, "step": 46600 }, { "epoch": 2715.357041251778, "eval_loss": 2.6184401512145996, "eval_runtime": 8.8574, "eval_samples_per_second": 157.383, "eval_steps_per_second": 19.758, "step": 46600 }, { "epoch": 2721.046941678521, "grad_norm": 0.13841165602207184, "learning_rate": 1.334448160535117e-06, "loss": 2.6138, "step": 46700 }, { "epoch": 2721.046941678521, "eval_loss": 2.617668867111206, "eval_runtime": 8.5339, "eval_samples_per_second": 163.348, "eval_steps_per_second": 20.506, "step": 46700 }, { "epoch": 2726.7368421052633, "grad_norm": 0.12478631734848022, "learning_rate": 1.3244147157190635e-06, "loss": 2.6141, "step": 46800 }, { "epoch": 2726.7368421052633, "eval_loss": 2.61787748336792, "eval_runtime": 8.8804, "eval_samples_per_second": 156.974, "eval_steps_per_second": 19.706, "step": 46800 }, { "epoch": 2732.426742532006, "grad_norm": 0.13361801207065582, "learning_rate": 1.31438127090301e-06, "loss": 2.6136, "step": 46900 }, { "epoch": 2732.426742532006, "eval_loss": 2.6143569946289062, "eval_runtime": 8.5467, "eval_samples_per_second": 163.105, "eval_steps_per_second": 20.476, "step": 46900 }, { "epoch": 2738.1166429587483, "grad_norm": 0.1362065225839615, "learning_rate": 1.3043478260869566e-06, "loss": 2.6135, "step": 47000 }, { "epoch": 2738.1166429587483, "eval_loss": 2.613162040710449, "eval_runtime": 8.8749, "eval_samples_per_second": 157.071, "eval_steps_per_second": 19.718, "step": 47000 }, { "epoch": 2743.806543385491, "grad_norm": 0.14401383697986603, "learning_rate": 1.294314381270903e-06, "loss": 2.6142, "step": 47100 }, { "epoch": 2743.806543385491, "eval_loss": 2.6157407760620117, "eval_runtime": 8.5464, "eval_samples_per_second": 163.11, "eval_steps_per_second": 20.476, "step": 47100 }, { "epoch": 2749.4964438122333, "grad_norm": 0.14595820009708405, "learning_rate": 1.2842809364548495e-06, "loss": 2.6136, "step": 47200 }, { "epoch": 2749.4964438122333, "eval_loss": 2.6133205890655518, "eval_runtime": 8.8738, "eval_samples_per_second": 157.092, "eval_steps_per_second": 19.721, "step": 47200 }, { "epoch": 2755.186344238976, "grad_norm": 0.14186260104179382, "learning_rate": 1.274247491638796e-06, "loss": 2.6134, "step": 47300 }, { "epoch": 2755.186344238976, "eval_loss": 2.617734670639038, "eval_runtime": 8.5319, "eval_samples_per_second": 163.387, "eval_steps_per_second": 20.511, "step": 47300 }, { "epoch": 2760.8762446657183, "grad_norm": 0.13552911579608917, "learning_rate": 1.2642140468227424e-06, "loss": 2.6135, "step": 47400 }, { "epoch": 2760.8762446657183, "eval_loss": 2.6164298057556152, "eval_runtime": 8.5373, "eval_samples_per_second": 163.284, "eval_steps_per_second": 20.498, "step": 47400 }, { "epoch": 2766.566145092461, "grad_norm": 0.12871357798576355, "learning_rate": 1.254180602006689e-06, "loss": 2.6138, "step": 47500 }, { "epoch": 2766.566145092461, "eval_loss": 2.6128602027893066, "eval_runtime": 8.5494, "eval_samples_per_second": 163.052, "eval_steps_per_second": 20.469, "step": 47500 }, { "epoch": 2772.2560455192033, "grad_norm": 0.12483840435743332, "learning_rate": 1.2441471571906355e-06, "loss": 2.6138, "step": 47600 }, { "epoch": 2772.2560455192033, "eval_loss": 2.6143155097961426, "eval_runtime": 8.8607, "eval_samples_per_second": 157.325, "eval_steps_per_second": 19.75, "step": 47600 }, { "epoch": 2777.945945945946, "grad_norm": 0.13678689301013947, "learning_rate": 1.234113712374582e-06, "loss": 2.6131, "step": 47700 }, { "epoch": 2777.945945945946, "eval_loss": 2.617353916168213, "eval_runtime": 8.5357, "eval_samples_per_second": 163.314, "eval_steps_per_second": 20.502, "step": 47700 }, { "epoch": 2783.6358463726883, "grad_norm": 0.12394748628139496, "learning_rate": 1.2240802675585284e-06, "loss": 2.6136, "step": 47800 }, { "epoch": 2783.6358463726883, "eval_loss": 2.6153721809387207, "eval_runtime": 8.548, "eval_samples_per_second": 163.08, "eval_steps_per_second": 20.473, "step": 47800 }, { "epoch": 2817.429587482219, "grad_norm": 0.15023942291736603, "learning_rate": 1.214046822742475e-06, "loss": 2.613, "step": 47900 }, { "epoch": 2817.429587482219, "eval_loss": 2.6161534786224365, "eval_runtime": 8.8571, "eval_samples_per_second": 157.388, "eval_steps_per_second": 19.758, "step": 47900 }, { "epoch": 2823.1194879089617, "grad_norm": 0.140534445643425, "learning_rate": 1.2040133779264215e-06, "loss": 2.6139, "step": 48000 }, { "epoch": 2823.1194879089617, "eval_loss": 2.614151954650879, "eval_runtime": 8.5312, "eval_samples_per_second": 163.4, "eval_steps_per_second": 20.513, "step": 48000 }, { "epoch": 2828.809388335704, "grad_norm": 0.1297474205493927, "learning_rate": 1.193979933110368e-06, "loss": 2.6131, "step": 48100 }, { "epoch": 2828.809388335704, "eval_loss": 2.612234354019165, "eval_runtime": 8.5265, "eval_samples_per_second": 163.489, "eval_steps_per_second": 20.524, "step": 48100 }, { "epoch": 2834.4992887624467, "grad_norm": 0.1272091567516327, "learning_rate": 1.1839464882943144e-06, "loss": 2.613, "step": 48200 }, { "epoch": 2834.4992887624467, "eval_loss": 2.617521047592163, "eval_runtime": 8.839, "eval_samples_per_second": 157.711, "eval_steps_per_second": 19.799, "step": 48200 }, { "epoch": 2840.189189189189, "grad_norm": 0.16200745105743408, "learning_rate": 1.173913043478261e-06, "loss": 2.6134, "step": 48300 }, { "epoch": 2840.189189189189, "eval_loss": 2.614422559738159, "eval_runtime": 8.8546, "eval_samples_per_second": 157.433, "eval_steps_per_second": 19.764, "step": 48300 }, { "epoch": 2845.8790896159317, "grad_norm": 0.13503460586071014, "learning_rate": 1.1638795986622075e-06, "loss": 2.6137, "step": 48400 }, { "epoch": 2845.8790896159317, "eval_loss": 2.612483263015747, "eval_runtime": 8.8623, "eval_samples_per_second": 157.295, "eval_steps_per_second": 19.747, "step": 48400 }, { "epoch": 2851.568990042674, "grad_norm": 0.1506689339876175, "learning_rate": 1.153846153846154e-06, "loss": 2.6139, "step": 48500 }, { "epoch": 2851.568990042674, "eval_loss": 2.614689588546753, "eval_runtime": 8.539, "eval_samples_per_second": 163.251, "eval_steps_per_second": 20.494, "step": 48500 }, { "epoch": 2857.2588904694167, "grad_norm": 0.13846616446971893, "learning_rate": 1.1438127090301004e-06, "loss": 2.6135, "step": 48600 }, { "epoch": 2857.2588904694167, "eval_loss": 2.612041711807251, "eval_runtime": 8.5482, "eval_samples_per_second": 163.075, "eval_steps_per_second": 20.472, "step": 48600 }, { "epoch": 2862.948790896159, "grad_norm": 0.12145441025495529, "learning_rate": 1.133779264214047e-06, "loss": 2.6134, "step": 48700 }, { "epoch": 2862.948790896159, "eval_loss": 2.614562749862671, "eval_runtime": 8.5346, "eval_samples_per_second": 163.336, "eval_steps_per_second": 20.505, "step": 48700 }, { "epoch": 2868.6386913229016, "grad_norm": 0.1398162990808487, "learning_rate": 1.1237458193979933e-06, "loss": 2.6135, "step": 48800 }, { "epoch": 2868.6386913229016, "eval_loss": 2.6151633262634277, "eval_runtime": 8.845, "eval_samples_per_second": 157.603, "eval_steps_per_second": 19.785, "step": 48800 }, { "epoch": 2874.3285917496446, "grad_norm": 0.13078400492668152, "learning_rate": 1.1137123745819398e-06, "loss": 2.6135, "step": 48900 }, { "epoch": 2874.3285917496446, "eval_loss": 2.612288236618042, "eval_runtime": 8.5278, "eval_samples_per_second": 163.466, "eval_steps_per_second": 20.521, "step": 48900 }, { "epoch": 2880.018492176387, "grad_norm": 0.14920541644096375, "learning_rate": 1.1036789297658862e-06, "loss": 2.6135, "step": 49000 }, { "epoch": 2880.018492176387, "eval_loss": 2.615710496902466, "eval_runtime": 8.5324, "eval_samples_per_second": 163.376, "eval_steps_per_second": 20.51, "step": 49000 }, { "epoch": 2885.7083926031296, "grad_norm": 0.12429507821798325, "learning_rate": 1.0936454849498327e-06, "loss": 2.6132, "step": 49100 }, { "epoch": 2885.7083926031296, "eval_loss": 2.6171202659606934, "eval_runtime": 8.526, "eval_samples_per_second": 163.5, "eval_steps_per_second": 20.526, "step": 49100 }, { "epoch": 2891.398293029872, "grad_norm": 0.14503461122512817, "learning_rate": 1.0836120401337793e-06, "loss": 2.6126, "step": 49200 }, { "epoch": 2891.398293029872, "eval_loss": 2.617884874343872, "eval_runtime": 8.8466, "eval_samples_per_second": 157.575, "eval_steps_per_second": 19.782, "step": 49200 }, { "epoch": 2897.0881934566146, "grad_norm": 0.1336805522441864, "learning_rate": 1.0735785953177258e-06, "loss": 2.6132, "step": 49300 }, { "epoch": 2897.0881934566146, "eval_loss": 2.6135501861572266, "eval_runtime": 8.5221, "eval_samples_per_second": 163.574, "eval_steps_per_second": 20.535, "step": 49300 }, { "epoch": 2902.778093883357, "grad_norm": 0.13340643048286438, "learning_rate": 1.0635451505016722e-06, "loss": 2.6132, "step": 49400 }, { "epoch": 2902.778093883357, "eval_loss": 2.6155478954315186, "eval_runtime": 8.5297, "eval_samples_per_second": 163.428, "eval_steps_per_second": 20.516, "step": 49400 }, { "epoch": 2908.4679943100996, "grad_norm": 0.12103159725666046, "learning_rate": 1.0535117056856187e-06, "loss": 2.6128, "step": 49500 }, { "epoch": 2908.4679943100996, "eval_loss": 2.614527702331543, "eval_runtime": 8.5244, "eval_samples_per_second": 163.531, "eval_steps_per_second": 20.529, "step": 49500 }, { "epoch": 2914.157894736842, "grad_norm": 0.13566209375858307, "learning_rate": 1.0434782608695653e-06, "loss": 2.6131, "step": 49600 }, { "epoch": 2914.157894736842, "eval_loss": 2.6156363487243652, "eval_runtime": 8.8443, "eval_samples_per_second": 157.615, "eval_steps_per_second": 19.787, "step": 49600 }, { "epoch": 2919.8477951635846, "grad_norm": 0.14300596714019775, "learning_rate": 1.0334448160535118e-06, "loss": 2.6133, "step": 49700 }, { "epoch": 2919.8477951635846, "eval_loss": 2.614346742630005, "eval_runtime": 8.5557, "eval_samples_per_second": 162.933, "eval_steps_per_second": 20.454, "step": 49700 }, { "epoch": 2925.537695590327, "grad_norm": 0.1305309683084488, "learning_rate": 1.0234113712374581e-06, "loss": 2.6127, "step": 49800 }, { "epoch": 2925.537695590327, "eval_loss": 2.617161512374878, "eval_runtime": 8.5486, "eval_samples_per_second": 163.068, "eval_steps_per_second": 20.471, "step": 49800 }, { "epoch": 2931.2275960170696, "grad_norm": 0.12761159241199493, "learning_rate": 1.0133779264214047e-06, "loss": 2.6131, "step": 49900 }, { "epoch": 2931.2275960170696, "eval_loss": 2.609426259994507, "eval_runtime": 8.5282, "eval_samples_per_second": 163.459, "eval_steps_per_second": 20.52, "step": 49900 }, { "epoch": 2936.917496443812, "grad_norm": 0.14436037838459015, "learning_rate": 1.0033444816053512e-06, "loss": 2.6129, "step": 50000 }, { "epoch": 2936.917496443812, "eval_loss": 2.6124675273895264, "eval_runtime": 8.8553, "eval_samples_per_second": 157.419, "eval_steps_per_second": 19.762, "step": 50000 }, { "epoch": 2942.6073968705546, "grad_norm": 0.14199206233024597, "learning_rate": 9.933110367892976e-07, "loss": 2.6135, "step": 50100 }, { "epoch": 2942.6073968705546, "eval_loss": 2.6135404109954834, "eval_runtime": 8.5286, "eval_samples_per_second": 163.449, "eval_steps_per_second": 20.519, "step": 50100 }, { "epoch": 2948.2972972972975, "grad_norm": 0.13962940871715546, "learning_rate": 9.832775919732441e-07, "loss": 2.6124, "step": 50200 }, { "epoch": 2948.2972972972975, "eval_loss": 2.6167821884155273, "eval_runtime": 8.5303, "eval_samples_per_second": 163.418, "eval_steps_per_second": 20.515, "step": 50200 }, { "epoch": 2953.98719772404, "grad_norm": 0.13427576422691345, "learning_rate": 9.732441471571907e-07, "loss": 2.6131, "step": 50300 }, { "epoch": 2953.98719772404, "eval_loss": 2.614187240600586, "eval_runtime": 8.5272, "eval_samples_per_second": 163.476, "eval_steps_per_second": 20.522, "step": 50300 }, { "epoch": 2959.6770981507825, "grad_norm": 0.14102576673030853, "learning_rate": 9.632107023411372e-07, "loss": 2.613, "step": 50400 }, { "epoch": 2959.6770981507825, "eval_loss": 2.6135125160217285, "eval_runtime": 8.8525, "eval_samples_per_second": 157.47, "eval_steps_per_second": 19.769, "step": 50400 }, { "epoch": 2965.366998577525, "grad_norm": 0.13503779470920563, "learning_rate": 9.531772575250837e-07, "loss": 2.6128, "step": 50500 }, { "epoch": 2965.366998577525, "eval_loss": 2.616763114929199, "eval_runtime": 8.5222, "eval_samples_per_second": 163.573, "eval_steps_per_second": 20.535, "step": 50500 }, { "epoch": 2971.0568990042675, "grad_norm": 0.11660658568143845, "learning_rate": 9.431438127090301e-07, "loss": 2.6132, "step": 50600 }, { "epoch": 2971.0568990042675, "eval_loss": 2.6096742153167725, "eval_runtime": 8.5204, "eval_samples_per_second": 163.606, "eval_steps_per_second": 20.539, "step": 50600 }, { "epoch": 2976.74679943101, "grad_norm": 0.11942931264638901, "learning_rate": 9.331103678929767e-07, "loss": 2.6127, "step": 50700 }, { "epoch": 2976.74679943101, "eval_loss": 2.6175696849823, "eval_runtime": 8.5208, "eval_samples_per_second": 163.599, "eval_steps_per_second": 20.538, "step": 50700 }, { "epoch": 2982.4366998577525, "grad_norm": 0.13427217304706573, "learning_rate": 9.230769230769231e-07, "loss": 2.6129, "step": 50800 }, { "epoch": 2982.4366998577525, "eval_loss": 2.617108106613159, "eval_runtime": 8.8462, "eval_samples_per_second": 157.582, "eval_steps_per_second": 19.783, "step": 50800 }, { "epoch": 2988.126600284495, "grad_norm": 0.13947026431560516, "learning_rate": 9.130434782608697e-07, "loss": 2.6128, "step": 50900 }, { "epoch": 2988.126600284495, "eval_loss": 2.614734411239624, "eval_runtime": 8.5215, "eval_samples_per_second": 163.586, "eval_steps_per_second": 20.536, "step": 50900 }, { "epoch": 2993.8165007112375, "grad_norm": 0.12719608843326569, "learning_rate": 9.030100334448161e-07, "loss": 2.6132, "step": 51000 }, { "epoch": 2993.8165007112375, "eval_loss": 2.6145222187042236, "eval_runtime": 8.5246, "eval_samples_per_second": 163.526, "eval_steps_per_second": 20.529, "step": 51000 }, { "epoch": 2999.50640113798, "grad_norm": 0.13431696593761444, "learning_rate": 8.929765886287627e-07, "loss": 2.613, "step": 51100 }, { "epoch": 2999.50640113798, "eval_loss": 2.615795850753784, "eval_runtime": 8.5247, "eval_samples_per_second": 163.525, "eval_steps_per_second": 20.529, "step": 51100 }, { "epoch": 3005.1963015647225, "grad_norm": 0.122039295732975, "learning_rate": 8.829431438127091e-07, "loss": 2.6133, "step": 51200 }, { "epoch": 3005.1963015647225, "eval_loss": 2.613452434539795, "eval_runtime": 8.8592, "eval_samples_per_second": 157.35, "eval_steps_per_second": 19.753, "step": 51200 }, { "epoch": 3010.886201991465, "grad_norm": 0.12794600427150726, "learning_rate": 8.729096989966555e-07, "loss": 2.6125, "step": 51300 }, { "epoch": 3010.886201991465, "eval_loss": 2.613462209701538, "eval_runtime": 8.5292, "eval_samples_per_second": 163.439, "eval_steps_per_second": 20.518, "step": 51300 }, { "epoch": 3016.5761024182075, "grad_norm": 0.15235668420791626, "learning_rate": 8.628762541806019e-07, "loss": 2.6128, "step": 51400 }, { "epoch": 3016.5761024182075, "eval_loss": 2.6159415245056152, "eval_runtime": 8.5195, "eval_samples_per_second": 163.624, "eval_steps_per_second": 20.541, "step": 51400 }, { "epoch": 3022.2660028449504, "grad_norm": 0.1353672742843628, "learning_rate": 8.528428093645485e-07, "loss": 2.6126, "step": 51500 }, { "epoch": 3022.2660028449504, "eval_loss": 2.6131489276885986, "eval_runtime": 8.5212, "eval_samples_per_second": 163.593, "eval_steps_per_second": 20.537, "step": 51500 }, { "epoch": 3027.955903271693, "grad_norm": 0.1265411078929901, "learning_rate": 8.428093645484949e-07, "loss": 2.6127, "step": 51600 }, { "epoch": 3027.955903271693, "eval_loss": 2.6140012741088867, "eval_runtime": 8.8546, "eval_samples_per_second": 157.432, "eval_steps_per_second": 19.764, "step": 51600 }, { "epoch": 3033.6458036984354, "grad_norm": 0.12123577296733856, "learning_rate": 8.327759197324414e-07, "loss": 2.6123, "step": 51700 }, { "epoch": 3033.6458036984354, "eval_loss": 2.617744207382202, "eval_runtime": 8.5337, "eval_samples_per_second": 163.352, "eval_steps_per_second": 20.507, "step": 51700 }, { "epoch": 3039.335704125178, "grad_norm": 0.13425582647323608, "learning_rate": 8.227424749163879e-07, "loss": 2.6128, "step": 51800 }, { "epoch": 3039.335704125178, "eval_loss": 2.6144909858703613, "eval_runtime": 8.5251, "eval_samples_per_second": 163.518, "eval_steps_per_second": 20.528, "step": 51800 }, { "epoch": 3045.0256045519204, "grad_norm": 0.12807567417621613, "learning_rate": 8.127090301003344e-07, "loss": 2.6127, "step": 51900 }, { "epoch": 3045.0256045519204, "eval_loss": 2.6116931438446045, "eval_runtime": 8.5237, "eval_samples_per_second": 163.545, "eval_steps_per_second": 20.531, "step": 51900 }, { "epoch": 3050.715504978663, "grad_norm": 0.13802653551101685, "learning_rate": 8.026755852842809e-07, "loss": 2.6127, "step": 52000 }, { "epoch": 3050.715504978663, "eval_loss": 2.613800287246704, "eval_runtime": 8.8567, "eval_samples_per_second": 157.395, "eval_steps_per_second": 19.759, "step": 52000 }, { "epoch": 3056.4054054054054, "grad_norm": 0.13295966386795044, "learning_rate": 7.926421404682274e-07, "loss": 2.6129, "step": 52100 }, { "epoch": 3056.4054054054054, "eval_loss": 2.614452600479126, "eval_runtime": 8.5253, "eval_samples_per_second": 163.513, "eval_steps_per_second": 20.527, "step": 52100 }, { "epoch": 3062.095305832148, "grad_norm": 0.1325223296880722, "learning_rate": 7.826086956521739e-07, "loss": 2.6128, "step": 52200 }, { "epoch": 3062.095305832148, "eval_loss": 2.6134650707244873, "eval_runtime": 8.5247, "eval_samples_per_second": 163.525, "eval_steps_per_second": 20.529, "step": 52200 }, { "epoch": 3067.7852062588904, "grad_norm": 0.13717898726463318, "learning_rate": 7.725752508361204e-07, "loss": 2.613, "step": 52300 }, { "epoch": 3067.7852062588904, "eval_loss": 2.614713430404663, "eval_runtime": 8.5281, "eval_samples_per_second": 163.459, "eval_steps_per_second": 20.52, "step": 52300 }, { "epoch": 3073.475106685633, "grad_norm": 0.1319703459739685, "learning_rate": 7.625418060200669e-07, "loss": 2.6121, "step": 52400 }, { "epoch": 3073.475106685633, "eval_loss": 2.615602970123291, "eval_runtime": 8.8484, "eval_samples_per_second": 157.542, "eval_steps_per_second": 19.777, "step": 52400 }, { "epoch": 3079.1650071123754, "grad_norm": 0.14499501883983612, "learning_rate": 7.525083612040134e-07, "loss": 2.6127, "step": 52500 }, { "epoch": 3079.1650071123754, "eval_loss": 2.61905574798584, "eval_runtime": 8.5283, "eval_samples_per_second": 163.455, "eval_steps_per_second": 20.52, "step": 52500 }, { "epoch": 3084.854907539118, "grad_norm": 0.12991563975811005, "learning_rate": 7.424749163879599e-07, "loss": 2.6125, "step": 52600 }, { "epoch": 3084.854907539118, "eval_loss": 2.616900682449341, "eval_runtime": 8.5227, "eval_samples_per_second": 163.563, "eval_steps_per_second": 20.533, "step": 52600 }, { "epoch": 3090.5448079658604, "grad_norm": 0.13655343651771545, "learning_rate": 7.324414715719064e-07, "loss": 2.6125, "step": 52700 }, { "epoch": 3090.5448079658604, "eval_loss": 2.6136441230773926, "eval_runtime": 8.8771, "eval_samples_per_second": 157.033, "eval_steps_per_second": 19.714, "step": 52700 }, { "epoch": 3096.2347083926034, "grad_norm": 0.1371728628873825, "learning_rate": 7.224080267558529e-07, "loss": 2.6125, "step": 52800 }, { "epoch": 3096.2347083926034, "eval_loss": 2.6113550662994385, "eval_runtime": 8.5351, "eval_samples_per_second": 163.325, "eval_steps_per_second": 20.504, "step": 52800 }, { "epoch": 3101.924608819346, "grad_norm": 0.12911546230316162, "learning_rate": 7.123745819397994e-07, "loss": 2.6125, "step": 52900 }, { "epoch": 3101.924608819346, "eval_loss": 2.61665940284729, "eval_runtime": 8.8673, "eval_samples_per_second": 157.206, "eval_steps_per_second": 19.735, "step": 52900 }, { "epoch": 3107.6145092460883, "grad_norm": 0.13488008081912994, "learning_rate": 7.023411371237459e-07, "loss": 2.6125, "step": 53000 }, { "epoch": 3107.6145092460883, "eval_loss": 2.616480827331543, "eval_runtime": 8.5263, "eval_samples_per_second": 163.495, "eval_steps_per_second": 20.525, "step": 53000 }, { "epoch": 3113.304409672831, "grad_norm": 0.1279713660478592, "learning_rate": 6.923076923076923e-07, "loss": 2.6126, "step": 53100 }, { "epoch": 3113.304409672831, "eval_loss": 2.612837553024292, "eval_runtime": 8.8593, "eval_samples_per_second": 157.348, "eval_steps_per_second": 19.753, "step": 53100 }, { "epoch": 3118.9943100995733, "grad_norm": 0.11780209094285965, "learning_rate": 6.822742474916388e-07, "loss": 2.6129, "step": 53200 }, { "epoch": 3118.9943100995733, "eval_loss": 2.6156790256500244, "eval_runtime": 8.5184, "eval_samples_per_second": 163.645, "eval_steps_per_second": 20.544, "step": 53200 }, { "epoch": 3124.684210526316, "grad_norm": 0.1243632510304451, "learning_rate": 6.722408026755853e-07, "loss": 2.6131, "step": 53300 }, { "epoch": 3124.684210526316, "eval_loss": 2.6163458824157715, "eval_runtime": 8.8667, "eval_samples_per_second": 157.217, "eval_steps_per_second": 19.737, "step": 53300 }, { "epoch": 3130.3741109530583, "grad_norm": 0.13544081151485443, "learning_rate": 6.622073578595318e-07, "loss": 2.6124, "step": 53400 }, { "epoch": 3130.3741109530583, "eval_loss": 2.6154003143310547, "eval_runtime": 8.521, "eval_samples_per_second": 163.596, "eval_steps_per_second": 20.538, "step": 53400 }, { "epoch": 3136.064011379801, "grad_norm": 0.14009779691696167, "learning_rate": 6.521739130434783e-07, "loss": 2.6129, "step": 53500 }, { "epoch": 3136.064011379801, "eval_loss": 2.6123223304748535, "eval_runtime": 8.5157, "eval_samples_per_second": 163.698, "eval_steps_per_second": 20.55, "step": 53500 }, { "epoch": 3141.7539118065433, "grad_norm": 0.12656356394290924, "learning_rate": 6.421404682274248e-07, "loss": 2.6122, "step": 53600 }, { "epoch": 3141.7539118065433, "eval_loss": 2.6149022579193115, "eval_runtime": 8.5183, "eval_samples_per_second": 163.648, "eval_steps_per_second": 20.544, "step": 53600 }, { "epoch": 3147.443812233286, "grad_norm": 0.1256483644247055, "learning_rate": 6.321070234113712e-07, "loss": 2.6121, "step": 53700 }, { "epoch": 3147.443812233286, "eval_loss": 2.613800048828125, "eval_runtime": 8.8545, "eval_samples_per_second": 157.434, "eval_steps_per_second": 19.764, "step": 53700 }, { "epoch": 3153.1337126600283, "grad_norm": 0.11175887286663055, "learning_rate": 6.220735785953178e-07, "loss": 2.6123, "step": 53800 }, { "epoch": 3153.1337126600283, "eval_loss": 2.6164095401763916, "eval_runtime": 8.8596, "eval_samples_per_second": 157.343, "eval_steps_per_second": 19.753, "step": 53800 }, { "epoch": 3158.823613086771, "grad_norm": 0.12376561760902405, "learning_rate": 6.120401337792642e-07, "loss": 2.6125, "step": 53900 }, { "epoch": 3158.823613086771, "eval_loss": 2.612550973892212, "eval_runtime": 8.5335, "eval_samples_per_second": 163.356, "eval_steps_per_second": 20.507, "step": 53900 }, { "epoch": 3164.5135135135133, "grad_norm": 0.12542764842510223, "learning_rate": 6.020066889632107e-07, "loss": 2.612, "step": 54000 }, { "epoch": 3164.5135135135133, "eval_loss": 2.614248037338257, "eval_runtime": 8.5367, "eval_samples_per_second": 163.296, "eval_steps_per_second": 20.5, "step": 54000 }, { "epoch": 3170.2034139402563, "grad_norm": 0.12020324170589447, "learning_rate": 5.919732441471572e-07, "loss": 2.6123, "step": 54100 }, { "epoch": 3170.2034139402563, "eval_loss": 2.615945339202881, "eval_runtime": 8.8835, "eval_samples_per_second": 156.92, "eval_steps_per_second": 19.699, "step": 54100 }, { "epoch": 3175.8933143669988, "grad_norm": 0.13160724937915802, "learning_rate": 5.819397993311037e-07, "loss": 2.6125, "step": 54200 }, { "epoch": 3175.8933143669988, "eval_loss": 2.612717628479004, "eval_runtime": 8.533, "eval_samples_per_second": 163.365, "eval_steps_per_second": 20.509, "step": 54200 }, { "epoch": 3181.5832147937413, "grad_norm": 0.11064854264259338, "learning_rate": 5.719063545150502e-07, "loss": 2.6127, "step": 54300 }, { "epoch": 3181.5832147937413, "eval_loss": 2.6137535572052, "eval_runtime": 8.5351, "eval_samples_per_second": 163.326, "eval_steps_per_second": 20.504, "step": 54300 }, { "epoch": 3187.2731152204838, "grad_norm": 0.13410420715808868, "learning_rate": 5.618729096989966e-07, "loss": 2.6125, "step": 54400 }, { "epoch": 3187.2731152204838, "eval_loss": 2.6163814067840576, "eval_runtime": 8.869, "eval_samples_per_second": 157.176, "eval_steps_per_second": 19.732, "step": 54400 }, { "epoch": 3192.9630156472263, "grad_norm": 0.13291259109973907, "learning_rate": 5.518394648829431e-07, "loss": 2.6125, "step": 54500 }, { "epoch": 3192.9630156472263, "eval_loss": 2.6127355098724365, "eval_runtime": 8.5356, "eval_samples_per_second": 163.315, "eval_steps_per_second": 20.502, "step": 54500 }, { "epoch": 3198.6529160739688, "grad_norm": 0.1289217323064804, "learning_rate": 5.418060200668896e-07, "loss": 2.6122, "step": 54600 }, { "epoch": 3198.6529160739688, "eval_loss": 2.613924503326416, "eval_runtime": 8.869, "eval_samples_per_second": 157.176, "eval_steps_per_second": 19.732, "step": 54600 }, { "epoch": 3204.3428165007113, "grad_norm": 0.12402568757534027, "learning_rate": 5.317725752508361e-07, "loss": 2.6125, "step": 54700 }, { "epoch": 3204.3428165007113, "eval_loss": 2.6189987659454346, "eval_runtime": 8.5185, "eval_samples_per_second": 163.643, "eval_steps_per_second": 20.543, "step": 54700 }, { "epoch": 3210.0327169274537, "grad_norm": 0.11996253579854965, "learning_rate": 5.217391304347826e-07, "loss": 2.6128, "step": 54800 }, { "epoch": 3210.0327169274537, "eval_loss": 2.6130542755126953, "eval_runtime": 8.5256, "eval_samples_per_second": 163.508, "eval_steps_per_second": 20.526, "step": 54800 }, { "epoch": 3215.7226173541962, "grad_norm": 0.1303112506866455, "learning_rate": 5.117056856187291e-07, "loss": 2.6125, "step": 54900 }, { "epoch": 3215.7226173541962, "eval_loss": 2.614591598510742, "eval_runtime": 9.1311, "eval_samples_per_second": 152.664, "eval_steps_per_second": 19.165, "step": 54900 }, { "epoch": 3221.4125177809387, "grad_norm": 0.13109813630580902, "learning_rate": 5.016722408026756e-07, "loss": 2.612, "step": 55000 }, { "epoch": 3221.4125177809387, "eval_loss": 2.6141510009765625, "eval_runtime": 8.5232, "eval_samples_per_second": 163.554, "eval_steps_per_second": 20.532, "step": 55000 }, { "epoch": 3227.1024182076812, "grad_norm": 0.11694065481424332, "learning_rate": 4.916387959866221e-07, "loss": 2.6118, "step": 55100 }, { "epoch": 3227.1024182076812, "eval_loss": 2.613607406616211, "eval_runtime": 8.5214, "eval_samples_per_second": 163.588, "eval_steps_per_second": 20.537, "step": 55100 }, { "epoch": 3232.7923186344237, "grad_norm": 0.126685231924057, "learning_rate": 4.816053511705686e-07, "loss": 2.6121, "step": 55200 }, { "epoch": 3232.7923186344237, "eval_loss": 2.617375373840332, "eval_runtime": 8.5254, "eval_samples_per_second": 163.511, "eval_steps_per_second": 20.527, "step": 55200 }, { "epoch": 3238.4822190611662, "grad_norm": 0.1280149221420288, "learning_rate": 4.7157190635451506e-07, "loss": 2.6126, "step": 55300 }, { "epoch": 3238.4822190611662, "eval_loss": 2.6150975227355957, "eval_runtime": 8.8573, "eval_samples_per_second": 157.383, "eval_steps_per_second": 19.758, "step": 55300 }, { "epoch": 3244.172119487909, "grad_norm": 0.13586066663265228, "learning_rate": 4.6153846153846156e-07, "loss": 2.6121, "step": 55400 }, { "epoch": 3244.172119487909, "eval_loss": 2.613374710083008, "eval_runtime": 8.5339, "eval_samples_per_second": 163.349, "eval_steps_per_second": 20.506, "step": 55400 }, { "epoch": 3249.8620199146517, "grad_norm": 0.13014060258865356, "learning_rate": 4.5150501672240806e-07, "loss": 2.6122, "step": 55500 }, { "epoch": 3249.8620199146517, "eval_loss": 2.6121749877929688, "eval_runtime": 8.8553, "eval_samples_per_second": 157.419, "eval_steps_per_second": 19.762, "step": 55500 }, { "epoch": 3255.551920341394, "grad_norm": 0.1337248831987381, "learning_rate": 4.4147157190635456e-07, "loss": 2.6115, "step": 55600 }, { "epoch": 3255.551920341394, "eval_loss": 2.6143338680267334, "eval_runtime": 8.532, "eval_samples_per_second": 163.385, "eval_steps_per_second": 20.511, "step": 55600 }, { "epoch": 3261.2418207681367, "grad_norm": 0.12295526266098022, "learning_rate": 4.3143812709030095e-07, "loss": 2.6128, "step": 55700 }, { "epoch": 3261.2418207681367, "eval_loss": 2.6155290603637695, "eval_runtime": 8.5401, "eval_samples_per_second": 163.23, "eval_steps_per_second": 20.492, "step": 55700 }, { "epoch": 3266.931721194879, "grad_norm": 0.13388285040855408, "learning_rate": 4.2140468227424745e-07, "loss": 2.6121, "step": 55800 }, { "epoch": 3266.931721194879, "eval_loss": 2.6157045364379883, "eval_runtime": 8.8687, "eval_samples_per_second": 157.182, "eval_steps_per_second": 19.732, "step": 55800 }, { "epoch": 3272.6216216216217, "grad_norm": 0.1304856538772583, "learning_rate": 4.1137123745819395e-07, "loss": 2.6119, "step": 55900 }, { "epoch": 3272.6216216216217, "eval_loss": 2.614722490310669, "eval_runtime": 8.5288, "eval_samples_per_second": 163.446, "eval_steps_per_second": 20.519, "step": 55900 }, { "epoch": 3278.311522048364, "grad_norm": 0.13436032831668854, "learning_rate": 4.0133779264214045e-07, "loss": 2.6123, "step": 56000 }, { "epoch": 3278.311522048364, "eval_loss": 2.613041639328003, "eval_runtime": 8.5278, "eval_samples_per_second": 163.466, "eval_steps_per_second": 20.521, "step": 56000 }, { "epoch": 3284.0014224751067, "grad_norm": 0.14674031734466553, "learning_rate": 3.9130434782608694e-07, "loss": 2.6122, "step": 56100 }, { "epoch": 3284.0014224751067, "eval_loss": 2.611990213394165, "eval_runtime": 8.7786, "eval_samples_per_second": 158.795, "eval_steps_per_second": 19.935, "step": 56100 }, { "epoch": 3289.691322901849, "grad_norm": 0.1269470900297165, "learning_rate": 3.8127090301003344e-07, "loss": 2.6119, "step": 56200 }, { "epoch": 3289.691322901849, "eval_loss": 2.614060878753662, "eval_runtime": 8.5989, "eval_samples_per_second": 162.113, "eval_steps_per_second": 20.351, "step": 56200 }, { "epoch": 3295.3812233285917, "grad_norm": 0.13767366111278534, "learning_rate": 3.7123745819397994e-07, "loss": 2.6121, "step": 56300 }, { "epoch": 3295.3812233285917, "eval_loss": 2.616547107696533, "eval_runtime": 8.5426, "eval_samples_per_second": 163.181, "eval_steps_per_second": 20.485, "step": 56300 }, { "epoch": 3301.071123755334, "grad_norm": 0.13906554877758026, "learning_rate": 3.6120401337792644e-07, "loss": 2.6121, "step": 56400 }, { "epoch": 3301.071123755334, "eval_loss": 2.6139421463012695, "eval_runtime": 8.5234, "eval_samples_per_second": 163.55, "eval_steps_per_second": 20.532, "step": 56400 }, { "epoch": 3306.7610241820767, "grad_norm": 0.13832303881645203, "learning_rate": 3.5117056856187294e-07, "loss": 2.612, "step": 56500 }, { "epoch": 3306.7610241820767, "eval_loss": 2.6141018867492676, "eval_runtime": 8.8522, "eval_samples_per_second": 157.475, "eval_steps_per_second": 19.769, "step": 56500 }, { "epoch": 3312.450924608819, "grad_norm": 0.13443072140216827, "learning_rate": 3.411371237458194e-07, "loss": 2.6127, "step": 56600 }, { "epoch": 3312.450924608819, "eval_loss": 2.6130294799804688, "eval_runtime": 8.5356, "eval_samples_per_second": 163.317, "eval_steps_per_second": 20.502, "step": 56600 }, { "epoch": 3318.140825035562, "grad_norm": 0.1384400725364685, "learning_rate": 3.311036789297659e-07, "loss": 2.6125, "step": 56700 }, { "epoch": 3318.140825035562, "eval_loss": 2.614971876144409, "eval_runtime": 8.5276, "eval_samples_per_second": 163.468, "eval_steps_per_second": 20.521, "step": 56700 }, { "epoch": 3323.8307254623046, "grad_norm": 0.12781038880348206, "learning_rate": 3.210702341137124e-07, "loss": 2.6119, "step": 56800 }, { "epoch": 3323.8307254623046, "eval_loss": 2.6149492263793945, "eval_runtime": 8.5244, "eval_samples_per_second": 163.531, "eval_steps_per_second": 20.529, "step": 56800 }, { "epoch": 3329.520625889047, "grad_norm": 0.13229794800281525, "learning_rate": 3.110367892976589e-07, "loss": 2.6114, "step": 56900 }, { "epoch": 3329.520625889047, "eval_loss": 2.620450019836426, "eval_runtime": 8.8571, "eval_samples_per_second": 157.388, "eval_steps_per_second": 19.758, "step": 56900 }, { "epoch": 3335.2105263157896, "grad_norm": 0.13062149286270142, "learning_rate": 3.010033444816054e-07, "loss": 2.6123, "step": 57000 }, { "epoch": 3335.2105263157896, "eval_loss": 2.6148345470428467, "eval_runtime": 8.5245, "eval_samples_per_second": 163.528, "eval_steps_per_second": 20.529, "step": 57000 }, { "epoch": 3340.900426742532, "grad_norm": 0.1294122189283371, "learning_rate": 2.9096989966555187e-07, "loss": 2.6121, "step": 57100 }, { "epoch": 3340.900426742532, "eval_loss": 2.6161153316497803, "eval_runtime": 8.5288, "eval_samples_per_second": 163.446, "eval_steps_per_second": 20.519, "step": 57100 }, { "epoch": 3346.5903271692746, "grad_norm": 0.1416897028684616, "learning_rate": 2.809364548494983e-07, "loss": 2.6121, "step": 57200 }, { "epoch": 3346.5903271692746, "eval_loss": 2.610884428024292, "eval_runtime": 8.8659, "eval_samples_per_second": 157.232, "eval_steps_per_second": 19.739, "step": 57200 }, { "epoch": 3352.280227596017, "grad_norm": 0.13414239883422852, "learning_rate": 2.709030100334448e-07, "loss": 2.6117, "step": 57300 }, { "epoch": 3352.280227596017, "eval_loss": 2.613905906677246, "eval_runtime": 8.523, "eval_samples_per_second": 163.557, "eval_steps_per_second": 20.533, "step": 57300 }, { "epoch": 3357.9701280227596, "grad_norm": 0.11113996803760529, "learning_rate": 2.608695652173913e-07, "loss": 2.6123, "step": 57400 }, { "epoch": 3357.9701280227596, "eval_loss": 2.6124770641326904, "eval_runtime": 8.5319, "eval_samples_per_second": 163.387, "eval_steps_per_second": 20.511, "step": 57400 }, { "epoch": 3363.660028449502, "grad_norm": 0.12642131745815277, "learning_rate": 2.508361204013378e-07, "loss": 2.6121, "step": 57500 }, { "epoch": 3363.660028449502, "eval_loss": 2.6125006675720215, "eval_runtime": 8.5321, "eval_samples_per_second": 163.382, "eval_steps_per_second": 20.511, "step": 57500 }, { "epoch": 3369.3499288762446, "grad_norm": 0.12002536654472351, "learning_rate": 2.408026755852843e-07, "loss": 2.6119, "step": 57600 }, { "epoch": 3369.3499288762446, "eval_loss": 2.6137232780456543, "eval_runtime": 8.8563, "eval_samples_per_second": 157.402, "eval_steps_per_second": 19.76, "step": 57600 }, { "epoch": 3375.039829302987, "grad_norm": 0.12281110137701035, "learning_rate": 2.3076923076923078e-07, "loss": 2.6117, "step": 57700 }, { "epoch": 3375.039829302987, "eval_loss": 2.6145715713500977, "eval_runtime": 8.5313, "eval_samples_per_second": 163.399, "eval_steps_per_second": 20.513, "step": 57700 }, { "epoch": 3380.7297297297296, "grad_norm": 0.14482566714286804, "learning_rate": 2.2073578595317728e-07, "loss": 2.6118, "step": 57800 }, { "epoch": 3380.7297297297296, "eval_loss": 2.613739490509033, "eval_runtime": 8.5223, "eval_samples_per_second": 163.572, "eval_steps_per_second": 20.534, "step": 57800 }, { "epoch": 3386.419630156472, "grad_norm": 0.1368427276611328, "learning_rate": 2.1070234113712372e-07, "loss": 2.6119, "step": 57900 }, { "epoch": 3386.419630156472, "eval_loss": 2.6152491569519043, "eval_runtime": 8.8583, "eval_samples_per_second": 157.366, "eval_steps_per_second": 19.755, "step": 57900 }, { "epoch": 3392.109530583215, "grad_norm": 0.13695128262043, "learning_rate": 2.0066889632107022e-07, "loss": 2.6116, "step": 58000 }, { "epoch": 3392.109530583215, "eval_loss": 2.6153528690338135, "eval_runtime": 8.5368, "eval_samples_per_second": 163.292, "eval_steps_per_second": 20.499, "step": 58000 }, { "epoch": 3397.7994310099575, "grad_norm": 0.11453160643577576, "learning_rate": 1.9063545150501672e-07, "loss": 2.612, "step": 58100 }, { "epoch": 3397.7994310099575, "eval_loss": 2.615257740020752, "eval_runtime": 8.5272, "eval_samples_per_second": 163.477, "eval_steps_per_second": 20.523, "step": 58100 }, { "epoch": 3403.4893314367, "grad_norm": 0.13847880065441132, "learning_rate": 1.8060200668896322e-07, "loss": 2.6125, "step": 58200 }, { "epoch": 3403.4893314367, "eval_loss": 2.6181981563568115, "eval_runtime": 8.8493, "eval_samples_per_second": 157.527, "eval_steps_per_second": 19.776, "step": 58200 }, { "epoch": 3409.1792318634425, "grad_norm": 0.13308827579021454, "learning_rate": 1.705685618729097e-07, "loss": 2.6124, "step": 58300 }, { "epoch": 3409.1792318634425, "eval_loss": 2.6120095252990723, "eval_runtime": 8.5246, "eval_samples_per_second": 163.526, "eval_steps_per_second": 20.529, "step": 58300 }, { "epoch": 3414.869132290185, "grad_norm": 0.13217765092849731, "learning_rate": 1.605351170568562e-07, "loss": 2.6117, "step": 58400 }, { "epoch": 3414.869132290185, "eval_loss": 2.6182873249053955, "eval_runtime": 8.5321, "eval_samples_per_second": 163.384, "eval_steps_per_second": 20.511, "step": 58400 }, { "epoch": 3420.5590327169275, "grad_norm": 0.13483327627182007, "learning_rate": 1.505016722408027e-07, "loss": 2.6122, "step": 58500 }, { "epoch": 3420.5590327169275, "eval_loss": 2.614924430847168, "eval_runtime": 8.8582, "eval_samples_per_second": 157.368, "eval_steps_per_second": 19.756, "step": 58500 }, { "epoch": 3426.24893314367, "grad_norm": 0.13368582725524902, "learning_rate": 1.4046822742474916e-07, "loss": 2.6123, "step": 58600 }, { "epoch": 3426.24893314367, "eval_loss": 2.613650321960449, "eval_runtime": 8.5412, "eval_samples_per_second": 163.209, "eval_steps_per_second": 20.489, "step": 58600 }, { "epoch": 3431.9388335704125, "grad_norm": 0.13080868124961853, "learning_rate": 1.3043478260869566e-07, "loss": 2.612, "step": 58700 }, { "epoch": 3431.9388335704125, "eval_loss": 2.61566162109375, "eval_runtime": 8.8538, "eval_samples_per_second": 157.447, "eval_steps_per_second": 19.766, "step": 58700 }, { "epoch": 3437.628733997155, "grad_norm": 0.1235753670334816, "learning_rate": 1.2040133779264215e-07, "loss": 2.6127, "step": 58800 }, { "epoch": 3437.628733997155, "eval_loss": 2.612259864807129, "eval_runtime": 8.5258, "eval_samples_per_second": 163.503, "eval_steps_per_second": 20.526, "step": 58800 }, { "epoch": 3443.3186344238975, "grad_norm": 0.13656386733055115, "learning_rate": 1.1036789297658864e-07, "loss": 2.6119, "step": 58900 }, { "epoch": 3443.3186344238975, "eval_loss": 2.615288257598877, "eval_runtime": 8.8549, "eval_samples_per_second": 157.428, "eval_steps_per_second": 19.763, "step": 58900 }, { "epoch": 3449.00853485064, "grad_norm": 0.13021564483642578, "learning_rate": 1.0033444816053511e-07, "loss": 2.6123, "step": 59000 }, { "epoch": 3449.00853485064, "eval_loss": 2.6138880252838135, "eval_runtime": 8.6443, "eval_samples_per_second": 161.263, "eval_steps_per_second": 20.245, "step": 59000 }, { "epoch": 3454.6984352773825, "grad_norm": 0.12730829417705536, "learning_rate": 9.030100334448161e-08, "loss": 2.6119, "step": 59100 }, { "epoch": 3454.6984352773825, "eval_loss": 2.6168129444122314, "eval_runtime": 8.8546, "eval_samples_per_second": 157.432, "eval_steps_per_second": 19.764, "step": 59100 }, { "epoch": 3460.388335704125, "grad_norm": 0.13064709305763245, "learning_rate": 8.02675585284281e-08, "loss": 2.6119, "step": 59200 }, { "epoch": 3460.388335704125, "eval_loss": 2.616276502609253, "eval_runtime": 8.5216, "eval_samples_per_second": 163.585, "eval_steps_per_second": 20.536, "step": 59200 }, { "epoch": 3466.078236130868, "grad_norm": 0.13055041432380676, "learning_rate": 7.023411371237458e-08, "loss": 2.6118, "step": 59300 }, { "epoch": 3466.078236130868, "eval_loss": 2.614774227142334, "eval_runtime": 8.8545, "eval_samples_per_second": 157.433, "eval_steps_per_second": 19.764, "step": 59300 }, { "epoch": 3471.7681365576104, "grad_norm": 0.12495147436857224, "learning_rate": 6.020066889632108e-08, "loss": 2.6122, "step": 59400 }, { "epoch": 3471.7681365576104, "eval_loss": 2.6170403957366943, "eval_runtime": 8.5346, "eval_samples_per_second": 163.335, "eval_steps_per_second": 20.505, "step": 59400 }, { "epoch": 3477.458036984353, "grad_norm": 0.1302523910999298, "learning_rate": 5.0167224080267556e-08, "loss": 2.6119, "step": 59500 }, { "epoch": 3477.458036984353, "eval_loss": 2.609950065612793, "eval_runtime": 8.8493, "eval_samples_per_second": 157.526, "eval_steps_per_second": 19.775, "step": 59500 }, { "epoch": 3483.1479374110954, "grad_norm": 0.13452781736850739, "learning_rate": 4.013377926421405e-08, "loss": 2.612, "step": 59600 }, { "epoch": 3483.1479374110954, "eval_loss": 2.614889144897461, "eval_runtime": 8.5246, "eval_samples_per_second": 163.526, "eval_steps_per_second": 20.529, "step": 59600 }, { "epoch": 3488.837837837838, "grad_norm": 0.1290915459394455, "learning_rate": 3.010033444816054e-08, "loss": 2.6118, "step": 59700 }, { "epoch": 3488.837837837838, "eval_loss": 2.616943120956421, "eval_runtime": 8.87, "eval_samples_per_second": 157.16, "eval_steps_per_second": 19.73, "step": 59700 }, { "epoch": 3494.5277382645804, "grad_norm": 0.12313296645879745, "learning_rate": 2.0066889632107024e-08, "loss": 2.6119, "step": 59800 }, { "epoch": 3494.5277382645804, "eval_loss": 2.6143088340759277, "eval_runtime": 8.5338, "eval_samples_per_second": 163.35, "eval_steps_per_second": 20.507, "step": 59800 }, { "epoch": 3500.217638691323, "grad_norm": 0.11486466974020004, "learning_rate": 1.0033444816053512e-08, "loss": 2.6118, "step": 59900 }, { "epoch": 3500.217638691323, "eval_loss": 2.6164870262145996, "eval_runtime": 8.5248, "eval_samples_per_second": 163.523, "eval_steps_per_second": 20.528, "step": 59900 }, { "epoch": 3505.9075391180654, "grad_norm": 0.11500786989927292, "learning_rate": 0.0, "loss": 2.6119, "step": 60000 }, { "epoch": 3505.9075391180654, "eval_loss": 2.616939067840576, "eval_runtime": 8.8637, "eval_samples_per_second": 157.27, "eval_steps_per_second": 19.743, "step": 60000 } ], "logging_steps": 100, "max_steps": 60000, "num_input_tokens_seen": 0, "num_train_epochs": 3530, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 10 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.02069363654656e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }