deskull's picture
Upload MolCrawl protein-sequence BERT small model
e8b32eb verified
{
"best_metric": 2.6091578006744385,
"best_model_checkpoint": "learning_source_20260316/protein_sequence/bert-output/protein_sequence-small/checkpoint-44000",
"epoch": 3505.9075391180654,
"eval_steps": 100,
"global_step": 60000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 5.689900426742532,
"grad_norm": 0.5827894806861877,
"learning_rate": 3e-06,
"loss": 3.1299,
"step": 100
},
{
"epoch": 5.689900426742532,
"eval_loss": 2.7554705142974854,
"eval_runtime": 13.2804,
"eval_samples_per_second": 104.967,
"eval_steps_per_second": 104.967,
"step": 100
},
{
"epoch": 11.379800853485063,
"grad_norm": 0.47490769624710083,
"learning_rate": 6e-06,
"loss": 2.7301,
"step": 200
},
{
"epoch": 11.379800853485063,
"eval_loss": 2.695699453353882,
"eval_runtime": 13.2496,
"eval_samples_per_second": 105.211,
"eval_steps_per_second": 105.211,
"step": 200
},
{
"epoch": 17.069701280227594,
"grad_norm": 0.27892372012138367,
"learning_rate": 5.998999666555519e-06,
"loss": 2.6967,
"step": 300
},
{
"epoch": 17.069701280227594,
"eval_loss": 2.6834542751312256,
"eval_runtime": 13.2537,
"eval_samples_per_second": 105.178,
"eval_steps_per_second": 105.178,
"step": 300
},
{
"epoch": 22.759601706970127,
"grad_norm": 0.26072826981544495,
"learning_rate": 5.997999333111037e-06,
"loss": 2.6876,
"step": 400
},
{
"epoch": 22.759601706970127,
"eval_loss": 2.6789743900299072,
"eval_runtime": 13.6654,
"eval_samples_per_second": 102.01,
"eval_steps_per_second": 102.01,
"step": 400
},
{
"epoch": 28.44950213371266,
"grad_norm": 0.29876092076301575,
"learning_rate": 5.9969989996665554e-06,
"loss": 2.6822,
"step": 500
},
{
"epoch": 28.44950213371266,
"eval_loss": 2.6772079467773438,
"eval_runtime": 13.27,
"eval_samples_per_second": 105.049,
"eval_steps_per_second": 105.049,
"step": 500
},
{
"epoch": 34.13940256045519,
"grad_norm": 0.3491186499595642,
"learning_rate": 5.995998666222074e-06,
"loss": 2.6793,
"step": 600
},
{
"epoch": 34.13940256045519,
"eval_loss": 2.6701366901397705,
"eval_runtime": 13.2631,
"eval_samples_per_second": 105.104,
"eval_steps_per_second": 105.104,
"step": 600
},
{
"epoch": 39.82930298719772,
"grad_norm": 0.38635119795799255,
"learning_rate": 5.994998332777593e-06,
"loss": 2.6775,
"step": 700
},
{
"epoch": 39.82930298719772,
"eval_loss": 2.675004243850708,
"eval_runtime": 13.7168,
"eval_samples_per_second": 101.627,
"eval_steps_per_second": 101.627,
"step": 700
},
{
"epoch": 45.519203413940254,
"grad_norm": 0.34463900327682495,
"learning_rate": 5.9939979993331115e-06,
"loss": 2.6754,
"step": 800
},
{
"epoch": 45.519203413940254,
"eval_loss": 2.673832416534424,
"eval_runtime": 13.2859,
"eval_samples_per_second": 104.923,
"eval_steps_per_second": 104.923,
"step": 800
},
{
"epoch": 51.209103840682786,
"grad_norm": 0.35753050446510315,
"learning_rate": 5.992997665888629e-06,
"loss": 2.6743,
"step": 900
},
{
"epoch": 51.209103840682786,
"eval_loss": 2.670893907546997,
"eval_runtime": 13.2644,
"eval_samples_per_second": 105.093,
"eval_steps_per_second": 105.093,
"step": 900
},
{
"epoch": 56.89900426742532,
"grad_norm": 0.30704179406166077,
"learning_rate": 5.991997332444148e-06,
"loss": 2.6733,
"step": 1000
},
{
"epoch": 56.89900426742532,
"eval_loss": 2.6703639030456543,
"eval_runtime": 13.274,
"eval_samples_per_second": 105.017,
"eval_steps_per_second": 105.017,
"step": 1000
},
{
"epoch": 62.58890469416785,
"grad_norm": 0.20322857797145844,
"learning_rate": 5.990996998999667e-06,
"loss": 2.6718,
"step": 1100
},
{
"epoch": 62.58890469416785,
"eval_loss": 2.6725869178771973,
"eval_runtime": 13.6525,
"eval_samples_per_second": 102.106,
"eval_steps_per_second": 102.106,
"step": 1100
},
{
"epoch": 68.27880512091038,
"grad_norm": 0.29705750942230225,
"learning_rate": 5.989996665555185e-06,
"loss": 2.6712,
"step": 1200
},
{
"epoch": 68.27880512091038,
"eval_loss": 2.6705477237701416,
"eval_runtime": 13.7099,
"eval_samples_per_second": 101.678,
"eval_steps_per_second": 101.678,
"step": 1200
},
{
"epoch": 73.96870554765292,
"grad_norm": 0.2920830249786377,
"learning_rate": 5.988996332110703e-06,
"loss": 2.671,
"step": 1300
},
{
"epoch": 73.96870554765292,
"eval_loss": 2.6691176891326904,
"eval_runtime": 13.276,
"eval_samples_per_second": 105.002,
"eval_steps_per_second": 105.002,
"step": 1300
},
{
"epoch": 79.65860597439544,
"grad_norm": 0.38358381390571594,
"learning_rate": 5.987995998666222e-06,
"loss": 2.6703,
"step": 1400
},
{
"epoch": 79.65860597439544,
"eval_loss": 2.6659200191497803,
"eval_runtime": 13.2877,
"eval_samples_per_second": 104.909,
"eval_steps_per_second": 104.909,
"step": 1400
},
{
"epoch": 85.34850640113798,
"grad_norm": 0.23219753801822662,
"learning_rate": 5.986995665221741e-06,
"loss": 2.6704,
"step": 1500
},
{
"epoch": 85.34850640113798,
"eval_loss": 2.6674177646636963,
"eval_runtime": 13.666,
"eval_samples_per_second": 102.005,
"eval_steps_per_second": 102.005,
"step": 1500
},
{
"epoch": 91.03840682788051,
"grad_norm": 0.23956173658370972,
"learning_rate": 5.9859953317772595e-06,
"loss": 2.6704,
"step": 1600
},
{
"epoch": 91.03840682788051,
"eval_loss": 2.667738199234009,
"eval_runtime": 13.2689,
"eval_samples_per_second": 105.058,
"eval_steps_per_second": 105.058,
"step": 1600
},
{
"epoch": 96.72830725462305,
"grad_norm": 0.22576624155044556,
"learning_rate": 5.984994998332777e-06,
"loss": 2.6696,
"step": 1700
},
{
"epoch": 96.72830725462305,
"eval_loss": 2.666335344314575,
"eval_runtime": 13.2674,
"eval_samples_per_second": 105.069,
"eval_steps_per_second": 105.069,
"step": 1700
},
{
"epoch": 102.41820768136557,
"grad_norm": 0.2869977653026581,
"learning_rate": 5.983994664888296e-06,
"loss": 2.6692,
"step": 1800
},
{
"epoch": 102.41820768136557,
"eval_loss": 2.667121171951294,
"eval_runtime": 13.2883,
"eval_samples_per_second": 104.905,
"eval_steps_per_second": 104.905,
"step": 1800
},
{
"epoch": 108.10810810810811,
"grad_norm": 0.24629302322864532,
"learning_rate": 5.982994331443815e-06,
"loss": 2.6685,
"step": 1900
},
{
"epoch": 108.10810810810811,
"eval_loss": 2.6667563915252686,
"eval_runtime": 13.6825,
"eval_samples_per_second": 101.882,
"eval_steps_per_second": 101.882,
"step": 1900
},
{
"epoch": 113.79800853485064,
"grad_norm": 0.23221346735954285,
"learning_rate": 5.981993997999333e-06,
"loss": 2.6683,
"step": 2000
},
{
"epoch": 113.79800853485064,
"eval_loss": 2.6664071083068848,
"eval_runtime": 13.2695,
"eval_samples_per_second": 105.053,
"eval_steps_per_second": 105.053,
"step": 2000
},
{
"epoch": 119.48790896159318,
"grad_norm": 0.24480201303958893,
"learning_rate": 5.980993664554851e-06,
"loss": 2.668,
"step": 2100
},
{
"epoch": 119.48790896159318,
"eval_loss": 2.6675596237182617,
"eval_runtime": 13.6602,
"eval_samples_per_second": 102.048,
"eval_steps_per_second": 102.048,
"step": 2100
},
{
"epoch": 125.1778093883357,
"grad_norm": 0.2695687413215637,
"learning_rate": 5.979993331110371e-06,
"loss": 2.6683,
"step": 2200
},
{
"epoch": 125.1778093883357,
"eval_loss": 2.6677987575531006,
"eval_runtime": 13.2773,
"eval_samples_per_second": 104.991,
"eval_steps_per_second": 104.991,
"step": 2200
},
{
"epoch": 130.86770981507823,
"grad_norm": 0.2357303947210312,
"learning_rate": 5.978992997665889e-06,
"loss": 2.6678,
"step": 2300
},
{
"epoch": 130.86770981507823,
"eval_loss": 2.6650021076202393,
"eval_runtime": 13.256,
"eval_samples_per_second": 105.16,
"eval_steps_per_second": 105.16,
"step": 2300
},
{
"epoch": 136.55761024182075,
"grad_norm": 0.23957480490207672,
"learning_rate": 5.9779926642214075e-06,
"loss": 2.6679,
"step": 2400
},
{
"epoch": 136.55761024182075,
"eval_loss": 2.6645851135253906,
"eval_runtime": 13.7249,
"eval_samples_per_second": 101.567,
"eval_steps_per_second": 101.567,
"step": 2400
},
{
"epoch": 146.86059743954482,
"grad_norm": 0.19333045184612274,
"learning_rate": 5.976992330776926e-06,
"loss": 2.6671,
"step": 2500
},
{
"epoch": 146.86059743954482,
"eval_loss": 2.666929244995117,
"eval_runtime": 8.8729,
"eval_samples_per_second": 157.107,
"eval_steps_per_second": 19.723,
"step": 2500
},
{
"epoch": 152.55049786628734,
"grad_norm": 0.3093737065792084,
"learning_rate": 5.975991997332444e-06,
"loss": 2.6674,
"step": 2600
},
{
"epoch": 152.55049786628734,
"eval_loss": 2.6641287803649902,
"eval_runtime": 8.544,
"eval_samples_per_second": 163.155,
"eval_steps_per_second": 20.482,
"step": 2600
},
{
"epoch": 158.24039829302987,
"grad_norm": 0.2492215484380722,
"learning_rate": 5.974991663887963e-06,
"loss": 2.6675,
"step": 2700
},
{
"epoch": 158.24039829302987,
"eval_loss": 2.6672415733337402,
"eval_runtime": 8.5402,
"eval_samples_per_second": 163.229,
"eval_steps_per_second": 20.491,
"step": 2700
},
{
"epoch": 163.9302987197724,
"grad_norm": 0.3064326047897339,
"learning_rate": 5.973991330443481e-06,
"loss": 2.6674,
"step": 2800
},
{
"epoch": 163.9302987197724,
"eval_loss": 2.667715072631836,
"eval_runtime": 8.861,
"eval_samples_per_second": 157.319,
"eval_steps_per_second": 19.75,
"step": 2800
},
{
"epoch": 169.62019914651495,
"grad_norm": 0.2401367574930191,
"learning_rate": 5.972990996999e-06,
"loss": 2.6668,
"step": 2900
},
{
"epoch": 169.62019914651495,
"eval_loss": 2.663231611251831,
"eval_runtime": 8.5263,
"eval_samples_per_second": 163.495,
"eval_steps_per_second": 20.525,
"step": 2900
},
{
"epoch": 175.31009957325747,
"grad_norm": 0.26518478989601135,
"learning_rate": 5.971990663554519e-06,
"loss": 2.6664,
"step": 3000
},
{
"epoch": 175.31009957325747,
"eval_loss": 2.66806960105896,
"eval_runtime": 8.5256,
"eval_samples_per_second": 163.508,
"eval_steps_per_second": 20.526,
"step": 3000
},
{
"epoch": 181.0,
"grad_norm": 0.21279650926589966,
"learning_rate": 5.970990330110037e-06,
"loss": 2.6662,
"step": 3100
},
{
"epoch": 181.0,
"eval_loss": 2.66540789604187,
"eval_runtime": 8.9087,
"eval_samples_per_second": 156.477,
"eval_steps_per_second": 19.644,
"step": 3100
},
{
"epoch": 186.68990042674253,
"grad_norm": 0.20601896941661835,
"learning_rate": 5.9699899966655554e-06,
"loss": 2.6662,
"step": 3200
},
{
"epoch": 186.68990042674253,
"eval_loss": 2.661759614944458,
"eval_runtime": 8.5361,
"eval_samples_per_second": 163.306,
"eval_steps_per_second": 20.501,
"step": 3200
},
{
"epoch": 192.37980085348505,
"grad_norm": 0.30063194036483765,
"learning_rate": 5.968989663221074e-06,
"loss": 2.666,
"step": 3300
},
{
"epoch": 192.37980085348505,
"eval_loss": 2.6638128757476807,
"eval_runtime": 8.5253,
"eval_samples_per_second": 163.514,
"eval_steps_per_second": 20.527,
"step": 3300
},
{
"epoch": 198.0697012802276,
"grad_norm": 0.17756374180316925,
"learning_rate": 5.967989329776592e-06,
"loss": 2.6652,
"step": 3400
},
{
"epoch": 198.0697012802276,
"eval_loss": 2.6624886989593506,
"eval_runtime": 8.9634,
"eval_samples_per_second": 155.521,
"eval_steps_per_second": 19.524,
"step": 3400
},
{
"epoch": 203.75960170697013,
"grad_norm": 0.3183553218841553,
"learning_rate": 5.966988996332111e-06,
"loss": 2.6656,
"step": 3500
},
{
"epoch": 203.75960170697013,
"eval_loss": 2.666609764099121,
"eval_runtime": 8.5304,
"eval_samples_per_second": 163.416,
"eval_steps_per_second": 20.515,
"step": 3500
},
{
"epoch": 209.44950213371266,
"grad_norm": 0.23746278882026672,
"learning_rate": 5.965988662887629e-06,
"loss": 2.6656,
"step": 3600
},
{
"epoch": 209.44950213371266,
"eval_loss": 2.664607048034668,
"eval_runtime": 8.5297,
"eval_samples_per_second": 163.429,
"eval_steps_per_second": 20.516,
"step": 3600
},
{
"epoch": 215.13940256045518,
"grad_norm": 0.2566852271556854,
"learning_rate": 5.964988329443148e-06,
"loss": 2.6652,
"step": 3700
},
{
"epoch": 215.13940256045518,
"eval_loss": 2.663752794265747,
"eval_runtime": 8.5306,
"eval_samples_per_second": 163.412,
"eval_steps_per_second": 20.514,
"step": 3700
},
{
"epoch": 220.82930298719774,
"grad_norm": 0.19710654020309448,
"learning_rate": 5.963987995998667e-06,
"loss": 2.6657,
"step": 3800
},
{
"epoch": 220.82930298719774,
"eval_loss": 2.66432785987854,
"eval_runtime": 8.9192,
"eval_samples_per_second": 156.293,
"eval_steps_per_second": 19.621,
"step": 3800
},
{
"epoch": 226.51920341394026,
"grad_norm": 0.20113052427768707,
"learning_rate": 5.962987662554185e-06,
"loss": 2.6655,
"step": 3900
},
{
"epoch": 226.51920341394026,
"eval_loss": 2.662318706512451,
"eval_runtime": 8.5269,
"eval_samples_per_second": 163.483,
"eval_steps_per_second": 20.523,
"step": 3900
},
{
"epoch": 232.2091038406828,
"grad_norm": 0.24698683619499207,
"learning_rate": 5.961987329109703e-06,
"loss": 2.6652,
"step": 4000
},
{
"epoch": 232.2091038406828,
"eval_loss": 2.6657159328460693,
"eval_runtime": 8.5292,
"eval_samples_per_second": 163.438,
"eval_steps_per_second": 20.518,
"step": 4000
},
{
"epoch": 237.8990042674253,
"grad_norm": 0.24947816133499146,
"learning_rate": 5.960986995665222e-06,
"loss": 2.6652,
"step": 4100
},
{
"epoch": 237.8990042674253,
"eval_loss": 2.6663217544555664,
"eval_runtime": 8.5354,
"eval_samples_per_second": 163.32,
"eval_steps_per_second": 20.503,
"step": 4100
},
{
"epoch": 243.58890469416787,
"grad_norm": 0.2810859680175781,
"learning_rate": 5.95998666222074e-06,
"loss": 2.6649,
"step": 4200
},
{
"epoch": 243.58890469416787,
"eval_loss": 2.6661739349365234,
"eval_runtime": 8.8623,
"eval_samples_per_second": 157.295,
"eval_steps_per_second": 19.747,
"step": 4200
},
{
"epoch": 252.74679943100995,
"grad_norm": 0.18688435852527618,
"learning_rate": 5.588628762541806e-06,
"loss": 2.6646,
"step": 4300
},
{
"epoch": 252.74679943100995,
"eval_loss": 2.664121389389038,
"eval_runtime": 9.162,
"eval_samples_per_second": 152.149,
"eval_steps_per_second": 19.101,
"step": 4300
},
{
"epoch": 258.4366998577525,
"grad_norm": 0.19968199729919434,
"learning_rate": 5.578595317725753e-06,
"loss": 2.6649,
"step": 4400
},
{
"epoch": 258.4366998577525,
"eval_loss": 2.666635274887085,
"eval_runtime": 11.7784,
"eval_samples_per_second": 118.352,
"eval_steps_per_second": 14.858,
"step": 4400
},
{
"epoch": 264.126600284495,
"grad_norm": 0.18012067675590515,
"learning_rate": 5.568561872909699e-06,
"loss": 2.6653,
"step": 4500
},
{
"epoch": 264.126600284495,
"eval_loss": 2.662325143814087,
"eval_runtime": 11.7766,
"eval_samples_per_second": 118.371,
"eval_steps_per_second": 14.86,
"step": 4500
},
{
"epoch": 269.81650071123755,
"grad_norm": 0.18739238381385803,
"learning_rate": 5.558528428093646e-06,
"loss": 2.6652,
"step": 4600
},
{
"epoch": 269.81650071123755,
"eval_loss": 2.664424419403076,
"eval_runtime": 9.739,
"eval_samples_per_second": 143.137,
"eval_steps_per_second": 17.969,
"step": 4600
},
{
"epoch": 275.5064011379801,
"grad_norm": 0.2488318383693695,
"learning_rate": 5.548494983277593e-06,
"loss": 2.6648,
"step": 4700
},
{
"epoch": 275.5064011379801,
"eval_loss": 2.6640284061431885,
"eval_runtime": 8.9011,
"eval_samples_per_second": 156.609,
"eval_steps_per_second": 19.66,
"step": 4700
},
{
"epoch": 281.1963015647226,
"grad_norm": 0.22808881103992462,
"learning_rate": 5.5384615384615385e-06,
"loss": 2.6651,
"step": 4800
},
{
"epoch": 281.1963015647226,
"eval_loss": 2.6617281436920166,
"eval_runtime": 8.5632,
"eval_samples_per_second": 162.79,
"eval_steps_per_second": 20.436,
"step": 4800
},
{
"epoch": 286.88620199146516,
"grad_norm": 0.1917983591556549,
"learning_rate": 5.528428093645485e-06,
"loss": 2.6647,
"step": 4900
},
{
"epoch": 286.88620199146516,
"eval_loss": 2.6639668941497803,
"eval_runtime": 8.5741,
"eval_samples_per_second": 162.583,
"eval_steps_per_second": 20.41,
"step": 4900
},
{
"epoch": 292.57610241820765,
"grad_norm": 0.247116819024086,
"learning_rate": 5.518394648829432e-06,
"loss": 2.6648,
"step": 5000
},
{
"epoch": 292.57610241820765,
"eval_loss": 2.660776376724243,
"eval_runtime": 8.9014,
"eval_samples_per_second": 156.605,
"eval_steps_per_second": 19.66,
"step": 5000
},
{
"epoch": 298.2660028449502,
"grad_norm": 0.18090835213661194,
"learning_rate": 5.508361204013378e-06,
"loss": 2.6643,
"step": 5100
},
{
"epoch": 298.2660028449502,
"eval_loss": 2.6607048511505127,
"eval_runtime": 8.5599,
"eval_samples_per_second": 162.853,
"eval_steps_per_second": 20.444,
"step": 5100
},
{
"epoch": 303.95590327169276,
"grad_norm": 0.1796797215938568,
"learning_rate": 5.498327759197324e-06,
"loss": 2.6645,
"step": 5200
},
{
"epoch": 303.95590327169276,
"eval_loss": 2.6626744270324707,
"eval_runtime": 8.9139,
"eval_samples_per_second": 156.385,
"eval_steps_per_second": 19.632,
"step": 5200
},
{
"epoch": 309.64580369843526,
"grad_norm": 0.19111952185630798,
"learning_rate": 5.488294314381271e-06,
"loss": 2.6647,
"step": 5300
},
{
"epoch": 309.64580369843526,
"eval_loss": 2.6617257595062256,
"eval_runtime": 8.5801,
"eval_samples_per_second": 162.47,
"eval_steps_per_second": 20.396,
"step": 5300
},
{
"epoch": 315.3357041251778,
"grad_norm": 0.17278283834457397,
"learning_rate": 5.478260869565217e-06,
"loss": 2.6645,
"step": 5400
},
{
"epoch": 315.3357041251778,
"eval_loss": 2.6651811599731445,
"eval_runtime": 8.8919,
"eval_samples_per_second": 156.771,
"eval_steps_per_second": 19.681,
"step": 5400
},
{
"epoch": 321.02560455192037,
"grad_norm": 0.24506501853466034,
"learning_rate": 5.468227424749163e-06,
"loss": 2.6644,
"step": 5500
},
{
"epoch": 321.02560455192037,
"eval_loss": 2.6612024307250977,
"eval_runtime": 8.5651,
"eval_samples_per_second": 162.754,
"eval_steps_per_second": 20.432,
"step": 5500
},
{
"epoch": 326.71550497866286,
"grad_norm": 0.17717023193836212,
"learning_rate": 5.45819397993311e-06,
"loss": 2.6644,
"step": 5600
},
{
"epoch": 326.71550497866286,
"eval_loss": 2.661200523376465,
"eval_runtime": 8.5626,
"eval_samples_per_second": 162.801,
"eval_steps_per_second": 20.438,
"step": 5600
},
{
"epoch": 332.4054054054054,
"grad_norm": 0.12661577761173248,
"learning_rate": 5.448160535117057e-06,
"loss": 2.6641,
"step": 5700
},
{
"epoch": 332.4054054054054,
"eval_loss": 2.6609702110290527,
"eval_runtime": 8.883,
"eval_samples_per_second": 156.929,
"eval_steps_per_second": 19.701,
"step": 5700
},
{
"epoch": 338.0953058321479,
"grad_norm": 0.199785977602005,
"learning_rate": 5.438127090301003e-06,
"loss": 2.6643,
"step": 5800
},
{
"epoch": 338.0953058321479,
"eval_loss": 2.660168409347534,
"eval_runtime": 8.5745,
"eval_samples_per_second": 162.574,
"eval_steps_per_second": 20.409,
"step": 5800
},
{
"epoch": 343.78520625889047,
"grad_norm": 0.2726210057735443,
"learning_rate": 5.4280936454849495e-06,
"loss": 2.6646,
"step": 5900
},
{
"epoch": 343.78520625889047,
"eval_loss": 2.664670944213867,
"eval_runtime": 8.5589,
"eval_samples_per_second": 162.871,
"eval_steps_per_second": 20.447,
"step": 5900
},
{
"epoch": 349.475106685633,
"grad_norm": 0.3512348234653473,
"learning_rate": 5.418060200668896e-06,
"loss": 2.664,
"step": 6000
},
{
"epoch": 349.475106685633,
"eval_loss": 2.665173292160034,
"eval_runtime": 8.5571,
"eval_samples_per_second": 162.905,
"eval_steps_per_second": 20.451,
"step": 6000
},
{
"epoch": 355.1650071123755,
"grad_norm": 0.20835170149803162,
"learning_rate": 5.408026755852843e-06,
"loss": 2.6641,
"step": 6100
},
{
"epoch": 355.1650071123755,
"eval_loss": 2.662048101425171,
"eval_runtime": 8.9087,
"eval_samples_per_second": 156.476,
"eval_steps_per_second": 19.644,
"step": 6100
},
{
"epoch": 360.8549075391181,
"grad_norm": 0.11575555801391602,
"learning_rate": 5.397993311036789e-06,
"loss": 2.6645,
"step": 6200
},
{
"epoch": 360.8549075391181,
"eval_loss": 2.6617300510406494,
"eval_runtime": 8.5709,
"eval_samples_per_second": 162.643,
"eval_steps_per_second": 20.418,
"step": 6200
},
{
"epoch": 366.54480796586057,
"grad_norm": 0.18948699533939362,
"learning_rate": 5.387959866220736e-06,
"loss": 2.6639,
"step": 6300
},
{
"epoch": 366.54480796586057,
"eval_loss": 2.6628897190093994,
"eval_runtime": 8.9034,
"eval_samples_per_second": 156.569,
"eval_steps_per_second": 19.655,
"step": 6300
},
{
"epoch": 372.2347083926031,
"grad_norm": 0.12320856750011444,
"learning_rate": 5.3779264214046825e-06,
"loss": 2.6647,
"step": 6400
},
{
"epoch": 372.2347083926031,
"eval_loss": 2.663992166519165,
"eval_runtime": 8.5612,
"eval_samples_per_second": 162.828,
"eval_steps_per_second": 20.441,
"step": 6400
},
{
"epoch": 377.9246088193457,
"grad_norm": 0.26067054271698,
"learning_rate": 5.367892976588628e-06,
"loss": 2.6643,
"step": 6500
},
{
"epoch": 377.9246088193457,
"eval_loss": 2.6624581813812256,
"eval_runtime": 8.572,
"eval_samples_per_second": 162.623,
"eval_steps_per_second": 20.415,
"step": 6500
},
{
"epoch": 383.6145092460882,
"grad_norm": 0.18116046488285065,
"learning_rate": 5.357859531772575e-06,
"loss": 2.664,
"step": 6600
},
{
"epoch": 383.6145092460882,
"eval_loss": 2.662827491760254,
"eval_runtime": 8.8871,
"eval_samples_per_second": 156.857,
"eval_steps_per_second": 19.692,
"step": 6600
},
{
"epoch": 389.30440967283073,
"grad_norm": 0.21489782631397247,
"learning_rate": 5.347826086956522e-06,
"loss": 2.6635,
"step": 6700
},
{
"epoch": 389.30440967283073,
"eval_loss": 2.6603777408599854,
"eval_runtime": 8.5641,
"eval_samples_per_second": 162.772,
"eval_steps_per_second": 20.434,
"step": 6700
},
{
"epoch": 394.9943100995733,
"grad_norm": 0.1781698316335678,
"learning_rate": 5.337792642140468e-06,
"loss": 2.6645,
"step": 6800
},
{
"epoch": 394.9943100995733,
"eval_loss": 2.661527156829834,
"eval_runtime": 8.5625,
"eval_samples_per_second": 162.802,
"eval_steps_per_second": 20.438,
"step": 6800
},
{
"epoch": 400.6842105263158,
"grad_norm": 0.18622642755508423,
"learning_rate": 5.327759197324415e-06,
"loss": 2.6647,
"step": 6900
},
{
"epoch": 400.6842105263158,
"eval_loss": 2.661090135574341,
"eval_runtime": 8.5509,
"eval_samples_per_second": 163.023,
"eval_steps_per_second": 20.466,
"step": 6900
},
{
"epoch": 406.37411095305833,
"grad_norm": 0.15774820744991302,
"learning_rate": 5.317725752508361e-06,
"loss": 2.6636,
"step": 7000
},
{
"epoch": 406.37411095305833,
"eval_loss": 2.66558575630188,
"eval_runtime": 8.898,
"eval_samples_per_second": 156.665,
"eval_steps_per_second": 19.667,
"step": 7000
},
{
"epoch": 412.06401137980083,
"grad_norm": 0.18330508470535278,
"learning_rate": 5.307692307692307e-06,
"loss": 2.6645,
"step": 7100
},
{
"epoch": 412.06401137980083,
"eval_loss": 2.6627676486968994,
"eval_runtime": 8.5744,
"eval_samples_per_second": 162.576,
"eval_steps_per_second": 20.409,
"step": 7100
},
{
"epoch": 417.7539118065434,
"grad_norm": 0.23223190009593964,
"learning_rate": 5.297658862876254e-06,
"loss": 2.6636,
"step": 7200
},
{
"epoch": 417.7539118065434,
"eval_loss": 2.661203145980835,
"eval_runtime": 8.5675,
"eval_samples_per_second": 162.708,
"eval_steps_per_second": 20.426,
"step": 7200
},
{
"epoch": 423.44381223328594,
"grad_norm": 0.15261903405189514,
"learning_rate": 5.287625418060201e-06,
"loss": 2.6641,
"step": 7300
},
{
"epoch": 423.44381223328594,
"eval_loss": 2.6625847816467285,
"eval_runtime": 8.9059,
"eval_samples_per_second": 156.525,
"eval_steps_per_second": 19.65,
"step": 7300
},
{
"epoch": 429.13371266002844,
"grad_norm": 0.1654181033372879,
"learning_rate": 5.277591973244147e-06,
"loss": 2.6641,
"step": 7400
},
{
"epoch": 429.13371266002844,
"eval_loss": 2.6628565788269043,
"eval_runtime": 8.555,
"eval_samples_per_second": 162.946,
"eval_steps_per_second": 20.456,
"step": 7400
},
{
"epoch": 434.823613086771,
"grad_norm": 0.2062557488679886,
"learning_rate": 5.2675585284280935e-06,
"loss": 2.6634,
"step": 7500
},
{
"epoch": 434.823613086771,
"eval_loss": 2.6651501655578613,
"eval_runtime": 8.9081,
"eval_samples_per_second": 156.487,
"eval_steps_per_second": 19.645,
"step": 7500
},
{
"epoch": 440.5135135135135,
"grad_norm": 0.21824122965335846,
"learning_rate": 5.25752508361204e-06,
"loss": 2.6637,
"step": 7600
},
{
"epoch": 440.5135135135135,
"eval_loss": 2.6624720096588135,
"eval_runtime": 8.5608,
"eval_samples_per_second": 162.836,
"eval_steps_per_second": 20.442,
"step": 7600
},
{
"epoch": 446.20341394025604,
"grad_norm": 0.2458944469690323,
"learning_rate": 5.247491638795986e-06,
"loss": 2.6637,
"step": 7700
},
{
"epoch": 446.20341394025604,
"eval_loss": 2.661086082458496,
"eval_runtime": 8.8963,
"eval_samples_per_second": 156.694,
"eval_steps_per_second": 19.671,
"step": 7700
},
{
"epoch": 451.8933143669986,
"grad_norm": 0.1574467271566391,
"learning_rate": 5.237458193979933e-06,
"loss": 2.6639,
"step": 7800
},
{
"epoch": 451.8933143669986,
"eval_loss": 2.6646134853363037,
"eval_runtime": 8.5514,
"eval_samples_per_second": 163.014,
"eval_steps_per_second": 20.464,
"step": 7800
},
{
"epoch": 457.5832147937411,
"grad_norm": 0.1982835829257965,
"learning_rate": 5.22742474916388e-06,
"loss": 2.664,
"step": 7900
},
{
"epoch": 457.5832147937411,
"eval_loss": 2.6606264114379883,
"eval_runtime": 8.5483,
"eval_samples_per_second": 163.073,
"eval_steps_per_second": 20.472,
"step": 7900
},
{
"epoch": 463.27311522048365,
"grad_norm": 0.19593903422355652,
"learning_rate": 5.2173913043478265e-06,
"loss": 2.6632,
"step": 8000
},
{
"epoch": 463.27311522048365,
"eval_loss": 2.664707899093628,
"eval_runtime": 8.5595,
"eval_samples_per_second": 162.861,
"eval_steps_per_second": 20.445,
"step": 8000
},
{
"epoch": 468.9630156472262,
"grad_norm": 0.22343507409095764,
"learning_rate": 5.207357859531772e-06,
"loss": 2.6634,
"step": 8100
},
{
"epoch": 468.9630156472262,
"eval_loss": 2.6642260551452637,
"eval_runtime": 8.9066,
"eval_samples_per_second": 156.513,
"eval_steps_per_second": 19.648,
"step": 8100
},
{
"epoch": 474.6529160739687,
"grad_norm": 0.16728109121322632,
"learning_rate": 5.197324414715719e-06,
"loss": 2.6633,
"step": 8200
},
{
"epoch": 474.6529160739687,
"eval_loss": 2.6625194549560547,
"eval_runtime": 8.5593,
"eval_samples_per_second": 162.863,
"eval_steps_per_second": 20.445,
"step": 8200
},
{
"epoch": 480.34281650071125,
"grad_norm": 0.23255111277103424,
"learning_rate": 5.187290969899666e-06,
"loss": 2.6634,
"step": 8300
},
{
"epoch": 480.34281650071125,
"eval_loss": 2.6606099605560303,
"eval_runtime": 8.559,
"eval_samples_per_second": 162.87,
"eval_steps_per_second": 20.446,
"step": 8300
},
{
"epoch": 486.03271692745375,
"grad_norm": 0.118553027510643,
"learning_rate": 5.177257525083612e-06,
"loss": 2.6632,
"step": 8400
},
{
"epoch": 486.03271692745375,
"eval_loss": 2.663628101348877,
"eval_runtime": 8.912,
"eval_samples_per_second": 156.418,
"eval_steps_per_second": 19.636,
"step": 8400
},
{
"epoch": 491.7226173541963,
"grad_norm": 0.23464259505271912,
"learning_rate": 5.167224080267559e-06,
"loss": 2.6636,
"step": 8500
},
{
"epoch": 491.7226173541963,
"eval_loss": 2.6618993282318115,
"eval_runtime": 8.5674,
"eval_samples_per_second": 162.711,
"eval_steps_per_second": 20.426,
"step": 8500
},
{
"epoch": 497.41251778093886,
"grad_norm": 0.14757351577281952,
"learning_rate": 5.157190635451505e-06,
"loss": 2.6634,
"step": 8600
},
{
"epoch": 497.41251778093886,
"eval_loss": 2.6627461910247803,
"eval_runtime": 8.553,
"eval_samples_per_second": 162.984,
"eval_steps_per_second": 20.461,
"step": 8600
},
{
"epoch": 503.10241820768135,
"grad_norm": 0.16491751372814178,
"learning_rate": 5.147157190635451e-06,
"loss": 2.6634,
"step": 8700
},
{
"epoch": 503.10241820768135,
"eval_loss": 2.6589674949645996,
"eval_runtime": 8.8861,
"eval_samples_per_second": 156.874,
"eval_steps_per_second": 19.694,
"step": 8700
},
{
"epoch": 508.7923186344239,
"grad_norm": 0.17845740914344788,
"learning_rate": 5.137123745819398e-06,
"loss": 2.6634,
"step": 8800
},
{
"epoch": 508.7923186344239,
"eval_loss": 2.6661596298217773,
"eval_runtime": 8.5694,
"eval_samples_per_second": 162.672,
"eval_steps_per_second": 20.422,
"step": 8800
},
{
"epoch": 514.4822190611665,
"grad_norm": 0.11282111704349518,
"learning_rate": 5.127090301003345e-06,
"loss": 2.6635,
"step": 8900
},
{
"epoch": 514.4822190611665,
"eval_loss": 2.6612164974212646,
"eval_runtime": 8.9016,
"eval_samples_per_second": 156.6,
"eval_steps_per_second": 19.659,
"step": 8900
},
{
"epoch": 520.172119487909,
"grad_norm": 0.11933238804340363,
"learning_rate": 5.117056856187291e-06,
"loss": 2.6629,
"step": 9000
},
{
"epoch": 520.172119487909,
"eval_loss": 2.663548707962036,
"eval_runtime": 8.5756,
"eval_samples_per_second": 162.553,
"eval_steps_per_second": 20.407,
"step": 9000
},
{
"epoch": 525.8620199146515,
"grad_norm": 0.16832073032855988,
"learning_rate": 5.1070234113712375e-06,
"loss": 2.6632,
"step": 9100
},
{
"epoch": 525.8620199146515,
"eval_loss": 2.665459156036377,
"eval_runtime": 8.8932,
"eval_samples_per_second": 156.75,
"eval_steps_per_second": 19.678,
"step": 9100
},
{
"epoch": 531.5519203413941,
"grad_norm": 0.1491301953792572,
"learning_rate": 5.096989966555184e-06,
"loss": 2.6633,
"step": 9200
},
{
"epoch": 531.5519203413941,
"eval_loss": 2.6649389266967773,
"eval_runtime": 8.5727,
"eval_samples_per_second": 162.609,
"eval_steps_per_second": 20.414,
"step": 9200
},
{
"epoch": 537.2418207681366,
"grad_norm": 0.20299378037452698,
"learning_rate": 5.08695652173913e-06,
"loss": 2.663,
"step": 9300
},
{
"epoch": 537.2418207681366,
"eval_loss": 2.662057638168335,
"eval_runtime": 8.5522,
"eval_samples_per_second": 162.999,
"eval_steps_per_second": 20.463,
"step": 9300
},
{
"epoch": 542.9317211948791,
"grad_norm": 0.1609990894794464,
"learning_rate": 5.076923076923077e-06,
"loss": 2.6631,
"step": 9400
},
{
"epoch": 542.9317211948791,
"eval_loss": 2.6604907512664795,
"eval_runtime": 8.8909,
"eval_samples_per_second": 156.79,
"eval_steps_per_second": 19.683,
"step": 9400
},
{
"epoch": 548.6216216216217,
"grad_norm": 0.18364398181438446,
"learning_rate": 5.066889632107024e-06,
"loss": 2.663,
"step": 9500
},
{
"epoch": 548.6216216216217,
"eval_loss": 2.660076856613159,
"eval_runtime": 8.5556,
"eval_samples_per_second": 162.934,
"eval_steps_per_second": 20.454,
"step": 9500
},
{
"epoch": 554.3115220483642,
"grad_norm": 0.15186648070812225,
"learning_rate": 5.05685618729097e-06,
"loss": 2.6631,
"step": 9600
},
{
"epoch": 554.3115220483642,
"eval_loss": 2.6639227867126465,
"eval_runtime": 8.551,
"eval_samples_per_second": 163.021,
"eval_steps_per_second": 20.465,
"step": 9600
},
{
"epoch": 560.0014224751067,
"grad_norm": 0.14984333515167236,
"learning_rate": 5.046822742474916e-06,
"loss": 2.6632,
"step": 9700
},
{
"epoch": 560.0014224751067,
"eval_loss": 2.6611454486846924,
"eval_runtime": 8.9892,
"eval_samples_per_second": 155.075,
"eval_steps_per_second": 19.468,
"step": 9700
},
{
"epoch": 565.6913229018492,
"grad_norm": 0.1124359741806984,
"learning_rate": 5.036789297658863e-06,
"loss": 2.663,
"step": 9800
},
{
"epoch": 565.6913229018492,
"eval_loss": 2.661329746246338,
"eval_runtime": 8.5548,
"eval_samples_per_second": 162.95,
"eval_steps_per_second": 20.456,
"step": 9800
},
{
"epoch": 571.3812233285918,
"grad_norm": 0.230003222823143,
"learning_rate": 5.02675585284281e-06,
"loss": 2.6631,
"step": 9900
},
{
"epoch": 571.3812233285918,
"eval_loss": 2.6644487380981445,
"eval_runtime": 8.563,
"eval_samples_per_second": 162.793,
"eval_steps_per_second": 20.437,
"step": 9900
},
{
"epoch": 577.0711237553343,
"grad_norm": 0.172781303524971,
"learning_rate": 5.016722408026756e-06,
"loss": 2.6626,
"step": 10000
},
{
"epoch": 577.0711237553343,
"eval_loss": 2.662069082260132,
"eval_runtime": 8.892,
"eval_samples_per_second": 156.769,
"eval_steps_per_second": 19.681,
"step": 10000
},
{
"epoch": 582.7610241820768,
"grad_norm": 0.15369383990764618,
"learning_rate": 5.0066889632107026e-06,
"loss": 2.663,
"step": 10100
},
{
"epoch": 582.7610241820768,
"eval_loss": 2.6648526191711426,
"eval_runtime": 8.5714,
"eval_samples_per_second": 162.633,
"eval_steps_per_second": 20.417,
"step": 10100
},
{
"epoch": 588.4509246088194,
"grad_norm": 0.1935221403837204,
"learning_rate": 4.996655518394649e-06,
"loss": 2.6632,
"step": 10200
},
{
"epoch": 588.4509246088194,
"eval_loss": 2.6587936878204346,
"eval_runtime": 8.5618,
"eval_samples_per_second": 162.816,
"eval_steps_per_second": 20.44,
"step": 10200
},
{
"epoch": 594.1408250355619,
"grad_norm": 0.14302797615528107,
"learning_rate": 4.986622073578595e-06,
"loss": 2.6626,
"step": 10300
},
{
"epoch": 594.1408250355619,
"eval_loss": 2.662747383117676,
"eval_runtime": 8.8896,
"eval_samples_per_second": 156.812,
"eval_steps_per_second": 19.686,
"step": 10300
},
{
"epoch": 599.8307254623044,
"grad_norm": 0.18007439374923706,
"learning_rate": 4.976588628762542e-06,
"loss": 2.6631,
"step": 10400
},
{
"epoch": 599.8307254623044,
"eval_loss": 2.6642062664031982,
"eval_runtime": 8.5619,
"eval_samples_per_second": 162.814,
"eval_steps_per_second": 20.439,
"step": 10400
},
{
"epoch": 605.520625889047,
"grad_norm": 0.2200157195329666,
"learning_rate": 4.966555183946489e-06,
"loss": 2.6625,
"step": 10500
},
{
"epoch": 605.520625889047,
"eval_loss": 2.6608235836029053,
"eval_runtime": 8.5591,
"eval_samples_per_second": 162.868,
"eval_steps_per_second": 20.446,
"step": 10500
},
{
"epoch": 611.2105263157895,
"grad_norm": 0.1693902462720871,
"learning_rate": 4.956521739130435e-06,
"loss": 2.6629,
"step": 10600
},
{
"epoch": 611.2105263157895,
"eval_loss": 2.6646671295166016,
"eval_runtime": 8.5548,
"eval_samples_per_second": 162.95,
"eval_steps_per_second": 20.456,
"step": 10600
},
{
"epoch": 616.900426742532,
"grad_norm": 0.17042887210845947,
"learning_rate": 4.9464882943143815e-06,
"loss": 2.663,
"step": 10700
},
{
"epoch": 616.900426742532,
"eval_loss": 2.6628501415252686,
"eval_runtime": 8.9184,
"eval_samples_per_second": 156.306,
"eval_steps_per_second": 19.622,
"step": 10700
},
{
"epoch": 622.5903271692746,
"grad_norm": 0.15105395019054413,
"learning_rate": 4.936454849498328e-06,
"loss": 2.6622,
"step": 10800
},
{
"epoch": 622.5903271692746,
"eval_loss": 2.663177251815796,
"eval_runtime": 8.5551,
"eval_samples_per_second": 162.943,
"eval_steps_per_second": 20.456,
"step": 10800
},
{
"epoch": 628.2802275960171,
"grad_norm": 0.16232497990131378,
"learning_rate": 4.926421404682274e-06,
"loss": 2.662,
"step": 10900
},
{
"epoch": 628.2802275960171,
"eval_loss": 2.662868022918701,
"eval_runtime": 8.5562,
"eval_samples_per_second": 162.923,
"eval_steps_per_second": 20.453,
"step": 10900
},
{
"epoch": 633.9701280227596,
"grad_norm": 0.19268840551376343,
"learning_rate": 4.916387959866221e-06,
"loss": 2.6616,
"step": 11000
},
{
"epoch": 633.9701280227596,
"eval_loss": 2.664335250854492,
"eval_runtime": 8.5889,
"eval_samples_per_second": 162.302,
"eval_steps_per_second": 20.375,
"step": 11000
},
{
"epoch": 639.6600284495021,
"grad_norm": 0.11089065670967102,
"learning_rate": 4.906354515050168e-06,
"loss": 2.6604,
"step": 11100
},
{
"epoch": 639.6600284495021,
"eval_loss": 2.656398057937622,
"eval_runtime": 8.9031,
"eval_samples_per_second": 156.575,
"eval_steps_per_second": 19.656,
"step": 11100
},
{
"epoch": 645.3499288762447,
"grad_norm": 0.1336248517036438,
"learning_rate": 4.8963210702341136e-06,
"loss": 2.6599,
"step": 11200
},
{
"epoch": 645.3499288762447,
"eval_loss": 2.6563735008239746,
"eval_runtime": 8.5539,
"eval_samples_per_second": 162.966,
"eval_steps_per_second": 20.458,
"step": 11200
},
{
"epoch": 651.0398293029872,
"grad_norm": 0.12397616356611252,
"learning_rate": 4.88628762541806e-06,
"loss": 2.6581,
"step": 11300
},
{
"epoch": 651.0398293029872,
"eval_loss": 2.65476131439209,
"eval_runtime": 8.5563,
"eval_samples_per_second": 162.92,
"eval_steps_per_second": 20.453,
"step": 11300
},
{
"epoch": 656.7297297297297,
"grad_norm": 0.2090333253145218,
"learning_rate": 4.876254180602007e-06,
"loss": 2.6553,
"step": 11400
},
{
"epoch": 656.7297297297297,
"eval_loss": 2.6521565914154053,
"eval_runtime": 8.892,
"eval_samples_per_second": 156.77,
"eval_steps_per_second": 19.681,
"step": 11400
},
{
"epoch": 662.4196301564723,
"grad_norm": 0.22825314104557037,
"learning_rate": 4.866220735785953e-06,
"loss": 2.654,
"step": 11500
},
{
"epoch": 662.4196301564723,
"eval_loss": 2.649017095565796,
"eval_runtime": 8.5668,
"eval_samples_per_second": 162.721,
"eval_steps_per_second": 20.428,
"step": 11500
},
{
"epoch": 668.1095305832148,
"grad_norm": 0.19265511631965637,
"learning_rate": 4.8561872909699e-06,
"loss": 2.6522,
"step": 11600
},
{
"epoch": 668.1095305832148,
"eval_loss": 2.650679588317871,
"eval_runtime": 8.5489,
"eval_samples_per_second": 163.061,
"eval_steps_per_second": 20.47,
"step": 11600
},
{
"epoch": 673.7994310099573,
"grad_norm": 0.1772225797176361,
"learning_rate": 4.8461538461538465e-06,
"loss": 2.6506,
"step": 11700
},
{
"epoch": 673.7994310099573,
"eval_loss": 2.644835948944092,
"eval_runtime": 8.8953,
"eval_samples_per_second": 156.711,
"eval_steps_per_second": 19.673,
"step": 11700
},
{
"epoch": 679.4893314366999,
"grad_norm": 0.21952596306800842,
"learning_rate": 4.8361204013377925e-06,
"loss": 2.6495,
"step": 11800
},
{
"epoch": 679.4893314366999,
"eval_loss": 2.6477487087249756,
"eval_runtime": 8.5582,
"eval_samples_per_second": 162.885,
"eval_steps_per_second": 20.448,
"step": 11800
},
{
"epoch": 685.1792318634424,
"grad_norm": 0.15563735365867615,
"learning_rate": 4.826086956521739e-06,
"loss": 2.6488,
"step": 11900
},
{
"epoch": 685.1792318634424,
"eval_loss": 2.6446661949157715,
"eval_runtime": 8.5731,
"eval_samples_per_second": 162.602,
"eval_steps_per_second": 20.413,
"step": 11900
},
{
"epoch": 690.8691322901849,
"grad_norm": 0.19501689076423645,
"learning_rate": 4.816053511705686e-06,
"loss": 2.6477,
"step": 12000
},
{
"epoch": 690.8691322901849,
"eval_loss": 2.644357442855835,
"eval_runtime": 8.9024,
"eval_samples_per_second": 156.587,
"eval_steps_per_second": 19.658,
"step": 12000
},
{
"epoch": 696.5590327169275,
"grad_norm": 0.18384377658367157,
"learning_rate": 4.806020066889633e-06,
"loss": 2.6469,
"step": 12100
},
{
"epoch": 696.5590327169275,
"eval_loss": 2.6418209075927734,
"eval_runtime": 8.5626,
"eval_samples_per_second": 162.801,
"eval_steps_per_second": 20.438,
"step": 12100
},
{
"epoch": 702.24893314367,
"grad_norm": 0.1915460228919983,
"learning_rate": 4.795986622073579e-06,
"loss": 2.6454,
"step": 12200
},
{
"epoch": 702.24893314367,
"eval_loss": 2.641967535018921,
"eval_runtime": 8.5587,
"eval_samples_per_second": 162.875,
"eval_steps_per_second": 20.447,
"step": 12200
},
{
"epoch": 707.9388335704125,
"grad_norm": 0.18700934946537018,
"learning_rate": 4.785953177257525e-06,
"loss": 2.6448,
"step": 12300
},
{
"epoch": 707.9388335704125,
"eval_loss": 2.638808012008667,
"eval_runtime": 8.5628,
"eval_samples_per_second": 162.798,
"eval_steps_per_second": 20.437,
"step": 12300
},
{
"epoch": 713.628733997155,
"grad_norm": 0.17106923460960388,
"learning_rate": 4.775919732441472e-06,
"loss": 2.6446,
"step": 12400
},
{
"epoch": 713.628733997155,
"eval_loss": 2.6404778957366943,
"eval_runtime": 8.8985,
"eval_samples_per_second": 156.655,
"eval_steps_per_second": 19.666,
"step": 12400
},
{
"epoch": 719.3186344238976,
"grad_norm": 0.17941860854625702,
"learning_rate": 4.765886287625418e-06,
"loss": 2.6436,
"step": 12500
},
{
"epoch": 719.3186344238976,
"eval_loss": 2.6373584270477295,
"eval_runtime": 8.557,
"eval_samples_per_second": 162.907,
"eval_steps_per_second": 20.451,
"step": 12500
},
{
"epoch": 725.0085348506401,
"grad_norm": 0.17565137147903442,
"learning_rate": 4.755852842809365e-06,
"loss": 2.6434,
"step": 12600
},
{
"epoch": 725.0085348506401,
"eval_loss": 2.639042377471924,
"eval_runtime": 8.5557,
"eval_samples_per_second": 162.932,
"eval_steps_per_second": 20.454,
"step": 12600
},
{
"epoch": 730.6984352773826,
"grad_norm": 0.18980301916599274,
"learning_rate": 4.745819397993312e-06,
"loss": 2.6428,
"step": 12700
},
{
"epoch": 730.6984352773826,
"eval_loss": 2.6368398666381836,
"eval_runtime": 8.9007,
"eval_samples_per_second": 156.617,
"eval_steps_per_second": 19.661,
"step": 12700
},
{
"epoch": 736.3883357041252,
"grad_norm": 0.1572832465171814,
"learning_rate": 4.7357859531772575e-06,
"loss": 2.6423,
"step": 12800
},
{
"epoch": 736.3883357041252,
"eval_loss": 2.6357386112213135,
"eval_runtime": 8.5632,
"eval_samples_per_second": 162.79,
"eval_steps_per_second": 20.436,
"step": 12800
},
{
"epoch": 742.0782361308677,
"grad_norm": 0.17804701626300812,
"learning_rate": 4.725752508361204e-06,
"loss": 2.6415,
"step": 12900
},
{
"epoch": 742.0782361308677,
"eval_loss": 2.636728525161743,
"eval_runtime": 8.5558,
"eval_samples_per_second": 162.931,
"eval_steps_per_second": 20.454,
"step": 12900
},
{
"epoch": 747.7681365576102,
"grad_norm": 0.14196521043777466,
"learning_rate": 4.715719063545151e-06,
"loss": 2.6415,
"step": 13000
},
{
"epoch": 747.7681365576102,
"eval_loss": 2.6351287364959717,
"eval_runtime": 8.5495,
"eval_samples_per_second": 163.05,
"eval_steps_per_second": 20.469,
"step": 13000
},
{
"epoch": 753.4580369843528,
"grad_norm": 0.16282819211483002,
"learning_rate": 4.705685618729097e-06,
"loss": 2.6409,
"step": 13100
},
{
"epoch": 753.4580369843528,
"eval_loss": 2.6369380950927734,
"eval_runtime": 8.8961,
"eval_samples_per_second": 156.698,
"eval_steps_per_second": 19.672,
"step": 13100
},
{
"epoch": 759.1479374110953,
"grad_norm": 0.1580921709537506,
"learning_rate": 4.695652173913044e-06,
"loss": 2.6404,
"step": 13200
},
{
"epoch": 759.1479374110953,
"eval_loss": 2.6370317935943604,
"eval_runtime": 8.5558,
"eval_samples_per_second": 162.93,
"eval_steps_per_second": 20.454,
"step": 13200
},
{
"epoch": 764.8378378378378,
"grad_norm": 0.23563043773174286,
"learning_rate": 4.6856187290969905e-06,
"loss": 2.6394,
"step": 13300
},
{
"epoch": 764.8378378378378,
"eval_loss": 2.6321442127227783,
"eval_runtime": 8.5496,
"eval_samples_per_second": 163.048,
"eval_steps_per_second": 20.469,
"step": 13300
},
{
"epoch": 770.5277382645804,
"grad_norm": 0.16354724764823914,
"learning_rate": 4.675585284280936e-06,
"loss": 2.639,
"step": 13400
},
{
"epoch": 770.5277382645804,
"eval_loss": 2.6348910331726074,
"eval_runtime": 8.9264,
"eval_samples_per_second": 156.166,
"eval_steps_per_second": 19.605,
"step": 13400
},
{
"epoch": 776.2176386913229,
"grad_norm": 0.1707228273153305,
"learning_rate": 4.665551839464883e-06,
"loss": 2.639,
"step": 13500
},
{
"epoch": 776.2176386913229,
"eval_loss": 2.635204792022705,
"eval_runtime": 8.5691,
"eval_samples_per_second": 162.678,
"eval_steps_per_second": 20.422,
"step": 13500
},
{
"epoch": 781.9075391180654,
"grad_norm": 0.16934677958488464,
"learning_rate": 4.65551839464883e-06,
"loss": 2.6385,
"step": 13600
},
{
"epoch": 781.9075391180654,
"eval_loss": 2.633455276489258,
"eval_runtime": 8.557,
"eval_samples_per_second": 162.907,
"eval_steps_per_second": 20.451,
"step": 13600
},
{
"epoch": 787.5974395448079,
"grad_norm": 0.1871781051158905,
"learning_rate": 4.645484949832776e-06,
"loss": 2.6379,
"step": 13700
},
{
"epoch": 787.5974395448079,
"eval_loss": 2.633129119873047,
"eval_runtime": 8.9094,
"eval_samples_per_second": 156.463,
"eval_steps_per_second": 19.642,
"step": 13700
},
{
"epoch": 793.2873399715505,
"grad_norm": 0.20615407824516296,
"learning_rate": 4.635451505016723e-06,
"loss": 2.6376,
"step": 13800
},
{
"epoch": 793.2873399715505,
"eval_loss": 2.634012222290039,
"eval_runtime": 8.551,
"eval_samples_per_second": 163.021,
"eval_steps_per_second": 20.465,
"step": 13800
},
{
"epoch": 798.977240398293,
"grad_norm": 0.21352247893810272,
"learning_rate": 4.625418060200669e-06,
"loss": 2.6374,
"step": 13900
},
{
"epoch": 798.977240398293,
"eval_loss": 2.6326115131378174,
"eval_runtime": 8.5835,
"eval_samples_per_second": 162.405,
"eval_steps_per_second": 20.388,
"step": 13900
},
{
"epoch": 804.6671408250355,
"grad_norm": 0.21041567623615265,
"learning_rate": 4.615384615384616e-06,
"loss": 2.6373,
"step": 14000
},
{
"epoch": 804.6671408250355,
"eval_loss": 2.632585287094116,
"eval_runtime": 8.9033,
"eval_samples_per_second": 156.571,
"eval_steps_per_second": 19.656,
"step": 14000
},
{
"epoch": 810.3570412517781,
"grad_norm": 0.16558390855789185,
"learning_rate": 4.605351170568562e-06,
"loss": 2.637,
"step": 14100
},
{
"epoch": 810.3570412517781,
"eval_loss": 2.6330647468566895,
"eval_runtime": 8.5657,
"eval_samples_per_second": 162.743,
"eval_steps_per_second": 20.43,
"step": 14100
},
{
"epoch": 816.0469416785206,
"grad_norm": 0.14121714234352112,
"learning_rate": 4.595317725752509e-06,
"loss": 2.6369,
"step": 14200
},
{
"epoch": 816.0469416785206,
"eval_loss": 2.633366823196411,
"eval_runtime": 8.5562,
"eval_samples_per_second": 162.924,
"eval_steps_per_second": 20.453,
"step": 14200
},
{
"epoch": 821.7368421052631,
"grad_norm": 0.18725652992725372,
"learning_rate": 4.585284280936456e-06,
"loss": 2.6366,
"step": 14300
},
{
"epoch": 821.7368421052631,
"eval_loss": 2.633021116256714,
"eval_runtime": 8.547,
"eval_samples_per_second": 163.099,
"eval_steps_per_second": 20.475,
"step": 14300
},
{
"epoch": 827.4267425320057,
"grad_norm": 0.17320464551448822,
"learning_rate": 4.5752508361204015e-06,
"loss": 2.6362,
"step": 14400
},
{
"epoch": 827.4267425320057,
"eval_loss": 2.6336045265197754,
"eval_runtime": 8.9019,
"eval_samples_per_second": 156.595,
"eval_steps_per_second": 19.659,
"step": 14400
},
{
"epoch": 833.1166429587482,
"grad_norm": 0.25663965940475464,
"learning_rate": 4.565217391304348e-06,
"loss": 2.6358,
"step": 14500
},
{
"epoch": 833.1166429587482,
"eval_loss": 2.629626750946045,
"eval_runtime": 8.555,
"eval_samples_per_second": 162.946,
"eval_steps_per_second": 20.456,
"step": 14500
},
{
"epoch": 838.8065433854907,
"grad_norm": 0.19742050766944885,
"learning_rate": 4.555183946488295e-06,
"loss": 2.6359,
"step": 14600
},
{
"epoch": 838.8065433854907,
"eval_loss": 2.6323554515838623,
"eval_runtime": 8.5537,
"eval_samples_per_second": 162.97,
"eval_steps_per_second": 20.459,
"step": 14600
},
{
"epoch": 844.4964438122333,
"grad_norm": 0.17131681740283966,
"learning_rate": 4.545150501672241e-06,
"loss": 2.636,
"step": 14700
},
{
"epoch": 844.4964438122333,
"eval_loss": 2.628143787384033,
"eval_runtime": 8.8882,
"eval_samples_per_second": 156.837,
"eval_steps_per_second": 19.689,
"step": 14700
},
{
"epoch": 850.1863442389758,
"grad_norm": 0.12929615378379822,
"learning_rate": 4.535117056856188e-06,
"loss": 2.6355,
"step": 14800
},
{
"epoch": 850.1863442389758,
"eval_loss": 2.62906813621521,
"eval_runtime": 8.5719,
"eval_samples_per_second": 162.624,
"eval_steps_per_second": 20.415,
"step": 14800
},
{
"epoch": 855.8762446657183,
"grad_norm": 0.1839623749256134,
"learning_rate": 4.5250836120401345e-06,
"loss": 2.6357,
"step": 14900
},
{
"epoch": 855.8762446657183,
"eval_loss": 2.6294586658477783,
"eval_runtime": 8.556,
"eval_samples_per_second": 162.926,
"eval_steps_per_second": 20.453,
"step": 14900
},
{
"epoch": 861.5661450924608,
"grad_norm": 0.16717371344566345,
"learning_rate": 4.51505016722408e-06,
"loss": 2.6344,
"step": 15000
},
{
"epoch": 861.5661450924608,
"eval_loss": 2.63043475151062,
"eval_runtime": 8.8856,
"eval_samples_per_second": 156.883,
"eval_steps_per_second": 19.695,
"step": 15000
},
{
"epoch": 867.2560455192034,
"grad_norm": 0.1823185533285141,
"learning_rate": 4.505016722408027e-06,
"loss": 2.6348,
"step": 15100
},
{
"epoch": 867.2560455192034,
"eval_loss": 2.6305038928985596,
"eval_runtime": 8.5615,
"eval_samples_per_second": 162.822,
"eval_steps_per_second": 20.44,
"step": 15100
},
{
"epoch": 872.9459459459459,
"grad_norm": 0.1823842078447342,
"learning_rate": 4.494983277591973e-06,
"loss": 2.6348,
"step": 15200
},
{
"epoch": 872.9459459459459,
"eval_loss": 2.6309924125671387,
"eval_runtime": 8.5581,
"eval_samples_per_second": 162.887,
"eval_steps_per_second": 20.449,
"step": 15200
},
{
"epoch": 878.6358463726884,
"grad_norm": 0.20153598487377167,
"learning_rate": 4.48494983277592e-06,
"loss": 2.6342,
"step": 15300
},
{
"epoch": 878.6358463726884,
"eval_loss": 2.6329071521759033,
"eval_runtime": 8.8917,
"eval_samples_per_second": 156.776,
"eval_steps_per_second": 19.681,
"step": 15300
},
{
"epoch": 884.325746799431,
"grad_norm": 0.18218009173870087,
"learning_rate": 4.474916387959866e-06,
"loss": 2.6344,
"step": 15400
},
{
"epoch": 884.325746799431,
"eval_loss": 2.6302568912506104,
"eval_runtime": 8.5652,
"eval_samples_per_second": 162.752,
"eval_steps_per_second": 20.432,
"step": 15400
},
{
"epoch": 890.0156472261735,
"grad_norm": 0.16739265620708466,
"learning_rate": 4.4648829431438125e-06,
"loss": 2.6343,
"step": 15500
},
{
"epoch": 890.0156472261735,
"eval_loss": 2.6310319900512695,
"eval_runtime": 8.5584,
"eval_samples_per_second": 162.88,
"eval_steps_per_second": 20.448,
"step": 15500
},
{
"epoch": 895.705547652916,
"grad_norm": 0.1390063315629959,
"learning_rate": 4.454849498327759e-06,
"loss": 2.6339,
"step": 15600
},
{
"epoch": 895.705547652916,
"eval_loss": 2.6301069259643555,
"eval_runtime": 8.9049,
"eval_samples_per_second": 156.544,
"eval_steps_per_second": 19.652,
"step": 15600
},
{
"epoch": 901.3954480796586,
"grad_norm": 0.18924345076084137,
"learning_rate": 4.444816053511705e-06,
"loss": 2.6339,
"step": 15700
},
{
"epoch": 901.3954480796586,
"eval_loss": 2.6323258876800537,
"eval_runtime": 8.5547,
"eval_samples_per_second": 162.952,
"eval_steps_per_second": 20.457,
"step": 15700
},
{
"epoch": 907.0853485064011,
"grad_norm": 0.18514582514762878,
"learning_rate": 4.434782608695652e-06,
"loss": 2.6338,
"step": 15800
},
{
"epoch": 907.0853485064011,
"eval_loss": 2.629317045211792,
"eval_runtime": 8.5557,
"eval_samples_per_second": 162.933,
"eval_steps_per_second": 20.454,
"step": 15800
},
{
"epoch": 912.7752489331436,
"grad_norm": 0.16134916245937347,
"learning_rate": 4.424749163879599e-06,
"loss": 2.6332,
"step": 15900
},
{
"epoch": 912.7752489331436,
"eval_loss": 2.6283786296844482,
"eval_runtime": 8.8916,
"eval_samples_per_second": 156.778,
"eval_steps_per_second": 19.682,
"step": 15900
},
{
"epoch": 918.4651493598863,
"grad_norm": 0.15325242280960083,
"learning_rate": 4.414715719063545e-06,
"loss": 2.6327,
"step": 16000
},
{
"epoch": 918.4651493598863,
"eval_loss": 2.628596305847168,
"eval_runtime": 8.5648,
"eval_samples_per_second": 162.759,
"eval_steps_per_second": 20.432,
"step": 16000
},
{
"epoch": 924.1550497866288,
"grad_norm": 0.16646109521389008,
"learning_rate": 4.404682274247491e-06,
"loss": 2.6334,
"step": 16100
},
{
"epoch": 924.1550497866288,
"eval_loss": 2.6277356147766113,
"eval_runtime": 8.5578,
"eval_samples_per_second": 162.891,
"eval_steps_per_second": 20.449,
"step": 16100
},
{
"epoch": 929.8449502133712,
"grad_norm": 0.190487802028656,
"learning_rate": 4.394648829431438e-06,
"loss": 2.6324,
"step": 16200
},
{
"epoch": 929.8449502133712,
"eval_loss": 2.632991075515747,
"eval_runtime": 8.8984,
"eval_samples_per_second": 156.657,
"eval_steps_per_second": 19.666,
"step": 16200
},
{
"epoch": 935.5348506401137,
"grad_norm": 0.12819956243038177,
"learning_rate": 4.384615384615384e-06,
"loss": 2.6329,
"step": 16300
},
{
"epoch": 935.5348506401137,
"eval_loss": 2.6287131309509277,
"eval_runtime": 8.5512,
"eval_samples_per_second": 163.019,
"eval_steps_per_second": 20.465,
"step": 16300
},
{
"epoch": 941.2247510668564,
"grad_norm": 0.1414095014333725,
"learning_rate": 4.374581939799331e-06,
"loss": 2.6329,
"step": 16400
},
{
"epoch": 941.2247510668564,
"eval_loss": 2.6280200481414795,
"eval_runtime": 8.5543,
"eval_samples_per_second": 162.96,
"eval_steps_per_second": 20.458,
"step": 16400
},
{
"epoch": 946.9146514935989,
"grad_norm": 0.1598784625530243,
"learning_rate": 4.364548494983278e-06,
"loss": 2.6321,
"step": 16500
},
{
"epoch": 946.9146514935989,
"eval_loss": 2.627798080444336,
"eval_runtime": 8.5617,
"eval_samples_per_second": 162.818,
"eval_steps_per_second": 20.44,
"step": 16500
},
{
"epoch": 952.6045519203414,
"grad_norm": 0.16925720870494843,
"learning_rate": 4.354515050167224e-06,
"loss": 2.6323,
"step": 16600
},
{
"epoch": 952.6045519203414,
"eval_loss": 2.627779722213745,
"eval_runtime": 8.8962,
"eval_samples_per_second": 156.697,
"eval_steps_per_second": 19.671,
"step": 16600
},
{
"epoch": 958.294452347084,
"grad_norm": 0.14368008077144623,
"learning_rate": 4.34448160535117e-06,
"loss": 2.6325,
"step": 16700
},
{
"epoch": 958.294452347084,
"eval_loss": 2.6313493251800537,
"eval_runtime": 8.5564,
"eval_samples_per_second": 162.919,
"eval_steps_per_second": 20.452,
"step": 16700
},
{
"epoch": 963.9843527738265,
"grad_norm": 0.17267128825187683,
"learning_rate": 4.334448160535117e-06,
"loss": 2.6323,
"step": 16800
},
{
"epoch": 963.9843527738265,
"eval_loss": 2.628115653991699,
"eval_runtime": 8.9,
"eval_samples_per_second": 156.628,
"eval_steps_per_second": 19.663,
"step": 16800
},
{
"epoch": 969.674253200569,
"grad_norm": 0.19119863212108612,
"learning_rate": 4.324414715719064e-06,
"loss": 2.6318,
"step": 16900
},
{
"epoch": 969.674253200569,
"eval_loss": 2.627437114715576,
"eval_runtime": 8.5548,
"eval_samples_per_second": 162.95,
"eval_steps_per_second": 20.456,
"step": 16900
},
{
"epoch": 975.3641536273116,
"grad_norm": 0.13695764541625977,
"learning_rate": 4.31438127090301e-06,
"loss": 2.6315,
"step": 17000
},
{
"epoch": 975.3641536273116,
"eval_loss": 2.6262221336364746,
"eval_runtime": 8.8997,
"eval_samples_per_second": 156.634,
"eval_steps_per_second": 19.664,
"step": 17000
},
{
"epoch": 981.0540540540541,
"grad_norm": 0.14241984486579895,
"learning_rate": 4.3043478260869565e-06,
"loss": 2.6318,
"step": 17100
},
{
"epoch": 981.0540540540541,
"eval_loss": 2.6269607543945312,
"eval_runtime": 8.5513,
"eval_samples_per_second": 163.017,
"eval_steps_per_second": 20.465,
"step": 17100
},
{
"epoch": 986.7439544807966,
"grad_norm": 0.15792237222194672,
"learning_rate": 4.294314381270903e-06,
"loss": 2.6315,
"step": 17200
},
{
"epoch": 986.7439544807966,
"eval_loss": 2.6278719902038574,
"eval_runtime": 8.6117,
"eval_samples_per_second": 161.873,
"eval_steps_per_second": 20.321,
"step": 17200
},
{
"epoch": 992.4338549075392,
"grad_norm": 0.17118434607982635,
"learning_rate": 4.284280936454849e-06,
"loss": 2.6316,
"step": 17300
},
{
"epoch": 992.4338549075392,
"eval_loss": 2.6280527114868164,
"eval_runtime": 8.9035,
"eval_samples_per_second": 156.568,
"eval_steps_per_second": 19.655,
"step": 17300
},
{
"epoch": 998.1237553342817,
"grad_norm": 0.15846611559391022,
"learning_rate": 4.274247491638796e-06,
"loss": 2.6313,
"step": 17400
},
{
"epoch": 998.1237553342817,
"eval_loss": 2.6250662803649902,
"eval_runtime": 8.6057,
"eval_samples_per_second": 161.986,
"eval_steps_per_second": 20.335,
"step": 17400
},
{
"epoch": 1003.8136557610242,
"grad_norm": 0.17078837752342224,
"learning_rate": 4.264214046822743e-06,
"loss": 2.6312,
"step": 17500
},
{
"epoch": 1003.8136557610242,
"eval_loss": 2.6274046897888184,
"eval_runtime": 8.564,
"eval_samples_per_second": 162.775,
"eval_steps_per_second": 20.434,
"step": 17500
},
{
"epoch": 1009.5035561877667,
"grad_norm": 0.1965128779411316,
"learning_rate": 4.254180602006689e-06,
"loss": 2.6312,
"step": 17600
},
{
"epoch": 1009.5035561877667,
"eval_loss": 2.6278066635131836,
"eval_runtime": 8.9009,
"eval_samples_per_second": 156.614,
"eval_steps_per_second": 19.661,
"step": 17600
},
{
"epoch": 1015.1934566145093,
"grad_norm": 0.19483456015586853,
"learning_rate": 4.244147157190635e-06,
"loss": 2.6311,
"step": 17700
},
{
"epoch": 1015.1934566145093,
"eval_loss": 2.623715400695801,
"eval_runtime": 8.5597,
"eval_samples_per_second": 162.857,
"eval_steps_per_second": 20.445,
"step": 17700
},
{
"epoch": 1020.8833570412518,
"grad_norm": 0.14647985994815826,
"learning_rate": 4.234113712374582e-06,
"loss": 2.6309,
"step": 17800
},
{
"epoch": 1020.8833570412518,
"eval_loss": 2.625011920928955,
"eval_runtime": 8.8958,
"eval_samples_per_second": 156.704,
"eval_steps_per_second": 19.672,
"step": 17800
},
{
"epoch": 1026.5732574679944,
"grad_norm": 0.1495138704776764,
"learning_rate": 4.224080267558528e-06,
"loss": 2.6303,
"step": 17900
},
{
"epoch": 1026.5732574679944,
"eval_loss": 2.6249139308929443,
"eval_runtime": 8.5668,
"eval_samples_per_second": 162.72,
"eval_steps_per_second": 20.428,
"step": 17900
},
{
"epoch": 1032.2631578947369,
"grad_norm": 0.1665605753660202,
"learning_rate": 4.214046822742475e-06,
"loss": 2.6305,
"step": 18000
},
{
"epoch": 1032.2631578947369,
"eval_loss": 2.6271395683288574,
"eval_runtime": 8.8963,
"eval_samples_per_second": 156.694,
"eval_steps_per_second": 19.671,
"step": 18000
},
{
"epoch": 1037.9530583214794,
"grad_norm": 0.1886260211467743,
"learning_rate": 4.2040133779264216e-06,
"loss": 2.6307,
"step": 18100
},
{
"epoch": 1037.9530583214794,
"eval_loss": 2.6232030391693115,
"eval_runtime": 8.5585,
"eval_samples_per_second": 162.879,
"eval_steps_per_second": 20.447,
"step": 18100
},
{
"epoch": 1043.6429587482219,
"grad_norm": 0.1451101154088974,
"learning_rate": 4.1939799331103675e-06,
"loss": 2.6304,
"step": 18200
},
{
"epoch": 1043.6429587482219,
"eval_loss": 2.624784469604492,
"eval_runtime": 8.5566,
"eval_samples_per_second": 162.916,
"eval_steps_per_second": 20.452,
"step": 18200
},
{
"epoch": 1049.3328591749644,
"grad_norm": 0.13841372728347778,
"learning_rate": 4.183946488294314e-06,
"loss": 2.6305,
"step": 18300
},
{
"epoch": 1049.3328591749644,
"eval_loss": 2.626993179321289,
"eval_runtime": 8.9131,
"eval_samples_per_second": 156.398,
"eval_steps_per_second": 19.634,
"step": 18300
},
{
"epoch": 1055.0227596017069,
"grad_norm": 0.1455683559179306,
"learning_rate": 4.173913043478261e-06,
"loss": 2.6301,
"step": 18400
},
{
"epoch": 1055.0227596017069,
"eval_loss": 2.6284282207489014,
"eval_runtime": 8.555,
"eval_samples_per_second": 162.946,
"eval_steps_per_second": 20.456,
"step": 18400
},
{
"epoch": 1060.7126600284496,
"grad_norm": 0.14764897525310516,
"learning_rate": 4.163879598662208e-06,
"loss": 2.6305,
"step": 18500
},
{
"epoch": 1060.7126600284496,
"eval_loss": 2.626128673553467,
"eval_runtime": 8.5552,
"eval_samples_per_second": 162.942,
"eval_steps_per_second": 20.455,
"step": 18500
},
{
"epoch": 1066.402560455192,
"grad_norm": 0.15602290630340576,
"learning_rate": 4.153846153846154e-06,
"loss": 2.6303,
"step": 18600
},
{
"epoch": 1066.402560455192,
"eval_loss": 2.6236324310302734,
"eval_runtime": 8.5623,
"eval_samples_per_second": 162.806,
"eval_steps_per_second": 20.438,
"step": 18600
},
{
"epoch": 1099.8036984352773,
"grad_norm": 0.1434181034564972,
"learning_rate": 4.1438127090301005e-06,
"loss": 2.6299,
"step": 18700
},
{
"epoch": 1099.8036984352773,
"eval_loss": 2.6240837574005127,
"eval_runtime": 8.8817,
"eval_samples_per_second": 156.952,
"eval_steps_per_second": 19.703,
"step": 18700
},
{
"epoch": 1105.49359886202,
"grad_norm": 0.17055080831050873,
"learning_rate": 4.133779264214047e-06,
"loss": 2.6296,
"step": 18800
},
{
"epoch": 1105.49359886202,
"eval_loss": 2.627481698989868,
"eval_runtime": 8.5387,
"eval_samples_per_second": 163.257,
"eval_steps_per_second": 20.495,
"step": 18800
},
{
"epoch": 1111.1834992887625,
"grad_norm": 0.15118207037448883,
"learning_rate": 4.123745819397993e-06,
"loss": 2.6295,
"step": 18900
},
{
"epoch": 1111.1834992887625,
"eval_loss": 2.6243932247161865,
"eval_runtime": 8.5529,
"eval_samples_per_second": 162.985,
"eval_steps_per_second": 20.461,
"step": 18900
},
{
"epoch": 1116.873399715505,
"grad_norm": 0.14333444833755493,
"learning_rate": 4.11371237458194e-06,
"loss": 2.6294,
"step": 19000
},
{
"epoch": 1116.873399715505,
"eval_loss": 2.6264147758483887,
"eval_runtime": 8.8805,
"eval_samples_per_second": 156.973,
"eval_steps_per_second": 19.706,
"step": 19000
},
{
"epoch": 1122.5633001422475,
"grad_norm": 0.13676032423973083,
"learning_rate": 4.103678929765887e-06,
"loss": 2.6292,
"step": 19100
},
{
"epoch": 1122.5633001422475,
"eval_loss": 2.6256096363067627,
"eval_runtime": 8.5396,
"eval_samples_per_second": 163.24,
"eval_steps_per_second": 20.493,
"step": 19100
},
{
"epoch": 1128.25320056899,
"grad_norm": 0.13608410954475403,
"learning_rate": 4.0936454849498326e-06,
"loss": 2.6292,
"step": 19200
},
{
"epoch": 1128.25320056899,
"eval_loss": 2.6272470951080322,
"eval_runtime": 8.5412,
"eval_samples_per_second": 163.209,
"eval_steps_per_second": 20.489,
"step": 19200
},
{
"epoch": 1133.9431009957325,
"grad_norm": 0.16941364109516144,
"learning_rate": 4.083612040133779e-06,
"loss": 2.6294,
"step": 19300
},
{
"epoch": 1133.9431009957325,
"eval_loss": 2.6245925426483154,
"eval_runtime": 8.8711,
"eval_samples_per_second": 157.14,
"eval_steps_per_second": 19.727,
"step": 19300
},
{
"epoch": 1139.6330014224752,
"grad_norm": 0.17961208522319794,
"learning_rate": 4.073578595317726e-06,
"loss": 2.6291,
"step": 19400
},
{
"epoch": 1139.6330014224752,
"eval_loss": 2.6260921955108643,
"eval_runtime": 8.5481,
"eval_samples_per_second": 163.078,
"eval_steps_per_second": 20.472,
"step": 19400
},
{
"epoch": 1145.3229018492177,
"grad_norm": 0.15234056115150452,
"learning_rate": 4.063545150501672e-06,
"loss": 2.6288,
"step": 19500
},
{
"epoch": 1145.3229018492177,
"eval_loss": 2.624178647994995,
"eval_runtime": 8.5458,
"eval_samples_per_second": 163.121,
"eval_steps_per_second": 20.478,
"step": 19500
},
{
"epoch": 1151.0128022759602,
"grad_norm": 0.1660071462392807,
"learning_rate": 4.053511705685619e-06,
"loss": 2.6289,
"step": 19600
},
{
"epoch": 1151.0128022759602,
"eval_loss": 2.625214099884033,
"eval_runtime": 8.8562,
"eval_samples_per_second": 157.404,
"eval_steps_per_second": 19.76,
"step": 19600
},
{
"epoch": 1156.7027027027027,
"grad_norm": 0.1432279646396637,
"learning_rate": 4.0434782608695655e-06,
"loss": 2.6288,
"step": 19700
},
{
"epoch": 1156.7027027027027,
"eval_loss": 2.6248371601104736,
"eval_runtime": 8.5388,
"eval_samples_per_second": 163.255,
"eval_steps_per_second": 20.495,
"step": 19700
},
{
"epoch": 1162.3926031294452,
"grad_norm": 0.13359645009040833,
"learning_rate": 4.0334448160535115e-06,
"loss": 2.6291,
"step": 19800
},
{
"epoch": 1162.3926031294452,
"eval_loss": 2.6228439807891846,
"eval_runtime": 8.5401,
"eval_samples_per_second": 163.23,
"eval_steps_per_second": 20.492,
"step": 19800
},
{
"epoch": 1168.0825035561877,
"grad_norm": 0.18464621901512146,
"learning_rate": 4.023411371237458e-06,
"loss": 2.6286,
"step": 19900
},
{
"epoch": 1168.0825035561877,
"eval_loss": 2.624844789505005,
"eval_runtime": 8.5435,
"eval_samples_per_second": 163.165,
"eval_steps_per_second": 20.483,
"step": 19900
},
{
"epoch": 1173.7724039829302,
"grad_norm": 0.14693519473075867,
"learning_rate": 4.013377926421405e-06,
"loss": 2.6282,
"step": 20000
},
{
"epoch": 1173.7724039829302,
"eval_loss": 2.625211238861084,
"eval_runtime": 8.876,
"eval_samples_per_second": 157.053,
"eval_steps_per_second": 19.716,
"step": 20000
},
{
"epoch": 1179.462304409673,
"grad_norm": 0.14849957823753357,
"learning_rate": 4.003344481605351e-06,
"loss": 2.6281,
"step": 20100
},
{
"epoch": 1179.462304409673,
"eval_loss": 2.6256697177886963,
"eval_runtime": 8.8789,
"eval_samples_per_second": 157.002,
"eval_steps_per_second": 19.71,
"step": 20100
},
{
"epoch": 1185.1522048364154,
"grad_norm": 0.1465172916650772,
"learning_rate": 3.993311036789298e-06,
"loss": 2.6279,
"step": 20200
},
{
"epoch": 1185.1522048364154,
"eval_loss": 2.6242611408233643,
"eval_runtime": 8.5384,
"eval_samples_per_second": 163.263,
"eval_steps_per_second": 20.496,
"step": 20200
},
{
"epoch": 1190.842105263158,
"grad_norm": 0.15794384479522705,
"learning_rate": 3.9832775919732444e-06,
"loss": 2.6281,
"step": 20300
},
{
"epoch": 1190.842105263158,
"eval_loss": 2.623426914215088,
"eval_runtime": 8.5314,
"eval_samples_per_second": 163.397,
"eval_steps_per_second": 20.512,
"step": 20300
},
{
"epoch": 1196.5320056899004,
"grad_norm": 0.1284749060869217,
"learning_rate": 3.97324414715719e-06,
"loss": 2.6281,
"step": 20400
},
{
"epoch": 1196.5320056899004,
"eval_loss": 2.622859239578247,
"eval_runtime": 8.863,
"eval_samples_per_second": 157.283,
"eval_steps_per_second": 19.745,
"step": 20400
},
{
"epoch": 1202.221906116643,
"grad_norm": 0.1669575572013855,
"learning_rate": 3.963210702341137e-06,
"loss": 2.6281,
"step": 20500
},
{
"epoch": 1202.221906116643,
"eval_loss": 2.6228220462799072,
"eval_runtime": 8.5362,
"eval_samples_per_second": 163.305,
"eval_steps_per_second": 20.501,
"step": 20500
},
{
"epoch": 1207.9118065433854,
"grad_norm": 0.12002875655889511,
"learning_rate": 3.953177257525084e-06,
"loss": 2.6284,
"step": 20600
},
{
"epoch": 1207.9118065433854,
"eval_loss": 2.6229088306427,
"eval_runtime": 8.5208,
"eval_samples_per_second": 163.599,
"eval_steps_per_second": 20.538,
"step": 20600
},
{
"epoch": 1213.6017069701281,
"grad_norm": 0.14911407232284546,
"learning_rate": 3.943143812709031e-06,
"loss": 2.6278,
"step": 20700
},
{
"epoch": 1213.6017069701281,
"eval_loss": 2.6207728385925293,
"eval_runtime": 8.5412,
"eval_samples_per_second": 163.208,
"eval_steps_per_second": 20.489,
"step": 20700
},
{
"epoch": 1219.2916073968706,
"grad_norm": 0.1687910258769989,
"learning_rate": 3.9331103678929765e-06,
"loss": 2.6277,
"step": 20800
},
{
"epoch": 1219.2916073968706,
"eval_loss": 2.623382806777954,
"eval_runtime": 8.8763,
"eval_samples_per_second": 157.047,
"eval_steps_per_second": 19.715,
"step": 20800
},
{
"epoch": 1224.9815078236131,
"grad_norm": 0.1914646476507187,
"learning_rate": 3.923076923076923e-06,
"loss": 2.6271,
"step": 20900
},
{
"epoch": 1224.9815078236131,
"eval_loss": 2.6222121715545654,
"eval_runtime": 8.5242,
"eval_samples_per_second": 163.535,
"eval_steps_per_second": 20.53,
"step": 20900
},
{
"epoch": 1230.6714082503556,
"grad_norm": 0.15010875463485718,
"learning_rate": 3.91304347826087e-06,
"loss": 2.6276,
"step": 21000
},
{
"epoch": 1230.6714082503556,
"eval_loss": 2.6212801933288574,
"eval_runtime": 8.5399,
"eval_samples_per_second": 163.234,
"eval_steps_per_second": 20.492,
"step": 21000
},
{
"epoch": 1236.3613086770981,
"grad_norm": 0.1383567601442337,
"learning_rate": 3.903010033444816e-06,
"loss": 2.6275,
"step": 21100
},
{
"epoch": 1236.3613086770981,
"eval_loss": 2.6240437030792236,
"eval_runtime": 8.5381,
"eval_samples_per_second": 163.269,
"eval_steps_per_second": 20.496,
"step": 21100
},
{
"epoch": 1242.0512091038406,
"grad_norm": 0.15790875256061554,
"learning_rate": 3.892976588628763e-06,
"loss": 2.6267,
"step": 21200
},
{
"epoch": 1242.0512091038406,
"eval_loss": 2.623500108718872,
"eval_runtime": 8.8811,
"eval_samples_per_second": 156.963,
"eval_steps_per_second": 19.705,
"step": 21200
},
{
"epoch": 1247.7411095305831,
"grad_norm": 0.15240466594696045,
"learning_rate": 3.8829431438127095e-06,
"loss": 2.6269,
"step": 21300
},
{
"epoch": 1247.7411095305831,
"eval_loss": 2.6207492351531982,
"eval_runtime": 8.5211,
"eval_samples_per_second": 163.594,
"eval_steps_per_second": 20.537,
"step": 21300
},
{
"epoch": 1253.4310099573258,
"grad_norm": 0.1933618187904358,
"learning_rate": 3.8729096989966554e-06,
"loss": 2.627,
"step": 21400
},
{
"epoch": 1253.4310099573258,
"eval_loss": 2.62373948097229,
"eval_runtime": 8.5399,
"eval_samples_per_second": 163.235,
"eval_steps_per_second": 20.492,
"step": 21400
},
{
"epoch": 1259.1209103840683,
"grad_norm": 0.17298194766044617,
"learning_rate": 3.862876254180602e-06,
"loss": 2.6273,
"step": 21500
},
{
"epoch": 1259.1209103840683,
"eval_loss": 2.626997232437134,
"eval_runtime": 8.88,
"eval_samples_per_second": 156.981,
"eval_steps_per_second": 19.707,
"step": 21500
},
{
"epoch": 1264.8108108108108,
"grad_norm": 0.15336528420448303,
"learning_rate": 3.852842809364549e-06,
"loss": 2.6276,
"step": 21600
},
{
"epoch": 1264.8108108108108,
"eval_loss": 2.6227035522460938,
"eval_runtime": 8.5395,
"eval_samples_per_second": 163.241,
"eval_steps_per_second": 20.493,
"step": 21600
},
{
"epoch": 1270.5007112375533,
"grad_norm": 0.1456770896911621,
"learning_rate": 3.842809364548495e-06,
"loss": 2.6264,
"step": 21700
},
{
"epoch": 1270.5007112375533,
"eval_loss": 2.6244804859161377,
"eval_runtime": 8.5371,
"eval_samples_per_second": 163.287,
"eval_steps_per_second": 20.499,
"step": 21700
},
{
"epoch": 1276.1906116642958,
"grad_norm": 0.14131468534469604,
"learning_rate": 3.832775919732442e-06,
"loss": 2.6261,
"step": 21800
},
{
"epoch": 1276.1906116642958,
"eval_loss": 2.6235532760620117,
"eval_runtime": 8.8848,
"eval_samples_per_second": 156.898,
"eval_steps_per_second": 19.697,
"step": 21800
},
{
"epoch": 1281.8805120910383,
"grad_norm": 0.16801823675632477,
"learning_rate": 3.822742474916388e-06,
"loss": 2.6266,
"step": 21900
},
{
"epoch": 1281.8805120910383,
"eval_loss": 2.6234781742095947,
"eval_runtime": 8.5394,
"eval_samples_per_second": 163.243,
"eval_steps_per_second": 20.493,
"step": 21900
},
{
"epoch": 1287.570412517781,
"grad_norm": 0.13501711189746857,
"learning_rate": 3.8127090301003347e-06,
"loss": 2.6261,
"step": 22000
},
{
"epoch": 1287.570412517781,
"eval_loss": 2.621072292327881,
"eval_runtime": 8.5307,
"eval_samples_per_second": 163.41,
"eval_steps_per_second": 20.514,
"step": 22000
},
{
"epoch": 1293.2603129445235,
"grad_norm": 0.14802291989326477,
"learning_rate": 3.802675585284281e-06,
"loss": 2.6267,
"step": 22100
},
{
"epoch": 1293.2603129445235,
"eval_loss": 2.625509023666382,
"eval_runtime": 8.876,
"eval_samples_per_second": 157.052,
"eval_steps_per_second": 19.716,
"step": 22100
},
{
"epoch": 1298.950213371266,
"grad_norm": 0.149693563580513,
"learning_rate": 3.792642140468228e-06,
"loss": 2.6266,
"step": 22200
},
{
"epoch": 1298.950213371266,
"eval_loss": 2.6230156421661377,
"eval_runtime": 8.5523,
"eval_samples_per_second": 162.996,
"eval_steps_per_second": 20.462,
"step": 22200
},
{
"epoch": 1304.6401137980085,
"grad_norm": 0.16010881960391998,
"learning_rate": 3.782608695652174e-06,
"loss": 2.6263,
"step": 22300
},
{
"epoch": 1304.6401137980085,
"eval_loss": 2.623608112335205,
"eval_runtime": 8.5405,
"eval_samples_per_second": 163.222,
"eval_steps_per_second": 20.491,
"step": 22300
},
{
"epoch": 1310.330014224751,
"grad_norm": 0.1507118195295334,
"learning_rate": 3.7725752508361205e-06,
"loss": 2.6262,
"step": 22400
},
{
"epoch": 1310.330014224751,
"eval_loss": 2.6196281909942627,
"eval_runtime": 8.8776,
"eval_samples_per_second": 157.024,
"eval_steps_per_second": 19.713,
"step": 22400
},
{
"epoch": 1316.0199146514935,
"grad_norm": 0.12015032023191452,
"learning_rate": 3.7625418060200673e-06,
"loss": 2.6261,
"step": 22500
},
{
"epoch": 1316.0199146514935,
"eval_loss": 2.6222877502441406,
"eval_runtime": 8.5476,
"eval_samples_per_second": 163.086,
"eval_steps_per_second": 20.474,
"step": 22500
},
{
"epoch": 1321.709815078236,
"grad_norm": 0.14796671271324158,
"learning_rate": 3.7525083612040136e-06,
"loss": 2.6261,
"step": 22600
},
{
"epoch": 1321.709815078236,
"eval_loss": 2.623142957687378,
"eval_runtime": 8.8638,
"eval_samples_per_second": 157.268,
"eval_steps_per_second": 19.743,
"step": 22600
},
{
"epoch": 1327.3997155049788,
"grad_norm": 0.14206399023532867,
"learning_rate": 3.74247491638796e-06,
"loss": 2.6261,
"step": 22700
},
{
"epoch": 1327.3997155049788,
"eval_loss": 2.620297431945801,
"eval_runtime": 8.5358,
"eval_samples_per_second": 163.313,
"eval_steps_per_second": 20.502,
"step": 22700
},
{
"epoch": 1333.0896159317213,
"grad_norm": 0.1448485553264618,
"learning_rate": 3.7324414715719067e-06,
"loss": 2.6258,
"step": 22800
},
{
"epoch": 1333.0896159317213,
"eval_loss": 2.6241817474365234,
"eval_runtime": 8.5523,
"eval_samples_per_second": 162.998,
"eval_steps_per_second": 20.462,
"step": 22800
},
{
"epoch": 1338.7795163584638,
"grad_norm": 0.14887595176696777,
"learning_rate": 3.722408026755853e-06,
"loss": 2.6255,
"step": 22900
},
{
"epoch": 1338.7795163584638,
"eval_loss": 2.622042179107666,
"eval_runtime": 8.902,
"eval_samples_per_second": 156.594,
"eval_steps_per_second": 19.658,
"step": 22900
},
{
"epoch": 1344.4694167852062,
"grad_norm": 0.16686739027500153,
"learning_rate": 3.7123745819398e-06,
"loss": 2.6258,
"step": 23000
},
{
"epoch": 1344.4694167852062,
"eval_loss": 2.6229121685028076,
"eval_runtime": 8.5332,
"eval_samples_per_second": 163.362,
"eval_steps_per_second": 20.508,
"step": 23000
},
{
"epoch": 1350.1593172119487,
"grad_norm": 0.16153846681118011,
"learning_rate": 3.702341137123746e-06,
"loss": 2.6257,
"step": 23100
},
{
"epoch": 1350.1593172119487,
"eval_loss": 2.6239538192749023,
"eval_runtime": 8.8767,
"eval_samples_per_second": 157.04,
"eval_steps_per_second": 19.715,
"step": 23100
},
{
"epoch": 1355.8492176386912,
"grad_norm": 0.1725204735994339,
"learning_rate": 3.6923076923076925e-06,
"loss": 2.6258,
"step": 23200
},
{
"epoch": 1355.8492176386912,
"eval_loss": 2.6215097904205322,
"eval_runtime": 8.5286,
"eval_samples_per_second": 163.45,
"eval_steps_per_second": 20.519,
"step": 23200
},
{
"epoch": 1361.539118065434,
"grad_norm": 0.12999078631401062,
"learning_rate": 3.6822742474916393e-06,
"loss": 2.6253,
"step": 23300
},
{
"epoch": 1361.539118065434,
"eval_loss": 2.6233925819396973,
"eval_runtime": 8.8885,
"eval_samples_per_second": 156.832,
"eval_steps_per_second": 19.688,
"step": 23300
},
{
"epoch": 1367.2290184921765,
"grad_norm": 0.1744973212480545,
"learning_rate": 3.6722408026755856e-06,
"loss": 2.6257,
"step": 23400
},
{
"epoch": 1367.2290184921765,
"eval_loss": 2.623767614364624,
"eval_runtime": 8.5312,
"eval_samples_per_second": 163.401,
"eval_steps_per_second": 20.513,
"step": 23400
},
{
"epoch": 1372.918918918919,
"grad_norm": 0.13030101358890533,
"learning_rate": 3.662207357859532e-06,
"loss": 2.6254,
"step": 23500
},
{
"epoch": 1372.918918918919,
"eval_loss": 2.622628927230835,
"eval_runtime": 8.8974,
"eval_samples_per_second": 156.675,
"eval_steps_per_second": 19.669,
"step": 23500
},
{
"epoch": 1378.6088193456615,
"grad_norm": 0.15082061290740967,
"learning_rate": 3.6521739130434787e-06,
"loss": 2.6258,
"step": 23600
},
{
"epoch": 1378.6088193456615,
"eval_loss": 2.62248158454895,
"eval_runtime": 8.5269,
"eval_samples_per_second": 163.482,
"eval_steps_per_second": 20.523,
"step": 23600
},
{
"epoch": 1384.298719772404,
"grad_norm": 0.1196790486574173,
"learning_rate": 3.642140468227425e-06,
"loss": 2.6254,
"step": 23700
},
{
"epoch": 1384.298719772404,
"eval_loss": 2.618326187133789,
"eval_runtime": 8.5494,
"eval_samples_per_second": 163.052,
"eval_steps_per_second": 20.469,
"step": 23700
},
{
"epoch": 1389.9886201991465,
"grad_norm": 0.168843612074852,
"learning_rate": 3.6321070234113714e-06,
"loss": 2.6249,
"step": 23800
},
{
"epoch": 1389.9886201991465,
"eval_loss": 2.621375799179077,
"eval_runtime": 8.859,
"eval_samples_per_second": 157.355,
"eval_steps_per_second": 19.754,
"step": 23800
},
{
"epoch": 1395.678520625889,
"grad_norm": 0.1318158209323883,
"learning_rate": 3.622073578595318e-06,
"loss": 2.6248,
"step": 23900
},
{
"epoch": 1395.678520625889,
"eval_loss": 2.6230545043945312,
"eval_runtime": 8.5184,
"eval_samples_per_second": 163.646,
"eval_steps_per_second": 20.544,
"step": 23900
},
{
"epoch": 1401.3684210526317,
"grad_norm": 0.14110194146633148,
"learning_rate": 3.6120401337792645e-06,
"loss": 2.6252,
"step": 24000
},
{
"epoch": 1401.3684210526317,
"eval_loss": 2.6207733154296875,
"eval_runtime": 8.5346,
"eval_samples_per_second": 163.335,
"eval_steps_per_second": 20.505,
"step": 24000
},
{
"epoch": 1407.0583214793742,
"grad_norm": 0.14449109137058258,
"learning_rate": 3.6020066889632112e-06,
"loss": 2.6245,
"step": 24100
},
{
"epoch": 1407.0583214793742,
"eval_loss": 2.6209616661071777,
"eval_runtime": 8.5416,
"eval_samples_per_second": 163.201,
"eval_steps_per_second": 20.488,
"step": 24100
},
{
"epoch": 1412.7482219061167,
"grad_norm": 0.12893743813037872,
"learning_rate": 3.5919732441471576e-06,
"loss": 2.6247,
"step": 24200
},
{
"epoch": 1412.7482219061167,
"eval_loss": 2.6214792728424072,
"eval_runtime": 8.8839,
"eval_samples_per_second": 156.913,
"eval_steps_per_second": 19.699,
"step": 24200
},
{
"epoch": 1418.4381223328592,
"grad_norm": 0.15788990259170532,
"learning_rate": 3.581939799331104e-06,
"loss": 2.6249,
"step": 24300
},
{
"epoch": 1418.4381223328592,
"eval_loss": 2.6239373683929443,
"eval_runtime": 8.5329,
"eval_samples_per_second": 163.368,
"eval_steps_per_second": 20.509,
"step": 24300
},
{
"epoch": 1424.1280227596017,
"grad_norm": 0.14352256059646606,
"learning_rate": 3.5719063545150507e-06,
"loss": 2.6244,
"step": 24400
},
{
"epoch": 1424.1280227596017,
"eval_loss": 2.621476888656616,
"eval_runtime": 8.8748,
"eval_samples_per_second": 157.073,
"eval_steps_per_second": 19.719,
"step": 24400
},
{
"epoch": 1429.8179231863442,
"grad_norm": 0.1311691254377365,
"learning_rate": 3.561872909698997e-06,
"loss": 2.6243,
"step": 24500
},
{
"epoch": 1429.8179231863442,
"eval_loss": 2.6242871284484863,
"eval_runtime": 8.5283,
"eval_samples_per_second": 163.456,
"eval_steps_per_second": 20.52,
"step": 24500
},
{
"epoch": 1435.5078236130869,
"grad_norm": 0.15464642643928528,
"learning_rate": 3.5518394648829434e-06,
"loss": 2.624,
"step": 24600
},
{
"epoch": 1435.5078236130869,
"eval_loss": 2.6201913356781006,
"eval_runtime": 8.8606,
"eval_samples_per_second": 157.326,
"eval_steps_per_second": 19.75,
"step": 24600
},
{
"epoch": 1441.1977240398294,
"grad_norm": 0.19396920502185822,
"learning_rate": 3.54180602006689e-06,
"loss": 2.625,
"step": 24700
},
{
"epoch": 1441.1977240398294,
"eval_loss": 2.619835138320923,
"eval_runtime": 8.5378,
"eval_samples_per_second": 163.275,
"eval_steps_per_second": 20.497,
"step": 24700
},
{
"epoch": 1446.8876244665719,
"grad_norm": 0.16594748198986053,
"learning_rate": 3.5317725752508365e-06,
"loss": 2.6238,
"step": 24800
},
{
"epoch": 1446.8876244665719,
"eval_loss": 2.620967388153076,
"eval_runtime": 8.5405,
"eval_samples_per_second": 163.221,
"eval_steps_per_second": 20.49,
"step": 24800
},
{
"epoch": 1452.5775248933144,
"grad_norm": 0.12998247146606445,
"learning_rate": 3.521739130434783e-06,
"loss": 2.6237,
"step": 24900
},
{
"epoch": 1452.5775248933144,
"eval_loss": 2.622404098510742,
"eval_runtime": 8.5397,
"eval_samples_per_second": 163.237,
"eval_steps_per_second": 20.492,
"step": 24900
},
{
"epoch": 1458.2674253200569,
"grad_norm": 0.15071412920951843,
"learning_rate": 3.5117056856187296e-06,
"loss": 2.6245,
"step": 25000
},
{
"epoch": 1458.2674253200569,
"eval_loss": 2.6213462352752686,
"eval_runtime": 8.8844,
"eval_samples_per_second": 156.905,
"eval_steps_per_second": 19.698,
"step": 25000
},
{
"epoch": 1463.9573257467994,
"grad_norm": 0.1532295048236847,
"learning_rate": 3.501672240802676e-06,
"loss": 2.6245,
"step": 25100
},
{
"epoch": 1463.9573257467994,
"eval_loss": 2.6207022666931152,
"eval_runtime": 8.5372,
"eval_samples_per_second": 163.285,
"eval_steps_per_second": 20.498,
"step": 25100
},
{
"epoch": 1469.6472261735419,
"grad_norm": 0.13699106872081757,
"learning_rate": 3.491638795986622e-06,
"loss": 2.6239,
"step": 25200
},
{
"epoch": 1469.6472261735419,
"eval_loss": 2.6193158626556396,
"eval_runtime": 8.8816,
"eval_samples_per_second": 156.954,
"eval_steps_per_second": 19.704,
"step": 25200
},
{
"epoch": 1475.3371266002846,
"grad_norm": 0.14744792878627777,
"learning_rate": 3.481605351170568e-06,
"loss": 2.624,
"step": 25300
},
{
"epoch": 1475.3371266002846,
"eval_loss": 2.6224136352539062,
"eval_runtime": 8.5419,
"eval_samples_per_second": 163.196,
"eval_steps_per_second": 20.487,
"step": 25300
},
{
"epoch": 1481.027027027027,
"grad_norm": 0.1340937465429306,
"learning_rate": 3.471571906354515e-06,
"loss": 2.624,
"step": 25400
},
{
"epoch": 1481.027027027027,
"eval_loss": 2.620910882949829,
"eval_runtime": 8.551,
"eval_samples_per_second": 163.022,
"eval_steps_per_second": 20.465,
"step": 25400
},
{
"epoch": 1486.7169274537696,
"grad_norm": 0.16349473595619202,
"learning_rate": 3.4615384615384613e-06,
"loss": 2.6236,
"step": 25500
},
{
"epoch": 1486.7169274537696,
"eval_loss": 2.6188619136810303,
"eval_runtime": 8.8975,
"eval_samples_per_second": 156.673,
"eval_steps_per_second": 19.668,
"step": 25500
},
{
"epoch": 1492.406827880512,
"grad_norm": 0.16049961745738983,
"learning_rate": 3.4515050167224076e-06,
"loss": 2.6236,
"step": 25600
},
{
"epoch": 1492.406827880512,
"eval_loss": 2.6201858520507812,
"eval_runtime": 8.5265,
"eval_samples_per_second": 163.49,
"eval_steps_per_second": 20.524,
"step": 25600
},
{
"epoch": 1498.0967283072546,
"grad_norm": 0.1545686572790146,
"learning_rate": 3.4414715719063544e-06,
"loss": 2.6237,
"step": 25700
},
{
"epoch": 1498.0967283072546,
"eval_loss": 2.6238884925842285,
"eval_runtime": 8.5206,
"eval_samples_per_second": 163.603,
"eval_steps_per_second": 20.538,
"step": 25700
},
{
"epoch": 1503.786628733997,
"grad_norm": 0.11945275217294693,
"learning_rate": 3.4314381270903007e-06,
"loss": 2.6237,
"step": 25800
},
{
"epoch": 1503.786628733997,
"eval_loss": 2.617251396179199,
"eval_runtime": 8.5374,
"eval_samples_per_second": 163.282,
"eval_steps_per_second": 20.498,
"step": 25800
},
{
"epoch": 1509.4765291607396,
"grad_norm": 0.16417285799980164,
"learning_rate": 3.4214046822742475e-06,
"loss": 2.6234,
"step": 25900
},
{
"epoch": 1509.4765291607396,
"eval_loss": 2.61983060836792,
"eval_runtime": 8.8777,
"eval_samples_per_second": 157.022,
"eval_steps_per_second": 19.712,
"step": 25900
},
{
"epoch": 1515.1664295874823,
"grad_norm": 0.1562732458114624,
"learning_rate": 3.411371237458194e-06,
"loss": 2.6234,
"step": 26000
},
{
"epoch": 1515.1664295874823,
"eval_loss": 2.6201515197753906,
"eval_runtime": 8.5309,
"eval_samples_per_second": 163.407,
"eval_steps_per_second": 20.514,
"step": 26000
},
{
"epoch": 1520.8563300142248,
"grad_norm": 0.1490921974182129,
"learning_rate": 3.40133779264214e-06,
"loss": 2.6233,
"step": 26100
},
{
"epoch": 1520.8563300142248,
"eval_loss": 2.6187028884887695,
"eval_runtime": 8.5278,
"eval_samples_per_second": 163.466,
"eval_steps_per_second": 20.521,
"step": 26100
},
{
"epoch": 1526.5462304409673,
"grad_norm": 0.13493777811527252,
"learning_rate": 3.391304347826087e-06,
"loss": 2.6232,
"step": 26200
},
{
"epoch": 1526.5462304409673,
"eval_loss": 2.6198177337646484,
"eval_runtime": 8.5368,
"eval_samples_per_second": 163.293,
"eval_steps_per_second": 20.5,
"step": 26200
},
{
"epoch": 1532.2361308677098,
"grad_norm": 0.16828219592571259,
"learning_rate": 3.3812709030100333e-06,
"loss": 2.6235,
"step": 26300
},
{
"epoch": 1532.2361308677098,
"eval_loss": 2.620209217071533,
"eval_runtime": 8.8832,
"eval_samples_per_second": 156.925,
"eval_steps_per_second": 19.7,
"step": 26300
},
{
"epoch": 1537.9260312944523,
"grad_norm": 0.13606858253479004,
"learning_rate": 3.3712374581939796e-06,
"loss": 2.6236,
"step": 26400
},
{
"epoch": 1537.9260312944523,
"eval_loss": 2.620745897293091,
"eval_runtime": 8.5351,
"eval_samples_per_second": 163.325,
"eval_steps_per_second": 20.504,
"step": 26400
},
{
"epoch": 1543.6159317211948,
"grad_norm": 0.15643203258514404,
"learning_rate": 3.3612040133779264e-06,
"loss": 2.6233,
"step": 26500
},
{
"epoch": 1543.6159317211948,
"eval_loss": 2.6203880310058594,
"eval_runtime": 8.5194,
"eval_samples_per_second": 163.627,
"eval_steps_per_second": 20.541,
"step": 26500
},
{
"epoch": 1549.3058321479375,
"grad_norm": 0.15990637242794037,
"learning_rate": 3.3511705685618727e-06,
"loss": 2.6235,
"step": 26600
},
{
"epoch": 1549.3058321479375,
"eval_loss": 2.618859052658081,
"eval_runtime": 8.8671,
"eval_samples_per_second": 157.21,
"eval_steps_per_second": 19.736,
"step": 26600
},
{
"epoch": 1554.99573257468,
"grad_norm": 0.1532638967037201,
"learning_rate": 3.3411371237458195e-06,
"loss": 2.6227,
"step": 26700
},
{
"epoch": 1554.99573257468,
"eval_loss": 2.621203660964966,
"eval_runtime": 8.5393,
"eval_samples_per_second": 163.246,
"eval_steps_per_second": 20.494,
"step": 26700
},
{
"epoch": 1560.6856330014225,
"grad_norm": 0.14362338185310364,
"learning_rate": 3.331103678929766e-06,
"loss": 2.6233,
"step": 26800
},
{
"epoch": 1560.6856330014225,
"eval_loss": 2.6196324825286865,
"eval_runtime": 8.5352,
"eval_samples_per_second": 163.323,
"eval_steps_per_second": 20.503,
"step": 26800
},
{
"epoch": 1566.375533428165,
"grad_norm": 0.15064574778079987,
"learning_rate": 3.321070234113712e-06,
"loss": 2.6231,
"step": 26900
},
{
"epoch": 1566.375533428165,
"eval_loss": 2.621459722518921,
"eval_runtime": 8.8713,
"eval_samples_per_second": 157.136,
"eval_steps_per_second": 19.726,
"step": 26900
},
{
"epoch": 1572.0654338549075,
"grad_norm": 0.14329403638839722,
"learning_rate": 3.311036789297659e-06,
"loss": 2.623,
"step": 27000
},
{
"epoch": 1572.0654338549075,
"eval_loss": 2.619920253753662,
"eval_runtime": 8.5347,
"eval_samples_per_second": 163.333,
"eval_steps_per_second": 20.505,
"step": 27000
},
{
"epoch": 1577.75533428165,
"grad_norm": 0.14685587584972382,
"learning_rate": 3.3010033444816052e-06,
"loss": 2.6233,
"step": 27100
},
{
"epoch": 1577.75533428165,
"eval_loss": 2.620281934738159,
"eval_runtime": 8.5331,
"eval_samples_per_second": 163.363,
"eval_steps_per_second": 20.508,
"step": 27100
},
{
"epoch": 1583.4452347083925,
"grad_norm": 0.14042943716049194,
"learning_rate": 3.2909698996655516e-06,
"loss": 2.6227,
"step": 27200
},
{
"epoch": 1583.4452347083925,
"eval_loss": 2.6232104301452637,
"eval_runtime": 8.5205,
"eval_samples_per_second": 163.606,
"eval_steps_per_second": 20.539,
"step": 27200
},
{
"epoch": 1589.1351351351352,
"grad_norm": 0.15437842905521393,
"learning_rate": 3.2809364548494983e-06,
"loss": 2.6228,
"step": 27300
},
{
"epoch": 1589.1351351351352,
"eval_loss": 2.6217334270477295,
"eval_runtime": 8.887,
"eval_samples_per_second": 156.858,
"eval_steps_per_second": 19.692,
"step": 27300
},
{
"epoch": 1594.8250355618777,
"grad_norm": 0.13956615328788757,
"learning_rate": 3.2709030100334447e-06,
"loss": 2.6227,
"step": 27400
},
{
"epoch": 1594.8250355618777,
"eval_loss": 2.619623899459839,
"eval_runtime": 8.5391,
"eval_samples_per_second": 163.248,
"eval_steps_per_second": 20.494,
"step": 27400
},
{
"epoch": 1600.5149359886202,
"grad_norm": 0.1520717293024063,
"learning_rate": 3.260869565217391e-06,
"loss": 2.6224,
"step": 27500
},
{
"epoch": 1600.5149359886202,
"eval_loss": 2.61783766746521,
"eval_runtime": 8.8816,
"eval_samples_per_second": 156.954,
"eval_steps_per_second": 19.704,
"step": 27500
},
{
"epoch": 1606.2048364153627,
"grad_norm": 0.12460660189390182,
"learning_rate": 3.2508361204013378e-06,
"loss": 2.6228,
"step": 27600
},
{
"epoch": 1606.2048364153627,
"eval_loss": 2.6196727752685547,
"eval_runtime": 8.5446,
"eval_samples_per_second": 163.145,
"eval_steps_per_second": 20.481,
"step": 27600
},
{
"epoch": 1611.8947368421052,
"grad_norm": 0.14338594675064087,
"learning_rate": 3.240802675585284e-06,
"loss": 2.6219,
"step": 27700
},
{
"epoch": 1611.8947368421052,
"eval_loss": 2.6190030574798584,
"eval_runtime": 8.5528,
"eval_samples_per_second": 162.987,
"eval_steps_per_second": 20.461,
"step": 27700
},
{
"epoch": 1617.5846372688477,
"grad_norm": 0.1541885882616043,
"learning_rate": 3.230769230769231e-06,
"loss": 2.6223,
"step": 27800
},
{
"epoch": 1617.5846372688477,
"eval_loss": 2.6206130981445312,
"eval_runtime": 8.5355,
"eval_samples_per_second": 163.319,
"eval_steps_per_second": 20.503,
"step": 27800
},
{
"epoch": 1623.2745376955904,
"grad_norm": 0.14063502848148346,
"learning_rate": 3.2207357859531772e-06,
"loss": 2.6226,
"step": 27900
},
{
"epoch": 1623.2745376955904,
"eval_loss": 2.6189980506896973,
"eval_runtime": 8.8925,
"eval_samples_per_second": 156.76,
"eval_steps_per_second": 19.679,
"step": 27900
},
{
"epoch": 1628.964438122333,
"grad_norm": 0.1286516785621643,
"learning_rate": 3.2107023411371236e-06,
"loss": 2.6221,
"step": 28000
},
{
"epoch": 1628.964438122333,
"eval_loss": 2.620689630508423,
"eval_runtime": 8.542,
"eval_samples_per_second": 163.193,
"eval_steps_per_second": 20.487,
"step": 28000
},
{
"epoch": 1634.6543385490754,
"grad_norm": 0.1280793398618698,
"learning_rate": 3.2006688963210703e-06,
"loss": 2.6224,
"step": 28100
},
{
"epoch": 1634.6543385490754,
"eval_loss": 2.6217143535614014,
"eval_runtime": 8.8756,
"eval_samples_per_second": 157.06,
"eval_steps_per_second": 19.717,
"step": 28100
},
{
"epoch": 1640.344238975818,
"grad_norm": 0.15803121030330658,
"learning_rate": 3.1906354515050167e-06,
"loss": 2.6219,
"step": 28200
},
{
"epoch": 1640.344238975818,
"eval_loss": 2.6206395626068115,
"eval_runtime": 8.8837,
"eval_samples_per_second": 156.917,
"eval_steps_per_second": 19.699,
"step": 28200
},
{
"epoch": 1646.0341394025604,
"grad_norm": 0.1751488745212555,
"learning_rate": 3.180602006688963e-06,
"loss": 2.6224,
"step": 28300
},
{
"epoch": 1646.0341394025604,
"eval_loss": 2.6214957237243652,
"eval_runtime": 8.5404,
"eval_samples_per_second": 163.224,
"eval_steps_per_second": 20.491,
"step": 28300
},
{
"epoch": 1651.724039829303,
"grad_norm": 0.15003472566604614,
"learning_rate": 3.1705685618729098e-06,
"loss": 2.6223,
"step": 28400
},
{
"epoch": 1651.724039829303,
"eval_loss": 2.619629144668579,
"eval_runtime": 8.5381,
"eval_samples_per_second": 163.268,
"eval_steps_per_second": 20.496,
"step": 28400
},
{
"epoch": 1657.4139402560454,
"grad_norm": 0.13195043802261353,
"learning_rate": 3.160535117056856e-06,
"loss": 2.6213,
"step": 28500
},
{
"epoch": 1657.4139402560454,
"eval_loss": 2.623068332672119,
"eval_runtime": 8.5234,
"eval_samples_per_second": 163.55,
"eval_steps_per_second": 20.532,
"step": 28500
},
{
"epoch": 1663.1038406827881,
"grad_norm": 0.12435358017683029,
"learning_rate": 3.1505016722408024e-06,
"loss": 2.6218,
"step": 28600
},
{
"epoch": 1663.1038406827881,
"eval_loss": 2.6203911304473877,
"eval_runtime": 8.8568,
"eval_samples_per_second": 157.394,
"eval_steps_per_second": 19.759,
"step": 28600
},
{
"epoch": 1668.7937411095306,
"grad_norm": 0.12473925203084946,
"learning_rate": 3.140468227424749e-06,
"loss": 2.6219,
"step": 28700
},
{
"epoch": 1668.7937411095306,
"eval_loss": 2.620685338973999,
"eval_runtime": 8.5395,
"eval_samples_per_second": 163.241,
"eval_steps_per_second": 20.493,
"step": 28700
},
{
"epoch": 1674.4836415362731,
"grad_norm": 0.14964550733566284,
"learning_rate": 3.1304347826086955e-06,
"loss": 2.6216,
"step": 28800
},
{
"epoch": 1674.4836415362731,
"eval_loss": 2.619400978088379,
"eval_runtime": 8.5365,
"eval_samples_per_second": 163.298,
"eval_steps_per_second": 20.5,
"step": 28800
},
{
"epoch": 1680.1735419630156,
"grad_norm": 0.12900976836681366,
"learning_rate": 3.1204013377926423e-06,
"loss": 2.6218,
"step": 28900
},
{
"epoch": 1680.1735419630156,
"eval_loss": 2.621912717819214,
"eval_runtime": 8.5373,
"eval_samples_per_second": 163.284,
"eval_steps_per_second": 20.498,
"step": 28900
},
{
"epoch": 1685.8634423897581,
"grad_norm": 0.1679168939590454,
"learning_rate": 3.1103678929765886e-06,
"loss": 2.622,
"step": 29000
},
{
"epoch": 1685.8634423897581,
"eval_loss": 2.6172022819519043,
"eval_runtime": 8.8725,
"eval_samples_per_second": 157.114,
"eval_steps_per_second": 19.724,
"step": 29000
},
{
"epoch": 1691.5533428165006,
"grad_norm": 0.14349579811096191,
"learning_rate": 3.100334448160535e-06,
"loss": 2.6214,
"step": 29100
},
{
"epoch": 1691.5533428165006,
"eval_loss": 2.6180310249328613,
"eval_runtime": 8.5353,
"eval_samples_per_second": 163.321,
"eval_steps_per_second": 20.503,
"step": 29100
},
{
"epoch": 1697.2432432432433,
"grad_norm": 0.11367882043123245,
"learning_rate": 3.0903010033444818e-06,
"loss": 2.6216,
"step": 29200
},
{
"epoch": 1697.2432432432433,
"eval_loss": 2.6190216541290283,
"eval_runtime": 8.874,
"eval_samples_per_second": 157.088,
"eval_steps_per_second": 19.72,
"step": 29200
},
{
"epoch": 1702.9331436699858,
"grad_norm": 0.1360355168581009,
"learning_rate": 3.080267558528428e-06,
"loss": 2.6209,
"step": 29300
},
{
"epoch": 1702.9331436699858,
"eval_loss": 2.618488311767578,
"eval_runtime": 8.544,
"eval_samples_per_second": 163.156,
"eval_steps_per_second": 20.482,
"step": 29300
},
{
"epoch": 1708.6230440967283,
"grad_norm": 0.15486325323581696,
"learning_rate": 3.0702341137123744e-06,
"loss": 2.6213,
"step": 29400
},
{
"epoch": 1708.6230440967283,
"eval_loss": 2.6200103759765625,
"eval_runtime": 8.88,
"eval_samples_per_second": 156.982,
"eval_steps_per_second": 19.707,
"step": 29400
},
{
"epoch": 1714.3129445234708,
"grad_norm": 0.16179534792900085,
"learning_rate": 3.060200668896321e-06,
"loss": 2.6216,
"step": 29500
},
{
"epoch": 1714.3129445234708,
"eval_loss": 2.619476795196533,
"eval_runtime": 8.5238,
"eval_samples_per_second": 163.542,
"eval_steps_per_second": 20.531,
"step": 29500
},
{
"epoch": 1720.0028449502133,
"grad_norm": 0.12888365983963013,
"learning_rate": 3.0501672240802675e-06,
"loss": 2.621,
"step": 29600
},
{
"epoch": 1720.0028449502133,
"eval_loss": 2.6209278106689453,
"eval_runtime": 8.8646,
"eval_samples_per_second": 157.255,
"eval_steps_per_second": 19.742,
"step": 29600
},
{
"epoch": 1725.6927453769558,
"grad_norm": 0.1323317587375641,
"learning_rate": 3.0401337792642143e-06,
"loss": 2.6217,
"step": 29700
},
{
"epoch": 1725.6927453769558,
"eval_loss": 2.6187312602996826,
"eval_runtime": 8.5424,
"eval_samples_per_second": 163.186,
"eval_steps_per_second": 20.486,
"step": 29700
},
{
"epoch": 1731.3826458036983,
"grad_norm": 0.13297787308692932,
"learning_rate": 3.0301003344481606e-06,
"loss": 2.6212,
"step": 29800
},
{
"epoch": 1731.3826458036983,
"eval_loss": 2.6216437816619873,
"eval_runtime": 8.5329,
"eval_samples_per_second": 163.368,
"eval_steps_per_second": 20.509,
"step": 29800
},
{
"epoch": 1737.072546230441,
"grad_norm": 0.11761217564344406,
"learning_rate": 3.020066889632107e-06,
"loss": 2.6211,
"step": 29900
},
{
"epoch": 1737.072546230441,
"eval_loss": 2.621067523956299,
"eval_runtime": 8.8814,
"eval_samples_per_second": 156.957,
"eval_steps_per_second": 19.704,
"step": 29900
},
{
"epoch": 1742.7624466571835,
"grad_norm": 0.13174152374267578,
"learning_rate": 3.0100334448160537e-06,
"loss": 2.621,
"step": 30000
},
{
"epoch": 1742.7624466571835,
"eval_loss": 2.619697093963623,
"eval_runtime": 8.5326,
"eval_samples_per_second": 163.373,
"eval_steps_per_second": 20.509,
"step": 30000
},
{
"epoch": 1748.452347083926,
"grad_norm": 0.13943453133106232,
"learning_rate": 3e-06,
"loss": 2.6208,
"step": 30100
},
{
"epoch": 1748.452347083926,
"eval_loss": 2.6210110187530518,
"eval_runtime": 8.5178,
"eval_samples_per_second": 163.658,
"eval_steps_per_second": 20.545,
"step": 30100
},
{
"epoch": 1754.1422475106685,
"grad_norm": 0.13520394265651703,
"learning_rate": 2.9899665551839464e-06,
"loss": 2.6213,
"step": 30200
},
{
"epoch": 1754.1422475106685,
"eval_loss": 2.616352081298828,
"eval_runtime": 8.8621,
"eval_samples_per_second": 157.3,
"eval_steps_per_second": 19.747,
"step": 30200
},
{
"epoch": 1759.832147937411,
"grad_norm": 0.1447754055261612,
"learning_rate": 2.979933110367893e-06,
"loss": 2.6212,
"step": 30300
},
{
"epoch": 1759.832147937411,
"eval_loss": 2.6177382469177246,
"eval_runtime": 8.5425,
"eval_samples_per_second": 163.184,
"eval_steps_per_second": 20.486,
"step": 30300
},
{
"epoch": 1765.5220483641535,
"grad_norm": 0.1305381804704666,
"learning_rate": 2.9698996655518395e-06,
"loss": 2.6207,
"step": 30400
},
{
"epoch": 1765.5220483641535,
"eval_loss": 2.6181886196136475,
"eval_runtime": 8.5323,
"eval_samples_per_second": 163.379,
"eval_steps_per_second": 20.51,
"step": 30400
},
{
"epoch": 1771.2119487908963,
"grad_norm": 0.13752570748329163,
"learning_rate": 2.959866220735786e-06,
"loss": 2.6211,
"step": 30500
},
{
"epoch": 1771.2119487908963,
"eval_loss": 2.6209466457366943,
"eval_runtime": 8.8636,
"eval_samples_per_second": 157.273,
"eval_steps_per_second": 19.744,
"step": 30500
},
{
"epoch": 1776.9018492176388,
"grad_norm": 0.15597382187843323,
"learning_rate": 2.9498327759197326e-06,
"loss": 2.6209,
"step": 30600
},
{
"epoch": 1776.9018492176388,
"eval_loss": 2.6217684745788574,
"eval_runtime": 8.535,
"eval_samples_per_second": 163.328,
"eval_steps_per_second": 20.504,
"step": 30600
},
{
"epoch": 1782.5917496443813,
"grad_norm": 0.13857756555080414,
"learning_rate": 2.939799331103679e-06,
"loss": 2.6203,
"step": 30700
},
{
"epoch": 1782.5917496443813,
"eval_loss": 2.6178483963012695,
"eval_runtime": 8.5228,
"eval_samples_per_second": 163.56,
"eval_steps_per_second": 20.533,
"step": 30700
},
{
"epoch": 1788.2816500711237,
"grad_norm": 0.12845158576965332,
"learning_rate": 2.9297658862876257e-06,
"loss": 2.6207,
"step": 30800
},
{
"epoch": 1788.2816500711237,
"eval_loss": 2.615445137023926,
"eval_runtime": 7.9709,
"eval_samples_per_second": 174.886,
"eval_steps_per_second": 21.955,
"step": 30800
},
{
"epoch": 1793.9715504978662,
"grad_norm": 0.12672263383865356,
"learning_rate": 2.919732441471572e-06,
"loss": 2.6207,
"step": 30900
},
{
"epoch": 1793.9715504978662,
"eval_loss": 2.621990919113159,
"eval_runtime": 8.6765,
"eval_samples_per_second": 160.663,
"eval_steps_per_second": 20.169,
"step": 30900
},
{
"epoch": 1799.6614509246087,
"grad_norm": 0.15212363004684448,
"learning_rate": 2.9096989966555184e-06,
"loss": 2.6205,
"step": 31000
},
{
"epoch": 1799.6614509246087,
"eval_loss": 2.617125988006592,
"eval_runtime": 8.5348,
"eval_samples_per_second": 163.33,
"eval_steps_per_second": 20.504,
"step": 31000
},
{
"epoch": 1805.3513513513512,
"grad_norm": 0.14816269278526306,
"learning_rate": 2.899665551839465e-06,
"loss": 2.6204,
"step": 31100
},
{
"epoch": 1805.3513513513512,
"eval_loss": 2.619084358215332,
"eval_runtime": 8.5316,
"eval_samples_per_second": 163.393,
"eval_steps_per_second": 20.512,
"step": 31100
},
{
"epoch": 1811.041251778094,
"grad_norm": 0.12133249640464783,
"learning_rate": 2.8896321070234115e-06,
"loss": 2.6202,
"step": 31200
},
{
"epoch": 1811.041251778094,
"eval_loss": 2.6200191974639893,
"eval_runtime": 8.8693,
"eval_samples_per_second": 157.171,
"eval_steps_per_second": 19.731,
"step": 31200
},
{
"epoch": 1816.7311522048365,
"grad_norm": 0.122464619576931,
"learning_rate": 2.879598662207358e-06,
"loss": 2.6206,
"step": 31300
},
{
"epoch": 1816.7311522048365,
"eval_loss": 2.6166939735412598,
"eval_runtime": 8.5416,
"eval_samples_per_second": 163.202,
"eval_steps_per_second": 20.488,
"step": 31300
},
{
"epoch": 1822.421052631579,
"grad_norm": 0.12631458044052124,
"learning_rate": 2.8695652173913046e-06,
"loss": 2.6207,
"step": 31400
},
{
"epoch": 1822.421052631579,
"eval_loss": 2.619025468826294,
"eval_runtime": 8.8769,
"eval_samples_per_second": 157.038,
"eval_steps_per_second": 19.714,
"step": 31400
},
{
"epoch": 1828.1109530583215,
"grad_norm": 0.13460245728492737,
"learning_rate": 2.859531772575251e-06,
"loss": 2.6205,
"step": 31500
},
{
"epoch": 1828.1109530583215,
"eval_loss": 2.6193785667419434,
"eval_runtime": 8.5355,
"eval_samples_per_second": 163.318,
"eval_steps_per_second": 20.503,
"step": 31500
},
{
"epoch": 1833.800853485064,
"grad_norm": 0.1309368908405304,
"learning_rate": 2.8494983277591977e-06,
"loss": 2.6202,
"step": 31600
},
{
"epoch": 1833.800853485064,
"eval_loss": 2.6178736686706543,
"eval_runtime": 8.5249,
"eval_samples_per_second": 163.52,
"eval_steps_per_second": 20.528,
"step": 31600
},
{
"epoch": 1839.4907539118065,
"grad_norm": 0.13755999505519867,
"learning_rate": 2.839464882943144e-06,
"loss": 2.6202,
"step": 31700
},
{
"epoch": 1839.4907539118065,
"eval_loss": 2.6197922229766846,
"eval_runtime": 8.8802,
"eval_samples_per_second": 156.978,
"eval_steps_per_second": 19.707,
"step": 31700
},
{
"epoch": 1845.1806543385492,
"grad_norm": 0.15736857056617737,
"learning_rate": 2.8294314381270904e-06,
"loss": 2.6205,
"step": 31800
},
{
"epoch": 1845.1806543385492,
"eval_loss": 2.617283344268799,
"eval_runtime": 8.5421,
"eval_samples_per_second": 163.192,
"eval_steps_per_second": 20.487,
"step": 31800
},
{
"epoch": 1850.8705547652917,
"grad_norm": 0.13804545998573303,
"learning_rate": 2.819397993311037e-06,
"loss": 2.6202,
"step": 31900
},
{
"epoch": 1850.8705547652917,
"eval_loss": 2.619748592376709,
"eval_runtime": 8.5172,
"eval_samples_per_second": 163.669,
"eval_steps_per_second": 20.547,
"step": 31900
},
{
"epoch": 1856.5604551920342,
"grad_norm": 0.14598102867603302,
"learning_rate": 2.8093645484949835e-06,
"loss": 2.6198,
"step": 32000
},
{
"epoch": 1856.5604551920342,
"eval_loss": 2.6171820163726807,
"eval_runtime": 8.5287,
"eval_samples_per_second": 163.449,
"eval_steps_per_second": 20.519,
"step": 32000
},
{
"epoch": 1862.2503556187767,
"grad_norm": 0.1180824562907219,
"learning_rate": 2.79933110367893e-06,
"loss": 2.62,
"step": 32100
},
{
"epoch": 1862.2503556187767,
"eval_loss": 2.6209444999694824,
"eval_runtime": 8.8778,
"eval_samples_per_second": 157.021,
"eval_steps_per_second": 19.712,
"step": 32100
},
{
"epoch": 1867.9402560455192,
"grad_norm": 0.13339059054851532,
"learning_rate": 2.7892976588628766e-06,
"loss": 2.6199,
"step": 32200
},
{
"epoch": 1867.9402560455192,
"eval_loss": 2.6180646419525146,
"eval_runtime": 8.5333,
"eval_samples_per_second": 163.36,
"eval_steps_per_second": 20.508,
"step": 32200
},
{
"epoch": 1873.6301564722617,
"grad_norm": 0.13802410662174225,
"learning_rate": 2.779264214046823e-06,
"loss": 2.6199,
"step": 32300
},
{
"epoch": 1873.6301564722617,
"eval_loss": 2.6181840896606445,
"eval_runtime": 8.5319,
"eval_samples_per_second": 163.386,
"eval_steps_per_second": 20.511,
"step": 32300
},
{
"epoch": 1879.3200568990042,
"grad_norm": 0.1414729207754135,
"learning_rate": 2.7692307692307693e-06,
"loss": 2.6204,
"step": 32400
},
{
"epoch": 1879.3200568990042,
"eval_loss": 2.6203253269195557,
"eval_runtime": 8.8692,
"eval_samples_per_second": 157.174,
"eval_steps_per_second": 19.731,
"step": 32400
},
{
"epoch": 1885.0099573257469,
"grad_norm": 0.14050759375095367,
"learning_rate": 2.759197324414716e-06,
"loss": 2.6198,
"step": 32500
},
{
"epoch": 1885.0099573257469,
"eval_loss": 2.6230709552764893,
"eval_runtime": 8.5278,
"eval_samples_per_second": 163.466,
"eval_steps_per_second": 20.521,
"step": 32500
},
{
"epoch": 1890.6998577524894,
"grad_norm": 0.12877824902534485,
"learning_rate": 2.749163879598662e-06,
"loss": 2.6196,
"step": 32600
},
{
"epoch": 1890.6998577524894,
"eval_loss": 2.617913246154785,
"eval_runtime": 8.8665,
"eval_samples_per_second": 157.221,
"eval_steps_per_second": 19.737,
"step": 32600
},
{
"epoch": 1896.3897581792319,
"grad_norm": 0.14117339253425598,
"learning_rate": 2.7391304347826087e-06,
"loss": 2.6201,
"step": 32700
},
{
"epoch": 1896.3897581792319,
"eval_loss": 2.6172854900360107,
"eval_runtime": 8.5203,
"eval_samples_per_second": 163.609,
"eval_steps_per_second": 20.539,
"step": 32700
},
{
"epoch": 1902.0796586059744,
"grad_norm": 0.13245785236358643,
"learning_rate": 2.729096989966555e-06,
"loss": 2.6195,
"step": 32800
},
{
"epoch": 1902.0796586059744,
"eval_loss": 2.618534564971924,
"eval_runtime": 8.5363,
"eval_samples_per_second": 163.303,
"eval_steps_per_second": 20.501,
"step": 32800
},
{
"epoch": 1907.7695590327169,
"grad_norm": 0.1612655520439148,
"learning_rate": 2.7190635451505014e-06,
"loss": 2.6196,
"step": 32900
},
{
"epoch": 1907.7695590327169,
"eval_loss": 2.619488477706909,
"eval_runtime": 8.5526,
"eval_samples_per_second": 162.992,
"eval_steps_per_second": 20.462,
"step": 32900
},
{
"epoch": 1913.4594594594594,
"grad_norm": 0.12668026983737946,
"learning_rate": 2.709030100334448e-06,
"loss": 2.6193,
"step": 33000
},
{
"epoch": 1913.4594594594594,
"eval_loss": 2.617690086364746,
"eval_runtime": 8.8781,
"eval_samples_per_second": 157.015,
"eval_steps_per_second": 19.711,
"step": 33000
},
{
"epoch": 1919.149359886202,
"grad_norm": 0.1749388575553894,
"learning_rate": 2.6989966555183945e-06,
"loss": 2.6198,
"step": 33100
},
{
"epoch": 1919.149359886202,
"eval_loss": 2.6185007095336914,
"eval_runtime": 8.5203,
"eval_samples_per_second": 163.61,
"eval_steps_per_second": 20.539,
"step": 33100
},
{
"epoch": 1924.8392603129446,
"grad_norm": 0.14269417524337769,
"learning_rate": 2.6889632107023413e-06,
"loss": 2.6195,
"step": 33200
},
{
"epoch": 1924.8392603129446,
"eval_loss": 2.6211161613464355,
"eval_runtime": 8.869,
"eval_samples_per_second": 157.176,
"eval_steps_per_second": 19.732,
"step": 33200
},
{
"epoch": 1958.6002844950212,
"grad_norm": 0.13085490465164185,
"learning_rate": 2.6789297658862876e-06,
"loss": 2.619,
"step": 33300
},
{
"epoch": 1958.6002844950212,
"eval_loss": 2.6180379390716553,
"eval_runtime": 9.0762,
"eval_samples_per_second": 153.589,
"eval_steps_per_second": 19.281,
"step": 33300
},
{
"epoch": 1964.290184921764,
"grad_norm": 0.14272978901863098,
"learning_rate": 2.668896321070234e-06,
"loss": 2.6189,
"step": 33400
},
{
"epoch": 1964.290184921764,
"eval_loss": 2.617365598678589,
"eval_runtime": 8.6925,
"eval_samples_per_second": 160.369,
"eval_steps_per_second": 20.132,
"step": 33400
},
{
"epoch": 1969.9800853485065,
"grad_norm": 0.1391880363225937,
"learning_rate": 2.6588628762541807e-06,
"loss": 2.6194,
"step": 33500
},
{
"epoch": 1969.9800853485065,
"eval_loss": 2.620480537414551,
"eval_runtime": 8.6935,
"eval_samples_per_second": 160.349,
"eval_steps_per_second": 20.13,
"step": 33500
},
{
"epoch": 1975.669985775249,
"grad_norm": 0.13617493212223053,
"learning_rate": 2.648829431438127e-06,
"loss": 2.619,
"step": 33600
},
{
"epoch": 1975.669985775249,
"eval_loss": 2.6200404167175293,
"eval_runtime": 8.5474,
"eval_samples_per_second": 163.091,
"eval_steps_per_second": 20.474,
"step": 33600
},
{
"epoch": 1981.3598862019915,
"grad_norm": 0.14002011716365814,
"learning_rate": 2.6387959866220734e-06,
"loss": 2.6193,
"step": 33700
},
{
"epoch": 1981.3598862019915,
"eval_loss": 2.619243621826172,
"eval_runtime": 8.852,
"eval_samples_per_second": 157.478,
"eval_steps_per_second": 19.77,
"step": 33700
},
{
"epoch": 1987.049786628734,
"grad_norm": 0.12899306416511536,
"learning_rate": 2.62876254180602e-06,
"loss": 2.6187,
"step": 33800
},
{
"epoch": 1987.049786628734,
"eval_loss": 2.6176578998565674,
"eval_runtime": 8.5349,
"eval_samples_per_second": 163.329,
"eval_steps_per_second": 20.504,
"step": 33800
},
{
"epoch": 1992.7396870554765,
"grad_norm": 0.13901114463806152,
"learning_rate": 2.6187290969899665e-06,
"loss": 2.619,
"step": 33900
},
{
"epoch": 1992.7396870554765,
"eval_loss": 2.6168863773345947,
"eval_runtime": 8.5326,
"eval_samples_per_second": 163.373,
"eval_steps_per_second": 20.509,
"step": 33900
},
{
"epoch": 1998.429587482219,
"grad_norm": 0.15283076465129852,
"learning_rate": 2.6086956521739132e-06,
"loss": 2.6185,
"step": 34000
},
{
"epoch": 1998.429587482219,
"eval_loss": 2.6182827949523926,
"eval_runtime": 8.8501,
"eval_samples_per_second": 157.512,
"eval_steps_per_second": 19.774,
"step": 34000
},
{
"epoch": 2004.1194879089617,
"grad_norm": 0.1270897537469864,
"learning_rate": 2.5986622073578596e-06,
"loss": 2.6191,
"step": 34100
},
{
"epoch": 2004.1194879089617,
"eval_loss": 2.616523265838623,
"eval_runtime": 8.545,
"eval_samples_per_second": 163.136,
"eval_steps_per_second": 20.48,
"step": 34100
},
{
"epoch": 2009.8093883357042,
"grad_norm": 0.11230363696813583,
"learning_rate": 2.588628762541806e-06,
"loss": 2.6187,
"step": 34200
},
{
"epoch": 2009.8093883357042,
"eval_loss": 2.619399309158325,
"eval_runtime": 8.8553,
"eval_samples_per_second": 157.42,
"eval_steps_per_second": 19.762,
"step": 34200
},
{
"epoch": 2015.4992887624467,
"grad_norm": 0.14034995436668396,
"learning_rate": 2.5785953177257527e-06,
"loss": 2.6187,
"step": 34300
},
{
"epoch": 2015.4992887624467,
"eval_loss": 2.6191158294677734,
"eval_runtime": 8.5383,
"eval_samples_per_second": 163.263,
"eval_steps_per_second": 20.496,
"step": 34300
},
{
"epoch": 2021.1891891891892,
"grad_norm": 0.14701803028583527,
"learning_rate": 2.568561872909699e-06,
"loss": 2.6183,
"step": 34400
},
{
"epoch": 2021.1891891891892,
"eval_loss": 2.620706796646118,
"eval_runtime": 8.5271,
"eval_samples_per_second": 163.479,
"eval_steps_per_second": 20.523,
"step": 34400
},
{
"epoch": 2026.8790896159317,
"grad_norm": 0.15207096934318542,
"learning_rate": 2.5585284280936454e-06,
"loss": 2.6189,
"step": 34500
},
{
"epoch": 2026.8790896159317,
"eval_loss": 2.619361400604248,
"eval_runtime": 8.5356,
"eval_samples_per_second": 163.316,
"eval_steps_per_second": 20.502,
"step": 34500
},
{
"epoch": 2032.5689900426742,
"grad_norm": 0.1416121870279312,
"learning_rate": 2.548494983277592e-06,
"loss": 2.6182,
"step": 34600
},
{
"epoch": 2032.5689900426742,
"eval_loss": 2.6184678077697754,
"eval_runtime": 8.8549,
"eval_samples_per_second": 157.428,
"eval_steps_per_second": 19.763,
"step": 34600
},
{
"epoch": 2038.2588904694169,
"grad_norm": 0.1294640153646469,
"learning_rate": 2.5384615384615385e-06,
"loss": 2.6185,
"step": 34700
},
{
"epoch": 2038.2588904694169,
"eval_loss": 2.618467330932617,
"eval_runtime": 8.5393,
"eval_samples_per_second": 163.246,
"eval_steps_per_second": 20.494,
"step": 34700
},
{
"epoch": 2043.9487908961594,
"grad_norm": 0.1140933409333229,
"learning_rate": 2.528428093645485e-06,
"loss": 2.618,
"step": 34800
},
{
"epoch": 2043.9487908961594,
"eval_loss": 2.6173062324523926,
"eval_runtime": 8.8708,
"eval_samples_per_second": 157.146,
"eval_steps_per_second": 19.728,
"step": 34800
},
{
"epoch": 2049.6386913229016,
"grad_norm": 0.15031367540359497,
"learning_rate": 2.5183946488294316e-06,
"loss": 2.6185,
"step": 34900
},
{
"epoch": 2049.6386913229016,
"eval_loss": 2.6177406311035156,
"eval_runtime": 8.5417,
"eval_samples_per_second": 163.2,
"eval_steps_per_second": 20.488,
"step": 34900
},
{
"epoch": 2055.3285917496446,
"grad_norm": 0.11632242053747177,
"learning_rate": 2.508361204013378e-06,
"loss": 2.6181,
"step": 35000
},
{
"epoch": 2055.3285917496446,
"eval_loss": 2.6203091144561768,
"eval_runtime": 8.8703,
"eval_samples_per_second": 157.154,
"eval_steps_per_second": 19.729,
"step": 35000
},
{
"epoch": 2061.018492176387,
"grad_norm": 0.14546014368534088,
"learning_rate": 2.4983277591973247e-06,
"loss": 2.6182,
"step": 35100
},
{
"epoch": 2061.018492176387,
"eval_loss": 2.6166837215423584,
"eval_runtime": 8.5363,
"eval_samples_per_second": 163.303,
"eval_steps_per_second": 20.501,
"step": 35100
},
{
"epoch": 2066.7083926031296,
"grad_norm": 0.15595249831676483,
"learning_rate": 2.488294314381271e-06,
"loss": 2.618,
"step": 35200
},
{
"epoch": 2066.7083926031296,
"eval_loss": 2.620990514755249,
"eval_runtime": 8.8529,
"eval_samples_per_second": 157.463,
"eval_steps_per_second": 19.768,
"step": 35200
},
{
"epoch": 2072.398293029872,
"grad_norm": 0.15020006895065308,
"learning_rate": 2.4782608695652173e-06,
"loss": 2.6181,
"step": 35300
},
{
"epoch": 2072.398293029872,
"eval_loss": 2.617979049682617,
"eval_runtime": 8.8415,
"eval_samples_per_second": 157.666,
"eval_steps_per_second": 19.793,
"step": 35300
},
{
"epoch": 2078.0881934566146,
"grad_norm": 0.12532344460487366,
"learning_rate": 2.468227424749164e-06,
"loss": 2.618,
"step": 35400
},
{
"epoch": 2078.0881934566146,
"eval_loss": 2.6182172298431396,
"eval_runtime": 8.8375,
"eval_samples_per_second": 157.737,
"eval_steps_per_second": 19.802,
"step": 35400
},
{
"epoch": 2083.778093883357,
"grad_norm": 0.13622809946537018,
"learning_rate": 2.4581939799331104e-06,
"loss": 2.6178,
"step": 35500
},
{
"epoch": 2083.778093883357,
"eval_loss": 2.6147806644439697,
"eval_runtime": 8.8411,
"eval_samples_per_second": 157.674,
"eval_steps_per_second": 19.794,
"step": 35500
},
{
"epoch": 2089.4679943100996,
"grad_norm": 0.11295317858457565,
"learning_rate": 2.4481605351170568e-06,
"loss": 2.6185,
"step": 35600
},
{
"epoch": 2089.4679943100996,
"eval_loss": 2.617539167404175,
"eval_runtime": 8.8516,
"eval_samples_per_second": 157.485,
"eval_steps_per_second": 19.77,
"step": 35600
},
{
"epoch": 2095.157894736842,
"grad_norm": 0.12703397870063782,
"learning_rate": 2.4381270903010035e-06,
"loss": 2.6178,
"step": 35700
},
{
"epoch": 2095.157894736842,
"eval_loss": 2.6192715167999268,
"eval_runtime": 8.8438,
"eval_samples_per_second": 157.624,
"eval_steps_per_second": 19.788,
"step": 35700
},
{
"epoch": 2100.8477951635846,
"grad_norm": 0.13047580420970917,
"learning_rate": 2.42809364548495e-06,
"loss": 2.6175,
"step": 35800
},
{
"epoch": 2100.8477951635846,
"eval_loss": 2.6149110794067383,
"eval_runtime": 8.5422,
"eval_samples_per_second": 163.189,
"eval_steps_per_second": 20.486,
"step": 35800
},
{
"epoch": 2106.537695590327,
"grad_norm": 0.1494310200214386,
"learning_rate": 2.4180602006688962e-06,
"loss": 2.6183,
"step": 35900
},
{
"epoch": 2106.537695590327,
"eval_loss": 2.617572069168091,
"eval_runtime": 8.5269,
"eval_samples_per_second": 163.483,
"eval_steps_per_second": 20.523,
"step": 35900
},
{
"epoch": 2112.2275960170696,
"grad_norm": 0.14913226664066315,
"learning_rate": 2.408026755852843e-06,
"loss": 2.6175,
"step": 36000
},
{
"epoch": 2112.2275960170696,
"eval_loss": 2.6157870292663574,
"eval_runtime": 8.5322,
"eval_samples_per_second": 163.381,
"eval_steps_per_second": 20.511,
"step": 36000
},
{
"epoch": 2117.917496443812,
"grad_norm": 0.12804996967315674,
"learning_rate": 2.3979933110367893e-06,
"loss": 2.6175,
"step": 36100
},
{
"epoch": 2117.917496443812,
"eval_loss": 2.6161787509918213,
"eval_runtime": 8.8464,
"eval_samples_per_second": 157.578,
"eval_steps_per_second": 19.782,
"step": 36100
},
{
"epoch": 2123.6073968705546,
"grad_norm": 0.1311938613653183,
"learning_rate": 2.387959866220736e-06,
"loss": 2.6177,
"step": 36200
},
{
"epoch": 2123.6073968705546,
"eval_loss": 2.6184916496276855,
"eval_runtime": 8.5276,
"eval_samples_per_second": 163.47,
"eval_steps_per_second": 20.522,
"step": 36200
},
{
"epoch": 2129.2972972972975,
"grad_norm": 0.14833857119083405,
"learning_rate": 2.3779264214046824e-06,
"loss": 2.618,
"step": 36300
},
{
"epoch": 2129.2972972972975,
"eval_loss": 2.616685628890991,
"eval_runtime": 8.5313,
"eval_samples_per_second": 163.399,
"eval_steps_per_second": 20.513,
"step": 36300
},
{
"epoch": 2134.98719772404,
"grad_norm": 0.14459851384162903,
"learning_rate": 2.3678929765886288e-06,
"loss": 2.6173,
"step": 36400
},
{
"epoch": 2134.98719772404,
"eval_loss": 2.6192727088928223,
"eval_runtime": 8.5314,
"eval_samples_per_second": 163.397,
"eval_steps_per_second": 20.513,
"step": 36400
},
{
"epoch": 2140.6770981507825,
"grad_norm": 0.12654992938041687,
"learning_rate": 2.3578595317725755e-06,
"loss": 2.6174,
"step": 36500
},
{
"epoch": 2140.6770981507825,
"eval_loss": 2.614757537841797,
"eval_runtime": 8.8498,
"eval_samples_per_second": 157.517,
"eval_steps_per_second": 19.774,
"step": 36500
},
{
"epoch": 2146.366998577525,
"grad_norm": 0.16258764266967773,
"learning_rate": 2.347826086956522e-06,
"loss": 2.618,
"step": 36600
},
{
"epoch": 2146.366998577525,
"eval_loss": 2.61818528175354,
"eval_runtime": 8.5339,
"eval_samples_per_second": 163.349,
"eval_steps_per_second": 20.507,
"step": 36600
},
{
"epoch": 2152.0568990042675,
"grad_norm": 0.1515471637248993,
"learning_rate": 2.337792642140468e-06,
"loss": 2.6177,
"step": 36700
},
{
"epoch": 2152.0568990042675,
"eval_loss": 2.6178441047668457,
"eval_runtime": 8.8414,
"eval_samples_per_second": 157.668,
"eval_steps_per_second": 19.793,
"step": 36700
},
{
"epoch": 2157.74679943101,
"grad_norm": 0.1283411979675293,
"learning_rate": 2.327759197324415e-06,
"loss": 2.6173,
"step": 36800
},
{
"epoch": 2157.74679943101,
"eval_loss": 2.6143412590026855,
"eval_runtime": 8.5345,
"eval_samples_per_second": 163.338,
"eval_steps_per_second": 20.505,
"step": 36800
},
{
"epoch": 2163.4366998577525,
"grad_norm": 0.13093768060207367,
"learning_rate": 2.3177257525083613e-06,
"loss": 2.6175,
"step": 36900
},
{
"epoch": 2163.4366998577525,
"eval_loss": 2.6168148517608643,
"eval_runtime": 8.5431,
"eval_samples_per_second": 163.172,
"eval_steps_per_second": 20.484,
"step": 36900
},
{
"epoch": 2169.126600284495,
"grad_norm": 0.12476625293493271,
"learning_rate": 2.307692307692308e-06,
"loss": 2.6174,
"step": 37000
},
{
"epoch": 2169.126600284495,
"eval_loss": 2.6173489093780518,
"eval_runtime": 8.8493,
"eval_samples_per_second": 157.526,
"eval_steps_per_second": 19.776,
"step": 37000
},
{
"epoch": 2174.8165007112375,
"grad_norm": 0.11948033422231674,
"learning_rate": 2.2976588628762544e-06,
"loss": 2.617,
"step": 37100
},
{
"epoch": 2174.8165007112375,
"eval_loss": 2.616060972213745,
"eval_runtime": 8.8543,
"eval_samples_per_second": 157.438,
"eval_steps_per_second": 19.764,
"step": 37100
},
{
"epoch": 2180.50640113798,
"grad_norm": 0.12949152290821075,
"learning_rate": 2.2876254180602008e-06,
"loss": 2.6175,
"step": 37200
},
{
"epoch": 2180.50640113798,
"eval_loss": 2.619767427444458,
"eval_runtime": 8.5433,
"eval_samples_per_second": 163.169,
"eval_steps_per_second": 20.484,
"step": 37200
},
{
"epoch": 2186.1963015647225,
"grad_norm": 0.14393049478530884,
"learning_rate": 2.2775919732441475e-06,
"loss": 2.6173,
"step": 37300
},
{
"epoch": 2186.1963015647225,
"eval_loss": 2.614513635635376,
"eval_runtime": 8.5464,
"eval_samples_per_second": 163.11,
"eval_steps_per_second": 20.477,
"step": 37300
},
{
"epoch": 2191.886201991465,
"grad_norm": 0.12848299741744995,
"learning_rate": 2.267558528428094e-06,
"loss": 2.6171,
"step": 37400
},
{
"epoch": 2191.886201991465,
"eval_loss": 2.615206241607666,
"eval_runtime": 8.8388,
"eval_samples_per_second": 157.714,
"eval_steps_per_second": 19.799,
"step": 37400
},
{
"epoch": 2197.5761024182075,
"grad_norm": 0.13800281286239624,
"learning_rate": 2.25752508361204e-06,
"loss": 2.6171,
"step": 37500
},
{
"epoch": 2197.5761024182075,
"eval_loss": 2.616520404815674,
"eval_runtime": 8.5307,
"eval_samples_per_second": 163.41,
"eval_steps_per_second": 20.514,
"step": 37500
},
{
"epoch": 2203.2660028449504,
"grad_norm": 0.1414160281419754,
"learning_rate": 2.2474916387959865e-06,
"loss": 2.617,
"step": 37600
},
{
"epoch": 2203.2660028449504,
"eval_loss": 2.617866039276123,
"eval_runtime": 8.8509,
"eval_samples_per_second": 157.499,
"eval_steps_per_second": 19.772,
"step": 37600
},
{
"epoch": 2208.955903271693,
"grad_norm": 0.129195898771286,
"learning_rate": 2.237458193979933e-06,
"loss": 2.617,
"step": 37700
},
{
"epoch": 2208.955903271693,
"eval_loss": 2.616370677947998,
"eval_runtime": 8.5271,
"eval_samples_per_second": 163.479,
"eval_steps_per_second": 20.523,
"step": 37700
},
{
"epoch": 2214.6458036984354,
"grad_norm": 0.12701831758022308,
"learning_rate": 2.2274247491638796e-06,
"loss": 2.6172,
"step": 37800
},
{
"epoch": 2214.6458036984354,
"eval_loss": 2.619422197341919,
"eval_runtime": 8.5388,
"eval_samples_per_second": 163.255,
"eval_steps_per_second": 20.495,
"step": 37800
},
{
"epoch": 2220.335704125178,
"grad_norm": 0.1434861570596695,
"learning_rate": 2.217391304347826e-06,
"loss": 2.6168,
"step": 37900
},
{
"epoch": 2220.335704125178,
"eval_loss": 2.6175920963287354,
"eval_runtime": 8.528,
"eval_samples_per_second": 163.461,
"eval_steps_per_second": 20.521,
"step": 37900
},
{
"epoch": 2226.0256045519204,
"grad_norm": 0.1319652646780014,
"learning_rate": 2.2073578595317723e-06,
"loss": 2.6169,
"step": 38000
},
{
"epoch": 2226.0256045519204,
"eval_loss": 2.6176187992095947,
"eval_runtime": 8.8424,
"eval_samples_per_second": 157.65,
"eval_steps_per_second": 19.791,
"step": 38000
},
{
"epoch": 2231.715504978663,
"grad_norm": 0.13358598947525024,
"learning_rate": 2.197324414715719e-06,
"loss": 2.6167,
"step": 38100
},
{
"epoch": 2231.715504978663,
"eval_loss": 2.616727828979492,
"eval_runtime": 8.5384,
"eval_samples_per_second": 163.263,
"eval_steps_per_second": 20.496,
"step": 38100
},
{
"epoch": 2237.4054054054054,
"grad_norm": 0.12551608681678772,
"learning_rate": 2.1872909698996654e-06,
"loss": 2.617,
"step": 38200
},
{
"epoch": 2237.4054054054054,
"eval_loss": 2.616206645965576,
"eval_runtime": 8.8419,
"eval_samples_per_second": 157.658,
"eval_steps_per_second": 19.792,
"step": 38200
},
{
"epoch": 2243.095305832148,
"grad_norm": 0.1412065029144287,
"learning_rate": 2.177257525083612e-06,
"loss": 2.6172,
"step": 38300
},
{
"epoch": 2243.095305832148,
"eval_loss": 2.618215799331665,
"eval_runtime": 8.54,
"eval_samples_per_second": 163.232,
"eval_steps_per_second": 20.492,
"step": 38300
},
{
"epoch": 2248.7852062588904,
"grad_norm": 0.16305094957351685,
"learning_rate": 2.1672240802675585e-06,
"loss": 2.6166,
"step": 38400
},
{
"epoch": 2248.7852062588904,
"eval_loss": 2.618960380554199,
"eval_runtime": 8.5327,
"eval_samples_per_second": 163.371,
"eval_steps_per_second": 20.509,
"step": 38400
},
{
"epoch": 2254.475106685633,
"grad_norm": 0.14737871289253235,
"learning_rate": 2.157190635451505e-06,
"loss": 2.6165,
"step": 38500
},
{
"epoch": 2254.475106685633,
"eval_loss": 2.618856906890869,
"eval_runtime": 8.8618,
"eval_samples_per_second": 157.305,
"eval_steps_per_second": 19.748,
"step": 38500
},
{
"epoch": 2260.1650071123754,
"grad_norm": 0.11627591401338577,
"learning_rate": 2.1471571906354516e-06,
"loss": 2.6169,
"step": 38600
},
{
"epoch": 2260.1650071123754,
"eval_loss": 2.6156229972839355,
"eval_runtime": 8.5295,
"eval_samples_per_second": 163.432,
"eval_steps_per_second": 20.517,
"step": 38600
},
{
"epoch": 2265.854907539118,
"grad_norm": 0.1361280232667923,
"learning_rate": 2.137123745819398e-06,
"loss": 2.6168,
"step": 38700
},
{
"epoch": 2265.854907539118,
"eval_loss": 2.6178250312805176,
"eval_runtime": 8.8561,
"eval_samples_per_second": 157.405,
"eval_steps_per_second": 19.76,
"step": 38700
},
{
"epoch": 2271.5448079658604,
"grad_norm": 0.13634426891803741,
"learning_rate": 2.1270903010033443e-06,
"loss": 2.6168,
"step": 38800
},
{
"epoch": 2271.5448079658604,
"eval_loss": 2.620987892150879,
"eval_runtime": 8.542,
"eval_samples_per_second": 163.194,
"eval_steps_per_second": 20.487,
"step": 38800
},
{
"epoch": 2277.2347083926034,
"grad_norm": 0.11851690709590912,
"learning_rate": 2.117056856187291e-06,
"loss": 2.6169,
"step": 38900
},
{
"epoch": 2277.2347083926034,
"eval_loss": 2.6175451278686523,
"eval_runtime": 8.5298,
"eval_samples_per_second": 163.428,
"eval_steps_per_second": 20.516,
"step": 38900
},
{
"epoch": 2282.924608819346,
"grad_norm": 0.15516361594200134,
"learning_rate": 2.1070234113712374e-06,
"loss": 2.6164,
"step": 39000
},
{
"epoch": 2282.924608819346,
"eval_loss": 2.6171224117279053,
"eval_runtime": 8.5317,
"eval_samples_per_second": 163.391,
"eval_steps_per_second": 20.512,
"step": 39000
},
{
"epoch": 2288.6145092460883,
"grad_norm": 0.14551801979541779,
"learning_rate": 2.0969899665551837e-06,
"loss": 2.6166,
"step": 39100
},
{
"epoch": 2288.6145092460883,
"eval_loss": 2.618269920349121,
"eval_runtime": 8.8607,
"eval_samples_per_second": 157.324,
"eval_steps_per_second": 19.75,
"step": 39100
},
{
"epoch": 2294.304409672831,
"grad_norm": 0.13568130135536194,
"learning_rate": 2.0869565217391305e-06,
"loss": 2.6162,
"step": 39200
},
{
"epoch": 2294.304409672831,
"eval_loss": 2.6169166564941406,
"eval_runtime": 8.5331,
"eval_samples_per_second": 163.363,
"eval_steps_per_second": 20.508,
"step": 39200
},
{
"epoch": 2299.9943100995733,
"grad_norm": 0.1397295743227005,
"learning_rate": 2.076923076923077e-06,
"loss": 2.6165,
"step": 39300
},
{
"epoch": 2299.9943100995733,
"eval_loss": 2.617077589035034,
"eval_runtime": 8.8589,
"eval_samples_per_second": 157.356,
"eval_steps_per_second": 19.754,
"step": 39300
},
{
"epoch": 2305.684210526316,
"grad_norm": 0.1272270530462265,
"learning_rate": 2.0668896321070236e-06,
"loss": 2.6167,
"step": 39400
},
{
"epoch": 2305.684210526316,
"eval_loss": 2.6153969764709473,
"eval_runtime": 8.5514,
"eval_samples_per_second": 163.015,
"eval_steps_per_second": 20.465,
"step": 39400
},
{
"epoch": 2311.3741109530583,
"grad_norm": 0.13360774517059326,
"learning_rate": 2.05685618729097e-06,
"loss": 2.616,
"step": 39500
},
{
"epoch": 2311.3741109530583,
"eval_loss": 2.6174352169036865,
"eval_runtime": 8.851,
"eval_samples_per_second": 157.496,
"eval_steps_per_second": 19.772,
"step": 39500
},
{
"epoch": 2317.064011379801,
"grad_norm": 0.14483892917633057,
"learning_rate": 2.0468227424749163e-06,
"loss": 2.6158,
"step": 39600
},
{
"epoch": 2317.064011379801,
"eval_loss": 2.617931842803955,
"eval_runtime": 8.5303,
"eval_samples_per_second": 163.418,
"eval_steps_per_second": 20.515,
"step": 39600
},
{
"epoch": 2322.7539118065433,
"grad_norm": 0.12557685375213623,
"learning_rate": 2.036789297658863e-06,
"loss": 2.6163,
"step": 39700
},
{
"epoch": 2322.7539118065433,
"eval_loss": 2.616457462310791,
"eval_runtime": 8.8454,
"eval_samples_per_second": 157.596,
"eval_steps_per_second": 19.784,
"step": 39700
},
{
"epoch": 2328.443812233286,
"grad_norm": 0.14481040835380554,
"learning_rate": 2.0267558528428094e-06,
"loss": 2.6161,
"step": 39800
},
{
"epoch": 2328.443812233286,
"eval_loss": 2.6148922443389893,
"eval_runtime": 8.5419,
"eval_samples_per_second": 163.195,
"eval_steps_per_second": 20.487,
"step": 39800
},
{
"epoch": 2334.1337126600283,
"grad_norm": 0.1371890753507614,
"learning_rate": 2.0167224080267557e-06,
"loss": 2.6156,
"step": 39900
},
{
"epoch": 2334.1337126600283,
"eval_loss": 2.6165366172790527,
"eval_runtime": 8.5312,
"eval_samples_per_second": 163.401,
"eval_steps_per_second": 20.513,
"step": 39900
},
{
"epoch": 2339.823613086771,
"grad_norm": 0.11908498406410217,
"learning_rate": 2.0066889632107025e-06,
"loss": 2.6161,
"step": 40000
},
{
"epoch": 2339.823613086771,
"eval_loss": 2.6168572902679443,
"eval_runtime": 8.5312,
"eval_samples_per_second": 163.4,
"eval_steps_per_second": 20.513,
"step": 40000
},
{
"epoch": 2345.5135135135133,
"grad_norm": 0.15776848793029785,
"learning_rate": 1.996655518394649e-06,
"loss": 2.6161,
"step": 40100
},
{
"epoch": 2345.5135135135133,
"eval_loss": 2.61922550201416,
"eval_runtime": 8.8387,
"eval_samples_per_second": 157.716,
"eval_steps_per_second": 19.799,
"step": 40100
},
{
"epoch": 2351.2034139402563,
"grad_norm": 0.13650420308113098,
"learning_rate": 1.986622073578595e-06,
"loss": 2.6157,
"step": 40200
},
{
"epoch": 2351.2034139402563,
"eval_loss": 2.6171460151672363,
"eval_runtime": 8.8588,
"eval_samples_per_second": 157.357,
"eval_steps_per_second": 19.754,
"step": 40200
},
{
"epoch": 2356.8933143669988,
"grad_norm": 0.14394904673099518,
"learning_rate": 1.976588628762542e-06,
"loss": 2.6156,
"step": 40300
},
{
"epoch": 2356.8933143669988,
"eval_loss": 2.617033004760742,
"eval_runtime": 8.5275,
"eval_samples_per_second": 163.472,
"eval_steps_per_second": 20.522,
"step": 40300
},
{
"epoch": 2362.5832147937413,
"grad_norm": 0.14980724453926086,
"learning_rate": 1.9665551839464883e-06,
"loss": 2.6163,
"step": 40400
},
{
"epoch": 2362.5832147937413,
"eval_loss": 2.614140510559082,
"eval_runtime": 8.5229,
"eval_samples_per_second": 163.56,
"eval_steps_per_second": 20.533,
"step": 40400
},
{
"epoch": 2368.2731152204838,
"grad_norm": 0.13233982026576996,
"learning_rate": 1.956521739130435e-06,
"loss": 2.6156,
"step": 40500
},
{
"epoch": 2368.2731152204838,
"eval_loss": 2.615586042404175,
"eval_runtime": 8.847,
"eval_samples_per_second": 157.568,
"eval_steps_per_second": 19.781,
"step": 40500
},
{
"epoch": 2373.9630156472263,
"grad_norm": 0.13586369156837463,
"learning_rate": 1.9464882943143814e-06,
"loss": 2.6159,
"step": 40600
},
{
"epoch": 2373.9630156472263,
"eval_loss": 2.61820650100708,
"eval_runtime": 8.8392,
"eval_samples_per_second": 157.707,
"eval_steps_per_second": 19.798,
"step": 40600
},
{
"epoch": 2379.6529160739688,
"grad_norm": 0.13869047164916992,
"learning_rate": 1.9364548494983277e-06,
"loss": 2.6152,
"step": 40700
},
{
"epoch": 2379.6529160739688,
"eval_loss": 2.614039897918701,
"eval_runtime": 8.5304,
"eval_samples_per_second": 163.415,
"eval_steps_per_second": 20.515,
"step": 40700
},
{
"epoch": 2385.3428165007113,
"grad_norm": 0.1269962042570114,
"learning_rate": 1.9264214046822745e-06,
"loss": 2.6152,
"step": 40800
},
{
"epoch": 2385.3428165007113,
"eval_loss": 2.614192247390747,
"eval_runtime": 8.5419,
"eval_samples_per_second": 163.195,
"eval_steps_per_second": 20.487,
"step": 40800
},
{
"epoch": 2391.0327169274537,
"grad_norm": 0.14708365499973297,
"learning_rate": 1.916387959866221e-06,
"loss": 2.6155,
"step": 40900
},
{
"epoch": 2391.0327169274537,
"eval_loss": 2.615812301635742,
"eval_runtime": 8.8721,
"eval_samples_per_second": 157.122,
"eval_steps_per_second": 19.725,
"step": 40900
},
{
"epoch": 2396.7226173541962,
"grad_norm": 0.11788502335548401,
"learning_rate": 1.9063545150501674e-06,
"loss": 2.6158,
"step": 41000
},
{
"epoch": 2396.7226173541962,
"eval_loss": 2.615337610244751,
"eval_runtime": 8.8548,
"eval_samples_per_second": 157.429,
"eval_steps_per_second": 19.763,
"step": 41000
},
{
"epoch": 2402.4125177809387,
"grad_norm": 0.14130190014839172,
"learning_rate": 1.896321070234114e-06,
"loss": 2.6153,
"step": 41100
},
{
"epoch": 2402.4125177809387,
"eval_loss": 2.6168124675750732,
"eval_runtime": 8.5314,
"eval_samples_per_second": 163.396,
"eval_steps_per_second": 20.512,
"step": 41100
},
{
"epoch": 2408.1024182076812,
"grad_norm": 0.14463502168655396,
"learning_rate": 1.8862876254180603e-06,
"loss": 2.6155,
"step": 41200
},
{
"epoch": 2408.1024182076812,
"eval_loss": 2.6179542541503906,
"eval_runtime": 8.5339,
"eval_samples_per_second": 163.348,
"eval_steps_per_second": 20.506,
"step": 41200
},
{
"epoch": 2413.7923186344237,
"grad_norm": 0.12708818912506104,
"learning_rate": 1.8762541806020068e-06,
"loss": 2.6155,
"step": 41300
},
{
"epoch": 2413.7923186344237,
"eval_loss": 2.616238832473755,
"eval_runtime": 8.5448,
"eval_samples_per_second": 163.139,
"eval_steps_per_second": 20.48,
"step": 41300
},
{
"epoch": 2419.4822190611662,
"grad_norm": 0.1303997039794922,
"learning_rate": 1.8662207357859534e-06,
"loss": 2.616,
"step": 41400
},
{
"epoch": 2419.4822190611662,
"eval_loss": 2.615610122680664,
"eval_runtime": 8.8552,
"eval_samples_per_second": 157.421,
"eval_steps_per_second": 19.762,
"step": 41400
},
{
"epoch": 2425.172119487909,
"grad_norm": 0.14887328445911407,
"learning_rate": 1.8561872909699e-06,
"loss": 2.6158,
"step": 41500
},
{
"epoch": 2425.172119487909,
"eval_loss": 2.6136879920959473,
"eval_runtime": 8.5262,
"eval_samples_per_second": 163.496,
"eval_steps_per_second": 20.525,
"step": 41500
},
{
"epoch": 2430.8620199146517,
"grad_norm": 0.12649740278720856,
"learning_rate": 1.8461538461538462e-06,
"loss": 2.6156,
"step": 41600
},
{
"epoch": 2430.8620199146517,
"eval_loss": 2.6171023845672607,
"eval_runtime": 8.8681,
"eval_samples_per_second": 157.192,
"eval_steps_per_second": 19.734,
"step": 41600
},
{
"epoch": 2436.551920341394,
"grad_norm": 0.14125467836856842,
"learning_rate": 1.8361204013377928e-06,
"loss": 2.6152,
"step": 41700
},
{
"epoch": 2436.551920341394,
"eval_loss": 2.6158018112182617,
"eval_runtime": 8.5206,
"eval_samples_per_second": 163.604,
"eval_steps_per_second": 20.538,
"step": 41700
},
{
"epoch": 2442.2418207681367,
"grad_norm": 0.12283240258693695,
"learning_rate": 1.8260869565217394e-06,
"loss": 2.6159,
"step": 41800
},
{
"epoch": 2442.2418207681367,
"eval_loss": 2.6169424057006836,
"eval_runtime": 8.5518,
"eval_samples_per_second": 163.007,
"eval_steps_per_second": 20.464,
"step": 41800
},
{
"epoch": 2447.931721194879,
"grad_norm": 0.15379033982753754,
"learning_rate": 1.8160535117056857e-06,
"loss": 2.6152,
"step": 41900
},
{
"epoch": 2447.931721194879,
"eval_loss": 2.6155290603637695,
"eval_runtime": 8.8698,
"eval_samples_per_second": 157.162,
"eval_steps_per_second": 19.73,
"step": 41900
},
{
"epoch": 2453.6216216216217,
"grad_norm": 0.15148812532424927,
"learning_rate": 1.8060200668896322e-06,
"loss": 2.6152,
"step": 42000
},
{
"epoch": 2453.6216216216217,
"eval_loss": 2.6144700050354004,
"eval_runtime": 8.5457,
"eval_samples_per_second": 163.122,
"eval_steps_per_second": 20.478,
"step": 42000
},
{
"epoch": 2459.311522048364,
"grad_norm": 0.1490088701248169,
"learning_rate": 1.7959866220735788e-06,
"loss": 2.615,
"step": 42100
},
{
"epoch": 2459.311522048364,
"eval_loss": 2.6168768405914307,
"eval_runtime": 8.5451,
"eval_samples_per_second": 163.135,
"eval_steps_per_second": 20.48,
"step": 42100
},
{
"epoch": 2465.0014224751067,
"grad_norm": 0.11491715162992477,
"learning_rate": 1.7859531772575253e-06,
"loss": 2.6157,
"step": 42200
},
{
"epoch": 2465.0014224751067,
"eval_loss": 2.6155242919921875,
"eval_runtime": 8.5538,
"eval_samples_per_second": 162.969,
"eval_steps_per_second": 20.459,
"step": 42200
},
{
"epoch": 2470.691322901849,
"grad_norm": 0.15772178769111633,
"learning_rate": 1.7759197324414717e-06,
"loss": 2.6153,
"step": 42300
},
{
"epoch": 2470.691322901849,
"eval_loss": 2.613830804824829,
"eval_runtime": 8.8766,
"eval_samples_per_second": 157.041,
"eval_steps_per_second": 19.715,
"step": 42300
},
{
"epoch": 2476.3812233285917,
"grad_norm": 0.13534432649612427,
"learning_rate": 1.7658862876254182e-06,
"loss": 2.6149,
"step": 42400
},
{
"epoch": 2476.3812233285917,
"eval_loss": 2.6143946647644043,
"eval_runtime": 8.5381,
"eval_samples_per_second": 163.268,
"eval_steps_per_second": 20.496,
"step": 42400
},
{
"epoch": 2482.071123755334,
"grad_norm": 0.11993639171123505,
"learning_rate": 1.7558528428093648e-06,
"loss": 2.6154,
"step": 42500
},
{
"epoch": 2482.071123755334,
"eval_loss": 2.6130027770996094,
"eval_runtime": 8.5295,
"eval_samples_per_second": 163.434,
"eval_steps_per_second": 20.517,
"step": 42500
},
{
"epoch": 2487.7610241820767,
"grad_norm": 0.12379685789346695,
"learning_rate": 1.745819397993311e-06,
"loss": 2.6152,
"step": 42600
},
{
"epoch": 2487.7610241820767,
"eval_loss": 2.616774082183838,
"eval_runtime": 8.854,
"eval_samples_per_second": 157.444,
"eval_steps_per_second": 19.765,
"step": 42600
},
{
"epoch": 2493.450924608819,
"grad_norm": 0.11662384122610092,
"learning_rate": 1.7357859531772575e-06,
"loss": 2.6152,
"step": 42700
},
{
"epoch": 2493.450924608819,
"eval_loss": 2.6169705390930176,
"eval_runtime": 8.5399,
"eval_samples_per_second": 163.234,
"eval_steps_per_second": 20.492,
"step": 42700
},
{
"epoch": 2499.140825035562,
"grad_norm": 0.13475127518177032,
"learning_rate": 1.7257525083612038e-06,
"loss": 2.6153,
"step": 42800
},
{
"epoch": 2499.140825035562,
"eval_loss": 2.6149654388427734,
"eval_runtime": 8.5276,
"eval_samples_per_second": 163.468,
"eval_steps_per_second": 20.522,
"step": 42800
},
{
"epoch": 2504.8307254623046,
"grad_norm": 0.12163935601711273,
"learning_rate": 1.7157190635451504e-06,
"loss": 2.6146,
"step": 42900
},
{
"epoch": 2504.8307254623046,
"eval_loss": 2.616426467895508,
"eval_runtime": 8.5341,
"eval_samples_per_second": 163.345,
"eval_steps_per_second": 20.506,
"step": 42900
},
{
"epoch": 2510.520625889047,
"grad_norm": 0.12904202938079834,
"learning_rate": 1.705685618729097e-06,
"loss": 2.615,
"step": 43000
},
{
"epoch": 2510.520625889047,
"eval_loss": 2.6142711639404297,
"eval_runtime": 8.8632,
"eval_samples_per_second": 157.279,
"eval_steps_per_second": 19.744,
"step": 43000
},
{
"epoch": 2516.2105263157896,
"grad_norm": 0.14409850537776947,
"learning_rate": 1.6956521739130435e-06,
"loss": 2.615,
"step": 43100
},
{
"epoch": 2516.2105263157896,
"eval_loss": 2.6173369884490967,
"eval_runtime": 8.517,
"eval_samples_per_second": 163.673,
"eval_steps_per_second": 20.547,
"step": 43100
},
{
"epoch": 2521.900426742532,
"grad_norm": 0.12942758202552795,
"learning_rate": 1.6856187290969898e-06,
"loss": 2.6147,
"step": 43200
},
{
"epoch": 2521.900426742532,
"eval_loss": 2.6177051067352295,
"eval_runtime": 8.8403,
"eval_samples_per_second": 157.688,
"eval_steps_per_second": 19.796,
"step": 43200
},
{
"epoch": 2527.5903271692746,
"grad_norm": 0.14761574566364288,
"learning_rate": 1.6755852842809363e-06,
"loss": 2.6143,
"step": 43300
},
{
"epoch": 2527.5903271692746,
"eval_loss": 2.6154208183288574,
"eval_runtime": 8.5203,
"eval_samples_per_second": 163.609,
"eval_steps_per_second": 20.539,
"step": 43300
},
{
"epoch": 2533.280227596017,
"grad_norm": 0.1361926943063736,
"learning_rate": 1.665551839464883e-06,
"loss": 2.615,
"step": 43400
},
{
"epoch": 2533.280227596017,
"eval_loss": 2.617976188659668,
"eval_runtime": 8.5489,
"eval_samples_per_second": 163.062,
"eval_steps_per_second": 20.47,
"step": 43400
},
{
"epoch": 2538.9701280227596,
"grad_norm": 0.1490316092967987,
"learning_rate": 1.6555183946488294e-06,
"loss": 2.6146,
"step": 43500
},
{
"epoch": 2538.9701280227596,
"eval_loss": 2.616652250289917,
"eval_runtime": 8.5376,
"eval_samples_per_second": 163.279,
"eval_steps_per_second": 20.498,
"step": 43500
},
{
"epoch": 2544.660028449502,
"grad_norm": 0.13588373363018036,
"learning_rate": 1.6454849498327758e-06,
"loss": 2.6152,
"step": 43600
},
{
"epoch": 2544.660028449502,
"eval_loss": 2.6177802085876465,
"eval_runtime": 8.8667,
"eval_samples_per_second": 157.217,
"eval_steps_per_second": 19.737,
"step": 43600
},
{
"epoch": 2550.3499288762446,
"grad_norm": 0.12654942274093628,
"learning_rate": 1.6354515050167223e-06,
"loss": 2.6146,
"step": 43700
},
{
"epoch": 2550.3499288762446,
"eval_loss": 2.615847110748291,
"eval_runtime": 8.5335,
"eval_samples_per_second": 163.356,
"eval_steps_per_second": 20.507,
"step": 43700
},
{
"epoch": 2556.039829302987,
"grad_norm": 0.15947924554347992,
"learning_rate": 1.6254180602006689e-06,
"loss": 2.6149,
"step": 43800
},
{
"epoch": 2556.039829302987,
"eval_loss": 2.613116502761841,
"eval_runtime": 8.5279,
"eval_samples_per_second": 163.464,
"eval_steps_per_second": 20.521,
"step": 43800
},
{
"epoch": 2561.7297297297296,
"grad_norm": 0.11915856599807739,
"learning_rate": 1.6153846153846154e-06,
"loss": 2.6146,
"step": 43900
},
{
"epoch": 2561.7297297297296,
"eval_loss": 2.614288568496704,
"eval_runtime": 8.8597,
"eval_samples_per_second": 157.342,
"eval_steps_per_second": 19.752,
"step": 43900
},
{
"epoch": 2567.419630156472,
"grad_norm": 0.1312067210674286,
"learning_rate": 1.6053511705685618e-06,
"loss": 2.6147,
"step": 44000
},
{
"epoch": 2567.419630156472,
"eval_loss": 2.6091578006744385,
"eval_runtime": 8.5457,
"eval_samples_per_second": 163.122,
"eval_steps_per_second": 20.478,
"step": 44000
},
{
"epoch": 2573.109530583215,
"grad_norm": 0.14233353734016418,
"learning_rate": 1.5953177257525083e-06,
"loss": 2.6148,
"step": 44100
},
{
"epoch": 2573.109530583215,
"eval_loss": 2.612126111984253,
"eval_runtime": 8.5355,
"eval_samples_per_second": 163.318,
"eval_steps_per_second": 20.503,
"step": 44100
},
{
"epoch": 2578.7994310099575,
"grad_norm": 0.1357184797525406,
"learning_rate": 1.5852842809364549e-06,
"loss": 2.6149,
"step": 44200
},
{
"epoch": 2578.7994310099575,
"eval_loss": 2.618696928024292,
"eval_runtime": 8.8741,
"eval_samples_per_second": 157.087,
"eval_steps_per_second": 19.72,
"step": 44200
},
{
"epoch": 2584.4893314367,
"grad_norm": 0.14556884765625,
"learning_rate": 1.5752508361204012e-06,
"loss": 2.6142,
"step": 44300
},
{
"epoch": 2584.4893314367,
"eval_loss": 2.616926908493042,
"eval_runtime": 8.5359,
"eval_samples_per_second": 163.311,
"eval_steps_per_second": 20.502,
"step": 44300
},
{
"epoch": 2590.1792318634425,
"grad_norm": 0.12908801436424255,
"learning_rate": 1.5652173913043478e-06,
"loss": 2.6145,
"step": 44400
},
{
"epoch": 2590.1792318634425,
"eval_loss": 2.6157069206237793,
"eval_runtime": 8.8877,
"eval_samples_per_second": 156.846,
"eval_steps_per_second": 19.69,
"step": 44400
},
{
"epoch": 2595.869132290185,
"grad_norm": 0.14168845117092133,
"learning_rate": 1.5551839464882943e-06,
"loss": 2.6146,
"step": 44500
},
{
"epoch": 2595.869132290185,
"eval_loss": 2.615161657333374,
"eval_runtime": 8.5305,
"eval_samples_per_second": 163.413,
"eval_steps_per_second": 20.515,
"step": 44500
},
{
"epoch": 2601.5590327169275,
"grad_norm": 0.13634611666202545,
"learning_rate": 1.5451505016722409e-06,
"loss": 2.6146,
"step": 44600
},
{
"epoch": 2601.5590327169275,
"eval_loss": 2.6135544776916504,
"eval_runtime": 8.54,
"eval_samples_per_second": 163.233,
"eval_steps_per_second": 20.492,
"step": 44600
},
{
"epoch": 2607.24893314367,
"grad_norm": 0.14684821665287018,
"learning_rate": 1.5351170568561872e-06,
"loss": 2.6149,
"step": 44700
},
{
"epoch": 2607.24893314367,
"eval_loss": 2.616076707839966,
"eval_runtime": 8.517,
"eval_samples_per_second": 163.672,
"eval_steps_per_second": 20.547,
"step": 44700
},
{
"epoch": 2612.9388335704125,
"grad_norm": 0.14135567843914032,
"learning_rate": 1.5250836120401338e-06,
"loss": 2.6142,
"step": 44800
},
{
"epoch": 2612.9388335704125,
"eval_loss": 2.6159145832061768,
"eval_runtime": 8.8492,
"eval_samples_per_second": 157.529,
"eval_steps_per_second": 19.776,
"step": 44800
},
{
"epoch": 2618.628733997155,
"grad_norm": 0.1256554275751114,
"learning_rate": 1.5150501672240803e-06,
"loss": 2.6142,
"step": 44900
},
{
"epoch": 2618.628733997155,
"eval_loss": 2.6102206707000732,
"eval_runtime": 8.5351,
"eval_samples_per_second": 163.325,
"eval_steps_per_second": 20.503,
"step": 44900
},
{
"epoch": 2624.3186344238975,
"grad_norm": 0.12723155319690704,
"learning_rate": 1.5050167224080269e-06,
"loss": 2.614,
"step": 45000
},
{
"epoch": 2624.3186344238975,
"eval_loss": 2.6176397800445557,
"eval_runtime": 8.5392,
"eval_samples_per_second": 163.248,
"eval_steps_per_second": 20.494,
"step": 45000
},
{
"epoch": 2630.00853485064,
"grad_norm": 0.1423732191324234,
"learning_rate": 1.4949832775919732e-06,
"loss": 2.6145,
"step": 45100
},
{
"epoch": 2630.00853485064,
"eval_loss": 2.613284111022949,
"eval_runtime": 8.5394,
"eval_samples_per_second": 163.243,
"eval_steps_per_second": 20.493,
"step": 45100
},
{
"epoch": 2635.6984352773825,
"grad_norm": 0.127468079328537,
"learning_rate": 1.4849498327759198e-06,
"loss": 2.6143,
"step": 45200
},
{
"epoch": 2635.6984352773825,
"eval_loss": 2.614154100418091,
"eval_runtime": 8.842,
"eval_samples_per_second": 157.657,
"eval_steps_per_second": 19.792,
"step": 45200
},
{
"epoch": 2641.388335704125,
"grad_norm": 0.13406263291835785,
"learning_rate": 1.4749163879598663e-06,
"loss": 2.6141,
"step": 45300
},
{
"epoch": 2641.388335704125,
"eval_loss": 2.614849328994751,
"eval_runtime": 8.53,
"eval_samples_per_second": 163.424,
"eval_steps_per_second": 20.516,
"step": 45300
},
{
"epoch": 2647.078236130868,
"grad_norm": 0.14327415823936462,
"learning_rate": 1.4648829431438129e-06,
"loss": 2.6142,
"step": 45400
},
{
"epoch": 2647.078236130868,
"eval_loss": 2.6156599521636963,
"eval_runtime": 8.5293,
"eval_samples_per_second": 163.437,
"eval_steps_per_second": 20.518,
"step": 45400
},
{
"epoch": 2652.7681365576104,
"grad_norm": 0.13055378198623657,
"learning_rate": 1.4548494983277592e-06,
"loss": 2.6141,
"step": 45500
},
{
"epoch": 2652.7681365576104,
"eval_loss": 2.6118390560150146,
"eval_runtime": 8.5382,
"eval_samples_per_second": 163.267,
"eval_steps_per_second": 20.496,
"step": 45500
},
{
"epoch": 2658.458036984353,
"grad_norm": 0.14269088208675385,
"learning_rate": 1.4448160535117058e-06,
"loss": 2.6149,
"step": 45600
},
{
"epoch": 2658.458036984353,
"eval_loss": 2.616089105606079,
"eval_runtime": 8.8625,
"eval_samples_per_second": 157.291,
"eval_steps_per_second": 19.746,
"step": 45600
},
{
"epoch": 2664.1479374110954,
"grad_norm": 0.13923226296901703,
"learning_rate": 1.4347826086956523e-06,
"loss": 2.6141,
"step": 45700
},
{
"epoch": 2664.1479374110954,
"eval_loss": 2.615753412246704,
"eval_runtime": 8.5429,
"eval_samples_per_second": 163.175,
"eval_steps_per_second": 20.485,
"step": 45700
},
{
"epoch": 2669.837837837838,
"grad_norm": 0.11520116031169891,
"learning_rate": 1.4247491638795989e-06,
"loss": 2.614,
"step": 45800
},
{
"epoch": 2669.837837837838,
"eval_loss": 2.614213228225708,
"eval_runtime": 8.8633,
"eval_samples_per_second": 157.278,
"eval_steps_per_second": 19.744,
"step": 45800
},
{
"epoch": 2675.5277382645804,
"grad_norm": 0.13826854526996613,
"learning_rate": 1.4147157190635452e-06,
"loss": 2.6141,
"step": 45900
},
{
"epoch": 2675.5277382645804,
"eval_loss": 2.6141257286071777,
"eval_runtime": 8.531,
"eval_samples_per_second": 163.403,
"eval_steps_per_second": 20.513,
"step": 45900
},
{
"epoch": 2681.217638691323,
"grad_norm": 0.1388641595840454,
"learning_rate": 1.4046822742474917e-06,
"loss": 2.614,
"step": 46000
},
{
"epoch": 2681.217638691323,
"eval_loss": 2.6166601181030273,
"eval_runtime": 8.8588,
"eval_samples_per_second": 157.357,
"eval_steps_per_second": 19.754,
"step": 46000
},
{
"epoch": 2686.9075391180654,
"grad_norm": 0.1250719428062439,
"learning_rate": 1.3946488294314383e-06,
"loss": 2.6134,
"step": 46100
},
{
"epoch": 2686.9075391180654,
"eval_loss": 2.6177427768707275,
"eval_runtime": 8.53,
"eval_samples_per_second": 163.424,
"eval_steps_per_second": 20.516,
"step": 46100
},
{
"epoch": 2692.597439544808,
"grad_norm": 0.1312686949968338,
"learning_rate": 1.3846153846153846e-06,
"loss": 2.614,
"step": 46200
},
{
"epoch": 2692.597439544808,
"eval_loss": 2.61600399017334,
"eval_runtime": 8.518,
"eval_samples_per_second": 163.653,
"eval_steps_per_second": 20.545,
"step": 46200
},
{
"epoch": 2698.2873399715504,
"grad_norm": 0.1418214589357376,
"learning_rate": 1.374581939799331e-06,
"loss": 2.6142,
"step": 46300
},
{
"epoch": 2698.2873399715504,
"eval_loss": 2.6169888973236084,
"eval_runtime": 8.5439,
"eval_samples_per_second": 163.157,
"eval_steps_per_second": 20.482,
"step": 46300
},
{
"epoch": 2703.977240398293,
"grad_norm": 0.13503268361091614,
"learning_rate": 1.3645484949832775e-06,
"loss": 2.6141,
"step": 46400
},
{
"epoch": 2703.977240398293,
"eval_loss": 2.617550849914551,
"eval_runtime": 8.8621,
"eval_samples_per_second": 157.299,
"eval_steps_per_second": 19.747,
"step": 46400
},
{
"epoch": 2709.6671408250354,
"grad_norm": 0.13151606917381287,
"learning_rate": 1.354515050167224e-06,
"loss": 2.6138,
"step": 46500
},
{
"epoch": 2709.6671408250354,
"eval_loss": 2.614605665206909,
"eval_runtime": 8.5338,
"eval_samples_per_second": 163.351,
"eval_steps_per_second": 20.507,
"step": 46500
},
{
"epoch": 2715.357041251778,
"grad_norm": 0.12771758437156677,
"learning_rate": 1.3444816053511706e-06,
"loss": 2.6141,
"step": 46600
},
{
"epoch": 2715.357041251778,
"eval_loss": 2.6184401512145996,
"eval_runtime": 8.8574,
"eval_samples_per_second": 157.383,
"eval_steps_per_second": 19.758,
"step": 46600
},
{
"epoch": 2721.046941678521,
"grad_norm": 0.13841165602207184,
"learning_rate": 1.334448160535117e-06,
"loss": 2.6138,
"step": 46700
},
{
"epoch": 2721.046941678521,
"eval_loss": 2.617668867111206,
"eval_runtime": 8.5339,
"eval_samples_per_second": 163.348,
"eval_steps_per_second": 20.506,
"step": 46700
},
{
"epoch": 2726.7368421052633,
"grad_norm": 0.12478631734848022,
"learning_rate": 1.3244147157190635e-06,
"loss": 2.6141,
"step": 46800
},
{
"epoch": 2726.7368421052633,
"eval_loss": 2.61787748336792,
"eval_runtime": 8.8804,
"eval_samples_per_second": 156.974,
"eval_steps_per_second": 19.706,
"step": 46800
},
{
"epoch": 2732.426742532006,
"grad_norm": 0.13361801207065582,
"learning_rate": 1.31438127090301e-06,
"loss": 2.6136,
"step": 46900
},
{
"epoch": 2732.426742532006,
"eval_loss": 2.6143569946289062,
"eval_runtime": 8.5467,
"eval_samples_per_second": 163.105,
"eval_steps_per_second": 20.476,
"step": 46900
},
{
"epoch": 2738.1166429587483,
"grad_norm": 0.1362065225839615,
"learning_rate": 1.3043478260869566e-06,
"loss": 2.6135,
"step": 47000
},
{
"epoch": 2738.1166429587483,
"eval_loss": 2.613162040710449,
"eval_runtime": 8.8749,
"eval_samples_per_second": 157.071,
"eval_steps_per_second": 19.718,
"step": 47000
},
{
"epoch": 2743.806543385491,
"grad_norm": 0.14401383697986603,
"learning_rate": 1.294314381270903e-06,
"loss": 2.6142,
"step": 47100
},
{
"epoch": 2743.806543385491,
"eval_loss": 2.6157407760620117,
"eval_runtime": 8.5464,
"eval_samples_per_second": 163.11,
"eval_steps_per_second": 20.476,
"step": 47100
},
{
"epoch": 2749.4964438122333,
"grad_norm": 0.14595820009708405,
"learning_rate": 1.2842809364548495e-06,
"loss": 2.6136,
"step": 47200
},
{
"epoch": 2749.4964438122333,
"eval_loss": 2.6133205890655518,
"eval_runtime": 8.8738,
"eval_samples_per_second": 157.092,
"eval_steps_per_second": 19.721,
"step": 47200
},
{
"epoch": 2755.186344238976,
"grad_norm": 0.14186260104179382,
"learning_rate": 1.274247491638796e-06,
"loss": 2.6134,
"step": 47300
},
{
"epoch": 2755.186344238976,
"eval_loss": 2.617734670639038,
"eval_runtime": 8.5319,
"eval_samples_per_second": 163.387,
"eval_steps_per_second": 20.511,
"step": 47300
},
{
"epoch": 2760.8762446657183,
"grad_norm": 0.13552911579608917,
"learning_rate": 1.2642140468227424e-06,
"loss": 2.6135,
"step": 47400
},
{
"epoch": 2760.8762446657183,
"eval_loss": 2.6164298057556152,
"eval_runtime": 8.5373,
"eval_samples_per_second": 163.284,
"eval_steps_per_second": 20.498,
"step": 47400
},
{
"epoch": 2766.566145092461,
"grad_norm": 0.12871357798576355,
"learning_rate": 1.254180602006689e-06,
"loss": 2.6138,
"step": 47500
},
{
"epoch": 2766.566145092461,
"eval_loss": 2.6128602027893066,
"eval_runtime": 8.5494,
"eval_samples_per_second": 163.052,
"eval_steps_per_second": 20.469,
"step": 47500
},
{
"epoch": 2772.2560455192033,
"grad_norm": 0.12483840435743332,
"learning_rate": 1.2441471571906355e-06,
"loss": 2.6138,
"step": 47600
},
{
"epoch": 2772.2560455192033,
"eval_loss": 2.6143155097961426,
"eval_runtime": 8.8607,
"eval_samples_per_second": 157.325,
"eval_steps_per_second": 19.75,
"step": 47600
},
{
"epoch": 2777.945945945946,
"grad_norm": 0.13678689301013947,
"learning_rate": 1.234113712374582e-06,
"loss": 2.6131,
"step": 47700
},
{
"epoch": 2777.945945945946,
"eval_loss": 2.617353916168213,
"eval_runtime": 8.5357,
"eval_samples_per_second": 163.314,
"eval_steps_per_second": 20.502,
"step": 47700
},
{
"epoch": 2783.6358463726883,
"grad_norm": 0.12394748628139496,
"learning_rate": 1.2240802675585284e-06,
"loss": 2.6136,
"step": 47800
},
{
"epoch": 2783.6358463726883,
"eval_loss": 2.6153721809387207,
"eval_runtime": 8.548,
"eval_samples_per_second": 163.08,
"eval_steps_per_second": 20.473,
"step": 47800
},
{
"epoch": 2817.429587482219,
"grad_norm": 0.15023942291736603,
"learning_rate": 1.214046822742475e-06,
"loss": 2.613,
"step": 47900
},
{
"epoch": 2817.429587482219,
"eval_loss": 2.6161534786224365,
"eval_runtime": 8.8571,
"eval_samples_per_second": 157.388,
"eval_steps_per_second": 19.758,
"step": 47900
},
{
"epoch": 2823.1194879089617,
"grad_norm": 0.140534445643425,
"learning_rate": 1.2040133779264215e-06,
"loss": 2.6139,
"step": 48000
},
{
"epoch": 2823.1194879089617,
"eval_loss": 2.614151954650879,
"eval_runtime": 8.5312,
"eval_samples_per_second": 163.4,
"eval_steps_per_second": 20.513,
"step": 48000
},
{
"epoch": 2828.809388335704,
"grad_norm": 0.1297474205493927,
"learning_rate": 1.193979933110368e-06,
"loss": 2.6131,
"step": 48100
},
{
"epoch": 2828.809388335704,
"eval_loss": 2.612234354019165,
"eval_runtime": 8.5265,
"eval_samples_per_second": 163.489,
"eval_steps_per_second": 20.524,
"step": 48100
},
{
"epoch": 2834.4992887624467,
"grad_norm": 0.1272091567516327,
"learning_rate": 1.1839464882943144e-06,
"loss": 2.613,
"step": 48200
},
{
"epoch": 2834.4992887624467,
"eval_loss": 2.617521047592163,
"eval_runtime": 8.839,
"eval_samples_per_second": 157.711,
"eval_steps_per_second": 19.799,
"step": 48200
},
{
"epoch": 2840.189189189189,
"grad_norm": 0.16200745105743408,
"learning_rate": 1.173913043478261e-06,
"loss": 2.6134,
"step": 48300
},
{
"epoch": 2840.189189189189,
"eval_loss": 2.614422559738159,
"eval_runtime": 8.8546,
"eval_samples_per_second": 157.433,
"eval_steps_per_second": 19.764,
"step": 48300
},
{
"epoch": 2845.8790896159317,
"grad_norm": 0.13503460586071014,
"learning_rate": 1.1638795986622075e-06,
"loss": 2.6137,
"step": 48400
},
{
"epoch": 2845.8790896159317,
"eval_loss": 2.612483263015747,
"eval_runtime": 8.8623,
"eval_samples_per_second": 157.295,
"eval_steps_per_second": 19.747,
"step": 48400
},
{
"epoch": 2851.568990042674,
"grad_norm": 0.1506689339876175,
"learning_rate": 1.153846153846154e-06,
"loss": 2.6139,
"step": 48500
},
{
"epoch": 2851.568990042674,
"eval_loss": 2.614689588546753,
"eval_runtime": 8.539,
"eval_samples_per_second": 163.251,
"eval_steps_per_second": 20.494,
"step": 48500
},
{
"epoch": 2857.2588904694167,
"grad_norm": 0.13846616446971893,
"learning_rate": 1.1438127090301004e-06,
"loss": 2.6135,
"step": 48600
},
{
"epoch": 2857.2588904694167,
"eval_loss": 2.612041711807251,
"eval_runtime": 8.5482,
"eval_samples_per_second": 163.075,
"eval_steps_per_second": 20.472,
"step": 48600
},
{
"epoch": 2862.948790896159,
"grad_norm": 0.12145441025495529,
"learning_rate": 1.133779264214047e-06,
"loss": 2.6134,
"step": 48700
},
{
"epoch": 2862.948790896159,
"eval_loss": 2.614562749862671,
"eval_runtime": 8.5346,
"eval_samples_per_second": 163.336,
"eval_steps_per_second": 20.505,
"step": 48700
},
{
"epoch": 2868.6386913229016,
"grad_norm": 0.1398162990808487,
"learning_rate": 1.1237458193979933e-06,
"loss": 2.6135,
"step": 48800
},
{
"epoch": 2868.6386913229016,
"eval_loss": 2.6151633262634277,
"eval_runtime": 8.845,
"eval_samples_per_second": 157.603,
"eval_steps_per_second": 19.785,
"step": 48800
},
{
"epoch": 2874.3285917496446,
"grad_norm": 0.13078400492668152,
"learning_rate": 1.1137123745819398e-06,
"loss": 2.6135,
"step": 48900
},
{
"epoch": 2874.3285917496446,
"eval_loss": 2.612288236618042,
"eval_runtime": 8.5278,
"eval_samples_per_second": 163.466,
"eval_steps_per_second": 20.521,
"step": 48900
},
{
"epoch": 2880.018492176387,
"grad_norm": 0.14920541644096375,
"learning_rate": 1.1036789297658862e-06,
"loss": 2.6135,
"step": 49000
},
{
"epoch": 2880.018492176387,
"eval_loss": 2.615710496902466,
"eval_runtime": 8.5324,
"eval_samples_per_second": 163.376,
"eval_steps_per_second": 20.51,
"step": 49000
},
{
"epoch": 2885.7083926031296,
"grad_norm": 0.12429507821798325,
"learning_rate": 1.0936454849498327e-06,
"loss": 2.6132,
"step": 49100
},
{
"epoch": 2885.7083926031296,
"eval_loss": 2.6171202659606934,
"eval_runtime": 8.526,
"eval_samples_per_second": 163.5,
"eval_steps_per_second": 20.526,
"step": 49100
},
{
"epoch": 2891.398293029872,
"grad_norm": 0.14503461122512817,
"learning_rate": 1.0836120401337793e-06,
"loss": 2.6126,
"step": 49200
},
{
"epoch": 2891.398293029872,
"eval_loss": 2.617884874343872,
"eval_runtime": 8.8466,
"eval_samples_per_second": 157.575,
"eval_steps_per_second": 19.782,
"step": 49200
},
{
"epoch": 2897.0881934566146,
"grad_norm": 0.1336805522441864,
"learning_rate": 1.0735785953177258e-06,
"loss": 2.6132,
"step": 49300
},
{
"epoch": 2897.0881934566146,
"eval_loss": 2.6135501861572266,
"eval_runtime": 8.5221,
"eval_samples_per_second": 163.574,
"eval_steps_per_second": 20.535,
"step": 49300
},
{
"epoch": 2902.778093883357,
"grad_norm": 0.13340643048286438,
"learning_rate": 1.0635451505016722e-06,
"loss": 2.6132,
"step": 49400
},
{
"epoch": 2902.778093883357,
"eval_loss": 2.6155478954315186,
"eval_runtime": 8.5297,
"eval_samples_per_second": 163.428,
"eval_steps_per_second": 20.516,
"step": 49400
},
{
"epoch": 2908.4679943100996,
"grad_norm": 0.12103159725666046,
"learning_rate": 1.0535117056856187e-06,
"loss": 2.6128,
"step": 49500
},
{
"epoch": 2908.4679943100996,
"eval_loss": 2.614527702331543,
"eval_runtime": 8.5244,
"eval_samples_per_second": 163.531,
"eval_steps_per_second": 20.529,
"step": 49500
},
{
"epoch": 2914.157894736842,
"grad_norm": 0.13566209375858307,
"learning_rate": 1.0434782608695653e-06,
"loss": 2.6131,
"step": 49600
},
{
"epoch": 2914.157894736842,
"eval_loss": 2.6156363487243652,
"eval_runtime": 8.8443,
"eval_samples_per_second": 157.615,
"eval_steps_per_second": 19.787,
"step": 49600
},
{
"epoch": 2919.8477951635846,
"grad_norm": 0.14300596714019775,
"learning_rate": 1.0334448160535118e-06,
"loss": 2.6133,
"step": 49700
},
{
"epoch": 2919.8477951635846,
"eval_loss": 2.614346742630005,
"eval_runtime": 8.5557,
"eval_samples_per_second": 162.933,
"eval_steps_per_second": 20.454,
"step": 49700
},
{
"epoch": 2925.537695590327,
"grad_norm": 0.1305309683084488,
"learning_rate": 1.0234113712374581e-06,
"loss": 2.6127,
"step": 49800
},
{
"epoch": 2925.537695590327,
"eval_loss": 2.617161512374878,
"eval_runtime": 8.5486,
"eval_samples_per_second": 163.068,
"eval_steps_per_second": 20.471,
"step": 49800
},
{
"epoch": 2931.2275960170696,
"grad_norm": 0.12761159241199493,
"learning_rate": 1.0133779264214047e-06,
"loss": 2.6131,
"step": 49900
},
{
"epoch": 2931.2275960170696,
"eval_loss": 2.609426259994507,
"eval_runtime": 8.5282,
"eval_samples_per_second": 163.459,
"eval_steps_per_second": 20.52,
"step": 49900
},
{
"epoch": 2936.917496443812,
"grad_norm": 0.14436037838459015,
"learning_rate": 1.0033444816053512e-06,
"loss": 2.6129,
"step": 50000
},
{
"epoch": 2936.917496443812,
"eval_loss": 2.6124675273895264,
"eval_runtime": 8.8553,
"eval_samples_per_second": 157.419,
"eval_steps_per_second": 19.762,
"step": 50000
},
{
"epoch": 2942.6073968705546,
"grad_norm": 0.14199206233024597,
"learning_rate": 9.933110367892976e-07,
"loss": 2.6135,
"step": 50100
},
{
"epoch": 2942.6073968705546,
"eval_loss": 2.6135404109954834,
"eval_runtime": 8.5286,
"eval_samples_per_second": 163.449,
"eval_steps_per_second": 20.519,
"step": 50100
},
{
"epoch": 2948.2972972972975,
"grad_norm": 0.13962940871715546,
"learning_rate": 9.832775919732441e-07,
"loss": 2.6124,
"step": 50200
},
{
"epoch": 2948.2972972972975,
"eval_loss": 2.6167821884155273,
"eval_runtime": 8.5303,
"eval_samples_per_second": 163.418,
"eval_steps_per_second": 20.515,
"step": 50200
},
{
"epoch": 2953.98719772404,
"grad_norm": 0.13427576422691345,
"learning_rate": 9.732441471571907e-07,
"loss": 2.6131,
"step": 50300
},
{
"epoch": 2953.98719772404,
"eval_loss": 2.614187240600586,
"eval_runtime": 8.5272,
"eval_samples_per_second": 163.476,
"eval_steps_per_second": 20.522,
"step": 50300
},
{
"epoch": 2959.6770981507825,
"grad_norm": 0.14102576673030853,
"learning_rate": 9.632107023411372e-07,
"loss": 2.613,
"step": 50400
},
{
"epoch": 2959.6770981507825,
"eval_loss": 2.6135125160217285,
"eval_runtime": 8.8525,
"eval_samples_per_second": 157.47,
"eval_steps_per_second": 19.769,
"step": 50400
},
{
"epoch": 2965.366998577525,
"grad_norm": 0.13503779470920563,
"learning_rate": 9.531772575250837e-07,
"loss": 2.6128,
"step": 50500
},
{
"epoch": 2965.366998577525,
"eval_loss": 2.616763114929199,
"eval_runtime": 8.5222,
"eval_samples_per_second": 163.573,
"eval_steps_per_second": 20.535,
"step": 50500
},
{
"epoch": 2971.0568990042675,
"grad_norm": 0.11660658568143845,
"learning_rate": 9.431438127090301e-07,
"loss": 2.6132,
"step": 50600
},
{
"epoch": 2971.0568990042675,
"eval_loss": 2.6096742153167725,
"eval_runtime": 8.5204,
"eval_samples_per_second": 163.606,
"eval_steps_per_second": 20.539,
"step": 50600
},
{
"epoch": 2976.74679943101,
"grad_norm": 0.11942931264638901,
"learning_rate": 9.331103678929767e-07,
"loss": 2.6127,
"step": 50700
},
{
"epoch": 2976.74679943101,
"eval_loss": 2.6175696849823,
"eval_runtime": 8.5208,
"eval_samples_per_second": 163.599,
"eval_steps_per_second": 20.538,
"step": 50700
},
{
"epoch": 2982.4366998577525,
"grad_norm": 0.13427217304706573,
"learning_rate": 9.230769230769231e-07,
"loss": 2.6129,
"step": 50800
},
{
"epoch": 2982.4366998577525,
"eval_loss": 2.617108106613159,
"eval_runtime": 8.8462,
"eval_samples_per_second": 157.582,
"eval_steps_per_second": 19.783,
"step": 50800
},
{
"epoch": 2988.126600284495,
"grad_norm": 0.13947026431560516,
"learning_rate": 9.130434782608697e-07,
"loss": 2.6128,
"step": 50900
},
{
"epoch": 2988.126600284495,
"eval_loss": 2.614734411239624,
"eval_runtime": 8.5215,
"eval_samples_per_second": 163.586,
"eval_steps_per_second": 20.536,
"step": 50900
},
{
"epoch": 2993.8165007112375,
"grad_norm": 0.12719608843326569,
"learning_rate": 9.030100334448161e-07,
"loss": 2.6132,
"step": 51000
},
{
"epoch": 2993.8165007112375,
"eval_loss": 2.6145222187042236,
"eval_runtime": 8.5246,
"eval_samples_per_second": 163.526,
"eval_steps_per_second": 20.529,
"step": 51000
},
{
"epoch": 2999.50640113798,
"grad_norm": 0.13431696593761444,
"learning_rate": 8.929765886287627e-07,
"loss": 2.613,
"step": 51100
},
{
"epoch": 2999.50640113798,
"eval_loss": 2.615795850753784,
"eval_runtime": 8.5247,
"eval_samples_per_second": 163.525,
"eval_steps_per_second": 20.529,
"step": 51100
},
{
"epoch": 3005.1963015647225,
"grad_norm": 0.122039295732975,
"learning_rate": 8.829431438127091e-07,
"loss": 2.6133,
"step": 51200
},
{
"epoch": 3005.1963015647225,
"eval_loss": 2.613452434539795,
"eval_runtime": 8.8592,
"eval_samples_per_second": 157.35,
"eval_steps_per_second": 19.753,
"step": 51200
},
{
"epoch": 3010.886201991465,
"grad_norm": 0.12794600427150726,
"learning_rate": 8.729096989966555e-07,
"loss": 2.6125,
"step": 51300
},
{
"epoch": 3010.886201991465,
"eval_loss": 2.613462209701538,
"eval_runtime": 8.5292,
"eval_samples_per_second": 163.439,
"eval_steps_per_second": 20.518,
"step": 51300
},
{
"epoch": 3016.5761024182075,
"grad_norm": 0.15235668420791626,
"learning_rate": 8.628762541806019e-07,
"loss": 2.6128,
"step": 51400
},
{
"epoch": 3016.5761024182075,
"eval_loss": 2.6159415245056152,
"eval_runtime": 8.5195,
"eval_samples_per_second": 163.624,
"eval_steps_per_second": 20.541,
"step": 51400
},
{
"epoch": 3022.2660028449504,
"grad_norm": 0.1353672742843628,
"learning_rate": 8.528428093645485e-07,
"loss": 2.6126,
"step": 51500
},
{
"epoch": 3022.2660028449504,
"eval_loss": 2.6131489276885986,
"eval_runtime": 8.5212,
"eval_samples_per_second": 163.593,
"eval_steps_per_second": 20.537,
"step": 51500
},
{
"epoch": 3027.955903271693,
"grad_norm": 0.1265411078929901,
"learning_rate": 8.428093645484949e-07,
"loss": 2.6127,
"step": 51600
},
{
"epoch": 3027.955903271693,
"eval_loss": 2.6140012741088867,
"eval_runtime": 8.8546,
"eval_samples_per_second": 157.432,
"eval_steps_per_second": 19.764,
"step": 51600
},
{
"epoch": 3033.6458036984354,
"grad_norm": 0.12123577296733856,
"learning_rate": 8.327759197324414e-07,
"loss": 2.6123,
"step": 51700
},
{
"epoch": 3033.6458036984354,
"eval_loss": 2.617744207382202,
"eval_runtime": 8.5337,
"eval_samples_per_second": 163.352,
"eval_steps_per_second": 20.507,
"step": 51700
},
{
"epoch": 3039.335704125178,
"grad_norm": 0.13425582647323608,
"learning_rate": 8.227424749163879e-07,
"loss": 2.6128,
"step": 51800
},
{
"epoch": 3039.335704125178,
"eval_loss": 2.6144909858703613,
"eval_runtime": 8.5251,
"eval_samples_per_second": 163.518,
"eval_steps_per_second": 20.528,
"step": 51800
},
{
"epoch": 3045.0256045519204,
"grad_norm": 0.12807567417621613,
"learning_rate": 8.127090301003344e-07,
"loss": 2.6127,
"step": 51900
},
{
"epoch": 3045.0256045519204,
"eval_loss": 2.6116931438446045,
"eval_runtime": 8.5237,
"eval_samples_per_second": 163.545,
"eval_steps_per_second": 20.531,
"step": 51900
},
{
"epoch": 3050.715504978663,
"grad_norm": 0.13802653551101685,
"learning_rate": 8.026755852842809e-07,
"loss": 2.6127,
"step": 52000
},
{
"epoch": 3050.715504978663,
"eval_loss": 2.613800287246704,
"eval_runtime": 8.8567,
"eval_samples_per_second": 157.395,
"eval_steps_per_second": 19.759,
"step": 52000
},
{
"epoch": 3056.4054054054054,
"grad_norm": 0.13295966386795044,
"learning_rate": 7.926421404682274e-07,
"loss": 2.6129,
"step": 52100
},
{
"epoch": 3056.4054054054054,
"eval_loss": 2.614452600479126,
"eval_runtime": 8.5253,
"eval_samples_per_second": 163.513,
"eval_steps_per_second": 20.527,
"step": 52100
},
{
"epoch": 3062.095305832148,
"grad_norm": 0.1325223296880722,
"learning_rate": 7.826086956521739e-07,
"loss": 2.6128,
"step": 52200
},
{
"epoch": 3062.095305832148,
"eval_loss": 2.6134650707244873,
"eval_runtime": 8.5247,
"eval_samples_per_second": 163.525,
"eval_steps_per_second": 20.529,
"step": 52200
},
{
"epoch": 3067.7852062588904,
"grad_norm": 0.13717898726463318,
"learning_rate": 7.725752508361204e-07,
"loss": 2.613,
"step": 52300
},
{
"epoch": 3067.7852062588904,
"eval_loss": 2.614713430404663,
"eval_runtime": 8.5281,
"eval_samples_per_second": 163.459,
"eval_steps_per_second": 20.52,
"step": 52300
},
{
"epoch": 3073.475106685633,
"grad_norm": 0.1319703459739685,
"learning_rate": 7.625418060200669e-07,
"loss": 2.6121,
"step": 52400
},
{
"epoch": 3073.475106685633,
"eval_loss": 2.615602970123291,
"eval_runtime": 8.8484,
"eval_samples_per_second": 157.542,
"eval_steps_per_second": 19.777,
"step": 52400
},
{
"epoch": 3079.1650071123754,
"grad_norm": 0.14499501883983612,
"learning_rate": 7.525083612040134e-07,
"loss": 2.6127,
"step": 52500
},
{
"epoch": 3079.1650071123754,
"eval_loss": 2.61905574798584,
"eval_runtime": 8.5283,
"eval_samples_per_second": 163.455,
"eval_steps_per_second": 20.52,
"step": 52500
},
{
"epoch": 3084.854907539118,
"grad_norm": 0.12991563975811005,
"learning_rate": 7.424749163879599e-07,
"loss": 2.6125,
"step": 52600
},
{
"epoch": 3084.854907539118,
"eval_loss": 2.616900682449341,
"eval_runtime": 8.5227,
"eval_samples_per_second": 163.563,
"eval_steps_per_second": 20.533,
"step": 52600
},
{
"epoch": 3090.5448079658604,
"grad_norm": 0.13655343651771545,
"learning_rate": 7.324414715719064e-07,
"loss": 2.6125,
"step": 52700
},
{
"epoch": 3090.5448079658604,
"eval_loss": 2.6136441230773926,
"eval_runtime": 8.8771,
"eval_samples_per_second": 157.033,
"eval_steps_per_second": 19.714,
"step": 52700
},
{
"epoch": 3096.2347083926034,
"grad_norm": 0.1371728628873825,
"learning_rate": 7.224080267558529e-07,
"loss": 2.6125,
"step": 52800
},
{
"epoch": 3096.2347083926034,
"eval_loss": 2.6113550662994385,
"eval_runtime": 8.5351,
"eval_samples_per_second": 163.325,
"eval_steps_per_second": 20.504,
"step": 52800
},
{
"epoch": 3101.924608819346,
"grad_norm": 0.12911546230316162,
"learning_rate": 7.123745819397994e-07,
"loss": 2.6125,
"step": 52900
},
{
"epoch": 3101.924608819346,
"eval_loss": 2.61665940284729,
"eval_runtime": 8.8673,
"eval_samples_per_second": 157.206,
"eval_steps_per_second": 19.735,
"step": 52900
},
{
"epoch": 3107.6145092460883,
"grad_norm": 0.13488008081912994,
"learning_rate": 7.023411371237459e-07,
"loss": 2.6125,
"step": 53000
},
{
"epoch": 3107.6145092460883,
"eval_loss": 2.616480827331543,
"eval_runtime": 8.5263,
"eval_samples_per_second": 163.495,
"eval_steps_per_second": 20.525,
"step": 53000
},
{
"epoch": 3113.304409672831,
"grad_norm": 0.1279713660478592,
"learning_rate": 6.923076923076923e-07,
"loss": 2.6126,
"step": 53100
},
{
"epoch": 3113.304409672831,
"eval_loss": 2.612837553024292,
"eval_runtime": 8.8593,
"eval_samples_per_second": 157.348,
"eval_steps_per_second": 19.753,
"step": 53100
},
{
"epoch": 3118.9943100995733,
"grad_norm": 0.11780209094285965,
"learning_rate": 6.822742474916388e-07,
"loss": 2.6129,
"step": 53200
},
{
"epoch": 3118.9943100995733,
"eval_loss": 2.6156790256500244,
"eval_runtime": 8.5184,
"eval_samples_per_second": 163.645,
"eval_steps_per_second": 20.544,
"step": 53200
},
{
"epoch": 3124.684210526316,
"grad_norm": 0.1243632510304451,
"learning_rate": 6.722408026755853e-07,
"loss": 2.6131,
"step": 53300
},
{
"epoch": 3124.684210526316,
"eval_loss": 2.6163458824157715,
"eval_runtime": 8.8667,
"eval_samples_per_second": 157.217,
"eval_steps_per_second": 19.737,
"step": 53300
},
{
"epoch": 3130.3741109530583,
"grad_norm": 0.13544081151485443,
"learning_rate": 6.622073578595318e-07,
"loss": 2.6124,
"step": 53400
},
{
"epoch": 3130.3741109530583,
"eval_loss": 2.6154003143310547,
"eval_runtime": 8.521,
"eval_samples_per_second": 163.596,
"eval_steps_per_second": 20.538,
"step": 53400
},
{
"epoch": 3136.064011379801,
"grad_norm": 0.14009779691696167,
"learning_rate": 6.521739130434783e-07,
"loss": 2.6129,
"step": 53500
},
{
"epoch": 3136.064011379801,
"eval_loss": 2.6123223304748535,
"eval_runtime": 8.5157,
"eval_samples_per_second": 163.698,
"eval_steps_per_second": 20.55,
"step": 53500
},
{
"epoch": 3141.7539118065433,
"grad_norm": 0.12656356394290924,
"learning_rate": 6.421404682274248e-07,
"loss": 2.6122,
"step": 53600
},
{
"epoch": 3141.7539118065433,
"eval_loss": 2.6149022579193115,
"eval_runtime": 8.5183,
"eval_samples_per_second": 163.648,
"eval_steps_per_second": 20.544,
"step": 53600
},
{
"epoch": 3147.443812233286,
"grad_norm": 0.1256483644247055,
"learning_rate": 6.321070234113712e-07,
"loss": 2.6121,
"step": 53700
},
{
"epoch": 3147.443812233286,
"eval_loss": 2.613800048828125,
"eval_runtime": 8.8545,
"eval_samples_per_second": 157.434,
"eval_steps_per_second": 19.764,
"step": 53700
},
{
"epoch": 3153.1337126600283,
"grad_norm": 0.11175887286663055,
"learning_rate": 6.220735785953178e-07,
"loss": 2.6123,
"step": 53800
},
{
"epoch": 3153.1337126600283,
"eval_loss": 2.6164095401763916,
"eval_runtime": 8.8596,
"eval_samples_per_second": 157.343,
"eval_steps_per_second": 19.753,
"step": 53800
},
{
"epoch": 3158.823613086771,
"grad_norm": 0.12376561760902405,
"learning_rate": 6.120401337792642e-07,
"loss": 2.6125,
"step": 53900
},
{
"epoch": 3158.823613086771,
"eval_loss": 2.612550973892212,
"eval_runtime": 8.5335,
"eval_samples_per_second": 163.356,
"eval_steps_per_second": 20.507,
"step": 53900
},
{
"epoch": 3164.5135135135133,
"grad_norm": 0.12542764842510223,
"learning_rate": 6.020066889632107e-07,
"loss": 2.612,
"step": 54000
},
{
"epoch": 3164.5135135135133,
"eval_loss": 2.614248037338257,
"eval_runtime": 8.5367,
"eval_samples_per_second": 163.296,
"eval_steps_per_second": 20.5,
"step": 54000
},
{
"epoch": 3170.2034139402563,
"grad_norm": 0.12020324170589447,
"learning_rate": 5.919732441471572e-07,
"loss": 2.6123,
"step": 54100
},
{
"epoch": 3170.2034139402563,
"eval_loss": 2.615945339202881,
"eval_runtime": 8.8835,
"eval_samples_per_second": 156.92,
"eval_steps_per_second": 19.699,
"step": 54100
},
{
"epoch": 3175.8933143669988,
"grad_norm": 0.13160724937915802,
"learning_rate": 5.819397993311037e-07,
"loss": 2.6125,
"step": 54200
},
{
"epoch": 3175.8933143669988,
"eval_loss": 2.612717628479004,
"eval_runtime": 8.533,
"eval_samples_per_second": 163.365,
"eval_steps_per_second": 20.509,
"step": 54200
},
{
"epoch": 3181.5832147937413,
"grad_norm": 0.11064854264259338,
"learning_rate": 5.719063545150502e-07,
"loss": 2.6127,
"step": 54300
},
{
"epoch": 3181.5832147937413,
"eval_loss": 2.6137535572052,
"eval_runtime": 8.5351,
"eval_samples_per_second": 163.326,
"eval_steps_per_second": 20.504,
"step": 54300
},
{
"epoch": 3187.2731152204838,
"grad_norm": 0.13410420715808868,
"learning_rate": 5.618729096989966e-07,
"loss": 2.6125,
"step": 54400
},
{
"epoch": 3187.2731152204838,
"eval_loss": 2.6163814067840576,
"eval_runtime": 8.869,
"eval_samples_per_second": 157.176,
"eval_steps_per_second": 19.732,
"step": 54400
},
{
"epoch": 3192.9630156472263,
"grad_norm": 0.13291259109973907,
"learning_rate": 5.518394648829431e-07,
"loss": 2.6125,
"step": 54500
},
{
"epoch": 3192.9630156472263,
"eval_loss": 2.6127355098724365,
"eval_runtime": 8.5356,
"eval_samples_per_second": 163.315,
"eval_steps_per_second": 20.502,
"step": 54500
},
{
"epoch": 3198.6529160739688,
"grad_norm": 0.1289217323064804,
"learning_rate": 5.418060200668896e-07,
"loss": 2.6122,
"step": 54600
},
{
"epoch": 3198.6529160739688,
"eval_loss": 2.613924503326416,
"eval_runtime": 8.869,
"eval_samples_per_second": 157.176,
"eval_steps_per_second": 19.732,
"step": 54600
},
{
"epoch": 3204.3428165007113,
"grad_norm": 0.12402568757534027,
"learning_rate": 5.317725752508361e-07,
"loss": 2.6125,
"step": 54700
},
{
"epoch": 3204.3428165007113,
"eval_loss": 2.6189987659454346,
"eval_runtime": 8.5185,
"eval_samples_per_second": 163.643,
"eval_steps_per_second": 20.543,
"step": 54700
},
{
"epoch": 3210.0327169274537,
"grad_norm": 0.11996253579854965,
"learning_rate": 5.217391304347826e-07,
"loss": 2.6128,
"step": 54800
},
{
"epoch": 3210.0327169274537,
"eval_loss": 2.6130542755126953,
"eval_runtime": 8.5256,
"eval_samples_per_second": 163.508,
"eval_steps_per_second": 20.526,
"step": 54800
},
{
"epoch": 3215.7226173541962,
"grad_norm": 0.1303112506866455,
"learning_rate": 5.117056856187291e-07,
"loss": 2.6125,
"step": 54900
},
{
"epoch": 3215.7226173541962,
"eval_loss": 2.614591598510742,
"eval_runtime": 9.1311,
"eval_samples_per_second": 152.664,
"eval_steps_per_second": 19.165,
"step": 54900
},
{
"epoch": 3221.4125177809387,
"grad_norm": 0.13109813630580902,
"learning_rate": 5.016722408026756e-07,
"loss": 2.612,
"step": 55000
},
{
"epoch": 3221.4125177809387,
"eval_loss": 2.6141510009765625,
"eval_runtime": 8.5232,
"eval_samples_per_second": 163.554,
"eval_steps_per_second": 20.532,
"step": 55000
},
{
"epoch": 3227.1024182076812,
"grad_norm": 0.11694065481424332,
"learning_rate": 4.916387959866221e-07,
"loss": 2.6118,
"step": 55100
},
{
"epoch": 3227.1024182076812,
"eval_loss": 2.613607406616211,
"eval_runtime": 8.5214,
"eval_samples_per_second": 163.588,
"eval_steps_per_second": 20.537,
"step": 55100
},
{
"epoch": 3232.7923186344237,
"grad_norm": 0.126685231924057,
"learning_rate": 4.816053511705686e-07,
"loss": 2.6121,
"step": 55200
},
{
"epoch": 3232.7923186344237,
"eval_loss": 2.617375373840332,
"eval_runtime": 8.5254,
"eval_samples_per_second": 163.511,
"eval_steps_per_second": 20.527,
"step": 55200
},
{
"epoch": 3238.4822190611662,
"grad_norm": 0.1280149221420288,
"learning_rate": 4.7157190635451506e-07,
"loss": 2.6126,
"step": 55300
},
{
"epoch": 3238.4822190611662,
"eval_loss": 2.6150975227355957,
"eval_runtime": 8.8573,
"eval_samples_per_second": 157.383,
"eval_steps_per_second": 19.758,
"step": 55300
},
{
"epoch": 3244.172119487909,
"grad_norm": 0.13586066663265228,
"learning_rate": 4.6153846153846156e-07,
"loss": 2.6121,
"step": 55400
},
{
"epoch": 3244.172119487909,
"eval_loss": 2.613374710083008,
"eval_runtime": 8.5339,
"eval_samples_per_second": 163.349,
"eval_steps_per_second": 20.506,
"step": 55400
},
{
"epoch": 3249.8620199146517,
"grad_norm": 0.13014060258865356,
"learning_rate": 4.5150501672240806e-07,
"loss": 2.6122,
"step": 55500
},
{
"epoch": 3249.8620199146517,
"eval_loss": 2.6121749877929688,
"eval_runtime": 8.8553,
"eval_samples_per_second": 157.419,
"eval_steps_per_second": 19.762,
"step": 55500
},
{
"epoch": 3255.551920341394,
"grad_norm": 0.1337248831987381,
"learning_rate": 4.4147157190635456e-07,
"loss": 2.6115,
"step": 55600
},
{
"epoch": 3255.551920341394,
"eval_loss": 2.6143338680267334,
"eval_runtime": 8.532,
"eval_samples_per_second": 163.385,
"eval_steps_per_second": 20.511,
"step": 55600
},
{
"epoch": 3261.2418207681367,
"grad_norm": 0.12295526266098022,
"learning_rate": 4.3143812709030095e-07,
"loss": 2.6128,
"step": 55700
},
{
"epoch": 3261.2418207681367,
"eval_loss": 2.6155290603637695,
"eval_runtime": 8.5401,
"eval_samples_per_second": 163.23,
"eval_steps_per_second": 20.492,
"step": 55700
},
{
"epoch": 3266.931721194879,
"grad_norm": 0.13388285040855408,
"learning_rate": 4.2140468227424745e-07,
"loss": 2.6121,
"step": 55800
},
{
"epoch": 3266.931721194879,
"eval_loss": 2.6157045364379883,
"eval_runtime": 8.8687,
"eval_samples_per_second": 157.182,
"eval_steps_per_second": 19.732,
"step": 55800
},
{
"epoch": 3272.6216216216217,
"grad_norm": 0.1304856538772583,
"learning_rate": 4.1137123745819395e-07,
"loss": 2.6119,
"step": 55900
},
{
"epoch": 3272.6216216216217,
"eval_loss": 2.614722490310669,
"eval_runtime": 8.5288,
"eval_samples_per_second": 163.446,
"eval_steps_per_second": 20.519,
"step": 55900
},
{
"epoch": 3278.311522048364,
"grad_norm": 0.13436032831668854,
"learning_rate": 4.0133779264214045e-07,
"loss": 2.6123,
"step": 56000
},
{
"epoch": 3278.311522048364,
"eval_loss": 2.613041639328003,
"eval_runtime": 8.5278,
"eval_samples_per_second": 163.466,
"eval_steps_per_second": 20.521,
"step": 56000
},
{
"epoch": 3284.0014224751067,
"grad_norm": 0.14674031734466553,
"learning_rate": 3.9130434782608694e-07,
"loss": 2.6122,
"step": 56100
},
{
"epoch": 3284.0014224751067,
"eval_loss": 2.611990213394165,
"eval_runtime": 8.7786,
"eval_samples_per_second": 158.795,
"eval_steps_per_second": 19.935,
"step": 56100
},
{
"epoch": 3289.691322901849,
"grad_norm": 0.1269470900297165,
"learning_rate": 3.8127090301003344e-07,
"loss": 2.6119,
"step": 56200
},
{
"epoch": 3289.691322901849,
"eval_loss": 2.614060878753662,
"eval_runtime": 8.5989,
"eval_samples_per_second": 162.113,
"eval_steps_per_second": 20.351,
"step": 56200
},
{
"epoch": 3295.3812233285917,
"grad_norm": 0.13767366111278534,
"learning_rate": 3.7123745819397994e-07,
"loss": 2.6121,
"step": 56300
},
{
"epoch": 3295.3812233285917,
"eval_loss": 2.616547107696533,
"eval_runtime": 8.5426,
"eval_samples_per_second": 163.181,
"eval_steps_per_second": 20.485,
"step": 56300
},
{
"epoch": 3301.071123755334,
"grad_norm": 0.13906554877758026,
"learning_rate": 3.6120401337792644e-07,
"loss": 2.6121,
"step": 56400
},
{
"epoch": 3301.071123755334,
"eval_loss": 2.6139421463012695,
"eval_runtime": 8.5234,
"eval_samples_per_second": 163.55,
"eval_steps_per_second": 20.532,
"step": 56400
},
{
"epoch": 3306.7610241820767,
"grad_norm": 0.13832303881645203,
"learning_rate": 3.5117056856187294e-07,
"loss": 2.612,
"step": 56500
},
{
"epoch": 3306.7610241820767,
"eval_loss": 2.6141018867492676,
"eval_runtime": 8.8522,
"eval_samples_per_second": 157.475,
"eval_steps_per_second": 19.769,
"step": 56500
},
{
"epoch": 3312.450924608819,
"grad_norm": 0.13443072140216827,
"learning_rate": 3.411371237458194e-07,
"loss": 2.6127,
"step": 56600
},
{
"epoch": 3312.450924608819,
"eval_loss": 2.6130294799804688,
"eval_runtime": 8.5356,
"eval_samples_per_second": 163.317,
"eval_steps_per_second": 20.502,
"step": 56600
},
{
"epoch": 3318.140825035562,
"grad_norm": 0.1384400725364685,
"learning_rate": 3.311036789297659e-07,
"loss": 2.6125,
"step": 56700
},
{
"epoch": 3318.140825035562,
"eval_loss": 2.614971876144409,
"eval_runtime": 8.5276,
"eval_samples_per_second": 163.468,
"eval_steps_per_second": 20.521,
"step": 56700
},
{
"epoch": 3323.8307254623046,
"grad_norm": 0.12781038880348206,
"learning_rate": 3.210702341137124e-07,
"loss": 2.6119,
"step": 56800
},
{
"epoch": 3323.8307254623046,
"eval_loss": 2.6149492263793945,
"eval_runtime": 8.5244,
"eval_samples_per_second": 163.531,
"eval_steps_per_second": 20.529,
"step": 56800
},
{
"epoch": 3329.520625889047,
"grad_norm": 0.13229794800281525,
"learning_rate": 3.110367892976589e-07,
"loss": 2.6114,
"step": 56900
},
{
"epoch": 3329.520625889047,
"eval_loss": 2.620450019836426,
"eval_runtime": 8.8571,
"eval_samples_per_second": 157.388,
"eval_steps_per_second": 19.758,
"step": 56900
},
{
"epoch": 3335.2105263157896,
"grad_norm": 0.13062149286270142,
"learning_rate": 3.010033444816054e-07,
"loss": 2.6123,
"step": 57000
},
{
"epoch": 3335.2105263157896,
"eval_loss": 2.6148345470428467,
"eval_runtime": 8.5245,
"eval_samples_per_second": 163.528,
"eval_steps_per_second": 20.529,
"step": 57000
},
{
"epoch": 3340.900426742532,
"grad_norm": 0.1294122189283371,
"learning_rate": 2.9096989966555187e-07,
"loss": 2.6121,
"step": 57100
},
{
"epoch": 3340.900426742532,
"eval_loss": 2.6161153316497803,
"eval_runtime": 8.5288,
"eval_samples_per_second": 163.446,
"eval_steps_per_second": 20.519,
"step": 57100
},
{
"epoch": 3346.5903271692746,
"grad_norm": 0.1416897028684616,
"learning_rate": 2.809364548494983e-07,
"loss": 2.6121,
"step": 57200
},
{
"epoch": 3346.5903271692746,
"eval_loss": 2.610884428024292,
"eval_runtime": 8.8659,
"eval_samples_per_second": 157.232,
"eval_steps_per_second": 19.739,
"step": 57200
},
{
"epoch": 3352.280227596017,
"grad_norm": 0.13414239883422852,
"learning_rate": 2.709030100334448e-07,
"loss": 2.6117,
"step": 57300
},
{
"epoch": 3352.280227596017,
"eval_loss": 2.613905906677246,
"eval_runtime": 8.523,
"eval_samples_per_second": 163.557,
"eval_steps_per_second": 20.533,
"step": 57300
},
{
"epoch": 3357.9701280227596,
"grad_norm": 0.11113996803760529,
"learning_rate": 2.608695652173913e-07,
"loss": 2.6123,
"step": 57400
},
{
"epoch": 3357.9701280227596,
"eval_loss": 2.6124770641326904,
"eval_runtime": 8.5319,
"eval_samples_per_second": 163.387,
"eval_steps_per_second": 20.511,
"step": 57400
},
{
"epoch": 3363.660028449502,
"grad_norm": 0.12642131745815277,
"learning_rate": 2.508361204013378e-07,
"loss": 2.6121,
"step": 57500
},
{
"epoch": 3363.660028449502,
"eval_loss": 2.6125006675720215,
"eval_runtime": 8.5321,
"eval_samples_per_second": 163.382,
"eval_steps_per_second": 20.511,
"step": 57500
},
{
"epoch": 3369.3499288762446,
"grad_norm": 0.12002536654472351,
"learning_rate": 2.408026755852843e-07,
"loss": 2.6119,
"step": 57600
},
{
"epoch": 3369.3499288762446,
"eval_loss": 2.6137232780456543,
"eval_runtime": 8.8563,
"eval_samples_per_second": 157.402,
"eval_steps_per_second": 19.76,
"step": 57600
},
{
"epoch": 3375.039829302987,
"grad_norm": 0.12281110137701035,
"learning_rate": 2.3076923076923078e-07,
"loss": 2.6117,
"step": 57700
},
{
"epoch": 3375.039829302987,
"eval_loss": 2.6145715713500977,
"eval_runtime": 8.5313,
"eval_samples_per_second": 163.399,
"eval_steps_per_second": 20.513,
"step": 57700
},
{
"epoch": 3380.7297297297296,
"grad_norm": 0.14482566714286804,
"learning_rate": 2.2073578595317728e-07,
"loss": 2.6118,
"step": 57800
},
{
"epoch": 3380.7297297297296,
"eval_loss": 2.613739490509033,
"eval_runtime": 8.5223,
"eval_samples_per_second": 163.572,
"eval_steps_per_second": 20.534,
"step": 57800
},
{
"epoch": 3386.419630156472,
"grad_norm": 0.1368427276611328,
"learning_rate": 2.1070234113712372e-07,
"loss": 2.6119,
"step": 57900
},
{
"epoch": 3386.419630156472,
"eval_loss": 2.6152491569519043,
"eval_runtime": 8.8583,
"eval_samples_per_second": 157.366,
"eval_steps_per_second": 19.755,
"step": 57900
},
{
"epoch": 3392.109530583215,
"grad_norm": 0.13695128262043,
"learning_rate": 2.0066889632107022e-07,
"loss": 2.6116,
"step": 58000
},
{
"epoch": 3392.109530583215,
"eval_loss": 2.6153528690338135,
"eval_runtime": 8.5368,
"eval_samples_per_second": 163.292,
"eval_steps_per_second": 20.499,
"step": 58000
},
{
"epoch": 3397.7994310099575,
"grad_norm": 0.11453160643577576,
"learning_rate": 1.9063545150501672e-07,
"loss": 2.612,
"step": 58100
},
{
"epoch": 3397.7994310099575,
"eval_loss": 2.615257740020752,
"eval_runtime": 8.5272,
"eval_samples_per_second": 163.477,
"eval_steps_per_second": 20.523,
"step": 58100
},
{
"epoch": 3403.4893314367,
"grad_norm": 0.13847880065441132,
"learning_rate": 1.8060200668896322e-07,
"loss": 2.6125,
"step": 58200
},
{
"epoch": 3403.4893314367,
"eval_loss": 2.6181981563568115,
"eval_runtime": 8.8493,
"eval_samples_per_second": 157.527,
"eval_steps_per_second": 19.776,
"step": 58200
},
{
"epoch": 3409.1792318634425,
"grad_norm": 0.13308827579021454,
"learning_rate": 1.705685618729097e-07,
"loss": 2.6124,
"step": 58300
},
{
"epoch": 3409.1792318634425,
"eval_loss": 2.6120095252990723,
"eval_runtime": 8.5246,
"eval_samples_per_second": 163.526,
"eval_steps_per_second": 20.529,
"step": 58300
},
{
"epoch": 3414.869132290185,
"grad_norm": 0.13217765092849731,
"learning_rate": 1.605351170568562e-07,
"loss": 2.6117,
"step": 58400
},
{
"epoch": 3414.869132290185,
"eval_loss": 2.6182873249053955,
"eval_runtime": 8.5321,
"eval_samples_per_second": 163.384,
"eval_steps_per_second": 20.511,
"step": 58400
},
{
"epoch": 3420.5590327169275,
"grad_norm": 0.13483327627182007,
"learning_rate": 1.505016722408027e-07,
"loss": 2.6122,
"step": 58500
},
{
"epoch": 3420.5590327169275,
"eval_loss": 2.614924430847168,
"eval_runtime": 8.8582,
"eval_samples_per_second": 157.368,
"eval_steps_per_second": 19.756,
"step": 58500
},
{
"epoch": 3426.24893314367,
"grad_norm": 0.13368582725524902,
"learning_rate": 1.4046822742474916e-07,
"loss": 2.6123,
"step": 58600
},
{
"epoch": 3426.24893314367,
"eval_loss": 2.613650321960449,
"eval_runtime": 8.5412,
"eval_samples_per_second": 163.209,
"eval_steps_per_second": 20.489,
"step": 58600
},
{
"epoch": 3431.9388335704125,
"grad_norm": 0.13080868124961853,
"learning_rate": 1.3043478260869566e-07,
"loss": 2.612,
"step": 58700
},
{
"epoch": 3431.9388335704125,
"eval_loss": 2.61566162109375,
"eval_runtime": 8.8538,
"eval_samples_per_second": 157.447,
"eval_steps_per_second": 19.766,
"step": 58700
},
{
"epoch": 3437.628733997155,
"grad_norm": 0.1235753670334816,
"learning_rate": 1.2040133779264215e-07,
"loss": 2.6127,
"step": 58800
},
{
"epoch": 3437.628733997155,
"eval_loss": 2.612259864807129,
"eval_runtime": 8.5258,
"eval_samples_per_second": 163.503,
"eval_steps_per_second": 20.526,
"step": 58800
},
{
"epoch": 3443.3186344238975,
"grad_norm": 0.13656386733055115,
"learning_rate": 1.1036789297658864e-07,
"loss": 2.6119,
"step": 58900
},
{
"epoch": 3443.3186344238975,
"eval_loss": 2.615288257598877,
"eval_runtime": 8.8549,
"eval_samples_per_second": 157.428,
"eval_steps_per_second": 19.763,
"step": 58900
},
{
"epoch": 3449.00853485064,
"grad_norm": 0.13021564483642578,
"learning_rate": 1.0033444816053511e-07,
"loss": 2.6123,
"step": 59000
},
{
"epoch": 3449.00853485064,
"eval_loss": 2.6138880252838135,
"eval_runtime": 8.6443,
"eval_samples_per_second": 161.263,
"eval_steps_per_second": 20.245,
"step": 59000
},
{
"epoch": 3454.6984352773825,
"grad_norm": 0.12730829417705536,
"learning_rate": 9.030100334448161e-08,
"loss": 2.6119,
"step": 59100
},
{
"epoch": 3454.6984352773825,
"eval_loss": 2.6168129444122314,
"eval_runtime": 8.8546,
"eval_samples_per_second": 157.432,
"eval_steps_per_second": 19.764,
"step": 59100
},
{
"epoch": 3460.388335704125,
"grad_norm": 0.13064709305763245,
"learning_rate": 8.02675585284281e-08,
"loss": 2.6119,
"step": 59200
},
{
"epoch": 3460.388335704125,
"eval_loss": 2.616276502609253,
"eval_runtime": 8.5216,
"eval_samples_per_second": 163.585,
"eval_steps_per_second": 20.536,
"step": 59200
},
{
"epoch": 3466.078236130868,
"grad_norm": 0.13055041432380676,
"learning_rate": 7.023411371237458e-08,
"loss": 2.6118,
"step": 59300
},
{
"epoch": 3466.078236130868,
"eval_loss": 2.614774227142334,
"eval_runtime": 8.8545,
"eval_samples_per_second": 157.433,
"eval_steps_per_second": 19.764,
"step": 59300
},
{
"epoch": 3471.7681365576104,
"grad_norm": 0.12495147436857224,
"learning_rate": 6.020066889632108e-08,
"loss": 2.6122,
"step": 59400
},
{
"epoch": 3471.7681365576104,
"eval_loss": 2.6170403957366943,
"eval_runtime": 8.5346,
"eval_samples_per_second": 163.335,
"eval_steps_per_second": 20.505,
"step": 59400
},
{
"epoch": 3477.458036984353,
"grad_norm": 0.1302523910999298,
"learning_rate": 5.0167224080267556e-08,
"loss": 2.6119,
"step": 59500
},
{
"epoch": 3477.458036984353,
"eval_loss": 2.609950065612793,
"eval_runtime": 8.8493,
"eval_samples_per_second": 157.526,
"eval_steps_per_second": 19.775,
"step": 59500
},
{
"epoch": 3483.1479374110954,
"grad_norm": 0.13452781736850739,
"learning_rate": 4.013377926421405e-08,
"loss": 2.612,
"step": 59600
},
{
"epoch": 3483.1479374110954,
"eval_loss": 2.614889144897461,
"eval_runtime": 8.5246,
"eval_samples_per_second": 163.526,
"eval_steps_per_second": 20.529,
"step": 59600
},
{
"epoch": 3488.837837837838,
"grad_norm": 0.1290915459394455,
"learning_rate": 3.010033444816054e-08,
"loss": 2.6118,
"step": 59700
},
{
"epoch": 3488.837837837838,
"eval_loss": 2.616943120956421,
"eval_runtime": 8.87,
"eval_samples_per_second": 157.16,
"eval_steps_per_second": 19.73,
"step": 59700
},
{
"epoch": 3494.5277382645804,
"grad_norm": 0.12313296645879745,
"learning_rate": 2.0066889632107024e-08,
"loss": 2.6119,
"step": 59800
},
{
"epoch": 3494.5277382645804,
"eval_loss": 2.6143088340759277,
"eval_runtime": 8.5338,
"eval_samples_per_second": 163.35,
"eval_steps_per_second": 20.507,
"step": 59800
},
{
"epoch": 3500.217638691323,
"grad_norm": 0.11486466974020004,
"learning_rate": 1.0033444816053512e-08,
"loss": 2.6118,
"step": 59900
},
{
"epoch": 3500.217638691323,
"eval_loss": 2.6164870262145996,
"eval_runtime": 8.5248,
"eval_samples_per_second": 163.523,
"eval_steps_per_second": 20.528,
"step": 59900
},
{
"epoch": 3505.9075391180654,
"grad_norm": 0.11500786989927292,
"learning_rate": 0.0,
"loss": 2.6119,
"step": 60000
},
{
"epoch": 3505.9075391180654,
"eval_loss": 2.616939067840576,
"eval_runtime": 8.8637,
"eval_samples_per_second": 157.27,
"eval_steps_per_second": 19.743,
"step": 60000
}
],
"logging_steps": 100,
"max_steps": 60000,
"num_input_tokens_seen": 0,
"num_train_epochs": 3530,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 10
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.02069363654656e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}