deskull's picture
Upload genome_sequence BERT medium model (60k steps)
346c427 verified
{
"best_metric": 6.378762722015381,
"best_model_checkpoint": "learning_source_20260316/genome_sequence/bert-output/genome_sequence-medium/checkpoint-59000",
"epoch": 133.07457721097865,
"eval_steps": 100,
"global_step": 60000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.22179096201829776,
"grad_norm": 0.9846197366714478,
"learning_rate": 3e-06,
"loss": 8.0812,
"step": 100
},
{
"epoch": 0.22179096201829776,
"eval_loss": 7.739930629730225,
"eval_runtime": 175.2691,
"eval_samples_per_second": 57.055,
"eval_steps_per_second": 7.132,
"step": 100
},
{
"epoch": 0.4435819240365955,
"grad_norm": 0.7575621604919434,
"learning_rate": 6e-06,
"loss": 7.5841,
"step": 200
},
{
"epoch": 0.4435819240365955,
"eval_loss": 7.357375144958496,
"eval_runtime": 174.8425,
"eval_samples_per_second": 57.194,
"eval_steps_per_second": 7.149,
"step": 200
},
{
"epoch": 0.6653728860548933,
"grad_norm": 2.5145933628082275,
"learning_rate": 5.989966555183947e-06,
"loss": 7.2475,
"step": 300
},
{
"epoch": 0.6653728860548933,
"eval_loss": 7.029321193695068,
"eval_runtime": 174.7596,
"eval_samples_per_second": 57.221,
"eval_steps_per_second": 7.153,
"step": 300
},
{
"epoch": 0.887163848073191,
"grad_norm": 2.1445820331573486,
"learning_rate": 5.979933110367893e-06,
"loss": 7.0137,
"step": 400
},
{
"epoch": 0.887163848073191,
"eval_loss": 6.827342987060547,
"eval_runtime": 174.7661,
"eval_samples_per_second": 57.219,
"eval_steps_per_second": 7.152,
"step": 400
},
{
"epoch": 1.1089548100914888,
"grad_norm": 2.264369010925293,
"learning_rate": 5.96989966555184e-06,
"loss": 6.8546,
"step": 500
},
{
"epoch": 1.1089548100914888,
"eval_loss": 6.712888240814209,
"eval_runtime": 174.751,
"eval_samples_per_second": 57.224,
"eval_steps_per_second": 7.153,
"step": 500
},
{
"epoch": 1.3307457721097866,
"grad_norm": 2.162890911102295,
"learning_rate": 5.959866220735786e-06,
"loss": 6.7576,
"step": 600
},
{
"epoch": 1.3307457721097866,
"eval_loss": 6.646862506866455,
"eval_runtime": 174.794,
"eval_samples_per_second": 57.21,
"eval_steps_per_second": 7.151,
"step": 600
},
{
"epoch": 1.5525367341280842,
"grad_norm": 1.8144651651382446,
"learning_rate": 5.949832775919732e-06,
"loss": 6.6931,
"step": 700
},
{
"epoch": 1.5525367341280842,
"eval_loss": 6.603611946105957,
"eval_runtime": 174.6927,
"eval_samples_per_second": 57.243,
"eval_steps_per_second": 7.155,
"step": 700
},
{
"epoch": 1.774327696146382,
"grad_norm": 1.877691626548767,
"learning_rate": 5.939799331103679e-06,
"loss": 6.6514,
"step": 800
},
{
"epoch": 1.774327696146382,
"eval_loss": 6.573204517364502,
"eval_runtime": 174.7097,
"eval_samples_per_second": 57.238,
"eval_steps_per_second": 7.155,
"step": 800
},
{
"epoch": 1.9961186581646797,
"grad_norm": 0.6093182563781738,
"learning_rate": 5.929765886287626e-06,
"loss": 6.6173,
"step": 900
},
{
"epoch": 1.9961186581646797,
"eval_loss": 6.551618576049805,
"eval_runtime": 174.7562,
"eval_samples_per_second": 57.223,
"eval_steps_per_second": 7.153,
"step": 900
},
{
"epoch": 2.2179096201829775,
"grad_norm": 0.6819909811019897,
"learning_rate": 5.919732441471572e-06,
"loss": 6.5884,
"step": 1000
},
{
"epoch": 2.2179096201829775,
"eval_loss": 6.52918004989624,
"eval_runtime": 174.672,
"eval_samples_per_second": 57.25,
"eval_steps_per_second": 7.156,
"step": 1000
},
{
"epoch": 2.4397005822012754,
"grad_norm": 1.7704071998596191,
"learning_rate": 5.9096989966555185e-06,
"loss": 6.5703,
"step": 1100
},
{
"epoch": 2.4397005822012754,
"eval_loss": 6.522487640380859,
"eval_runtime": 173.2328,
"eval_samples_per_second": 57.726,
"eval_steps_per_second": 7.216,
"step": 1100
},
{
"epoch": 2.6614915442195732,
"grad_norm": 0.34777677059173584,
"learning_rate": 5.899665551839465e-06,
"loss": 6.5571,
"step": 1200
},
{
"epoch": 2.6614915442195732,
"eval_loss": 6.512077808380127,
"eval_runtime": 174.7576,
"eval_samples_per_second": 57.222,
"eval_steps_per_second": 7.153,
"step": 1200
},
{
"epoch": 2.8832825062378706,
"grad_norm": 0.40927115082740784,
"learning_rate": 5.889632107023412e-06,
"loss": 6.5403,
"step": 1300
},
{
"epoch": 2.8832825062378706,
"eval_loss": 6.502514839172363,
"eval_runtime": 174.8661,
"eval_samples_per_second": 57.187,
"eval_steps_per_second": 7.148,
"step": 1300
},
{
"epoch": 3.1050734682561685,
"grad_norm": 1.628187894821167,
"learning_rate": 5.879598662207358e-06,
"loss": 6.5271,
"step": 1400
},
{
"epoch": 3.1050734682561685,
"eval_loss": 6.495711803436279,
"eval_runtime": 173.1482,
"eval_samples_per_second": 57.754,
"eval_steps_per_second": 7.219,
"step": 1400
},
{
"epoch": 3.3268644302744663,
"grad_norm": 0.2986718416213989,
"learning_rate": 5.869565217391305e-06,
"loss": 6.5149,
"step": 1500
},
{
"epoch": 3.3268644302744663,
"eval_loss": 6.48654317855835,
"eval_runtime": 174.8407,
"eval_samples_per_second": 57.195,
"eval_steps_per_second": 7.149,
"step": 1500
},
{
"epoch": 3.548655392292764,
"grad_norm": 0.2633047103881836,
"learning_rate": 5.8595317725752514e-06,
"loss": 6.5072,
"step": 1600
},
{
"epoch": 3.548655392292764,
"eval_loss": 6.482588291168213,
"eval_runtime": 172.4593,
"eval_samples_per_second": 57.985,
"eval_steps_per_second": 7.248,
"step": 1600
},
{
"epoch": 3.770446354311062,
"grad_norm": 1.4255868196487427,
"learning_rate": 5.849498327759197e-06,
"loss": 6.5022,
"step": 1700
},
{
"epoch": 3.770446354311062,
"eval_loss": 6.47553825378418,
"eval_runtime": 174.7719,
"eval_samples_per_second": 57.217,
"eval_steps_per_second": 7.152,
"step": 1700
},
{
"epoch": 3.9922373163293594,
"grad_norm": 0.9065702557563782,
"learning_rate": 5.839464882943144e-06,
"loss": 6.4959,
"step": 1800
},
{
"epoch": 3.9922373163293594,
"eval_loss": 6.473917484283447,
"eval_runtime": 174.8124,
"eval_samples_per_second": 57.204,
"eval_steps_per_second": 7.151,
"step": 1800
},
{
"epoch": 4.214028278347658,
"grad_norm": 0.566608190536499,
"learning_rate": 5.829431438127091e-06,
"loss": 6.4886,
"step": 1900
},
{
"epoch": 4.214028278347658,
"eval_loss": 6.469518184661865,
"eval_runtime": 174.705,
"eval_samples_per_second": 57.239,
"eval_steps_per_second": 7.155,
"step": 1900
},
{
"epoch": 4.435819240365955,
"grad_norm": 0.48451030254364014,
"learning_rate": 5.819397993311037e-06,
"loss": 6.4833,
"step": 2000
},
{
"epoch": 4.435819240365955,
"eval_loss": 6.466635704040527,
"eval_runtime": 172.7376,
"eval_samples_per_second": 57.891,
"eval_steps_per_second": 7.236,
"step": 2000
},
{
"epoch": 4.6576102023842525,
"grad_norm": 0.9523207545280457,
"learning_rate": 5.8093645484949836e-06,
"loss": 6.4787,
"step": 2100
},
{
"epoch": 4.6576102023842525,
"eval_loss": 6.461461544036865,
"eval_runtime": 175.0633,
"eval_samples_per_second": 57.122,
"eval_steps_per_second": 7.14,
"step": 2100
},
{
"epoch": 4.879401164402551,
"grad_norm": 0.5693651437759399,
"learning_rate": 5.79933110367893e-06,
"loss": 6.4728,
"step": 2200
},
{
"epoch": 4.879401164402551,
"eval_loss": 6.459517478942871,
"eval_runtime": 172.5038,
"eval_samples_per_second": 57.97,
"eval_steps_per_second": 7.246,
"step": 2200
},
{
"epoch": 5.101192126420848,
"grad_norm": 0.5901357531547546,
"learning_rate": 5.789297658862876e-06,
"loss": 6.4696,
"step": 2300
},
{
"epoch": 5.101192126420848,
"eval_loss": 6.457067012786865,
"eval_runtime": 174.7951,
"eval_samples_per_second": 57.21,
"eval_steps_per_second": 7.151,
"step": 2300
},
{
"epoch": 5.3229830884391465,
"grad_norm": 1.0042142868041992,
"learning_rate": 5.779264214046823e-06,
"loss": 6.4653,
"step": 2400
},
{
"epoch": 5.3229830884391465,
"eval_loss": 6.453999042510986,
"eval_runtime": 172.3564,
"eval_samples_per_second": 58.019,
"eval_steps_per_second": 7.252,
"step": 2400
},
{
"epoch": 5.544774050457444,
"grad_norm": 0.7791227102279663,
"learning_rate": 5.76923076923077e-06,
"loss": 6.4633,
"step": 2500
},
{
"epoch": 5.544774050457444,
"eval_loss": 6.451336860656738,
"eval_runtime": 174.7261,
"eval_samples_per_second": 57.232,
"eval_steps_per_second": 7.154,
"step": 2500
},
{
"epoch": 5.766565012475741,
"grad_norm": 0.8784928321838379,
"learning_rate": 5.759197324414716e-06,
"loss": 6.4583,
"step": 2600
},
{
"epoch": 5.766565012475741,
"eval_loss": 6.448914051055908,
"eval_runtime": 174.6875,
"eval_samples_per_second": 57.245,
"eval_steps_per_second": 7.156,
"step": 2600
},
{
"epoch": 5.98835597449404,
"grad_norm": 0.5964264869689941,
"learning_rate": 5.7491638795986624e-06,
"loss": 6.4565,
"step": 2700
},
{
"epoch": 5.98835597449404,
"eval_loss": 6.4472856521606445,
"eval_runtime": 174.8096,
"eval_samples_per_second": 57.205,
"eval_steps_per_second": 7.151,
"step": 2700
},
{
"epoch": 6.210146936512337,
"grad_norm": 0.9274541735649109,
"learning_rate": 5.739130434782609e-06,
"loss": 6.4532,
"step": 2800
},
{
"epoch": 6.210146936512337,
"eval_loss": 6.4447126388549805,
"eval_runtime": 174.8407,
"eval_samples_per_second": 57.195,
"eval_steps_per_second": 7.149,
"step": 2800
},
{
"epoch": 6.431937898530635,
"grad_norm": 1.001717209815979,
"learning_rate": 5.729096989966555e-06,
"loss": 6.4502,
"step": 2900
},
{
"epoch": 6.431937898530635,
"eval_loss": 6.442678928375244,
"eval_runtime": 174.8426,
"eval_samples_per_second": 57.194,
"eval_steps_per_second": 7.149,
"step": 2900
},
{
"epoch": 6.653728860548933,
"grad_norm": 1.0303460359573364,
"learning_rate": 5.719063545150502e-06,
"loss": 6.4461,
"step": 3000
},
{
"epoch": 6.653728860548933,
"eval_loss": 6.441711902618408,
"eval_runtime": 174.9081,
"eval_samples_per_second": 57.173,
"eval_steps_per_second": 7.147,
"step": 3000
},
{
"epoch": 6.87551982256723,
"grad_norm": 0.8993558287620544,
"learning_rate": 5.709030100334449e-06,
"loss": 6.4441,
"step": 3100
},
{
"epoch": 6.87551982256723,
"eval_loss": 6.4411845207214355,
"eval_runtime": 174.9473,
"eval_samples_per_second": 57.16,
"eval_steps_per_second": 7.145,
"step": 3100
},
{
"epoch": 7.097310784585528,
"grad_norm": 0.8197622299194336,
"learning_rate": 5.698996655518395e-06,
"loss": 6.4423,
"step": 3200
},
{
"epoch": 7.097310784585528,
"eval_loss": 6.436513900756836,
"eval_runtime": 174.8498,
"eval_samples_per_second": 57.192,
"eval_steps_per_second": 7.149,
"step": 3200
},
{
"epoch": 7.319101746603826,
"grad_norm": 0.8674586415290833,
"learning_rate": 5.688963210702341e-06,
"loss": 6.4396,
"step": 3300
},
{
"epoch": 7.319101746603826,
"eval_loss": 6.435162544250488,
"eval_runtime": 174.7933,
"eval_samples_per_second": 57.21,
"eval_steps_per_second": 7.151,
"step": 3300
},
{
"epoch": 7.540892708622124,
"grad_norm": 1.1237138509750366,
"learning_rate": 5.678929765886288e-06,
"loss": 6.436,
"step": 3400
},
{
"epoch": 7.540892708622124,
"eval_loss": 6.435031414031982,
"eval_runtime": 175.3834,
"eval_samples_per_second": 57.018,
"eval_steps_per_second": 7.127,
"step": 3400
},
{
"epoch": 7.762683670640421,
"grad_norm": 0.8178996443748474,
"learning_rate": 5.668896321070235e-06,
"loss": 6.436,
"step": 3500
},
{
"epoch": 7.762683670640421,
"eval_loss": 6.435057640075684,
"eval_runtime": 174.7238,
"eval_samples_per_second": 57.233,
"eval_steps_per_second": 7.154,
"step": 3500
},
{
"epoch": 7.984474632658719,
"grad_norm": 1.035356044769287,
"learning_rate": 5.658862876254181e-06,
"loss": 6.4349,
"step": 3600
},
{
"epoch": 7.984474632658719,
"eval_loss": 6.434642791748047,
"eval_runtime": 174.772,
"eval_samples_per_second": 57.217,
"eval_steps_per_second": 7.152,
"step": 3600
},
{
"epoch": 8.206265594677017,
"grad_norm": 0.5910846590995789,
"learning_rate": 5.6488294314381275e-06,
"loss": 6.4321,
"step": 3700
},
{
"epoch": 8.206265594677017,
"eval_loss": 6.431816577911377,
"eval_runtime": 174.7565,
"eval_samples_per_second": 57.222,
"eval_steps_per_second": 7.153,
"step": 3700
},
{
"epoch": 8.428056556695315,
"grad_norm": 1.0821483135223389,
"learning_rate": 5.638795986622074e-06,
"loss": 6.4311,
"step": 3800
},
{
"epoch": 8.428056556695315,
"eval_loss": 6.432049751281738,
"eval_runtime": 174.8038,
"eval_samples_per_second": 57.207,
"eval_steps_per_second": 7.151,
"step": 3800
},
{
"epoch": 8.649847518713612,
"grad_norm": 0.3947916328907013,
"learning_rate": 5.62876254180602e-06,
"loss": 6.4274,
"step": 3900
},
{
"epoch": 8.649847518713612,
"eval_loss": 6.434264183044434,
"eval_runtime": 174.9692,
"eval_samples_per_second": 57.153,
"eval_steps_per_second": 7.144,
"step": 3900
},
{
"epoch": 8.87163848073191,
"grad_norm": 0.9494003653526306,
"learning_rate": 5.618729096989967e-06,
"loss": 6.4274,
"step": 4000
},
{
"epoch": 8.87163848073191,
"eval_loss": 6.430780410766602,
"eval_runtime": 172.2523,
"eval_samples_per_second": 58.054,
"eval_steps_per_second": 7.257,
"step": 4000
},
{
"epoch": 9.093429442750208,
"grad_norm": 1.1131881475448608,
"learning_rate": 5.608695652173914e-06,
"loss": 6.4257,
"step": 4100
},
{
"epoch": 9.093429442750208,
"eval_loss": 6.429464817047119,
"eval_runtime": 174.8834,
"eval_samples_per_second": 57.181,
"eval_steps_per_second": 7.148,
"step": 4100
},
{
"epoch": 9.315220404768505,
"grad_norm": 1.5252963304519653,
"learning_rate": 5.59866220735786e-06,
"loss": 6.4234,
"step": 4200
},
{
"epoch": 9.315220404768505,
"eval_loss": 6.428813934326172,
"eval_runtime": 174.7709,
"eval_samples_per_second": 57.218,
"eval_steps_per_second": 7.152,
"step": 4200
},
{
"epoch": 9.537011366786803,
"grad_norm": 0.7536811828613281,
"learning_rate": 5.588628762541806e-06,
"loss": 6.4234,
"step": 4300
},
{
"epoch": 9.537011366786803,
"eval_loss": 6.428880214691162,
"eval_runtime": 174.7826,
"eval_samples_per_second": 57.214,
"eval_steps_per_second": 7.152,
"step": 4300
},
{
"epoch": 9.758802328805102,
"grad_norm": 0.6803523302078247,
"learning_rate": 5.578595317725753e-06,
"loss": 6.4212,
"step": 4400
},
{
"epoch": 9.758802328805102,
"eval_loss": 6.426270484924316,
"eval_runtime": 174.8236,
"eval_samples_per_second": 57.201,
"eval_steps_per_second": 7.15,
"step": 4400
},
{
"epoch": 9.9805932908234,
"grad_norm": 0.8163429498672485,
"learning_rate": 5.568561872909699e-06,
"loss": 6.4165,
"step": 4500
},
{
"epoch": 9.9805932908234,
"eval_loss": 6.428164005279541,
"eval_runtime": 174.8809,
"eval_samples_per_second": 57.182,
"eval_steps_per_second": 7.148,
"step": 4500
},
{
"epoch": 10.202384252841696,
"grad_norm": 0.630403459072113,
"learning_rate": 5.558528428093646e-06,
"loss": 6.4189,
"step": 4600
},
{
"epoch": 10.202384252841696,
"eval_loss": 6.428719520568848,
"eval_runtime": 174.7591,
"eval_samples_per_second": 57.222,
"eval_steps_per_second": 7.153,
"step": 4600
},
{
"epoch": 10.424175214859995,
"grad_norm": 0.8704747557640076,
"learning_rate": 5.548494983277593e-06,
"loss": 6.4192,
"step": 4700
},
{
"epoch": 10.424175214859995,
"eval_loss": 6.423656463623047,
"eval_runtime": 174.707,
"eval_samples_per_second": 57.239,
"eval_steps_per_second": 7.155,
"step": 4700
},
{
"epoch": 10.645966176878293,
"grad_norm": 1.2153334617614746,
"learning_rate": 5.5384615384615385e-06,
"loss": 6.4176,
"step": 4800
},
{
"epoch": 10.645966176878293,
"eval_loss": 6.427283763885498,
"eval_runtime": 174.8703,
"eval_samples_per_second": 57.185,
"eval_steps_per_second": 7.148,
"step": 4800
},
{
"epoch": 10.86775713889659,
"grad_norm": 0.9878360629081726,
"learning_rate": 5.528428093645485e-06,
"loss": 6.4147,
"step": 4900
},
{
"epoch": 10.86775713889659,
"eval_loss": 6.424483776092529,
"eval_runtime": 174.7677,
"eval_samples_per_second": 57.219,
"eval_steps_per_second": 7.152,
"step": 4900
},
{
"epoch": 11.089548100914888,
"grad_norm": 1.1536431312561035,
"learning_rate": 5.518394648829432e-06,
"loss": 6.4141,
"step": 5000
},
{
"epoch": 11.089548100914888,
"eval_loss": 6.423103332519531,
"eval_runtime": 174.7198,
"eval_samples_per_second": 57.235,
"eval_steps_per_second": 7.154,
"step": 5000
},
{
"epoch": 11.311339062933186,
"grad_norm": 0.5233383774757385,
"learning_rate": 5.508361204013378e-06,
"loss": 6.4143,
"step": 5100
},
{
"epoch": 11.311339062933186,
"eval_loss": 6.426151275634766,
"eval_runtime": 174.8201,
"eval_samples_per_second": 57.202,
"eval_steps_per_second": 7.15,
"step": 5100
},
{
"epoch": 11.533130024951483,
"grad_norm": 0.4546308219432831,
"learning_rate": 5.498327759197324e-06,
"loss": 6.4131,
"step": 5200
},
{
"epoch": 11.533130024951483,
"eval_loss": 6.41951322555542,
"eval_runtime": 174.8531,
"eval_samples_per_second": 57.191,
"eval_steps_per_second": 7.149,
"step": 5200
},
{
"epoch": 11.75492098696978,
"grad_norm": 0.7687248587608337,
"learning_rate": 5.488294314381271e-06,
"loss": 6.4127,
"step": 5300
},
{
"epoch": 11.75492098696978,
"eval_loss": 6.421510696411133,
"eval_runtime": 174.8268,
"eval_samples_per_second": 57.199,
"eval_steps_per_second": 7.15,
"step": 5300
},
{
"epoch": 11.97671194898808,
"grad_norm": 0.6706124544143677,
"learning_rate": 5.478260869565217e-06,
"loss": 6.4114,
"step": 5400
},
{
"epoch": 11.97671194898808,
"eval_loss": 6.42447566986084,
"eval_runtime": 174.8755,
"eval_samples_per_second": 57.184,
"eval_steps_per_second": 7.148,
"step": 5400
},
{
"epoch": 12.198502911006376,
"grad_norm": 1.165449619293213,
"learning_rate": 5.468227424749163e-06,
"loss": 6.4112,
"step": 5500
},
{
"epoch": 12.198502911006376,
"eval_loss": 6.423706531524658,
"eval_runtime": 174.8245,
"eval_samples_per_second": 57.2,
"eval_steps_per_second": 7.15,
"step": 5500
},
{
"epoch": 12.420293873024674,
"grad_norm": 0.614251434803009,
"learning_rate": 5.45819397993311e-06,
"loss": 6.4088,
"step": 5600
},
{
"epoch": 12.420293873024674,
"eval_loss": 6.417710304260254,
"eval_runtime": 174.7714,
"eval_samples_per_second": 57.218,
"eval_steps_per_second": 7.152,
"step": 5600
},
{
"epoch": 12.642084835042972,
"grad_norm": 0.7338353991508484,
"learning_rate": 5.448160535117057e-06,
"loss": 6.4093,
"step": 5700
},
{
"epoch": 12.642084835042972,
"eval_loss": 6.421204566955566,
"eval_runtime": 174.7739,
"eval_samples_per_second": 57.217,
"eval_steps_per_second": 7.152,
"step": 5700
},
{
"epoch": 12.86387579706127,
"grad_norm": 0.5238298773765564,
"learning_rate": 5.438127090301003e-06,
"loss": 6.4088,
"step": 5800
},
{
"epoch": 12.86387579706127,
"eval_loss": 6.418464183807373,
"eval_runtime": 174.8398,
"eval_samples_per_second": 57.195,
"eval_steps_per_second": 7.149,
"step": 5800
},
{
"epoch": 13.085666759079567,
"grad_norm": 0.8438045382499695,
"learning_rate": 5.4280936454849495e-06,
"loss": 6.4059,
"step": 5900
},
{
"epoch": 13.085666759079567,
"eval_loss": 6.41862678527832,
"eval_runtime": 174.7377,
"eval_samples_per_second": 57.229,
"eval_steps_per_second": 7.154,
"step": 5900
},
{
"epoch": 13.307457721097865,
"grad_norm": 0.6270604133605957,
"learning_rate": 5.418060200668896e-06,
"loss": 6.4083,
"step": 6000
},
{
"epoch": 13.307457721097865,
"eval_loss": 6.420100688934326,
"eval_runtime": 174.8134,
"eval_samples_per_second": 57.204,
"eval_steps_per_second": 7.15,
"step": 6000
},
{
"epoch": 13.529248683116164,
"grad_norm": 0.49625712633132935,
"learning_rate": 5.408026755852843e-06,
"loss": 6.4065,
"step": 6100
},
{
"epoch": 13.529248683116164,
"eval_loss": 6.41825008392334,
"eval_runtime": 174.9176,
"eval_samples_per_second": 57.17,
"eval_steps_per_second": 7.146,
"step": 6100
},
{
"epoch": 13.75103964513446,
"grad_norm": 0.996813178062439,
"learning_rate": 5.397993311036789e-06,
"loss": 6.4055,
"step": 6200
},
{
"epoch": 13.75103964513446,
"eval_loss": 6.419356346130371,
"eval_runtime": 174.915,
"eval_samples_per_second": 57.171,
"eval_steps_per_second": 7.146,
"step": 6200
},
{
"epoch": 13.972830607152758,
"grad_norm": 0.9816793203353882,
"learning_rate": 5.387959866220736e-06,
"loss": 6.4065,
"step": 6300
},
{
"epoch": 13.972830607152758,
"eval_loss": 6.4173455238342285,
"eval_runtime": 175.0068,
"eval_samples_per_second": 57.141,
"eval_steps_per_second": 7.143,
"step": 6300
},
{
"epoch": 14.194621569171057,
"grad_norm": 1.072190761566162,
"learning_rate": 5.3779264214046825e-06,
"loss": 6.403,
"step": 6400
},
{
"epoch": 14.194621569171057,
"eval_loss": 6.416932582855225,
"eval_runtime": 174.8129,
"eval_samples_per_second": 57.204,
"eval_steps_per_second": 7.151,
"step": 6400
},
{
"epoch": 14.416412531189353,
"grad_norm": 0.8124646544456482,
"learning_rate": 5.367892976588628e-06,
"loss": 6.4038,
"step": 6500
},
{
"epoch": 14.416412531189353,
"eval_loss": 6.417375087738037,
"eval_runtime": 174.7648,
"eval_samples_per_second": 57.22,
"eval_steps_per_second": 7.152,
"step": 6500
},
{
"epoch": 14.638203493207651,
"grad_norm": 0.6260553002357483,
"learning_rate": 5.357859531772575e-06,
"loss": 6.4045,
"step": 6600
},
{
"epoch": 14.638203493207651,
"eval_loss": 6.4163103103637695,
"eval_runtime": 173.3723,
"eval_samples_per_second": 57.679,
"eval_steps_per_second": 7.21,
"step": 6600
},
{
"epoch": 14.85999445522595,
"grad_norm": 0.6502517461776733,
"learning_rate": 5.347826086956522e-06,
"loss": 6.4039,
"step": 6700
},
{
"epoch": 14.85999445522595,
"eval_loss": 6.421817779541016,
"eval_runtime": 173.7415,
"eval_samples_per_second": 57.557,
"eval_steps_per_second": 7.195,
"step": 6700
},
{
"epoch": 15.081785417244248,
"grad_norm": 0.7852392196655273,
"learning_rate": 5.337792642140468e-06,
"loss": 6.4021,
"step": 6800
},
{
"epoch": 15.081785417244248,
"eval_loss": 6.414952278137207,
"eval_runtime": 174.7855,
"eval_samples_per_second": 57.213,
"eval_steps_per_second": 7.152,
"step": 6800
},
{
"epoch": 15.303576379262545,
"grad_norm": 0.5642409920692444,
"learning_rate": 5.327759197324415e-06,
"loss": 6.4018,
"step": 6900
},
{
"epoch": 15.303576379262545,
"eval_loss": 6.417159557342529,
"eval_runtime": 172.3834,
"eval_samples_per_second": 58.01,
"eval_steps_per_second": 7.251,
"step": 6900
},
{
"epoch": 15.525367341280843,
"grad_norm": 0.5935277938842773,
"learning_rate": 5.317725752508361e-06,
"loss": 6.4015,
"step": 7000
},
{
"epoch": 15.525367341280843,
"eval_loss": 6.419808864593506,
"eval_runtime": 174.8129,
"eval_samples_per_second": 57.204,
"eval_steps_per_second": 7.151,
"step": 7000
},
{
"epoch": 15.747158303299141,
"grad_norm": 0.8796281218528748,
"learning_rate": 5.307692307692307e-06,
"loss": 6.402,
"step": 7100
},
{
"epoch": 15.747158303299141,
"eval_loss": 6.413030624389648,
"eval_runtime": 174.991,
"eval_samples_per_second": 57.146,
"eval_steps_per_second": 7.143,
"step": 7100
},
{
"epoch": 15.968949265317438,
"grad_norm": 0.686579167842865,
"learning_rate": 5.297658862876254e-06,
"loss": 6.4003,
"step": 7200
},
{
"epoch": 15.968949265317438,
"eval_loss": 6.412362575531006,
"eval_runtime": 174.9653,
"eval_samples_per_second": 57.154,
"eval_steps_per_second": 7.144,
"step": 7200
},
{
"epoch": 16.190740227335738,
"grad_norm": 0.8254374265670776,
"learning_rate": 5.287625418060201e-06,
"loss": 6.4003,
"step": 7300
},
{
"epoch": 16.190740227335738,
"eval_loss": 6.415155410766602,
"eval_runtime": 174.7441,
"eval_samples_per_second": 57.227,
"eval_steps_per_second": 7.153,
"step": 7300
},
{
"epoch": 16.412531189354034,
"grad_norm": 1.0479621887207031,
"learning_rate": 5.277591973244147e-06,
"loss": 6.3999,
"step": 7400
},
{
"epoch": 16.412531189354034,
"eval_loss": 6.4169602394104,
"eval_runtime": 174.7973,
"eval_samples_per_second": 57.209,
"eval_steps_per_second": 7.151,
"step": 7400
},
{
"epoch": 16.63432215137233,
"grad_norm": 0.8358107805252075,
"learning_rate": 5.2675585284280935e-06,
"loss": 6.3989,
"step": 7500
},
{
"epoch": 16.63432215137233,
"eval_loss": 6.417453765869141,
"eval_runtime": 174.497,
"eval_samples_per_second": 57.308,
"eval_steps_per_second": 7.163,
"step": 7500
},
{
"epoch": 16.85611311339063,
"grad_norm": 0.6018221378326416,
"learning_rate": 5.25752508361204e-06,
"loss": 6.3991,
"step": 7600
},
{
"epoch": 16.85611311339063,
"eval_loss": 6.4126434326171875,
"eval_runtime": 172.7161,
"eval_samples_per_second": 57.898,
"eval_steps_per_second": 7.237,
"step": 7600
},
{
"epoch": 17.077904075408927,
"grad_norm": 1.0999138355255127,
"learning_rate": 5.247491638795986e-06,
"loss": 6.3981,
"step": 7700
},
{
"epoch": 17.077904075408927,
"eval_loss": 6.413776397705078,
"eval_runtime": 174.7746,
"eval_samples_per_second": 57.217,
"eval_steps_per_second": 7.152,
"step": 7700
},
{
"epoch": 17.299695037427224,
"grad_norm": 0.5430467128753662,
"learning_rate": 5.237458193979933e-06,
"loss": 6.3993,
"step": 7800
},
{
"epoch": 17.299695037427224,
"eval_loss": 6.414647102355957,
"eval_runtime": 174.8381,
"eval_samples_per_second": 57.196,
"eval_steps_per_second": 7.149,
"step": 7800
},
{
"epoch": 17.521485999445524,
"grad_norm": 0.588058352470398,
"learning_rate": 5.22742474916388e-06,
"loss": 6.3976,
"step": 7900
},
{
"epoch": 17.521485999445524,
"eval_loss": 6.413895130157471,
"eval_runtime": 174.9633,
"eval_samples_per_second": 57.155,
"eval_steps_per_second": 7.144,
"step": 7900
},
{
"epoch": 17.74327696146382,
"grad_norm": 0.365583598613739,
"learning_rate": 5.2173913043478265e-06,
"loss": 6.3966,
"step": 8000
},
{
"epoch": 17.74327696146382,
"eval_loss": 6.409445285797119,
"eval_runtime": 174.9468,
"eval_samples_per_second": 57.16,
"eval_steps_per_second": 7.145,
"step": 8000
},
{
"epoch": 17.965067923482117,
"grad_norm": 0.6981125473976135,
"learning_rate": 5.207357859531772e-06,
"loss": 6.3974,
"step": 8100
},
{
"epoch": 17.965067923482117,
"eval_loss": 6.413646221160889,
"eval_runtime": 174.9417,
"eval_samples_per_second": 57.162,
"eval_steps_per_second": 7.145,
"step": 8100
},
{
"epoch": 18.186858885500417,
"grad_norm": 0.6041765213012695,
"learning_rate": 5.197324414715719e-06,
"loss": 6.3985,
"step": 8200
},
{
"epoch": 18.186858885500417,
"eval_loss": 6.411979675292969,
"eval_runtime": 174.8191,
"eval_samples_per_second": 57.202,
"eval_steps_per_second": 7.15,
"step": 8200
},
{
"epoch": 18.408649847518713,
"grad_norm": 0.7936201095581055,
"learning_rate": 5.187290969899666e-06,
"loss": 6.3964,
"step": 8300
},
{
"epoch": 18.408649847518713,
"eval_loss": 6.40911865234375,
"eval_runtime": 174.818,
"eval_samples_per_second": 57.202,
"eval_steps_per_second": 7.15,
"step": 8300
},
{
"epoch": 18.63044080953701,
"grad_norm": 0.6278252005577087,
"learning_rate": 5.177257525083612e-06,
"loss": 6.3957,
"step": 8400
},
{
"epoch": 18.63044080953701,
"eval_loss": 6.413068771362305,
"eval_runtime": 172.3693,
"eval_samples_per_second": 58.015,
"eval_steps_per_second": 7.252,
"step": 8400
},
{
"epoch": 18.85223177155531,
"grad_norm": 0.6582921743392944,
"learning_rate": 5.167224080267559e-06,
"loss": 6.3956,
"step": 8500
},
{
"epoch": 18.85223177155531,
"eval_loss": 6.410306453704834,
"eval_runtime": 174.8171,
"eval_samples_per_second": 57.203,
"eval_steps_per_second": 7.15,
"step": 8500
},
{
"epoch": 19.074022733573607,
"grad_norm": 0.8874194622039795,
"learning_rate": 5.157190635451505e-06,
"loss": 6.3975,
"step": 8600
},
{
"epoch": 19.074022733573607,
"eval_loss": 6.409109592437744,
"eval_runtime": 172.2351,
"eval_samples_per_second": 58.06,
"eval_steps_per_second": 7.258,
"step": 8600
},
{
"epoch": 19.295813695591903,
"grad_norm": 0.589608907699585,
"learning_rate": 5.147157190635451e-06,
"loss": 6.3957,
"step": 8700
},
{
"epoch": 19.295813695591903,
"eval_loss": 6.413524150848389,
"eval_runtime": 174.767,
"eval_samples_per_second": 57.219,
"eval_steps_per_second": 7.152,
"step": 8700
},
{
"epoch": 19.517604657610203,
"grad_norm": 0.7026548385620117,
"learning_rate": 5.137123745819398e-06,
"loss": 6.3942,
"step": 8800
},
{
"epoch": 19.517604657610203,
"eval_loss": 6.41259241104126,
"eval_runtime": 174.7786,
"eval_samples_per_second": 57.215,
"eval_steps_per_second": 7.152,
"step": 8800
},
{
"epoch": 19.7393956196285,
"grad_norm": 0.7508072257041931,
"learning_rate": 5.127090301003345e-06,
"loss": 6.3936,
"step": 8900
},
{
"epoch": 19.7393956196285,
"eval_loss": 6.410432815551758,
"eval_runtime": 174.7865,
"eval_samples_per_second": 57.213,
"eval_steps_per_second": 7.152,
"step": 8900
},
{
"epoch": 19.9611865816468,
"grad_norm": 0.36028188467025757,
"learning_rate": 5.117056856187291e-06,
"loss": 6.3943,
"step": 9000
},
{
"epoch": 19.9611865816468,
"eval_loss": 6.409936904907227,
"eval_runtime": 174.852,
"eval_samples_per_second": 57.191,
"eval_steps_per_second": 7.149,
"step": 9000
},
{
"epoch": 20.182977543665096,
"grad_norm": 0.8198152184486389,
"learning_rate": 5.1070234113712375e-06,
"loss": 6.3939,
"step": 9100
},
{
"epoch": 20.182977543665096,
"eval_loss": 6.412051677703857,
"eval_runtime": 174.8255,
"eval_samples_per_second": 57.2,
"eval_steps_per_second": 7.15,
"step": 9100
},
{
"epoch": 20.404768505683393,
"grad_norm": 0.6599276065826416,
"learning_rate": 5.096989966555184e-06,
"loss": 6.3939,
"step": 9200
},
{
"epoch": 20.404768505683393,
"eval_loss": 6.411386489868164,
"eval_runtime": 174.8596,
"eval_samples_per_second": 57.189,
"eval_steps_per_second": 7.149,
"step": 9200
},
{
"epoch": 20.626559467701693,
"grad_norm": 0.736455500125885,
"learning_rate": 5.08695652173913e-06,
"loss": 6.3931,
"step": 9300
},
{
"epoch": 20.626559467701693,
"eval_loss": 6.40945291519165,
"eval_runtime": 174.8463,
"eval_samples_per_second": 57.193,
"eval_steps_per_second": 7.149,
"step": 9300
},
{
"epoch": 20.84835042971999,
"grad_norm": 0.7547162175178528,
"learning_rate": 5.076923076923077e-06,
"loss": 6.393,
"step": 9400
},
{
"epoch": 20.84835042971999,
"eval_loss": 6.409768581390381,
"eval_runtime": 175.1356,
"eval_samples_per_second": 57.099,
"eval_steps_per_second": 7.137,
"step": 9400
},
{
"epoch": 21.070141391738286,
"grad_norm": 0.4197324216365814,
"learning_rate": 5.066889632107024e-06,
"loss": 6.3943,
"step": 9500
},
{
"epoch": 21.070141391738286,
"eval_loss": 6.4077606201171875,
"eval_runtime": 174.8821,
"eval_samples_per_second": 57.181,
"eval_steps_per_second": 7.148,
"step": 9500
},
{
"epoch": 21.291932353756586,
"grad_norm": 0.6552382111549377,
"learning_rate": 5.05685618729097e-06,
"loss": 6.3927,
"step": 9600
},
{
"epoch": 21.291932353756586,
"eval_loss": 6.40675163269043,
"eval_runtime": 174.964,
"eval_samples_per_second": 57.155,
"eval_steps_per_second": 7.144,
"step": 9600
},
{
"epoch": 21.513723315774882,
"grad_norm": 0.507618248462677,
"learning_rate": 5.046822742474916e-06,
"loss": 6.3948,
"step": 9700
},
{
"epoch": 21.513723315774882,
"eval_loss": 6.4094719886779785,
"eval_runtime": 174.8996,
"eval_samples_per_second": 57.176,
"eval_steps_per_second": 7.147,
"step": 9700
},
{
"epoch": 21.73551427779318,
"grad_norm": 1.0394549369812012,
"learning_rate": 5.036789297658863e-06,
"loss": 6.3933,
"step": 9800
},
{
"epoch": 21.73551427779318,
"eval_loss": 6.411880016326904,
"eval_runtime": 174.8373,
"eval_samples_per_second": 57.196,
"eval_steps_per_second": 7.15,
"step": 9800
},
{
"epoch": 21.95730523981148,
"grad_norm": 0.852592945098877,
"learning_rate": 5.02675585284281e-06,
"loss": 6.3932,
"step": 9900
},
{
"epoch": 21.95730523981148,
"eval_loss": 6.405695915222168,
"eval_runtime": 174.9233,
"eval_samples_per_second": 57.168,
"eval_steps_per_second": 7.146,
"step": 9900
},
{
"epoch": 22.179096201829775,
"grad_norm": 0.6302698254585266,
"learning_rate": 5.016722408026756e-06,
"loss": 6.3914,
"step": 10000
},
{
"epoch": 22.179096201829775,
"eval_loss": 6.404843807220459,
"eval_runtime": 174.881,
"eval_samples_per_second": 57.182,
"eval_steps_per_second": 7.148,
"step": 10000
},
{
"epoch": 22.400887163848072,
"grad_norm": 0.5545974969863892,
"learning_rate": 5.0066889632107026e-06,
"loss": 6.3913,
"step": 10100
},
{
"epoch": 22.400887163848072,
"eval_loss": 6.4088826179504395,
"eval_runtime": 174.9362,
"eval_samples_per_second": 57.164,
"eval_steps_per_second": 7.145,
"step": 10100
},
{
"epoch": 22.622678125866372,
"grad_norm": 0.6303640007972717,
"learning_rate": 4.996655518394649e-06,
"loss": 6.3916,
"step": 10200
},
{
"epoch": 22.622678125866372,
"eval_loss": 6.406084060668945,
"eval_runtime": 174.8669,
"eval_samples_per_second": 57.186,
"eval_steps_per_second": 7.148,
"step": 10200
},
{
"epoch": 22.84446908788467,
"grad_norm": 0.6866323947906494,
"learning_rate": 4.986622073578595e-06,
"loss": 6.3922,
"step": 10300
},
{
"epoch": 22.84446908788467,
"eval_loss": 6.406491279602051,
"eval_runtime": 172.5199,
"eval_samples_per_second": 57.964,
"eval_steps_per_second": 7.246,
"step": 10300
},
{
"epoch": 23.066260049902965,
"grad_norm": 0.5681377649307251,
"learning_rate": 4.976588628762542e-06,
"loss": 6.3919,
"step": 10400
},
{
"epoch": 23.066260049902965,
"eval_loss": 6.407881259918213,
"eval_runtime": 174.7996,
"eval_samples_per_second": 57.208,
"eval_steps_per_second": 7.151,
"step": 10400
},
{
"epoch": 23.288051011921265,
"grad_norm": 0.5302285552024841,
"learning_rate": 4.966555183946489e-06,
"loss": 6.3928,
"step": 10500
},
{
"epoch": 23.288051011921265,
"eval_loss": 6.4045891761779785,
"eval_runtime": 175.2024,
"eval_samples_per_second": 57.077,
"eval_steps_per_second": 7.135,
"step": 10500
},
{
"epoch": 23.50984197393956,
"grad_norm": 0.5630497336387634,
"learning_rate": 4.956521739130435e-06,
"loss": 6.3903,
"step": 10600
},
{
"epoch": 23.50984197393956,
"eval_loss": 6.406449317932129,
"eval_runtime": 172.7598,
"eval_samples_per_second": 57.884,
"eval_steps_per_second": 7.235,
"step": 10600
},
{
"epoch": 23.731632935957858,
"grad_norm": 0.5340705513954163,
"learning_rate": 4.9464882943143815e-06,
"loss": 6.3896,
"step": 10700
},
{
"epoch": 23.731632935957858,
"eval_loss": 6.408339023590088,
"eval_runtime": 175.3018,
"eval_samples_per_second": 57.044,
"eval_steps_per_second": 7.131,
"step": 10700
},
{
"epoch": 23.95342389797616,
"grad_norm": 0.7192414402961731,
"learning_rate": 4.936454849498328e-06,
"loss": 6.3904,
"step": 10800
},
{
"epoch": 23.95342389797616,
"eval_loss": 6.408904075622559,
"eval_runtime": 175.321,
"eval_samples_per_second": 57.038,
"eval_steps_per_second": 7.13,
"step": 10800
},
{
"epoch": 24.175214859994455,
"grad_norm": 0.7297828197479248,
"learning_rate": 4.926421404682274e-06,
"loss": 6.3906,
"step": 10900
},
{
"epoch": 24.175214859994455,
"eval_loss": 6.406455993652344,
"eval_runtime": 175.3497,
"eval_samples_per_second": 57.029,
"eval_steps_per_second": 7.129,
"step": 10900
},
{
"epoch": 24.39700582201275,
"grad_norm": 0.8612614870071411,
"learning_rate": 4.916387959866221e-06,
"loss": 6.389,
"step": 11000
},
{
"epoch": 24.39700582201275,
"eval_loss": 6.4049272537231445,
"eval_runtime": 174.9884,
"eval_samples_per_second": 57.147,
"eval_steps_per_second": 7.143,
"step": 11000
},
{
"epoch": 24.61879678403105,
"grad_norm": 0.39626169204711914,
"learning_rate": 4.906354515050168e-06,
"loss": 6.3904,
"step": 11100
},
{
"epoch": 24.61879678403105,
"eval_loss": 6.399599075317383,
"eval_runtime": 175.2182,
"eval_samples_per_second": 57.072,
"eval_steps_per_second": 7.134,
"step": 11100
},
{
"epoch": 24.840587746049348,
"grad_norm": 0.47381725907325745,
"learning_rate": 4.8963210702341136e-06,
"loss": 6.3896,
"step": 11200
},
{
"epoch": 24.840587746049348,
"eval_loss": 6.405921459197998,
"eval_runtime": 174.9189,
"eval_samples_per_second": 57.169,
"eval_steps_per_second": 7.146,
"step": 11200
},
{
"epoch": 25.062378708067648,
"grad_norm": 0.567333996295929,
"learning_rate": 4.88628762541806e-06,
"loss": 6.3886,
"step": 11300
},
{
"epoch": 25.062378708067648,
"eval_loss": 6.409249782562256,
"eval_runtime": 174.8058,
"eval_samples_per_second": 57.206,
"eval_steps_per_second": 7.151,
"step": 11300
},
{
"epoch": 25.284169670085944,
"grad_norm": 0.47083523869514465,
"learning_rate": 4.876254180602007e-06,
"loss": 6.3892,
"step": 11400
},
{
"epoch": 25.284169670085944,
"eval_loss": 6.406309604644775,
"eval_runtime": 174.8008,
"eval_samples_per_second": 57.208,
"eval_steps_per_second": 7.151,
"step": 11400
},
{
"epoch": 25.50596063210424,
"grad_norm": 0.4636823832988739,
"learning_rate": 4.866220735785953e-06,
"loss": 6.3905,
"step": 11500
},
{
"epoch": 25.50596063210424,
"eval_loss": 6.4087066650390625,
"eval_runtime": 174.7802,
"eval_samples_per_second": 57.215,
"eval_steps_per_second": 7.152,
"step": 11500
},
{
"epoch": 25.72775159412254,
"grad_norm": 0.8328993916511536,
"learning_rate": 4.8561872909699e-06,
"loss": 6.3888,
"step": 11600
},
{
"epoch": 25.72775159412254,
"eval_loss": 6.405496120452881,
"eval_runtime": 172.4449,
"eval_samples_per_second": 57.99,
"eval_steps_per_second": 7.249,
"step": 11600
},
{
"epoch": 25.949542556140837,
"grad_norm": 0.5866479873657227,
"learning_rate": 4.8461538461538465e-06,
"loss": 6.3895,
"step": 11700
},
{
"epoch": 25.949542556140837,
"eval_loss": 6.4065117835998535,
"eval_runtime": 172.4094,
"eval_samples_per_second": 58.001,
"eval_steps_per_second": 7.25,
"step": 11700
},
{
"epoch": 26.171333518159134,
"grad_norm": 0.7557168006896973,
"learning_rate": 4.8361204013377925e-06,
"loss": 6.3901,
"step": 11800
},
{
"epoch": 26.171333518159134,
"eval_loss": 6.404352188110352,
"eval_runtime": 174.7894,
"eval_samples_per_second": 57.212,
"eval_steps_per_second": 7.151,
"step": 11800
},
{
"epoch": 26.393124480177434,
"grad_norm": 0.5010234117507935,
"learning_rate": 4.826086956521739e-06,
"loss": 6.3881,
"step": 11900
},
{
"epoch": 26.393124480177434,
"eval_loss": 6.406057834625244,
"eval_runtime": 174.8928,
"eval_samples_per_second": 57.178,
"eval_steps_per_second": 7.147,
"step": 11900
},
{
"epoch": 26.61491544219573,
"grad_norm": 0.5228267312049866,
"learning_rate": 4.816053511705686e-06,
"loss": 6.3893,
"step": 12000
},
{
"epoch": 26.61491544219573,
"eval_loss": 6.403919219970703,
"eval_runtime": 173.0368,
"eval_samples_per_second": 57.791,
"eval_steps_per_second": 7.224,
"step": 12000
},
{
"epoch": 26.836706404214027,
"grad_norm": 0.41645535826683044,
"learning_rate": 4.806020066889633e-06,
"loss": 6.3893,
"step": 12100
},
{
"epoch": 26.836706404214027,
"eval_loss": 6.403182029724121,
"eval_runtime": 173.7518,
"eval_samples_per_second": 57.553,
"eval_steps_per_second": 7.194,
"step": 12100
},
{
"epoch": 27.058497366232327,
"grad_norm": 0.6280103325843811,
"learning_rate": 4.795986622073579e-06,
"loss": 6.388,
"step": 12200
},
{
"epoch": 27.058497366232327,
"eval_loss": 6.406325817108154,
"eval_runtime": 172.5376,
"eval_samples_per_second": 57.958,
"eval_steps_per_second": 7.245,
"step": 12200
},
{
"epoch": 27.280288328250624,
"grad_norm": 0.4701608419418335,
"learning_rate": 4.785953177257525e-06,
"loss": 6.3891,
"step": 12300
},
{
"epoch": 27.280288328250624,
"eval_loss": 6.403144836425781,
"eval_runtime": 174.7938,
"eval_samples_per_second": 57.21,
"eval_steps_per_second": 7.151,
"step": 12300
},
{
"epoch": 27.50207929026892,
"grad_norm": 0.49227380752563477,
"learning_rate": 4.775919732441472e-06,
"loss": 6.3893,
"step": 12400
},
{
"epoch": 27.50207929026892,
"eval_loss": 6.404545783996582,
"eval_runtime": 172.6406,
"eval_samples_per_second": 57.924,
"eval_steps_per_second": 7.24,
"step": 12400
},
{
"epoch": 27.72387025228722,
"grad_norm": 0.5558980703353882,
"learning_rate": 4.765886287625418e-06,
"loss": 6.3883,
"step": 12500
},
{
"epoch": 27.72387025228722,
"eval_loss": 6.402305603027344,
"eval_runtime": 174.8403,
"eval_samples_per_second": 57.195,
"eval_steps_per_second": 7.149,
"step": 12500
},
{
"epoch": 27.945661214305517,
"grad_norm": 0.7037143707275391,
"learning_rate": 4.755852842809365e-06,
"loss": 6.3885,
"step": 12600
},
{
"epoch": 27.945661214305517,
"eval_loss": 6.403327465057373,
"eval_runtime": 172.6409,
"eval_samples_per_second": 57.924,
"eval_steps_per_second": 7.24,
"step": 12600
},
{
"epoch": 28.167452176323813,
"grad_norm": 0.8158712983131409,
"learning_rate": 4.745819397993312e-06,
"loss": 6.3858,
"step": 12700
},
{
"epoch": 28.167452176323813,
"eval_loss": 6.40453577041626,
"eval_runtime": 174.7251,
"eval_samples_per_second": 57.233,
"eval_steps_per_second": 7.154,
"step": 12700
},
{
"epoch": 28.389243138342113,
"grad_norm": 0.49727940559387207,
"learning_rate": 4.7357859531772575e-06,
"loss": 6.3882,
"step": 12800
},
{
"epoch": 28.389243138342113,
"eval_loss": 6.404928684234619,
"eval_runtime": 174.7858,
"eval_samples_per_second": 57.213,
"eval_steps_per_second": 7.152,
"step": 12800
},
{
"epoch": 28.61103410036041,
"grad_norm": 0.5173976421356201,
"learning_rate": 4.725752508361204e-06,
"loss": 6.3866,
"step": 12900
},
{
"epoch": 28.61103410036041,
"eval_loss": 6.40172815322876,
"eval_runtime": 174.5561,
"eval_samples_per_second": 57.288,
"eval_steps_per_second": 7.161,
"step": 12900
},
{
"epoch": 28.832825062378706,
"grad_norm": 0.5842565298080444,
"learning_rate": 4.715719063545151e-06,
"loss": 6.3891,
"step": 13000
},
{
"epoch": 28.832825062378706,
"eval_loss": 6.401641845703125,
"eval_runtime": 172.3935,
"eval_samples_per_second": 58.007,
"eval_steps_per_second": 7.251,
"step": 13000
},
{
"epoch": 29.054616024397006,
"grad_norm": 0.6438339948654175,
"learning_rate": 4.705685618729097e-06,
"loss": 6.3869,
"step": 13100
},
{
"epoch": 29.054616024397006,
"eval_loss": 6.403342247009277,
"eval_runtime": 174.8489,
"eval_samples_per_second": 57.192,
"eval_steps_per_second": 7.149,
"step": 13100
},
{
"epoch": 29.276406986415303,
"grad_norm": 0.5338951349258423,
"learning_rate": 4.695652173913044e-06,
"loss": 6.3882,
"step": 13200
},
{
"epoch": 29.276406986415303,
"eval_loss": 6.400930404663086,
"eval_runtime": 172.4302,
"eval_samples_per_second": 57.994,
"eval_steps_per_second": 7.249,
"step": 13200
},
{
"epoch": 29.498197948433603,
"grad_norm": 0.5359793305397034,
"learning_rate": 4.6856187290969905e-06,
"loss": 6.3878,
"step": 13300
},
{
"epoch": 29.498197948433603,
"eval_loss": 6.406982898712158,
"eval_runtime": 174.802,
"eval_samples_per_second": 57.208,
"eval_steps_per_second": 7.151,
"step": 13300
},
{
"epoch": 29.7199889104519,
"grad_norm": 0.715033233165741,
"learning_rate": 4.675585284280936e-06,
"loss": 6.3859,
"step": 13400
},
{
"epoch": 29.7199889104519,
"eval_loss": 6.40342903137207,
"eval_runtime": 174.8452,
"eval_samples_per_second": 57.193,
"eval_steps_per_second": 7.149,
"step": 13400
},
{
"epoch": 29.941779872470196,
"grad_norm": 0.934853732585907,
"learning_rate": 4.665551839464883e-06,
"loss": 6.3875,
"step": 13500
},
{
"epoch": 29.941779872470196,
"eval_loss": 6.401629447937012,
"eval_runtime": 174.7924,
"eval_samples_per_second": 57.211,
"eval_steps_per_second": 7.151,
"step": 13500
},
{
"epoch": 30.163570834488496,
"grad_norm": 0.479612797498703,
"learning_rate": 4.65551839464883e-06,
"loss": 6.3866,
"step": 13600
},
{
"epoch": 30.163570834488496,
"eval_loss": 6.399043560028076,
"eval_runtime": 176.925,
"eval_samples_per_second": 56.521,
"eval_steps_per_second": 7.065,
"step": 13600
},
{
"epoch": 30.385361796506793,
"grad_norm": 0.5256738662719727,
"learning_rate": 4.645484949832776e-06,
"loss": 6.3878,
"step": 13700
},
{
"epoch": 30.385361796506793,
"eval_loss": 6.400505065917969,
"eval_runtime": 175.3482,
"eval_samples_per_second": 57.029,
"eval_steps_per_second": 7.129,
"step": 13700
},
{
"epoch": 30.60715275852509,
"grad_norm": 0.5690653920173645,
"learning_rate": 4.635451505016723e-06,
"loss": 6.3848,
"step": 13800
},
{
"epoch": 30.60715275852509,
"eval_loss": 6.403696060180664,
"eval_runtime": 172.8083,
"eval_samples_per_second": 57.868,
"eval_steps_per_second": 7.233,
"step": 13800
},
{
"epoch": 30.82894372054339,
"grad_norm": 0.4565252363681793,
"learning_rate": 4.625418060200669e-06,
"loss": 6.3849,
"step": 13900
},
{
"epoch": 30.82894372054339,
"eval_loss": 6.403767108917236,
"eval_runtime": 175.7515,
"eval_samples_per_second": 56.899,
"eval_steps_per_second": 7.112,
"step": 13900
},
{
"epoch": 31.050734682561686,
"grad_norm": 0.4801616966724396,
"learning_rate": 4.615384615384616e-06,
"loss": 6.3869,
"step": 14000
},
{
"epoch": 31.050734682561686,
"eval_loss": 6.400508403778076,
"eval_runtime": 174.4685,
"eval_samples_per_second": 57.317,
"eval_steps_per_second": 7.165,
"step": 14000
},
{
"epoch": 31.272525644579982,
"grad_norm": 0.5834231972694397,
"learning_rate": 4.605351170568562e-06,
"loss": 6.3853,
"step": 14100
},
{
"epoch": 31.272525644579982,
"eval_loss": 6.400169849395752,
"eval_runtime": 175.4977,
"eval_samples_per_second": 56.981,
"eval_steps_per_second": 7.123,
"step": 14100
},
{
"epoch": 31.494316606598282,
"grad_norm": 0.6701497435569763,
"learning_rate": 4.595317725752509e-06,
"loss": 6.3865,
"step": 14200
},
{
"epoch": 31.494316606598282,
"eval_loss": 6.397976875305176,
"eval_runtime": 175.4612,
"eval_samples_per_second": 56.993,
"eval_steps_per_second": 7.124,
"step": 14200
},
{
"epoch": 31.71610756861658,
"grad_norm": 0.4794948697090149,
"learning_rate": 4.585284280936456e-06,
"loss": 6.3852,
"step": 14300
},
{
"epoch": 31.71610756861658,
"eval_loss": 6.403610706329346,
"eval_runtime": 176.2646,
"eval_samples_per_second": 56.733,
"eval_steps_per_second": 7.092,
"step": 14300
},
{
"epoch": 31.937898530634875,
"grad_norm": 0.6028741002082825,
"learning_rate": 4.5752508361204015e-06,
"loss": 6.3851,
"step": 14400
},
{
"epoch": 31.937898530634875,
"eval_loss": 6.400261878967285,
"eval_runtime": 174.9022,
"eval_samples_per_second": 57.175,
"eval_steps_per_second": 7.147,
"step": 14400
},
{
"epoch": 32.15968949265317,
"grad_norm": 0.7439810037612915,
"learning_rate": 4.565217391304348e-06,
"loss": 6.3839,
"step": 14500
},
{
"epoch": 32.15968949265317,
"eval_loss": 6.397915363311768,
"eval_runtime": 172.885,
"eval_samples_per_second": 57.842,
"eval_steps_per_second": 7.23,
"step": 14500
},
{
"epoch": 32.381480454671475,
"grad_norm": 0.4727949798107147,
"learning_rate": 4.555183946488295e-06,
"loss": 6.3855,
"step": 14600
},
{
"epoch": 32.381480454671475,
"eval_loss": 6.39973258972168,
"eval_runtime": 175.4295,
"eval_samples_per_second": 57.003,
"eval_steps_per_second": 7.125,
"step": 14600
},
{
"epoch": 32.60327141668977,
"grad_norm": 0.5084313154220581,
"learning_rate": 4.545150501672241e-06,
"loss": 6.3833,
"step": 14700
},
{
"epoch": 32.60327141668977,
"eval_loss": 6.39823055267334,
"eval_runtime": 173.3819,
"eval_samples_per_second": 57.676,
"eval_steps_per_second": 7.21,
"step": 14700
},
{
"epoch": 32.82506237870807,
"grad_norm": 0.36422112584114075,
"learning_rate": 4.535117056856188e-06,
"loss": 6.3854,
"step": 14800
},
{
"epoch": 32.82506237870807,
"eval_loss": 6.402724266052246,
"eval_runtime": 174.9732,
"eval_samples_per_second": 57.152,
"eval_steps_per_second": 7.144,
"step": 14800
},
{
"epoch": 33.046853340726365,
"grad_norm": 0.5722773671150208,
"learning_rate": 4.5250836120401345e-06,
"loss": 6.3859,
"step": 14900
},
{
"epoch": 33.046853340726365,
"eval_loss": 6.396421909332275,
"eval_runtime": 175.3976,
"eval_samples_per_second": 57.013,
"eval_steps_per_second": 7.127,
"step": 14900
},
{
"epoch": 33.26864430274466,
"grad_norm": 0.859866201877594,
"learning_rate": 4.51505016722408e-06,
"loss": 6.3851,
"step": 15000
},
{
"epoch": 33.26864430274466,
"eval_loss": 6.396206855773926,
"eval_runtime": 174.5321,
"eval_samples_per_second": 57.296,
"eval_steps_per_second": 7.162,
"step": 15000
},
{
"epoch": 33.49043526476296,
"grad_norm": 0.8327785134315491,
"learning_rate": 4.505016722408027e-06,
"loss": 6.3848,
"step": 15100
},
{
"epoch": 33.49043526476296,
"eval_loss": 6.403675556182861,
"eval_runtime": 172.9138,
"eval_samples_per_second": 57.832,
"eval_steps_per_second": 7.229,
"step": 15100
},
{
"epoch": 33.71222622678126,
"grad_norm": 0.4790419042110443,
"learning_rate": 4.494983277591973e-06,
"loss": 6.3843,
"step": 15200
},
{
"epoch": 33.71222622678126,
"eval_loss": 6.397605895996094,
"eval_runtime": 175.348,
"eval_samples_per_second": 57.029,
"eval_steps_per_second": 7.129,
"step": 15200
},
{
"epoch": 33.93401718879956,
"grad_norm": 0.8004974722862244,
"learning_rate": 4.48494983277592e-06,
"loss": 6.3852,
"step": 15300
},
{
"epoch": 33.93401718879956,
"eval_loss": 6.396829605102539,
"eval_runtime": 172.9108,
"eval_samples_per_second": 57.833,
"eval_steps_per_second": 7.229,
"step": 15300
},
{
"epoch": 34.155808150817855,
"grad_norm": 0.40926745533943176,
"learning_rate": 4.474916387959866e-06,
"loss": 6.3835,
"step": 15400
},
{
"epoch": 34.155808150817855,
"eval_loss": 6.400079727172852,
"eval_runtime": 175.4491,
"eval_samples_per_second": 56.997,
"eval_steps_per_second": 7.125,
"step": 15400
},
{
"epoch": 34.37759911283615,
"grad_norm": 0.3634837567806244,
"learning_rate": 4.4648829431438125e-06,
"loss": 6.3836,
"step": 15500
},
{
"epoch": 34.37759911283615,
"eval_loss": 6.399561882019043,
"eval_runtime": 173.2399,
"eval_samples_per_second": 57.723,
"eval_steps_per_second": 7.215,
"step": 15500
},
{
"epoch": 34.59939007485445,
"grad_norm": 0.4545910954475403,
"learning_rate": 4.454849498327759e-06,
"loss": 6.3836,
"step": 15600
},
{
"epoch": 34.59939007485445,
"eval_loss": 6.3967742919921875,
"eval_runtime": 175.8575,
"eval_samples_per_second": 56.864,
"eval_steps_per_second": 7.108,
"step": 15600
},
{
"epoch": 34.821181036872744,
"grad_norm": 0.5282755494117737,
"learning_rate": 4.444816053511705e-06,
"loss": 6.3851,
"step": 15700
},
{
"epoch": 34.821181036872744,
"eval_loss": 6.399077892303467,
"eval_runtime": 175.7729,
"eval_samples_per_second": 56.892,
"eval_steps_per_second": 7.111,
"step": 15700
},
{
"epoch": 35.04297199889105,
"grad_norm": 0.5991719961166382,
"learning_rate": 4.434782608695652e-06,
"loss": 6.3846,
"step": 15800
},
{
"epoch": 35.04297199889105,
"eval_loss": 6.4012532234191895,
"eval_runtime": 175.8802,
"eval_samples_per_second": 56.857,
"eval_steps_per_second": 7.107,
"step": 15800
},
{
"epoch": 35.264762960909344,
"grad_norm": 0.5155884623527527,
"learning_rate": 4.424749163879599e-06,
"loss": 6.3836,
"step": 15900
},
{
"epoch": 35.264762960909344,
"eval_loss": 6.396469593048096,
"eval_runtime": 175.4084,
"eval_samples_per_second": 57.01,
"eval_steps_per_second": 7.126,
"step": 15900
},
{
"epoch": 35.48655392292764,
"grad_norm": 0.5687472224235535,
"learning_rate": 4.414715719063545e-06,
"loss": 6.3851,
"step": 16000
},
{
"epoch": 35.48655392292764,
"eval_loss": 6.39898681640625,
"eval_runtime": 172.8397,
"eval_samples_per_second": 57.857,
"eval_steps_per_second": 7.232,
"step": 16000
},
{
"epoch": 35.70834488494594,
"grad_norm": 0.43625304102897644,
"learning_rate": 4.404682274247491e-06,
"loss": 6.3839,
"step": 16100
},
{
"epoch": 35.70834488494594,
"eval_loss": 6.397797584533691,
"eval_runtime": 175.3929,
"eval_samples_per_second": 57.015,
"eval_steps_per_second": 7.127,
"step": 16100
},
{
"epoch": 35.930135846964234,
"grad_norm": 0.45570382475852966,
"learning_rate": 4.394648829431438e-06,
"loss": 6.383,
"step": 16200
},
{
"epoch": 35.930135846964234,
"eval_loss": 6.396146774291992,
"eval_runtime": 172.944,
"eval_samples_per_second": 57.822,
"eval_steps_per_second": 7.228,
"step": 16200
},
{
"epoch": 36.15192680898254,
"grad_norm": 0.5023874044418335,
"learning_rate": 4.384615384615384e-06,
"loss": 6.3832,
"step": 16300
},
{
"epoch": 36.15192680898254,
"eval_loss": 6.394959449768066,
"eval_runtime": 175.3162,
"eval_samples_per_second": 57.04,
"eval_steps_per_second": 7.13,
"step": 16300
},
{
"epoch": 36.373717771000834,
"grad_norm": 0.6336263418197632,
"learning_rate": 4.374581939799331e-06,
"loss": 6.384,
"step": 16400
},
{
"epoch": 36.373717771000834,
"eval_loss": 6.396052360534668,
"eval_runtime": 172.9338,
"eval_samples_per_second": 57.826,
"eval_steps_per_second": 7.228,
"step": 16400
},
{
"epoch": 36.59550873301913,
"grad_norm": 0.49517419934272766,
"learning_rate": 4.364548494983278e-06,
"loss": 6.3837,
"step": 16500
},
{
"epoch": 36.59550873301913,
"eval_loss": 6.394345760345459,
"eval_runtime": 175.3695,
"eval_samples_per_second": 57.022,
"eval_steps_per_second": 7.128,
"step": 16500
},
{
"epoch": 36.81729969503743,
"grad_norm": 0.6354840993881226,
"learning_rate": 4.354515050167224e-06,
"loss": 6.3819,
"step": 16600
},
{
"epoch": 36.81729969503743,
"eval_loss": 6.399397850036621,
"eval_runtime": 172.967,
"eval_samples_per_second": 57.814,
"eval_steps_per_second": 7.227,
"step": 16600
},
{
"epoch": 37.03909065705572,
"grad_norm": 0.6154801845550537,
"learning_rate": 4.34448160535117e-06,
"loss": 6.3846,
"step": 16700
},
{
"epoch": 37.03909065705572,
"eval_loss": 6.398616313934326,
"eval_runtime": 175.382,
"eval_samples_per_second": 57.018,
"eval_steps_per_second": 7.127,
"step": 16700
},
{
"epoch": 37.26088161907402,
"grad_norm": 0.5332671999931335,
"learning_rate": 4.334448160535117e-06,
"loss": 6.3833,
"step": 16800
},
{
"epoch": 37.26088161907402,
"eval_loss": 6.400417327880859,
"eval_runtime": 172.8252,
"eval_samples_per_second": 57.862,
"eval_steps_per_second": 7.233,
"step": 16800
},
{
"epoch": 37.482672581092324,
"grad_norm": 0.4707394242286682,
"learning_rate": 4.324414715719064e-06,
"loss": 6.382,
"step": 16900
},
{
"epoch": 37.482672581092324,
"eval_loss": 6.399077415466309,
"eval_runtime": 175.3262,
"eval_samples_per_second": 57.037,
"eval_steps_per_second": 7.13,
"step": 16900
},
{
"epoch": 37.70446354311062,
"grad_norm": 0.5503630042076111,
"learning_rate": 4.31438127090301e-06,
"loss": 6.3825,
"step": 17000
},
{
"epoch": 37.70446354311062,
"eval_loss": 6.3964338302612305,
"eval_runtime": 175.3567,
"eval_samples_per_second": 57.027,
"eval_steps_per_second": 7.128,
"step": 17000
},
{
"epoch": 37.92625450512892,
"grad_norm": 0.4225850999355316,
"learning_rate": 4.3043478260869565e-06,
"loss": 6.3808,
"step": 17100
},
{
"epoch": 37.92625450512892,
"eval_loss": 6.399682998657227,
"eval_runtime": 175.5337,
"eval_samples_per_second": 56.969,
"eval_steps_per_second": 7.121,
"step": 17100
},
{
"epoch": 38.14804546714721,
"grad_norm": 0.26002365350723267,
"learning_rate": 4.294314381270903e-06,
"loss": 6.3825,
"step": 17200
},
{
"epoch": 38.14804546714721,
"eval_loss": 6.394641399383545,
"eval_runtime": 175.4187,
"eval_samples_per_second": 57.006,
"eval_steps_per_second": 7.126,
"step": 17200
},
{
"epoch": 38.36983642916551,
"grad_norm": 0.5679543614387512,
"learning_rate": 4.284280936454849e-06,
"loss": 6.381,
"step": 17300
},
{
"epoch": 38.36983642916551,
"eval_loss": 6.39400053024292,
"eval_runtime": 175.3915,
"eval_samples_per_second": 57.015,
"eval_steps_per_second": 7.127,
"step": 17300
},
{
"epoch": 38.591627391183806,
"grad_norm": 0.6668972373008728,
"learning_rate": 4.274247491638796e-06,
"loss": 6.3833,
"step": 17400
},
{
"epoch": 38.591627391183806,
"eval_loss": 6.395496845245361,
"eval_runtime": 175.3632,
"eval_samples_per_second": 57.025,
"eval_steps_per_second": 7.128,
"step": 17400
},
{
"epoch": 38.81341835320211,
"grad_norm": 0.7112624049186707,
"learning_rate": 4.264214046822743e-06,
"loss": 6.3819,
"step": 17500
},
{
"epoch": 38.81341835320211,
"eval_loss": 6.394676685333252,
"eval_runtime": 174.8435,
"eval_samples_per_second": 57.194,
"eval_steps_per_second": 7.149,
"step": 17500
},
{
"epoch": 39.035209315220406,
"grad_norm": 0.550544261932373,
"learning_rate": 4.254180602006689e-06,
"loss": 6.3826,
"step": 17600
},
{
"epoch": 39.035209315220406,
"eval_loss": 6.396825313568115,
"eval_runtime": 175.8952,
"eval_samples_per_second": 56.852,
"eval_steps_per_second": 7.107,
"step": 17600
},
{
"epoch": 39.2570002772387,
"grad_norm": 0.43430355191230774,
"learning_rate": 4.244147157190635e-06,
"loss": 6.3829,
"step": 17700
},
{
"epoch": 39.2570002772387,
"eval_loss": 6.396999835968018,
"eval_runtime": 173.2928,
"eval_samples_per_second": 57.706,
"eval_steps_per_second": 7.213,
"step": 17700
},
{
"epoch": 39.478791239257,
"grad_norm": 0.4726496636867523,
"learning_rate": 4.234113712374582e-06,
"loss": 6.3832,
"step": 17800
},
{
"epoch": 39.478791239257,
"eval_loss": 6.394546031951904,
"eval_runtime": 175.1792,
"eval_samples_per_second": 57.084,
"eval_steps_per_second": 7.136,
"step": 17800
},
{
"epoch": 39.700582201275296,
"grad_norm": 0.6477558612823486,
"learning_rate": 4.224080267558528e-06,
"loss": 6.383,
"step": 17900
},
{
"epoch": 39.700582201275296,
"eval_loss": 6.39369010925293,
"eval_runtime": 175.8821,
"eval_samples_per_second": 56.856,
"eval_steps_per_second": 7.107,
"step": 17900
},
{
"epoch": 39.92237316329359,
"grad_norm": 0.3382057845592499,
"learning_rate": 4.214046822742475e-06,
"loss": 6.3794,
"step": 18000
},
{
"epoch": 39.92237316329359,
"eval_loss": 6.394671440124512,
"eval_runtime": 175.9089,
"eval_samples_per_second": 56.848,
"eval_steps_per_second": 7.106,
"step": 18000
},
{
"epoch": 40.144164125311896,
"grad_norm": 0.32499295473098755,
"learning_rate": 4.2040133779264216e-06,
"loss": 6.3836,
"step": 18100
},
{
"epoch": 40.144164125311896,
"eval_loss": 6.393697738647461,
"eval_runtime": 173.0953,
"eval_samples_per_second": 57.772,
"eval_steps_per_second": 7.221,
"step": 18100
},
{
"epoch": 40.36595508733019,
"grad_norm": 0.4412948489189148,
"learning_rate": 4.1939799331103675e-06,
"loss": 6.382,
"step": 18200
},
{
"epoch": 40.36595508733019,
"eval_loss": 6.395814895629883,
"eval_runtime": 175.6272,
"eval_samples_per_second": 56.939,
"eval_steps_per_second": 7.117,
"step": 18200
},
{
"epoch": 40.58774604934849,
"grad_norm": 0.46561938524246216,
"learning_rate": 4.183946488294314e-06,
"loss": 6.3809,
"step": 18300
},
{
"epoch": 40.58774604934849,
"eval_loss": 6.395906448364258,
"eval_runtime": 173.1113,
"eval_samples_per_second": 57.766,
"eval_steps_per_second": 7.221,
"step": 18300
},
{
"epoch": 40.809537011366785,
"grad_norm": 0.3944660425186157,
"learning_rate": 4.173913043478261e-06,
"loss": 6.3816,
"step": 18400
},
{
"epoch": 40.809537011366785,
"eval_loss": 6.395975589752197,
"eval_runtime": 175.6877,
"eval_samples_per_second": 56.919,
"eval_steps_per_second": 7.115,
"step": 18400
},
{
"epoch": 41.03132797338508,
"grad_norm": 0.6692656874656677,
"learning_rate": 4.163879598662208e-06,
"loss": 6.3812,
"step": 18500
},
{
"epoch": 41.03132797338508,
"eval_loss": 6.39307975769043,
"eval_runtime": 173.2571,
"eval_samples_per_second": 57.718,
"eval_steps_per_second": 7.215,
"step": 18500
},
{
"epoch": 41.253118935403386,
"grad_norm": 0.5447328090667725,
"learning_rate": 4.153846153846154e-06,
"loss": 6.382,
"step": 18600
},
{
"epoch": 41.253118935403386,
"eval_loss": 6.392385005950928,
"eval_runtime": 175.7445,
"eval_samples_per_second": 56.901,
"eval_steps_per_second": 7.113,
"step": 18600
},
{
"epoch": 41.47490989742168,
"grad_norm": 0.4197390079498291,
"learning_rate": 4.1438127090301005e-06,
"loss": 6.3809,
"step": 18700
},
{
"epoch": 41.47490989742168,
"eval_loss": 6.395226001739502,
"eval_runtime": 173.3622,
"eval_samples_per_second": 57.683,
"eval_steps_per_second": 7.21,
"step": 18700
},
{
"epoch": 41.69670085943998,
"grad_norm": 0.37331509590148926,
"learning_rate": 4.133779264214047e-06,
"loss": 6.3821,
"step": 18800
},
{
"epoch": 41.69670085943998,
"eval_loss": 6.397747039794922,
"eval_runtime": 175.5873,
"eval_samples_per_second": 56.952,
"eval_steps_per_second": 7.119,
"step": 18800
},
{
"epoch": 41.918491821458275,
"grad_norm": 0.439635306596756,
"learning_rate": 4.123745819397993e-06,
"loss": 6.3802,
"step": 18900
},
{
"epoch": 41.918491821458275,
"eval_loss": 6.393184185028076,
"eval_runtime": 175.6266,
"eval_samples_per_second": 56.939,
"eval_steps_per_second": 7.117,
"step": 18900
},
{
"epoch": 42.14028278347657,
"grad_norm": 0.4135972857475281,
"learning_rate": 4.11371237458194e-06,
"loss": 6.381,
"step": 19000
},
{
"epoch": 42.14028278347657,
"eval_loss": 6.396628379821777,
"eval_runtime": 175.68,
"eval_samples_per_second": 56.922,
"eval_steps_per_second": 7.115,
"step": 19000
},
{
"epoch": 42.36207374549487,
"grad_norm": 0.3350447118282318,
"learning_rate": 4.103678929765887e-06,
"loss": 6.382,
"step": 19100
},
{
"epoch": 42.36207374549487,
"eval_loss": 6.3959784507751465,
"eval_runtime": 173.1015,
"eval_samples_per_second": 57.77,
"eval_steps_per_second": 7.221,
"step": 19100
},
{
"epoch": 42.58386470751317,
"grad_norm": 0.40015509724617004,
"learning_rate": 4.0936454849498326e-06,
"loss": 6.3793,
"step": 19200
},
{
"epoch": 42.58386470751317,
"eval_loss": 6.392791271209717,
"eval_runtime": 175.6231,
"eval_samples_per_second": 56.94,
"eval_steps_per_second": 7.118,
"step": 19200
},
{
"epoch": 42.80565566953147,
"grad_norm": 0.42993155121803284,
"learning_rate": 4.083612040133779e-06,
"loss": 6.3817,
"step": 19300
},
{
"epoch": 42.80565566953147,
"eval_loss": 6.393764495849609,
"eval_runtime": 175.7583,
"eval_samples_per_second": 56.896,
"eval_steps_per_second": 7.112,
"step": 19300
},
{
"epoch": 43.027446631549765,
"grad_norm": 0.506564199924469,
"learning_rate": 4.073578595317726e-06,
"loss": 6.3805,
"step": 19400
},
{
"epoch": 43.027446631549765,
"eval_loss": 6.395299434661865,
"eval_runtime": 172.8685,
"eval_samples_per_second": 57.847,
"eval_steps_per_second": 7.231,
"step": 19400
},
{
"epoch": 43.24923759356806,
"grad_norm": 0.34368619322776794,
"learning_rate": 4.063545150501672e-06,
"loss": 6.3791,
"step": 19500
},
{
"epoch": 43.24923759356806,
"eval_loss": 6.390516757965088,
"eval_runtime": 175.4183,
"eval_samples_per_second": 57.007,
"eval_steps_per_second": 7.126,
"step": 19500
},
{
"epoch": 43.47102855558636,
"grad_norm": 0.5442679524421692,
"learning_rate": 4.053511705685619e-06,
"loss": 6.3805,
"step": 19600
},
{
"epoch": 43.47102855558636,
"eval_loss": 6.390527248382568,
"eval_runtime": 172.8815,
"eval_samples_per_second": 57.843,
"eval_steps_per_second": 7.23,
"step": 19600
},
{
"epoch": 43.692819517604654,
"grad_norm": 0.6060280799865723,
"learning_rate": 4.0434782608695655e-06,
"loss": 6.3792,
"step": 19700
},
{
"epoch": 43.692819517604654,
"eval_loss": 6.393373489379883,
"eval_runtime": 175.3372,
"eval_samples_per_second": 57.033,
"eval_steps_per_second": 7.129,
"step": 19700
},
{
"epoch": 43.91461047962296,
"grad_norm": 0.5891469120979309,
"learning_rate": 4.0334448160535115e-06,
"loss": 6.382,
"step": 19800
},
{
"epoch": 43.91461047962296,
"eval_loss": 6.395658493041992,
"eval_runtime": 173.3068,
"eval_samples_per_second": 57.701,
"eval_steps_per_second": 7.213,
"step": 19800
},
{
"epoch": 44.136401441641254,
"grad_norm": 0.3623868525028229,
"learning_rate": 4.023411371237458e-06,
"loss": 6.3794,
"step": 19900
},
{
"epoch": 44.136401441641254,
"eval_loss": 6.394290447235107,
"eval_runtime": 175.7778,
"eval_samples_per_second": 56.89,
"eval_steps_per_second": 7.111,
"step": 19900
},
{
"epoch": 44.35819240365955,
"grad_norm": 0.6197667121887207,
"learning_rate": 4.013377926421405e-06,
"loss": 6.3798,
"step": 20000
},
{
"epoch": 44.35819240365955,
"eval_loss": 6.393582820892334,
"eval_runtime": 175.4817,
"eval_samples_per_second": 56.986,
"eval_steps_per_second": 7.123,
"step": 20000
},
{
"epoch": 44.57998336567785,
"grad_norm": 0.5198450684547424,
"learning_rate": 4.003344481605351e-06,
"loss": 6.3792,
"step": 20100
},
{
"epoch": 44.57998336567785,
"eval_loss": 6.3943023681640625,
"eval_runtime": 175.4115,
"eval_samples_per_second": 57.009,
"eval_steps_per_second": 7.126,
"step": 20100
},
{
"epoch": 44.801774327696144,
"grad_norm": 0.4044889211654663,
"learning_rate": 3.993311036789298e-06,
"loss": 6.3798,
"step": 20200
},
{
"epoch": 44.801774327696144,
"eval_loss": 6.396990776062012,
"eval_runtime": 172.8449,
"eval_samples_per_second": 57.855,
"eval_steps_per_second": 7.232,
"step": 20200
},
{
"epoch": 45.02356528971445,
"grad_norm": 0.4656885862350464,
"learning_rate": 3.9832775919732444e-06,
"loss": 6.3807,
"step": 20300
},
{
"epoch": 45.02356528971445,
"eval_loss": 6.395167350769043,
"eval_runtime": 175.2548,
"eval_samples_per_second": 57.06,
"eval_steps_per_second": 7.132,
"step": 20300
},
{
"epoch": 45.245356251732744,
"grad_norm": 0.5882771611213684,
"learning_rate": 3.97324414715719e-06,
"loss": 6.3802,
"step": 20400
},
{
"epoch": 45.245356251732744,
"eval_loss": 6.392847537994385,
"eval_runtime": 175.4165,
"eval_samples_per_second": 57.007,
"eval_steps_per_second": 7.126,
"step": 20400
},
{
"epoch": 45.46714721375104,
"grad_norm": 0.31189513206481934,
"learning_rate": 3.963210702341137e-06,
"loss": 6.3799,
"step": 20500
},
{
"epoch": 45.46714721375104,
"eval_loss": 6.391454696655273,
"eval_runtime": 175.3822,
"eval_samples_per_second": 57.018,
"eval_steps_per_second": 7.127,
"step": 20500
},
{
"epoch": 45.68893817576934,
"grad_norm": 0.7188530564308167,
"learning_rate": 3.953177257525084e-06,
"loss": 6.3775,
"step": 20600
},
{
"epoch": 45.68893817576934,
"eval_loss": 6.391802787780762,
"eval_runtime": 175.4136,
"eval_samples_per_second": 57.008,
"eval_steps_per_second": 7.126,
"step": 20600
},
{
"epoch": 45.910729137787634,
"grad_norm": 0.4235071837902069,
"learning_rate": 3.943143812709031e-06,
"loss": 6.3791,
"step": 20700
},
{
"epoch": 45.910729137787634,
"eval_loss": 6.3952836990356445,
"eval_runtime": 175.3753,
"eval_samples_per_second": 57.021,
"eval_steps_per_second": 7.128,
"step": 20700
},
{
"epoch": 46.13252009980593,
"grad_norm": 0.4977140724658966,
"learning_rate": 3.9331103678929765e-06,
"loss": 6.3807,
"step": 20800
},
{
"epoch": 46.13252009980593,
"eval_loss": 6.397064208984375,
"eval_runtime": 175.8439,
"eval_samples_per_second": 56.869,
"eval_steps_per_second": 7.109,
"step": 20800
},
{
"epoch": 46.354311061824234,
"grad_norm": 0.5896762609481812,
"learning_rate": 3.923076923076923e-06,
"loss": 6.3801,
"step": 20900
},
{
"epoch": 46.354311061824234,
"eval_loss": 6.394172191619873,
"eval_runtime": 173.449,
"eval_samples_per_second": 57.654,
"eval_steps_per_second": 7.207,
"step": 20900
},
{
"epoch": 46.57610202384253,
"grad_norm": 0.47281450033187866,
"learning_rate": 3.91304347826087e-06,
"loss": 6.3787,
"step": 21000
},
{
"epoch": 46.57610202384253,
"eval_loss": 6.3905463218688965,
"eval_runtime": 175.9964,
"eval_samples_per_second": 56.819,
"eval_steps_per_second": 7.102,
"step": 21000
},
{
"epoch": 46.79789298586083,
"grad_norm": 0.42211413383483887,
"learning_rate": 3.903010033444816e-06,
"loss": 6.3798,
"step": 21100
},
{
"epoch": 46.79789298586083,
"eval_loss": 6.39119291305542,
"eval_runtime": 175.8005,
"eval_samples_per_second": 56.883,
"eval_steps_per_second": 7.11,
"step": 21100
},
{
"epoch": 47.01968394787912,
"grad_norm": 0.7232652306556702,
"learning_rate": 3.892976588628763e-06,
"loss": 6.3795,
"step": 21200
},
{
"epoch": 47.01968394787912,
"eval_loss": 6.39454984664917,
"eval_runtime": 174.7314,
"eval_samples_per_second": 57.231,
"eval_steps_per_second": 7.154,
"step": 21200
},
{
"epoch": 47.24147490989742,
"grad_norm": 0.4875265657901764,
"learning_rate": 3.8829431438127095e-06,
"loss": 6.3798,
"step": 21300
},
{
"epoch": 47.24147490989742,
"eval_loss": 6.391242027282715,
"eval_runtime": 173.1294,
"eval_samples_per_second": 57.76,
"eval_steps_per_second": 7.22,
"step": 21300
},
{
"epoch": 47.463265871915716,
"grad_norm": 0.689365804195404,
"learning_rate": 3.8729096989966554e-06,
"loss": 6.3797,
"step": 21400
},
{
"epoch": 47.463265871915716,
"eval_loss": 6.392244338989258,
"eval_runtime": 175.7048,
"eval_samples_per_second": 56.914,
"eval_steps_per_second": 7.114,
"step": 21400
},
{
"epoch": 47.68505683393402,
"grad_norm": 0.34326601028442383,
"learning_rate": 3.862876254180602e-06,
"loss": 6.3799,
"step": 21500
},
{
"epoch": 47.68505683393402,
"eval_loss": 6.390882968902588,
"eval_runtime": 173.1981,
"eval_samples_per_second": 57.737,
"eval_steps_per_second": 7.217,
"step": 21500
},
{
"epoch": 47.90684779595232,
"grad_norm": 0.5094731450080872,
"learning_rate": 3.852842809364549e-06,
"loss": 6.3789,
"step": 21600
},
{
"epoch": 47.90684779595232,
"eval_loss": 6.391824245452881,
"eval_runtime": 175.6758,
"eval_samples_per_second": 56.923,
"eval_steps_per_second": 7.115,
"step": 21600
},
{
"epoch": 48.12863875797061,
"grad_norm": 0.5096613764762878,
"learning_rate": 3.842809364548495e-06,
"loss": 6.3788,
"step": 21700
},
{
"epoch": 48.12863875797061,
"eval_loss": 6.3908467292785645,
"eval_runtime": 175.722,
"eval_samples_per_second": 56.908,
"eval_steps_per_second": 7.114,
"step": 21700
},
{
"epoch": 48.35042971998891,
"grad_norm": 0.49328041076660156,
"learning_rate": 3.832775919732442e-06,
"loss": 6.3801,
"step": 21800
},
{
"epoch": 48.35042971998891,
"eval_loss": 6.392337322235107,
"eval_runtime": 175.7017,
"eval_samples_per_second": 56.915,
"eval_steps_per_second": 7.114,
"step": 21800
},
{
"epoch": 48.572220682007206,
"grad_norm": 0.331511914730072,
"learning_rate": 3.822742474916388e-06,
"loss": 6.3787,
"step": 21900
},
{
"epoch": 48.572220682007206,
"eval_loss": 6.392426013946533,
"eval_runtime": 175.6914,
"eval_samples_per_second": 56.918,
"eval_steps_per_second": 7.115,
"step": 21900
},
{
"epoch": 48.7940116440255,
"grad_norm": 0.5596035718917847,
"learning_rate": 3.8127090301003347e-06,
"loss": 6.3783,
"step": 22000
},
{
"epoch": 48.7940116440255,
"eval_loss": 6.396266460418701,
"eval_runtime": 175.7217,
"eval_samples_per_second": 56.908,
"eval_steps_per_second": 7.114,
"step": 22000
},
{
"epoch": 49.015802606043806,
"grad_norm": 0.42308327555656433,
"learning_rate": 3.802675585284281e-06,
"loss": 6.3788,
"step": 22100
},
{
"epoch": 49.015802606043806,
"eval_loss": 6.392462730407715,
"eval_runtime": 175.6395,
"eval_samples_per_second": 56.935,
"eval_steps_per_second": 7.117,
"step": 22100
},
{
"epoch": 49.2375935680621,
"grad_norm": 0.47657862305641174,
"learning_rate": 3.792642140468228e-06,
"loss": 6.3768,
"step": 22200
},
{
"epoch": 49.2375935680621,
"eval_loss": 6.392263412475586,
"eval_runtime": 175.6228,
"eval_samples_per_second": 56.94,
"eval_steps_per_second": 7.118,
"step": 22200
},
{
"epoch": 49.4593845300804,
"grad_norm": 0.4417143166065216,
"learning_rate": 3.782608695652174e-06,
"loss": 6.3785,
"step": 22300
},
{
"epoch": 49.4593845300804,
"eval_loss": 6.39237642288208,
"eval_runtime": 175.5904,
"eval_samples_per_second": 56.951,
"eval_steps_per_second": 7.119,
"step": 22300
},
{
"epoch": 49.681175492098696,
"grad_norm": 0.3279063105583191,
"learning_rate": 3.7725752508361205e-06,
"loss": 6.3791,
"step": 22400
},
{
"epoch": 49.681175492098696,
"eval_loss": 6.3924407958984375,
"eval_runtime": 175.6501,
"eval_samples_per_second": 56.931,
"eval_steps_per_second": 7.116,
"step": 22400
},
{
"epoch": 49.90296645411699,
"grad_norm": 0.6854652166366577,
"learning_rate": 3.7625418060200673e-06,
"loss": 6.3785,
"step": 22500
},
{
"epoch": 49.90296645411699,
"eval_loss": 6.390333652496338,
"eval_runtime": 175.1173,
"eval_samples_per_second": 57.105,
"eval_steps_per_second": 7.138,
"step": 22500
},
{
"epoch": 50.124757416135296,
"grad_norm": 0.3522402048110962,
"learning_rate": 3.7525083612040136e-06,
"loss": 6.3776,
"step": 22600
},
{
"epoch": 50.124757416135296,
"eval_loss": 6.395279884338379,
"eval_runtime": 172.8769,
"eval_samples_per_second": 57.845,
"eval_steps_per_second": 7.231,
"step": 22600
},
{
"epoch": 50.34654837815359,
"grad_norm": 0.4847201704978943,
"learning_rate": 3.74247491638796e-06,
"loss": 6.3798,
"step": 22700
},
{
"epoch": 50.34654837815359,
"eval_loss": 6.385508060455322,
"eval_runtime": 175.3898,
"eval_samples_per_second": 57.016,
"eval_steps_per_second": 7.127,
"step": 22700
},
{
"epoch": 50.56833934017189,
"grad_norm": 0.6891096234321594,
"learning_rate": 3.7324414715719067e-06,
"loss": 6.379,
"step": 22800
},
{
"epoch": 50.56833934017189,
"eval_loss": 6.389738082885742,
"eval_runtime": 172.9656,
"eval_samples_per_second": 57.815,
"eval_steps_per_second": 7.227,
"step": 22800
},
{
"epoch": 50.790130302190185,
"grad_norm": 0.5377815365791321,
"learning_rate": 3.722408026755853e-06,
"loss": 6.3781,
"step": 22900
},
{
"epoch": 50.790130302190185,
"eval_loss": 6.393865585327148,
"eval_runtime": 175.4211,
"eval_samples_per_second": 57.006,
"eval_steps_per_second": 7.126,
"step": 22900
},
{
"epoch": 51.01192126420848,
"grad_norm": 0.33496779203414917,
"learning_rate": 3.7123745819398e-06,
"loss": 6.3774,
"step": 23000
},
{
"epoch": 51.01192126420848,
"eval_loss": 6.388363838195801,
"eval_runtime": 172.9308,
"eval_samples_per_second": 57.827,
"eval_steps_per_second": 7.228,
"step": 23000
},
{
"epoch": 51.23371222622678,
"grad_norm": 0.374717116355896,
"learning_rate": 3.702341137123746e-06,
"loss": 6.3782,
"step": 23100
},
{
"epoch": 51.23371222622678,
"eval_loss": 6.3933634757995605,
"eval_runtime": 175.8194,
"eval_samples_per_second": 56.877,
"eval_steps_per_second": 7.11,
"step": 23100
},
{
"epoch": 51.45550318824508,
"grad_norm": 0.5700441002845764,
"learning_rate": 3.6923076923076925e-06,
"loss": 6.3779,
"step": 23200
},
{
"epoch": 51.45550318824508,
"eval_loss": 6.391829490661621,
"eval_runtime": 173.0462,
"eval_samples_per_second": 57.788,
"eval_steps_per_second": 7.224,
"step": 23200
},
{
"epoch": 51.67729415026338,
"grad_norm": 0.5987123250961304,
"learning_rate": 3.6822742474916393e-06,
"loss": 6.3775,
"step": 23300
},
{
"epoch": 51.67729415026338,
"eval_loss": 6.391645908355713,
"eval_runtime": 175.546,
"eval_samples_per_second": 56.965,
"eval_steps_per_second": 7.121,
"step": 23300
},
{
"epoch": 51.899085112281675,
"grad_norm": 0.6282506585121155,
"learning_rate": 3.6722408026755856e-06,
"loss": 6.3785,
"step": 23400
},
{
"epoch": 51.899085112281675,
"eval_loss": 6.394507884979248,
"eval_runtime": 175.5236,
"eval_samples_per_second": 56.972,
"eval_steps_per_second": 7.122,
"step": 23400
},
{
"epoch": 52.12087607429997,
"grad_norm": 0.4422946572303772,
"learning_rate": 3.662207357859532e-06,
"loss": 6.378,
"step": 23500
},
{
"epoch": 52.12087607429997,
"eval_loss": 6.389113903045654,
"eval_runtime": 172.8391,
"eval_samples_per_second": 57.857,
"eval_steps_per_second": 7.232,
"step": 23500
},
{
"epoch": 52.34266703631827,
"grad_norm": 0.43772438168525696,
"learning_rate": 3.6521739130434787e-06,
"loss": 6.3769,
"step": 23600
},
{
"epoch": 52.34266703631827,
"eval_loss": 6.389682292938232,
"eval_runtime": 174.37,
"eval_samples_per_second": 57.349,
"eval_steps_per_second": 7.169,
"step": 23600
},
{
"epoch": 52.564457998336565,
"grad_norm": 0.4291711449623108,
"learning_rate": 3.642140468227425e-06,
"loss": 6.3787,
"step": 23700
},
{
"epoch": 52.564457998336565,
"eval_loss": 6.387042999267578,
"eval_runtime": 175.3622,
"eval_samples_per_second": 57.025,
"eval_steps_per_second": 7.128,
"step": 23700
},
{
"epoch": 52.78624896035487,
"grad_norm": 0.3986354172229767,
"learning_rate": 3.6321070234113714e-06,
"loss": 6.378,
"step": 23800
},
{
"epoch": 52.78624896035487,
"eval_loss": 6.394027233123779,
"eval_runtime": 175.4238,
"eval_samples_per_second": 57.005,
"eval_steps_per_second": 7.126,
"step": 23800
},
{
"epoch": 53.008039922373165,
"grad_norm": 0.4198819398880005,
"learning_rate": 3.622073578595318e-06,
"loss": 6.378,
"step": 23900
},
{
"epoch": 53.008039922373165,
"eval_loss": 6.391998291015625,
"eval_runtime": 175.3995,
"eval_samples_per_second": 57.013,
"eval_steps_per_second": 7.127,
"step": 23900
},
{
"epoch": 53.22983088439146,
"grad_norm": 0.42992842197418213,
"learning_rate": 3.6120401337792645e-06,
"loss": 6.378,
"step": 24000
},
{
"epoch": 53.22983088439146,
"eval_loss": 6.391213893890381,
"eval_runtime": 175.5148,
"eval_samples_per_second": 56.975,
"eval_steps_per_second": 7.122,
"step": 24000
},
{
"epoch": 53.45162184640976,
"grad_norm": 0.3845984637737274,
"learning_rate": 3.6020066889632112e-06,
"loss": 6.3794,
"step": 24100
},
{
"epoch": 53.45162184640976,
"eval_loss": 6.395719528198242,
"eval_runtime": 175.2358,
"eval_samples_per_second": 57.066,
"eval_steps_per_second": 7.133,
"step": 24100
},
{
"epoch": 53.673412808428054,
"grad_norm": 0.4092540144920349,
"learning_rate": 3.5919732441471576e-06,
"loss": 6.3764,
"step": 24200
},
{
"epoch": 53.673412808428054,
"eval_loss": 6.392786502838135,
"eval_runtime": 173.4491,
"eval_samples_per_second": 57.654,
"eval_steps_per_second": 7.207,
"step": 24200
},
{
"epoch": 53.89520377044636,
"grad_norm": 0.4434932470321655,
"learning_rate": 3.581939799331104e-06,
"loss": 6.3784,
"step": 24300
},
{
"epoch": 53.89520377044636,
"eval_loss": 6.392944812774658,
"eval_runtime": 173.4556,
"eval_samples_per_second": 57.652,
"eval_steps_per_second": 7.206,
"step": 24300
},
{
"epoch": 54.116994732464654,
"grad_norm": 0.3644530177116394,
"learning_rate": 3.5719063545150507e-06,
"loss": 6.3777,
"step": 24400
},
{
"epoch": 54.116994732464654,
"eval_loss": 6.389293193817139,
"eval_runtime": 175.8393,
"eval_samples_per_second": 56.87,
"eval_steps_per_second": 7.109,
"step": 24400
},
{
"epoch": 54.33878569448295,
"grad_norm": 0.42048630118370056,
"learning_rate": 3.561872909698997e-06,
"loss": 6.3779,
"step": 24500
},
{
"epoch": 54.33878569448295,
"eval_loss": 6.392094612121582,
"eval_runtime": 173.3329,
"eval_samples_per_second": 57.692,
"eval_steps_per_second": 7.212,
"step": 24500
},
{
"epoch": 54.56057665650125,
"grad_norm": 0.5288220047950745,
"learning_rate": 3.5518394648829434e-06,
"loss": 6.3768,
"step": 24600
},
{
"epoch": 54.56057665650125,
"eval_loss": 6.389921188354492,
"eval_runtime": 175.5087,
"eval_samples_per_second": 56.977,
"eval_steps_per_second": 7.122,
"step": 24600
},
{
"epoch": 54.782367618519544,
"grad_norm": 0.5413895845413208,
"learning_rate": 3.54180602006689e-06,
"loss": 6.3788,
"step": 24700
},
{
"epoch": 54.782367618519544,
"eval_loss": 6.389023303985596,
"eval_runtime": 172.9846,
"eval_samples_per_second": 57.809,
"eval_steps_per_second": 7.226,
"step": 24700
},
{
"epoch": 55.00415858053784,
"grad_norm": 0.35512205958366394,
"learning_rate": 3.5317725752508365e-06,
"loss": 6.3777,
"step": 24800
},
{
"epoch": 55.00415858053784,
"eval_loss": 6.390623569488525,
"eval_runtime": 175.3777,
"eval_samples_per_second": 57.02,
"eval_steps_per_second": 7.127,
"step": 24800
},
{
"epoch": 55.225949542556144,
"grad_norm": 0.46963444352149963,
"learning_rate": 3.521739130434783e-06,
"loss": 6.3759,
"step": 24900
},
{
"epoch": 55.225949542556144,
"eval_loss": 6.392442226409912,
"eval_runtime": 173.0136,
"eval_samples_per_second": 57.799,
"eval_steps_per_second": 7.225,
"step": 24900
},
{
"epoch": 55.44774050457444,
"grad_norm": 0.4473781883716583,
"learning_rate": 3.5117056856187296e-06,
"loss": 6.3766,
"step": 25000
},
{
"epoch": 55.44774050457444,
"eval_loss": 6.392148971557617,
"eval_runtime": 175.4775,
"eval_samples_per_second": 56.987,
"eval_steps_per_second": 7.123,
"step": 25000
},
{
"epoch": 55.66953146659274,
"grad_norm": 0.4387643337249756,
"learning_rate": 3.501672240802676e-06,
"loss": 6.3768,
"step": 25100
},
{
"epoch": 55.66953146659274,
"eval_loss": 6.391911506652832,
"eval_runtime": 175.6257,
"eval_samples_per_second": 56.939,
"eval_steps_per_second": 7.117,
"step": 25100
},
{
"epoch": 55.89132242861103,
"grad_norm": 0.5157041549682617,
"learning_rate": 3.491638795986622e-06,
"loss": 6.3784,
"step": 25200
},
{
"epoch": 55.89132242861103,
"eval_loss": 6.384146690368652,
"eval_runtime": 175.6148,
"eval_samples_per_second": 56.943,
"eval_steps_per_second": 7.118,
"step": 25200
},
{
"epoch": 56.11311339062933,
"grad_norm": 0.36674726009368896,
"learning_rate": 3.481605351170568e-06,
"loss": 6.3757,
"step": 25300
},
{
"epoch": 56.11311339062933,
"eval_loss": 6.3921380043029785,
"eval_runtime": 175.3664,
"eval_samples_per_second": 57.023,
"eval_steps_per_second": 7.128,
"step": 25300
},
{
"epoch": 56.33490435264763,
"grad_norm": 0.44830092787742615,
"learning_rate": 3.471571906354515e-06,
"loss": 6.3785,
"step": 25400
},
{
"epoch": 56.33490435264763,
"eval_loss": 6.387638092041016,
"eval_runtime": 175.4426,
"eval_samples_per_second": 56.999,
"eval_steps_per_second": 7.125,
"step": 25400
},
{
"epoch": 56.55669531466593,
"grad_norm": 0.4037076532840729,
"learning_rate": 3.4615384615384613e-06,
"loss": 6.3753,
"step": 25500
},
{
"epoch": 56.55669531466593,
"eval_loss": 6.390742778778076,
"eval_runtime": 175.869,
"eval_samples_per_second": 56.861,
"eval_steps_per_second": 7.108,
"step": 25500
},
{
"epoch": 56.77848627668423,
"grad_norm": 0.5410855412483215,
"learning_rate": 3.4515050167224076e-06,
"loss": 6.3773,
"step": 25600
},
{
"epoch": 56.77848627668423,
"eval_loss": 6.388538837432861,
"eval_runtime": 175.5689,
"eval_samples_per_second": 56.958,
"eval_steps_per_second": 7.12,
"step": 25600
},
{
"epoch": 57.00027723870252,
"grad_norm": 0.6200158596038818,
"learning_rate": 3.4414715719063544e-06,
"loss": 6.3762,
"step": 25700
},
{
"epoch": 57.00027723870252,
"eval_loss": 6.392038345336914,
"eval_runtime": 172.8867,
"eval_samples_per_second": 57.841,
"eval_steps_per_second": 7.23,
"step": 25700
},
{
"epoch": 57.22206820072082,
"grad_norm": 0.33977118134498596,
"learning_rate": 3.4314381270903007e-06,
"loss": 6.3782,
"step": 25800
},
{
"epoch": 57.22206820072082,
"eval_loss": 6.390758037567139,
"eval_runtime": 172.9474,
"eval_samples_per_second": 57.821,
"eval_steps_per_second": 7.228,
"step": 25800
},
{
"epoch": 57.443859162739116,
"grad_norm": 0.396681010723114,
"learning_rate": 3.4214046822742475e-06,
"loss": 6.3766,
"step": 25900
},
{
"epoch": 57.443859162739116,
"eval_loss": 6.391767501831055,
"eval_runtime": 175.4265,
"eval_samples_per_second": 57.004,
"eval_steps_per_second": 7.125,
"step": 25900
},
{
"epoch": 57.66565012475741,
"grad_norm": 0.3652241826057434,
"learning_rate": 3.411371237458194e-06,
"loss": 6.3766,
"step": 26000
},
{
"epoch": 57.66565012475741,
"eval_loss": 6.388927936553955,
"eval_runtime": 173.1869,
"eval_samples_per_second": 57.741,
"eval_steps_per_second": 7.218,
"step": 26000
},
{
"epoch": 57.887441086775716,
"grad_norm": 0.40237948298454285,
"learning_rate": 3.40133779264214e-06,
"loss": 6.3786,
"step": 26100
},
{
"epoch": 57.887441086775716,
"eval_loss": 6.385989665985107,
"eval_runtime": 175.7809,
"eval_samples_per_second": 56.889,
"eval_steps_per_second": 7.111,
"step": 26100
},
{
"epoch": 58.10923204879401,
"grad_norm": 0.47134748101234436,
"learning_rate": 3.391304347826087e-06,
"loss": 6.3766,
"step": 26200
},
{
"epoch": 58.10923204879401,
"eval_loss": 6.388063907623291,
"eval_runtime": 172.8868,
"eval_samples_per_second": 57.841,
"eval_steps_per_second": 7.23,
"step": 26200
},
{
"epoch": 58.33102301081231,
"grad_norm": 0.35729169845581055,
"learning_rate": 3.3812709030100333e-06,
"loss": 6.376,
"step": 26300
},
{
"epoch": 58.33102301081231,
"eval_loss": 6.38781213760376,
"eval_runtime": 175.295,
"eval_samples_per_second": 57.047,
"eval_steps_per_second": 7.131,
"step": 26300
},
{
"epoch": 58.552813972830606,
"grad_norm": 0.38715028762817383,
"learning_rate": 3.3712374581939796e-06,
"loss": 6.3765,
"step": 26400
},
{
"epoch": 58.552813972830606,
"eval_loss": 6.389337539672852,
"eval_runtime": 172.8668,
"eval_samples_per_second": 57.848,
"eval_steps_per_second": 7.231,
"step": 26400
},
{
"epoch": 58.7746049348489,
"grad_norm": 0.46873271465301514,
"learning_rate": 3.3612040133779264e-06,
"loss": 6.3768,
"step": 26500
},
{
"epoch": 58.7746049348489,
"eval_loss": 6.392114162445068,
"eval_runtime": 175.4104,
"eval_samples_per_second": 57.009,
"eval_steps_per_second": 7.126,
"step": 26500
},
{
"epoch": 58.996395896867206,
"grad_norm": 0.3447762131690979,
"learning_rate": 3.3511705685618727e-06,
"loss": 6.3759,
"step": 26600
},
{
"epoch": 58.996395896867206,
"eval_loss": 6.387296676635742,
"eval_runtime": 175.3375,
"eval_samples_per_second": 57.033,
"eval_steps_per_second": 7.129,
"step": 26600
},
{
"epoch": 59.2181868588855,
"grad_norm": 0.3914731442928314,
"learning_rate": 3.3411371237458195e-06,
"loss": 6.3771,
"step": 26700
},
{
"epoch": 59.2181868588855,
"eval_loss": 6.387917995452881,
"eval_runtime": 175.4868,
"eval_samples_per_second": 56.984,
"eval_steps_per_second": 7.123,
"step": 26700
},
{
"epoch": 59.4399778209038,
"grad_norm": 0.5208538174629211,
"learning_rate": 3.331103678929766e-06,
"loss": 6.3765,
"step": 26800
},
{
"epoch": 59.4399778209038,
"eval_loss": 6.389184474945068,
"eval_runtime": 174.2169,
"eval_samples_per_second": 57.4,
"eval_steps_per_second": 7.175,
"step": 26800
},
{
"epoch": 59.661768782922096,
"grad_norm": 0.3724886178970337,
"learning_rate": 3.321070234113712e-06,
"loss": 6.3757,
"step": 26900
},
{
"epoch": 59.661768782922096,
"eval_loss": 6.392241954803467,
"eval_runtime": 175.4491,
"eval_samples_per_second": 56.997,
"eval_steps_per_second": 7.125,
"step": 26900
},
{
"epoch": 59.88355974494039,
"grad_norm": 0.33004748821258545,
"learning_rate": 3.311036789297659e-06,
"loss": 6.3759,
"step": 27000
},
{
"epoch": 59.88355974494039,
"eval_loss": 6.389077186584473,
"eval_runtime": 172.9579,
"eval_samples_per_second": 57.818,
"eval_steps_per_second": 7.227,
"step": 27000
},
{
"epoch": 60.10535070695869,
"grad_norm": 0.3995635211467743,
"learning_rate": 3.3010033444816052e-06,
"loss": 6.3774,
"step": 27100
},
{
"epoch": 60.10535070695869,
"eval_loss": 6.389009952545166,
"eval_runtime": 175.8118,
"eval_samples_per_second": 56.879,
"eval_steps_per_second": 7.11,
"step": 27100
},
{
"epoch": 60.32714166897699,
"grad_norm": 0.49882611632347107,
"learning_rate": 3.2909698996655516e-06,
"loss": 6.3762,
"step": 27200
},
{
"epoch": 60.32714166897699,
"eval_loss": 6.3899922370910645,
"eval_runtime": 175.7786,
"eval_samples_per_second": 56.89,
"eval_steps_per_second": 7.111,
"step": 27200
},
{
"epoch": 60.54893263099529,
"grad_norm": 0.46321776509284973,
"learning_rate": 3.2809364548494983e-06,
"loss": 6.3758,
"step": 27300
},
{
"epoch": 60.54893263099529,
"eval_loss": 6.389715671539307,
"eval_runtime": 175.8928,
"eval_samples_per_second": 56.853,
"eval_steps_per_second": 7.107,
"step": 27300
},
{
"epoch": 60.770723593013585,
"grad_norm": 0.4512879252433777,
"learning_rate": 3.2709030100334447e-06,
"loss": 6.3764,
"step": 27400
},
{
"epoch": 60.770723593013585,
"eval_loss": 6.388641357421875,
"eval_runtime": 175.8755,
"eval_samples_per_second": 56.858,
"eval_steps_per_second": 7.107,
"step": 27400
},
{
"epoch": 60.99251455503188,
"grad_norm": 0.5370669364929199,
"learning_rate": 3.260869565217391e-06,
"loss": 6.3764,
"step": 27500
},
{
"epoch": 60.99251455503188,
"eval_loss": 6.391347885131836,
"eval_runtime": 173.6027,
"eval_samples_per_second": 57.603,
"eval_steps_per_second": 7.2,
"step": 27500
},
{
"epoch": 61.21430551705018,
"grad_norm": 0.4362497627735138,
"learning_rate": 3.2508361204013378e-06,
"loss": 6.3747,
"step": 27600
},
{
"epoch": 61.21430551705018,
"eval_loss": 6.390707969665527,
"eval_runtime": 175.8739,
"eval_samples_per_second": 56.859,
"eval_steps_per_second": 7.107,
"step": 27600
},
{
"epoch": 61.436096479068475,
"grad_norm": 0.36759933829307556,
"learning_rate": 3.240802675585284e-06,
"loss": 6.3768,
"step": 27700
},
{
"epoch": 61.436096479068475,
"eval_loss": 6.390637397766113,
"eval_runtime": 173.3683,
"eval_samples_per_second": 57.681,
"eval_steps_per_second": 7.21,
"step": 27700
},
{
"epoch": 61.65788744108678,
"grad_norm": 0.4922894537448883,
"learning_rate": 3.230769230769231e-06,
"loss": 6.3758,
"step": 27800
},
{
"epoch": 61.65788744108678,
"eval_loss": 6.386129379272461,
"eval_runtime": 175.8295,
"eval_samples_per_second": 56.873,
"eval_steps_per_second": 7.109,
"step": 27800
},
{
"epoch": 61.879678403105075,
"grad_norm": 0.5007067918777466,
"learning_rate": 3.2207357859531772e-06,
"loss": 6.3755,
"step": 27900
},
{
"epoch": 61.879678403105075,
"eval_loss": 6.389693737030029,
"eval_runtime": 173.4229,
"eval_samples_per_second": 57.663,
"eval_steps_per_second": 7.208,
"step": 27900
},
{
"epoch": 62.10146936512337,
"grad_norm": 0.5208317041397095,
"learning_rate": 3.2107023411371236e-06,
"loss": 6.3766,
"step": 28000
},
{
"epoch": 62.10146936512337,
"eval_loss": 6.387614727020264,
"eval_runtime": 175.7473,
"eval_samples_per_second": 56.9,
"eval_steps_per_second": 7.112,
"step": 28000
},
{
"epoch": 62.32326032714167,
"grad_norm": 0.5632686614990234,
"learning_rate": 3.2006688963210703e-06,
"loss": 6.3759,
"step": 28100
},
{
"epoch": 62.32326032714167,
"eval_loss": 6.392298221588135,
"eval_runtime": 173.3859,
"eval_samples_per_second": 57.675,
"eval_steps_per_second": 7.209,
"step": 28100
},
{
"epoch": 62.545051289159964,
"grad_norm": 0.44811296463012695,
"learning_rate": 3.1906354515050167e-06,
"loss": 6.376,
"step": 28200
},
{
"epoch": 62.545051289159964,
"eval_loss": 6.388302326202393,
"eval_runtime": 175.8812,
"eval_samples_per_second": 56.857,
"eval_steps_per_second": 7.107,
"step": 28200
},
{
"epoch": 62.76684225117826,
"grad_norm": 0.434894323348999,
"learning_rate": 3.180602006688963e-06,
"loss": 6.3754,
"step": 28300
},
{
"epoch": 62.76684225117826,
"eval_loss": 6.388329982757568,
"eval_runtime": 173.1819,
"eval_samples_per_second": 57.743,
"eval_steps_per_second": 7.218,
"step": 28300
},
{
"epoch": 62.988633213196564,
"grad_norm": 0.4996633231639862,
"learning_rate": 3.1705685618729098e-06,
"loss": 6.3753,
"step": 28400
},
{
"epoch": 62.988633213196564,
"eval_loss": 6.386618614196777,
"eval_runtime": 175.5366,
"eval_samples_per_second": 56.968,
"eval_steps_per_second": 7.121,
"step": 28400
},
{
"epoch": 63.21042417521486,
"grad_norm": 0.4766680598258972,
"learning_rate": 3.160535117056856e-06,
"loss": 6.3757,
"step": 28500
},
{
"epoch": 63.21042417521486,
"eval_loss": 6.388480186462402,
"eval_runtime": 175.3311,
"eval_samples_per_second": 57.035,
"eval_steps_per_second": 7.129,
"step": 28500
},
{
"epoch": 63.43221513723316,
"grad_norm": 0.28831642866134644,
"learning_rate": 3.1505016722408024e-06,
"loss": 6.3764,
"step": 28600
},
{
"epoch": 63.43221513723316,
"eval_loss": 6.3880767822265625,
"eval_runtime": 175.3784,
"eval_samples_per_second": 57.02,
"eval_steps_per_second": 7.127,
"step": 28600
},
{
"epoch": 63.654006099251454,
"grad_norm": 0.2838084399700165,
"learning_rate": 3.140468227424749e-06,
"loss": 6.3755,
"step": 28700
},
{
"epoch": 63.654006099251454,
"eval_loss": 6.386078357696533,
"eval_runtime": 172.9388,
"eval_samples_per_second": 57.824,
"eval_steps_per_second": 7.228,
"step": 28700
},
{
"epoch": 63.87579706126975,
"grad_norm": 0.47868525981903076,
"learning_rate": 3.1304347826086955e-06,
"loss": 6.377,
"step": 28800
},
{
"epoch": 63.87579706126975,
"eval_loss": 6.387932777404785,
"eval_runtime": 175.4569,
"eval_samples_per_second": 56.994,
"eval_steps_per_second": 7.124,
"step": 28800
},
{
"epoch": 64.09758802328805,
"grad_norm": 0.5446937680244446,
"learning_rate": 3.1204013377926423e-06,
"loss": 6.3753,
"step": 28900
},
{
"epoch": 64.09758802328805,
"eval_loss": 6.388584136962891,
"eval_runtime": 172.8884,
"eval_samples_per_second": 57.841,
"eval_steps_per_second": 7.23,
"step": 28900
},
{
"epoch": 64.31937898530634,
"grad_norm": 0.41702982783317566,
"learning_rate": 3.1103678929765886e-06,
"loss": 6.3761,
"step": 29000
},
{
"epoch": 64.31937898530634,
"eval_loss": 6.3896894454956055,
"eval_runtime": 172.9657,
"eval_samples_per_second": 57.815,
"eval_steps_per_second": 7.227,
"step": 29000
},
{
"epoch": 64.54116994732465,
"grad_norm": 0.39311668276786804,
"learning_rate": 3.100334448160535e-06,
"loss": 6.3753,
"step": 29100
},
{
"epoch": 64.54116994732465,
"eval_loss": 6.3889970779418945,
"eval_runtime": 175.6814,
"eval_samples_per_second": 56.921,
"eval_steps_per_second": 7.115,
"step": 29100
},
{
"epoch": 64.76296090934295,
"grad_norm": 0.31582164764404297,
"learning_rate": 3.0903010033444818e-06,
"loss": 6.3763,
"step": 29200
},
{
"epoch": 64.76296090934295,
"eval_loss": 6.388535976409912,
"eval_runtime": 173.1769,
"eval_samples_per_second": 57.744,
"eval_steps_per_second": 7.218,
"step": 29200
},
{
"epoch": 64.98475187136124,
"grad_norm": 0.4400019347667694,
"learning_rate": 3.080267558528428e-06,
"loss": 6.3752,
"step": 29300
},
{
"epoch": 64.98475187136124,
"eval_loss": 6.38809061050415,
"eval_runtime": 175.7068,
"eval_samples_per_second": 56.913,
"eval_steps_per_second": 7.114,
"step": 29300
},
{
"epoch": 65.20654283337954,
"grad_norm": 0.3871637284755707,
"learning_rate": 3.0702341137123744e-06,
"loss": 6.3761,
"step": 29400
},
{
"epoch": 65.20654283337954,
"eval_loss": 6.3887200355529785,
"eval_runtime": 175.6633,
"eval_samples_per_second": 56.927,
"eval_steps_per_second": 7.116,
"step": 29400
},
{
"epoch": 65.42833379539783,
"grad_norm": 0.3527097702026367,
"learning_rate": 3.060200668896321e-06,
"loss": 6.375,
"step": 29500
},
{
"epoch": 65.42833379539783,
"eval_loss": 6.385637283325195,
"eval_runtime": 175.6827,
"eval_samples_per_second": 56.921,
"eval_steps_per_second": 7.115,
"step": 29500
},
{
"epoch": 65.65012475741614,
"grad_norm": 0.3956551253795624,
"learning_rate": 3.0501672240802675e-06,
"loss": 6.3763,
"step": 29600
},
{
"epoch": 65.65012475741614,
"eval_loss": 6.388696670532227,
"eval_runtime": 175.6777,
"eval_samples_per_second": 56.922,
"eval_steps_per_second": 7.115,
"step": 29600
},
{
"epoch": 65.87191571943443,
"grad_norm": 0.317006379365921,
"learning_rate": 3.0401337792642143e-06,
"loss": 6.3747,
"step": 29700
},
{
"epoch": 65.87191571943443,
"eval_loss": 6.386444568634033,
"eval_runtime": 175.0294,
"eval_samples_per_second": 57.133,
"eval_steps_per_second": 7.142,
"step": 29700
},
{
"epoch": 66.09370668145273,
"grad_norm": 0.29853495955467224,
"learning_rate": 3.0301003344481606e-06,
"loss": 6.3742,
"step": 29800
},
{
"epoch": 66.09370668145273,
"eval_loss": 6.38703727722168,
"eval_runtime": 173.1862,
"eval_samples_per_second": 57.741,
"eval_steps_per_second": 7.218,
"step": 29800
},
{
"epoch": 66.31549764347103,
"grad_norm": 0.3481820225715637,
"learning_rate": 3.020066889632107e-06,
"loss": 6.3756,
"step": 29900
},
{
"epoch": 66.31549764347103,
"eval_loss": 6.385500907897949,
"eval_runtime": 175.6985,
"eval_samples_per_second": 56.916,
"eval_steps_per_second": 7.114,
"step": 29900
},
{
"epoch": 66.53728860548932,
"grad_norm": 0.3467808961868286,
"learning_rate": 3.0100334448160537e-06,
"loss": 6.3755,
"step": 30000
},
{
"epoch": 66.53728860548932,
"eval_loss": 6.389315605163574,
"eval_runtime": 173.203,
"eval_samples_per_second": 57.736,
"eval_steps_per_second": 7.217,
"step": 30000
},
{
"epoch": 66.75907956750763,
"grad_norm": 0.3288291096687317,
"learning_rate": 3e-06,
"loss": 6.3762,
"step": 30100
},
{
"epoch": 66.75907956750763,
"eval_loss": 6.389954090118408,
"eval_runtime": 175.5948,
"eval_samples_per_second": 56.949,
"eval_steps_per_second": 7.119,
"step": 30100
},
{
"epoch": 66.98087052952592,
"grad_norm": 0.3450663387775421,
"learning_rate": 2.9899665551839464e-06,
"loss": 6.3749,
"step": 30200
},
{
"epoch": 66.98087052952592,
"eval_loss": 6.388577938079834,
"eval_runtime": 173.1084,
"eval_samples_per_second": 57.767,
"eval_steps_per_second": 7.221,
"step": 30200
},
{
"epoch": 67.20266149154422,
"grad_norm": 0.4391154646873474,
"learning_rate": 2.979933110367893e-06,
"loss": 6.3757,
"step": 30300
},
{
"epoch": 67.20266149154422,
"eval_loss": 6.3895344734191895,
"eval_runtime": 175.4784,
"eval_samples_per_second": 56.987,
"eval_steps_per_second": 7.123,
"step": 30300
},
{
"epoch": 67.42445245356252,
"grad_norm": 0.4594007730484009,
"learning_rate": 2.9698996655518395e-06,
"loss": 6.3742,
"step": 30400
},
{
"epoch": 67.42445245356252,
"eval_loss": 6.387800216674805,
"eval_runtime": 175.4386,
"eval_samples_per_second": 57.0,
"eval_steps_per_second": 7.125,
"step": 30400
},
{
"epoch": 67.64624341558081,
"grad_norm": 0.2892398238182068,
"learning_rate": 2.959866220735786e-06,
"loss": 6.3758,
"step": 30500
},
{
"epoch": 67.64624341558081,
"eval_loss": 6.3860883712768555,
"eval_runtime": 175.4706,
"eval_samples_per_second": 56.99,
"eval_steps_per_second": 7.124,
"step": 30500
},
{
"epoch": 67.86803437759912,
"grad_norm": 0.5031465888023376,
"learning_rate": 2.9498327759197326e-06,
"loss": 6.3738,
"step": 30600
},
{
"epoch": 67.86803437759912,
"eval_loss": 6.38906192779541,
"eval_runtime": 175.4554,
"eval_samples_per_second": 56.995,
"eval_steps_per_second": 7.124,
"step": 30600
},
{
"epoch": 68.0898253396174,
"grad_norm": 0.2999316453933716,
"learning_rate": 2.939799331103679e-06,
"loss": 6.3732,
"step": 30700
},
{
"epoch": 68.0898253396174,
"eval_loss": 6.387207984924316,
"eval_runtime": 172.9284,
"eval_samples_per_second": 57.827,
"eval_steps_per_second": 7.228,
"step": 30700
},
{
"epoch": 68.31161630163571,
"grad_norm": 0.3920566737651825,
"learning_rate": 2.9297658862876257e-06,
"loss": 6.3746,
"step": 30800
},
{
"epoch": 68.31161630163571,
"eval_loss": 6.388418197631836,
"eval_runtime": 175.4686,
"eval_samples_per_second": 56.99,
"eval_steps_per_second": 7.124,
"step": 30800
},
{
"epoch": 68.53340726365401,
"grad_norm": 0.3810490369796753,
"learning_rate": 2.919732441471572e-06,
"loss": 6.3736,
"step": 30900
},
{
"epoch": 68.53340726365401,
"eval_loss": 6.382778167724609,
"eval_runtime": 172.9448,
"eval_samples_per_second": 57.822,
"eval_steps_per_second": 7.228,
"step": 30900
},
{
"epoch": 68.7551982256723,
"grad_norm": 0.282163143157959,
"learning_rate": 2.9096989966555184e-06,
"loss": 6.3764,
"step": 31000
},
{
"epoch": 68.7551982256723,
"eval_loss": 6.3898420333862305,
"eval_runtime": 175.8822,
"eval_samples_per_second": 56.856,
"eval_steps_per_second": 7.107,
"step": 31000
},
{
"epoch": 68.9769891876906,
"grad_norm": 0.5345416069030762,
"learning_rate": 2.899665551839465e-06,
"loss": 6.3744,
"step": 31100
},
{
"epoch": 68.9769891876906,
"eval_loss": 6.389834880828857,
"eval_runtime": 173.048,
"eval_samples_per_second": 57.787,
"eval_steps_per_second": 7.223,
"step": 31100
},
{
"epoch": 69.1987801497089,
"grad_norm": 0.2955686151981354,
"learning_rate": 2.8896321070234115e-06,
"loss": 6.3752,
"step": 31200
},
{
"epoch": 69.1987801497089,
"eval_loss": 6.385989189147949,
"eval_runtime": 175.4356,
"eval_samples_per_second": 57.001,
"eval_steps_per_second": 7.125,
"step": 31200
},
{
"epoch": 69.4205711117272,
"grad_norm": 0.2998807430267334,
"learning_rate": 2.879598662207358e-06,
"loss": 6.3744,
"step": 31300
},
{
"epoch": 69.4205711117272,
"eval_loss": 6.3874688148498535,
"eval_runtime": 175.8432,
"eval_samples_per_second": 56.869,
"eval_steps_per_second": 7.109,
"step": 31300
},
{
"epoch": 69.64236207374549,
"grad_norm": 0.5946409702301025,
"learning_rate": 2.8695652173913046e-06,
"loss": 6.3742,
"step": 31400
},
{
"epoch": 69.64236207374549,
"eval_loss": 6.386292934417725,
"eval_runtime": 175.7657,
"eval_samples_per_second": 56.894,
"eval_steps_per_second": 7.112,
"step": 31400
},
{
"epoch": 69.86415303576379,
"grad_norm": 0.4089396595954895,
"learning_rate": 2.859531772575251e-06,
"loss": 6.3741,
"step": 31500
},
{
"epoch": 69.86415303576379,
"eval_loss": 6.386563301086426,
"eval_runtime": 175.832,
"eval_samples_per_second": 56.872,
"eval_steps_per_second": 7.109,
"step": 31500
},
{
"epoch": 70.0859439977821,
"grad_norm": 0.4220736622810364,
"learning_rate": 2.8494983277591977e-06,
"loss": 6.3761,
"step": 31600
},
{
"epoch": 70.0859439977821,
"eval_loss": 6.386425495147705,
"eval_runtime": 175.4574,
"eval_samples_per_second": 56.994,
"eval_steps_per_second": 7.124,
"step": 31600
},
{
"epoch": 70.30773495980038,
"grad_norm": 0.5009733438491821,
"learning_rate": 2.839464882943144e-06,
"loss": 6.3746,
"step": 31700
},
{
"epoch": 70.30773495980038,
"eval_loss": 6.386416435241699,
"eval_runtime": 175.5124,
"eval_samples_per_second": 56.976,
"eval_steps_per_second": 7.122,
"step": 31700
},
{
"epoch": 70.52952592181869,
"grad_norm": 0.41243863105773926,
"learning_rate": 2.8294314381270904e-06,
"loss": 6.3738,
"step": 31800
},
{
"epoch": 70.52952592181869,
"eval_loss": 6.388505935668945,
"eval_runtime": 175.5511,
"eval_samples_per_second": 56.963,
"eval_steps_per_second": 7.12,
"step": 31800
},
{
"epoch": 70.75131688383698,
"grad_norm": 0.3510850667953491,
"learning_rate": 2.819397993311037e-06,
"loss": 6.3754,
"step": 31900
},
{
"epoch": 70.75131688383698,
"eval_loss": 6.388024806976318,
"eval_runtime": 175.6891,
"eval_samples_per_second": 56.919,
"eval_steps_per_second": 7.115,
"step": 31900
},
{
"epoch": 70.97310784585528,
"grad_norm": 0.2912569046020508,
"learning_rate": 2.8093645484949835e-06,
"loss": 6.374,
"step": 32000
},
{
"epoch": 70.97310784585528,
"eval_loss": 6.385600566864014,
"eval_runtime": 175.9407,
"eval_samples_per_second": 56.837,
"eval_steps_per_second": 7.105,
"step": 32000
},
{
"epoch": 71.19489880787359,
"grad_norm": 0.3566642105579376,
"learning_rate": 2.79933110367893e-06,
"loss": 6.3728,
"step": 32100
},
{
"epoch": 71.19489880787359,
"eval_loss": 6.384610652923584,
"eval_runtime": 175.7319,
"eval_samples_per_second": 56.905,
"eval_steps_per_second": 7.113,
"step": 32100
},
{
"epoch": 71.41668976989187,
"grad_norm": 0.36077818274497986,
"learning_rate": 2.7892976588628766e-06,
"loss": 6.3742,
"step": 32200
},
{
"epoch": 71.41668976989187,
"eval_loss": 6.389194488525391,
"eval_runtime": 173.108,
"eval_samples_per_second": 57.767,
"eval_steps_per_second": 7.221,
"step": 32200
},
{
"epoch": 71.63848073191018,
"grad_norm": 0.4366162121295929,
"learning_rate": 2.779264214046823e-06,
"loss": 6.373,
"step": 32300
},
{
"epoch": 71.63848073191018,
"eval_loss": 6.388595104217529,
"eval_runtime": 175.5624,
"eval_samples_per_second": 56.96,
"eval_steps_per_second": 7.12,
"step": 32300
},
{
"epoch": 71.86027169392847,
"grad_norm": 0.3485216498374939,
"learning_rate": 2.7692307692307693e-06,
"loss": 6.3744,
"step": 32400
},
{
"epoch": 71.86027169392847,
"eval_loss": 6.38759708404541,
"eval_runtime": 173.3825,
"eval_samples_per_second": 57.676,
"eval_steps_per_second": 7.209,
"step": 32400
},
{
"epoch": 72.08206265594677,
"grad_norm": 0.41392314434051514,
"learning_rate": 2.759197324414716e-06,
"loss": 6.3733,
"step": 32500
},
{
"epoch": 72.08206265594677,
"eval_loss": 6.388287544250488,
"eval_runtime": 175.8186,
"eval_samples_per_second": 56.877,
"eval_steps_per_second": 7.11,
"step": 32500
},
{
"epoch": 72.30385361796507,
"grad_norm": 0.38669446110725403,
"learning_rate": 2.749163879598662e-06,
"loss": 6.3736,
"step": 32600
},
{
"epoch": 72.30385361796507,
"eval_loss": 6.387938499450684,
"eval_runtime": 167.9516,
"eval_samples_per_second": 59.541,
"eval_steps_per_second": 7.443,
"step": 32600
},
{
"epoch": 72.52564457998336,
"grad_norm": 0.42049235105514526,
"learning_rate": 2.7391304347826087e-06,
"loss": 6.3744,
"step": 32700
},
{
"epoch": 72.52564457998336,
"eval_loss": 6.387884140014648,
"eval_runtime": 175.7946,
"eval_samples_per_second": 56.885,
"eval_steps_per_second": 7.111,
"step": 32700
},
{
"epoch": 72.74743554200167,
"grad_norm": 0.45259612798690796,
"learning_rate": 2.729096989966555e-06,
"loss": 6.3733,
"step": 32800
},
{
"epoch": 72.74743554200167,
"eval_loss": 6.383664608001709,
"eval_runtime": 175.4633,
"eval_samples_per_second": 56.992,
"eval_steps_per_second": 7.124,
"step": 32800
},
{
"epoch": 72.96922650401996,
"grad_norm": 0.35638928413391113,
"learning_rate": 2.7190635451505014e-06,
"loss": 6.3752,
"step": 32900
},
{
"epoch": 72.96922650401996,
"eval_loss": 6.385019302368164,
"eval_runtime": 175.4207,
"eval_samples_per_second": 57.006,
"eval_steps_per_second": 7.126,
"step": 32900
},
{
"epoch": 73.19101746603826,
"grad_norm": 0.4410247206687927,
"learning_rate": 2.709030100334448e-06,
"loss": 6.3739,
"step": 33000
},
{
"epoch": 73.19101746603826,
"eval_loss": 6.385441303253174,
"eval_runtime": 175.4138,
"eval_samples_per_second": 57.008,
"eval_steps_per_second": 7.126,
"step": 33000
},
{
"epoch": 73.41280842805655,
"grad_norm": 0.2410985231399536,
"learning_rate": 2.6989966555183945e-06,
"loss": 6.3728,
"step": 33100
},
{
"epoch": 73.41280842805655,
"eval_loss": 6.38595724105835,
"eval_runtime": 175.8764,
"eval_samples_per_second": 56.858,
"eval_steps_per_second": 7.107,
"step": 33100
},
{
"epoch": 73.63459939007485,
"grad_norm": 0.43327927589416504,
"learning_rate": 2.6889632107023413e-06,
"loss": 6.3742,
"step": 33200
},
{
"epoch": 73.63459939007485,
"eval_loss": 6.387829780578613,
"eval_runtime": 175.8542,
"eval_samples_per_second": 56.865,
"eval_steps_per_second": 7.108,
"step": 33200
},
{
"epoch": 73.85639035209316,
"grad_norm": 0.2946775555610657,
"learning_rate": 2.6789297658862876e-06,
"loss": 6.3751,
"step": 33300
},
{
"epoch": 73.85639035209316,
"eval_loss": 6.385344505310059,
"eval_runtime": 173.3421,
"eval_samples_per_second": 57.689,
"eval_steps_per_second": 7.211,
"step": 33300
},
{
"epoch": 74.07818131411145,
"grad_norm": 0.33265405893325806,
"learning_rate": 2.668896321070234e-06,
"loss": 6.3737,
"step": 33400
},
{
"epoch": 74.07818131411145,
"eval_loss": 6.38824987411499,
"eval_runtime": 173.3017,
"eval_samples_per_second": 57.703,
"eval_steps_per_second": 7.213,
"step": 33400
},
{
"epoch": 74.29997227612975,
"grad_norm": 0.40044334530830383,
"learning_rate": 2.6588628762541807e-06,
"loss": 6.3752,
"step": 33500
},
{
"epoch": 74.29997227612975,
"eval_loss": 6.385106086730957,
"eval_runtime": 175.777,
"eval_samples_per_second": 56.89,
"eval_steps_per_second": 7.111,
"step": 33500
},
{
"epoch": 74.52176323814804,
"grad_norm": 0.3776157796382904,
"learning_rate": 2.648829431438127e-06,
"loss": 6.3739,
"step": 33600
},
{
"epoch": 74.52176323814804,
"eval_loss": 6.387485980987549,
"eval_runtime": 174.2601,
"eval_samples_per_second": 57.385,
"eval_steps_per_second": 7.173,
"step": 33600
},
{
"epoch": 74.74355420016634,
"grad_norm": 0.33734750747680664,
"learning_rate": 2.6387959866220734e-06,
"loss": 6.3739,
"step": 33700
},
{
"epoch": 74.74355420016634,
"eval_loss": 6.383073806762695,
"eval_runtime": 175.8418,
"eval_samples_per_second": 56.869,
"eval_steps_per_second": 7.109,
"step": 33700
},
{
"epoch": 74.96534516218465,
"grad_norm": 0.2771698534488678,
"learning_rate": 2.62876254180602e-06,
"loss": 6.3729,
"step": 33800
},
{
"epoch": 74.96534516218465,
"eval_loss": 6.388527870178223,
"eval_runtime": 174.0412,
"eval_samples_per_second": 57.458,
"eval_steps_per_second": 7.182,
"step": 33800
},
{
"epoch": 75.18713612420294,
"grad_norm": 0.3911442458629608,
"learning_rate": 2.6187290969899665e-06,
"loss": 6.374,
"step": 33900
},
{
"epoch": 75.18713612420294,
"eval_loss": 6.386963367462158,
"eval_runtime": 172.9704,
"eval_samples_per_second": 57.813,
"eval_steps_per_second": 7.227,
"step": 33900
},
{
"epoch": 75.40892708622124,
"grad_norm": 0.3304766118526459,
"learning_rate": 2.6086956521739132e-06,
"loss": 6.3746,
"step": 34000
},
{
"epoch": 75.40892708622124,
"eval_loss": 6.386199951171875,
"eval_runtime": 175.3788,
"eval_samples_per_second": 57.019,
"eval_steps_per_second": 7.127,
"step": 34000
},
{
"epoch": 75.63071804823953,
"grad_norm": 0.4422440230846405,
"learning_rate": 2.5986622073578596e-06,
"loss": 6.3737,
"step": 34100
},
{
"epoch": 75.63071804823953,
"eval_loss": 6.384350776672363,
"eval_runtime": 173.0054,
"eval_samples_per_second": 57.802,
"eval_steps_per_second": 7.225,
"step": 34100
},
{
"epoch": 75.85250901025783,
"grad_norm": 0.28921636939048767,
"learning_rate": 2.588628762541806e-06,
"loss": 6.3739,
"step": 34200
},
{
"epoch": 75.85250901025783,
"eval_loss": 6.387299537658691,
"eval_runtime": 175.5823,
"eval_samples_per_second": 56.953,
"eval_steps_per_second": 7.119,
"step": 34200
},
{
"epoch": 76.07429997227614,
"grad_norm": 0.3911747336387634,
"learning_rate": 2.5785953177257527e-06,
"loss": 6.3734,
"step": 34300
},
{
"epoch": 76.07429997227614,
"eval_loss": 6.389584541320801,
"eval_runtime": 172.7277,
"eval_samples_per_second": 57.895,
"eval_steps_per_second": 7.237,
"step": 34300
},
{
"epoch": 76.29609093429443,
"grad_norm": 0.3622056245803833,
"learning_rate": 2.568561872909699e-06,
"loss": 6.3739,
"step": 34400
},
{
"epoch": 76.29609093429443,
"eval_loss": 6.386180400848389,
"eval_runtime": 175.4099,
"eval_samples_per_second": 57.009,
"eval_steps_per_second": 7.126,
"step": 34400
},
{
"epoch": 76.51788189631273,
"grad_norm": 0.24905167520046234,
"learning_rate": 2.5585284280936454e-06,
"loss": 6.3746,
"step": 34500
},
{
"epoch": 76.51788189631273,
"eval_loss": 6.383036136627197,
"eval_runtime": 172.8585,
"eval_samples_per_second": 57.851,
"eval_steps_per_second": 7.231,
"step": 34500
},
{
"epoch": 76.73967285833102,
"grad_norm": 0.3207278549671173,
"learning_rate": 2.548494983277592e-06,
"loss": 6.3749,
"step": 34600
},
{
"epoch": 76.73967285833102,
"eval_loss": 6.3871564865112305,
"eval_runtime": 175.7463,
"eval_samples_per_second": 56.9,
"eval_steps_per_second": 7.113,
"step": 34600
},
{
"epoch": 76.96146382034932,
"grad_norm": 0.3537052273750305,
"learning_rate": 2.5384615384615385e-06,
"loss": 6.3719,
"step": 34700
},
{
"epoch": 76.96146382034932,
"eval_loss": 6.384720325469971,
"eval_runtime": 172.9583,
"eval_samples_per_second": 57.817,
"eval_steps_per_second": 7.227,
"step": 34700
},
{
"epoch": 77.18325478236761,
"grad_norm": 0.4220789074897766,
"learning_rate": 2.528428093645485e-06,
"loss": 6.3736,
"step": 34800
},
{
"epoch": 77.18325478236761,
"eval_loss": 6.384481906890869,
"eval_runtime": 175.4683,
"eval_samples_per_second": 56.99,
"eval_steps_per_second": 7.124,
"step": 34800
},
{
"epoch": 77.40504574438592,
"grad_norm": 0.3726615011692047,
"learning_rate": 2.5183946488294316e-06,
"loss": 6.3726,
"step": 34900
},
{
"epoch": 77.40504574438592,
"eval_loss": 6.383063793182373,
"eval_runtime": 175.4526,
"eval_samples_per_second": 56.995,
"eval_steps_per_second": 7.124,
"step": 34900
},
{
"epoch": 77.62683670640422,
"grad_norm": 0.3583526909351349,
"learning_rate": 2.508361204013378e-06,
"loss": 6.3742,
"step": 35000
},
{
"epoch": 77.62683670640422,
"eval_loss": 6.383593559265137,
"eval_runtime": 175.5123,
"eval_samples_per_second": 56.976,
"eval_steps_per_second": 7.122,
"step": 35000
},
{
"epoch": 77.84862766842251,
"grad_norm": 0.31663283705711365,
"learning_rate": 2.4983277591973247e-06,
"loss": 6.3746,
"step": 35100
},
{
"epoch": 77.84862766842251,
"eval_loss": 6.385804653167725,
"eval_runtime": 175.8899,
"eval_samples_per_second": 56.854,
"eval_steps_per_second": 7.107,
"step": 35100
},
{
"epoch": 78.07041863044081,
"grad_norm": 0.3281422555446625,
"learning_rate": 2.488294314381271e-06,
"loss": 6.374,
"step": 35200
},
{
"epoch": 78.07041863044081,
"eval_loss": 6.382884979248047,
"eval_runtime": 176.0174,
"eval_samples_per_second": 56.813,
"eval_steps_per_second": 7.102,
"step": 35200
},
{
"epoch": 78.2922095924591,
"grad_norm": 0.35885676741600037,
"learning_rate": 2.4782608695652173e-06,
"loss": 6.3737,
"step": 35300
},
{
"epoch": 78.2922095924591,
"eval_loss": 6.38320255279541,
"eval_runtime": 175.5121,
"eval_samples_per_second": 56.976,
"eval_steps_per_second": 7.122,
"step": 35300
},
{
"epoch": 78.5140005544774,
"grad_norm": 0.40301480889320374,
"learning_rate": 2.468227424749164e-06,
"loss": 6.3742,
"step": 35400
},
{
"epoch": 78.5140005544774,
"eval_loss": 6.386338233947754,
"eval_runtime": 175.6402,
"eval_samples_per_second": 56.935,
"eval_steps_per_second": 7.117,
"step": 35400
},
{
"epoch": 78.73579151649571,
"grad_norm": 0.3202325701713562,
"learning_rate": 2.4581939799331104e-06,
"loss": 6.3736,
"step": 35500
},
{
"epoch": 78.73579151649571,
"eval_loss": 6.385340690612793,
"eval_runtime": 173.2176,
"eval_samples_per_second": 57.731,
"eval_steps_per_second": 7.216,
"step": 35500
},
{
"epoch": 78.957582478514,
"grad_norm": 0.370046466588974,
"learning_rate": 2.4481605351170568e-06,
"loss": 6.3733,
"step": 35600
},
{
"epoch": 78.957582478514,
"eval_loss": 6.3839592933654785,
"eval_runtime": 173.1936,
"eval_samples_per_second": 57.739,
"eval_steps_per_second": 7.217,
"step": 35600
},
{
"epoch": 79.1793734405323,
"grad_norm": 0.3682570457458496,
"learning_rate": 2.4381270903010035e-06,
"loss": 6.373,
"step": 35700
},
{
"epoch": 79.1793734405323,
"eval_loss": 6.384267807006836,
"eval_runtime": 175.7512,
"eval_samples_per_second": 56.899,
"eval_steps_per_second": 7.112,
"step": 35700
},
{
"epoch": 79.40116440255059,
"grad_norm": 0.42555299401283264,
"learning_rate": 2.42809364548495e-06,
"loss": 6.3724,
"step": 35800
},
{
"epoch": 79.40116440255059,
"eval_loss": 6.386261940002441,
"eval_runtime": 173.3473,
"eval_samples_per_second": 57.688,
"eval_steps_per_second": 7.211,
"step": 35800
},
{
"epoch": 79.6229553645689,
"grad_norm": 0.4109131693840027,
"learning_rate": 2.4180602006688962e-06,
"loss": 6.3738,
"step": 35900
},
{
"epoch": 79.6229553645689,
"eval_loss": 6.385996341705322,
"eval_runtime": 175.868,
"eval_samples_per_second": 56.861,
"eval_steps_per_second": 7.108,
"step": 35900
},
{
"epoch": 79.84474632658718,
"grad_norm": 0.4770185351371765,
"learning_rate": 2.408026755852843e-06,
"loss": 6.373,
"step": 36000
},
{
"epoch": 79.84474632658718,
"eval_loss": 6.385003566741943,
"eval_runtime": 175.9258,
"eval_samples_per_second": 56.842,
"eval_steps_per_second": 7.105,
"step": 36000
},
{
"epoch": 80.06653728860549,
"grad_norm": 0.31983354687690735,
"learning_rate": 2.3979933110367893e-06,
"loss": 6.3721,
"step": 36100
},
{
"epoch": 80.06653728860549,
"eval_loss": 6.384030818939209,
"eval_runtime": 175.9559,
"eval_samples_per_second": 56.832,
"eval_steps_per_second": 7.104,
"step": 36100
},
{
"epoch": 80.28832825062379,
"grad_norm": 0.42961299419403076,
"learning_rate": 2.387959866220736e-06,
"loss": 6.3712,
"step": 36200
},
{
"epoch": 80.28832825062379,
"eval_loss": 6.385640621185303,
"eval_runtime": 173.4173,
"eval_samples_per_second": 57.664,
"eval_steps_per_second": 7.208,
"step": 36200
},
{
"epoch": 80.51011921264208,
"grad_norm": 0.31057417392730713,
"learning_rate": 2.3779264214046824e-06,
"loss": 6.3731,
"step": 36300
},
{
"epoch": 80.51011921264208,
"eval_loss": 6.384836196899414,
"eval_runtime": 175.866,
"eval_samples_per_second": 56.861,
"eval_steps_per_second": 7.108,
"step": 36300
},
{
"epoch": 80.73191017466038,
"grad_norm": 0.2894494831562042,
"learning_rate": 2.3678929765886288e-06,
"loss": 6.3741,
"step": 36400
},
{
"epoch": 80.73191017466038,
"eval_loss": 6.385368824005127,
"eval_runtime": 175.9096,
"eval_samples_per_second": 56.847,
"eval_steps_per_second": 7.106,
"step": 36400
},
{
"epoch": 80.95370113667867,
"grad_norm": 0.4780093729496002,
"learning_rate": 2.3578595317725755e-06,
"loss": 6.3749,
"step": 36500
},
{
"epoch": 80.95370113667867,
"eval_loss": 6.384347438812256,
"eval_runtime": 175.8521,
"eval_samples_per_second": 56.866,
"eval_steps_per_second": 7.108,
"step": 36500
},
{
"epoch": 81.17549209869698,
"grad_norm": 0.31205832958221436,
"learning_rate": 2.347826086956522e-06,
"loss": 6.3743,
"step": 36600
},
{
"epoch": 81.17549209869698,
"eval_loss": 6.385135173797607,
"eval_runtime": 175.8923,
"eval_samples_per_second": 56.853,
"eval_steps_per_second": 7.107,
"step": 36600
},
{
"epoch": 81.39728306071528,
"grad_norm": 0.3318498134613037,
"learning_rate": 2.337792642140468e-06,
"loss": 6.3735,
"step": 36700
},
{
"epoch": 81.39728306071528,
"eval_loss": 6.3830389976501465,
"eval_runtime": 174.576,
"eval_samples_per_second": 57.282,
"eval_steps_per_second": 7.16,
"step": 36700
},
{
"epoch": 81.61907402273357,
"grad_norm": 0.35717305541038513,
"learning_rate": 2.327759197324415e-06,
"loss": 6.3726,
"step": 36800
},
{
"epoch": 81.61907402273357,
"eval_loss": 6.384567737579346,
"eval_runtime": 173.0226,
"eval_samples_per_second": 57.796,
"eval_steps_per_second": 7.224,
"step": 36800
},
{
"epoch": 81.84086498475187,
"grad_norm": 0.36196058988571167,
"learning_rate": 2.3177257525083613e-06,
"loss": 6.3734,
"step": 36900
},
{
"epoch": 81.84086498475187,
"eval_loss": 6.385857582092285,
"eval_runtime": 175.4192,
"eval_samples_per_second": 57.006,
"eval_steps_per_second": 7.126,
"step": 36900
},
{
"epoch": 82.06265594677016,
"grad_norm": 0.34454473853111267,
"learning_rate": 2.307692307692308e-06,
"loss": 6.3732,
"step": 37000
},
{
"epoch": 82.06265594677016,
"eval_loss": 6.384513854980469,
"eval_runtime": 175.4568,
"eval_samples_per_second": 56.994,
"eval_steps_per_second": 7.124,
"step": 37000
},
{
"epoch": 82.28444690878847,
"grad_norm": 0.3330673575401306,
"learning_rate": 2.2976588628762544e-06,
"loss": 6.3717,
"step": 37100
},
{
"epoch": 82.28444690878847,
"eval_loss": 6.383497714996338,
"eval_runtime": 173.2512,
"eval_samples_per_second": 57.72,
"eval_steps_per_second": 7.215,
"step": 37100
},
{
"epoch": 82.50623787080677,
"grad_norm": 0.40681159496307373,
"learning_rate": 2.2876254180602008e-06,
"loss": 6.3728,
"step": 37200
},
{
"epoch": 82.50623787080677,
"eval_loss": 6.38515567779541,
"eval_runtime": 175.8178,
"eval_samples_per_second": 56.877,
"eval_steps_per_second": 7.11,
"step": 37200
},
{
"epoch": 82.72802883282506,
"grad_norm": 0.3258204162120819,
"learning_rate": 2.2775919732441475e-06,
"loss": 6.3743,
"step": 37300
},
{
"epoch": 82.72802883282506,
"eval_loss": 6.38502311706543,
"eval_runtime": 173.1187,
"eval_samples_per_second": 57.764,
"eval_steps_per_second": 7.22,
"step": 37300
},
{
"epoch": 82.94981979484336,
"grad_norm": 0.37041613459587097,
"learning_rate": 2.267558528428094e-06,
"loss": 6.3728,
"step": 37400
},
{
"epoch": 82.94981979484336,
"eval_loss": 6.3821611404418945,
"eval_runtime": 175.5559,
"eval_samples_per_second": 56.962,
"eval_steps_per_second": 7.12,
"step": 37400
},
{
"epoch": 83.17161075686165,
"grad_norm": 0.33911818265914917,
"learning_rate": 2.25752508361204e-06,
"loss": 6.3738,
"step": 37500
},
{
"epoch": 83.17161075686165,
"eval_loss": 6.386144638061523,
"eval_runtime": 173.2969,
"eval_samples_per_second": 57.704,
"eval_steps_per_second": 7.213,
"step": 37500
},
{
"epoch": 83.39340171887996,
"grad_norm": 0.48508045077323914,
"learning_rate": 2.2474916387959865e-06,
"loss": 6.3728,
"step": 37600
},
{
"epoch": 83.39340171887996,
"eval_loss": 6.383870601654053,
"eval_runtime": 175.8417,
"eval_samples_per_second": 56.869,
"eval_steps_per_second": 7.109,
"step": 37600
},
{
"epoch": 83.61519268089825,
"grad_norm": 0.3488113284111023,
"learning_rate": 2.237458193979933e-06,
"loss": 6.3726,
"step": 37700
},
{
"epoch": 83.61519268089825,
"eval_loss": 6.385016441345215,
"eval_runtime": 175.8197,
"eval_samples_per_second": 56.876,
"eval_steps_per_second": 7.11,
"step": 37700
},
{
"epoch": 83.83698364291655,
"grad_norm": 0.3524182438850403,
"learning_rate": 2.2274247491638796e-06,
"loss": 6.3725,
"step": 37800
},
{
"epoch": 83.83698364291655,
"eval_loss": 6.384798526763916,
"eval_runtime": 175.4988,
"eval_samples_per_second": 56.98,
"eval_steps_per_second": 7.123,
"step": 37800
},
{
"epoch": 84.05877460493485,
"grad_norm": 0.28423815965652466,
"learning_rate": 2.217391304347826e-06,
"loss": 6.374,
"step": 37900
},
{
"epoch": 84.05877460493485,
"eval_loss": 6.387665748596191,
"eval_runtime": 172.948,
"eval_samples_per_second": 57.821,
"eval_steps_per_second": 7.228,
"step": 37900
},
{
"epoch": 84.28056556695314,
"grad_norm": 0.32828596234321594,
"learning_rate": 2.2073578595317723e-06,
"loss": 6.3724,
"step": 38000
},
{
"epoch": 84.28056556695314,
"eval_loss": 6.383293628692627,
"eval_runtime": 175.4508,
"eval_samples_per_second": 56.996,
"eval_steps_per_second": 7.125,
"step": 38000
},
{
"epoch": 84.50235652897145,
"grad_norm": 0.33721184730529785,
"learning_rate": 2.197324414715719e-06,
"loss": 6.373,
"step": 38100
},
{
"epoch": 84.50235652897145,
"eval_loss": 6.385343551635742,
"eval_runtime": 175.531,
"eval_samples_per_second": 56.97,
"eval_steps_per_second": 7.121,
"step": 38100
},
{
"epoch": 84.72414749098974,
"grad_norm": 0.2766687273979187,
"learning_rate": 2.1872909698996654e-06,
"loss": 6.3728,
"step": 38200
},
{
"epoch": 84.72414749098974,
"eval_loss": 6.38714599609375,
"eval_runtime": 175.5001,
"eval_samples_per_second": 56.98,
"eval_steps_per_second": 7.123,
"step": 38200
},
{
"epoch": 84.94593845300804,
"grad_norm": 0.26238977909088135,
"learning_rate": 2.177257525083612e-06,
"loss": 6.3733,
"step": 38300
},
{
"epoch": 84.94593845300804,
"eval_loss": 6.385676383972168,
"eval_runtime": 175.6778,
"eval_samples_per_second": 56.922,
"eval_steps_per_second": 7.115,
"step": 38300
},
{
"epoch": 85.16772941502634,
"grad_norm": 0.2862393260002136,
"learning_rate": 2.1672240802675585e-06,
"loss": 6.3729,
"step": 38400
},
{
"epoch": 85.16772941502634,
"eval_loss": 6.384363174438477,
"eval_runtime": 175.3945,
"eval_samples_per_second": 57.014,
"eval_steps_per_second": 7.127,
"step": 38400
},
{
"epoch": 85.38952037704463,
"grad_norm": 0.34560856223106384,
"learning_rate": 2.157190635451505e-06,
"loss": 6.3732,
"step": 38500
},
{
"epoch": 85.38952037704463,
"eval_loss": 6.383378982543945,
"eval_runtime": 172.9454,
"eval_samples_per_second": 57.822,
"eval_steps_per_second": 7.228,
"step": 38500
},
{
"epoch": 85.61131133906294,
"grad_norm": 0.31079375743865967,
"learning_rate": 2.1471571906354516e-06,
"loss": 6.373,
"step": 38600
},
{
"epoch": 85.61131133906294,
"eval_loss": 6.383601665496826,
"eval_runtime": 175.4201,
"eval_samples_per_second": 57.006,
"eval_steps_per_second": 7.126,
"step": 38600
},
{
"epoch": 85.83310230108123,
"grad_norm": 0.3083253800868988,
"learning_rate": 2.137123745819398e-06,
"loss": 6.3731,
"step": 38700
},
{
"epoch": 85.83310230108123,
"eval_loss": 6.383668899536133,
"eval_runtime": 175.9754,
"eval_samples_per_second": 56.826,
"eval_steps_per_second": 7.103,
"step": 38700
},
{
"epoch": 86.05489326309953,
"grad_norm": 0.344168096780777,
"learning_rate": 2.1270903010033443e-06,
"loss": 6.3731,
"step": 38800
},
{
"epoch": 86.05489326309953,
"eval_loss": 6.382165431976318,
"eval_runtime": 173.0611,
"eval_samples_per_second": 57.783,
"eval_steps_per_second": 7.223,
"step": 38800
},
{
"epoch": 86.27668422511783,
"grad_norm": 0.42378509044647217,
"learning_rate": 2.117056856187291e-06,
"loss": 6.3735,
"step": 38900
},
{
"epoch": 86.27668422511783,
"eval_loss": 6.386937618255615,
"eval_runtime": 175.7527,
"eval_samples_per_second": 56.898,
"eval_steps_per_second": 7.112,
"step": 38900
},
{
"epoch": 86.49847518713612,
"grad_norm": 0.4086206555366516,
"learning_rate": 2.1070234113712374e-06,
"loss": 6.372,
"step": 39000
},
{
"epoch": 86.49847518713612,
"eval_loss": 6.385149955749512,
"eval_runtime": 172.9793,
"eval_samples_per_second": 57.81,
"eval_steps_per_second": 7.226,
"step": 39000
},
{
"epoch": 86.72026614915443,
"grad_norm": 0.3867028057575226,
"learning_rate": 2.0969899665551837e-06,
"loss": 6.371,
"step": 39100
},
{
"epoch": 86.72026614915443,
"eval_loss": 6.385136604309082,
"eval_runtime": 175.5185,
"eval_samples_per_second": 56.974,
"eval_steps_per_second": 7.122,
"step": 39100
},
{
"epoch": 86.94205711117272,
"grad_norm": 0.34638744592666626,
"learning_rate": 2.0869565217391305e-06,
"loss": 6.3723,
"step": 39200
},
{
"epoch": 86.94205711117272,
"eval_loss": 6.382205486297607,
"eval_runtime": 172.979,
"eval_samples_per_second": 57.81,
"eval_steps_per_second": 7.226,
"step": 39200
},
{
"epoch": 87.16384807319102,
"grad_norm": 0.45395034551620483,
"learning_rate": 2.076923076923077e-06,
"loss": 6.374,
"step": 39300
},
{
"epoch": 87.16384807319102,
"eval_loss": 6.383747100830078,
"eval_runtime": 175.4954,
"eval_samples_per_second": 56.982,
"eval_steps_per_second": 7.123,
"step": 39300
},
{
"epoch": 87.38563903520931,
"grad_norm": 0.2925475537776947,
"learning_rate": 2.0668896321070236e-06,
"loss": 6.3746,
"step": 39400
},
{
"epoch": 87.38563903520931,
"eval_loss": 6.3860931396484375,
"eval_runtime": 172.9827,
"eval_samples_per_second": 57.809,
"eval_steps_per_second": 7.226,
"step": 39400
},
{
"epoch": 87.60742999722761,
"grad_norm": 0.25185534358024597,
"learning_rate": 2.05685618729097e-06,
"loss": 6.3721,
"step": 39500
},
{
"epoch": 87.60742999722761,
"eval_loss": 6.383828163146973,
"eval_runtime": 175.4682,
"eval_samples_per_second": 56.99,
"eval_steps_per_second": 7.124,
"step": 39500
},
{
"epoch": 87.82922095924592,
"grad_norm": 0.35766276717185974,
"learning_rate": 2.0468227424749163e-06,
"loss": 6.3713,
"step": 39600
},
{
"epoch": 87.82922095924592,
"eval_loss": 6.383662700653076,
"eval_runtime": 173.3378,
"eval_samples_per_second": 57.691,
"eval_steps_per_second": 7.211,
"step": 39600
},
{
"epoch": 88.0510119212642,
"grad_norm": 0.31199392676353455,
"learning_rate": 2.036789297658863e-06,
"loss": 6.3717,
"step": 39700
},
{
"epoch": 88.0510119212642,
"eval_loss": 6.383730411529541,
"eval_runtime": 175.0814,
"eval_samples_per_second": 57.116,
"eval_steps_per_second": 7.14,
"step": 39700
},
{
"epoch": 88.27280288328251,
"grad_norm": 0.3334641754627228,
"learning_rate": 2.0267558528428094e-06,
"loss": 6.372,
"step": 39800
},
{
"epoch": 88.27280288328251,
"eval_loss": 6.381414890289307,
"eval_runtime": 172.9715,
"eval_samples_per_second": 57.813,
"eval_steps_per_second": 7.227,
"step": 39800
},
{
"epoch": 88.4945938453008,
"grad_norm": 0.5019832849502563,
"learning_rate": 2.0167224080267557e-06,
"loss": 6.3721,
"step": 39900
},
{
"epoch": 88.4945938453008,
"eval_loss": 6.383211612701416,
"eval_runtime": 175.5157,
"eval_samples_per_second": 56.975,
"eval_steps_per_second": 7.122,
"step": 39900
},
{
"epoch": 88.7163848073191,
"grad_norm": 0.4383368194103241,
"learning_rate": 2.0066889632107025e-06,
"loss": 6.3731,
"step": 40000
},
{
"epoch": 88.7163848073191,
"eval_loss": 6.385327339172363,
"eval_runtime": 175.457,
"eval_samples_per_second": 56.994,
"eval_steps_per_second": 7.124,
"step": 40000
},
{
"epoch": 88.9381757693374,
"grad_norm": 0.27147725224494934,
"learning_rate": 1.996655518394649e-06,
"loss": 6.3741,
"step": 40100
},
{
"epoch": 88.9381757693374,
"eval_loss": 6.383349418640137,
"eval_runtime": 173.4084,
"eval_samples_per_second": 57.667,
"eval_steps_per_second": 7.208,
"step": 40100
},
{
"epoch": 89.1599667313557,
"grad_norm": 0.2689467966556549,
"learning_rate": 1.986622073578595e-06,
"loss": 6.3719,
"step": 40200
},
{
"epoch": 89.1599667313557,
"eval_loss": 6.38576078414917,
"eval_runtime": 173.3868,
"eval_samples_per_second": 57.675,
"eval_steps_per_second": 7.209,
"step": 40200
},
{
"epoch": 89.381757693374,
"grad_norm": 0.3858400881290436,
"learning_rate": 1.976588628762542e-06,
"loss": 6.3722,
"step": 40300
},
{
"epoch": 89.381757693374,
"eval_loss": 6.38473653793335,
"eval_runtime": 174.5973,
"eval_samples_per_second": 57.275,
"eval_steps_per_second": 7.159,
"step": 40300
},
{
"epoch": 89.60354865539229,
"grad_norm": 0.372864693403244,
"learning_rate": 1.9665551839464883e-06,
"loss": 6.3727,
"step": 40400
},
{
"epoch": 89.60354865539229,
"eval_loss": 6.384860992431641,
"eval_runtime": 175.5793,
"eval_samples_per_second": 56.954,
"eval_steps_per_second": 7.119,
"step": 40400
},
{
"epoch": 89.82533961741059,
"grad_norm": 0.31050923466682434,
"learning_rate": 1.956521739130435e-06,
"loss": 6.3721,
"step": 40500
},
{
"epoch": 89.82533961741059,
"eval_loss": 6.3831257820129395,
"eval_runtime": 173.5084,
"eval_samples_per_second": 57.634,
"eval_steps_per_second": 7.204,
"step": 40500
},
{
"epoch": 90.0471305794289,
"grad_norm": 0.31580400466918945,
"learning_rate": 1.9464882943143814e-06,
"loss": 6.3716,
"step": 40600
},
{
"epoch": 90.0471305794289,
"eval_loss": 6.382096767425537,
"eval_runtime": 175.7748,
"eval_samples_per_second": 56.891,
"eval_steps_per_second": 7.111,
"step": 40600
},
{
"epoch": 90.26892154144718,
"grad_norm": 0.30445969104766846,
"learning_rate": 1.9364548494983277e-06,
"loss": 6.3738,
"step": 40700
},
{
"epoch": 90.26892154144718,
"eval_loss": 6.383363246917725,
"eval_runtime": 175.8814,
"eval_samples_per_second": 56.856,
"eval_steps_per_second": 7.107,
"step": 40700
},
{
"epoch": 90.49071250346549,
"grad_norm": 0.3509177565574646,
"learning_rate": 1.9264214046822745e-06,
"loss": 6.3711,
"step": 40800
},
{
"epoch": 90.49071250346549,
"eval_loss": 6.3791728019714355,
"eval_runtime": 175.2022,
"eval_samples_per_second": 57.077,
"eval_steps_per_second": 7.135,
"step": 40800
},
{
"epoch": 90.71250346548378,
"grad_norm": 0.2431792914867401,
"learning_rate": 1.916387959866221e-06,
"loss": 6.3717,
"step": 40900
},
{
"epoch": 90.71250346548378,
"eval_loss": 6.383620262145996,
"eval_runtime": 173.3604,
"eval_samples_per_second": 57.683,
"eval_steps_per_second": 7.21,
"step": 40900
},
{
"epoch": 90.93429442750208,
"grad_norm": 0.3652373254299164,
"learning_rate": 1.9063545150501674e-06,
"loss": 6.3702,
"step": 41000
},
{
"epoch": 90.93429442750208,
"eval_loss": 6.384062767028809,
"eval_runtime": 175.9398,
"eval_samples_per_second": 56.838,
"eval_steps_per_second": 7.105,
"step": 41000
},
{
"epoch": 91.15608538952037,
"grad_norm": 0.3120420575141907,
"learning_rate": 1.896321070234114e-06,
"loss": 6.3734,
"step": 41100
},
{
"epoch": 91.15608538952037,
"eval_loss": 6.383402347564697,
"eval_runtime": 173.0565,
"eval_samples_per_second": 57.785,
"eval_steps_per_second": 7.223,
"step": 41100
},
{
"epoch": 91.37787635153867,
"grad_norm": 0.36098653078079224,
"learning_rate": 1.8862876254180603e-06,
"loss": 6.3731,
"step": 41200
},
{
"epoch": 91.37787635153867,
"eval_loss": 6.384464263916016,
"eval_runtime": 175.6772,
"eval_samples_per_second": 56.923,
"eval_steps_per_second": 7.115,
"step": 41200
},
{
"epoch": 91.59966731355698,
"grad_norm": 0.2494172751903534,
"learning_rate": 1.8762541806020068e-06,
"loss": 6.3727,
"step": 41300
},
{
"epoch": 91.59966731355698,
"eval_loss": 6.384238243103027,
"eval_runtime": 175.6493,
"eval_samples_per_second": 56.932,
"eval_steps_per_second": 7.116,
"step": 41300
},
{
"epoch": 91.82145827557527,
"grad_norm": 0.2649492025375366,
"learning_rate": 1.8662207357859534e-06,
"loss": 6.3715,
"step": 41400
},
{
"epoch": 91.82145827557527,
"eval_loss": 6.386543273925781,
"eval_runtime": 173.0007,
"eval_samples_per_second": 57.803,
"eval_steps_per_second": 7.225,
"step": 41400
},
{
"epoch": 92.04324923759357,
"grad_norm": 0.31116828322410583,
"learning_rate": 1.8561872909699e-06,
"loss": 6.3714,
"step": 41500
},
{
"epoch": 92.04324923759357,
"eval_loss": 6.384570121765137,
"eval_runtime": 172.9737,
"eval_samples_per_second": 57.812,
"eval_steps_per_second": 7.227,
"step": 41500
},
{
"epoch": 92.26504019961186,
"grad_norm": 0.39690667390823364,
"learning_rate": 1.8461538461538462e-06,
"loss": 6.3722,
"step": 41600
},
{
"epoch": 92.26504019961186,
"eval_loss": 6.384208679199219,
"eval_runtime": 175.5344,
"eval_samples_per_second": 56.969,
"eval_steps_per_second": 7.121,
"step": 41600
},
{
"epoch": 92.48683116163016,
"grad_norm": 0.31385165452957153,
"learning_rate": 1.8361204013377928e-06,
"loss": 6.3727,
"step": 41700
},
{
"epoch": 92.48683116163016,
"eval_loss": 6.382976055145264,
"eval_runtime": 175.571,
"eval_samples_per_second": 56.957,
"eval_steps_per_second": 7.12,
"step": 41700
},
{
"epoch": 92.70862212364847,
"grad_norm": 0.2589961886405945,
"learning_rate": 1.8260869565217394e-06,
"loss": 6.373,
"step": 41800
},
{
"epoch": 92.70862212364847,
"eval_loss": 6.384578704833984,
"eval_runtime": 172.8987,
"eval_samples_per_second": 57.837,
"eval_steps_per_second": 7.23,
"step": 41800
},
{
"epoch": 92.93041308566676,
"grad_norm": 0.3754993677139282,
"learning_rate": 1.8160535117056857e-06,
"loss": 6.3716,
"step": 41900
},
{
"epoch": 92.93041308566676,
"eval_loss": 6.387712478637695,
"eval_runtime": 173.081,
"eval_samples_per_second": 57.776,
"eval_steps_per_second": 7.222,
"step": 41900
},
{
"epoch": 93.15220404768506,
"grad_norm": 0.34123027324676514,
"learning_rate": 1.8060200668896322e-06,
"loss": 6.3719,
"step": 42000
},
{
"epoch": 93.15220404768506,
"eval_loss": 6.387158393859863,
"eval_runtime": 173.0202,
"eval_samples_per_second": 57.797,
"eval_steps_per_second": 7.225,
"step": 42000
},
{
"epoch": 93.37399500970335,
"grad_norm": 0.28870150446891785,
"learning_rate": 1.7959866220735788e-06,
"loss": 6.3717,
"step": 42100
},
{
"epoch": 93.37399500970335,
"eval_loss": 6.384382247924805,
"eval_runtime": 175.9359,
"eval_samples_per_second": 56.839,
"eval_steps_per_second": 7.105,
"step": 42100
},
{
"epoch": 93.59578597172165,
"grad_norm": 0.33736997842788696,
"learning_rate": 1.7859531772575253e-06,
"loss": 6.3731,
"step": 42200
},
{
"epoch": 93.59578597172165,
"eval_loss": 6.384626865386963,
"eval_runtime": 172.9425,
"eval_samples_per_second": 57.823,
"eval_steps_per_second": 7.228,
"step": 42200
},
{
"epoch": 93.81757693373996,
"grad_norm": 0.30697163939476013,
"learning_rate": 1.7759197324414717e-06,
"loss": 6.3731,
"step": 42300
},
{
"epoch": 93.81757693373996,
"eval_loss": 6.384149074554443,
"eval_runtime": 175.533,
"eval_samples_per_second": 56.969,
"eval_steps_per_second": 7.121,
"step": 42300
},
{
"epoch": 94.03936789575825,
"grad_norm": 0.31292060017585754,
"learning_rate": 1.7658862876254182e-06,
"loss": 6.372,
"step": 42400
},
{
"epoch": 94.03936789575825,
"eval_loss": 6.38083553314209,
"eval_runtime": 173.0184,
"eval_samples_per_second": 57.797,
"eval_steps_per_second": 7.225,
"step": 42400
},
{
"epoch": 94.26115885777655,
"grad_norm": 0.3728470504283905,
"learning_rate": 1.7558528428093648e-06,
"loss": 6.3713,
"step": 42500
},
{
"epoch": 94.26115885777655,
"eval_loss": 6.381670951843262,
"eval_runtime": 175.3914,
"eval_samples_per_second": 57.015,
"eval_steps_per_second": 7.127,
"step": 42500
},
{
"epoch": 94.48294981979484,
"grad_norm": 0.44780856370925903,
"learning_rate": 1.745819397993311e-06,
"loss": 6.3718,
"step": 42600
},
{
"epoch": 94.48294981979484,
"eval_loss": 6.385097503662109,
"eval_runtime": 175.3778,
"eval_samples_per_second": 57.02,
"eval_steps_per_second": 7.127,
"step": 42600
},
{
"epoch": 94.70474078181314,
"grad_norm": 0.29420205950737,
"learning_rate": 1.7357859531772575e-06,
"loss": 6.3709,
"step": 42700
},
{
"epoch": 94.70474078181314,
"eval_loss": 6.382612705230713,
"eval_runtime": 173.3858,
"eval_samples_per_second": 57.675,
"eval_steps_per_second": 7.209,
"step": 42700
},
{
"epoch": 94.92653174383143,
"grad_norm": 0.43360549211502075,
"learning_rate": 1.7257525083612038e-06,
"loss": 6.3708,
"step": 42800
},
{
"epoch": 94.92653174383143,
"eval_loss": 6.382971286773682,
"eval_runtime": 172.9207,
"eval_samples_per_second": 57.83,
"eval_steps_per_second": 7.229,
"step": 42800
},
{
"epoch": 95.14832270584974,
"grad_norm": 0.29865312576293945,
"learning_rate": 1.7157190635451504e-06,
"loss": 6.372,
"step": 42900
},
{
"epoch": 95.14832270584974,
"eval_loss": 6.3829755783081055,
"eval_runtime": 175.5167,
"eval_samples_per_second": 56.975,
"eval_steps_per_second": 7.122,
"step": 42900
},
{
"epoch": 95.37011366786804,
"grad_norm": 0.32399508357048035,
"learning_rate": 1.705685618729097e-06,
"loss": 6.3712,
"step": 43000
},
{
"epoch": 95.37011366786804,
"eval_loss": 6.381554126739502,
"eval_runtime": 172.9843,
"eval_samples_per_second": 57.809,
"eval_steps_per_second": 7.226,
"step": 43000
},
{
"epoch": 95.59190462988633,
"grad_norm": 0.2875135540962219,
"learning_rate": 1.6956521739130435e-06,
"loss": 6.3709,
"step": 43100
},
{
"epoch": 95.59190462988633,
"eval_loss": 6.381914138793945,
"eval_runtime": 175.5546,
"eval_samples_per_second": 56.962,
"eval_steps_per_second": 7.12,
"step": 43100
},
{
"epoch": 95.81369559190463,
"grad_norm": 0.4401540756225586,
"learning_rate": 1.6856187290969898e-06,
"loss": 6.3723,
"step": 43200
},
{
"epoch": 95.81369559190463,
"eval_loss": 6.383592128753662,
"eval_runtime": 175.584,
"eval_samples_per_second": 56.953,
"eval_steps_per_second": 7.119,
"step": 43200
},
{
"epoch": 96.03548655392292,
"grad_norm": 0.2576783001422882,
"learning_rate": 1.6755852842809363e-06,
"loss": 6.3722,
"step": 43300
},
{
"epoch": 96.03548655392292,
"eval_loss": 6.383729457855225,
"eval_runtime": 175.4696,
"eval_samples_per_second": 56.99,
"eval_steps_per_second": 7.124,
"step": 43300
},
{
"epoch": 96.25727751594123,
"grad_norm": 0.3146987855434418,
"learning_rate": 1.665551839464883e-06,
"loss": 6.3716,
"step": 43400
},
{
"epoch": 96.25727751594123,
"eval_loss": 6.380384922027588,
"eval_runtime": 175.0534,
"eval_samples_per_second": 57.125,
"eval_steps_per_second": 7.141,
"step": 43400
},
{
"epoch": 96.47906847795953,
"grad_norm": 0.3195679485797882,
"learning_rate": 1.6555183946488294e-06,
"loss": 6.3714,
"step": 43500
},
{
"epoch": 96.47906847795953,
"eval_loss": 6.382904529571533,
"eval_runtime": 175.5685,
"eval_samples_per_second": 56.958,
"eval_steps_per_second": 7.12,
"step": 43500
},
{
"epoch": 96.70085943997782,
"grad_norm": 0.2415214329957962,
"learning_rate": 1.6454849498327758e-06,
"loss": 6.3711,
"step": 43600
},
{
"epoch": 96.70085943997782,
"eval_loss": 6.380964279174805,
"eval_runtime": 173.0267,
"eval_samples_per_second": 57.795,
"eval_steps_per_second": 7.224,
"step": 43600
},
{
"epoch": 96.92265040199612,
"grad_norm": 0.40489473938941956,
"learning_rate": 1.6354515050167223e-06,
"loss": 6.3726,
"step": 43700
},
{
"epoch": 96.92265040199612,
"eval_loss": 6.381808757781982,
"eval_runtime": 173.1061,
"eval_samples_per_second": 57.768,
"eval_steps_per_second": 7.221,
"step": 43700
},
{
"epoch": 97.14444136401441,
"grad_norm": 0.30804529786109924,
"learning_rate": 1.6254180602006689e-06,
"loss": 6.372,
"step": 43800
},
{
"epoch": 97.14444136401441,
"eval_loss": 6.384749889373779,
"eval_runtime": 175.6167,
"eval_samples_per_second": 56.942,
"eval_steps_per_second": 7.118,
"step": 43800
},
{
"epoch": 97.36623232603272,
"grad_norm": 0.31247368454933167,
"learning_rate": 1.6153846153846154e-06,
"loss": 6.3738,
"step": 43900
},
{
"epoch": 97.36623232603272,
"eval_loss": 6.383345127105713,
"eval_runtime": 172.9449,
"eval_samples_per_second": 57.822,
"eval_steps_per_second": 7.228,
"step": 43900
},
{
"epoch": 97.588023288051,
"grad_norm": 0.3146020174026489,
"learning_rate": 1.6053511705685618e-06,
"loss": 6.3736,
"step": 44000
},
{
"epoch": 97.588023288051,
"eval_loss": 6.38405179977417,
"eval_runtime": 175.4959,
"eval_samples_per_second": 56.981,
"eval_steps_per_second": 7.123,
"step": 44000
},
{
"epoch": 97.80981425006931,
"grad_norm": 0.30886611342430115,
"learning_rate": 1.5953177257525083e-06,
"loss": 6.3706,
"step": 44100
},
{
"epoch": 97.80981425006931,
"eval_loss": 6.381131172180176,
"eval_runtime": 172.9957,
"eval_samples_per_second": 57.805,
"eval_steps_per_second": 7.226,
"step": 44100
},
{
"epoch": 98.03160521208761,
"grad_norm": 0.3250170648097992,
"learning_rate": 1.5852842809364549e-06,
"loss": 6.3711,
"step": 44200
},
{
"epoch": 98.03160521208761,
"eval_loss": 6.382991313934326,
"eval_runtime": 175.9006,
"eval_samples_per_second": 56.85,
"eval_steps_per_second": 7.106,
"step": 44200
},
{
"epoch": 98.2533961741059,
"grad_norm": 0.2637650966644287,
"learning_rate": 1.5752508361204012e-06,
"loss": 6.3721,
"step": 44300
},
{
"epoch": 98.2533961741059,
"eval_loss": 6.385432243347168,
"eval_runtime": 175.8265,
"eval_samples_per_second": 56.874,
"eval_steps_per_second": 7.109,
"step": 44300
},
{
"epoch": 98.4751871361242,
"grad_norm": 0.3357675075531006,
"learning_rate": 1.5652173913043478e-06,
"loss": 6.371,
"step": 44400
},
{
"epoch": 98.4751871361242,
"eval_loss": 6.385194301605225,
"eval_runtime": 175.8373,
"eval_samples_per_second": 56.871,
"eval_steps_per_second": 7.109,
"step": 44400
},
{
"epoch": 98.6969780981425,
"grad_norm": 0.3793193995952606,
"learning_rate": 1.5551839464882943e-06,
"loss": 6.3717,
"step": 44500
},
{
"epoch": 98.6969780981425,
"eval_loss": 6.382778167724609,
"eval_runtime": 173.4199,
"eval_samples_per_second": 57.664,
"eval_steps_per_second": 7.208,
"step": 44500
},
{
"epoch": 98.9187690601608,
"grad_norm": 0.3075515329837799,
"learning_rate": 1.5451505016722409e-06,
"loss": 6.3705,
"step": 44600
},
{
"epoch": 98.9187690601608,
"eval_loss": 6.384821891784668,
"eval_runtime": 175.4722,
"eval_samples_per_second": 56.989,
"eval_steps_per_second": 7.124,
"step": 44600
},
{
"epoch": 99.1405600221791,
"grad_norm": 0.27654966711997986,
"learning_rate": 1.5351170568561872e-06,
"loss": 6.3725,
"step": 44700
},
{
"epoch": 99.1405600221791,
"eval_loss": 6.378158092498779,
"eval_runtime": 173.0439,
"eval_samples_per_second": 57.789,
"eval_steps_per_second": 7.224,
"step": 44700
},
{
"epoch": 99.36235098419739,
"grad_norm": 0.25358349084854126,
"learning_rate": 1.5250836120401338e-06,
"loss": 6.3718,
"step": 44800
},
{
"epoch": 99.36235098419739,
"eval_loss": 6.381252288818359,
"eval_runtime": 175.5178,
"eval_samples_per_second": 56.974,
"eval_steps_per_second": 7.122,
"step": 44800
},
{
"epoch": 99.5841419462157,
"grad_norm": 0.27983585000038147,
"learning_rate": 1.5150501672240803e-06,
"loss": 6.3709,
"step": 44900
},
{
"epoch": 99.5841419462157,
"eval_loss": 6.383197784423828,
"eval_runtime": 175.4823,
"eval_samples_per_second": 56.986,
"eval_steps_per_second": 7.123,
"step": 44900
},
{
"epoch": 99.80593290823398,
"grad_norm": 0.35121074318885803,
"learning_rate": 1.5050167224080269e-06,
"loss": 6.3726,
"step": 45000
},
{
"epoch": 99.80593290823398,
"eval_loss": 6.385370254516602,
"eval_runtime": 175.4757,
"eval_samples_per_second": 56.988,
"eval_steps_per_second": 7.123,
"step": 45000
},
{
"epoch": 100.02772387025229,
"grad_norm": 0.22111310064792633,
"learning_rate": 1.4949832775919732e-06,
"loss": 6.3716,
"step": 45100
},
{
"epoch": 100.02772387025229,
"eval_loss": 6.38284158706665,
"eval_runtime": 175.8886,
"eval_samples_per_second": 56.854,
"eval_steps_per_second": 7.107,
"step": 45100
},
{
"epoch": 100.24951483227059,
"grad_norm": 0.22795332968235016,
"learning_rate": 1.4849498327759198e-06,
"loss": 6.3721,
"step": 45200
},
{
"epoch": 100.24951483227059,
"eval_loss": 6.378814697265625,
"eval_runtime": 173.36,
"eval_samples_per_second": 57.683,
"eval_steps_per_second": 7.21,
"step": 45200
},
{
"epoch": 100.47130579428888,
"grad_norm": 0.3906308710575104,
"learning_rate": 1.4749163879598663e-06,
"loss": 6.3711,
"step": 45300
},
{
"epoch": 100.47130579428888,
"eval_loss": 6.380859375,
"eval_runtime": 175.65,
"eval_samples_per_second": 56.931,
"eval_steps_per_second": 7.116,
"step": 45300
},
{
"epoch": 100.69309675630718,
"grad_norm": 0.35361433029174805,
"learning_rate": 1.4648829431438129e-06,
"loss": 6.3689,
"step": 45400
},
{
"epoch": 100.69309675630718,
"eval_loss": 6.386940956115723,
"eval_runtime": 173.0816,
"eval_samples_per_second": 57.776,
"eval_steps_per_second": 7.222,
"step": 45400
},
{
"epoch": 100.91488771832547,
"grad_norm": 0.3520587682723999,
"learning_rate": 1.4548494983277592e-06,
"loss": 6.371,
"step": 45500
},
{
"epoch": 100.91488771832547,
"eval_loss": 6.384310245513916,
"eval_runtime": 175.5343,
"eval_samples_per_second": 56.969,
"eval_steps_per_second": 7.121,
"step": 45500
},
{
"epoch": 101.13667868034378,
"grad_norm": 0.37038084864616394,
"learning_rate": 1.4448160535117058e-06,
"loss": 6.3712,
"step": 45600
},
{
"epoch": 101.13667868034378,
"eval_loss": 6.381255626678467,
"eval_runtime": 172.9314,
"eval_samples_per_second": 57.826,
"eval_steps_per_second": 7.228,
"step": 45600
},
{
"epoch": 101.35846964236207,
"grad_norm": 0.2583162188529968,
"learning_rate": 1.4347826086956523e-06,
"loss": 6.3693,
"step": 45700
},
{
"epoch": 101.35846964236207,
"eval_loss": 6.385676383972168,
"eval_runtime": 175.4492,
"eval_samples_per_second": 56.997,
"eval_steps_per_second": 7.125,
"step": 45700
},
{
"epoch": 101.58026060438037,
"grad_norm": 0.37049952149391174,
"learning_rate": 1.4247491638795989e-06,
"loss": 6.3715,
"step": 45800
},
{
"epoch": 101.58026060438037,
"eval_loss": 6.383345603942871,
"eval_runtime": 172.9908,
"eval_samples_per_second": 57.807,
"eval_steps_per_second": 7.226,
"step": 45800
},
{
"epoch": 101.80205156639867,
"grad_norm": 0.3586992919445038,
"learning_rate": 1.4147157190635452e-06,
"loss": 6.3709,
"step": 45900
},
{
"epoch": 101.80205156639867,
"eval_loss": 6.383970260620117,
"eval_runtime": 175.5127,
"eval_samples_per_second": 56.976,
"eval_steps_per_second": 7.122,
"step": 45900
},
{
"epoch": 102.02384252841696,
"grad_norm": 0.274954229593277,
"learning_rate": 1.4046822742474917e-06,
"loss": 6.3721,
"step": 46000
},
{
"epoch": 102.02384252841696,
"eval_loss": 6.379533767700195,
"eval_runtime": 175.5086,
"eval_samples_per_second": 56.977,
"eval_steps_per_second": 7.122,
"step": 46000
},
{
"epoch": 102.24563349043527,
"grad_norm": 0.2859888970851898,
"learning_rate": 1.3946488294314383e-06,
"loss": 6.3704,
"step": 46100
},
{
"epoch": 102.24563349043527,
"eval_loss": 6.3819146156311035,
"eval_runtime": 175.6284,
"eval_samples_per_second": 56.938,
"eval_steps_per_second": 7.117,
"step": 46100
},
{
"epoch": 102.46742445245356,
"grad_norm": 0.27162763476371765,
"learning_rate": 1.3846153846153846e-06,
"loss": 6.3718,
"step": 46200
},
{
"epoch": 102.46742445245356,
"eval_loss": 6.383949279785156,
"eval_runtime": 173.0341,
"eval_samples_per_second": 57.792,
"eval_steps_per_second": 7.224,
"step": 46200
},
{
"epoch": 102.68921541447186,
"grad_norm": 0.24669644236564636,
"learning_rate": 1.374581939799331e-06,
"loss": 6.3706,
"step": 46300
},
{
"epoch": 102.68921541447186,
"eval_loss": 6.384088516235352,
"eval_runtime": 175.8327,
"eval_samples_per_second": 56.872,
"eval_steps_per_second": 7.109,
"step": 46300
},
{
"epoch": 102.91100637649016,
"grad_norm": 0.32821038365364075,
"learning_rate": 1.3645484949832775e-06,
"loss": 6.3716,
"step": 46400
},
{
"epoch": 102.91100637649016,
"eval_loss": 6.383686065673828,
"eval_runtime": 173.1011,
"eval_samples_per_second": 57.77,
"eval_steps_per_second": 7.221,
"step": 46400
},
{
"epoch": 103.13279733850845,
"grad_norm": 0.23931552469730377,
"learning_rate": 1.354515050167224e-06,
"loss": 6.3706,
"step": 46500
},
{
"epoch": 103.13279733850845,
"eval_loss": 6.379798412322998,
"eval_runtime": 175.5988,
"eval_samples_per_second": 56.948,
"eval_steps_per_second": 7.118,
"step": 46500
},
{
"epoch": 103.35458830052676,
"grad_norm": 0.2975938022136688,
"learning_rate": 1.3444816053511706e-06,
"loss": 6.3713,
"step": 46600
},
{
"epoch": 103.35458830052676,
"eval_loss": 6.3860554695129395,
"eval_runtime": 175.5887,
"eval_samples_per_second": 56.951,
"eval_steps_per_second": 7.119,
"step": 46600
},
{
"epoch": 103.57637926254505,
"grad_norm": 0.2592810392379761,
"learning_rate": 1.334448160535117e-06,
"loss": 6.3717,
"step": 46700
},
{
"epoch": 103.57637926254505,
"eval_loss": 6.3828301429748535,
"eval_runtime": 175.6957,
"eval_samples_per_second": 56.917,
"eval_steps_per_second": 7.115,
"step": 46700
},
{
"epoch": 103.79817022456335,
"grad_norm": 0.2834523320198059,
"learning_rate": 1.3244147157190635e-06,
"loss": 6.3713,
"step": 46800
},
{
"epoch": 103.79817022456335,
"eval_loss": 6.386697769165039,
"eval_runtime": 172.9159,
"eval_samples_per_second": 57.832,
"eval_steps_per_second": 7.229,
"step": 46800
},
{
"epoch": 104.01996118658165,
"grad_norm": 0.2672658860683441,
"learning_rate": 1.31438127090301e-06,
"loss": 6.3721,
"step": 46900
},
{
"epoch": 104.01996118658165,
"eval_loss": 6.381076812744141,
"eval_runtime": 173.0101,
"eval_samples_per_second": 57.8,
"eval_steps_per_second": 7.225,
"step": 46900
},
{
"epoch": 104.24175214859994,
"grad_norm": 0.29608866572380066,
"learning_rate": 1.3043478260869566e-06,
"loss": 6.3722,
"step": 47000
},
{
"epoch": 104.24175214859994,
"eval_loss": 6.383474826812744,
"eval_runtime": 175.9295,
"eval_samples_per_second": 56.841,
"eval_steps_per_second": 7.105,
"step": 47000
},
{
"epoch": 104.46354311061825,
"grad_norm": 0.31595227122306824,
"learning_rate": 1.294314381270903e-06,
"loss": 6.3715,
"step": 47100
},
{
"epoch": 104.46354311061825,
"eval_loss": 6.382750988006592,
"eval_runtime": 173.1316,
"eval_samples_per_second": 57.76,
"eval_steps_per_second": 7.22,
"step": 47100
},
{
"epoch": 104.68533407263654,
"grad_norm": 0.2782845199108124,
"learning_rate": 1.2842809364548495e-06,
"loss": 6.3715,
"step": 47200
},
{
"epoch": 104.68533407263654,
"eval_loss": 6.381110191345215,
"eval_runtime": 175.6479,
"eval_samples_per_second": 56.932,
"eval_steps_per_second": 7.117,
"step": 47200
},
{
"epoch": 104.90712503465484,
"grad_norm": 0.32985934615135193,
"learning_rate": 1.274247491638796e-06,
"loss": 6.3707,
"step": 47300
},
{
"epoch": 104.90712503465484,
"eval_loss": 6.380244731903076,
"eval_runtime": 173.0618,
"eval_samples_per_second": 57.783,
"eval_steps_per_second": 7.223,
"step": 47300
},
{
"epoch": 105.12891599667313,
"grad_norm": 0.27673158049583435,
"learning_rate": 1.2642140468227424e-06,
"loss": 6.371,
"step": 47400
},
{
"epoch": 105.12891599667313,
"eval_loss": 6.382138252258301,
"eval_runtime": 175.4509,
"eval_samples_per_second": 56.996,
"eval_steps_per_second": 7.125,
"step": 47400
},
{
"epoch": 105.35070695869143,
"grad_norm": 0.2984777092933655,
"learning_rate": 1.254180602006689e-06,
"loss": 6.3719,
"step": 47500
},
{
"epoch": 105.35070695869143,
"eval_loss": 6.382594585418701,
"eval_runtime": 173.0122,
"eval_samples_per_second": 57.799,
"eval_steps_per_second": 7.225,
"step": 47500
},
{
"epoch": 105.57249792070974,
"grad_norm": 0.29209384322166443,
"learning_rate": 1.2441471571906355e-06,
"loss": 6.3715,
"step": 47600
},
{
"epoch": 105.57249792070974,
"eval_loss": 6.38098669052124,
"eval_runtime": 175.7524,
"eval_samples_per_second": 56.898,
"eval_steps_per_second": 7.112,
"step": 47600
},
{
"epoch": 105.79428888272803,
"grad_norm": 0.35189709067344666,
"learning_rate": 1.234113712374582e-06,
"loss": 6.3701,
"step": 47700
},
{
"epoch": 105.79428888272803,
"eval_loss": 6.384945392608643,
"eval_runtime": 175.5438,
"eval_samples_per_second": 56.966,
"eval_steps_per_second": 7.121,
"step": 47700
},
{
"epoch": 106.01607984474633,
"grad_norm": 0.37181735038757324,
"learning_rate": 1.2240802675585284e-06,
"loss": 6.3703,
"step": 47800
},
{
"epoch": 106.01607984474633,
"eval_loss": 6.378709316253662,
"eval_runtime": 175.523,
"eval_samples_per_second": 56.973,
"eval_steps_per_second": 7.122,
"step": 47800
},
{
"epoch": 106.23787080676462,
"grad_norm": 0.2793137729167938,
"learning_rate": 1.214046822742475e-06,
"loss": 6.3706,
"step": 47900
},
{
"epoch": 106.23787080676462,
"eval_loss": 6.380676746368408,
"eval_runtime": 173.0355,
"eval_samples_per_second": 57.792,
"eval_steps_per_second": 7.224,
"step": 47900
},
{
"epoch": 106.45966176878292,
"grad_norm": 0.2996074855327606,
"learning_rate": 1.2040133779264215e-06,
"loss": 6.3714,
"step": 48000
},
{
"epoch": 106.45966176878292,
"eval_loss": 6.382739067077637,
"eval_runtime": 175.5807,
"eval_samples_per_second": 56.954,
"eval_steps_per_second": 7.119,
"step": 48000
},
{
"epoch": 106.68145273080123,
"grad_norm": 0.32835853099823,
"learning_rate": 1.193979933110368e-06,
"loss": 6.3717,
"step": 48100
},
{
"epoch": 106.68145273080123,
"eval_loss": 6.382002353668213,
"eval_runtime": 173.3264,
"eval_samples_per_second": 57.695,
"eval_steps_per_second": 7.212,
"step": 48100
},
{
"epoch": 106.90324369281952,
"grad_norm": 0.31071096658706665,
"learning_rate": 1.1839464882943144e-06,
"loss": 6.3715,
"step": 48200
},
{
"epoch": 106.90324369281952,
"eval_loss": 6.385354042053223,
"eval_runtime": 175.863,
"eval_samples_per_second": 56.862,
"eval_steps_per_second": 7.108,
"step": 48200
},
{
"epoch": 107.12503465483782,
"grad_norm": 0.32424595952033997,
"learning_rate": 1.173913043478261e-06,
"loss": 6.3713,
"step": 48300
},
{
"epoch": 107.12503465483782,
"eval_loss": 6.381778240203857,
"eval_runtime": 176.0254,
"eval_samples_per_second": 56.81,
"eval_steps_per_second": 7.101,
"step": 48300
},
{
"epoch": 107.34682561685611,
"grad_norm": 0.25034162402153015,
"learning_rate": 1.1638795986622075e-06,
"loss": 6.3714,
"step": 48400
},
{
"epoch": 107.34682561685611,
"eval_loss": 6.383028507232666,
"eval_runtime": 175.899,
"eval_samples_per_second": 56.851,
"eval_steps_per_second": 7.106,
"step": 48400
},
{
"epoch": 107.56861657887441,
"grad_norm": 0.2586011290550232,
"learning_rate": 1.153846153846154e-06,
"loss": 6.3722,
"step": 48500
},
{
"epoch": 107.56861657887441,
"eval_loss": 6.382985591888428,
"eval_runtime": 175.861,
"eval_samples_per_second": 56.863,
"eval_steps_per_second": 7.108,
"step": 48500
},
{
"epoch": 107.79040754089272,
"grad_norm": 0.28121402859687805,
"learning_rate": 1.1438127090301004e-06,
"loss": 6.3715,
"step": 48600
},
{
"epoch": 107.79040754089272,
"eval_loss": 6.381731986999512,
"eval_runtime": 173.3663,
"eval_samples_per_second": 57.681,
"eval_steps_per_second": 7.21,
"step": 48600
},
{
"epoch": 108.012198502911,
"grad_norm": 0.27013683319091797,
"learning_rate": 1.133779264214047e-06,
"loss": 6.3689,
"step": 48700
},
{
"epoch": 108.012198502911,
"eval_loss": 6.381706237792969,
"eval_runtime": 175.9392,
"eval_samples_per_second": 56.838,
"eval_steps_per_second": 7.105,
"step": 48700
},
{
"epoch": 108.23398946492931,
"grad_norm": 0.345570832490921,
"learning_rate": 1.1237458193979933e-06,
"loss": 6.3706,
"step": 48800
},
{
"epoch": 108.23398946492931,
"eval_loss": 6.384325981140137,
"eval_runtime": 173.2557,
"eval_samples_per_second": 57.718,
"eval_steps_per_second": 7.215,
"step": 48800
},
{
"epoch": 108.4557804269476,
"grad_norm": 0.26037341356277466,
"learning_rate": 1.1137123745819398e-06,
"loss": 6.3728,
"step": 48900
},
{
"epoch": 108.4557804269476,
"eval_loss": 6.383279323577881,
"eval_runtime": 175.662,
"eval_samples_per_second": 56.927,
"eval_steps_per_second": 7.116,
"step": 48900
},
{
"epoch": 108.6775713889659,
"grad_norm": 0.25174733996391296,
"learning_rate": 1.1036789297658862e-06,
"loss": 6.3711,
"step": 49000
},
{
"epoch": 108.6775713889659,
"eval_loss": 6.384110927581787,
"eval_runtime": 173.04,
"eval_samples_per_second": 57.79,
"eval_steps_per_second": 7.224,
"step": 49000
},
{
"epoch": 108.89936235098419,
"grad_norm": 0.22819426655769348,
"learning_rate": 1.0936454849498327e-06,
"loss": 6.3725,
"step": 49100
},
{
"epoch": 108.89936235098419,
"eval_loss": 6.383809566497803,
"eval_runtime": 175.8514,
"eval_samples_per_second": 56.866,
"eval_steps_per_second": 7.108,
"step": 49100
},
{
"epoch": 109.1211533130025,
"grad_norm": 0.3142814636230469,
"learning_rate": 1.0836120401337793e-06,
"loss": 6.3707,
"step": 49200
},
{
"epoch": 109.1211533130025,
"eval_loss": 6.38060998916626,
"eval_runtime": 172.941,
"eval_samples_per_second": 57.823,
"eval_steps_per_second": 7.228,
"step": 49200
},
{
"epoch": 109.3429442750208,
"grad_norm": 0.30092594027519226,
"learning_rate": 1.0735785953177258e-06,
"loss": 6.3699,
"step": 49300
},
{
"epoch": 109.3429442750208,
"eval_loss": 6.385983943939209,
"eval_runtime": 175.5269,
"eval_samples_per_second": 56.971,
"eval_steps_per_second": 7.121,
"step": 49300
},
{
"epoch": 109.56473523703909,
"grad_norm": 0.31821510195732117,
"learning_rate": 1.0635451505016722e-06,
"loss": 6.3694,
"step": 49400
},
{
"epoch": 109.56473523703909,
"eval_loss": 6.383793830871582,
"eval_runtime": 175.5382,
"eval_samples_per_second": 56.968,
"eval_steps_per_second": 7.121,
"step": 49400
},
{
"epoch": 109.78652619905739,
"grad_norm": 0.3837875425815582,
"learning_rate": 1.0535117056856187e-06,
"loss": 6.3701,
"step": 49500
},
{
"epoch": 109.78652619905739,
"eval_loss": 6.380537509918213,
"eval_runtime": 175.935,
"eval_samples_per_second": 56.839,
"eval_steps_per_second": 7.105,
"step": 49500
},
{
"epoch": 110.00831716107568,
"grad_norm": 0.23530994355678558,
"learning_rate": 1.0434782608695653e-06,
"loss": 6.3705,
"step": 49600
},
{
"epoch": 110.00831716107568,
"eval_loss": 6.386258602142334,
"eval_runtime": 175.9707,
"eval_samples_per_second": 56.828,
"eval_steps_per_second": 7.103,
"step": 49600
},
{
"epoch": 110.23010812309398,
"grad_norm": 0.26103320717811584,
"learning_rate": 1.0334448160535118e-06,
"loss": 6.3707,
"step": 49700
},
{
"epoch": 110.23010812309398,
"eval_loss": 6.383273124694824,
"eval_runtime": 173.4608,
"eval_samples_per_second": 57.65,
"eval_steps_per_second": 7.206,
"step": 49700
},
{
"epoch": 110.45189908511229,
"grad_norm": 0.2887881398200989,
"learning_rate": 1.0234113712374581e-06,
"loss": 6.3721,
"step": 49800
},
{
"epoch": 110.45189908511229,
"eval_loss": 6.384125709533691,
"eval_runtime": 172.9625,
"eval_samples_per_second": 57.816,
"eval_steps_per_second": 7.227,
"step": 49800
},
{
"epoch": 110.67369004713058,
"grad_norm": 0.31840309500694275,
"learning_rate": 1.0133779264214047e-06,
"loss": 6.3717,
"step": 49900
},
{
"epoch": 110.67369004713058,
"eval_loss": 6.381842136383057,
"eval_runtime": 175.4749,
"eval_samples_per_second": 56.988,
"eval_steps_per_second": 7.124,
"step": 49900
},
{
"epoch": 110.89548100914888,
"grad_norm": 0.21653781831264496,
"learning_rate": 1.0033444816053512e-06,
"loss": 6.3707,
"step": 50000
},
{
"epoch": 110.89548100914888,
"eval_loss": 6.381892204284668,
"eval_runtime": 175.5709,
"eval_samples_per_second": 56.957,
"eval_steps_per_second": 7.12,
"step": 50000
},
{
"epoch": 111.11727197116717,
"grad_norm": 0.3267481327056885,
"learning_rate": 9.933110367892976e-07,
"loss": 6.3708,
"step": 50100
},
{
"epoch": 111.11727197116717,
"eval_loss": 6.3821611404418945,
"eval_runtime": 172.9472,
"eval_samples_per_second": 57.821,
"eval_steps_per_second": 7.228,
"step": 50100
},
{
"epoch": 111.33906293318547,
"grad_norm": 0.27063196897506714,
"learning_rate": 9.832775919732441e-07,
"loss": 6.3717,
"step": 50200
},
{
"epoch": 111.33906293318547,
"eval_loss": 6.380985736846924,
"eval_runtime": 175.4969,
"eval_samples_per_second": 56.981,
"eval_steps_per_second": 7.123,
"step": 50200
},
{
"epoch": 111.56085389520376,
"grad_norm": 0.3249282240867615,
"learning_rate": 9.732441471571907e-07,
"loss": 6.3712,
"step": 50300
},
{
"epoch": 111.56085389520376,
"eval_loss": 6.380914688110352,
"eval_runtime": 172.9605,
"eval_samples_per_second": 57.817,
"eval_steps_per_second": 7.227,
"step": 50300
},
{
"epoch": 111.78264485722207,
"grad_norm": 0.23895922303199768,
"learning_rate": 9.632107023411372e-07,
"loss": 6.3703,
"step": 50400
},
{
"epoch": 111.78264485722207,
"eval_loss": 6.382885932922363,
"eval_runtime": 175.5673,
"eval_samples_per_second": 56.958,
"eval_steps_per_second": 7.12,
"step": 50400
},
{
"epoch": 112.00443581924037,
"grad_norm": 0.35966283082962036,
"learning_rate": 9.531772575250837e-07,
"loss": 6.37,
"step": 50500
},
{
"epoch": 112.00443581924037,
"eval_loss": 6.383852481842041,
"eval_runtime": 173.4529,
"eval_samples_per_second": 57.653,
"eval_steps_per_second": 7.207,
"step": 50500
},
{
"epoch": 112.22622678125866,
"grad_norm": 0.3259362578392029,
"learning_rate": 9.431438127090301e-07,
"loss": 6.371,
"step": 50600
},
{
"epoch": 112.22622678125866,
"eval_loss": 6.385578155517578,
"eval_runtime": 175.6993,
"eval_samples_per_second": 56.915,
"eval_steps_per_second": 7.114,
"step": 50600
},
{
"epoch": 112.44801774327696,
"grad_norm": 0.26321855187416077,
"learning_rate": 9.331103678929767e-07,
"loss": 6.3701,
"step": 50700
},
{
"epoch": 112.44801774327696,
"eval_loss": 6.380197048187256,
"eval_runtime": 175.6967,
"eval_samples_per_second": 56.916,
"eval_steps_per_second": 7.115,
"step": 50700
},
{
"epoch": 112.66980870529525,
"grad_norm": 0.25881582498550415,
"learning_rate": 9.230769230769231e-07,
"loss": 6.3701,
"step": 50800
},
{
"epoch": 112.66980870529525,
"eval_loss": 6.379401683807373,
"eval_runtime": 175.5648,
"eval_samples_per_second": 56.959,
"eval_steps_per_second": 7.12,
"step": 50800
},
{
"epoch": 112.89159966731356,
"grad_norm": 0.23602035641670227,
"learning_rate": 9.130434782608697e-07,
"loss": 6.3697,
"step": 50900
},
{
"epoch": 112.89159966731356,
"eval_loss": 6.380613803863525,
"eval_runtime": 172.9114,
"eval_samples_per_second": 57.833,
"eval_steps_per_second": 7.229,
"step": 50900
},
{
"epoch": 113.11339062933186,
"grad_norm": 0.3607383072376251,
"learning_rate": 9.030100334448161e-07,
"loss": 6.3717,
"step": 51000
},
{
"epoch": 113.11339062933186,
"eval_loss": 6.3821024894714355,
"eval_runtime": 175.5159,
"eval_samples_per_second": 56.975,
"eval_steps_per_second": 7.122,
"step": 51000
},
{
"epoch": 113.33518159135015,
"grad_norm": 0.313915878534317,
"learning_rate": 8.929765886287627e-07,
"loss": 6.3703,
"step": 51100
},
{
"epoch": 113.33518159135015,
"eval_loss": 6.381007194519043,
"eval_runtime": 175.5953,
"eval_samples_per_second": 56.949,
"eval_steps_per_second": 7.119,
"step": 51100
},
{
"epoch": 113.55697255336845,
"grad_norm": 0.26152804493904114,
"learning_rate": 8.829431438127091e-07,
"loss": 6.3712,
"step": 51200
},
{
"epoch": 113.55697255336845,
"eval_loss": 6.381545543670654,
"eval_runtime": 175.5198,
"eval_samples_per_second": 56.974,
"eval_steps_per_second": 7.122,
"step": 51200
},
{
"epoch": 113.77876351538674,
"grad_norm": 0.31778955459594727,
"learning_rate": 8.729096989966555e-07,
"loss": 6.3718,
"step": 51300
},
{
"epoch": 113.77876351538674,
"eval_loss": 6.384615421295166,
"eval_runtime": 175.9251,
"eval_samples_per_second": 56.842,
"eval_steps_per_second": 7.105,
"step": 51300
},
{
"epoch": 114.00055447740505,
"grad_norm": 0.2694382965564728,
"learning_rate": 8.628762541806019e-07,
"loss": 6.371,
"step": 51400
},
{
"epoch": 114.00055447740505,
"eval_loss": 6.383395671844482,
"eval_runtime": 175.9708,
"eval_samples_per_second": 56.828,
"eval_steps_per_second": 7.103,
"step": 51400
},
{
"epoch": 114.22234543942335,
"grad_norm": 0.29690447449684143,
"learning_rate": 8.528428093645485e-07,
"loss": 6.37,
"step": 51500
},
{
"epoch": 114.22234543942335,
"eval_loss": 6.382811546325684,
"eval_runtime": 173.537,
"eval_samples_per_second": 57.625,
"eval_steps_per_second": 7.203,
"step": 51500
},
{
"epoch": 114.44413640144164,
"grad_norm": 0.39484673738479614,
"learning_rate": 8.428093645484949e-07,
"loss": 6.3711,
"step": 51600
},
{
"epoch": 114.44413640144164,
"eval_loss": 6.382282257080078,
"eval_runtime": 175.9709,
"eval_samples_per_second": 56.828,
"eval_steps_per_second": 7.103,
"step": 51600
},
{
"epoch": 114.66592736345994,
"grad_norm": 0.2630254626274109,
"learning_rate": 8.327759197324414e-07,
"loss": 6.3707,
"step": 51700
},
{
"epoch": 114.66592736345994,
"eval_loss": 6.382809162139893,
"eval_runtime": 176.0003,
"eval_samples_per_second": 56.818,
"eval_steps_per_second": 7.102,
"step": 51700
},
{
"epoch": 114.88771832547823,
"grad_norm": 0.3054973781108856,
"learning_rate": 8.227424749163879e-07,
"loss": 6.3708,
"step": 51800
},
{
"epoch": 114.88771832547823,
"eval_loss": 6.3818230628967285,
"eval_runtime": 173.4807,
"eval_samples_per_second": 57.643,
"eval_steps_per_second": 7.205,
"step": 51800
},
{
"epoch": 115.10950928749654,
"grad_norm": 0.24989312887191772,
"learning_rate": 8.127090301003344e-07,
"loss": 6.3697,
"step": 51900
},
{
"epoch": 115.10950928749654,
"eval_loss": 6.3821187019348145,
"eval_runtime": 175.9917,
"eval_samples_per_second": 56.821,
"eval_steps_per_second": 7.103,
"step": 51900
},
{
"epoch": 115.33130024951483,
"grad_norm": 0.3176492750644684,
"learning_rate": 8.026755852842809e-07,
"loss": 6.3716,
"step": 52000
},
{
"epoch": 115.33130024951483,
"eval_loss": 6.3822808265686035,
"eval_runtime": 173.4423,
"eval_samples_per_second": 57.656,
"eval_steps_per_second": 7.207,
"step": 52000
},
{
"epoch": 115.55309121153313,
"grad_norm": 0.2542394995689392,
"learning_rate": 7.926421404682274e-07,
"loss": 6.3712,
"step": 52100
},
{
"epoch": 115.55309121153313,
"eval_loss": 6.380392074584961,
"eval_runtime": 175.9555,
"eval_samples_per_second": 56.833,
"eval_steps_per_second": 7.104,
"step": 52100
},
{
"epoch": 115.77488217355143,
"grad_norm": 0.2998870313167572,
"learning_rate": 7.826086956521739e-07,
"loss": 6.3699,
"step": 52200
},
{
"epoch": 115.77488217355143,
"eval_loss": 6.381204605102539,
"eval_runtime": 173.1037,
"eval_samples_per_second": 57.769,
"eval_steps_per_second": 7.221,
"step": 52200
},
{
"epoch": 115.99667313556972,
"grad_norm": 0.2524458169937134,
"learning_rate": 7.725752508361204e-07,
"loss": 6.3704,
"step": 52300
},
{
"epoch": 115.99667313556972,
"eval_loss": 6.383292198181152,
"eval_runtime": 175.5333,
"eval_samples_per_second": 56.969,
"eval_steps_per_second": 7.121,
"step": 52300
},
{
"epoch": 116.21846409758803,
"grad_norm": 0.2731904983520508,
"learning_rate": 7.625418060200669e-07,
"loss": 6.3715,
"step": 52400
},
{
"epoch": 116.21846409758803,
"eval_loss": 6.380125999450684,
"eval_runtime": 173.003,
"eval_samples_per_second": 57.802,
"eval_steps_per_second": 7.225,
"step": 52400
},
{
"epoch": 116.44025505960632,
"grad_norm": 0.3370875120162964,
"learning_rate": 7.525083612040134e-07,
"loss": 6.3702,
"step": 52500
},
{
"epoch": 116.44025505960632,
"eval_loss": 6.383055686950684,
"eval_runtime": 175.6351,
"eval_samples_per_second": 56.936,
"eval_steps_per_second": 7.117,
"step": 52500
},
{
"epoch": 116.66204602162462,
"grad_norm": 0.2853044867515564,
"learning_rate": 7.424749163879599e-07,
"loss": 6.3706,
"step": 52600
},
{
"epoch": 116.66204602162462,
"eval_loss": 6.381393909454346,
"eval_runtime": 175.6586,
"eval_samples_per_second": 56.929,
"eval_steps_per_second": 7.116,
"step": 52600
},
{
"epoch": 116.88383698364292,
"grad_norm": 0.3378102481365204,
"learning_rate": 7.324414715719064e-07,
"loss": 6.3701,
"step": 52700
},
{
"epoch": 116.88383698364292,
"eval_loss": 6.380878448486328,
"eval_runtime": 175.5156,
"eval_samples_per_second": 56.975,
"eval_steps_per_second": 7.122,
"step": 52700
},
{
"epoch": 117.10562794566121,
"grad_norm": 0.27575579285621643,
"learning_rate": 7.224080267558529e-07,
"loss": 6.3698,
"step": 52800
},
{
"epoch": 117.10562794566121,
"eval_loss": 6.381886959075928,
"eval_runtime": 175.558,
"eval_samples_per_second": 56.961,
"eval_steps_per_second": 7.12,
"step": 52800
},
{
"epoch": 117.32741890767952,
"grad_norm": 0.22469982504844666,
"learning_rate": 7.123745819397994e-07,
"loss": 6.3689,
"step": 52900
},
{
"epoch": 117.32741890767952,
"eval_loss": 6.378075122833252,
"eval_runtime": 176.0795,
"eval_samples_per_second": 56.793,
"eval_steps_per_second": 7.099,
"step": 52900
},
{
"epoch": 117.5492098696978,
"grad_norm": 0.26414427161216736,
"learning_rate": 7.023411371237459e-07,
"loss": 6.3715,
"step": 53000
},
{
"epoch": 117.5492098696978,
"eval_loss": 6.38188362121582,
"eval_runtime": 176.003,
"eval_samples_per_second": 56.817,
"eval_steps_per_second": 7.102,
"step": 53000
},
{
"epoch": 117.77100083171611,
"grad_norm": 0.2348640114068985,
"learning_rate": 6.923076923076923e-07,
"loss": 6.3699,
"step": 53100
},
{
"epoch": 117.77100083171611,
"eval_loss": 6.382396697998047,
"eval_runtime": 175.8429,
"eval_samples_per_second": 56.869,
"eval_steps_per_second": 7.109,
"step": 53100
},
{
"epoch": 117.99279179373441,
"grad_norm": 0.36397331953048706,
"learning_rate": 6.822742474916388e-07,
"loss": 6.3703,
"step": 53200
},
{
"epoch": 117.99279179373441,
"eval_loss": 6.384123802185059,
"eval_runtime": 174.4907,
"eval_samples_per_second": 57.31,
"eval_steps_per_second": 7.164,
"step": 53200
},
{
"epoch": 118.2145827557527,
"grad_norm": 0.25135567784309387,
"learning_rate": 6.722408026755853e-07,
"loss": 6.3701,
"step": 53300
},
{
"epoch": 118.2145827557527,
"eval_loss": 6.3801045417785645,
"eval_runtime": 173.0674,
"eval_samples_per_second": 57.781,
"eval_steps_per_second": 7.223,
"step": 53300
},
{
"epoch": 118.436373717771,
"grad_norm": 0.30894702672958374,
"learning_rate": 6.622073578595318e-07,
"loss": 6.3702,
"step": 53400
},
{
"epoch": 118.436373717771,
"eval_loss": 6.379894733428955,
"eval_runtime": 175.516,
"eval_samples_per_second": 56.975,
"eval_steps_per_second": 7.122,
"step": 53400
},
{
"epoch": 118.6581646797893,
"grad_norm": 0.26461485028266907,
"learning_rate": 6.521739130434783e-07,
"loss": 6.3713,
"step": 53500
},
{
"epoch": 118.6581646797893,
"eval_loss": 6.3835272789001465,
"eval_runtime": 173.4513,
"eval_samples_per_second": 57.653,
"eval_steps_per_second": 7.207,
"step": 53500
},
{
"epoch": 118.8799556418076,
"grad_norm": 0.24245497584342957,
"learning_rate": 6.421404682274248e-07,
"loss": 6.3705,
"step": 53600
},
{
"epoch": 118.8799556418076,
"eval_loss": 6.381874084472656,
"eval_runtime": 176.0158,
"eval_samples_per_second": 56.813,
"eval_steps_per_second": 7.102,
"step": 53600
},
{
"epoch": 119.10174660382589,
"grad_norm": 0.23844820261001587,
"learning_rate": 6.321070234113712e-07,
"loss": 6.3698,
"step": 53700
},
{
"epoch": 119.10174660382589,
"eval_loss": 6.381486415863037,
"eval_runtime": 173.4108,
"eval_samples_per_second": 57.667,
"eval_steps_per_second": 7.208,
"step": 53700
},
{
"epoch": 119.32353756584419,
"grad_norm": 0.2418413609266281,
"learning_rate": 6.220735785953178e-07,
"loss": 6.3712,
"step": 53800
},
{
"epoch": 119.32353756584419,
"eval_loss": 6.382267951965332,
"eval_runtime": 175.9952,
"eval_samples_per_second": 56.82,
"eval_steps_per_second": 7.102,
"step": 53800
},
{
"epoch": 119.5453285278625,
"grad_norm": 0.22692246735095978,
"learning_rate": 6.120401337792642e-07,
"loss": 6.371,
"step": 53900
},
{
"epoch": 119.5453285278625,
"eval_loss": 6.383540630340576,
"eval_runtime": 173.1726,
"eval_samples_per_second": 57.746,
"eval_steps_per_second": 7.218,
"step": 53900
},
{
"epoch": 119.76711948988078,
"grad_norm": 0.29117047786712646,
"learning_rate": 6.020066889632107e-07,
"loss": 6.3713,
"step": 54000
},
{
"epoch": 119.76711948988078,
"eval_loss": 6.382152557373047,
"eval_runtime": 175.7557,
"eval_samples_per_second": 56.897,
"eval_steps_per_second": 7.112,
"step": 54000
},
{
"epoch": 119.98891045189909,
"grad_norm": 0.21682819724082947,
"learning_rate": 5.919732441471572e-07,
"loss": 6.3702,
"step": 54100
},
{
"epoch": 119.98891045189909,
"eval_loss": 6.380878925323486,
"eval_runtime": 173.0921,
"eval_samples_per_second": 57.773,
"eval_steps_per_second": 7.222,
"step": 54100
},
{
"epoch": 120.21070141391738,
"grad_norm": 0.31245148181915283,
"learning_rate": 5.819397993311037e-07,
"loss": 6.3694,
"step": 54200
},
{
"epoch": 120.21070141391738,
"eval_loss": 6.383978843688965,
"eval_runtime": 175.5232,
"eval_samples_per_second": 56.973,
"eval_steps_per_second": 7.122,
"step": 54200
},
{
"epoch": 120.43249237593568,
"grad_norm": 0.22876819968223572,
"learning_rate": 5.719063545150502e-07,
"loss": 6.3706,
"step": 54300
},
{
"epoch": 120.43249237593568,
"eval_loss": 6.382028102874756,
"eval_runtime": 173.1291,
"eval_samples_per_second": 57.76,
"eval_steps_per_second": 7.22,
"step": 54300
},
{
"epoch": 120.65428333795398,
"grad_norm": 0.25953638553619385,
"learning_rate": 5.618729096989966e-07,
"loss": 6.3707,
"step": 54400
},
{
"epoch": 120.65428333795398,
"eval_loss": 6.381461143493652,
"eval_runtime": 175.57,
"eval_samples_per_second": 56.957,
"eval_steps_per_second": 7.12,
"step": 54400
},
{
"epoch": 120.87607429997227,
"grad_norm": 0.1654128134250641,
"learning_rate": 5.518394648829431e-07,
"loss": 6.3707,
"step": 54500
},
{
"epoch": 120.87607429997227,
"eval_loss": 6.3789753913879395,
"eval_runtime": 175.5387,
"eval_samples_per_second": 56.967,
"eval_steps_per_second": 7.121,
"step": 54500
},
{
"epoch": 121.09786526199058,
"grad_norm": 0.29274898767471313,
"learning_rate": 5.418060200668896e-07,
"loss": 6.3703,
"step": 54600
},
{
"epoch": 121.09786526199058,
"eval_loss": 6.380027770996094,
"eval_runtime": 175.4995,
"eval_samples_per_second": 56.98,
"eval_steps_per_second": 7.123,
"step": 54600
},
{
"epoch": 121.31965622400887,
"grad_norm": 0.2235456258058548,
"learning_rate": 5.317725752508361e-07,
"loss": 6.373,
"step": 54700
},
{
"epoch": 121.31965622400887,
"eval_loss": 6.380786418914795,
"eval_runtime": 175.5186,
"eval_samples_per_second": 56.974,
"eval_steps_per_second": 7.122,
"step": 54700
},
{
"epoch": 121.54144718602717,
"grad_norm": 0.30965185165405273,
"learning_rate": 5.217391304347826e-07,
"loss": 6.3714,
"step": 54800
},
{
"epoch": 121.54144718602717,
"eval_loss": 6.382297039031982,
"eval_runtime": 175.5968,
"eval_samples_per_second": 56.949,
"eval_steps_per_second": 7.119,
"step": 54800
},
{
"epoch": 121.76323814804547,
"grad_norm": 0.28793787956237793,
"learning_rate": 5.117056856187291e-07,
"loss": 6.3707,
"step": 54900
},
{
"epoch": 121.76323814804547,
"eval_loss": 6.377398490905762,
"eval_runtime": 173.0039,
"eval_samples_per_second": 57.802,
"eval_steps_per_second": 7.225,
"step": 54900
},
{
"epoch": 121.98502911006376,
"grad_norm": 0.3277120590209961,
"learning_rate": 5.016722408026756e-07,
"loss": 6.3688,
"step": 55000
},
{
"epoch": 121.98502911006376,
"eval_loss": 6.383605480194092,
"eval_runtime": 173.0503,
"eval_samples_per_second": 57.787,
"eval_steps_per_second": 7.223,
"step": 55000
},
{
"epoch": 122.20682007208207,
"grad_norm": 0.2291731834411621,
"learning_rate": 4.916387959866221e-07,
"loss": 6.3702,
"step": 55100
},
{
"epoch": 122.20682007208207,
"eval_loss": 6.385202407836914,
"eval_runtime": 175.7369,
"eval_samples_per_second": 56.903,
"eval_steps_per_second": 7.113,
"step": 55100
},
{
"epoch": 122.42861103410036,
"grad_norm": 0.23682117462158203,
"learning_rate": 4.816053511705686e-07,
"loss": 6.3711,
"step": 55200
},
{
"epoch": 122.42861103410036,
"eval_loss": 6.386002063751221,
"eval_runtime": 173.0919,
"eval_samples_per_second": 57.773,
"eval_steps_per_second": 7.222,
"step": 55200
},
{
"epoch": 122.65040199611866,
"grad_norm": 0.21502740681171417,
"learning_rate": 4.7157190635451506e-07,
"loss": 6.37,
"step": 55300
},
{
"epoch": 122.65040199611866,
"eval_loss": 6.38268518447876,
"eval_runtime": 175.5194,
"eval_samples_per_second": 56.974,
"eval_steps_per_second": 7.122,
"step": 55300
},
{
"epoch": 122.87219295813695,
"grad_norm": 0.2415875792503357,
"learning_rate": 4.6153846153846156e-07,
"loss": 6.37,
"step": 55400
},
{
"epoch": 122.87219295813695,
"eval_loss": 6.379030704498291,
"eval_runtime": 173.0644,
"eval_samples_per_second": 57.782,
"eval_steps_per_second": 7.223,
"step": 55400
},
{
"epoch": 123.09398392015525,
"grad_norm": 0.278998464345932,
"learning_rate": 4.5150501672240806e-07,
"loss": 6.3709,
"step": 55500
},
{
"epoch": 123.09398392015525,
"eval_loss": 6.381860256195068,
"eval_runtime": 175.5462,
"eval_samples_per_second": 56.965,
"eval_steps_per_second": 7.121,
"step": 55500
},
{
"epoch": 123.31577488217356,
"grad_norm": 0.27015259861946106,
"learning_rate": 4.4147157190635456e-07,
"loss": 6.37,
"step": 55600
},
{
"epoch": 123.31577488217356,
"eval_loss": 6.380960464477539,
"eval_runtime": 175.9714,
"eval_samples_per_second": 56.827,
"eval_steps_per_second": 7.103,
"step": 55600
},
{
"epoch": 123.53756584419185,
"grad_norm": 0.23815931379795074,
"learning_rate": 4.3143812709030095e-07,
"loss": 6.37,
"step": 55700
},
{
"epoch": 123.53756584419185,
"eval_loss": 6.384081840515137,
"eval_runtime": 173.1242,
"eval_samples_per_second": 57.762,
"eval_steps_per_second": 7.22,
"step": 55700
},
{
"epoch": 123.75935680621015,
"grad_norm": 0.24355483055114746,
"learning_rate": 4.2140468227424745e-07,
"loss": 6.3694,
"step": 55800
},
{
"epoch": 123.75935680621015,
"eval_loss": 6.378664016723633,
"eval_runtime": 173.0043,
"eval_samples_per_second": 57.802,
"eval_steps_per_second": 7.225,
"step": 55800
},
{
"epoch": 123.98114776822844,
"grad_norm": 0.21320495009422302,
"learning_rate": 4.1137123745819395e-07,
"loss": 6.3693,
"step": 55900
},
{
"epoch": 123.98114776822844,
"eval_loss": 6.382479190826416,
"eval_runtime": 175.5916,
"eval_samples_per_second": 56.95,
"eval_steps_per_second": 7.119,
"step": 55900
},
{
"epoch": 124.20293873024674,
"grad_norm": 0.2245740443468094,
"learning_rate": 4.0133779264214045e-07,
"loss": 6.3702,
"step": 56000
},
{
"epoch": 124.20293873024674,
"eval_loss": 6.385231971740723,
"eval_runtime": 175.6666,
"eval_samples_per_second": 56.926,
"eval_steps_per_second": 7.116,
"step": 56000
},
{
"epoch": 124.42472969226505,
"grad_norm": 0.282416969537735,
"learning_rate": 3.9130434782608694e-07,
"loss": 6.3709,
"step": 56100
},
{
"epoch": 124.42472969226505,
"eval_loss": 6.380115032196045,
"eval_runtime": 175.7632,
"eval_samples_per_second": 56.895,
"eval_steps_per_second": 7.112,
"step": 56100
},
{
"epoch": 124.64652065428334,
"grad_norm": 0.19661109149456024,
"learning_rate": 3.8127090301003344e-07,
"loss": 6.3712,
"step": 56200
},
{
"epoch": 124.64652065428334,
"eval_loss": 6.3793158531188965,
"eval_runtime": 175.59,
"eval_samples_per_second": 56.951,
"eval_steps_per_second": 7.119,
"step": 56200
},
{
"epoch": 124.86831161630164,
"grad_norm": 0.18216532468795776,
"learning_rate": 3.7123745819397994e-07,
"loss": 6.3703,
"step": 56300
},
{
"epoch": 124.86831161630164,
"eval_loss": 6.381213188171387,
"eval_runtime": 175.6076,
"eval_samples_per_second": 56.945,
"eval_steps_per_second": 7.118,
"step": 56300
},
{
"epoch": 125.09010257831993,
"grad_norm": 0.3018471598625183,
"learning_rate": 3.6120401337792644e-07,
"loss": 6.3706,
"step": 56400
},
{
"epoch": 125.09010257831993,
"eval_loss": 6.3784942626953125,
"eval_runtime": 175.6917,
"eval_samples_per_second": 56.918,
"eval_steps_per_second": 7.115,
"step": 56400
},
{
"epoch": 125.31189354033823,
"grad_norm": 0.21381452679634094,
"learning_rate": 3.5117056856187294e-07,
"loss": 6.3722,
"step": 56500
},
{
"epoch": 125.31189354033823,
"eval_loss": 6.381383419036865,
"eval_runtime": 173.1305,
"eval_samples_per_second": 57.76,
"eval_steps_per_second": 7.22,
"step": 56500
},
{
"epoch": 125.53368450235654,
"grad_norm": 0.23340944945812225,
"learning_rate": 3.411371237458194e-07,
"loss": 6.3698,
"step": 56600
},
{
"epoch": 125.53368450235654,
"eval_loss": 6.380908012390137,
"eval_runtime": 175.6729,
"eval_samples_per_second": 56.924,
"eval_steps_per_second": 7.115,
"step": 56600
},
{
"epoch": 125.75547546437483,
"grad_norm": 0.22507449984550476,
"learning_rate": 3.311036789297659e-07,
"loss": 6.3711,
"step": 56700
},
{
"epoch": 125.75547546437483,
"eval_loss": 6.37741756439209,
"eval_runtime": 172.9897,
"eval_samples_per_second": 57.807,
"eval_steps_per_second": 7.226,
"step": 56700
},
{
"epoch": 125.97726642639313,
"grad_norm": 0.21832765638828278,
"learning_rate": 3.210702341137124e-07,
"loss": 6.3716,
"step": 56800
},
{
"epoch": 125.97726642639313,
"eval_loss": 6.381014823913574,
"eval_runtime": 175.5155,
"eval_samples_per_second": 56.975,
"eval_steps_per_second": 7.122,
"step": 56800
},
{
"epoch": 126.19905738841142,
"grad_norm": 0.27440136671066284,
"learning_rate": 3.110367892976589e-07,
"loss": 6.3728,
"step": 56900
},
{
"epoch": 126.19905738841142,
"eval_loss": 6.3819074630737305,
"eval_runtime": 172.9421,
"eval_samples_per_second": 57.823,
"eval_steps_per_second": 7.228,
"step": 56900
},
{
"epoch": 126.42084835042972,
"grad_norm": 0.24798136949539185,
"learning_rate": 3.010033444816054e-07,
"loss": 6.3702,
"step": 57000
},
{
"epoch": 126.42084835042972,
"eval_loss": 6.379570484161377,
"eval_runtime": 176.0012,
"eval_samples_per_second": 56.818,
"eval_steps_per_second": 7.102,
"step": 57000
},
{
"epoch": 126.64263931244801,
"grad_norm": 0.196645587682724,
"learning_rate": 2.9096989966555187e-07,
"loss": 6.3702,
"step": 57100
},
{
"epoch": 126.64263931244801,
"eval_loss": 6.3817267417907715,
"eval_runtime": 173.0992,
"eval_samples_per_second": 57.77,
"eval_steps_per_second": 7.221,
"step": 57100
},
{
"epoch": 126.86443027446632,
"grad_norm": 0.21966499090194702,
"learning_rate": 2.809364548494983e-07,
"loss": 6.3689,
"step": 57200
},
{
"epoch": 126.86443027446632,
"eval_loss": 6.383100986480713,
"eval_runtime": 175.7334,
"eval_samples_per_second": 56.904,
"eval_steps_per_second": 7.113,
"step": 57200
},
{
"epoch": 127.08622123648462,
"grad_norm": 0.19457194209098816,
"learning_rate": 2.709030100334448e-07,
"loss": 6.371,
"step": 57300
},
{
"epoch": 127.08622123648462,
"eval_loss": 6.381374835968018,
"eval_runtime": 175.566,
"eval_samples_per_second": 56.959,
"eval_steps_per_second": 7.12,
"step": 57300
},
{
"epoch": 127.30801219850291,
"grad_norm": 0.22573208808898926,
"learning_rate": 2.608695652173913e-07,
"loss": 6.3725,
"step": 57400
},
{
"epoch": 127.30801219850291,
"eval_loss": 6.380834579467773,
"eval_runtime": 175.5891,
"eval_samples_per_second": 56.951,
"eval_steps_per_second": 7.119,
"step": 57400
},
{
"epoch": 127.52980316052121,
"grad_norm": 0.2630537748336792,
"learning_rate": 2.508361204013378e-07,
"loss": 6.3689,
"step": 57500
},
{
"epoch": 127.52980316052121,
"eval_loss": 6.380504131317139,
"eval_runtime": 175.5167,
"eval_samples_per_second": 56.975,
"eval_steps_per_second": 7.122,
"step": 57500
},
{
"epoch": 127.7515941225395,
"grad_norm": 0.2693498134613037,
"learning_rate": 2.408026755852843e-07,
"loss": 6.3711,
"step": 57600
},
{
"epoch": 127.7515941225395,
"eval_loss": 6.379264831542969,
"eval_runtime": 175.6659,
"eval_samples_per_second": 56.926,
"eval_steps_per_second": 7.116,
"step": 57600
},
{
"epoch": 127.9733850845578,
"grad_norm": 0.21430125832557678,
"learning_rate": 2.3076923076923078e-07,
"loss": 6.3701,
"step": 57700
},
{
"epoch": 127.9733850845578,
"eval_loss": 6.383444309234619,
"eval_runtime": 175.8385,
"eval_samples_per_second": 56.87,
"eval_steps_per_second": 7.109,
"step": 57700
},
{
"epoch": 128.1951760465761,
"grad_norm": 0.23632164299488068,
"learning_rate": 2.2073578595317728e-07,
"loss": 6.37,
"step": 57800
},
{
"epoch": 128.1951760465761,
"eval_loss": 6.381924152374268,
"eval_runtime": 175.9161,
"eval_samples_per_second": 56.845,
"eval_steps_per_second": 7.106,
"step": 57800
},
{
"epoch": 128.4169670085944,
"grad_norm": 0.20027929544448853,
"learning_rate": 2.1070234113712372e-07,
"loss": 6.3689,
"step": 57900
},
{
"epoch": 128.4169670085944,
"eval_loss": 6.380605697631836,
"eval_runtime": 175.7408,
"eval_samples_per_second": 56.902,
"eval_steps_per_second": 7.113,
"step": 57900
},
{
"epoch": 128.6387579706127,
"grad_norm": 0.24598795175552368,
"learning_rate": 2.0066889632107022e-07,
"loss": 6.3703,
"step": 58000
},
{
"epoch": 128.6387579706127,
"eval_loss": 6.380997180938721,
"eval_runtime": 175.8242,
"eval_samples_per_second": 56.875,
"eval_steps_per_second": 7.109,
"step": 58000
},
{
"epoch": 128.860548932631,
"grad_norm": 0.22210384905338287,
"learning_rate": 1.9063545150501672e-07,
"loss": 6.3713,
"step": 58100
},
{
"epoch": 128.860548932631,
"eval_loss": 6.379730701446533,
"eval_runtime": 175.6297,
"eval_samples_per_second": 56.938,
"eval_steps_per_second": 7.117,
"step": 58100
},
{
"epoch": 129.0823398946493,
"grad_norm": 0.21533408761024475,
"learning_rate": 1.8060200668896322e-07,
"loss": 6.369,
"step": 58200
},
{
"epoch": 129.0823398946493,
"eval_loss": 6.379825592041016,
"eval_runtime": 173.1155,
"eval_samples_per_second": 57.765,
"eval_steps_per_second": 7.221,
"step": 58200
},
{
"epoch": 129.3041308566676,
"grad_norm": 0.24441500008106232,
"learning_rate": 1.705685618729097e-07,
"loss": 6.3712,
"step": 58300
},
{
"epoch": 129.3041308566676,
"eval_loss": 6.380709171295166,
"eval_runtime": 175.6262,
"eval_samples_per_second": 56.939,
"eval_steps_per_second": 7.117,
"step": 58300
},
{
"epoch": 129.5259218186859,
"grad_norm": 0.174821138381958,
"learning_rate": 1.605351170568562e-07,
"loss": 6.3694,
"step": 58400
},
{
"epoch": 129.5259218186859,
"eval_loss": 6.3804545402526855,
"eval_runtime": 174.2415,
"eval_samples_per_second": 57.392,
"eval_steps_per_second": 7.174,
"step": 58400
},
{
"epoch": 129.74771278070418,
"grad_norm": 0.24464456737041473,
"learning_rate": 1.505016722408027e-07,
"loss": 6.3713,
"step": 58500
},
{
"epoch": 129.74771278070418,
"eval_loss": 6.379507541656494,
"eval_runtime": 175.5413,
"eval_samples_per_second": 56.967,
"eval_steps_per_second": 7.121,
"step": 58500
},
{
"epoch": 129.96950374272248,
"grad_norm": 0.1928214728832245,
"learning_rate": 1.4046822742474916e-07,
"loss": 6.37,
"step": 58600
},
{
"epoch": 129.96950374272248,
"eval_loss": 6.384742736816406,
"eval_runtime": 173.0335,
"eval_samples_per_second": 57.792,
"eval_steps_per_second": 7.224,
"step": 58600
},
{
"epoch": 130.19129470474078,
"grad_norm": 0.2976389229297638,
"learning_rate": 1.3043478260869566e-07,
"loss": 6.3717,
"step": 58700
},
{
"epoch": 130.19129470474078,
"eval_loss": 6.3786187171936035,
"eval_runtime": 175.6076,
"eval_samples_per_second": 56.945,
"eval_steps_per_second": 7.118,
"step": 58700
},
{
"epoch": 130.4130856667591,
"grad_norm": 0.25023147463798523,
"learning_rate": 1.2040133779264215e-07,
"loss": 6.3685,
"step": 58800
},
{
"epoch": 130.4130856667591,
"eval_loss": 6.383387565612793,
"eval_runtime": 175.5163,
"eval_samples_per_second": 56.975,
"eval_steps_per_second": 7.122,
"step": 58800
},
{
"epoch": 130.6348766287774,
"grad_norm": 0.21737854182720184,
"learning_rate": 1.1036789297658864e-07,
"loss": 6.3712,
"step": 58900
},
{
"epoch": 130.6348766287774,
"eval_loss": 6.379786491394043,
"eval_runtime": 175.7874,
"eval_samples_per_second": 56.887,
"eval_steps_per_second": 7.111,
"step": 58900
},
{
"epoch": 130.85666759079567,
"grad_norm": 0.18008896708488464,
"learning_rate": 1.0033444816053511e-07,
"loss": 6.3701,
"step": 59000
},
{
"epoch": 130.85666759079567,
"eval_loss": 6.378762722015381,
"eval_runtime": 175.3457,
"eval_samples_per_second": 57.03,
"eval_steps_per_second": 7.129,
"step": 59000
},
{
"epoch": 131.07845855281397,
"grad_norm": 0.26529356837272644,
"learning_rate": 9.030100334448161e-08,
"loss": 6.3706,
"step": 59100
},
{
"epoch": 131.07845855281397,
"eval_loss": 6.384096622467041,
"eval_runtime": 173.4291,
"eval_samples_per_second": 57.66,
"eval_steps_per_second": 7.208,
"step": 59100
},
{
"epoch": 131.30024951483227,
"grad_norm": 0.2854064106941223,
"learning_rate": 8.02675585284281e-08,
"loss": 6.3699,
"step": 59200
},
{
"epoch": 131.30024951483227,
"eval_loss": 6.38028621673584,
"eval_runtime": 175.6366,
"eval_samples_per_second": 56.936,
"eval_steps_per_second": 7.117,
"step": 59200
},
{
"epoch": 131.52204047685058,
"grad_norm": 0.2294512242078781,
"learning_rate": 7.023411371237458e-08,
"loss": 6.3711,
"step": 59300
},
{
"epoch": 131.52204047685058,
"eval_loss": 6.384092330932617,
"eval_runtime": 175.5659,
"eval_samples_per_second": 56.959,
"eval_steps_per_second": 7.12,
"step": 59300
},
{
"epoch": 131.74383143886885,
"grad_norm": 0.23442944884300232,
"learning_rate": 6.020066889632108e-08,
"loss": 6.3704,
"step": 59400
},
{
"epoch": 131.74383143886885,
"eval_loss": 6.382981300354004,
"eval_runtime": 175.5589,
"eval_samples_per_second": 56.961,
"eval_steps_per_second": 7.12,
"step": 59400
},
{
"epoch": 131.96562240088716,
"grad_norm": 0.216475710272789,
"learning_rate": 5.0167224080267556e-08,
"loss": 6.3708,
"step": 59500
},
{
"epoch": 131.96562240088716,
"eval_loss": 6.381463050842285,
"eval_runtime": 175.6519,
"eval_samples_per_second": 56.931,
"eval_steps_per_second": 7.116,
"step": 59500
},
{
"epoch": 132.18741336290546,
"grad_norm": 0.2338051199913025,
"learning_rate": 4.013377926421405e-08,
"loss": 6.3693,
"step": 59600
},
{
"epoch": 132.18741336290546,
"eval_loss": 6.379833698272705,
"eval_runtime": 175.5243,
"eval_samples_per_second": 56.972,
"eval_steps_per_second": 7.122,
"step": 59600
},
{
"epoch": 132.40920432492376,
"grad_norm": 0.20408721268177032,
"learning_rate": 3.010033444816054e-08,
"loss": 6.3683,
"step": 59700
},
{
"epoch": 132.40920432492376,
"eval_loss": 6.38368034362793,
"eval_runtime": 175.3937,
"eval_samples_per_second": 57.015,
"eval_steps_per_second": 7.127,
"step": 59700
},
{
"epoch": 132.63099528694207,
"grad_norm": 0.24998629093170166,
"learning_rate": 2.0066889632107024e-08,
"loss": 6.3697,
"step": 59800
},
{
"epoch": 132.63099528694207,
"eval_loss": 6.381494522094727,
"eval_runtime": 176.0167,
"eval_samples_per_second": 56.813,
"eval_steps_per_second": 7.102,
"step": 59800
},
{
"epoch": 132.85278624896034,
"grad_norm": 0.2178734391927719,
"learning_rate": 1.0033444816053512e-08,
"loss": 6.371,
"step": 59900
},
{
"epoch": 132.85278624896034,
"eval_loss": 6.382035732269287,
"eval_runtime": 173.4585,
"eval_samples_per_second": 57.651,
"eval_steps_per_second": 7.206,
"step": 59900
},
{
"epoch": 133.07457721097865,
"grad_norm": 0.24738912284374237,
"learning_rate": 0.0,
"loss": 6.3696,
"step": 60000
},
{
"epoch": 133.07457721097865,
"eval_loss": 6.382532596588135,
"eval_runtime": 175.9883,
"eval_samples_per_second": 56.822,
"eval_steps_per_second": 7.103,
"step": 60000
}
],
"logging_steps": 100,
"max_steps": 60000,
"num_input_tokens_seen": 0,
"num_train_epochs": 134,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.157205700133659e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}