{ "best_metric": 6.378762722015381, "best_model_checkpoint": "learning_source_20260316/genome_sequence/bert-output/genome_sequence-medium/checkpoint-59000", "epoch": 133.07457721097865, "eval_steps": 100, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.22179096201829776, "grad_norm": 0.9846197366714478, "learning_rate": 3e-06, "loss": 8.0812, "step": 100 }, { "epoch": 0.22179096201829776, "eval_loss": 7.739930629730225, "eval_runtime": 175.2691, "eval_samples_per_second": 57.055, "eval_steps_per_second": 7.132, "step": 100 }, { "epoch": 0.4435819240365955, "grad_norm": 0.7575621604919434, "learning_rate": 6e-06, "loss": 7.5841, "step": 200 }, { "epoch": 0.4435819240365955, "eval_loss": 7.357375144958496, "eval_runtime": 174.8425, "eval_samples_per_second": 57.194, "eval_steps_per_second": 7.149, "step": 200 }, { "epoch": 0.6653728860548933, "grad_norm": 2.5145933628082275, "learning_rate": 5.989966555183947e-06, "loss": 7.2475, "step": 300 }, { "epoch": 0.6653728860548933, "eval_loss": 7.029321193695068, "eval_runtime": 174.7596, "eval_samples_per_second": 57.221, "eval_steps_per_second": 7.153, "step": 300 }, { "epoch": 0.887163848073191, "grad_norm": 2.1445820331573486, "learning_rate": 5.979933110367893e-06, "loss": 7.0137, "step": 400 }, { "epoch": 0.887163848073191, "eval_loss": 6.827342987060547, "eval_runtime": 174.7661, "eval_samples_per_second": 57.219, "eval_steps_per_second": 7.152, "step": 400 }, { "epoch": 1.1089548100914888, "grad_norm": 2.264369010925293, "learning_rate": 5.96989966555184e-06, "loss": 6.8546, "step": 500 }, { "epoch": 1.1089548100914888, "eval_loss": 6.712888240814209, "eval_runtime": 174.751, "eval_samples_per_second": 57.224, "eval_steps_per_second": 7.153, "step": 500 }, { "epoch": 1.3307457721097866, "grad_norm": 2.162890911102295, "learning_rate": 5.959866220735786e-06, "loss": 6.7576, "step": 600 }, { "epoch": 1.3307457721097866, "eval_loss": 6.646862506866455, "eval_runtime": 174.794, "eval_samples_per_second": 57.21, "eval_steps_per_second": 7.151, "step": 600 }, { "epoch": 1.5525367341280842, "grad_norm": 1.8144651651382446, "learning_rate": 5.949832775919732e-06, "loss": 6.6931, "step": 700 }, { "epoch": 1.5525367341280842, "eval_loss": 6.603611946105957, "eval_runtime": 174.6927, "eval_samples_per_second": 57.243, "eval_steps_per_second": 7.155, "step": 700 }, { "epoch": 1.774327696146382, "grad_norm": 1.877691626548767, "learning_rate": 5.939799331103679e-06, "loss": 6.6514, "step": 800 }, { "epoch": 1.774327696146382, "eval_loss": 6.573204517364502, "eval_runtime": 174.7097, "eval_samples_per_second": 57.238, "eval_steps_per_second": 7.155, "step": 800 }, { "epoch": 1.9961186581646797, "grad_norm": 0.6093182563781738, "learning_rate": 5.929765886287626e-06, "loss": 6.6173, "step": 900 }, { "epoch": 1.9961186581646797, "eval_loss": 6.551618576049805, "eval_runtime": 174.7562, "eval_samples_per_second": 57.223, "eval_steps_per_second": 7.153, "step": 900 }, { "epoch": 2.2179096201829775, "grad_norm": 0.6819909811019897, "learning_rate": 5.919732441471572e-06, "loss": 6.5884, "step": 1000 }, { "epoch": 2.2179096201829775, "eval_loss": 6.52918004989624, "eval_runtime": 174.672, "eval_samples_per_second": 57.25, "eval_steps_per_second": 7.156, "step": 1000 }, { "epoch": 2.4397005822012754, "grad_norm": 1.7704071998596191, "learning_rate": 5.9096989966555185e-06, "loss": 6.5703, "step": 1100 }, { "epoch": 2.4397005822012754, "eval_loss": 6.522487640380859, "eval_runtime": 173.2328, "eval_samples_per_second": 57.726, "eval_steps_per_second": 7.216, "step": 1100 }, { "epoch": 2.6614915442195732, "grad_norm": 0.34777677059173584, "learning_rate": 5.899665551839465e-06, "loss": 6.5571, "step": 1200 }, { "epoch": 2.6614915442195732, "eval_loss": 6.512077808380127, "eval_runtime": 174.7576, "eval_samples_per_second": 57.222, "eval_steps_per_second": 7.153, "step": 1200 }, { "epoch": 2.8832825062378706, "grad_norm": 0.40927115082740784, "learning_rate": 5.889632107023412e-06, "loss": 6.5403, "step": 1300 }, { "epoch": 2.8832825062378706, "eval_loss": 6.502514839172363, "eval_runtime": 174.8661, "eval_samples_per_second": 57.187, "eval_steps_per_second": 7.148, "step": 1300 }, { "epoch": 3.1050734682561685, "grad_norm": 1.628187894821167, "learning_rate": 5.879598662207358e-06, "loss": 6.5271, "step": 1400 }, { "epoch": 3.1050734682561685, "eval_loss": 6.495711803436279, "eval_runtime": 173.1482, "eval_samples_per_second": 57.754, "eval_steps_per_second": 7.219, "step": 1400 }, { "epoch": 3.3268644302744663, "grad_norm": 0.2986718416213989, "learning_rate": 5.869565217391305e-06, "loss": 6.5149, "step": 1500 }, { "epoch": 3.3268644302744663, "eval_loss": 6.48654317855835, "eval_runtime": 174.8407, "eval_samples_per_second": 57.195, "eval_steps_per_second": 7.149, "step": 1500 }, { "epoch": 3.548655392292764, "grad_norm": 0.2633047103881836, "learning_rate": 5.8595317725752514e-06, "loss": 6.5072, "step": 1600 }, { "epoch": 3.548655392292764, "eval_loss": 6.482588291168213, "eval_runtime": 172.4593, "eval_samples_per_second": 57.985, "eval_steps_per_second": 7.248, "step": 1600 }, { "epoch": 3.770446354311062, "grad_norm": 1.4255868196487427, "learning_rate": 5.849498327759197e-06, "loss": 6.5022, "step": 1700 }, { "epoch": 3.770446354311062, "eval_loss": 6.47553825378418, "eval_runtime": 174.7719, "eval_samples_per_second": 57.217, "eval_steps_per_second": 7.152, "step": 1700 }, { "epoch": 3.9922373163293594, "grad_norm": 0.9065702557563782, "learning_rate": 5.839464882943144e-06, "loss": 6.4959, "step": 1800 }, { "epoch": 3.9922373163293594, "eval_loss": 6.473917484283447, "eval_runtime": 174.8124, "eval_samples_per_second": 57.204, "eval_steps_per_second": 7.151, "step": 1800 }, { "epoch": 4.214028278347658, "grad_norm": 0.566608190536499, "learning_rate": 5.829431438127091e-06, "loss": 6.4886, "step": 1900 }, { "epoch": 4.214028278347658, "eval_loss": 6.469518184661865, "eval_runtime": 174.705, "eval_samples_per_second": 57.239, "eval_steps_per_second": 7.155, "step": 1900 }, { "epoch": 4.435819240365955, "grad_norm": 0.48451030254364014, "learning_rate": 5.819397993311037e-06, "loss": 6.4833, "step": 2000 }, { "epoch": 4.435819240365955, "eval_loss": 6.466635704040527, "eval_runtime": 172.7376, "eval_samples_per_second": 57.891, "eval_steps_per_second": 7.236, "step": 2000 }, { "epoch": 4.6576102023842525, "grad_norm": 0.9523207545280457, "learning_rate": 5.8093645484949836e-06, "loss": 6.4787, "step": 2100 }, { "epoch": 4.6576102023842525, "eval_loss": 6.461461544036865, "eval_runtime": 175.0633, "eval_samples_per_second": 57.122, "eval_steps_per_second": 7.14, "step": 2100 }, { "epoch": 4.879401164402551, "grad_norm": 0.5693651437759399, "learning_rate": 5.79933110367893e-06, "loss": 6.4728, "step": 2200 }, { "epoch": 4.879401164402551, "eval_loss": 6.459517478942871, "eval_runtime": 172.5038, "eval_samples_per_second": 57.97, "eval_steps_per_second": 7.246, "step": 2200 }, { "epoch": 5.101192126420848, "grad_norm": 0.5901357531547546, "learning_rate": 5.789297658862876e-06, "loss": 6.4696, "step": 2300 }, { "epoch": 5.101192126420848, "eval_loss": 6.457067012786865, "eval_runtime": 174.7951, "eval_samples_per_second": 57.21, "eval_steps_per_second": 7.151, "step": 2300 }, { "epoch": 5.3229830884391465, "grad_norm": 1.0042142868041992, "learning_rate": 5.779264214046823e-06, "loss": 6.4653, "step": 2400 }, { "epoch": 5.3229830884391465, "eval_loss": 6.453999042510986, "eval_runtime": 172.3564, "eval_samples_per_second": 58.019, "eval_steps_per_second": 7.252, "step": 2400 }, { "epoch": 5.544774050457444, "grad_norm": 0.7791227102279663, "learning_rate": 5.76923076923077e-06, "loss": 6.4633, "step": 2500 }, { "epoch": 5.544774050457444, "eval_loss": 6.451336860656738, "eval_runtime": 174.7261, "eval_samples_per_second": 57.232, "eval_steps_per_second": 7.154, "step": 2500 }, { "epoch": 5.766565012475741, "grad_norm": 0.8784928321838379, "learning_rate": 5.759197324414716e-06, "loss": 6.4583, "step": 2600 }, { "epoch": 5.766565012475741, "eval_loss": 6.448914051055908, "eval_runtime": 174.6875, "eval_samples_per_second": 57.245, "eval_steps_per_second": 7.156, "step": 2600 }, { "epoch": 5.98835597449404, "grad_norm": 0.5964264869689941, "learning_rate": 5.7491638795986624e-06, "loss": 6.4565, "step": 2700 }, { "epoch": 5.98835597449404, "eval_loss": 6.4472856521606445, "eval_runtime": 174.8096, "eval_samples_per_second": 57.205, "eval_steps_per_second": 7.151, "step": 2700 }, { "epoch": 6.210146936512337, "grad_norm": 0.9274541735649109, "learning_rate": 5.739130434782609e-06, "loss": 6.4532, "step": 2800 }, { "epoch": 6.210146936512337, "eval_loss": 6.4447126388549805, "eval_runtime": 174.8407, "eval_samples_per_second": 57.195, "eval_steps_per_second": 7.149, "step": 2800 }, { "epoch": 6.431937898530635, "grad_norm": 1.001717209815979, "learning_rate": 5.729096989966555e-06, "loss": 6.4502, "step": 2900 }, { "epoch": 6.431937898530635, "eval_loss": 6.442678928375244, "eval_runtime": 174.8426, "eval_samples_per_second": 57.194, "eval_steps_per_second": 7.149, "step": 2900 }, { "epoch": 6.653728860548933, "grad_norm": 1.0303460359573364, "learning_rate": 5.719063545150502e-06, "loss": 6.4461, "step": 3000 }, { "epoch": 6.653728860548933, "eval_loss": 6.441711902618408, "eval_runtime": 174.9081, "eval_samples_per_second": 57.173, "eval_steps_per_second": 7.147, "step": 3000 }, { "epoch": 6.87551982256723, "grad_norm": 0.8993558287620544, "learning_rate": 5.709030100334449e-06, "loss": 6.4441, "step": 3100 }, { "epoch": 6.87551982256723, "eval_loss": 6.4411845207214355, "eval_runtime": 174.9473, "eval_samples_per_second": 57.16, "eval_steps_per_second": 7.145, "step": 3100 }, { "epoch": 7.097310784585528, "grad_norm": 0.8197622299194336, "learning_rate": 5.698996655518395e-06, "loss": 6.4423, "step": 3200 }, { "epoch": 7.097310784585528, "eval_loss": 6.436513900756836, "eval_runtime": 174.8498, "eval_samples_per_second": 57.192, "eval_steps_per_second": 7.149, "step": 3200 }, { "epoch": 7.319101746603826, "grad_norm": 0.8674586415290833, "learning_rate": 5.688963210702341e-06, "loss": 6.4396, "step": 3300 }, { "epoch": 7.319101746603826, "eval_loss": 6.435162544250488, "eval_runtime": 174.7933, "eval_samples_per_second": 57.21, "eval_steps_per_second": 7.151, "step": 3300 }, { "epoch": 7.540892708622124, "grad_norm": 1.1237138509750366, "learning_rate": 5.678929765886288e-06, "loss": 6.436, "step": 3400 }, { "epoch": 7.540892708622124, "eval_loss": 6.435031414031982, "eval_runtime": 175.3834, "eval_samples_per_second": 57.018, "eval_steps_per_second": 7.127, "step": 3400 }, { "epoch": 7.762683670640421, "grad_norm": 0.8178996443748474, "learning_rate": 5.668896321070235e-06, "loss": 6.436, "step": 3500 }, { "epoch": 7.762683670640421, "eval_loss": 6.435057640075684, "eval_runtime": 174.7238, "eval_samples_per_second": 57.233, "eval_steps_per_second": 7.154, "step": 3500 }, { "epoch": 7.984474632658719, "grad_norm": 1.035356044769287, "learning_rate": 5.658862876254181e-06, "loss": 6.4349, "step": 3600 }, { "epoch": 7.984474632658719, "eval_loss": 6.434642791748047, "eval_runtime": 174.772, "eval_samples_per_second": 57.217, "eval_steps_per_second": 7.152, "step": 3600 }, { "epoch": 8.206265594677017, "grad_norm": 0.5910846590995789, "learning_rate": 5.6488294314381275e-06, "loss": 6.4321, "step": 3700 }, { "epoch": 8.206265594677017, "eval_loss": 6.431816577911377, "eval_runtime": 174.7565, "eval_samples_per_second": 57.222, "eval_steps_per_second": 7.153, "step": 3700 }, { "epoch": 8.428056556695315, "grad_norm": 1.0821483135223389, "learning_rate": 5.638795986622074e-06, "loss": 6.4311, "step": 3800 }, { "epoch": 8.428056556695315, "eval_loss": 6.432049751281738, "eval_runtime": 174.8038, "eval_samples_per_second": 57.207, "eval_steps_per_second": 7.151, "step": 3800 }, { "epoch": 8.649847518713612, "grad_norm": 0.3947916328907013, "learning_rate": 5.62876254180602e-06, "loss": 6.4274, "step": 3900 }, { "epoch": 8.649847518713612, "eval_loss": 6.434264183044434, "eval_runtime": 174.9692, "eval_samples_per_second": 57.153, "eval_steps_per_second": 7.144, "step": 3900 }, { "epoch": 8.87163848073191, "grad_norm": 0.9494003653526306, "learning_rate": 5.618729096989967e-06, "loss": 6.4274, "step": 4000 }, { "epoch": 8.87163848073191, "eval_loss": 6.430780410766602, "eval_runtime": 172.2523, "eval_samples_per_second": 58.054, "eval_steps_per_second": 7.257, "step": 4000 }, { "epoch": 9.093429442750208, "grad_norm": 1.1131881475448608, "learning_rate": 5.608695652173914e-06, "loss": 6.4257, "step": 4100 }, { "epoch": 9.093429442750208, "eval_loss": 6.429464817047119, "eval_runtime": 174.8834, "eval_samples_per_second": 57.181, "eval_steps_per_second": 7.148, "step": 4100 }, { "epoch": 9.315220404768505, "grad_norm": 1.5252963304519653, "learning_rate": 5.59866220735786e-06, "loss": 6.4234, "step": 4200 }, { "epoch": 9.315220404768505, "eval_loss": 6.428813934326172, "eval_runtime": 174.7709, "eval_samples_per_second": 57.218, "eval_steps_per_second": 7.152, "step": 4200 }, { "epoch": 9.537011366786803, "grad_norm": 0.7536811828613281, "learning_rate": 5.588628762541806e-06, "loss": 6.4234, "step": 4300 }, { "epoch": 9.537011366786803, "eval_loss": 6.428880214691162, "eval_runtime": 174.7826, "eval_samples_per_second": 57.214, "eval_steps_per_second": 7.152, "step": 4300 }, { "epoch": 9.758802328805102, "grad_norm": 0.6803523302078247, "learning_rate": 5.578595317725753e-06, "loss": 6.4212, "step": 4400 }, { "epoch": 9.758802328805102, "eval_loss": 6.426270484924316, "eval_runtime": 174.8236, "eval_samples_per_second": 57.201, "eval_steps_per_second": 7.15, "step": 4400 }, { "epoch": 9.9805932908234, "grad_norm": 0.8163429498672485, "learning_rate": 5.568561872909699e-06, "loss": 6.4165, "step": 4500 }, { "epoch": 9.9805932908234, "eval_loss": 6.428164005279541, "eval_runtime": 174.8809, "eval_samples_per_second": 57.182, "eval_steps_per_second": 7.148, "step": 4500 }, { "epoch": 10.202384252841696, "grad_norm": 0.630403459072113, "learning_rate": 5.558528428093646e-06, "loss": 6.4189, "step": 4600 }, { "epoch": 10.202384252841696, "eval_loss": 6.428719520568848, "eval_runtime": 174.7591, "eval_samples_per_second": 57.222, "eval_steps_per_second": 7.153, "step": 4600 }, { "epoch": 10.424175214859995, "grad_norm": 0.8704747557640076, "learning_rate": 5.548494983277593e-06, "loss": 6.4192, "step": 4700 }, { "epoch": 10.424175214859995, "eval_loss": 6.423656463623047, "eval_runtime": 174.707, "eval_samples_per_second": 57.239, "eval_steps_per_second": 7.155, "step": 4700 }, { "epoch": 10.645966176878293, "grad_norm": 1.2153334617614746, "learning_rate": 5.5384615384615385e-06, "loss": 6.4176, "step": 4800 }, { "epoch": 10.645966176878293, "eval_loss": 6.427283763885498, "eval_runtime": 174.8703, "eval_samples_per_second": 57.185, "eval_steps_per_second": 7.148, "step": 4800 }, { "epoch": 10.86775713889659, "grad_norm": 0.9878360629081726, "learning_rate": 5.528428093645485e-06, "loss": 6.4147, "step": 4900 }, { "epoch": 10.86775713889659, "eval_loss": 6.424483776092529, "eval_runtime": 174.7677, "eval_samples_per_second": 57.219, "eval_steps_per_second": 7.152, "step": 4900 }, { "epoch": 11.089548100914888, "grad_norm": 1.1536431312561035, "learning_rate": 5.518394648829432e-06, "loss": 6.4141, "step": 5000 }, { "epoch": 11.089548100914888, "eval_loss": 6.423103332519531, "eval_runtime": 174.7198, "eval_samples_per_second": 57.235, "eval_steps_per_second": 7.154, "step": 5000 }, { "epoch": 11.311339062933186, "grad_norm": 0.5233383774757385, "learning_rate": 5.508361204013378e-06, "loss": 6.4143, "step": 5100 }, { "epoch": 11.311339062933186, "eval_loss": 6.426151275634766, "eval_runtime": 174.8201, "eval_samples_per_second": 57.202, "eval_steps_per_second": 7.15, "step": 5100 }, { "epoch": 11.533130024951483, "grad_norm": 0.4546308219432831, "learning_rate": 5.498327759197324e-06, "loss": 6.4131, "step": 5200 }, { "epoch": 11.533130024951483, "eval_loss": 6.41951322555542, "eval_runtime": 174.8531, "eval_samples_per_second": 57.191, "eval_steps_per_second": 7.149, "step": 5200 }, { "epoch": 11.75492098696978, "grad_norm": 0.7687248587608337, "learning_rate": 5.488294314381271e-06, "loss": 6.4127, "step": 5300 }, { "epoch": 11.75492098696978, "eval_loss": 6.421510696411133, "eval_runtime": 174.8268, "eval_samples_per_second": 57.199, "eval_steps_per_second": 7.15, "step": 5300 }, { "epoch": 11.97671194898808, "grad_norm": 0.6706124544143677, "learning_rate": 5.478260869565217e-06, "loss": 6.4114, "step": 5400 }, { "epoch": 11.97671194898808, "eval_loss": 6.42447566986084, "eval_runtime": 174.8755, "eval_samples_per_second": 57.184, "eval_steps_per_second": 7.148, "step": 5400 }, { "epoch": 12.198502911006376, "grad_norm": 1.165449619293213, "learning_rate": 5.468227424749163e-06, "loss": 6.4112, "step": 5500 }, { "epoch": 12.198502911006376, "eval_loss": 6.423706531524658, "eval_runtime": 174.8245, "eval_samples_per_second": 57.2, "eval_steps_per_second": 7.15, "step": 5500 }, { "epoch": 12.420293873024674, "grad_norm": 0.614251434803009, "learning_rate": 5.45819397993311e-06, "loss": 6.4088, "step": 5600 }, { "epoch": 12.420293873024674, "eval_loss": 6.417710304260254, "eval_runtime": 174.7714, "eval_samples_per_second": 57.218, "eval_steps_per_second": 7.152, "step": 5600 }, { "epoch": 12.642084835042972, "grad_norm": 0.7338353991508484, "learning_rate": 5.448160535117057e-06, "loss": 6.4093, "step": 5700 }, { "epoch": 12.642084835042972, "eval_loss": 6.421204566955566, "eval_runtime": 174.7739, "eval_samples_per_second": 57.217, "eval_steps_per_second": 7.152, "step": 5700 }, { "epoch": 12.86387579706127, "grad_norm": 0.5238298773765564, "learning_rate": 5.438127090301003e-06, "loss": 6.4088, "step": 5800 }, { "epoch": 12.86387579706127, "eval_loss": 6.418464183807373, "eval_runtime": 174.8398, "eval_samples_per_second": 57.195, "eval_steps_per_second": 7.149, "step": 5800 }, { "epoch": 13.085666759079567, "grad_norm": 0.8438045382499695, "learning_rate": 5.4280936454849495e-06, "loss": 6.4059, "step": 5900 }, { "epoch": 13.085666759079567, "eval_loss": 6.41862678527832, "eval_runtime": 174.7377, "eval_samples_per_second": 57.229, "eval_steps_per_second": 7.154, "step": 5900 }, { "epoch": 13.307457721097865, "grad_norm": 0.6270604133605957, "learning_rate": 5.418060200668896e-06, "loss": 6.4083, "step": 6000 }, { "epoch": 13.307457721097865, "eval_loss": 6.420100688934326, "eval_runtime": 174.8134, "eval_samples_per_second": 57.204, "eval_steps_per_second": 7.15, "step": 6000 }, { "epoch": 13.529248683116164, "grad_norm": 0.49625712633132935, "learning_rate": 5.408026755852843e-06, "loss": 6.4065, "step": 6100 }, { "epoch": 13.529248683116164, "eval_loss": 6.41825008392334, "eval_runtime": 174.9176, "eval_samples_per_second": 57.17, "eval_steps_per_second": 7.146, "step": 6100 }, { "epoch": 13.75103964513446, "grad_norm": 0.996813178062439, "learning_rate": 5.397993311036789e-06, "loss": 6.4055, "step": 6200 }, { "epoch": 13.75103964513446, "eval_loss": 6.419356346130371, "eval_runtime": 174.915, "eval_samples_per_second": 57.171, "eval_steps_per_second": 7.146, "step": 6200 }, { "epoch": 13.972830607152758, "grad_norm": 0.9816793203353882, "learning_rate": 5.387959866220736e-06, "loss": 6.4065, "step": 6300 }, { "epoch": 13.972830607152758, "eval_loss": 6.4173455238342285, "eval_runtime": 175.0068, "eval_samples_per_second": 57.141, "eval_steps_per_second": 7.143, "step": 6300 }, { "epoch": 14.194621569171057, "grad_norm": 1.072190761566162, "learning_rate": 5.3779264214046825e-06, "loss": 6.403, "step": 6400 }, { "epoch": 14.194621569171057, "eval_loss": 6.416932582855225, "eval_runtime": 174.8129, "eval_samples_per_second": 57.204, "eval_steps_per_second": 7.151, "step": 6400 }, { "epoch": 14.416412531189353, "grad_norm": 0.8124646544456482, "learning_rate": 5.367892976588628e-06, "loss": 6.4038, "step": 6500 }, { "epoch": 14.416412531189353, "eval_loss": 6.417375087738037, "eval_runtime": 174.7648, "eval_samples_per_second": 57.22, "eval_steps_per_second": 7.152, "step": 6500 }, { "epoch": 14.638203493207651, "grad_norm": 0.6260553002357483, "learning_rate": 5.357859531772575e-06, "loss": 6.4045, "step": 6600 }, { "epoch": 14.638203493207651, "eval_loss": 6.4163103103637695, "eval_runtime": 173.3723, "eval_samples_per_second": 57.679, "eval_steps_per_second": 7.21, "step": 6600 }, { "epoch": 14.85999445522595, "grad_norm": 0.6502517461776733, "learning_rate": 5.347826086956522e-06, "loss": 6.4039, "step": 6700 }, { "epoch": 14.85999445522595, "eval_loss": 6.421817779541016, "eval_runtime": 173.7415, "eval_samples_per_second": 57.557, "eval_steps_per_second": 7.195, "step": 6700 }, { "epoch": 15.081785417244248, "grad_norm": 0.7852392196655273, "learning_rate": 5.337792642140468e-06, "loss": 6.4021, "step": 6800 }, { "epoch": 15.081785417244248, "eval_loss": 6.414952278137207, "eval_runtime": 174.7855, "eval_samples_per_second": 57.213, "eval_steps_per_second": 7.152, "step": 6800 }, { "epoch": 15.303576379262545, "grad_norm": 0.5642409920692444, "learning_rate": 5.327759197324415e-06, "loss": 6.4018, "step": 6900 }, { "epoch": 15.303576379262545, "eval_loss": 6.417159557342529, "eval_runtime": 172.3834, "eval_samples_per_second": 58.01, "eval_steps_per_second": 7.251, "step": 6900 }, { "epoch": 15.525367341280843, "grad_norm": 0.5935277938842773, "learning_rate": 5.317725752508361e-06, "loss": 6.4015, "step": 7000 }, { "epoch": 15.525367341280843, "eval_loss": 6.419808864593506, "eval_runtime": 174.8129, "eval_samples_per_second": 57.204, "eval_steps_per_second": 7.151, "step": 7000 }, { "epoch": 15.747158303299141, "grad_norm": 0.8796281218528748, "learning_rate": 5.307692307692307e-06, "loss": 6.402, "step": 7100 }, { "epoch": 15.747158303299141, "eval_loss": 6.413030624389648, "eval_runtime": 174.991, "eval_samples_per_second": 57.146, "eval_steps_per_second": 7.143, "step": 7100 }, { "epoch": 15.968949265317438, "grad_norm": 0.686579167842865, "learning_rate": 5.297658862876254e-06, "loss": 6.4003, "step": 7200 }, { "epoch": 15.968949265317438, "eval_loss": 6.412362575531006, "eval_runtime": 174.9653, "eval_samples_per_second": 57.154, "eval_steps_per_second": 7.144, "step": 7200 }, { "epoch": 16.190740227335738, "grad_norm": 0.8254374265670776, "learning_rate": 5.287625418060201e-06, "loss": 6.4003, "step": 7300 }, { "epoch": 16.190740227335738, "eval_loss": 6.415155410766602, "eval_runtime": 174.7441, "eval_samples_per_second": 57.227, "eval_steps_per_second": 7.153, "step": 7300 }, { "epoch": 16.412531189354034, "grad_norm": 1.0479621887207031, "learning_rate": 5.277591973244147e-06, "loss": 6.3999, "step": 7400 }, { "epoch": 16.412531189354034, "eval_loss": 6.4169602394104, "eval_runtime": 174.7973, "eval_samples_per_second": 57.209, "eval_steps_per_second": 7.151, "step": 7400 }, { "epoch": 16.63432215137233, "grad_norm": 0.8358107805252075, "learning_rate": 5.2675585284280935e-06, "loss": 6.3989, "step": 7500 }, { "epoch": 16.63432215137233, "eval_loss": 6.417453765869141, "eval_runtime": 174.497, "eval_samples_per_second": 57.308, "eval_steps_per_second": 7.163, "step": 7500 }, { "epoch": 16.85611311339063, "grad_norm": 0.6018221378326416, "learning_rate": 5.25752508361204e-06, "loss": 6.3991, "step": 7600 }, { "epoch": 16.85611311339063, "eval_loss": 6.4126434326171875, "eval_runtime": 172.7161, "eval_samples_per_second": 57.898, "eval_steps_per_second": 7.237, "step": 7600 }, { "epoch": 17.077904075408927, "grad_norm": 1.0999138355255127, "learning_rate": 5.247491638795986e-06, "loss": 6.3981, "step": 7700 }, { "epoch": 17.077904075408927, "eval_loss": 6.413776397705078, "eval_runtime": 174.7746, "eval_samples_per_second": 57.217, "eval_steps_per_second": 7.152, "step": 7700 }, { "epoch": 17.299695037427224, "grad_norm": 0.5430467128753662, "learning_rate": 5.237458193979933e-06, "loss": 6.3993, "step": 7800 }, { "epoch": 17.299695037427224, "eval_loss": 6.414647102355957, "eval_runtime": 174.8381, "eval_samples_per_second": 57.196, "eval_steps_per_second": 7.149, "step": 7800 }, { "epoch": 17.521485999445524, "grad_norm": 0.588058352470398, "learning_rate": 5.22742474916388e-06, "loss": 6.3976, "step": 7900 }, { "epoch": 17.521485999445524, "eval_loss": 6.413895130157471, "eval_runtime": 174.9633, "eval_samples_per_second": 57.155, "eval_steps_per_second": 7.144, "step": 7900 }, { "epoch": 17.74327696146382, "grad_norm": 0.365583598613739, "learning_rate": 5.2173913043478265e-06, "loss": 6.3966, "step": 8000 }, { "epoch": 17.74327696146382, "eval_loss": 6.409445285797119, "eval_runtime": 174.9468, "eval_samples_per_second": 57.16, "eval_steps_per_second": 7.145, "step": 8000 }, { "epoch": 17.965067923482117, "grad_norm": 0.6981125473976135, "learning_rate": 5.207357859531772e-06, "loss": 6.3974, "step": 8100 }, { "epoch": 17.965067923482117, "eval_loss": 6.413646221160889, "eval_runtime": 174.9417, "eval_samples_per_second": 57.162, "eval_steps_per_second": 7.145, "step": 8100 }, { "epoch": 18.186858885500417, "grad_norm": 0.6041765213012695, "learning_rate": 5.197324414715719e-06, "loss": 6.3985, "step": 8200 }, { "epoch": 18.186858885500417, "eval_loss": 6.411979675292969, "eval_runtime": 174.8191, "eval_samples_per_second": 57.202, "eval_steps_per_second": 7.15, "step": 8200 }, { "epoch": 18.408649847518713, "grad_norm": 0.7936201095581055, "learning_rate": 5.187290969899666e-06, "loss": 6.3964, "step": 8300 }, { "epoch": 18.408649847518713, "eval_loss": 6.40911865234375, "eval_runtime": 174.818, "eval_samples_per_second": 57.202, "eval_steps_per_second": 7.15, "step": 8300 }, { "epoch": 18.63044080953701, "grad_norm": 0.6278252005577087, "learning_rate": 5.177257525083612e-06, "loss": 6.3957, "step": 8400 }, { "epoch": 18.63044080953701, "eval_loss": 6.413068771362305, "eval_runtime": 172.3693, "eval_samples_per_second": 58.015, "eval_steps_per_second": 7.252, "step": 8400 }, { "epoch": 18.85223177155531, "grad_norm": 0.6582921743392944, "learning_rate": 5.167224080267559e-06, "loss": 6.3956, "step": 8500 }, { "epoch": 18.85223177155531, "eval_loss": 6.410306453704834, "eval_runtime": 174.8171, "eval_samples_per_second": 57.203, "eval_steps_per_second": 7.15, "step": 8500 }, { "epoch": 19.074022733573607, "grad_norm": 0.8874194622039795, "learning_rate": 5.157190635451505e-06, "loss": 6.3975, "step": 8600 }, { "epoch": 19.074022733573607, "eval_loss": 6.409109592437744, "eval_runtime": 172.2351, "eval_samples_per_second": 58.06, "eval_steps_per_second": 7.258, "step": 8600 }, { "epoch": 19.295813695591903, "grad_norm": 0.589608907699585, "learning_rate": 5.147157190635451e-06, "loss": 6.3957, "step": 8700 }, { "epoch": 19.295813695591903, "eval_loss": 6.413524150848389, "eval_runtime": 174.767, "eval_samples_per_second": 57.219, "eval_steps_per_second": 7.152, "step": 8700 }, { "epoch": 19.517604657610203, "grad_norm": 0.7026548385620117, "learning_rate": 5.137123745819398e-06, "loss": 6.3942, "step": 8800 }, { "epoch": 19.517604657610203, "eval_loss": 6.41259241104126, "eval_runtime": 174.7786, "eval_samples_per_second": 57.215, "eval_steps_per_second": 7.152, "step": 8800 }, { "epoch": 19.7393956196285, "grad_norm": 0.7508072257041931, "learning_rate": 5.127090301003345e-06, "loss": 6.3936, "step": 8900 }, { "epoch": 19.7393956196285, "eval_loss": 6.410432815551758, "eval_runtime": 174.7865, "eval_samples_per_second": 57.213, "eval_steps_per_second": 7.152, "step": 8900 }, { "epoch": 19.9611865816468, "grad_norm": 0.36028188467025757, "learning_rate": 5.117056856187291e-06, "loss": 6.3943, "step": 9000 }, { "epoch": 19.9611865816468, "eval_loss": 6.409936904907227, "eval_runtime": 174.852, "eval_samples_per_second": 57.191, "eval_steps_per_second": 7.149, "step": 9000 }, { "epoch": 20.182977543665096, "grad_norm": 0.8198152184486389, "learning_rate": 5.1070234113712375e-06, "loss": 6.3939, "step": 9100 }, { "epoch": 20.182977543665096, "eval_loss": 6.412051677703857, "eval_runtime": 174.8255, "eval_samples_per_second": 57.2, "eval_steps_per_second": 7.15, "step": 9100 }, { "epoch": 20.404768505683393, "grad_norm": 0.6599276065826416, "learning_rate": 5.096989966555184e-06, "loss": 6.3939, "step": 9200 }, { "epoch": 20.404768505683393, "eval_loss": 6.411386489868164, "eval_runtime": 174.8596, "eval_samples_per_second": 57.189, "eval_steps_per_second": 7.149, "step": 9200 }, { "epoch": 20.626559467701693, "grad_norm": 0.736455500125885, "learning_rate": 5.08695652173913e-06, "loss": 6.3931, "step": 9300 }, { "epoch": 20.626559467701693, "eval_loss": 6.40945291519165, "eval_runtime": 174.8463, "eval_samples_per_second": 57.193, "eval_steps_per_second": 7.149, "step": 9300 }, { "epoch": 20.84835042971999, "grad_norm": 0.7547162175178528, "learning_rate": 5.076923076923077e-06, "loss": 6.393, "step": 9400 }, { "epoch": 20.84835042971999, "eval_loss": 6.409768581390381, "eval_runtime": 175.1356, "eval_samples_per_second": 57.099, "eval_steps_per_second": 7.137, "step": 9400 }, { "epoch": 21.070141391738286, "grad_norm": 0.4197324216365814, "learning_rate": 5.066889632107024e-06, "loss": 6.3943, "step": 9500 }, { "epoch": 21.070141391738286, "eval_loss": 6.4077606201171875, "eval_runtime": 174.8821, "eval_samples_per_second": 57.181, "eval_steps_per_second": 7.148, "step": 9500 }, { "epoch": 21.291932353756586, "grad_norm": 0.6552382111549377, "learning_rate": 5.05685618729097e-06, "loss": 6.3927, "step": 9600 }, { "epoch": 21.291932353756586, "eval_loss": 6.40675163269043, "eval_runtime": 174.964, "eval_samples_per_second": 57.155, "eval_steps_per_second": 7.144, "step": 9600 }, { "epoch": 21.513723315774882, "grad_norm": 0.507618248462677, "learning_rate": 5.046822742474916e-06, "loss": 6.3948, "step": 9700 }, { "epoch": 21.513723315774882, "eval_loss": 6.4094719886779785, "eval_runtime": 174.8996, "eval_samples_per_second": 57.176, "eval_steps_per_second": 7.147, "step": 9700 }, { "epoch": 21.73551427779318, "grad_norm": 1.0394549369812012, "learning_rate": 5.036789297658863e-06, "loss": 6.3933, "step": 9800 }, { "epoch": 21.73551427779318, "eval_loss": 6.411880016326904, "eval_runtime": 174.8373, "eval_samples_per_second": 57.196, "eval_steps_per_second": 7.15, "step": 9800 }, { "epoch": 21.95730523981148, "grad_norm": 0.852592945098877, "learning_rate": 5.02675585284281e-06, "loss": 6.3932, "step": 9900 }, { "epoch": 21.95730523981148, "eval_loss": 6.405695915222168, "eval_runtime": 174.9233, "eval_samples_per_second": 57.168, "eval_steps_per_second": 7.146, "step": 9900 }, { "epoch": 22.179096201829775, "grad_norm": 0.6302698254585266, "learning_rate": 5.016722408026756e-06, "loss": 6.3914, "step": 10000 }, { "epoch": 22.179096201829775, "eval_loss": 6.404843807220459, "eval_runtime": 174.881, "eval_samples_per_second": 57.182, "eval_steps_per_second": 7.148, "step": 10000 }, { "epoch": 22.400887163848072, "grad_norm": 0.5545974969863892, "learning_rate": 5.0066889632107026e-06, "loss": 6.3913, "step": 10100 }, { "epoch": 22.400887163848072, "eval_loss": 6.4088826179504395, "eval_runtime": 174.9362, "eval_samples_per_second": 57.164, "eval_steps_per_second": 7.145, "step": 10100 }, { "epoch": 22.622678125866372, "grad_norm": 0.6303640007972717, "learning_rate": 4.996655518394649e-06, "loss": 6.3916, "step": 10200 }, { "epoch": 22.622678125866372, "eval_loss": 6.406084060668945, "eval_runtime": 174.8669, "eval_samples_per_second": 57.186, "eval_steps_per_second": 7.148, "step": 10200 }, { "epoch": 22.84446908788467, "grad_norm": 0.6866323947906494, "learning_rate": 4.986622073578595e-06, "loss": 6.3922, "step": 10300 }, { "epoch": 22.84446908788467, "eval_loss": 6.406491279602051, "eval_runtime": 172.5199, "eval_samples_per_second": 57.964, "eval_steps_per_second": 7.246, "step": 10300 }, { "epoch": 23.066260049902965, "grad_norm": 0.5681377649307251, "learning_rate": 4.976588628762542e-06, "loss": 6.3919, "step": 10400 }, { "epoch": 23.066260049902965, "eval_loss": 6.407881259918213, "eval_runtime": 174.7996, "eval_samples_per_second": 57.208, "eval_steps_per_second": 7.151, "step": 10400 }, { "epoch": 23.288051011921265, "grad_norm": 0.5302285552024841, "learning_rate": 4.966555183946489e-06, "loss": 6.3928, "step": 10500 }, { "epoch": 23.288051011921265, "eval_loss": 6.4045891761779785, "eval_runtime": 175.2024, "eval_samples_per_second": 57.077, "eval_steps_per_second": 7.135, "step": 10500 }, { "epoch": 23.50984197393956, "grad_norm": 0.5630497336387634, "learning_rate": 4.956521739130435e-06, "loss": 6.3903, "step": 10600 }, { "epoch": 23.50984197393956, "eval_loss": 6.406449317932129, "eval_runtime": 172.7598, "eval_samples_per_second": 57.884, "eval_steps_per_second": 7.235, "step": 10600 }, { "epoch": 23.731632935957858, "grad_norm": 0.5340705513954163, "learning_rate": 4.9464882943143815e-06, "loss": 6.3896, "step": 10700 }, { "epoch": 23.731632935957858, "eval_loss": 6.408339023590088, "eval_runtime": 175.3018, "eval_samples_per_second": 57.044, "eval_steps_per_second": 7.131, "step": 10700 }, { "epoch": 23.95342389797616, "grad_norm": 0.7192414402961731, "learning_rate": 4.936454849498328e-06, "loss": 6.3904, "step": 10800 }, { "epoch": 23.95342389797616, "eval_loss": 6.408904075622559, "eval_runtime": 175.321, "eval_samples_per_second": 57.038, "eval_steps_per_second": 7.13, "step": 10800 }, { "epoch": 24.175214859994455, "grad_norm": 0.7297828197479248, "learning_rate": 4.926421404682274e-06, "loss": 6.3906, "step": 10900 }, { "epoch": 24.175214859994455, "eval_loss": 6.406455993652344, "eval_runtime": 175.3497, "eval_samples_per_second": 57.029, "eval_steps_per_second": 7.129, "step": 10900 }, { "epoch": 24.39700582201275, "grad_norm": 0.8612614870071411, "learning_rate": 4.916387959866221e-06, "loss": 6.389, "step": 11000 }, { "epoch": 24.39700582201275, "eval_loss": 6.4049272537231445, "eval_runtime": 174.9884, "eval_samples_per_second": 57.147, "eval_steps_per_second": 7.143, "step": 11000 }, { "epoch": 24.61879678403105, "grad_norm": 0.39626169204711914, "learning_rate": 4.906354515050168e-06, "loss": 6.3904, "step": 11100 }, { "epoch": 24.61879678403105, "eval_loss": 6.399599075317383, "eval_runtime": 175.2182, "eval_samples_per_second": 57.072, "eval_steps_per_second": 7.134, "step": 11100 }, { "epoch": 24.840587746049348, "grad_norm": 0.47381725907325745, "learning_rate": 4.8963210702341136e-06, "loss": 6.3896, "step": 11200 }, { "epoch": 24.840587746049348, "eval_loss": 6.405921459197998, "eval_runtime": 174.9189, "eval_samples_per_second": 57.169, "eval_steps_per_second": 7.146, "step": 11200 }, { "epoch": 25.062378708067648, "grad_norm": 0.567333996295929, "learning_rate": 4.88628762541806e-06, "loss": 6.3886, "step": 11300 }, { "epoch": 25.062378708067648, "eval_loss": 6.409249782562256, "eval_runtime": 174.8058, "eval_samples_per_second": 57.206, "eval_steps_per_second": 7.151, "step": 11300 }, { "epoch": 25.284169670085944, "grad_norm": 0.47083523869514465, "learning_rate": 4.876254180602007e-06, "loss": 6.3892, "step": 11400 }, { "epoch": 25.284169670085944, "eval_loss": 6.406309604644775, "eval_runtime": 174.8008, "eval_samples_per_second": 57.208, "eval_steps_per_second": 7.151, "step": 11400 }, { "epoch": 25.50596063210424, "grad_norm": 0.4636823832988739, "learning_rate": 4.866220735785953e-06, "loss": 6.3905, "step": 11500 }, { "epoch": 25.50596063210424, "eval_loss": 6.4087066650390625, "eval_runtime": 174.7802, "eval_samples_per_second": 57.215, "eval_steps_per_second": 7.152, "step": 11500 }, { "epoch": 25.72775159412254, "grad_norm": 0.8328993916511536, "learning_rate": 4.8561872909699e-06, "loss": 6.3888, "step": 11600 }, { "epoch": 25.72775159412254, "eval_loss": 6.405496120452881, "eval_runtime": 172.4449, "eval_samples_per_second": 57.99, "eval_steps_per_second": 7.249, "step": 11600 }, { "epoch": 25.949542556140837, "grad_norm": 0.5866479873657227, "learning_rate": 4.8461538461538465e-06, "loss": 6.3895, "step": 11700 }, { "epoch": 25.949542556140837, "eval_loss": 6.4065117835998535, "eval_runtime": 172.4094, "eval_samples_per_second": 58.001, "eval_steps_per_second": 7.25, "step": 11700 }, { "epoch": 26.171333518159134, "grad_norm": 0.7557168006896973, "learning_rate": 4.8361204013377925e-06, "loss": 6.3901, "step": 11800 }, { "epoch": 26.171333518159134, "eval_loss": 6.404352188110352, "eval_runtime": 174.7894, "eval_samples_per_second": 57.212, "eval_steps_per_second": 7.151, "step": 11800 }, { "epoch": 26.393124480177434, "grad_norm": 0.5010234117507935, "learning_rate": 4.826086956521739e-06, "loss": 6.3881, "step": 11900 }, { "epoch": 26.393124480177434, "eval_loss": 6.406057834625244, "eval_runtime": 174.8928, "eval_samples_per_second": 57.178, "eval_steps_per_second": 7.147, "step": 11900 }, { "epoch": 26.61491544219573, "grad_norm": 0.5228267312049866, "learning_rate": 4.816053511705686e-06, "loss": 6.3893, "step": 12000 }, { "epoch": 26.61491544219573, "eval_loss": 6.403919219970703, "eval_runtime": 173.0368, "eval_samples_per_second": 57.791, "eval_steps_per_second": 7.224, "step": 12000 }, { "epoch": 26.836706404214027, "grad_norm": 0.41645535826683044, "learning_rate": 4.806020066889633e-06, "loss": 6.3893, "step": 12100 }, { "epoch": 26.836706404214027, "eval_loss": 6.403182029724121, "eval_runtime": 173.7518, "eval_samples_per_second": 57.553, "eval_steps_per_second": 7.194, "step": 12100 }, { "epoch": 27.058497366232327, "grad_norm": 0.6280103325843811, "learning_rate": 4.795986622073579e-06, "loss": 6.388, "step": 12200 }, { "epoch": 27.058497366232327, "eval_loss": 6.406325817108154, "eval_runtime": 172.5376, "eval_samples_per_second": 57.958, "eval_steps_per_second": 7.245, "step": 12200 }, { "epoch": 27.280288328250624, "grad_norm": 0.4701608419418335, "learning_rate": 4.785953177257525e-06, "loss": 6.3891, "step": 12300 }, { "epoch": 27.280288328250624, "eval_loss": 6.403144836425781, "eval_runtime": 174.7938, "eval_samples_per_second": 57.21, "eval_steps_per_second": 7.151, "step": 12300 }, { "epoch": 27.50207929026892, "grad_norm": 0.49227380752563477, "learning_rate": 4.775919732441472e-06, "loss": 6.3893, "step": 12400 }, { "epoch": 27.50207929026892, "eval_loss": 6.404545783996582, "eval_runtime": 172.6406, "eval_samples_per_second": 57.924, "eval_steps_per_second": 7.24, "step": 12400 }, { "epoch": 27.72387025228722, "grad_norm": 0.5558980703353882, "learning_rate": 4.765886287625418e-06, "loss": 6.3883, "step": 12500 }, { "epoch": 27.72387025228722, "eval_loss": 6.402305603027344, "eval_runtime": 174.8403, "eval_samples_per_second": 57.195, "eval_steps_per_second": 7.149, "step": 12500 }, { "epoch": 27.945661214305517, "grad_norm": 0.7037143707275391, "learning_rate": 4.755852842809365e-06, "loss": 6.3885, "step": 12600 }, { "epoch": 27.945661214305517, "eval_loss": 6.403327465057373, "eval_runtime": 172.6409, "eval_samples_per_second": 57.924, "eval_steps_per_second": 7.24, "step": 12600 }, { "epoch": 28.167452176323813, "grad_norm": 0.8158712983131409, "learning_rate": 4.745819397993312e-06, "loss": 6.3858, "step": 12700 }, { "epoch": 28.167452176323813, "eval_loss": 6.40453577041626, "eval_runtime": 174.7251, "eval_samples_per_second": 57.233, "eval_steps_per_second": 7.154, "step": 12700 }, { "epoch": 28.389243138342113, "grad_norm": 0.49727940559387207, "learning_rate": 4.7357859531772575e-06, "loss": 6.3882, "step": 12800 }, { "epoch": 28.389243138342113, "eval_loss": 6.404928684234619, "eval_runtime": 174.7858, "eval_samples_per_second": 57.213, "eval_steps_per_second": 7.152, "step": 12800 }, { "epoch": 28.61103410036041, "grad_norm": 0.5173976421356201, "learning_rate": 4.725752508361204e-06, "loss": 6.3866, "step": 12900 }, { "epoch": 28.61103410036041, "eval_loss": 6.40172815322876, "eval_runtime": 174.5561, "eval_samples_per_second": 57.288, "eval_steps_per_second": 7.161, "step": 12900 }, { "epoch": 28.832825062378706, "grad_norm": 0.5842565298080444, "learning_rate": 4.715719063545151e-06, "loss": 6.3891, "step": 13000 }, { "epoch": 28.832825062378706, "eval_loss": 6.401641845703125, "eval_runtime": 172.3935, "eval_samples_per_second": 58.007, "eval_steps_per_second": 7.251, "step": 13000 }, { "epoch": 29.054616024397006, "grad_norm": 0.6438339948654175, "learning_rate": 4.705685618729097e-06, "loss": 6.3869, "step": 13100 }, { "epoch": 29.054616024397006, "eval_loss": 6.403342247009277, "eval_runtime": 174.8489, "eval_samples_per_second": 57.192, "eval_steps_per_second": 7.149, "step": 13100 }, { "epoch": 29.276406986415303, "grad_norm": 0.5338951349258423, "learning_rate": 4.695652173913044e-06, "loss": 6.3882, "step": 13200 }, { "epoch": 29.276406986415303, "eval_loss": 6.400930404663086, "eval_runtime": 172.4302, "eval_samples_per_second": 57.994, "eval_steps_per_second": 7.249, "step": 13200 }, { "epoch": 29.498197948433603, "grad_norm": 0.5359793305397034, "learning_rate": 4.6856187290969905e-06, "loss": 6.3878, "step": 13300 }, { "epoch": 29.498197948433603, "eval_loss": 6.406982898712158, "eval_runtime": 174.802, "eval_samples_per_second": 57.208, "eval_steps_per_second": 7.151, "step": 13300 }, { "epoch": 29.7199889104519, "grad_norm": 0.715033233165741, "learning_rate": 4.675585284280936e-06, "loss": 6.3859, "step": 13400 }, { "epoch": 29.7199889104519, "eval_loss": 6.40342903137207, "eval_runtime": 174.8452, "eval_samples_per_second": 57.193, "eval_steps_per_second": 7.149, "step": 13400 }, { "epoch": 29.941779872470196, "grad_norm": 0.934853732585907, "learning_rate": 4.665551839464883e-06, "loss": 6.3875, "step": 13500 }, { "epoch": 29.941779872470196, "eval_loss": 6.401629447937012, "eval_runtime": 174.7924, "eval_samples_per_second": 57.211, "eval_steps_per_second": 7.151, "step": 13500 }, { "epoch": 30.163570834488496, "grad_norm": 0.479612797498703, "learning_rate": 4.65551839464883e-06, "loss": 6.3866, "step": 13600 }, { "epoch": 30.163570834488496, "eval_loss": 6.399043560028076, "eval_runtime": 176.925, "eval_samples_per_second": 56.521, "eval_steps_per_second": 7.065, "step": 13600 }, { "epoch": 30.385361796506793, "grad_norm": 0.5256738662719727, "learning_rate": 4.645484949832776e-06, "loss": 6.3878, "step": 13700 }, { "epoch": 30.385361796506793, "eval_loss": 6.400505065917969, "eval_runtime": 175.3482, "eval_samples_per_second": 57.029, "eval_steps_per_second": 7.129, "step": 13700 }, { "epoch": 30.60715275852509, "grad_norm": 0.5690653920173645, "learning_rate": 4.635451505016723e-06, "loss": 6.3848, "step": 13800 }, { "epoch": 30.60715275852509, "eval_loss": 6.403696060180664, "eval_runtime": 172.8083, "eval_samples_per_second": 57.868, "eval_steps_per_second": 7.233, "step": 13800 }, { "epoch": 30.82894372054339, "grad_norm": 0.4565252363681793, "learning_rate": 4.625418060200669e-06, "loss": 6.3849, "step": 13900 }, { "epoch": 30.82894372054339, "eval_loss": 6.403767108917236, "eval_runtime": 175.7515, "eval_samples_per_second": 56.899, "eval_steps_per_second": 7.112, "step": 13900 }, { "epoch": 31.050734682561686, "grad_norm": 0.4801616966724396, "learning_rate": 4.615384615384616e-06, "loss": 6.3869, "step": 14000 }, { "epoch": 31.050734682561686, "eval_loss": 6.400508403778076, "eval_runtime": 174.4685, "eval_samples_per_second": 57.317, "eval_steps_per_second": 7.165, "step": 14000 }, { "epoch": 31.272525644579982, "grad_norm": 0.5834231972694397, "learning_rate": 4.605351170568562e-06, "loss": 6.3853, "step": 14100 }, { "epoch": 31.272525644579982, "eval_loss": 6.400169849395752, "eval_runtime": 175.4977, "eval_samples_per_second": 56.981, "eval_steps_per_second": 7.123, "step": 14100 }, { "epoch": 31.494316606598282, "grad_norm": 0.6701497435569763, "learning_rate": 4.595317725752509e-06, "loss": 6.3865, "step": 14200 }, { "epoch": 31.494316606598282, "eval_loss": 6.397976875305176, "eval_runtime": 175.4612, "eval_samples_per_second": 56.993, "eval_steps_per_second": 7.124, "step": 14200 }, { "epoch": 31.71610756861658, "grad_norm": 0.4794948697090149, "learning_rate": 4.585284280936456e-06, "loss": 6.3852, "step": 14300 }, { "epoch": 31.71610756861658, "eval_loss": 6.403610706329346, "eval_runtime": 176.2646, "eval_samples_per_second": 56.733, "eval_steps_per_second": 7.092, "step": 14300 }, { "epoch": 31.937898530634875, "grad_norm": 0.6028741002082825, "learning_rate": 4.5752508361204015e-06, "loss": 6.3851, "step": 14400 }, { "epoch": 31.937898530634875, "eval_loss": 6.400261878967285, "eval_runtime": 174.9022, "eval_samples_per_second": 57.175, "eval_steps_per_second": 7.147, "step": 14400 }, { "epoch": 32.15968949265317, "grad_norm": 0.7439810037612915, "learning_rate": 4.565217391304348e-06, "loss": 6.3839, "step": 14500 }, { "epoch": 32.15968949265317, "eval_loss": 6.397915363311768, "eval_runtime": 172.885, "eval_samples_per_second": 57.842, "eval_steps_per_second": 7.23, "step": 14500 }, { "epoch": 32.381480454671475, "grad_norm": 0.4727949798107147, "learning_rate": 4.555183946488295e-06, "loss": 6.3855, "step": 14600 }, { "epoch": 32.381480454671475, "eval_loss": 6.39973258972168, "eval_runtime": 175.4295, "eval_samples_per_second": 57.003, "eval_steps_per_second": 7.125, "step": 14600 }, { "epoch": 32.60327141668977, "grad_norm": 0.5084313154220581, "learning_rate": 4.545150501672241e-06, "loss": 6.3833, "step": 14700 }, { "epoch": 32.60327141668977, "eval_loss": 6.39823055267334, "eval_runtime": 173.3819, "eval_samples_per_second": 57.676, "eval_steps_per_second": 7.21, "step": 14700 }, { "epoch": 32.82506237870807, "grad_norm": 0.36422112584114075, "learning_rate": 4.535117056856188e-06, "loss": 6.3854, "step": 14800 }, { "epoch": 32.82506237870807, "eval_loss": 6.402724266052246, "eval_runtime": 174.9732, "eval_samples_per_second": 57.152, "eval_steps_per_second": 7.144, "step": 14800 }, { "epoch": 33.046853340726365, "grad_norm": 0.5722773671150208, "learning_rate": 4.5250836120401345e-06, "loss": 6.3859, "step": 14900 }, { "epoch": 33.046853340726365, "eval_loss": 6.396421909332275, "eval_runtime": 175.3976, "eval_samples_per_second": 57.013, "eval_steps_per_second": 7.127, "step": 14900 }, { "epoch": 33.26864430274466, "grad_norm": 0.859866201877594, "learning_rate": 4.51505016722408e-06, "loss": 6.3851, "step": 15000 }, { "epoch": 33.26864430274466, "eval_loss": 6.396206855773926, "eval_runtime": 174.5321, "eval_samples_per_second": 57.296, "eval_steps_per_second": 7.162, "step": 15000 }, { "epoch": 33.49043526476296, "grad_norm": 0.8327785134315491, "learning_rate": 4.505016722408027e-06, "loss": 6.3848, "step": 15100 }, { "epoch": 33.49043526476296, "eval_loss": 6.403675556182861, "eval_runtime": 172.9138, "eval_samples_per_second": 57.832, "eval_steps_per_second": 7.229, "step": 15100 }, { "epoch": 33.71222622678126, "grad_norm": 0.4790419042110443, "learning_rate": 4.494983277591973e-06, "loss": 6.3843, "step": 15200 }, { "epoch": 33.71222622678126, "eval_loss": 6.397605895996094, "eval_runtime": 175.348, "eval_samples_per_second": 57.029, "eval_steps_per_second": 7.129, "step": 15200 }, { "epoch": 33.93401718879956, "grad_norm": 0.8004974722862244, "learning_rate": 4.48494983277592e-06, "loss": 6.3852, "step": 15300 }, { "epoch": 33.93401718879956, "eval_loss": 6.396829605102539, "eval_runtime": 172.9108, "eval_samples_per_second": 57.833, "eval_steps_per_second": 7.229, "step": 15300 }, { "epoch": 34.155808150817855, "grad_norm": 0.40926745533943176, "learning_rate": 4.474916387959866e-06, "loss": 6.3835, "step": 15400 }, { "epoch": 34.155808150817855, "eval_loss": 6.400079727172852, "eval_runtime": 175.4491, "eval_samples_per_second": 56.997, "eval_steps_per_second": 7.125, "step": 15400 }, { "epoch": 34.37759911283615, "grad_norm": 0.3634837567806244, "learning_rate": 4.4648829431438125e-06, "loss": 6.3836, "step": 15500 }, { "epoch": 34.37759911283615, "eval_loss": 6.399561882019043, "eval_runtime": 173.2399, "eval_samples_per_second": 57.723, "eval_steps_per_second": 7.215, "step": 15500 }, { "epoch": 34.59939007485445, "grad_norm": 0.4545910954475403, "learning_rate": 4.454849498327759e-06, "loss": 6.3836, "step": 15600 }, { "epoch": 34.59939007485445, "eval_loss": 6.3967742919921875, "eval_runtime": 175.8575, "eval_samples_per_second": 56.864, "eval_steps_per_second": 7.108, "step": 15600 }, { "epoch": 34.821181036872744, "grad_norm": 0.5282755494117737, "learning_rate": 4.444816053511705e-06, "loss": 6.3851, "step": 15700 }, { "epoch": 34.821181036872744, "eval_loss": 6.399077892303467, "eval_runtime": 175.7729, "eval_samples_per_second": 56.892, "eval_steps_per_second": 7.111, "step": 15700 }, { "epoch": 35.04297199889105, "grad_norm": 0.5991719961166382, "learning_rate": 4.434782608695652e-06, "loss": 6.3846, "step": 15800 }, { "epoch": 35.04297199889105, "eval_loss": 6.4012532234191895, "eval_runtime": 175.8802, "eval_samples_per_second": 56.857, "eval_steps_per_second": 7.107, "step": 15800 }, { "epoch": 35.264762960909344, "grad_norm": 0.5155884623527527, "learning_rate": 4.424749163879599e-06, "loss": 6.3836, "step": 15900 }, { "epoch": 35.264762960909344, "eval_loss": 6.396469593048096, "eval_runtime": 175.4084, "eval_samples_per_second": 57.01, "eval_steps_per_second": 7.126, "step": 15900 }, { "epoch": 35.48655392292764, "grad_norm": 0.5687472224235535, "learning_rate": 4.414715719063545e-06, "loss": 6.3851, "step": 16000 }, { "epoch": 35.48655392292764, "eval_loss": 6.39898681640625, "eval_runtime": 172.8397, "eval_samples_per_second": 57.857, "eval_steps_per_second": 7.232, "step": 16000 }, { "epoch": 35.70834488494594, "grad_norm": 0.43625304102897644, "learning_rate": 4.404682274247491e-06, "loss": 6.3839, "step": 16100 }, { "epoch": 35.70834488494594, "eval_loss": 6.397797584533691, "eval_runtime": 175.3929, "eval_samples_per_second": 57.015, "eval_steps_per_second": 7.127, "step": 16100 }, { "epoch": 35.930135846964234, "grad_norm": 0.45570382475852966, "learning_rate": 4.394648829431438e-06, "loss": 6.383, "step": 16200 }, { "epoch": 35.930135846964234, "eval_loss": 6.396146774291992, "eval_runtime": 172.944, "eval_samples_per_second": 57.822, "eval_steps_per_second": 7.228, "step": 16200 }, { "epoch": 36.15192680898254, "grad_norm": 0.5023874044418335, "learning_rate": 4.384615384615384e-06, "loss": 6.3832, "step": 16300 }, { "epoch": 36.15192680898254, "eval_loss": 6.394959449768066, "eval_runtime": 175.3162, "eval_samples_per_second": 57.04, "eval_steps_per_second": 7.13, "step": 16300 }, { "epoch": 36.373717771000834, "grad_norm": 0.6336263418197632, "learning_rate": 4.374581939799331e-06, "loss": 6.384, "step": 16400 }, { "epoch": 36.373717771000834, "eval_loss": 6.396052360534668, "eval_runtime": 172.9338, "eval_samples_per_second": 57.826, "eval_steps_per_second": 7.228, "step": 16400 }, { "epoch": 36.59550873301913, "grad_norm": 0.49517419934272766, "learning_rate": 4.364548494983278e-06, "loss": 6.3837, "step": 16500 }, { "epoch": 36.59550873301913, "eval_loss": 6.394345760345459, "eval_runtime": 175.3695, "eval_samples_per_second": 57.022, "eval_steps_per_second": 7.128, "step": 16500 }, { "epoch": 36.81729969503743, "grad_norm": 0.6354840993881226, "learning_rate": 4.354515050167224e-06, "loss": 6.3819, "step": 16600 }, { "epoch": 36.81729969503743, "eval_loss": 6.399397850036621, "eval_runtime": 172.967, "eval_samples_per_second": 57.814, "eval_steps_per_second": 7.227, "step": 16600 }, { "epoch": 37.03909065705572, "grad_norm": 0.6154801845550537, "learning_rate": 4.34448160535117e-06, "loss": 6.3846, "step": 16700 }, { "epoch": 37.03909065705572, "eval_loss": 6.398616313934326, "eval_runtime": 175.382, "eval_samples_per_second": 57.018, "eval_steps_per_second": 7.127, "step": 16700 }, { "epoch": 37.26088161907402, "grad_norm": 0.5332671999931335, "learning_rate": 4.334448160535117e-06, "loss": 6.3833, "step": 16800 }, { "epoch": 37.26088161907402, "eval_loss": 6.400417327880859, "eval_runtime": 172.8252, "eval_samples_per_second": 57.862, "eval_steps_per_second": 7.233, "step": 16800 }, { "epoch": 37.482672581092324, "grad_norm": 0.4707394242286682, "learning_rate": 4.324414715719064e-06, "loss": 6.382, "step": 16900 }, { "epoch": 37.482672581092324, "eval_loss": 6.399077415466309, "eval_runtime": 175.3262, "eval_samples_per_second": 57.037, "eval_steps_per_second": 7.13, "step": 16900 }, { "epoch": 37.70446354311062, "grad_norm": 0.5503630042076111, "learning_rate": 4.31438127090301e-06, "loss": 6.3825, "step": 17000 }, { "epoch": 37.70446354311062, "eval_loss": 6.3964338302612305, "eval_runtime": 175.3567, "eval_samples_per_second": 57.027, "eval_steps_per_second": 7.128, "step": 17000 }, { "epoch": 37.92625450512892, "grad_norm": 0.4225850999355316, "learning_rate": 4.3043478260869565e-06, "loss": 6.3808, "step": 17100 }, { "epoch": 37.92625450512892, "eval_loss": 6.399682998657227, "eval_runtime": 175.5337, "eval_samples_per_second": 56.969, "eval_steps_per_second": 7.121, "step": 17100 }, { "epoch": 38.14804546714721, "grad_norm": 0.26002365350723267, "learning_rate": 4.294314381270903e-06, "loss": 6.3825, "step": 17200 }, { "epoch": 38.14804546714721, "eval_loss": 6.394641399383545, "eval_runtime": 175.4187, "eval_samples_per_second": 57.006, "eval_steps_per_second": 7.126, "step": 17200 }, { "epoch": 38.36983642916551, "grad_norm": 0.5679543614387512, "learning_rate": 4.284280936454849e-06, "loss": 6.381, "step": 17300 }, { "epoch": 38.36983642916551, "eval_loss": 6.39400053024292, "eval_runtime": 175.3915, "eval_samples_per_second": 57.015, "eval_steps_per_second": 7.127, "step": 17300 }, { "epoch": 38.591627391183806, "grad_norm": 0.6668972373008728, "learning_rate": 4.274247491638796e-06, "loss": 6.3833, "step": 17400 }, { "epoch": 38.591627391183806, "eval_loss": 6.395496845245361, "eval_runtime": 175.3632, "eval_samples_per_second": 57.025, "eval_steps_per_second": 7.128, "step": 17400 }, { "epoch": 38.81341835320211, "grad_norm": 0.7112624049186707, "learning_rate": 4.264214046822743e-06, "loss": 6.3819, "step": 17500 }, { "epoch": 38.81341835320211, "eval_loss": 6.394676685333252, "eval_runtime": 174.8435, "eval_samples_per_second": 57.194, "eval_steps_per_second": 7.149, "step": 17500 }, { "epoch": 39.035209315220406, "grad_norm": 0.550544261932373, "learning_rate": 4.254180602006689e-06, "loss": 6.3826, "step": 17600 }, { "epoch": 39.035209315220406, "eval_loss": 6.396825313568115, "eval_runtime": 175.8952, "eval_samples_per_second": 56.852, "eval_steps_per_second": 7.107, "step": 17600 }, { "epoch": 39.2570002772387, "grad_norm": 0.43430355191230774, "learning_rate": 4.244147157190635e-06, "loss": 6.3829, "step": 17700 }, { "epoch": 39.2570002772387, "eval_loss": 6.396999835968018, "eval_runtime": 173.2928, "eval_samples_per_second": 57.706, "eval_steps_per_second": 7.213, "step": 17700 }, { "epoch": 39.478791239257, "grad_norm": 0.4726496636867523, "learning_rate": 4.234113712374582e-06, "loss": 6.3832, "step": 17800 }, { "epoch": 39.478791239257, "eval_loss": 6.394546031951904, "eval_runtime": 175.1792, "eval_samples_per_second": 57.084, "eval_steps_per_second": 7.136, "step": 17800 }, { "epoch": 39.700582201275296, "grad_norm": 0.6477558612823486, "learning_rate": 4.224080267558528e-06, "loss": 6.383, "step": 17900 }, { "epoch": 39.700582201275296, "eval_loss": 6.39369010925293, "eval_runtime": 175.8821, "eval_samples_per_second": 56.856, "eval_steps_per_second": 7.107, "step": 17900 }, { "epoch": 39.92237316329359, "grad_norm": 0.3382057845592499, "learning_rate": 4.214046822742475e-06, "loss": 6.3794, "step": 18000 }, { "epoch": 39.92237316329359, "eval_loss": 6.394671440124512, "eval_runtime": 175.9089, "eval_samples_per_second": 56.848, "eval_steps_per_second": 7.106, "step": 18000 }, { "epoch": 40.144164125311896, "grad_norm": 0.32499295473098755, "learning_rate": 4.2040133779264216e-06, "loss": 6.3836, "step": 18100 }, { "epoch": 40.144164125311896, "eval_loss": 6.393697738647461, "eval_runtime": 173.0953, "eval_samples_per_second": 57.772, "eval_steps_per_second": 7.221, "step": 18100 }, { "epoch": 40.36595508733019, "grad_norm": 0.4412948489189148, "learning_rate": 4.1939799331103675e-06, "loss": 6.382, "step": 18200 }, { "epoch": 40.36595508733019, "eval_loss": 6.395814895629883, "eval_runtime": 175.6272, "eval_samples_per_second": 56.939, "eval_steps_per_second": 7.117, "step": 18200 }, { "epoch": 40.58774604934849, "grad_norm": 0.46561938524246216, "learning_rate": 4.183946488294314e-06, "loss": 6.3809, "step": 18300 }, { "epoch": 40.58774604934849, "eval_loss": 6.395906448364258, "eval_runtime": 173.1113, "eval_samples_per_second": 57.766, "eval_steps_per_second": 7.221, "step": 18300 }, { "epoch": 40.809537011366785, "grad_norm": 0.3944660425186157, "learning_rate": 4.173913043478261e-06, "loss": 6.3816, "step": 18400 }, { "epoch": 40.809537011366785, "eval_loss": 6.395975589752197, "eval_runtime": 175.6877, "eval_samples_per_second": 56.919, "eval_steps_per_second": 7.115, "step": 18400 }, { "epoch": 41.03132797338508, "grad_norm": 0.6692656874656677, "learning_rate": 4.163879598662208e-06, "loss": 6.3812, "step": 18500 }, { "epoch": 41.03132797338508, "eval_loss": 6.39307975769043, "eval_runtime": 173.2571, "eval_samples_per_second": 57.718, "eval_steps_per_second": 7.215, "step": 18500 }, { "epoch": 41.253118935403386, "grad_norm": 0.5447328090667725, "learning_rate": 4.153846153846154e-06, "loss": 6.382, "step": 18600 }, { "epoch": 41.253118935403386, "eval_loss": 6.392385005950928, "eval_runtime": 175.7445, "eval_samples_per_second": 56.901, "eval_steps_per_second": 7.113, "step": 18600 }, { "epoch": 41.47490989742168, "grad_norm": 0.4197390079498291, "learning_rate": 4.1438127090301005e-06, "loss": 6.3809, "step": 18700 }, { "epoch": 41.47490989742168, "eval_loss": 6.395226001739502, "eval_runtime": 173.3622, "eval_samples_per_second": 57.683, "eval_steps_per_second": 7.21, "step": 18700 }, { "epoch": 41.69670085943998, "grad_norm": 0.37331509590148926, "learning_rate": 4.133779264214047e-06, "loss": 6.3821, "step": 18800 }, { "epoch": 41.69670085943998, "eval_loss": 6.397747039794922, "eval_runtime": 175.5873, "eval_samples_per_second": 56.952, "eval_steps_per_second": 7.119, "step": 18800 }, { "epoch": 41.918491821458275, "grad_norm": 0.439635306596756, "learning_rate": 4.123745819397993e-06, "loss": 6.3802, "step": 18900 }, { "epoch": 41.918491821458275, "eval_loss": 6.393184185028076, "eval_runtime": 175.6266, "eval_samples_per_second": 56.939, "eval_steps_per_second": 7.117, "step": 18900 }, { "epoch": 42.14028278347657, "grad_norm": 0.4135972857475281, "learning_rate": 4.11371237458194e-06, "loss": 6.381, "step": 19000 }, { "epoch": 42.14028278347657, "eval_loss": 6.396628379821777, "eval_runtime": 175.68, "eval_samples_per_second": 56.922, "eval_steps_per_second": 7.115, "step": 19000 }, { "epoch": 42.36207374549487, "grad_norm": 0.3350447118282318, "learning_rate": 4.103678929765887e-06, "loss": 6.382, "step": 19100 }, { "epoch": 42.36207374549487, "eval_loss": 6.3959784507751465, "eval_runtime": 173.1015, "eval_samples_per_second": 57.77, "eval_steps_per_second": 7.221, "step": 19100 }, { "epoch": 42.58386470751317, "grad_norm": 0.40015509724617004, "learning_rate": 4.0936454849498326e-06, "loss": 6.3793, "step": 19200 }, { "epoch": 42.58386470751317, "eval_loss": 6.392791271209717, "eval_runtime": 175.6231, "eval_samples_per_second": 56.94, "eval_steps_per_second": 7.118, "step": 19200 }, { "epoch": 42.80565566953147, "grad_norm": 0.42993155121803284, "learning_rate": 4.083612040133779e-06, "loss": 6.3817, "step": 19300 }, { "epoch": 42.80565566953147, "eval_loss": 6.393764495849609, "eval_runtime": 175.7583, "eval_samples_per_second": 56.896, "eval_steps_per_second": 7.112, "step": 19300 }, { "epoch": 43.027446631549765, "grad_norm": 0.506564199924469, "learning_rate": 4.073578595317726e-06, "loss": 6.3805, "step": 19400 }, { "epoch": 43.027446631549765, "eval_loss": 6.395299434661865, "eval_runtime": 172.8685, "eval_samples_per_second": 57.847, "eval_steps_per_second": 7.231, "step": 19400 }, { "epoch": 43.24923759356806, "grad_norm": 0.34368619322776794, "learning_rate": 4.063545150501672e-06, "loss": 6.3791, "step": 19500 }, { "epoch": 43.24923759356806, "eval_loss": 6.390516757965088, "eval_runtime": 175.4183, "eval_samples_per_second": 57.007, "eval_steps_per_second": 7.126, "step": 19500 }, { "epoch": 43.47102855558636, "grad_norm": 0.5442679524421692, "learning_rate": 4.053511705685619e-06, "loss": 6.3805, "step": 19600 }, { "epoch": 43.47102855558636, "eval_loss": 6.390527248382568, "eval_runtime": 172.8815, "eval_samples_per_second": 57.843, "eval_steps_per_second": 7.23, "step": 19600 }, { "epoch": 43.692819517604654, "grad_norm": 0.6060280799865723, "learning_rate": 4.0434782608695655e-06, "loss": 6.3792, "step": 19700 }, { "epoch": 43.692819517604654, "eval_loss": 6.393373489379883, "eval_runtime": 175.3372, "eval_samples_per_second": 57.033, "eval_steps_per_second": 7.129, "step": 19700 }, { "epoch": 43.91461047962296, "grad_norm": 0.5891469120979309, "learning_rate": 4.0334448160535115e-06, "loss": 6.382, "step": 19800 }, { "epoch": 43.91461047962296, "eval_loss": 6.395658493041992, "eval_runtime": 173.3068, "eval_samples_per_second": 57.701, "eval_steps_per_second": 7.213, "step": 19800 }, { "epoch": 44.136401441641254, "grad_norm": 0.3623868525028229, "learning_rate": 4.023411371237458e-06, "loss": 6.3794, "step": 19900 }, { "epoch": 44.136401441641254, "eval_loss": 6.394290447235107, "eval_runtime": 175.7778, "eval_samples_per_second": 56.89, "eval_steps_per_second": 7.111, "step": 19900 }, { "epoch": 44.35819240365955, "grad_norm": 0.6197667121887207, "learning_rate": 4.013377926421405e-06, "loss": 6.3798, "step": 20000 }, { "epoch": 44.35819240365955, "eval_loss": 6.393582820892334, "eval_runtime": 175.4817, "eval_samples_per_second": 56.986, "eval_steps_per_second": 7.123, "step": 20000 }, { "epoch": 44.57998336567785, "grad_norm": 0.5198450684547424, "learning_rate": 4.003344481605351e-06, "loss": 6.3792, "step": 20100 }, { "epoch": 44.57998336567785, "eval_loss": 6.3943023681640625, "eval_runtime": 175.4115, "eval_samples_per_second": 57.009, "eval_steps_per_second": 7.126, "step": 20100 }, { "epoch": 44.801774327696144, "grad_norm": 0.4044889211654663, "learning_rate": 3.993311036789298e-06, "loss": 6.3798, "step": 20200 }, { "epoch": 44.801774327696144, "eval_loss": 6.396990776062012, "eval_runtime": 172.8449, "eval_samples_per_second": 57.855, "eval_steps_per_second": 7.232, "step": 20200 }, { "epoch": 45.02356528971445, "grad_norm": 0.4656885862350464, "learning_rate": 3.9832775919732444e-06, "loss": 6.3807, "step": 20300 }, { "epoch": 45.02356528971445, "eval_loss": 6.395167350769043, "eval_runtime": 175.2548, "eval_samples_per_second": 57.06, "eval_steps_per_second": 7.132, "step": 20300 }, { "epoch": 45.245356251732744, "grad_norm": 0.5882771611213684, "learning_rate": 3.97324414715719e-06, "loss": 6.3802, "step": 20400 }, { "epoch": 45.245356251732744, "eval_loss": 6.392847537994385, "eval_runtime": 175.4165, "eval_samples_per_second": 57.007, "eval_steps_per_second": 7.126, "step": 20400 }, { "epoch": 45.46714721375104, "grad_norm": 0.31189513206481934, "learning_rate": 3.963210702341137e-06, "loss": 6.3799, "step": 20500 }, { "epoch": 45.46714721375104, "eval_loss": 6.391454696655273, "eval_runtime": 175.3822, "eval_samples_per_second": 57.018, "eval_steps_per_second": 7.127, "step": 20500 }, { "epoch": 45.68893817576934, "grad_norm": 0.7188530564308167, "learning_rate": 3.953177257525084e-06, "loss": 6.3775, "step": 20600 }, { "epoch": 45.68893817576934, "eval_loss": 6.391802787780762, "eval_runtime": 175.4136, "eval_samples_per_second": 57.008, "eval_steps_per_second": 7.126, "step": 20600 }, { "epoch": 45.910729137787634, "grad_norm": 0.4235071837902069, "learning_rate": 3.943143812709031e-06, "loss": 6.3791, "step": 20700 }, { "epoch": 45.910729137787634, "eval_loss": 6.3952836990356445, "eval_runtime": 175.3753, "eval_samples_per_second": 57.021, "eval_steps_per_second": 7.128, "step": 20700 }, { "epoch": 46.13252009980593, "grad_norm": 0.4977140724658966, "learning_rate": 3.9331103678929765e-06, "loss": 6.3807, "step": 20800 }, { "epoch": 46.13252009980593, "eval_loss": 6.397064208984375, "eval_runtime": 175.8439, "eval_samples_per_second": 56.869, "eval_steps_per_second": 7.109, "step": 20800 }, { "epoch": 46.354311061824234, "grad_norm": 0.5896762609481812, "learning_rate": 3.923076923076923e-06, "loss": 6.3801, "step": 20900 }, { "epoch": 46.354311061824234, "eval_loss": 6.394172191619873, "eval_runtime": 173.449, "eval_samples_per_second": 57.654, "eval_steps_per_second": 7.207, "step": 20900 }, { "epoch": 46.57610202384253, "grad_norm": 0.47281450033187866, "learning_rate": 3.91304347826087e-06, "loss": 6.3787, "step": 21000 }, { "epoch": 46.57610202384253, "eval_loss": 6.3905463218688965, "eval_runtime": 175.9964, "eval_samples_per_second": 56.819, "eval_steps_per_second": 7.102, "step": 21000 }, { "epoch": 46.79789298586083, "grad_norm": 0.42211413383483887, "learning_rate": 3.903010033444816e-06, "loss": 6.3798, "step": 21100 }, { "epoch": 46.79789298586083, "eval_loss": 6.39119291305542, "eval_runtime": 175.8005, "eval_samples_per_second": 56.883, "eval_steps_per_second": 7.11, "step": 21100 }, { "epoch": 47.01968394787912, "grad_norm": 0.7232652306556702, "learning_rate": 3.892976588628763e-06, "loss": 6.3795, "step": 21200 }, { "epoch": 47.01968394787912, "eval_loss": 6.39454984664917, "eval_runtime": 174.7314, "eval_samples_per_second": 57.231, "eval_steps_per_second": 7.154, "step": 21200 }, { "epoch": 47.24147490989742, "grad_norm": 0.4875265657901764, "learning_rate": 3.8829431438127095e-06, "loss": 6.3798, "step": 21300 }, { "epoch": 47.24147490989742, "eval_loss": 6.391242027282715, "eval_runtime": 173.1294, "eval_samples_per_second": 57.76, "eval_steps_per_second": 7.22, "step": 21300 }, { "epoch": 47.463265871915716, "grad_norm": 0.689365804195404, "learning_rate": 3.8729096989966554e-06, "loss": 6.3797, "step": 21400 }, { "epoch": 47.463265871915716, "eval_loss": 6.392244338989258, "eval_runtime": 175.7048, "eval_samples_per_second": 56.914, "eval_steps_per_second": 7.114, "step": 21400 }, { "epoch": 47.68505683393402, "grad_norm": 0.34326601028442383, "learning_rate": 3.862876254180602e-06, "loss": 6.3799, "step": 21500 }, { "epoch": 47.68505683393402, "eval_loss": 6.390882968902588, "eval_runtime": 173.1981, "eval_samples_per_second": 57.737, "eval_steps_per_second": 7.217, "step": 21500 }, { "epoch": 47.90684779595232, "grad_norm": 0.5094731450080872, "learning_rate": 3.852842809364549e-06, "loss": 6.3789, "step": 21600 }, { "epoch": 47.90684779595232, "eval_loss": 6.391824245452881, "eval_runtime": 175.6758, "eval_samples_per_second": 56.923, "eval_steps_per_second": 7.115, "step": 21600 }, { "epoch": 48.12863875797061, "grad_norm": 0.5096613764762878, "learning_rate": 3.842809364548495e-06, "loss": 6.3788, "step": 21700 }, { "epoch": 48.12863875797061, "eval_loss": 6.3908467292785645, "eval_runtime": 175.722, "eval_samples_per_second": 56.908, "eval_steps_per_second": 7.114, "step": 21700 }, { "epoch": 48.35042971998891, "grad_norm": 0.49328041076660156, "learning_rate": 3.832775919732442e-06, "loss": 6.3801, "step": 21800 }, { "epoch": 48.35042971998891, "eval_loss": 6.392337322235107, "eval_runtime": 175.7017, "eval_samples_per_second": 56.915, "eval_steps_per_second": 7.114, "step": 21800 }, { "epoch": 48.572220682007206, "grad_norm": 0.331511914730072, "learning_rate": 3.822742474916388e-06, "loss": 6.3787, "step": 21900 }, { "epoch": 48.572220682007206, "eval_loss": 6.392426013946533, "eval_runtime": 175.6914, "eval_samples_per_second": 56.918, "eval_steps_per_second": 7.115, "step": 21900 }, { "epoch": 48.7940116440255, "grad_norm": 0.5596035718917847, "learning_rate": 3.8127090301003347e-06, "loss": 6.3783, "step": 22000 }, { "epoch": 48.7940116440255, "eval_loss": 6.396266460418701, "eval_runtime": 175.7217, "eval_samples_per_second": 56.908, "eval_steps_per_second": 7.114, "step": 22000 }, { "epoch": 49.015802606043806, "grad_norm": 0.42308327555656433, "learning_rate": 3.802675585284281e-06, "loss": 6.3788, "step": 22100 }, { "epoch": 49.015802606043806, "eval_loss": 6.392462730407715, "eval_runtime": 175.6395, "eval_samples_per_second": 56.935, "eval_steps_per_second": 7.117, "step": 22100 }, { "epoch": 49.2375935680621, "grad_norm": 0.47657862305641174, "learning_rate": 3.792642140468228e-06, "loss": 6.3768, "step": 22200 }, { "epoch": 49.2375935680621, "eval_loss": 6.392263412475586, "eval_runtime": 175.6228, "eval_samples_per_second": 56.94, "eval_steps_per_second": 7.118, "step": 22200 }, { "epoch": 49.4593845300804, "grad_norm": 0.4417143166065216, "learning_rate": 3.782608695652174e-06, "loss": 6.3785, "step": 22300 }, { "epoch": 49.4593845300804, "eval_loss": 6.39237642288208, "eval_runtime": 175.5904, "eval_samples_per_second": 56.951, "eval_steps_per_second": 7.119, "step": 22300 }, { "epoch": 49.681175492098696, "grad_norm": 0.3279063105583191, "learning_rate": 3.7725752508361205e-06, "loss": 6.3791, "step": 22400 }, { "epoch": 49.681175492098696, "eval_loss": 6.3924407958984375, "eval_runtime": 175.6501, "eval_samples_per_second": 56.931, "eval_steps_per_second": 7.116, "step": 22400 }, { "epoch": 49.90296645411699, "grad_norm": 0.6854652166366577, "learning_rate": 3.7625418060200673e-06, "loss": 6.3785, "step": 22500 }, { "epoch": 49.90296645411699, "eval_loss": 6.390333652496338, "eval_runtime": 175.1173, "eval_samples_per_second": 57.105, "eval_steps_per_second": 7.138, "step": 22500 }, { "epoch": 50.124757416135296, "grad_norm": 0.3522402048110962, "learning_rate": 3.7525083612040136e-06, "loss": 6.3776, "step": 22600 }, { "epoch": 50.124757416135296, "eval_loss": 6.395279884338379, "eval_runtime": 172.8769, "eval_samples_per_second": 57.845, "eval_steps_per_second": 7.231, "step": 22600 }, { "epoch": 50.34654837815359, "grad_norm": 0.4847201704978943, "learning_rate": 3.74247491638796e-06, "loss": 6.3798, "step": 22700 }, { "epoch": 50.34654837815359, "eval_loss": 6.385508060455322, "eval_runtime": 175.3898, "eval_samples_per_second": 57.016, "eval_steps_per_second": 7.127, "step": 22700 }, { "epoch": 50.56833934017189, "grad_norm": 0.6891096234321594, "learning_rate": 3.7324414715719067e-06, "loss": 6.379, "step": 22800 }, { "epoch": 50.56833934017189, "eval_loss": 6.389738082885742, "eval_runtime": 172.9656, "eval_samples_per_second": 57.815, "eval_steps_per_second": 7.227, "step": 22800 }, { "epoch": 50.790130302190185, "grad_norm": 0.5377815365791321, "learning_rate": 3.722408026755853e-06, "loss": 6.3781, "step": 22900 }, { "epoch": 50.790130302190185, "eval_loss": 6.393865585327148, "eval_runtime": 175.4211, "eval_samples_per_second": 57.006, "eval_steps_per_second": 7.126, "step": 22900 }, { "epoch": 51.01192126420848, "grad_norm": 0.33496779203414917, "learning_rate": 3.7123745819398e-06, "loss": 6.3774, "step": 23000 }, { "epoch": 51.01192126420848, "eval_loss": 6.388363838195801, "eval_runtime": 172.9308, "eval_samples_per_second": 57.827, "eval_steps_per_second": 7.228, "step": 23000 }, { "epoch": 51.23371222622678, "grad_norm": 0.374717116355896, "learning_rate": 3.702341137123746e-06, "loss": 6.3782, "step": 23100 }, { "epoch": 51.23371222622678, "eval_loss": 6.3933634757995605, "eval_runtime": 175.8194, "eval_samples_per_second": 56.877, "eval_steps_per_second": 7.11, "step": 23100 }, { "epoch": 51.45550318824508, "grad_norm": 0.5700441002845764, "learning_rate": 3.6923076923076925e-06, "loss": 6.3779, "step": 23200 }, { "epoch": 51.45550318824508, "eval_loss": 6.391829490661621, "eval_runtime": 173.0462, "eval_samples_per_second": 57.788, "eval_steps_per_second": 7.224, "step": 23200 }, { "epoch": 51.67729415026338, "grad_norm": 0.5987123250961304, "learning_rate": 3.6822742474916393e-06, "loss": 6.3775, "step": 23300 }, { "epoch": 51.67729415026338, "eval_loss": 6.391645908355713, "eval_runtime": 175.546, "eval_samples_per_second": 56.965, "eval_steps_per_second": 7.121, "step": 23300 }, { "epoch": 51.899085112281675, "grad_norm": 0.6282506585121155, "learning_rate": 3.6722408026755856e-06, "loss": 6.3785, "step": 23400 }, { "epoch": 51.899085112281675, "eval_loss": 6.394507884979248, "eval_runtime": 175.5236, "eval_samples_per_second": 56.972, "eval_steps_per_second": 7.122, "step": 23400 }, { "epoch": 52.12087607429997, "grad_norm": 0.4422946572303772, "learning_rate": 3.662207357859532e-06, "loss": 6.378, "step": 23500 }, { "epoch": 52.12087607429997, "eval_loss": 6.389113903045654, "eval_runtime": 172.8391, "eval_samples_per_second": 57.857, "eval_steps_per_second": 7.232, "step": 23500 }, { "epoch": 52.34266703631827, "grad_norm": 0.43772438168525696, "learning_rate": 3.6521739130434787e-06, "loss": 6.3769, "step": 23600 }, { "epoch": 52.34266703631827, "eval_loss": 6.389682292938232, "eval_runtime": 174.37, "eval_samples_per_second": 57.349, "eval_steps_per_second": 7.169, "step": 23600 }, { "epoch": 52.564457998336565, "grad_norm": 0.4291711449623108, "learning_rate": 3.642140468227425e-06, "loss": 6.3787, "step": 23700 }, { "epoch": 52.564457998336565, "eval_loss": 6.387042999267578, "eval_runtime": 175.3622, "eval_samples_per_second": 57.025, "eval_steps_per_second": 7.128, "step": 23700 }, { "epoch": 52.78624896035487, "grad_norm": 0.3986354172229767, "learning_rate": 3.6321070234113714e-06, "loss": 6.378, "step": 23800 }, { "epoch": 52.78624896035487, "eval_loss": 6.394027233123779, "eval_runtime": 175.4238, "eval_samples_per_second": 57.005, "eval_steps_per_second": 7.126, "step": 23800 }, { "epoch": 53.008039922373165, "grad_norm": 0.4198819398880005, "learning_rate": 3.622073578595318e-06, "loss": 6.378, "step": 23900 }, { "epoch": 53.008039922373165, "eval_loss": 6.391998291015625, "eval_runtime": 175.3995, "eval_samples_per_second": 57.013, "eval_steps_per_second": 7.127, "step": 23900 }, { "epoch": 53.22983088439146, "grad_norm": 0.42992842197418213, "learning_rate": 3.6120401337792645e-06, "loss": 6.378, "step": 24000 }, { "epoch": 53.22983088439146, "eval_loss": 6.391213893890381, "eval_runtime": 175.5148, "eval_samples_per_second": 56.975, "eval_steps_per_second": 7.122, "step": 24000 }, { "epoch": 53.45162184640976, "grad_norm": 0.3845984637737274, "learning_rate": 3.6020066889632112e-06, "loss": 6.3794, "step": 24100 }, { "epoch": 53.45162184640976, "eval_loss": 6.395719528198242, "eval_runtime": 175.2358, "eval_samples_per_second": 57.066, "eval_steps_per_second": 7.133, "step": 24100 }, { "epoch": 53.673412808428054, "grad_norm": 0.4092540144920349, "learning_rate": 3.5919732441471576e-06, "loss": 6.3764, "step": 24200 }, { "epoch": 53.673412808428054, "eval_loss": 6.392786502838135, "eval_runtime": 173.4491, "eval_samples_per_second": 57.654, "eval_steps_per_second": 7.207, "step": 24200 }, { "epoch": 53.89520377044636, "grad_norm": 0.4434932470321655, "learning_rate": 3.581939799331104e-06, "loss": 6.3784, "step": 24300 }, { "epoch": 53.89520377044636, "eval_loss": 6.392944812774658, "eval_runtime": 173.4556, "eval_samples_per_second": 57.652, "eval_steps_per_second": 7.206, "step": 24300 }, { "epoch": 54.116994732464654, "grad_norm": 0.3644530177116394, "learning_rate": 3.5719063545150507e-06, "loss": 6.3777, "step": 24400 }, { "epoch": 54.116994732464654, "eval_loss": 6.389293193817139, "eval_runtime": 175.8393, "eval_samples_per_second": 56.87, "eval_steps_per_second": 7.109, "step": 24400 }, { "epoch": 54.33878569448295, "grad_norm": 0.42048630118370056, "learning_rate": 3.561872909698997e-06, "loss": 6.3779, "step": 24500 }, { "epoch": 54.33878569448295, "eval_loss": 6.392094612121582, "eval_runtime": 173.3329, "eval_samples_per_second": 57.692, "eval_steps_per_second": 7.212, "step": 24500 }, { "epoch": 54.56057665650125, "grad_norm": 0.5288220047950745, "learning_rate": 3.5518394648829434e-06, "loss": 6.3768, "step": 24600 }, { "epoch": 54.56057665650125, "eval_loss": 6.389921188354492, "eval_runtime": 175.5087, "eval_samples_per_second": 56.977, "eval_steps_per_second": 7.122, "step": 24600 }, { "epoch": 54.782367618519544, "grad_norm": 0.5413895845413208, "learning_rate": 3.54180602006689e-06, "loss": 6.3788, "step": 24700 }, { "epoch": 54.782367618519544, "eval_loss": 6.389023303985596, "eval_runtime": 172.9846, "eval_samples_per_second": 57.809, "eval_steps_per_second": 7.226, "step": 24700 }, { "epoch": 55.00415858053784, "grad_norm": 0.35512205958366394, "learning_rate": 3.5317725752508365e-06, "loss": 6.3777, "step": 24800 }, { "epoch": 55.00415858053784, "eval_loss": 6.390623569488525, "eval_runtime": 175.3777, "eval_samples_per_second": 57.02, "eval_steps_per_second": 7.127, "step": 24800 }, { "epoch": 55.225949542556144, "grad_norm": 0.46963444352149963, "learning_rate": 3.521739130434783e-06, "loss": 6.3759, "step": 24900 }, { "epoch": 55.225949542556144, "eval_loss": 6.392442226409912, "eval_runtime": 173.0136, "eval_samples_per_second": 57.799, "eval_steps_per_second": 7.225, "step": 24900 }, { "epoch": 55.44774050457444, "grad_norm": 0.4473781883716583, "learning_rate": 3.5117056856187296e-06, "loss": 6.3766, "step": 25000 }, { "epoch": 55.44774050457444, "eval_loss": 6.392148971557617, "eval_runtime": 175.4775, "eval_samples_per_second": 56.987, "eval_steps_per_second": 7.123, "step": 25000 }, { "epoch": 55.66953146659274, "grad_norm": 0.4387643337249756, "learning_rate": 3.501672240802676e-06, "loss": 6.3768, "step": 25100 }, { "epoch": 55.66953146659274, "eval_loss": 6.391911506652832, "eval_runtime": 175.6257, "eval_samples_per_second": 56.939, "eval_steps_per_second": 7.117, "step": 25100 }, { "epoch": 55.89132242861103, "grad_norm": 0.5157041549682617, "learning_rate": 3.491638795986622e-06, "loss": 6.3784, "step": 25200 }, { "epoch": 55.89132242861103, "eval_loss": 6.384146690368652, "eval_runtime": 175.6148, "eval_samples_per_second": 56.943, "eval_steps_per_second": 7.118, "step": 25200 }, { "epoch": 56.11311339062933, "grad_norm": 0.36674726009368896, "learning_rate": 3.481605351170568e-06, "loss": 6.3757, "step": 25300 }, { "epoch": 56.11311339062933, "eval_loss": 6.3921380043029785, "eval_runtime": 175.3664, "eval_samples_per_second": 57.023, "eval_steps_per_second": 7.128, "step": 25300 }, { "epoch": 56.33490435264763, "grad_norm": 0.44830092787742615, "learning_rate": 3.471571906354515e-06, "loss": 6.3785, "step": 25400 }, { "epoch": 56.33490435264763, "eval_loss": 6.387638092041016, "eval_runtime": 175.4426, "eval_samples_per_second": 56.999, "eval_steps_per_second": 7.125, "step": 25400 }, { "epoch": 56.55669531466593, "grad_norm": 0.4037076532840729, "learning_rate": 3.4615384615384613e-06, "loss": 6.3753, "step": 25500 }, { "epoch": 56.55669531466593, "eval_loss": 6.390742778778076, "eval_runtime": 175.869, "eval_samples_per_second": 56.861, "eval_steps_per_second": 7.108, "step": 25500 }, { "epoch": 56.77848627668423, "grad_norm": 0.5410855412483215, "learning_rate": 3.4515050167224076e-06, "loss": 6.3773, "step": 25600 }, { "epoch": 56.77848627668423, "eval_loss": 6.388538837432861, "eval_runtime": 175.5689, "eval_samples_per_second": 56.958, "eval_steps_per_second": 7.12, "step": 25600 }, { "epoch": 57.00027723870252, "grad_norm": 0.6200158596038818, "learning_rate": 3.4414715719063544e-06, "loss": 6.3762, "step": 25700 }, { "epoch": 57.00027723870252, "eval_loss": 6.392038345336914, "eval_runtime": 172.8867, "eval_samples_per_second": 57.841, "eval_steps_per_second": 7.23, "step": 25700 }, { "epoch": 57.22206820072082, "grad_norm": 0.33977118134498596, "learning_rate": 3.4314381270903007e-06, "loss": 6.3782, "step": 25800 }, { "epoch": 57.22206820072082, "eval_loss": 6.390758037567139, "eval_runtime": 172.9474, "eval_samples_per_second": 57.821, "eval_steps_per_second": 7.228, "step": 25800 }, { "epoch": 57.443859162739116, "grad_norm": 0.396681010723114, "learning_rate": 3.4214046822742475e-06, "loss": 6.3766, "step": 25900 }, { "epoch": 57.443859162739116, "eval_loss": 6.391767501831055, "eval_runtime": 175.4265, "eval_samples_per_second": 57.004, "eval_steps_per_second": 7.125, "step": 25900 }, { "epoch": 57.66565012475741, "grad_norm": 0.3652241826057434, "learning_rate": 3.411371237458194e-06, "loss": 6.3766, "step": 26000 }, { "epoch": 57.66565012475741, "eval_loss": 6.388927936553955, "eval_runtime": 173.1869, "eval_samples_per_second": 57.741, "eval_steps_per_second": 7.218, "step": 26000 }, { "epoch": 57.887441086775716, "grad_norm": 0.40237948298454285, "learning_rate": 3.40133779264214e-06, "loss": 6.3786, "step": 26100 }, { "epoch": 57.887441086775716, "eval_loss": 6.385989665985107, "eval_runtime": 175.7809, "eval_samples_per_second": 56.889, "eval_steps_per_second": 7.111, "step": 26100 }, { "epoch": 58.10923204879401, "grad_norm": 0.47134748101234436, "learning_rate": 3.391304347826087e-06, "loss": 6.3766, "step": 26200 }, { "epoch": 58.10923204879401, "eval_loss": 6.388063907623291, "eval_runtime": 172.8868, "eval_samples_per_second": 57.841, "eval_steps_per_second": 7.23, "step": 26200 }, { "epoch": 58.33102301081231, "grad_norm": 0.35729169845581055, "learning_rate": 3.3812709030100333e-06, "loss": 6.376, "step": 26300 }, { "epoch": 58.33102301081231, "eval_loss": 6.38781213760376, "eval_runtime": 175.295, "eval_samples_per_second": 57.047, "eval_steps_per_second": 7.131, "step": 26300 }, { "epoch": 58.552813972830606, "grad_norm": 0.38715028762817383, "learning_rate": 3.3712374581939796e-06, "loss": 6.3765, "step": 26400 }, { "epoch": 58.552813972830606, "eval_loss": 6.389337539672852, "eval_runtime": 172.8668, "eval_samples_per_second": 57.848, "eval_steps_per_second": 7.231, "step": 26400 }, { "epoch": 58.7746049348489, "grad_norm": 0.46873271465301514, "learning_rate": 3.3612040133779264e-06, "loss": 6.3768, "step": 26500 }, { "epoch": 58.7746049348489, "eval_loss": 6.392114162445068, "eval_runtime": 175.4104, "eval_samples_per_second": 57.009, "eval_steps_per_second": 7.126, "step": 26500 }, { "epoch": 58.996395896867206, "grad_norm": 0.3447762131690979, "learning_rate": 3.3511705685618727e-06, "loss": 6.3759, "step": 26600 }, { "epoch": 58.996395896867206, "eval_loss": 6.387296676635742, "eval_runtime": 175.3375, "eval_samples_per_second": 57.033, "eval_steps_per_second": 7.129, "step": 26600 }, { "epoch": 59.2181868588855, "grad_norm": 0.3914731442928314, "learning_rate": 3.3411371237458195e-06, "loss": 6.3771, "step": 26700 }, { "epoch": 59.2181868588855, "eval_loss": 6.387917995452881, "eval_runtime": 175.4868, "eval_samples_per_second": 56.984, "eval_steps_per_second": 7.123, "step": 26700 }, { "epoch": 59.4399778209038, "grad_norm": 0.5208538174629211, "learning_rate": 3.331103678929766e-06, "loss": 6.3765, "step": 26800 }, { "epoch": 59.4399778209038, "eval_loss": 6.389184474945068, "eval_runtime": 174.2169, "eval_samples_per_second": 57.4, "eval_steps_per_second": 7.175, "step": 26800 }, { "epoch": 59.661768782922096, "grad_norm": 0.3724886178970337, "learning_rate": 3.321070234113712e-06, "loss": 6.3757, "step": 26900 }, { "epoch": 59.661768782922096, "eval_loss": 6.392241954803467, "eval_runtime": 175.4491, "eval_samples_per_second": 56.997, "eval_steps_per_second": 7.125, "step": 26900 }, { "epoch": 59.88355974494039, "grad_norm": 0.33004748821258545, "learning_rate": 3.311036789297659e-06, "loss": 6.3759, "step": 27000 }, { "epoch": 59.88355974494039, "eval_loss": 6.389077186584473, "eval_runtime": 172.9579, "eval_samples_per_second": 57.818, "eval_steps_per_second": 7.227, "step": 27000 }, { "epoch": 60.10535070695869, "grad_norm": 0.3995635211467743, "learning_rate": 3.3010033444816052e-06, "loss": 6.3774, "step": 27100 }, { "epoch": 60.10535070695869, "eval_loss": 6.389009952545166, "eval_runtime": 175.8118, "eval_samples_per_second": 56.879, "eval_steps_per_second": 7.11, "step": 27100 }, { "epoch": 60.32714166897699, "grad_norm": 0.49882611632347107, "learning_rate": 3.2909698996655516e-06, "loss": 6.3762, "step": 27200 }, { "epoch": 60.32714166897699, "eval_loss": 6.3899922370910645, "eval_runtime": 175.7786, "eval_samples_per_second": 56.89, "eval_steps_per_second": 7.111, "step": 27200 }, { "epoch": 60.54893263099529, "grad_norm": 0.46321776509284973, "learning_rate": 3.2809364548494983e-06, "loss": 6.3758, "step": 27300 }, { "epoch": 60.54893263099529, "eval_loss": 6.389715671539307, "eval_runtime": 175.8928, "eval_samples_per_second": 56.853, "eval_steps_per_second": 7.107, "step": 27300 }, { "epoch": 60.770723593013585, "grad_norm": 0.4512879252433777, "learning_rate": 3.2709030100334447e-06, "loss": 6.3764, "step": 27400 }, { "epoch": 60.770723593013585, "eval_loss": 6.388641357421875, "eval_runtime": 175.8755, "eval_samples_per_second": 56.858, "eval_steps_per_second": 7.107, "step": 27400 }, { "epoch": 60.99251455503188, "grad_norm": 0.5370669364929199, "learning_rate": 3.260869565217391e-06, "loss": 6.3764, "step": 27500 }, { "epoch": 60.99251455503188, "eval_loss": 6.391347885131836, "eval_runtime": 173.6027, "eval_samples_per_second": 57.603, "eval_steps_per_second": 7.2, "step": 27500 }, { "epoch": 61.21430551705018, "grad_norm": 0.4362497627735138, "learning_rate": 3.2508361204013378e-06, "loss": 6.3747, "step": 27600 }, { "epoch": 61.21430551705018, "eval_loss": 6.390707969665527, "eval_runtime": 175.8739, "eval_samples_per_second": 56.859, "eval_steps_per_second": 7.107, "step": 27600 }, { "epoch": 61.436096479068475, "grad_norm": 0.36759933829307556, "learning_rate": 3.240802675585284e-06, "loss": 6.3768, "step": 27700 }, { "epoch": 61.436096479068475, "eval_loss": 6.390637397766113, "eval_runtime": 173.3683, "eval_samples_per_second": 57.681, "eval_steps_per_second": 7.21, "step": 27700 }, { "epoch": 61.65788744108678, "grad_norm": 0.4922894537448883, "learning_rate": 3.230769230769231e-06, "loss": 6.3758, "step": 27800 }, { "epoch": 61.65788744108678, "eval_loss": 6.386129379272461, "eval_runtime": 175.8295, "eval_samples_per_second": 56.873, "eval_steps_per_second": 7.109, "step": 27800 }, { "epoch": 61.879678403105075, "grad_norm": 0.5007067918777466, "learning_rate": 3.2207357859531772e-06, "loss": 6.3755, "step": 27900 }, { "epoch": 61.879678403105075, "eval_loss": 6.389693737030029, "eval_runtime": 173.4229, "eval_samples_per_second": 57.663, "eval_steps_per_second": 7.208, "step": 27900 }, { "epoch": 62.10146936512337, "grad_norm": 0.5208317041397095, "learning_rate": 3.2107023411371236e-06, "loss": 6.3766, "step": 28000 }, { "epoch": 62.10146936512337, "eval_loss": 6.387614727020264, "eval_runtime": 175.7473, "eval_samples_per_second": 56.9, "eval_steps_per_second": 7.112, "step": 28000 }, { "epoch": 62.32326032714167, "grad_norm": 0.5632686614990234, "learning_rate": 3.2006688963210703e-06, "loss": 6.3759, "step": 28100 }, { "epoch": 62.32326032714167, "eval_loss": 6.392298221588135, "eval_runtime": 173.3859, "eval_samples_per_second": 57.675, "eval_steps_per_second": 7.209, "step": 28100 }, { "epoch": 62.545051289159964, "grad_norm": 0.44811296463012695, "learning_rate": 3.1906354515050167e-06, "loss": 6.376, "step": 28200 }, { "epoch": 62.545051289159964, "eval_loss": 6.388302326202393, "eval_runtime": 175.8812, "eval_samples_per_second": 56.857, "eval_steps_per_second": 7.107, "step": 28200 }, { "epoch": 62.76684225117826, "grad_norm": 0.434894323348999, "learning_rate": 3.180602006688963e-06, "loss": 6.3754, "step": 28300 }, { "epoch": 62.76684225117826, "eval_loss": 6.388329982757568, "eval_runtime": 173.1819, "eval_samples_per_second": 57.743, "eval_steps_per_second": 7.218, "step": 28300 }, { "epoch": 62.988633213196564, "grad_norm": 0.4996633231639862, "learning_rate": 3.1705685618729098e-06, "loss": 6.3753, "step": 28400 }, { "epoch": 62.988633213196564, "eval_loss": 6.386618614196777, "eval_runtime": 175.5366, "eval_samples_per_second": 56.968, "eval_steps_per_second": 7.121, "step": 28400 }, { "epoch": 63.21042417521486, "grad_norm": 0.4766680598258972, "learning_rate": 3.160535117056856e-06, "loss": 6.3757, "step": 28500 }, { "epoch": 63.21042417521486, "eval_loss": 6.388480186462402, "eval_runtime": 175.3311, "eval_samples_per_second": 57.035, "eval_steps_per_second": 7.129, "step": 28500 }, { "epoch": 63.43221513723316, "grad_norm": 0.28831642866134644, "learning_rate": 3.1505016722408024e-06, "loss": 6.3764, "step": 28600 }, { "epoch": 63.43221513723316, "eval_loss": 6.3880767822265625, "eval_runtime": 175.3784, "eval_samples_per_second": 57.02, "eval_steps_per_second": 7.127, "step": 28600 }, { "epoch": 63.654006099251454, "grad_norm": 0.2838084399700165, "learning_rate": 3.140468227424749e-06, "loss": 6.3755, "step": 28700 }, { "epoch": 63.654006099251454, "eval_loss": 6.386078357696533, "eval_runtime": 172.9388, "eval_samples_per_second": 57.824, "eval_steps_per_second": 7.228, "step": 28700 }, { "epoch": 63.87579706126975, "grad_norm": 0.47868525981903076, "learning_rate": 3.1304347826086955e-06, "loss": 6.377, "step": 28800 }, { "epoch": 63.87579706126975, "eval_loss": 6.387932777404785, "eval_runtime": 175.4569, "eval_samples_per_second": 56.994, "eval_steps_per_second": 7.124, "step": 28800 }, { "epoch": 64.09758802328805, "grad_norm": 0.5446937680244446, "learning_rate": 3.1204013377926423e-06, "loss": 6.3753, "step": 28900 }, { "epoch": 64.09758802328805, "eval_loss": 6.388584136962891, "eval_runtime": 172.8884, "eval_samples_per_second": 57.841, "eval_steps_per_second": 7.23, "step": 28900 }, { "epoch": 64.31937898530634, "grad_norm": 0.41702982783317566, "learning_rate": 3.1103678929765886e-06, "loss": 6.3761, "step": 29000 }, { "epoch": 64.31937898530634, "eval_loss": 6.3896894454956055, "eval_runtime": 172.9657, "eval_samples_per_second": 57.815, "eval_steps_per_second": 7.227, "step": 29000 }, { "epoch": 64.54116994732465, "grad_norm": 0.39311668276786804, "learning_rate": 3.100334448160535e-06, "loss": 6.3753, "step": 29100 }, { "epoch": 64.54116994732465, "eval_loss": 6.3889970779418945, "eval_runtime": 175.6814, "eval_samples_per_second": 56.921, "eval_steps_per_second": 7.115, "step": 29100 }, { "epoch": 64.76296090934295, "grad_norm": 0.31582164764404297, "learning_rate": 3.0903010033444818e-06, "loss": 6.3763, "step": 29200 }, { "epoch": 64.76296090934295, "eval_loss": 6.388535976409912, "eval_runtime": 173.1769, "eval_samples_per_second": 57.744, "eval_steps_per_second": 7.218, "step": 29200 }, { "epoch": 64.98475187136124, "grad_norm": 0.4400019347667694, "learning_rate": 3.080267558528428e-06, "loss": 6.3752, "step": 29300 }, { "epoch": 64.98475187136124, "eval_loss": 6.38809061050415, "eval_runtime": 175.7068, "eval_samples_per_second": 56.913, "eval_steps_per_second": 7.114, "step": 29300 }, { "epoch": 65.20654283337954, "grad_norm": 0.3871637284755707, "learning_rate": 3.0702341137123744e-06, "loss": 6.3761, "step": 29400 }, { "epoch": 65.20654283337954, "eval_loss": 6.3887200355529785, "eval_runtime": 175.6633, "eval_samples_per_second": 56.927, "eval_steps_per_second": 7.116, "step": 29400 }, { "epoch": 65.42833379539783, "grad_norm": 0.3527097702026367, "learning_rate": 3.060200668896321e-06, "loss": 6.375, "step": 29500 }, { "epoch": 65.42833379539783, "eval_loss": 6.385637283325195, "eval_runtime": 175.6827, "eval_samples_per_second": 56.921, "eval_steps_per_second": 7.115, "step": 29500 }, { "epoch": 65.65012475741614, "grad_norm": 0.3956551253795624, "learning_rate": 3.0501672240802675e-06, "loss": 6.3763, "step": 29600 }, { "epoch": 65.65012475741614, "eval_loss": 6.388696670532227, "eval_runtime": 175.6777, "eval_samples_per_second": 56.922, "eval_steps_per_second": 7.115, "step": 29600 }, { "epoch": 65.87191571943443, "grad_norm": 0.317006379365921, "learning_rate": 3.0401337792642143e-06, "loss": 6.3747, "step": 29700 }, { "epoch": 65.87191571943443, "eval_loss": 6.386444568634033, "eval_runtime": 175.0294, "eval_samples_per_second": 57.133, "eval_steps_per_second": 7.142, "step": 29700 }, { "epoch": 66.09370668145273, "grad_norm": 0.29853495955467224, "learning_rate": 3.0301003344481606e-06, "loss": 6.3742, "step": 29800 }, { "epoch": 66.09370668145273, "eval_loss": 6.38703727722168, "eval_runtime": 173.1862, "eval_samples_per_second": 57.741, "eval_steps_per_second": 7.218, "step": 29800 }, { "epoch": 66.31549764347103, "grad_norm": 0.3481820225715637, "learning_rate": 3.020066889632107e-06, "loss": 6.3756, "step": 29900 }, { "epoch": 66.31549764347103, "eval_loss": 6.385500907897949, "eval_runtime": 175.6985, "eval_samples_per_second": 56.916, "eval_steps_per_second": 7.114, "step": 29900 }, { "epoch": 66.53728860548932, "grad_norm": 0.3467808961868286, "learning_rate": 3.0100334448160537e-06, "loss": 6.3755, "step": 30000 }, { "epoch": 66.53728860548932, "eval_loss": 6.389315605163574, "eval_runtime": 173.203, "eval_samples_per_second": 57.736, "eval_steps_per_second": 7.217, "step": 30000 }, { "epoch": 66.75907956750763, "grad_norm": 0.3288291096687317, "learning_rate": 3e-06, "loss": 6.3762, "step": 30100 }, { "epoch": 66.75907956750763, "eval_loss": 6.389954090118408, "eval_runtime": 175.5948, "eval_samples_per_second": 56.949, "eval_steps_per_second": 7.119, "step": 30100 }, { "epoch": 66.98087052952592, "grad_norm": 0.3450663387775421, "learning_rate": 2.9899665551839464e-06, "loss": 6.3749, "step": 30200 }, { "epoch": 66.98087052952592, "eval_loss": 6.388577938079834, "eval_runtime": 173.1084, "eval_samples_per_second": 57.767, "eval_steps_per_second": 7.221, "step": 30200 }, { "epoch": 67.20266149154422, "grad_norm": 0.4391154646873474, "learning_rate": 2.979933110367893e-06, "loss": 6.3757, "step": 30300 }, { "epoch": 67.20266149154422, "eval_loss": 6.3895344734191895, "eval_runtime": 175.4784, "eval_samples_per_second": 56.987, "eval_steps_per_second": 7.123, "step": 30300 }, { "epoch": 67.42445245356252, "grad_norm": 0.4594007730484009, "learning_rate": 2.9698996655518395e-06, "loss": 6.3742, "step": 30400 }, { "epoch": 67.42445245356252, "eval_loss": 6.387800216674805, "eval_runtime": 175.4386, "eval_samples_per_second": 57.0, "eval_steps_per_second": 7.125, "step": 30400 }, { "epoch": 67.64624341558081, "grad_norm": 0.2892398238182068, "learning_rate": 2.959866220735786e-06, "loss": 6.3758, "step": 30500 }, { "epoch": 67.64624341558081, "eval_loss": 6.3860883712768555, "eval_runtime": 175.4706, "eval_samples_per_second": 56.99, "eval_steps_per_second": 7.124, "step": 30500 }, { "epoch": 67.86803437759912, "grad_norm": 0.5031465888023376, "learning_rate": 2.9498327759197326e-06, "loss": 6.3738, "step": 30600 }, { "epoch": 67.86803437759912, "eval_loss": 6.38906192779541, "eval_runtime": 175.4554, "eval_samples_per_second": 56.995, "eval_steps_per_second": 7.124, "step": 30600 }, { "epoch": 68.0898253396174, "grad_norm": 0.2999316453933716, "learning_rate": 2.939799331103679e-06, "loss": 6.3732, "step": 30700 }, { "epoch": 68.0898253396174, "eval_loss": 6.387207984924316, "eval_runtime": 172.9284, "eval_samples_per_second": 57.827, "eval_steps_per_second": 7.228, "step": 30700 }, { "epoch": 68.31161630163571, "grad_norm": 0.3920566737651825, "learning_rate": 2.9297658862876257e-06, "loss": 6.3746, "step": 30800 }, { "epoch": 68.31161630163571, "eval_loss": 6.388418197631836, "eval_runtime": 175.4686, "eval_samples_per_second": 56.99, "eval_steps_per_second": 7.124, "step": 30800 }, { "epoch": 68.53340726365401, "grad_norm": 0.3810490369796753, "learning_rate": 2.919732441471572e-06, "loss": 6.3736, "step": 30900 }, { "epoch": 68.53340726365401, "eval_loss": 6.382778167724609, "eval_runtime": 172.9448, "eval_samples_per_second": 57.822, "eval_steps_per_second": 7.228, "step": 30900 }, { "epoch": 68.7551982256723, "grad_norm": 0.282163143157959, "learning_rate": 2.9096989966555184e-06, "loss": 6.3764, "step": 31000 }, { "epoch": 68.7551982256723, "eval_loss": 6.3898420333862305, "eval_runtime": 175.8822, "eval_samples_per_second": 56.856, "eval_steps_per_second": 7.107, "step": 31000 }, { "epoch": 68.9769891876906, "grad_norm": 0.5345416069030762, "learning_rate": 2.899665551839465e-06, "loss": 6.3744, "step": 31100 }, { "epoch": 68.9769891876906, "eval_loss": 6.389834880828857, "eval_runtime": 173.048, "eval_samples_per_second": 57.787, "eval_steps_per_second": 7.223, "step": 31100 }, { "epoch": 69.1987801497089, "grad_norm": 0.2955686151981354, "learning_rate": 2.8896321070234115e-06, "loss": 6.3752, "step": 31200 }, { "epoch": 69.1987801497089, "eval_loss": 6.385989189147949, "eval_runtime": 175.4356, "eval_samples_per_second": 57.001, "eval_steps_per_second": 7.125, "step": 31200 }, { "epoch": 69.4205711117272, "grad_norm": 0.2998807430267334, "learning_rate": 2.879598662207358e-06, "loss": 6.3744, "step": 31300 }, { "epoch": 69.4205711117272, "eval_loss": 6.3874688148498535, "eval_runtime": 175.8432, "eval_samples_per_second": 56.869, "eval_steps_per_second": 7.109, "step": 31300 }, { "epoch": 69.64236207374549, "grad_norm": 0.5946409702301025, "learning_rate": 2.8695652173913046e-06, "loss": 6.3742, "step": 31400 }, { "epoch": 69.64236207374549, "eval_loss": 6.386292934417725, "eval_runtime": 175.7657, "eval_samples_per_second": 56.894, "eval_steps_per_second": 7.112, "step": 31400 }, { "epoch": 69.86415303576379, "grad_norm": 0.4089396595954895, "learning_rate": 2.859531772575251e-06, "loss": 6.3741, "step": 31500 }, { "epoch": 69.86415303576379, "eval_loss": 6.386563301086426, "eval_runtime": 175.832, "eval_samples_per_second": 56.872, "eval_steps_per_second": 7.109, "step": 31500 }, { "epoch": 70.0859439977821, "grad_norm": 0.4220736622810364, "learning_rate": 2.8494983277591977e-06, "loss": 6.3761, "step": 31600 }, { "epoch": 70.0859439977821, "eval_loss": 6.386425495147705, "eval_runtime": 175.4574, "eval_samples_per_second": 56.994, "eval_steps_per_second": 7.124, "step": 31600 }, { "epoch": 70.30773495980038, "grad_norm": 0.5009733438491821, "learning_rate": 2.839464882943144e-06, "loss": 6.3746, "step": 31700 }, { "epoch": 70.30773495980038, "eval_loss": 6.386416435241699, "eval_runtime": 175.5124, "eval_samples_per_second": 56.976, "eval_steps_per_second": 7.122, "step": 31700 }, { "epoch": 70.52952592181869, "grad_norm": 0.41243863105773926, "learning_rate": 2.8294314381270904e-06, "loss": 6.3738, "step": 31800 }, { "epoch": 70.52952592181869, "eval_loss": 6.388505935668945, "eval_runtime": 175.5511, "eval_samples_per_second": 56.963, "eval_steps_per_second": 7.12, "step": 31800 }, { "epoch": 70.75131688383698, "grad_norm": 0.3510850667953491, "learning_rate": 2.819397993311037e-06, "loss": 6.3754, "step": 31900 }, { "epoch": 70.75131688383698, "eval_loss": 6.388024806976318, "eval_runtime": 175.6891, "eval_samples_per_second": 56.919, "eval_steps_per_second": 7.115, "step": 31900 }, { "epoch": 70.97310784585528, "grad_norm": 0.2912569046020508, "learning_rate": 2.8093645484949835e-06, "loss": 6.374, "step": 32000 }, { "epoch": 70.97310784585528, "eval_loss": 6.385600566864014, "eval_runtime": 175.9407, "eval_samples_per_second": 56.837, "eval_steps_per_second": 7.105, "step": 32000 }, { "epoch": 71.19489880787359, "grad_norm": 0.3566642105579376, "learning_rate": 2.79933110367893e-06, "loss": 6.3728, "step": 32100 }, { "epoch": 71.19489880787359, "eval_loss": 6.384610652923584, "eval_runtime": 175.7319, "eval_samples_per_second": 56.905, "eval_steps_per_second": 7.113, "step": 32100 }, { "epoch": 71.41668976989187, "grad_norm": 0.36077818274497986, "learning_rate": 2.7892976588628766e-06, "loss": 6.3742, "step": 32200 }, { "epoch": 71.41668976989187, "eval_loss": 6.389194488525391, "eval_runtime": 173.108, "eval_samples_per_second": 57.767, "eval_steps_per_second": 7.221, "step": 32200 }, { "epoch": 71.63848073191018, "grad_norm": 0.4366162121295929, "learning_rate": 2.779264214046823e-06, "loss": 6.373, "step": 32300 }, { "epoch": 71.63848073191018, "eval_loss": 6.388595104217529, "eval_runtime": 175.5624, "eval_samples_per_second": 56.96, "eval_steps_per_second": 7.12, "step": 32300 }, { "epoch": 71.86027169392847, "grad_norm": 0.3485216498374939, "learning_rate": 2.7692307692307693e-06, "loss": 6.3744, "step": 32400 }, { "epoch": 71.86027169392847, "eval_loss": 6.38759708404541, "eval_runtime": 173.3825, "eval_samples_per_second": 57.676, "eval_steps_per_second": 7.209, "step": 32400 }, { "epoch": 72.08206265594677, "grad_norm": 0.41392314434051514, "learning_rate": 2.759197324414716e-06, "loss": 6.3733, "step": 32500 }, { "epoch": 72.08206265594677, "eval_loss": 6.388287544250488, "eval_runtime": 175.8186, "eval_samples_per_second": 56.877, "eval_steps_per_second": 7.11, "step": 32500 }, { "epoch": 72.30385361796507, "grad_norm": 0.38669446110725403, "learning_rate": 2.749163879598662e-06, "loss": 6.3736, "step": 32600 }, { "epoch": 72.30385361796507, "eval_loss": 6.387938499450684, "eval_runtime": 167.9516, "eval_samples_per_second": 59.541, "eval_steps_per_second": 7.443, "step": 32600 }, { "epoch": 72.52564457998336, "grad_norm": 0.42049235105514526, "learning_rate": 2.7391304347826087e-06, "loss": 6.3744, "step": 32700 }, { "epoch": 72.52564457998336, "eval_loss": 6.387884140014648, "eval_runtime": 175.7946, "eval_samples_per_second": 56.885, "eval_steps_per_second": 7.111, "step": 32700 }, { "epoch": 72.74743554200167, "grad_norm": 0.45259612798690796, "learning_rate": 2.729096989966555e-06, "loss": 6.3733, "step": 32800 }, { "epoch": 72.74743554200167, "eval_loss": 6.383664608001709, "eval_runtime": 175.4633, "eval_samples_per_second": 56.992, "eval_steps_per_second": 7.124, "step": 32800 }, { "epoch": 72.96922650401996, "grad_norm": 0.35638928413391113, "learning_rate": 2.7190635451505014e-06, "loss": 6.3752, "step": 32900 }, { "epoch": 72.96922650401996, "eval_loss": 6.385019302368164, "eval_runtime": 175.4207, "eval_samples_per_second": 57.006, "eval_steps_per_second": 7.126, "step": 32900 }, { "epoch": 73.19101746603826, "grad_norm": 0.4410247206687927, "learning_rate": 2.709030100334448e-06, "loss": 6.3739, "step": 33000 }, { "epoch": 73.19101746603826, "eval_loss": 6.385441303253174, "eval_runtime": 175.4138, "eval_samples_per_second": 57.008, "eval_steps_per_second": 7.126, "step": 33000 }, { "epoch": 73.41280842805655, "grad_norm": 0.2410985231399536, "learning_rate": 2.6989966555183945e-06, "loss": 6.3728, "step": 33100 }, { "epoch": 73.41280842805655, "eval_loss": 6.38595724105835, "eval_runtime": 175.8764, "eval_samples_per_second": 56.858, "eval_steps_per_second": 7.107, "step": 33100 }, { "epoch": 73.63459939007485, "grad_norm": 0.43327927589416504, "learning_rate": 2.6889632107023413e-06, "loss": 6.3742, "step": 33200 }, { "epoch": 73.63459939007485, "eval_loss": 6.387829780578613, "eval_runtime": 175.8542, "eval_samples_per_second": 56.865, "eval_steps_per_second": 7.108, "step": 33200 }, { "epoch": 73.85639035209316, "grad_norm": 0.2946775555610657, "learning_rate": 2.6789297658862876e-06, "loss": 6.3751, "step": 33300 }, { "epoch": 73.85639035209316, "eval_loss": 6.385344505310059, "eval_runtime": 173.3421, "eval_samples_per_second": 57.689, "eval_steps_per_second": 7.211, "step": 33300 }, { "epoch": 74.07818131411145, "grad_norm": 0.33265405893325806, "learning_rate": 2.668896321070234e-06, "loss": 6.3737, "step": 33400 }, { "epoch": 74.07818131411145, "eval_loss": 6.38824987411499, "eval_runtime": 173.3017, "eval_samples_per_second": 57.703, "eval_steps_per_second": 7.213, "step": 33400 }, { "epoch": 74.29997227612975, "grad_norm": 0.40044334530830383, "learning_rate": 2.6588628762541807e-06, "loss": 6.3752, "step": 33500 }, { "epoch": 74.29997227612975, "eval_loss": 6.385106086730957, "eval_runtime": 175.777, "eval_samples_per_second": 56.89, "eval_steps_per_second": 7.111, "step": 33500 }, { "epoch": 74.52176323814804, "grad_norm": 0.3776157796382904, "learning_rate": 2.648829431438127e-06, "loss": 6.3739, "step": 33600 }, { "epoch": 74.52176323814804, "eval_loss": 6.387485980987549, "eval_runtime": 174.2601, "eval_samples_per_second": 57.385, "eval_steps_per_second": 7.173, "step": 33600 }, { "epoch": 74.74355420016634, "grad_norm": 0.33734750747680664, "learning_rate": 2.6387959866220734e-06, "loss": 6.3739, "step": 33700 }, { "epoch": 74.74355420016634, "eval_loss": 6.383073806762695, "eval_runtime": 175.8418, "eval_samples_per_second": 56.869, "eval_steps_per_second": 7.109, "step": 33700 }, { "epoch": 74.96534516218465, "grad_norm": 0.2771698534488678, "learning_rate": 2.62876254180602e-06, "loss": 6.3729, "step": 33800 }, { "epoch": 74.96534516218465, "eval_loss": 6.388527870178223, "eval_runtime": 174.0412, "eval_samples_per_second": 57.458, "eval_steps_per_second": 7.182, "step": 33800 }, { "epoch": 75.18713612420294, "grad_norm": 0.3911442458629608, "learning_rate": 2.6187290969899665e-06, "loss": 6.374, "step": 33900 }, { "epoch": 75.18713612420294, "eval_loss": 6.386963367462158, "eval_runtime": 172.9704, "eval_samples_per_second": 57.813, "eval_steps_per_second": 7.227, "step": 33900 }, { "epoch": 75.40892708622124, "grad_norm": 0.3304766118526459, "learning_rate": 2.6086956521739132e-06, "loss": 6.3746, "step": 34000 }, { "epoch": 75.40892708622124, "eval_loss": 6.386199951171875, "eval_runtime": 175.3788, "eval_samples_per_second": 57.019, "eval_steps_per_second": 7.127, "step": 34000 }, { "epoch": 75.63071804823953, "grad_norm": 0.4422440230846405, "learning_rate": 2.5986622073578596e-06, "loss": 6.3737, "step": 34100 }, { "epoch": 75.63071804823953, "eval_loss": 6.384350776672363, "eval_runtime": 173.0054, "eval_samples_per_second": 57.802, "eval_steps_per_second": 7.225, "step": 34100 }, { "epoch": 75.85250901025783, "grad_norm": 0.28921636939048767, "learning_rate": 2.588628762541806e-06, "loss": 6.3739, "step": 34200 }, { "epoch": 75.85250901025783, "eval_loss": 6.387299537658691, "eval_runtime": 175.5823, "eval_samples_per_second": 56.953, "eval_steps_per_second": 7.119, "step": 34200 }, { "epoch": 76.07429997227614, "grad_norm": 0.3911747336387634, "learning_rate": 2.5785953177257527e-06, "loss": 6.3734, "step": 34300 }, { "epoch": 76.07429997227614, "eval_loss": 6.389584541320801, "eval_runtime": 172.7277, "eval_samples_per_second": 57.895, "eval_steps_per_second": 7.237, "step": 34300 }, { "epoch": 76.29609093429443, "grad_norm": 0.3622056245803833, "learning_rate": 2.568561872909699e-06, "loss": 6.3739, "step": 34400 }, { "epoch": 76.29609093429443, "eval_loss": 6.386180400848389, "eval_runtime": 175.4099, "eval_samples_per_second": 57.009, "eval_steps_per_second": 7.126, "step": 34400 }, { "epoch": 76.51788189631273, "grad_norm": 0.24905167520046234, "learning_rate": 2.5585284280936454e-06, "loss": 6.3746, "step": 34500 }, { "epoch": 76.51788189631273, "eval_loss": 6.383036136627197, "eval_runtime": 172.8585, "eval_samples_per_second": 57.851, "eval_steps_per_second": 7.231, "step": 34500 }, { "epoch": 76.73967285833102, "grad_norm": 0.3207278549671173, "learning_rate": 2.548494983277592e-06, "loss": 6.3749, "step": 34600 }, { "epoch": 76.73967285833102, "eval_loss": 6.3871564865112305, "eval_runtime": 175.7463, "eval_samples_per_second": 56.9, "eval_steps_per_second": 7.113, "step": 34600 }, { "epoch": 76.96146382034932, "grad_norm": 0.3537052273750305, "learning_rate": 2.5384615384615385e-06, "loss": 6.3719, "step": 34700 }, { "epoch": 76.96146382034932, "eval_loss": 6.384720325469971, "eval_runtime": 172.9583, "eval_samples_per_second": 57.817, "eval_steps_per_second": 7.227, "step": 34700 }, { "epoch": 77.18325478236761, "grad_norm": 0.4220789074897766, "learning_rate": 2.528428093645485e-06, "loss": 6.3736, "step": 34800 }, { "epoch": 77.18325478236761, "eval_loss": 6.384481906890869, "eval_runtime": 175.4683, "eval_samples_per_second": 56.99, "eval_steps_per_second": 7.124, "step": 34800 }, { "epoch": 77.40504574438592, "grad_norm": 0.3726615011692047, "learning_rate": 2.5183946488294316e-06, "loss": 6.3726, "step": 34900 }, { "epoch": 77.40504574438592, "eval_loss": 6.383063793182373, "eval_runtime": 175.4526, "eval_samples_per_second": 56.995, "eval_steps_per_second": 7.124, "step": 34900 }, { "epoch": 77.62683670640422, "grad_norm": 0.3583526909351349, "learning_rate": 2.508361204013378e-06, "loss": 6.3742, "step": 35000 }, { "epoch": 77.62683670640422, "eval_loss": 6.383593559265137, "eval_runtime": 175.5123, "eval_samples_per_second": 56.976, "eval_steps_per_second": 7.122, "step": 35000 }, { "epoch": 77.84862766842251, "grad_norm": 0.31663283705711365, "learning_rate": 2.4983277591973247e-06, "loss": 6.3746, "step": 35100 }, { "epoch": 77.84862766842251, "eval_loss": 6.385804653167725, "eval_runtime": 175.8899, "eval_samples_per_second": 56.854, "eval_steps_per_second": 7.107, "step": 35100 }, { "epoch": 78.07041863044081, "grad_norm": 0.3281422555446625, "learning_rate": 2.488294314381271e-06, "loss": 6.374, "step": 35200 }, { "epoch": 78.07041863044081, "eval_loss": 6.382884979248047, "eval_runtime": 176.0174, "eval_samples_per_second": 56.813, "eval_steps_per_second": 7.102, "step": 35200 }, { "epoch": 78.2922095924591, "grad_norm": 0.35885676741600037, "learning_rate": 2.4782608695652173e-06, "loss": 6.3737, "step": 35300 }, { "epoch": 78.2922095924591, "eval_loss": 6.38320255279541, "eval_runtime": 175.5121, "eval_samples_per_second": 56.976, "eval_steps_per_second": 7.122, "step": 35300 }, { "epoch": 78.5140005544774, "grad_norm": 0.40301480889320374, "learning_rate": 2.468227424749164e-06, "loss": 6.3742, "step": 35400 }, { "epoch": 78.5140005544774, "eval_loss": 6.386338233947754, "eval_runtime": 175.6402, "eval_samples_per_second": 56.935, "eval_steps_per_second": 7.117, "step": 35400 }, { "epoch": 78.73579151649571, "grad_norm": 0.3202325701713562, "learning_rate": 2.4581939799331104e-06, "loss": 6.3736, "step": 35500 }, { "epoch": 78.73579151649571, "eval_loss": 6.385340690612793, "eval_runtime": 173.2176, "eval_samples_per_second": 57.731, "eval_steps_per_second": 7.216, "step": 35500 }, { "epoch": 78.957582478514, "grad_norm": 0.370046466588974, "learning_rate": 2.4481605351170568e-06, "loss": 6.3733, "step": 35600 }, { "epoch": 78.957582478514, "eval_loss": 6.3839592933654785, "eval_runtime": 173.1936, "eval_samples_per_second": 57.739, "eval_steps_per_second": 7.217, "step": 35600 }, { "epoch": 79.1793734405323, "grad_norm": 0.3682570457458496, "learning_rate": 2.4381270903010035e-06, "loss": 6.373, "step": 35700 }, { "epoch": 79.1793734405323, "eval_loss": 6.384267807006836, "eval_runtime": 175.7512, "eval_samples_per_second": 56.899, "eval_steps_per_second": 7.112, "step": 35700 }, { "epoch": 79.40116440255059, "grad_norm": 0.42555299401283264, "learning_rate": 2.42809364548495e-06, "loss": 6.3724, "step": 35800 }, { "epoch": 79.40116440255059, "eval_loss": 6.386261940002441, "eval_runtime": 173.3473, "eval_samples_per_second": 57.688, "eval_steps_per_second": 7.211, "step": 35800 }, { "epoch": 79.6229553645689, "grad_norm": 0.4109131693840027, "learning_rate": 2.4180602006688962e-06, "loss": 6.3738, "step": 35900 }, { "epoch": 79.6229553645689, "eval_loss": 6.385996341705322, "eval_runtime": 175.868, "eval_samples_per_second": 56.861, "eval_steps_per_second": 7.108, "step": 35900 }, { "epoch": 79.84474632658718, "grad_norm": 0.4770185351371765, "learning_rate": 2.408026755852843e-06, "loss": 6.373, "step": 36000 }, { "epoch": 79.84474632658718, "eval_loss": 6.385003566741943, "eval_runtime": 175.9258, "eval_samples_per_second": 56.842, "eval_steps_per_second": 7.105, "step": 36000 }, { "epoch": 80.06653728860549, "grad_norm": 0.31983354687690735, "learning_rate": 2.3979933110367893e-06, "loss": 6.3721, "step": 36100 }, { "epoch": 80.06653728860549, "eval_loss": 6.384030818939209, "eval_runtime": 175.9559, "eval_samples_per_second": 56.832, "eval_steps_per_second": 7.104, "step": 36100 }, { "epoch": 80.28832825062379, "grad_norm": 0.42961299419403076, "learning_rate": 2.387959866220736e-06, "loss": 6.3712, "step": 36200 }, { "epoch": 80.28832825062379, "eval_loss": 6.385640621185303, "eval_runtime": 173.4173, "eval_samples_per_second": 57.664, "eval_steps_per_second": 7.208, "step": 36200 }, { "epoch": 80.51011921264208, "grad_norm": 0.31057417392730713, "learning_rate": 2.3779264214046824e-06, "loss": 6.3731, "step": 36300 }, { "epoch": 80.51011921264208, "eval_loss": 6.384836196899414, "eval_runtime": 175.866, "eval_samples_per_second": 56.861, "eval_steps_per_second": 7.108, "step": 36300 }, { "epoch": 80.73191017466038, "grad_norm": 0.2894494831562042, "learning_rate": 2.3678929765886288e-06, "loss": 6.3741, "step": 36400 }, { "epoch": 80.73191017466038, "eval_loss": 6.385368824005127, "eval_runtime": 175.9096, "eval_samples_per_second": 56.847, "eval_steps_per_second": 7.106, "step": 36400 }, { "epoch": 80.95370113667867, "grad_norm": 0.4780093729496002, "learning_rate": 2.3578595317725755e-06, "loss": 6.3749, "step": 36500 }, { "epoch": 80.95370113667867, "eval_loss": 6.384347438812256, "eval_runtime": 175.8521, "eval_samples_per_second": 56.866, "eval_steps_per_second": 7.108, "step": 36500 }, { "epoch": 81.17549209869698, "grad_norm": 0.31205832958221436, "learning_rate": 2.347826086956522e-06, "loss": 6.3743, "step": 36600 }, { "epoch": 81.17549209869698, "eval_loss": 6.385135173797607, "eval_runtime": 175.8923, "eval_samples_per_second": 56.853, "eval_steps_per_second": 7.107, "step": 36600 }, { "epoch": 81.39728306071528, "grad_norm": 0.3318498134613037, "learning_rate": 2.337792642140468e-06, "loss": 6.3735, "step": 36700 }, { "epoch": 81.39728306071528, "eval_loss": 6.3830389976501465, "eval_runtime": 174.576, "eval_samples_per_second": 57.282, "eval_steps_per_second": 7.16, "step": 36700 }, { "epoch": 81.61907402273357, "grad_norm": 0.35717305541038513, "learning_rate": 2.327759197324415e-06, "loss": 6.3726, "step": 36800 }, { "epoch": 81.61907402273357, "eval_loss": 6.384567737579346, "eval_runtime": 173.0226, "eval_samples_per_second": 57.796, "eval_steps_per_second": 7.224, "step": 36800 }, { "epoch": 81.84086498475187, "grad_norm": 0.36196058988571167, "learning_rate": 2.3177257525083613e-06, "loss": 6.3734, "step": 36900 }, { "epoch": 81.84086498475187, "eval_loss": 6.385857582092285, "eval_runtime": 175.4192, "eval_samples_per_second": 57.006, "eval_steps_per_second": 7.126, "step": 36900 }, { "epoch": 82.06265594677016, "grad_norm": 0.34454473853111267, "learning_rate": 2.307692307692308e-06, "loss": 6.3732, "step": 37000 }, { "epoch": 82.06265594677016, "eval_loss": 6.384513854980469, "eval_runtime": 175.4568, "eval_samples_per_second": 56.994, "eval_steps_per_second": 7.124, "step": 37000 }, { "epoch": 82.28444690878847, "grad_norm": 0.3330673575401306, "learning_rate": 2.2976588628762544e-06, "loss": 6.3717, "step": 37100 }, { "epoch": 82.28444690878847, "eval_loss": 6.383497714996338, "eval_runtime": 173.2512, "eval_samples_per_second": 57.72, "eval_steps_per_second": 7.215, "step": 37100 }, { "epoch": 82.50623787080677, "grad_norm": 0.40681159496307373, "learning_rate": 2.2876254180602008e-06, "loss": 6.3728, "step": 37200 }, { "epoch": 82.50623787080677, "eval_loss": 6.38515567779541, "eval_runtime": 175.8178, "eval_samples_per_second": 56.877, "eval_steps_per_second": 7.11, "step": 37200 }, { "epoch": 82.72802883282506, "grad_norm": 0.3258204162120819, "learning_rate": 2.2775919732441475e-06, "loss": 6.3743, "step": 37300 }, { "epoch": 82.72802883282506, "eval_loss": 6.38502311706543, "eval_runtime": 173.1187, "eval_samples_per_second": 57.764, "eval_steps_per_second": 7.22, "step": 37300 }, { "epoch": 82.94981979484336, "grad_norm": 0.37041613459587097, "learning_rate": 2.267558528428094e-06, "loss": 6.3728, "step": 37400 }, { "epoch": 82.94981979484336, "eval_loss": 6.3821611404418945, "eval_runtime": 175.5559, "eval_samples_per_second": 56.962, "eval_steps_per_second": 7.12, "step": 37400 }, { "epoch": 83.17161075686165, "grad_norm": 0.33911818265914917, "learning_rate": 2.25752508361204e-06, "loss": 6.3738, "step": 37500 }, { "epoch": 83.17161075686165, "eval_loss": 6.386144638061523, "eval_runtime": 173.2969, "eval_samples_per_second": 57.704, "eval_steps_per_second": 7.213, "step": 37500 }, { "epoch": 83.39340171887996, "grad_norm": 0.48508045077323914, "learning_rate": 2.2474916387959865e-06, "loss": 6.3728, "step": 37600 }, { "epoch": 83.39340171887996, "eval_loss": 6.383870601654053, "eval_runtime": 175.8417, "eval_samples_per_second": 56.869, "eval_steps_per_second": 7.109, "step": 37600 }, { "epoch": 83.61519268089825, "grad_norm": 0.3488113284111023, "learning_rate": 2.237458193979933e-06, "loss": 6.3726, "step": 37700 }, { "epoch": 83.61519268089825, "eval_loss": 6.385016441345215, "eval_runtime": 175.8197, "eval_samples_per_second": 56.876, "eval_steps_per_second": 7.11, "step": 37700 }, { "epoch": 83.83698364291655, "grad_norm": 0.3524182438850403, "learning_rate": 2.2274247491638796e-06, "loss": 6.3725, "step": 37800 }, { "epoch": 83.83698364291655, "eval_loss": 6.384798526763916, "eval_runtime": 175.4988, "eval_samples_per_second": 56.98, "eval_steps_per_second": 7.123, "step": 37800 }, { "epoch": 84.05877460493485, "grad_norm": 0.28423815965652466, "learning_rate": 2.217391304347826e-06, "loss": 6.374, "step": 37900 }, { "epoch": 84.05877460493485, "eval_loss": 6.387665748596191, "eval_runtime": 172.948, "eval_samples_per_second": 57.821, "eval_steps_per_second": 7.228, "step": 37900 }, { "epoch": 84.28056556695314, "grad_norm": 0.32828596234321594, "learning_rate": 2.2073578595317723e-06, "loss": 6.3724, "step": 38000 }, { "epoch": 84.28056556695314, "eval_loss": 6.383293628692627, "eval_runtime": 175.4508, "eval_samples_per_second": 56.996, "eval_steps_per_second": 7.125, "step": 38000 }, { "epoch": 84.50235652897145, "grad_norm": 0.33721184730529785, "learning_rate": 2.197324414715719e-06, "loss": 6.373, "step": 38100 }, { "epoch": 84.50235652897145, "eval_loss": 6.385343551635742, "eval_runtime": 175.531, "eval_samples_per_second": 56.97, "eval_steps_per_second": 7.121, "step": 38100 }, { "epoch": 84.72414749098974, "grad_norm": 0.2766687273979187, "learning_rate": 2.1872909698996654e-06, "loss": 6.3728, "step": 38200 }, { "epoch": 84.72414749098974, "eval_loss": 6.38714599609375, "eval_runtime": 175.5001, "eval_samples_per_second": 56.98, "eval_steps_per_second": 7.123, "step": 38200 }, { "epoch": 84.94593845300804, "grad_norm": 0.26238977909088135, "learning_rate": 2.177257525083612e-06, "loss": 6.3733, "step": 38300 }, { "epoch": 84.94593845300804, "eval_loss": 6.385676383972168, "eval_runtime": 175.6778, "eval_samples_per_second": 56.922, "eval_steps_per_second": 7.115, "step": 38300 }, { "epoch": 85.16772941502634, "grad_norm": 0.2862393260002136, "learning_rate": 2.1672240802675585e-06, "loss": 6.3729, "step": 38400 }, { "epoch": 85.16772941502634, "eval_loss": 6.384363174438477, "eval_runtime": 175.3945, "eval_samples_per_second": 57.014, "eval_steps_per_second": 7.127, "step": 38400 }, { "epoch": 85.38952037704463, "grad_norm": 0.34560856223106384, "learning_rate": 2.157190635451505e-06, "loss": 6.3732, "step": 38500 }, { "epoch": 85.38952037704463, "eval_loss": 6.383378982543945, "eval_runtime": 172.9454, "eval_samples_per_second": 57.822, "eval_steps_per_second": 7.228, "step": 38500 }, { "epoch": 85.61131133906294, "grad_norm": 0.31079375743865967, "learning_rate": 2.1471571906354516e-06, "loss": 6.373, "step": 38600 }, { "epoch": 85.61131133906294, "eval_loss": 6.383601665496826, "eval_runtime": 175.4201, "eval_samples_per_second": 57.006, "eval_steps_per_second": 7.126, "step": 38600 }, { "epoch": 85.83310230108123, "grad_norm": 0.3083253800868988, "learning_rate": 2.137123745819398e-06, "loss": 6.3731, "step": 38700 }, { "epoch": 85.83310230108123, "eval_loss": 6.383668899536133, "eval_runtime": 175.9754, "eval_samples_per_second": 56.826, "eval_steps_per_second": 7.103, "step": 38700 }, { "epoch": 86.05489326309953, "grad_norm": 0.344168096780777, "learning_rate": 2.1270903010033443e-06, "loss": 6.3731, "step": 38800 }, { "epoch": 86.05489326309953, "eval_loss": 6.382165431976318, "eval_runtime": 173.0611, "eval_samples_per_second": 57.783, "eval_steps_per_second": 7.223, "step": 38800 }, { "epoch": 86.27668422511783, "grad_norm": 0.42378509044647217, "learning_rate": 2.117056856187291e-06, "loss": 6.3735, "step": 38900 }, { "epoch": 86.27668422511783, "eval_loss": 6.386937618255615, "eval_runtime": 175.7527, "eval_samples_per_second": 56.898, "eval_steps_per_second": 7.112, "step": 38900 }, { "epoch": 86.49847518713612, "grad_norm": 0.4086206555366516, "learning_rate": 2.1070234113712374e-06, "loss": 6.372, "step": 39000 }, { "epoch": 86.49847518713612, "eval_loss": 6.385149955749512, "eval_runtime": 172.9793, "eval_samples_per_second": 57.81, "eval_steps_per_second": 7.226, "step": 39000 }, { "epoch": 86.72026614915443, "grad_norm": 0.3867028057575226, "learning_rate": 2.0969899665551837e-06, "loss": 6.371, "step": 39100 }, { "epoch": 86.72026614915443, "eval_loss": 6.385136604309082, "eval_runtime": 175.5185, "eval_samples_per_second": 56.974, "eval_steps_per_second": 7.122, "step": 39100 }, { "epoch": 86.94205711117272, "grad_norm": 0.34638744592666626, "learning_rate": 2.0869565217391305e-06, "loss": 6.3723, "step": 39200 }, { "epoch": 86.94205711117272, "eval_loss": 6.382205486297607, "eval_runtime": 172.979, "eval_samples_per_second": 57.81, "eval_steps_per_second": 7.226, "step": 39200 }, { "epoch": 87.16384807319102, "grad_norm": 0.45395034551620483, "learning_rate": 2.076923076923077e-06, "loss": 6.374, "step": 39300 }, { "epoch": 87.16384807319102, "eval_loss": 6.383747100830078, "eval_runtime": 175.4954, "eval_samples_per_second": 56.982, "eval_steps_per_second": 7.123, "step": 39300 }, { "epoch": 87.38563903520931, "grad_norm": 0.2925475537776947, "learning_rate": 2.0668896321070236e-06, "loss": 6.3746, "step": 39400 }, { "epoch": 87.38563903520931, "eval_loss": 6.3860931396484375, "eval_runtime": 172.9827, "eval_samples_per_second": 57.809, "eval_steps_per_second": 7.226, "step": 39400 }, { "epoch": 87.60742999722761, "grad_norm": 0.25185534358024597, "learning_rate": 2.05685618729097e-06, "loss": 6.3721, "step": 39500 }, { "epoch": 87.60742999722761, "eval_loss": 6.383828163146973, "eval_runtime": 175.4682, "eval_samples_per_second": 56.99, "eval_steps_per_second": 7.124, "step": 39500 }, { "epoch": 87.82922095924592, "grad_norm": 0.35766276717185974, "learning_rate": 2.0468227424749163e-06, "loss": 6.3713, "step": 39600 }, { "epoch": 87.82922095924592, "eval_loss": 6.383662700653076, "eval_runtime": 173.3378, "eval_samples_per_second": 57.691, "eval_steps_per_second": 7.211, "step": 39600 }, { "epoch": 88.0510119212642, "grad_norm": 0.31199392676353455, "learning_rate": 2.036789297658863e-06, "loss": 6.3717, "step": 39700 }, { "epoch": 88.0510119212642, "eval_loss": 6.383730411529541, "eval_runtime": 175.0814, "eval_samples_per_second": 57.116, "eval_steps_per_second": 7.14, "step": 39700 }, { "epoch": 88.27280288328251, "grad_norm": 0.3334641754627228, "learning_rate": 2.0267558528428094e-06, "loss": 6.372, "step": 39800 }, { "epoch": 88.27280288328251, "eval_loss": 6.381414890289307, "eval_runtime": 172.9715, "eval_samples_per_second": 57.813, "eval_steps_per_second": 7.227, "step": 39800 }, { "epoch": 88.4945938453008, "grad_norm": 0.5019832849502563, "learning_rate": 2.0167224080267557e-06, "loss": 6.3721, "step": 39900 }, { "epoch": 88.4945938453008, "eval_loss": 6.383211612701416, "eval_runtime": 175.5157, "eval_samples_per_second": 56.975, "eval_steps_per_second": 7.122, "step": 39900 }, { "epoch": 88.7163848073191, "grad_norm": 0.4383368194103241, "learning_rate": 2.0066889632107025e-06, "loss": 6.3731, "step": 40000 }, { "epoch": 88.7163848073191, "eval_loss": 6.385327339172363, "eval_runtime": 175.457, "eval_samples_per_second": 56.994, "eval_steps_per_second": 7.124, "step": 40000 }, { "epoch": 88.9381757693374, "grad_norm": 0.27147725224494934, "learning_rate": 1.996655518394649e-06, "loss": 6.3741, "step": 40100 }, { "epoch": 88.9381757693374, "eval_loss": 6.383349418640137, "eval_runtime": 173.4084, "eval_samples_per_second": 57.667, "eval_steps_per_second": 7.208, "step": 40100 }, { "epoch": 89.1599667313557, "grad_norm": 0.2689467966556549, "learning_rate": 1.986622073578595e-06, "loss": 6.3719, "step": 40200 }, { "epoch": 89.1599667313557, "eval_loss": 6.38576078414917, "eval_runtime": 173.3868, "eval_samples_per_second": 57.675, "eval_steps_per_second": 7.209, "step": 40200 }, { "epoch": 89.381757693374, "grad_norm": 0.3858400881290436, "learning_rate": 1.976588628762542e-06, "loss": 6.3722, "step": 40300 }, { "epoch": 89.381757693374, "eval_loss": 6.38473653793335, "eval_runtime": 174.5973, "eval_samples_per_second": 57.275, "eval_steps_per_second": 7.159, "step": 40300 }, { "epoch": 89.60354865539229, "grad_norm": 0.372864693403244, "learning_rate": 1.9665551839464883e-06, "loss": 6.3727, "step": 40400 }, { "epoch": 89.60354865539229, "eval_loss": 6.384860992431641, "eval_runtime": 175.5793, "eval_samples_per_second": 56.954, "eval_steps_per_second": 7.119, "step": 40400 }, { "epoch": 89.82533961741059, "grad_norm": 0.31050923466682434, "learning_rate": 1.956521739130435e-06, "loss": 6.3721, "step": 40500 }, { "epoch": 89.82533961741059, "eval_loss": 6.3831257820129395, "eval_runtime": 173.5084, "eval_samples_per_second": 57.634, "eval_steps_per_second": 7.204, "step": 40500 }, { "epoch": 90.0471305794289, "grad_norm": 0.31580400466918945, "learning_rate": 1.9464882943143814e-06, "loss": 6.3716, "step": 40600 }, { "epoch": 90.0471305794289, "eval_loss": 6.382096767425537, "eval_runtime": 175.7748, "eval_samples_per_second": 56.891, "eval_steps_per_second": 7.111, "step": 40600 }, { "epoch": 90.26892154144718, "grad_norm": 0.30445969104766846, "learning_rate": 1.9364548494983277e-06, "loss": 6.3738, "step": 40700 }, { "epoch": 90.26892154144718, "eval_loss": 6.383363246917725, "eval_runtime": 175.8814, "eval_samples_per_second": 56.856, "eval_steps_per_second": 7.107, "step": 40700 }, { "epoch": 90.49071250346549, "grad_norm": 0.3509177565574646, "learning_rate": 1.9264214046822745e-06, "loss": 6.3711, "step": 40800 }, { "epoch": 90.49071250346549, "eval_loss": 6.3791728019714355, "eval_runtime": 175.2022, "eval_samples_per_second": 57.077, "eval_steps_per_second": 7.135, "step": 40800 }, { "epoch": 90.71250346548378, "grad_norm": 0.2431792914867401, "learning_rate": 1.916387959866221e-06, "loss": 6.3717, "step": 40900 }, { "epoch": 90.71250346548378, "eval_loss": 6.383620262145996, "eval_runtime": 173.3604, "eval_samples_per_second": 57.683, "eval_steps_per_second": 7.21, "step": 40900 }, { "epoch": 90.93429442750208, "grad_norm": 0.3652373254299164, "learning_rate": 1.9063545150501674e-06, "loss": 6.3702, "step": 41000 }, { "epoch": 90.93429442750208, "eval_loss": 6.384062767028809, "eval_runtime": 175.9398, "eval_samples_per_second": 56.838, "eval_steps_per_second": 7.105, "step": 41000 }, { "epoch": 91.15608538952037, "grad_norm": 0.3120420575141907, "learning_rate": 1.896321070234114e-06, "loss": 6.3734, "step": 41100 }, { "epoch": 91.15608538952037, "eval_loss": 6.383402347564697, "eval_runtime": 173.0565, "eval_samples_per_second": 57.785, "eval_steps_per_second": 7.223, "step": 41100 }, { "epoch": 91.37787635153867, "grad_norm": 0.36098653078079224, "learning_rate": 1.8862876254180603e-06, "loss": 6.3731, "step": 41200 }, { "epoch": 91.37787635153867, "eval_loss": 6.384464263916016, "eval_runtime": 175.6772, "eval_samples_per_second": 56.923, "eval_steps_per_second": 7.115, "step": 41200 }, { "epoch": 91.59966731355698, "grad_norm": 0.2494172751903534, "learning_rate": 1.8762541806020068e-06, "loss": 6.3727, "step": 41300 }, { "epoch": 91.59966731355698, "eval_loss": 6.384238243103027, "eval_runtime": 175.6493, "eval_samples_per_second": 56.932, "eval_steps_per_second": 7.116, "step": 41300 }, { "epoch": 91.82145827557527, "grad_norm": 0.2649492025375366, "learning_rate": 1.8662207357859534e-06, "loss": 6.3715, "step": 41400 }, { "epoch": 91.82145827557527, "eval_loss": 6.386543273925781, "eval_runtime": 173.0007, "eval_samples_per_second": 57.803, "eval_steps_per_second": 7.225, "step": 41400 }, { "epoch": 92.04324923759357, "grad_norm": 0.31116828322410583, "learning_rate": 1.8561872909699e-06, "loss": 6.3714, "step": 41500 }, { "epoch": 92.04324923759357, "eval_loss": 6.384570121765137, "eval_runtime": 172.9737, "eval_samples_per_second": 57.812, "eval_steps_per_second": 7.227, "step": 41500 }, { "epoch": 92.26504019961186, "grad_norm": 0.39690667390823364, "learning_rate": 1.8461538461538462e-06, "loss": 6.3722, "step": 41600 }, { "epoch": 92.26504019961186, "eval_loss": 6.384208679199219, "eval_runtime": 175.5344, "eval_samples_per_second": 56.969, "eval_steps_per_second": 7.121, "step": 41600 }, { "epoch": 92.48683116163016, "grad_norm": 0.31385165452957153, "learning_rate": 1.8361204013377928e-06, "loss": 6.3727, "step": 41700 }, { "epoch": 92.48683116163016, "eval_loss": 6.382976055145264, "eval_runtime": 175.571, "eval_samples_per_second": 56.957, "eval_steps_per_second": 7.12, "step": 41700 }, { "epoch": 92.70862212364847, "grad_norm": 0.2589961886405945, "learning_rate": 1.8260869565217394e-06, "loss": 6.373, "step": 41800 }, { "epoch": 92.70862212364847, "eval_loss": 6.384578704833984, "eval_runtime": 172.8987, "eval_samples_per_second": 57.837, "eval_steps_per_second": 7.23, "step": 41800 }, { "epoch": 92.93041308566676, "grad_norm": 0.3754993677139282, "learning_rate": 1.8160535117056857e-06, "loss": 6.3716, "step": 41900 }, { "epoch": 92.93041308566676, "eval_loss": 6.387712478637695, "eval_runtime": 173.081, "eval_samples_per_second": 57.776, "eval_steps_per_second": 7.222, "step": 41900 }, { "epoch": 93.15220404768506, "grad_norm": 0.34123027324676514, "learning_rate": 1.8060200668896322e-06, "loss": 6.3719, "step": 42000 }, { "epoch": 93.15220404768506, "eval_loss": 6.387158393859863, "eval_runtime": 173.0202, "eval_samples_per_second": 57.797, "eval_steps_per_second": 7.225, "step": 42000 }, { "epoch": 93.37399500970335, "grad_norm": 0.28870150446891785, "learning_rate": 1.7959866220735788e-06, "loss": 6.3717, "step": 42100 }, { "epoch": 93.37399500970335, "eval_loss": 6.384382247924805, "eval_runtime": 175.9359, "eval_samples_per_second": 56.839, "eval_steps_per_second": 7.105, "step": 42100 }, { "epoch": 93.59578597172165, "grad_norm": 0.33736997842788696, "learning_rate": 1.7859531772575253e-06, "loss": 6.3731, "step": 42200 }, { "epoch": 93.59578597172165, "eval_loss": 6.384626865386963, "eval_runtime": 172.9425, "eval_samples_per_second": 57.823, "eval_steps_per_second": 7.228, "step": 42200 }, { "epoch": 93.81757693373996, "grad_norm": 0.30697163939476013, "learning_rate": 1.7759197324414717e-06, "loss": 6.3731, "step": 42300 }, { "epoch": 93.81757693373996, "eval_loss": 6.384149074554443, "eval_runtime": 175.533, "eval_samples_per_second": 56.969, "eval_steps_per_second": 7.121, "step": 42300 }, { "epoch": 94.03936789575825, "grad_norm": 0.31292060017585754, "learning_rate": 1.7658862876254182e-06, "loss": 6.372, "step": 42400 }, { "epoch": 94.03936789575825, "eval_loss": 6.38083553314209, "eval_runtime": 173.0184, "eval_samples_per_second": 57.797, "eval_steps_per_second": 7.225, "step": 42400 }, { "epoch": 94.26115885777655, "grad_norm": 0.3728470504283905, "learning_rate": 1.7558528428093648e-06, "loss": 6.3713, "step": 42500 }, { "epoch": 94.26115885777655, "eval_loss": 6.381670951843262, "eval_runtime": 175.3914, "eval_samples_per_second": 57.015, "eval_steps_per_second": 7.127, "step": 42500 }, { "epoch": 94.48294981979484, "grad_norm": 0.44780856370925903, "learning_rate": 1.745819397993311e-06, "loss": 6.3718, "step": 42600 }, { "epoch": 94.48294981979484, "eval_loss": 6.385097503662109, "eval_runtime": 175.3778, "eval_samples_per_second": 57.02, "eval_steps_per_second": 7.127, "step": 42600 }, { "epoch": 94.70474078181314, "grad_norm": 0.29420205950737, "learning_rate": 1.7357859531772575e-06, "loss": 6.3709, "step": 42700 }, { "epoch": 94.70474078181314, "eval_loss": 6.382612705230713, "eval_runtime": 173.3858, "eval_samples_per_second": 57.675, "eval_steps_per_second": 7.209, "step": 42700 }, { "epoch": 94.92653174383143, "grad_norm": 0.43360549211502075, "learning_rate": 1.7257525083612038e-06, "loss": 6.3708, "step": 42800 }, { "epoch": 94.92653174383143, "eval_loss": 6.382971286773682, "eval_runtime": 172.9207, "eval_samples_per_second": 57.83, "eval_steps_per_second": 7.229, "step": 42800 }, { "epoch": 95.14832270584974, "grad_norm": 0.29865312576293945, "learning_rate": 1.7157190635451504e-06, "loss": 6.372, "step": 42900 }, { "epoch": 95.14832270584974, "eval_loss": 6.3829755783081055, "eval_runtime": 175.5167, "eval_samples_per_second": 56.975, "eval_steps_per_second": 7.122, "step": 42900 }, { "epoch": 95.37011366786804, "grad_norm": 0.32399508357048035, "learning_rate": 1.705685618729097e-06, "loss": 6.3712, "step": 43000 }, { "epoch": 95.37011366786804, "eval_loss": 6.381554126739502, "eval_runtime": 172.9843, "eval_samples_per_second": 57.809, "eval_steps_per_second": 7.226, "step": 43000 }, { "epoch": 95.59190462988633, "grad_norm": 0.2875135540962219, "learning_rate": 1.6956521739130435e-06, "loss": 6.3709, "step": 43100 }, { "epoch": 95.59190462988633, "eval_loss": 6.381914138793945, "eval_runtime": 175.5546, "eval_samples_per_second": 56.962, "eval_steps_per_second": 7.12, "step": 43100 }, { "epoch": 95.81369559190463, "grad_norm": 0.4401540756225586, "learning_rate": 1.6856187290969898e-06, "loss": 6.3723, "step": 43200 }, { "epoch": 95.81369559190463, "eval_loss": 6.383592128753662, "eval_runtime": 175.584, "eval_samples_per_second": 56.953, "eval_steps_per_second": 7.119, "step": 43200 }, { "epoch": 96.03548655392292, "grad_norm": 0.2576783001422882, "learning_rate": 1.6755852842809363e-06, "loss": 6.3722, "step": 43300 }, { "epoch": 96.03548655392292, "eval_loss": 6.383729457855225, "eval_runtime": 175.4696, "eval_samples_per_second": 56.99, "eval_steps_per_second": 7.124, "step": 43300 }, { "epoch": 96.25727751594123, "grad_norm": 0.3146987855434418, "learning_rate": 1.665551839464883e-06, "loss": 6.3716, "step": 43400 }, { "epoch": 96.25727751594123, "eval_loss": 6.380384922027588, "eval_runtime": 175.0534, "eval_samples_per_second": 57.125, "eval_steps_per_second": 7.141, "step": 43400 }, { "epoch": 96.47906847795953, "grad_norm": 0.3195679485797882, "learning_rate": 1.6555183946488294e-06, "loss": 6.3714, "step": 43500 }, { "epoch": 96.47906847795953, "eval_loss": 6.382904529571533, "eval_runtime": 175.5685, "eval_samples_per_second": 56.958, "eval_steps_per_second": 7.12, "step": 43500 }, { "epoch": 96.70085943997782, "grad_norm": 0.2415214329957962, "learning_rate": 1.6454849498327758e-06, "loss": 6.3711, "step": 43600 }, { "epoch": 96.70085943997782, "eval_loss": 6.380964279174805, "eval_runtime": 173.0267, "eval_samples_per_second": 57.795, "eval_steps_per_second": 7.224, "step": 43600 }, { "epoch": 96.92265040199612, "grad_norm": 0.40489473938941956, "learning_rate": 1.6354515050167223e-06, "loss": 6.3726, "step": 43700 }, { "epoch": 96.92265040199612, "eval_loss": 6.381808757781982, "eval_runtime": 173.1061, "eval_samples_per_second": 57.768, "eval_steps_per_second": 7.221, "step": 43700 }, { "epoch": 97.14444136401441, "grad_norm": 0.30804529786109924, "learning_rate": 1.6254180602006689e-06, "loss": 6.372, "step": 43800 }, { "epoch": 97.14444136401441, "eval_loss": 6.384749889373779, "eval_runtime": 175.6167, "eval_samples_per_second": 56.942, "eval_steps_per_second": 7.118, "step": 43800 }, { "epoch": 97.36623232603272, "grad_norm": 0.31247368454933167, "learning_rate": 1.6153846153846154e-06, "loss": 6.3738, "step": 43900 }, { "epoch": 97.36623232603272, "eval_loss": 6.383345127105713, "eval_runtime": 172.9449, "eval_samples_per_second": 57.822, "eval_steps_per_second": 7.228, "step": 43900 }, { "epoch": 97.588023288051, "grad_norm": 0.3146020174026489, "learning_rate": 1.6053511705685618e-06, "loss": 6.3736, "step": 44000 }, { "epoch": 97.588023288051, "eval_loss": 6.38405179977417, "eval_runtime": 175.4959, "eval_samples_per_second": 56.981, "eval_steps_per_second": 7.123, "step": 44000 }, { "epoch": 97.80981425006931, "grad_norm": 0.30886611342430115, "learning_rate": 1.5953177257525083e-06, "loss": 6.3706, "step": 44100 }, { "epoch": 97.80981425006931, "eval_loss": 6.381131172180176, "eval_runtime": 172.9957, "eval_samples_per_second": 57.805, "eval_steps_per_second": 7.226, "step": 44100 }, { "epoch": 98.03160521208761, "grad_norm": 0.3250170648097992, "learning_rate": 1.5852842809364549e-06, "loss": 6.3711, "step": 44200 }, { "epoch": 98.03160521208761, "eval_loss": 6.382991313934326, "eval_runtime": 175.9006, "eval_samples_per_second": 56.85, "eval_steps_per_second": 7.106, "step": 44200 }, { "epoch": 98.2533961741059, "grad_norm": 0.2637650966644287, "learning_rate": 1.5752508361204012e-06, "loss": 6.3721, "step": 44300 }, { "epoch": 98.2533961741059, "eval_loss": 6.385432243347168, "eval_runtime": 175.8265, "eval_samples_per_second": 56.874, "eval_steps_per_second": 7.109, "step": 44300 }, { "epoch": 98.4751871361242, "grad_norm": 0.3357675075531006, "learning_rate": 1.5652173913043478e-06, "loss": 6.371, "step": 44400 }, { "epoch": 98.4751871361242, "eval_loss": 6.385194301605225, "eval_runtime": 175.8373, "eval_samples_per_second": 56.871, "eval_steps_per_second": 7.109, "step": 44400 }, { "epoch": 98.6969780981425, "grad_norm": 0.3793193995952606, "learning_rate": 1.5551839464882943e-06, "loss": 6.3717, "step": 44500 }, { "epoch": 98.6969780981425, "eval_loss": 6.382778167724609, "eval_runtime": 173.4199, "eval_samples_per_second": 57.664, "eval_steps_per_second": 7.208, "step": 44500 }, { "epoch": 98.9187690601608, "grad_norm": 0.3075515329837799, "learning_rate": 1.5451505016722409e-06, "loss": 6.3705, "step": 44600 }, { "epoch": 98.9187690601608, "eval_loss": 6.384821891784668, "eval_runtime": 175.4722, "eval_samples_per_second": 56.989, "eval_steps_per_second": 7.124, "step": 44600 }, { "epoch": 99.1405600221791, "grad_norm": 0.27654966711997986, "learning_rate": 1.5351170568561872e-06, "loss": 6.3725, "step": 44700 }, { "epoch": 99.1405600221791, "eval_loss": 6.378158092498779, "eval_runtime": 173.0439, "eval_samples_per_second": 57.789, "eval_steps_per_second": 7.224, "step": 44700 }, { "epoch": 99.36235098419739, "grad_norm": 0.25358349084854126, "learning_rate": 1.5250836120401338e-06, "loss": 6.3718, "step": 44800 }, { "epoch": 99.36235098419739, "eval_loss": 6.381252288818359, "eval_runtime": 175.5178, "eval_samples_per_second": 56.974, "eval_steps_per_second": 7.122, "step": 44800 }, { "epoch": 99.5841419462157, "grad_norm": 0.27983585000038147, "learning_rate": 1.5150501672240803e-06, "loss": 6.3709, "step": 44900 }, { "epoch": 99.5841419462157, "eval_loss": 6.383197784423828, "eval_runtime": 175.4823, "eval_samples_per_second": 56.986, "eval_steps_per_second": 7.123, "step": 44900 }, { "epoch": 99.80593290823398, "grad_norm": 0.35121074318885803, "learning_rate": 1.5050167224080269e-06, "loss": 6.3726, "step": 45000 }, { "epoch": 99.80593290823398, "eval_loss": 6.385370254516602, "eval_runtime": 175.4757, "eval_samples_per_second": 56.988, "eval_steps_per_second": 7.123, "step": 45000 }, { "epoch": 100.02772387025229, "grad_norm": 0.22111310064792633, "learning_rate": 1.4949832775919732e-06, "loss": 6.3716, "step": 45100 }, { "epoch": 100.02772387025229, "eval_loss": 6.38284158706665, "eval_runtime": 175.8886, "eval_samples_per_second": 56.854, "eval_steps_per_second": 7.107, "step": 45100 }, { "epoch": 100.24951483227059, "grad_norm": 0.22795332968235016, "learning_rate": 1.4849498327759198e-06, "loss": 6.3721, "step": 45200 }, { "epoch": 100.24951483227059, "eval_loss": 6.378814697265625, "eval_runtime": 173.36, "eval_samples_per_second": 57.683, "eval_steps_per_second": 7.21, "step": 45200 }, { "epoch": 100.47130579428888, "grad_norm": 0.3906308710575104, "learning_rate": 1.4749163879598663e-06, "loss": 6.3711, "step": 45300 }, { "epoch": 100.47130579428888, "eval_loss": 6.380859375, "eval_runtime": 175.65, "eval_samples_per_second": 56.931, "eval_steps_per_second": 7.116, "step": 45300 }, { "epoch": 100.69309675630718, "grad_norm": 0.35361433029174805, "learning_rate": 1.4648829431438129e-06, "loss": 6.3689, "step": 45400 }, { "epoch": 100.69309675630718, "eval_loss": 6.386940956115723, "eval_runtime": 173.0816, "eval_samples_per_second": 57.776, "eval_steps_per_second": 7.222, "step": 45400 }, { "epoch": 100.91488771832547, "grad_norm": 0.3520587682723999, "learning_rate": 1.4548494983277592e-06, "loss": 6.371, "step": 45500 }, { "epoch": 100.91488771832547, "eval_loss": 6.384310245513916, "eval_runtime": 175.5343, "eval_samples_per_second": 56.969, "eval_steps_per_second": 7.121, "step": 45500 }, { "epoch": 101.13667868034378, "grad_norm": 0.37038084864616394, "learning_rate": 1.4448160535117058e-06, "loss": 6.3712, "step": 45600 }, { "epoch": 101.13667868034378, "eval_loss": 6.381255626678467, "eval_runtime": 172.9314, "eval_samples_per_second": 57.826, "eval_steps_per_second": 7.228, "step": 45600 }, { "epoch": 101.35846964236207, "grad_norm": 0.2583162188529968, "learning_rate": 1.4347826086956523e-06, "loss": 6.3693, "step": 45700 }, { "epoch": 101.35846964236207, "eval_loss": 6.385676383972168, "eval_runtime": 175.4492, "eval_samples_per_second": 56.997, "eval_steps_per_second": 7.125, "step": 45700 }, { "epoch": 101.58026060438037, "grad_norm": 0.37049952149391174, "learning_rate": 1.4247491638795989e-06, "loss": 6.3715, "step": 45800 }, { "epoch": 101.58026060438037, "eval_loss": 6.383345603942871, "eval_runtime": 172.9908, "eval_samples_per_second": 57.807, "eval_steps_per_second": 7.226, "step": 45800 }, { "epoch": 101.80205156639867, "grad_norm": 0.3586992919445038, "learning_rate": 1.4147157190635452e-06, "loss": 6.3709, "step": 45900 }, { "epoch": 101.80205156639867, "eval_loss": 6.383970260620117, "eval_runtime": 175.5127, "eval_samples_per_second": 56.976, "eval_steps_per_second": 7.122, "step": 45900 }, { "epoch": 102.02384252841696, "grad_norm": 0.274954229593277, "learning_rate": 1.4046822742474917e-06, "loss": 6.3721, "step": 46000 }, { "epoch": 102.02384252841696, "eval_loss": 6.379533767700195, "eval_runtime": 175.5086, "eval_samples_per_second": 56.977, "eval_steps_per_second": 7.122, "step": 46000 }, { "epoch": 102.24563349043527, "grad_norm": 0.2859888970851898, "learning_rate": 1.3946488294314383e-06, "loss": 6.3704, "step": 46100 }, { "epoch": 102.24563349043527, "eval_loss": 6.3819146156311035, "eval_runtime": 175.6284, "eval_samples_per_second": 56.938, "eval_steps_per_second": 7.117, "step": 46100 }, { "epoch": 102.46742445245356, "grad_norm": 0.27162763476371765, "learning_rate": 1.3846153846153846e-06, "loss": 6.3718, "step": 46200 }, { "epoch": 102.46742445245356, "eval_loss": 6.383949279785156, "eval_runtime": 173.0341, "eval_samples_per_second": 57.792, "eval_steps_per_second": 7.224, "step": 46200 }, { "epoch": 102.68921541447186, "grad_norm": 0.24669644236564636, "learning_rate": 1.374581939799331e-06, "loss": 6.3706, "step": 46300 }, { "epoch": 102.68921541447186, "eval_loss": 6.384088516235352, "eval_runtime": 175.8327, "eval_samples_per_second": 56.872, "eval_steps_per_second": 7.109, "step": 46300 }, { "epoch": 102.91100637649016, "grad_norm": 0.32821038365364075, "learning_rate": 1.3645484949832775e-06, "loss": 6.3716, "step": 46400 }, { "epoch": 102.91100637649016, "eval_loss": 6.383686065673828, "eval_runtime": 173.1011, "eval_samples_per_second": 57.77, "eval_steps_per_second": 7.221, "step": 46400 }, { "epoch": 103.13279733850845, "grad_norm": 0.23931552469730377, "learning_rate": 1.354515050167224e-06, "loss": 6.3706, "step": 46500 }, { "epoch": 103.13279733850845, "eval_loss": 6.379798412322998, "eval_runtime": 175.5988, "eval_samples_per_second": 56.948, "eval_steps_per_second": 7.118, "step": 46500 }, { "epoch": 103.35458830052676, "grad_norm": 0.2975938022136688, "learning_rate": 1.3444816053511706e-06, "loss": 6.3713, "step": 46600 }, { "epoch": 103.35458830052676, "eval_loss": 6.3860554695129395, "eval_runtime": 175.5887, "eval_samples_per_second": 56.951, "eval_steps_per_second": 7.119, "step": 46600 }, { "epoch": 103.57637926254505, "grad_norm": 0.2592810392379761, "learning_rate": 1.334448160535117e-06, "loss": 6.3717, "step": 46700 }, { "epoch": 103.57637926254505, "eval_loss": 6.3828301429748535, "eval_runtime": 175.6957, "eval_samples_per_second": 56.917, "eval_steps_per_second": 7.115, "step": 46700 }, { "epoch": 103.79817022456335, "grad_norm": 0.2834523320198059, "learning_rate": 1.3244147157190635e-06, "loss": 6.3713, "step": 46800 }, { "epoch": 103.79817022456335, "eval_loss": 6.386697769165039, "eval_runtime": 172.9159, "eval_samples_per_second": 57.832, "eval_steps_per_second": 7.229, "step": 46800 }, { "epoch": 104.01996118658165, "grad_norm": 0.2672658860683441, "learning_rate": 1.31438127090301e-06, "loss": 6.3721, "step": 46900 }, { "epoch": 104.01996118658165, "eval_loss": 6.381076812744141, "eval_runtime": 173.0101, "eval_samples_per_second": 57.8, "eval_steps_per_second": 7.225, "step": 46900 }, { "epoch": 104.24175214859994, "grad_norm": 0.29608866572380066, "learning_rate": 1.3043478260869566e-06, "loss": 6.3722, "step": 47000 }, { "epoch": 104.24175214859994, "eval_loss": 6.383474826812744, "eval_runtime": 175.9295, "eval_samples_per_second": 56.841, "eval_steps_per_second": 7.105, "step": 47000 }, { "epoch": 104.46354311061825, "grad_norm": 0.31595227122306824, "learning_rate": 1.294314381270903e-06, "loss": 6.3715, "step": 47100 }, { "epoch": 104.46354311061825, "eval_loss": 6.382750988006592, "eval_runtime": 173.1316, "eval_samples_per_second": 57.76, "eval_steps_per_second": 7.22, "step": 47100 }, { "epoch": 104.68533407263654, "grad_norm": 0.2782845199108124, "learning_rate": 1.2842809364548495e-06, "loss": 6.3715, "step": 47200 }, { "epoch": 104.68533407263654, "eval_loss": 6.381110191345215, "eval_runtime": 175.6479, "eval_samples_per_second": 56.932, "eval_steps_per_second": 7.117, "step": 47200 }, { "epoch": 104.90712503465484, "grad_norm": 0.32985934615135193, "learning_rate": 1.274247491638796e-06, "loss": 6.3707, "step": 47300 }, { "epoch": 104.90712503465484, "eval_loss": 6.380244731903076, "eval_runtime": 173.0618, "eval_samples_per_second": 57.783, "eval_steps_per_second": 7.223, "step": 47300 }, { "epoch": 105.12891599667313, "grad_norm": 0.27673158049583435, "learning_rate": 1.2642140468227424e-06, "loss": 6.371, "step": 47400 }, { "epoch": 105.12891599667313, "eval_loss": 6.382138252258301, "eval_runtime": 175.4509, "eval_samples_per_second": 56.996, "eval_steps_per_second": 7.125, "step": 47400 }, { "epoch": 105.35070695869143, "grad_norm": 0.2984777092933655, "learning_rate": 1.254180602006689e-06, "loss": 6.3719, "step": 47500 }, { "epoch": 105.35070695869143, "eval_loss": 6.382594585418701, "eval_runtime": 173.0122, "eval_samples_per_second": 57.799, "eval_steps_per_second": 7.225, "step": 47500 }, { "epoch": 105.57249792070974, "grad_norm": 0.29209384322166443, "learning_rate": 1.2441471571906355e-06, "loss": 6.3715, "step": 47600 }, { "epoch": 105.57249792070974, "eval_loss": 6.38098669052124, "eval_runtime": 175.7524, "eval_samples_per_second": 56.898, "eval_steps_per_second": 7.112, "step": 47600 }, { "epoch": 105.79428888272803, "grad_norm": 0.35189709067344666, "learning_rate": 1.234113712374582e-06, "loss": 6.3701, "step": 47700 }, { "epoch": 105.79428888272803, "eval_loss": 6.384945392608643, "eval_runtime": 175.5438, "eval_samples_per_second": 56.966, "eval_steps_per_second": 7.121, "step": 47700 }, { "epoch": 106.01607984474633, "grad_norm": 0.37181735038757324, "learning_rate": 1.2240802675585284e-06, "loss": 6.3703, "step": 47800 }, { "epoch": 106.01607984474633, "eval_loss": 6.378709316253662, "eval_runtime": 175.523, "eval_samples_per_second": 56.973, "eval_steps_per_second": 7.122, "step": 47800 }, { "epoch": 106.23787080676462, "grad_norm": 0.2793137729167938, "learning_rate": 1.214046822742475e-06, "loss": 6.3706, "step": 47900 }, { "epoch": 106.23787080676462, "eval_loss": 6.380676746368408, "eval_runtime": 173.0355, "eval_samples_per_second": 57.792, "eval_steps_per_second": 7.224, "step": 47900 }, { "epoch": 106.45966176878292, "grad_norm": 0.2996074855327606, "learning_rate": 1.2040133779264215e-06, "loss": 6.3714, "step": 48000 }, { "epoch": 106.45966176878292, "eval_loss": 6.382739067077637, "eval_runtime": 175.5807, "eval_samples_per_second": 56.954, "eval_steps_per_second": 7.119, "step": 48000 }, { "epoch": 106.68145273080123, "grad_norm": 0.32835853099823, "learning_rate": 1.193979933110368e-06, "loss": 6.3717, "step": 48100 }, { "epoch": 106.68145273080123, "eval_loss": 6.382002353668213, "eval_runtime": 173.3264, "eval_samples_per_second": 57.695, "eval_steps_per_second": 7.212, "step": 48100 }, { "epoch": 106.90324369281952, "grad_norm": 0.31071096658706665, "learning_rate": 1.1839464882943144e-06, "loss": 6.3715, "step": 48200 }, { "epoch": 106.90324369281952, "eval_loss": 6.385354042053223, "eval_runtime": 175.863, "eval_samples_per_second": 56.862, "eval_steps_per_second": 7.108, "step": 48200 }, { "epoch": 107.12503465483782, "grad_norm": 0.32424595952033997, "learning_rate": 1.173913043478261e-06, "loss": 6.3713, "step": 48300 }, { "epoch": 107.12503465483782, "eval_loss": 6.381778240203857, "eval_runtime": 176.0254, "eval_samples_per_second": 56.81, "eval_steps_per_second": 7.101, "step": 48300 }, { "epoch": 107.34682561685611, "grad_norm": 0.25034162402153015, "learning_rate": 1.1638795986622075e-06, "loss": 6.3714, "step": 48400 }, { "epoch": 107.34682561685611, "eval_loss": 6.383028507232666, "eval_runtime": 175.899, "eval_samples_per_second": 56.851, "eval_steps_per_second": 7.106, "step": 48400 }, { "epoch": 107.56861657887441, "grad_norm": 0.2586011290550232, "learning_rate": 1.153846153846154e-06, "loss": 6.3722, "step": 48500 }, { "epoch": 107.56861657887441, "eval_loss": 6.382985591888428, "eval_runtime": 175.861, "eval_samples_per_second": 56.863, "eval_steps_per_second": 7.108, "step": 48500 }, { "epoch": 107.79040754089272, "grad_norm": 0.28121402859687805, "learning_rate": 1.1438127090301004e-06, "loss": 6.3715, "step": 48600 }, { "epoch": 107.79040754089272, "eval_loss": 6.381731986999512, "eval_runtime": 173.3663, "eval_samples_per_second": 57.681, "eval_steps_per_second": 7.21, "step": 48600 }, { "epoch": 108.012198502911, "grad_norm": 0.27013683319091797, "learning_rate": 1.133779264214047e-06, "loss": 6.3689, "step": 48700 }, { "epoch": 108.012198502911, "eval_loss": 6.381706237792969, "eval_runtime": 175.9392, "eval_samples_per_second": 56.838, "eval_steps_per_second": 7.105, "step": 48700 }, { "epoch": 108.23398946492931, "grad_norm": 0.345570832490921, "learning_rate": 1.1237458193979933e-06, "loss": 6.3706, "step": 48800 }, { "epoch": 108.23398946492931, "eval_loss": 6.384325981140137, "eval_runtime": 173.2557, "eval_samples_per_second": 57.718, "eval_steps_per_second": 7.215, "step": 48800 }, { "epoch": 108.4557804269476, "grad_norm": 0.26037341356277466, "learning_rate": 1.1137123745819398e-06, "loss": 6.3728, "step": 48900 }, { "epoch": 108.4557804269476, "eval_loss": 6.383279323577881, "eval_runtime": 175.662, "eval_samples_per_second": 56.927, "eval_steps_per_second": 7.116, "step": 48900 }, { "epoch": 108.6775713889659, "grad_norm": 0.25174733996391296, "learning_rate": 1.1036789297658862e-06, "loss": 6.3711, "step": 49000 }, { "epoch": 108.6775713889659, "eval_loss": 6.384110927581787, "eval_runtime": 173.04, "eval_samples_per_second": 57.79, "eval_steps_per_second": 7.224, "step": 49000 }, { "epoch": 108.89936235098419, "grad_norm": 0.22819426655769348, "learning_rate": 1.0936454849498327e-06, "loss": 6.3725, "step": 49100 }, { "epoch": 108.89936235098419, "eval_loss": 6.383809566497803, "eval_runtime": 175.8514, "eval_samples_per_second": 56.866, "eval_steps_per_second": 7.108, "step": 49100 }, { "epoch": 109.1211533130025, "grad_norm": 0.3142814636230469, "learning_rate": 1.0836120401337793e-06, "loss": 6.3707, "step": 49200 }, { "epoch": 109.1211533130025, "eval_loss": 6.38060998916626, "eval_runtime": 172.941, "eval_samples_per_second": 57.823, "eval_steps_per_second": 7.228, "step": 49200 }, { "epoch": 109.3429442750208, "grad_norm": 0.30092594027519226, "learning_rate": 1.0735785953177258e-06, "loss": 6.3699, "step": 49300 }, { "epoch": 109.3429442750208, "eval_loss": 6.385983943939209, "eval_runtime": 175.5269, "eval_samples_per_second": 56.971, "eval_steps_per_second": 7.121, "step": 49300 }, { "epoch": 109.56473523703909, "grad_norm": 0.31821510195732117, "learning_rate": 1.0635451505016722e-06, "loss": 6.3694, "step": 49400 }, { "epoch": 109.56473523703909, "eval_loss": 6.383793830871582, "eval_runtime": 175.5382, "eval_samples_per_second": 56.968, "eval_steps_per_second": 7.121, "step": 49400 }, { "epoch": 109.78652619905739, "grad_norm": 0.3837875425815582, "learning_rate": 1.0535117056856187e-06, "loss": 6.3701, "step": 49500 }, { "epoch": 109.78652619905739, "eval_loss": 6.380537509918213, "eval_runtime": 175.935, "eval_samples_per_second": 56.839, "eval_steps_per_second": 7.105, "step": 49500 }, { "epoch": 110.00831716107568, "grad_norm": 0.23530994355678558, "learning_rate": 1.0434782608695653e-06, "loss": 6.3705, "step": 49600 }, { "epoch": 110.00831716107568, "eval_loss": 6.386258602142334, "eval_runtime": 175.9707, "eval_samples_per_second": 56.828, "eval_steps_per_second": 7.103, "step": 49600 }, { "epoch": 110.23010812309398, "grad_norm": 0.26103320717811584, "learning_rate": 1.0334448160535118e-06, "loss": 6.3707, "step": 49700 }, { "epoch": 110.23010812309398, "eval_loss": 6.383273124694824, "eval_runtime": 173.4608, "eval_samples_per_second": 57.65, "eval_steps_per_second": 7.206, "step": 49700 }, { "epoch": 110.45189908511229, "grad_norm": 0.2887881398200989, "learning_rate": 1.0234113712374581e-06, "loss": 6.3721, "step": 49800 }, { "epoch": 110.45189908511229, "eval_loss": 6.384125709533691, "eval_runtime": 172.9625, "eval_samples_per_second": 57.816, "eval_steps_per_second": 7.227, "step": 49800 }, { "epoch": 110.67369004713058, "grad_norm": 0.31840309500694275, "learning_rate": 1.0133779264214047e-06, "loss": 6.3717, "step": 49900 }, { "epoch": 110.67369004713058, "eval_loss": 6.381842136383057, "eval_runtime": 175.4749, "eval_samples_per_second": 56.988, "eval_steps_per_second": 7.124, "step": 49900 }, { "epoch": 110.89548100914888, "grad_norm": 0.21653781831264496, "learning_rate": 1.0033444816053512e-06, "loss": 6.3707, "step": 50000 }, { "epoch": 110.89548100914888, "eval_loss": 6.381892204284668, "eval_runtime": 175.5709, "eval_samples_per_second": 56.957, "eval_steps_per_second": 7.12, "step": 50000 }, { "epoch": 111.11727197116717, "grad_norm": 0.3267481327056885, "learning_rate": 9.933110367892976e-07, "loss": 6.3708, "step": 50100 }, { "epoch": 111.11727197116717, "eval_loss": 6.3821611404418945, "eval_runtime": 172.9472, "eval_samples_per_second": 57.821, "eval_steps_per_second": 7.228, "step": 50100 }, { "epoch": 111.33906293318547, "grad_norm": 0.27063196897506714, "learning_rate": 9.832775919732441e-07, "loss": 6.3717, "step": 50200 }, { "epoch": 111.33906293318547, "eval_loss": 6.380985736846924, "eval_runtime": 175.4969, "eval_samples_per_second": 56.981, "eval_steps_per_second": 7.123, "step": 50200 }, { "epoch": 111.56085389520376, "grad_norm": 0.3249282240867615, "learning_rate": 9.732441471571907e-07, "loss": 6.3712, "step": 50300 }, { "epoch": 111.56085389520376, "eval_loss": 6.380914688110352, "eval_runtime": 172.9605, "eval_samples_per_second": 57.817, "eval_steps_per_second": 7.227, "step": 50300 }, { "epoch": 111.78264485722207, "grad_norm": 0.23895922303199768, "learning_rate": 9.632107023411372e-07, "loss": 6.3703, "step": 50400 }, { "epoch": 111.78264485722207, "eval_loss": 6.382885932922363, "eval_runtime": 175.5673, "eval_samples_per_second": 56.958, "eval_steps_per_second": 7.12, "step": 50400 }, { "epoch": 112.00443581924037, "grad_norm": 0.35966283082962036, "learning_rate": 9.531772575250837e-07, "loss": 6.37, "step": 50500 }, { "epoch": 112.00443581924037, "eval_loss": 6.383852481842041, "eval_runtime": 173.4529, "eval_samples_per_second": 57.653, "eval_steps_per_second": 7.207, "step": 50500 }, { "epoch": 112.22622678125866, "grad_norm": 0.3259362578392029, "learning_rate": 9.431438127090301e-07, "loss": 6.371, "step": 50600 }, { "epoch": 112.22622678125866, "eval_loss": 6.385578155517578, "eval_runtime": 175.6993, "eval_samples_per_second": 56.915, "eval_steps_per_second": 7.114, "step": 50600 }, { "epoch": 112.44801774327696, "grad_norm": 0.26321855187416077, "learning_rate": 9.331103678929767e-07, "loss": 6.3701, "step": 50700 }, { "epoch": 112.44801774327696, "eval_loss": 6.380197048187256, "eval_runtime": 175.6967, "eval_samples_per_second": 56.916, "eval_steps_per_second": 7.115, "step": 50700 }, { "epoch": 112.66980870529525, "grad_norm": 0.25881582498550415, "learning_rate": 9.230769230769231e-07, "loss": 6.3701, "step": 50800 }, { "epoch": 112.66980870529525, "eval_loss": 6.379401683807373, "eval_runtime": 175.5648, "eval_samples_per_second": 56.959, "eval_steps_per_second": 7.12, "step": 50800 }, { "epoch": 112.89159966731356, "grad_norm": 0.23602035641670227, "learning_rate": 9.130434782608697e-07, "loss": 6.3697, "step": 50900 }, { "epoch": 112.89159966731356, "eval_loss": 6.380613803863525, "eval_runtime": 172.9114, "eval_samples_per_second": 57.833, "eval_steps_per_second": 7.229, "step": 50900 }, { "epoch": 113.11339062933186, "grad_norm": 0.3607383072376251, "learning_rate": 9.030100334448161e-07, "loss": 6.3717, "step": 51000 }, { "epoch": 113.11339062933186, "eval_loss": 6.3821024894714355, "eval_runtime": 175.5159, "eval_samples_per_second": 56.975, "eval_steps_per_second": 7.122, "step": 51000 }, { "epoch": 113.33518159135015, "grad_norm": 0.313915878534317, "learning_rate": 8.929765886287627e-07, "loss": 6.3703, "step": 51100 }, { "epoch": 113.33518159135015, "eval_loss": 6.381007194519043, "eval_runtime": 175.5953, "eval_samples_per_second": 56.949, "eval_steps_per_second": 7.119, "step": 51100 }, { "epoch": 113.55697255336845, "grad_norm": 0.26152804493904114, "learning_rate": 8.829431438127091e-07, "loss": 6.3712, "step": 51200 }, { "epoch": 113.55697255336845, "eval_loss": 6.381545543670654, "eval_runtime": 175.5198, "eval_samples_per_second": 56.974, "eval_steps_per_second": 7.122, "step": 51200 }, { "epoch": 113.77876351538674, "grad_norm": 0.31778955459594727, "learning_rate": 8.729096989966555e-07, "loss": 6.3718, "step": 51300 }, { "epoch": 113.77876351538674, "eval_loss": 6.384615421295166, "eval_runtime": 175.9251, "eval_samples_per_second": 56.842, "eval_steps_per_second": 7.105, "step": 51300 }, { "epoch": 114.00055447740505, "grad_norm": 0.2694382965564728, "learning_rate": 8.628762541806019e-07, "loss": 6.371, "step": 51400 }, { "epoch": 114.00055447740505, "eval_loss": 6.383395671844482, "eval_runtime": 175.9708, "eval_samples_per_second": 56.828, "eval_steps_per_second": 7.103, "step": 51400 }, { "epoch": 114.22234543942335, "grad_norm": 0.29690447449684143, "learning_rate": 8.528428093645485e-07, "loss": 6.37, "step": 51500 }, { "epoch": 114.22234543942335, "eval_loss": 6.382811546325684, "eval_runtime": 173.537, "eval_samples_per_second": 57.625, "eval_steps_per_second": 7.203, "step": 51500 }, { "epoch": 114.44413640144164, "grad_norm": 0.39484673738479614, "learning_rate": 8.428093645484949e-07, "loss": 6.3711, "step": 51600 }, { "epoch": 114.44413640144164, "eval_loss": 6.382282257080078, "eval_runtime": 175.9709, "eval_samples_per_second": 56.828, "eval_steps_per_second": 7.103, "step": 51600 }, { "epoch": 114.66592736345994, "grad_norm": 0.2630254626274109, "learning_rate": 8.327759197324414e-07, "loss": 6.3707, "step": 51700 }, { "epoch": 114.66592736345994, "eval_loss": 6.382809162139893, "eval_runtime": 176.0003, "eval_samples_per_second": 56.818, "eval_steps_per_second": 7.102, "step": 51700 }, { "epoch": 114.88771832547823, "grad_norm": 0.3054973781108856, "learning_rate": 8.227424749163879e-07, "loss": 6.3708, "step": 51800 }, { "epoch": 114.88771832547823, "eval_loss": 6.3818230628967285, "eval_runtime": 173.4807, "eval_samples_per_second": 57.643, "eval_steps_per_second": 7.205, "step": 51800 }, { "epoch": 115.10950928749654, "grad_norm": 0.24989312887191772, "learning_rate": 8.127090301003344e-07, "loss": 6.3697, "step": 51900 }, { "epoch": 115.10950928749654, "eval_loss": 6.3821187019348145, "eval_runtime": 175.9917, "eval_samples_per_second": 56.821, "eval_steps_per_second": 7.103, "step": 51900 }, { "epoch": 115.33130024951483, "grad_norm": 0.3176492750644684, "learning_rate": 8.026755852842809e-07, "loss": 6.3716, "step": 52000 }, { "epoch": 115.33130024951483, "eval_loss": 6.3822808265686035, "eval_runtime": 173.4423, "eval_samples_per_second": 57.656, "eval_steps_per_second": 7.207, "step": 52000 }, { "epoch": 115.55309121153313, "grad_norm": 0.2542394995689392, "learning_rate": 7.926421404682274e-07, "loss": 6.3712, "step": 52100 }, { "epoch": 115.55309121153313, "eval_loss": 6.380392074584961, "eval_runtime": 175.9555, "eval_samples_per_second": 56.833, "eval_steps_per_second": 7.104, "step": 52100 }, { "epoch": 115.77488217355143, "grad_norm": 0.2998870313167572, "learning_rate": 7.826086956521739e-07, "loss": 6.3699, "step": 52200 }, { "epoch": 115.77488217355143, "eval_loss": 6.381204605102539, "eval_runtime": 173.1037, "eval_samples_per_second": 57.769, "eval_steps_per_second": 7.221, "step": 52200 }, { "epoch": 115.99667313556972, "grad_norm": 0.2524458169937134, "learning_rate": 7.725752508361204e-07, "loss": 6.3704, "step": 52300 }, { "epoch": 115.99667313556972, "eval_loss": 6.383292198181152, "eval_runtime": 175.5333, "eval_samples_per_second": 56.969, "eval_steps_per_second": 7.121, "step": 52300 }, { "epoch": 116.21846409758803, "grad_norm": 0.2731904983520508, "learning_rate": 7.625418060200669e-07, "loss": 6.3715, "step": 52400 }, { "epoch": 116.21846409758803, "eval_loss": 6.380125999450684, "eval_runtime": 173.003, "eval_samples_per_second": 57.802, "eval_steps_per_second": 7.225, "step": 52400 }, { "epoch": 116.44025505960632, "grad_norm": 0.3370875120162964, "learning_rate": 7.525083612040134e-07, "loss": 6.3702, "step": 52500 }, { "epoch": 116.44025505960632, "eval_loss": 6.383055686950684, "eval_runtime": 175.6351, "eval_samples_per_second": 56.936, "eval_steps_per_second": 7.117, "step": 52500 }, { "epoch": 116.66204602162462, "grad_norm": 0.2853044867515564, "learning_rate": 7.424749163879599e-07, "loss": 6.3706, "step": 52600 }, { "epoch": 116.66204602162462, "eval_loss": 6.381393909454346, "eval_runtime": 175.6586, "eval_samples_per_second": 56.929, "eval_steps_per_second": 7.116, "step": 52600 }, { "epoch": 116.88383698364292, "grad_norm": 0.3378102481365204, "learning_rate": 7.324414715719064e-07, "loss": 6.3701, "step": 52700 }, { "epoch": 116.88383698364292, "eval_loss": 6.380878448486328, "eval_runtime": 175.5156, "eval_samples_per_second": 56.975, "eval_steps_per_second": 7.122, "step": 52700 }, { "epoch": 117.10562794566121, "grad_norm": 0.27575579285621643, "learning_rate": 7.224080267558529e-07, "loss": 6.3698, "step": 52800 }, { "epoch": 117.10562794566121, "eval_loss": 6.381886959075928, "eval_runtime": 175.558, "eval_samples_per_second": 56.961, "eval_steps_per_second": 7.12, "step": 52800 }, { "epoch": 117.32741890767952, "grad_norm": 0.22469982504844666, "learning_rate": 7.123745819397994e-07, "loss": 6.3689, "step": 52900 }, { "epoch": 117.32741890767952, "eval_loss": 6.378075122833252, "eval_runtime": 176.0795, "eval_samples_per_second": 56.793, "eval_steps_per_second": 7.099, "step": 52900 }, { "epoch": 117.5492098696978, "grad_norm": 0.26414427161216736, "learning_rate": 7.023411371237459e-07, "loss": 6.3715, "step": 53000 }, { "epoch": 117.5492098696978, "eval_loss": 6.38188362121582, "eval_runtime": 176.003, "eval_samples_per_second": 56.817, "eval_steps_per_second": 7.102, "step": 53000 }, { "epoch": 117.77100083171611, "grad_norm": 0.2348640114068985, "learning_rate": 6.923076923076923e-07, "loss": 6.3699, "step": 53100 }, { "epoch": 117.77100083171611, "eval_loss": 6.382396697998047, "eval_runtime": 175.8429, "eval_samples_per_second": 56.869, "eval_steps_per_second": 7.109, "step": 53100 }, { "epoch": 117.99279179373441, "grad_norm": 0.36397331953048706, "learning_rate": 6.822742474916388e-07, "loss": 6.3703, "step": 53200 }, { "epoch": 117.99279179373441, "eval_loss": 6.384123802185059, "eval_runtime": 174.4907, "eval_samples_per_second": 57.31, "eval_steps_per_second": 7.164, "step": 53200 }, { "epoch": 118.2145827557527, "grad_norm": 0.25135567784309387, "learning_rate": 6.722408026755853e-07, "loss": 6.3701, "step": 53300 }, { "epoch": 118.2145827557527, "eval_loss": 6.3801045417785645, "eval_runtime": 173.0674, "eval_samples_per_second": 57.781, "eval_steps_per_second": 7.223, "step": 53300 }, { "epoch": 118.436373717771, "grad_norm": 0.30894702672958374, "learning_rate": 6.622073578595318e-07, "loss": 6.3702, "step": 53400 }, { "epoch": 118.436373717771, "eval_loss": 6.379894733428955, "eval_runtime": 175.516, "eval_samples_per_second": 56.975, "eval_steps_per_second": 7.122, "step": 53400 }, { "epoch": 118.6581646797893, "grad_norm": 0.26461485028266907, "learning_rate": 6.521739130434783e-07, "loss": 6.3713, "step": 53500 }, { "epoch": 118.6581646797893, "eval_loss": 6.3835272789001465, "eval_runtime": 173.4513, "eval_samples_per_second": 57.653, "eval_steps_per_second": 7.207, "step": 53500 }, { "epoch": 118.8799556418076, "grad_norm": 0.24245497584342957, "learning_rate": 6.421404682274248e-07, "loss": 6.3705, "step": 53600 }, { "epoch": 118.8799556418076, "eval_loss": 6.381874084472656, "eval_runtime": 176.0158, "eval_samples_per_second": 56.813, "eval_steps_per_second": 7.102, "step": 53600 }, { "epoch": 119.10174660382589, "grad_norm": 0.23844820261001587, "learning_rate": 6.321070234113712e-07, "loss": 6.3698, "step": 53700 }, { "epoch": 119.10174660382589, "eval_loss": 6.381486415863037, "eval_runtime": 173.4108, "eval_samples_per_second": 57.667, "eval_steps_per_second": 7.208, "step": 53700 }, { "epoch": 119.32353756584419, "grad_norm": 0.2418413609266281, "learning_rate": 6.220735785953178e-07, "loss": 6.3712, "step": 53800 }, { "epoch": 119.32353756584419, "eval_loss": 6.382267951965332, "eval_runtime": 175.9952, "eval_samples_per_second": 56.82, "eval_steps_per_second": 7.102, "step": 53800 }, { "epoch": 119.5453285278625, "grad_norm": 0.22692246735095978, "learning_rate": 6.120401337792642e-07, "loss": 6.371, "step": 53900 }, { "epoch": 119.5453285278625, "eval_loss": 6.383540630340576, "eval_runtime": 173.1726, "eval_samples_per_second": 57.746, "eval_steps_per_second": 7.218, "step": 53900 }, { "epoch": 119.76711948988078, "grad_norm": 0.29117047786712646, "learning_rate": 6.020066889632107e-07, "loss": 6.3713, "step": 54000 }, { "epoch": 119.76711948988078, "eval_loss": 6.382152557373047, "eval_runtime": 175.7557, "eval_samples_per_second": 56.897, "eval_steps_per_second": 7.112, "step": 54000 }, { "epoch": 119.98891045189909, "grad_norm": 0.21682819724082947, "learning_rate": 5.919732441471572e-07, "loss": 6.3702, "step": 54100 }, { "epoch": 119.98891045189909, "eval_loss": 6.380878925323486, "eval_runtime": 173.0921, "eval_samples_per_second": 57.773, "eval_steps_per_second": 7.222, "step": 54100 }, { "epoch": 120.21070141391738, "grad_norm": 0.31245148181915283, "learning_rate": 5.819397993311037e-07, "loss": 6.3694, "step": 54200 }, { "epoch": 120.21070141391738, "eval_loss": 6.383978843688965, "eval_runtime": 175.5232, "eval_samples_per_second": 56.973, "eval_steps_per_second": 7.122, "step": 54200 }, { "epoch": 120.43249237593568, "grad_norm": 0.22876819968223572, "learning_rate": 5.719063545150502e-07, "loss": 6.3706, "step": 54300 }, { "epoch": 120.43249237593568, "eval_loss": 6.382028102874756, "eval_runtime": 173.1291, "eval_samples_per_second": 57.76, "eval_steps_per_second": 7.22, "step": 54300 }, { "epoch": 120.65428333795398, "grad_norm": 0.25953638553619385, "learning_rate": 5.618729096989966e-07, "loss": 6.3707, "step": 54400 }, { "epoch": 120.65428333795398, "eval_loss": 6.381461143493652, "eval_runtime": 175.57, "eval_samples_per_second": 56.957, "eval_steps_per_second": 7.12, "step": 54400 }, { "epoch": 120.87607429997227, "grad_norm": 0.1654128134250641, "learning_rate": 5.518394648829431e-07, "loss": 6.3707, "step": 54500 }, { "epoch": 120.87607429997227, "eval_loss": 6.3789753913879395, "eval_runtime": 175.5387, "eval_samples_per_second": 56.967, "eval_steps_per_second": 7.121, "step": 54500 }, { "epoch": 121.09786526199058, "grad_norm": 0.29274898767471313, "learning_rate": 5.418060200668896e-07, "loss": 6.3703, "step": 54600 }, { "epoch": 121.09786526199058, "eval_loss": 6.380027770996094, "eval_runtime": 175.4995, "eval_samples_per_second": 56.98, "eval_steps_per_second": 7.123, "step": 54600 }, { "epoch": 121.31965622400887, "grad_norm": 0.2235456258058548, "learning_rate": 5.317725752508361e-07, "loss": 6.373, "step": 54700 }, { "epoch": 121.31965622400887, "eval_loss": 6.380786418914795, "eval_runtime": 175.5186, "eval_samples_per_second": 56.974, "eval_steps_per_second": 7.122, "step": 54700 }, { "epoch": 121.54144718602717, "grad_norm": 0.30965185165405273, "learning_rate": 5.217391304347826e-07, "loss": 6.3714, "step": 54800 }, { "epoch": 121.54144718602717, "eval_loss": 6.382297039031982, "eval_runtime": 175.5968, "eval_samples_per_second": 56.949, "eval_steps_per_second": 7.119, "step": 54800 }, { "epoch": 121.76323814804547, "grad_norm": 0.28793787956237793, "learning_rate": 5.117056856187291e-07, "loss": 6.3707, "step": 54900 }, { "epoch": 121.76323814804547, "eval_loss": 6.377398490905762, "eval_runtime": 173.0039, "eval_samples_per_second": 57.802, "eval_steps_per_second": 7.225, "step": 54900 }, { "epoch": 121.98502911006376, "grad_norm": 0.3277120590209961, "learning_rate": 5.016722408026756e-07, "loss": 6.3688, "step": 55000 }, { "epoch": 121.98502911006376, "eval_loss": 6.383605480194092, "eval_runtime": 173.0503, "eval_samples_per_second": 57.787, "eval_steps_per_second": 7.223, "step": 55000 }, { "epoch": 122.20682007208207, "grad_norm": 0.2291731834411621, "learning_rate": 4.916387959866221e-07, "loss": 6.3702, "step": 55100 }, { "epoch": 122.20682007208207, "eval_loss": 6.385202407836914, "eval_runtime": 175.7369, "eval_samples_per_second": 56.903, "eval_steps_per_second": 7.113, "step": 55100 }, { "epoch": 122.42861103410036, "grad_norm": 0.23682117462158203, "learning_rate": 4.816053511705686e-07, "loss": 6.3711, "step": 55200 }, { "epoch": 122.42861103410036, "eval_loss": 6.386002063751221, "eval_runtime": 173.0919, "eval_samples_per_second": 57.773, "eval_steps_per_second": 7.222, "step": 55200 }, { "epoch": 122.65040199611866, "grad_norm": 0.21502740681171417, "learning_rate": 4.7157190635451506e-07, "loss": 6.37, "step": 55300 }, { "epoch": 122.65040199611866, "eval_loss": 6.38268518447876, "eval_runtime": 175.5194, "eval_samples_per_second": 56.974, "eval_steps_per_second": 7.122, "step": 55300 }, { "epoch": 122.87219295813695, "grad_norm": 0.2415875792503357, "learning_rate": 4.6153846153846156e-07, "loss": 6.37, "step": 55400 }, { "epoch": 122.87219295813695, "eval_loss": 6.379030704498291, "eval_runtime": 173.0644, "eval_samples_per_second": 57.782, "eval_steps_per_second": 7.223, "step": 55400 }, { "epoch": 123.09398392015525, "grad_norm": 0.278998464345932, "learning_rate": 4.5150501672240806e-07, "loss": 6.3709, "step": 55500 }, { "epoch": 123.09398392015525, "eval_loss": 6.381860256195068, "eval_runtime": 175.5462, "eval_samples_per_second": 56.965, "eval_steps_per_second": 7.121, "step": 55500 }, { "epoch": 123.31577488217356, "grad_norm": 0.27015259861946106, "learning_rate": 4.4147157190635456e-07, "loss": 6.37, "step": 55600 }, { "epoch": 123.31577488217356, "eval_loss": 6.380960464477539, "eval_runtime": 175.9714, "eval_samples_per_second": 56.827, "eval_steps_per_second": 7.103, "step": 55600 }, { "epoch": 123.53756584419185, "grad_norm": 0.23815931379795074, "learning_rate": 4.3143812709030095e-07, "loss": 6.37, "step": 55700 }, { "epoch": 123.53756584419185, "eval_loss": 6.384081840515137, "eval_runtime": 173.1242, "eval_samples_per_second": 57.762, "eval_steps_per_second": 7.22, "step": 55700 }, { "epoch": 123.75935680621015, "grad_norm": 0.24355483055114746, "learning_rate": 4.2140468227424745e-07, "loss": 6.3694, "step": 55800 }, { "epoch": 123.75935680621015, "eval_loss": 6.378664016723633, "eval_runtime": 173.0043, "eval_samples_per_second": 57.802, "eval_steps_per_second": 7.225, "step": 55800 }, { "epoch": 123.98114776822844, "grad_norm": 0.21320495009422302, "learning_rate": 4.1137123745819395e-07, "loss": 6.3693, "step": 55900 }, { "epoch": 123.98114776822844, "eval_loss": 6.382479190826416, "eval_runtime": 175.5916, "eval_samples_per_second": 56.95, "eval_steps_per_second": 7.119, "step": 55900 }, { "epoch": 124.20293873024674, "grad_norm": 0.2245740443468094, "learning_rate": 4.0133779264214045e-07, "loss": 6.3702, "step": 56000 }, { "epoch": 124.20293873024674, "eval_loss": 6.385231971740723, "eval_runtime": 175.6666, "eval_samples_per_second": 56.926, "eval_steps_per_second": 7.116, "step": 56000 }, { "epoch": 124.42472969226505, "grad_norm": 0.282416969537735, "learning_rate": 3.9130434782608694e-07, "loss": 6.3709, "step": 56100 }, { "epoch": 124.42472969226505, "eval_loss": 6.380115032196045, "eval_runtime": 175.7632, "eval_samples_per_second": 56.895, "eval_steps_per_second": 7.112, "step": 56100 }, { "epoch": 124.64652065428334, "grad_norm": 0.19661109149456024, "learning_rate": 3.8127090301003344e-07, "loss": 6.3712, "step": 56200 }, { "epoch": 124.64652065428334, "eval_loss": 6.3793158531188965, "eval_runtime": 175.59, "eval_samples_per_second": 56.951, "eval_steps_per_second": 7.119, "step": 56200 }, { "epoch": 124.86831161630164, "grad_norm": 0.18216532468795776, "learning_rate": 3.7123745819397994e-07, "loss": 6.3703, "step": 56300 }, { "epoch": 124.86831161630164, "eval_loss": 6.381213188171387, "eval_runtime": 175.6076, "eval_samples_per_second": 56.945, "eval_steps_per_second": 7.118, "step": 56300 }, { "epoch": 125.09010257831993, "grad_norm": 0.3018471598625183, "learning_rate": 3.6120401337792644e-07, "loss": 6.3706, "step": 56400 }, { "epoch": 125.09010257831993, "eval_loss": 6.3784942626953125, "eval_runtime": 175.6917, "eval_samples_per_second": 56.918, "eval_steps_per_second": 7.115, "step": 56400 }, { "epoch": 125.31189354033823, "grad_norm": 0.21381452679634094, "learning_rate": 3.5117056856187294e-07, "loss": 6.3722, "step": 56500 }, { "epoch": 125.31189354033823, "eval_loss": 6.381383419036865, "eval_runtime": 173.1305, "eval_samples_per_second": 57.76, "eval_steps_per_second": 7.22, "step": 56500 }, { "epoch": 125.53368450235654, "grad_norm": 0.23340944945812225, "learning_rate": 3.411371237458194e-07, "loss": 6.3698, "step": 56600 }, { "epoch": 125.53368450235654, "eval_loss": 6.380908012390137, "eval_runtime": 175.6729, "eval_samples_per_second": 56.924, "eval_steps_per_second": 7.115, "step": 56600 }, { "epoch": 125.75547546437483, "grad_norm": 0.22507449984550476, "learning_rate": 3.311036789297659e-07, "loss": 6.3711, "step": 56700 }, { "epoch": 125.75547546437483, "eval_loss": 6.37741756439209, "eval_runtime": 172.9897, "eval_samples_per_second": 57.807, "eval_steps_per_second": 7.226, "step": 56700 }, { "epoch": 125.97726642639313, "grad_norm": 0.21832765638828278, "learning_rate": 3.210702341137124e-07, "loss": 6.3716, "step": 56800 }, { "epoch": 125.97726642639313, "eval_loss": 6.381014823913574, "eval_runtime": 175.5155, "eval_samples_per_second": 56.975, "eval_steps_per_second": 7.122, "step": 56800 }, { "epoch": 126.19905738841142, "grad_norm": 0.27440136671066284, "learning_rate": 3.110367892976589e-07, "loss": 6.3728, "step": 56900 }, { "epoch": 126.19905738841142, "eval_loss": 6.3819074630737305, "eval_runtime": 172.9421, "eval_samples_per_second": 57.823, "eval_steps_per_second": 7.228, "step": 56900 }, { "epoch": 126.42084835042972, "grad_norm": 0.24798136949539185, "learning_rate": 3.010033444816054e-07, "loss": 6.3702, "step": 57000 }, { "epoch": 126.42084835042972, "eval_loss": 6.379570484161377, "eval_runtime": 176.0012, "eval_samples_per_second": 56.818, "eval_steps_per_second": 7.102, "step": 57000 }, { "epoch": 126.64263931244801, "grad_norm": 0.196645587682724, "learning_rate": 2.9096989966555187e-07, "loss": 6.3702, "step": 57100 }, { "epoch": 126.64263931244801, "eval_loss": 6.3817267417907715, "eval_runtime": 173.0992, "eval_samples_per_second": 57.77, "eval_steps_per_second": 7.221, "step": 57100 }, { "epoch": 126.86443027446632, "grad_norm": 0.21966499090194702, "learning_rate": 2.809364548494983e-07, "loss": 6.3689, "step": 57200 }, { "epoch": 126.86443027446632, "eval_loss": 6.383100986480713, "eval_runtime": 175.7334, "eval_samples_per_second": 56.904, "eval_steps_per_second": 7.113, "step": 57200 }, { "epoch": 127.08622123648462, "grad_norm": 0.19457194209098816, "learning_rate": 2.709030100334448e-07, "loss": 6.371, "step": 57300 }, { "epoch": 127.08622123648462, "eval_loss": 6.381374835968018, "eval_runtime": 175.566, "eval_samples_per_second": 56.959, "eval_steps_per_second": 7.12, "step": 57300 }, { "epoch": 127.30801219850291, "grad_norm": 0.22573208808898926, "learning_rate": 2.608695652173913e-07, "loss": 6.3725, "step": 57400 }, { "epoch": 127.30801219850291, "eval_loss": 6.380834579467773, "eval_runtime": 175.5891, "eval_samples_per_second": 56.951, "eval_steps_per_second": 7.119, "step": 57400 }, { "epoch": 127.52980316052121, "grad_norm": 0.2630537748336792, "learning_rate": 2.508361204013378e-07, "loss": 6.3689, "step": 57500 }, { "epoch": 127.52980316052121, "eval_loss": 6.380504131317139, "eval_runtime": 175.5167, "eval_samples_per_second": 56.975, "eval_steps_per_second": 7.122, "step": 57500 }, { "epoch": 127.7515941225395, "grad_norm": 0.2693498134613037, "learning_rate": 2.408026755852843e-07, "loss": 6.3711, "step": 57600 }, { "epoch": 127.7515941225395, "eval_loss": 6.379264831542969, "eval_runtime": 175.6659, "eval_samples_per_second": 56.926, "eval_steps_per_second": 7.116, "step": 57600 }, { "epoch": 127.9733850845578, "grad_norm": 0.21430125832557678, "learning_rate": 2.3076923076923078e-07, "loss": 6.3701, "step": 57700 }, { "epoch": 127.9733850845578, "eval_loss": 6.383444309234619, "eval_runtime": 175.8385, "eval_samples_per_second": 56.87, "eval_steps_per_second": 7.109, "step": 57700 }, { "epoch": 128.1951760465761, "grad_norm": 0.23632164299488068, "learning_rate": 2.2073578595317728e-07, "loss": 6.37, "step": 57800 }, { "epoch": 128.1951760465761, "eval_loss": 6.381924152374268, "eval_runtime": 175.9161, "eval_samples_per_second": 56.845, "eval_steps_per_second": 7.106, "step": 57800 }, { "epoch": 128.4169670085944, "grad_norm": 0.20027929544448853, "learning_rate": 2.1070234113712372e-07, "loss": 6.3689, "step": 57900 }, { "epoch": 128.4169670085944, "eval_loss": 6.380605697631836, "eval_runtime": 175.7408, "eval_samples_per_second": 56.902, "eval_steps_per_second": 7.113, "step": 57900 }, { "epoch": 128.6387579706127, "grad_norm": 0.24598795175552368, "learning_rate": 2.0066889632107022e-07, "loss": 6.3703, "step": 58000 }, { "epoch": 128.6387579706127, "eval_loss": 6.380997180938721, "eval_runtime": 175.8242, "eval_samples_per_second": 56.875, "eval_steps_per_second": 7.109, "step": 58000 }, { "epoch": 128.860548932631, "grad_norm": 0.22210384905338287, "learning_rate": 1.9063545150501672e-07, "loss": 6.3713, "step": 58100 }, { "epoch": 128.860548932631, "eval_loss": 6.379730701446533, "eval_runtime": 175.6297, "eval_samples_per_second": 56.938, "eval_steps_per_second": 7.117, "step": 58100 }, { "epoch": 129.0823398946493, "grad_norm": 0.21533408761024475, "learning_rate": 1.8060200668896322e-07, "loss": 6.369, "step": 58200 }, { "epoch": 129.0823398946493, "eval_loss": 6.379825592041016, "eval_runtime": 173.1155, "eval_samples_per_second": 57.765, "eval_steps_per_second": 7.221, "step": 58200 }, { "epoch": 129.3041308566676, "grad_norm": 0.24441500008106232, "learning_rate": 1.705685618729097e-07, "loss": 6.3712, "step": 58300 }, { "epoch": 129.3041308566676, "eval_loss": 6.380709171295166, "eval_runtime": 175.6262, "eval_samples_per_second": 56.939, "eval_steps_per_second": 7.117, "step": 58300 }, { "epoch": 129.5259218186859, "grad_norm": 0.174821138381958, "learning_rate": 1.605351170568562e-07, "loss": 6.3694, "step": 58400 }, { "epoch": 129.5259218186859, "eval_loss": 6.3804545402526855, "eval_runtime": 174.2415, "eval_samples_per_second": 57.392, "eval_steps_per_second": 7.174, "step": 58400 }, { "epoch": 129.74771278070418, "grad_norm": 0.24464456737041473, "learning_rate": 1.505016722408027e-07, "loss": 6.3713, "step": 58500 }, { "epoch": 129.74771278070418, "eval_loss": 6.379507541656494, "eval_runtime": 175.5413, "eval_samples_per_second": 56.967, "eval_steps_per_second": 7.121, "step": 58500 }, { "epoch": 129.96950374272248, "grad_norm": 0.1928214728832245, "learning_rate": 1.4046822742474916e-07, "loss": 6.37, "step": 58600 }, { "epoch": 129.96950374272248, "eval_loss": 6.384742736816406, "eval_runtime": 173.0335, "eval_samples_per_second": 57.792, "eval_steps_per_second": 7.224, "step": 58600 }, { "epoch": 130.19129470474078, "grad_norm": 0.2976389229297638, "learning_rate": 1.3043478260869566e-07, "loss": 6.3717, "step": 58700 }, { "epoch": 130.19129470474078, "eval_loss": 6.3786187171936035, "eval_runtime": 175.6076, "eval_samples_per_second": 56.945, "eval_steps_per_second": 7.118, "step": 58700 }, { "epoch": 130.4130856667591, "grad_norm": 0.25023147463798523, "learning_rate": 1.2040133779264215e-07, "loss": 6.3685, "step": 58800 }, { "epoch": 130.4130856667591, "eval_loss": 6.383387565612793, "eval_runtime": 175.5163, "eval_samples_per_second": 56.975, "eval_steps_per_second": 7.122, "step": 58800 }, { "epoch": 130.6348766287774, "grad_norm": 0.21737854182720184, "learning_rate": 1.1036789297658864e-07, "loss": 6.3712, "step": 58900 }, { "epoch": 130.6348766287774, "eval_loss": 6.379786491394043, "eval_runtime": 175.7874, "eval_samples_per_second": 56.887, "eval_steps_per_second": 7.111, "step": 58900 }, { "epoch": 130.85666759079567, "grad_norm": 0.18008896708488464, "learning_rate": 1.0033444816053511e-07, "loss": 6.3701, "step": 59000 }, { "epoch": 130.85666759079567, "eval_loss": 6.378762722015381, "eval_runtime": 175.3457, "eval_samples_per_second": 57.03, "eval_steps_per_second": 7.129, "step": 59000 }, { "epoch": 131.07845855281397, "grad_norm": 0.26529356837272644, "learning_rate": 9.030100334448161e-08, "loss": 6.3706, "step": 59100 }, { "epoch": 131.07845855281397, "eval_loss": 6.384096622467041, "eval_runtime": 173.4291, "eval_samples_per_second": 57.66, "eval_steps_per_second": 7.208, "step": 59100 }, { "epoch": 131.30024951483227, "grad_norm": 0.2854064106941223, "learning_rate": 8.02675585284281e-08, "loss": 6.3699, "step": 59200 }, { "epoch": 131.30024951483227, "eval_loss": 6.38028621673584, "eval_runtime": 175.6366, "eval_samples_per_second": 56.936, "eval_steps_per_second": 7.117, "step": 59200 }, { "epoch": 131.52204047685058, "grad_norm": 0.2294512242078781, "learning_rate": 7.023411371237458e-08, "loss": 6.3711, "step": 59300 }, { "epoch": 131.52204047685058, "eval_loss": 6.384092330932617, "eval_runtime": 175.5659, "eval_samples_per_second": 56.959, "eval_steps_per_second": 7.12, "step": 59300 }, { "epoch": 131.74383143886885, "grad_norm": 0.23442944884300232, "learning_rate": 6.020066889632108e-08, "loss": 6.3704, "step": 59400 }, { "epoch": 131.74383143886885, "eval_loss": 6.382981300354004, "eval_runtime": 175.5589, "eval_samples_per_second": 56.961, "eval_steps_per_second": 7.12, "step": 59400 }, { "epoch": 131.96562240088716, "grad_norm": 0.216475710272789, "learning_rate": 5.0167224080267556e-08, "loss": 6.3708, "step": 59500 }, { "epoch": 131.96562240088716, "eval_loss": 6.381463050842285, "eval_runtime": 175.6519, "eval_samples_per_second": 56.931, "eval_steps_per_second": 7.116, "step": 59500 }, { "epoch": 132.18741336290546, "grad_norm": 0.2338051199913025, "learning_rate": 4.013377926421405e-08, "loss": 6.3693, "step": 59600 }, { "epoch": 132.18741336290546, "eval_loss": 6.379833698272705, "eval_runtime": 175.5243, "eval_samples_per_second": 56.972, "eval_steps_per_second": 7.122, "step": 59600 }, { "epoch": 132.40920432492376, "grad_norm": 0.20408721268177032, "learning_rate": 3.010033444816054e-08, "loss": 6.3683, "step": 59700 }, { "epoch": 132.40920432492376, "eval_loss": 6.38368034362793, "eval_runtime": 175.3937, "eval_samples_per_second": 57.015, "eval_steps_per_second": 7.127, "step": 59700 }, { "epoch": 132.63099528694207, "grad_norm": 0.24998629093170166, "learning_rate": 2.0066889632107024e-08, "loss": 6.3697, "step": 59800 }, { "epoch": 132.63099528694207, "eval_loss": 6.381494522094727, "eval_runtime": 176.0167, "eval_samples_per_second": 56.813, "eval_steps_per_second": 7.102, "step": 59800 }, { "epoch": 132.85278624896034, "grad_norm": 0.2178734391927719, "learning_rate": 1.0033444816053512e-08, "loss": 6.371, "step": 59900 }, { "epoch": 132.85278624896034, "eval_loss": 6.382035732269287, "eval_runtime": 173.4585, "eval_samples_per_second": 57.651, "eval_steps_per_second": 7.206, "step": 59900 }, { "epoch": 133.07457721097865, "grad_norm": 0.24738912284374237, "learning_rate": 0.0, "loss": 6.3696, "step": 60000 }, { "epoch": 133.07457721097865, "eval_loss": 6.382532596588135, "eval_runtime": 175.9883, "eval_samples_per_second": 56.822, "eval_steps_per_second": 7.103, "step": 60000 } ], "logging_steps": 100, "max_steps": 60000, "num_input_tokens_seen": 0, "num_train_epochs": 134, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.157205700133659e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }