jaykaydg's picture
Upload folder using huggingface_hub
45c9eab verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.659152330103487,
"eval_steps": 500,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001318304660206974,
"grad_norm": 4.59375,
"learning_rate": 0.0002,
"loss": 1.9624,
"step": 5
},
{
"epoch": 0.002636609320413948,
"grad_norm": 1.7421875,
"learning_rate": 0.00019986805647183008,
"loss": 0.6513,
"step": 10
},
{
"epoch": 0.003954913980620921,
"grad_norm": 1.84375,
"learning_rate": 0.00019973611294366012,
"loss": 0.1146,
"step": 15
},
{
"epoch": 0.005273218640827896,
"grad_norm": 1.3203125,
"learning_rate": 0.0001996041694154902,
"loss": 0.0529,
"step": 20
},
{
"epoch": 0.006591523301034869,
"grad_norm": 0.40234375,
"learning_rate": 0.00019947222588732023,
"loss": 0.1214,
"step": 25
},
{
"epoch": 0.007909827961241843,
"grad_norm": 1.5390625,
"learning_rate": 0.0001993402823591503,
"loss": 0.0919,
"step": 30
},
{
"epoch": 0.009228132621448816,
"grad_norm": 0.06201171875,
"learning_rate": 0.00019920833883098034,
"loss": 0.09,
"step": 35
},
{
"epoch": 0.010546437281655791,
"grad_norm": 1.53125,
"learning_rate": 0.0001990763953028104,
"loss": 0.1945,
"step": 40
},
{
"epoch": 0.011864741941862765,
"grad_norm": 0.2890625,
"learning_rate": 0.00019894445177464048,
"loss": 0.1259,
"step": 45
},
{
"epoch": 0.013183046602069738,
"grad_norm": 0.609375,
"learning_rate": 0.00019881250824647052,
"loss": 0.027,
"step": 50
},
{
"epoch": 0.014501351262276712,
"grad_norm": 0.369140625,
"learning_rate": 0.00019868056471830057,
"loss": 0.1068,
"step": 55
},
{
"epoch": 0.015819655922483685,
"grad_norm": 0.34765625,
"learning_rate": 0.00019854862119013064,
"loss": 0.0542,
"step": 60
},
{
"epoch": 0.01713796058269066,
"grad_norm": 0.055419921875,
"learning_rate": 0.00019841667766196068,
"loss": 0.0901,
"step": 65
},
{
"epoch": 0.018456265242897632,
"grad_norm": 0.0247802734375,
"learning_rate": 0.00019828473413379075,
"loss": 0.0091,
"step": 70
},
{
"epoch": 0.019774569903104607,
"grad_norm": 0.0079345703125,
"learning_rate": 0.0001981527906056208,
"loss": 0.0744,
"step": 75
},
{
"epoch": 0.021092874563311582,
"grad_norm": 0.65234375,
"learning_rate": 0.00019802084707745086,
"loss": 0.1108,
"step": 80
},
{
"epoch": 0.022411179223518554,
"grad_norm": 0.50390625,
"learning_rate": 0.0001978889035492809,
"loss": 0.0446,
"step": 85
},
{
"epoch": 0.02372948388372553,
"grad_norm": 0.1787109375,
"learning_rate": 0.00019775696002111097,
"loss": 0.0982,
"step": 90
},
{
"epoch": 0.0250477885439325,
"grad_norm": 0.490234375,
"learning_rate": 0.00019762501649294104,
"loss": 0.1035,
"step": 95
},
{
"epoch": 0.026366093204139476,
"grad_norm": 0.12158203125,
"learning_rate": 0.00019749307296477108,
"loss": 0.0401,
"step": 100
},
{
"epoch": 0.02768439786434645,
"grad_norm": 0.16015625,
"learning_rate": 0.00019736112943660115,
"loss": 0.0309,
"step": 105
},
{
"epoch": 0.029002702524553423,
"grad_norm": 1.359375,
"learning_rate": 0.0001972291859084312,
"loss": 0.1032,
"step": 110
},
{
"epoch": 0.0303210071847604,
"grad_norm": 0.52734375,
"learning_rate": 0.00019709724238026126,
"loss": 0.0811,
"step": 115
},
{
"epoch": 0.03163931184496737,
"grad_norm": 0.177734375,
"learning_rate": 0.00019696529885209133,
"loss": 0.0258,
"step": 120
},
{
"epoch": 0.03295761650517435,
"grad_norm": 0.234375,
"learning_rate": 0.00019683335532392137,
"loss": 0.0437,
"step": 125
},
{
"epoch": 0.03427592116538132,
"grad_norm": 1.3046875,
"learning_rate": 0.00019670141179575144,
"loss": 0.0967,
"step": 130
},
{
"epoch": 0.03559422582558829,
"grad_norm": 0.2734375,
"learning_rate": 0.00019656946826758148,
"loss": 0.0132,
"step": 135
},
{
"epoch": 0.036912530485795264,
"grad_norm": 0.66015625,
"learning_rate": 0.00019643752473941155,
"loss": 0.0396,
"step": 140
},
{
"epoch": 0.03823083514600224,
"grad_norm": 1.0546875,
"learning_rate": 0.0001963055812112416,
"loss": 0.0449,
"step": 145
},
{
"epoch": 0.039549139806209214,
"grad_norm": 0.2021484375,
"learning_rate": 0.00019617363768307166,
"loss": 0.1196,
"step": 150
},
{
"epoch": 0.040867444466416186,
"grad_norm": 0.5859375,
"learning_rate": 0.0001960416941549017,
"loss": 0.0588,
"step": 155
},
{
"epoch": 0.042185749126623165,
"grad_norm": 0.06005859375,
"learning_rate": 0.00019590975062673175,
"loss": 0.0234,
"step": 160
},
{
"epoch": 0.04350405378683014,
"grad_norm": 0.4921875,
"learning_rate": 0.00019577780709856182,
"loss": 0.0916,
"step": 165
},
{
"epoch": 0.04482235844703711,
"grad_norm": 0.84375,
"learning_rate": 0.0001956458635703919,
"loss": 0.0271,
"step": 170
},
{
"epoch": 0.04614066310724409,
"grad_norm": 0.8828125,
"learning_rate": 0.00019551392004222193,
"loss": 0.0175,
"step": 175
},
{
"epoch": 0.04745896776745106,
"grad_norm": 0.0152587890625,
"learning_rate": 0.000195381976514052,
"loss": 0.0356,
"step": 180
},
{
"epoch": 0.04877727242765803,
"grad_norm": 0.09326171875,
"learning_rate": 0.00019525003298588204,
"loss": 0.0057,
"step": 185
},
{
"epoch": 0.050095577087865,
"grad_norm": 0.24609375,
"learning_rate": 0.0001951180894577121,
"loss": 0.0082,
"step": 190
},
{
"epoch": 0.05141388174807198,
"grad_norm": 0.05029296875,
"learning_rate": 0.00019498614592954215,
"loss": 0.0178,
"step": 195
},
{
"epoch": 0.05273218640827895,
"grad_norm": 0.0390625,
"learning_rate": 0.00019485420240137222,
"loss": 0.0789,
"step": 200
},
{
"epoch": 0.054050491068485924,
"grad_norm": 0.5625,
"learning_rate": 0.0001947222588732023,
"loss": 0.0645,
"step": 205
},
{
"epoch": 0.0553687957286929,
"grad_norm": 0.53515625,
"learning_rate": 0.00019459031534503233,
"loss": 0.116,
"step": 210
},
{
"epoch": 0.056687100388899875,
"grad_norm": 0.55078125,
"learning_rate": 0.0001944583718168624,
"loss": 0.0516,
"step": 215
},
{
"epoch": 0.058005405049106847,
"grad_norm": 0.314453125,
"learning_rate": 0.00019432642828869244,
"loss": 0.1019,
"step": 220
},
{
"epoch": 0.059323709709313825,
"grad_norm": 0.1123046875,
"learning_rate": 0.0001941944847605225,
"loss": 0.0529,
"step": 225
},
{
"epoch": 0.0606420143695208,
"grad_norm": 0.4921875,
"learning_rate": 0.00019406254123235256,
"loss": 0.0368,
"step": 230
},
{
"epoch": 0.06196031902972777,
"grad_norm": 0.054443359375,
"learning_rate": 0.00019393059770418262,
"loss": 0.037,
"step": 235
},
{
"epoch": 0.06327862368993474,
"grad_norm": 0.008544921875,
"learning_rate": 0.0001937986541760127,
"loss": 0.0324,
"step": 240
},
{
"epoch": 0.06459692835014172,
"grad_norm": 1.5,
"learning_rate": 0.00019366671064784274,
"loss": 0.0334,
"step": 245
},
{
"epoch": 0.0659152330103487,
"grad_norm": 0.2109375,
"learning_rate": 0.0001935347671196728,
"loss": 0.0671,
"step": 250
},
{
"epoch": 0.06723353767055566,
"grad_norm": 2.0625,
"learning_rate": 0.00019340282359150285,
"loss": 0.1559,
"step": 255
},
{
"epoch": 0.06855184233076264,
"grad_norm": 0.7734375,
"learning_rate": 0.0001932708800633329,
"loss": 0.0198,
"step": 260
},
{
"epoch": 0.06987014699096962,
"grad_norm": 0.42578125,
"learning_rate": 0.00019313893653516296,
"loss": 0.0151,
"step": 265
},
{
"epoch": 0.07118845165117658,
"grad_norm": 0.1884765625,
"learning_rate": 0.000193006993006993,
"loss": 0.0269,
"step": 270
},
{
"epoch": 0.07250675631138356,
"grad_norm": 1.546875,
"learning_rate": 0.00019287504947882307,
"loss": 0.0565,
"step": 275
},
{
"epoch": 0.07382506097159053,
"grad_norm": 0.5078125,
"learning_rate": 0.0001927431059506531,
"loss": 0.0942,
"step": 280
},
{
"epoch": 0.0751433656317975,
"grad_norm": 0.392578125,
"learning_rate": 0.00019261116242248318,
"loss": 0.0061,
"step": 285
},
{
"epoch": 0.07646167029200449,
"grad_norm": 1.9140625,
"learning_rate": 0.00019247921889431325,
"loss": 0.0497,
"step": 290
},
{
"epoch": 0.07777997495221145,
"grad_norm": 0.08837890625,
"learning_rate": 0.0001923472753661433,
"loss": 0.0573,
"step": 295
},
{
"epoch": 0.07909827961241843,
"grad_norm": 1.046875,
"learning_rate": 0.00019221533183797336,
"loss": 0.0528,
"step": 300
},
{
"epoch": 0.08041658427262541,
"grad_norm": 0.2275390625,
"learning_rate": 0.0001920833883098034,
"loss": 0.0506,
"step": 305
},
{
"epoch": 0.08173488893283237,
"grad_norm": 0.08203125,
"learning_rate": 0.00019195144478163347,
"loss": 0.0307,
"step": 310
},
{
"epoch": 0.08305319359303935,
"grad_norm": 0.111328125,
"learning_rate": 0.00019181950125346354,
"loss": 0.0365,
"step": 315
},
{
"epoch": 0.08437149825324633,
"grad_norm": 1.2890625,
"learning_rate": 0.00019168755772529358,
"loss": 0.0447,
"step": 320
},
{
"epoch": 0.0856898029134533,
"grad_norm": 0.6015625,
"learning_rate": 0.00019155561419712365,
"loss": 0.0605,
"step": 325
},
{
"epoch": 0.08700810757366027,
"grad_norm": 0.71875,
"learning_rate": 0.0001914236706689537,
"loss": 0.0846,
"step": 330
},
{
"epoch": 0.08832641223386725,
"grad_norm": 0.1494140625,
"learning_rate": 0.00019129172714078376,
"loss": 0.0713,
"step": 335
},
{
"epoch": 0.08964471689407422,
"grad_norm": 0.1669921875,
"learning_rate": 0.0001911597836126138,
"loss": 0.0826,
"step": 340
},
{
"epoch": 0.0909630215542812,
"grad_norm": 2.203125,
"learning_rate": 0.00019102784008444388,
"loss": 0.0441,
"step": 345
},
{
"epoch": 0.09228132621448817,
"grad_norm": 1.21875,
"learning_rate": 0.00019089589655627395,
"loss": 0.1378,
"step": 350
},
{
"epoch": 0.09359963087469514,
"grad_norm": 3.0625,
"learning_rate": 0.00019076395302810396,
"loss": 0.1552,
"step": 355
},
{
"epoch": 0.09491793553490212,
"grad_norm": 0.232421875,
"learning_rate": 0.00019063200949993403,
"loss": 0.0458,
"step": 360
},
{
"epoch": 0.0962362401951091,
"grad_norm": 0.71875,
"learning_rate": 0.0001905000659717641,
"loss": 0.0312,
"step": 365
},
{
"epoch": 0.09755454485531606,
"grad_norm": 0.0218505859375,
"learning_rate": 0.00019036812244359414,
"loss": 0.0247,
"step": 370
},
{
"epoch": 0.09887284951552304,
"grad_norm": 0.064453125,
"learning_rate": 0.0001902361789154242,
"loss": 0.054,
"step": 375
},
{
"epoch": 0.10019115417573,
"grad_norm": 0.021240234375,
"learning_rate": 0.00019010423538725425,
"loss": 0.0023,
"step": 380
},
{
"epoch": 0.10150945883593698,
"grad_norm": 0.0361328125,
"learning_rate": 0.00018997229185908432,
"loss": 0.0884,
"step": 385
},
{
"epoch": 0.10282776349614396,
"grad_norm": 1.703125,
"learning_rate": 0.00018984034833091436,
"loss": 0.0506,
"step": 390
},
{
"epoch": 0.10414606815635093,
"grad_norm": 0.08837890625,
"learning_rate": 0.00018970840480274443,
"loss": 0.1123,
"step": 395
},
{
"epoch": 0.1054643728165579,
"grad_norm": 0.6953125,
"learning_rate": 0.0001895764612745745,
"loss": 0.0597,
"step": 400
},
{
"epoch": 0.10678267747676488,
"grad_norm": 0.18359375,
"learning_rate": 0.00018944451774640454,
"loss": 0.0138,
"step": 405
},
{
"epoch": 0.10810098213697185,
"grad_norm": 0.0272216796875,
"learning_rate": 0.0001893125742182346,
"loss": 0.0249,
"step": 410
},
{
"epoch": 0.10941928679717883,
"grad_norm": 0.00970458984375,
"learning_rate": 0.00018918063069006466,
"loss": 0.0084,
"step": 415
},
{
"epoch": 0.1107375914573858,
"grad_norm": 0.54296875,
"learning_rate": 0.00018904868716189472,
"loss": 0.0541,
"step": 420
},
{
"epoch": 0.11205589611759277,
"grad_norm": 0.74609375,
"learning_rate": 0.00018891674363372477,
"loss": 0.007,
"step": 425
},
{
"epoch": 0.11337420077779975,
"grad_norm": 0.0211181640625,
"learning_rate": 0.00018878480010555484,
"loss": 0.0875,
"step": 430
},
{
"epoch": 0.11469250543800673,
"grad_norm": 0.9296875,
"learning_rate": 0.0001886528565773849,
"loss": 0.1207,
"step": 435
},
{
"epoch": 0.11601081009821369,
"grad_norm": 1.2734375,
"learning_rate": 0.00018852091304921495,
"loss": 0.1143,
"step": 440
},
{
"epoch": 0.11732911475842067,
"grad_norm": 0.6484375,
"learning_rate": 0.00018838896952104502,
"loss": 0.0393,
"step": 445
},
{
"epoch": 0.11864741941862765,
"grad_norm": 0.1552734375,
"learning_rate": 0.00018825702599287506,
"loss": 0.02,
"step": 450
},
{
"epoch": 0.11996572407883462,
"grad_norm": 0.486328125,
"learning_rate": 0.0001881250824647051,
"loss": 0.0891,
"step": 455
},
{
"epoch": 0.1212840287390416,
"grad_norm": 1.0,
"learning_rate": 0.00018799313893653517,
"loss": 0.0469,
"step": 460
},
{
"epoch": 0.12260233339924857,
"grad_norm": 0.2099609375,
"learning_rate": 0.0001878611954083652,
"loss": 0.019,
"step": 465
},
{
"epoch": 0.12392063805945554,
"grad_norm": 0.03857421875,
"learning_rate": 0.00018772925188019528,
"loss": 0.007,
"step": 470
},
{
"epoch": 0.12523894271966252,
"grad_norm": 0.0257568359375,
"learning_rate": 0.00018759730835202532,
"loss": 0.0039,
"step": 475
},
{
"epoch": 0.12655724737986948,
"grad_norm": 0.014404296875,
"learning_rate": 0.0001874653648238554,
"loss": 0.0043,
"step": 480
},
{
"epoch": 0.12787555204007647,
"grad_norm": 0.51953125,
"learning_rate": 0.00018733342129568546,
"loss": 0.1326,
"step": 485
},
{
"epoch": 0.12919385670028344,
"grad_norm": 0.99609375,
"learning_rate": 0.0001872014777675155,
"loss": 0.0369,
"step": 490
},
{
"epoch": 0.1305121613604904,
"grad_norm": 0.2734375,
"learning_rate": 0.00018706953423934557,
"loss": 0.0395,
"step": 495
},
{
"epoch": 0.1318304660206974,
"grad_norm": 0.083984375,
"learning_rate": 0.00018693759071117561,
"loss": 0.0284,
"step": 500
},
{
"epoch": 0.1318304660206974,
"eval_loss": 0.04542969539761543,
"eval_model_preparation_time": 0.0076,
"eval_runtime": 457.5293,
"eval_samples_per_second": 7.37,
"eval_steps_per_second": 3.685,
"step": 500
},
{
"epoch": 0.13314877068090436,
"grad_norm": 0.0291748046875,
"learning_rate": 0.00018680564718300568,
"loss": 0.0533,
"step": 505
},
{
"epoch": 0.13446707534111133,
"grad_norm": 0.71484375,
"learning_rate": 0.00018667370365483575,
"loss": 0.0183,
"step": 510
},
{
"epoch": 0.13578538000131832,
"grad_norm": 0.018798828125,
"learning_rate": 0.0001865417601266658,
"loss": 0.0473,
"step": 515
},
{
"epoch": 0.13710368466152528,
"grad_norm": 0.388671875,
"learning_rate": 0.00018640981659849586,
"loss": 0.0562,
"step": 520
},
{
"epoch": 0.13842198932173225,
"grad_norm": 0.77734375,
"learning_rate": 0.0001862778730703259,
"loss": 0.0755,
"step": 525
},
{
"epoch": 0.13974029398193924,
"grad_norm": 2.8125,
"learning_rate": 0.00018614592954215598,
"loss": 0.0422,
"step": 530
},
{
"epoch": 0.1410585986421462,
"grad_norm": 0.48828125,
"learning_rate": 0.00018601398601398602,
"loss": 0.0882,
"step": 535
},
{
"epoch": 0.14237690330235317,
"grad_norm": 0.16015625,
"learning_rate": 0.0001858820424858161,
"loss": 0.0131,
"step": 540
},
{
"epoch": 0.14369520796256013,
"grad_norm": 0.31640625,
"learning_rate": 0.00018575009895764616,
"loss": 0.03,
"step": 545
},
{
"epoch": 0.14501351262276713,
"grad_norm": 0.0120849609375,
"learning_rate": 0.0001856181554294762,
"loss": 0.0425,
"step": 550
},
{
"epoch": 0.1463318172829741,
"grad_norm": 0.390625,
"learning_rate": 0.00018548621190130624,
"loss": 0.011,
"step": 555
},
{
"epoch": 0.14765012194318106,
"grad_norm": 1.9609375,
"learning_rate": 0.0001853542683731363,
"loss": 0.0807,
"step": 560
},
{
"epoch": 0.14896842660338805,
"grad_norm": 0.609375,
"learning_rate": 0.00018522232484496635,
"loss": 0.0278,
"step": 565
},
{
"epoch": 0.150286731263595,
"grad_norm": 0.087890625,
"learning_rate": 0.00018509038131679642,
"loss": 0.0484,
"step": 570
},
{
"epoch": 0.15160503592380198,
"grad_norm": 0.5078125,
"learning_rate": 0.00018495843778862646,
"loss": 0.1277,
"step": 575
},
{
"epoch": 0.15292334058400897,
"grad_norm": 0.8125,
"learning_rate": 0.00018482649426045653,
"loss": 0.058,
"step": 580
},
{
"epoch": 0.15424164524421594,
"grad_norm": 0.22265625,
"learning_rate": 0.00018469455073228657,
"loss": 0.0259,
"step": 585
},
{
"epoch": 0.1555599499044229,
"grad_norm": 1.8984375,
"learning_rate": 0.00018456260720411664,
"loss": 0.113,
"step": 590
},
{
"epoch": 0.1568782545646299,
"grad_norm": 0.12451171875,
"learning_rate": 0.0001844306636759467,
"loss": 0.0312,
"step": 595
},
{
"epoch": 0.15819655922483686,
"grad_norm": 0.0322265625,
"learning_rate": 0.00018429872014777676,
"loss": 0.0476,
"step": 600
},
{
"epoch": 0.15951486388504382,
"grad_norm": 0.0281982421875,
"learning_rate": 0.00018416677661960682,
"loss": 0.0232,
"step": 605
},
{
"epoch": 0.16083316854525082,
"grad_norm": 0.57421875,
"learning_rate": 0.00018403483309143687,
"loss": 0.1287,
"step": 610
},
{
"epoch": 0.16215147320545778,
"grad_norm": 0.765625,
"learning_rate": 0.00018390288956326694,
"loss": 0.0991,
"step": 615
},
{
"epoch": 0.16346977786566474,
"grad_norm": 0.3125,
"learning_rate": 0.00018377094603509698,
"loss": 0.0247,
"step": 620
},
{
"epoch": 0.16478808252587174,
"grad_norm": 0.37890625,
"learning_rate": 0.00018363900250692705,
"loss": 0.0632,
"step": 625
},
{
"epoch": 0.1661063871860787,
"grad_norm": 0.1494140625,
"learning_rate": 0.00018350705897875712,
"loss": 0.0314,
"step": 630
},
{
"epoch": 0.16742469184628567,
"grad_norm": 0.0673828125,
"learning_rate": 0.00018337511545058716,
"loss": 0.0425,
"step": 635
},
{
"epoch": 0.16874299650649266,
"grad_norm": 0.396484375,
"learning_rate": 0.00018324317192241723,
"loss": 0.0613,
"step": 640
},
{
"epoch": 0.17006130116669962,
"grad_norm": 0.057373046875,
"learning_rate": 0.00018311122839424727,
"loss": 0.0569,
"step": 645
},
{
"epoch": 0.1713796058269066,
"grad_norm": 0.001373291015625,
"learning_rate": 0.00018297928486607734,
"loss": 0.007,
"step": 650
},
{
"epoch": 0.17269791048711358,
"grad_norm": 1.0859375,
"learning_rate": 0.00018284734133790738,
"loss": 0.0189,
"step": 655
},
{
"epoch": 0.17401621514732055,
"grad_norm": 0.6015625,
"learning_rate": 0.00018271539780973742,
"loss": 0.0601,
"step": 660
},
{
"epoch": 0.1753345198075275,
"grad_norm": 0.25390625,
"learning_rate": 0.0001825834542815675,
"loss": 0.0211,
"step": 665
},
{
"epoch": 0.1766528244677345,
"grad_norm": 2.6875,
"learning_rate": 0.00018245151075339753,
"loss": 0.0713,
"step": 670
},
{
"epoch": 0.17797112912794147,
"grad_norm": 1.1875,
"learning_rate": 0.0001823195672252276,
"loss": 0.0522,
"step": 675
},
{
"epoch": 0.17928943378814843,
"grad_norm": 0.025146484375,
"learning_rate": 0.00018218762369705767,
"loss": 0.0242,
"step": 680
},
{
"epoch": 0.18060773844835543,
"grad_norm": 0.048095703125,
"learning_rate": 0.00018205568016888772,
"loss": 0.0129,
"step": 685
},
{
"epoch": 0.1819260431085624,
"grad_norm": 0.04541015625,
"learning_rate": 0.00018192373664071778,
"loss": 0.0142,
"step": 690
},
{
"epoch": 0.18324434776876936,
"grad_norm": 0.00830078125,
"learning_rate": 0.00018179179311254783,
"loss": 0.0121,
"step": 695
},
{
"epoch": 0.18456265242897635,
"grad_norm": 0.53125,
"learning_rate": 0.0001816598495843779,
"loss": 0.0163,
"step": 700
},
{
"epoch": 0.1858809570891833,
"grad_norm": 0.185546875,
"learning_rate": 0.00018152790605620796,
"loss": 0.0203,
"step": 705
},
{
"epoch": 0.18719926174939028,
"grad_norm": 1.2578125,
"learning_rate": 0.000181395962528038,
"loss": 0.1548,
"step": 710
},
{
"epoch": 0.18851756640959727,
"grad_norm": 0.0247802734375,
"learning_rate": 0.00018126401899986808,
"loss": 0.0543,
"step": 715
},
{
"epoch": 0.18983587106980424,
"grad_norm": 0.07568359375,
"learning_rate": 0.00018113207547169812,
"loss": 0.0346,
"step": 720
},
{
"epoch": 0.1911541757300112,
"grad_norm": 0.1318359375,
"learning_rate": 0.0001810001319435282,
"loss": 0.03,
"step": 725
},
{
"epoch": 0.1924724803902182,
"grad_norm": 0.1455078125,
"learning_rate": 0.00018086818841535823,
"loss": 0.0796,
"step": 730
},
{
"epoch": 0.19379078505042516,
"grad_norm": 0.09814453125,
"learning_rate": 0.0001807362448871883,
"loss": 0.0662,
"step": 735
},
{
"epoch": 0.19510908971063212,
"grad_norm": 0.91015625,
"learning_rate": 0.00018060430135901837,
"loss": 0.0675,
"step": 740
},
{
"epoch": 0.19642739437083911,
"grad_norm": 0.10693359375,
"learning_rate": 0.0001804723578308484,
"loss": 0.0377,
"step": 745
},
{
"epoch": 0.19774569903104608,
"grad_norm": 0.95703125,
"learning_rate": 0.00018034041430267848,
"loss": 0.0174,
"step": 750
},
{
"epoch": 0.19906400369125304,
"grad_norm": 1.7890625,
"learning_rate": 0.00018020847077450852,
"loss": 0.0278,
"step": 755
},
{
"epoch": 0.20038230835146,
"grad_norm": 0.8515625,
"learning_rate": 0.00018007652724633856,
"loss": 0.0113,
"step": 760
},
{
"epoch": 0.201700613011667,
"grad_norm": 0.016845703125,
"learning_rate": 0.00017994458371816863,
"loss": 0.0589,
"step": 765
},
{
"epoch": 0.20301891767187397,
"grad_norm": 0.01043701171875,
"learning_rate": 0.00017981264018999867,
"loss": 0.0203,
"step": 770
},
{
"epoch": 0.20433722233208093,
"grad_norm": 0.0242919921875,
"learning_rate": 0.00017968069666182874,
"loss": 0.0494,
"step": 775
},
{
"epoch": 0.20565552699228792,
"grad_norm": 0.56640625,
"learning_rate": 0.00017954875313365879,
"loss": 0.0394,
"step": 780
},
{
"epoch": 0.2069738316524949,
"grad_norm": 0.06591796875,
"learning_rate": 0.00017941680960548886,
"loss": 0.0848,
"step": 785
},
{
"epoch": 0.20829213631270185,
"grad_norm": 0.40234375,
"learning_rate": 0.00017928486607731892,
"loss": 0.0464,
"step": 790
},
{
"epoch": 0.20961044097290885,
"grad_norm": 0.06298828125,
"learning_rate": 0.00017915292254914897,
"loss": 0.0222,
"step": 795
},
{
"epoch": 0.2109287456331158,
"grad_norm": 0.5390625,
"learning_rate": 0.00017902097902097904,
"loss": 0.0434,
"step": 800
},
{
"epoch": 0.21224705029332278,
"grad_norm": 1.390625,
"learning_rate": 0.00017888903549280908,
"loss": 0.0222,
"step": 805
},
{
"epoch": 0.21356535495352977,
"grad_norm": 0.0272216796875,
"learning_rate": 0.00017875709196463915,
"loss": 0.0099,
"step": 810
},
{
"epoch": 0.21488365961373673,
"grad_norm": 0.10009765625,
"learning_rate": 0.0001786251484364692,
"loss": 0.0086,
"step": 815
},
{
"epoch": 0.2162019642739437,
"grad_norm": 0.06396484375,
"learning_rate": 0.00017849320490829926,
"loss": 0.0715,
"step": 820
},
{
"epoch": 0.2175202689341507,
"grad_norm": 0.365234375,
"learning_rate": 0.00017836126138012933,
"loss": 0.0642,
"step": 825
},
{
"epoch": 0.21883857359435765,
"grad_norm": 0.01519775390625,
"learning_rate": 0.00017822931785195937,
"loss": 0.0111,
"step": 830
},
{
"epoch": 0.22015687825456462,
"grad_norm": 1.1640625,
"learning_rate": 0.00017809737432378944,
"loss": 0.0518,
"step": 835
},
{
"epoch": 0.2214751829147716,
"grad_norm": 0.00921630859375,
"learning_rate": 0.00017796543079561948,
"loss": 0.0384,
"step": 840
},
{
"epoch": 0.22279348757497858,
"grad_norm": 0.33984375,
"learning_rate": 0.00017783348726744955,
"loss": 0.0204,
"step": 845
},
{
"epoch": 0.22411179223518554,
"grad_norm": 0.294921875,
"learning_rate": 0.00017770154373927962,
"loss": 0.0075,
"step": 850
},
{
"epoch": 0.22543009689539253,
"grad_norm": 0.033203125,
"learning_rate": 0.00017756960021110963,
"loss": 0.0895,
"step": 855
},
{
"epoch": 0.2267484015555995,
"grad_norm": 0.08056640625,
"learning_rate": 0.0001774376566829397,
"loss": 0.1039,
"step": 860
},
{
"epoch": 0.22806670621580646,
"grad_norm": 0.55078125,
"learning_rate": 0.00017730571315476975,
"loss": 0.0125,
"step": 865
},
{
"epoch": 0.22938501087601346,
"grad_norm": 0.5859375,
"learning_rate": 0.00017717376962659982,
"loss": 0.0381,
"step": 870
},
{
"epoch": 0.23070331553622042,
"grad_norm": 0.029052734375,
"learning_rate": 0.00017704182609842988,
"loss": 0.0434,
"step": 875
},
{
"epoch": 0.23202162019642739,
"grad_norm": 0.43359375,
"learning_rate": 0.00017690988257025993,
"loss": 0.0799,
"step": 880
},
{
"epoch": 0.23333992485663438,
"grad_norm": 0.04150390625,
"learning_rate": 0.00017677793904209,
"loss": 0.0692,
"step": 885
},
{
"epoch": 0.23465822951684134,
"grad_norm": 0.435546875,
"learning_rate": 0.00017664599551392004,
"loss": 0.0544,
"step": 890
},
{
"epoch": 0.2359765341770483,
"grad_norm": 1.171875,
"learning_rate": 0.0001765140519857501,
"loss": 0.0619,
"step": 895
},
{
"epoch": 0.2372948388372553,
"grad_norm": 0.01263427734375,
"learning_rate": 0.00017638210845758018,
"loss": 0.0418,
"step": 900
},
{
"epoch": 0.23861314349746227,
"grad_norm": 0.017578125,
"learning_rate": 0.00017625016492941022,
"loss": 0.0195,
"step": 905
},
{
"epoch": 0.23993144815766923,
"grad_norm": 0.6171875,
"learning_rate": 0.0001761182214012403,
"loss": 0.067,
"step": 910
},
{
"epoch": 0.24124975281787622,
"grad_norm": 0.59765625,
"learning_rate": 0.00017598627787307033,
"loss": 0.049,
"step": 915
},
{
"epoch": 0.2425680574780832,
"grad_norm": 1.2421875,
"learning_rate": 0.0001758543343449004,
"loss": 0.0539,
"step": 920
},
{
"epoch": 0.24388636213829015,
"grad_norm": 0.10302734375,
"learning_rate": 0.00017572239081673044,
"loss": 0.0725,
"step": 925
},
{
"epoch": 0.24520466679849715,
"grad_norm": 0.330078125,
"learning_rate": 0.0001755904472885605,
"loss": 0.064,
"step": 930
},
{
"epoch": 0.2465229714587041,
"grad_norm": 0.220703125,
"learning_rate": 0.00017545850376039058,
"loss": 0.0271,
"step": 935
},
{
"epoch": 0.24784127611891107,
"grad_norm": 0.01470947265625,
"learning_rate": 0.00017532656023222062,
"loss": 0.0247,
"step": 940
},
{
"epoch": 0.24915958077911807,
"grad_norm": 0.013427734375,
"learning_rate": 0.0001751946167040507,
"loss": 0.017,
"step": 945
},
{
"epoch": 0.25047788543932503,
"grad_norm": 0.58984375,
"learning_rate": 0.00017506267317588073,
"loss": 0.0254,
"step": 950
},
{
"epoch": 0.251796190099532,
"grad_norm": 0.412109375,
"learning_rate": 0.00017493072964771078,
"loss": 0.0186,
"step": 955
},
{
"epoch": 0.25311449475973896,
"grad_norm": 0.66796875,
"learning_rate": 0.00017479878611954084,
"loss": 0.0617,
"step": 960
},
{
"epoch": 0.25443279941994595,
"grad_norm": 0.322265625,
"learning_rate": 0.00017466684259137089,
"loss": 0.0173,
"step": 965
},
{
"epoch": 0.25575110408015295,
"grad_norm": 0.83203125,
"learning_rate": 0.00017453489906320096,
"loss": 0.0512,
"step": 970
},
{
"epoch": 0.2570694087403599,
"grad_norm": 0.08447265625,
"learning_rate": 0.000174402955535031,
"loss": 0.0361,
"step": 975
},
{
"epoch": 0.2583877134005669,
"grad_norm": 0.423828125,
"learning_rate": 0.00017427101200686107,
"loss": 0.0175,
"step": 980
},
{
"epoch": 0.25970601806077387,
"grad_norm": 0.77734375,
"learning_rate": 0.00017413906847869114,
"loss": 0.0139,
"step": 985
},
{
"epoch": 0.2610243227209808,
"grad_norm": 0.515625,
"learning_rate": 0.00017400712495052118,
"loss": 0.0948,
"step": 990
},
{
"epoch": 0.2623426273811878,
"grad_norm": 1.421875,
"learning_rate": 0.00017387518142235125,
"loss": 0.0406,
"step": 995
},
{
"epoch": 0.2636609320413948,
"grad_norm": 0.058837890625,
"learning_rate": 0.0001737432378941813,
"loss": 0.1011,
"step": 1000
},
{
"epoch": 0.2636609320413948,
"eval_loss": 0.045552924275398254,
"eval_model_preparation_time": 0.0076,
"eval_runtime": 457.6113,
"eval_samples_per_second": 7.369,
"eval_steps_per_second": 3.684,
"step": 1000
},
{
"epoch": 0.26497923670160173,
"grad_norm": 0.380859375,
"learning_rate": 0.00017361129436601136,
"loss": 0.0711,
"step": 1005
},
{
"epoch": 0.2662975413618087,
"grad_norm": 0.0208740234375,
"learning_rate": 0.00017347935083784143,
"loss": 0.0218,
"step": 1010
},
{
"epoch": 0.2676158460220157,
"grad_norm": 0.04345703125,
"learning_rate": 0.00017334740730967147,
"loss": 0.0301,
"step": 1015
},
{
"epoch": 0.26893415068222265,
"grad_norm": 0.2734375,
"learning_rate": 0.00017321546378150154,
"loss": 0.0721,
"step": 1020
},
{
"epoch": 0.27025245534242964,
"grad_norm": 0.25390625,
"learning_rate": 0.00017308352025333158,
"loss": 0.0363,
"step": 1025
},
{
"epoch": 0.27157076000263664,
"grad_norm": 0.04345703125,
"learning_rate": 0.00017295157672516165,
"loss": 0.0313,
"step": 1030
},
{
"epoch": 0.2728890646628436,
"grad_norm": 0.0211181640625,
"learning_rate": 0.0001728196331969917,
"loss": 0.0385,
"step": 1035
},
{
"epoch": 0.27420736932305056,
"grad_norm": 0.00787353515625,
"learning_rate": 0.00017268768966882176,
"loss": 0.0405,
"step": 1040
},
{
"epoch": 0.27552567398325756,
"grad_norm": 0.484375,
"learning_rate": 0.00017255574614065183,
"loss": 0.0616,
"step": 1045
},
{
"epoch": 0.2768439786434645,
"grad_norm": 0.0908203125,
"learning_rate": 0.00017242380261248185,
"loss": 0.0057,
"step": 1050
},
{
"epoch": 0.2781622833036715,
"grad_norm": 0.1904296875,
"learning_rate": 0.00017229185908431192,
"loss": 0.0417,
"step": 1055
},
{
"epoch": 0.2794805879638785,
"grad_norm": 0.30078125,
"learning_rate": 0.00017215991555614196,
"loss": 0.0346,
"step": 1060
},
{
"epoch": 0.2807988926240854,
"grad_norm": 0.016357421875,
"learning_rate": 0.00017202797202797203,
"loss": 0.0295,
"step": 1065
},
{
"epoch": 0.2821171972842924,
"grad_norm": 0.490234375,
"learning_rate": 0.0001718960284998021,
"loss": 0.0448,
"step": 1070
},
{
"epoch": 0.28343550194449935,
"grad_norm": 0.004241943359375,
"learning_rate": 0.00017176408497163214,
"loss": 0.0051,
"step": 1075
},
{
"epoch": 0.28475380660470634,
"grad_norm": 0.01904296875,
"learning_rate": 0.0001716321414434622,
"loss": 0.0894,
"step": 1080
},
{
"epoch": 0.28607211126491333,
"grad_norm": 0.83984375,
"learning_rate": 0.00017150019791529225,
"loss": 0.0288,
"step": 1085
},
{
"epoch": 0.28739041592512027,
"grad_norm": 0.2021484375,
"learning_rate": 0.00017136825438712232,
"loss": 0.0222,
"step": 1090
},
{
"epoch": 0.28870872058532726,
"grad_norm": 0.322265625,
"learning_rate": 0.0001712363108589524,
"loss": 0.0444,
"step": 1095
},
{
"epoch": 0.29002702524553425,
"grad_norm": 0.408203125,
"learning_rate": 0.00017110436733078243,
"loss": 0.0828,
"step": 1100
},
{
"epoch": 0.2913453299057412,
"grad_norm": 0.04052734375,
"learning_rate": 0.0001709724238026125,
"loss": 0.0725,
"step": 1105
},
{
"epoch": 0.2926636345659482,
"grad_norm": 0.2578125,
"learning_rate": 0.00017084048027444254,
"loss": 0.0204,
"step": 1110
},
{
"epoch": 0.2939819392261552,
"grad_norm": 0.67578125,
"learning_rate": 0.0001707085367462726,
"loss": 0.0503,
"step": 1115
},
{
"epoch": 0.2953002438863621,
"grad_norm": 0.0059814453125,
"learning_rate": 0.00017057659321810265,
"loss": 0.0144,
"step": 1120
},
{
"epoch": 0.2966185485465691,
"grad_norm": 0.0269775390625,
"learning_rate": 0.00017044464968993272,
"loss": 0.0044,
"step": 1125
},
{
"epoch": 0.2979368532067761,
"grad_norm": 0.1396484375,
"learning_rate": 0.0001703127061617628,
"loss": 0.013,
"step": 1130
},
{
"epoch": 0.29925515786698303,
"grad_norm": 0.287109375,
"learning_rate": 0.00017018076263359283,
"loss": 0.0245,
"step": 1135
},
{
"epoch": 0.30057346252719,
"grad_norm": 0.26171875,
"learning_rate": 0.0001700488191054229,
"loss": 0.0247,
"step": 1140
},
{
"epoch": 0.301891767187397,
"grad_norm": 0.40625,
"learning_rate": 0.00016991687557725294,
"loss": 0.0402,
"step": 1145
},
{
"epoch": 0.30321007184760396,
"grad_norm": 1.2578125,
"learning_rate": 0.000169784932049083,
"loss": 0.0071,
"step": 1150
},
{
"epoch": 0.30452837650781095,
"grad_norm": 0.330078125,
"learning_rate": 0.00016965298852091306,
"loss": 0.0177,
"step": 1155
},
{
"epoch": 0.30584668116801794,
"grad_norm": 0.07275390625,
"learning_rate": 0.0001695210449927431,
"loss": 0.0029,
"step": 1160
},
{
"epoch": 0.3071649858282249,
"grad_norm": 0.455078125,
"learning_rate": 0.00016938910146457317,
"loss": 0.0262,
"step": 1165
},
{
"epoch": 0.30848329048843187,
"grad_norm": 0.002655029296875,
"learning_rate": 0.0001692571579364032,
"loss": 0.0346,
"step": 1170
},
{
"epoch": 0.30980159514863886,
"grad_norm": 0.1748046875,
"learning_rate": 0.00016912521440823328,
"loss": 0.0494,
"step": 1175
},
{
"epoch": 0.3111198998088458,
"grad_norm": 1.4609375,
"learning_rate": 0.00016899327088006335,
"loss": 0.0603,
"step": 1180
},
{
"epoch": 0.3124382044690528,
"grad_norm": 0.1572265625,
"learning_rate": 0.0001688613273518934,
"loss": 0.0366,
"step": 1185
},
{
"epoch": 0.3137565091292598,
"grad_norm": 0.01422119140625,
"learning_rate": 0.00016872938382372346,
"loss": 0.0678,
"step": 1190
},
{
"epoch": 0.3150748137894667,
"grad_norm": 0.2412109375,
"learning_rate": 0.0001685974402955535,
"loss": 0.0359,
"step": 1195
},
{
"epoch": 0.3163931184496737,
"grad_norm": 0.275390625,
"learning_rate": 0.00016846549676738357,
"loss": 0.1099,
"step": 1200
},
{
"epoch": 0.3177114231098807,
"grad_norm": 0.212890625,
"learning_rate": 0.00016833355323921364,
"loss": 0.0343,
"step": 1205
},
{
"epoch": 0.31902972777008765,
"grad_norm": 0.0302734375,
"learning_rate": 0.00016820160971104368,
"loss": 0.0138,
"step": 1210
},
{
"epoch": 0.32034803243029464,
"grad_norm": 0.016845703125,
"learning_rate": 0.00016806966618287375,
"loss": 0.0202,
"step": 1215
},
{
"epoch": 0.32166633709050163,
"grad_norm": 0.1474609375,
"learning_rate": 0.0001679377226547038,
"loss": 0.0442,
"step": 1220
},
{
"epoch": 0.32298464175070857,
"grad_norm": 0.049072265625,
"learning_rate": 0.00016780577912653386,
"loss": 0.0375,
"step": 1225
},
{
"epoch": 0.32430294641091556,
"grad_norm": 0.1337890625,
"learning_rate": 0.0001676738355983639,
"loss": 0.01,
"step": 1230
},
{
"epoch": 0.32562125107112255,
"grad_norm": 0.02197265625,
"learning_rate": 0.00016754189207019397,
"loss": 0.0139,
"step": 1235
},
{
"epoch": 0.3269395557313295,
"grad_norm": 0.09228515625,
"learning_rate": 0.00016740994854202404,
"loss": 0.014,
"step": 1240
},
{
"epoch": 0.3282578603915365,
"grad_norm": 0.47265625,
"learning_rate": 0.00016727800501385408,
"loss": 0.1546,
"step": 1245
},
{
"epoch": 0.3295761650517435,
"grad_norm": 0.02294921875,
"learning_rate": 0.00016714606148568413,
"loss": 0.0803,
"step": 1250
},
{
"epoch": 0.3308944697119504,
"grad_norm": 0.185546875,
"learning_rate": 0.00016701411795751417,
"loss": 0.0376,
"step": 1255
},
{
"epoch": 0.3322127743721574,
"grad_norm": 0.1123046875,
"learning_rate": 0.00016688217442934424,
"loss": 0.0375,
"step": 1260
},
{
"epoch": 0.3335310790323644,
"grad_norm": 1.03125,
"learning_rate": 0.0001667502309011743,
"loss": 0.0442,
"step": 1265
},
{
"epoch": 0.33484938369257133,
"grad_norm": 0.0172119140625,
"learning_rate": 0.00016661828737300435,
"loss": 0.0261,
"step": 1270
},
{
"epoch": 0.3361676883527783,
"grad_norm": 0.42578125,
"learning_rate": 0.00016648634384483442,
"loss": 0.0553,
"step": 1275
},
{
"epoch": 0.3374859930129853,
"grad_norm": 0.1328125,
"learning_rate": 0.00016635440031666446,
"loss": 0.0065,
"step": 1280
},
{
"epoch": 0.33880429767319226,
"grad_norm": 0.263671875,
"learning_rate": 0.00016622245678849453,
"loss": 0.0527,
"step": 1285
},
{
"epoch": 0.34012260233339925,
"grad_norm": 0.314453125,
"learning_rate": 0.0001660905132603246,
"loss": 0.0297,
"step": 1290
},
{
"epoch": 0.34144090699360624,
"grad_norm": 0.04345703125,
"learning_rate": 0.00016595856973215464,
"loss": 0.0477,
"step": 1295
},
{
"epoch": 0.3427592116538132,
"grad_norm": 0.08154296875,
"learning_rate": 0.0001658266262039847,
"loss": 0.0298,
"step": 1300
},
{
"epoch": 0.34407751631402017,
"grad_norm": 0.08935546875,
"learning_rate": 0.00016569468267581475,
"loss": 0.0481,
"step": 1305
},
{
"epoch": 0.34539582097422716,
"grad_norm": 0.06640625,
"learning_rate": 0.00016556273914764482,
"loss": 0.0153,
"step": 1310
},
{
"epoch": 0.3467141256344341,
"grad_norm": 0.00592041015625,
"learning_rate": 0.00016543079561947486,
"loss": 0.0111,
"step": 1315
},
{
"epoch": 0.3480324302946411,
"grad_norm": 0.2236328125,
"learning_rate": 0.00016529885209130493,
"loss": 0.0309,
"step": 1320
},
{
"epoch": 0.3493507349548481,
"grad_norm": 0.0198974609375,
"learning_rate": 0.000165166908563135,
"loss": 0.0579,
"step": 1325
},
{
"epoch": 0.350669039615055,
"grad_norm": 0.10107421875,
"learning_rate": 0.00016503496503496504,
"loss": 0.0055,
"step": 1330
},
{
"epoch": 0.351987344275262,
"grad_norm": 0.71875,
"learning_rate": 0.00016490302150679511,
"loss": 0.0299,
"step": 1335
},
{
"epoch": 0.353305648935469,
"grad_norm": 0.01348876953125,
"learning_rate": 0.00016477107797862516,
"loss": 0.0943,
"step": 1340
},
{
"epoch": 0.35462395359567594,
"grad_norm": 0.3046875,
"learning_rate": 0.00016463913445045523,
"loss": 0.0216,
"step": 1345
},
{
"epoch": 0.35594225825588294,
"grad_norm": 0.02392578125,
"learning_rate": 0.00016450719092228527,
"loss": 0.0265,
"step": 1350
},
{
"epoch": 0.35726056291608993,
"grad_norm": 0.453125,
"learning_rate": 0.0001643752473941153,
"loss": 0.0539,
"step": 1355
},
{
"epoch": 0.35857886757629687,
"grad_norm": 0.00823974609375,
"learning_rate": 0.00016424330386594538,
"loss": 0.0139,
"step": 1360
},
{
"epoch": 0.35989717223650386,
"grad_norm": 0.55859375,
"learning_rate": 0.00016411136033777542,
"loss": 0.0428,
"step": 1365
},
{
"epoch": 0.36121547689671085,
"grad_norm": 0.052734375,
"learning_rate": 0.0001639794168096055,
"loss": 0.0346,
"step": 1370
},
{
"epoch": 0.3625337815569178,
"grad_norm": 0.12158203125,
"learning_rate": 0.00016384747328143556,
"loss": 0.0095,
"step": 1375
},
{
"epoch": 0.3638520862171248,
"grad_norm": 0.0240478515625,
"learning_rate": 0.0001637155297532656,
"loss": 0.0224,
"step": 1380
},
{
"epoch": 0.3651703908773318,
"grad_norm": 0.01318359375,
"learning_rate": 0.00016358358622509567,
"loss": 0.0316,
"step": 1385
},
{
"epoch": 0.3664886955375387,
"grad_norm": 0.011962890625,
"learning_rate": 0.0001634516426969257,
"loss": 0.0051,
"step": 1390
},
{
"epoch": 0.3678070001977457,
"grad_norm": 0.00396728515625,
"learning_rate": 0.00016331969916875578,
"loss": 0.038,
"step": 1395
},
{
"epoch": 0.3691253048579527,
"grad_norm": 0.375,
"learning_rate": 0.00016318775564058585,
"loss": 0.029,
"step": 1400
},
{
"epoch": 0.37044360951815963,
"grad_norm": 0.265625,
"learning_rate": 0.0001630558121124159,
"loss": 0.0072,
"step": 1405
},
{
"epoch": 0.3717619141783666,
"grad_norm": 0.00127410888671875,
"learning_rate": 0.00016292386858424596,
"loss": 0.0381,
"step": 1410
},
{
"epoch": 0.3730802188385736,
"grad_norm": 1.15625,
"learning_rate": 0.000162791925056076,
"loss": 0.0573,
"step": 1415
},
{
"epoch": 0.37439852349878056,
"grad_norm": 0.0244140625,
"learning_rate": 0.00016265998152790607,
"loss": 0.051,
"step": 1420
},
{
"epoch": 0.37571682815898755,
"grad_norm": 0.0015106201171875,
"learning_rate": 0.00016252803799973612,
"loss": 0.0239,
"step": 1425
},
{
"epoch": 0.37703513281919454,
"grad_norm": 0.26953125,
"learning_rate": 0.00016239609447156618,
"loss": 0.0165,
"step": 1430
},
{
"epoch": 0.3783534374794015,
"grad_norm": 0.006134033203125,
"learning_rate": 0.00016226415094339625,
"loss": 0.0071,
"step": 1435
},
{
"epoch": 0.37967174213960847,
"grad_norm": 2.828125,
"learning_rate": 0.0001621322074152263,
"loss": 0.0272,
"step": 1440
},
{
"epoch": 0.38099004679981546,
"grad_norm": 0.349609375,
"learning_rate": 0.00016200026388705637,
"loss": 0.0647,
"step": 1445
},
{
"epoch": 0.3823083514600224,
"grad_norm": 0.09326171875,
"learning_rate": 0.00016186832035888638,
"loss": 0.0262,
"step": 1450
},
{
"epoch": 0.3836266561202294,
"grad_norm": 0.041015625,
"learning_rate": 0.00016173637683071645,
"loss": 0.0576,
"step": 1455
},
{
"epoch": 0.3849449607804364,
"grad_norm": 0.033935546875,
"learning_rate": 0.00016160443330254652,
"loss": 0.0142,
"step": 1460
},
{
"epoch": 0.3862632654406433,
"grad_norm": 0.09130859375,
"learning_rate": 0.00016147248977437656,
"loss": 0.0348,
"step": 1465
},
{
"epoch": 0.3875815701008503,
"grad_norm": 2.390625,
"learning_rate": 0.00016134054624620663,
"loss": 0.0672,
"step": 1470
},
{
"epoch": 0.3888998747610573,
"grad_norm": 0.439453125,
"learning_rate": 0.00016120860271803667,
"loss": 0.0121,
"step": 1475
},
{
"epoch": 0.39021817942126424,
"grad_norm": 0.1298828125,
"learning_rate": 0.00016107665918986674,
"loss": 0.0114,
"step": 1480
},
{
"epoch": 0.39153648408147124,
"grad_norm": 0.85546875,
"learning_rate": 0.0001609447156616968,
"loss": 0.0968,
"step": 1485
},
{
"epoch": 0.39285478874167823,
"grad_norm": 0.703125,
"learning_rate": 0.00016081277213352685,
"loss": 0.0349,
"step": 1490
},
{
"epoch": 0.39417309340188517,
"grad_norm": 0.021728515625,
"learning_rate": 0.00016068082860535692,
"loss": 0.0106,
"step": 1495
},
{
"epoch": 0.39549139806209216,
"grad_norm": 0.7265625,
"learning_rate": 0.00016054888507718696,
"loss": 0.0225,
"step": 1500
},
{
"epoch": 0.39549139806209216,
"eval_loss": 0.03515048325061798,
"eval_model_preparation_time": 0.0076,
"eval_runtime": 457.3497,
"eval_samples_per_second": 7.373,
"eval_steps_per_second": 3.686,
"step": 1500
},
{
"epoch": 0.3968097027222991,
"grad_norm": 0.016519820317626,
"learning_rate": 0.00016041694154901703,
"loss": 0.0202,
"step": 1505
},
{
"epoch": 0.3981280073825061,
"grad_norm": 0.8505942225456238,
"learning_rate": 0.00016028499802084708,
"loss": 0.0541,
"step": 1510
},
{
"epoch": 0.3994463120427131,
"grad_norm": 0.04163295030593872,
"learning_rate": 0.00016015305449267714,
"loss": 0.0037,
"step": 1515
},
{
"epoch": 0.40076461670292,
"grad_norm": 0.011332935653626919,
"learning_rate": 0.00016002111096450721,
"loss": 0.0459,
"step": 1520
},
{
"epoch": 0.402082921363127,
"grad_norm": 0.9360129833221436,
"learning_rate": 0.00015988916743633726,
"loss": 0.013,
"step": 1525
},
{
"epoch": 0.403401226023334,
"grad_norm": 0.11991436779499054,
"learning_rate": 0.00015975722390816733,
"loss": 0.0079,
"step": 1530
},
{
"epoch": 0.40471953068354094,
"grad_norm": 0.36911076307296753,
"learning_rate": 0.00015962528037999737,
"loss": 0.0638,
"step": 1535
},
{
"epoch": 0.40603783534374793,
"grad_norm": 0.020278634503483772,
"learning_rate": 0.00015949333685182744,
"loss": 0.0217,
"step": 1540
},
{
"epoch": 0.4073561400039549,
"grad_norm": 0.14263059198856354,
"learning_rate": 0.0001593613933236575,
"loss": 0.0495,
"step": 1545
},
{
"epoch": 0.40867444466416186,
"grad_norm": 0.09494803845882416,
"learning_rate": 0.00015922944979548752,
"loss": 0.0248,
"step": 1550
},
{
"epoch": 0.40999274932436885,
"grad_norm": 0.23064319789409637,
"learning_rate": 0.0001590975062673176,
"loss": 0.0285,
"step": 1555
},
{
"epoch": 0.41131105398457585,
"grad_norm": 0.32220256328582764,
"learning_rate": 0.00015896556273914763,
"loss": 0.0537,
"step": 1560
},
{
"epoch": 0.4126293586447828,
"grad_norm": 0.41208815574645996,
"learning_rate": 0.0001588336192109777,
"loss": 0.0453,
"step": 1565
},
{
"epoch": 0.4139476633049898,
"grad_norm": 0.03775424137711525,
"learning_rate": 0.00015870167568280777,
"loss": 0.0134,
"step": 1570
},
{
"epoch": 0.41526596796519677,
"grad_norm": 0.6526333093643188,
"learning_rate": 0.0001585697321546378,
"loss": 0.0329,
"step": 1575
},
{
"epoch": 0.4165842726254037,
"grad_norm": 1.001305103302002,
"learning_rate": 0.00015843778862646788,
"loss": 0.0912,
"step": 1580
},
{
"epoch": 0.4179025772856107,
"grad_norm": 0.4055219888687134,
"learning_rate": 0.00015830584509829792,
"loss": 0.0519,
"step": 1585
},
{
"epoch": 0.4192208819458177,
"grad_norm": 0.035015616565942764,
"learning_rate": 0.000158173901570128,
"loss": 0.0191,
"step": 1590
},
{
"epoch": 0.42053918660602463,
"grad_norm": 0.09326844662427902,
"learning_rate": 0.00015804195804195806,
"loss": 0.0106,
"step": 1595
},
{
"epoch": 0.4218574912662316,
"grad_norm": 0.06223440542817116,
"learning_rate": 0.0001579100145137881,
"loss": 0.0113,
"step": 1600
},
{
"epoch": 0.4231757959264386,
"grad_norm": 0.0625135526061058,
"learning_rate": 0.00015777807098561817,
"loss": 0.0191,
"step": 1605
},
{
"epoch": 0.42449410058664555,
"grad_norm": 0.2645983099937439,
"learning_rate": 0.00015764612745744822,
"loss": 0.0829,
"step": 1610
},
{
"epoch": 0.42581240524685254,
"grad_norm": 0.009632415138185024,
"learning_rate": 0.00015751418392927829,
"loss": 0.0542,
"step": 1615
},
{
"epoch": 0.42713070990705954,
"grad_norm": 0.01979319378733635,
"learning_rate": 0.00015738224040110833,
"loss": 0.0517,
"step": 1620
},
{
"epoch": 0.4284490145672665,
"grad_norm": 0.3065454065799713,
"learning_rate": 0.0001572502968729384,
"loss": 0.0738,
"step": 1625
},
{
"epoch": 0.42976731922747347,
"grad_norm": 0.09581473469734192,
"learning_rate": 0.00015711835334476847,
"loss": 0.0571,
"step": 1630
},
{
"epoch": 0.43108562388768046,
"grad_norm": 0.23746591806411743,
"learning_rate": 0.0001569864098165985,
"loss": 0.0128,
"step": 1635
},
{
"epoch": 0.4324039285478874,
"grad_norm": 0.936278760433197,
"learning_rate": 0.00015685446628842858,
"loss": 0.0665,
"step": 1640
},
{
"epoch": 0.4337222332080944,
"grad_norm": 0.18487441539764404,
"learning_rate": 0.00015672252276025862,
"loss": 0.0527,
"step": 1645
},
{
"epoch": 0.4350405378683014,
"grad_norm": 0.6980624794960022,
"learning_rate": 0.00015659057923208866,
"loss": 0.0613,
"step": 1650
},
{
"epoch": 0.4363588425285083,
"grad_norm": 0.4696301221847534,
"learning_rate": 0.00015645863570391873,
"loss": 0.0569,
"step": 1655
},
{
"epoch": 0.4376771471887153,
"grad_norm": 0.15083105862140656,
"learning_rate": 0.00015632669217574877,
"loss": 0.0394,
"step": 1660
},
{
"epoch": 0.4389954518489223,
"grad_norm": 0.44701239466667175,
"learning_rate": 0.00015619474864757884,
"loss": 0.0494,
"step": 1665
},
{
"epoch": 0.44031375650912924,
"grad_norm": 0.07418403029441833,
"learning_rate": 0.00015606280511940888,
"loss": 0.0291,
"step": 1670
},
{
"epoch": 0.44163206116933623,
"grad_norm": 0.02311861515045166,
"learning_rate": 0.00015593086159123895,
"loss": 0.0304,
"step": 1675
},
{
"epoch": 0.4429503658295432,
"grad_norm": 0.4416038990020752,
"learning_rate": 0.00015579891806306902,
"loss": 0.0176,
"step": 1680
},
{
"epoch": 0.44426867048975016,
"grad_norm": 0.5124915242195129,
"learning_rate": 0.00015566697453489906,
"loss": 0.0454,
"step": 1685
},
{
"epoch": 0.44558697514995715,
"grad_norm": 0.3159286081790924,
"learning_rate": 0.00015553503100672913,
"loss": 0.047,
"step": 1690
},
{
"epoch": 0.44690527981016415,
"grad_norm": 0.032126396894454956,
"learning_rate": 0.00015540308747855918,
"loss": 0.0151,
"step": 1695
},
{
"epoch": 0.4482235844703711,
"grad_norm": 0.04663548618555069,
"learning_rate": 0.00015527114395038924,
"loss": 0.0375,
"step": 1700
},
{
"epoch": 0.4495418891305781,
"grad_norm": 0.013753900304436684,
"learning_rate": 0.0001551392004222193,
"loss": 0.0485,
"step": 1705
},
{
"epoch": 0.45086019379078507,
"grad_norm": 1.9952393770217896,
"learning_rate": 0.00015500725689404936,
"loss": 0.0625,
"step": 1710
},
{
"epoch": 0.452178498450992,
"grad_norm": 0.014283270575106144,
"learning_rate": 0.00015487531336587943,
"loss": 0.0037,
"step": 1715
},
{
"epoch": 0.453496803111199,
"grad_norm": 0.3897913098335266,
"learning_rate": 0.00015474336983770947,
"loss": 0.0304,
"step": 1720
},
{
"epoch": 0.454815107771406,
"grad_norm": 0.3730885684490204,
"learning_rate": 0.00015461142630953954,
"loss": 0.0115,
"step": 1725
},
{
"epoch": 0.45613341243161293,
"grad_norm": 0.035858724266290665,
"learning_rate": 0.00015447948278136958,
"loss": 0.0021,
"step": 1730
},
{
"epoch": 0.4574517170918199,
"grad_norm": 0.20589517056941986,
"learning_rate": 0.00015434753925319965,
"loss": 0.0132,
"step": 1735
},
{
"epoch": 0.4587700217520269,
"grad_norm": 0.004939342383295298,
"learning_rate": 0.00015421559572502972,
"loss": 0.0471,
"step": 1740
},
{
"epoch": 0.46008832641223385,
"grad_norm": 0.03493283689022064,
"learning_rate": 0.00015408365219685976,
"loss": 0.0062,
"step": 1745
},
{
"epoch": 0.46140663107244084,
"grad_norm": 0.045927103608846664,
"learning_rate": 0.0001539517086686898,
"loss": 0.0283,
"step": 1750
},
{
"epoch": 0.46272493573264784,
"grad_norm": 0.012629454955458641,
"learning_rate": 0.00015381976514051984,
"loss": 0.0133,
"step": 1755
},
{
"epoch": 0.46404324039285477,
"grad_norm": 0.8001697659492493,
"learning_rate": 0.0001536878216123499,
"loss": 0.0224,
"step": 1760
},
{
"epoch": 0.46536154505306176,
"grad_norm": 0.002036362886428833,
"learning_rate": 0.00015355587808417998,
"loss": 0.0066,
"step": 1765
},
{
"epoch": 0.46667984971326876,
"grad_norm": 1.0261330604553223,
"learning_rate": 0.00015342393455601002,
"loss": 0.191,
"step": 1770
},
{
"epoch": 0.4679981543734757,
"grad_norm": 0.3033429682254791,
"learning_rate": 0.0001532919910278401,
"loss": 0.0222,
"step": 1775
},
{
"epoch": 0.4693164590336827,
"grad_norm": 0.36911338567733765,
"learning_rate": 0.00015316004749967014,
"loss": 0.0363,
"step": 1780
},
{
"epoch": 0.4706347636938897,
"grad_norm": 0.0406811460852623,
"learning_rate": 0.0001530281039715002,
"loss": 0.0283,
"step": 1785
},
{
"epoch": 0.4719530683540966,
"grad_norm": 0.23334211111068726,
"learning_rate": 0.00015289616044333027,
"loss": 0.0274,
"step": 1790
},
{
"epoch": 0.4732713730143036,
"grad_norm": 0.013081169687211514,
"learning_rate": 0.00015276421691516032,
"loss": 0.0221,
"step": 1795
},
{
"epoch": 0.4745896776745106,
"grad_norm": 0.2480790615081787,
"learning_rate": 0.00015263227338699039,
"loss": 0.019,
"step": 1800
},
{
"epoch": 0.47590798233471754,
"grad_norm": 0.0373196005821228,
"learning_rate": 0.00015250032985882043,
"loss": 0.0292,
"step": 1805
},
{
"epoch": 0.47722628699492453,
"grad_norm": 0.004609994124621153,
"learning_rate": 0.0001523683863306505,
"loss": 0.0918,
"step": 1810
},
{
"epoch": 0.4785445916551315,
"grad_norm": 0.02370987832546234,
"learning_rate": 0.00015223644280248054,
"loss": 0.0462,
"step": 1815
},
{
"epoch": 0.47986289631533846,
"grad_norm": 0.05842221528291702,
"learning_rate": 0.0001521044992743106,
"loss": 0.0595,
"step": 1820
},
{
"epoch": 0.48118120097554545,
"grad_norm": 0.009685276076197624,
"learning_rate": 0.00015197255574614068,
"loss": 0.0074,
"step": 1825
},
{
"epoch": 0.48249950563575245,
"grad_norm": 0.8933250308036804,
"learning_rate": 0.00015184061221797072,
"loss": 0.0757,
"step": 1830
},
{
"epoch": 0.4838178102959594,
"grad_norm": 0.07075401395559311,
"learning_rate": 0.0001517086686898008,
"loss": 0.0226,
"step": 1835
},
{
"epoch": 0.4851361149561664,
"grad_norm": 0.732706606388092,
"learning_rate": 0.00015157672516163083,
"loss": 0.0161,
"step": 1840
},
{
"epoch": 0.48645441961637337,
"grad_norm": 1.1897023916244507,
"learning_rate": 0.0001514447816334609,
"loss": 0.0265,
"step": 1845
},
{
"epoch": 0.4877727242765803,
"grad_norm": 0.052572328597307205,
"learning_rate": 0.00015131283810529094,
"loss": 0.0094,
"step": 1850
},
{
"epoch": 0.4890910289367873,
"grad_norm": 0.08263898640871048,
"learning_rate": 0.00015118089457712098,
"loss": 0.0631,
"step": 1855
},
{
"epoch": 0.4904093335969943,
"grad_norm": 0.03225664421916008,
"learning_rate": 0.00015104895104895105,
"loss": 0.023,
"step": 1860
},
{
"epoch": 0.4917276382572012,
"grad_norm": 0.007935039699077606,
"learning_rate": 0.0001509170075207811,
"loss": 0.0039,
"step": 1865
},
{
"epoch": 0.4930459429174082,
"grad_norm": 0.00830796267837286,
"learning_rate": 0.00015078506399261116,
"loss": 0.007,
"step": 1870
},
{
"epoch": 0.4943642475776152,
"grad_norm": 0.08042234182357788,
"learning_rate": 0.00015065312046444123,
"loss": 0.0366,
"step": 1875
},
{
"epoch": 0.49568255223782215,
"grad_norm": 0.009092851541936398,
"learning_rate": 0.00015052117693627128,
"loss": 0.0107,
"step": 1880
},
{
"epoch": 0.49700085689802914,
"grad_norm": 0.2674141824245453,
"learning_rate": 0.00015038923340810135,
"loss": 0.0076,
"step": 1885
},
{
"epoch": 0.49831916155823613,
"grad_norm": 0.07694366574287415,
"learning_rate": 0.0001502572898799314,
"loss": 0.0252,
"step": 1890
},
{
"epoch": 0.49963746621844307,
"grad_norm": 0.5699467062950134,
"learning_rate": 0.00015012534635176146,
"loss": 0.0487,
"step": 1895
},
{
"epoch": 0.5009557708786501,
"grad_norm": 0.18800878524780273,
"learning_rate": 0.0001499934028235915,
"loss": 0.0183,
"step": 1900
},
{
"epoch": 0.5022740755388571,
"grad_norm": 0.019469989463686943,
"learning_rate": 0.00014986145929542157,
"loss": 0.0268,
"step": 1905
},
{
"epoch": 0.503592380199064,
"grad_norm": 0.01890506222844124,
"learning_rate": 0.00014972951576725164,
"loss": 0.0449,
"step": 1910
},
{
"epoch": 0.5049106848592709,
"grad_norm": 0.0006314461352303624,
"learning_rate": 0.00014959757223908168,
"loss": 0.0056,
"step": 1915
},
{
"epoch": 0.5062289895194779,
"grad_norm": 0.32654041051864624,
"learning_rate": 0.00014946562871091175,
"loss": 0.0256,
"step": 1920
},
{
"epoch": 0.5075472941796849,
"grad_norm": 0.7803483605384827,
"learning_rate": 0.0001493336851827418,
"loss": 0.0374,
"step": 1925
},
{
"epoch": 0.5088655988398919,
"grad_norm": 0.028441445901989937,
"learning_rate": 0.00014920174165457186,
"loss": 0.0161,
"step": 1930
},
{
"epoch": 0.5101839035000989,
"grad_norm": 0.028379200026392937,
"learning_rate": 0.00014906979812640193,
"loss": 0.0151,
"step": 1935
},
{
"epoch": 0.5115022081603059,
"grad_norm": 0.021159596741199493,
"learning_rate": 0.00014893785459823197,
"loss": 0.0303,
"step": 1940
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.24903325736522675,
"learning_rate": 0.000148805911070062,
"loss": 0.0076,
"step": 1945
},
{
"epoch": 0.5141388174807198,
"grad_norm": 0.007065301761031151,
"learning_rate": 0.00014867396754189206,
"loss": 0.022,
"step": 1950
},
{
"epoch": 0.5154571221409268,
"grad_norm": 0.004032329190522432,
"learning_rate": 0.00014854202401372212,
"loss": 0.0083,
"step": 1955
},
{
"epoch": 0.5167754268011338,
"grad_norm": 0.3045775592327118,
"learning_rate": 0.0001484100804855522,
"loss": 0.0113,
"step": 1960
},
{
"epoch": 0.5180937314613407,
"grad_norm": 0.36974939703941345,
"learning_rate": 0.00014827813695738224,
"loss": 0.0267,
"step": 1965
},
{
"epoch": 0.5194120361215477,
"grad_norm": 0.009729950688779354,
"learning_rate": 0.0001481461934292123,
"loss": 0.027,
"step": 1970
},
{
"epoch": 0.5207303407817546,
"grad_norm": 0.0013097926275804639,
"learning_rate": 0.00014801424990104235,
"loss": 0.003,
"step": 1975
},
{
"epoch": 0.5220486454419616,
"grad_norm": 0.0706263929605484,
"learning_rate": 0.00014788230637287242,
"loss": 0.0193,
"step": 1980
},
{
"epoch": 0.5233669501021686,
"grad_norm": 1.435702919960022,
"learning_rate": 0.00014775036284470249,
"loss": 0.0647,
"step": 1985
},
{
"epoch": 0.5246852547623756,
"grad_norm": 0.00661757867783308,
"learning_rate": 0.00014761841931653253,
"loss": 0.0373,
"step": 1990
},
{
"epoch": 0.5260035594225826,
"grad_norm": 0.12014541029930115,
"learning_rate": 0.0001474864757883626,
"loss": 0.0178,
"step": 1995
},
{
"epoch": 0.5273218640827896,
"grad_norm": 1.0549248456954956,
"learning_rate": 0.00014735453226019264,
"loss": 0.0191,
"step": 2000
},
{
"epoch": 0.5273218640827896,
"eval_loss": 0.037292081862688065,
"eval_runtime": 454.3033,
"eval_samples_per_second": 7.422,
"eval_steps_per_second": 3.711,
"step": 2000
},
{
"epoch": 0.5286401687429965,
"grad_norm": 0.47634151577949524,
"learning_rate": 0.0001472225887320227,
"loss": 0.0404,
"step": 2005
},
{
"epoch": 0.5299584734032035,
"grad_norm": 0.006752463988959789,
"learning_rate": 0.00014709064520385275,
"loss": 0.034,
"step": 2010
},
{
"epoch": 0.5312767780634104,
"grad_norm": 0.20780125260353088,
"learning_rate": 0.00014695870167568282,
"loss": 0.0421,
"step": 2015
},
{
"epoch": 0.5325950827236174,
"grad_norm": 0.010941066779196262,
"learning_rate": 0.0001468267581475129,
"loss": 0.0086,
"step": 2020
},
{
"epoch": 0.5339133873838244,
"grad_norm": 0.3439581096172333,
"learning_rate": 0.00014669481461934293,
"loss": 0.0187,
"step": 2025
},
{
"epoch": 0.5352316920440314,
"grad_norm": 0.14961636066436768,
"learning_rate": 0.000146562871091173,
"loss": 0.0504,
"step": 2030
},
{
"epoch": 0.5365499967042383,
"grad_norm": 0.0044641937129199505,
"learning_rate": 0.00014643092756300304,
"loss": 0.0134,
"step": 2035
},
{
"epoch": 0.5378683013644453,
"grad_norm": 0.14088386297225952,
"learning_rate": 0.0001462989840348331,
"loss": 0.0096,
"step": 2040
},
{
"epoch": 0.5391866060246523,
"grad_norm": 0.48116979002952576,
"learning_rate": 0.00014616704050666315,
"loss": 0.0124,
"step": 2045
},
{
"epoch": 0.5405049106848593,
"grad_norm": 0.3688766360282898,
"learning_rate": 0.0001460350969784932,
"loss": 0.0226,
"step": 2050
},
{
"epoch": 0.5418232153450663,
"grad_norm": 0.002938181860372424,
"learning_rate": 0.00014590315345032326,
"loss": 0.0267,
"step": 2055
},
{
"epoch": 0.5431415200052733,
"grad_norm": 0.3335214853286743,
"learning_rate": 0.0001457712099221533,
"loss": 0.0367,
"step": 2060
},
{
"epoch": 0.5444598246654802,
"grad_norm": 0.004644686821848154,
"learning_rate": 0.00014563926639398338,
"loss": 0.0121,
"step": 2065
},
{
"epoch": 0.5457781293256871,
"grad_norm": 0.19505545496940613,
"learning_rate": 0.00014550732286581345,
"loss": 0.0591,
"step": 2070
},
{
"epoch": 0.5470964339858941,
"grad_norm": 0.018028756603598595,
"learning_rate": 0.0001453753793376435,
"loss": 0.0131,
"step": 2075
},
{
"epoch": 0.5484147386461011,
"grad_norm": 0.045639291405677795,
"learning_rate": 0.00014524343580947356,
"loss": 0.0443,
"step": 2080
},
{
"epoch": 0.5497330433063081,
"grad_norm": 0.727981686592102,
"learning_rate": 0.0001451114922813036,
"loss": 0.0205,
"step": 2085
},
{
"epoch": 0.5510513479665151,
"grad_norm": 0.03766491636633873,
"learning_rate": 0.00014497954875313367,
"loss": 0.0067,
"step": 2090
},
{
"epoch": 0.552369652626722,
"grad_norm": 0.1911504715681076,
"learning_rate": 0.0001448476052249637,
"loss": 0.0397,
"step": 2095
},
{
"epoch": 0.553687957286929,
"grad_norm": 0.08238353580236435,
"learning_rate": 0.00014471566169679378,
"loss": 0.0513,
"step": 2100
},
{
"epoch": 0.555006261947136,
"grad_norm": 0.06317206472158432,
"learning_rate": 0.00014458371816862385,
"loss": 0.0178,
"step": 2105
},
{
"epoch": 0.556324566607343,
"grad_norm": 0.0652734637260437,
"learning_rate": 0.0001444517746404539,
"loss": 0.0184,
"step": 2110
},
{
"epoch": 0.55764287126755,
"grad_norm": 0.05471858009696007,
"learning_rate": 0.00014431983111228396,
"loss": 0.0089,
"step": 2115
},
{
"epoch": 0.558961175927757,
"grad_norm": 0.005062670446932316,
"learning_rate": 0.000144187887584114,
"loss": 0.0052,
"step": 2120
},
{
"epoch": 0.5602794805879638,
"grad_norm": 0.06337414681911469,
"learning_rate": 0.00014405594405594407,
"loss": 0.053,
"step": 2125
},
{
"epoch": 0.5615977852481708,
"grad_norm": 0.33745357394218445,
"learning_rate": 0.00014392400052777414,
"loss": 0.0166,
"step": 2130
},
{
"epoch": 0.5629160899083778,
"grad_norm": 0.7382741570472717,
"learning_rate": 0.00014379205699960418,
"loss": 0.0191,
"step": 2135
},
{
"epoch": 0.5642343945685848,
"grad_norm": 0.007551972754299641,
"learning_rate": 0.00014366011347143425,
"loss": 0.0022,
"step": 2140
},
{
"epoch": 0.5655526992287918,
"grad_norm": 0.6260896921157837,
"learning_rate": 0.00014352816994326427,
"loss": 0.0095,
"step": 2145
},
{
"epoch": 0.5668710038889987,
"grad_norm": 0.11619322001934052,
"learning_rate": 0.00014339622641509434,
"loss": 0.015,
"step": 2150
},
{
"epoch": 0.5681893085492057,
"grad_norm": 1.1440670490264893,
"learning_rate": 0.0001432642828869244,
"loss": 0.1343,
"step": 2155
},
{
"epoch": 0.5695076132094127,
"grad_norm": 1.1793878078460693,
"learning_rate": 0.00014313233935875445,
"loss": 0.0968,
"step": 2160
},
{
"epoch": 0.5708259178696197,
"grad_norm": 0.6865736842155457,
"learning_rate": 0.00014300039583058452,
"loss": 0.0195,
"step": 2165
},
{
"epoch": 0.5721442225298267,
"grad_norm": 0.140816792845726,
"learning_rate": 0.00014286845230241456,
"loss": 0.0761,
"step": 2170
},
{
"epoch": 0.5734625271900337,
"grad_norm": 0.04071786254644394,
"learning_rate": 0.00014273650877424463,
"loss": 0.0193,
"step": 2175
},
{
"epoch": 0.5747808318502405,
"grad_norm": 0.044617727398872375,
"learning_rate": 0.0001426045652460747,
"loss": 0.0112,
"step": 2180
},
{
"epoch": 0.5760991365104475,
"grad_norm": 0.11001799255609512,
"learning_rate": 0.00014247262171790474,
"loss": 0.0039,
"step": 2185
},
{
"epoch": 0.5774174411706545,
"grad_norm": 0.0036315324250608683,
"learning_rate": 0.0001423406781897348,
"loss": 0.0038,
"step": 2190
},
{
"epoch": 0.5787357458308615,
"grad_norm": 0.9866570830345154,
"learning_rate": 0.00014220873466156485,
"loss": 0.025,
"step": 2195
},
{
"epoch": 0.5800540504910685,
"grad_norm": 0.023570384830236435,
"learning_rate": 0.00014207679113339492,
"loss": 0.0468,
"step": 2200
},
{
"epoch": 0.5813723551512755,
"grad_norm": 0.20010559260845184,
"learning_rate": 0.00014194484760522496,
"loss": 0.0198,
"step": 2205
},
{
"epoch": 0.5826906598114824,
"grad_norm": 0.06153270602226257,
"learning_rate": 0.00014181290407705503,
"loss": 0.0764,
"step": 2210
},
{
"epoch": 0.5840089644716894,
"grad_norm": 0.033162448555231094,
"learning_rate": 0.0001416809605488851,
"loss": 0.028,
"step": 2215
},
{
"epoch": 0.5853272691318964,
"grad_norm": 0.428382933139801,
"learning_rate": 0.00014154901702071514,
"loss": 0.0652,
"step": 2220
},
{
"epoch": 0.5866455737921034,
"grad_norm": 0.25004762411117554,
"learning_rate": 0.0001414170734925452,
"loss": 0.0411,
"step": 2225
},
{
"epoch": 0.5879638784523104,
"grad_norm": 0.22649863362312317,
"learning_rate": 0.00014128512996437525,
"loss": 0.0517,
"step": 2230
},
{
"epoch": 0.5892821831125173,
"grad_norm": 0.035932112485170364,
"learning_rate": 0.00014115318643620532,
"loss": 0.015,
"step": 2235
},
{
"epoch": 0.5906004877727242,
"grad_norm": 0.3800172507762909,
"learning_rate": 0.00014102124290803536,
"loss": 0.0324,
"step": 2240
},
{
"epoch": 0.5919187924329312,
"grad_norm": 0.6974118947982788,
"learning_rate": 0.0001408892993798654,
"loss": 0.0216,
"step": 2245
},
{
"epoch": 0.5932370970931382,
"grad_norm": 0.15472032129764557,
"learning_rate": 0.00014075735585169548,
"loss": 0.0164,
"step": 2250
},
{
"epoch": 0.5945554017533452,
"grad_norm": 0.015000814571976662,
"learning_rate": 0.00014062541232352552,
"loss": 0.0395,
"step": 2255
},
{
"epoch": 0.5958737064135522,
"grad_norm": 0.052086081355810165,
"learning_rate": 0.0001404934687953556,
"loss": 0.0032,
"step": 2260
},
{
"epoch": 0.5971920110737592,
"grad_norm": 0.004600350745022297,
"learning_rate": 0.00014036152526718566,
"loss": 0.0056,
"step": 2265
},
{
"epoch": 0.5985103157339661,
"grad_norm": 0.4940958321094513,
"learning_rate": 0.0001402295817390157,
"loss": 0.0206,
"step": 2270
},
{
"epoch": 0.5998286203941731,
"grad_norm": 0.09658394008874893,
"learning_rate": 0.00014009763821084577,
"loss": 0.0052,
"step": 2275
},
{
"epoch": 0.60114692505438,
"grad_norm": 0.00020539117394946516,
"learning_rate": 0.0001399656946826758,
"loss": 0.087,
"step": 2280
},
{
"epoch": 0.602465229714587,
"grad_norm": 0.1871018409729004,
"learning_rate": 0.00013983375115450588,
"loss": 0.0812,
"step": 2285
},
{
"epoch": 0.603783534374794,
"grad_norm": 0.02583954855799675,
"learning_rate": 0.00013970180762633592,
"loss": 0.0232,
"step": 2290
},
{
"epoch": 0.605101839035001,
"grad_norm": 1.2103784084320068,
"learning_rate": 0.000139569864098166,
"loss": 0.0151,
"step": 2295
},
{
"epoch": 0.6064201436952079,
"grad_norm": 0.023514943197369576,
"learning_rate": 0.00013943792056999606,
"loss": 0.0193,
"step": 2300
},
{
"epoch": 0.6077384483554149,
"grad_norm": 0.0076395305804908276,
"learning_rate": 0.0001393059770418261,
"loss": 0.0379,
"step": 2305
},
{
"epoch": 0.6090567530156219,
"grad_norm": 0.12412039190530777,
"learning_rate": 0.00013917403351365617,
"loss": 0.0095,
"step": 2310
},
{
"epoch": 0.6103750576758289,
"grad_norm": 0.021904783323407173,
"learning_rate": 0.0001390420899854862,
"loss": 0.0166,
"step": 2315
},
{
"epoch": 0.6116933623360359,
"grad_norm": 0.004012851510196924,
"learning_rate": 0.00013891014645731628,
"loss": 0.0103,
"step": 2320
},
{
"epoch": 0.6130116669962429,
"grad_norm": 0.007267913781106472,
"learning_rate": 0.00013877820292914635,
"loss": 0.0708,
"step": 2325
},
{
"epoch": 0.6143299716564498,
"grad_norm": 0.10363642126321793,
"learning_rate": 0.0001386462594009764,
"loss": 0.0473,
"step": 2330
},
{
"epoch": 0.6156482763166568,
"grad_norm": 0.04899830371141434,
"learning_rate": 0.00013851431587280646,
"loss": 0.0283,
"step": 2335
},
{
"epoch": 0.6169665809768637,
"grad_norm": 0.39460498094558716,
"learning_rate": 0.0001383823723446365,
"loss": 0.0597,
"step": 2340
},
{
"epoch": 0.6182848856370707,
"grad_norm": 0.04092290997505188,
"learning_rate": 0.00013825042881646655,
"loss": 0.0167,
"step": 2345
},
{
"epoch": 0.6196031902972777,
"grad_norm": 0.2781132161617279,
"learning_rate": 0.00013811848528829662,
"loss": 0.0097,
"step": 2350
},
{
"epoch": 0.6209214949574847,
"grad_norm": 0.041443537920713425,
"learning_rate": 0.00013798654176012666,
"loss": 0.0226,
"step": 2355
},
{
"epoch": 0.6222397996176916,
"grad_norm": 0.1242462694644928,
"learning_rate": 0.00013785459823195673,
"loss": 0.0055,
"step": 2360
},
{
"epoch": 0.6235581042778986,
"grad_norm": 0.4440467357635498,
"learning_rate": 0.00013772265470378677,
"loss": 0.049,
"step": 2365
},
{
"epoch": 0.6248764089381056,
"grad_norm": 0.014354427345097065,
"learning_rate": 0.00013759071117561684,
"loss": 0.0327,
"step": 2370
},
{
"epoch": 0.6261947135983126,
"grad_norm": 0.011539973318576813,
"learning_rate": 0.0001374587676474469,
"loss": 0.0222,
"step": 2375
},
{
"epoch": 0.6275130182585196,
"grad_norm": 0.23539051413536072,
"learning_rate": 0.00013732682411927695,
"loss": 0.0816,
"step": 2380
},
{
"epoch": 0.6288313229187266,
"grad_norm": 0.26793941855430603,
"learning_rate": 0.00013719488059110702,
"loss": 0.0325,
"step": 2385
},
{
"epoch": 0.6301496275789334,
"grad_norm": 0.01662217453122139,
"learning_rate": 0.00013706293706293706,
"loss": 0.0221,
"step": 2390
},
{
"epoch": 0.6314679322391404,
"grad_norm": 0.30669671297073364,
"learning_rate": 0.00013693099353476713,
"loss": 0.026,
"step": 2395
},
{
"epoch": 0.6327862368993474,
"grad_norm": 0.03350894898176193,
"learning_rate": 0.00013679905000659717,
"loss": 0.0072,
"step": 2400
},
{
"epoch": 0.6341045415595544,
"grad_norm": 0.014983875676989555,
"learning_rate": 0.00013666710647842724,
"loss": 0.049,
"step": 2405
},
{
"epoch": 0.6354228462197614,
"grad_norm": 1.8989384174346924,
"learning_rate": 0.0001365351629502573,
"loss": 0.0335,
"step": 2410
},
{
"epoch": 0.6367411508799684,
"grad_norm": 0.030135562643408775,
"learning_rate": 0.00013640321942208735,
"loss": 0.0051,
"step": 2415
},
{
"epoch": 0.6380594555401753,
"grad_norm": 0.02079075388610363,
"learning_rate": 0.00013627127589391742,
"loss": 0.0138,
"step": 2420
},
{
"epoch": 0.6393777602003823,
"grad_norm": 0.06065403297543526,
"learning_rate": 0.00013613933236574746,
"loss": 0.0357,
"step": 2425
},
{
"epoch": 0.6406960648605893,
"grad_norm": 0.2980937659740448,
"learning_rate": 0.00013600738883757753,
"loss": 0.0138,
"step": 2430
},
{
"epoch": 0.6420143695207963,
"grad_norm": 0.4820438623428345,
"learning_rate": 0.00013587544530940758,
"loss": 0.01,
"step": 2435
},
{
"epoch": 0.6433326741810033,
"grad_norm": 0.005618259310722351,
"learning_rate": 0.00013574350178123765,
"loss": 0.0052,
"step": 2440
},
{
"epoch": 0.6446509788412103,
"grad_norm": 0.7173821926116943,
"learning_rate": 0.0001356115582530677,
"loss": 0.0133,
"step": 2445
},
{
"epoch": 0.6459692835014171,
"grad_norm": 0.0053142281249165535,
"learning_rate": 0.00013547961472489773,
"loss": 0.0045,
"step": 2450
},
{
"epoch": 0.6472875881616241,
"grad_norm": 0.06118829548358917,
"learning_rate": 0.0001353476711967278,
"loss": 0.056,
"step": 2455
},
{
"epoch": 0.6486058928218311,
"grad_norm": 3.5878078937530518,
"learning_rate": 0.00013521572766855787,
"loss": 0.0232,
"step": 2460
},
{
"epoch": 0.6499241974820381,
"grad_norm": 0.004911276511847973,
"learning_rate": 0.0001350837841403879,
"loss": 0.0074,
"step": 2465
},
{
"epoch": 0.6512425021422451,
"grad_norm": 0.0028026222717016935,
"learning_rate": 0.00013495184061221798,
"loss": 0.0782,
"step": 2470
},
{
"epoch": 0.6525608068024521,
"grad_norm": 0.7317615747451782,
"learning_rate": 0.00013481989708404802,
"loss": 0.0222,
"step": 2475
},
{
"epoch": 0.653879111462659,
"grad_norm": 0.01835751160979271,
"learning_rate": 0.0001346879535558781,
"loss": 0.0661,
"step": 2480
},
{
"epoch": 0.655197416122866,
"grad_norm": 0.03598962351679802,
"learning_rate": 0.00013455601002770813,
"loss": 0.0395,
"step": 2485
},
{
"epoch": 0.656515720783073,
"grad_norm": 0.013886351138353348,
"learning_rate": 0.0001344240664995382,
"loss": 0.0156,
"step": 2490
},
{
"epoch": 0.65783402544328,
"grad_norm": 5.741530895233154,
"learning_rate": 0.00013429212297136827,
"loss": 0.0317,
"step": 2495
},
{
"epoch": 0.659152330103487,
"grad_norm": 0.20793496072292328,
"learning_rate": 0.0001341601794431983,
"loss": 0.0072,
"step": 2500
},
{
"epoch": 0.659152330103487,
"eval_loss": 0.0300898440182209,
"eval_runtime": 453.0554,
"eval_samples_per_second": 7.443,
"eval_steps_per_second": 3.721,
"step": 2500
}
],
"logging_steps": 5,
"max_steps": 7584,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.0176108255414272e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}