{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5871, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005109862033725089, "grad_norm": 57.87140382113745, "learning_rate": 5e-06, "loss": 2.3817, "step": 10 }, { "epoch": 0.010219724067450179, "grad_norm": 22.753974610258698, "learning_rate": 5e-06, "loss": 1.7816, "step": 20 }, { "epoch": 0.015329586101175269, "grad_norm": 10.200028852824934, "learning_rate": 5e-06, "loss": 1.4328, "step": 30 }, { "epoch": 0.020439448134900357, "grad_norm": 10.263199576103078, "learning_rate": 5e-06, "loss": 1.2383, "step": 40 }, { "epoch": 0.025549310168625446, "grad_norm": 4.8615267890156915, "learning_rate": 5e-06, "loss": 1.131, "step": 50 }, { "epoch": 0.030659172202350538, "grad_norm": 4.118120498213385, "learning_rate": 5e-06, "loss": 1.0356, "step": 60 }, { "epoch": 0.03576903423607562, "grad_norm": 2.8275806453598338, "learning_rate": 5e-06, "loss": 1.0006, "step": 70 }, { "epoch": 0.040878896269800714, "grad_norm": 2.315792934153761, "learning_rate": 5e-06, "loss": 0.9655, "step": 80 }, { "epoch": 0.045988758303525806, "grad_norm": 2.165672978783245, "learning_rate": 5e-06, "loss": 0.9467, "step": 90 }, { "epoch": 0.05109862033725089, "grad_norm": 1.5488035180810096, "learning_rate": 5e-06, "loss": 0.9281, "step": 100 }, { "epoch": 0.05620848237097598, "grad_norm": 1.7243341087325224, "learning_rate": 5e-06, "loss": 0.9128, "step": 110 }, { "epoch": 0.061318344404701075, "grad_norm": 1.5359895854005425, "learning_rate": 5e-06, "loss": 0.9031, "step": 120 }, { "epoch": 0.06642820643842616, "grad_norm": 1.435850065371256, "learning_rate": 5e-06, "loss": 0.8922, "step": 130 }, { "epoch": 0.07153806847215124, "grad_norm": 1.2790521202391505, "learning_rate": 5e-06, "loss": 0.8765, "step": 140 }, { "epoch": 0.07664793050587634, "grad_norm": 1.5917071745489466, "learning_rate": 5e-06, "loss": 0.883, "step": 150 }, { "epoch": 0.08175779253960143, "grad_norm": 1.4686990635112733, "learning_rate": 5e-06, "loss": 0.8874, "step": 160 }, { "epoch": 0.08686765457332651, "grad_norm": 1.2956583323307207, "learning_rate": 5e-06, "loss": 0.8411, "step": 170 }, { "epoch": 0.09197751660705161, "grad_norm": 1.2357587318309058, "learning_rate": 5e-06, "loss": 0.8726, "step": 180 }, { "epoch": 0.0970873786407767, "grad_norm": 1.1075519305139339, "learning_rate": 5e-06, "loss": 0.8548, "step": 190 }, { "epoch": 0.10219724067450178, "grad_norm": 1.086727474202395, "learning_rate": 5e-06, "loss": 0.8493, "step": 200 }, { "epoch": 0.10730710270822688, "grad_norm": 1.0270014162441539, "learning_rate": 5e-06, "loss": 0.8545, "step": 210 }, { "epoch": 0.11241696474195197, "grad_norm": 0.8505362329385927, "learning_rate": 5e-06, "loss": 0.8388, "step": 220 }, { "epoch": 0.11752682677567705, "grad_norm": 1.109098072362057, "learning_rate": 5e-06, "loss": 0.8384, "step": 230 }, { "epoch": 0.12263668880940215, "grad_norm": 1.2809391093206264, "learning_rate": 5e-06, "loss": 0.8471, "step": 240 }, { "epoch": 0.12774655084312725, "grad_norm": 0.9245822751345097, "learning_rate": 5e-06, "loss": 0.836, "step": 250 }, { "epoch": 0.13285641287685232, "grad_norm": 0.8873874884959316, "learning_rate": 5e-06, "loss": 0.8283, "step": 260 }, { "epoch": 0.13796627491057742, "grad_norm": 0.6839627248028421, "learning_rate": 5e-06, "loss": 0.8191, "step": 270 }, { "epoch": 0.1430761369443025, "grad_norm": 0.8006134542844254, "learning_rate": 5e-06, "loss": 0.82, "step": 280 }, { "epoch": 0.1481859989780276, "grad_norm": 0.7249813175028869, "learning_rate": 5e-06, "loss": 0.8254, "step": 290 }, { "epoch": 0.1532958610117527, "grad_norm": 0.611471064933341, "learning_rate": 5e-06, "loss": 0.8148, "step": 300 }, { "epoch": 0.15840572304547776, "grad_norm": 0.8403753613185879, "learning_rate": 5e-06, "loss": 0.8091, "step": 310 }, { "epoch": 0.16351558507920286, "grad_norm": 0.5250234838001376, "learning_rate": 5e-06, "loss": 0.8328, "step": 320 }, { "epoch": 0.16862544711292796, "grad_norm": 0.5411140081911021, "learning_rate": 5e-06, "loss": 0.8171, "step": 330 }, { "epoch": 0.17373530914665303, "grad_norm": 0.7002721154510084, "learning_rate": 5e-06, "loss": 0.8073, "step": 340 }, { "epoch": 0.17884517118037813, "grad_norm": 0.7366019714677299, "learning_rate": 5e-06, "loss": 0.8139, "step": 350 }, { "epoch": 0.18395503321410323, "grad_norm": 0.4960107622038265, "learning_rate": 5e-06, "loss": 0.8195, "step": 360 }, { "epoch": 0.1890648952478283, "grad_norm": 0.590921742782821, "learning_rate": 5e-06, "loss": 0.8131, "step": 370 }, { "epoch": 0.1941747572815534, "grad_norm": 0.43046345355299165, "learning_rate": 5e-06, "loss": 0.8166, "step": 380 }, { "epoch": 0.1992846193152785, "grad_norm": 0.37156340060616266, "learning_rate": 5e-06, "loss": 0.8082, "step": 390 }, { "epoch": 0.20439448134900357, "grad_norm": 0.5029827349762587, "learning_rate": 5e-06, "loss": 0.8187, "step": 400 }, { "epoch": 0.20950434338272866, "grad_norm": 0.39268177137347837, "learning_rate": 5e-06, "loss": 0.8132, "step": 410 }, { "epoch": 0.21461420541645376, "grad_norm": 0.38787210509179787, "learning_rate": 5e-06, "loss": 0.8153, "step": 420 }, { "epoch": 0.21972406745017883, "grad_norm": 0.5073537687615494, "learning_rate": 5e-06, "loss": 0.8032, "step": 430 }, { "epoch": 0.22483392948390393, "grad_norm": 0.36232141966438186, "learning_rate": 5e-06, "loss": 0.786, "step": 440 }, { "epoch": 0.22994379151762903, "grad_norm": 0.3362374048828632, "learning_rate": 5e-06, "loss": 0.8019, "step": 450 }, { "epoch": 0.2350536535513541, "grad_norm": 0.3661481336901712, "learning_rate": 5e-06, "loss": 0.8012, "step": 460 }, { "epoch": 0.2401635155850792, "grad_norm": 0.3186478374299407, "learning_rate": 5e-06, "loss": 0.8088, "step": 470 }, { "epoch": 0.2452733776188043, "grad_norm": 0.3742412176358808, "learning_rate": 5e-06, "loss": 0.795, "step": 480 }, { "epoch": 0.2503832396525294, "grad_norm": 0.3687134051282934, "learning_rate": 5e-06, "loss": 0.7899, "step": 490 }, { "epoch": 0.2554931016862545, "grad_norm": 0.2844684124400157, "learning_rate": 5e-06, "loss": 0.7992, "step": 500 }, { "epoch": 0.26060296371997954, "grad_norm": 0.3445705328013273, "learning_rate": 5e-06, "loss": 0.7908, "step": 510 }, { "epoch": 0.26571282575370464, "grad_norm": 0.3100104564359116, "learning_rate": 5e-06, "loss": 0.7874, "step": 520 }, { "epoch": 0.27082268778742974, "grad_norm": 0.32495110543783357, "learning_rate": 5e-06, "loss": 0.7982, "step": 530 }, { "epoch": 0.27593254982115484, "grad_norm": 0.27997656185948167, "learning_rate": 5e-06, "loss": 0.7954, "step": 540 }, { "epoch": 0.28104241185487994, "grad_norm": 0.4055280072220847, "learning_rate": 5e-06, "loss": 0.7895, "step": 550 }, { "epoch": 0.286152273888605, "grad_norm": 0.2751499734062912, "learning_rate": 5e-06, "loss": 0.7931, "step": 560 }, { "epoch": 0.2912621359223301, "grad_norm": 0.3232277546201772, "learning_rate": 5e-06, "loss": 0.7972, "step": 570 }, { "epoch": 0.2963719979560552, "grad_norm": 0.2915040535303487, "learning_rate": 5e-06, "loss": 0.7912, "step": 580 }, { "epoch": 0.3014818599897803, "grad_norm": 0.2637267252402153, "learning_rate": 5e-06, "loss": 0.7903, "step": 590 }, { "epoch": 0.3065917220235054, "grad_norm": 0.329139944193737, "learning_rate": 5e-06, "loss": 0.7906, "step": 600 }, { "epoch": 0.3117015840572305, "grad_norm": 0.24558162809212852, "learning_rate": 5e-06, "loss": 0.7947, "step": 610 }, { "epoch": 0.3168114460909555, "grad_norm": 0.27224578177967845, "learning_rate": 5e-06, "loss": 0.7998, "step": 620 }, { "epoch": 0.3219213081246806, "grad_norm": 0.2379116871443806, "learning_rate": 5e-06, "loss": 0.7702, "step": 630 }, { "epoch": 0.3270311701584057, "grad_norm": 0.2609800304013704, "learning_rate": 5e-06, "loss": 0.7799, "step": 640 }, { "epoch": 0.3321410321921308, "grad_norm": 0.23810454322283608, "learning_rate": 5e-06, "loss": 0.7897, "step": 650 }, { "epoch": 0.3372508942258559, "grad_norm": 0.252487995051141, "learning_rate": 5e-06, "loss": 0.7726, "step": 660 }, { "epoch": 0.342360756259581, "grad_norm": 0.24925719360242624, "learning_rate": 5e-06, "loss": 0.7978, "step": 670 }, { "epoch": 0.34747061829330605, "grad_norm": 0.2656149332274455, "learning_rate": 5e-06, "loss": 0.7915, "step": 680 }, { "epoch": 0.35258048032703115, "grad_norm": 0.2661464494899381, "learning_rate": 5e-06, "loss": 0.78, "step": 690 }, { "epoch": 0.35769034236075625, "grad_norm": 0.20145837945729328, "learning_rate": 5e-06, "loss": 0.7734, "step": 700 }, { "epoch": 0.36280020439448135, "grad_norm": 0.26148666494272177, "learning_rate": 5e-06, "loss": 0.7792, "step": 710 }, { "epoch": 0.36791006642820645, "grad_norm": 0.24346683962909085, "learning_rate": 5e-06, "loss": 0.7816, "step": 720 }, { "epoch": 0.37301992846193155, "grad_norm": 0.22278218589963927, "learning_rate": 5e-06, "loss": 0.7827, "step": 730 }, { "epoch": 0.3781297904956566, "grad_norm": 0.2303051334399858, "learning_rate": 5e-06, "loss": 0.7707, "step": 740 }, { "epoch": 0.3832396525293817, "grad_norm": 0.2543406962739334, "learning_rate": 5e-06, "loss": 0.7721, "step": 750 }, { "epoch": 0.3883495145631068, "grad_norm": 0.22568609222745584, "learning_rate": 5e-06, "loss": 0.7836, "step": 760 }, { "epoch": 0.3934593765968319, "grad_norm": 0.2294845661519681, "learning_rate": 5e-06, "loss": 0.7872, "step": 770 }, { "epoch": 0.398569238630557, "grad_norm": 0.2197759747155408, "learning_rate": 5e-06, "loss": 0.7681, "step": 780 }, { "epoch": 0.4036791006642821, "grad_norm": 0.22484907906751414, "learning_rate": 5e-06, "loss": 0.7797, "step": 790 }, { "epoch": 0.40878896269800713, "grad_norm": 0.21381439198625019, "learning_rate": 5e-06, "loss": 0.7891, "step": 800 }, { "epoch": 0.41389882473173223, "grad_norm": 0.21726381466920616, "learning_rate": 5e-06, "loss": 0.7779, "step": 810 }, { "epoch": 0.4190086867654573, "grad_norm": 0.21218649602843365, "learning_rate": 5e-06, "loss": 0.7756, "step": 820 }, { "epoch": 0.4241185487991824, "grad_norm": 0.2321703903701098, "learning_rate": 5e-06, "loss": 0.778, "step": 830 }, { "epoch": 0.4292284108329075, "grad_norm": 0.21376686462742833, "learning_rate": 5e-06, "loss": 0.7754, "step": 840 }, { "epoch": 0.4343382728666326, "grad_norm": 0.20773121342846562, "learning_rate": 5e-06, "loss": 0.7729, "step": 850 }, { "epoch": 0.43944813490035767, "grad_norm": 0.23075209429146118, "learning_rate": 5e-06, "loss": 0.7743, "step": 860 }, { "epoch": 0.44455799693408277, "grad_norm": 0.21889813942522324, "learning_rate": 5e-06, "loss": 0.7872, "step": 870 }, { "epoch": 0.44966785896780787, "grad_norm": 0.23158111213452945, "learning_rate": 5e-06, "loss": 0.7729, "step": 880 }, { "epoch": 0.45477772100153296, "grad_norm": 0.23759359366214816, "learning_rate": 5e-06, "loss": 0.7781, "step": 890 }, { "epoch": 0.45988758303525806, "grad_norm": 0.20737512188117463, "learning_rate": 5e-06, "loss": 0.7733, "step": 900 }, { "epoch": 0.46499744506898316, "grad_norm": 0.21978896076440746, "learning_rate": 5e-06, "loss": 0.7724, "step": 910 }, { "epoch": 0.4701073071027082, "grad_norm": 0.21326048741734183, "learning_rate": 5e-06, "loss": 0.7779, "step": 920 }, { "epoch": 0.4752171691364333, "grad_norm": 0.2127733995734116, "learning_rate": 5e-06, "loss": 0.7862, "step": 930 }, { "epoch": 0.4803270311701584, "grad_norm": 0.20545254783011138, "learning_rate": 5e-06, "loss": 0.7741, "step": 940 }, { "epoch": 0.4854368932038835, "grad_norm": 0.22401938020775877, "learning_rate": 5e-06, "loss": 0.7879, "step": 950 }, { "epoch": 0.4905467552376086, "grad_norm": 0.20313108726519555, "learning_rate": 5e-06, "loss": 0.7795, "step": 960 }, { "epoch": 0.4956566172713337, "grad_norm": 0.20676236489485517, "learning_rate": 5e-06, "loss": 0.7658, "step": 970 }, { "epoch": 0.5007664793050588, "grad_norm": 0.1997349100496237, "learning_rate": 5e-06, "loss": 0.7665, "step": 980 }, { "epoch": 0.5058763413387839, "grad_norm": 0.18582330695996696, "learning_rate": 5e-06, "loss": 0.7678, "step": 990 }, { "epoch": 0.510986203372509, "grad_norm": 0.20480578385489007, "learning_rate": 5e-06, "loss": 0.7916, "step": 1000 }, { "epoch": 0.516096065406234, "grad_norm": 0.2068595244192038, "learning_rate": 5e-06, "loss": 0.7728, "step": 1010 }, { "epoch": 0.5212059274399591, "grad_norm": 0.19202510728167327, "learning_rate": 5e-06, "loss": 0.7875, "step": 1020 }, { "epoch": 0.5263157894736842, "grad_norm": 0.20142193391262955, "learning_rate": 5e-06, "loss": 0.7691, "step": 1030 }, { "epoch": 0.5314256515074093, "grad_norm": 0.19153357968487428, "learning_rate": 5e-06, "loss": 0.7717, "step": 1040 }, { "epoch": 0.5365355135411344, "grad_norm": 0.21070221575444148, "learning_rate": 5e-06, "loss": 0.771, "step": 1050 }, { "epoch": 0.5416453755748595, "grad_norm": 0.19388651883167224, "learning_rate": 5e-06, "loss": 0.7674, "step": 1060 }, { "epoch": 0.5467552376085846, "grad_norm": 0.19235966239305358, "learning_rate": 5e-06, "loss": 0.7639, "step": 1070 }, { "epoch": 0.5518650996423097, "grad_norm": 0.2020128053771939, "learning_rate": 5e-06, "loss": 0.7838, "step": 1080 }, { "epoch": 0.5569749616760348, "grad_norm": 0.19898211373916952, "learning_rate": 5e-06, "loss": 0.7734, "step": 1090 }, { "epoch": 0.5620848237097599, "grad_norm": 0.19556108367687597, "learning_rate": 5e-06, "loss": 0.7668, "step": 1100 }, { "epoch": 0.567194685743485, "grad_norm": 0.19409586285487196, "learning_rate": 5e-06, "loss": 0.7778, "step": 1110 }, { "epoch": 0.57230454777721, "grad_norm": 0.20681598650606026, "learning_rate": 5e-06, "loss": 0.779, "step": 1120 }, { "epoch": 0.5774144098109351, "grad_norm": 0.2017546177375358, "learning_rate": 5e-06, "loss": 0.7573, "step": 1130 }, { "epoch": 0.5825242718446602, "grad_norm": 0.1913755514728684, "learning_rate": 5e-06, "loss": 0.7625, "step": 1140 }, { "epoch": 0.5876341338783853, "grad_norm": 0.18303523769215124, "learning_rate": 5e-06, "loss": 0.7752, "step": 1150 }, { "epoch": 0.5927439959121104, "grad_norm": 0.19646699079126273, "learning_rate": 5e-06, "loss": 0.7734, "step": 1160 }, { "epoch": 0.5978538579458355, "grad_norm": 0.18589450588787854, "learning_rate": 5e-06, "loss": 0.7631, "step": 1170 }, { "epoch": 0.6029637199795606, "grad_norm": 0.19141416094028266, "learning_rate": 5e-06, "loss": 0.7721, "step": 1180 }, { "epoch": 0.6080735820132857, "grad_norm": 0.2010700371236207, "learning_rate": 5e-06, "loss": 0.7723, "step": 1190 }, { "epoch": 0.6131834440470108, "grad_norm": 0.209693144160535, "learning_rate": 5e-06, "loss": 0.7709, "step": 1200 }, { "epoch": 0.6182933060807358, "grad_norm": 0.19006264249604685, "learning_rate": 5e-06, "loss": 0.7731, "step": 1210 }, { "epoch": 0.623403168114461, "grad_norm": 0.19625031587081004, "learning_rate": 5e-06, "loss": 0.7683, "step": 1220 }, { "epoch": 0.628513030148186, "grad_norm": 0.19078272960681916, "learning_rate": 5e-06, "loss": 0.773, "step": 1230 }, { "epoch": 0.633622892181911, "grad_norm": 0.2007688157803181, "learning_rate": 5e-06, "loss": 0.7533, "step": 1240 }, { "epoch": 0.6387327542156361, "grad_norm": 0.20004504142951482, "learning_rate": 5e-06, "loss": 0.7721, "step": 1250 }, { "epoch": 0.6438426162493612, "grad_norm": 0.18940428161938747, "learning_rate": 5e-06, "loss": 0.7525, "step": 1260 }, { "epoch": 0.6489524782830863, "grad_norm": 0.19192245014604287, "learning_rate": 5e-06, "loss": 0.7743, "step": 1270 }, { "epoch": 0.6540623403168114, "grad_norm": 0.18803291404020508, "learning_rate": 5e-06, "loss": 0.7781, "step": 1280 }, { "epoch": 0.6591722023505365, "grad_norm": 0.19034763073417169, "learning_rate": 5e-06, "loss": 0.7609, "step": 1290 }, { "epoch": 0.6642820643842616, "grad_norm": 0.2000260745898994, "learning_rate": 5e-06, "loss": 0.7493, "step": 1300 }, { "epoch": 0.6693919264179867, "grad_norm": 0.19721212735320226, "learning_rate": 5e-06, "loss": 0.759, "step": 1310 }, { "epoch": 0.6745017884517118, "grad_norm": 0.1840594216152482, "learning_rate": 5e-06, "loss": 0.7554, "step": 1320 }, { "epoch": 0.6796116504854369, "grad_norm": 0.19294533218500587, "learning_rate": 5e-06, "loss": 0.7601, "step": 1330 }, { "epoch": 0.684721512519162, "grad_norm": 0.2003437533474394, "learning_rate": 5e-06, "loss": 0.7665, "step": 1340 }, { "epoch": 0.6898313745528871, "grad_norm": 0.19522682692250634, "learning_rate": 5e-06, "loss": 0.7682, "step": 1350 }, { "epoch": 0.6949412365866121, "grad_norm": 0.19114649796975278, "learning_rate": 5e-06, "loss": 0.7675, "step": 1360 }, { "epoch": 0.7000510986203372, "grad_norm": 0.18795490185304542, "learning_rate": 5e-06, "loss": 0.7514, "step": 1370 }, { "epoch": 0.7051609606540623, "grad_norm": 0.19982770311498202, "learning_rate": 5e-06, "loss": 0.7587, "step": 1380 }, { "epoch": 0.7102708226877874, "grad_norm": 0.2094868748722276, "learning_rate": 5e-06, "loss": 0.7424, "step": 1390 }, { "epoch": 0.7153806847215125, "grad_norm": 0.18141939898186682, "learning_rate": 5e-06, "loss": 0.7668, "step": 1400 }, { "epoch": 0.7204905467552376, "grad_norm": 0.184317543793234, "learning_rate": 5e-06, "loss": 0.7659, "step": 1410 }, { "epoch": 0.7256004087889627, "grad_norm": 0.19482359157799217, "learning_rate": 5e-06, "loss": 0.766, "step": 1420 }, { "epoch": 0.7307102708226878, "grad_norm": 0.20609416559465576, "learning_rate": 5e-06, "loss": 0.7591, "step": 1430 }, { "epoch": 0.7358201328564129, "grad_norm": 0.1733817918744796, "learning_rate": 5e-06, "loss": 0.7657, "step": 1440 }, { "epoch": 0.740929994890138, "grad_norm": 0.20231819059208814, "learning_rate": 5e-06, "loss": 0.7693, "step": 1450 }, { "epoch": 0.7460398569238631, "grad_norm": 0.19384075901742115, "learning_rate": 5e-06, "loss": 0.7677, "step": 1460 }, { "epoch": 0.7511497189575882, "grad_norm": 0.20242534213073207, "learning_rate": 5e-06, "loss": 0.7658, "step": 1470 }, { "epoch": 0.7562595809913132, "grad_norm": 0.18992152096280124, "learning_rate": 5e-06, "loss": 0.7604, "step": 1480 }, { "epoch": 0.7613694430250383, "grad_norm": 0.20300312286644698, "learning_rate": 5e-06, "loss": 0.7654, "step": 1490 }, { "epoch": 0.7664793050587634, "grad_norm": 0.2110214114358105, "learning_rate": 5e-06, "loss": 0.7579, "step": 1500 }, { "epoch": 0.7715891670924885, "grad_norm": 0.18507470375993035, "learning_rate": 5e-06, "loss": 0.7535, "step": 1510 }, { "epoch": 0.7766990291262136, "grad_norm": 0.20773697020515, "learning_rate": 5e-06, "loss": 0.7681, "step": 1520 }, { "epoch": 0.7818088911599387, "grad_norm": 0.1846942481775099, "learning_rate": 5e-06, "loss": 0.7589, "step": 1530 }, { "epoch": 0.7869187531936638, "grad_norm": 0.19873266556747132, "learning_rate": 5e-06, "loss": 0.7701, "step": 1540 }, { "epoch": 0.7920286152273889, "grad_norm": 0.1889722343751879, "learning_rate": 5e-06, "loss": 0.7637, "step": 1550 }, { "epoch": 0.797138477261114, "grad_norm": 0.2160555515101488, "learning_rate": 5e-06, "loss": 0.7459, "step": 1560 }, { "epoch": 0.8022483392948391, "grad_norm": 0.18958425843094595, "learning_rate": 5e-06, "loss": 0.7499, "step": 1570 }, { "epoch": 0.8073582013285642, "grad_norm": 0.18917128647246217, "learning_rate": 5e-06, "loss": 0.7572, "step": 1580 }, { "epoch": 0.8124680633622893, "grad_norm": 0.17914225162735836, "learning_rate": 5e-06, "loss": 0.7506, "step": 1590 }, { "epoch": 0.8175779253960143, "grad_norm": 0.19189511614416635, "learning_rate": 5e-06, "loss": 0.7613, "step": 1600 }, { "epoch": 0.8226877874297394, "grad_norm": 0.19059344363394998, "learning_rate": 5e-06, "loss": 0.7604, "step": 1610 }, { "epoch": 0.8277976494634645, "grad_norm": 0.20852250489781288, "learning_rate": 5e-06, "loss": 0.7737, "step": 1620 }, { "epoch": 0.8329075114971896, "grad_norm": 0.1877964215413997, "learning_rate": 5e-06, "loss": 0.7725, "step": 1630 }, { "epoch": 0.8380173735309147, "grad_norm": 0.18419774730049385, "learning_rate": 5e-06, "loss": 0.757, "step": 1640 }, { "epoch": 0.8431272355646398, "grad_norm": 0.1926784804428757, "learning_rate": 5e-06, "loss": 0.7515, "step": 1650 }, { "epoch": 0.8482370975983649, "grad_norm": 0.19627214391258455, "learning_rate": 5e-06, "loss": 0.7718, "step": 1660 }, { "epoch": 0.85334695963209, "grad_norm": 0.1944045450874202, "learning_rate": 5e-06, "loss": 0.7449, "step": 1670 }, { "epoch": 0.858456821665815, "grad_norm": 0.18125259142482736, "learning_rate": 5e-06, "loss": 0.7604, "step": 1680 }, { "epoch": 0.8635666836995401, "grad_norm": 0.17817637475202877, "learning_rate": 5e-06, "loss": 0.7641, "step": 1690 }, { "epoch": 0.8686765457332652, "grad_norm": 0.20471658407314103, "learning_rate": 5e-06, "loss": 0.7734, "step": 1700 }, { "epoch": 0.8737864077669902, "grad_norm": 0.1878161208418682, "learning_rate": 5e-06, "loss": 0.7696, "step": 1710 }, { "epoch": 0.8788962698007153, "grad_norm": 0.18334210713527221, "learning_rate": 5e-06, "loss": 0.7536, "step": 1720 }, { "epoch": 0.8840061318344404, "grad_norm": 0.18076699199021762, "learning_rate": 5e-06, "loss": 0.7522, "step": 1730 }, { "epoch": 0.8891159938681655, "grad_norm": 0.19181861440876702, "learning_rate": 5e-06, "loss": 0.7625, "step": 1740 }, { "epoch": 0.8942258559018906, "grad_norm": 0.17965104525055625, "learning_rate": 5e-06, "loss": 0.754, "step": 1750 }, { "epoch": 0.8993357179356157, "grad_norm": 0.19942739585455946, "learning_rate": 5e-06, "loss": 0.7663, "step": 1760 }, { "epoch": 0.9044455799693408, "grad_norm": 0.19441177912329213, "learning_rate": 5e-06, "loss": 0.7643, "step": 1770 }, { "epoch": 0.9095554420030659, "grad_norm": 0.1989153914067934, "learning_rate": 5e-06, "loss": 0.7538, "step": 1780 }, { "epoch": 0.914665304036791, "grad_norm": 0.18779250602110079, "learning_rate": 5e-06, "loss": 0.7582, "step": 1790 }, { "epoch": 0.9197751660705161, "grad_norm": 0.18880847142963628, "learning_rate": 5e-06, "loss": 0.7834, "step": 1800 }, { "epoch": 0.9248850281042412, "grad_norm": 0.19100671149679851, "learning_rate": 5e-06, "loss": 0.7633, "step": 1810 }, { "epoch": 0.9299948901379663, "grad_norm": 0.19156019315031683, "learning_rate": 5e-06, "loss": 0.7644, "step": 1820 }, { "epoch": 0.9351047521716913, "grad_norm": 0.18435442552610007, "learning_rate": 5e-06, "loss": 0.7597, "step": 1830 }, { "epoch": 0.9402146142054164, "grad_norm": 0.2051724242213117, "learning_rate": 5e-06, "loss": 0.7504, "step": 1840 }, { "epoch": 0.9453244762391415, "grad_norm": 0.18536587248191086, "learning_rate": 5e-06, "loss": 0.7396, "step": 1850 }, { "epoch": 0.9504343382728666, "grad_norm": 0.17780024213447235, "learning_rate": 5e-06, "loss": 0.7506, "step": 1860 }, { "epoch": 0.9555442003065917, "grad_norm": 0.18708967218779626, "learning_rate": 5e-06, "loss": 0.7455, "step": 1870 }, { "epoch": 0.9606540623403168, "grad_norm": 0.17747103600840475, "learning_rate": 5e-06, "loss": 0.7444, "step": 1880 }, { "epoch": 0.9657639243740419, "grad_norm": 0.19063629309146018, "learning_rate": 5e-06, "loss": 0.7533, "step": 1890 }, { "epoch": 0.970873786407767, "grad_norm": 0.19471871824403422, "learning_rate": 5e-06, "loss": 0.7539, "step": 1900 }, { "epoch": 0.9759836484414921, "grad_norm": 0.18403984668742995, "learning_rate": 5e-06, "loss": 0.766, "step": 1910 }, { "epoch": 0.9810935104752172, "grad_norm": 0.19270062603489418, "learning_rate": 5e-06, "loss": 0.7661, "step": 1920 }, { "epoch": 0.9862033725089423, "grad_norm": 0.19463685028697894, "learning_rate": 5e-06, "loss": 0.7591, "step": 1930 }, { "epoch": 0.9913132345426674, "grad_norm": 0.18870267371498323, "learning_rate": 5e-06, "loss": 0.7619, "step": 1940 }, { "epoch": 0.9964230965763924, "grad_norm": 0.19561509144751293, "learning_rate": 5e-06, "loss": 0.7727, "step": 1950 }, { "epoch": 1.0015329586101176, "grad_norm": 0.1865614045173204, "learning_rate": 5e-06, "loss": 0.753, "step": 1960 }, { "epoch": 1.0066428206438427, "grad_norm": 0.17671337095527262, "learning_rate": 5e-06, "loss": 0.7557, "step": 1970 }, { "epoch": 1.0117526826775678, "grad_norm": 0.19010483409505236, "learning_rate": 5e-06, "loss": 0.7446, "step": 1980 }, { "epoch": 1.016862544711293, "grad_norm": 0.17413564167981435, "learning_rate": 5e-06, "loss": 0.7389, "step": 1990 }, { "epoch": 1.021972406745018, "grad_norm": 0.18070567046481728, "learning_rate": 5e-06, "loss": 0.7451, "step": 2000 }, { "epoch": 1.0270822687787429, "grad_norm": 0.2047148011452083, "learning_rate": 5e-06, "loss": 0.7404, "step": 2010 }, { "epoch": 1.032192130812468, "grad_norm": 0.19369702939207809, "learning_rate": 5e-06, "loss": 0.7432, "step": 2020 }, { "epoch": 1.037301992846193, "grad_norm": 0.19067195646001925, "learning_rate": 5e-06, "loss": 0.7396, "step": 2030 }, { "epoch": 1.0424118548799182, "grad_norm": 0.20517054385871566, "learning_rate": 5e-06, "loss": 0.7312, "step": 2040 }, { "epoch": 1.0475217169136433, "grad_norm": 0.19412259781936536, "learning_rate": 5e-06, "loss": 0.7434, "step": 2050 }, { "epoch": 1.0526315789473684, "grad_norm": 0.1873188104877111, "learning_rate": 5e-06, "loss": 0.7411, "step": 2060 }, { "epoch": 1.0577414409810935, "grad_norm": 0.1897369700854769, "learning_rate": 5e-06, "loss": 0.7345, "step": 2070 }, { "epoch": 1.0628513030148186, "grad_norm": 0.1853976118509793, "learning_rate": 5e-06, "loss": 0.7549, "step": 2080 }, { "epoch": 1.0679611650485437, "grad_norm": 0.17933568978007214, "learning_rate": 5e-06, "loss": 0.7531, "step": 2090 }, { "epoch": 1.0730710270822688, "grad_norm": 0.20312467005419368, "learning_rate": 5e-06, "loss": 0.7498, "step": 2100 }, { "epoch": 1.0781808891159939, "grad_norm": 0.18428135458855252, "learning_rate": 5e-06, "loss": 0.7404, "step": 2110 }, { "epoch": 1.083290751149719, "grad_norm": 0.1973755972738785, "learning_rate": 5e-06, "loss": 0.7329, "step": 2120 }, { "epoch": 1.088400613183444, "grad_norm": 0.1917623224859124, "learning_rate": 5e-06, "loss": 0.7555, "step": 2130 }, { "epoch": 1.0935104752171692, "grad_norm": 0.18406588696688597, "learning_rate": 5e-06, "loss": 0.7433, "step": 2140 }, { "epoch": 1.0986203372508943, "grad_norm": 0.17921040061727433, "learning_rate": 5e-06, "loss": 0.7305, "step": 2150 }, { "epoch": 1.1037301992846194, "grad_norm": 0.18963146030246644, "learning_rate": 5e-06, "loss": 0.7397, "step": 2160 }, { "epoch": 1.1088400613183444, "grad_norm": 0.18712686418913257, "learning_rate": 5e-06, "loss": 0.7409, "step": 2170 }, { "epoch": 1.1139499233520695, "grad_norm": 0.19771359798461643, "learning_rate": 5e-06, "loss": 0.7427, "step": 2180 }, { "epoch": 1.1190597853857946, "grad_norm": 0.1840406013875161, "learning_rate": 5e-06, "loss": 0.7611, "step": 2190 }, { "epoch": 1.1241696474195197, "grad_norm": 0.18827676786628125, "learning_rate": 5e-06, "loss": 0.7364, "step": 2200 }, { "epoch": 1.1292795094532448, "grad_norm": 0.191153055094231, "learning_rate": 5e-06, "loss": 0.7539, "step": 2210 }, { "epoch": 1.13438937148697, "grad_norm": 0.1822666967889064, "learning_rate": 5e-06, "loss": 0.7414, "step": 2220 }, { "epoch": 1.139499233520695, "grad_norm": 0.19699245088881503, "learning_rate": 5e-06, "loss": 0.738, "step": 2230 }, { "epoch": 1.14460909555442, "grad_norm": 0.1898722587971926, "learning_rate": 5e-06, "loss": 0.7461, "step": 2240 }, { "epoch": 1.1497189575881452, "grad_norm": 0.18927249674651098, "learning_rate": 5e-06, "loss": 0.7429, "step": 2250 }, { "epoch": 1.1548288196218701, "grad_norm": 0.18896514284627008, "learning_rate": 5e-06, "loss": 0.7549, "step": 2260 }, { "epoch": 1.1599386816555952, "grad_norm": 0.18739970646008372, "learning_rate": 5e-06, "loss": 0.7499, "step": 2270 }, { "epoch": 1.1650485436893203, "grad_norm": 0.1910100009129843, "learning_rate": 5e-06, "loss": 0.747, "step": 2280 }, { "epoch": 1.1701584057230454, "grad_norm": 0.20198153170551428, "learning_rate": 5e-06, "loss": 0.7386, "step": 2290 }, { "epoch": 1.1752682677567705, "grad_norm": 0.18720641288978465, "learning_rate": 5e-06, "loss": 0.7578, "step": 2300 }, { "epoch": 1.1803781297904956, "grad_norm": 0.18961987449195758, "learning_rate": 5e-06, "loss": 0.7529, "step": 2310 }, { "epoch": 1.1854879918242207, "grad_norm": 0.17712198248177036, "learning_rate": 5e-06, "loss": 0.7476, "step": 2320 }, { "epoch": 1.1905978538579458, "grad_norm": 0.18490722732969878, "learning_rate": 5e-06, "loss": 0.7437, "step": 2330 }, { "epoch": 1.195707715891671, "grad_norm": 0.18822524406653396, "learning_rate": 5e-06, "loss": 0.7469, "step": 2340 }, { "epoch": 1.200817577925396, "grad_norm": 0.17979619340691932, "learning_rate": 5e-06, "loss": 0.7476, "step": 2350 }, { "epoch": 1.205927439959121, "grad_norm": 0.19302201862857232, "learning_rate": 5e-06, "loss": 0.7519, "step": 2360 }, { "epoch": 1.2110373019928462, "grad_norm": 0.17331596795239365, "learning_rate": 5e-06, "loss": 0.7421, "step": 2370 }, { "epoch": 1.2161471640265713, "grad_norm": 0.1900974772777454, "learning_rate": 5e-06, "loss": 0.7491, "step": 2380 }, { "epoch": 1.2212570260602964, "grad_norm": 0.18235079887869074, "learning_rate": 5e-06, "loss": 0.7398, "step": 2390 }, { "epoch": 1.2263668880940215, "grad_norm": 0.1990024061618005, "learning_rate": 5e-06, "loss": 0.7367, "step": 2400 }, { "epoch": 1.2314767501277466, "grad_norm": 0.19774880108315787, "learning_rate": 5e-06, "loss": 0.7509, "step": 2410 }, { "epoch": 1.2365866121614717, "grad_norm": 0.18038613594979708, "learning_rate": 5e-06, "loss": 0.7397, "step": 2420 }, { "epoch": 1.2416964741951968, "grad_norm": 0.19148490320343095, "learning_rate": 5e-06, "loss": 0.738, "step": 2430 }, { "epoch": 1.246806336228922, "grad_norm": 0.1764579818726389, "learning_rate": 5e-06, "loss": 0.7344, "step": 2440 }, { "epoch": 1.2519161982626468, "grad_norm": 0.19292171667184566, "learning_rate": 5e-06, "loss": 0.7386, "step": 2450 }, { "epoch": 1.257026060296372, "grad_norm": 0.1830054013064937, "learning_rate": 5e-06, "loss": 0.7428, "step": 2460 }, { "epoch": 1.262135922330097, "grad_norm": 0.1771188524320852, "learning_rate": 5e-06, "loss": 0.7349, "step": 2470 }, { "epoch": 1.2672457843638223, "grad_norm": 0.19308200340380383, "learning_rate": 5e-06, "loss": 0.7428, "step": 2480 }, { "epoch": 1.2723556463975472, "grad_norm": 0.17723395601019812, "learning_rate": 5e-06, "loss": 0.7271, "step": 2490 }, { "epoch": 1.2774655084312725, "grad_norm": 0.21808963432632347, "learning_rate": 5e-06, "loss": 0.7551, "step": 2500 }, { "epoch": 1.2825753704649974, "grad_norm": 0.1944324454402299, "learning_rate": 5e-06, "loss": 0.7388, "step": 2510 }, { "epoch": 1.2876852324987225, "grad_norm": 0.1737167892046253, "learning_rate": 5e-06, "loss": 0.7383, "step": 2520 }, { "epoch": 1.2927950945324476, "grad_norm": 0.19316243219818216, "learning_rate": 5e-06, "loss": 0.7447, "step": 2530 }, { "epoch": 1.2979049565661727, "grad_norm": 0.18271045253382115, "learning_rate": 5e-06, "loss": 0.7439, "step": 2540 }, { "epoch": 1.3030148185998978, "grad_norm": 0.19235060247622612, "learning_rate": 5e-06, "loss": 0.7362, "step": 2550 }, { "epoch": 1.3081246806336229, "grad_norm": 0.17975422534798727, "learning_rate": 5e-06, "loss": 0.7355, "step": 2560 }, { "epoch": 1.313234542667348, "grad_norm": 0.19133431284185412, "learning_rate": 5e-06, "loss": 0.7474, "step": 2570 }, { "epoch": 1.318344404701073, "grad_norm": 0.18525107476229646, "learning_rate": 5e-06, "loss": 0.7362, "step": 2580 }, { "epoch": 1.3234542667347982, "grad_norm": 0.17940598778920888, "learning_rate": 5e-06, "loss": 0.7456, "step": 2590 }, { "epoch": 1.3285641287685233, "grad_norm": 0.18377019808836909, "learning_rate": 5e-06, "loss": 0.7371, "step": 2600 }, { "epoch": 1.3336739908022484, "grad_norm": 0.19380006248186266, "learning_rate": 5e-06, "loss": 0.7382, "step": 2610 }, { "epoch": 1.3387838528359735, "grad_norm": 0.19130341477919996, "learning_rate": 5e-06, "loss": 0.7249, "step": 2620 }, { "epoch": 1.3438937148696986, "grad_norm": 0.19399937386692442, "learning_rate": 5e-06, "loss": 0.7461, "step": 2630 }, { "epoch": 1.3490035769034237, "grad_norm": 0.19026606995853784, "learning_rate": 5e-06, "loss": 0.744, "step": 2640 }, { "epoch": 1.3541134389371488, "grad_norm": 0.17865393066414276, "learning_rate": 5e-06, "loss": 0.7562, "step": 2650 }, { "epoch": 1.3592233009708738, "grad_norm": 0.18268806699965215, "learning_rate": 5e-06, "loss": 0.7369, "step": 2660 }, { "epoch": 1.364333163004599, "grad_norm": 0.1859894512893362, "learning_rate": 5e-06, "loss": 0.7303, "step": 2670 }, { "epoch": 1.369443025038324, "grad_norm": 0.1858579729895718, "learning_rate": 5e-06, "loss": 0.7338, "step": 2680 }, { "epoch": 1.3745528870720491, "grad_norm": 0.18844334675064925, "learning_rate": 5e-06, "loss": 0.7399, "step": 2690 }, { "epoch": 1.379662749105774, "grad_norm": 0.19488780104365555, "learning_rate": 5e-06, "loss": 0.7297, "step": 2700 }, { "epoch": 1.3847726111394993, "grad_norm": 0.2012609184785339, "learning_rate": 5e-06, "loss": 0.7438, "step": 2710 }, { "epoch": 1.3898824731732242, "grad_norm": 0.19240664121181153, "learning_rate": 5e-06, "loss": 0.724, "step": 2720 }, { "epoch": 1.3949923352069495, "grad_norm": 0.1989866337354731, "learning_rate": 5e-06, "loss": 0.7387, "step": 2730 }, { "epoch": 1.4001021972406744, "grad_norm": 0.19173593318229185, "learning_rate": 5e-06, "loss": 0.7389, "step": 2740 }, { "epoch": 1.4052120592743995, "grad_norm": 0.18490218706957628, "learning_rate": 5e-06, "loss": 0.7354, "step": 2750 }, { "epoch": 1.4103219213081246, "grad_norm": 0.19596927441115194, "learning_rate": 5e-06, "loss": 0.7294, "step": 2760 }, { "epoch": 1.4154317833418497, "grad_norm": 0.1884149575008095, "learning_rate": 5e-06, "loss": 0.7535, "step": 2770 }, { "epoch": 1.4205416453755748, "grad_norm": 0.19011351454021505, "learning_rate": 5e-06, "loss": 0.7311, "step": 2780 }, { "epoch": 1.4256515074093, "grad_norm": 0.18330086878582655, "learning_rate": 5e-06, "loss": 0.7238, "step": 2790 }, { "epoch": 1.430761369443025, "grad_norm": 0.18245127655957494, "learning_rate": 5e-06, "loss": 0.7297, "step": 2800 }, { "epoch": 1.43587123147675, "grad_norm": 0.1899683440493854, "learning_rate": 5e-06, "loss": 0.7388, "step": 2810 }, { "epoch": 1.4409810935104752, "grad_norm": 0.17955923753560576, "learning_rate": 5e-06, "loss": 0.7382, "step": 2820 }, { "epoch": 1.4460909555442003, "grad_norm": 0.19252118964036657, "learning_rate": 5e-06, "loss": 0.7428, "step": 2830 }, { "epoch": 1.4512008175779254, "grad_norm": 0.1993095814580447, "learning_rate": 5e-06, "loss": 0.7453, "step": 2840 }, { "epoch": 1.4563106796116505, "grad_norm": 0.19453340379561043, "learning_rate": 5e-06, "loss": 0.7316, "step": 2850 }, { "epoch": 1.4614205416453756, "grad_norm": 0.18991487065634022, "learning_rate": 5e-06, "loss": 0.7386, "step": 2860 }, { "epoch": 1.4665304036791007, "grad_norm": 0.18973483396294438, "learning_rate": 5e-06, "loss": 0.7466, "step": 2870 }, { "epoch": 1.4716402657128258, "grad_norm": 0.1919078737205217, "learning_rate": 5e-06, "loss": 0.7495, "step": 2880 }, { "epoch": 1.476750127746551, "grad_norm": 0.1839372662925669, "learning_rate": 5e-06, "loss": 0.7389, "step": 2890 }, { "epoch": 1.481859989780276, "grad_norm": 0.19249510537236922, "learning_rate": 5e-06, "loss": 0.749, "step": 2900 }, { "epoch": 1.486969851814001, "grad_norm": 0.179246890906481, "learning_rate": 5e-06, "loss": 0.7282, "step": 2910 }, { "epoch": 1.4920797138477262, "grad_norm": 0.19514232006253573, "learning_rate": 5e-06, "loss": 0.7446, "step": 2920 }, { "epoch": 1.497189575881451, "grad_norm": 0.18596165643891152, "learning_rate": 5e-06, "loss": 0.7334, "step": 2930 }, { "epoch": 1.5022994379151764, "grad_norm": 0.18786595484474303, "learning_rate": 5e-06, "loss": 0.7279, "step": 2940 }, { "epoch": 1.5074092999489013, "grad_norm": 0.1971738538736254, "learning_rate": 5e-06, "loss": 0.7461, "step": 2950 }, { "epoch": 1.5125191619826266, "grad_norm": 0.18866040282412957, "learning_rate": 5e-06, "loss": 0.7474, "step": 2960 }, { "epoch": 1.5176290240163515, "grad_norm": 0.19040908132898832, "learning_rate": 5e-06, "loss": 0.7445, "step": 2970 }, { "epoch": 1.5227388860500768, "grad_norm": 0.1929810400978154, "learning_rate": 5e-06, "loss": 0.7362, "step": 2980 }, { "epoch": 1.5278487480838017, "grad_norm": 0.19442375737357984, "learning_rate": 5e-06, "loss": 0.7305, "step": 2990 }, { "epoch": 1.532958610117527, "grad_norm": 0.18546826858387847, "learning_rate": 5e-06, "loss": 0.7311, "step": 3000 }, { "epoch": 1.5380684721512519, "grad_norm": 0.18542784404927515, "learning_rate": 5e-06, "loss": 0.7447, "step": 3010 }, { "epoch": 1.543178334184977, "grad_norm": 0.2020846723209545, "learning_rate": 5e-06, "loss": 0.7113, "step": 3020 }, { "epoch": 1.548288196218702, "grad_norm": 0.19026884893782828, "learning_rate": 5e-06, "loss": 0.7157, "step": 3030 }, { "epoch": 1.5533980582524272, "grad_norm": 0.18111421662938304, "learning_rate": 5e-06, "loss": 0.7323, "step": 3040 }, { "epoch": 1.5585079202861523, "grad_norm": 0.19367385202342016, "learning_rate": 5e-06, "loss": 0.7305, "step": 3050 }, { "epoch": 1.5636177823198774, "grad_norm": 0.18590394121821466, "learning_rate": 5e-06, "loss": 0.7341, "step": 3060 }, { "epoch": 1.5687276443536025, "grad_norm": 0.18488441186992707, "learning_rate": 5e-06, "loss": 0.7482, "step": 3070 }, { "epoch": 1.5738375063873276, "grad_norm": 0.18226306867076514, "learning_rate": 5e-06, "loss": 0.7334, "step": 3080 }, { "epoch": 1.5789473684210527, "grad_norm": 0.20053856155426641, "learning_rate": 5e-06, "loss": 0.7414, "step": 3090 }, { "epoch": 1.5840572304547778, "grad_norm": 0.19672564131420983, "learning_rate": 5e-06, "loss": 0.7508, "step": 3100 }, { "epoch": 1.5891670924885029, "grad_norm": 0.1790851772225089, "learning_rate": 5e-06, "loss": 0.7471, "step": 3110 }, { "epoch": 1.594276954522228, "grad_norm": 0.1900047612676954, "learning_rate": 5e-06, "loss": 0.7421, "step": 3120 }, { "epoch": 1.599386816555953, "grad_norm": 0.19746465955340986, "learning_rate": 5e-06, "loss": 0.7471, "step": 3130 }, { "epoch": 1.604496678589678, "grad_norm": 0.186549540683221, "learning_rate": 5e-06, "loss": 0.7275, "step": 3140 }, { "epoch": 1.6096065406234032, "grad_norm": 0.1876261054598287, "learning_rate": 5e-06, "loss": 0.7359, "step": 3150 }, { "epoch": 1.6147164026571281, "grad_norm": 0.19082325492370317, "learning_rate": 5e-06, "loss": 0.7268, "step": 3160 }, { "epoch": 1.6198262646908534, "grad_norm": 0.2016402119888201, "learning_rate": 5e-06, "loss": 0.7377, "step": 3170 }, { "epoch": 1.6249361267245783, "grad_norm": 0.1888126317070555, "learning_rate": 5e-06, "loss": 0.7265, "step": 3180 }, { "epoch": 1.6300459887583036, "grad_norm": 0.17743730583327474, "learning_rate": 5e-06, "loss": 0.7203, "step": 3190 }, { "epoch": 1.6351558507920285, "grad_norm": 0.1826162903853255, "learning_rate": 5e-06, "loss": 0.7215, "step": 3200 }, { "epoch": 1.6402657128257538, "grad_norm": 0.19419266754404552, "learning_rate": 5e-06, "loss": 0.7376, "step": 3210 }, { "epoch": 1.6453755748594787, "grad_norm": 0.1956453565355767, "learning_rate": 5e-06, "loss": 0.7316, "step": 3220 }, { "epoch": 1.650485436893204, "grad_norm": 0.19765143129125318, "learning_rate": 5e-06, "loss": 0.7374, "step": 3230 }, { "epoch": 1.655595298926929, "grad_norm": 0.19127982430051427, "learning_rate": 5e-06, "loss": 0.7405, "step": 3240 }, { "epoch": 1.660705160960654, "grad_norm": 0.1847472801458583, "learning_rate": 5e-06, "loss": 0.724, "step": 3250 }, { "epoch": 1.6658150229943791, "grad_norm": 0.18698307703261788, "learning_rate": 5e-06, "loss": 0.7374, "step": 3260 }, { "epoch": 1.6709248850281042, "grad_norm": 0.17533798523255767, "learning_rate": 5e-06, "loss": 0.7202, "step": 3270 }, { "epoch": 1.6760347470618293, "grad_norm": 0.1806351825859557, "learning_rate": 5e-06, "loss": 0.7452, "step": 3280 }, { "epoch": 1.6811446090955544, "grad_norm": 0.1767976805961292, "learning_rate": 5e-06, "loss": 0.7338, "step": 3290 }, { "epoch": 1.6862544711292795, "grad_norm": 0.19498984484111873, "learning_rate": 5e-06, "loss": 0.7403, "step": 3300 }, { "epoch": 1.6913643331630046, "grad_norm": 0.17701307669892918, "learning_rate": 5e-06, "loss": 0.7299, "step": 3310 }, { "epoch": 1.6964741951967297, "grad_norm": 0.19220216566407472, "learning_rate": 5e-06, "loss": 0.7314, "step": 3320 }, { "epoch": 1.7015840572304548, "grad_norm": 0.1829279730231264, "learning_rate": 5e-06, "loss": 0.743, "step": 3330 }, { "epoch": 1.70669391926418, "grad_norm": 0.19526766653061225, "learning_rate": 5e-06, "loss": 0.7222, "step": 3340 }, { "epoch": 1.711803781297905, "grad_norm": 0.19455609962672274, "learning_rate": 5e-06, "loss": 0.7253, "step": 3350 }, { "epoch": 1.71691364333163, "grad_norm": 0.21002379536162816, "learning_rate": 5e-06, "loss": 0.7429, "step": 3360 }, { "epoch": 1.722023505365355, "grad_norm": 0.1990882316461353, "learning_rate": 5e-06, "loss": 0.7443, "step": 3370 }, { "epoch": 1.7271333673990803, "grad_norm": 0.17934672167038826, "learning_rate": 5e-06, "loss": 0.7497, "step": 3380 }, { "epoch": 1.7322432294328052, "grad_norm": 0.19501165745940965, "learning_rate": 5e-06, "loss": 0.7425, "step": 3390 }, { "epoch": 1.7373530914665305, "grad_norm": 0.19248650606756543, "learning_rate": 5e-06, "loss": 0.7297, "step": 3400 }, { "epoch": 1.7424629535002554, "grad_norm": 0.17721599710417338, "learning_rate": 5e-06, "loss": 0.7251, "step": 3410 }, { "epoch": 1.7475728155339807, "grad_norm": 0.18509365156353424, "learning_rate": 5e-06, "loss": 0.7221, "step": 3420 }, { "epoch": 1.7526826775677056, "grad_norm": 0.18289284691122754, "learning_rate": 5e-06, "loss": 0.7327, "step": 3430 }, { "epoch": 1.757792539601431, "grad_norm": 0.18756279151165123, "learning_rate": 5e-06, "loss": 0.732, "step": 3440 }, { "epoch": 1.7629024016351558, "grad_norm": 0.17439303769229625, "learning_rate": 5e-06, "loss": 0.7344, "step": 3450 }, { "epoch": 1.768012263668881, "grad_norm": 0.17783376482824478, "learning_rate": 5e-06, "loss": 0.7323, "step": 3460 }, { "epoch": 1.773122125702606, "grad_norm": 0.19448194078586717, "learning_rate": 5e-06, "loss": 0.7292, "step": 3470 }, { "epoch": 1.778231987736331, "grad_norm": 0.18000237860427712, "learning_rate": 5e-06, "loss": 0.7219, "step": 3480 }, { "epoch": 1.7833418497700562, "grad_norm": 0.18519882940772772, "learning_rate": 5e-06, "loss": 0.7354, "step": 3490 }, { "epoch": 1.7884517118037813, "grad_norm": 0.19301292549147336, "learning_rate": 5e-06, "loss": 0.7487, "step": 3500 }, { "epoch": 1.7935615738375064, "grad_norm": 0.17758380897102066, "learning_rate": 5e-06, "loss": 0.7373, "step": 3510 }, { "epoch": 1.7986714358712315, "grad_norm": 0.1794720757802905, "learning_rate": 5e-06, "loss": 0.7226, "step": 3520 }, { "epoch": 1.8037812979049566, "grad_norm": 0.18100374933694008, "learning_rate": 5e-06, "loss": 0.7256, "step": 3530 }, { "epoch": 1.8088911599386817, "grad_norm": 0.1954603145633284, "learning_rate": 5e-06, "loss": 0.7283, "step": 3540 }, { "epoch": 1.8140010219724068, "grad_norm": 0.19558607958635285, "learning_rate": 5e-06, "loss": 0.7384, "step": 3550 }, { "epoch": 1.8191108840061319, "grad_norm": 0.1772107537935853, "learning_rate": 5e-06, "loss": 0.7382, "step": 3560 }, { "epoch": 1.824220746039857, "grad_norm": 0.17916901000763397, "learning_rate": 5e-06, "loss": 0.7367, "step": 3570 }, { "epoch": 1.829330608073582, "grad_norm": 0.19083482072843658, "learning_rate": 5e-06, "loss": 0.7247, "step": 3580 }, { "epoch": 1.8344404701073072, "grad_norm": 0.1770449813805881, "learning_rate": 5e-06, "loss": 0.7187, "step": 3590 }, { "epoch": 1.839550332141032, "grad_norm": 0.18790158384523442, "learning_rate": 5e-06, "loss": 0.7403, "step": 3600 }, { "epoch": 1.8446601941747574, "grad_norm": 0.17892362311216914, "learning_rate": 5e-06, "loss": 0.7368, "step": 3610 }, { "epoch": 1.8497700562084822, "grad_norm": 0.1839659251785667, "learning_rate": 5e-06, "loss": 0.7255, "step": 3620 }, { "epoch": 1.8548799182422075, "grad_norm": 0.19138496555502849, "learning_rate": 5e-06, "loss": 0.7453, "step": 3630 }, { "epoch": 1.8599897802759324, "grad_norm": 0.18135734354491537, "learning_rate": 5e-06, "loss": 0.7166, "step": 3640 }, { "epoch": 1.8650996423096577, "grad_norm": 0.1859082044449026, "learning_rate": 5e-06, "loss": 0.7285, "step": 3650 }, { "epoch": 1.8702095043433826, "grad_norm": 0.1913280855307758, "learning_rate": 5e-06, "loss": 0.7279, "step": 3660 }, { "epoch": 1.875319366377108, "grad_norm": 0.19148047998384163, "learning_rate": 5e-06, "loss": 0.7381, "step": 3670 }, { "epoch": 1.8804292284108328, "grad_norm": 0.190776629149848, "learning_rate": 5e-06, "loss": 0.7347, "step": 3680 }, { "epoch": 1.8855390904445581, "grad_norm": 0.18748890464701637, "learning_rate": 5e-06, "loss": 0.7214, "step": 3690 }, { "epoch": 1.890648952478283, "grad_norm": 0.19691029617370956, "learning_rate": 5e-06, "loss": 0.7396, "step": 3700 }, { "epoch": 1.8957588145120083, "grad_norm": 0.17143385370457725, "learning_rate": 5e-06, "loss": 0.7305, "step": 3710 }, { "epoch": 1.9008686765457332, "grad_norm": 0.19115494111352774, "learning_rate": 5e-06, "loss": 0.7252, "step": 3720 }, { "epoch": 1.9059785385794583, "grad_norm": 0.18589155104150923, "learning_rate": 5e-06, "loss": 0.728, "step": 3730 }, { "epoch": 1.9110884006131834, "grad_norm": 0.1870078279938856, "learning_rate": 5e-06, "loss": 0.7259, "step": 3740 }, { "epoch": 1.9161982626469085, "grad_norm": 0.17909224396912188, "learning_rate": 5e-06, "loss": 0.7345, "step": 3750 }, { "epoch": 1.9213081246806336, "grad_norm": 0.19885632401705697, "learning_rate": 5e-06, "loss": 0.7289, "step": 3760 }, { "epoch": 1.9264179867143587, "grad_norm": 0.18778552722329356, "learning_rate": 5e-06, "loss": 0.725, "step": 3770 }, { "epoch": 1.9315278487480838, "grad_norm": 0.2019790249495369, "learning_rate": 5e-06, "loss": 0.7398, "step": 3780 }, { "epoch": 1.936637710781809, "grad_norm": 0.19338701904495897, "learning_rate": 5e-06, "loss": 0.7265, "step": 3790 }, { "epoch": 1.941747572815534, "grad_norm": 0.18703296264974872, "learning_rate": 5e-06, "loss": 0.7332, "step": 3800 }, { "epoch": 1.946857434849259, "grad_norm": 0.1700175440342506, "learning_rate": 5e-06, "loss": 0.7205, "step": 3810 }, { "epoch": 1.9519672968829842, "grad_norm": 0.18636496202992153, "learning_rate": 5e-06, "loss": 0.7154, "step": 3820 }, { "epoch": 1.9570771589167093, "grad_norm": 0.1826391337083993, "learning_rate": 5e-06, "loss": 0.7356, "step": 3830 }, { "epoch": 1.9621870209504344, "grad_norm": 0.17766191115765154, "learning_rate": 5e-06, "loss": 0.7369, "step": 3840 }, { "epoch": 1.9672968829841593, "grad_norm": 0.18034528150782342, "learning_rate": 5e-06, "loss": 0.7288, "step": 3850 }, { "epoch": 1.9724067450178846, "grad_norm": 0.18541894497456152, "learning_rate": 5e-06, "loss": 0.7296, "step": 3860 }, { "epoch": 1.9775166070516095, "grad_norm": 0.19539634425789987, "learning_rate": 5e-06, "loss": 0.7374, "step": 3870 }, { "epoch": 1.9826264690853348, "grad_norm": 0.1928837424204438, "learning_rate": 5e-06, "loss": 0.732, "step": 3880 }, { "epoch": 1.9877363311190597, "grad_norm": 0.18813671735265705, "learning_rate": 5e-06, "loss": 0.7285, "step": 3890 }, { "epoch": 1.992846193152785, "grad_norm": 0.19024591983517306, "learning_rate": 5e-06, "loss": 0.7386, "step": 3900 }, { "epoch": 1.9979560551865099, "grad_norm": 0.1791330764130833, "learning_rate": 5e-06, "loss": 0.7324, "step": 3910 }, { "epoch": 2.003065917220235, "grad_norm": 0.18457472513280188, "learning_rate": 5e-06, "loss": 0.7303, "step": 3920 }, { "epoch": 2.00817577925396, "grad_norm": 0.18684788166920566, "learning_rate": 5e-06, "loss": 0.735, "step": 3930 }, { "epoch": 2.0132856412876854, "grad_norm": 0.18385152341485855, "learning_rate": 5e-06, "loss": 0.7204, "step": 3940 }, { "epoch": 2.0183955033214103, "grad_norm": 0.1893087134675762, "learning_rate": 5e-06, "loss": 0.7271, "step": 3950 }, { "epoch": 2.0235053653551356, "grad_norm": 0.19001500071497598, "learning_rate": 5e-06, "loss": 0.7286, "step": 3960 }, { "epoch": 2.0286152273888605, "grad_norm": 0.18883227887144083, "learning_rate": 5e-06, "loss": 0.7157, "step": 3970 }, { "epoch": 2.033725089422586, "grad_norm": 0.17689498199700174, "learning_rate": 5e-06, "loss": 0.7063, "step": 3980 }, { "epoch": 2.0388349514563107, "grad_norm": 0.19270059169949594, "learning_rate": 5e-06, "loss": 0.7295, "step": 3990 }, { "epoch": 2.043944813490036, "grad_norm": 0.19597358123850073, "learning_rate": 5e-06, "loss": 0.7268, "step": 4000 }, { "epoch": 2.049054675523761, "grad_norm": 0.19711968415004932, "learning_rate": 5e-06, "loss": 0.7229, "step": 4010 }, { "epoch": 2.0541645375574857, "grad_norm": 0.18776496261358783, "learning_rate": 5e-06, "loss": 0.7158, "step": 4020 }, { "epoch": 2.059274399591211, "grad_norm": 0.18818737294591004, "learning_rate": 5e-06, "loss": 0.7194, "step": 4030 }, { "epoch": 2.064384261624936, "grad_norm": 0.19845798280803176, "learning_rate": 5e-06, "loss": 0.7265, "step": 4040 }, { "epoch": 2.0694941236586613, "grad_norm": 0.18713228958457867, "learning_rate": 5e-06, "loss": 0.7095, "step": 4050 }, { "epoch": 2.074603985692386, "grad_norm": 0.17787870833728897, "learning_rate": 5e-06, "loss": 0.7149, "step": 4060 }, { "epoch": 2.0797138477261115, "grad_norm": 0.19472810880013228, "learning_rate": 5e-06, "loss": 0.7178, "step": 4070 }, { "epoch": 2.0848237097598363, "grad_norm": 0.19429450371850024, "learning_rate": 5e-06, "loss": 0.7118, "step": 4080 }, { "epoch": 2.0899335717935617, "grad_norm": 0.1941609760118733, "learning_rate": 5e-06, "loss": 0.7265, "step": 4090 }, { "epoch": 2.0950434338272865, "grad_norm": 0.19290976310635458, "learning_rate": 5e-06, "loss": 0.7085, "step": 4100 }, { "epoch": 2.100153295861012, "grad_norm": 0.1765241017205207, "learning_rate": 5e-06, "loss": 0.7301, "step": 4110 }, { "epoch": 2.1052631578947367, "grad_norm": 0.17846133756954982, "learning_rate": 5e-06, "loss": 0.7105, "step": 4120 }, { "epoch": 2.110373019928462, "grad_norm": 0.1990655813201847, "learning_rate": 5e-06, "loss": 0.7197, "step": 4130 }, { "epoch": 2.115482881962187, "grad_norm": 0.18227727070573727, "learning_rate": 5e-06, "loss": 0.7225, "step": 4140 }, { "epoch": 2.1205927439959122, "grad_norm": 0.1950861579503913, "learning_rate": 5e-06, "loss": 0.7151, "step": 4150 }, { "epoch": 2.125702606029637, "grad_norm": 0.1852273630173904, "learning_rate": 5e-06, "loss": 0.7349, "step": 4160 }, { "epoch": 2.1308124680633624, "grad_norm": 0.19463032921601597, "learning_rate": 5e-06, "loss": 0.7209, "step": 4170 }, { "epoch": 2.1359223300970873, "grad_norm": 0.20049767813158514, "learning_rate": 5e-06, "loss": 0.7272, "step": 4180 }, { "epoch": 2.1410321921308126, "grad_norm": 0.1964989410548727, "learning_rate": 5e-06, "loss": 0.7205, "step": 4190 }, { "epoch": 2.1461420541645375, "grad_norm": 0.18656787454813695, "learning_rate": 5e-06, "loss": 0.73, "step": 4200 }, { "epoch": 2.151251916198263, "grad_norm": 0.1869811709706087, "learning_rate": 5e-06, "loss": 0.7211, "step": 4210 }, { "epoch": 2.1563617782319877, "grad_norm": 0.1822452717215839, "learning_rate": 5e-06, "loss": 0.7149, "step": 4220 }, { "epoch": 2.161471640265713, "grad_norm": 0.18992052776084958, "learning_rate": 5e-06, "loss": 0.7343, "step": 4230 }, { "epoch": 2.166581502299438, "grad_norm": 0.18438525645647527, "learning_rate": 5e-06, "loss": 0.7094, "step": 4240 }, { "epoch": 2.171691364333163, "grad_norm": 0.19529306952253045, "learning_rate": 5e-06, "loss": 0.7309, "step": 4250 }, { "epoch": 2.176801226366888, "grad_norm": 0.19540047885297857, "learning_rate": 5e-06, "loss": 0.7128, "step": 4260 }, { "epoch": 2.181911088400613, "grad_norm": 0.18411641667621176, "learning_rate": 5e-06, "loss": 0.7225, "step": 4270 }, { "epoch": 2.1870209504343383, "grad_norm": 0.19493918166759708, "learning_rate": 5e-06, "loss": 0.723, "step": 4280 }, { "epoch": 2.192130812468063, "grad_norm": 0.1843201705851165, "learning_rate": 5e-06, "loss": 0.7133, "step": 4290 }, { "epoch": 2.1972406745017885, "grad_norm": 0.1945434906423164, "learning_rate": 5e-06, "loss": 0.7196, "step": 4300 }, { "epoch": 2.2023505365355134, "grad_norm": 0.19008929161664118, "learning_rate": 5e-06, "loss": 0.7224, "step": 4310 }, { "epoch": 2.2074603985692387, "grad_norm": 0.18391558057092405, "learning_rate": 5e-06, "loss": 0.723, "step": 4320 }, { "epoch": 2.2125702606029636, "grad_norm": 0.19348061164149957, "learning_rate": 5e-06, "loss": 0.7331, "step": 4330 }, { "epoch": 2.217680122636689, "grad_norm": 0.18607054173383442, "learning_rate": 5e-06, "loss": 0.7244, "step": 4340 }, { "epoch": 2.2227899846704138, "grad_norm": 0.18618780601208676, "learning_rate": 5e-06, "loss": 0.7272, "step": 4350 }, { "epoch": 2.227899846704139, "grad_norm": 0.17823884880532717, "learning_rate": 5e-06, "loss": 0.7091, "step": 4360 }, { "epoch": 2.233009708737864, "grad_norm": 0.18561422862650115, "learning_rate": 5e-06, "loss": 0.7205, "step": 4370 }, { "epoch": 2.2381195707715893, "grad_norm": 0.17953482905796772, "learning_rate": 5e-06, "loss": 0.729, "step": 4380 }, { "epoch": 2.243229432805314, "grad_norm": 0.18191363924178688, "learning_rate": 5e-06, "loss": 0.7111, "step": 4390 }, { "epoch": 2.2483392948390395, "grad_norm": 0.180435799688224, "learning_rate": 5e-06, "loss": 0.7107, "step": 4400 }, { "epoch": 2.2534491568727644, "grad_norm": 0.18960270587770217, "learning_rate": 5e-06, "loss": 0.725, "step": 4410 }, { "epoch": 2.2585590189064897, "grad_norm": 0.17769968679729267, "learning_rate": 5e-06, "loss": 0.7143, "step": 4420 }, { "epoch": 2.2636688809402146, "grad_norm": 0.19163278997385685, "learning_rate": 5e-06, "loss": 0.7182, "step": 4430 }, { "epoch": 2.26877874297394, "grad_norm": 0.1940788858796441, "learning_rate": 5e-06, "loss": 0.7366, "step": 4440 }, { "epoch": 2.2738886050076648, "grad_norm": 0.1776800674678045, "learning_rate": 5e-06, "loss": 0.7069, "step": 4450 }, { "epoch": 2.27899846704139, "grad_norm": 0.17843289642160187, "learning_rate": 5e-06, "loss": 0.724, "step": 4460 }, { "epoch": 2.284108329075115, "grad_norm": 0.18057741833938729, "learning_rate": 5e-06, "loss": 0.7137, "step": 4470 }, { "epoch": 2.28921819110884, "grad_norm": 0.1866133357180047, "learning_rate": 5e-06, "loss": 0.7166, "step": 4480 }, { "epoch": 2.294328053142565, "grad_norm": 0.18193348825294622, "learning_rate": 5e-06, "loss": 0.7244, "step": 4490 }, { "epoch": 2.2994379151762905, "grad_norm": 0.1929190761683958, "learning_rate": 5e-06, "loss": 0.7277, "step": 4500 }, { "epoch": 2.3045477772100154, "grad_norm": 0.19392177452359835, "learning_rate": 5e-06, "loss": 0.7198, "step": 4510 }, { "epoch": 2.3096576392437402, "grad_norm": 0.18143577291270357, "learning_rate": 5e-06, "loss": 0.728, "step": 4520 }, { "epoch": 2.3147675012774656, "grad_norm": 0.19443872804506757, "learning_rate": 5e-06, "loss": 0.7272, "step": 4530 }, { "epoch": 2.3198773633111904, "grad_norm": 0.1851328945489432, "learning_rate": 5e-06, "loss": 0.7234, "step": 4540 }, { "epoch": 2.3249872253449158, "grad_norm": 0.19038110214278162, "learning_rate": 5e-06, "loss": 0.713, "step": 4550 }, { "epoch": 2.3300970873786406, "grad_norm": 0.1874741106018047, "learning_rate": 5e-06, "loss": 0.7161, "step": 4560 }, { "epoch": 2.335206949412366, "grad_norm": 0.19581977792981697, "learning_rate": 5e-06, "loss": 0.7162, "step": 4570 }, { "epoch": 2.340316811446091, "grad_norm": 0.17541497625191085, "learning_rate": 5e-06, "loss": 0.723, "step": 4580 }, { "epoch": 2.345426673479816, "grad_norm": 0.20023862697490177, "learning_rate": 5e-06, "loss": 0.7185, "step": 4590 }, { "epoch": 2.350536535513541, "grad_norm": 0.1959974413991938, "learning_rate": 5e-06, "loss": 0.7088, "step": 4600 }, { "epoch": 2.3556463975472663, "grad_norm": 0.18473631827351808, "learning_rate": 5e-06, "loss": 0.719, "step": 4610 }, { "epoch": 2.3607562595809912, "grad_norm": 0.19338189496040809, "learning_rate": 5e-06, "loss": 0.7239, "step": 4620 }, { "epoch": 2.3658661216147165, "grad_norm": 0.19066355345819264, "learning_rate": 5e-06, "loss": 0.7266, "step": 4630 }, { "epoch": 2.3709759836484414, "grad_norm": 0.1883524242314222, "learning_rate": 5e-06, "loss": 0.7186, "step": 4640 }, { "epoch": 2.3760858456821667, "grad_norm": 0.18502939250064523, "learning_rate": 5e-06, "loss": 0.7199, "step": 4650 }, { "epoch": 2.3811957077158916, "grad_norm": 0.1792178984338457, "learning_rate": 5e-06, "loss": 0.714, "step": 4660 }, { "epoch": 2.386305569749617, "grad_norm": 0.18226944439325007, "learning_rate": 5e-06, "loss": 0.7041, "step": 4670 }, { "epoch": 2.391415431783342, "grad_norm": 0.18015473190902764, "learning_rate": 5e-06, "loss": 0.716, "step": 4680 }, { "epoch": 2.396525293817067, "grad_norm": 0.18115429600775665, "learning_rate": 5e-06, "loss": 0.7086, "step": 4690 }, { "epoch": 2.401635155850792, "grad_norm": 0.2043473041616948, "learning_rate": 5e-06, "loss": 0.7165, "step": 4700 }, { "epoch": 2.406745017884517, "grad_norm": 0.20268566073488395, "learning_rate": 5e-06, "loss": 0.7153, "step": 4710 }, { "epoch": 2.411854879918242, "grad_norm": 0.19488489934465453, "learning_rate": 5e-06, "loss": 0.7261, "step": 4720 }, { "epoch": 2.4169647419519675, "grad_norm": 0.1869426176451986, "learning_rate": 5e-06, "loss": 0.6977, "step": 4730 }, { "epoch": 2.4220746039856924, "grad_norm": 0.19817580694708986, "learning_rate": 5e-06, "loss": 0.71, "step": 4740 }, { "epoch": 2.4271844660194173, "grad_norm": 0.2099570721696579, "learning_rate": 5e-06, "loss": 0.7204, "step": 4750 }, { "epoch": 2.4322943280531426, "grad_norm": 0.1964657272176851, "learning_rate": 5e-06, "loss": 0.7104, "step": 4760 }, { "epoch": 2.4374041900868675, "grad_norm": 0.1886905609835038, "learning_rate": 5e-06, "loss": 0.7085, "step": 4770 }, { "epoch": 2.442514052120593, "grad_norm": 0.18220683381465547, "learning_rate": 5e-06, "loss": 0.722, "step": 4780 }, { "epoch": 2.4476239141543177, "grad_norm": 0.18487998134947345, "learning_rate": 5e-06, "loss": 0.7112, "step": 4790 }, { "epoch": 2.452733776188043, "grad_norm": 0.16830917755794478, "learning_rate": 5e-06, "loss": 0.7157, "step": 4800 }, { "epoch": 2.457843638221768, "grad_norm": 0.18354244943334422, "learning_rate": 5e-06, "loss": 0.7152, "step": 4810 }, { "epoch": 2.462953500255493, "grad_norm": 0.18986840496033355, "learning_rate": 5e-06, "loss": 0.7161, "step": 4820 }, { "epoch": 2.468063362289218, "grad_norm": 0.1826318712937385, "learning_rate": 5e-06, "loss": 0.7295, "step": 4830 }, { "epoch": 2.4731732243229434, "grad_norm": 0.18527008923688568, "learning_rate": 5e-06, "loss": 0.7115, "step": 4840 }, { "epoch": 2.4782830863566683, "grad_norm": 0.18224411387144135, "learning_rate": 5e-06, "loss": 0.7173, "step": 4850 }, { "epoch": 2.4833929483903936, "grad_norm": 0.17811035407666295, "learning_rate": 5e-06, "loss": 0.7095, "step": 4860 }, { "epoch": 2.4885028104241185, "grad_norm": 0.18261169308001168, "learning_rate": 5e-06, "loss": 0.7162, "step": 4870 }, { "epoch": 2.493612672457844, "grad_norm": 0.2002887514635378, "learning_rate": 5e-06, "loss": 0.7208, "step": 4880 }, { "epoch": 2.4987225344915687, "grad_norm": 0.1869680657646229, "learning_rate": 5e-06, "loss": 0.7081, "step": 4890 }, { "epoch": 2.5038323965252935, "grad_norm": 0.18675570886820952, "learning_rate": 5e-06, "loss": 0.7117, "step": 4900 }, { "epoch": 2.508942258559019, "grad_norm": 0.19944201285579877, "learning_rate": 5e-06, "loss": 0.7198, "step": 4910 }, { "epoch": 2.514052120592744, "grad_norm": 0.18956975903685294, "learning_rate": 5e-06, "loss": 0.7179, "step": 4920 }, { "epoch": 2.519161982626469, "grad_norm": 0.1942120087148544, "learning_rate": 5e-06, "loss": 0.7216, "step": 4930 }, { "epoch": 2.524271844660194, "grad_norm": 0.18623164151592378, "learning_rate": 5e-06, "loss": 0.6997, "step": 4940 }, { "epoch": 2.5293817066939193, "grad_norm": 0.19081196748322382, "learning_rate": 5e-06, "loss": 0.7091, "step": 4950 }, { "epoch": 2.5344915687276446, "grad_norm": 0.191889152108264, "learning_rate": 5e-06, "loss": 0.7122, "step": 4960 }, { "epoch": 2.5396014307613695, "grad_norm": 0.19018109192168367, "learning_rate": 5e-06, "loss": 0.7195, "step": 4970 }, { "epoch": 2.5447112927950943, "grad_norm": 0.18029975730571046, "learning_rate": 5e-06, "loss": 0.7067, "step": 4980 }, { "epoch": 2.5498211548288197, "grad_norm": 0.19081905131548996, "learning_rate": 5e-06, "loss": 0.7184, "step": 4990 }, { "epoch": 2.554931016862545, "grad_norm": 0.18184068561404432, "learning_rate": 5e-06, "loss": 0.711, "step": 5000 }, { "epoch": 2.56004087889627, "grad_norm": 0.18141265501857734, "learning_rate": 5e-06, "loss": 0.7298, "step": 5010 }, { "epoch": 2.5651507409299947, "grad_norm": 0.19059042586600472, "learning_rate": 5e-06, "loss": 0.7114, "step": 5020 }, { "epoch": 2.57026060296372, "grad_norm": 0.18752747662093455, "learning_rate": 5e-06, "loss": 0.7139, "step": 5030 }, { "epoch": 2.575370464997445, "grad_norm": 0.18531393933159526, "learning_rate": 5e-06, "loss": 0.7152, "step": 5040 }, { "epoch": 2.5804803270311703, "grad_norm": 0.2020545252464496, "learning_rate": 5e-06, "loss": 0.724, "step": 5050 }, { "epoch": 2.585590189064895, "grad_norm": 0.1817661308445167, "learning_rate": 5e-06, "loss": 0.7099, "step": 5060 }, { "epoch": 2.5907000510986204, "grad_norm": 0.1874710204305083, "learning_rate": 5e-06, "loss": 0.727, "step": 5070 }, { "epoch": 2.5958099131323453, "grad_norm": 0.19697140144619885, "learning_rate": 5e-06, "loss": 0.7184, "step": 5080 }, { "epoch": 2.6009197751660706, "grad_norm": 0.1889733854232041, "learning_rate": 5e-06, "loss": 0.7169, "step": 5090 }, { "epoch": 2.6060296371997955, "grad_norm": 0.20000474912796498, "learning_rate": 5e-06, "loss": 0.7089, "step": 5100 }, { "epoch": 2.611139499233521, "grad_norm": 0.18020813610110156, "learning_rate": 5e-06, "loss": 0.7264, "step": 5110 }, { "epoch": 2.6162493612672457, "grad_norm": 0.17876620180082428, "learning_rate": 5e-06, "loss": 0.7007, "step": 5120 }, { "epoch": 2.6213592233009706, "grad_norm": 0.186937820916383, "learning_rate": 5e-06, "loss": 0.7219, "step": 5130 }, { "epoch": 2.626469085334696, "grad_norm": 0.19293145357331443, "learning_rate": 5e-06, "loss": 0.7116, "step": 5140 }, { "epoch": 2.6315789473684212, "grad_norm": 0.18779972078705487, "learning_rate": 5e-06, "loss": 0.7147, "step": 5150 }, { "epoch": 2.636688809402146, "grad_norm": 0.2004320087337195, "learning_rate": 5e-06, "loss": 0.7046, "step": 5160 }, { "epoch": 2.641798671435871, "grad_norm": 0.18155864298582336, "learning_rate": 5e-06, "loss": 0.7024, "step": 5170 }, { "epoch": 2.6469085334695963, "grad_norm": 0.18766505517066065, "learning_rate": 5e-06, "loss": 0.7092, "step": 5180 }, { "epoch": 2.6520183955033216, "grad_norm": 0.18241808648535346, "learning_rate": 5e-06, "loss": 0.7231, "step": 5190 }, { "epoch": 2.6571282575370465, "grad_norm": 0.1956290063678602, "learning_rate": 5e-06, "loss": 0.7071, "step": 5200 }, { "epoch": 2.6622381195707714, "grad_norm": 0.19687281837098963, "learning_rate": 5e-06, "loss": 0.7237, "step": 5210 }, { "epoch": 2.6673479816044967, "grad_norm": 0.18253071868952309, "learning_rate": 5e-06, "loss": 0.7085, "step": 5220 }, { "epoch": 2.672457843638222, "grad_norm": 0.18783632566318903, "learning_rate": 5e-06, "loss": 0.717, "step": 5230 }, { "epoch": 2.677567705671947, "grad_norm": 0.1856712541693123, "learning_rate": 5e-06, "loss": 0.7121, "step": 5240 }, { "epoch": 2.682677567705672, "grad_norm": 0.17913630024093868, "learning_rate": 5e-06, "loss": 0.7141, "step": 5250 }, { "epoch": 2.687787429739397, "grad_norm": 0.1814607328558563, "learning_rate": 5e-06, "loss": 0.7143, "step": 5260 }, { "epoch": 2.692897291773122, "grad_norm": 0.20516860190923697, "learning_rate": 5e-06, "loss": 0.7153, "step": 5270 }, { "epoch": 2.6980071538068473, "grad_norm": 0.18691014833866346, "learning_rate": 5e-06, "loss": 0.7021, "step": 5280 }, { "epoch": 2.703117015840572, "grad_norm": 0.19243627032023453, "learning_rate": 5e-06, "loss": 0.7198, "step": 5290 }, { "epoch": 2.7082268778742975, "grad_norm": 0.1782810408332171, "learning_rate": 5e-06, "loss": 0.7144, "step": 5300 }, { "epoch": 2.7133367399080224, "grad_norm": 0.18246834780709842, "learning_rate": 5e-06, "loss": 0.7073, "step": 5310 }, { "epoch": 2.7184466019417477, "grad_norm": 0.19188458416675994, "learning_rate": 5e-06, "loss": 0.7033, "step": 5320 }, { "epoch": 2.7235564639754726, "grad_norm": 0.19431224425803312, "learning_rate": 5e-06, "loss": 0.7121, "step": 5330 }, { "epoch": 2.728666326009198, "grad_norm": 0.19472680477435397, "learning_rate": 5e-06, "loss": 0.7204, "step": 5340 }, { "epoch": 2.7337761880429228, "grad_norm": 0.17416333272267362, "learning_rate": 5e-06, "loss": 0.722, "step": 5350 }, { "epoch": 2.738886050076648, "grad_norm": 0.17997245175531543, "learning_rate": 5e-06, "loss": 0.7209, "step": 5360 }, { "epoch": 2.743995912110373, "grad_norm": 0.1818254672253798, "learning_rate": 5e-06, "loss": 0.7064, "step": 5370 }, { "epoch": 2.7491057741440983, "grad_norm": 0.19143052614657435, "learning_rate": 5e-06, "loss": 0.7251, "step": 5380 }, { "epoch": 2.754215636177823, "grad_norm": 0.1849449249230007, "learning_rate": 5e-06, "loss": 0.7134, "step": 5390 }, { "epoch": 2.759325498211548, "grad_norm": 0.18960800644635575, "learning_rate": 5e-06, "loss": 0.7132, "step": 5400 }, { "epoch": 2.7644353602452734, "grad_norm": 0.18450711551330742, "learning_rate": 5e-06, "loss": 0.7218, "step": 5410 }, { "epoch": 2.7695452222789987, "grad_norm": 0.1775798100347251, "learning_rate": 5e-06, "loss": 0.7246, "step": 5420 }, { "epoch": 2.7746550843127236, "grad_norm": 0.19385869953809406, "learning_rate": 5e-06, "loss": 0.7154, "step": 5430 }, { "epoch": 2.7797649463464484, "grad_norm": 0.18947315882216123, "learning_rate": 5e-06, "loss": 0.7146, "step": 5440 }, { "epoch": 2.7848748083801738, "grad_norm": 0.19874490028355687, "learning_rate": 5e-06, "loss": 0.7098, "step": 5450 }, { "epoch": 2.789984670413899, "grad_norm": 0.19216650910265412, "learning_rate": 5e-06, "loss": 0.7157, "step": 5460 }, { "epoch": 2.795094532447624, "grad_norm": 0.1884532290700664, "learning_rate": 5e-06, "loss": 0.73, "step": 5470 }, { "epoch": 2.800204394481349, "grad_norm": 0.178944069456414, "learning_rate": 5e-06, "loss": 0.7023, "step": 5480 }, { "epoch": 2.805314256515074, "grad_norm": 0.19897724638804584, "learning_rate": 5e-06, "loss": 0.7266, "step": 5490 }, { "epoch": 2.810424118548799, "grad_norm": 0.19676207860709408, "learning_rate": 5e-06, "loss": 0.7042, "step": 5500 }, { "epoch": 2.8155339805825244, "grad_norm": 0.17311366117436, "learning_rate": 5e-06, "loss": 0.7059, "step": 5510 }, { "epoch": 2.8206438426162492, "grad_norm": 0.1862348663291006, "learning_rate": 5e-06, "loss": 0.7012, "step": 5520 }, { "epoch": 2.8257537046499746, "grad_norm": 0.19532852701482903, "learning_rate": 5e-06, "loss": 0.7105, "step": 5530 }, { "epoch": 2.8308635666836994, "grad_norm": 0.18410412264007187, "learning_rate": 5e-06, "loss": 0.7163, "step": 5540 }, { "epoch": 2.8359734287174247, "grad_norm": 0.21025902958863738, "learning_rate": 5e-06, "loss": 0.7359, "step": 5550 }, { "epoch": 2.8410832907511496, "grad_norm": 0.17818540055082727, "learning_rate": 5e-06, "loss": 0.7111, "step": 5560 }, { "epoch": 2.846193152784875, "grad_norm": 0.1842948913149892, "learning_rate": 5e-06, "loss": 0.7167, "step": 5570 }, { "epoch": 2.8513030148186, "grad_norm": 0.18686037650976978, "learning_rate": 5e-06, "loss": 0.7138, "step": 5580 }, { "epoch": 2.856412876852325, "grad_norm": 0.203403946822995, "learning_rate": 5e-06, "loss": 0.6996, "step": 5590 }, { "epoch": 2.86152273888605, "grad_norm": 0.19304193259535427, "learning_rate": 5e-06, "loss": 0.7184, "step": 5600 }, { "epoch": 2.8666326009197753, "grad_norm": 0.17636221017029402, "learning_rate": 5e-06, "loss": 0.6989, "step": 5610 }, { "epoch": 2.8717424629535, "grad_norm": 0.2037376472703345, "learning_rate": 5e-06, "loss": 0.7179, "step": 5620 }, { "epoch": 2.876852324987225, "grad_norm": 0.19200332931870842, "learning_rate": 5e-06, "loss": 0.7139, "step": 5630 }, { "epoch": 2.8819621870209504, "grad_norm": 0.18491595431761487, "learning_rate": 5e-06, "loss": 0.7128, "step": 5640 }, { "epoch": 2.8870720490546757, "grad_norm": 0.1838785582117868, "learning_rate": 5e-06, "loss": 0.6955, "step": 5650 }, { "epoch": 2.8921819110884006, "grad_norm": 0.19428058651152774, "learning_rate": 5e-06, "loss": 0.7102, "step": 5660 }, { "epoch": 2.8972917731221255, "grad_norm": 0.17936626194327465, "learning_rate": 5e-06, "loss": 0.7078, "step": 5670 }, { "epoch": 2.902401635155851, "grad_norm": 0.17631468596491473, "learning_rate": 5e-06, "loss": 0.7162, "step": 5680 }, { "epoch": 2.907511497189576, "grad_norm": 0.18415745158446212, "learning_rate": 5e-06, "loss": 0.7233, "step": 5690 }, { "epoch": 2.912621359223301, "grad_norm": 0.17279113129182289, "learning_rate": 5e-06, "loss": 0.6992, "step": 5700 }, { "epoch": 2.917731221257026, "grad_norm": 0.19005862366918605, "learning_rate": 5e-06, "loss": 0.7015, "step": 5710 }, { "epoch": 2.922841083290751, "grad_norm": 0.1865410089900959, "learning_rate": 5e-06, "loss": 0.7043, "step": 5720 }, { "epoch": 2.927950945324476, "grad_norm": 0.19166084388796084, "learning_rate": 5e-06, "loss": 0.7212, "step": 5730 }, { "epoch": 2.9330608073582014, "grad_norm": 0.19183886093470404, "learning_rate": 5e-06, "loss": 0.7117, "step": 5740 }, { "epoch": 2.9381706693919263, "grad_norm": 0.18542747500934698, "learning_rate": 5e-06, "loss": 0.7031, "step": 5750 }, { "epoch": 2.9432805314256516, "grad_norm": 0.19800100325472195, "learning_rate": 5e-06, "loss": 0.7003, "step": 5760 }, { "epoch": 2.9483903934593765, "grad_norm": 0.19068782203027565, "learning_rate": 5e-06, "loss": 0.7157, "step": 5770 }, { "epoch": 2.953500255493102, "grad_norm": 0.1893011994562663, "learning_rate": 5e-06, "loss": 0.7049, "step": 5780 }, { "epoch": 2.9586101175268267, "grad_norm": 0.17894632421886825, "learning_rate": 5e-06, "loss": 0.721, "step": 5790 }, { "epoch": 2.963719979560552, "grad_norm": 0.18749644947518349, "learning_rate": 5e-06, "loss": 0.7245, "step": 5800 }, { "epoch": 2.968829841594277, "grad_norm": 0.18920042895955116, "learning_rate": 5e-06, "loss": 0.7086, "step": 5810 }, { "epoch": 2.973939703628002, "grad_norm": 0.18597523756708192, "learning_rate": 5e-06, "loss": 0.7143, "step": 5820 }, { "epoch": 2.979049565661727, "grad_norm": 0.18741696462849414, "learning_rate": 5e-06, "loss": 0.7195, "step": 5830 }, { "epoch": 2.9841594276954524, "grad_norm": 0.18985455877786367, "learning_rate": 5e-06, "loss": 0.7091, "step": 5840 }, { "epoch": 2.9892692897291773, "grad_norm": 0.18003618910403507, "learning_rate": 5e-06, "loss": 0.7111, "step": 5850 }, { "epoch": 2.994379151762902, "grad_norm": 0.19511416723954572, "learning_rate": 5e-06, "loss": 0.7105, "step": 5860 }, { "epoch": 2.9994890137966275, "grad_norm": 0.18652462173874393, "learning_rate": 5e-06, "loss": 0.7294, "step": 5870 }, { "epoch": 3.0, "step": 5871, "total_flos": 2506240622592000.0, "train_loss": 0.754048796695819, "train_runtime": 27383.3094, "train_samples_per_second": 109.726, "train_steps_per_second": 0.214 } ], "logging_steps": 10, "max_steps": 5871, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2506240622592000.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }