InfiMed-SFT-3B / trainer_state.json
Zeyu077's picture
Upload 15 files
c323206 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.8972542072630647,
"eval_steps": 2000,
"global_step": 22000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00354295837023915,
"grad_norm": 10.283039659237906,
"learning_rate": 3.5423308537017364e-08,
"loss": 1.5344,
"step": 20
},
{
"epoch": 0.0070859167404783,
"grad_norm": 10.29248009558218,
"learning_rate": 7.084661707403473e-08,
"loss": 1.4742,
"step": 40
},
{
"epoch": 0.010628875110717449,
"grad_norm": 9.936150778118938,
"learning_rate": 1.0626992561105209e-07,
"loss": 1.4303,
"step": 60
},
{
"epoch": 0.0141718334809566,
"grad_norm": 10.764517644587348,
"learning_rate": 1.4169323414806946e-07,
"loss": 1.41,
"step": 80
},
{
"epoch": 0.01771479185119575,
"grad_norm": 7.66807491548178,
"learning_rate": 1.7711654268508678e-07,
"loss": 1.3917,
"step": 100
},
{
"epoch": 0.021257750221434897,
"grad_norm": 4.50717632446115,
"learning_rate": 2.1253985122210417e-07,
"loss": 1.4334,
"step": 120
},
{
"epoch": 0.024800708591674048,
"grad_norm": 5.252802352230719,
"learning_rate": 2.479631597591215e-07,
"loss": 1.3535,
"step": 140
},
{
"epoch": 0.0283436669619132,
"grad_norm": 6.854461866498012,
"learning_rate": 2.833864682961389e-07,
"loss": 1.327,
"step": 160
},
{
"epoch": 0.03188662533215235,
"grad_norm": 5.151839746898103,
"learning_rate": 3.188097768331563e-07,
"loss": 1.3742,
"step": 180
},
{
"epoch": 0.0354295837023915,
"grad_norm": 5.045147162247824,
"learning_rate": 3.5423308537017355e-07,
"loss": 1.3332,
"step": 200
},
{
"epoch": 0.03897254207263064,
"grad_norm": 4.683603653905373,
"learning_rate": 3.89656393907191e-07,
"loss": 1.2738,
"step": 220
},
{
"epoch": 0.042515500442869794,
"grad_norm": 3.3123783143845853,
"learning_rate": 4.2507970244420835e-07,
"loss": 1.3093,
"step": 240
},
{
"epoch": 0.046058458813108945,
"grad_norm": 7.022433229508388,
"learning_rate": 4.605030109812257e-07,
"loss": 1.3152,
"step": 260
},
{
"epoch": 0.049601417183348095,
"grad_norm": 4.657117511243846,
"learning_rate": 4.95926319518243e-07,
"loss": 1.2072,
"step": 280
},
{
"epoch": 0.053144375553587246,
"grad_norm": 3.8212763356692094,
"learning_rate": 5.313496280552604e-07,
"loss": 1.2571,
"step": 300
},
{
"epoch": 0.0566873339238264,
"grad_norm": 3.510893973188609,
"learning_rate": 5.667729365922778e-07,
"loss": 1.2607,
"step": 320
},
{
"epoch": 0.06023029229406555,
"grad_norm": 5.0086319180776195,
"learning_rate": 6.021962451292952e-07,
"loss": 1.197,
"step": 340
},
{
"epoch": 0.0637732506643047,
"grad_norm": 3.431634184610715,
"learning_rate": 6.376195536663126e-07,
"loss": 1.2377,
"step": 360
},
{
"epoch": 0.06731620903454384,
"grad_norm": 6.75935217841033,
"learning_rate": 6.730428622033298e-07,
"loss": 1.1773,
"step": 380
},
{
"epoch": 0.070859167404783,
"grad_norm": 4.209102695925099,
"learning_rate": 7.084661707403471e-07,
"loss": 1.219,
"step": 400
},
{
"epoch": 0.07440212577502214,
"grad_norm": 3.49834326636571,
"learning_rate": 7.438894792773646e-07,
"loss": 1.1681,
"step": 420
},
{
"epoch": 0.07794508414526129,
"grad_norm": 6.9169037458817515,
"learning_rate": 7.79312787814382e-07,
"loss": 1.159,
"step": 440
},
{
"epoch": 0.08148804251550044,
"grad_norm": 4.117418143738441,
"learning_rate": 8.147360963513992e-07,
"loss": 1.1361,
"step": 460
},
{
"epoch": 0.08503100088573959,
"grad_norm": 3.878686980846809,
"learning_rate": 8.501594048884167e-07,
"loss": 1.1408,
"step": 480
},
{
"epoch": 0.08857395925597875,
"grad_norm": 2.4629212636233513,
"learning_rate": 8.85582713425434e-07,
"loss": 1.1273,
"step": 500
},
{
"epoch": 0.09211691762621789,
"grad_norm": 3.5335784704034263,
"learning_rate": 9.210060219624514e-07,
"loss": 1.1072,
"step": 520
},
{
"epoch": 0.09565987599645705,
"grad_norm": 2.993397755869922,
"learning_rate": 9.564293304994688e-07,
"loss": 1.1374,
"step": 540
},
{
"epoch": 0.09920283436669619,
"grad_norm": 3.3645219648004074,
"learning_rate": 9.91852639036486e-07,
"loss": 1.0846,
"step": 560
},
{
"epoch": 0.10274579273693533,
"grad_norm": 3.5598287380775657,
"learning_rate": 1.0272759475735035e-06,
"loss": 1.1543,
"step": 580
},
{
"epoch": 0.10628875110717449,
"grad_norm": 5.756341944728728,
"learning_rate": 1.0626992561105207e-06,
"loss": 1.1218,
"step": 600
},
{
"epoch": 0.10983170947741364,
"grad_norm": 4.622099244697303,
"learning_rate": 1.098122564647538e-06,
"loss": 1.1217,
"step": 620
},
{
"epoch": 0.1133746678476528,
"grad_norm": 4.036105893234525,
"learning_rate": 1.1335458731845557e-06,
"loss": 1.1389,
"step": 640
},
{
"epoch": 0.11691762621789194,
"grad_norm": 4.152917559749059,
"learning_rate": 1.1689691817215728e-06,
"loss": 1.1605,
"step": 660
},
{
"epoch": 0.1204605845881311,
"grad_norm": 2.353928725309983,
"learning_rate": 1.2043924902585904e-06,
"loss": 1.1468,
"step": 680
},
{
"epoch": 0.12400354295837024,
"grad_norm": 3.628879073452012,
"learning_rate": 1.2398157987956076e-06,
"loss": 1.1481,
"step": 700
},
{
"epoch": 0.1275465013286094,
"grad_norm": 4.198170873976199,
"learning_rate": 1.2752391073326251e-06,
"loss": 1.1093,
"step": 720
},
{
"epoch": 0.13108945969884853,
"grad_norm": 3.269113557624318,
"learning_rate": 1.3106624158696423e-06,
"loss": 1.102,
"step": 740
},
{
"epoch": 0.13463241806908768,
"grad_norm": 4.528204224506456,
"learning_rate": 1.3460857244066597e-06,
"loss": 1.0894,
"step": 760
},
{
"epoch": 0.13817537643932684,
"grad_norm": 3.27137530368028,
"learning_rate": 1.381509032943677e-06,
"loss": 1.0747,
"step": 780
},
{
"epoch": 0.141718334809566,
"grad_norm": 7.233836600777667,
"learning_rate": 1.4169323414806942e-06,
"loss": 1.1337,
"step": 800
},
{
"epoch": 0.14526129317980513,
"grad_norm": 2.8025481056888815,
"learning_rate": 1.4523556500177118e-06,
"loss": 1.0662,
"step": 820
},
{
"epoch": 0.1488042515500443,
"grad_norm": 3.388696427420553,
"learning_rate": 1.4877789585547292e-06,
"loss": 1.0438,
"step": 840
},
{
"epoch": 0.15234720992028344,
"grad_norm": 4.710208067024261,
"learning_rate": 1.5232022670917465e-06,
"loss": 1.1523,
"step": 860
},
{
"epoch": 0.15589016829052257,
"grad_norm": 3.564554055568693,
"learning_rate": 1.558625575628764e-06,
"loss": 1.1362,
"step": 880
},
{
"epoch": 0.15943312666076173,
"grad_norm": 4.195782034527705,
"learning_rate": 1.594048884165781e-06,
"loss": 1.1197,
"step": 900
},
{
"epoch": 0.1629760850310009,
"grad_norm": 5.136529290856518,
"learning_rate": 1.6294721927027984e-06,
"loss": 1.0747,
"step": 920
},
{
"epoch": 0.16651904340124005,
"grad_norm": 3.0425557174200875,
"learning_rate": 1.664895501239816e-06,
"loss": 1.0078,
"step": 940
},
{
"epoch": 0.17006200177147918,
"grad_norm": 4.70753429709887,
"learning_rate": 1.7003188097768334e-06,
"loss": 1.0692,
"step": 960
},
{
"epoch": 0.17360496014171833,
"grad_norm": 2.352046892407463,
"learning_rate": 1.7357421183138505e-06,
"loss": 1.1146,
"step": 980
},
{
"epoch": 0.1771479185119575,
"grad_norm": 4.251351248901323,
"learning_rate": 1.771165426850868e-06,
"loss": 1.1281,
"step": 1000
},
{
"epoch": 0.18069087688219662,
"grad_norm": 3.1244602580086203,
"learning_rate": 1.8065887353878853e-06,
"loss": 1.0573,
"step": 1020
},
{
"epoch": 0.18423383525243578,
"grad_norm": 3.7998767293490014,
"learning_rate": 1.8420120439249029e-06,
"loss": 1.0572,
"step": 1040
},
{
"epoch": 0.18777679362267494,
"grad_norm": 4.226400750966905,
"learning_rate": 1.8774353524619202e-06,
"loss": 1.0907,
"step": 1060
},
{
"epoch": 0.1913197519929141,
"grad_norm": 8.680035093004253,
"learning_rate": 1.9128586609989376e-06,
"loss": 1.1316,
"step": 1080
},
{
"epoch": 0.19486271036315322,
"grad_norm": 5.6932973340030175,
"learning_rate": 1.9482819695359548e-06,
"loss": 1.0741,
"step": 1100
},
{
"epoch": 0.19840566873339238,
"grad_norm": 4.008840499431025,
"learning_rate": 1.983705278072972e-06,
"loss": 1.0599,
"step": 1120
},
{
"epoch": 0.20194862710363154,
"grad_norm": 4.7089915409873555,
"learning_rate": 2.0191285866099895e-06,
"loss": 1.1045,
"step": 1140
},
{
"epoch": 0.20549158547387067,
"grad_norm": 3.8841603187249665,
"learning_rate": 2.054551895147007e-06,
"loss": 1.1013,
"step": 1160
},
{
"epoch": 0.20903454384410983,
"grad_norm": 2.5908607447164256,
"learning_rate": 2.0899752036840243e-06,
"loss": 1.0688,
"step": 1180
},
{
"epoch": 0.21257750221434898,
"grad_norm": 4.285794532561674,
"learning_rate": 2.1253985122210414e-06,
"loss": 1.044,
"step": 1200
},
{
"epoch": 0.21612046058458814,
"grad_norm": 5.061151481127176,
"learning_rate": 2.160821820758059e-06,
"loss": 1.0987,
"step": 1220
},
{
"epoch": 0.21966341895482727,
"grad_norm": 2.4272321583338945,
"learning_rate": 2.196245129295076e-06,
"loss": 1.0697,
"step": 1240
},
{
"epoch": 0.22320637732506643,
"grad_norm": 3.516050346917228,
"learning_rate": 2.2316684378320937e-06,
"loss": 1.0373,
"step": 1260
},
{
"epoch": 0.2267493356953056,
"grad_norm": 5.174609559420662,
"learning_rate": 2.2670917463691113e-06,
"loss": 1.0212,
"step": 1280
},
{
"epoch": 0.23029229406554472,
"grad_norm": 5.096030335997553,
"learning_rate": 2.3025150549061285e-06,
"loss": 1.1042,
"step": 1300
},
{
"epoch": 0.23383525243578387,
"grad_norm": 3.21646646241324,
"learning_rate": 2.3379383634431456e-06,
"loss": 1.074,
"step": 1320
},
{
"epoch": 0.23737821080602303,
"grad_norm": 4.102524624460841,
"learning_rate": 2.3733616719801632e-06,
"loss": 1.1245,
"step": 1340
},
{
"epoch": 0.2409211691762622,
"grad_norm": 3.537479639297508,
"learning_rate": 2.408784980517181e-06,
"loss": 1.1179,
"step": 1360
},
{
"epoch": 0.24446412754650132,
"grad_norm": 3.886486819810854,
"learning_rate": 2.444208289054198e-06,
"loss": 1.0651,
"step": 1380
},
{
"epoch": 0.24800708591674048,
"grad_norm": 4.034282862676682,
"learning_rate": 2.479631597591215e-06,
"loss": 1.02,
"step": 1400
},
{
"epoch": 0.25155004428697963,
"grad_norm": 3.0945919872830663,
"learning_rate": 2.5150549061282327e-06,
"loss": 1.0985,
"step": 1420
},
{
"epoch": 0.2550930026572188,
"grad_norm": 3.634569400423284,
"learning_rate": 2.5504782146652503e-06,
"loss": 1.0805,
"step": 1440
},
{
"epoch": 0.25863596102745795,
"grad_norm": 3.920774204411743,
"learning_rate": 2.585901523202267e-06,
"loss": 1.0352,
"step": 1460
},
{
"epoch": 0.26217891939769705,
"grad_norm": 3.650545041007239,
"learning_rate": 2.6213248317392846e-06,
"loss": 1.0575,
"step": 1480
},
{
"epoch": 0.2657218777679362,
"grad_norm": 3.8026274989044793,
"learning_rate": 2.6567481402763018e-06,
"loss": 1.0806,
"step": 1500
},
{
"epoch": 0.26926483613817537,
"grad_norm": 4.1381541786166895,
"learning_rate": 2.6921714488133194e-06,
"loss": 1.0501,
"step": 1520
},
{
"epoch": 0.2728077945084145,
"grad_norm": 5.519250816332529,
"learning_rate": 2.727594757350337e-06,
"loss": 1.0583,
"step": 1540
},
{
"epoch": 0.2763507528786537,
"grad_norm": 4.595209023098072,
"learning_rate": 2.763018065887354e-06,
"loss": 1.0662,
"step": 1560
},
{
"epoch": 0.27989371124889284,
"grad_norm": 3.540003351520752,
"learning_rate": 2.7984413744243717e-06,
"loss": 1.0286,
"step": 1580
},
{
"epoch": 0.283436669619132,
"grad_norm": 3.4373968392712184,
"learning_rate": 2.8338646829613884e-06,
"loss": 1.0434,
"step": 1600
},
{
"epoch": 0.2869796279893711,
"grad_norm": 17.453104932319967,
"learning_rate": 2.869287991498406e-06,
"loss": 1.0223,
"step": 1620
},
{
"epoch": 0.29052258635961026,
"grad_norm": 2.6061492522441863,
"learning_rate": 2.9047113000354236e-06,
"loss": 1.0935,
"step": 1640
},
{
"epoch": 0.2940655447298494,
"grad_norm": 4.139323586910726,
"learning_rate": 2.9401346085724407e-06,
"loss": 1.0754,
"step": 1660
},
{
"epoch": 0.2976085031000886,
"grad_norm": 5.300424892826558,
"learning_rate": 2.9755579171094583e-06,
"loss": 1.0681,
"step": 1680
},
{
"epoch": 0.30115146147032773,
"grad_norm": 3.649885398648624,
"learning_rate": 3.0109812256464755e-06,
"loss": 1.0922,
"step": 1700
},
{
"epoch": 0.3046944198405669,
"grad_norm": 4.140426538668616,
"learning_rate": 3.046404534183493e-06,
"loss": 1.0644,
"step": 1720
},
{
"epoch": 0.30823737821080605,
"grad_norm": 2.175231115055194,
"learning_rate": 3.0818278427205106e-06,
"loss": 1.0494,
"step": 1740
},
{
"epoch": 0.31178033658104515,
"grad_norm": 3.028695259816387,
"learning_rate": 3.117251151257528e-06,
"loss": 1.0932,
"step": 1760
},
{
"epoch": 0.3153232949512843,
"grad_norm": 2.7850683084236394,
"learning_rate": 3.1526744597945454e-06,
"loss": 1.0287,
"step": 1780
},
{
"epoch": 0.31886625332152346,
"grad_norm": 2.933892885639913,
"learning_rate": 3.188097768331562e-06,
"loss": 1.0669,
"step": 1800
},
{
"epoch": 0.3224092116917626,
"grad_norm": 4.948366022661806,
"learning_rate": 3.2235210768685797e-06,
"loss": 1.0984,
"step": 1820
},
{
"epoch": 0.3259521700620018,
"grad_norm": 3.086993856127569,
"learning_rate": 3.258944385405597e-06,
"loss": 1.0421,
"step": 1840
},
{
"epoch": 0.32949512843224094,
"grad_norm": 4.135810740344135,
"learning_rate": 3.2943676939426144e-06,
"loss": 0.9316,
"step": 1860
},
{
"epoch": 0.3330380868024801,
"grad_norm": 2.7787248572400673,
"learning_rate": 3.329791002479632e-06,
"loss": 1.0354,
"step": 1880
},
{
"epoch": 0.3365810451727192,
"grad_norm": 6.6200330325040815,
"learning_rate": 3.365214311016649e-06,
"loss": 1.0825,
"step": 1900
},
{
"epoch": 0.34012400354295835,
"grad_norm": 3.9618002923502607,
"learning_rate": 3.4006376195536668e-06,
"loss": 1.0643,
"step": 1920
},
{
"epoch": 0.3436669619131975,
"grad_norm": 2.9478604282057987,
"learning_rate": 3.4360609280906835e-06,
"loss": 1.0942,
"step": 1940
},
{
"epoch": 0.34720992028343667,
"grad_norm": 3.1696939381732596,
"learning_rate": 3.471484236627701e-06,
"loss": 1.0848,
"step": 1960
},
{
"epoch": 0.3507528786536758,
"grad_norm": 2.610545027614052,
"learning_rate": 3.5069075451647187e-06,
"loss": 1.0424,
"step": 1980
},
{
"epoch": 0.354295837023915,
"grad_norm": 2.8653023342432844,
"learning_rate": 3.542330853701736e-06,
"loss": 1.0376,
"step": 2000
},
{
"epoch": 0.354295837023915,
"eval_loss": 0.9136635661125183,
"eval_runtime": 366.8623,
"eval_samples_per_second": 25.914,
"eval_steps_per_second": 3.241,
"step": 2000
},
{
"epoch": 0.35783879539415414,
"grad_norm": 5.501631748880912,
"learning_rate": 3.5777541622387534e-06,
"loss": 1.0794,
"step": 2020
},
{
"epoch": 0.36138175376439324,
"grad_norm": 3.9781584018724216,
"learning_rate": 3.6131774707757706e-06,
"loss": 1.0918,
"step": 2040
},
{
"epoch": 0.3649247121346324,
"grad_norm": 5.9653615606161035,
"learning_rate": 3.648600779312788e-06,
"loss": 1.0281,
"step": 2060
},
{
"epoch": 0.36846767050487156,
"grad_norm": 4.911079902501515,
"learning_rate": 3.6840240878498057e-06,
"loss": 1.0565,
"step": 2080
},
{
"epoch": 0.3720106288751107,
"grad_norm": 6.677202780526525,
"learning_rate": 3.719447396386823e-06,
"loss": 1.0622,
"step": 2100
},
{
"epoch": 0.3755535872453499,
"grad_norm": 2.9957559478511513,
"learning_rate": 3.7548707049238405e-06,
"loss": 1.014,
"step": 2120
},
{
"epoch": 0.37909654561558903,
"grad_norm": 6.136487010459827,
"learning_rate": 3.7902940134608572e-06,
"loss": 1.0463,
"step": 2140
},
{
"epoch": 0.3826395039858282,
"grad_norm": 2.6989289543608987,
"learning_rate": 3.825717321997875e-06,
"loss": 1.0334,
"step": 2160
},
{
"epoch": 0.3861824623560673,
"grad_norm": 2.8559280148544715,
"learning_rate": 3.861140630534892e-06,
"loss": 0.9961,
"step": 2180
},
{
"epoch": 0.38972542072630645,
"grad_norm": 3.9195236355689618,
"learning_rate": 3.8965639390719095e-06,
"loss": 1.0501,
"step": 2200
},
{
"epoch": 0.3932683790965456,
"grad_norm": 4.745715075717865,
"learning_rate": 3.9319872476089276e-06,
"loss": 1.0532,
"step": 2220
},
{
"epoch": 0.39681133746678476,
"grad_norm": 3.1678711880303365,
"learning_rate": 3.967410556145944e-06,
"loss": 1.0464,
"step": 2240
},
{
"epoch": 0.4003542958370239,
"grad_norm": 4.318491084289353,
"learning_rate": 4.002833864682962e-06,
"loss": 1.01,
"step": 2260
},
{
"epoch": 0.4038972542072631,
"grad_norm": 3.877214772420464,
"learning_rate": 4.038257173219979e-06,
"loss": 1.04,
"step": 2280
},
{
"epoch": 0.40744021257750224,
"grad_norm": 4.408726611386237,
"learning_rate": 4.073680481756996e-06,
"loss": 0.9864,
"step": 2300
},
{
"epoch": 0.41098317094774134,
"grad_norm": 3.1476639776400264,
"learning_rate": 4.109103790294014e-06,
"loss": 1.0471,
"step": 2320
},
{
"epoch": 0.4145261293179805,
"grad_norm": 4.057188755394368,
"learning_rate": 4.144527098831031e-06,
"loss": 1.0187,
"step": 2340
},
{
"epoch": 0.41806908768821965,
"grad_norm": 3.7443003760493547,
"learning_rate": 4.1799504073680485e-06,
"loss": 1.0207,
"step": 2360
},
{
"epoch": 0.4216120460584588,
"grad_norm": 3.4133153204439375,
"learning_rate": 4.215373715905066e-06,
"loss": 1.0412,
"step": 2380
},
{
"epoch": 0.42515500442869797,
"grad_norm": 5.271529700638458,
"learning_rate": 4.250797024442083e-06,
"loss": 1.0446,
"step": 2400
},
{
"epoch": 0.4286979627989371,
"grad_norm": 3.690737939017104,
"learning_rate": 4.286220332979101e-06,
"loss": 1.0893,
"step": 2420
},
{
"epoch": 0.4322409211691763,
"grad_norm": 4.6971388539053445,
"learning_rate": 4.321643641516118e-06,
"loss": 1.0552,
"step": 2440
},
{
"epoch": 0.4357838795394154,
"grad_norm": 3.833304687965468,
"learning_rate": 4.357066950053135e-06,
"loss": 1.018,
"step": 2460
},
{
"epoch": 0.43932683790965454,
"grad_norm": 3.876707930916304,
"learning_rate": 4.392490258590152e-06,
"loss": 1.0593,
"step": 2480
},
{
"epoch": 0.4428697962798937,
"grad_norm": 4.485093155652708,
"learning_rate": 4.42791356712717e-06,
"loss": 1.0315,
"step": 2500
},
{
"epoch": 0.44641275465013286,
"grad_norm": 3.240309715459973,
"learning_rate": 4.4633368756641875e-06,
"loss": 1.0228,
"step": 2520
},
{
"epoch": 0.449955713020372,
"grad_norm": 2.8577216948048085,
"learning_rate": 4.498760184201205e-06,
"loss": 0.9622,
"step": 2540
},
{
"epoch": 0.4534986713906112,
"grad_norm": 2.3204510234528004,
"learning_rate": 4.534183492738223e-06,
"loss": 1.0417,
"step": 2560
},
{
"epoch": 0.45704162976085033,
"grad_norm": 4.8495156054088655,
"learning_rate": 4.569606801275239e-06,
"loss": 1.0108,
"step": 2580
},
{
"epoch": 0.46058458813108943,
"grad_norm": 5.060714565551563,
"learning_rate": 4.605030109812257e-06,
"loss": 1.0303,
"step": 2600
},
{
"epoch": 0.4641275465013286,
"grad_norm": 6.095607076544207,
"learning_rate": 4.640453418349274e-06,
"loss": 1.0116,
"step": 2620
},
{
"epoch": 0.46767050487156775,
"grad_norm": 2.9232128503389183,
"learning_rate": 4.675876726886291e-06,
"loss": 0.983,
"step": 2640
},
{
"epoch": 0.4712134632418069,
"grad_norm": 2.36685887518906,
"learning_rate": 4.711300035423309e-06,
"loss": 1.0277,
"step": 2660
},
{
"epoch": 0.47475642161204606,
"grad_norm": 4.128356071985117,
"learning_rate": 4.7467233439603264e-06,
"loss": 1.0411,
"step": 2680
},
{
"epoch": 0.4782993799822852,
"grad_norm": 2.7297669503368804,
"learning_rate": 4.782146652497344e-06,
"loss": 1.0553,
"step": 2700
},
{
"epoch": 0.4818423383525244,
"grad_norm": 2.7607226135533103,
"learning_rate": 4.817569961034362e-06,
"loss": 1.0547,
"step": 2720
},
{
"epoch": 0.4853852967227635,
"grad_norm": 4.187021213318743,
"learning_rate": 4.852993269571378e-06,
"loss": 1.0437,
"step": 2740
},
{
"epoch": 0.48892825509300264,
"grad_norm": 3.7584391728566695,
"learning_rate": 4.888416578108396e-06,
"loss": 1.035,
"step": 2760
},
{
"epoch": 0.4924712134632418,
"grad_norm": 4.830912639451228,
"learning_rate": 4.923839886645413e-06,
"loss": 1.0675,
"step": 2780
},
{
"epoch": 0.49601417183348095,
"grad_norm": 4.685752123568836,
"learning_rate": 4.95926319518243e-06,
"loss": 1.0493,
"step": 2800
},
{
"epoch": 0.4995571302037201,
"grad_norm": 3.375680932358239,
"learning_rate": 4.994686503719448e-06,
"loss": 1.0686,
"step": 2820
},
{
"epoch": 0.5031000885739593,
"grad_norm": 2.4729024999298534,
"learning_rate": 4.999994474499561e-06,
"loss": 1.0283,
"step": 2840
},
{
"epoch": 0.5066430469441984,
"grad_norm": 4.50380809454446,
"learning_rate": 4.999973825606614e-06,
"loss": 1.0188,
"step": 2860
},
{
"epoch": 0.5101860053144376,
"grad_norm": 3.418425163716614,
"learning_rate": 4.999937881373025e-06,
"loss": 1.0617,
"step": 2880
},
{
"epoch": 0.5137289636846767,
"grad_norm": 3.6088000942292164,
"learning_rate": 4.999886642018707e-06,
"loss": 1.0723,
"step": 2900
},
{
"epoch": 0.5172719220549159,
"grad_norm": 6.244089618662361,
"learning_rate": 4.999820107857154e-06,
"loss": 1.0662,
"step": 2920
},
{
"epoch": 0.520814880425155,
"grad_norm": 2.609964349424898,
"learning_rate": 4.999738279295433e-06,
"loss": 1.0324,
"step": 2940
},
{
"epoch": 0.5243578387953941,
"grad_norm": 2.679897624387073,
"learning_rate": 4.9996411568341896e-06,
"loss": 1.0207,
"step": 2960
},
{
"epoch": 0.5279007971656333,
"grad_norm": 3.27573058708962,
"learning_rate": 4.999528741067638e-06,
"loss": 1.0939,
"step": 2980
},
{
"epoch": 0.5314437555358724,
"grad_norm": 3.3784052333905756,
"learning_rate": 4.99940103268356e-06,
"loss": 1.0193,
"step": 3000
},
{
"epoch": 0.5349867139061116,
"grad_norm": 4.924298481099745,
"learning_rate": 4.999258032463301e-06,
"loss": 1.1053,
"step": 3020
},
{
"epoch": 0.5385296722763507,
"grad_norm": 4.1853626057858895,
"learning_rate": 4.999099741281766e-06,
"loss": 1.0337,
"step": 3040
},
{
"epoch": 0.54207263064659,
"grad_norm": 3.5502444781095104,
"learning_rate": 4.998926160107411e-06,
"loss": 1.0786,
"step": 3060
},
{
"epoch": 0.545615589016829,
"grad_norm": 3.335300103709776,
"learning_rate": 4.998737290002241e-06,
"loss": 1.0507,
"step": 3080
},
{
"epoch": 0.5491585473870682,
"grad_norm": 3.114815124169259,
"learning_rate": 4.9985331321218e-06,
"loss": 1.0352,
"step": 3100
},
{
"epoch": 0.5527015057573074,
"grad_norm": 4.900399906908037,
"learning_rate": 4.998313687715169e-06,
"loss": 1.0244,
"step": 3120
},
{
"epoch": 0.5562444641275465,
"grad_norm": 2.853630801128127,
"learning_rate": 4.9980789581249515e-06,
"loss": 1.0552,
"step": 3140
},
{
"epoch": 0.5597874224977857,
"grad_norm": 3.824968946809653,
"learning_rate": 4.9978289447872695e-06,
"loss": 1.0109,
"step": 3160
},
{
"epoch": 0.5633303808680248,
"grad_norm": 3.6496584120718314,
"learning_rate": 4.997563649231755e-06,
"loss": 1.0097,
"step": 3180
},
{
"epoch": 0.566873339238264,
"grad_norm": 4.089881200621581,
"learning_rate": 4.997283073081541e-06,
"loss": 1.0687,
"step": 3200
},
{
"epoch": 0.5704162976085031,
"grad_norm": 4.25767273401204,
"learning_rate": 4.996987218053247e-06,
"loss": 1.0032,
"step": 3220
},
{
"epoch": 0.5739592559787422,
"grad_norm": 4.054242038282677,
"learning_rate": 4.996676085956973e-06,
"loss": 1.0109,
"step": 3240
},
{
"epoch": 0.5775022143489814,
"grad_norm": 2.2212311640306934,
"learning_rate": 4.996349678696288e-06,
"loss": 0.9873,
"step": 3260
},
{
"epoch": 0.5810451727192205,
"grad_norm": 2.910691796089737,
"learning_rate": 4.996007998268219e-06,
"loss": 1.0389,
"step": 3280
},
{
"epoch": 0.5845881310894597,
"grad_norm": 1.804064223013201,
"learning_rate": 4.995651046763232e-06,
"loss": 1.0065,
"step": 3300
},
{
"epoch": 0.5881310894596988,
"grad_norm": 1.787168345455913,
"learning_rate": 4.99527882636523e-06,
"loss": 0.9964,
"step": 3320
},
{
"epoch": 0.591674047829938,
"grad_norm": 2.6825449022104584,
"learning_rate": 4.99489133935153e-06,
"loss": 1.0113,
"step": 3340
},
{
"epoch": 0.5952170062001771,
"grad_norm": 3.3430457632929986,
"learning_rate": 4.9944885880928576e-06,
"loss": 1.0159,
"step": 3360
},
{
"epoch": 0.5987599645704162,
"grad_norm": 3.4188233454866777,
"learning_rate": 4.994070575053324e-06,
"loss": 1.0332,
"step": 3380
},
{
"epoch": 0.6023029229406555,
"grad_norm": 5.035300065424226,
"learning_rate": 4.993637302790417e-06,
"loss": 1.0072,
"step": 3400
},
{
"epoch": 0.6058458813108946,
"grad_norm": 2.4629577630265067,
"learning_rate": 4.9931887739549845e-06,
"loss": 1.0246,
"step": 3420
},
{
"epoch": 0.6093888396811338,
"grad_norm": 3.2222908571387605,
"learning_rate": 4.9927249912912135e-06,
"loss": 1.1202,
"step": 3440
},
{
"epoch": 0.6129317980513729,
"grad_norm": 3.329571486141443,
"learning_rate": 4.99224595763662e-06,
"loss": 1.0052,
"step": 3460
},
{
"epoch": 0.6164747564216121,
"grad_norm": 3.2262886392708023,
"learning_rate": 4.991751675922029e-06,
"loss": 1.0563,
"step": 3480
},
{
"epoch": 0.6200177147918512,
"grad_norm": 2.430384347609413,
"learning_rate": 4.991242149171554e-06,
"loss": 1.0084,
"step": 3500
},
{
"epoch": 0.6235606731620903,
"grad_norm": 2.2313346912151437,
"learning_rate": 4.990717380502581e-06,
"loss": 1.1098,
"step": 3520
},
{
"epoch": 0.6271036315323295,
"grad_norm": 3.269110632691307,
"learning_rate": 4.990177373125752e-06,
"loss": 0.9681,
"step": 3540
},
{
"epoch": 0.6306465899025686,
"grad_norm": 2.8574079693862213,
"learning_rate": 4.989622130344939e-06,
"loss": 1.0192,
"step": 3560
},
{
"epoch": 0.6341895482728078,
"grad_norm": 3.663783109216899,
"learning_rate": 4.989051655557228e-06,
"loss": 0.9997,
"step": 3580
},
{
"epoch": 0.6377325066430469,
"grad_norm": 3.3537695839211916,
"learning_rate": 4.9884659522528985e-06,
"loss": 0.9669,
"step": 3600
},
{
"epoch": 0.6412754650132861,
"grad_norm": 2.809218376275673,
"learning_rate": 4.987865024015401e-06,
"loss": 1.0155,
"step": 3620
},
{
"epoch": 0.6448184233835252,
"grad_norm": 5.035643249923235,
"learning_rate": 4.9872488745213356e-06,
"loss": 1.0125,
"step": 3640
},
{
"epoch": 0.6483613817537643,
"grad_norm": 3.6981075116685798,
"learning_rate": 4.986617507540426e-06,
"loss": 0.9861,
"step": 3660
},
{
"epoch": 0.6519043401240036,
"grad_norm": 2.5959218064380725,
"learning_rate": 4.985970926935504e-06,
"loss": 1.0936,
"step": 3680
},
{
"epoch": 0.6554472984942427,
"grad_norm": 2.72099024631383,
"learning_rate": 4.985309136662478e-06,
"loss": 1.0458,
"step": 3700
},
{
"epoch": 0.6589902568644819,
"grad_norm": 2.3741120569873937,
"learning_rate": 4.984632140770314e-06,
"loss": 0.9733,
"step": 3720
},
{
"epoch": 0.662533215234721,
"grad_norm": 2.9843690685487316,
"learning_rate": 4.983939943401009e-06,
"loss": 0.9865,
"step": 3740
},
{
"epoch": 0.6660761736049602,
"grad_norm": 3.753859508812206,
"learning_rate": 4.9832325487895625e-06,
"loss": 1.0373,
"step": 3760
},
{
"epoch": 0.6696191319751993,
"grad_norm": 2.36323312181139,
"learning_rate": 4.98250996126396e-06,
"loss": 1.0007,
"step": 3780
},
{
"epoch": 0.6731620903454384,
"grad_norm": 2.4128489083499916,
"learning_rate": 4.981772185245135e-06,
"loss": 1.0155,
"step": 3800
},
{
"epoch": 0.6767050487156776,
"grad_norm": 4.471122710526832,
"learning_rate": 4.98101922524695e-06,
"loss": 1.0625,
"step": 3820
},
{
"epoch": 0.6802480070859167,
"grad_norm": 3.9450022308941035,
"learning_rate": 4.980251085876163e-06,
"loss": 1.0608,
"step": 3840
},
{
"epoch": 0.6837909654561559,
"grad_norm": 3.750505881130023,
"learning_rate": 4.979467771832407e-06,
"loss": 1.0401,
"step": 3860
},
{
"epoch": 0.687333923826395,
"grad_norm": 3.0818679668346918,
"learning_rate": 4.978669287908152e-06,
"loss": 0.9782,
"step": 3880
},
{
"epoch": 0.6908768821966342,
"grad_norm": 3.585328199676442,
"learning_rate": 4.9778556389886836e-06,
"loss": 1.0293,
"step": 3900
},
{
"epoch": 0.6944198405668733,
"grad_norm": 3.352267909342498,
"learning_rate": 4.97702683005207e-06,
"loss": 1.0443,
"step": 3920
},
{
"epoch": 0.6979627989371124,
"grad_norm": 2.740609338046614,
"learning_rate": 4.976182866169128e-06,
"loss": 0.983,
"step": 3940
},
{
"epoch": 0.7015057573073517,
"grad_norm": 3.815853619798988,
"learning_rate": 4.9753237525033995e-06,
"loss": 1.0241,
"step": 3960
},
{
"epoch": 0.7050487156775908,
"grad_norm": 2.056928416192008,
"learning_rate": 4.974449494311113e-06,
"loss": 0.935,
"step": 3980
},
{
"epoch": 0.70859167404783,
"grad_norm": 4.501534397772864,
"learning_rate": 4.973560096941157e-06,
"loss": 1.0417,
"step": 4000
},
{
"epoch": 0.70859167404783,
"eval_loss": 0.8831750154495239,
"eval_runtime": 377.54,
"eval_samples_per_second": 25.181,
"eval_steps_per_second": 3.149,
"step": 4000
},
{
"epoch": 0.7121346324180691,
"grad_norm": 4.163953326960491,
"learning_rate": 4.97265556583504e-06,
"loss": 0.9787,
"step": 4020
},
{
"epoch": 0.7156775907883083,
"grad_norm": 2.1612827561442143,
"learning_rate": 4.971735906526867e-06,
"loss": 1.0187,
"step": 4040
},
{
"epoch": 0.7192205491585474,
"grad_norm": 4.132524372144837,
"learning_rate": 4.9708011246432954e-06,
"loss": 1.0447,
"step": 4060
},
{
"epoch": 0.7227635075287865,
"grad_norm": 4.902391594790166,
"learning_rate": 4.969851225903511e-06,
"loss": 1.0849,
"step": 4080
},
{
"epoch": 0.7263064658990257,
"grad_norm": 4.040812745701054,
"learning_rate": 4.968886216119181e-06,
"loss": 0.9977,
"step": 4100
},
{
"epoch": 0.7298494242692648,
"grad_norm": 4.403191939391695,
"learning_rate": 4.967906101194432e-06,
"loss": 1.0151,
"step": 4120
},
{
"epoch": 0.733392382639504,
"grad_norm": 4.888547155083341,
"learning_rate": 4.9669108871258005e-06,
"loss": 1.0488,
"step": 4140
},
{
"epoch": 0.7369353410097431,
"grad_norm": 2.5511865400666247,
"learning_rate": 4.965900580002208e-06,
"loss": 0.9839,
"step": 4160
},
{
"epoch": 0.7404782993799823,
"grad_norm": 3.7363422093828267,
"learning_rate": 4.9648751860049146e-06,
"loss": 0.9671,
"step": 4180
},
{
"epoch": 0.7440212577502214,
"grad_norm": 4.5882225405684105,
"learning_rate": 4.963834711407487e-06,
"loss": 1.0153,
"step": 4200
},
{
"epoch": 0.7475642161204605,
"grad_norm": 3.3728967510985175,
"learning_rate": 4.962779162575757e-06,
"loss": 0.9866,
"step": 4220
},
{
"epoch": 0.7511071744906997,
"grad_norm": 3.3582439513903695,
"learning_rate": 4.961708545967782e-06,
"loss": 1.0012,
"step": 4240
},
{
"epoch": 0.7546501328609388,
"grad_norm": 4.229044845753943,
"learning_rate": 4.960622868133811e-06,
"loss": 1.0264,
"step": 4260
},
{
"epoch": 0.7581930912311781,
"grad_norm": 3.604122128833424,
"learning_rate": 4.959522135716238e-06,
"loss": 1.0334,
"step": 4280
},
{
"epoch": 0.7617360496014172,
"grad_norm": 4.330367291558989,
"learning_rate": 4.958406355449564e-06,
"loss": 1.0528,
"step": 4300
},
{
"epoch": 0.7652790079716564,
"grad_norm": 3.96229330754017,
"learning_rate": 4.957275534160356e-06,
"loss": 1.0142,
"step": 4320
},
{
"epoch": 0.7688219663418955,
"grad_norm": 1.880123067964198,
"learning_rate": 4.956129678767206e-06,
"loss": 0.9585,
"step": 4340
},
{
"epoch": 0.7723649247121346,
"grad_norm": 2.9357741019622026,
"learning_rate": 4.954968796280685e-06,
"loss": 1.0118,
"step": 4360
},
{
"epoch": 0.7759078830823738,
"grad_norm": 5.337878559961154,
"learning_rate": 4.953792893803308e-06,
"loss": 0.96,
"step": 4380
},
{
"epoch": 0.7794508414526129,
"grad_norm": 2.7737448076628146,
"learning_rate": 4.952601978529479e-06,
"loss": 1.0095,
"step": 4400
},
{
"epoch": 0.7829937998228521,
"grad_norm": 2.81198860896816,
"learning_rate": 4.951396057745457e-06,
"loss": 1.0025,
"step": 4420
},
{
"epoch": 0.7865367581930912,
"grad_norm": 3.2768364907663843,
"learning_rate": 4.950175138829306e-06,
"loss": 1.0062,
"step": 4440
},
{
"epoch": 0.7900797165633304,
"grad_norm": 4.503697644314717,
"learning_rate": 4.948939229250855e-06,
"loss": 0.9866,
"step": 4460
},
{
"epoch": 0.7936226749335695,
"grad_norm": 3.709697666688961,
"learning_rate": 4.947688336571644e-06,
"loss": 1.0234,
"step": 4480
},
{
"epoch": 0.7971656333038086,
"grad_norm": 2.9177293521142804,
"learning_rate": 4.946422468444886e-06,
"loss": 0.9501,
"step": 4500
},
{
"epoch": 0.8007085916740478,
"grad_norm": 3.3045345091796197,
"learning_rate": 4.945141632615416e-06,
"loss": 1.0335,
"step": 4520
},
{
"epoch": 0.804251550044287,
"grad_norm": 4.0701401436058795,
"learning_rate": 4.943845836919642e-06,
"loss": 1.0438,
"step": 4540
},
{
"epoch": 0.8077945084145262,
"grad_norm": 3.9759562983887213,
"learning_rate": 4.942535089285505e-06,
"loss": 1.0283,
"step": 4560
},
{
"epoch": 0.8113374667847653,
"grad_norm": 5.075198666819325,
"learning_rate": 4.9412093977324196e-06,
"loss": 1.0087,
"step": 4580
},
{
"epoch": 0.8148804251550045,
"grad_norm": 4.64113304477617,
"learning_rate": 4.9398687703712324e-06,
"loss": 1.0335,
"step": 4600
},
{
"epoch": 0.8184233835252436,
"grad_norm": 3.6121906044220835,
"learning_rate": 4.938513215404171e-06,
"loss": 1.036,
"step": 4620
},
{
"epoch": 0.8219663418954827,
"grad_norm": 2.8641112530538297,
"learning_rate": 4.9371427411247905e-06,
"loss": 0.9476,
"step": 4640
},
{
"epoch": 0.8255093002657219,
"grad_norm": 2.2471640500283194,
"learning_rate": 4.935757355917929e-06,
"loss": 1.0,
"step": 4660
},
{
"epoch": 0.829052258635961,
"grad_norm": 2.6668789188777553,
"learning_rate": 4.93435706825965e-06,
"loss": 1.042,
"step": 4680
},
{
"epoch": 0.8325952170062002,
"grad_norm": 3.382540393461139,
"learning_rate": 4.932941886717193e-06,
"loss": 0.9925,
"step": 4700
},
{
"epoch": 0.8361381753764393,
"grad_norm": 2.9612515882054224,
"learning_rate": 4.931511819948924e-06,
"loss": 1.0038,
"step": 4720
},
{
"epoch": 0.8396811337466785,
"grad_norm": 5.805032752593361,
"learning_rate": 4.930066876704276e-06,
"loss": 1.0752,
"step": 4740
},
{
"epoch": 0.8432240921169176,
"grad_norm": 2.906268834013272,
"learning_rate": 4.9286070658237025e-06,
"loss": 0.9574,
"step": 4760
},
{
"epoch": 0.8467670504871567,
"grad_norm": 3.908291806399892,
"learning_rate": 4.9271323962386185e-06,
"loss": 0.9355,
"step": 4780
},
{
"epoch": 0.8503100088573959,
"grad_norm": 2.862223610977262,
"learning_rate": 4.925642876971347e-06,
"loss": 0.9913,
"step": 4800
},
{
"epoch": 0.853852967227635,
"grad_norm": 3.885251950370837,
"learning_rate": 4.924138517135068e-06,
"loss": 0.9437,
"step": 4820
},
{
"epoch": 0.8573959255978743,
"grad_norm": 2.687403470850269,
"learning_rate": 4.922619325933753e-06,
"loss": 1.0183,
"step": 4840
},
{
"epoch": 0.8609388839681134,
"grad_norm": 5.1744645832504945,
"learning_rate": 4.921085312662119e-06,
"loss": 0.9639,
"step": 4860
},
{
"epoch": 0.8644818423383526,
"grad_norm": 4.73053239403457,
"learning_rate": 4.919536486705569e-06,
"loss": 1.0124,
"step": 4880
},
{
"epoch": 0.8680248007085917,
"grad_norm": 4.4563037107783865,
"learning_rate": 4.917972857540126e-06,
"loss": 0.99,
"step": 4900
},
{
"epoch": 0.8715677590788308,
"grad_norm": 4.614255774835929,
"learning_rate": 4.916394434732391e-06,
"loss": 1.0037,
"step": 4920
},
{
"epoch": 0.87511071744907,
"grad_norm": 3.533058939111727,
"learning_rate": 4.914801227939467e-06,
"loss": 1.0177,
"step": 4940
},
{
"epoch": 0.8786536758193091,
"grad_norm": 2.6727964096166583,
"learning_rate": 4.913193246908916e-06,
"loss": 0.9957,
"step": 4960
},
{
"epoch": 0.8821966341895483,
"grad_norm": 6.368757949715121,
"learning_rate": 4.911570501478686e-06,
"loss": 1.0324,
"step": 4980
},
{
"epoch": 0.8857395925597874,
"grad_norm": 3.238302734032586,
"learning_rate": 4.909933001577057e-06,
"loss": 0.9778,
"step": 5000
},
{
"epoch": 0.8892825509300266,
"grad_norm": 2.9414571649379173,
"learning_rate": 4.908280757222585e-06,
"loss": 1.0183,
"step": 5020
},
{
"epoch": 0.8928255093002657,
"grad_norm": 3.8675143162693417,
"learning_rate": 4.906613778524029e-06,
"loss": 1.0417,
"step": 5040
},
{
"epoch": 0.8963684676705048,
"grad_norm": 4.062896818204324,
"learning_rate": 4.9049320756803e-06,
"loss": 0.9951,
"step": 5060
},
{
"epoch": 0.899911426040744,
"grad_norm": 3.965608738547987,
"learning_rate": 4.9032356589803935e-06,
"loss": 1.0096,
"step": 5080
},
{
"epoch": 0.9034543844109831,
"grad_norm": 2.4470182814478845,
"learning_rate": 4.901524538803325e-06,
"loss": 0.9706,
"step": 5100
},
{
"epoch": 0.9069973427812223,
"grad_norm": 3.3652865356433788,
"learning_rate": 4.899798725618071e-06,
"loss": 1.0189,
"step": 5120
},
{
"epoch": 0.9105403011514615,
"grad_norm": 2.1015419863160316,
"learning_rate": 4.898058229983502e-06,
"loss": 0.9427,
"step": 5140
},
{
"epoch": 0.9140832595217007,
"grad_norm": 2.494263988797181,
"learning_rate": 4.896303062548321e-06,
"loss": 0.9542,
"step": 5160
},
{
"epoch": 0.9176262178919398,
"grad_norm": 5.0132756287008355,
"learning_rate": 4.894533234050992e-06,
"loss": 1.0177,
"step": 5180
},
{
"epoch": 0.9211691762621789,
"grad_norm": 3.848341234829432,
"learning_rate": 4.892748755319679e-06,
"loss": 0.9785,
"step": 5200
},
{
"epoch": 0.9247121346324181,
"grad_norm": 3.9783991127336824,
"learning_rate": 4.890949637272184e-06,
"loss": 0.9964,
"step": 5220
},
{
"epoch": 0.9282550930026572,
"grad_norm": 3.7994392920413333,
"learning_rate": 4.8891358909158695e-06,
"loss": 1.0164,
"step": 5240
},
{
"epoch": 0.9317980513728964,
"grad_norm": 3.646774315012477,
"learning_rate": 4.887307527347598e-06,
"loss": 1.008,
"step": 5260
},
{
"epoch": 0.9353410097431355,
"grad_norm": 6.203065849865684,
"learning_rate": 4.885464557753666e-06,
"loss": 1.0426,
"step": 5280
},
{
"epoch": 0.9388839681133747,
"grad_norm": 2.9353445219470444,
"learning_rate": 4.88360699340973e-06,
"loss": 1.0052,
"step": 5300
},
{
"epoch": 0.9424269264836138,
"grad_norm": 3.9291014774722033,
"learning_rate": 4.88173484568074e-06,
"loss": 0.9606,
"step": 5320
},
{
"epoch": 0.9459698848538529,
"grad_norm": 3.9955198747460634,
"learning_rate": 4.8798481260208715e-06,
"loss": 0.9862,
"step": 5340
},
{
"epoch": 0.9495128432240921,
"grad_norm": 4.103474267744412,
"learning_rate": 4.877946845973453e-06,
"loss": 1.008,
"step": 5360
},
{
"epoch": 0.9530558015943312,
"grad_norm": 3.817035998173409,
"learning_rate": 4.876031017170898e-06,
"loss": 0.9696,
"step": 5380
},
{
"epoch": 0.9565987599645704,
"grad_norm": 5.198464280181255,
"learning_rate": 4.874100651334629e-06,
"loss": 1.0248,
"step": 5400
},
{
"epoch": 0.9601417183348095,
"grad_norm": 3.289343037136709,
"learning_rate": 4.872155760275012e-06,
"loss": 0.9793,
"step": 5420
},
{
"epoch": 0.9636846767050488,
"grad_norm": 2.2435930827644746,
"learning_rate": 4.87019635589128e-06,
"loss": 1.0101,
"step": 5440
},
{
"epoch": 0.9672276350752879,
"grad_norm": 4.310791558682954,
"learning_rate": 4.86822245017146e-06,
"loss": 0.995,
"step": 5460
},
{
"epoch": 0.970770593445527,
"grad_norm": 3.352727892906536,
"learning_rate": 4.866234055192306e-06,
"loss": 0.9751,
"step": 5480
},
{
"epoch": 0.9743135518157662,
"grad_norm": 5.068267192871768,
"learning_rate": 4.864231183119212e-06,
"loss": 0.9629,
"step": 5500
},
{
"epoch": 0.9778565101860053,
"grad_norm": 3.5363020465890704,
"learning_rate": 4.862213846206155e-06,
"loss": 0.9977,
"step": 5520
},
{
"epoch": 0.9813994685562445,
"grad_norm": 3.3894885205413834,
"learning_rate": 4.860182056795604e-06,
"loss": 0.9575,
"step": 5540
},
{
"epoch": 0.9849424269264836,
"grad_norm": 2.6287369097602578,
"learning_rate": 4.8581358273184545e-06,
"loss": 0.989,
"step": 5560
},
{
"epoch": 0.9884853852967228,
"grad_norm": 2.9015866832665185,
"learning_rate": 4.856075170293948e-06,
"loss": 1.0018,
"step": 5580
},
{
"epoch": 0.9920283436669619,
"grad_norm": 3.0658535183131423,
"learning_rate": 4.854000098329596e-06,
"loss": 1.0078,
"step": 5600
},
{
"epoch": 0.995571302037201,
"grad_norm": 2.5404639262387025,
"learning_rate": 4.851910624121106e-06,
"loss": 0.9407,
"step": 5620
},
{
"epoch": 0.9991142604074402,
"grad_norm": 3.158604742158805,
"learning_rate": 4.849806760452299e-06,
"loss": 0.98,
"step": 5640
},
{
"epoch": 1.0026572187776794,
"grad_norm": 3.3616076027225965,
"learning_rate": 4.8476885201950345e-06,
"loss": 0.9476,
"step": 5660
},
{
"epoch": 1.0062001771479185,
"grad_norm": 3.690305513366549,
"learning_rate": 4.84555591630913e-06,
"loss": 0.9102,
"step": 5680
},
{
"epoch": 1.0097431355181576,
"grad_norm": 5.359946307013767,
"learning_rate": 4.843408961842285e-06,
"loss": 0.9232,
"step": 5700
},
{
"epoch": 1.0132860938883967,
"grad_norm": 3.569793560947344,
"learning_rate": 4.841247669929995e-06,
"loss": 0.935,
"step": 5720
},
{
"epoch": 1.016829052258636,
"grad_norm": 3.874750269772003,
"learning_rate": 4.839072053795479e-06,
"loss": 0.9331,
"step": 5740
},
{
"epoch": 1.0203720106288752,
"grad_norm": 2.7385873822832023,
"learning_rate": 4.83688212674959e-06,
"loss": 0.9371,
"step": 5760
},
{
"epoch": 1.0239149689991143,
"grad_norm": 3.665069410011044,
"learning_rate": 4.834677902190742e-06,
"loss": 0.9085,
"step": 5780
},
{
"epoch": 1.0274579273693534,
"grad_norm": 2.3598498586364416,
"learning_rate": 4.832459393604822e-06,
"loss": 0.8526,
"step": 5800
},
{
"epoch": 1.0310008857395925,
"grad_norm": 2.7411380905564613,
"learning_rate": 4.830226614565109e-06,
"loss": 0.9451,
"step": 5820
},
{
"epoch": 1.0345438441098318,
"grad_norm": 4.338981698988937,
"learning_rate": 4.8279795787321935e-06,
"loss": 0.9065,
"step": 5840
},
{
"epoch": 1.038086802480071,
"grad_norm": 5.731424678260782,
"learning_rate": 4.8257182998538895e-06,
"loss": 0.8988,
"step": 5860
},
{
"epoch": 1.04162976085031,
"grad_norm": 4.552868648680658,
"learning_rate": 4.823442791765157e-06,
"loss": 0.9059,
"step": 5880
},
{
"epoch": 1.045172719220549,
"grad_norm": 2.50768692217334,
"learning_rate": 4.821153068388007e-06,
"loss": 0.9601,
"step": 5900
},
{
"epoch": 1.0487156775907882,
"grad_norm": 5.095351842225188,
"learning_rate": 4.818849143731428e-06,
"loss": 0.9152,
"step": 5920
},
{
"epoch": 1.0522586359610275,
"grad_norm": 2.8481200255546493,
"learning_rate": 4.816531031891292e-06,
"loss": 0.8828,
"step": 5940
},
{
"epoch": 1.0558015943312666,
"grad_norm": 4.464320906945187,
"learning_rate": 4.814198747050271e-06,
"loss": 0.9552,
"step": 5960
},
{
"epoch": 1.0593445527015057,
"grad_norm": 5.546584102940785,
"learning_rate": 4.811852303477751e-06,
"loss": 0.8654,
"step": 5980
},
{
"epoch": 1.0628875110717448,
"grad_norm": 4.466057137043603,
"learning_rate": 4.809491715529744e-06,
"loss": 0.8941,
"step": 6000
},
{
"epoch": 1.0628875110717448,
"eval_loss": 0.8596345782279968,
"eval_runtime": 368.3497,
"eval_samples_per_second": 25.81,
"eval_steps_per_second": 3.228,
"step": 6000
},
{
"epoch": 1.066430469441984,
"grad_norm": 4.240898370302153,
"learning_rate": 4.8071169976488e-06,
"loss": 0.9238,
"step": 6020
},
{
"epoch": 1.0699734278122233,
"grad_norm": 3.2580988514752343,
"learning_rate": 4.804728164363918e-06,
"loss": 0.9158,
"step": 6040
},
{
"epoch": 1.0735163861824624,
"grad_norm": 3.870662909162037,
"learning_rate": 4.80232523029046e-06,
"loss": 0.9688,
"step": 6060
},
{
"epoch": 1.0770593445527015,
"grad_norm": 1.7822539984397539,
"learning_rate": 4.799908210130058e-06,
"loss": 0.9053,
"step": 6080
},
{
"epoch": 1.0806023029229406,
"grad_norm": 3.7732124960041276,
"learning_rate": 4.797477118670524e-06,
"loss": 0.9815,
"step": 6100
},
{
"epoch": 1.08414526129318,
"grad_norm": 6.115099007176549,
"learning_rate": 4.7950319707857615e-06,
"loss": 0.9064,
"step": 6120
},
{
"epoch": 1.087688219663419,
"grad_norm": 2.577014162447891,
"learning_rate": 4.792572781435678e-06,
"loss": 0.8382,
"step": 6140
},
{
"epoch": 1.091231178033658,
"grad_norm": 3.220759744400382,
"learning_rate": 4.790099565666086e-06,
"loss": 0.8572,
"step": 6160
},
{
"epoch": 1.0947741364038972,
"grad_norm": 2.7603504810469097,
"learning_rate": 4.787612338608614e-06,
"loss": 0.9017,
"step": 6180
},
{
"epoch": 1.0983170947741363,
"grad_norm": 3.4076917515040686,
"learning_rate": 4.785111115480615e-06,
"loss": 0.9043,
"step": 6200
},
{
"epoch": 1.1018600531443756,
"grad_norm": 2.714069163764211,
"learning_rate": 4.782595911585074e-06,
"loss": 0.9445,
"step": 6220
},
{
"epoch": 1.1054030115146147,
"grad_norm": 2.579769298355691,
"learning_rate": 4.780066742310512e-06,
"loss": 0.8789,
"step": 6240
},
{
"epoch": 1.1089459698848538,
"grad_norm": 4.326106933810614,
"learning_rate": 4.777523623130894e-06,
"loss": 0.9087,
"step": 6260
},
{
"epoch": 1.112488928255093,
"grad_norm": 4.203656763039969,
"learning_rate": 4.774966569605531e-06,
"loss": 0.9168,
"step": 6280
},
{
"epoch": 1.1160318866253323,
"grad_norm": 4.154066900716089,
"learning_rate": 4.772395597378991e-06,
"loss": 0.8687,
"step": 6300
},
{
"epoch": 1.1195748449955714,
"grad_norm": 2.08540747712982,
"learning_rate": 4.769810722180994e-06,
"loss": 0.871,
"step": 6320
},
{
"epoch": 1.1231178033658105,
"grad_norm": 3.1465129817119677,
"learning_rate": 4.767211959826326e-06,
"loss": 0.9231,
"step": 6340
},
{
"epoch": 1.1266607617360496,
"grad_norm": 2.3015425077734233,
"learning_rate": 4.764599326214736e-06,
"loss": 0.91,
"step": 6360
},
{
"epoch": 1.1302037201062887,
"grad_norm": 3.2081119900478083,
"learning_rate": 4.761972837330839e-06,
"loss": 0.9247,
"step": 6380
},
{
"epoch": 1.133746678476528,
"grad_norm": 3.093019376379044,
"learning_rate": 4.7593325092440204e-06,
"loss": 0.8783,
"step": 6400
},
{
"epoch": 1.137289636846767,
"grad_norm": 4.476714117074904,
"learning_rate": 4.756678358108337e-06,
"loss": 0.9356,
"step": 6420
},
{
"epoch": 1.1408325952170062,
"grad_norm": 4.415252399299131,
"learning_rate": 4.754010400162416e-06,
"loss": 0.8873,
"step": 6440
},
{
"epoch": 1.1443755535872453,
"grad_norm": 4.618410771551369,
"learning_rate": 4.7513286517293585e-06,
"loss": 0.9271,
"step": 6460
},
{
"epoch": 1.1479185119574846,
"grad_norm": 3.1205309158472465,
"learning_rate": 4.74863312921664e-06,
"loss": 0.8835,
"step": 6480
},
{
"epoch": 1.1514614703277237,
"grad_norm": 2.4282160366985255,
"learning_rate": 4.7459238491160056e-06,
"loss": 0.9308,
"step": 6500
},
{
"epoch": 1.1550044286979628,
"grad_norm": 2.865738470619868,
"learning_rate": 4.743200828003374e-06,
"loss": 0.9414,
"step": 6520
},
{
"epoch": 1.158547387068202,
"grad_norm": 3.6374100587906835,
"learning_rate": 4.740464082538735e-06,
"loss": 0.9106,
"step": 6540
},
{
"epoch": 1.162090345438441,
"grad_norm": 3.0695217920809053,
"learning_rate": 4.737713629466045e-06,
"loss": 0.8616,
"step": 6560
},
{
"epoch": 1.1656333038086801,
"grad_norm": 3.9353520892249363,
"learning_rate": 4.734949485613126e-06,
"loss": 0.8914,
"step": 6580
},
{
"epoch": 1.1691762621789195,
"grad_norm": 3.484414702314974,
"learning_rate": 4.732171667891564e-06,
"loss": 0.92,
"step": 6600
},
{
"epoch": 1.1727192205491586,
"grad_norm": 3.504870240653996,
"learning_rate": 4.729380193296605e-06,
"loss": 0.9396,
"step": 6620
},
{
"epoch": 1.1762621789193977,
"grad_norm": 5.929401699342908,
"learning_rate": 4.726575078907049e-06,
"loss": 0.9188,
"step": 6640
},
{
"epoch": 1.1798051372896368,
"grad_norm": 6.554517511673939,
"learning_rate": 4.723756341885148e-06,
"loss": 0.9534,
"step": 6660
},
{
"epoch": 1.183348095659876,
"grad_norm": 2.434654285685298,
"learning_rate": 4.7209239994765e-06,
"loss": 0.8497,
"step": 6680
},
{
"epoch": 1.1868910540301152,
"grad_norm": 2.9006017781540483,
"learning_rate": 4.718078069009944e-06,
"loss": 0.9326,
"step": 6700
},
{
"epoch": 1.1904340124003543,
"grad_norm": 6.939455220904752,
"learning_rate": 4.71521856789745e-06,
"loss": 0.9234,
"step": 6720
},
{
"epoch": 1.1939769707705934,
"grad_norm": 3.3273297930889814,
"learning_rate": 4.712345513634021e-06,
"loss": 0.9146,
"step": 6740
},
{
"epoch": 1.1975199291408325,
"grad_norm": 3.0714433546937774,
"learning_rate": 4.709458923797579e-06,
"loss": 0.9112,
"step": 6760
},
{
"epoch": 1.2010628875110718,
"grad_norm": 3.686574518500066,
"learning_rate": 4.7065588160488565e-06,
"loss": 0.9353,
"step": 6780
},
{
"epoch": 1.204605845881311,
"grad_norm": 3.1307294434556256,
"learning_rate": 4.703645208131294e-06,
"loss": 0.8906,
"step": 6800
},
{
"epoch": 1.20814880425155,
"grad_norm": 3.9540023879464616,
"learning_rate": 4.70071811787093e-06,
"loss": 0.9389,
"step": 6820
},
{
"epoch": 1.2116917626217891,
"grad_norm": 4.99757735756388,
"learning_rate": 4.697777563176288e-06,
"loss": 0.8728,
"step": 6840
},
{
"epoch": 1.2152347209920284,
"grad_norm": 1.8831614270222023,
"learning_rate": 4.694823562038271e-06,
"loss": 0.8971,
"step": 6860
},
{
"epoch": 1.2187776793622676,
"grad_norm": 3.3841095933963747,
"learning_rate": 4.69185613253005e-06,
"loss": 0.9404,
"step": 6880
},
{
"epoch": 1.2223206377325067,
"grad_norm": 2.5639299660820543,
"learning_rate": 4.688875292806952e-06,
"loss": 0.8651,
"step": 6900
},
{
"epoch": 1.2258635961027458,
"grad_norm": 2.5209787475843024,
"learning_rate": 4.685881061106352e-06,
"loss": 0.8783,
"step": 6920
},
{
"epoch": 1.2294065544729849,
"grad_norm": 4.174003458453298,
"learning_rate": 4.68287345574756e-06,
"loss": 0.939,
"step": 6940
},
{
"epoch": 1.2329495128432242,
"grad_norm": 3.7213954731195944,
"learning_rate": 4.679852495131708e-06,
"loss": 0.9698,
"step": 6960
},
{
"epoch": 1.2364924712134633,
"grad_norm": 3.1050229374536826,
"learning_rate": 4.676818197741637e-06,
"loss": 0.901,
"step": 6980
},
{
"epoch": 1.2400354295837024,
"grad_norm": 4.537242489543826,
"learning_rate": 4.673770582141788e-06,
"loss": 0.8826,
"step": 7000
},
{
"epoch": 1.2435783879539415,
"grad_norm": 3.157545512432448,
"learning_rate": 4.670709666978081e-06,
"loss": 0.9426,
"step": 7020
},
{
"epoch": 1.2471213463241808,
"grad_norm": 3.6885040878766975,
"learning_rate": 4.667635470977811e-06,
"loss": 0.9253,
"step": 7040
},
{
"epoch": 1.25066430469442,
"grad_norm": 3.8470192299130495,
"learning_rate": 4.664548012949523e-06,
"loss": 0.9516,
"step": 7060
},
{
"epoch": 1.254207263064659,
"grad_norm": 2.7610537115436897,
"learning_rate": 4.661447311782905e-06,
"loss": 0.9632,
"step": 7080
},
{
"epoch": 1.2577502214348981,
"grad_norm": 2.9076857370419726,
"learning_rate": 4.658333386448668e-06,
"loss": 0.8516,
"step": 7100
},
{
"epoch": 1.2612931798051372,
"grad_norm": 2.8712653113719178,
"learning_rate": 4.655206255998429e-06,
"loss": 0.8681,
"step": 7120
},
{
"epoch": 1.2648361381753763,
"grad_norm": 3.5281659347458443,
"learning_rate": 4.652065939564601e-06,
"loss": 0.8612,
"step": 7140
},
{
"epoch": 1.2683790965456156,
"grad_norm": 3.0200930161561836,
"learning_rate": 4.648912456360266e-06,
"loss": 0.9232,
"step": 7160
},
{
"epoch": 1.2719220549158547,
"grad_norm": 2.9141762965804068,
"learning_rate": 4.645745825679069e-06,
"loss": 0.8704,
"step": 7180
},
{
"epoch": 1.2754650132860939,
"grad_norm": 3.466517729779936,
"learning_rate": 4.642566066895089e-06,
"loss": 0.9167,
"step": 7200
},
{
"epoch": 1.2790079716563332,
"grad_norm": 4.016886275498443,
"learning_rate": 4.639373199462728e-06,
"loss": 0.8753,
"step": 7220
},
{
"epoch": 1.2825509300265723,
"grad_norm": 2.6960913937478064,
"learning_rate": 4.636167242916588e-06,
"loss": 0.9387,
"step": 7240
},
{
"epoch": 1.2860938883968114,
"grad_norm": 4.156865933729297,
"learning_rate": 4.6329482168713535e-06,
"loss": 0.8807,
"step": 7260
},
{
"epoch": 1.2896368467670505,
"grad_norm": 2.3052353180349194,
"learning_rate": 4.62971614102167e-06,
"loss": 0.9344,
"step": 7280
},
{
"epoch": 1.2931798051372896,
"grad_norm": 4.351236836672874,
"learning_rate": 4.626471035142027e-06,
"loss": 0.9368,
"step": 7300
},
{
"epoch": 1.2967227635075287,
"grad_norm": 2.96988427417053,
"learning_rate": 4.62321291908663e-06,
"loss": 0.9225,
"step": 7320
},
{
"epoch": 1.300265721877768,
"grad_norm": 2.803880488767538,
"learning_rate": 4.619941812789287e-06,
"loss": 0.9065,
"step": 7340
},
{
"epoch": 1.3038086802480071,
"grad_norm": 2.2501268712600115,
"learning_rate": 4.616657736263282e-06,
"loss": 0.9095,
"step": 7360
},
{
"epoch": 1.3073516386182462,
"grad_norm": 3.530457146463456,
"learning_rate": 4.613360709601251e-06,
"loss": 0.8956,
"step": 7380
},
{
"epoch": 1.3108945969884853,
"grad_norm": 2.9920752839459848,
"learning_rate": 4.6100507529750656e-06,
"loss": 0.8907,
"step": 7400
},
{
"epoch": 1.3144375553587246,
"grad_norm": 2.3035469610025614,
"learning_rate": 4.6067278866357025e-06,
"loss": 0.9135,
"step": 7420
},
{
"epoch": 1.3179805137289637,
"grad_norm": 3.742157836005142,
"learning_rate": 4.603392130913123e-06,
"loss": 0.9146,
"step": 7440
},
{
"epoch": 1.3215234720992028,
"grad_norm": 2.228186526375547,
"learning_rate": 4.600043506216151e-06,
"loss": 0.9103,
"step": 7460
},
{
"epoch": 1.325066430469442,
"grad_norm": 2.4528087238581504,
"learning_rate": 4.5966820330323405e-06,
"loss": 0.9298,
"step": 7480
},
{
"epoch": 1.328609388839681,
"grad_norm": 2.8414827920653125,
"learning_rate": 4.59330773192786e-06,
"loss": 0.8779,
"step": 7500
},
{
"epoch": 1.3321523472099202,
"grad_norm": 3.492728703129015,
"learning_rate": 4.5899206235473585e-06,
"loss": 0.9399,
"step": 7520
},
{
"epoch": 1.3356953055801595,
"grad_norm": 4.648463092813606,
"learning_rate": 4.586520728613842e-06,
"loss": 0.9026,
"step": 7540
},
{
"epoch": 1.3392382639503986,
"grad_norm": 3.9103385940119297,
"learning_rate": 4.583108067928552e-06,
"loss": 0.8996,
"step": 7560
},
{
"epoch": 1.3427812223206377,
"grad_norm": 4.1525594833780035,
"learning_rate": 4.579682662370829e-06,
"loss": 0.911,
"step": 7580
},
{
"epoch": 1.346324180690877,
"grad_norm": 4.692836743880663,
"learning_rate": 4.576244532897988e-06,
"loss": 0.8638,
"step": 7600
},
{
"epoch": 1.349867139061116,
"grad_norm": 2.805532914309198,
"learning_rate": 4.572793700545197e-06,
"loss": 0.9105,
"step": 7620
},
{
"epoch": 1.3534100974313552,
"grad_norm": 3.179829503013336,
"learning_rate": 4.569330186425339e-06,
"loss": 0.9251,
"step": 7640
},
{
"epoch": 1.3569530558015943,
"grad_norm": 2.862631874344196,
"learning_rate": 4.565854011728885e-06,
"loss": 0.9681,
"step": 7660
},
{
"epoch": 1.3604960141718334,
"grad_norm": 3.7867164985506876,
"learning_rate": 4.562365197723771e-06,
"loss": 0.9298,
"step": 7680
},
{
"epoch": 1.3640389725420725,
"grad_norm": 4.25563448626548,
"learning_rate": 4.558863765755257e-06,
"loss": 0.8872,
"step": 7700
},
{
"epoch": 1.3675819309123118,
"grad_norm": 2.742439626228035,
"learning_rate": 4.555349737245808e-06,
"loss": 0.8776,
"step": 7720
},
{
"epoch": 1.371124889282551,
"grad_norm": 3.4373516712170504,
"learning_rate": 4.5518231336949526e-06,
"loss": 0.8886,
"step": 7740
},
{
"epoch": 1.37466784765279,
"grad_norm": 3.098577500060214,
"learning_rate": 4.548283976679158e-06,
"loss": 0.8762,
"step": 7760
},
{
"epoch": 1.3782108060230294,
"grad_norm": 4.088083429018507,
"learning_rate": 4.5447322878516965e-06,
"loss": 0.8655,
"step": 7780
},
{
"epoch": 1.3817537643932685,
"grad_norm": 4.5726701868460475,
"learning_rate": 4.541168088942511e-06,
"loss": 0.9061,
"step": 7800
},
{
"epoch": 1.3852967227635076,
"grad_norm": 3.093081976098804,
"learning_rate": 4.537591401758084e-06,
"loss": 0.934,
"step": 7820
},
{
"epoch": 1.3888396811337467,
"grad_norm": 5.171297134853851,
"learning_rate": 4.5340022481813055e-06,
"loss": 0.9712,
"step": 7840
},
{
"epoch": 1.3923826395039858,
"grad_norm": 2.889572353723012,
"learning_rate": 4.530400650171335e-06,
"loss": 0.8755,
"step": 7860
},
{
"epoch": 1.3959255978742249,
"grad_norm": 5.753273400624572,
"learning_rate": 4.526786629763471e-06,
"loss": 0.8735,
"step": 7880
},
{
"epoch": 1.3994685562444642,
"grad_norm": 5.098809443054654,
"learning_rate": 4.523160209069014e-06,
"loss": 0.8922,
"step": 7900
},
{
"epoch": 1.4030115146147033,
"grad_norm": 2.357594211768667,
"learning_rate": 4.5195214102751324e-06,
"loss": 0.9088,
"step": 7920
},
{
"epoch": 1.4065544729849424,
"grad_norm": 3.9367579628501184,
"learning_rate": 4.515870255644727e-06,
"loss": 0.9186,
"step": 7940
},
{
"epoch": 1.4100974313551815,
"grad_norm": 5.456286963862879,
"learning_rate": 4.512206767516291e-06,
"loss": 0.9111,
"step": 7960
},
{
"epoch": 1.4136403897254208,
"grad_norm": 3.5921469100201944,
"learning_rate": 4.508530968303781e-06,
"loss": 0.9028,
"step": 7980
},
{
"epoch": 1.41718334809566,
"grad_norm": 2.9603120534047056,
"learning_rate": 4.504842880496472e-06,
"loss": 0.8972,
"step": 8000
},
{
"epoch": 1.41718334809566,
"eval_loss": 0.837660551071167,
"eval_runtime": 368.7294,
"eval_samples_per_second": 25.783,
"eval_steps_per_second": 3.225,
"step": 8000
},
{
"epoch": 1.420726306465899,
"grad_norm": 4.883873708709509,
"learning_rate": 4.5011425266588225e-06,
"loss": 0.9461,
"step": 8020
},
{
"epoch": 1.4242692648361381,
"grad_norm": 4.135582187777777,
"learning_rate": 4.497429929430341e-06,
"loss": 0.9508,
"step": 8040
},
{
"epoch": 1.4278122232063772,
"grad_norm": 4.734069578966871,
"learning_rate": 4.493705111525439e-06,
"loss": 0.9336,
"step": 8060
},
{
"epoch": 1.4313551815766163,
"grad_norm": 3.218508259496232,
"learning_rate": 4.4899680957333e-06,
"loss": 0.8421,
"step": 8080
},
{
"epoch": 1.4348981399468557,
"grad_norm": 2.1197851091950355,
"learning_rate": 4.486218904917735e-06,
"loss": 0.8656,
"step": 8100
},
{
"epoch": 1.4384410983170948,
"grad_norm": 3.0696326095251694,
"learning_rate": 4.482457562017043e-06,
"loss": 0.8596,
"step": 8120
},
{
"epoch": 1.4419840566873339,
"grad_norm": 3.0008100293231617,
"learning_rate": 4.478684090043875e-06,
"loss": 0.92,
"step": 8140
},
{
"epoch": 1.4455270150575732,
"grad_norm": 2.9367769313434207,
"learning_rate": 4.474898512085088e-06,
"loss": 0.8598,
"step": 8160
},
{
"epoch": 1.4490699734278123,
"grad_norm": 2.3972324015727473,
"learning_rate": 4.471100851301605e-06,
"loss": 0.8952,
"step": 8180
},
{
"epoch": 1.4526129317980514,
"grad_norm": 2.2321286554170476,
"learning_rate": 4.467291130928277e-06,
"loss": 0.9081,
"step": 8200
},
{
"epoch": 1.4561558901682905,
"grad_norm": 2.5187305501003725,
"learning_rate": 4.463469374273737e-06,
"loss": 0.9273,
"step": 8220
},
{
"epoch": 1.4596988485385296,
"grad_norm": 5.185336322080168,
"learning_rate": 4.459635604720255e-06,
"loss": 0.8962,
"step": 8240
},
{
"epoch": 1.4632418069087687,
"grad_norm": 2.6043475650440895,
"learning_rate": 4.4557898457236025e-06,
"loss": 0.9125,
"step": 8260
},
{
"epoch": 1.466784765279008,
"grad_norm": 3.0322932631838233,
"learning_rate": 4.4519321208129044e-06,
"loss": 0.8977,
"step": 8280
},
{
"epoch": 1.4703277236492471,
"grad_norm": 4.742695243075337,
"learning_rate": 4.448062453590493e-06,
"loss": 0.9128,
"step": 8300
},
{
"epoch": 1.4738706820194862,
"grad_norm": 2.709718326834174,
"learning_rate": 4.444180867731769e-06,
"loss": 0.8838,
"step": 8320
},
{
"epoch": 1.4774136403897256,
"grad_norm": 4.119470722998265,
"learning_rate": 4.44028738698505e-06,
"loss": 0.8819,
"step": 8340
},
{
"epoch": 1.4809565987599647,
"grad_norm": 3.1816783974486382,
"learning_rate": 4.436382035171432e-06,
"loss": 0.8797,
"step": 8360
},
{
"epoch": 1.4844995571302038,
"grad_norm": 3.1914690524711844,
"learning_rate": 4.4324648361846424e-06,
"loss": 0.8278,
"step": 8380
},
{
"epoch": 1.4880425155004429,
"grad_norm": 4.009571063093233,
"learning_rate": 4.428535813990885e-06,
"loss": 0.9445,
"step": 8400
},
{
"epoch": 1.491585473870682,
"grad_norm": 2.9833892404793736,
"learning_rate": 4.424594992628708e-06,
"loss": 0.8951,
"step": 8420
},
{
"epoch": 1.495128432240921,
"grad_norm": 2.478036010569002,
"learning_rate": 4.420642396208844e-06,
"loss": 0.8963,
"step": 8440
},
{
"epoch": 1.4986713906111604,
"grad_norm": 3.24442393322022,
"learning_rate": 4.416678048914069e-06,
"loss": 0.8875,
"step": 8460
},
{
"epoch": 1.5022143489813995,
"grad_norm": 5.0690401321371334,
"learning_rate": 4.412701974999057e-06,
"loss": 0.9041,
"step": 8480
},
{
"epoch": 1.5057573073516386,
"grad_norm": 3.5032838009682172,
"learning_rate": 4.4087141987902215e-06,
"loss": 0.9024,
"step": 8500
},
{
"epoch": 1.509300265721878,
"grad_norm": 3.4593082915022384,
"learning_rate": 4.404714744685578e-06,
"loss": 0.9299,
"step": 8520
},
{
"epoch": 1.512843224092117,
"grad_norm": 2.906522716091139,
"learning_rate": 4.4007036371545865e-06,
"loss": 0.8399,
"step": 8540
},
{
"epoch": 1.5163861824623561,
"grad_norm": 2.556244604059082,
"learning_rate": 4.396680900738007e-06,
"loss": 0.8959,
"step": 8560
},
{
"epoch": 1.5199291408325952,
"grad_norm": 2.872414224448131,
"learning_rate": 4.392646560047746e-06,
"loss": 0.8837,
"step": 8580
},
{
"epoch": 1.5234720992028343,
"grad_norm": 4.371819876343486,
"learning_rate": 4.388600639766711e-06,
"loss": 0.9246,
"step": 8600
},
{
"epoch": 1.5270150575730734,
"grad_norm": 3.5523806587587514,
"learning_rate": 4.384543164648649e-06,
"loss": 0.931,
"step": 8620
},
{
"epoch": 1.5305580159433125,
"grad_norm": 2.929477900211409,
"learning_rate": 4.380474159518007e-06,
"loss": 0.8985,
"step": 8640
},
{
"epoch": 1.5341009743135519,
"grad_norm": 4.211828377945081,
"learning_rate": 4.3763936492697735e-06,
"loss": 0.8785,
"step": 8660
},
{
"epoch": 1.537643932683791,
"grad_norm": 2.4701458590021956,
"learning_rate": 4.372301658869327e-06,
"loss": 0.9385,
"step": 8680
},
{
"epoch": 1.54118689105403,
"grad_norm": 3.142058476836683,
"learning_rate": 4.368198213352286e-06,
"loss": 0.902,
"step": 8700
},
{
"epoch": 1.5447298494242694,
"grad_norm": 2.771138222528823,
"learning_rate": 4.3640833378243505e-06,
"loss": 0.8804,
"step": 8720
},
{
"epoch": 1.5482728077945085,
"grad_norm": 3.9280654601321006,
"learning_rate": 4.3599570574611545e-06,
"loss": 0.8938,
"step": 8740
},
{
"epoch": 1.5518157661647476,
"grad_norm": 2.2418472839675463,
"learning_rate": 4.355819397508106e-06,
"loss": 0.8968,
"step": 8760
},
{
"epoch": 1.5553587245349867,
"grad_norm": 3.399601556100489,
"learning_rate": 4.35167038328024e-06,
"loss": 0.8546,
"step": 8780
},
{
"epoch": 1.5589016829052258,
"grad_norm": 3.455386669336896,
"learning_rate": 4.3475100401620555e-06,
"loss": 0.8987,
"step": 8800
},
{
"epoch": 1.562444641275465,
"grad_norm": 4.528613569077828,
"learning_rate": 4.3433383936073635e-06,
"loss": 0.9096,
"step": 8820
},
{
"epoch": 1.565987599645704,
"grad_norm": 2.885188625705028,
"learning_rate": 4.3391554691391345e-06,
"loss": 0.8747,
"step": 8840
},
{
"epoch": 1.5695305580159433,
"grad_norm": 4.53397939933421,
"learning_rate": 4.334961292349339e-06,
"loss": 0.9238,
"step": 8860
},
{
"epoch": 1.5730735163861824,
"grad_norm": 4.595528706412981,
"learning_rate": 4.33075588889879e-06,
"loss": 0.9103,
"step": 8880
},
{
"epoch": 1.5766164747564217,
"grad_norm": 4.301564262537775,
"learning_rate": 4.326539284516989e-06,
"loss": 0.8638,
"step": 8900
},
{
"epoch": 1.5801594331266609,
"grad_norm": 2.0941698417183883,
"learning_rate": 4.322311505001964e-06,
"loss": 0.9186,
"step": 8920
},
{
"epoch": 1.5837023914969,
"grad_norm": 2.2302494709941416,
"learning_rate": 4.318072576220119e-06,
"loss": 0.9041,
"step": 8940
},
{
"epoch": 1.587245349867139,
"grad_norm": 3.6126726071073914,
"learning_rate": 4.31382252410607e-06,
"loss": 0.9306,
"step": 8960
},
{
"epoch": 1.5907883082373782,
"grad_norm": 3.4874125803596265,
"learning_rate": 4.309561374662486e-06,
"loss": 0.9067,
"step": 8980
},
{
"epoch": 1.5943312666076173,
"grad_norm": 3.2939583547971076,
"learning_rate": 4.3052891539599315e-06,
"loss": 0.9511,
"step": 9000
},
{
"epoch": 1.5978742249778564,
"grad_norm": 4.614562529533774,
"learning_rate": 4.301005888136711e-06,
"loss": 0.8829,
"step": 9020
},
{
"epoch": 1.6014171833480957,
"grad_norm": 3.801219562406557,
"learning_rate": 4.2967116033987015e-06,
"loss": 0.912,
"step": 9040
},
{
"epoch": 1.6049601417183348,
"grad_norm": 3.753568798707887,
"learning_rate": 4.292406326019198e-06,
"loss": 0.8699,
"step": 9060
},
{
"epoch": 1.6085031000885741,
"grad_norm": 4.73422889733671,
"learning_rate": 4.288090082338749e-06,
"loss": 0.8836,
"step": 9080
},
{
"epoch": 1.6120460584588132,
"grad_norm": 2.9548744217680873,
"learning_rate": 4.283762898764998e-06,
"loss": 0.8952,
"step": 9100
},
{
"epoch": 1.6155890168290523,
"grad_norm": 3.206506941283786,
"learning_rate": 4.2794248017725226e-06,
"loss": 0.8603,
"step": 9120
},
{
"epoch": 1.6191319751992914,
"grad_norm": 3.530944230336076,
"learning_rate": 4.275075817902667e-06,
"loss": 0.9217,
"step": 9140
},
{
"epoch": 1.6226749335695305,
"grad_norm": 3.203969300098245,
"learning_rate": 4.270715973763387e-06,
"loss": 0.8971,
"step": 9160
},
{
"epoch": 1.6262178919397696,
"grad_norm": 3.0239142090145052,
"learning_rate": 4.2663452960290805e-06,
"loss": 0.9334,
"step": 9180
},
{
"epoch": 1.6297608503100087,
"grad_norm": 2.5777090574009,
"learning_rate": 4.261963811440432e-06,
"loss": 0.8392,
"step": 9200
},
{
"epoch": 1.633303808680248,
"grad_norm": 4.058353343726418,
"learning_rate": 4.25757154680424e-06,
"loss": 0.8933,
"step": 9220
},
{
"epoch": 1.6368467670504872,
"grad_norm": 3.507645687067127,
"learning_rate": 4.253168528993261e-06,
"loss": 0.8899,
"step": 9240
},
{
"epoch": 1.6403897254207263,
"grad_norm": 3.513884615072957,
"learning_rate": 4.248754784946038e-06,
"loss": 0.9113,
"step": 9260
},
{
"epoch": 1.6439326837909656,
"grad_norm": 2.773723856571234,
"learning_rate": 4.244330341666743e-06,
"loss": 0.9056,
"step": 9280
},
{
"epoch": 1.6474756421612047,
"grad_norm": 3.4180543280648243,
"learning_rate": 4.239895226225005e-06,
"loss": 0.8966,
"step": 9300
},
{
"epoch": 1.6510186005314438,
"grad_norm": 3.918770035240026,
"learning_rate": 4.2354494657557485e-06,
"loss": 0.8769,
"step": 9320
},
{
"epoch": 1.6545615589016829,
"grad_norm": 3.372636086492444,
"learning_rate": 4.230993087459028e-06,
"loss": 0.8915,
"step": 9340
},
{
"epoch": 1.658104517271922,
"grad_norm": 3.510054489740414,
"learning_rate": 4.226526118599858e-06,
"loss": 0.9184,
"step": 9360
},
{
"epoch": 1.661647475642161,
"grad_norm": 2.0157440996114526,
"learning_rate": 4.222048586508048e-06,
"loss": 0.9172,
"step": 9380
},
{
"epoch": 1.6651904340124002,
"grad_norm": 3.6068994268498074,
"learning_rate": 4.2175605185780375e-06,
"loss": 0.8873,
"step": 9400
},
{
"epoch": 1.6687333923826395,
"grad_norm": 4.420449955455039,
"learning_rate": 4.213061942268724e-06,
"loss": 0.8436,
"step": 9420
},
{
"epoch": 1.6722763507528786,
"grad_norm": 3.1825503459026168,
"learning_rate": 4.208552885103299e-06,
"loss": 0.8543,
"step": 9440
},
{
"epoch": 1.675819309123118,
"grad_norm": 2.8992183977213535,
"learning_rate": 4.204033374669077e-06,
"loss": 0.8824,
"step": 9460
},
{
"epoch": 1.679362267493357,
"grad_norm": 2.799570684359126,
"learning_rate": 4.19950343861733e-06,
"loss": 0.8671,
"step": 9480
},
{
"epoch": 1.6829052258635961,
"grad_norm": 3.641582622193844,
"learning_rate": 4.194963104663112e-06,
"loss": 0.8628,
"step": 9500
},
{
"epoch": 1.6864481842338352,
"grad_norm": 4.398919078323408,
"learning_rate": 4.1904124005850954e-06,
"loss": 0.9005,
"step": 9520
},
{
"epoch": 1.6899911426040743,
"grad_norm": 2.9512298802973143,
"learning_rate": 4.185851354225401e-06,
"loss": 0.9078,
"step": 9540
},
{
"epoch": 1.6935341009743134,
"grad_norm": 3.8044700286481246,
"learning_rate": 4.181279993489423e-06,
"loss": 0.9168,
"step": 9560
},
{
"epoch": 1.6970770593445526,
"grad_norm": 3.308352478584529,
"learning_rate": 4.176698346345663e-06,
"loss": 0.8434,
"step": 9580
},
{
"epoch": 1.7006200177147919,
"grad_norm": 3.904869182178536,
"learning_rate": 4.1721064408255555e-06,
"loss": 0.9005,
"step": 9600
},
{
"epoch": 1.704162976085031,
"grad_norm": 2.6424522860538615,
"learning_rate": 4.167504305023298e-06,
"loss": 0.9278,
"step": 9620
},
{
"epoch": 1.7077059344552703,
"grad_norm": 5.57177952217328,
"learning_rate": 4.162891967095679e-06,
"loss": 0.8677,
"step": 9640
},
{
"epoch": 1.7112488928255094,
"grad_norm": 2.7234322141678686,
"learning_rate": 4.158269455261906e-06,
"loss": 0.8629,
"step": 9660
},
{
"epoch": 1.7147918511957485,
"grad_norm": 4.837352939716468,
"learning_rate": 4.1536367978034335e-06,
"loss": 0.9231,
"step": 9680
},
{
"epoch": 1.7183348095659876,
"grad_norm": 2.5650760898970653,
"learning_rate": 4.148994023063787e-06,
"loss": 0.91,
"step": 9700
},
{
"epoch": 1.7218777679362267,
"grad_norm": 2.1623432794551216,
"learning_rate": 4.1443411594483915e-06,
"loss": 0.8876,
"step": 9720
},
{
"epoch": 1.7254207263064658,
"grad_norm": 5.619389806185287,
"learning_rate": 4.139678235424399e-06,
"loss": 0.8599,
"step": 9740
},
{
"epoch": 1.728963684676705,
"grad_norm": 4.021341243552535,
"learning_rate": 4.135005279520514e-06,
"loss": 0.9074,
"step": 9760
},
{
"epoch": 1.7325066430469442,
"grad_norm": 3.5635632300979707,
"learning_rate": 4.130322320326816e-06,
"loss": 0.8933,
"step": 9780
},
{
"epoch": 1.7360496014171833,
"grad_norm": 2.540277205876661,
"learning_rate": 4.125629386494587e-06,
"loss": 0.9291,
"step": 9800
},
{
"epoch": 1.7395925597874224,
"grad_norm": 3.7695170204780957,
"learning_rate": 4.120926506736137e-06,
"loss": 0.903,
"step": 9820
},
{
"epoch": 1.7431355181576618,
"grad_norm": 1.914697967954277,
"learning_rate": 4.116213709824625e-06,
"loss": 0.8321,
"step": 9840
},
{
"epoch": 1.7466784765279009,
"grad_norm": 3.4586226366917194,
"learning_rate": 4.111491024593889e-06,
"loss": 0.8858,
"step": 9860
},
{
"epoch": 1.75022143489814,
"grad_norm": 2.2832262533351195,
"learning_rate": 4.10675847993826e-06,
"loss": 0.9102,
"step": 9880
},
{
"epoch": 1.753764393268379,
"grad_norm": 4.02421843778651,
"learning_rate": 4.102016104812396e-06,
"loss": 0.8392,
"step": 9900
},
{
"epoch": 1.7573073516386182,
"grad_norm": 3.890461917932318,
"learning_rate": 4.0972639282311e-06,
"loss": 0.8785,
"step": 9920
},
{
"epoch": 1.7608503100088573,
"grad_norm": 2.297623079323031,
"learning_rate": 4.092501979269137e-06,
"loss": 0.8855,
"step": 9940
},
{
"epoch": 1.7643932683790964,
"grad_norm": 3.6677681482613216,
"learning_rate": 4.087730287061065e-06,
"loss": 0.8625,
"step": 9960
},
{
"epoch": 1.7679362267493357,
"grad_norm": 3.9358016909534395,
"learning_rate": 4.082948880801054e-06,
"loss": 0.833,
"step": 9980
},
{
"epoch": 1.7714791851195748,
"grad_norm": 4.267043354085718,
"learning_rate": 4.078157789742706e-06,
"loss": 0.9039,
"step": 10000
},
{
"epoch": 1.7714791851195748,
"eval_loss": 0.8149307370185852,
"eval_runtime": 369.1142,
"eval_samples_per_second": 25.756,
"eval_steps_per_second": 3.221,
"step": 10000
},
{
"epoch": 1.7750221434898141,
"grad_norm": 3.7339229818305384,
"learning_rate": 4.073357043198874e-06,
"loss": 0.8925,
"step": 10020
},
{
"epoch": 1.7785651018600532,
"grad_norm": 3.428649696183049,
"learning_rate": 4.068546670541487e-06,
"loss": 0.8256,
"step": 10040
},
{
"epoch": 1.7821080602302923,
"grad_norm": 2.561924691977688,
"learning_rate": 4.06372670120137e-06,
"loss": 0.8685,
"step": 10060
},
{
"epoch": 1.7856510186005314,
"grad_norm": 2.2436966577943855,
"learning_rate": 4.05889716466806e-06,
"loss": 0.8773,
"step": 10080
},
{
"epoch": 1.7891939769707705,
"grad_norm": 2.5739823412815395,
"learning_rate": 4.054058090489628e-06,
"loss": 0.9268,
"step": 10100
},
{
"epoch": 1.7927369353410096,
"grad_norm": 3.639815355685089,
"learning_rate": 4.049209508272501e-06,
"loss": 0.901,
"step": 10120
},
{
"epoch": 1.7962798937112487,
"grad_norm": 4.803567565137283,
"learning_rate": 4.044351447681276e-06,
"loss": 0.8509,
"step": 10140
},
{
"epoch": 1.799822852081488,
"grad_norm": 4.961777896889485,
"learning_rate": 4.0394839384385395e-06,
"loss": 0.9093,
"step": 10160
},
{
"epoch": 1.8033658104517272,
"grad_norm": 3.0658650580570823,
"learning_rate": 4.034607010324689e-06,
"loss": 0.8937,
"step": 10180
},
{
"epoch": 1.8069087688219665,
"grad_norm": 3.236704486524868,
"learning_rate": 4.029720693177747e-06,
"loss": 0.8738,
"step": 10200
},
{
"epoch": 1.8104517271922056,
"grad_norm": 4.564091129067981,
"learning_rate": 4.024825016893182e-06,
"loss": 0.8737,
"step": 10220
},
{
"epoch": 1.8139946855624447,
"grad_norm": 2.347358666581137,
"learning_rate": 4.01992001142372e-06,
"loss": 0.8632,
"step": 10240
},
{
"epoch": 1.8175376439326838,
"grad_norm": 3.5395322977603927,
"learning_rate": 4.015005706779169e-06,
"loss": 0.8579,
"step": 10260
},
{
"epoch": 1.821080602302923,
"grad_norm": 2.3411115987360827,
"learning_rate": 4.010082133026229e-06,
"loss": 0.9116,
"step": 10280
},
{
"epoch": 1.824623560673162,
"grad_norm": 5.420278546986004,
"learning_rate": 4.005149320288308e-06,
"loss": 0.9216,
"step": 10300
},
{
"epoch": 1.828166519043401,
"grad_norm": 3.177246662363069,
"learning_rate": 4.000207298745347e-06,
"loss": 0.8348,
"step": 10320
},
{
"epoch": 1.8317094774136404,
"grad_norm": 3.260983474589408,
"learning_rate": 3.995256098633618e-06,
"loss": 0.8853,
"step": 10340
},
{
"epoch": 1.8352524357838795,
"grad_norm": 4.262465852736409,
"learning_rate": 3.9902957502455605e-06,
"loss": 0.8776,
"step": 10360
},
{
"epoch": 1.8387953941541186,
"grad_norm": 4.227525664714203,
"learning_rate": 3.985326283929577e-06,
"loss": 0.8614,
"step": 10380
},
{
"epoch": 1.842338352524358,
"grad_norm": 3.1605952131114625,
"learning_rate": 3.9803477300898574e-06,
"loss": 0.863,
"step": 10400
},
{
"epoch": 1.845881310894597,
"grad_norm": 2.751402878519728,
"learning_rate": 3.975360119186192e-06,
"loss": 0.8683,
"step": 10420
},
{
"epoch": 1.8494242692648362,
"grad_norm": 3.1105411540099714,
"learning_rate": 3.970363481733784e-06,
"loss": 0.9019,
"step": 10440
},
{
"epoch": 1.8529672276350753,
"grad_norm": 2.090165548223681,
"learning_rate": 3.965357848303061e-06,
"loss": 0.9317,
"step": 10460
},
{
"epoch": 1.8565101860053144,
"grad_norm": 2.551059143441233,
"learning_rate": 3.960343249519493e-06,
"loss": 0.8711,
"step": 10480
},
{
"epoch": 1.8600531443755535,
"grad_norm": 4.2607971452767766,
"learning_rate": 3.955319716063397e-06,
"loss": 0.8526,
"step": 10500
},
{
"epoch": 1.8635961027457926,
"grad_norm": 1.8942460590897414,
"learning_rate": 3.950287278669759e-06,
"loss": 0.8988,
"step": 10520
},
{
"epoch": 1.867139061116032,
"grad_norm": 3.6086912268779483,
"learning_rate": 3.945245968128039e-06,
"loss": 0.828,
"step": 10540
},
{
"epoch": 1.870682019486271,
"grad_norm": 2.2171636599062294,
"learning_rate": 3.940195815281984e-06,
"loss": 0.8195,
"step": 10560
},
{
"epoch": 1.8742249778565103,
"grad_norm": 3.3159052435056333,
"learning_rate": 3.935136851029441e-06,
"loss": 0.9019,
"step": 10580
},
{
"epoch": 1.8777679362267494,
"grad_norm": 2.4712113724302998,
"learning_rate": 3.930069106322167e-06,
"loss": 0.867,
"step": 10600
},
{
"epoch": 1.8813108945969885,
"grad_norm": 3.5773386774255473,
"learning_rate": 3.924992612165638e-06,
"loss": 0.9161,
"step": 10620
},
{
"epoch": 1.8848538529672276,
"grad_norm": 3.63341154286338,
"learning_rate": 3.919907399618864e-06,
"loss": 0.9039,
"step": 10640
},
{
"epoch": 1.8883968113374667,
"grad_norm": 2.5738022632421806,
"learning_rate": 3.914813499794193e-06,
"loss": 0.9,
"step": 10660
},
{
"epoch": 1.8919397697077058,
"grad_norm": 3.282128941108443,
"learning_rate": 3.909710943857125e-06,
"loss": 0.8783,
"step": 10680
},
{
"epoch": 1.895482728077945,
"grad_norm": 3.079231075434731,
"learning_rate": 3.904599763026117e-06,
"loss": 0.8829,
"step": 10700
},
{
"epoch": 1.8990256864481843,
"grad_norm": 4.0065620278734055,
"learning_rate": 3.899479988572401e-06,
"loss": 0.9157,
"step": 10720
},
{
"epoch": 1.9025686448184234,
"grad_norm": 4.857718153080153,
"learning_rate": 3.89435165181978e-06,
"loss": 0.8971,
"step": 10740
},
{
"epoch": 1.9061116031886627,
"grad_norm": 5.451024881061867,
"learning_rate": 3.8892147841444465e-06,
"loss": 0.9133,
"step": 10760
},
{
"epoch": 1.9096545615589018,
"grad_norm": 2.9783216918153688,
"learning_rate": 3.884069416974785e-06,
"loss": 0.8671,
"step": 10780
},
{
"epoch": 1.9131975199291409,
"grad_norm": 4.130442740763265,
"learning_rate": 3.878915581791184e-06,
"loss": 0.8812,
"step": 10800
},
{
"epoch": 1.91674047829938,
"grad_norm": 3.087103014469501,
"learning_rate": 3.873753310125838e-06,
"loss": 0.8596,
"step": 10820
},
{
"epoch": 1.920283436669619,
"grad_norm": 3.4980061190534832,
"learning_rate": 3.868582633562561e-06,
"loss": 0.9347,
"step": 10840
},
{
"epoch": 1.9238263950398582,
"grad_norm": 2.7736175156145353,
"learning_rate": 3.863403583736586e-06,
"loss": 0.8216,
"step": 10860
},
{
"epoch": 1.9273693534100973,
"grad_norm": 3.2203528253737184,
"learning_rate": 3.858216192334377e-06,
"loss": 0.9563,
"step": 10880
},
{
"epoch": 1.9309123117803366,
"grad_norm": 3.423660636662842,
"learning_rate": 3.853020491093436e-06,
"loss": 0.9045,
"step": 10900
},
{
"epoch": 1.9344552701505757,
"grad_norm": 4.576810058530241,
"learning_rate": 3.847816511802104e-06,
"loss": 0.8914,
"step": 10920
},
{
"epoch": 1.9379982285208148,
"grad_norm": 4.986380910931554,
"learning_rate": 3.842604286299366e-06,
"loss": 0.9222,
"step": 10940
},
{
"epoch": 1.9415411868910541,
"grad_norm": 5.049582396983484,
"learning_rate": 3.837383846474663e-06,
"loss": 0.8764,
"step": 10960
},
{
"epoch": 1.9450841452612933,
"grad_norm": 3.943843485476312,
"learning_rate": 3.832155224267693e-06,
"loss": 0.8614,
"step": 10980
},
{
"epoch": 1.9486271036315324,
"grad_norm": 4.818067048868772,
"learning_rate": 3.8269184516682114e-06,
"loss": 0.8844,
"step": 11000
},
{
"epoch": 1.9521700620017715,
"grad_norm": 2.6435129006124356,
"learning_rate": 3.821673560715844e-06,
"loss": 0.8859,
"step": 11020
},
{
"epoch": 1.9557130203720106,
"grad_norm": 4.218938965276267,
"learning_rate": 3.816420583499883e-06,
"loss": 0.8694,
"step": 11040
},
{
"epoch": 1.9592559787422497,
"grad_norm": 3.1248706808657993,
"learning_rate": 3.811159552159097e-06,
"loss": 0.8484,
"step": 11060
},
{
"epoch": 1.9627989371124888,
"grad_norm": 3.806523511219131,
"learning_rate": 3.8058904988815274e-06,
"loss": 0.8471,
"step": 11080
},
{
"epoch": 1.966341895482728,
"grad_norm": 3.51824367536095,
"learning_rate": 3.800613455904299e-06,
"loss": 0.9077,
"step": 11100
},
{
"epoch": 1.9698848538529672,
"grad_norm": 3.3138759907728677,
"learning_rate": 3.795328455513418e-06,
"loss": 0.8493,
"step": 11120
},
{
"epoch": 1.9734278122232065,
"grad_norm": 2.581058466584183,
"learning_rate": 3.7900355300435744e-06,
"loss": 0.8834,
"step": 11140
},
{
"epoch": 1.9769707705934456,
"grad_norm": 2.712205874433446,
"learning_rate": 3.7847347118779464e-06,
"loss": 0.8902,
"step": 11160
},
{
"epoch": 1.9805137289636847,
"grad_norm": 1.9869529582535677,
"learning_rate": 3.7794260334480026e-06,
"loss": 0.849,
"step": 11180
},
{
"epoch": 1.9840566873339238,
"grad_norm": 5.413750266589672,
"learning_rate": 3.7741095272333008e-06,
"loss": 0.8644,
"step": 11200
},
{
"epoch": 1.987599645704163,
"grad_norm": 2.941600930177304,
"learning_rate": 3.76878522576129e-06,
"loss": 0.8478,
"step": 11220
},
{
"epoch": 1.991142604074402,
"grad_norm": 4.120948714739335,
"learning_rate": 3.7634531616071137e-06,
"loss": 0.9119,
"step": 11240
},
{
"epoch": 1.9946855624446411,
"grad_norm": 3.1166160914538636,
"learning_rate": 3.758113367393409e-06,
"loss": 0.8953,
"step": 11260
},
{
"epoch": 1.9982285208148804,
"grad_norm": 3.148186400991171,
"learning_rate": 3.7527658757901046e-06,
"loss": 0.8957,
"step": 11280
},
{
"epoch": 2.0017714791851198,
"grad_norm": 4.237349805163588,
"learning_rate": 3.7474107195142273e-06,
"loss": 0.8242,
"step": 11300
},
{
"epoch": 2.005314437555359,
"grad_norm": 3.964803901086038,
"learning_rate": 3.7420479313296964e-06,
"loss": 0.733,
"step": 11320
},
{
"epoch": 2.008857395925598,
"grad_norm": 2.851459294376265,
"learning_rate": 3.7366775440471213e-06,
"loss": 0.8224,
"step": 11340
},
{
"epoch": 2.012400354295837,
"grad_norm": 3.6339236231049847,
"learning_rate": 3.7312995905236105e-06,
"loss": 0.8078,
"step": 11360
},
{
"epoch": 2.015943312666076,
"grad_norm": 2.394977571241479,
"learning_rate": 3.725914103662559e-06,
"loss": 0.7777,
"step": 11380
},
{
"epoch": 2.0194862710363153,
"grad_norm": 3.5149972337594475,
"learning_rate": 3.7205211164134547e-06,
"loss": 0.7742,
"step": 11400
},
{
"epoch": 2.0230292294065544,
"grad_norm": 4.113453512038928,
"learning_rate": 3.7151206617716734e-06,
"loss": 0.7539,
"step": 11420
},
{
"epoch": 2.0265721877767935,
"grad_norm": 3.2750961089816353,
"learning_rate": 3.709712772778279e-06,
"loss": 0.7788,
"step": 11440
},
{
"epoch": 2.0301151461470326,
"grad_norm": 4.035570508399602,
"learning_rate": 3.70429748251982e-06,
"loss": 0.7829,
"step": 11460
},
{
"epoch": 2.033658104517272,
"grad_norm": 4.436303471963281,
"learning_rate": 3.698874824128126e-06,
"loss": 0.7311,
"step": 11480
},
{
"epoch": 2.0372010628875112,
"grad_norm": 5.818707534945472,
"learning_rate": 3.693444830780107e-06,
"loss": 0.773,
"step": 11500
},
{
"epoch": 2.0407440212577503,
"grad_norm": 3.540550130014629,
"learning_rate": 3.6880075356975515e-06,
"loss": 0.7814,
"step": 11520
},
{
"epoch": 2.0442869796279894,
"grad_norm": 3.6346205822234943,
"learning_rate": 3.6825629721469188e-06,
"loss": 0.8135,
"step": 11540
},
{
"epoch": 2.0478299379982285,
"grad_norm": 2.4682825796443475,
"learning_rate": 3.6771111734391397e-06,
"loss": 0.745,
"step": 11560
},
{
"epoch": 2.0513728963684676,
"grad_norm": 2.7295024854988195,
"learning_rate": 3.6716521729294104e-06,
"loss": 0.7792,
"step": 11580
},
{
"epoch": 2.0549158547387067,
"grad_norm": 3.3999116860452525,
"learning_rate": 3.66618600401699e-06,
"loss": 0.7841,
"step": 11600
},
{
"epoch": 2.058458813108946,
"grad_norm": 1.979950381057992,
"learning_rate": 3.660712700144995e-06,
"loss": 0.7577,
"step": 11620
},
{
"epoch": 2.062001771479185,
"grad_norm": 2.961231731307014,
"learning_rate": 3.655232294800194e-06,
"loss": 0.8112,
"step": 11640
},
{
"epoch": 2.065544729849424,
"grad_norm": 2.9168214431871546,
"learning_rate": 3.6497448215128054e-06,
"loss": 0.7407,
"step": 11660
},
{
"epoch": 2.0690876882196636,
"grad_norm": 3.3537918642406814,
"learning_rate": 3.6442503138562902e-06,
"loss": 0.7432,
"step": 11680
},
{
"epoch": 2.0726306465899027,
"grad_norm": 2.885605458154861,
"learning_rate": 3.638748805447146e-06,
"loss": 0.7657,
"step": 11700
},
{
"epoch": 2.076173604960142,
"grad_norm": 4.064439601513717,
"learning_rate": 3.6332403299447046e-06,
"loss": 0.7374,
"step": 11720
},
{
"epoch": 2.079716563330381,
"grad_norm": 4.214372884804176,
"learning_rate": 3.6277249210509208e-06,
"loss": 0.758,
"step": 11740
},
{
"epoch": 2.08325952170062,
"grad_norm": 3.8146529692577937,
"learning_rate": 3.6222026125101717e-06,
"loss": 0.7635,
"step": 11760
},
{
"epoch": 2.086802480070859,
"grad_norm": 4.249159301555663,
"learning_rate": 3.6166734381090483e-06,
"loss": 0.7928,
"step": 11780
},
{
"epoch": 2.090345438441098,
"grad_norm": 5.7403075464380375,
"learning_rate": 3.611137431676146e-06,
"loss": 0.7451,
"step": 11800
},
{
"epoch": 2.0938883968113373,
"grad_norm": 3.6236953170629667,
"learning_rate": 3.605594627081861e-06,
"loss": 0.7332,
"step": 11820
},
{
"epoch": 2.0974313551815764,
"grad_norm": 3.240921081992807,
"learning_rate": 3.6000450582381823e-06,
"loss": 0.75,
"step": 11840
},
{
"epoch": 2.100974313551816,
"grad_norm": 4.877179694529326,
"learning_rate": 3.5944887590984846e-06,
"loss": 0.7824,
"step": 11860
},
{
"epoch": 2.104517271922055,
"grad_norm": 2.2176442012470576,
"learning_rate": 3.5889257636573183e-06,
"loss": 0.7936,
"step": 11880
},
{
"epoch": 2.108060230292294,
"grad_norm": 4.221294517545139,
"learning_rate": 3.583356105950203e-06,
"loss": 0.7548,
"step": 11900
},
{
"epoch": 2.1116031886625333,
"grad_norm": 3.8503143338096786,
"learning_rate": 3.5777798200534214e-06,
"loss": 0.7967,
"step": 11920
},
{
"epoch": 2.1151461470327724,
"grad_norm": 3.1910071043427326,
"learning_rate": 3.5721969400838073e-06,
"loss": 0.7786,
"step": 11940
},
{
"epoch": 2.1186891054030115,
"grad_norm": 2.860111918382593,
"learning_rate": 3.5666075001985386e-06,
"loss": 0.7517,
"step": 11960
},
{
"epoch": 2.1222320637732506,
"grad_norm": 2.0905030982179134,
"learning_rate": 3.561011534594928e-06,
"loss": 0.7558,
"step": 11980
},
{
"epoch": 2.1257750221434897,
"grad_norm": 5.060645708453129,
"learning_rate": 3.555409077510215e-06,
"loss": 0.7414,
"step": 12000
},
{
"epoch": 2.1257750221434897,
"eval_loss": 0.8194052577018738,
"eval_runtime": 368.7991,
"eval_samples_per_second": 25.778,
"eval_steps_per_second": 3.224,
"step": 12000
},
{
"epoch": 2.129317980513729,
"grad_norm": 3.6709097863394655,
"learning_rate": 3.549800163221353e-06,
"loss": 0.7369,
"step": 12020
},
{
"epoch": 2.132860938883968,
"grad_norm": 4.6404940480464125,
"learning_rate": 3.5441848260448035e-06,
"loss": 0.7919,
"step": 12040
},
{
"epoch": 2.1364038972542074,
"grad_norm": 3.4473691964125353,
"learning_rate": 3.5385631003363245e-06,
"loss": 0.7841,
"step": 12060
},
{
"epoch": 2.1399468556244465,
"grad_norm": 4.36997053485404,
"learning_rate": 3.532935020490761e-06,
"loss": 0.7681,
"step": 12080
},
{
"epoch": 2.1434898139946856,
"grad_norm": 3.642775603252494,
"learning_rate": 3.5273006209418297e-06,
"loss": 0.7377,
"step": 12100
},
{
"epoch": 2.1470327723649247,
"grad_norm": 2.9088288240530806,
"learning_rate": 3.5216599361619193e-06,
"loss": 0.7356,
"step": 12120
},
{
"epoch": 2.150575730735164,
"grad_norm": 4.94624315541774,
"learning_rate": 3.5160130006618665e-06,
"loss": 0.7688,
"step": 12140
},
{
"epoch": 2.154118689105403,
"grad_norm": 3.431747470285621,
"learning_rate": 3.5103598489907553e-06,
"loss": 0.7322,
"step": 12160
},
{
"epoch": 2.157661647475642,
"grad_norm": 3.5652670345700876,
"learning_rate": 3.5047005157357e-06,
"loss": 0.7752,
"step": 12180
},
{
"epoch": 2.161204605845881,
"grad_norm": 4.315860117945172,
"learning_rate": 3.4990350355216347e-06,
"loss": 0.7443,
"step": 12200
},
{
"epoch": 2.1647475642161202,
"grad_norm": 3.963300128138939,
"learning_rate": 3.493363443011102e-06,
"loss": 0.7595,
"step": 12220
},
{
"epoch": 2.16829052258636,
"grad_norm": 3.3278580876056623,
"learning_rate": 3.487685772904041e-06,
"loss": 0.7573,
"step": 12240
},
{
"epoch": 2.171833480956599,
"grad_norm": 3.6502088387557516,
"learning_rate": 3.4820020599375755e-06,
"loss": 0.7675,
"step": 12260
},
{
"epoch": 2.175376439326838,
"grad_norm": 2.110435980731087,
"learning_rate": 3.476312338885799e-06,
"loss": 0.7659,
"step": 12280
},
{
"epoch": 2.178919397697077,
"grad_norm": 3.132585579346833,
"learning_rate": 3.4706166445595657e-06,
"loss": 0.7691,
"step": 12300
},
{
"epoch": 2.182462356067316,
"grad_norm": 5.677900838287172,
"learning_rate": 3.4649150118062737e-06,
"loss": 0.7543,
"step": 12320
},
{
"epoch": 2.1860053144375553,
"grad_norm": 3.5849859798668047,
"learning_rate": 3.4592074755096533e-06,
"loss": 0.7485,
"step": 12340
},
{
"epoch": 2.1895482728077944,
"grad_norm": 2.897975653877466,
"learning_rate": 3.453494070589556e-06,
"loss": 0.741,
"step": 12360
},
{
"epoch": 2.1930912311780335,
"grad_norm": 2.300880578954949,
"learning_rate": 3.4477748320017386e-06,
"loss": 0.7245,
"step": 12380
},
{
"epoch": 2.1966341895482726,
"grad_norm": 3.4934403676076213,
"learning_rate": 3.442049794737647e-06,
"loss": 0.7645,
"step": 12400
},
{
"epoch": 2.200177147918512,
"grad_norm": 3.0309935521375695,
"learning_rate": 3.436318993824206e-06,
"loss": 0.7822,
"step": 12420
},
{
"epoch": 2.2037201062887513,
"grad_norm": 5.162246999416868,
"learning_rate": 3.430582464323603e-06,
"loss": 0.7638,
"step": 12440
},
{
"epoch": 2.2072630646589904,
"grad_norm": 4.3206905241817175,
"learning_rate": 3.4248402413330766e-06,
"loss": 0.7872,
"step": 12460
},
{
"epoch": 2.2108060230292295,
"grad_norm": 4.236342233174995,
"learning_rate": 3.419092359984695e-06,
"loss": 0.7546,
"step": 12480
},
{
"epoch": 2.2143489813994686,
"grad_norm": 2.8151366817841756,
"learning_rate": 3.41333885544515e-06,
"loss": 0.7635,
"step": 12500
},
{
"epoch": 2.2178919397697077,
"grad_norm": 3.2839030456741978,
"learning_rate": 3.4075797629155336e-06,
"loss": 0.7588,
"step": 12520
},
{
"epoch": 2.2214348981399468,
"grad_norm": 3.0947148397280997,
"learning_rate": 3.4018151176311267e-06,
"loss": 0.7277,
"step": 12540
},
{
"epoch": 2.224977856510186,
"grad_norm": 3.5428621372363063,
"learning_rate": 3.396044954861185e-06,
"loss": 0.7679,
"step": 12560
},
{
"epoch": 2.228520814880425,
"grad_norm": 2.02419126865859,
"learning_rate": 3.39026930990872e-06,
"loss": 0.7446,
"step": 12580
},
{
"epoch": 2.2320637732506645,
"grad_norm": 4.0306736526937765,
"learning_rate": 3.384488218110285e-06,
"loss": 0.7599,
"step": 12600
},
{
"epoch": 2.2356067316209036,
"grad_norm": 3.3079541839461606,
"learning_rate": 3.378701714835756e-06,
"loss": 0.7325,
"step": 12620
},
{
"epoch": 2.2391496899911427,
"grad_norm": 3.6500019106828754,
"learning_rate": 3.3729098354881207e-06,
"loss": 0.7834,
"step": 12640
},
{
"epoch": 2.242692648361382,
"grad_norm": 2.9776776199055073,
"learning_rate": 3.367112615503256e-06,
"loss": 0.7479,
"step": 12660
},
{
"epoch": 2.246235606731621,
"grad_norm": 2.8136251511132,
"learning_rate": 3.3613100903497165e-06,
"loss": 0.7972,
"step": 12680
},
{
"epoch": 2.24977856510186,
"grad_norm": 2.6979166100675633,
"learning_rate": 3.355502295528512e-06,
"loss": 0.785,
"step": 12700
},
{
"epoch": 2.253321523472099,
"grad_norm": 2.555788234192105,
"learning_rate": 3.349689266572896e-06,
"loss": 0.7337,
"step": 12720
},
{
"epoch": 2.2568644818423382,
"grad_norm": 4.085833264183358,
"learning_rate": 3.3438710390481423e-06,
"loss": 0.7795,
"step": 12740
},
{
"epoch": 2.2604074402125773,
"grad_norm": 4.190825689674867,
"learning_rate": 3.338047648551333e-06,
"loss": 0.7946,
"step": 12760
},
{
"epoch": 2.263950398582817,
"grad_norm": 3.1997981806226496,
"learning_rate": 3.3322191307111386e-06,
"loss": 0.7573,
"step": 12780
},
{
"epoch": 2.267493356953056,
"grad_norm": 3.0496961823219237,
"learning_rate": 3.326385521187598e-06,
"loss": 0.7191,
"step": 12800
},
{
"epoch": 2.271036315323295,
"grad_norm": 2.3669933953301996,
"learning_rate": 3.320546855671903e-06,
"loss": 0.7787,
"step": 12820
},
{
"epoch": 2.274579273693534,
"grad_norm": 4.128804064988176,
"learning_rate": 3.3147031698861783e-06,
"loss": 0.8122,
"step": 12840
},
{
"epoch": 2.2781222320637733,
"grad_norm": 3.6541280879376288,
"learning_rate": 3.308854499583265e-06,
"loss": 0.8089,
"step": 12860
},
{
"epoch": 2.2816651904340124,
"grad_norm": 3.9405511693030513,
"learning_rate": 3.3030008805464987e-06,
"loss": 0.7806,
"step": 12880
},
{
"epoch": 2.2852081488042515,
"grad_norm": 2.7534555896912183,
"learning_rate": 3.297142348589493e-06,
"loss": 0.7826,
"step": 12900
},
{
"epoch": 2.2887511071744906,
"grad_norm": 3.3797355039239956,
"learning_rate": 3.2912789395559226e-06,
"loss": 0.8049,
"step": 12920
},
{
"epoch": 2.2922940655447297,
"grad_norm": 3.9407118834982677,
"learning_rate": 3.285410689319295e-06,
"loss": 0.7897,
"step": 12940
},
{
"epoch": 2.2958370239149692,
"grad_norm": 2.4540627878279713,
"learning_rate": 3.2795376337827416e-06,
"loss": 0.7869,
"step": 12960
},
{
"epoch": 2.299379982285208,
"grad_norm": 5.328758496807459,
"learning_rate": 3.273659808878794e-06,
"loss": 0.7567,
"step": 12980
},
{
"epoch": 2.3029229406554474,
"grad_norm": 2.939528537204539,
"learning_rate": 3.2677772505691614e-06,
"loss": 0.7337,
"step": 13000
},
{
"epoch": 2.3064658990256866,
"grad_norm": 2.1870838009976206,
"learning_rate": 3.2618899948445143e-06,
"loss": 0.8036,
"step": 13020
},
{
"epoch": 2.3100088573959257,
"grad_norm": 2.3749989783363037,
"learning_rate": 3.255998077724261e-06,
"loss": 0.7477,
"step": 13040
},
{
"epoch": 2.3135518157661648,
"grad_norm": 4.691180192385719,
"learning_rate": 3.250101535256333e-06,
"loss": 0.7982,
"step": 13060
},
{
"epoch": 2.317094774136404,
"grad_norm": 3.160712448911439,
"learning_rate": 3.2442004035169566e-06,
"loss": 0.7429,
"step": 13080
},
{
"epoch": 2.320637732506643,
"grad_norm": 3.397209885685089,
"learning_rate": 3.2382947186104385e-06,
"loss": 0.7749,
"step": 13100
},
{
"epoch": 2.324180690876882,
"grad_norm": 3.15455456395674,
"learning_rate": 3.232384516668943e-06,
"loss": 0.7693,
"step": 13120
},
{
"epoch": 2.327723649247121,
"grad_norm": 5.565654365935215,
"learning_rate": 3.2264698338522664e-06,
"loss": 0.772,
"step": 13140
},
{
"epoch": 2.3312666076173603,
"grad_norm": 4.698072365225265,
"learning_rate": 3.2205507063476255e-06,
"loss": 0.7808,
"step": 13160
},
{
"epoch": 2.3348095659876,
"grad_norm": 3.59501906125545,
"learning_rate": 3.2146271703694277e-06,
"loss": 0.7505,
"step": 13180
},
{
"epoch": 2.338352524357839,
"grad_norm": 3.679313411486785,
"learning_rate": 3.208699262159052e-06,
"loss": 0.7336,
"step": 13200
},
{
"epoch": 2.341895482728078,
"grad_norm": 2.8791170352318276,
"learning_rate": 3.2027670179846294e-06,
"loss": 0.7307,
"step": 13220
},
{
"epoch": 2.345438441098317,
"grad_norm": 4.059760879698832,
"learning_rate": 3.196830474140816e-06,
"loss": 0.753,
"step": 13240
},
{
"epoch": 2.348981399468556,
"grad_norm": 2.996057141115257,
"learning_rate": 3.190889666948579e-06,
"loss": 0.7399,
"step": 13260
},
{
"epoch": 2.3525243578387953,
"grad_norm": 4.234482920426088,
"learning_rate": 3.184944632754964e-06,
"loss": 0.7904,
"step": 13280
},
{
"epoch": 2.3560673162090344,
"grad_norm": 2.3964341925650463,
"learning_rate": 3.1789954079328835e-06,
"loss": 0.7534,
"step": 13300
},
{
"epoch": 2.3596102745792735,
"grad_norm": 2.9159272754142918,
"learning_rate": 3.1730420288808862e-06,
"loss": 0.7834,
"step": 13320
},
{
"epoch": 2.3631532329495126,
"grad_norm": 3.781684460395673,
"learning_rate": 3.1670845320229355e-06,
"loss": 0.771,
"step": 13340
},
{
"epoch": 2.366696191319752,
"grad_norm": 2.826416108672929,
"learning_rate": 3.161122953808192e-06,
"loss": 0.7354,
"step": 13360
},
{
"epoch": 2.3702391496899913,
"grad_norm": 5.104935252190013,
"learning_rate": 3.1551573307107867e-06,
"loss": 0.7503,
"step": 13380
},
{
"epoch": 2.3737821080602304,
"grad_norm": 4.124050748848566,
"learning_rate": 3.149187699229595e-06,
"loss": 0.7191,
"step": 13400
},
{
"epoch": 2.3773250664304695,
"grad_norm": 4.433917574735756,
"learning_rate": 3.1432140958880186e-06,
"loss": 0.7036,
"step": 13420
},
{
"epoch": 2.3808680248007086,
"grad_norm": 6.0333723611681975,
"learning_rate": 3.1372365572337592e-06,
"loss": 0.6947,
"step": 13440
},
{
"epoch": 2.3844109831709477,
"grad_norm": 3.496479107809089,
"learning_rate": 3.1312551198385964e-06,
"loss": 0.8186,
"step": 13460
},
{
"epoch": 2.387953941541187,
"grad_norm": 2.9985043132815608,
"learning_rate": 3.1252698202981613e-06,
"loss": 0.762,
"step": 13480
},
{
"epoch": 2.391496899911426,
"grad_norm": 3.5778965231230733,
"learning_rate": 3.1192806952317155e-06,
"loss": 0.7475,
"step": 13500
},
{
"epoch": 2.395039858281665,
"grad_norm": 2.4853017301906046,
"learning_rate": 3.113287781281927e-06,
"loss": 0.7673,
"step": 13520
},
{
"epoch": 2.3985828166519045,
"grad_norm": 4.385979644321999,
"learning_rate": 3.107291115114643e-06,
"loss": 0.7664,
"step": 13540
},
{
"epoch": 2.4021257750221436,
"grad_norm": 3.519870816810653,
"learning_rate": 3.1012907334186676e-06,
"loss": 0.7708,
"step": 13560
},
{
"epoch": 2.4056687333923827,
"grad_norm": 3.6501676511740913,
"learning_rate": 3.09528667290554e-06,
"loss": 0.7354,
"step": 13580
},
{
"epoch": 2.409211691762622,
"grad_norm": 5.282285428849241,
"learning_rate": 3.0892789703093025e-06,
"loss": 0.7679,
"step": 13600
},
{
"epoch": 2.412754650132861,
"grad_norm": 4.970772234231322,
"learning_rate": 3.0832676623862847e-06,
"loss": 0.7753,
"step": 13620
},
{
"epoch": 2.4162976085031,
"grad_norm": 5.300930589688842,
"learning_rate": 3.0772527859148726e-06,
"loss": 0.7309,
"step": 13640
},
{
"epoch": 2.419840566873339,
"grad_norm": 3.195715973824964,
"learning_rate": 3.0712343776952845e-06,
"loss": 0.8118,
"step": 13660
},
{
"epoch": 2.4233835252435783,
"grad_norm": 3.8485686008854025,
"learning_rate": 3.0652124745493483e-06,
"loss": 0.7677,
"step": 13680
},
{
"epoch": 2.4269264836138174,
"grad_norm": 3.6307249326313844,
"learning_rate": 3.0591871133202733e-06,
"loss": 0.7562,
"step": 13700
},
{
"epoch": 2.430469441984057,
"grad_norm": 3.316064861185543,
"learning_rate": 3.0531583308724267e-06,
"loss": 0.7626,
"step": 13720
},
{
"epoch": 2.434012400354296,
"grad_norm": 6.634549317490503,
"learning_rate": 3.0471261640911065e-06,
"loss": 0.758,
"step": 13740
},
{
"epoch": 2.437555358724535,
"grad_norm": 3.175037432084709,
"learning_rate": 3.0410906498823176e-06,
"loss": 0.747,
"step": 13760
},
{
"epoch": 2.441098317094774,
"grad_norm": 4.104765159107976,
"learning_rate": 3.0350518251725466e-06,
"loss": 0.7529,
"step": 13780
},
{
"epoch": 2.4446412754650133,
"grad_norm": 4.108216250616023,
"learning_rate": 3.02900972690853e-06,
"loss": 0.7329,
"step": 13800
},
{
"epoch": 2.4481842338352524,
"grad_norm": 5.179386294221434,
"learning_rate": 3.0229643920570368e-06,
"loss": 0.7756,
"step": 13820
},
{
"epoch": 2.4517271922054915,
"grad_norm": 3.532213415891956,
"learning_rate": 3.0169158576046364e-06,
"loss": 0.7857,
"step": 13840
},
{
"epoch": 2.4552701505757306,
"grad_norm": 3.1668720104134898,
"learning_rate": 3.0108641605574746e-06,
"loss": 0.7689,
"step": 13860
},
{
"epoch": 2.4588131089459697,
"grad_norm": 5.323518128860221,
"learning_rate": 3.0048093379410455e-06,
"loss": 0.7193,
"step": 13880
},
{
"epoch": 2.4623560673162093,
"grad_norm": 2.8074163772217346,
"learning_rate": 2.998751426799967e-06,
"loss": 0.7663,
"step": 13900
},
{
"epoch": 2.4658990256864484,
"grad_norm": 3.3433870569143402,
"learning_rate": 2.9926904641977524e-06,
"loss": 0.7351,
"step": 13920
},
{
"epoch": 2.4694419840566875,
"grad_norm": 4.317693104328955,
"learning_rate": 2.986626487216586e-06,
"loss": 0.7303,
"step": 13940
},
{
"epoch": 2.4729849424269266,
"grad_norm": 2.9507964917970093,
"learning_rate": 2.9805595329570926e-06,
"loss": 0.7355,
"step": 13960
},
{
"epoch": 2.4765279007971657,
"grad_norm": 5.5399797509230035,
"learning_rate": 2.974489638538115e-06,
"loss": 0.7673,
"step": 13980
},
{
"epoch": 2.4800708591674048,
"grad_norm": 6.341760350289788,
"learning_rate": 2.9684168410964815e-06,
"loss": 0.7332,
"step": 14000
},
{
"epoch": 2.4800708591674048,
"eval_loss": 0.8049691915512085,
"eval_runtime": 366.4293,
"eval_samples_per_second": 25.945,
"eval_steps_per_second": 3.245,
"step": 14000
},
{
"epoch": 2.483613817537644,
"grad_norm": 3.362490165226843,
"learning_rate": 2.9623411777867845e-06,
"loss": 0.8132,
"step": 14020
},
{
"epoch": 2.487156775907883,
"grad_norm": 3.067701488951309,
"learning_rate": 2.9562626857811486e-06,
"loss": 0.73,
"step": 14040
},
{
"epoch": 2.490699734278122,
"grad_norm": 5.481445388434678,
"learning_rate": 2.950181402269007e-06,
"loss": 0.7727,
"step": 14060
},
{
"epoch": 2.4942426926483616,
"grad_norm": 3.704158248514089,
"learning_rate": 2.944097364456867e-06,
"loss": 0.7594,
"step": 14080
},
{
"epoch": 2.4977856510186003,
"grad_norm": 3.703332272719033,
"learning_rate": 2.9380106095680943e-06,
"loss": 0.7816,
"step": 14100
},
{
"epoch": 2.50132860938884,
"grad_norm": 5.075701987314385,
"learning_rate": 2.931921174842672e-06,
"loss": 0.8127,
"step": 14120
},
{
"epoch": 2.504871567759079,
"grad_norm": 3.3757570715718126,
"learning_rate": 2.925829097536983e-06,
"loss": 0.7594,
"step": 14140
},
{
"epoch": 2.508414526129318,
"grad_norm": 5.003414946307021,
"learning_rate": 2.9197344149235762e-06,
"loss": 0.802,
"step": 14160
},
{
"epoch": 2.511957484499557,
"grad_norm": 3.883907931049023,
"learning_rate": 2.9136371642909406e-06,
"loss": 0.7292,
"step": 14180
},
{
"epoch": 2.5155004428697962,
"grad_norm": 4.7644188631188555,
"learning_rate": 2.9075373829432766e-06,
"loss": 0.7899,
"step": 14200
},
{
"epoch": 2.5190434012400353,
"grad_norm": 3.1005288937544795,
"learning_rate": 2.901435108200269e-06,
"loss": 0.7501,
"step": 14220
},
{
"epoch": 2.5225863596102744,
"grad_norm": 4.202314740845049,
"learning_rate": 2.8953303773968566e-06,
"loss": 0.733,
"step": 14240
},
{
"epoch": 2.526129317980514,
"grad_norm": 3.0865315047496256,
"learning_rate": 2.889223227883006e-06,
"loss": 0.7218,
"step": 14260
},
{
"epoch": 2.5296722763507526,
"grad_norm": 3.687380874866973,
"learning_rate": 2.8831136970234798e-06,
"loss": 0.7539,
"step": 14280
},
{
"epoch": 2.533215234720992,
"grad_norm": 4.787195673053088,
"learning_rate": 2.8770018221976126e-06,
"loss": 0.7733,
"step": 14300
},
{
"epoch": 2.5367581930912313,
"grad_norm": 3.131210531061707,
"learning_rate": 2.8708876407990794e-06,
"loss": 0.8023,
"step": 14320
},
{
"epoch": 2.5403011514614704,
"grad_norm": 6.575229105312035,
"learning_rate": 2.8647711902356653e-06,
"loss": 0.7857,
"step": 14340
},
{
"epoch": 2.5438441098317095,
"grad_norm": 3.132660461857889,
"learning_rate": 2.858652507929042e-06,
"loss": 0.6994,
"step": 14360
},
{
"epoch": 2.5473870682019486,
"grad_norm": 4.422023062404442,
"learning_rate": 2.852531631314531e-06,
"loss": 0.7629,
"step": 14380
},
{
"epoch": 2.5509300265721877,
"grad_norm": 6.144792250633855,
"learning_rate": 2.846408597840884e-06,
"loss": 0.8015,
"step": 14400
},
{
"epoch": 2.554472984942427,
"grad_norm": 4.254153197416055,
"learning_rate": 2.8402834449700444e-06,
"loss": 0.8166,
"step": 14420
},
{
"epoch": 2.5580159433126664,
"grad_norm": 4.276352369102927,
"learning_rate": 2.8341562101769258e-06,
"loss": 0.7488,
"step": 14440
},
{
"epoch": 2.561558901682905,
"grad_norm": 2.7388450962012008,
"learning_rate": 2.8280269309491783e-06,
"loss": 0.731,
"step": 14460
},
{
"epoch": 2.5651018600531446,
"grad_norm": 3.155033747973735,
"learning_rate": 2.821895644786958e-06,
"loss": 0.7601,
"step": 14480
},
{
"epoch": 2.5686448184233837,
"grad_norm": 3.8509609959663242,
"learning_rate": 2.815762389202703e-06,
"loss": 0.7773,
"step": 14500
},
{
"epoch": 2.5721877767936228,
"grad_norm": 2.7079281092995617,
"learning_rate": 2.8096272017208996e-06,
"loss": 0.6832,
"step": 14520
},
{
"epoch": 2.575730735163862,
"grad_norm": 4.486116320592051,
"learning_rate": 2.8034901198778537e-06,
"loss": 0.7792,
"step": 14540
},
{
"epoch": 2.579273693534101,
"grad_norm": 3.703073425582054,
"learning_rate": 2.7973511812214614e-06,
"loss": 0.7297,
"step": 14560
},
{
"epoch": 2.58281665190434,
"grad_norm": 4.327931475336945,
"learning_rate": 2.79121042331098e-06,
"loss": 0.7332,
"step": 14580
},
{
"epoch": 2.586359610274579,
"grad_norm": 4.730270311129735,
"learning_rate": 2.7850678837167943e-06,
"loss": 0.7537,
"step": 14600
},
{
"epoch": 2.5899025686448183,
"grad_norm": 4.061197200463917,
"learning_rate": 2.778923600020193e-06,
"loss": 0.7364,
"step": 14620
},
{
"epoch": 2.5934455270150574,
"grad_norm": 5.875022772190702,
"learning_rate": 2.7727776098131355e-06,
"loss": 0.763,
"step": 14640
},
{
"epoch": 2.596988485385297,
"grad_norm": 4.2525211564332155,
"learning_rate": 2.76662995069802e-06,
"loss": 0.7106,
"step": 14660
},
{
"epoch": 2.600531443755536,
"grad_norm": 2.812095781009687,
"learning_rate": 2.760480660287457e-06,
"loss": 0.7964,
"step": 14680
},
{
"epoch": 2.604074402125775,
"grad_norm": 2.982535796988278,
"learning_rate": 2.7543297762040367e-06,
"loss": 0.7471,
"step": 14700
},
{
"epoch": 2.6076173604960142,
"grad_norm": 3.3609650163002285,
"learning_rate": 2.748177336080099e-06,
"loss": 0.7404,
"step": 14720
},
{
"epoch": 2.6111603188662533,
"grad_norm": 3.3659754899747183,
"learning_rate": 2.7420233775575062e-06,
"loss": 0.7088,
"step": 14740
},
{
"epoch": 2.6147032772364924,
"grad_norm": 4.8930238508897865,
"learning_rate": 2.73586793828741e-06,
"loss": 0.8097,
"step": 14760
},
{
"epoch": 2.6182462356067315,
"grad_norm": 3.7139806486443523,
"learning_rate": 2.7297110559300196e-06,
"loss": 0.726,
"step": 14780
},
{
"epoch": 2.6217891939769706,
"grad_norm": 5.786625818280847,
"learning_rate": 2.7235527681543745e-06,
"loss": 0.7663,
"step": 14800
},
{
"epoch": 2.6253321523472097,
"grad_norm": 2.4715288407192917,
"learning_rate": 2.717393112638113e-06,
"loss": 0.8067,
"step": 14820
},
{
"epoch": 2.6288751107174493,
"grad_norm": 2.48294124433425,
"learning_rate": 2.7112321270672427e-06,
"loss": 0.7436,
"step": 14840
},
{
"epoch": 2.6324180690876884,
"grad_norm": 4.533346896730641,
"learning_rate": 2.705069849135905e-06,
"loss": 0.7542,
"step": 14860
},
{
"epoch": 2.6359610274579275,
"grad_norm": 2.595943549639139,
"learning_rate": 2.698906316546154e-06,
"loss": 0.7206,
"step": 14880
},
{
"epoch": 2.6395039858281666,
"grad_norm": 4.011814531987077,
"learning_rate": 2.6927415670077133e-06,
"loss": 0.7981,
"step": 14900
},
{
"epoch": 2.6430469441984057,
"grad_norm": 3.060330539530146,
"learning_rate": 2.6865756382377577e-06,
"loss": 0.7805,
"step": 14920
},
{
"epoch": 2.646589902568645,
"grad_norm": 2.9995394478553803,
"learning_rate": 2.6804085679606735e-06,
"loss": 0.7601,
"step": 14940
},
{
"epoch": 2.650132860938884,
"grad_norm": 5.280778921498777,
"learning_rate": 2.674240393907832e-06,
"loss": 0.7646,
"step": 14960
},
{
"epoch": 2.653675819309123,
"grad_norm": 3.798446112260494,
"learning_rate": 2.6680711538173595e-06,
"loss": 0.7871,
"step": 14980
},
{
"epoch": 2.657218777679362,
"grad_norm": 5.01912287250632,
"learning_rate": 2.661900885433899e-06,
"loss": 0.745,
"step": 15000
},
{
"epoch": 2.6607617360496016,
"grad_norm": 2.960773034701044,
"learning_rate": 2.6557296265083917e-06,
"loss": 0.7822,
"step": 15020
},
{
"epoch": 2.6643046944198403,
"grad_norm": 3.6183436876712287,
"learning_rate": 2.649557414797834e-06,
"loss": 0.7811,
"step": 15040
},
{
"epoch": 2.66784765279008,
"grad_norm": 2.9204062861952242,
"learning_rate": 2.6433842880650552e-06,
"loss": 0.7684,
"step": 15060
},
{
"epoch": 2.671390611160319,
"grad_norm": 3.8802496727785587,
"learning_rate": 2.63721028407848e-06,
"loss": 0.6913,
"step": 15080
},
{
"epoch": 2.674933569530558,
"grad_norm": 2.5788899500499554,
"learning_rate": 2.6310354406119022e-06,
"loss": 0.7309,
"step": 15100
},
{
"epoch": 2.678476527900797,
"grad_norm": 4.277286678336026,
"learning_rate": 2.6248597954442493e-06,
"loss": 0.7644,
"step": 15120
},
{
"epoch": 2.6820194862710363,
"grad_norm": 3.188282097160713,
"learning_rate": 2.6186833863593576e-06,
"loss": 0.7619,
"step": 15140
},
{
"epoch": 2.6855624446412754,
"grad_norm": 2.200272309526565,
"learning_rate": 2.6125062511457344e-06,
"loss": 0.7518,
"step": 15160
},
{
"epoch": 2.6891054030115145,
"grad_norm": 4.286800017508891,
"learning_rate": 2.6063284275963296e-06,
"loss": 0.7551,
"step": 15180
},
{
"epoch": 2.692648361381754,
"grad_norm": 3.9692599529255337,
"learning_rate": 2.6001499535083067e-06,
"loss": 0.7885,
"step": 15200
},
{
"epoch": 2.6961913197519927,
"grad_norm": 3.3386892906925203,
"learning_rate": 2.593970866682806e-06,
"loss": 0.7603,
"step": 15220
},
{
"epoch": 2.699734278122232,
"grad_norm": 3.5522070573455298,
"learning_rate": 2.5877912049247206e-06,
"loss": 0.7833,
"step": 15240
},
{
"epoch": 2.7032772364924713,
"grad_norm": 3.6763130005746327,
"learning_rate": 2.5816110060424566e-06,
"loss": 0.7451,
"step": 15260
},
{
"epoch": 2.7068201948627104,
"grad_norm": 4.065296942712161,
"learning_rate": 2.57543030784771e-06,
"loss": 0.7856,
"step": 15280
},
{
"epoch": 2.7103631532329495,
"grad_norm": 3.925534297841757,
"learning_rate": 2.5692491481552314e-06,
"loss": 0.7869,
"step": 15300
},
{
"epoch": 2.7139061116031886,
"grad_norm": 5.0603926851389245,
"learning_rate": 2.5630675647825913e-06,
"loss": 0.7616,
"step": 15320
},
{
"epoch": 2.7174490699734277,
"grad_norm": 4.055554853144999,
"learning_rate": 2.5568855955499573e-06,
"loss": 0.7882,
"step": 15340
},
{
"epoch": 2.720992028343667,
"grad_norm": 3.478661550213012,
"learning_rate": 2.5507032782798553e-06,
"loss": 0.7852,
"step": 15360
},
{
"epoch": 2.7245349867139064,
"grad_norm": 2.171548931614822,
"learning_rate": 2.5445206507969395e-06,
"loss": 0.77,
"step": 15380
},
{
"epoch": 2.728077945084145,
"grad_norm": 3.0583258036393604,
"learning_rate": 2.5383377509277648e-06,
"loss": 0.7404,
"step": 15400
},
{
"epoch": 2.7316209034543846,
"grad_norm": 3.1484952352447273,
"learning_rate": 2.5321546165005497e-06,
"loss": 0.7266,
"step": 15420
},
{
"epoch": 2.7351638618246237,
"grad_norm": 2.942970058363834,
"learning_rate": 2.5259712853449503e-06,
"loss": 0.7527,
"step": 15440
},
{
"epoch": 2.738706820194863,
"grad_norm": 2.645281388123703,
"learning_rate": 2.5197877952918243e-06,
"loss": 0.7662,
"step": 15460
},
{
"epoch": 2.742249778565102,
"grad_norm": 3.4746684651389548,
"learning_rate": 2.5136041841730026e-06,
"loss": 0.7628,
"step": 15480
},
{
"epoch": 2.745792736935341,
"grad_norm": 3.7116192950002795,
"learning_rate": 2.5074204898210587e-06,
"loss": 0.7428,
"step": 15500
},
{
"epoch": 2.74933569530558,
"grad_norm": 5.909705442043078,
"learning_rate": 2.50123675006907e-06,
"loss": 0.7538,
"step": 15520
},
{
"epoch": 2.752878653675819,
"grad_norm": 3.8189355746524045,
"learning_rate": 2.4950530027503963e-06,
"loss": 0.7647,
"step": 15540
},
{
"epoch": 2.7564216120460587,
"grad_norm": 3.5256750418834515,
"learning_rate": 2.4888692856984446e-06,
"loss": 0.7332,
"step": 15560
},
{
"epoch": 2.7599645704162974,
"grad_norm": 5.2988278310924715,
"learning_rate": 2.482685636746432e-06,
"loss": 0.7446,
"step": 15580
},
{
"epoch": 2.763507528786537,
"grad_norm": 2.9280069670539146,
"learning_rate": 2.4765020937271615e-06,
"loss": 0.7999,
"step": 15600
},
{
"epoch": 2.767050487156776,
"grad_norm": 3.3339855601700243,
"learning_rate": 2.4703186944727885e-06,
"loss": 0.7421,
"step": 15620
},
{
"epoch": 2.770593445527015,
"grad_norm": 3.1483076854706993,
"learning_rate": 2.464135476814589e-06,
"loss": 0.7523,
"step": 15640
},
{
"epoch": 2.7741364038972542,
"grad_norm": 3.1964072857533687,
"learning_rate": 2.4579524785827254e-06,
"loss": 0.7793,
"step": 15660
},
{
"epoch": 2.7776793622674933,
"grad_norm": 5.327494932320985,
"learning_rate": 2.451769737606021e-06,
"loss": 0.7604,
"step": 15680
},
{
"epoch": 2.7812223206377324,
"grad_norm": 2.0488240092746413,
"learning_rate": 2.4455872917117233e-06,
"loss": 0.7198,
"step": 15700
},
{
"epoch": 2.7847652790079716,
"grad_norm": 3.6254982549079506,
"learning_rate": 2.439405178725274e-06,
"loss": 0.7811,
"step": 15720
},
{
"epoch": 2.7883082373782107,
"grad_norm": 3.813833819224438,
"learning_rate": 2.4332234364700793e-06,
"loss": 0.7857,
"step": 15740
},
{
"epoch": 2.7918511957484498,
"grad_norm": 2.332438247153177,
"learning_rate": 2.427042102767278e-06,
"loss": 0.7741,
"step": 15760
},
{
"epoch": 2.7953941541186893,
"grad_norm": 5.637052434142593,
"learning_rate": 2.4208612154355054e-06,
"loss": 0.7873,
"step": 15780
},
{
"epoch": 2.7989371124889284,
"grad_norm": 3.61371927386971,
"learning_rate": 2.4146808122906685e-06,
"loss": 0.7667,
"step": 15800
},
{
"epoch": 2.8024800708591675,
"grad_norm": 3.081712372431115,
"learning_rate": 2.408500931145713e-06,
"loss": 0.7637,
"step": 15820
},
{
"epoch": 2.8060230292294066,
"grad_norm": 2.8637593806841064,
"learning_rate": 2.4023216098103892e-06,
"loss": 0.7406,
"step": 15840
},
{
"epoch": 2.8095659875996457,
"grad_norm": 4.855719945200848,
"learning_rate": 2.396142886091023e-06,
"loss": 0.762,
"step": 15860
},
{
"epoch": 2.813108945969885,
"grad_norm": 2.3565232263934592,
"learning_rate": 2.389964797790283e-06,
"loss": 0.7539,
"step": 15880
},
{
"epoch": 2.816651904340124,
"grad_norm": 3.4605355857001325,
"learning_rate": 2.383787382706953e-06,
"loss": 0.7435,
"step": 15900
},
{
"epoch": 2.820194862710363,
"grad_norm": 2.2057409877337992,
"learning_rate": 2.377610678635693e-06,
"loss": 0.7737,
"step": 15920
},
{
"epoch": 2.823737821080602,
"grad_norm": 2.136429659045698,
"learning_rate": 2.371434723366818e-06,
"loss": 0.7759,
"step": 15940
},
{
"epoch": 2.8272807794508417,
"grad_norm": 2.9028483092829815,
"learning_rate": 2.3652595546860595e-06,
"loss": 0.7826,
"step": 15960
},
{
"epoch": 2.8308237378210808,
"grad_norm": 3.7595789211806383,
"learning_rate": 2.359085210374335e-06,
"loss": 0.7565,
"step": 15980
},
{
"epoch": 2.83436669619132,
"grad_norm": 3.402275688537832,
"learning_rate": 2.3529117282075207e-06,
"loss": 0.7222,
"step": 16000
},
{
"epoch": 2.83436669619132,
"eval_loss": 0.7888814210891724,
"eval_runtime": 368.71,
"eval_samples_per_second": 25.784,
"eval_steps_per_second": 3.225,
"step": 16000
},
{
"epoch": 2.837909654561559,
"grad_norm": 4.2698147796391455,
"learning_rate": 2.3467391459562163e-06,
"loss": 0.772,
"step": 16020
},
{
"epoch": 2.841452612931798,
"grad_norm": 5.560388355211716,
"learning_rate": 2.340567501385518e-06,
"loss": 0.7719,
"step": 16040
},
{
"epoch": 2.844995571302037,
"grad_norm": 2.294411499429463,
"learning_rate": 2.3343968322547816e-06,
"loss": 0.7737,
"step": 16060
},
{
"epoch": 2.8485385296722763,
"grad_norm": 1.995090359134686,
"learning_rate": 2.3282271763173984e-06,
"loss": 0.7808,
"step": 16080
},
{
"epoch": 2.8520814880425154,
"grad_norm": 2.8221763040945422,
"learning_rate": 2.322058571320559e-06,
"loss": 0.7943,
"step": 16100
},
{
"epoch": 2.8556244464127545,
"grad_norm": 2.8411701267751264,
"learning_rate": 2.315891055005024e-06,
"loss": 0.7458,
"step": 16120
},
{
"epoch": 2.859167404782994,
"grad_norm": 3.841302334883737,
"learning_rate": 2.3097246651048937e-06,
"loss": 0.77,
"step": 16140
},
{
"epoch": 2.8627103631532327,
"grad_norm": 3.890480722348098,
"learning_rate": 2.3035594393473777e-06,
"loss": 0.7384,
"step": 16160
},
{
"epoch": 2.8662533215234722,
"grad_norm": 5.679565106603388,
"learning_rate": 2.297395415452562e-06,
"loss": 0.803,
"step": 16180
},
{
"epoch": 2.8697962798937113,
"grad_norm": 3.337771681609959,
"learning_rate": 2.2912326311331774e-06,
"loss": 0.7028,
"step": 16200
},
{
"epoch": 2.8733392382639504,
"grad_norm": 3.9468562372490443,
"learning_rate": 2.285071124094375e-06,
"loss": 0.778,
"step": 16220
},
{
"epoch": 2.8768821966341895,
"grad_norm": 3.7924647740545203,
"learning_rate": 2.2789109320334885e-06,
"loss": 0.7559,
"step": 16240
},
{
"epoch": 2.8804251550044286,
"grad_norm": 3.3047453094089505,
"learning_rate": 2.2727520926398067e-06,
"loss": 0.7563,
"step": 16260
},
{
"epoch": 2.8839681133746677,
"grad_norm": 4.691213784478345,
"learning_rate": 2.2665946435943425e-06,
"loss": 0.7708,
"step": 16280
},
{
"epoch": 2.887511071744907,
"grad_norm": 5.050030297267503,
"learning_rate": 2.2604386225696035e-06,
"loss": 0.7855,
"step": 16300
},
{
"epoch": 2.8910540301151464,
"grad_norm": 3.297233505976855,
"learning_rate": 2.254284067229359e-06,
"loss": 0.7273,
"step": 16320
},
{
"epoch": 2.894596988485385,
"grad_norm": 3.0043017323996106,
"learning_rate": 2.24813101522841e-06,
"loss": 0.7538,
"step": 16340
},
{
"epoch": 2.8981399468556246,
"grad_norm": 3.7341413882543857,
"learning_rate": 2.2419795042123644e-06,
"loss": 0.7414,
"step": 16360
},
{
"epoch": 2.9016829052258637,
"grad_norm": 4.878090641375587,
"learning_rate": 2.2358295718173966e-06,
"loss": 0.7679,
"step": 16380
},
{
"epoch": 2.905225863596103,
"grad_norm": 4.429314718401584,
"learning_rate": 2.2296812556700245e-06,
"loss": 0.7517,
"step": 16400
},
{
"epoch": 2.908768821966342,
"grad_norm": 2.9862439233870943,
"learning_rate": 2.2235345933868785e-06,
"loss": 0.7818,
"step": 16420
},
{
"epoch": 2.912311780336581,
"grad_norm": 6.698492865099133,
"learning_rate": 2.2173896225744704e-06,
"loss": 0.7695,
"step": 16440
},
{
"epoch": 2.91585473870682,
"grad_norm": 4.153101032253308,
"learning_rate": 2.2112463808289613e-06,
"loss": 0.7296,
"step": 16460
},
{
"epoch": 2.919397697077059,
"grad_norm": 2.779471876140217,
"learning_rate": 2.2051049057359354e-06,
"loss": 0.7283,
"step": 16480
},
{
"epoch": 2.9229406554472988,
"grad_norm": 3.2836392616484984,
"learning_rate": 2.1989652348701683e-06,
"loss": 0.7383,
"step": 16500
},
{
"epoch": 2.9264836138175374,
"grad_norm": 1.895818806571468,
"learning_rate": 2.192827405795395e-06,
"loss": 0.7345,
"step": 16520
},
{
"epoch": 2.930026572187777,
"grad_norm": 5.6943248214395545,
"learning_rate": 2.1866914560640832e-06,
"loss": 0.7717,
"step": 16540
},
{
"epoch": 2.933569530558016,
"grad_norm": 4.513933666928205,
"learning_rate": 2.1805574232172044e-06,
"loss": 0.7773,
"step": 16560
},
{
"epoch": 2.937112488928255,
"grad_norm": 4.620092058556062,
"learning_rate": 2.1744253447839988e-06,
"loss": 0.7592,
"step": 16580
},
{
"epoch": 2.9406554472984943,
"grad_norm": 3.858292964944665,
"learning_rate": 2.16829525828175e-06,
"loss": 0.7854,
"step": 16600
},
{
"epoch": 2.9441984056687334,
"grad_norm": 3.8882587756941174,
"learning_rate": 2.1621672012155552e-06,
"loss": 0.7434,
"step": 16620
},
{
"epoch": 2.9477413640389725,
"grad_norm": 4.499656195479335,
"learning_rate": 2.1560412110780967e-06,
"loss": 0.7695,
"step": 16640
},
{
"epoch": 2.9512843224092116,
"grad_norm": 3.7965733733040494,
"learning_rate": 2.149917325349408e-06,
"loss": 0.7197,
"step": 16660
},
{
"epoch": 2.954827280779451,
"grad_norm": 3.860376338376774,
"learning_rate": 2.143795581496648e-06,
"loss": 0.7403,
"step": 16680
},
{
"epoch": 2.9583702391496898,
"grad_norm": 3.323974522469437,
"learning_rate": 2.1376760169738746e-06,
"loss": 0.7497,
"step": 16700
},
{
"epoch": 2.9619131975199293,
"grad_norm": 2.3281130863968382,
"learning_rate": 2.131558669221806e-06,
"loss": 0.7319,
"step": 16720
},
{
"epoch": 2.9654561558901684,
"grad_norm": 3.4381149445643517,
"learning_rate": 2.125443575667603e-06,
"loss": 0.7817,
"step": 16740
},
{
"epoch": 2.9689991142604075,
"grad_norm": 3.9685288684047815,
"learning_rate": 2.1193307737246336e-06,
"loss": 0.7764,
"step": 16760
},
{
"epoch": 2.9725420726306466,
"grad_norm": 3.396199970398508,
"learning_rate": 2.113220300792243e-06,
"loss": 0.7661,
"step": 16780
},
{
"epoch": 2.9760850310008857,
"grad_norm": 3.6420647109134943,
"learning_rate": 2.10711219425553e-06,
"loss": 0.7035,
"step": 16800
},
{
"epoch": 2.979627989371125,
"grad_norm": 4.626273852138959,
"learning_rate": 2.101006491485112e-06,
"loss": 0.753,
"step": 16820
},
{
"epoch": 2.983170947741364,
"grad_norm": 2.4460574774339654,
"learning_rate": 2.0949032298369035e-06,
"loss": 0.7692,
"step": 16840
},
{
"epoch": 2.986713906111603,
"grad_norm": 3.41069245958657,
"learning_rate": 2.0888024466518804e-06,
"loss": 0.6976,
"step": 16860
},
{
"epoch": 2.990256864481842,
"grad_norm": 3.403936749564734,
"learning_rate": 2.082704179255857e-06,
"loss": 0.7946,
"step": 16880
},
{
"epoch": 2.9937998228520817,
"grad_norm": 4.194199735821774,
"learning_rate": 2.076608464959255e-06,
"loss": 0.7235,
"step": 16900
},
{
"epoch": 2.997342781222321,
"grad_norm": 4.995370798020937,
"learning_rate": 2.0705153410568753e-06,
"loss": 0.7518,
"step": 16920
},
{
"epoch": 3.00088573959256,
"grad_norm": 4.498354371200463,
"learning_rate": 2.0644248448276698e-06,
"loss": 0.6865,
"step": 16940
},
{
"epoch": 3.004428697962799,
"grad_norm": 5.9848761265551484,
"learning_rate": 2.0583370135345157e-06,
"loss": 0.6598,
"step": 16960
},
{
"epoch": 3.007971656333038,
"grad_norm": 3.1703728963133844,
"learning_rate": 2.0522518844239834e-06,
"loss": 0.634,
"step": 16980
},
{
"epoch": 3.011514614703277,
"grad_norm": 4.221579231662142,
"learning_rate": 2.0461694947261127e-06,
"loss": 0.6631,
"step": 17000
},
{
"epoch": 3.0150575730735163,
"grad_norm": 4.154018707851057,
"learning_rate": 2.0400898816541807e-06,
"loss": 0.6633,
"step": 17020
},
{
"epoch": 3.0186005314437554,
"grad_norm": 4.631953700832906,
"learning_rate": 2.034013082404479e-06,
"loss": 0.6674,
"step": 17040
},
{
"epoch": 3.0221434898139945,
"grad_norm": 3.085154700215037,
"learning_rate": 2.0279391341560823e-06,
"loss": 0.6241,
"step": 17060
},
{
"epoch": 3.025686448184234,
"grad_norm": 4.415029344873564,
"learning_rate": 2.0218680740706227e-06,
"loss": 0.6436,
"step": 17080
},
{
"epoch": 3.029229406554473,
"grad_norm": 4.050012850016261,
"learning_rate": 2.0157999392920626e-06,
"loss": 0.6809,
"step": 17100
},
{
"epoch": 3.0327723649247122,
"grad_norm": 3.474053983443441,
"learning_rate": 2.009734766946465e-06,
"loss": 0.6748,
"step": 17120
},
{
"epoch": 3.0363153232949514,
"grad_norm": 5.502084090817595,
"learning_rate": 2.0036725941417695e-06,
"loss": 0.7077,
"step": 17140
},
{
"epoch": 3.0398582816651905,
"grad_norm": 4.7328265250493375,
"learning_rate": 1.997613457967565e-06,
"loss": 0.6685,
"step": 17160
},
{
"epoch": 3.0434012400354296,
"grad_norm": 4.990841825704372,
"learning_rate": 1.991557395494858e-06,
"loss": 0.6576,
"step": 17180
},
{
"epoch": 3.0469441984056687,
"grad_norm": 4.446445857081803,
"learning_rate": 1.9855044437758542e-06,
"loss": 0.6291,
"step": 17200
},
{
"epoch": 3.0504871567759078,
"grad_norm": 4.010559033356023,
"learning_rate": 1.9794546398437233e-06,
"loss": 0.6821,
"step": 17220
},
{
"epoch": 3.054030115146147,
"grad_norm": 5.668917044427614,
"learning_rate": 1.973408020712378e-06,
"loss": 0.6501,
"step": 17240
},
{
"epoch": 3.057573073516386,
"grad_norm": 4.507165538436801,
"learning_rate": 1.967364623376245e-06,
"loss": 0.6634,
"step": 17260
},
{
"epoch": 3.0611160318866255,
"grad_norm": 6.594268797496839,
"learning_rate": 1.9613244848100393e-06,
"loss": 0.6777,
"step": 17280
},
{
"epoch": 3.0646589902568646,
"grad_norm": 5.86179333081565,
"learning_rate": 1.9552876419685404e-06,
"loss": 0.6966,
"step": 17300
},
{
"epoch": 3.0682019486271037,
"grad_norm": 6.897830395606888,
"learning_rate": 1.94925413178636e-06,
"loss": 0.6358,
"step": 17320
},
{
"epoch": 3.071744906997343,
"grad_norm": 3.105114998536212,
"learning_rate": 1.9432239911777234e-06,
"loss": 0.6144,
"step": 17340
},
{
"epoch": 3.075287865367582,
"grad_norm": 4.226605746771805,
"learning_rate": 1.9371972570362386e-06,
"loss": 0.6445,
"step": 17360
},
{
"epoch": 3.078830823737821,
"grad_norm": 3.2064699379152946,
"learning_rate": 1.9311739662346714e-06,
"loss": 0.6295,
"step": 17380
},
{
"epoch": 3.08237378210806,
"grad_norm": 4.509110054894344,
"learning_rate": 1.925154155624723e-06,
"loss": 0.6584,
"step": 17400
},
{
"epoch": 3.0859167404782992,
"grad_norm": 2.7613419445656877,
"learning_rate": 1.9191378620367992e-06,
"loss": 0.6872,
"step": 17420
},
{
"epoch": 3.0894596988485383,
"grad_norm": 5.249702772830893,
"learning_rate": 1.91312512227979e-06,
"loss": 0.659,
"step": 17440
},
{
"epoch": 3.093002657218778,
"grad_norm": 4.594127042773178,
"learning_rate": 1.907115973140841e-06,
"loss": 0.6445,
"step": 17460
},
{
"epoch": 3.096545615589017,
"grad_norm": 3.5475423306782,
"learning_rate": 1.9011104513851306e-06,
"loss": 0.6446,
"step": 17480
},
{
"epoch": 3.100088573959256,
"grad_norm": 6.443218004353607,
"learning_rate": 1.8951085937556447e-06,
"loss": 0.6642,
"step": 17500
},
{
"epoch": 3.103631532329495,
"grad_norm": 4.817902509140939,
"learning_rate": 1.889110436972949e-06,
"loss": 0.6675,
"step": 17520
},
{
"epoch": 3.1071744906997343,
"grad_norm": 3.6588947567775576,
"learning_rate": 1.8831160177349694e-06,
"loss": 0.6011,
"step": 17540
},
{
"epoch": 3.1107174490699734,
"grad_norm": 3.28321156533759,
"learning_rate": 1.8771253727167639e-06,
"loss": 0.6553,
"step": 17560
},
{
"epoch": 3.1142604074402125,
"grad_norm": 5.587131615275665,
"learning_rate": 1.8711385385702973e-06,
"loss": 0.6896,
"step": 17580
},
{
"epoch": 3.1178033658104516,
"grad_norm": 4.969644996204083,
"learning_rate": 1.8651555519242215e-06,
"loss": 0.648,
"step": 17600
},
{
"epoch": 3.1213463241806907,
"grad_norm": 4.564430542899775,
"learning_rate": 1.8591764493836468e-06,
"loss": 0.6673,
"step": 17620
},
{
"epoch": 3.1248892825509302,
"grad_norm": 4.821775727343219,
"learning_rate": 1.8532012675299198e-06,
"loss": 0.6368,
"step": 17640
},
{
"epoch": 3.1284322409211693,
"grad_norm": 3.550472754727026,
"learning_rate": 1.8472300429203998e-06,
"loss": 0.6763,
"step": 17660
},
{
"epoch": 3.1319751992914084,
"grad_norm": 5.387384363073119,
"learning_rate": 1.8412628120882359e-06,
"loss": 0.6228,
"step": 17680
},
{
"epoch": 3.1355181576616475,
"grad_norm": 2.0363557538364527,
"learning_rate": 1.8352996115421417e-06,
"loss": 0.6165,
"step": 17700
},
{
"epoch": 3.1390611160318866,
"grad_norm": 3.302139015386827,
"learning_rate": 1.829340477766172e-06,
"loss": 0.6668,
"step": 17720
},
{
"epoch": 3.1426040744021257,
"grad_norm": 4.799357203523113,
"learning_rate": 1.8233854472195014e-06,
"loss": 0.6657,
"step": 17740
},
{
"epoch": 3.146147032772365,
"grad_norm": 4.447535055541927,
"learning_rate": 1.8174345563361992e-06,
"loss": 0.6814,
"step": 17760
},
{
"epoch": 3.149689991142604,
"grad_norm": 2.19457736885214,
"learning_rate": 1.8114878415250082e-06,
"loss": 0.6682,
"step": 17780
},
{
"epoch": 3.153232949512843,
"grad_norm": 3.0879621968638755,
"learning_rate": 1.8055453391691209e-06,
"loss": 0.6,
"step": 17800
},
{
"epoch": 3.156775907883082,
"grad_norm": 3.9283834956111705,
"learning_rate": 1.7996070856259568e-06,
"loss": 0.6664,
"step": 17820
},
{
"epoch": 3.1603188662533217,
"grad_norm": 5.062334002651309,
"learning_rate": 1.7936731172269414e-06,
"loss": 0.6691,
"step": 17840
},
{
"epoch": 3.163861824623561,
"grad_norm": 3.592859470647672,
"learning_rate": 1.7877434702772807e-06,
"loss": 0.6632,
"step": 17860
},
{
"epoch": 3.1674047829938,
"grad_norm": 3.5217893259822306,
"learning_rate": 1.7818181810557428e-06,
"loss": 0.6588,
"step": 17880
},
{
"epoch": 3.170947741364039,
"grad_norm": 5.792763255978902,
"learning_rate": 1.7758972858144351e-06,
"loss": 0.6843,
"step": 17900
},
{
"epoch": 3.174490699734278,
"grad_norm": 5.619010064865972,
"learning_rate": 1.7699808207785796e-06,
"loss": 0.6304,
"step": 17920
},
{
"epoch": 3.178033658104517,
"grad_norm": 3.233388216079928,
"learning_rate": 1.7640688221462955e-06,
"loss": 0.6481,
"step": 17940
},
{
"epoch": 3.1815766164747563,
"grad_norm": 7.189211258809608,
"learning_rate": 1.7581613260883733e-06,
"loss": 0.6516,
"step": 17960
},
{
"epoch": 3.1851195748449954,
"grad_norm": 4.794500314138224,
"learning_rate": 1.7522583687480587e-06,
"loss": 0.6276,
"step": 17980
},
{
"epoch": 3.1886625332152345,
"grad_norm": 5.414264886735773,
"learning_rate": 1.7463599862408265e-06,
"loss": 0.6461,
"step": 18000
},
{
"epoch": 3.1886625332152345,
"eval_loss": 0.814194917678833,
"eval_runtime": 367.8278,
"eval_samples_per_second": 25.846,
"eval_steps_per_second": 3.232,
"step": 18000
},
{
"epoch": 3.192205491585474,
"grad_norm": 2.678076603574593,
"learning_rate": 1.7404662146541622e-06,
"loss": 0.6586,
"step": 18020
},
{
"epoch": 3.195748449955713,
"grad_norm": 5.435434057895623,
"learning_rate": 1.7345770900473424e-06,
"loss": 0.6378,
"step": 18040
},
{
"epoch": 3.1992914083259523,
"grad_norm": 4.182207392231193,
"learning_rate": 1.7286926484512088e-06,
"loss": 0.6429,
"step": 18060
},
{
"epoch": 3.2028343666961914,
"grad_norm": 6.163291864345331,
"learning_rate": 1.722812925867955e-06,
"loss": 0.6215,
"step": 18080
},
{
"epoch": 3.2063773250664305,
"grad_norm": 4.221207066086723,
"learning_rate": 1.7169379582709018e-06,
"loss": 0.6734,
"step": 18100
},
{
"epoch": 3.2099202834366696,
"grad_norm": 3.6319759767031345,
"learning_rate": 1.711067781604277e-06,
"loss": 0.6688,
"step": 18120
},
{
"epoch": 3.2134632418069087,
"grad_norm": 2.4361609804138986,
"learning_rate": 1.7052024317829986e-06,
"loss": 0.6779,
"step": 18140
},
{
"epoch": 3.217006200177148,
"grad_norm": 3.2229485582444646,
"learning_rate": 1.69934194469245e-06,
"loss": 0.5963,
"step": 18160
},
{
"epoch": 3.220549158547387,
"grad_norm": 4.04542335006295,
"learning_rate": 1.6934863561882664e-06,
"loss": 0.6149,
"step": 18180
},
{
"epoch": 3.2240921169176264,
"grad_norm": 4.224812708749921,
"learning_rate": 1.687635702096111e-06,
"loss": 0.6544,
"step": 18200
},
{
"epoch": 3.2276350752878655,
"grad_norm": 2.5768170370991133,
"learning_rate": 1.681790018211457e-06,
"loss": 0.6455,
"step": 18220
},
{
"epoch": 3.2311780336581046,
"grad_norm": 8.446037248628688,
"learning_rate": 1.6759493402993713e-06,
"loss": 0.6399,
"step": 18240
},
{
"epoch": 3.2347209920283437,
"grad_norm": 4.421664554190382,
"learning_rate": 1.6701137040942884e-06,
"loss": 0.6605,
"step": 18260
},
{
"epoch": 3.238263950398583,
"grad_norm": 3.867362303030101,
"learning_rate": 1.664283145299801e-06,
"loss": 0.6197,
"step": 18280
},
{
"epoch": 3.241806908768822,
"grad_norm": 2.9062273272206975,
"learning_rate": 1.658457699588436e-06,
"loss": 0.6415,
"step": 18300
},
{
"epoch": 3.245349867139061,
"grad_norm": 5.355980117276988,
"learning_rate": 1.6526374026014366e-06,
"loss": 0.6154,
"step": 18320
},
{
"epoch": 3.2488928255093,
"grad_norm": 3.447853280839097,
"learning_rate": 1.6468222899485464e-06,
"loss": 0.6004,
"step": 18340
},
{
"epoch": 3.2524357838795392,
"grad_norm": 3.7093160550377813,
"learning_rate": 1.6410123972077884e-06,
"loss": 0.6604,
"step": 18360
},
{
"epoch": 3.255978742249779,
"grad_norm": 5.03373607670554,
"learning_rate": 1.6352077599252508e-06,
"loss": 0.6942,
"step": 18380
},
{
"epoch": 3.259521700620018,
"grad_norm": 2.7267954903558462,
"learning_rate": 1.6294084136148677e-06,
"loss": 0.6245,
"step": 18400
},
{
"epoch": 3.263064658990257,
"grad_norm": 3.422200189756036,
"learning_rate": 1.6236143937582006e-06,
"loss": 0.6454,
"step": 18420
},
{
"epoch": 3.266607617360496,
"grad_norm": 3.004333600805996,
"learning_rate": 1.6178257358042238e-06,
"loss": 0.6308,
"step": 18440
},
{
"epoch": 3.270150575730735,
"grad_norm": 3.205706761360872,
"learning_rate": 1.6120424751691078e-06,
"loss": 0.7113,
"step": 18460
},
{
"epoch": 3.2736935341009743,
"grad_norm": 4.169462011619208,
"learning_rate": 1.6062646472359967e-06,
"loss": 0.6739,
"step": 18480
},
{
"epoch": 3.2772364924712134,
"grad_norm": 5.474090442872105,
"learning_rate": 1.6004922873548014e-06,
"loss": 0.6459,
"step": 18500
},
{
"epoch": 3.2807794508414525,
"grad_norm": 4.123370814980775,
"learning_rate": 1.594725430841975e-06,
"loss": 0.6329,
"step": 18520
},
{
"epoch": 3.2843224092116916,
"grad_norm": 3.386294588511836,
"learning_rate": 1.5889641129803013e-06,
"loss": 0.6978,
"step": 18540
},
{
"epoch": 3.287865367581931,
"grad_norm": 4.479997130104378,
"learning_rate": 1.5832083690186763e-06,
"loss": 0.6942,
"step": 18560
},
{
"epoch": 3.2914083259521703,
"grad_norm": 3.403229229215997,
"learning_rate": 1.5774582341718952e-06,
"loss": 0.6561,
"step": 18580
},
{
"epoch": 3.2949512843224094,
"grad_norm": 3.3840120900240045,
"learning_rate": 1.571713743620435e-06,
"loss": 0.6464,
"step": 18600
},
{
"epoch": 3.2984942426926485,
"grad_norm": 2.166301971846268,
"learning_rate": 1.5659749325102391e-06,
"loss": 0.6633,
"step": 18620
},
{
"epoch": 3.3020372010628876,
"grad_norm": 4.571502120958036,
"learning_rate": 1.5602418359525029e-06,
"loss": 0.6449,
"step": 18640
},
{
"epoch": 3.3055801594331267,
"grad_norm": 6.850347513041369,
"learning_rate": 1.5545144890234618e-06,
"loss": 0.6375,
"step": 18660
},
{
"epoch": 3.3091231178033658,
"grad_norm": 5.268732271957646,
"learning_rate": 1.5487929267641688e-06,
"loss": 0.6387,
"step": 18680
},
{
"epoch": 3.312666076173605,
"grad_norm": 6.854691620973651,
"learning_rate": 1.5430771841802894e-06,
"loss": 0.6792,
"step": 18700
},
{
"epoch": 3.316209034543844,
"grad_norm": 5.772199960663563,
"learning_rate": 1.537367296241881e-06,
"loss": 0.5957,
"step": 18720
},
{
"epoch": 3.319751992914083,
"grad_norm": 4.543759004099246,
"learning_rate": 1.531663297883183e-06,
"loss": 0.6704,
"step": 18740
},
{
"epoch": 3.323294951284322,
"grad_norm": 4.253141544006728,
"learning_rate": 1.525965224002398e-06,
"loss": 0.6591,
"step": 18760
},
{
"epoch": 3.3268379096545617,
"grad_norm": 4.030607569941474,
"learning_rate": 1.5202731094614848e-06,
"loss": 0.6153,
"step": 18780
},
{
"epoch": 3.330380868024801,
"grad_norm": 4.541306419220621,
"learning_rate": 1.5145869890859404e-06,
"loss": 0.6801,
"step": 18800
},
{
"epoch": 3.33392382639504,
"grad_norm": 4.860276758268095,
"learning_rate": 1.5089068976645876e-06,
"loss": 0.6129,
"step": 18820
},
{
"epoch": 3.337466784765279,
"grad_norm": 3.7898282834137875,
"learning_rate": 1.503232869949364e-06,
"loss": 0.647,
"step": 18840
},
{
"epoch": 3.341009743135518,
"grad_norm": 4.475193399989839,
"learning_rate": 1.4975649406551081e-06,
"loss": 0.6015,
"step": 18860
},
{
"epoch": 3.3445527015057572,
"grad_norm": 3.824693434860729,
"learning_rate": 1.4919031444593458e-06,
"loss": 0.6672,
"step": 18880
},
{
"epoch": 3.3480956598759963,
"grad_norm": 3.5036219590743447,
"learning_rate": 1.4862475160020806e-06,
"loss": 0.6771,
"step": 18900
},
{
"epoch": 3.3516386182462354,
"grad_norm": 4.1272200518774325,
"learning_rate": 1.48059808988558e-06,
"loss": 0.6757,
"step": 18920
},
{
"epoch": 3.3551815766164745,
"grad_norm": 3.5389364787677957,
"learning_rate": 1.4749549006741655e-06,
"loss": 0.7042,
"step": 18940
},
{
"epoch": 3.358724534986714,
"grad_norm": 2.5975704276819,
"learning_rate": 1.4693179828939985e-06,
"loss": 0.6987,
"step": 18960
},
{
"epoch": 3.362267493356953,
"grad_norm": 3.8727633599446794,
"learning_rate": 1.463687371032871e-06,
"loss": 0.6685,
"step": 18980
},
{
"epoch": 3.3658104517271923,
"grad_norm": 3.803175388434909,
"learning_rate": 1.4580630995399949e-06,
"loss": 0.6214,
"step": 19000
},
{
"epoch": 3.3693534100974314,
"grad_norm": 3.1226214687691445,
"learning_rate": 1.4524452028257884e-06,
"loss": 0.6516,
"step": 19020
},
{
"epoch": 3.3728963684676705,
"grad_norm": 3.9984001662113986,
"learning_rate": 1.4468337152616712e-06,
"loss": 0.6686,
"step": 19040
},
{
"epoch": 3.3764393268379096,
"grad_norm": 5.2905284840587985,
"learning_rate": 1.4412286711798473e-06,
"loss": 0.643,
"step": 19060
},
{
"epoch": 3.3799822852081487,
"grad_norm": 3.996754961897811,
"learning_rate": 1.4356301048730987e-06,
"loss": 0.6707,
"step": 19080
},
{
"epoch": 3.383525243578388,
"grad_norm": 2.7287259969894757,
"learning_rate": 1.4300380505945754e-06,
"loss": 0.647,
"step": 19100
},
{
"epoch": 3.387068201948627,
"grad_norm": 4.627474263327022,
"learning_rate": 1.4244525425575862e-06,
"loss": 0.6579,
"step": 19120
},
{
"epoch": 3.3906111603188664,
"grad_norm": 6.3578848860264054,
"learning_rate": 1.418873614935387e-06,
"loss": 0.6214,
"step": 19140
},
{
"epoch": 3.3941541186891055,
"grad_norm": 2.5409208272433292,
"learning_rate": 1.4133013018609762e-06,
"loss": 0.6916,
"step": 19160
},
{
"epoch": 3.3976970770593447,
"grad_norm": 5.73185853916136,
"learning_rate": 1.4077356374268808e-06,
"loss": 0.639,
"step": 19180
},
{
"epoch": 3.4012400354295838,
"grad_norm": 5.547561124073059,
"learning_rate": 1.4021766556849492e-06,
"loss": 0.6472,
"step": 19200
},
{
"epoch": 3.404782993799823,
"grad_norm": 6.154343627624317,
"learning_rate": 1.3966243906461477e-06,
"loss": 0.632,
"step": 19220
},
{
"epoch": 3.408325952170062,
"grad_norm": 3.352883710099438,
"learning_rate": 1.3910788762803448e-06,
"loss": 0.6399,
"step": 19240
},
{
"epoch": 3.411868910540301,
"grad_norm": 5.151354471677131,
"learning_rate": 1.3855401465161072e-06,
"loss": 0.6439,
"step": 19260
},
{
"epoch": 3.41541186891054,
"grad_norm": 3.533259346429167,
"learning_rate": 1.3800082352404964e-06,
"loss": 0.7011,
"step": 19280
},
{
"epoch": 3.4189548272807793,
"grad_norm": 2.422586203772127,
"learning_rate": 1.3744831762988492e-06,
"loss": 0.6802,
"step": 19300
},
{
"epoch": 3.422497785651019,
"grad_norm": 2.0987507209228773,
"learning_rate": 1.368965003494586e-06,
"loss": 0.653,
"step": 19320
},
{
"epoch": 3.426040744021258,
"grad_norm": 5.860157027237845,
"learning_rate": 1.3634537505889927e-06,
"loss": 0.6517,
"step": 19340
},
{
"epoch": 3.429583702391497,
"grad_norm": 4.150793117319128,
"learning_rate": 1.3579494513010178e-06,
"loss": 0.6702,
"step": 19360
},
{
"epoch": 3.433126660761736,
"grad_norm": 2.376892166127197,
"learning_rate": 1.352452139307068e-06,
"loss": 0.6578,
"step": 19380
},
{
"epoch": 3.436669619131975,
"grad_norm": 3.5677324284202974,
"learning_rate": 1.3469618482407993e-06,
"loss": 0.6466,
"step": 19400
},
{
"epoch": 3.4402125775022143,
"grad_norm": 4.23134121034947,
"learning_rate": 1.3414786116929102e-06,
"loss": 0.6529,
"step": 19420
},
{
"epoch": 3.4437555358724534,
"grad_norm": 5.745078327509347,
"learning_rate": 1.3360024632109431e-06,
"loss": 0.6484,
"step": 19440
},
{
"epoch": 3.4472984942426925,
"grad_norm": 3.6442103656861864,
"learning_rate": 1.3305334362990697e-06,
"loss": 0.6669,
"step": 19460
},
{
"epoch": 3.4508414526129316,
"grad_norm": 36.732566773014995,
"learning_rate": 1.3250715644178926e-06,
"loss": 0.6526,
"step": 19480
},
{
"epoch": 3.454384410983171,
"grad_norm": 6.375922451901222,
"learning_rate": 1.3196168809842384e-06,
"loss": 0.6773,
"step": 19500
},
{
"epoch": 3.4579273693534103,
"grad_norm": 4.596315770556884,
"learning_rate": 1.314169419370952e-06,
"loss": 0.6634,
"step": 19520
},
{
"epoch": 3.4614703277236494,
"grad_norm": 5.605691895671033,
"learning_rate": 1.3087292129066947e-06,
"loss": 0.6925,
"step": 19540
},
{
"epoch": 3.4650132860938885,
"grad_norm": 4.310812747641876,
"learning_rate": 1.3032962948757406e-06,
"loss": 0.6323,
"step": 19560
},
{
"epoch": 3.4685562444641276,
"grad_norm": 2.539953889588533,
"learning_rate": 1.2978706985177702e-06,
"loss": 0.6603,
"step": 19580
},
{
"epoch": 3.4720992028343667,
"grad_norm": 4.700922466636149,
"learning_rate": 1.2924524570276676e-06,
"loss": 0.6387,
"step": 19600
},
{
"epoch": 3.475642161204606,
"grad_norm": 3.386892338712282,
"learning_rate": 1.2870416035553213e-06,
"loss": 0.665,
"step": 19620
},
{
"epoch": 3.479185119574845,
"grad_norm": 4.047488784454614,
"learning_rate": 1.2816381712054157e-06,
"loss": 0.6442,
"step": 19640
},
{
"epoch": 3.482728077945084,
"grad_norm": 4.456956526186442,
"learning_rate": 1.2762421930372318e-06,
"loss": 0.637,
"step": 19660
},
{
"epoch": 3.4862710363153235,
"grad_norm": 4.141098829330102,
"learning_rate": 1.2708537020644465e-06,
"loss": 0.6384,
"step": 19680
},
{
"epoch": 3.4898139946855626,
"grad_norm": 3.179545115166026,
"learning_rate": 1.265472731254926e-06,
"loss": 0.6259,
"step": 19700
},
{
"epoch": 3.4933569530558017,
"grad_norm": 3.085506510833184,
"learning_rate": 1.2600993135305278e-06,
"loss": 0.6297,
"step": 19720
},
{
"epoch": 3.496899911426041,
"grad_norm": 4.667399574209881,
"learning_rate": 1.254733481766898e-06,
"loss": 0.6576,
"step": 19740
},
{
"epoch": 3.50044286979628,
"grad_norm": 4.264096999147888,
"learning_rate": 1.2493752687932687e-06,
"loss": 0.6778,
"step": 19760
},
{
"epoch": 3.503985828166519,
"grad_norm": 2.0289989732438936,
"learning_rate": 1.2440247073922627e-06,
"loss": 0.6264,
"step": 19780
},
{
"epoch": 3.507528786536758,
"grad_norm": 2.316066403465445,
"learning_rate": 1.2386818302996847e-06,
"loss": 0.6594,
"step": 19800
},
{
"epoch": 3.5110717449069972,
"grad_norm": 4.626840288617283,
"learning_rate": 1.233346670204327e-06,
"loss": 0.691,
"step": 19820
},
{
"epoch": 3.5146147032772364,
"grad_norm": 3.3340384188287193,
"learning_rate": 1.228019259747769e-06,
"loss": 0.6249,
"step": 19840
},
{
"epoch": 3.518157661647476,
"grad_norm": 3.501104395294738,
"learning_rate": 1.2226996315241743e-06,
"loss": 0.6646,
"step": 19860
},
{
"epoch": 3.5217006200177146,
"grad_norm": 1.7915799562011805,
"learning_rate": 1.217387818080093e-06,
"loss": 0.6616,
"step": 19880
},
{
"epoch": 3.525243578387954,
"grad_norm": 4.085420259424753,
"learning_rate": 1.2120838519142664e-06,
"loss": 0.6475,
"step": 19900
},
{
"epoch": 3.528786536758193,
"grad_norm": 2.889218336443782,
"learning_rate": 1.2067877654774195e-06,
"loss": 0.6577,
"step": 19920
},
{
"epoch": 3.5323294951284323,
"grad_norm": 3.4338184164603773,
"learning_rate": 1.20149959117207e-06,
"loss": 0.6706,
"step": 19940
},
{
"epoch": 3.5358724534986714,
"grad_norm": 5.515577025317211,
"learning_rate": 1.196219361352329e-06,
"loss": 0.646,
"step": 19960
},
{
"epoch": 3.5394154118689105,
"grad_norm": 5.178906055641794,
"learning_rate": 1.1909471083236999e-06,
"loss": 0.6457,
"step": 19980
},
{
"epoch": 3.5429583702391496,
"grad_norm": 2.503351819448841,
"learning_rate": 1.1856828643428813e-06,
"loss": 0.644,
"step": 20000
},
{
"epoch": 3.5429583702391496,
"eval_loss": 0.8062734603881836,
"eval_runtime": 374.2695,
"eval_samples_per_second": 25.401,
"eval_steps_per_second": 3.177,
"step": 20000
},
{
"epoch": 3.5465013286093887,
"grad_norm": 4.448169389650755,
"learning_rate": 1.1804266616175747e-06,
"loss": 0.6384,
"step": 20020
},
{
"epoch": 3.5500442869796283,
"grad_norm": 7.501920429960917,
"learning_rate": 1.17517853230628e-06,
"loss": 0.6745,
"step": 20040
},
{
"epoch": 3.553587245349867,
"grad_norm": 4.264187361101836,
"learning_rate": 1.169938508518103e-06,
"loss": 0.6495,
"step": 20060
},
{
"epoch": 3.5571302037201065,
"grad_norm": 4.121288134877933,
"learning_rate": 1.1647066223125606e-06,
"loss": 0.6297,
"step": 20080
},
{
"epoch": 3.5606731620903456,
"grad_norm": 4.279937646373024,
"learning_rate": 1.1594829056993794e-06,
"loss": 0.6421,
"step": 20100
},
{
"epoch": 3.5642161204605847,
"grad_norm": 2.387088275176141,
"learning_rate": 1.1542673906383045e-06,
"loss": 0.6768,
"step": 20120
},
{
"epoch": 3.5677590788308238,
"grad_norm": 4.372760494789765,
"learning_rate": 1.1490601090389014e-06,
"loss": 0.6512,
"step": 20140
},
{
"epoch": 3.571302037201063,
"grad_norm": 3.3326241965778425,
"learning_rate": 1.1438610927603614e-06,
"loss": 0.6615,
"step": 20160
},
{
"epoch": 3.574844995571302,
"grad_norm": 3.04200662462561,
"learning_rate": 1.1386703736113092e-06,
"loss": 0.6343,
"step": 20180
},
{
"epoch": 3.578387953941541,
"grad_norm": 3.4142428638907254,
"learning_rate": 1.1334879833496033e-06,
"loss": 0.6929,
"step": 20200
},
{
"epoch": 3.58193091231178,
"grad_norm": 2.3882204523624484,
"learning_rate": 1.1283139536821446e-06,
"loss": 0.6017,
"step": 20220
},
{
"epoch": 3.5854738706820193,
"grad_norm": 2.733962119410601,
"learning_rate": 1.1231483162646851e-06,
"loss": 0.6503,
"step": 20240
},
{
"epoch": 3.589016829052259,
"grad_norm": 3.7061102390832383,
"learning_rate": 1.1179911027016277e-06,
"loss": 0.6049,
"step": 20260
},
{
"epoch": 3.592559787422498,
"grad_norm": 4.725193440295439,
"learning_rate": 1.1128423445458378e-06,
"loss": 0.6488,
"step": 20280
},
{
"epoch": 3.596102745792737,
"grad_norm": 6.502036568180023,
"learning_rate": 1.1077020732984508e-06,
"loss": 0.635,
"step": 20300
},
{
"epoch": 3.599645704162976,
"grad_norm": 3.9545813460025676,
"learning_rate": 1.1025703204086758e-06,
"loss": 0.679,
"step": 20320
},
{
"epoch": 3.6031886625332152,
"grad_norm": 3.6704467908903013,
"learning_rate": 1.097447117273602e-06,
"loss": 0.6222,
"step": 20340
},
{
"epoch": 3.6067316209034543,
"grad_norm": 3.6721020929579655,
"learning_rate": 1.0923324952380158e-06,
"loss": 0.6313,
"step": 20360
},
{
"epoch": 3.6102745792736934,
"grad_norm": 2.6332813999535247,
"learning_rate": 1.0872264855941974e-06,
"loss": 0.6067,
"step": 20380
},
{
"epoch": 3.6138175376439325,
"grad_norm": 6.199873637972493,
"learning_rate": 1.0821291195817368e-06,
"loss": 0.6525,
"step": 20400
},
{
"epoch": 3.6173604960141716,
"grad_norm": 5.810381931293109,
"learning_rate": 1.077040428387341e-06,
"loss": 0.6836,
"step": 20420
},
{
"epoch": 3.620903454384411,
"grad_norm": 3.782224795545236,
"learning_rate": 1.0719604431446424e-06,
"loss": 0.6494,
"step": 20440
},
{
"epoch": 3.6244464127546503,
"grad_norm": 3.813897952858666,
"learning_rate": 1.0668891949340066e-06,
"loss": 0.6666,
"step": 20460
},
{
"epoch": 3.6279893711248894,
"grad_norm": 3.689851683344868,
"learning_rate": 1.061826714782348e-06,
"loss": 0.665,
"step": 20480
},
{
"epoch": 3.6315323294951285,
"grad_norm": 2.9395205953490975,
"learning_rate": 1.0567730336629332e-06,
"loss": 0.6364,
"step": 20500
},
{
"epoch": 3.6350752878653676,
"grad_norm": 3.6265579695009444,
"learning_rate": 1.0517281824951958e-06,
"loss": 0.6308,
"step": 20520
},
{
"epoch": 3.6386182462356067,
"grad_norm": 3.827147896558186,
"learning_rate": 1.0466921921445455e-06,
"loss": 0.6372,
"step": 20540
},
{
"epoch": 3.642161204605846,
"grad_norm": 4.1574203155116445,
"learning_rate": 1.0416650934221797e-06,
"loss": 0.6439,
"step": 20560
},
{
"epoch": 3.645704162976085,
"grad_norm": 4.4390839250431515,
"learning_rate": 1.0366469170848966e-06,
"loss": 0.6009,
"step": 20580
},
{
"epoch": 3.649247121346324,
"grad_norm": 5.0129876566121165,
"learning_rate": 1.0316376938349037e-06,
"loss": 0.692,
"step": 20600
},
{
"epoch": 3.6527900797165636,
"grad_norm": 3.704661473447835,
"learning_rate": 1.0266374543196312e-06,
"loss": 0.6231,
"step": 20620
},
{
"epoch": 3.656333038086802,
"grad_norm": 3.8011769756171954,
"learning_rate": 1.021646229131548e-06,
"loss": 0.6669,
"step": 20640
},
{
"epoch": 3.6598759964570418,
"grad_norm": 4.1871605406787875,
"learning_rate": 1.0166640488079682e-06,
"loss": 0.6749,
"step": 20660
},
{
"epoch": 3.663418954827281,
"grad_norm": 4.393069433206878,
"learning_rate": 1.0116909438308689e-06,
"loss": 0.6444,
"step": 20680
},
{
"epoch": 3.66696191319752,
"grad_norm": 3.8353595768451725,
"learning_rate": 1.006726944626704e-06,
"loss": 0.6717,
"step": 20700
},
{
"epoch": 3.670504871567759,
"grad_norm": 5.904031353317382,
"learning_rate": 1.0017720815662137e-06,
"loss": 0.634,
"step": 20720
},
{
"epoch": 3.674047829937998,
"grad_norm": 4.827433370442803,
"learning_rate": 9.968263849642434e-07,
"loss": 0.6189,
"step": 20740
},
{
"epoch": 3.6775907883082373,
"grad_norm": 5.58653647794514,
"learning_rate": 9.91889885079555e-07,
"loss": 0.6557,
"step": 20760
},
{
"epoch": 3.6811337466784764,
"grad_norm": 6.026946025657904,
"learning_rate": 9.869626121146442e-07,
"loss": 0.6468,
"step": 20780
},
{
"epoch": 3.684676705048716,
"grad_norm": 2.8168530647204384,
"learning_rate": 9.820445962155526e-07,
"loss": 0.6782,
"step": 20800
},
{
"epoch": 3.6882196634189546,
"grad_norm": 5.029755822261837,
"learning_rate": 9.771358674716886e-07,
"loss": 0.6575,
"step": 20820
},
{
"epoch": 3.691762621789194,
"grad_norm": 3.0952751296440826,
"learning_rate": 9.722364559156373e-07,
"loss": 0.6524,
"step": 20840
},
{
"epoch": 3.6953055801594332,
"grad_norm": 3.5997851014244033,
"learning_rate": 9.673463915229786e-07,
"loss": 0.672,
"step": 20860
},
{
"epoch": 3.6988485385296723,
"grad_norm": 2.1873288789656913,
"learning_rate": 9.62465704212108e-07,
"loss": 0.7021,
"step": 20880
},
{
"epoch": 3.7023914968999114,
"grad_norm": 3.8549619707969636,
"learning_rate": 9.575944238440473e-07,
"loss": 0.6788,
"step": 20900
},
{
"epoch": 3.7059344552701505,
"grad_norm": 2.517253170655585,
"learning_rate": 9.527325802222651e-07,
"loss": 0.6652,
"step": 20920
},
{
"epoch": 3.7094774136403896,
"grad_norm": 4.344342864101945,
"learning_rate": 9.478802030924964e-07,
"loss": 0.66,
"step": 20940
},
{
"epoch": 3.7130203720106287,
"grad_norm": 5.501462412298871,
"learning_rate": 9.430373221425534e-07,
"loss": 0.6083,
"step": 20960
},
{
"epoch": 3.7165633303808683,
"grad_norm": 5.05343578594501,
"learning_rate": 9.382039670021548e-07,
"loss": 0.6454,
"step": 20980
},
{
"epoch": 3.720106288751107,
"grad_norm": 3.501826208404435,
"learning_rate": 9.333801672427339e-07,
"loss": 0.6739,
"step": 21000
},
{
"epoch": 3.7236492471213465,
"grad_norm": 2.7034068335281836,
"learning_rate": 9.285659523772636e-07,
"loss": 0.6527,
"step": 21020
},
{
"epoch": 3.7271922054915856,
"grad_norm": 5.720702183074802,
"learning_rate": 9.237613518600763e-07,
"loss": 0.6369,
"step": 21040
},
{
"epoch": 3.7307351638618247,
"grad_norm": 3.1813749076707207,
"learning_rate": 9.189663950866795e-07,
"loss": 0.6318,
"step": 21060
},
{
"epoch": 3.734278122232064,
"grad_norm": 6.860403932855227,
"learning_rate": 9.141811113935786e-07,
"loss": 0.6501,
"step": 21080
},
{
"epoch": 3.737821080602303,
"grad_norm": 5.995799501979168,
"learning_rate": 9.094055300580992e-07,
"loss": 0.686,
"step": 21100
},
{
"epoch": 3.741364038972542,
"grad_norm": 3.980336679764352,
"learning_rate": 9.046396802982041e-07,
"loss": 0.6047,
"step": 21120
},
{
"epoch": 3.744906997342781,
"grad_norm": 6.358653317261383,
"learning_rate": 8.998835912723162e-07,
"loss": 0.6597,
"step": 21140
},
{
"epoch": 3.7484499557130206,
"grad_norm": 5.811283042903301,
"learning_rate": 8.951372920791412e-07,
"loss": 0.6643,
"step": 21160
},
{
"epoch": 3.7519929140832593,
"grad_norm": 3.162264075637326,
"learning_rate": 8.904008117574886e-07,
"loss": 0.6319,
"step": 21180
},
{
"epoch": 3.755535872453499,
"grad_norm": 4.2075636016893805,
"learning_rate": 8.856741792860923e-07,
"loss": 0.6902,
"step": 21200
},
{
"epoch": 3.759078830823738,
"grad_norm": 4.321253377690551,
"learning_rate": 8.80957423583439e-07,
"loss": 0.7127,
"step": 21220
},
{
"epoch": 3.762621789193977,
"grad_norm": 4.271237705673551,
"learning_rate": 8.762505735075833e-07,
"loss": 0.6617,
"step": 21240
},
{
"epoch": 3.766164747564216,
"grad_norm": 3.0868138114437866,
"learning_rate": 8.715536578559763e-07,
"loss": 0.6178,
"step": 21260
},
{
"epoch": 3.7697077059344553,
"grad_norm": 5.628060531285577,
"learning_rate": 8.668667053652907e-07,
"loss": 0.6439,
"step": 21280
},
{
"epoch": 3.7732506643046944,
"grad_norm": 2.4998507467015245,
"learning_rate": 8.621897447112395e-07,
"loss": 0.6257,
"step": 21300
},
{
"epoch": 3.7767936226749335,
"grad_norm": 3.5267634187350874,
"learning_rate": 8.575228045084044e-07,
"loss": 0.6537,
"step": 21320
},
{
"epoch": 3.7803365810451726,
"grad_norm": 5.53390702998575,
"learning_rate": 8.528659133100616e-07,
"loss": 0.6343,
"step": 21340
},
{
"epoch": 3.7838795394154117,
"grad_norm": 5.193506072777215,
"learning_rate": 8.482190996080042e-07,
"loss": 0.6457,
"step": 21360
},
{
"epoch": 3.787422497785651,
"grad_norm": 3.306608753288234,
"learning_rate": 8.435823918323682e-07,
"loss": 0.674,
"step": 21380
},
{
"epoch": 3.7909654561558903,
"grad_norm": 5.818341238525181,
"learning_rate": 8.389558183514615e-07,
"loss": 0.6551,
"step": 21400
},
{
"epoch": 3.7945084145261294,
"grad_norm": 1.869734323352245,
"learning_rate": 8.34339407471586e-07,
"loss": 0.6328,
"step": 21420
},
{
"epoch": 3.7980513728963685,
"grad_norm": 3.061312982527864,
"learning_rate": 8.297331874368702e-07,
"loss": 0.6127,
"step": 21440
},
{
"epoch": 3.8015943312666076,
"grad_norm": 2.810183936591826,
"learning_rate": 8.2513718642909e-07,
"loss": 0.6226,
"step": 21460
},
{
"epoch": 3.8051372896368467,
"grad_norm": 3.6433306189517127,
"learning_rate": 8.205514325674993e-07,
"loss": 0.6773,
"step": 21480
},
{
"epoch": 3.808680248007086,
"grad_norm": 6.17382443818236,
"learning_rate": 8.159759539086603e-07,
"loss": 0.6604,
"step": 21500
},
{
"epoch": 3.812223206377325,
"grad_norm": 3.3939319572629563,
"learning_rate": 8.114107784462677e-07,
"loss": 0.6187,
"step": 21520
},
{
"epoch": 3.815766164747564,
"grad_norm": 5.234765372841271,
"learning_rate": 8.068559341109791e-07,
"loss": 0.6466,
"step": 21540
},
{
"epoch": 3.8193091231178036,
"grad_norm": 3.1266699022361357,
"learning_rate": 8.023114487702446e-07,
"loss": 0.6708,
"step": 21560
},
{
"epoch": 3.8228520814880427,
"grad_norm": 4.632817513631642,
"learning_rate": 7.977773502281355e-07,
"loss": 0.6564,
"step": 21580
},
{
"epoch": 3.8263950398582818,
"grad_norm": 5.332789338782664,
"learning_rate": 7.932536662251747e-07,
"loss": 0.6521,
"step": 21600
},
{
"epoch": 3.829937998228521,
"grad_norm": 4.360391036257879,
"learning_rate": 7.887404244381683e-07,
"loss": 0.6484,
"step": 21620
},
{
"epoch": 3.83348095659876,
"grad_norm": 3.444603203101798,
"learning_rate": 7.84237652480033e-07,
"loss": 0.6651,
"step": 21640
},
{
"epoch": 3.837023914968999,
"grad_norm": 4.102682609294379,
"learning_rate": 7.797453778996284e-07,
"loss": 0.6597,
"step": 21660
},
{
"epoch": 3.840566873339238,
"grad_norm": 3.7772046806599917,
"learning_rate": 7.752636281815923e-07,
"loss": 0.669,
"step": 21680
},
{
"epoch": 3.8441098317094773,
"grad_norm": 4.059355924000635,
"learning_rate": 7.707924307461664e-07,
"loss": 0.6333,
"step": 21700
},
{
"epoch": 3.8476527900797164,
"grad_norm": 5.671547408240144,
"learning_rate": 7.663318129490313e-07,
"loss": 0.6299,
"step": 21720
},
{
"epoch": 3.851195748449956,
"grad_norm": 3.8675019069565595,
"learning_rate": 7.61881802081142e-07,
"loss": 0.6915,
"step": 21740
},
{
"epoch": 3.8547387068201946,
"grad_norm": 4.07995418510124,
"learning_rate": 7.57442425368555e-07,
"loss": 0.6051,
"step": 21760
},
{
"epoch": 3.858281665190434,
"grad_norm": 4.316690207012028,
"learning_rate": 7.53013709972267e-07,
"loss": 0.6172,
"step": 21780
},
{
"epoch": 3.8618246235606732,
"grad_norm": 2.973078463900836,
"learning_rate": 7.485956829880455e-07,
"loss": 0.6679,
"step": 21800
},
{
"epoch": 3.8653675819309123,
"grad_norm": 5.5663765632065525,
"learning_rate": 7.441883714462641e-07,
"loss": 0.6259,
"step": 21820
},
{
"epoch": 3.8689105403011514,
"grad_norm": 4.08577312555125,
"learning_rate": 7.397918023117389e-07,
"loss": 0.6318,
"step": 21840
},
{
"epoch": 3.8724534986713905,
"grad_norm": 4.203797767658477,
"learning_rate": 7.354060024835599e-07,
"loss": 0.6391,
"step": 21860
},
{
"epoch": 3.8759964570416297,
"grad_norm": 2.811964097058608,
"learning_rate": 7.310309987949294e-07,
"loss": 0.6946,
"step": 21880
},
{
"epoch": 3.8795394154118688,
"grad_norm": 3.853792141294366,
"learning_rate": 7.266668180129946e-07,
"loss": 0.6468,
"step": 21900
},
{
"epoch": 3.8830823737821083,
"grad_norm": 5.804907803894153,
"learning_rate": 7.223134868386903e-07,
"loss": 0.6124,
"step": 21920
},
{
"epoch": 3.886625332152347,
"grad_norm": 5.778097135701537,
"learning_rate": 7.179710319065672e-07,
"loss": 0.7053,
"step": 21940
},
{
"epoch": 3.8901682905225865,
"grad_norm": 3.6866863372568712,
"learning_rate": 7.136394797846338e-07,
"loss": 0.6541,
"step": 21960
},
{
"epoch": 3.8937112488928256,
"grad_norm": 3.29536698985515,
"learning_rate": 7.093188569741962e-07,
"loss": 0.6287,
"step": 21980
},
{
"epoch": 3.8972542072630647,
"grad_norm": 4.9327123681203275,
"learning_rate": 7.050091899096869e-07,
"loss": 0.6666,
"step": 22000
},
{
"epoch": 3.8972542072630647,
"eval_loss": 0.8004346489906311,
"eval_runtime": 378.4036,
"eval_samples_per_second": 25.124,
"eval_steps_per_second": 3.142,
"step": 22000
}
],
"logging_steps": 20,
"max_steps": 28225,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2905457032298496.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}