JomokCoder's picture
Upload folder using huggingface_hub
cb3d03b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 51.01892063492063,
"eval_steps": 500,
"global_step": 15700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015873015873015873,
"grad_norm": 9.738715171813965,
"learning_rate": 1.5862944162436548e-07,
"loss": 5.3155,
"step": 25
},
{
"epoch": 0.0031746031746031746,
"grad_norm": 32.81618881225586,
"learning_rate": 3.1725888324873095e-07,
"loss": 5.0547,
"step": 50
},
{
"epoch": 0.004761904761904762,
"grad_norm": 32.67608642578125,
"learning_rate": 4.7588832487309643e-07,
"loss": 5.1135,
"step": 75
},
{
"epoch": 0.006349206349206349,
"grad_norm": 10.283950805664062,
"learning_rate": 6.345177664974619e-07,
"loss": 4.9131,
"step": 100
},
{
"epoch": 0.007936507936507936,
"grad_norm": 21.210186004638672,
"learning_rate": 7.931472081218275e-07,
"loss": 4.9171,
"step": 125
},
{
"epoch": 0.009523809523809525,
"grad_norm": 48.682411193847656,
"learning_rate": 9.517766497461929e-07,
"loss": 4.993,
"step": 150
},
{
"epoch": 0.011111111111111112,
"grad_norm": 12.522156715393066,
"learning_rate": 1.1104060913705584e-06,
"loss": 4.966,
"step": 175
},
{
"epoch": 0.012698412698412698,
"grad_norm": 32.584293365478516,
"learning_rate": 1.2690355329949238e-06,
"loss": 4.7858,
"step": 200
},
{
"epoch": 0.014285714285714285,
"grad_norm": 22.508533477783203,
"learning_rate": 1.4276649746192894e-06,
"loss": 4.713,
"step": 225
},
{
"epoch": 0.015873015873015872,
"grad_norm": 10.486129760742188,
"learning_rate": 1.586294416243655e-06,
"loss": 4.7809,
"step": 250
},
{
"epoch": 0.01746031746031746,
"grad_norm": 10.104169845581055,
"learning_rate": 1.7449238578680206e-06,
"loss": 4.8488,
"step": 275
},
{
"epoch": 0.01904761904761905,
"grad_norm": 20.557931900024414,
"learning_rate": 1.9035532994923857e-06,
"loss": 4.5276,
"step": 300
},
{
"epoch": 1.0014603174603174,
"grad_norm": 14.145257949829102,
"learning_rate": 2.0621827411167515e-06,
"loss": 4.641,
"step": 325
},
{
"epoch": 1.003047619047619,
"grad_norm": 11.256460189819336,
"learning_rate": 2.220812182741117e-06,
"loss": 4.4173,
"step": 350
},
{
"epoch": 1.0046349206349205,
"grad_norm": 13.009560585021973,
"learning_rate": 2.3794416243654827e-06,
"loss": 4.2029,
"step": 375
},
{
"epoch": 1.0062222222222221,
"grad_norm": 10.433947563171387,
"learning_rate": 2.5380710659898476e-06,
"loss": 4.0364,
"step": 400
},
{
"epoch": 1.0078095238095237,
"grad_norm": 7.473550319671631,
"learning_rate": 2.6967005076142134e-06,
"loss": 3.8754,
"step": 425
},
{
"epoch": 1.0093968253968253,
"grad_norm": 12.957052230834961,
"learning_rate": 2.855329949238579e-06,
"loss": 4.0458,
"step": 450
},
{
"epoch": 1.0109841269841269,
"grad_norm": 9.960160255432129,
"learning_rate": 3.0139593908629446e-06,
"loss": 3.7578,
"step": 475
},
{
"epoch": 1.0125714285714287,
"grad_norm": 12.287276268005371,
"learning_rate": 3.17258883248731e-06,
"loss": 3.6334,
"step": 500
},
{
"epoch": 1.0141587301587303,
"grad_norm": 12.79332160949707,
"learning_rate": 3.3312182741116753e-06,
"loss": 3.4685,
"step": 525
},
{
"epoch": 1.0157460317460318,
"grad_norm": 8.90628719329834,
"learning_rate": 3.489847715736041e-06,
"loss": 3.3762,
"step": 550
},
{
"epoch": 1.0173333333333334,
"grad_norm": 12.635107040405273,
"learning_rate": 3.6484771573604065e-06,
"loss": 3.305,
"step": 575
},
{
"epoch": 1.018920634920635,
"grad_norm": 15.49641227722168,
"learning_rate": 3.8071065989847715e-06,
"loss": 2.9876,
"step": 600
},
{
"epoch": 2.001333333333333,
"grad_norm": 10.022441864013672,
"learning_rate": 3.965736040609137e-06,
"loss": 3.0157,
"step": 625
},
{
"epoch": 2.0029206349206348,
"grad_norm": 9.137686729431152,
"learning_rate": 4.124365482233503e-06,
"loss": 2.8705,
"step": 650
},
{
"epoch": 2.0045079365079363,
"grad_norm": 10.643122673034668,
"learning_rate": 4.282994923857868e-06,
"loss": 2.5582,
"step": 675
},
{
"epoch": 2.006095238095238,
"grad_norm": 8.754136085510254,
"learning_rate": 4.441624365482234e-06,
"loss": 2.419,
"step": 700
},
{
"epoch": 2.0076825396825395,
"grad_norm": 9.287137031555176,
"learning_rate": 4.6002538071066e-06,
"loss": 2.2705,
"step": 725
},
{
"epoch": 2.009269841269841,
"grad_norm": 10.788775444030762,
"learning_rate": 4.758883248730965e-06,
"loss": 2.2656,
"step": 750
},
{
"epoch": 2.0108571428571427,
"grad_norm": 9.800201416015625,
"learning_rate": 4.91751269035533e-06,
"loss": 1.9858,
"step": 775
},
{
"epoch": 2.0124444444444443,
"grad_norm": 10.444952011108398,
"learning_rate": 4.995989840930358e-06,
"loss": 1.8272,
"step": 800
},
{
"epoch": 2.014031746031746,
"grad_norm": 11.801770210266113,
"learning_rate": 4.987635342868601e-06,
"loss": 1.7213,
"step": 825
},
{
"epoch": 2.0156190476190474,
"grad_norm": 8.304800033569336,
"learning_rate": 4.979280844806844e-06,
"loss": 1.5774,
"step": 850
},
{
"epoch": 2.017206349206349,
"grad_norm": 10.936843872070312,
"learning_rate": 4.970926346745088e-06,
"loss": 1.5306,
"step": 875
},
{
"epoch": 2.0187936507936506,
"grad_norm": 12.08633804321289,
"learning_rate": 4.962571848683331e-06,
"loss": 1.4236,
"step": 900
},
{
"epoch": 3.001206349206349,
"grad_norm": 12.72133731842041,
"learning_rate": 4.954217350621575e-06,
"loss": 1.3777,
"step": 925
},
{
"epoch": 3.002793650793651,
"grad_norm": 7.71216344833374,
"learning_rate": 4.945862852559818e-06,
"loss": 1.3662,
"step": 950
},
{
"epoch": 3.0043809523809526,
"grad_norm": 4.924420356750488,
"learning_rate": 4.937508354498062e-06,
"loss": 1.1704,
"step": 975
},
{
"epoch": 3.005968253968254,
"grad_norm": 9.430765151977539,
"learning_rate": 4.9291538564363055e-06,
"loss": 1.1824,
"step": 1000
},
{
"epoch": 3.0075555555555558,
"grad_norm": 8.154838562011719,
"learning_rate": 4.9207993583745495e-06,
"loss": 1.1026,
"step": 1025
},
{
"epoch": 3.0091428571428573,
"grad_norm": 8.593589782714844,
"learning_rate": 4.912444860312793e-06,
"loss": 1.1226,
"step": 1050
},
{
"epoch": 3.010730158730159,
"grad_norm": 7.987609386444092,
"learning_rate": 4.904090362251037e-06,
"loss": 1.0897,
"step": 1075
},
{
"epoch": 3.0123174603174605,
"grad_norm": 6.220165252685547,
"learning_rate": 4.89573586418928e-06,
"loss": 1.0358,
"step": 1100
},
{
"epoch": 3.013904761904762,
"grad_norm": 5.584622383117676,
"learning_rate": 4.887381366127523e-06,
"loss": 1.0586,
"step": 1125
},
{
"epoch": 3.0154920634920637,
"grad_norm": 6.9964141845703125,
"learning_rate": 4.879026868065767e-06,
"loss": 1.0425,
"step": 1150
},
{
"epoch": 3.0170793650793652,
"grad_norm": 6.9891839027404785,
"learning_rate": 4.87067237000401e-06,
"loss": 1.0801,
"step": 1175
},
{
"epoch": 3.018666666666667,
"grad_norm": 5.334001541137695,
"learning_rate": 4.862317871942254e-06,
"loss": 1.071,
"step": 1200
},
{
"epoch": 4.001079365079365,
"grad_norm": 6.936366081237793,
"learning_rate": 4.853963373880497e-06,
"loss": 1.0287,
"step": 1225
},
{
"epoch": 4.002666666666666,
"grad_norm": 5.803761959075928,
"learning_rate": 4.845608875818741e-06,
"loss": 1.1338,
"step": 1250
},
{
"epoch": 4.004253968253968,
"grad_norm": 6.901465892791748,
"learning_rate": 4.837254377756984e-06,
"loss": 0.9522,
"step": 1275
},
{
"epoch": 4.0058412698412695,
"grad_norm": 7.466715335845947,
"learning_rate": 4.828899879695228e-06,
"loss": 0.9676,
"step": 1300
},
{
"epoch": 4.007428571428571,
"grad_norm": 5.247936248779297,
"learning_rate": 4.820545381633472e-06,
"loss": 0.9219,
"step": 1325
},
{
"epoch": 4.009015873015873,
"grad_norm": 9.886089324951172,
"learning_rate": 4.812190883571715e-06,
"loss": 0.9054,
"step": 1350
},
{
"epoch": 4.010603174603174,
"grad_norm": 6.104865550994873,
"learning_rate": 4.803836385509959e-06,
"loss": 0.9142,
"step": 1375
},
{
"epoch": 4.012190476190476,
"grad_norm": 7.953219413757324,
"learning_rate": 4.7954818874482025e-06,
"loss": 0.899,
"step": 1400
},
{
"epoch": 4.0137777777777774,
"grad_norm": 6.037745475769043,
"learning_rate": 4.7871273893864465e-06,
"loss": 0.9195,
"step": 1425
},
{
"epoch": 4.015365079365079,
"grad_norm": 5.599011421203613,
"learning_rate": 4.77877289132469e-06,
"loss": 0.9049,
"step": 1450
},
{
"epoch": 4.016952380952381,
"grad_norm": 9.971386909484863,
"learning_rate": 4.770418393262934e-06,
"loss": 0.9406,
"step": 1475
},
{
"epoch": 4.018539682539682,
"grad_norm": 5.549421310424805,
"learning_rate": 4.762063895201177e-06,
"loss": 0.9306,
"step": 1500
},
{
"epoch": 5.000952380952381,
"grad_norm": 8.038061141967773,
"learning_rate": 4.753709397139421e-06,
"loss": 0.9159,
"step": 1525
},
{
"epoch": 5.002539682539682,
"grad_norm": 4.798451900482178,
"learning_rate": 4.745354899077664e-06,
"loss": 0.9853,
"step": 1550
},
{
"epoch": 5.004126984126984,
"grad_norm": 5.497593402862549,
"learning_rate": 4.737000401015907e-06,
"loss": 0.8753,
"step": 1575
},
{
"epoch": 5.005714285714285,
"grad_norm": 8.052043914794922,
"learning_rate": 4.728645902954151e-06,
"loss": 0.8879,
"step": 1600
},
{
"epoch": 5.007301587301587,
"grad_norm": 10.569701194763184,
"learning_rate": 4.720291404892394e-06,
"loss": 0.8254,
"step": 1625
},
{
"epoch": 5.0088888888888885,
"grad_norm": 7.724920749664307,
"learning_rate": 4.711936906830638e-06,
"loss": 0.7917,
"step": 1650
},
{
"epoch": 5.01047619047619,
"grad_norm": 6.411539554595947,
"learning_rate": 4.703582408768881e-06,
"loss": 0.8296,
"step": 1675
},
{
"epoch": 5.012063492063492,
"grad_norm": 4.88054084777832,
"learning_rate": 4.695227910707125e-06,
"loss": 0.8315,
"step": 1700
},
{
"epoch": 5.013650793650793,
"grad_norm": 8.332351684570312,
"learning_rate": 4.6868734126453685e-06,
"loss": 0.8323,
"step": 1725
},
{
"epoch": 5.015238095238095,
"grad_norm": 5.097577095031738,
"learning_rate": 4.6785189145836124e-06,
"loss": 0.8122,
"step": 1750
},
{
"epoch": 5.016825396825396,
"grad_norm": 9.592188835144043,
"learning_rate": 4.670164416521856e-06,
"loss": 0.8712,
"step": 1775
},
{
"epoch": 5.018412698412698,
"grad_norm": 7.307371616363525,
"learning_rate": 4.6618099184601e-06,
"loss": 0.8648,
"step": 1800
},
{
"epoch": 6.000825396825396,
"grad_norm": 7.067652702331543,
"learning_rate": 4.653455420398343e-06,
"loss": 0.7985,
"step": 1825
},
{
"epoch": 6.002412698412698,
"grad_norm": 6.129504203796387,
"learning_rate": 4.645100922336586e-06,
"loss": 0.9276,
"step": 1850
},
{
"epoch": 6.004,
"grad_norm": 5.0192742347717285,
"learning_rate": 4.63674642427483e-06,
"loss": 0.8118,
"step": 1875
},
{
"epoch": 6.005587301587302,
"grad_norm": 9.476181030273438,
"learning_rate": 4.628391926213073e-06,
"loss": 0.8063,
"step": 1900
},
{
"epoch": 6.007174603174604,
"grad_norm": 4.5081987380981445,
"learning_rate": 4.620037428151317e-06,
"loss": 0.7605,
"step": 1925
},
{
"epoch": 6.008761904761905,
"grad_norm": 7.077738285064697,
"learning_rate": 4.61168293008956e-06,
"loss": 0.7435,
"step": 1950
},
{
"epoch": 6.010349206349207,
"grad_norm": 5.30238151550293,
"learning_rate": 4.603328432027804e-06,
"loss": 0.7603,
"step": 1975
},
{
"epoch": 6.011936507936508,
"grad_norm": 6.851855754852295,
"learning_rate": 4.594973933966047e-06,
"loss": 0.7606,
"step": 2000
},
{
"epoch": 6.01352380952381,
"grad_norm": 7.457930564880371,
"learning_rate": 4.586619435904291e-06,
"loss": 0.7822,
"step": 2025
},
{
"epoch": 6.0151111111111115,
"grad_norm": 7.502450466156006,
"learning_rate": 4.578264937842534e-06,
"loss": 0.7487,
"step": 2050
},
{
"epoch": 6.016698412698413,
"grad_norm": 6.5313544273376465,
"learning_rate": 4.569910439780778e-06,
"loss": 0.7878,
"step": 2075
},
{
"epoch": 6.018285714285715,
"grad_norm": 7.514427661895752,
"learning_rate": 4.5615559417190215e-06,
"loss": 0.812,
"step": 2100
},
{
"epoch": 7.000698412698413,
"grad_norm": 5.095201015472412,
"learning_rate": 4.5532014436572655e-06,
"loss": 0.7322,
"step": 2125
},
{
"epoch": 7.002285714285715,
"grad_norm": 5.384045600891113,
"learning_rate": 4.5448469455955095e-06,
"loss": 0.8814,
"step": 2150
},
{
"epoch": 7.003873015873016,
"grad_norm": 4.990402698516846,
"learning_rate": 4.536492447533753e-06,
"loss": 0.7483,
"step": 2175
},
{
"epoch": 7.005460317460318,
"grad_norm": 6.337180137634277,
"learning_rate": 4.528137949471997e-06,
"loss": 0.742,
"step": 2200
},
{
"epoch": 7.007047619047619,
"grad_norm": 4.511834621429443,
"learning_rate": 4.51978345141024e-06,
"loss": 0.7482,
"step": 2225
},
{
"epoch": 7.008634920634921,
"grad_norm": 10.161249160766602,
"learning_rate": 4.511428953348484e-06,
"loss": 0.6561,
"step": 2250
},
{
"epoch": 7.010222222222223,
"grad_norm": 13.714274406433105,
"learning_rate": 4.503074455286727e-06,
"loss": 0.7306,
"step": 2275
},
{
"epoch": 7.011809523809524,
"grad_norm": 7.517003536224365,
"learning_rate": 4.494719957224971e-06,
"loss": 0.6942,
"step": 2300
},
{
"epoch": 7.013396825396826,
"grad_norm": 5.552587509155273,
"learning_rate": 4.486365459163214e-06,
"loss": 0.7466,
"step": 2325
},
{
"epoch": 7.014984126984127,
"grad_norm": 6.234260559082031,
"learning_rate": 4.478010961101457e-06,
"loss": 0.6913,
"step": 2350
},
{
"epoch": 7.016571428571429,
"grad_norm": 5.961171627044678,
"learning_rate": 4.469656463039701e-06,
"loss": 0.7444,
"step": 2375
},
{
"epoch": 7.0181587301587305,
"grad_norm": 9.553787231445312,
"learning_rate": 4.461301964977944e-06,
"loss": 0.7764,
"step": 2400
},
{
"epoch": 8.000571428571428,
"grad_norm": 6.736727237701416,
"learning_rate": 4.452947466916188e-06,
"loss": 0.6902,
"step": 2425
},
{
"epoch": 8.00215873015873,
"grad_norm": 6.498841285705566,
"learning_rate": 4.4445929688544314e-06,
"loss": 0.8205,
"step": 2450
},
{
"epoch": 8.003746031746031,
"grad_norm": 3.605954647064209,
"learning_rate": 4.4362384707926754e-06,
"loss": 0.7139,
"step": 2475
},
{
"epoch": 8.005333333333333,
"grad_norm": 4.930444240570068,
"learning_rate": 4.427883972730919e-06,
"loss": 0.7082,
"step": 2500
},
{
"epoch": 8.006920634920634,
"grad_norm": 4.654155731201172,
"learning_rate": 4.4195294746691626e-06,
"loss": 0.734,
"step": 2525
},
{
"epoch": 8.008507936507936,
"grad_norm": 4.657145977020264,
"learning_rate": 4.411174976607406e-06,
"loss": 0.6076,
"step": 2550
},
{
"epoch": 8.010095238095237,
"grad_norm": 5.5561418533325195,
"learning_rate": 4.402820478545649e-06,
"loss": 0.6882,
"step": 2575
},
{
"epoch": 8.011682539682539,
"grad_norm": 4.575371742248535,
"learning_rate": 4.394465980483893e-06,
"loss": 0.6572,
"step": 2600
},
{
"epoch": 8.01326984126984,
"grad_norm": 13.730400085449219,
"learning_rate": 4.386111482422136e-06,
"loss": 0.7209,
"step": 2625
},
{
"epoch": 8.014857142857142,
"grad_norm": 4.949292182922363,
"learning_rate": 4.37775698436038e-06,
"loss": 0.6433,
"step": 2650
},
{
"epoch": 8.016444444444444,
"grad_norm": 9.161581039428711,
"learning_rate": 4.369402486298623e-06,
"loss": 0.7117,
"step": 2675
},
{
"epoch": 8.018031746031745,
"grad_norm": 6.098261833190918,
"learning_rate": 4.361047988236867e-06,
"loss": 0.7515,
"step": 2700
},
{
"epoch": 9.000444444444444,
"grad_norm": 5.116198539733887,
"learning_rate": 4.35269349017511e-06,
"loss": 0.6484,
"step": 2725
},
{
"epoch": 9.002031746031745,
"grad_norm": 10.22689437866211,
"learning_rate": 4.344338992113354e-06,
"loss": 0.7807,
"step": 2750
},
{
"epoch": 9.003619047619047,
"grad_norm": 12.227742195129395,
"learning_rate": 4.335984494051597e-06,
"loss": 0.6763,
"step": 2775
},
{
"epoch": 9.005206349206349,
"grad_norm": 9.396242141723633,
"learning_rate": 4.327629995989841e-06,
"loss": 0.6722,
"step": 2800
},
{
"epoch": 9.00679365079365,
"grad_norm": 5.258615016937256,
"learning_rate": 4.3192754979280845e-06,
"loss": 0.7176,
"step": 2825
},
{
"epoch": 9.008380952380952,
"grad_norm": 5.422908306121826,
"learning_rate": 4.3109209998663285e-06,
"loss": 0.5881,
"step": 2850
},
{
"epoch": 9.009968253968253,
"grad_norm": 7.922283172607422,
"learning_rate": 4.302566501804572e-06,
"loss": 0.6492,
"step": 2875
},
{
"epoch": 9.011555555555555,
"grad_norm": 8.866413116455078,
"learning_rate": 4.294212003742816e-06,
"loss": 0.6277,
"step": 2900
},
{
"epoch": 9.013142857142856,
"grad_norm": 9.206356048583984,
"learning_rate": 4.285857505681059e-06,
"loss": 0.6644,
"step": 2925
},
{
"epoch": 9.014730158730158,
"grad_norm": 5.49714994430542,
"learning_rate": 4.277503007619303e-06,
"loss": 0.6481,
"step": 2950
},
{
"epoch": 9.01631746031746,
"grad_norm": 6.799171447753906,
"learning_rate": 4.269148509557547e-06,
"loss": 0.67,
"step": 2975
},
{
"epoch": 9.017904761904761,
"grad_norm": 7.201016902923584,
"learning_rate": 4.26079401149579e-06,
"loss": 0.7103,
"step": 3000
},
{
"epoch": 10.00031746031746,
"grad_norm": 6.763806343078613,
"learning_rate": 4.252439513434034e-06,
"loss": 0.6487,
"step": 3025
},
{
"epoch": 10.001904761904761,
"grad_norm": 9.263688087463379,
"learning_rate": 4.244085015372277e-06,
"loss": 0.7082,
"step": 3050
},
{
"epoch": 10.003492063492063,
"grad_norm": 11.184915542602539,
"learning_rate": 4.23573051731052e-06,
"loss": 0.6759,
"step": 3075
},
{
"epoch": 10.005079365079364,
"grad_norm": 4.252372741699219,
"learning_rate": 4.227376019248764e-06,
"loss": 0.66,
"step": 3100
},
{
"epoch": 10.006666666666666,
"grad_norm": 11.342703819274902,
"learning_rate": 4.219021521187007e-06,
"loss": 0.6747,
"step": 3125
},
{
"epoch": 10.008253968253968,
"grad_norm": 5.704590797424316,
"learning_rate": 4.210667023125251e-06,
"loss": 0.5862,
"step": 3150
},
{
"epoch": 10.009841269841269,
"grad_norm": 5.683150291442871,
"learning_rate": 4.2023125250634944e-06,
"loss": 0.6129,
"step": 3175
},
{
"epoch": 10.01142857142857,
"grad_norm": 4.855335235595703,
"learning_rate": 4.193958027001738e-06,
"loss": 0.604,
"step": 3200
},
{
"epoch": 10.013015873015872,
"grad_norm": 7.8647613525390625,
"learning_rate": 4.1856035289399816e-06,
"loss": 0.6367,
"step": 3225
},
{
"epoch": 10.014603174603174,
"grad_norm": 6.591641902923584,
"learning_rate": 4.1772490308782256e-06,
"loss": 0.6371,
"step": 3250
},
{
"epoch": 10.016190476190475,
"grad_norm": 7.047679424285889,
"learning_rate": 4.168894532816469e-06,
"loss": 0.6433,
"step": 3275
},
{
"epoch": 10.017777777777777,
"grad_norm": 7.674762725830078,
"learning_rate": 4.160540034754712e-06,
"loss": 0.6696,
"step": 3300
},
{
"epoch": 11.000190476190475,
"grad_norm": 7.130011081695557,
"learning_rate": 4.152185536692956e-06,
"loss": 0.6527,
"step": 3325
},
{
"epoch": 11.001777777777777,
"grad_norm": 7.317767143249512,
"learning_rate": 4.143831038631199e-06,
"loss": 0.6586,
"step": 3350
},
{
"epoch": 11.003365079365079,
"grad_norm": 2.615405321121216,
"learning_rate": 4.135476540569443e-06,
"loss": 0.6771,
"step": 3375
},
{
"epoch": 11.00495238095238,
"grad_norm": 4.891953468322754,
"learning_rate": 4.127122042507686e-06,
"loss": 0.64,
"step": 3400
},
{
"epoch": 11.006539682539682,
"grad_norm": 6.664401531219482,
"learning_rate": 4.11876754444593e-06,
"loss": 0.6411,
"step": 3425
},
{
"epoch": 11.008126984126983,
"grad_norm": 6.38748836517334,
"learning_rate": 4.110413046384173e-06,
"loss": 0.5616,
"step": 3450
},
{
"epoch": 11.009714285714285,
"grad_norm": 6.957112789154053,
"learning_rate": 4.102058548322417e-06,
"loss": 0.5956,
"step": 3475
},
{
"epoch": 11.011301587301586,
"grad_norm": 4.554030895233154,
"learning_rate": 4.09370405026066e-06,
"loss": 0.5998,
"step": 3500
},
{
"epoch": 11.012888888888888,
"grad_norm": 5.208797454833984,
"learning_rate": 4.085349552198904e-06,
"loss": 0.616,
"step": 3525
},
{
"epoch": 11.01447619047619,
"grad_norm": 6.402866840362549,
"learning_rate": 4.0769950541371475e-06,
"loss": 0.5958,
"step": 3550
},
{
"epoch": 11.016063492063491,
"grad_norm": 5.4900946617126465,
"learning_rate": 4.0686405560753915e-06,
"loss": 0.6173,
"step": 3575
},
{
"epoch": 11.017650793650793,
"grad_norm": 9.454499244689941,
"learning_rate": 4.060286058013635e-06,
"loss": 0.6567,
"step": 3600
},
{
"epoch": 12.000063492063491,
"grad_norm": 9.544127464294434,
"learning_rate": 4.051931559951879e-06,
"loss": 0.6164,
"step": 3625
},
{
"epoch": 12.001650793650793,
"grad_norm": 5.444927215576172,
"learning_rate": 4.043577061890122e-06,
"loss": 0.6563,
"step": 3650
},
{
"epoch": 12.003238095238094,
"grad_norm": 3.2568845748901367,
"learning_rate": 4.035222563828366e-06,
"loss": 0.6631,
"step": 3675
},
{
"epoch": 12.004825396825396,
"grad_norm": 11.233345031738281,
"learning_rate": 4.026868065766609e-06,
"loss": 0.6186,
"step": 3700
},
{
"epoch": 12.006412698412698,
"grad_norm": 5.101284027099609,
"learning_rate": 4.018513567704853e-06,
"loss": 0.6307,
"step": 3725
},
{
"epoch": 12.008,
"grad_norm": 7.161935329437256,
"learning_rate": 4.010159069643096e-06,
"loss": 0.5363,
"step": 3750
},
{
"epoch": 12.0095873015873,
"grad_norm": 8.10970401763916,
"learning_rate": 4.00180457158134e-06,
"loss": 0.5729,
"step": 3775
},
{
"epoch": 12.011174603174604,
"grad_norm": 6.0759077072143555,
"learning_rate": 3.993450073519583e-06,
"loss": 0.5785,
"step": 3800
},
{
"epoch": 12.012761904761906,
"grad_norm": 11.944267272949219,
"learning_rate": 3.985095575457827e-06,
"loss": 0.5812,
"step": 3825
},
{
"epoch": 12.014349206349207,
"grad_norm": 6.059834957122803,
"learning_rate": 3.97674107739607e-06,
"loss": 0.5958,
"step": 3850
},
{
"epoch": 12.015936507936509,
"grad_norm": 5.849289417266846,
"learning_rate": 3.968386579334314e-06,
"loss": 0.6121,
"step": 3875
},
{
"epoch": 12.01752380952381,
"grad_norm": 5.543300628662109,
"learning_rate": 3.960032081272557e-06,
"loss": 0.6259,
"step": 3900
},
{
"epoch": 12.019111111111112,
"grad_norm": 5.864200592041016,
"learning_rate": 3.951677583210801e-06,
"loss": 0.597,
"step": 3925
},
{
"epoch": 13.00152380952381,
"grad_norm": 9.676383972167969,
"learning_rate": 3.9433230851490445e-06,
"loss": 0.6586,
"step": 3950
},
{
"epoch": 13.003111111111112,
"grad_norm": 11.184745788574219,
"learning_rate": 3.9349685870872885e-06,
"loss": 0.644,
"step": 3975
},
{
"epoch": 13.004698412698414,
"grad_norm": 6.4422502517700195,
"learning_rate": 3.926614089025532e-06,
"loss": 0.5813,
"step": 4000
},
{
"epoch": 13.006285714285715,
"grad_norm": 5.095337390899658,
"learning_rate": 3.918259590963775e-06,
"loss": 0.6238,
"step": 4025
},
{
"epoch": 13.007873015873017,
"grad_norm": 7.0134663581848145,
"learning_rate": 3.909905092902019e-06,
"loss": 0.5323,
"step": 4050
},
{
"epoch": 13.009460317460318,
"grad_norm": 11.49138069152832,
"learning_rate": 3.901550594840262e-06,
"loss": 0.5527,
"step": 4075
},
{
"epoch": 13.01104761904762,
"grad_norm": 5.017846584320068,
"learning_rate": 3.893196096778506e-06,
"loss": 0.5561,
"step": 4100
},
{
"epoch": 13.012634920634921,
"grad_norm": 6.452044486999512,
"learning_rate": 3.884841598716749e-06,
"loss": 0.5796,
"step": 4125
},
{
"epoch": 13.014222222222223,
"grad_norm": 4.59455680847168,
"learning_rate": 3.876487100654993e-06,
"loss": 0.5686,
"step": 4150
},
{
"epoch": 13.015809523809525,
"grad_norm": 6.177151203155518,
"learning_rate": 3.868132602593236e-06,
"loss": 0.5974,
"step": 4175
},
{
"epoch": 13.017396825396826,
"grad_norm": 8.819549560546875,
"learning_rate": 3.85977810453148e-06,
"loss": 0.6186,
"step": 4200
},
{
"epoch": 13.018984126984128,
"grad_norm": 4.838363170623779,
"learning_rate": 3.851423606469723e-06,
"loss": 0.5783,
"step": 4225
},
{
"epoch": 14.001396825396826,
"grad_norm": 6.992326736450195,
"learning_rate": 3.843069108407967e-06,
"loss": 0.6319,
"step": 4250
},
{
"epoch": 14.002984126984128,
"grad_norm": 6.273831367492676,
"learning_rate": 3.8347146103462105e-06,
"loss": 0.6446,
"step": 4275
},
{
"epoch": 14.00457142857143,
"grad_norm": 5.901345252990723,
"learning_rate": 3.826360112284454e-06,
"loss": 0.5465,
"step": 4300
},
{
"epoch": 14.006158730158731,
"grad_norm": 9.980649948120117,
"learning_rate": 3.818005614222698e-06,
"loss": 0.6371,
"step": 4325
},
{
"epoch": 14.007746031746033,
"grad_norm": 3.479801654815674,
"learning_rate": 3.809651116160941e-06,
"loss": 0.5216,
"step": 4350
},
{
"epoch": 14.009333333333334,
"grad_norm": 7.416255950927734,
"learning_rate": 3.801296618099185e-06,
"loss": 0.5298,
"step": 4375
},
{
"epoch": 14.010920634920636,
"grad_norm": 6.53993034362793,
"learning_rate": 3.7929421200374283e-06,
"loss": 0.5488,
"step": 4400
},
{
"epoch": 14.012507936507937,
"grad_norm": 5.593329906463623,
"learning_rate": 3.7845876219756723e-06,
"loss": 0.5506,
"step": 4425
},
{
"epoch": 14.014095238095239,
"grad_norm": 5.567030906677246,
"learning_rate": 3.7762331239139154e-06,
"loss": 0.5616,
"step": 4450
},
{
"epoch": 14.01568253968254,
"grad_norm": 11.033809661865234,
"learning_rate": 3.7678786258521594e-06,
"loss": 0.5849,
"step": 4475
},
{
"epoch": 14.017269841269842,
"grad_norm": 5.325310707092285,
"learning_rate": 3.7595241277904026e-06,
"loss": 0.5911,
"step": 4500
},
{
"epoch": 14.018857142857144,
"grad_norm": 10.827190399169922,
"learning_rate": 3.751169629728646e-06,
"loss": 0.5941,
"step": 4525
},
{
"epoch": 15.001269841269842,
"grad_norm": 6.941823482513428,
"learning_rate": 3.7428151316668897e-06,
"loss": 0.6097,
"step": 4550
},
{
"epoch": 15.002857142857144,
"grad_norm": 11.185466766357422,
"learning_rate": 3.7344606336051333e-06,
"loss": 0.6295,
"step": 4575
},
{
"epoch": 15.004444444444445,
"grad_norm": 7.75522518157959,
"learning_rate": 3.726106135543377e-06,
"loss": 0.5308,
"step": 4600
},
{
"epoch": 15.006031746031747,
"grad_norm": 5.286986827850342,
"learning_rate": 3.7177516374816204e-06,
"loss": 0.6022,
"step": 4625
},
{
"epoch": 15.007619047619048,
"grad_norm": 6.149432182312012,
"learning_rate": 3.709397139419864e-06,
"loss": 0.5457,
"step": 4650
},
{
"epoch": 15.00920634920635,
"grad_norm": 7.458939552307129,
"learning_rate": 3.7010426413581075e-06,
"loss": 0.5073,
"step": 4675
},
{
"epoch": 15.010793650793651,
"grad_norm": 5.971858024597168,
"learning_rate": 3.692688143296351e-06,
"loss": 0.5418,
"step": 4700
},
{
"epoch": 15.012380952380953,
"grad_norm": 5.619646072387695,
"learning_rate": 3.6843336452345947e-06,
"loss": 0.5423,
"step": 4725
},
{
"epoch": 15.013968253968255,
"grad_norm": 6.382497787475586,
"learning_rate": 3.675979147172838e-06,
"loss": 0.5446,
"step": 4750
},
{
"epoch": 15.015555555555556,
"grad_norm": 6.406772136688232,
"learning_rate": 3.667624649111082e-06,
"loss": 0.5734,
"step": 4775
},
{
"epoch": 15.017142857142858,
"grad_norm": 7.28453254699707,
"learning_rate": 3.659270151049325e-06,
"loss": 0.5786,
"step": 4800
},
{
"epoch": 15.01873015873016,
"grad_norm": 9.923730850219727,
"learning_rate": 3.650915652987569e-06,
"loss": 0.5791,
"step": 4825
},
{
"epoch": 16.001142857142856,
"grad_norm": 8.251771926879883,
"learning_rate": 3.642561154925812e-06,
"loss": 0.5688,
"step": 4850
},
{
"epoch": 16.00273015873016,
"grad_norm": 7.045546054840088,
"learning_rate": 3.634206656864056e-06,
"loss": 0.6337,
"step": 4875
},
{
"epoch": 16.00431746031746,
"grad_norm": 8.073708534240723,
"learning_rate": 3.625852158802299e-06,
"loss": 0.5459,
"step": 4900
},
{
"epoch": 16.005904761904763,
"grad_norm": 4.836098670959473,
"learning_rate": 3.617497660740543e-06,
"loss": 0.5852,
"step": 4925
},
{
"epoch": 16.007492063492062,
"grad_norm": 6.777514457702637,
"learning_rate": 3.6091431626787863e-06,
"loss": 0.5408,
"step": 4950
},
{
"epoch": 16.009079365079366,
"grad_norm": 4.61210298538208,
"learning_rate": 3.6007886646170303e-06,
"loss": 0.4816,
"step": 4975
},
{
"epoch": 16.010666666666665,
"grad_norm": 6.172779083251953,
"learning_rate": 3.5924341665552735e-06,
"loss": 0.528,
"step": 5000
},
{
"epoch": 16.01225396825397,
"grad_norm": 5.35014009475708,
"learning_rate": 3.584079668493517e-06,
"loss": 0.5443,
"step": 5025
},
{
"epoch": 16.01384126984127,
"grad_norm": 7.521047115325928,
"learning_rate": 3.575725170431761e-06,
"loss": 0.5411,
"step": 5050
},
{
"epoch": 16.015428571428572,
"grad_norm": 3.7705650329589844,
"learning_rate": 3.567370672370004e-06,
"loss": 0.555,
"step": 5075
},
{
"epoch": 16.017015873015872,
"grad_norm": 4.953466892242432,
"learning_rate": 3.559016174308248e-06,
"loss": 0.5691,
"step": 5100
},
{
"epoch": 16.018603174603175,
"grad_norm": 9.339215278625488,
"learning_rate": 3.5506616762464913e-06,
"loss": 0.5754,
"step": 5125
},
{
"epoch": 17.001015873015874,
"grad_norm": 6.013854503631592,
"learning_rate": 3.5423071781847353e-06,
"loss": 0.5407,
"step": 5150
},
{
"epoch": 17.002603174603173,
"grad_norm": 6.144811630249023,
"learning_rate": 3.5339526801229784e-06,
"loss": 0.6193,
"step": 5175
},
{
"epoch": 17.004190476190477,
"grad_norm": 4.639403820037842,
"learning_rate": 3.5255981820612224e-06,
"loss": 0.5446,
"step": 5200
},
{
"epoch": 17.005777777777777,
"grad_norm": 3.8471908569335938,
"learning_rate": 3.5172436839994656e-06,
"loss": 0.5891,
"step": 5225
},
{
"epoch": 17.00736507936508,
"grad_norm": 4.270881175994873,
"learning_rate": 3.5088891859377087e-06,
"loss": 0.5323,
"step": 5250
},
{
"epoch": 17.00895238095238,
"grad_norm": 6.322847366333008,
"learning_rate": 3.5005346878759527e-06,
"loss": 0.4589,
"step": 5275
},
{
"epoch": 17.010539682539683,
"grad_norm": 18.737030029296875,
"learning_rate": 3.492180189814196e-06,
"loss": 0.5348,
"step": 5300
},
{
"epoch": 17.012126984126983,
"grad_norm": 3.4430785179138184,
"learning_rate": 3.48382569175244e-06,
"loss": 0.5397,
"step": 5325
},
{
"epoch": 17.013714285714286,
"grad_norm": 6.301079273223877,
"learning_rate": 3.4754711936906834e-06,
"loss": 0.5254,
"step": 5350
},
{
"epoch": 17.015301587301586,
"grad_norm": 10.843756675720215,
"learning_rate": 3.467116695628927e-06,
"loss": 0.5535,
"step": 5375
},
{
"epoch": 17.01688888888889,
"grad_norm": 9.099514961242676,
"learning_rate": 3.4587621975671705e-06,
"loss": 0.5458,
"step": 5400
},
{
"epoch": 17.01847619047619,
"grad_norm": 5.591258525848389,
"learning_rate": 3.450407699505414e-06,
"loss": 0.5715,
"step": 5425
},
{
"epoch": 18.000888888888888,
"grad_norm": 7.137011528015137,
"learning_rate": 3.4420532014436577e-06,
"loss": 0.5219,
"step": 5450
},
{
"epoch": 18.00247619047619,
"grad_norm": 9.867881774902344,
"learning_rate": 3.4336987033819012e-06,
"loss": 0.6136,
"step": 5475
},
{
"epoch": 18.00406349206349,
"grad_norm": 11.723578453063965,
"learning_rate": 3.4253442053201448e-06,
"loss": 0.5442,
"step": 5500
},
{
"epoch": 18.005650793650794,
"grad_norm": 5.498622417449951,
"learning_rate": 3.416989707258388e-06,
"loss": 0.5691,
"step": 5525
},
{
"epoch": 18.007238095238094,
"grad_norm": 5.055809020996094,
"learning_rate": 3.408635209196632e-06,
"loss": 0.5105,
"step": 5550
},
{
"epoch": 18.008825396825397,
"grad_norm": 3.1195857524871826,
"learning_rate": 3.400280711134875e-06,
"loss": 0.4747,
"step": 5575
},
{
"epoch": 18.010412698412697,
"grad_norm": 11.140824317932129,
"learning_rate": 3.391926213073119e-06,
"loss": 0.5209,
"step": 5600
},
{
"epoch": 18.012,
"grad_norm": 6.157780647277832,
"learning_rate": 3.383571715011362e-06,
"loss": 0.534,
"step": 5625
},
{
"epoch": 18.0135873015873,
"grad_norm": 4.913928031921387,
"learning_rate": 3.375217216949606e-06,
"loss": 0.5126,
"step": 5650
},
{
"epoch": 18.015174603174604,
"grad_norm": 7.3261895179748535,
"learning_rate": 3.3668627188878493e-06,
"loss": 0.5411,
"step": 5675
},
{
"epoch": 18.016761904761903,
"grad_norm": 4.676502227783203,
"learning_rate": 3.3585082208260933e-06,
"loss": 0.5322,
"step": 5700
},
{
"epoch": 18.018349206349207,
"grad_norm": 8.648612976074219,
"learning_rate": 3.3501537227643365e-06,
"loss": 0.5704,
"step": 5725
},
{
"epoch": 19.000761904761905,
"grad_norm": 10.17095947265625,
"learning_rate": 3.34179922470258e-06,
"loss": 0.5118,
"step": 5750
},
{
"epoch": 19.002349206349205,
"grad_norm": 4.843137264251709,
"learning_rate": 3.3334447266408236e-06,
"loss": 0.6025,
"step": 5775
},
{
"epoch": 19.00393650793651,
"grad_norm": 8.146428108215332,
"learning_rate": 3.325090228579067e-06,
"loss": 0.5498,
"step": 5800
},
{
"epoch": 19.005523809523808,
"grad_norm": 9.00712776184082,
"learning_rate": 3.3167357305173107e-06,
"loss": 0.5468,
"step": 5825
},
{
"epoch": 19.00711111111111,
"grad_norm": 4.855189800262451,
"learning_rate": 3.3083812324555543e-06,
"loss": 0.5228,
"step": 5850
},
{
"epoch": 19.00869841269841,
"grad_norm": 4.557061672210693,
"learning_rate": 3.300026734393798e-06,
"loss": 0.4595,
"step": 5875
},
{
"epoch": 19.010285714285715,
"grad_norm": 8.567035675048828,
"learning_rate": 3.2916722363320414e-06,
"loss": 0.519,
"step": 5900
},
{
"epoch": 19.011873015873014,
"grad_norm": 26.754615783691406,
"learning_rate": 3.2833177382702854e-06,
"loss": 0.5212,
"step": 5925
},
{
"epoch": 19.013460317460318,
"grad_norm": 4.910426139831543,
"learning_rate": 3.2749632402085285e-06,
"loss": 0.5025,
"step": 5950
},
{
"epoch": 19.015047619047618,
"grad_norm": 11.170868873596191,
"learning_rate": 3.2666087421467717e-06,
"loss": 0.5109,
"step": 5975
},
{
"epoch": 19.01663492063492,
"grad_norm": 8.7157564163208,
"learning_rate": 3.2582542440850157e-06,
"loss": 0.5374,
"step": 6000
},
{
"epoch": 19.01822222222222,
"grad_norm": 5.3223700523376465,
"learning_rate": 3.249899746023259e-06,
"loss": 0.5608,
"step": 6025
},
{
"epoch": 20.00063492063492,
"grad_norm": 15.868850708007812,
"learning_rate": 3.241545247961503e-06,
"loss": 0.5028,
"step": 6050
},
{
"epoch": 20.002222222222223,
"grad_norm": 7.932621955871582,
"learning_rate": 3.233190749899746e-06,
"loss": 0.6152,
"step": 6075
},
{
"epoch": 20.003809523809522,
"grad_norm": 10.044479370117188,
"learning_rate": 3.22483625183799e-06,
"loss": 0.5187,
"step": 6100
},
{
"epoch": 20.005396825396826,
"grad_norm": 7.519008159637451,
"learning_rate": 3.216481753776233e-06,
"loss": 0.5333,
"step": 6125
},
{
"epoch": 20.006984126984126,
"grad_norm": 4.2018866539001465,
"learning_rate": 3.208127255714477e-06,
"loss": 0.546,
"step": 6150
},
{
"epoch": 20.00857142857143,
"grad_norm": 6.87973690032959,
"learning_rate": 3.1997727576527206e-06,
"loss": 0.4432,
"step": 6175
},
{
"epoch": 20.01015873015873,
"grad_norm": 6.215482711791992,
"learning_rate": 3.191418259590964e-06,
"loss": 0.5015,
"step": 6200
},
{
"epoch": 20.011746031746032,
"grad_norm": 5.1478753089904785,
"learning_rate": 3.1830637615292078e-06,
"loss": 0.5034,
"step": 6225
},
{
"epoch": 20.013333333333332,
"grad_norm": 3.017598867416382,
"learning_rate": 3.174709263467451e-06,
"loss": 0.5259,
"step": 6250
},
{
"epoch": 20.014920634920635,
"grad_norm": 9.40729808807373,
"learning_rate": 3.166354765405695e-06,
"loss": 0.4903,
"step": 6275
},
{
"epoch": 20.016507936507935,
"grad_norm": 10.465718269348145,
"learning_rate": 3.158000267343938e-06,
"loss": 0.5392,
"step": 6300
},
{
"epoch": 20.01809523809524,
"grad_norm": 5.032984733581543,
"learning_rate": 3.149645769282182e-06,
"loss": 0.5571,
"step": 6325
},
{
"epoch": 21.000507936507937,
"grad_norm": 9.124772071838379,
"learning_rate": 3.141291271220425e-06,
"loss": 0.4855,
"step": 6350
},
{
"epoch": 21.002095238095237,
"grad_norm": 5.090267181396484,
"learning_rate": 3.132936773158669e-06,
"loss": 0.5912,
"step": 6375
},
{
"epoch": 21.00368253968254,
"grad_norm": 7.209656238555908,
"learning_rate": 3.1245822750969123e-06,
"loss": 0.5315,
"step": 6400
},
{
"epoch": 21.00526984126984,
"grad_norm": 10.766797065734863,
"learning_rate": 3.1162277770351563e-06,
"loss": 0.5364,
"step": 6425
},
{
"epoch": 21.006857142857143,
"grad_norm": 9.655423164367676,
"learning_rate": 3.1078732789733994e-06,
"loss": 0.5469,
"step": 6450
},
{
"epoch": 21.008444444444443,
"grad_norm": 9.706192970275879,
"learning_rate": 3.099518780911643e-06,
"loss": 0.4377,
"step": 6475
},
{
"epoch": 21.010031746031746,
"grad_norm": 5.6594343185424805,
"learning_rate": 3.0911642828498866e-06,
"loss": 0.4867,
"step": 6500
},
{
"epoch": 21.011619047619046,
"grad_norm": 9.920619010925293,
"learning_rate": 3.08280978478813e-06,
"loss": 0.497,
"step": 6525
},
{
"epoch": 21.01320634920635,
"grad_norm": 4.976430892944336,
"learning_rate": 3.0744552867263737e-06,
"loss": 0.4949,
"step": 6550
},
{
"epoch": 21.01479365079365,
"grad_norm": 7.563751697540283,
"learning_rate": 3.0661007886646173e-06,
"loss": 0.5136,
"step": 6575
},
{
"epoch": 21.016380952380953,
"grad_norm": 5.668909072875977,
"learning_rate": 3.057746290602861e-06,
"loss": 0.5097,
"step": 6600
},
{
"epoch": 21.017968253968252,
"grad_norm": 6.428191661834717,
"learning_rate": 3.0493917925411044e-06,
"loss": 0.5594,
"step": 6625
},
{
"epoch": 22.00038095238095,
"grad_norm": 6.726556301116943,
"learning_rate": 3.041037294479348e-06,
"loss": 0.4874,
"step": 6650
},
{
"epoch": 22.001968253968254,
"grad_norm": 14.118648529052734,
"learning_rate": 3.0326827964175915e-06,
"loss": 0.5642,
"step": 6675
},
{
"epoch": 22.003555555555554,
"grad_norm": 11.719437599182129,
"learning_rate": 3.024328298355835e-06,
"loss": 0.5228,
"step": 6700
},
{
"epoch": 22.005142857142857,
"grad_norm": 7.100255489349365,
"learning_rate": 3.0159738002940787e-06,
"loss": 0.5289,
"step": 6725
},
{
"epoch": 22.006730158730157,
"grad_norm": 15.27164363861084,
"learning_rate": 3.007619302232322e-06,
"loss": 0.5544,
"step": 6750
},
{
"epoch": 22.00831746031746,
"grad_norm": 6.844352722167969,
"learning_rate": 2.999264804170566e-06,
"loss": 0.4507,
"step": 6775
},
{
"epoch": 22.00990476190476,
"grad_norm": 5.465493202209473,
"learning_rate": 2.990910306108809e-06,
"loss": 0.4717,
"step": 6800
},
{
"epoch": 22.011492063492064,
"grad_norm": 6.300055503845215,
"learning_rate": 2.982555808047053e-06,
"loss": 0.4833,
"step": 6825
},
{
"epoch": 22.013079365079363,
"grad_norm": 3.9687702655792236,
"learning_rate": 2.974201309985296e-06,
"loss": 0.4925,
"step": 6850
},
{
"epoch": 22.014666666666667,
"grad_norm": 5.950267791748047,
"learning_rate": 2.96584681192354e-06,
"loss": 0.5093,
"step": 6875
},
{
"epoch": 22.016253968253967,
"grad_norm": 7.3085618019104,
"learning_rate": 2.957492313861783e-06,
"loss": 0.5095,
"step": 6900
},
{
"epoch": 22.01784126984127,
"grad_norm": 10.438004493713379,
"learning_rate": 2.949137815800027e-06,
"loss": 0.5509,
"step": 6925
},
{
"epoch": 23.00025396825397,
"grad_norm": 5.851992130279541,
"learning_rate": 2.9407833177382703e-06,
"loss": 0.4989,
"step": 6950
},
{
"epoch": 23.001841269841268,
"grad_norm": 5.808182716369629,
"learning_rate": 2.932428819676514e-06,
"loss": 0.531,
"step": 6975
},
{
"epoch": 23.00342857142857,
"grad_norm": 4.814669132232666,
"learning_rate": 2.924074321614758e-06,
"loss": 0.5413,
"step": 7000
},
{
"epoch": 23.00501587301587,
"grad_norm": 7.406203269958496,
"learning_rate": 2.915719823553001e-06,
"loss": 0.5223,
"step": 7025
},
{
"epoch": 23.006603174603175,
"grad_norm": 4.7713942527771,
"learning_rate": 2.907365325491245e-06,
"loss": 0.5251,
"step": 7050
},
{
"epoch": 23.008190476190475,
"grad_norm": 4.403865814208984,
"learning_rate": 2.899010827429488e-06,
"loss": 0.46,
"step": 7075
},
{
"epoch": 23.009777777777778,
"grad_norm": 5.674661636352539,
"learning_rate": 2.890656329367732e-06,
"loss": 0.4705,
"step": 7100
},
{
"epoch": 23.011365079365078,
"grad_norm": 7.83860445022583,
"learning_rate": 2.8823018313059753e-06,
"loss": 0.4906,
"step": 7125
},
{
"epoch": 23.01295238095238,
"grad_norm": 3.9756040573120117,
"learning_rate": 2.8739473332442193e-06,
"loss": 0.4931,
"step": 7150
},
{
"epoch": 23.01453968253968,
"grad_norm": 4.530709743499756,
"learning_rate": 2.8655928351824624e-06,
"loss": 0.4957,
"step": 7175
},
{
"epoch": 23.016126984126984,
"grad_norm": 7.570037364959717,
"learning_rate": 2.8572383371207056e-06,
"loss": 0.5039,
"step": 7200
},
{
"epoch": 23.017714285714284,
"grad_norm": 6.422541618347168,
"learning_rate": 2.8488838390589496e-06,
"loss": 0.5312,
"step": 7225
},
{
"epoch": 24.000126984126982,
"grad_norm": 7.004579544067383,
"learning_rate": 2.8405293409971927e-06,
"loss": 0.4954,
"step": 7250
},
{
"epoch": 24.001714285714286,
"grad_norm": 11.172274589538574,
"learning_rate": 2.8321748429354367e-06,
"loss": 0.5304,
"step": 7275
},
{
"epoch": 24.003301587301586,
"grad_norm": 6.250117301940918,
"learning_rate": 2.8238203448736803e-06,
"loss": 0.5508,
"step": 7300
},
{
"epoch": 24.00488888888889,
"grad_norm": 5.038013935089111,
"learning_rate": 2.815465846811924e-06,
"loss": 0.5128,
"step": 7325
},
{
"epoch": 24.00647619047619,
"grad_norm": 3.5625054836273193,
"learning_rate": 2.8071113487501674e-06,
"loss": 0.5219,
"step": 7350
},
{
"epoch": 24.008063492063492,
"grad_norm": 4.982530117034912,
"learning_rate": 2.798756850688411e-06,
"loss": 0.4495,
"step": 7375
},
{
"epoch": 24.009650793650792,
"grad_norm": 17.86178207397461,
"learning_rate": 2.7904023526266545e-06,
"loss": 0.4743,
"step": 7400
},
{
"epoch": 24.011238095238095,
"grad_norm": 6.184370517730713,
"learning_rate": 2.782047854564898e-06,
"loss": 0.4768,
"step": 7425
},
{
"epoch": 24.012825396825395,
"grad_norm": 11.036730766296387,
"learning_rate": 2.7736933565031416e-06,
"loss": 0.493,
"step": 7450
},
{
"epoch": 24.0144126984127,
"grad_norm": 7.9786577224731445,
"learning_rate": 2.765338858441385e-06,
"loss": 0.4758,
"step": 7475
},
{
"epoch": 24.016,
"grad_norm": 5.915741443634033,
"learning_rate": 2.7569843603796288e-06,
"loss": 0.5088,
"step": 7500
},
{
"epoch": 24.0175873015873,
"grad_norm": 10.715784072875977,
"learning_rate": 2.748629862317872e-06,
"loss": 0.5091,
"step": 7525
},
{
"epoch": 24.0191746031746,
"grad_norm": 6.6324381828308105,
"learning_rate": 2.740275364256116e-06,
"loss": 0.4805,
"step": 7550
},
{
"epoch": 25.001587301587303,
"grad_norm": 4.505024433135986,
"learning_rate": 2.731920866194359e-06,
"loss": 0.55,
"step": 7575
},
{
"epoch": 25.003174603174603,
"grad_norm": 6.079422950744629,
"learning_rate": 2.723566368132603e-06,
"loss": 0.5442,
"step": 7600
},
{
"epoch": 25.004761904761907,
"grad_norm": 5.729933738708496,
"learning_rate": 2.715211870070846e-06,
"loss": 0.4952,
"step": 7625
},
{
"epoch": 25.006349206349206,
"grad_norm": 12.97045612335205,
"learning_rate": 2.70685737200909e-06,
"loss": 0.5312,
"step": 7650
},
{
"epoch": 25.00793650793651,
"grad_norm": 6.688389301300049,
"learning_rate": 2.6985028739473333e-06,
"loss": 0.4426,
"step": 7675
},
{
"epoch": 25.00952380952381,
"grad_norm": 5.51877498626709,
"learning_rate": 2.690148375885577e-06,
"loss": 0.4597,
"step": 7700
},
{
"epoch": 25.011111111111113,
"grad_norm": 6.6266374588012695,
"learning_rate": 2.6817938778238204e-06,
"loss": 0.4767,
"step": 7725
},
{
"epoch": 25.012698412698413,
"grad_norm": 4.988967418670654,
"learning_rate": 2.673439379762064e-06,
"loss": 0.4721,
"step": 7750
},
{
"epoch": 25.014285714285716,
"grad_norm": 5.249930381774902,
"learning_rate": 2.6650848817003076e-06,
"loss": 0.481,
"step": 7775
},
{
"epoch": 25.015873015873016,
"grad_norm": 8.894637107849121,
"learning_rate": 2.656730383638551e-06,
"loss": 0.5092,
"step": 7800
},
{
"epoch": 25.01746031746032,
"grad_norm": 6.139794826507568,
"learning_rate": 2.648375885576795e-06,
"loss": 0.5146,
"step": 7825
},
{
"epoch": 25.01904761904762,
"grad_norm": 4.836121082305908,
"learning_rate": 2.6400213875150383e-06,
"loss": 0.4853,
"step": 7850
},
{
"epoch": 26.001460317460317,
"grad_norm": 4.840237140655518,
"learning_rate": 2.6316668894532823e-06,
"loss": 0.5325,
"step": 7875
},
{
"epoch": 26.00304761904762,
"grad_norm": 6.270430088043213,
"learning_rate": 2.6233123913915254e-06,
"loss": 0.5517,
"step": 7900
},
{
"epoch": 26.00463492063492,
"grad_norm": 6.732022285461426,
"learning_rate": 2.6149578933297694e-06,
"loss": 0.4771,
"step": 7925
},
{
"epoch": 26.006222222222224,
"grad_norm": 4.249831199645996,
"learning_rate": 2.6066033952680125e-06,
"loss": 0.5281,
"step": 7950
},
{
"epoch": 26.007809523809524,
"grad_norm": 9.650166511535645,
"learning_rate": 2.5982488972062557e-06,
"loss": 0.4461,
"step": 7975
},
{
"epoch": 26.009396825396827,
"grad_norm": 5.7691216468811035,
"learning_rate": 2.5898943991444997e-06,
"loss": 0.4487,
"step": 8000
},
{
"epoch": 26.010984126984127,
"grad_norm": 5.991948127746582,
"learning_rate": 2.581539901082743e-06,
"loss": 0.4715,
"step": 8025
},
{
"epoch": 26.01257142857143,
"grad_norm": 11.065790176391602,
"learning_rate": 2.573185403020987e-06,
"loss": 0.4742,
"step": 8050
},
{
"epoch": 26.01415873015873,
"grad_norm": 11.387042045593262,
"learning_rate": 2.56483090495923e-06,
"loss": 0.4793,
"step": 8075
},
{
"epoch": 26.015746031746033,
"grad_norm": 7.323668479919434,
"learning_rate": 2.556476406897474e-06,
"loss": 0.4933,
"step": 8100
},
{
"epoch": 26.017333333333333,
"grad_norm": 10.183083534240723,
"learning_rate": 2.5481219088357175e-06,
"loss": 0.5176,
"step": 8125
},
{
"epoch": 26.018920634920637,
"grad_norm": 3.41259503364563,
"learning_rate": 2.539767410773961e-06,
"loss": 0.4789,
"step": 8150
},
{
"epoch": 27.001333333333335,
"grad_norm": 8.132092475891113,
"learning_rate": 2.5314129127122046e-06,
"loss": 0.5315,
"step": 8175
},
{
"epoch": 27.002920634920635,
"grad_norm": 6.488096237182617,
"learning_rate": 2.5230584146504478e-06,
"loss": 0.5424,
"step": 8200
},
{
"epoch": 27.004507936507938,
"grad_norm": 7.1543803215026855,
"learning_rate": 2.5147039165886918e-06,
"loss": 0.4628,
"step": 8225
},
{
"epoch": 27.006095238095238,
"grad_norm": 6.017189025878906,
"learning_rate": 2.506349418526935e-06,
"loss": 0.5284,
"step": 8250
},
{
"epoch": 27.00768253968254,
"grad_norm": 5.09862756729126,
"learning_rate": 2.497994920465179e-06,
"loss": 0.4613,
"step": 8275
},
{
"epoch": 27.00926984126984,
"grad_norm": 6.283570766448975,
"learning_rate": 2.489640422403422e-06,
"loss": 0.4374,
"step": 8300
},
{
"epoch": 27.010857142857144,
"grad_norm": 5.45609712600708,
"learning_rate": 2.4812859243416656e-06,
"loss": 0.4615,
"step": 8325
},
{
"epoch": 27.012444444444444,
"grad_norm": 9.621217727661133,
"learning_rate": 2.472931426279909e-06,
"loss": 0.4665,
"step": 8350
},
{
"epoch": 27.014031746031748,
"grad_norm": 10.336989402770996,
"learning_rate": 2.4645769282181527e-06,
"loss": 0.4712,
"step": 8375
},
{
"epoch": 27.015619047619047,
"grad_norm": 8.53022289276123,
"learning_rate": 2.4562224301563963e-06,
"loss": 0.4869,
"step": 8400
},
{
"epoch": 27.01720634920635,
"grad_norm": 6.758102893829346,
"learning_rate": 2.44786793209464e-06,
"loss": 0.5007,
"step": 8425
},
{
"epoch": 27.01879365079365,
"grad_norm": 6.737295627593994,
"learning_rate": 2.4395134340328834e-06,
"loss": 0.5,
"step": 8450
},
{
"epoch": 28.00120634920635,
"grad_norm": 8.9446439743042,
"learning_rate": 2.431158935971127e-06,
"loss": 0.5098,
"step": 8475
},
{
"epoch": 28.002793650793652,
"grad_norm": 4.6513190269470215,
"learning_rate": 2.4228044379093706e-06,
"loss": 0.5249,
"step": 8500
},
{
"epoch": 28.004380952380952,
"grad_norm": 7.930838108062744,
"learning_rate": 2.414449939847614e-06,
"loss": 0.4823,
"step": 8525
},
{
"epoch": 28.005968253968256,
"grad_norm": 5.986405372619629,
"learning_rate": 2.4060954417858577e-06,
"loss": 0.5295,
"step": 8550
},
{
"epoch": 28.007555555555555,
"grad_norm": 6.348638534545898,
"learning_rate": 2.3977409437241013e-06,
"loss": 0.4608,
"step": 8575
},
{
"epoch": 28.00914285714286,
"grad_norm": 5.640425205230713,
"learning_rate": 2.389386445662345e-06,
"loss": 0.4183,
"step": 8600
},
{
"epoch": 28.01073015873016,
"grad_norm": 7.974732875823975,
"learning_rate": 2.3810319476005884e-06,
"loss": 0.4716,
"step": 8625
},
{
"epoch": 28.012317460317462,
"grad_norm": 4.698752403259277,
"learning_rate": 2.372677449538832e-06,
"loss": 0.4637,
"step": 8650
},
{
"epoch": 28.01390476190476,
"grad_norm": 4.2253828048706055,
"learning_rate": 2.3643229514770755e-06,
"loss": 0.4589,
"step": 8675
},
{
"epoch": 28.015492063492065,
"grad_norm": 7.007496356964111,
"learning_rate": 2.355968453415319e-06,
"loss": 0.4978,
"step": 8700
},
{
"epoch": 28.017079365079365,
"grad_norm": 4.830111026763916,
"learning_rate": 2.3476139553535627e-06,
"loss": 0.4873,
"step": 8725
},
{
"epoch": 28.018666666666668,
"grad_norm": 3.9254467487335205,
"learning_rate": 2.3392594572918062e-06,
"loss": 0.4994,
"step": 8750
},
{
"epoch": 29.001079365079367,
"grad_norm": 6.090777397155762,
"learning_rate": 2.33090495923005e-06,
"loss": 0.4761,
"step": 8775
},
{
"epoch": 29.002666666666666,
"grad_norm": 5.358640670776367,
"learning_rate": 2.322550461168293e-06,
"loss": 0.5431,
"step": 8800
},
{
"epoch": 29.00425396825397,
"grad_norm": 9.447075843811035,
"learning_rate": 2.3141959631065365e-06,
"loss": 0.4746,
"step": 8825
},
{
"epoch": 29.00584126984127,
"grad_norm": 5.390321731567383,
"learning_rate": 2.30584146504478e-06,
"loss": 0.5236,
"step": 8850
},
{
"epoch": 29.007428571428573,
"grad_norm": 4.194957256317139,
"learning_rate": 2.2974869669830236e-06,
"loss": 0.4683,
"step": 8875
},
{
"epoch": 29.009015873015873,
"grad_norm": 10.377429008483887,
"learning_rate": 2.289132468921267e-06,
"loss": 0.4122,
"step": 8900
},
{
"epoch": 29.010603174603176,
"grad_norm": 4.972590923309326,
"learning_rate": 2.2807779708595108e-06,
"loss": 0.4557,
"step": 8925
},
{
"epoch": 29.012190476190476,
"grad_norm": 4.772759437561035,
"learning_rate": 2.2724234727977548e-06,
"loss": 0.4732,
"step": 8950
},
{
"epoch": 29.01377777777778,
"grad_norm": 19.494970321655273,
"learning_rate": 2.2640689747359983e-06,
"loss": 0.4653,
"step": 8975
},
{
"epoch": 29.01536507936508,
"grad_norm": 6.1877593994140625,
"learning_rate": 2.255714476674242e-06,
"loss": 0.4901,
"step": 9000
},
{
"epoch": 29.016952380952382,
"grad_norm": 5.228841781616211,
"learning_rate": 2.2473599786124854e-06,
"loss": 0.4841,
"step": 9025
},
{
"epoch": 29.018539682539682,
"grad_norm": 5.32314395904541,
"learning_rate": 2.2390054805507286e-06,
"loss": 0.4902,
"step": 9050
},
{
"epoch": 30.00095238095238,
"grad_norm": 6.952610015869141,
"learning_rate": 2.230650982488972e-06,
"loss": 0.4803,
"step": 9075
},
{
"epoch": 30.002539682539684,
"grad_norm": 4.230266571044922,
"learning_rate": 2.2222964844272157e-06,
"loss": 0.5235,
"step": 9100
},
{
"epoch": 30.004126984126984,
"grad_norm": 7.016523361206055,
"learning_rate": 2.2139419863654593e-06,
"loss": 0.4873,
"step": 9125
},
{
"epoch": 30.005714285714287,
"grad_norm": 10.13500690460205,
"learning_rate": 2.205587488303703e-06,
"loss": 0.5262,
"step": 9150
},
{
"epoch": 30.007301587301587,
"grad_norm": 7.627212047576904,
"learning_rate": 2.1972329902419464e-06,
"loss": 0.4619,
"step": 9175
},
{
"epoch": 30.00888888888889,
"grad_norm": 7.077376365661621,
"learning_rate": 2.18887849218019e-06,
"loss": 0.4011,
"step": 9200
},
{
"epoch": 30.01047619047619,
"grad_norm": 7.501957416534424,
"learning_rate": 2.1805239941184336e-06,
"loss": 0.455,
"step": 9225
},
{
"epoch": 30.012063492063493,
"grad_norm": 6.617973327636719,
"learning_rate": 2.172169496056677e-06,
"loss": 0.4814,
"step": 9250
},
{
"epoch": 30.013650793650793,
"grad_norm": 3.885499954223633,
"learning_rate": 2.1638149979949207e-06,
"loss": 0.4523,
"step": 9275
},
{
"epoch": 30.015238095238097,
"grad_norm": 5.5597615242004395,
"learning_rate": 2.1554604999331642e-06,
"loss": 0.4768,
"step": 9300
},
{
"epoch": 30.016825396825396,
"grad_norm": 9.792261123657227,
"learning_rate": 2.147106001871408e-06,
"loss": 0.49,
"step": 9325
},
{
"epoch": 30.0184126984127,
"grad_norm": 5.992704391479492,
"learning_rate": 2.1387515038096514e-06,
"loss": 0.498,
"step": 9350
},
{
"epoch": 31.000825396825398,
"grad_norm": 8.464439392089844,
"learning_rate": 2.130397005747895e-06,
"loss": 0.4518,
"step": 9375
},
{
"epoch": 31.002412698412698,
"grad_norm": 3.486860990524292,
"learning_rate": 2.1220425076861385e-06,
"loss": 0.5318,
"step": 9400
},
{
"epoch": 31.004,
"grad_norm": 4.426388740539551,
"learning_rate": 2.113688009624382e-06,
"loss": 0.4917,
"step": 9425
},
{
"epoch": 31.0055873015873,
"grad_norm": 8.08337116241455,
"learning_rate": 2.1053335115626256e-06,
"loss": 0.5093,
"step": 9450
},
{
"epoch": 31.007174603174604,
"grad_norm": 3.963824987411499,
"learning_rate": 2.096979013500869e-06,
"loss": 0.4515,
"step": 9475
},
{
"epoch": 31.008761904761904,
"grad_norm": 7.304539203643799,
"learning_rate": 2.0886245154391128e-06,
"loss": 0.4196,
"step": 9500
},
{
"epoch": 31.010349206349208,
"grad_norm": 4.731977939605713,
"learning_rate": 2.080270017377356e-06,
"loss": 0.4529,
"step": 9525
},
{
"epoch": 31.011936507936507,
"grad_norm": 8.285253524780273,
"learning_rate": 2.0719155193155995e-06,
"loss": 0.4653,
"step": 9550
},
{
"epoch": 31.01352380952381,
"grad_norm": 8.305194854736328,
"learning_rate": 2.063561021253843e-06,
"loss": 0.4624,
"step": 9575
},
{
"epoch": 31.01511111111111,
"grad_norm": 13.913382530212402,
"learning_rate": 2.0552065231920866e-06,
"loss": 0.4676,
"step": 9600
},
{
"epoch": 31.016698412698414,
"grad_norm": 6.448155403137207,
"learning_rate": 2.04685202513033e-06,
"loss": 0.476,
"step": 9625
},
{
"epoch": 31.018285714285714,
"grad_norm": 7.706886291503906,
"learning_rate": 2.0384975270685737e-06,
"loss": 0.4967,
"step": 9650
},
{
"epoch": 32.00069841269841,
"grad_norm": 4.588306427001953,
"learning_rate": 2.0301430290068173e-06,
"loss": 0.4358,
"step": 9675
},
{
"epoch": 32.00228571428571,
"grad_norm": 4.243907451629639,
"learning_rate": 2.021788530945061e-06,
"loss": 0.5546,
"step": 9700
},
{
"epoch": 32.00387301587302,
"grad_norm": 6.786617755889893,
"learning_rate": 2.0134340328833044e-06,
"loss": 0.4775,
"step": 9725
},
{
"epoch": 32.00546031746032,
"grad_norm": 9.617806434631348,
"learning_rate": 2.005079534821548e-06,
"loss": 0.4857,
"step": 9750
},
{
"epoch": 32.00704761904762,
"grad_norm": 4.088709354400635,
"learning_rate": 1.9967250367597916e-06,
"loss": 0.4872,
"step": 9775
},
{
"epoch": 32.00863492063492,
"grad_norm": 7.801070690155029,
"learning_rate": 1.988370538698035e-06,
"loss": 0.3978,
"step": 9800
},
{
"epoch": 32.010222222222225,
"grad_norm": 12.151320457458496,
"learning_rate": 1.9800160406362787e-06,
"loss": 0.4567,
"step": 9825
},
{
"epoch": 32.011809523809525,
"grad_norm": 6.325204372406006,
"learning_rate": 1.9716615425745223e-06,
"loss": 0.4568,
"step": 9850
},
{
"epoch": 32.013396825396825,
"grad_norm": 4.849487781524658,
"learning_rate": 1.963307044512766e-06,
"loss": 0.4612,
"step": 9875
},
{
"epoch": 32.014984126984125,
"grad_norm": 6.0611090660095215,
"learning_rate": 1.9549525464510094e-06,
"loss": 0.4492,
"step": 9900
},
{
"epoch": 32.01657142857143,
"grad_norm": 8.705154418945312,
"learning_rate": 1.946598048389253e-06,
"loss": 0.4823,
"step": 9925
},
{
"epoch": 32.01815873015873,
"grad_norm": 6.838711261749268,
"learning_rate": 1.9382435503274965e-06,
"loss": 0.4981,
"step": 9950
},
{
"epoch": 33.000571428571426,
"grad_norm": 6.763596057891846,
"learning_rate": 1.92988905226574e-06,
"loss": 0.4369,
"step": 9975
},
{
"epoch": 33.00215873015873,
"grad_norm": 5.268820285797119,
"learning_rate": 1.9215345542039837e-06,
"loss": 0.5427,
"step": 10000
},
{
"epoch": 33.00374603174603,
"grad_norm": 3.5916361808776855,
"learning_rate": 1.913180056142227e-06,
"loss": 0.4798,
"step": 10025
},
{
"epoch": 33.00533333333333,
"grad_norm": 4.288261890411377,
"learning_rate": 1.9048255580804706e-06,
"loss": 0.488,
"step": 10050
},
{
"epoch": 33.00692063492063,
"grad_norm": 5.668625354766846,
"learning_rate": 1.8964710600187142e-06,
"loss": 0.5008,
"step": 10075
},
{
"epoch": 33.00850793650794,
"grad_norm": 4.973710536956787,
"learning_rate": 1.8881165619569577e-06,
"loss": 0.391,
"step": 10100
},
{
"epoch": 33.01009523809524,
"grad_norm": 7.073776721954346,
"learning_rate": 1.8797620638952013e-06,
"loss": 0.444,
"step": 10125
},
{
"epoch": 33.01168253968254,
"grad_norm": 4.297868251800537,
"learning_rate": 1.8714075658334449e-06,
"loss": 0.4516,
"step": 10150
},
{
"epoch": 33.01326984126984,
"grad_norm": 25.770071029663086,
"learning_rate": 1.8630530677716884e-06,
"loss": 0.4746,
"step": 10175
},
{
"epoch": 33.014857142857146,
"grad_norm": 3.6950442790985107,
"learning_rate": 1.854698569709932e-06,
"loss": 0.4346,
"step": 10200
},
{
"epoch": 33.016444444444446,
"grad_norm": 8.81914234161377,
"learning_rate": 1.8463440716481756e-06,
"loss": 0.4794,
"step": 10225
},
{
"epoch": 33.018031746031745,
"grad_norm": 6.376156806945801,
"learning_rate": 1.837989573586419e-06,
"loss": 0.5085,
"step": 10250
},
{
"epoch": 34.00044444444445,
"grad_norm": 4.302385330200195,
"learning_rate": 1.8296350755246625e-06,
"loss": 0.4276,
"step": 10275
},
{
"epoch": 34.00203174603175,
"grad_norm": 8.226256370544434,
"learning_rate": 1.821280577462906e-06,
"loss": 0.5405,
"step": 10300
},
{
"epoch": 34.00361904761905,
"grad_norm": 12.093836784362793,
"learning_rate": 1.8129260794011496e-06,
"loss": 0.4661,
"step": 10325
},
{
"epoch": 34.00520634920635,
"grad_norm": 11.840346336364746,
"learning_rate": 1.8045715813393932e-06,
"loss": 0.4779,
"step": 10350
},
{
"epoch": 34.006793650793654,
"grad_norm": 5.186977386474609,
"learning_rate": 1.7962170832776367e-06,
"loss": 0.5152,
"step": 10375
},
{
"epoch": 34.00838095238095,
"grad_norm": 4.393645286560059,
"learning_rate": 1.7878625852158805e-06,
"loss": 0.3996,
"step": 10400
},
{
"epoch": 34.00996825396825,
"grad_norm": 11.108858108520508,
"learning_rate": 1.779508087154124e-06,
"loss": 0.4351,
"step": 10425
},
{
"epoch": 34.01155555555555,
"grad_norm": 6.400074005126953,
"learning_rate": 1.7711535890923676e-06,
"loss": 0.4461,
"step": 10450
},
{
"epoch": 34.01314285714286,
"grad_norm": 11.155898094177246,
"learning_rate": 1.7627990910306112e-06,
"loss": 0.4503,
"step": 10475
},
{
"epoch": 34.01473015873016,
"grad_norm": 5.40648889541626,
"learning_rate": 1.7544445929688544e-06,
"loss": 0.4552,
"step": 10500
},
{
"epoch": 34.01631746031746,
"grad_norm": 4.550622463226318,
"learning_rate": 1.746090094907098e-06,
"loss": 0.4701,
"step": 10525
},
{
"epoch": 34.01790476190476,
"grad_norm": 7.3100433349609375,
"learning_rate": 1.7377355968453417e-06,
"loss": 0.4936,
"step": 10550
},
{
"epoch": 35.00031746031746,
"grad_norm": 6.450623035430908,
"learning_rate": 1.7293810987835853e-06,
"loss": 0.449,
"step": 10575
},
{
"epoch": 35.00190476190476,
"grad_norm": 9.00794792175293,
"learning_rate": 1.7210266007218288e-06,
"loss": 0.5011,
"step": 10600
},
{
"epoch": 35.00349206349206,
"grad_norm": 10.713994026184082,
"learning_rate": 1.7126721026600724e-06,
"loss": 0.4829,
"step": 10625
},
{
"epoch": 35.00507936507937,
"grad_norm": 4.488622188568115,
"learning_rate": 1.704317604598316e-06,
"loss": 0.4795,
"step": 10650
},
{
"epoch": 35.00666666666667,
"grad_norm": 16.104774475097656,
"learning_rate": 1.6959631065365595e-06,
"loss": 0.5035,
"step": 10675
},
{
"epoch": 35.00825396825397,
"grad_norm": 4.884101390838623,
"learning_rate": 1.687608608474803e-06,
"loss": 0.4168,
"step": 10700
},
{
"epoch": 35.00984126984127,
"grad_norm": 5.478789806365967,
"learning_rate": 1.6792541104130467e-06,
"loss": 0.4258,
"step": 10725
},
{
"epoch": 35.011428571428574,
"grad_norm": 6.428930282592773,
"learning_rate": 1.67089961235129e-06,
"loss": 0.4392,
"step": 10750
},
{
"epoch": 35.013015873015874,
"grad_norm": 2.7530977725982666,
"learning_rate": 1.6625451142895336e-06,
"loss": 0.451,
"step": 10775
},
{
"epoch": 35.014603174603174,
"grad_norm": 5.9829912185668945,
"learning_rate": 1.6541906162277771e-06,
"loss": 0.4586,
"step": 10800
},
{
"epoch": 35.016190476190474,
"grad_norm": 6.039813995361328,
"learning_rate": 1.6458361181660207e-06,
"loss": 0.4667,
"step": 10825
},
{
"epoch": 35.01777777777778,
"grad_norm": 6.336811065673828,
"learning_rate": 1.6374816201042643e-06,
"loss": 0.482,
"step": 10850
},
{
"epoch": 36.000190476190475,
"grad_norm": 6.172911643981934,
"learning_rate": 1.6291271220425078e-06,
"loss": 0.4651,
"step": 10875
},
{
"epoch": 36.001777777777775,
"grad_norm": 4.215289115905762,
"learning_rate": 1.6207726239807514e-06,
"loss": 0.4776,
"step": 10900
},
{
"epoch": 36.00336507936508,
"grad_norm": 2.862426519393921,
"learning_rate": 1.612418125918995e-06,
"loss": 0.5059,
"step": 10925
},
{
"epoch": 36.00495238095238,
"grad_norm": 4.817645072937012,
"learning_rate": 1.6040636278572385e-06,
"loss": 0.4777,
"step": 10950
},
{
"epoch": 36.00653968253968,
"grad_norm": 8.073090553283691,
"learning_rate": 1.595709129795482e-06,
"loss": 0.4837,
"step": 10975
},
{
"epoch": 36.00812698412698,
"grad_norm": 6.108732223510742,
"learning_rate": 1.5873546317337255e-06,
"loss": 0.4149,
"step": 11000
},
{
"epoch": 36.00971428571429,
"grad_norm": 4.37070369720459,
"learning_rate": 1.579000133671969e-06,
"loss": 0.4323,
"step": 11025
},
{
"epoch": 36.01130158730159,
"grad_norm": 3.8069772720336914,
"learning_rate": 1.5706456356102126e-06,
"loss": 0.4488,
"step": 11050
},
{
"epoch": 36.01288888888889,
"grad_norm": 4.619093894958496,
"learning_rate": 1.5622911375484562e-06,
"loss": 0.4551,
"step": 11075
},
{
"epoch": 36.01447619047619,
"grad_norm": 5.150590419769287,
"learning_rate": 1.5539366394866997e-06,
"loss": 0.4401,
"step": 11100
},
{
"epoch": 36.016063492063495,
"grad_norm": 6.264833450317383,
"learning_rate": 1.5455821414249433e-06,
"loss": 0.4642,
"step": 11125
},
{
"epoch": 36.017650793650795,
"grad_norm": 7.851002216339111,
"learning_rate": 1.5372276433631869e-06,
"loss": 0.4791,
"step": 11150
},
{
"epoch": 37.00006349206349,
"grad_norm": 7.941956996917725,
"learning_rate": 1.5288731453014304e-06,
"loss": 0.4488,
"step": 11175
},
{
"epoch": 37.001650793650796,
"grad_norm": 6.259990692138672,
"learning_rate": 1.520518647239674e-06,
"loss": 0.4967,
"step": 11200
},
{
"epoch": 37.003238095238096,
"grad_norm": 2.6891424655914307,
"learning_rate": 1.5121641491779175e-06,
"loss": 0.501,
"step": 11225
},
{
"epoch": 37.004825396825396,
"grad_norm": 7.098905563354492,
"learning_rate": 1.503809651116161e-06,
"loss": 0.4755,
"step": 11250
},
{
"epoch": 37.006412698412696,
"grad_norm": 5.689685344696045,
"learning_rate": 1.4954551530544045e-06,
"loss": 0.4868,
"step": 11275
},
{
"epoch": 37.008,
"grad_norm": 6.782131195068359,
"learning_rate": 1.487100654992648e-06,
"loss": 0.4057,
"step": 11300
},
{
"epoch": 37.0095873015873,
"grad_norm": 7.18269157409668,
"learning_rate": 1.4787461569308916e-06,
"loss": 0.4281,
"step": 11325
},
{
"epoch": 37.0111746031746,
"grad_norm": 6.619096755981445,
"learning_rate": 1.4703916588691352e-06,
"loss": 0.4398,
"step": 11350
},
{
"epoch": 37.0127619047619,
"grad_norm": 4.206869602203369,
"learning_rate": 1.462037160807379e-06,
"loss": 0.4394,
"step": 11375
},
{
"epoch": 37.01434920634921,
"grad_norm": 7.179015636444092,
"learning_rate": 1.4536826627456225e-06,
"loss": 0.4515,
"step": 11400
},
{
"epoch": 37.01593650793651,
"grad_norm": 5.137106895446777,
"learning_rate": 1.445328164683866e-06,
"loss": 0.4704,
"step": 11425
},
{
"epoch": 37.01752380952381,
"grad_norm": 4.871583461761475,
"learning_rate": 1.4369736666221096e-06,
"loss": 0.4704,
"step": 11450
},
{
"epoch": 37.01911111111111,
"grad_norm": 3.7863287925720215,
"learning_rate": 1.4286191685603528e-06,
"loss": 0.4518,
"step": 11475
},
{
"epoch": 38.00152380952381,
"grad_norm": 11.65233039855957,
"learning_rate": 1.4202646704985963e-06,
"loss": 0.5093,
"step": 11500
},
{
"epoch": 38.00311111111111,
"grad_norm": 9.356032371520996,
"learning_rate": 1.4119101724368401e-06,
"loss": 0.4997,
"step": 11525
},
{
"epoch": 38.00469841269841,
"grad_norm": 5.056600093841553,
"learning_rate": 1.4035556743750837e-06,
"loss": 0.4455,
"step": 11550
},
{
"epoch": 38.00628571428572,
"grad_norm": 4.728214740753174,
"learning_rate": 1.3952011763133273e-06,
"loss": 0.4935,
"step": 11575
},
{
"epoch": 38.00787301587302,
"grad_norm": 6.161364555358887,
"learning_rate": 1.3868466782515708e-06,
"loss": 0.4153,
"step": 11600
},
{
"epoch": 38.00946031746032,
"grad_norm": 10.345149993896484,
"learning_rate": 1.3784921801898144e-06,
"loss": 0.4206,
"step": 11625
},
{
"epoch": 38.011047619047616,
"grad_norm": 3.4351565837860107,
"learning_rate": 1.370137682128058e-06,
"loss": 0.434,
"step": 11650
},
{
"epoch": 38.01263492063492,
"grad_norm": 6.009977340698242,
"learning_rate": 1.3617831840663015e-06,
"loss": 0.448,
"step": 11675
},
{
"epoch": 38.01422222222222,
"grad_norm": 5.062911510467529,
"learning_rate": 1.353428686004545e-06,
"loss": 0.4399,
"step": 11700
},
{
"epoch": 38.01580952380952,
"grad_norm": 5.664973735809326,
"learning_rate": 1.3450741879427884e-06,
"loss": 0.4674,
"step": 11725
},
{
"epoch": 38.01739682539682,
"grad_norm": 7.776061534881592,
"learning_rate": 1.336719689881032e-06,
"loss": 0.481,
"step": 11750
},
{
"epoch": 38.01898412698413,
"grad_norm": 4.977148532867432,
"learning_rate": 1.3283651918192756e-06,
"loss": 0.446,
"step": 11775
},
{
"epoch": 39.001396825396824,
"grad_norm": 5.45735502243042,
"learning_rate": 1.3200106937575191e-06,
"loss": 0.4955,
"step": 11800
},
{
"epoch": 39.002984126984124,
"grad_norm": 5.3033881187438965,
"learning_rate": 1.3116561956957627e-06,
"loss": 0.5065,
"step": 11825
},
{
"epoch": 39.00457142857143,
"grad_norm": 5.209838390350342,
"learning_rate": 1.3033016976340063e-06,
"loss": 0.4283,
"step": 11850
},
{
"epoch": 39.00615873015873,
"grad_norm": 9.413320541381836,
"learning_rate": 1.2949471995722498e-06,
"loss": 0.5134,
"step": 11875
},
{
"epoch": 39.00774603174603,
"grad_norm": 3.614576816558838,
"learning_rate": 1.2865927015104934e-06,
"loss": 0.4132,
"step": 11900
},
{
"epoch": 39.00933333333333,
"grad_norm": 5.418633937835693,
"learning_rate": 1.278238203448737e-06,
"loss": 0.411,
"step": 11925
},
{
"epoch": 39.01092063492064,
"grad_norm": 7.182598114013672,
"learning_rate": 1.2698837053869805e-06,
"loss": 0.4367,
"step": 11950
},
{
"epoch": 39.01250793650794,
"grad_norm": 5.821928024291992,
"learning_rate": 1.2615292073252239e-06,
"loss": 0.4313,
"step": 11975
},
{
"epoch": 39.01409523809524,
"grad_norm": 5.386777877807617,
"learning_rate": 1.2531747092634675e-06,
"loss": 0.4423,
"step": 12000
},
{
"epoch": 39.01568253968254,
"grad_norm": 7.081798553466797,
"learning_rate": 1.244820211201711e-06,
"loss": 0.4637,
"step": 12025
},
{
"epoch": 39.017269841269844,
"grad_norm": 6.970532417297363,
"learning_rate": 1.2364657131399546e-06,
"loss": 0.4641,
"step": 12050
},
{
"epoch": 39.018857142857144,
"grad_norm": 10.821274757385254,
"learning_rate": 1.2281112150781982e-06,
"loss": 0.4712,
"step": 12075
},
{
"epoch": 40.00126984126984,
"grad_norm": 7.221808910369873,
"learning_rate": 1.2197567170164417e-06,
"loss": 0.4856,
"step": 12100
},
{
"epoch": 40.002857142857145,
"grad_norm": 15.445313453674316,
"learning_rate": 1.2114022189546853e-06,
"loss": 0.5051,
"step": 12125
},
{
"epoch": 40.004444444444445,
"grad_norm": 5.132606029510498,
"learning_rate": 1.2030477208929288e-06,
"loss": 0.4254,
"step": 12150
},
{
"epoch": 40.006031746031745,
"grad_norm": 3.0163042545318604,
"learning_rate": 1.1946932228311724e-06,
"loss": 0.4991,
"step": 12175
},
{
"epoch": 40.007619047619045,
"grad_norm": 4.396322250366211,
"learning_rate": 1.186338724769416e-06,
"loss": 0.4394,
"step": 12200
},
{
"epoch": 40.00920634920635,
"grad_norm": 9.984151840209961,
"learning_rate": 1.1779842267076595e-06,
"loss": 0.4004,
"step": 12225
},
{
"epoch": 40.01079365079365,
"grad_norm": 5.9347825050354,
"learning_rate": 1.1696297286459031e-06,
"loss": 0.4373,
"step": 12250
},
{
"epoch": 40.01238095238095,
"grad_norm": 5.0575852394104,
"learning_rate": 1.1612752305841465e-06,
"loss": 0.4321,
"step": 12275
},
{
"epoch": 40.01396825396825,
"grad_norm": 6.324869155883789,
"learning_rate": 1.15292073252239e-06,
"loss": 0.4353,
"step": 12300
},
{
"epoch": 40.01555555555556,
"grad_norm": 6.5207414627075195,
"learning_rate": 1.1445662344606336e-06,
"loss": 0.4675,
"step": 12325
},
{
"epoch": 40.01714285714286,
"grad_norm": 6.220884799957275,
"learning_rate": 1.1362117363988774e-06,
"loss": 0.4611,
"step": 12350
},
{
"epoch": 40.01873015873016,
"grad_norm": 10.964550018310547,
"learning_rate": 1.127857238337121e-06,
"loss": 0.4673,
"step": 12375
},
{
"epoch": 41.00114285714286,
"grad_norm": 6.545460224151611,
"learning_rate": 1.1195027402753643e-06,
"loss": 0.4593,
"step": 12400
},
{
"epoch": 41.00273015873016,
"grad_norm": 6.692239761352539,
"learning_rate": 1.1111482422136079e-06,
"loss": 0.512,
"step": 12425
},
{
"epoch": 41.00431746031746,
"grad_norm": 7.213928699493408,
"learning_rate": 1.1027937441518514e-06,
"loss": 0.4477,
"step": 12450
},
{
"epoch": 41.00590476190476,
"grad_norm": 4.5662431716918945,
"learning_rate": 1.094439246090095e-06,
"loss": 0.4916,
"step": 12475
},
{
"epoch": 41.007492063492066,
"grad_norm": 4.408071041107178,
"learning_rate": 1.0860847480283386e-06,
"loss": 0.4456,
"step": 12500
},
{
"epoch": 41.009079365079366,
"grad_norm": 5.65850830078125,
"learning_rate": 1.0777302499665821e-06,
"loss": 0.3866,
"step": 12525
},
{
"epoch": 41.010666666666665,
"grad_norm": 5.419500827789307,
"learning_rate": 1.0693757519048257e-06,
"loss": 0.4345,
"step": 12550
},
{
"epoch": 41.012253968253965,
"grad_norm": 4.399853229522705,
"learning_rate": 1.0610212538430693e-06,
"loss": 0.4444,
"step": 12575
},
{
"epoch": 41.01384126984127,
"grad_norm": 6.50507116317749,
"learning_rate": 1.0526667557813128e-06,
"loss": 0.4385,
"step": 12600
},
{
"epoch": 41.01542857142857,
"grad_norm": 2.587691068649292,
"learning_rate": 1.0443122577195564e-06,
"loss": 0.4558,
"step": 12625
},
{
"epoch": 41.01701587301587,
"grad_norm": 4.828845977783203,
"learning_rate": 1.0359577596577997e-06,
"loss": 0.4655,
"step": 12650
},
{
"epoch": 41.01860317460317,
"grad_norm": 9.805785179138184,
"learning_rate": 1.0276032615960433e-06,
"loss": 0.4688,
"step": 12675
},
{
"epoch": 42.001015873015874,
"grad_norm": 4.370798587799072,
"learning_rate": 1.0192487635342869e-06,
"loss": 0.4457,
"step": 12700
},
{
"epoch": 42.00260317460317,
"grad_norm": 5.954914093017578,
"learning_rate": 1.0108942654725304e-06,
"loss": 0.5047,
"step": 12725
},
{
"epoch": 42.00419047619047,
"grad_norm": 4.680336952209473,
"learning_rate": 1.002539767410774e-06,
"loss": 0.4536,
"step": 12750
},
{
"epoch": 42.00577777777778,
"grad_norm": 3.612610101699829,
"learning_rate": 9.941852693490176e-07,
"loss": 0.5006,
"step": 12775
},
{
"epoch": 42.00736507936508,
"grad_norm": 4.1838178634643555,
"learning_rate": 9.858307712872611e-07,
"loss": 0.4459,
"step": 12800
},
{
"epoch": 42.00895238095238,
"grad_norm": 5.238033771514893,
"learning_rate": 9.774762732255047e-07,
"loss": 0.3692,
"step": 12825
},
{
"epoch": 42.01053968253968,
"grad_norm": 5.992905139923096,
"learning_rate": 9.691217751637483e-07,
"loss": 0.4471,
"step": 12850
},
{
"epoch": 42.012126984126986,
"grad_norm": 3.9099578857421875,
"learning_rate": 9.607672771019918e-07,
"loss": 0.4505,
"step": 12875
},
{
"epoch": 42.013714285714286,
"grad_norm": 5.2203497886657715,
"learning_rate": 9.524127790402353e-07,
"loss": 0.4285,
"step": 12900
},
{
"epoch": 42.015301587301586,
"grad_norm": 10.484596252441406,
"learning_rate": 9.440582809784789e-07,
"loss": 0.4644,
"step": 12925
},
{
"epoch": 42.016888888888886,
"grad_norm": 9.001386642456055,
"learning_rate": 9.357037829167224e-07,
"loss": 0.4485,
"step": 12950
},
{
"epoch": 42.01847619047619,
"grad_norm": 5.222927570343018,
"learning_rate": 9.27349284854966e-07,
"loss": 0.4722,
"step": 12975
},
{
"epoch": 43.00088888888889,
"grad_norm": 5.298580169677734,
"learning_rate": 9.189947867932095e-07,
"loss": 0.4334,
"step": 13000
},
{
"epoch": 43.00247619047619,
"grad_norm": 7.794508457183838,
"learning_rate": 9.10640288731453e-07,
"loss": 0.5104,
"step": 13025
},
{
"epoch": 43.004063492063494,
"grad_norm": 7.346789360046387,
"learning_rate": 9.022857906696966e-07,
"loss": 0.4567,
"step": 13050
},
{
"epoch": 43.005650793650794,
"grad_norm": 7.656489372253418,
"learning_rate": 8.939312926079403e-07,
"loss": 0.4923,
"step": 13075
},
{
"epoch": 43.007238095238094,
"grad_norm": 4.487580299377441,
"learning_rate": 8.855767945461838e-07,
"loss": 0.4286,
"step": 13100
},
{
"epoch": 43.008825396825394,
"grad_norm": 3.4565789699554443,
"learning_rate": 8.772222964844272e-07,
"loss": 0.3914,
"step": 13125
},
{
"epoch": 43.0104126984127,
"grad_norm": 6.925444602966309,
"learning_rate": 8.688677984226708e-07,
"loss": 0.4391,
"step": 13150
},
{
"epoch": 43.012,
"grad_norm": 5.899662494659424,
"learning_rate": 8.605133003609144e-07,
"loss": 0.4529,
"step": 13175
},
{
"epoch": 43.0135873015873,
"grad_norm": 4.932972431182861,
"learning_rate": 8.52158802299158e-07,
"loss": 0.4289,
"step": 13200
},
{
"epoch": 43.0151746031746,
"grad_norm": 6.803351402282715,
"learning_rate": 8.438043042374015e-07,
"loss": 0.4562,
"step": 13225
},
{
"epoch": 43.01676190476191,
"grad_norm": 4.4438066482543945,
"learning_rate": 8.35449806175645e-07,
"loss": 0.4426,
"step": 13250
},
{
"epoch": 43.01834920634921,
"grad_norm": 9.328689575195312,
"learning_rate": 8.270953081138886e-07,
"loss": 0.4814,
"step": 13275
},
{
"epoch": 44.0007619047619,
"grad_norm": 10.716085433959961,
"learning_rate": 8.187408100521321e-07,
"loss": 0.4272,
"step": 13300
},
{
"epoch": 44.00234920634921,
"grad_norm": 5.453390598297119,
"learning_rate": 8.103863119903757e-07,
"loss": 0.5105,
"step": 13325
},
{
"epoch": 44.00393650793651,
"grad_norm": 8.206409454345703,
"learning_rate": 8.020318139286193e-07,
"loss": 0.4678,
"step": 13350
},
{
"epoch": 44.00552380952381,
"grad_norm": 9.60810661315918,
"learning_rate": 7.936773158668627e-07,
"loss": 0.4715,
"step": 13375
},
{
"epoch": 44.00711111111111,
"grad_norm": 4.23331880569458,
"learning_rate": 7.853228178051063e-07,
"loss": 0.4476,
"step": 13400
},
{
"epoch": 44.008698412698415,
"grad_norm": 4.390725135803223,
"learning_rate": 7.769683197433499e-07,
"loss": 0.3836,
"step": 13425
},
{
"epoch": 44.010285714285715,
"grad_norm": 8.254178047180176,
"learning_rate": 7.686138216815934e-07,
"loss": 0.441,
"step": 13450
},
{
"epoch": 44.011873015873014,
"grad_norm": 9.834754943847656,
"learning_rate": 7.60259323619837e-07,
"loss": 0.4474,
"step": 13475
},
{
"epoch": 44.013460317460314,
"grad_norm": 5.189121723175049,
"learning_rate": 7.519048255580805e-07,
"loss": 0.4259,
"step": 13500
},
{
"epoch": 44.01504761904762,
"grad_norm": 8.684473037719727,
"learning_rate": 7.43550327496324e-07,
"loss": 0.4342,
"step": 13525
},
{
"epoch": 44.01663492063492,
"grad_norm": 9.289813041687012,
"learning_rate": 7.351958294345676e-07,
"loss": 0.4599,
"step": 13550
},
{
"epoch": 44.01822222222222,
"grad_norm": 4.835540294647217,
"learning_rate": 7.268413313728113e-07,
"loss": 0.4765,
"step": 13575
},
{
"epoch": 45.00063492063492,
"grad_norm": 15.820073127746582,
"learning_rate": 7.184868333110548e-07,
"loss": 0.429,
"step": 13600
},
{
"epoch": 45.00222222222222,
"grad_norm": 7.332337856292725,
"learning_rate": 7.101323352492982e-07,
"loss": 0.5291,
"step": 13625
},
{
"epoch": 45.00380952380952,
"grad_norm": 9.775607109069824,
"learning_rate": 7.017778371875418e-07,
"loss": 0.4441,
"step": 13650
},
{
"epoch": 45.00539682539682,
"grad_norm": 6.92523717880249,
"learning_rate": 6.934233391257854e-07,
"loss": 0.4656,
"step": 13675
},
{
"epoch": 45.00698412698413,
"grad_norm": 4.137689590454102,
"learning_rate": 6.85068841064029e-07,
"loss": 0.4757,
"step": 13700
},
{
"epoch": 45.00857142857143,
"grad_norm": 6.4185638427734375,
"learning_rate": 6.767143430022725e-07,
"loss": 0.3757,
"step": 13725
},
{
"epoch": 45.01015873015873,
"grad_norm": 6.079351425170898,
"learning_rate": 6.68359844940516e-07,
"loss": 0.431,
"step": 13750
},
{
"epoch": 45.01174603174603,
"grad_norm": 4.2088165283203125,
"learning_rate": 6.600053468787596e-07,
"loss": 0.4354,
"step": 13775
},
{
"epoch": 45.013333333333335,
"grad_norm": 3.3986575603485107,
"learning_rate": 6.516508488170031e-07,
"loss": 0.4486,
"step": 13800
},
{
"epoch": 45.014920634920635,
"grad_norm": 5.290012359619141,
"learning_rate": 6.432963507552467e-07,
"loss": 0.4227,
"step": 13825
},
{
"epoch": 45.016507936507935,
"grad_norm": 8.828485488891602,
"learning_rate": 6.349418526934903e-07,
"loss": 0.4662,
"step": 13850
},
{
"epoch": 45.018095238095235,
"grad_norm": 4.196980953216553,
"learning_rate": 6.265873546317337e-07,
"loss": 0.4792,
"step": 13875
},
{
"epoch": 46.00050793650794,
"grad_norm": 8.14965534210205,
"learning_rate": 6.182328565699773e-07,
"loss": 0.4178,
"step": 13900
},
{
"epoch": 46.00209523809524,
"grad_norm": 4.221451282501221,
"learning_rate": 6.098783585082209e-07,
"loss": 0.5127,
"step": 13925
},
{
"epoch": 46.003682539682536,
"grad_norm": 7.106605052947998,
"learning_rate": 6.015238604464644e-07,
"loss": 0.4599,
"step": 13950
},
{
"epoch": 46.00526984126984,
"grad_norm": 11.949932098388672,
"learning_rate": 5.93169362384708e-07,
"loss": 0.4757,
"step": 13975
},
{
"epoch": 46.00685714285714,
"grad_norm": 7.468497276306152,
"learning_rate": 5.848148643229516e-07,
"loss": 0.4786,
"step": 14000
},
{
"epoch": 46.00844444444444,
"grad_norm": 9.329829216003418,
"learning_rate": 5.76460366261195e-07,
"loss": 0.3776,
"step": 14025
},
{
"epoch": 46.01003174603174,
"grad_norm": 5.68237829208374,
"learning_rate": 5.681058681994387e-07,
"loss": 0.419,
"step": 14050
},
{
"epoch": 46.01161904761905,
"grad_norm": 9.558659553527832,
"learning_rate": 5.597513701376821e-07,
"loss": 0.4381,
"step": 14075
},
{
"epoch": 46.01320634920635,
"grad_norm": 4.898390769958496,
"learning_rate": 5.513968720759257e-07,
"loss": 0.4285,
"step": 14100
},
{
"epoch": 46.01479365079365,
"grad_norm": 6.284289360046387,
"learning_rate": 5.430423740141693e-07,
"loss": 0.4426,
"step": 14125
},
{
"epoch": 46.01638095238095,
"grad_norm": 7.268927574157715,
"learning_rate": 5.346878759524128e-07,
"loss": 0.4454,
"step": 14150
},
{
"epoch": 46.017968253968256,
"grad_norm": 8.74441146850586,
"learning_rate": 5.263333778906564e-07,
"loss": 0.4862,
"step": 14175
},
{
"epoch": 47.00038095238095,
"grad_norm": 5.302139759063721,
"learning_rate": 5.179788798288999e-07,
"loss": 0.4241,
"step": 14200
},
{
"epoch": 47.00196825396825,
"grad_norm": 9.133415222167969,
"learning_rate": 5.096243817671434e-07,
"loss": 0.4938,
"step": 14225
},
{
"epoch": 47.00355555555556,
"grad_norm": 11.927820205688477,
"learning_rate": 5.01269883705387e-07,
"loss": 0.4569,
"step": 14250
},
{
"epoch": 47.00514285714286,
"grad_norm": 6.953431606292725,
"learning_rate": 4.929153856436306e-07,
"loss": 0.4669,
"step": 14275
},
{
"epoch": 47.00673015873016,
"grad_norm": 15.06204891204834,
"learning_rate": 4.845608875818741e-07,
"loss": 0.4956,
"step": 14300
},
{
"epoch": 47.00831746031746,
"grad_norm": 6.762052059173584,
"learning_rate": 4.7620638952011765e-07,
"loss": 0.3959,
"step": 14325
},
{
"epoch": 47.009904761904764,
"grad_norm": 5.515415668487549,
"learning_rate": 4.678518914583612e-07,
"loss": 0.4077,
"step": 14350
},
{
"epoch": 47.011492063492064,
"grad_norm": 5.323752403259277,
"learning_rate": 4.594973933966047e-07,
"loss": 0.4282,
"step": 14375
},
{
"epoch": 47.01307936507936,
"grad_norm": 3.4081521034240723,
"learning_rate": 4.511428953348483e-07,
"loss": 0.4318,
"step": 14400
},
{
"epoch": 47.01466666666666,
"grad_norm": 10.325825691223145,
"learning_rate": 4.427883972730919e-07,
"loss": 0.443,
"step": 14425
},
{
"epoch": 47.01625396825397,
"grad_norm": 7.097803592681885,
"learning_rate": 4.344338992113354e-07,
"loss": 0.4469,
"step": 14450
},
{
"epoch": 47.01784126984127,
"grad_norm": 8.34315013885498,
"learning_rate": 4.26079401149579e-07,
"loss": 0.48,
"step": 14475
},
{
"epoch": 48.000253968253965,
"grad_norm": 5.317016124725342,
"learning_rate": 4.177249030878225e-07,
"loss": 0.4391,
"step": 14500
},
{
"epoch": 48.00184126984127,
"grad_norm": 6.396937370300293,
"learning_rate": 4.0937040502606607e-07,
"loss": 0.4688,
"step": 14525
},
{
"epoch": 48.00342857142857,
"grad_norm": 3.8121814727783203,
"learning_rate": 4.0101590696430963e-07,
"loss": 0.4771,
"step": 14550
},
{
"epoch": 48.00501587301587,
"grad_norm": 7.068954944610596,
"learning_rate": 3.9266140890255315e-07,
"loss": 0.4657,
"step": 14575
},
{
"epoch": 48.00660317460317,
"grad_norm": 3.8409268856048584,
"learning_rate": 3.843069108407967e-07,
"loss": 0.4651,
"step": 14600
},
{
"epoch": 48.00819047619048,
"grad_norm": 4.6952714920043945,
"learning_rate": 3.759524127790402e-07,
"loss": 0.4065,
"step": 14625
},
{
"epoch": 48.00977777777778,
"grad_norm": 5.573005199432373,
"learning_rate": 3.675979147172838e-07,
"loss": 0.411,
"step": 14650
},
{
"epoch": 48.01136507936508,
"grad_norm": 7.740847587585449,
"learning_rate": 3.592434166555274e-07,
"loss": 0.4348,
"step": 14675
},
{
"epoch": 48.01295238095238,
"grad_norm": 3.9830234050750732,
"learning_rate": 3.508889185937709e-07,
"loss": 0.437,
"step": 14700
},
{
"epoch": 48.014539682539684,
"grad_norm": 4.826086521148682,
"learning_rate": 3.425344205320145e-07,
"loss": 0.4347,
"step": 14725
},
{
"epoch": 48.016126984126984,
"grad_norm": 7.815319538116455,
"learning_rate": 3.34179922470258e-07,
"loss": 0.4485,
"step": 14750
},
{
"epoch": 48.017714285714284,
"grad_norm": 7.226869583129883,
"learning_rate": 3.2582542440850157e-07,
"loss": 0.4703,
"step": 14775
},
{
"epoch": 49.000126984126986,
"grad_norm": 6.489394187927246,
"learning_rate": 3.1747092634674513e-07,
"loss": 0.4377,
"step": 14800
},
{
"epoch": 49.001714285714286,
"grad_norm": 7.8261399269104,
"learning_rate": 3.0911642828498865e-07,
"loss": 0.4772,
"step": 14825
},
{
"epoch": 49.003301587301586,
"grad_norm": 5.6710004806518555,
"learning_rate": 3.007619302232322e-07,
"loss": 0.4895,
"step": 14850
},
{
"epoch": 49.004888888888885,
"grad_norm": 5.4189252853393555,
"learning_rate": 2.924074321614758e-07,
"loss": 0.4575,
"step": 14875
},
{
"epoch": 49.00647619047619,
"grad_norm": 3.988452434539795,
"learning_rate": 2.8405293409971934e-07,
"loss": 0.4701,
"step": 14900
},
{
"epoch": 49.00806349206349,
"grad_norm": 6.010385990142822,
"learning_rate": 2.7569843603796286e-07,
"loss": 0.4005,
"step": 14925
},
{
"epoch": 49.00965079365079,
"grad_norm": 12.048097610473633,
"learning_rate": 2.673439379762064e-07,
"loss": 0.4209,
"step": 14950
},
{
"epoch": 49.01123809523809,
"grad_norm": 5.6577839851379395,
"learning_rate": 2.5898943991444994e-07,
"loss": 0.4263,
"step": 14975
},
{
"epoch": 49.0128253968254,
"grad_norm": 9.367551803588867,
"learning_rate": 2.506349418526935e-07,
"loss": 0.4412,
"step": 15000
},
{
"epoch": 49.0144126984127,
"grad_norm": 6.728636741638184,
"learning_rate": 2.4228044379093707e-07,
"loss": 0.4234,
"step": 15025
},
{
"epoch": 49.016,
"grad_norm": 6.2957234382629395,
"learning_rate": 2.339259457291806e-07,
"loss": 0.4565,
"step": 15050
},
{
"epoch": 49.0175873015873,
"grad_norm": 10.448657989501953,
"learning_rate": 2.2557144766742415e-07,
"loss": 0.4544,
"step": 15075
},
{
"epoch": 49.019174603174605,
"grad_norm": 6.694546699523926,
"learning_rate": 2.172169496056677e-07,
"loss": 0.4277,
"step": 15100
},
{
"epoch": 50.0015873015873,
"grad_norm": 5.7494096755981445,
"learning_rate": 2.0886245154391125e-07,
"loss": 0.4951,
"step": 15125
},
{
"epoch": 50.00317460317461,
"grad_norm": 6.14343786239624,
"learning_rate": 2.0050795348215482e-07,
"loss": 0.4884,
"step": 15150
},
{
"epoch": 50.00476190476191,
"grad_norm": 5.558969974517822,
"learning_rate": 1.9215345542039836e-07,
"loss": 0.4453,
"step": 15175
},
{
"epoch": 50.006349206349206,
"grad_norm": 12.143143653869629,
"learning_rate": 1.837989573586419e-07,
"loss": 0.4809,
"step": 15200
},
{
"epoch": 50.007936507936506,
"grad_norm": 6.324706554412842,
"learning_rate": 1.7544445929688546e-07,
"loss": 0.4006,
"step": 15225
},
{
"epoch": 50.00952380952381,
"grad_norm": 5.624573230743408,
"learning_rate": 1.67089961235129e-07,
"loss": 0.4116,
"step": 15250
},
{
"epoch": 50.01111111111111,
"grad_norm": 6.506253242492676,
"learning_rate": 1.5873546317337257e-07,
"loss": 0.4286,
"step": 15275
},
{
"epoch": 50.01269841269841,
"grad_norm": 6.208716869354248,
"learning_rate": 1.503809651116161e-07,
"loss": 0.4255,
"step": 15300
},
{
"epoch": 50.01428571428571,
"grad_norm": 3.370025634765625,
"learning_rate": 1.4202646704985967e-07,
"loss": 0.4314,
"step": 15325
},
{
"epoch": 50.01587301587302,
"grad_norm": 10.119969367980957,
"learning_rate": 1.336719689881032e-07,
"loss": 0.4615,
"step": 15350
},
{
"epoch": 50.01746031746032,
"grad_norm": 5.545236110687256,
"learning_rate": 1.2531747092634675e-07,
"loss": 0.4646,
"step": 15375
},
{
"epoch": 50.01904761904762,
"grad_norm": 4.946952819824219,
"learning_rate": 1.169629728645903e-07,
"loss": 0.4344,
"step": 15400
},
{
"epoch": 51.00146031746032,
"grad_norm": 5.004914283752441,
"learning_rate": 1.0860847480283386e-07,
"loss": 0.4824,
"step": 15425
},
{
"epoch": 51.00304761904762,
"grad_norm": 6.189335823059082,
"learning_rate": 1.0025397674107741e-07,
"loss": 0.4991,
"step": 15450
},
{
"epoch": 51.00463492063492,
"grad_norm": 7.598541259765625,
"learning_rate": 9.189947867932095e-08,
"loss": 0.4302,
"step": 15475
},
{
"epoch": 51.00622222222222,
"grad_norm": 5.034823417663574,
"learning_rate": 8.35449806175645e-08,
"loss": 0.4842,
"step": 15500
},
{
"epoch": 51.00780952380953,
"grad_norm": 10.85693359375,
"learning_rate": 7.519048255580805e-08,
"loss": 0.4062,
"step": 15525
},
{
"epoch": 51.00939682539683,
"grad_norm": 5.432575702667236,
"learning_rate": 6.68359844940516e-08,
"loss": 0.4042,
"step": 15550
},
{
"epoch": 51.01098412698413,
"grad_norm": 6.148897171020508,
"learning_rate": 5.848148643229515e-08,
"loss": 0.4301,
"step": 15575
},
{
"epoch": 51.01257142857143,
"grad_norm": 11.638060569763184,
"learning_rate": 5.0126988370538704e-08,
"loss": 0.4318,
"step": 15600
},
{
"epoch": 51.014158730158734,
"grad_norm": 9.534034729003906,
"learning_rate": 4.177249030878225e-08,
"loss": 0.4328,
"step": 15625
},
{
"epoch": 51.01574603174603,
"grad_norm": 7.127634048461914,
"learning_rate": 3.34179922470258e-08,
"loss": 0.4515,
"step": 15650
},
{
"epoch": 51.01733333333333,
"grad_norm": 11.5350923538208,
"learning_rate": 2.5063494185269352e-08,
"loss": 0.472,
"step": 15675
},
{
"epoch": 51.01892063492063,
"grad_norm": 2.837473154067993,
"learning_rate": 1.67089961235129e-08,
"loss": 0.4333,
"step": 15700
}
],
"logging_steps": 25,
"max_steps": 15750,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.19361437974528e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}