Convertalk / checkpoint-31526 /trainer_state.json
joshualxndrs's picture
Upload folder using huggingface_hub
858a148 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 31526,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003171985028230667,
"grad_norm": 6.952805519104004,
"learning_rate": 4.9949248239548315e-05,
"loss": 7.5095,
"step": 50
},
{
"epoch": 0.006343970056461334,
"grad_norm": 0.7631180882453918,
"learning_rate": 4.9897439150753875e-05,
"loss": 0.581,
"step": 100
},
{
"epoch": 0.009515955084692,
"grad_norm": 0.6402416229248047,
"learning_rate": 4.98445727336167e-05,
"loss": 0.4056,
"step": 150
},
{
"epoch": 0.012687940112922668,
"grad_norm": 0.2075737714767456,
"learning_rate": 4.979170631647952e-05,
"loss": 0.3261,
"step": 200
},
{
"epoch": 0.015859925141153332,
"grad_norm": 0.3269546329975128,
"learning_rate": 4.973883989934234e-05,
"loss": 0.2932,
"step": 250
},
{
"epoch": 0.019031910169384,
"grad_norm": 0.42410802841186523,
"learning_rate": 4.968597348220517e-05,
"loss": 0.3021,
"step": 300
},
{
"epoch": 0.02220389519761467,
"grad_norm": 0.26918208599090576,
"learning_rate": 4.963310706506799e-05,
"loss": 0.3534,
"step": 350
},
{
"epoch": 0.025375880225845335,
"grad_norm": 0.36581048369407654,
"learning_rate": 4.958024064793081e-05,
"loss": 0.3136,
"step": 400
},
{
"epoch": 0.028547865254076002,
"grad_norm": 0.3738636076450348,
"learning_rate": 4.9527374230793634e-05,
"loss": 0.2982,
"step": 450
},
{
"epoch": 0.031719850282306665,
"grad_norm": 0.3204258680343628,
"learning_rate": 4.9474507813656455e-05,
"loss": 0.3183,
"step": 500
},
{
"epoch": 0.034891835310537335,
"grad_norm": 0.24417226016521454,
"learning_rate": 4.942164139651928e-05,
"loss": 0.2906,
"step": 550
},
{
"epoch": 0.038063820338768,
"grad_norm": 0.3362484574317932,
"learning_rate": 4.93687749793821e-05,
"loss": 0.3457,
"step": 600
},
{
"epoch": 0.04123580536699867,
"grad_norm": 0.4605356454849243,
"learning_rate": 4.931590856224492e-05,
"loss": 0.3072,
"step": 650
},
{
"epoch": 0.04440779039522934,
"grad_norm": 0.5630594491958618,
"learning_rate": 4.926304214510775e-05,
"loss": 0.3201,
"step": 700
},
{
"epoch": 0.04757977542346,
"grad_norm": 0.5788861513137817,
"learning_rate": 4.9210175727970564e-05,
"loss": 0.2825,
"step": 750
},
{
"epoch": 0.05075176045169067,
"grad_norm": 0.26267775893211365,
"learning_rate": 4.9157309310833386e-05,
"loss": 0.3336,
"step": 800
},
{
"epoch": 0.053923745479921334,
"grad_norm": 0.34053167700767517,
"learning_rate": 4.9104442893696214e-05,
"loss": 0.314,
"step": 850
},
{
"epoch": 0.057095730508152004,
"grad_norm": 0.3658903241157532,
"learning_rate": 4.9051576476559036e-05,
"loss": 0.335,
"step": 900
},
{
"epoch": 0.06026771553638267,
"grad_norm": 0.5043643712997437,
"learning_rate": 4.899871005942185e-05,
"loss": 0.2956,
"step": 950
},
{
"epoch": 0.06343970056461333,
"grad_norm": 0.4024595320224762,
"learning_rate": 4.894584364228468e-05,
"loss": 0.3452,
"step": 1000
},
{
"epoch": 0.066611685592844,
"grad_norm": 0.19914183020591736,
"learning_rate": 4.88929772251475e-05,
"loss": 0.2788,
"step": 1050
},
{
"epoch": 0.06978367062107467,
"grad_norm": 0.3954038619995117,
"learning_rate": 4.8840110808010316e-05,
"loss": 0.3452,
"step": 1100
},
{
"epoch": 0.07295565564930534,
"grad_norm": 0.2236422300338745,
"learning_rate": 4.8787244390873145e-05,
"loss": 0.292,
"step": 1150
},
{
"epoch": 0.076127640677536,
"grad_norm": 0.3291652202606201,
"learning_rate": 4.873437797373597e-05,
"loss": 0.2955,
"step": 1200
},
{
"epoch": 0.07929962570576667,
"grad_norm": 0.46171942353248596,
"learning_rate": 4.868151155659879e-05,
"loss": 0.3074,
"step": 1250
},
{
"epoch": 0.08247161073399734,
"grad_norm": 0.22759978473186493,
"learning_rate": 4.862864513946161e-05,
"loss": 0.3017,
"step": 1300
},
{
"epoch": 0.085643595762228,
"grad_norm": 0.23650604486465454,
"learning_rate": 4.857577872232443e-05,
"loss": 0.3176,
"step": 1350
},
{
"epoch": 0.08881558079045868,
"grad_norm": 0.5810359120368958,
"learning_rate": 4.8522912305187254e-05,
"loss": 0.3382,
"step": 1400
},
{
"epoch": 0.09198756581868933,
"grad_norm": 0.33472833037376404,
"learning_rate": 4.8470045888050075e-05,
"loss": 0.3031,
"step": 1450
},
{
"epoch": 0.09515955084692,
"grad_norm": 0.38241392374038696,
"learning_rate": 4.84171794709129e-05,
"loss": 0.2811,
"step": 1500
},
{
"epoch": 0.09833153587515067,
"grad_norm": 0.29483163356781006,
"learning_rate": 4.8364313053775726e-05,
"loss": 0.2733,
"step": 1550
},
{
"epoch": 0.10150352090338134,
"grad_norm": 0.7687553763389587,
"learning_rate": 4.831144663663855e-05,
"loss": 0.3021,
"step": 1600
},
{
"epoch": 0.104675505931612,
"grad_norm": 0.23088738322257996,
"learning_rate": 4.825858021950136e-05,
"loss": 0.2768,
"step": 1650
},
{
"epoch": 0.10784749095984267,
"grad_norm": 0.4174398183822632,
"learning_rate": 4.820571380236419e-05,
"loss": 0.2826,
"step": 1700
},
{
"epoch": 0.11101947598807334,
"grad_norm": 0.8025326132774353,
"learning_rate": 4.815284738522701e-05,
"loss": 0.342,
"step": 1750
},
{
"epoch": 0.11419146101630401,
"grad_norm": 0.22708263993263245,
"learning_rate": 4.809998096808983e-05,
"loss": 0.2597,
"step": 1800
},
{
"epoch": 0.11736344604453466,
"grad_norm": 0.7445570826530457,
"learning_rate": 4.8047114550952656e-05,
"loss": 0.3205,
"step": 1850
},
{
"epoch": 0.12053543107276533,
"grad_norm": 0.3659399151802063,
"learning_rate": 4.799424813381548e-05,
"loss": 0.3078,
"step": 1900
},
{
"epoch": 0.123707416100996,
"grad_norm": 0.21260209381580353,
"learning_rate": 4.79413817166783e-05,
"loss": 0.2728,
"step": 1950
},
{
"epoch": 0.12687940112922666,
"grad_norm": 0.44993868470191956,
"learning_rate": 4.788851529954112e-05,
"loss": 0.3012,
"step": 2000
},
{
"epoch": 0.13005138615745734,
"grad_norm": 0.47664666175842285,
"learning_rate": 4.783564888240394e-05,
"loss": 0.3808,
"step": 2050
},
{
"epoch": 0.133223371185688,
"grad_norm": 0.2831490933895111,
"learning_rate": 4.7782782465266765e-05,
"loss": 0.2902,
"step": 2100
},
{
"epoch": 0.13639535621391868,
"grad_norm": 0.22913455963134766,
"learning_rate": 4.772991604812959e-05,
"loss": 0.3305,
"step": 2150
},
{
"epoch": 0.13956734124214934,
"grad_norm": 0.3116960823535919,
"learning_rate": 4.767704963099241e-05,
"loss": 0.259,
"step": 2200
},
{
"epoch": 0.14273932627038,
"grad_norm": 0.2142101675271988,
"learning_rate": 4.762418321385523e-05,
"loss": 0.2946,
"step": 2250
},
{
"epoch": 0.14591131129861068,
"grad_norm": 0.31184491515159607,
"learning_rate": 4.757131679671806e-05,
"loss": 0.3192,
"step": 2300
},
{
"epoch": 0.14908329632684134,
"grad_norm": 0.34447625279426575,
"learning_rate": 4.7518450379580874e-05,
"loss": 0.258,
"step": 2350
},
{
"epoch": 0.152255281355072,
"grad_norm": 0.2478509545326233,
"learning_rate": 4.74655839624437e-05,
"loss": 0.2644,
"step": 2400
},
{
"epoch": 0.15542726638330268,
"grad_norm": 0.4509633183479309,
"learning_rate": 4.7412717545306524e-05,
"loss": 0.3223,
"step": 2450
},
{
"epoch": 0.15859925141153333,
"grad_norm": 0.5341806411743164,
"learning_rate": 4.735985112816934e-05,
"loss": 0.3158,
"step": 2500
},
{
"epoch": 0.16177123643976402,
"grad_norm": 0.3863554298877716,
"learning_rate": 4.730698471103217e-05,
"loss": 0.3017,
"step": 2550
},
{
"epoch": 0.16494322146799467,
"grad_norm": 0.5030498504638672,
"learning_rate": 4.725411829389499e-05,
"loss": 0.3043,
"step": 2600
},
{
"epoch": 0.16811520649622533,
"grad_norm": 0.18916811048984528,
"learning_rate": 4.720125187675781e-05,
"loss": 0.2842,
"step": 2650
},
{
"epoch": 0.171287191524456,
"grad_norm": 0.36252668499946594,
"learning_rate": 4.714838545962063e-05,
"loss": 0.3019,
"step": 2700
},
{
"epoch": 0.17445917655268667,
"grad_norm": 0.47955018281936646,
"learning_rate": 4.7095519042483454e-05,
"loss": 0.2992,
"step": 2750
},
{
"epoch": 0.17763116158091735,
"grad_norm": 0.20012222230434418,
"learning_rate": 4.7042652625346276e-05,
"loss": 0.2525,
"step": 2800
},
{
"epoch": 0.180803146609148,
"grad_norm": 0.30269569158554077,
"learning_rate": 4.6989786208209105e-05,
"loss": 0.2789,
"step": 2850
},
{
"epoch": 0.18397513163737866,
"grad_norm": 0.20235884189605713,
"learning_rate": 4.693691979107192e-05,
"loss": 0.2819,
"step": 2900
},
{
"epoch": 0.18714711666560935,
"grad_norm": 0.21970972418785095,
"learning_rate": 4.688405337393474e-05,
"loss": 0.2649,
"step": 2950
},
{
"epoch": 0.19031910169384,
"grad_norm": 0.3686061501502991,
"learning_rate": 4.683118695679757e-05,
"loss": 0.2724,
"step": 3000
},
{
"epoch": 0.19349108672207066,
"grad_norm": 0.5213696360588074,
"learning_rate": 4.6778320539660385e-05,
"loss": 0.3447,
"step": 3050
},
{
"epoch": 0.19666307175030134,
"grad_norm": 0.385406494140625,
"learning_rate": 4.6725454122523213e-05,
"loss": 0.3139,
"step": 3100
},
{
"epoch": 0.199835056778532,
"grad_norm": 0.578931450843811,
"learning_rate": 4.6672587705386035e-05,
"loss": 0.3217,
"step": 3150
},
{
"epoch": 0.20300704180676268,
"grad_norm": 0.39177459478378296,
"learning_rate": 4.661972128824886e-05,
"loss": 0.2639,
"step": 3200
},
{
"epoch": 0.20617902683499334,
"grad_norm": 0.4169202148914337,
"learning_rate": 4.656685487111168e-05,
"loss": 0.3087,
"step": 3250
},
{
"epoch": 0.209351011863224,
"grad_norm": 0.4254414141178131,
"learning_rate": 4.65139884539745e-05,
"loss": 0.2634,
"step": 3300
},
{
"epoch": 0.21252299689145468,
"grad_norm": 0.4798215627670288,
"learning_rate": 4.646112203683732e-05,
"loss": 0.3065,
"step": 3350
},
{
"epoch": 0.21569498191968534,
"grad_norm": 0.5004227161407471,
"learning_rate": 4.6408255619700144e-05,
"loss": 0.2469,
"step": 3400
},
{
"epoch": 0.21886696694791602,
"grad_norm": 0.3792094886302948,
"learning_rate": 4.6355389202562966e-05,
"loss": 0.3019,
"step": 3450
},
{
"epoch": 0.22203895197614668,
"grad_norm": 0.701235294342041,
"learning_rate": 4.630252278542579e-05,
"loss": 0.369,
"step": 3500
},
{
"epoch": 0.22521093700437733,
"grad_norm": 0.3253133296966553,
"learning_rate": 4.6249656368288616e-05,
"loss": 0.277,
"step": 3550
},
{
"epoch": 0.22838292203260802,
"grad_norm": 0.49311327934265137,
"learning_rate": 4.619678995115143e-05,
"loss": 0.2913,
"step": 3600
},
{
"epoch": 0.23155490706083867,
"grad_norm": 0.18701878190040588,
"learning_rate": 4.614392353401425e-05,
"loss": 0.3095,
"step": 3650
},
{
"epoch": 0.23472689208906933,
"grad_norm": 0.6811497807502747,
"learning_rate": 4.609105711687708e-05,
"loss": 0.2754,
"step": 3700
},
{
"epoch": 0.2378988771173,
"grad_norm": 0.16985131800174713,
"learning_rate": 4.6038190699739896e-05,
"loss": 0.3032,
"step": 3750
},
{
"epoch": 0.24107086214553067,
"grad_norm": 0.33355677127838135,
"learning_rate": 4.598532428260272e-05,
"loss": 0.31,
"step": 3800
},
{
"epoch": 0.24424284717376135,
"grad_norm": 0.3561393916606903,
"learning_rate": 4.5932457865465546e-05,
"loss": 0.3527,
"step": 3850
},
{
"epoch": 0.247414832201992,
"grad_norm": 0.5302127599716187,
"learning_rate": 4.587959144832837e-05,
"loss": 0.2963,
"step": 3900
},
{
"epoch": 0.2505868172302227,
"grad_norm": 0.26475265622138977,
"learning_rate": 4.582672503119119e-05,
"loss": 0.2859,
"step": 3950
},
{
"epoch": 0.2537588022584533,
"grad_norm": 0.42244717478752136,
"learning_rate": 4.577385861405401e-05,
"loss": 0.2915,
"step": 4000
},
{
"epoch": 0.256930787286684,
"grad_norm": 0.20778138935565948,
"learning_rate": 4.5720992196916833e-05,
"loss": 0.2775,
"step": 4050
},
{
"epoch": 0.2601027723149147,
"grad_norm": 0.21787141263484955,
"learning_rate": 4.5668125779779655e-05,
"loss": 0.2971,
"step": 4100
},
{
"epoch": 0.2632747573431453,
"grad_norm": 0.19819681346416473,
"learning_rate": 4.561525936264248e-05,
"loss": 0.3213,
"step": 4150
},
{
"epoch": 0.266446742371376,
"grad_norm": 0.23251983523368835,
"learning_rate": 4.55623929455053e-05,
"loss": 0.2691,
"step": 4200
},
{
"epoch": 0.2696187273996067,
"grad_norm": 0.26158884167671204,
"learning_rate": 4.550952652836812e-05,
"loss": 0.3758,
"step": 4250
},
{
"epoch": 0.27279071242783737,
"grad_norm": 0.18944093585014343,
"learning_rate": 4.545666011123094e-05,
"loss": 0.2989,
"step": 4300
},
{
"epoch": 0.275962697456068,
"grad_norm": 0.2313028872013092,
"learning_rate": 4.5403793694093764e-05,
"loss": 0.3013,
"step": 4350
},
{
"epoch": 0.2791346824842987,
"grad_norm": 0.2733168601989746,
"learning_rate": 4.535092727695659e-05,
"loss": 0.3073,
"step": 4400
},
{
"epoch": 0.28230666751252936,
"grad_norm": 0.5805867314338684,
"learning_rate": 4.529806085981941e-05,
"loss": 0.2595,
"step": 4450
},
{
"epoch": 0.28547865254076,
"grad_norm": 0.38282257318496704,
"learning_rate": 4.524519444268223e-05,
"loss": 0.2867,
"step": 4500
},
{
"epoch": 0.2886506375689907,
"grad_norm": 0.2380545735359192,
"learning_rate": 4.519232802554506e-05,
"loss": 0.2848,
"step": 4550
},
{
"epoch": 0.29182262259722136,
"grad_norm": 0.36206936836242676,
"learning_rate": 4.513946160840788e-05,
"loss": 0.3017,
"step": 4600
},
{
"epoch": 0.294994607625452,
"grad_norm": 0.2402886301279068,
"learning_rate": 4.5086595191270694e-05,
"loss": 0.2957,
"step": 4650
},
{
"epoch": 0.29816659265368267,
"grad_norm": 0.29904839396476746,
"learning_rate": 4.503372877413352e-05,
"loss": 0.3743,
"step": 4700
},
{
"epoch": 0.30133857768191336,
"grad_norm": 0.43882688879966736,
"learning_rate": 4.4980862356996345e-05,
"loss": 0.3218,
"step": 4750
},
{
"epoch": 0.304510562710144,
"grad_norm": 0.3032098412513733,
"learning_rate": 4.4927995939859166e-05,
"loss": 0.3373,
"step": 4800
},
{
"epoch": 0.30768254773837467,
"grad_norm": 0.5561183094978333,
"learning_rate": 4.487512952272199e-05,
"loss": 0.3756,
"step": 4850
},
{
"epoch": 0.31085453276660535,
"grad_norm": 0.2683407962322235,
"learning_rate": 4.482226310558481e-05,
"loss": 0.3041,
"step": 4900
},
{
"epoch": 0.31402651779483604,
"grad_norm": 0.44373977184295654,
"learning_rate": 4.476939668844763e-05,
"loss": 0.3054,
"step": 4950
},
{
"epoch": 0.31719850282306666,
"grad_norm": 0.23822714388370514,
"learning_rate": 4.4716530271310453e-05,
"loss": 0.2684,
"step": 5000
},
{
"epoch": 0.32037048785129735,
"grad_norm": 0.37610000371932983,
"learning_rate": 4.4663663854173275e-05,
"loss": 0.3111,
"step": 5050
},
{
"epoch": 0.32354247287952803,
"grad_norm": 0.2191406935453415,
"learning_rate": 4.4610797437036104e-05,
"loss": 0.3164,
"step": 5100
},
{
"epoch": 0.32671445790775866,
"grad_norm": 0.31773659586906433,
"learning_rate": 4.4557931019898925e-05,
"loss": 0.2996,
"step": 5150
},
{
"epoch": 0.32988644293598934,
"grad_norm": 0.3252709209918976,
"learning_rate": 4.450506460276174e-05,
"loss": 0.3224,
"step": 5200
},
{
"epoch": 0.33305842796422,
"grad_norm": 0.34400445222854614,
"learning_rate": 4.445219818562457e-05,
"loss": 0.2975,
"step": 5250
},
{
"epoch": 0.33623041299245066,
"grad_norm": 0.16536681354045868,
"learning_rate": 4.439933176848739e-05,
"loss": 0.2726,
"step": 5300
},
{
"epoch": 0.33940239802068134,
"grad_norm": 0.23107750713825226,
"learning_rate": 4.4346465351350206e-05,
"loss": 0.2814,
"step": 5350
},
{
"epoch": 0.342574383048912,
"grad_norm": 0.22565191984176636,
"learning_rate": 4.4293598934213034e-05,
"loss": 0.2877,
"step": 5400
},
{
"epoch": 0.34574636807714265,
"grad_norm": 0.25360986590385437,
"learning_rate": 4.4240732517075856e-05,
"loss": 0.3075,
"step": 5450
},
{
"epoch": 0.34891835310537334,
"grad_norm": 0.42394259572029114,
"learning_rate": 4.418786609993868e-05,
"loss": 0.2732,
"step": 5500
},
{
"epoch": 0.352090338133604,
"grad_norm": 0.5393642783164978,
"learning_rate": 4.41349996828015e-05,
"loss": 0.342,
"step": 5550
},
{
"epoch": 0.3552623231618347,
"grad_norm": 0.4016542136669159,
"learning_rate": 4.408213326566432e-05,
"loss": 0.2756,
"step": 5600
},
{
"epoch": 0.35843430819006533,
"grad_norm": 0.2234315276145935,
"learning_rate": 4.402926684852714e-05,
"loss": 0.3487,
"step": 5650
},
{
"epoch": 0.361606293218296,
"grad_norm": 0.2084522843360901,
"learning_rate": 4.3976400431389965e-05,
"loss": 0.357,
"step": 5700
},
{
"epoch": 0.3647782782465267,
"grad_norm": 0.2758818566799164,
"learning_rate": 4.3923534014252786e-05,
"loss": 0.3605,
"step": 5750
},
{
"epoch": 0.3679502632747573,
"grad_norm": 0.20652857422828674,
"learning_rate": 4.387066759711561e-05,
"loss": 0.3164,
"step": 5800
},
{
"epoch": 0.371122248302988,
"grad_norm": 0.3151554763317108,
"learning_rate": 4.381780117997844e-05,
"loss": 0.3447,
"step": 5850
},
{
"epoch": 0.3742942333312187,
"grad_norm": 0.3141522705554962,
"learning_rate": 4.376493476284125e-05,
"loss": 0.2973,
"step": 5900
},
{
"epoch": 0.3774662183594493,
"grad_norm": 0.47755780816078186,
"learning_rate": 4.371206834570408e-05,
"loss": 0.3411,
"step": 5950
},
{
"epoch": 0.38063820338768,
"grad_norm": 0.2301286906003952,
"learning_rate": 4.36592019285669e-05,
"loss": 0.2651,
"step": 6000
},
{
"epoch": 0.3838101884159107,
"grad_norm": 0.2510074973106384,
"learning_rate": 4.360633551142972e-05,
"loss": 0.2988,
"step": 6050
},
{
"epoch": 0.3869821734441413,
"grad_norm": 0.26201292872428894,
"learning_rate": 4.3553469094292545e-05,
"loss": 0.262,
"step": 6100
},
{
"epoch": 0.390154158472372,
"grad_norm": 0.1688852608203888,
"learning_rate": 4.350060267715537e-05,
"loss": 0.3006,
"step": 6150
},
{
"epoch": 0.3933261435006027,
"grad_norm": 0.475284218788147,
"learning_rate": 4.344773626001819e-05,
"loss": 0.3003,
"step": 6200
},
{
"epoch": 0.39649812852883337,
"grad_norm": 0.4884473383426666,
"learning_rate": 4.339486984288101e-05,
"loss": 0.2861,
"step": 6250
},
{
"epoch": 0.399670113557064,
"grad_norm": 0.2931898534297943,
"learning_rate": 4.334200342574383e-05,
"loss": 0.2841,
"step": 6300
},
{
"epoch": 0.4028420985852947,
"grad_norm": 0.26861268281936646,
"learning_rate": 4.3289137008606654e-05,
"loss": 0.3191,
"step": 6350
},
{
"epoch": 0.40601408361352537,
"grad_norm": 0.4085983335971832,
"learning_rate": 4.3236270591469476e-05,
"loss": 0.3429,
"step": 6400
},
{
"epoch": 0.409186068641756,
"grad_norm": 0.5681502819061279,
"learning_rate": 4.31834041743323e-05,
"loss": 0.2732,
"step": 6450
},
{
"epoch": 0.4123580536699867,
"grad_norm": 0.17655836045742035,
"learning_rate": 4.313053775719512e-05,
"loss": 0.2403,
"step": 6500
},
{
"epoch": 0.41553003869821736,
"grad_norm": 0.25269463658332825,
"learning_rate": 4.307767134005795e-05,
"loss": 0.3231,
"step": 6550
},
{
"epoch": 0.418702023726448,
"grad_norm": 0.26235464215278625,
"learning_rate": 4.302480492292076e-05,
"loss": 0.2952,
"step": 6600
},
{
"epoch": 0.4218740087546787,
"grad_norm": 0.3551720380783081,
"learning_rate": 4.2971938505783585e-05,
"loss": 0.2934,
"step": 6650
},
{
"epoch": 0.42504599378290936,
"grad_norm": 0.19850347936153412,
"learning_rate": 4.291907208864641e-05,
"loss": 0.2669,
"step": 6700
},
{
"epoch": 0.42821797881114,
"grad_norm": 0.6945760250091553,
"learning_rate": 4.2866205671509235e-05,
"loss": 0.3636,
"step": 6750
},
{
"epoch": 0.43138996383937067,
"grad_norm": 0.1777346432209015,
"learning_rate": 4.281333925437206e-05,
"loss": 0.3069,
"step": 6800
},
{
"epoch": 0.43456194886760136,
"grad_norm": 0.4449566900730133,
"learning_rate": 4.276047283723488e-05,
"loss": 0.3688,
"step": 6850
},
{
"epoch": 0.43773393389583204,
"grad_norm": 0.2210356742143631,
"learning_rate": 4.27076064200977e-05,
"loss": 0.2659,
"step": 6900
},
{
"epoch": 0.44090591892406267,
"grad_norm": 0.265536367893219,
"learning_rate": 4.265474000296052e-05,
"loss": 0.3184,
"step": 6950
},
{
"epoch": 0.44407790395229335,
"grad_norm": 0.2273561656475067,
"learning_rate": 4.2601873585823344e-05,
"loss": 0.2729,
"step": 7000
},
{
"epoch": 0.44724988898052404,
"grad_norm": 0.2259570211172104,
"learning_rate": 4.2549007168686165e-05,
"loss": 0.2893,
"step": 7050
},
{
"epoch": 0.45042187400875466,
"grad_norm": 0.2315446436405182,
"learning_rate": 4.2496140751548994e-05,
"loss": 0.295,
"step": 7100
},
{
"epoch": 0.45359385903698535,
"grad_norm": 0.26081445813179016,
"learning_rate": 4.244327433441181e-05,
"loss": 0.3424,
"step": 7150
},
{
"epoch": 0.45676584406521603,
"grad_norm": 0.33118560910224915,
"learning_rate": 4.239040791727463e-05,
"loss": 0.3039,
"step": 7200
},
{
"epoch": 0.45993782909344666,
"grad_norm": 0.34620553255081177,
"learning_rate": 4.233754150013746e-05,
"loss": 0.313,
"step": 7250
},
{
"epoch": 0.46310981412167734,
"grad_norm": 0.2923032343387604,
"learning_rate": 4.2284675083000274e-05,
"loss": 0.285,
"step": 7300
},
{
"epoch": 0.466281799149908,
"grad_norm": 0.27615100145339966,
"learning_rate": 4.2231808665863096e-05,
"loss": 0.3385,
"step": 7350
},
{
"epoch": 0.46945378417813866,
"grad_norm": 0.3606735169887543,
"learning_rate": 4.2178942248725924e-05,
"loss": 0.3031,
"step": 7400
},
{
"epoch": 0.47262576920636934,
"grad_norm": 0.2961825132369995,
"learning_rate": 4.2126075831588746e-05,
"loss": 0.3671,
"step": 7450
},
{
"epoch": 0.4757977542346,
"grad_norm": 0.1403179168701172,
"learning_rate": 4.207320941445156e-05,
"loss": 0.2606,
"step": 7500
},
{
"epoch": 0.4789697392628307,
"grad_norm": 0.2120542675256729,
"learning_rate": 4.202034299731439e-05,
"loss": 0.2723,
"step": 7550
},
{
"epoch": 0.48214172429106134,
"grad_norm": 0.42080938816070557,
"learning_rate": 4.196747658017721e-05,
"loss": 0.3048,
"step": 7600
},
{
"epoch": 0.485313709319292,
"grad_norm": 0.2501380443572998,
"learning_rate": 4.191461016304003e-05,
"loss": 0.3641,
"step": 7650
},
{
"epoch": 0.4884856943475227,
"grad_norm": 0.2869213819503784,
"learning_rate": 4.1861743745902855e-05,
"loss": 0.3156,
"step": 7700
},
{
"epoch": 0.49165767937575333,
"grad_norm": 0.5819279551506042,
"learning_rate": 4.180887732876568e-05,
"loss": 0.3228,
"step": 7750
},
{
"epoch": 0.494829664403984,
"grad_norm": 0.3455282151699066,
"learning_rate": 4.17560109116285e-05,
"loss": 0.2957,
"step": 7800
},
{
"epoch": 0.4980016494322147,
"grad_norm": 0.14816895127296448,
"learning_rate": 4.170314449449132e-05,
"loss": 0.3039,
"step": 7850
},
{
"epoch": 0.5011736344604454,
"grad_norm": 0.5370512008666992,
"learning_rate": 4.165027807735414e-05,
"loss": 0.3572,
"step": 7900
},
{
"epoch": 0.5043456194886761,
"grad_norm": 0.3175135850906372,
"learning_rate": 4.159741166021697e-05,
"loss": 0.2527,
"step": 7950
},
{
"epoch": 0.5075176045169066,
"grad_norm": 0.24236038327217102,
"learning_rate": 4.1544545243079785e-05,
"loss": 0.3078,
"step": 8000
},
{
"epoch": 0.5106895895451373,
"grad_norm": 0.38427793979644775,
"learning_rate": 4.149167882594261e-05,
"loss": 0.313,
"step": 8050
},
{
"epoch": 0.513861574573368,
"grad_norm": 0.3454573154449463,
"learning_rate": 4.1438812408805436e-05,
"loss": 0.312,
"step": 8100
},
{
"epoch": 0.5170335596015987,
"grad_norm": 0.23383672535419464,
"learning_rate": 4.138594599166826e-05,
"loss": 0.3082,
"step": 8150
},
{
"epoch": 0.5202055446298294,
"grad_norm": 0.25619155168533325,
"learning_rate": 4.133307957453107e-05,
"loss": 0.315,
"step": 8200
},
{
"epoch": 0.5233775296580601,
"grad_norm": 0.17233146727085114,
"learning_rate": 4.12802131573939e-05,
"loss": 0.2746,
"step": 8250
},
{
"epoch": 0.5265495146862906,
"grad_norm": 0.305403470993042,
"learning_rate": 4.122734674025672e-05,
"loss": 0.3822,
"step": 8300
},
{
"epoch": 0.5297214997145213,
"grad_norm": 0.1978190392255783,
"learning_rate": 4.117448032311954e-05,
"loss": 0.3654,
"step": 8350
},
{
"epoch": 0.532893484742752,
"grad_norm": 0.1691414713859558,
"learning_rate": 4.1121613905982366e-05,
"loss": 0.2663,
"step": 8400
},
{
"epoch": 0.5360654697709827,
"grad_norm": 0.30035603046417236,
"learning_rate": 4.106874748884519e-05,
"loss": 0.3726,
"step": 8450
},
{
"epoch": 0.5392374547992134,
"grad_norm": 0.2909483015537262,
"learning_rate": 4.101588107170801e-05,
"loss": 0.2711,
"step": 8500
},
{
"epoch": 0.542409439827444,
"grad_norm": 0.1674415022134781,
"learning_rate": 4.096301465457083e-05,
"loss": 0.2736,
"step": 8550
},
{
"epoch": 0.5455814248556747,
"grad_norm": 0.33168473839759827,
"learning_rate": 4.091014823743365e-05,
"loss": 0.3428,
"step": 8600
},
{
"epoch": 0.5487534098839053,
"grad_norm": 0.389967143535614,
"learning_rate": 4.0857281820296475e-05,
"loss": 0.2841,
"step": 8650
},
{
"epoch": 0.551925394912136,
"grad_norm": 0.2906075716018677,
"learning_rate": 4.0804415403159303e-05,
"loss": 0.2447,
"step": 8700
},
{
"epoch": 0.5550973799403667,
"grad_norm": 0.5243480205535889,
"learning_rate": 4.075154898602212e-05,
"loss": 0.2743,
"step": 8750
},
{
"epoch": 0.5582693649685974,
"grad_norm": 0.3285157084465027,
"learning_rate": 4.069868256888495e-05,
"loss": 0.2498,
"step": 8800
},
{
"epoch": 0.561441349996828,
"grad_norm": 0.7118728756904602,
"learning_rate": 4.064581615174777e-05,
"loss": 0.2666,
"step": 8850
},
{
"epoch": 0.5646133350250587,
"grad_norm": 0.25792092084884644,
"learning_rate": 4.0592949734610584e-05,
"loss": 0.2903,
"step": 8900
},
{
"epoch": 0.5677853200532893,
"grad_norm": 0.6102173924446106,
"learning_rate": 4.054008331747341e-05,
"loss": 0.2847,
"step": 8950
},
{
"epoch": 0.57095730508152,
"grad_norm": 0.16453662514686584,
"learning_rate": 4.0487216900336234e-05,
"loss": 0.2895,
"step": 9000
},
{
"epoch": 0.5741292901097507,
"grad_norm": 0.4610302448272705,
"learning_rate": 4.0434350483199056e-05,
"loss": 0.3077,
"step": 9050
},
{
"epoch": 0.5773012751379814,
"grad_norm": 0.2923647165298462,
"learning_rate": 4.038148406606188e-05,
"loss": 0.2588,
"step": 9100
},
{
"epoch": 0.580473260166212,
"grad_norm": 0.25572457909584045,
"learning_rate": 4.03286176489247e-05,
"loss": 0.2902,
"step": 9150
},
{
"epoch": 0.5836452451944427,
"grad_norm": 0.28391504287719727,
"learning_rate": 4.027575123178752e-05,
"loss": 0.2972,
"step": 9200
},
{
"epoch": 0.5868172302226734,
"grad_norm": 0.23603619635105133,
"learning_rate": 4.022288481465034e-05,
"loss": 0.295,
"step": 9250
},
{
"epoch": 0.589989215250904,
"grad_norm": 0.23293623328208923,
"learning_rate": 4.0170018397513164e-05,
"loss": 0.2754,
"step": 9300
},
{
"epoch": 0.5931612002791347,
"grad_norm": 0.38438886404037476,
"learning_rate": 4.0117151980375986e-05,
"loss": 0.3086,
"step": 9350
},
{
"epoch": 0.5963331853073653,
"grad_norm": 0.2958177626132965,
"learning_rate": 4.0064285563238815e-05,
"loss": 0.2789,
"step": 9400
},
{
"epoch": 0.599505170335596,
"grad_norm": 0.19555646181106567,
"learning_rate": 4.001141914610163e-05,
"loss": 0.2437,
"step": 9450
},
{
"epoch": 0.6026771553638267,
"grad_norm": 0.24617354571819305,
"learning_rate": 3.995855272896445e-05,
"loss": 0.2823,
"step": 9500
},
{
"epoch": 0.6058491403920574,
"grad_norm": 0.2656566798686981,
"learning_rate": 3.990568631182728e-05,
"loss": 0.2593,
"step": 9550
},
{
"epoch": 0.609021125420288,
"grad_norm": 0.17703703045845032,
"learning_rate": 3.9852819894690095e-05,
"loss": 0.2781,
"step": 9600
},
{
"epoch": 0.6121931104485187,
"grad_norm": 0.28496983647346497,
"learning_rate": 3.9799953477552923e-05,
"loss": 0.2852,
"step": 9650
},
{
"epoch": 0.6153650954767493,
"grad_norm": 0.16364213824272156,
"learning_rate": 3.9747087060415745e-05,
"loss": 0.3279,
"step": 9700
},
{
"epoch": 0.61853708050498,
"grad_norm": 0.5835040211677551,
"learning_rate": 3.969422064327857e-05,
"loss": 0.3108,
"step": 9750
},
{
"epoch": 0.6217090655332107,
"grad_norm": 0.2625332474708557,
"learning_rate": 3.964135422614139e-05,
"loss": 0.2944,
"step": 9800
},
{
"epoch": 0.6248810505614414,
"grad_norm": 0.18694092333316803,
"learning_rate": 3.958848780900421e-05,
"loss": 0.2827,
"step": 9850
},
{
"epoch": 0.6280530355896721,
"grad_norm": 0.26124364137649536,
"learning_rate": 3.953562139186703e-05,
"loss": 0.2805,
"step": 9900
},
{
"epoch": 0.6312250206179026,
"grad_norm": 0.2587612271308899,
"learning_rate": 3.9482754974729854e-05,
"loss": 0.3242,
"step": 9950
},
{
"epoch": 0.6343970056461333,
"grad_norm": 0.2706884443759918,
"learning_rate": 3.9429888557592676e-05,
"loss": 0.268,
"step": 10000
},
{
"epoch": 0.637568990674364,
"grad_norm": 0.23814411461353302,
"learning_rate": 3.93770221404555e-05,
"loss": 0.3485,
"step": 10050
},
{
"epoch": 0.6407409757025947,
"grad_norm": 0.5621163249015808,
"learning_rate": 3.9324155723318326e-05,
"loss": 0.3154,
"step": 10100
},
{
"epoch": 0.6439129607308254,
"grad_norm": 0.42355868220329285,
"learning_rate": 3.927128930618114e-05,
"loss": 0.2868,
"step": 10150
},
{
"epoch": 0.6470849457590561,
"grad_norm": 0.2288525253534317,
"learning_rate": 3.921842288904396e-05,
"loss": 0.2522,
"step": 10200
},
{
"epoch": 0.6502569307872866,
"grad_norm": 0.22445383667945862,
"learning_rate": 3.916555647190679e-05,
"loss": 0.3396,
"step": 10250
},
{
"epoch": 0.6534289158155173,
"grad_norm": 0.6962974667549133,
"learning_rate": 3.9112690054769606e-05,
"loss": 0.3361,
"step": 10300
},
{
"epoch": 0.656600900843748,
"grad_norm": 0.28549084067344666,
"learning_rate": 3.905982363763243e-05,
"loss": 0.2817,
"step": 10350
},
{
"epoch": 0.6597728858719787,
"grad_norm": 0.23823893070220947,
"learning_rate": 3.9006957220495256e-05,
"loss": 0.2634,
"step": 10400
},
{
"epoch": 0.6629448709002094,
"grad_norm": 0.2657338082790375,
"learning_rate": 3.895409080335808e-05,
"loss": 0.2855,
"step": 10450
},
{
"epoch": 0.66611685592844,
"grad_norm": 0.20627088844776154,
"learning_rate": 3.89012243862209e-05,
"loss": 0.2857,
"step": 10500
},
{
"epoch": 0.6692888409566707,
"grad_norm": 0.3207322061061859,
"learning_rate": 3.884941529742647e-05,
"loss": 0.317,
"step": 10550
},
{
"epoch": 0.6724608259849013,
"grad_norm": 0.28210651874542236,
"learning_rate": 3.879654888028928e-05,
"loss": 0.3595,
"step": 10600
},
{
"epoch": 0.675632811013132,
"grad_norm": 0.1988239288330078,
"learning_rate": 3.874368246315211e-05,
"loss": 0.267,
"step": 10650
},
{
"epoch": 0.6788047960413627,
"grad_norm": 0.7708704471588135,
"learning_rate": 3.869081604601493e-05,
"loss": 0.3021,
"step": 10700
},
{
"epoch": 0.6819767810695934,
"grad_norm": 0.2626688778400421,
"learning_rate": 3.8637949628877754e-05,
"loss": 0.2574,
"step": 10750
},
{
"epoch": 0.685148766097824,
"grad_norm": 0.20237919688224792,
"learning_rate": 3.8585083211740576e-05,
"loss": 0.2658,
"step": 10800
},
{
"epoch": 0.6883207511260547,
"grad_norm": 0.2662367820739746,
"learning_rate": 3.85322167946034e-05,
"loss": 0.3054,
"step": 10850
},
{
"epoch": 0.6914927361542853,
"grad_norm": 0.3348242938518524,
"learning_rate": 3.847935037746622e-05,
"loss": 0.3265,
"step": 10900
},
{
"epoch": 0.694664721182516,
"grad_norm": 0.44233012199401855,
"learning_rate": 3.842648396032904e-05,
"loss": 0.2712,
"step": 10950
},
{
"epoch": 0.6978367062107467,
"grad_norm": 0.33227744698524475,
"learning_rate": 3.837361754319186e-05,
"loss": 0.2904,
"step": 11000
},
{
"epoch": 0.7010086912389774,
"grad_norm": 0.4147779047489166,
"learning_rate": 3.832075112605469e-05,
"loss": 0.3009,
"step": 11050
},
{
"epoch": 0.704180676267208,
"grad_norm": 0.39537376165390015,
"learning_rate": 3.8267884708917506e-05,
"loss": 0.3238,
"step": 11100
},
{
"epoch": 0.7073526612954387,
"grad_norm": 0.18787072598934174,
"learning_rate": 3.821501829178033e-05,
"loss": 0.2494,
"step": 11150
},
{
"epoch": 0.7105246463236694,
"grad_norm": 0.20560680329799652,
"learning_rate": 3.8162151874643156e-05,
"loss": 0.2715,
"step": 11200
},
{
"epoch": 0.7136966313519,
"grad_norm": 0.29710862040519714,
"learning_rate": 3.810928545750598e-05,
"loss": 0.3487,
"step": 11250
},
{
"epoch": 0.7168686163801307,
"grad_norm": 0.30526480078697205,
"learning_rate": 3.805641904036879e-05,
"loss": 0.2885,
"step": 11300
},
{
"epoch": 0.7200406014083613,
"grad_norm": 0.2582074701786041,
"learning_rate": 3.800355262323162e-05,
"loss": 0.336,
"step": 11350
},
{
"epoch": 0.723212586436592,
"grad_norm": 0.3673989176750183,
"learning_rate": 3.795068620609444e-05,
"loss": 0.4197,
"step": 11400
},
{
"epoch": 0.7263845714648227,
"grad_norm": 0.19386839866638184,
"learning_rate": 3.7897819788957265e-05,
"loss": 0.3385,
"step": 11450
},
{
"epoch": 0.7295565564930534,
"grad_norm": 0.21169255673885345,
"learning_rate": 3.784495337182009e-05,
"loss": 0.305,
"step": 11500
},
{
"epoch": 0.732728541521284,
"grad_norm": 0.541127622127533,
"learning_rate": 3.779208695468291e-05,
"loss": 0.2942,
"step": 11550
},
{
"epoch": 0.7359005265495147,
"grad_norm": 0.4768331050872803,
"learning_rate": 3.773922053754573e-05,
"loss": 0.3122,
"step": 11600
},
{
"epoch": 0.7390725115777453,
"grad_norm": 0.39062756299972534,
"learning_rate": 3.768635412040855e-05,
"loss": 0.2905,
"step": 11650
},
{
"epoch": 0.742244496605976,
"grad_norm": 0.2553999722003937,
"learning_rate": 3.7633487703271374e-05,
"loss": 0.2705,
"step": 11700
},
{
"epoch": 0.7454164816342067,
"grad_norm": 0.3118399381637573,
"learning_rate": 3.7580621286134196e-05,
"loss": 0.263,
"step": 11750
},
{
"epoch": 0.7485884666624374,
"grad_norm": 0.1847628504037857,
"learning_rate": 3.7527754868997024e-05,
"loss": 0.2748,
"step": 11800
},
{
"epoch": 0.7517604516906681,
"grad_norm": 0.20181454718112946,
"learning_rate": 3.747488845185984e-05,
"loss": 0.2801,
"step": 11850
},
{
"epoch": 0.7549324367188986,
"grad_norm": 0.4498727321624756,
"learning_rate": 3.742202203472267e-05,
"loss": 0.3182,
"step": 11900
},
{
"epoch": 0.7581044217471293,
"grad_norm": 0.37652915716171265,
"learning_rate": 3.736915561758549e-05,
"loss": 0.2782,
"step": 11950
},
{
"epoch": 0.76127640677536,
"grad_norm": 0.2723052203655243,
"learning_rate": 3.7316289200448304e-05,
"loss": 0.3187,
"step": 12000
},
{
"epoch": 0.7644483918035907,
"grad_norm": 0.16256879270076752,
"learning_rate": 3.726342278331113e-05,
"loss": 0.2866,
"step": 12050
},
{
"epoch": 0.7676203768318214,
"grad_norm": 0.25574353337287903,
"learning_rate": 3.7210556366173955e-05,
"loss": 0.2848,
"step": 12100
},
{
"epoch": 0.7707923618600521,
"grad_norm": 0.18727587163448334,
"learning_rate": 3.7157689949036776e-05,
"loss": 0.3105,
"step": 12150
},
{
"epoch": 0.7739643468882826,
"grad_norm": 0.25161731243133545,
"learning_rate": 3.71048235318996e-05,
"loss": 0.2911,
"step": 12200
},
{
"epoch": 0.7771363319165133,
"grad_norm": 0.43750718235969543,
"learning_rate": 3.705195711476242e-05,
"loss": 0.2981,
"step": 12250
},
{
"epoch": 0.780308316944744,
"grad_norm": 0.27956822514533997,
"learning_rate": 3.699909069762524e-05,
"loss": 0.3155,
"step": 12300
},
{
"epoch": 0.7834803019729747,
"grad_norm": 0.1870819479227066,
"learning_rate": 3.694622428048806e-05,
"loss": 0.2683,
"step": 12350
},
{
"epoch": 0.7866522870012054,
"grad_norm": 0.44053515791893005,
"learning_rate": 3.6893357863350885e-05,
"loss": 0.3244,
"step": 12400
},
{
"epoch": 0.7898242720294361,
"grad_norm": 0.4747866988182068,
"learning_rate": 3.684049144621371e-05,
"loss": 0.2675,
"step": 12450
},
{
"epoch": 0.7929962570576667,
"grad_norm": 0.2212987244129181,
"learning_rate": 3.6787625029076535e-05,
"loss": 0.3245,
"step": 12500
},
{
"epoch": 0.7961682420858973,
"grad_norm": 0.16488397121429443,
"learning_rate": 3.673475861193935e-05,
"loss": 0.2579,
"step": 12550
},
{
"epoch": 0.799340227114128,
"grad_norm": 0.15184266865253448,
"learning_rate": 3.668189219480217e-05,
"loss": 0.3139,
"step": 12600
},
{
"epoch": 0.8025122121423587,
"grad_norm": 0.179268479347229,
"learning_rate": 3.6629025777665e-05,
"loss": 0.2941,
"step": 12650
},
{
"epoch": 0.8056841971705894,
"grad_norm": 0.24617774784564972,
"learning_rate": 3.6576159360527816e-05,
"loss": 0.2934,
"step": 12700
},
{
"epoch": 0.80885618219882,
"grad_norm": 0.3757403790950775,
"learning_rate": 3.6523292943390644e-05,
"loss": 0.2917,
"step": 12750
},
{
"epoch": 0.8120281672270507,
"grad_norm": 0.360689640045166,
"learning_rate": 3.6471483854596204e-05,
"loss": 0.278,
"step": 12800
},
{
"epoch": 0.8152001522552813,
"grad_norm": 0.38574671745300293,
"learning_rate": 3.641861743745903e-05,
"loss": 0.2706,
"step": 12850
},
{
"epoch": 0.818372137283512,
"grad_norm": 0.45887231826782227,
"learning_rate": 3.6365751020321855e-05,
"loss": 0.3111,
"step": 12900
},
{
"epoch": 0.8215441223117427,
"grad_norm": 0.22820314764976501,
"learning_rate": 3.6312884603184676e-05,
"loss": 0.3343,
"step": 12950
},
{
"epoch": 0.8247161073399734,
"grad_norm": 0.18171218037605286,
"learning_rate": 3.62600181860475e-05,
"loss": 0.2827,
"step": 13000
},
{
"epoch": 0.827888092368204,
"grad_norm": 0.2788825035095215,
"learning_rate": 3.620715176891032e-05,
"loss": 0.331,
"step": 13050
},
{
"epoch": 0.8310600773964347,
"grad_norm": 0.31379690766334534,
"learning_rate": 3.615428535177314e-05,
"loss": 0.3031,
"step": 13100
},
{
"epoch": 0.8342320624246654,
"grad_norm": 0.3020433187484741,
"learning_rate": 3.610141893463596e-05,
"loss": 0.27,
"step": 13150
},
{
"epoch": 0.837404047452896,
"grad_norm": 0.2358977198600769,
"learning_rate": 3.6048552517498785e-05,
"loss": 0.3495,
"step": 13200
},
{
"epoch": 0.8405760324811267,
"grad_norm": 0.2896983325481415,
"learning_rate": 3.599568610036161e-05,
"loss": 0.2766,
"step": 13250
},
{
"epoch": 0.8437480175093574,
"grad_norm": 0.30271226167678833,
"learning_rate": 3.5942819683224435e-05,
"loss": 0.3418,
"step": 13300
},
{
"epoch": 0.846920002537588,
"grad_norm": 0.22971239686012268,
"learning_rate": 3.588995326608725e-05,
"loss": 0.2905,
"step": 13350
},
{
"epoch": 0.8500919875658187,
"grad_norm": 0.22787493467330933,
"learning_rate": 3.583708684895007e-05,
"loss": 0.2906,
"step": 13400
},
{
"epoch": 0.8532639725940494,
"grad_norm": 0.3081256151199341,
"learning_rate": 3.57842204318129e-05,
"loss": 0.3264,
"step": 13450
},
{
"epoch": 0.85643595762228,
"grad_norm": 0.46066999435424805,
"learning_rate": 3.5731354014675716e-05,
"loss": 0.2963,
"step": 13500
},
{
"epoch": 0.8596079426505107,
"grad_norm": 0.467032253742218,
"learning_rate": 3.567848759753854e-05,
"loss": 0.3505,
"step": 13550
},
{
"epoch": 0.8627799276787413,
"grad_norm": 0.35964497923851013,
"learning_rate": 3.5625621180401366e-05,
"loss": 0.2843,
"step": 13600
},
{
"epoch": 0.865951912706972,
"grad_norm": 0.3182917833328247,
"learning_rate": 3.557275476326419e-05,
"loss": 0.3199,
"step": 13650
},
{
"epoch": 0.8691238977352027,
"grad_norm": 0.547640323638916,
"learning_rate": 3.551988834612701e-05,
"loss": 0.2675,
"step": 13700
},
{
"epoch": 0.8722958827634334,
"grad_norm": 0.5345727801322937,
"learning_rate": 3.546702192898983e-05,
"loss": 0.3218,
"step": 13750
},
{
"epoch": 0.8754678677916641,
"grad_norm": 0.23531897366046906,
"learning_rate": 3.541415551185265e-05,
"loss": 0.3132,
"step": 13800
},
{
"epoch": 0.8786398528198947,
"grad_norm": 0.38224852085113525,
"learning_rate": 3.5361289094715475e-05,
"loss": 0.3107,
"step": 13850
},
{
"epoch": 0.8818118378481253,
"grad_norm": 0.21280410885810852,
"learning_rate": 3.5308422677578296e-05,
"loss": 0.3213,
"step": 13900
},
{
"epoch": 0.884983822876356,
"grad_norm": 0.23802965879440308,
"learning_rate": 3.525555626044112e-05,
"loss": 0.3249,
"step": 13950
},
{
"epoch": 0.8881558079045867,
"grad_norm": 0.23534643650054932,
"learning_rate": 3.5202689843303947e-05,
"loss": 0.2846,
"step": 14000
},
{
"epoch": 0.8913277929328174,
"grad_norm": 0.2755154073238373,
"learning_rate": 3.514982342616676e-05,
"loss": 0.2596,
"step": 14050
},
{
"epoch": 0.8944997779610481,
"grad_norm": 0.20472615957260132,
"learning_rate": 3.509695700902958e-05,
"loss": 0.2655,
"step": 14100
},
{
"epoch": 0.8976717629892786,
"grad_norm": 0.280692994594574,
"learning_rate": 3.504409059189241e-05,
"loss": 0.3129,
"step": 14150
},
{
"epoch": 0.9008437480175093,
"grad_norm": 0.382570743560791,
"learning_rate": 3.4991224174755234e-05,
"loss": 0.3144,
"step": 14200
},
{
"epoch": 0.90401573304574,
"grad_norm": 0.2799607813358307,
"learning_rate": 3.493835775761805e-05,
"loss": 0.3225,
"step": 14250
},
{
"epoch": 0.9071877180739707,
"grad_norm": 0.2509687840938568,
"learning_rate": 3.488549134048088e-05,
"loss": 0.2808,
"step": 14300
},
{
"epoch": 0.9103597031022014,
"grad_norm": 0.6318449378013611,
"learning_rate": 3.48326249233437e-05,
"loss": 0.3053,
"step": 14350
},
{
"epoch": 0.9135316881304321,
"grad_norm": 0.16883951425552368,
"learning_rate": 3.4779758506206514e-05,
"loss": 0.2634,
"step": 14400
},
{
"epoch": 0.9167036731586627,
"grad_norm": 0.1657867729663849,
"learning_rate": 3.472689208906934e-05,
"loss": 0.3301,
"step": 14450
},
{
"epoch": 0.9198756581868933,
"grad_norm": 0.17061150074005127,
"learning_rate": 3.4674025671932164e-05,
"loss": 0.2835,
"step": 14500
},
{
"epoch": 0.923047643215124,
"grad_norm": 0.4344567656517029,
"learning_rate": 3.4621159254794986e-05,
"loss": 0.312,
"step": 14550
},
{
"epoch": 0.9262196282433547,
"grad_norm": 0.2930458188056946,
"learning_rate": 3.456829283765781e-05,
"loss": 0.2989,
"step": 14600
},
{
"epoch": 0.9293916132715854,
"grad_norm": 0.2887861728668213,
"learning_rate": 3.451542642052063e-05,
"loss": 0.3375,
"step": 14650
},
{
"epoch": 0.932563598299816,
"grad_norm": 0.22968149185180664,
"learning_rate": 3.446256000338345e-05,
"loss": 0.3065,
"step": 14700
},
{
"epoch": 0.9357355833280467,
"grad_norm": 0.2681732773780823,
"learning_rate": 3.440969358624627e-05,
"loss": 0.3132,
"step": 14750
},
{
"epoch": 0.9389075683562773,
"grad_norm": 0.3073856234550476,
"learning_rate": 3.4356827169109095e-05,
"loss": 0.3223,
"step": 14800
},
{
"epoch": 0.942079553384508,
"grad_norm": 0.18574346601963043,
"learning_rate": 3.430396075197192e-05,
"loss": 0.3083,
"step": 14850
},
{
"epoch": 0.9452515384127387,
"grad_norm": 0.44194427132606506,
"learning_rate": 3.4251094334834745e-05,
"loss": 0.2727,
"step": 14900
},
{
"epoch": 0.9484235234409694,
"grad_norm": 0.29627904295921326,
"learning_rate": 3.419822791769756e-05,
"loss": 0.3319,
"step": 14950
},
{
"epoch": 0.9515955084692,
"grad_norm": 0.3156539499759674,
"learning_rate": 3.414536150056039e-05,
"loss": 0.2866,
"step": 15000
},
{
"epoch": 0.9547674934974307,
"grad_norm": 0.18663552403450012,
"learning_rate": 3.409249508342321e-05,
"loss": 0.2708,
"step": 15050
},
{
"epoch": 0.9579394785256614,
"grad_norm": 0.29560723900794983,
"learning_rate": 3.4039628666286025e-05,
"loss": 0.2739,
"step": 15100
},
{
"epoch": 0.961111463553892,
"grad_norm": 0.3896738290786743,
"learning_rate": 3.3986762249148854e-05,
"loss": 0.3098,
"step": 15150
},
{
"epoch": 0.9642834485821227,
"grad_norm": 0.23948702216148376,
"learning_rate": 3.3933895832011675e-05,
"loss": 0.2292,
"step": 15200
},
{
"epoch": 0.9674554336103534,
"grad_norm": 0.36551278829574585,
"learning_rate": 3.38810294148745e-05,
"loss": 0.303,
"step": 15250
},
{
"epoch": 0.970627418638584,
"grad_norm": 0.16233482956886292,
"learning_rate": 3.382816299773732e-05,
"loss": 0.3182,
"step": 15300
},
{
"epoch": 0.9737994036668147,
"grad_norm": 0.29786011576652527,
"learning_rate": 3.377529658060014e-05,
"loss": 0.2966,
"step": 15350
},
{
"epoch": 0.9769713886950454,
"grad_norm": 0.18349693715572357,
"learning_rate": 3.372243016346296e-05,
"loss": 0.3126,
"step": 15400
},
{
"epoch": 0.980143373723276,
"grad_norm": 0.13039042055606842,
"learning_rate": 3.3669563746325784e-05,
"loss": 0.2645,
"step": 15450
},
{
"epoch": 0.9833153587515067,
"grad_norm": 0.19823278486728668,
"learning_rate": 3.3616697329188606e-05,
"loss": 0.2843,
"step": 15500
},
{
"epoch": 0.9864873437797373,
"grad_norm": 0.2074085921049118,
"learning_rate": 3.356383091205143e-05,
"loss": 0.2859,
"step": 15550
},
{
"epoch": 0.989659328807968,
"grad_norm": 0.6243526935577393,
"learning_rate": 3.3510964494914256e-05,
"loss": 0.3081,
"step": 15600
},
{
"epoch": 0.9928313138361987,
"grad_norm": 0.19868969917297363,
"learning_rate": 3.345809807777707e-05,
"loss": 0.2767,
"step": 15650
},
{
"epoch": 0.9960032988644294,
"grad_norm": 0.4235476553440094,
"learning_rate": 3.34052316606399e-05,
"loss": 0.2831,
"step": 15700
},
{
"epoch": 0.9991752838926601,
"grad_norm": 0.29368528723716736,
"learning_rate": 3.335236524350272e-05,
"loss": 0.3094,
"step": 15750
},
{
"epoch": 1.0,
"eval_loss": 0.3151220977306366,
"eval_runtime": 48.1084,
"eval_samples_per_second": 34.194,
"eval_steps_per_second": 17.107,
"step": 15763
},
{
"epoch": 1.0023472689208908,
"grad_norm": 0.34481731057167053,
"learning_rate": 3.329949882636554e-05,
"loss": 0.2416,
"step": 15800
},
{
"epoch": 1.0055192539491213,
"grad_norm": 0.34973275661468506,
"learning_rate": 3.3246632409228365e-05,
"loss": 0.2596,
"step": 15850
},
{
"epoch": 1.0086912389773521,
"grad_norm": 0.3936697840690613,
"learning_rate": 3.3193765992091187e-05,
"loss": 0.3223,
"step": 15900
},
{
"epoch": 1.0118632240055827,
"grad_norm": 0.2801978588104248,
"learning_rate": 3.314089957495401e-05,
"loss": 0.2896,
"step": 15950
},
{
"epoch": 1.0150352090338133,
"grad_norm": 0.1787472814321518,
"learning_rate": 3.308803315781683e-05,
"loss": 0.3416,
"step": 16000
},
{
"epoch": 1.018207194062044,
"grad_norm": 0.13754060864448547,
"learning_rate": 3.303516674067965e-05,
"loss": 0.282,
"step": 16050
},
{
"epoch": 1.0213791790902746,
"grad_norm": 0.24444366991519928,
"learning_rate": 3.2982300323542474e-05,
"loss": 0.2829,
"step": 16100
},
{
"epoch": 1.0245511641185054,
"grad_norm": 0.46902337670326233,
"learning_rate": 3.29294339064053e-05,
"loss": 0.3322,
"step": 16150
},
{
"epoch": 1.027723149146736,
"grad_norm": 0.17102986574172974,
"learning_rate": 3.287656748926812e-05,
"loss": 0.2946,
"step": 16200
},
{
"epoch": 1.0308951341749666,
"grad_norm": 0.5195295214653015,
"learning_rate": 3.282370107213094e-05,
"loss": 0.2921,
"step": 16250
},
{
"epoch": 1.0340671192031974,
"grad_norm": 0.23874568939208984,
"learning_rate": 3.277083465499377e-05,
"loss": 0.2764,
"step": 16300
},
{
"epoch": 1.037239104231428,
"grad_norm": 0.2507326304912567,
"learning_rate": 3.271796823785658e-05,
"loss": 0.2752,
"step": 16350
},
{
"epoch": 1.0404110892596588,
"grad_norm": 0.386338472366333,
"learning_rate": 3.2665101820719404e-05,
"loss": 0.3205,
"step": 16400
},
{
"epoch": 1.0435830742878893,
"grad_norm": 0.2907971441745758,
"learning_rate": 3.261223540358223e-05,
"loss": 0.2699,
"step": 16450
},
{
"epoch": 1.0467550593161201,
"grad_norm": 0.3498822748661041,
"learning_rate": 3.2559368986445054e-05,
"loss": 0.3075,
"step": 16500
},
{
"epoch": 1.0499270443443507,
"grad_norm": 0.1719454526901245,
"learning_rate": 3.2506502569307876e-05,
"loss": 0.2604,
"step": 16550
},
{
"epoch": 1.0530990293725813,
"grad_norm": 0.2626461684703827,
"learning_rate": 3.24536361521707e-05,
"loss": 0.308,
"step": 16600
},
{
"epoch": 1.056271014400812,
"grad_norm": 0.26986241340637207,
"learning_rate": 3.240076973503352e-05,
"loss": 0.2816,
"step": 16650
},
{
"epoch": 1.0594429994290426,
"grad_norm": 0.2187446653842926,
"learning_rate": 3.234790331789634e-05,
"loss": 0.2749,
"step": 16700
},
{
"epoch": 1.0626149844572734,
"grad_norm": 0.2777579128742218,
"learning_rate": 3.229503690075916e-05,
"loss": 0.3258,
"step": 16750
},
{
"epoch": 1.065786969485504,
"grad_norm": 0.29376596212387085,
"learning_rate": 3.2242170483621985e-05,
"loss": 0.3239,
"step": 16800
},
{
"epoch": 1.0689589545137348,
"grad_norm": 0.25620236992836,
"learning_rate": 3.218930406648481e-05,
"loss": 0.2619,
"step": 16850
},
{
"epoch": 1.0721309395419654,
"grad_norm": 0.30874136090278625,
"learning_rate": 3.213643764934763e-05,
"loss": 0.2822,
"step": 16900
},
{
"epoch": 1.075302924570196,
"grad_norm": 0.3903138041496277,
"learning_rate": 3.208357123221045e-05,
"loss": 0.2874,
"step": 16950
},
{
"epoch": 1.0784749095984267,
"grad_norm": 0.22503992915153503,
"learning_rate": 3.203070481507328e-05,
"loss": 0.2311,
"step": 17000
},
{
"epoch": 1.0816468946266573,
"grad_norm": 0.17416654527187347,
"learning_rate": 3.1977838397936093e-05,
"loss": 0.2739,
"step": 17050
},
{
"epoch": 1.084818879654888,
"grad_norm": 0.2830020487308502,
"learning_rate": 3.1924971980798915e-05,
"loss": 0.3134,
"step": 17100
},
{
"epoch": 1.0879908646831187,
"grad_norm": 0.3325769305229187,
"learning_rate": 3.1872105563661744e-05,
"loss": 0.2577,
"step": 17150
},
{
"epoch": 1.0911628497113495,
"grad_norm": 0.2352118343114853,
"learning_rate": 3.1819239146524566e-05,
"loss": 0.2406,
"step": 17200
},
{
"epoch": 1.09433483473958,
"grad_norm": 0.27199751138687134,
"learning_rate": 3.176637272938738e-05,
"loss": 0.3044,
"step": 17250
},
{
"epoch": 1.0975068197678106,
"grad_norm": 0.28306007385253906,
"learning_rate": 3.171350631225021e-05,
"loss": 0.2881,
"step": 17300
},
{
"epoch": 1.1006788047960414,
"grad_norm": 0.22405964136123657,
"learning_rate": 3.166063989511303e-05,
"loss": 0.2771,
"step": 17350
},
{
"epoch": 1.103850789824272,
"grad_norm": 0.5038449764251709,
"learning_rate": 3.160777347797585e-05,
"loss": 0.2575,
"step": 17400
},
{
"epoch": 1.1070227748525028,
"grad_norm": 0.23774085938930511,
"learning_rate": 3.155596438918142e-05,
"loss": 0.3377,
"step": 17450
},
{
"epoch": 1.1101947598807334,
"grad_norm": 0.3367967903614044,
"learning_rate": 3.1503097972044234e-05,
"loss": 0.2936,
"step": 17500
},
{
"epoch": 1.113366744908964,
"grad_norm": 0.7372679710388184,
"learning_rate": 3.145023155490706e-05,
"loss": 0.285,
"step": 17550
},
{
"epoch": 1.1165387299371947,
"grad_norm": 0.23422600328922272,
"learning_rate": 3.1397365137769885e-05,
"loss": 0.2332,
"step": 17600
},
{
"epoch": 1.1197107149654253,
"grad_norm": 0.2483871728181839,
"learning_rate": 3.1344498720632706e-05,
"loss": 0.3047,
"step": 17650
},
{
"epoch": 1.122882699993656,
"grad_norm": 0.3678695261478424,
"learning_rate": 3.129163230349553e-05,
"loss": 0.2943,
"step": 17700
},
{
"epoch": 1.1260546850218867,
"grad_norm": 0.3198718726634979,
"learning_rate": 3.123876588635835e-05,
"loss": 0.2797,
"step": 17750
},
{
"epoch": 1.1292266700501175,
"grad_norm": 0.17824482917785645,
"learning_rate": 3.118589946922117e-05,
"loss": 0.2806,
"step": 17800
},
{
"epoch": 1.132398655078348,
"grad_norm": 0.20436514914035797,
"learning_rate": 3.1133033052083993e-05,
"loss": 0.3234,
"step": 17850
},
{
"epoch": 1.1355706401065788,
"grad_norm": 0.28306111693382263,
"learning_rate": 3.1080166634946815e-05,
"loss": 0.2955,
"step": 17900
},
{
"epoch": 1.1387426251348094,
"grad_norm": 0.2912297546863556,
"learning_rate": 3.1027300217809644e-05,
"loss": 0.2773,
"step": 17950
},
{
"epoch": 1.14191461016304,
"grad_norm": 0.41615915298461914,
"learning_rate": 3.0974433800672465e-05,
"loss": 0.2689,
"step": 18000
},
{
"epoch": 1.1450865951912708,
"grad_norm": 0.2598041594028473,
"learning_rate": 3.092156738353528e-05,
"loss": 0.269,
"step": 18050
},
{
"epoch": 1.1482585802195013,
"grad_norm": 0.19208338856697083,
"learning_rate": 3.086870096639811e-05,
"loss": 0.3644,
"step": 18100
},
{
"epoch": 1.1514305652477321,
"grad_norm": 0.36915165185928345,
"learning_rate": 3.081583454926093e-05,
"loss": 0.3754,
"step": 18150
},
{
"epoch": 1.1546025502759627,
"grad_norm": 0.2906833589076996,
"learning_rate": 3.076296813212375e-05,
"loss": 0.3004,
"step": 18200
},
{
"epoch": 1.1577745353041933,
"grad_norm": 0.27490586042404175,
"learning_rate": 3.0710101714986574e-05,
"loss": 0.2476,
"step": 18250
},
{
"epoch": 1.160946520332424,
"grad_norm": 0.2721092998981476,
"learning_rate": 3.0657235297849396e-05,
"loss": 0.2994,
"step": 18300
},
{
"epoch": 1.1641185053606546,
"grad_norm": 0.5216304063796997,
"learning_rate": 3.060436888071222e-05,
"loss": 0.2848,
"step": 18350
},
{
"epoch": 1.1672904903888854,
"grad_norm": 0.2627362012863159,
"learning_rate": 3.055150246357504e-05,
"loss": 0.2537,
"step": 18400
},
{
"epoch": 1.170462475417116,
"grad_norm": 0.7663968205451965,
"learning_rate": 3.049863604643786e-05,
"loss": 0.2894,
"step": 18450
},
{
"epoch": 1.1736344604453466,
"grad_norm": 0.2766590714454651,
"learning_rate": 3.0445769629300686e-05,
"loss": 0.3203,
"step": 18500
},
{
"epoch": 1.1768064454735774,
"grad_norm": 0.37423521280288696,
"learning_rate": 3.0392903212163508e-05,
"loss": 0.2978,
"step": 18550
},
{
"epoch": 1.179978430501808,
"grad_norm": 0.3937060534954071,
"learning_rate": 3.0340036795026326e-05,
"loss": 0.3268,
"step": 18600
},
{
"epoch": 1.1831504155300387,
"grad_norm": 0.3597530722618103,
"learning_rate": 3.028717037788915e-05,
"loss": 0.2737,
"step": 18650
},
{
"epoch": 1.1863224005582693,
"grad_norm": 0.3743630349636078,
"learning_rate": 3.0234303960751977e-05,
"loss": 0.302,
"step": 18700
},
{
"epoch": 1.1894943855865001,
"grad_norm": 0.2796330749988556,
"learning_rate": 3.0181437543614795e-05,
"loss": 0.2937,
"step": 18750
},
{
"epoch": 1.1926663706147307,
"grad_norm": 0.2742915749549866,
"learning_rate": 3.0128571126477617e-05,
"loss": 0.3305,
"step": 18800
},
{
"epoch": 1.1958383556429615,
"grad_norm": 0.29744336009025574,
"learning_rate": 3.0075704709340442e-05,
"loss": 0.2896,
"step": 18850
},
{
"epoch": 1.199010340671192,
"grad_norm": 0.2520214319229126,
"learning_rate": 3.0022838292203264e-05,
"loss": 0.2732,
"step": 18900
},
{
"epoch": 1.2021823256994226,
"grad_norm": 0.2396412491798401,
"learning_rate": 2.9969971875066082e-05,
"loss": 0.3194,
"step": 18950
},
{
"epoch": 1.2053543107276534,
"grad_norm": 0.42488738894462585,
"learning_rate": 2.9917105457928907e-05,
"loss": 0.2837,
"step": 19000
},
{
"epoch": 1.208526295755884,
"grad_norm": 0.21764253079891205,
"learning_rate": 2.986423904079173e-05,
"loss": 0.2955,
"step": 19050
},
{
"epoch": 1.2116982807841148,
"grad_norm": 0.4629133641719818,
"learning_rate": 2.9811372623654547e-05,
"loss": 0.2466,
"step": 19100
},
{
"epoch": 1.2148702658123454,
"grad_norm": 0.2966591417789459,
"learning_rate": 2.9758506206517372e-05,
"loss": 0.3033,
"step": 19150
},
{
"epoch": 1.218042250840576,
"grad_norm": 0.42917561531066895,
"learning_rate": 2.9705639789380198e-05,
"loss": 0.3201,
"step": 19200
},
{
"epoch": 1.2212142358688067,
"grad_norm": 0.24894292652606964,
"learning_rate": 2.965277337224302e-05,
"loss": 0.2464,
"step": 19250
},
{
"epoch": 1.2243862208970373,
"grad_norm": 0.40641218423843384,
"learning_rate": 2.9599906955105838e-05,
"loss": 0.2771,
"step": 19300
},
{
"epoch": 1.227558205925268,
"grad_norm": 0.36876288056373596,
"learning_rate": 2.9547040537968663e-05,
"loss": 0.2744,
"step": 19350
},
{
"epoch": 1.2307301909534987,
"grad_norm": 0.6306925415992737,
"learning_rate": 2.9494174120831485e-05,
"loss": 0.2859,
"step": 19400
},
{
"epoch": 1.2339021759817292,
"grad_norm": 0.17763349413871765,
"learning_rate": 2.9441307703694303e-05,
"loss": 0.3406,
"step": 19450
},
{
"epoch": 1.23707416100996,
"grad_norm": 0.3222569525241852,
"learning_rate": 2.9388441286557128e-05,
"loss": 0.286,
"step": 19500
},
{
"epoch": 1.2402461460381906,
"grad_norm": 0.22196908295154572,
"learning_rate": 2.9335574869419953e-05,
"loss": 0.2585,
"step": 19550
},
{
"epoch": 1.2434181310664214,
"grad_norm": 0.29623332619667053,
"learning_rate": 2.9282708452282775e-05,
"loss": 0.292,
"step": 19600
},
{
"epoch": 1.246590116094652,
"grad_norm": 0.2855692207813263,
"learning_rate": 2.9229842035145593e-05,
"loss": 0.2804,
"step": 19650
},
{
"epoch": 1.2497621011228828,
"grad_norm": 0.3012256920337677,
"learning_rate": 2.917697561800842e-05,
"loss": 0.2678,
"step": 19700
},
{
"epoch": 1.2529340861511133,
"grad_norm": 0.43054285645484924,
"learning_rate": 2.912410920087124e-05,
"loss": 0.3117,
"step": 19750
},
{
"epoch": 1.2561060711793441,
"grad_norm": 0.2894386351108551,
"learning_rate": 2.907124278373406e-05,
"loss": 0.2846,
"step": 19800
},
{
"epoch": 1.2592780562075747,
"grad_norm": 0.7129951119422913,
"learning_rate": 2.9018376366596884e-05,
"loss": 0.3188,
"step": 19850
},
{
"epoch": 1.2624500412358053,
"grad_norm": 0.2086195945739746,
"learning_rate": 2.8965509949459705e-05,
"loss": 0.267,
"step": 19900
},
{
"epoch": 1.265622026264036,
"grad_norm": 0.15617908537387848,
"learning_rate": 2.891264353232253e-05,
"loss": 0.2682,
"step": 19950
},
{
"epoch": 1.2687940112922667,
"grad_norm": 0.22346539795398712,
"learning_rate": 2.885977711518535e-05,
"loss": 0.3291,
"step": 20000
},
{
"epoch": 1.2719659963204974,
"grad_norm": 0.20272932946681976,
"learning_rate": 2.8806910698048174e-05,
"loss": 0.2848,
"step": 20050
},
{
"epoch": 1.275137981348728,
"grad_norm": 0.5262783169746399,
"learning_rate": 2.8754044280910996e-05,
"loss": 0.3224,
"step": 20100
},
{
"epoch": 1.2783099663769586,
"grad_norm": 0.28421077132225037,
"learning_rate": 2.870117786377382e-05,
"loss": 0.2628,
"step": 20150
},
{
"epoch": 1.2814819514051894,
"grad_norm": 0.29966121912002563,
"learning_rate": 2.864831144663664e-05,
"loss": 0.3174,
"step": 20200
},
{
"epoch": 1.28465393643342,
"grad_norm": 0.41417014598846436,
"learning_rate": 2.859544502949946e-05,
"loss": 0.2971,
"step": 20250
},
{
"epoch": 1.2878259214616508,
"grad_norm": 0.2396809309720993,
"learning_rate": 2.8542578612362286e-05,
"loss": 0.2744,
"step": 20300
},
{
"epoch": 1.2909979064898813,
"grad_norm": 0.29226428270339966,
"learning_rate": 2.8489712195225105e-05,
"loss": 0.3256,
"step": 20350
},
{
"epoch": 1.294169891518112,
"grad_norm": 0.27770760655403137,
"learning_rate": 2.843684577808793e-05,
"loss": 0.3271,
"step": 20400
},
{
"epoch": 1.2973418765463427,
"grad_norm": 0.34558218717575073,
"learning_rate": 2.838397936095075e-05,
"loss": 0.2438,
"step": 20450
},
{
"epoch": 1.3005138615745735,
"grad_norm": 0.1685953289270401,
"learning_rate": 2.8331112943813577e-05,
"loss": 0.2701,
"step": 20500
},
{
"epoch": 1.303685846602804,
"grad_norm": 0.2469525784254074,
"learning_rate": 2.8278246526676395e-05,
"loss": 0.2622,
"step": 20550
},
{
"epoch": 1.3068578316310346,
"grad_norm": 0.17005576193332672,
"learning_rate": 2.8225380109539217e-05,
"loss": 0.2992,
"step": 20600
},
{
"epoch": 1.3100298166592654,
"grad_norm": 0.30128028988838196,
"learning_rate": 2.8172513692402042e-05,
"loss": 0.3352,
"step": 20650
},
{
"epoch": 1.313201801687496,
"grad_norm": 0.6207164525985718,
"learning_rate": 2.811964727526486e-05,
"loss": 0.2747,
"step": 20700
},
{
"epoch": 1.3163737867157268,
"grad_norm": 0.21769997477531433,
"learning_rate": 2.8066780858127685e-05,
"loss": 0.2948,
"step": 20750
},
{
"epoch": 1.3195457717439574,
"grad_norm": 0.4176817238330841,
"learning_rate": 2.801497176933325e-05,
"loss": 0.2682,
"step": 20800
},
{
"epoch": 1.322717756772188,
"grad_norm": 0.2459891140460968,
"learning_rate": 2.796210535219607e-05,
"loss": 0.2669,
"step": 20850
},
{
"epoch": 1.3258897418004187,
"grad_norm": 0.3456882834434509,
"learning_rate": 2.7909238935058896e-05,
"loss": 0.3307,
"step": 20900
},
{
"epoch": 1.3290617268286493,
"grad_norm": 0.2911352813243866,
"learning_rate": 2.785637251792172e-05,
"loss": 0.2304,
"step": 20950
},
{
"epoch": 1.33223371185688,
"grad_norm": 0.620798647403717,
"learning_rate": 2.780350610078454e-05,
"loss": 0.3607,
"step": 21000
},
{
"epoch": 1.3354056968851107,
"grad_norm": 0.22824439406394958,
"learning_rate": 2.775063968364736e-05,
"loss": 0.3092,
"step": 21050
},
{
"epoch": 1.3385776819133413,
"grad_norm": 0.22648726403713226,
"learning_rate": 2.7697773266510186e-05,
"loss": 0.3063,
"step": 21100
},
{
"epoch": 1.341749666941572,
"grad_norm": 0.698406457901001,
"learning_rate": 2.7644906849373004e-05,
"loss": 0.2693,
"step": 21150
},
{
"epoch": 1.3449216519698026,
"grad_norm": 0.351519912481308,
"learning_rate": 2.7592040432235826e-05,
"loss": 0.2796,
"step": 21200
},
{
"epoch": 1.3480936369980334,
"grad_norm": 0.346582293510437,
"learning_rate": 2.753917401509865e-05,
"loss": 0.3109,
"step": 21250
},
{
"epoch": 1.351265622026264,
"grad_norm": 0.22138628363609314,
"learning_rate": 2.7486307597961477e-05,
"loss": 0.2739,
"step": 21300
},
{
"epoch": 1.3544376070544948,
"grad_norm": 0.24700744450092316,
"learning_rate": 2.743344118082429e-05,
"loss": 0.2661,
"step": 21350
},
{
"epoch": 1.3576095920827254,
"grad_norm": 0.5413603186607361,
"learning_rate": 2.7380574763687117e-05,
"loss": 0.2772,
"step": 21400
},
{
"epoch": 1.3607815771109562,
"grad_norm": 0.36633920669555664,
"learning_rate": 2.7327708346549942e-05,
"loss": 0.2918,
"step": 21450
},
{
"epoch": 1.3639535621391867,
"grad_norm": 0.4693305194377899,
"learning_rate": 2.727484192941276e-05,
"loss": 0.3215,
"step": 21500
},
{
"epoch": 1.3671255471674173,
"grad_norm": 0.5809823870658875,
"learning_rate": 2.7221975512275582e-05,
"loss": 0.2846,
"step": 21550
},
{
"epoch": 1.370297532195648,
"grad_norm": 0.5905492305755615,
"learning_rate": 2.7169109095138407e-05,
"loss": 0.3345,
"step": 21600
},
{
"epoch": 1.3734695172238787,
"grad_norm": 0.29343274235725403,
"learning_rate": 2.711624267800123e-05,
"loss": 0.2596,
"step": 21650
},
{
"epoch": 1.3766415022521095,
"grad_norm": 0.4964137375354767,
"learning_rate": 2.7063376260864047e-05,
"loss": 0.3203,
"step": 21700
},
{
"epoch": 1.37981348728034,
"grad_norm": 0.43368765711784363,
"learning_rate": 2.7010509843726872e-05,
"loss": 0.2402,
"step": 21750
},
{
"epoch": 1.3829854723085706,
"grad_norm": 0.6017479300498962,
"learning_rate": 2.6957643426589697e-05,
"loss": 0.3013,
"step": 21800
},
{
"epoch": 1.3861574573368014,
"grad_norm": 0.29252320528030396,
"learning_rate": 2.6904777009452516e-05,
"loss": 0.3873,
"step": 21850
},
{
"epoch": 1.389329442365032,
"grad_norm": 0.309181809425354,
"learning_rate": 2.6851910592315337e-05,
"loss": 0.2836,
"step": 21900
},
{
"epoch": 1.3925014273932628,
"grad_norm": 0.43791621923446655,
"learning_rate": 2.6799044175178163e-05,
"loss": 0.3525,
"step": 21950
},
{
"epoch": 1.3956734124214933,
"grad_norm": 0.2919745147228241,
"learning_rate": 2.6746177758040984e-05,
"loss": 0.2974,
"step": 22000
},
{
"epoch": 1.398845397449724,
"grad_norm": 0.3903926908969879,
"learning_rate": 2.6693311340903803e-05,
"loss": 0.2992,
"step": 22050
},
{
"epoch": 1.4020173824779547,
"grad_norm": 0.3180385231971741,
"learning_rate": 2.6640444923766628e-05,
"loss": 0.2865,
"step": 22100
},
{
"epoch": 1.4051893675061855,
"grad_norm": 0.36164039373397827,
"learning_rate": 2.6587578506629453e-05,
"loss": 0.2636,
"step": 22150
},
{
"epoch": 1.408361352534416,
"grad_norm": 0.6932289004325867,
"learning_rate": 2.6534712089492268e-05,
"loss": 0.2933,
"step": 22200
},
{
"epoch": 1.4115333375626467,
"grad_norm": 0.21415837109088898,
"learning_rate": 2.6481845672355093e-05,
"loss": 0.2722,
"step": 22250
},
{
"epoch": 1.4147053225908774,
"grad_norm": 0.27378618717193604,
"learning_rate": 2.6428979255217918e-05,
"loss": 0.3441,
"step": 22300
},
{
"epoch": 1.417877307619108,
"grad_norm": 0.2832282483577728,
"learning_rate": 2.637611283808074e-05,
"loss": 0.2735,
"step": 22350
},
{
"epoch": 1.4210492926473388,
"grad_norm": 0.3578130006790161,
"learning_rate": 2.632324642094356e-05,
"loss": 0.283,
"step": 22400
},
{
"epoch": 1.4242212776755694,
"grad_norm": 0.49478858709335327,
"learning_rate": 2.6270380003806383e-05,
"loss": 0.3103,
"step": 22450
},
{
"epoch": 1.4273932627038,
"grad_norm": 0.4549751579761505,
"learning_rate": 2.6217513586669205e-05,
"loss": 0.2965,
"step": 22500
},
{
"epoch": 1.4305652477320308,
"grad_norm": 0.24857546389102936,
"learning_rate": 2.6164647169532024e-05,
"loss": 0.3054,
"step": 22550
},
{
"epoch": 1.4337372327602613,
"grad_norm": 0.35119330883026123,
"learning_rate": 2.611178075239485e-05,
"loss": 0.2559,
"step": 22600
},
{
"epoch": 1.4369092177884921,
"grad_norm": 0.41354435682296753,
"learning_rate": 2.6058914335257674e-05,
"loss": 0.2945,
"step": 22650
},
{
"epoch": 1.4400812028167227,
"grad_norm": 0.22190292179584503,
"learning_rate": 2.6006047918120496e-05,
"loss": 0.2905,
"step": 22700
},
{
"epoch": 1.4432531878449533,
"grad_norm": 0.1551959365606308,
"learning_rate": 2.5953181500983314e-05,
"loss": 0.2435,
"step": 22750
},
{
"epoch": 1.446425172873184,
"grad_norm": 0.7655497193336487,
"learning_rate": 2.590031508384614e-05,
"loss": 0.3264,
"step": 22800
},
{
"epoch": 1.4495971579014146,
"grad_norm": 0.5172088742256165,
"learning_rate": 2.584744866670896e-05,
"loss": 0.2764,
"step": 22850
},
{
"epoch": 1.4527691429296454,
"grad_norm": 0.24481894075870514,
"learning_rate": 2.5794582249571786e-05,
"loss": 0.3019,
"step": 22900
},
{
"epoch": 1.455941127957876,
"grad_norm": 0.5899595618247986,
"learning_rate": 2.5741715832434604e-05,
"loss": 0.3086,
"step": 22950
},
{
"epoch": 1.4591131129861066,
"grad_norm": 0.28628554940223694,
"learning_rate": 2.568884941529743e-05,
"loss": 0.2766,
"step": 23000
},
{
"epoch": 1.4622850980143374,
"grad_norm": 0.3063284158706665,
"learning_rate": 2.563598299816025e-05,
"loss": 0.2854,
"step": 23050
},
{
"epoch": 1.4654570830425682,
"grad_norm": 0.23942221701145172,
"learning_rate": 2.558311658102307e-05,
"loss": 0.2577,
"step": 23100
},
{
"epoch": 1.4686290680707987,
"grad_norm": 0.17220012843608856,
"learning_rate": 2.5530250163885895e-05,
"loss": 0.2941,
"step": 23150
},
{
"epoch": 1.4718010530990293,
"grad_norm": 0.6850319504737854,
"learning_rate": 2.5477383746748716e-05,
"loss": 0.274,
"step": 23200
},
{
"epoch": 1.47497303812726,
"grad_norm": 0.34509897232055664,
"learning_rate": 2.542451732961154e-05,
"loss": 0.3397,
"step": 23250
},
{
"epoch": 1.4781450231554907,
"grad_norm": 0.5132359266281128,
"learning_rate": 2.537165091247436e-05,
"loss": 0.274,
"step": 23300
},
{
"epoch": 1.4813170081837215,
"grad_norm": 0.34320104122161865,
"learning_rate": 2.5318784495337182e-05,
"loss": 0.2837,
"step": 23350
},
{
"epoch": 1.484488993211952,
"grad_norm": 0.3969442844390869,
"learning_rate": 2.5265918078200007e-05,
"loss": 0.3049,
"step": 23400
},
{
"epoch": 1.4876609782401826,
"grad_norm": 0.25766682624816895,
"learning_rate": 2.5213051661062825e-05,
"loss": 0.3315,
"step": 23450
},
{
"epoch": 1.4908329632684134,
"grad_norm": 0.1973307579755783,
"learning_rate": 2.516018524392565e-05,
"loss": 0.3203,
"step": 23500
},
{
"epoch": 1.494004948296644,
"grad_norm": 0.5415976643562317,
"learning_rate": 2.5107318826788472e-05,
"loss": 0.3044,
"step": 23550
},
{
"epoch": 1.4971769333248748,
"grad_norm": 0.3590141236782074,
"learning_rate": 2.5054452409651297e-05,
"loss": 0.2836,
"step": 23600
},
{
"epoch": 1.5003489183531054,
"grad_norm": 0.3506677448749542,
"learning_rate": 2.5001585992514116e-05,
"loss": 0.262,
"step": 23650
},
{
"epoch": 1.503520903381336,
"grad_norm": 0.26054617762565613,
"learning_rate": 2.4948719575376937e-05,
"loss": 0.2966,
"step": 23700
},
{
"epoch": 1.5066928884095667,
"grad_norm": 0.18887007236480713,
"learning_rate": 2.489585315823976e-05,
"loss": 0.2243,
"step": 23750
},
{
"epoch": 1.5098648734377975,
"grad_norm": 0.4273685812950134,
"learning_rate": 2.4842986741102584e-05,
"loss": 0.2944,
"step": 23800
},
{
"epoch": 1.513036858466028,
"grad_norm": 0.27183377742767334,
"learning_rate": 2.4790120323965406e-05,
"loss": 0.2445,
"step": 23850
},
{
"epoch": 1.5162088434942587,
"grad_norm": 0.28723788261413574,
"learning_rate": 2.4737253906828228e-05,
"loss": 0.2884,
"step": 23900
},
{
"epoch": 1.5193808285224892,
"grad_norm": 0.3014012575149536,
"learning_rate": 2.468544481803379e-05,
"loss": 0.2676,
"step": 23950
},
{
"epoch": 1.52255281355072,
"grad_norm": 0.4594823122024536,
"learning_rate": 2.4632578400896616e-05,
"loss": 0.2874,
"step": 24000
},
{
"epoch": 1.5257247985789508,
"grad_norm": 0.17278143763542175,
"learning_rate": 2.4579711983759438e-05,
"loss": 0.2959,
"step": 24050
},
{
"epoch": 1.5288967836071814,
"grad_norm": 0.2626342177391052,
"learning_rate": 2.452684556662226e-05,
"loss": 0.3108,
"step": 24100
},
{
"epoch": 1.532068768635412,
"grad_norm": 0.42795270681381226,
"learning_rate": 2.447397914948508e-05,
"loss": 0.3011,
"step": 24150
},
{
"epoch": 1.5352407536636425,
"grad_norm": 0.331232488155365,
"learning_rate": 2.4421112732347903e-05,
"loss": 0.3191,
"step": 24200
},
{
"epoch": 1.5384127386918733,
"grad_norm": 0.5219537019729614,
"learning_rate": 2.436824631521073e-05,
"loss": 0.3303,
"step": 24250
},
{
"epoch": 1.5415847237201041,
"grad_norm": 0.2795136272907257,
"learning_rate": 2.4315379898073547e-05,
"loss": 0.2567,
"step": 24300
},
{
"epoch": 1.5447567087483347,
"grad_norm": 0.20256808400154114,
"learning_rate": 2.4262513480936372e-05,
"loss": 0.3279,
"step": 24350
},
{
"epoch": 1.5479286937765653,
"grad_norm": 0.6803138852119446,
"learning_rate": 2.4209647063799194e-05,
"loss": 0.2965,
"step": 24400
},
{
"epoch": 1.551100678804796,
"grad_norm": 0.3239715099334717,
"learning_rate": 2.4156780646662016e-05,
"loss": 0.2702,
"step": 24450
},
{
"epoch": 1.5542726638330269,
"grad_norm": 0.26858869194984436,
"learning_rate": 2.4103914229524837e-05,
"loss": 0.2552,
"step": 24500
},
{
"epoch": 1.5574446488612574,
"grad_norm": 0.4535232186317444,
"learning_rate": 2.405104781238766e-05,
"loss": 0.3275,
"step": 24550
},
{
"epoch": 1.560616633889488,
"grad_norm": 0.4622326195240021,
"learning_rate": 2.3998181395250484e-05,
"loss": 0.2784,
"step": 24600
},
{
"epoch": 1.5637886189177186,
"grad_norm": 0.26528090238571167,
"learning_rate": 2.3945314978113303e-05,
"loss": 0.3231,
"step": 24650
},
{
"epoch": 1.5669606039459494,
"grad_norm": 0.20564743876457214,
"learning_rate": 2.3892448560976128e-05,
"loss": 0.2765,
"step": 24700
},
{
"epoch": 1.5701325889741802,
"grad_norm": 0.45677173137664795,
"learning_rate": 2.383958214383895e-05,
"loss": 0.3207,
"step": 24750
},
{
"epoch": 1.5733045740024107,
"grad_norm": 0.3483443558216095,
"learning_rate": 2.378671572670177e-05,
"loss": 0.2698,
"step": 24800
},
{
"epoch": 1.5764765590306413,
"grad_norm": 0.44952449202537537,
"learning_rate": 2.3733849309564593e-05,
"loss": 0.2678,
"step": 24850
},
{
"epoch": 1.579648544058872,
"grad_norm": 0.32127121090888977,
"learning_rate": 2.3680982892427418e-05,
"loss": 0.2802,
"step": 24900
},
{
"epoch": 1.5828205290871027,
"grad_norm": 0.44746747612953186,
"learning_rate": 2.3628116475290236e-05,
"loss": 0.24,
"step": 24950
},
{
"epoch": 1.5859925141153335,
"grad_norm": 0.8728600740432739,
"learning_rate": 2.3575250058153058e-05,
"loss": 0.2903,
"step": 25000
},
{
"epoch": 1.589164499143564,
"grad_norm": 0.4112453758716583,
"learning_rate": 2.3522383641015883e-05,
"loss": 0.2781,
"step": 25050
},
{
"epoch": 1.5923364841717946,
"grad_norm": 0.5081580877304077,
"learning_rate": 2.3469517223878705e-05,
"loss": 0.2689,
"step": 25100
},
{
"epoch": 1.5955084692000254,
"grad_norm": 0.42527180910110474,
"learning_rate": 2.3416650806741527e-05,
"loss": 0.2657,
"step": 25150
},
{
"epoch": 1.598680454228256,
"grad_norm": 0.30899807810783386,
"learning_rate": 2.336378438960435e-05,
"loss": 0.2723,
"step": 25200
},
{
"epoch": 1.6018524392564868,
"grad_norm": 0.276732861995697,
"learning_rate": 2.3310917972467174e-05,
"loss": 0.2771,
"step": 25250
},
{
"epoch": 1.6050244242847174,
"grad_norm": 0.34849727153778076,
"learning_rate": 2.3258051555329992e-05,
"loss": 0.2287,
"step": 25300
},
{
"epoch": 1.608196409312948,
"grad_norm": 0.30580082535743713,
"learning_rate": 2.3205185138192814e-05,
"loss": 0.2579,
"step": 25350
},
{
"epoch": 1.6113683943411787,
"grad_norm": 0.21243813633918762,
"learning_rate": 2.315231872105564e-05,
"loss": 0.2729,
"step": 25400
},
{
"epoch": 1.6145403793694095,
"grad_norm": 0.3976793885231018,
"learning_rate": 2.309945230391846e-05,
"loss": 0.3249,
"step": 25450
},
{
"epoch": 1.61771236439764,
"grad_norm": 0.3687296211719513,
"learning_rate": 2.3046585886781282e-05,
"loss": 0.2978,
"step": 25500
},
{
"epoch": 1.6208843494258707,
"grad_norm": 0.6395165324211121,
"learning_rate": 2.2993719469644104e-05,
"loss": 0.2604,
"step": 25550
},
{
"epoch": 1.6240563344541012,
"grad_norm": 0.31490978598594666,
"learning_rate": 2.294085305250693e-05,
"loss": 0.3259,
"step": 25600
},
{
"epoch": 1.627228319482332,
"grad_norm": 0.3745858669281006,
"learning_rate": 2.2887986635369748e-05,
"loss": 0.2989,
"step": 25650
},
{
"epoch": 1.6304003045105628,
"grad_norm": 0.2982928156852722,
"learning_rate": 2.2835120218232573e-05,
"loss": 0.2532,
"step": 25700
},
{
"epoch": 1.6335722895387934,
"grad_norm": 0.28931793570518494,
"learning_rate": 2.2782253801095395e-05,
"loss": 0.2652,
"step": 25750
},
{
"epoch": 1.636744274567024,
"grad_norm": 0.27391621470451355,
"learning_rate": 2.2729387383958213e-05,
"loss": 0.2633,
"step": 25800
},
{
"epoch": 1.6399162595952546,
"grad_norm": 0.32411888241767883,
"learning_rate": 2.2676520966821038e-05,
"loss": 0.258,
"step": 25850
},
{
"epoch": 1.6430882446234854,
"grad_norm": 0.12455958873033524,
"learning_rate": 2.262365454968386e-05,
"loss": 0.2943,
"step": 25900
},
{
"epoch": 1.6462602296517161,
"grad_norm": 0.4207943081855774,
"learning_rate": 2.257078813254668e-05,
"loss": 0.3091,
"step": 25950
},
{
"epoch": 1.6494322146799467,
"grad_norm": 0.23534472286701202,
"learning_rate": 2.2517921715409503e-05,
"loss": 0.2906,
"step": 26000
},
{
"epoch": 1.6526041997081773,
"grad_norm": 0.42088547348976135,
"learning_rate": 2.246505529827233e-05,
"loss": 0.3157,
"step": 26050
},
{
"epoch": 1.655776184736408,
"grad_norm": 0.18945495784282684,
"learning_rate": 2.241218888113515e-05,
"loss": 0.2658,
"step": 26100
},
{
"epoch": 1.6589481697646387,
"grad_norm": 0.19314059615135193,
"learning_rate": 2.235932246399797e-05,
"loss": 0.303,
"step": 26150
},
{
"epoch": 1.6621201547928695,
"grad_norm": 0.6903896927833557,
"learning_rate": 2.2307513375203535e-05,
"loss": 0.315,
"step": 26200
},
{
"epoch": 1.6652921398211,
"grad_norm": 0.4477178752422333,
"learning_rate": 2.2254646958066357e-05,
"loss": 0.2688,
"step": 26250
},
{
"epoch": 1.6684641248493306,
"grad_norm": 0.3560877740383148,
"learning_rate": 2.2201780540929182e-05,
"loss": 0.2728,
"step": 26300
},
{
"epoch": 1.6716361098775614,
"grad_norm": 0.29471373558044434,
"learning_rate": 2.2148914123792004e-05,
"loss": 0.2232,
"step": 26350
},
{
"epoch": 1.6748080949057922,
"grad_norm": 0.3013167381286621,
"learning_rate": 2.2096047706654826e-05,
"loss": 0.3367,
"step": 26400
},
{
"epoch": 1.6779800799340228,
"grad_norm": 0.30516213178634644,
"learning_rate": 2.2043181289517648e-05,
"loss": 0.2794,
"step": 26450
},
{
"epoch": 1.6811520649622533,
"grad_norm": 0.30091190338134766,
"learning_rate": 2.1990314872380473e-05,
"loss": 0.2783,
"step": 26500
},
{
"epoch": 1.684324049990484,
"grad_norm": 0.5126471519470215,
"learning_rate": 2.193744845524329e-05,
"loss": 0.3011,
"step": 26550
},
{
"epoch": 1.6874960350187147,
"grad_norm": 0.3148995041847229,
"learning_rate": 2.1884582038106113e-05,
"loss": 0.2673,
"step": 26600
},
{
"epoch": 1.6906680200469455,
"grad_norm": 0.5630244016647339,
"learning_rate": 2.1831715620968938e-05,
"loss": 0.3006,
"step": 26650
},
{
"epoch": 1.693840005075176,
"grad_norm": 0.23608249425888062,
"learning_rate": 2.177884920383176e-05,
"loss": 0.2932,
"step": 26700
},
{
"epoch": 1.6970119901034066,
"grad_norm": 0.5582406520843506,
"learning_rate": 2.172598278669458e-05,
"loss": 0.2894,
"step": 26750
},
{
"epoch": 1.7001839751316372,
"grad_norm": 0.6372901797294617,
"learning_rate": 2.1673116369557403e-05,
"loss": 0.2811,
"step": 26800
},
{
"epoch": 1.703355960159868,
"grad_norm": 0.45520493388175964,
"learning_rate": 2.162024995242023e-05,
"loss": 0.2169,
"step": 26850
},
{
"epoch": 1.7065279451880988,
"grad_norm": 0.3398955762386322,
"learning_rate": 2.1567383535283047e-05,
"loss": 0.2982,
"step": 26900
},
{
"epoch": 1.7096999302163294,
"grad_norm": 0.8106810450553894,
"learning_rate": 2.151451711814587e-05,
"loss": 0.3525,
"step": 26950
},
{
"epoch": 1.71287191524456,
"grad_norm": 0.2512779235839844,
"learning_rate": 2.1461650701008694e-05,
"loss": 0.2731,
"step": 27000
},
{
"epoch": 1.7160439002727907,
"grad_norm": 0.4722442924976349,
"learning_rate": 2.1408784283871512e-05,
"loss": 0.2725,
"step": 27050
},
{
"epoch": 1.7192158853010215,
"grad_norm": 0.27375251054763794,
"learning_rate": 2.1355917866734337e-05,
"loss": 0.3077,
"step": 27100
},
{
"epoch": 1.7223878703292521,
"grad_norm": 0.9621772170066833,
"learning_rate": 2.130305144959716e-05,
"loss": 0.3281,
"step": 27150
},
{
"epoch": 1.7255598553574827,
"grad_norm": 0.285846084356308,
"learning_rate": 2.125018503245998e-05,
"loss": 0.301,
"step": 27200
},
{
"epoch": 1.7287318403857133,
"grad_norm": 0.40346983075141907,
"learning_rate": 2.1197318615322802e-05,
"loss": 0.2653,
"step": 27250
},
{
"epoch": 1.731903825413944,
"grad_norm": 0.4507441818714142,
"learning_rate": 2.1144452198185627e-05,
"loss": 0.2625,
"step": 27300
},
{
"epoch": 1.7350758104421748,
"grad_norm": 0.4275396168231964,
"learning_rate": 2.109158578104845e-05,
"loss": 0.2581,
"step": 27350
},
{
"epoch": 1.7382477954704054,
"grad_norm": 0.38401782512664795,
"learning_rate": 2.1038719363911268e-05,
"loss": 0.2486,
"step": 27400
},
{
"epoch": 1.741419780498636,
"grad_norm": 0.22168871760368347,
"learning_rate": 2.0985852946774093e-05,
"loss": 0.2556,
"step": 27450
},
{
"epoch": 1.7445917655268666,
"grad_norm": 0.3157811462879181,
"learning_rate": 2.0932986529636914e-05,
"loss": 0.3277,
"step": 27500
},
{
"epoch": 1.7477637505550974,
"grad_norm": 0.37447062134742737,
"learning_rate": 2.0880120112499736e-05,
"loss": 0.3922,
"step": 27550
},
{
"epoch": 1.7509357355833282,
"grad_norm": 0.2262888103723526,
"learning_rate": 2.0827253695362558e-05,
"loss": 0.2663,
"step": 27600
},
{
"epoch": 1.7541077206115587,
"grad_norm": 0.2502616047859192,
"learning_rate": 2.0774387278225383e-05,
"loss": 0.2922,
"step": 27650
},
{
"epoch": 1.7572797056397893,
"grad_norm": 0.5477867126464844,
"learning_rate": 2.0721520861088205e-05,
"loss": 0.3556,
"step": 27700
},
{
"epoch": 1.76045169066802,
"grad_norm": 0.31725648045539856,
"learning_rate": 2.0668654443951023e-05,
"loss": 0.2973,
"step": 27750
},
{
"epoch": 1.7636236756962507,
"grad_norm": 0.5742406249046326,
"learning_rate": 2.061578802681385e-05,
"loss": 0.2641,
"step": 27800
},
{
"epoch": 1.7667956607244815,
"grad_norm": 0.17584888637065887,
"learning_rate": 2.056292160967667e-05,
"loss": 0.2907,
"step": 27850
},
{
"epoch": 1.769967645752712,
"grad_norm": 0.18802449107170105,
"learning_rate": 2.0510055192539492e-05,
"loss": 0.3145,
"step": 27900
},
{
"epoch": 1.7731396307809426,
"grad_norm": 0.1602659672498703,
"learning_rate": 2.0457188775402314e-05,
"loss": 0.3164,
"step": 27950
},
{
"epoch": 1.7763116158091734,
"grad_norm": 0.19660255312919617,
"learning_rate": 2.040432235826514e-05,
"loss": 0.3053,
"step": 28000
},
{
"epoch": 1.7794836008374042,
"grad_norm": 0.20148231089115143,
"learning_rate": 2.0351455941127957e-05,
"loss": 0.2577,
"step": 28050
},
{
"epoch": 1.7826555858656348,
"grad_norm": 0.68003910779953,
"learning_rate": 2.029858952399078e-05,
"loss": 0.2914,
"step": 28100
},
{
"epoch": 1.7858275708938653,
"grad_norm": 0.2885502278804779,
"learning_rate": 2.0245723106853604e-05,
"loss": 0.2864,
"step": 28150
},
{
"epoch": 1.788999555922096,
"grad_norm": 0.29255005717277527,
"learning_rate": 2.0192856689716426e-05,
"loss": 0.2482,
"step": 28200
},
{
"epoch": 1.7921715409503267,
"grad_norm": 0.24247625470161438,
"learning_rate": 2.0139990272579247e-05,
"loss": 0.2796,
"step": 28250
},
{
"epoch": 1.7953435259785575,
"grad_norm": 0.5008931159973145,
"learning_rate": 2.008712385544207e-05,
"loss": 0.3548,
"step": 28300
},
{
"epoch": 1.798515511006788,
"grad_norm": 0.43906369805336,
"learning_rate": 2.0034257438304894e-05,
"loss": 0.2499,
"step": 28350
},
{
"epoch": 1.8016874960350187,
"grad_norm": 0.37931495904922485,
"learning_rate": 1.9981391021167713e-05,
"loss": 0.2815,
"step": 28400
},
{
"epoch": 1.8048594810632492,
"grad_norm": 0.6427097320556641,
"learning_rate": 1.992958193237328e-05,
"loss": 0.3206,
"step": 28450
},
{
"epoch": 1.80803146609148,
"grad_norm": 0.2767207622528076,
"learning_rate": 1.98767155152361e-05,
"loss": 0.2855,
"step": 28500
},
{
"epoch": 1.8112034511197108,
"grad_norm": 0.3229456841945648,
"learning_rate": 1.9823849098098923e-05,
"loss": 0.2656,
"step": 28550
},
{
"epoch": 1.8143754361479414,
"grad_norm": 0.22634799778461456,
"learning_rate": 1.9770982680961748e-05,
"loss": 0.3229,
"step": 28600
},
{
"epoch": 1.817547421176172,
"grad_norm": 0.35182350873947144,
"learning_rate": 1.9718116263824567e-05,
"loss": 0.2949,
"step": 28650
},
{
"epoch": 1.8207194062044028,
"grad_norm": 0.325244277715683,
"learning_rate": 1.9665249846687392e-05,
"loss": 0.33,
"step": 28700
},
{
"epoch": 1.8238913912326333,
"grad_norm": 0.5972079634666443,
"learning_rate": 1.9612383429550214e-05,
"loss": 0.2836,
"step": 28750
},
{
"epoch": 1.8270633762608641,
"grad_norm": 0.3747629225254059,
"learning_rate": 1.9559517012413035e-05,
"loss": 0.2406,
"step": 28800
},
{
"epoch": 1.8302353612890947,
"grad_norm": 0.8294934630393982,
"learning_rate": 1.9506650595275857e-05,
"loss": 0.2768,
"step": 28850
},
{
"epoch": 1.8334073463173253,
"grad_norm": 0.8470065593719482,
"learning_rate": 1.9453784178138682e-05,
"loss": 0.3305,
"step": 28900
},
{
"epoch": 1.836579331345556,
"grad_norm": 0.23448146879673004,
"learning_rate": 1.9400917761001504e-05,
"loss": 0.2622,
"step": 28950
},
{
"epoch": 1.8397513163737869,
"grad_norm": 0.4023194909095764,
"learning_rate": 1.9348051343864322e-05,
"loss": 0.2734,
"step": 29000
},
{
"epoch": 1.8429233014020174,
"grad_norm": 0.24675685167312622,
"learning_rate": 1.9295184926727147e-05,
"loss": 0.3257,
"step": 29050
},
{
"epoch": 1.846095286430248,
"grad_norm": 0.2382373958826065,
"learning_rate": 1.924231850958997e-05,
"loss": 0.2646,
"step": 29100
},
{
"epoch": 1.8492672714584786,
"grad_norm": 0.3856213688850403,
"learning_rate": 1.918945209245279e-05,
"loss": 0.3609,
"step": 29150
},
{
"epoch": 1.8524392564867094,
"grad_norm": 0.46289342641830444,
"learning_rate": 1.9136585675315613e-05,
"loss": 0.302,
"step": 29200
},
{
"epoch": 1.8556112415149402,
"grad_norm": 0.2916334569454193,
"learning_rate": 1.9083719258178438e-05,
"loss": 0.3081,
"step": 29250
},
{
"epoch": 1.8587832265431707,
"grad_norm": 0.30459076166152954,
"learning_rate": 1.903085284104126e-05,
"loss": 0.3154,
"step": 29300
},
{
"epoch": 1.8619552115714013,
"grad_norm": 0.21553024649620056,
"learning_rate": 1.8977986423904078e-05,
"loss": 0.2745,
"step": 29350
},
{
"epoch": 1.8651271965996319,
"grad_norm": 0.56458580493927,
"learning_rate": 1.8925120006766903e-05,
"loss": 0.2684,
"step": 29400
},
{
"epoch": 1.8682991816278627,
"grad_norm": 0.2930966913700104,
"learning_rate": 1.8872253589629725e-05,
"loss": 0.2746,
"step": 29450
},
{
"epoch": 1.8714711666560935,
"grad_norm": 0.24118304252624512,
"learning_rate": 1.8819387172492547e-05,
"loss": 0.311,
"step": 29500
},
{
"epoch": 1.874643151684324,
"grad_norm": 0.5832043886184692,
"learning_rate": 1.8766520755355368e-05,
"loss": 0.2716,
"step": 29550
},
{
"epoch": 1.8778151367125546,
"grad_norm": 0.4469052255153656,
"learning_rate": 1.8713654338218193e-05,
"loss": 0.276,
"step": 29600
},
{
"epoch": 1.8809871217407854,
"grad_norm": 0.24563166499137878,
"learning_rate": 1.8660787921081012e-05,
"loss": 0.2437,
"step": 29650
},
{
"epoch": 1.8841591067690162,
"grad_norm": 0.25196200609207153,
"learning_rate": 1.8607921503943833e-05,
"loss": 0.2682,
"step": 29700
},
{
"epoch": 1.8873310917972468,
"grad_norm": 0.37745949625968933,
"learning_rate": 1.855505508680666e-05,
"loss": 0.2565,
"step": 29750
},
{
"epoch": 1.8905030768254774,
"grad_norm": 0.48578011989593506,
"learning_rate": 1.850218866966948e-05,
"loss": 0.3559,
"step": 29800
},
{
"epoch": 1.893675061853708,
"grad_norm": 0.7121931910514832,
"learning_rate": 1.8449322252532302e-05,
"loss": 0.3278,
"step": 29850
},
{
"epoch": 1.8968470468819387,
"grad_norm": 0.3295869827270508,
"learning_rate": 1.8396455835395124e-05,
"loss": 0.2745,
"step": 29900
},
{
"epoch": 1.9000190319101695,
"grad_norm": 0.3305869698524475,
"learning_rate": 1.834358941825795e-05,
"loss": 0.2895,
"step": 29950
},
{
"epoch": 1.9031910169384,
"grad_norm": 0.16952826082706451,
"learning_rate": 1.8290723001120767e-05,
"loss": 0.3058,
"step": 30000
},
{
"epoch": 1.9063630019666307,
"grad_norm": 0.5475337505340576,
"learning_rate": 1.8237856583983593e-05,
"loss": 0.261,
"step": 30050
},
{
"epoch": 1.9095349869948612,
"grad_norm": 0.29783618450164795,
"learning_rate": 1.8184990166846414e-05,
"loss": 0.2663,
"step": 30100
},
{
"epoch": 1.912706972023092,
"grad_norm": 0.22428205609321594,
"learning_rate": 1.8132123749709236e-05,
"loss": 0.2741,
"step": 30150
},
{
"epoch": 1.9158789570513228,
"grad_norm": 0.29052209854125977,
"learning_rate": 1.8079257332572058e-05,
"loss": 0.2971,
"step": 30200
},
{
"epoch": 1.9190509420795534,
"grad_norm": 0.49918898940086365,
"learning_rate": 1.802639091543488e-05,
"loss": 0.3309,
"step": 30250
},
{
"epoch": 1.922222927107784,
"grad_norm": 0.38725921511650085,
"learning_rate": 1.7973524498297705e-05,
"loss": 0.3191,
"step": 30300
},
{
"epoch": 1.9253949121360148,
"grad_norm": 0.5741166472434998,
"learning_rate": 1.7920658081160523e-05,
"loss": 0.3136,
"step": 30350
},
{
"epoch": 1.9285668971642453,
"grad_norm": 0.3201332986354828,
"learning_rate": 1.7867791664023348e-05,
"loss": 0.3066,
"step": 30400
},
{
"epoch": 1.9317388821924761,
"grad_norm": 0.5069090723991394,
"learning_rate": 1.781492524688617e-05,
"loss": 0.2804,
"step": 30450
},
{
"epoch": 1.9349108672207067,
"grad_norm": 0.18295565247535706,
"learning_rate": 1.7762058829748988e-05,
"loss": 0.2942,
"step": 30500
},
{
"epoch": 1.9380828522489373,
"grad_norm": 0.26717889308929443,
"learning_rate": 1.7709192412611813e-05,
"loss": 0.2938,
"step": 30550
},
{
"epoch": 1.941254837277168,
"grad_norm": 0.2297358214855194,
"learning_rate": 1.7656325995474635e-05,
"loss": 0.2656,
"step": 30600
},
{
"epoch": 1.9444268223053989,
"grad_norm": 0.46368107199668884,
"learning_rate": 1.7603459578337457e-05,
"loss": 0.3266,
"step": 30650
},
{
"epoch": 1.9475988073336294,
"grad_norm": 0.4937555491924286,
"learning_rate": 1.7551650489543024e-05,
"loss": 0.2992,
"step": 30700
},
{
"epoch": 1.95077079236186,
"grad_norm": 0.7138214111328125,
"learning_rate": 1.7498784072405846e-05,
"loss": 0.3092,
"step": 30750
},
{
"epoch": 1.9539427773900906,
"grad_norm": 0.26548969745635986,
"learning_rate": 1.7445917655268667e-05,
"loss": 0.2582,
"step": 30800
},
{
"epoch": 1.9571147624183214,
"grad_norm": 0.29331421852111816,
"learning_rate": 1.7393051238131492e-05,
"loss": 0.2892,
"step": 30850
},
{
"epoch": 1.9602867474465522,
"grad_norm": 0.29395994544029236,
"learning_rate": 1.734018482099431e-05,
"loss": 0.2705,
"step": 30900
},
{
"epoch": 1.9634587324747828,
"grad_norm": 0.25719061493873596,
"learning_rate": 1.7287318403857133e-05,
"loss": 0.2811,
"step": 30950
},
{
"epoch": 1.9666307175030133,
"grad_norm": 0.5885137915611267,
"learning_rate": 1.7234451986719958e-05,
"loss": 0.2834,
"step": 31000
},
{
"epoch": 1.969802702531244,
"grad_norm": 0.43175041675567627,
"learning_rate": 1.718158556958278e-05,
"loss": 0.2683,
"step": 31050
},
{
"epoch": 1.9729746875594747,
"grad_norm": 0.25399765372276306,
"learning_rate": 1.71287191524456e-05,
"loss": 0.2932,
"step": 31100
},
{
"epoch": 1.9761466725877055,
"grad_norm": 0.3162282407283783,
"learning_rate": 1.7075852735308423e-05,
"loss": 0.2753,
"step": 31150
},
{
"epoch": 1.979318657615936,
"grad_norm": 0.5122566223144531,
"learning_rate": 1.7022986318171248e-05,
"loss": 0.271,
"step": 31200
},
{
"epoch": 1.9824906426441666,
"grad_norm": 0.22120784223079681,
"learning_rate": 1.6970119901034066e-05,
"loss": 0.3167,
"step": 31250
},
{
"epoch": 1.9856626276723974,
"grad_norm": 0.7437451481819153,
"learning_rate": 1.6917253483896888e-05,
"loss": 0.3324,
"step": 31300
},
{
"epoch": 1.988834612700628,
"grad_norm": 0.43868857622146606,
"learning_rate": 1.6864387066759713e-05,
"loss": 0.3211,
"step": 31350
},
{
"epoch": 1.9920065977288588,
"grad_norm": 0.5011254549026489,
"learning_rate": 1.6811520649622535e-05,
"loss": 0.3138,
"step": 31400
},
{
"epoch": 1.9951785827570894,
"grad_norm": 0.476948618888855,
"learning_rate": 1.6758654232485357e-05,
"loss": 0.3014,
"step": 31450
},
{
"epoch": 1.99835056778532,
"grad_norm": 0.258881151676178,
"learning_rate": 1.670578781534818e-05,
"loss": 0.2907,
"step": 31500
},
{
"epoch": 2.0,
"eval_loss": 0.31407585740089417,
"eval_runtime": 47.9633,
"eval_samples_per_second": 34.297,
"eval_steps_per_second": 17.159,
"step": 31526
}
],
"logging_steps": 50,
"max_steps": 47289,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.289941398028288e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}