lora-chemdfm-modifying-qed-full / trainer_state.json
Alan123's picture
Upload folder using huggingface_hub
2f83fa2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999964067682114,
"eval_steps": 3000,
"global_step": 69575,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007186463577205975,
"grad_norm": 0.33678868412971497,
"learning_rate": 9.992957240388071e-05,
"loss": 0.3609,
"step": 50
},
{
"epoch": 0.001437292715441195,
"grad_norm": 0.3722863495349884,
"learning_rate": 9.985770750988143e-05,
"loss": 0.2822,
"step": 100
},
{
"epoch": 0.0021559390731617925,
"grad_norm": 0.3654504120349884,
"learning_rate": 9.978584261588215e-05,
"loss": 0.2715,
"step": 150
},
{
"epoch": 0.00287458543088239,
"grad_norm": 0.3333148956298828,
"learning_rate": 9.971397772188287e-05,
"loss": 0.273,
"step": 200
},
{
"epoch": 0.0035932317886029873,
"grad_norm": 0.28888237476348877,
"learning_rate": 9.964211282788358e-05,
"loss": 0.2777,
"step": 250
},
{
"epoch": 0.004311878146323585,
"grad_norm": 0.4001215696334839,
"learning_rate": 9.95702479338843e-05,
"loss": 0.2622,
"step": 300
},
{
"epoch": 0.005030524504044182,
"grad_norm": 0.38211333751678467,
"learning_rate": 9.949838303988502e-05,
"loss": 0.27,
"step": 350
},
{
"epoch": 0.00574917086176478,
"grad_norm": 0.3811050057411194,
"learning_rate": 9.942651814588574e-05,
"loss": 0.2599,
"step": 400
},
{
"epoch": 0.006467817219485378,
"grad_norm": 0.41944026947021484,
"learning_rate": 9.935465325188645e-05,
"loss": 0.2539,
"step": 450
},
{
"epoch": 0.0071864635772059746,
"grad_norm": 0.4201948046684265,
"learning_rate": 9.928278835788718e-05,
"loss": 0.2501,
"step": 500
},
{
"epoch": 0.007905109934926572,
"grad_norm": 0.34886258840560913,
"learning_rate": 9.92109234638879e-05,
"loss": 0.2578,
"step": 550
},
{
"epoch": 0.00862375629264717,
"grad_norm": 0.369731605052948,
"learning_rate": 9.91390585698886e-05,
"loss": 0.2558,
"step": 600
},
{
"epoch": 0.009342402650367768,
"grad_norm": 0.2964828610420227,
"learning_rate": 9.906719367588934e-05,
"loss": 0.2445,
"step": 650
},
{
"epoch": 0.010061049008088364,
"grad_norm": 0.27640748023986816,
"learning_rate": 9.899532878189006e-05,
"loss": 0.2477,
"step": 700
},
{
"epoch": 0.010779695365808962,
"grad_norm": 0.4146655797958374,
"learning_rate": 9.892346388789076e-05,
"loss": 0.2427,
"step": 750
},
{
"epoch": 0.01149834172352956,
"grad_norm": 0.24966135621070862,
"learning_rate": 9.88515989938915e-05,
"loss": 0.243,
"step": 800
},
{
"epoch": 0.012216988081250157,
"grad_norm": 0.39003002643585205,
"learning_rate": 9.877973409989221e-05,
"loss": 0.2514,
"step": 850
},
{
"epoch": 0.012935634438970755,
"grad_norm": 0.3037317991256714,
"learning_rate": 9.870786920589292e-05,
"loss": 0.2444,
"step": 900
},
{
"epoch": 0.013654280796691351,
"grad_norm": 0.4129560589790344,
"learning_rate": 9.863600431189365e-05,
"loss": 0.2488,
"step": 950
},
{
"epoch": 0.014372927154411949,
"grad_norm": 0.3825244903564453,
"learning_rate": 9.856413941789436e-05,
"loss": 0.244,
"step": 1000
},
{
"epoch": 0.015091573512132547,
"grad_norm": 0.30103355646133423,
"learning_rate": 9.849227452389508e-05,
"loss": 0.2409,
"step": 1050
},
{
"epoch": 0.015810219869853145,
"grad_norm": 0.3930550515651703,
"learning_rate": 9.842040962989581e-05,
"loss": 0.2282,
"step": 1100
},
{
"epoch": 0.01652886622757374,
"grad_norm": 0.31128013134002686,
"learning_rate": 9.834854473589651e-05,
"loss": 0.239,
"step": 1150
},
{
"epoch": 0.01724751258529434,
"grad_norm": 0.3896910846233368,
"learning_rate": 9.827667984189723e-05,
"loss": 0.2354,
"step": 1200
},
{
"epoch": 0.017966158943014936,
"grad_norm": 0.35376253724098206,
"learning_rate": 9.820481494789797e-05,
"loss": 0.2368,
"step": 1250
},
{
"epoch": 0.018684805300735536,
"grad_norm": 0.31663399934768677,
"learning_rate": 9.813295005389867e-05,
"loss": 0.2358,
"step": 1300
},
{
"epoch": 0.019403451658456132,
"grad_norm": 0.29948070645332336,
"learning_rate": 9.806108515989939e-05,
"loss": 0.2331,
"step": 1350
},
{
"epoch": 0.020122098016176728,
"grad_norm": 0.37420302629470825,
"learning_rate": 9.798922026590011e-05,
"loss": 0.2323,
"step": 1400
},
{
"epoch": 0.020840744373897328,
"grad_norm": 0.38153383135795593,
"learning_rate": 9.791735537190083e-05,
"loss": 0.2348,
"step": 1450
},
{
"epoch": 0.021559390731617924,
"grad_norm": 0.30417364835739136,
"learning_rate": 9.784549047790155e-05,
"loss": 0.242,
"step": 1500
},
{
"epoch": 0.022278037089338523,
"grad_norm": 0.31648948788642883,
"learning_rate": 9.777362558390227e-05,
"loss": 0.2322,
"step": 1550
},
{
"epoch": 0.02299668344705912,
"grad_norm": 0.28670674562454224,
"learning_rate": 9.770176068990299e-05,
"loss": 0.2264,
"step": 1600
},
{
"epoch": 0.023715329804779715,
"grad_norm": 0.3444724977016449,
"learning_rate": 9.76298957959037e-05,
"loss": 0.2286,
"step": 1650
},
{
"epoch": 0.024433976162500315,
"grad_norm": 0.3509560227394104,
"learning_rate": 9.755803090190442e-05,
"loss": 0.2296,
"step": 1700
},
{
"epoch": 0.02515262252022091,
"grad_norm": 0.34082117676734924,
"learning_rate": 9.748616600790514e-05,
"loss": 0.2286,
"step": 1750
},
{
"epoch": 0.02587126887794151,
"grad_norm": 0.39878740906715393,
"learning_rate": 9.741430111390586e-05,
"loss": 0.2284,
"step": 1800
},
{
"epoch": 0.026589915235662107,
"grad_norm": 0.3792494833469391,
"learning_rate": 9.734243621990658e-05,
"loss": 0.2232,
"step": 1850
},
{
"epoch": 0.027308561593382703,
"grad_norm": 0.26393282413482666,
"learning_rate": 9.72705713259073e-05,
"loss": 0.22,
"step": 1900
},
{
"epoch": 0.028027207951103302,
"grad_norm": 0.2707872986793518,
"learning_rate": 9.719870643190802e-05,
"loss": 0.2195,
"step": 1950
},
{
"epoch": 0.028745854308823898,
"grad_norm": 0.2854771018028259,
"learning_rate": 9.712684153790874e-05,
"loss": 0.2227,
"step": 2000
},
{
"epoch": 0.029464500666544498,
"grad_norm": 0.29345157742500305,
"learning_rate": 9.705497664390946e-05,
"loss": 0.2247,
"step": 2050
},
{
"epoch": 0.030183147024265094,
"grad_norm": 0.29724326729774475,
"learning_rate": 9.698311174991018e-05,
"loss": 0.2239,
"step": 2100
},
{
"epoch": 0.030901793381985693,
"grad_norm": 0.31227824091911316,
"learning_rate": 9.69112468559109e-05,
"loss": 0.2265,
"step": 2150
},
{
"epoch": 0.03162043973970629,
"grad_norm": 0.31163766980171204,
"learning_rate": 9.683938196191162e-05,
"loss": 0.2266,
"step": 2200
},
{
"epoch": 0.03233908609742689,
"grad_norm": 0.2654283344745636,
"learning_rate": 9.676751706791232e-05,
"loss": 0.2233,
"step": 2250
},
{
"epoch": 0.03305773245514748,
"grad_norm": 0.27485188841819763,
"learning_rate": 9.669565217391304e-05,
"loss": 0.2252,
"step": 2300
},
{
"epoch": 0.03377637881286808,
"grad_norm": 0.28361964225769043,
"learning_rate": 9.662378727991377e-05,
"loss": 0.2272,
"step": 2350
},
{
"epoch": 0.03449502517058868,
"grad_norm": 0.45227479934692383,
"learning_rate": 9.655192238591448e-05,
"loss": 0.2219,
"step": 2400
},
{
"epoch": 0.03521367152830927,
"grad_norm": 0.26741498708724976,
"learning_rate": 9.64800574919152e-05,
"loss": 0.2244,
"step": 2450
},
{
"epoch": 0.03593231788602987,
"grad_norm": 0.38418859243392944,
"learning_rate": 9.640819259791593e-05,
"loss": 0.2213,
"step": 2500
},
{
"epoch": 0.03665096424375047,
"grad_norm": 0.28470662236213684,
"learning_rate": 9.633632770391664e-05,
"loss": 0.2265,
"step": 2550
},
{
"epoch": 0.03736961060147107,
"grad_norm": 0.3410508632659912,
"learning_rate": 9.626446280991736e-05,
"loss": 0.2172,
"step": 2600
},
{
"epoch": 0.038088256959191664,
"grad_norm": 0.30783000588417053,
"learning_rate": 9.619259791591809e-05,
"loss": 0.2198,
"step": 2650
},
{
"epoch": 0.038806903316912264,
"grad_norm": 0.2574290335178375,
"learning_rate": 9.61207330219188e-05,
"loss": 0.2149,
"step": 2700
},
{
"epoch": 0.039525549674632864,
"grad_norm": 0.2963494062423706,
"learning_rate": 9.604886812791951e-05,
"loss": 0.2193,
"step": 2750
},
{
"epoch": 0.040244196032353456,
"grad_norm": 0.33474215865135193,
"learning_rate": 9.597700323392023e-05,
"loss": 0.2202,
"step": 2800
},
{
"epoch": 0.040962842390074056,
"grad_norm": 0.3282819390296936,
"learning_rate": 9.590513833992095e-05,
"loss": 0.2249,
"step": 2850
},
{
"epoch": 0.041681488747794655,
"grad_norm": 0.4898041784763336,
"learning_rate": 9.583327344592167e-05,
"loss": 0.2199,
"step": 2900
},
{
"epoch": 0.042400135105515255,
"grad_norm": 0.2768670618534088,
"learning_rate": 9.576140855192239e-05,
"loss": 0.217,
"step": 2950
},
{
"epoch": 0.04311878146323585,
"grad_norm": 0.3368474543094635,
"learning_rate": 9.568954365792311e-05,
"loss": 0.2171,
"step": 3000
},
{
"epoch": 0.04311878146323585,
"eval_loss": 0.21704652905464172,
"eval_runtime": 2340.123,
"eval_samples_per_second": 25.037,
"eval_steps_per_second": 3.13,
"step": 3000
},
{
"epoch": 0.04383742782095645,
"grad_norm": 0.4556143283843994,
"learning_rate": 9.561767876392383e-05,
"loss": 0.2196,
"step": 3050
},
{
"epoch": 0.044556074178677046,
"grad_norm": 0.3851192891597748,
"learning_rate": 9.554581386992455e-05,
"loss": 0.2154,
"step": 3100
},
{
"epoch": 0.04527472053639764,
"grad_norm": 0.33037668466567993,
"learning_rate": 9.547394897592527e-05,
"loss": 0.2146,
"step": 3150
},
{
"epoch": 0.04599336689411824,
"grad_norm": 0.5030372142791748,
"learning_rate": 9.540208408192598e-05,
"loss": 0.2163,
"step": 3200
},
{
"epoch": 0.04671201325183884,
"grad_norm": 0.2891259789466858,
"learning_rate": 9.53302191879267e-05,
"loss": 0.2102,
"step": 3250
},
{
"epoch": 0.04743065960955943,
"grad_norm": 0.3370216190814972,
"learning_rate": 9.525835429392742e-05,
"loss": 0.226,
"step": 3300
},
{
"epoch": 0.04814930596728003,
"grad_norm": 0.3481488525867462,
"learning_rate": 9.518648939992814e-05,
"loss": 0.2102,
"step": 3350
},
{
"epoch": 0.04886795232500063,
"grad_norm": 0.23386803269386292,
"learning_rate": 9.511462450592886e-05,
"loss": 0.2094,
"step": 3400
},
{
"epoch": 0.04958659868272123,
"grad_norm": 0.3331839144229889,
"learning_rate": 9.504275961192958e-05,
"loss": 0.2133,
"step": 3450
},
{
"epoch": 0.05030524504044182,
"grad_norm": 0.29888877272605896,
"learning_rate": 9.49708947179303e-05,
"loss": 0.2085,
"step": 3500
},
{
"epoch": 0.05102389139816242,
"grad_norm": 0.284204363822937,
"learning_rate": 9.489902982393102e-05,
"loss": 0.2084,
"step": 3550
},
{
"epoch": 0.05174253775588302,
"grad_norm": 0.3944644033908844,
"learning_rate": 9.482716492993174e-05,
"loss": 0.2084,
"step": 3600
},
{
"epoch": 0.052461184113603614,
"grad_norm": 0.3057551085948944,
"learning_rate": 9.475530003593244e-05,
"loss": 0.2092,
"step": 3650
},
{
"epoch": 0.05317983047132421,
"grad_norm": 0.30618202686309814,
"learning_rate": 9.468343514193318e-05,
"loss": 0.206,
"step": 3700
},
{
"epoch": 0.05389847682904481,
"grad_norm": 0.2912580370903015,
"learning_rate": 9.46115702479339e-05,
"loss": 0.2097,
"step": 3750
},
{
"epoch": 0.054617123186765405,
"grad_norm": 0.29069676995277405,
"learning_rate": 9.45397053539346e-05,
"loss": 0.2172,
"step": 3800
},
{
"epoch": 0.055335769544486005,
"grad_norm": 0.35476160049438477,
"learning_rate": 9.446784045993533e-05,
"loss": 0.2142,
"step": 3850
},
{
"epoch": 0.056054415902206604,
"grad_norm": 0.38042518496513367,
"learning_rate": 9.439597556593605e-05,
"loss": 0.2093,
"step": 3900
},
{
"epoch": 0.056773062259927204,
"grad_norm": 0.33690425753593445,
"learning_rate": 9.432411067193676e-05,
"loss": 0.2088,
"step": 3950
},
{
"epoch": 0.057491708617647796,
"grad_norm": 0.329703152179718,
"learning_rate": 9.425224577793749e-05,
"loss": 0.2112,
"step": 4000
},
{
"epoch": 0.058210354975368396,
"grad_norm": 0.35252898931503296,
"learning_rate": 9.41803808839382e-05,
"loss": 0.2093,
"step": 4050
},
{
"epoch": 0.058929001333088996,
"grad_norm": 0.36909961700439453,
"learning_rate": 9.410851598993891e-05,
"loss": 0.2077,
"step": 4100
},
{
"epoch": 0.05964764769080959,
"grad_norm": 0.3794455826282501,
"learning_rate": 9.403665109593965e-05,
"loss": 0.2063,
"step": 4150
},
{
"epoch": 0.06036629404853019,
"grad_norm": 0.31799831986427307,
"learning_rate": 9.396478620194035e-05,
"loss": 0.2079,
"step": 4200
},
{
"epoch": 0.06108494040625079,
"grad_norm": 0.2503960132598877,
"learning_rate": 9.389292130794107e-05,
"loss": 0.2069,
"step": 4250
},
{
"epoch": 0.06180358676397139,
"grad_norm": 0.32965216040611267,
"learning_rate": 9.38210564139418e-05,
"loss": 0.21,
"step": 4300
},
{
"epoch": 0.06252223312169199,
"grad_norm": 0.2706376314163208,
"learning_rate": 9.374919151994251e-05,
"loss": 0.204,
"step": 4350
},
{
"epoch": 0.06324087947941258,
"grad_norm": 0.3276236951351166,
"learning_rate": 9.367732662594323e-05,
"loss": 0.2069,
"step": 4400
},
{
"epoch": 0.06395952583713317,
"grad_norm": 0.3109481930732727,
"learning_rate": 9.360546173194396e-05,
"loss": 0.2147,
"step": 4450
},
{
"epoch": 0.06467817219485378,
"grad_norm": 0.3856702446937561,
"learning_rate": 9.353359683794467e-05,
"loss": 0.21,
"step": 4500
},
{
"epoch": 0.06539681855257437,
"grad_norm": 0.32052862644195557,
"learning_rate": 9.346173194394539e-05,
"loss": 0.2112,
"step": 4550
},
{
"epoch": 0.06611546491029496,
"grad_norm": 0.3940321207046509,
"learning_rate": 9.33898670499461e-05,
"loss": 0.2063,
"step": 4600
},
{
"epoch": 0.06683411126801557,
"grad_norm": 0.290620893239975,
"learning_rate": 9.331800215594682e-05,
"loss": 0.211,
"step": 4650
},
{
"epoch": 0.06755275762573616,
"grad_norm": 0.2583450973033905,
"learning_rate": 9.324613726194754e-05,
"loss": 0.2043,
"step": 4700
},
{
"epoch": 0.06827140398345675,
"grad_norm": 0.5912581086158752,
"learning_rate": 9.317427236794826e-05,
"loss": 0.2077,
"step": 4750
},
{
"epoch": 0.06899005034117736,
"grad_norm": 0.4294145107269287,
"learning_rate": 9.310240747394898e-05,
"loss": 0.2103,
"step": 4800
},
{
"epoch": 0.06970869669889795,
"grad_norm": 0.2989557385444641,
"learning_rate": 9.30305425799497e-05,
"loss": 0.2006,
"step": 4850
},
{
"epoch": 0.07042734305661855,
"grad_norm": 0.3520444333553314,
"learning_rate": 9.29586776859504e-05,
"loss": 0.2054,
"step": 4900
},
{
"epoch": 0.07114598941433915,
"grad_norm": 0.3374841511249542,
"learning_rate": 9.288681279195114e-05,
"loss": 0.2058,
"step": 4950
},
{
"epoch": 0.07186463577205975,
"grad_norm": 0.5366376042366028,
"learning_rate": 9.281494789795186e-05,
"loss": 0.2063,
"step": 5000
},
{
"epoch": 0.07258328212978035,
"grad_norm": 0.3531434237957001,
"learning_rate": 9.274308300395256e-05,
"loss": 0.1993,
"step": 5050
},
{
"epoch": 0.07330192848750094,
"grad_norm": 0.406619668006897,
"learning_rate": 9.26712181099533e-05,
"loss": 0.2,
"step": 5100
},
{
"epoch": 0.07402057484522154,
"grad_norm": 0.5909317135810852,
"learning_rate": 9.259935321595402e-05,
"loss": 0.2023,
"step": 5150
},
{
"epoch": 0.07473922120294214,
"grad_norm": 0.3451712727546692,
"learning_rate": 9.252748832195472e-05,
"loss": 0.2073,
"step": 5200
},
{
"epoch": 0.07545786756066274,
"grad_norm": 0.26411768794059753,
"learning_rate": 9.245562342795545e-05,
"loss": 0.2077,
"step": 5250
},
{
"epoch": 0.07617651391838333,
"grad_norm": 0.36952096223831177,
"learning_rate": 9.238375853395616e-05,
"loss": 0.1997,
"step": 5300
},
{
"epoch": 0.07689516027610394,
"grad_norm": 0.5570283532142639,
"learning_rate": 9.231189363995688e-05,
"loss": 0.2082,
"step": 5350
},
{
"epoch": 0.07761380663382453,
"grad_norm": 0.2923007607460022,
"learning_rate": 9.224002874595761e-05,
"loss": 0.2009,
"step": 5400
},
{
"epoch": 0.07833245299154512,
"grad_norm": 0.36968836188316345,
"learning_rate": 9.216816385195832e-05,
"loss": 0.2073,
"step": 5450
},
{
"epoch": 0.07905109934926573,
"grad_norm": 0.34387850761413574,
"learning_rate": 9.209629895795904e-05,
"loss": 0.2056,
"step": 5500
},
{
"epoch": 0.07976974570698632,
"grad_norm": 0.36204349994659424,
"learning_rate": 9.202443406395977e-05,
"loss": 0.2009,
"step": 5550
},
{
"epoch": 0.08048839206470691,
"grad_norm": 0.517398476600647,
"learning_rate": 9.195256916996047e-05,
"loss": 0.2025,
"step": 5600
},
{
"epoch": 0.08120703842242752,
"grad_norm": 0.3113616704940796,
"learning_rate": 9.18807042759612e-05,
"loss": 0.1983,
"step": 5650
},
{
"epoch": 0.08192568478014811,
"grad_norm": 0.32107213139533997,
"learning_rate": 9.180883938196193e-05,
"loss": 0.2044,
"step": 5700
},
{
"epoch": 0.0826443311378687,
"grad_norm": 0.45938199758529663,
"learning_rate": 9.173697448796263e-05,
"loss": 0.201,
"step": 5750
},
{
"epoch": 0.08336297749558931,
"grad_norm": 0.39211779832839966,
"learning_rate": 9.166510959396335e-05,
"loss": 0.2122,
"step": 5800
},
{
"epoch": 0.0840816238533099,
"grad_norm": 0.49038565158843994,
"learning_rate": 9.159324469996407e-05,
"loss": 0.197,
"step": 5850
},
{
"epoch": 0.08480027021103051,
"grad_norm": 0.3352994918823242,
"learning_rate": 9.152137980596479e-05,
"loss": 0.2036,
"step": 5900
},
{
"epoch": 0.0855189165687511,
"grad_norm": 0.4314139485359192,
"learning_rate": 9.144951491196551e-05,
"loss": 0.194,
"step": 5950
},
{
"epoch": 0.0862375629264717,
"grad_norm": 0.2832717001438141,
"learning_rate": 9.137765001796623e-05,
"loss": 0.1988,
"step": 6000
},
{
"epoch": 0.0862375629264717,
"eval_loss": 0.20054864883422852,
"eval_runtime": 2378.5965,
"eval_samples_per_second": 24.632,
"eval_steps_per_second": 3.079,
"step": 6000
},
{
"epoch": 0.0869562092841923,
"grad_norm": 0.37466466426849365,
"learning_rate": 9.130578512396695e-05,
"loss": 0.2006,
"step": 6050
},
{
"epoch": 0.0876748556419129,
"grad_norm": 0.3644261658191681,
"learning_rate": 9.123392022996767e-05,
"loss": 0.2013,
"step": 6100
},
{
"epoch": 0.08839350199963349,
"grad_norm": 0.3186008632183075,
"learning_rate": 9.116205533596838e-05,
"loss": 0.1953,
"step": 6150
},
{
"epoch": 0.08911214835735409,
"grad_norm": 0.51948481798172,
"learning_rate": 9.10901904419691e-05,
"loss": 0.196,
"step": 6200
},
{
"epoch": 0.08983079471507469,
"grad_norm": 0.3450036346912384,
"learning_rate": 9.101832554796982e-05,
"loss": 0.1954,
"step": 6250
},
{
"epoch": 0.09054944107279528,
"grad_norm": 0.3176559805870056,
"learning_rate": 9.094646065397054e-05,
"loss": 0.1995,
"step": 6300
},
{
"epoch": 0.09126808743051588,
"grad_norm": 0.39915382862091064,
"learning_rate": 9.087459575997126e-05,
"loss": 0.2022,
"step": 6350
},
{
"epoch": 0.09198673378823648,
"grad_norm": 0.3537696301937103,
"learning_rate": 9.080273086597198e-05,
"loss": 0.1983,
"step": 6400
},
{
"epoch": 0.09270538014595707,
"grad_norm": 0.35816970467567444,
"learning_rate": 9.07308659719727e-05,
"loss": 0.2011,
"step": 6450
},
{
"epoch": 0.09342402650367768,
"grad_norm": 0.27828651666641235,
"learning_rate": 9.065900107797342e-05,
"loss": 0.196,
"step": 6500
},
{
"epoch": 0.09414267286139827,
"grad_norm": 0.294121116399765,
"learning_rate": 9.058713618397414e-05,
"loss": 0.1979,
"step": 6550
},
{
"epoch": 0.09486131921911886,
"grad_norm": 0.3522706925868988,
"learning_rate": 9.051527128997486e-05,
"loss": 0.1986,
"step": 6600
},
{
"epoch": 0.09557996557683947,
"grad_norm": 0.2693149745464325,
"learning_rate": 9.044340639597558e-05,
"loss": 0.2033,
"step": 6650
},
{
"epoch": 0.09629861193456006,
"grad_norm": 0.3474609851837158,
"learning_rate": 9.037154150197628e-05,
"loss": 0.1967,
"step": 6700
},
{
"epoch": 0.09701725829228067,
"grad_norm": 0.4667980372905731,
"learning_rate": 9.029967660797701e-05,
"loss": 0.1965,
"step": 6750
},
{
"epoch": 0.09773590465000126,
"grad_norm": 0.3009561002254486,
"learning_rate": 9.022781171397773e-05,
"loss": 0.1897,
"step": 6800
},
{
"epoch": 0.09845455100772185,
"grad_norm": 0.3703483045101166,
"learning_rate": 9.015594681997844e-05,
"loss": 0.2068,
"step": 6850
},
{
"epoch": 0.09917319736544246,
"grad_norm": 0.3561961054801941,
"learning_rate": 9.008408192597916e-05,
"loss": 0.1902,
"step": 6900
},
{
"epoch": 0.09989184372316305,
"grad_norm": 0.381755530834198,
"learning_rate": 9.001221703197989e-05,
"loss": 0.1939,
"step": 6950
},
{
"epoch": 0.10061049008088364,
"grad_norm": 0.298585444688797,
"learning_rate": 8.99403521379806e-05,
"loss": 0.1948,
"step": 7000
},
{
"epoch": 0.10132913643860425,
"grad_norm": 0.24976646900177002,
"learning_rate": 8.986848724398131e-05,
"loss": 0.192,
"step": 7050
},
{
"epoch": 0.10204778279632484,
"grad_norm": 0.3998195230960846,
"learning_rate": 8.979662234998203e-05,
"loss": 0.1979,
"step": 7100
},
{
"epoch": 0.10276642915404544,
"grad_norm": 0.2840961515903473,
"learning_rate": 8.972475745598275e-05,
"loss": 0.1976,
"step": 7150
},
{
"epoch": 0.10348507551176604,
"grad_norm": 0.39742761850357056,
"learning_rate": 8.965289256198347e-05,
"loss": 0.1904,
"step": 7200
},
{
"epoch": 0.10420372186948663,
"grad_norm": 0.2676410377025604,
"learning_rate": 8.958102766798419e-05,
"loss": 0.1974,
"step": 7250
},
{
"epoch": 0.10492236822720723,
"grad_norm": 0.32223057746887207,
"learning_rate": 8.950916277398491e-05,
"loss": 0.1942,
"step": 7300
},
{
"epoch": 0.10564101458492783,
"grad_norm": 0.35473334789276123,
"learning_rate": 8.943729787998563e-05,
"loss": 0.1976,
"step": 7350
},
{
"epoch": 0.10635966094264843,
"grad_norm": 0.2410697489976883,
"learning_rate": 8.936543298598635e-05,
"loss": 0.1979,
"step": 7400
},
{
"epoch": 0.10707830730036902,
"grad_norm": 0.2326819747686386,
"learning_rate": 8.929356809198707e-05,
"loss": 0.193,
"step": 7450
},
{
"epoch": 0.10779695365808963,
"grad_norm": 0.39694592356681824,
"learning_rate": 8.922170319798779e-05,
"loss": 0.1952,
"step": 7500
},
{
"epoch": 0.10851560001581022,
"grad_norm": 0.2801777720451355,
"learning_rate": 8.91498383039885e-05,
"loss": 0.1913,
"step": 7550
},
{
"epoch": 0.10923424637353081,
"grad_norm": 0.27637767791748047,
"learning_rate": 8.907797340998922e-05,
"loss": 0.1894,
"step": 7600
},
{
"epoch": 0.10995289273125142,
"grad_norm": 0.47819074988365173,
"learning_rate": 8.900610851598994e-05,
"loss": 0.19,
"step": 7650
},
{
"epoch": 0.11067153908897201,
"grad_norm": 0.37279731035232544,
"learning_rate": 8.893424362199066e-05,
"loss": 0.193,
"step": 7700
},
{
"epoch": 0.11139018544669262,
"grad_norm": 0.3399754464626312,
"learning_rate": 8.886237872799138e-05,
"loss": 0.1901,
"step": 7750
},
{
"epoch": 0.11210883180441321,
"grad_norm": 0.2655271887779236,
"learning_rate": 8.87905138339921e-05,
"loss": 0.1902,
"step": 7800
},
{
"epoch": 0.1128274781621338,
"grad_norm": 0.2509619891643524,
"learning_rate": 8.871864893999282e-05,
"loss": 0.1929,
"step": 7850
},
{
"epoch": 0.11354612451985441,
"grad_norm": 0.37877124547958374,
"learning_rate": 8.864678404599354e-05,
"loss": 0.1935,
"step": 7900
},
{
"epoch": 0.114264770877575,
"grad_norm": 0.3020787835121155,
"learning_rate": 8.857491915199424e-05,
"loss": 0.1943,
"step": 7950
},
{
"epoch": 0.11498341723529559,
"grad_norm": 0.37178000807762146,
"learning_rate": 8.850305425799498e-05,
"loss": 0.1874,
"step": 8000
},
{
"epoch": 0.1157020635930162,
"grad_norm": 0.29959267377853394,
"learning_rate": 8.84311893639957e-05,
"loss": 0.1915,
"step": 8050
},
{
"epoch": 0.11642070995073679,
"grad_norm": 0.3364570736885071,
"learning_rate": 8.83607617678764e-05,
"loss": 0.1929,
"step": 8100
},
{
"epoch": 0.11713935630845738,
"grad_norm": 0.4932682514190674,
"learning_rate": 8.828889687387712e-05,
"loss": 0.1903,
"step": 8150
},
{
"epoch": 0.11785800266617799,
"grad_norm": 0.32474425435066223,
"learning_rate": 8.821703197987782e-05,
"loss": 0.1884,
"step": 8200
},
{
"epoch": 0.11857664902389858,
"grad_norm": 0.2824951410293579,
"learning_rate": 8.814516708587856e-05,
"loss": 0.1847,
"step": 8250
},
{
"epoch": 0.11929529538161918,
"grad_norm": 0.4078387916088104,
"learning_rate": 8.807330219187928e-05,
"loss": 0.1909,
"step": 8300
},
{
"epoch": 0.12001394173933978,
"grad_norm": 0.42612573504447937,
"learning_rate": 8.800143729787998e-05,
"loss": 0.191,
"step": 8350
},
{
"epoch": 0.12073258809706038,
"grad_norm": 0.3414117991924286,
"learning_rate": 8.792957240388071e-05,
"loss": 0.1929,
"step": 8400
},
{
"epoch": 0.12145123445478097,
"grad_norm": 0.2816385328769684,
"learning_rate": 8.785770750988142e-05,
"loss": 0.1958,
"step": 8450
},
{
"epoch": 0.12216988081250157,
"grad_norm": 0.29827359318733215,
"learning_rate": 8.778584261588214e-05,
"loss": 0.191,
"step": 8500
},
{
"epoch": 0.12288852717022217,
"grad_norm": 0.27113088965415955,
"learning_rate": 8.771397772188287e-05,
"loss": 0.1908,
"step": 8550
},
{
"epoch": 0.12360717352794277,
"grad_norm": 0.3588126003742218,
"learning_rate": 8.764211282788358e-05,
"loss": 0.1951,
"step": 8600
},
{
"epoch": 0.12432581988566337,
"grad_norm": 0.2723435163497925,
"learning_rate": 8.75702479338843e-05,
"loss": 0.1916,
"step": 8650
},
{
"epoch": 0.12504446624338397,
"grad_norm": 0.37845754623413086,
"learning_rate": 8.749838303988503e-05,
"loss": 0.1967,
"step": 8700
},
{
"epoch": 0.12576311260110457,
"grad_norm": 0.36663511395454407,
"learning_rate": 8.742651814588573e-05,
"loss": 0.183,
"step": 8750
},
{
"epoch": 0.12648175895882516,
"grad_norm": 0.34882092475891113,
"learning_rate": 8.735465325188645e-05,
"loss": 0.1895,
"step": 8800
},
{
"epoch": 0.12720040531654575,
"grad_norm": 0.3167667090892792,
"learning_rate": 8.728278835788719e-05,
"loss": 0.1933,
"step": 8850
},
{
"epoch": 0.12791905167426634,
"grad_norm": 0.4581696093082428,
"learning_rate": 8.721092346388789e-05,
"loss": 0.1909,
"step": 8900
},
{
"epoch": 0.12863769803198694,
"grad_norm": 0.3376115560531616,
"learning_rate": 8.713905856988861e-05,
"loss": 0.189,
"step": 8950
},
{
"epoch": 0.12935634438970756,
"grad_norm": 0.4183780550956726,
"learning_rate": 8.706719367588933e-05,
"loss": 0.1933,
"step": 9000
},
{
"epoch": 0.12935634438970756,
"eval_loss": 0.18834719061851501,
"eval_runtime": 2348.9887,
"eval_samples_per_second": 24.943,
"eval_steps_per_second": 3.118,
"step": 9000
},
{
"epoch": 0.13007499074742815,
"grad_norm": 0.36423614621162415,
"learning_rate": 8.699532878189005e-05,
"loss": 0.1926,
"step": 9050
},
{
"epoch": 0.13079363710514874,
"grad_norm": 0.47257256507873535,
"learning_rate": 8.692346388789077e-05,
"loss": 0.1982,
"step": 9100
},
{
"epoch": 0.13151228346286933,
"grad_norm": 0.2503010034561157,
"learning_rate": 8.685159899389149e-05,
"loss": 0.1874,
"step": 9150
},
{
"epoch": 0.13223092982058993,
"grad_norm": 0.2888598144054413,
"learning_rate": 8.67797340998922e-05,
"loss": 0.1927,
"step": 9200
},
{
"epoch": 0.13294957617831055,
"grad_norm": 0.6369989514350891,
"learning_rate": 8.670786920589293e-05,
"loss": 0.1843,
"step": 9250
},
{
"epoch": 0.13366822253603114,
"grad_norm": 0.33099669218063354,
"learning_rate": 8.663600431189364e-05,
"loss": 0.1872,
"step": 9300
},
{
"epoch": 0.13438686889375173,
"grad_norm": 0.3384752869606018,
"learning_rate": 8.656413941789436e-05,
"loss": 0.187,
"step": 9350
},
{
"epoch": 0.13510551525147232,
"grad_norm": 0.3799266219139099,
"learning_rate": 8.649227452389508e-05,
"loss": 0.1866,
"step": 9400
},
{
"epoch": 0.13582416160919292,
"grad_norm": 0.26931485533714294,
"learning_rate": 8.64204096298958e-05,
"loss": 0.1815,
"step": 9450
},
{
"epoch": 0.1365428079669135,
"grad_norm": 0.4372073709964752,
"learning_rate": 8.634854473589652e-05,
"loss": 0.1907,
"step": 9500
},
{
"epoch": 0.13726145432463413,
"grad_norm": 0.34800344705581665,
"learning_rate": 8.627667984189724e-05,
"loss": 0.1832,
"step": 9550
},
{
"epoch": 0.13798010068235472,
"grad_norm": 0.3955633044242859,
"learning_rate": 8.620481494789796e-05,
"loss": 0.1884,
"step": 9600
},
{
"epoch": 0.13869874704007532,
"grad_norm": 0.37053999304771423,
"learning_rate": 8.613295005389868e-05,
"loss": 0.1845,
"step": 9650
},
{
"epoch": 0.1394173933977959,
"grad_norm": 0.25118228793144226,
"learning_rate": 8.606108515989938e-05,
"loss": 0.1785,
"step": 9700
},
{
"epoch": 0.1401360397555165,
"grad_norm": 0.3882904052734375,
"learning_rate": 8.598922026590012e-05,
"loss": 0.1851,
"step": 9750
},
{
"epoch": 0.1408546861132371,
"grad_norm": 0.2123679518699646,
"learning_rate": 8.591735537190084e-05,
"loss": 0.1886,
"step": 9800
},
{
"epoch": 0.1415733324709577,
"grad_norm": 0.28561949729919434,
"learning_rate": 8.584549047790154e-05,
"loss": 0.1766,
"step": 9850
},
{
"epoch": 0.1422919788286783,
"grad_norm": 0.2941133379936218,
"learning_rate": 8.577362558390227e-05,
"loss": 0.1889,
"step": 9900
},
{
"epoch": 0.1430106251863989,
"grad_norm": 0.3725387454032898,
"learning_rate": 8.570176068990299e-05,
"loss": 0.1887,
"step": 9950
},
{
"epoch": 0.1437292715441195,
"grad_norm": 0.34473615884780884,
"learning_rate": 8.56298957959037e-05,
"loss": 0.1876,
"step": 10000
},
{
"epoch": 0.14444791790184008,
"grad_norm": 0.3055415749549866,
"learning_rate": 8.555803090190443e-05,
"loss": 0.1844,
"step": 10050
},
{
"epoch": 0.1451665642595607,
"grad_norm": 0.308893084526062,
"learning_rate": 8.548616600790515e-05,
"loss": 0.1819,
"step": 10100
},
{
"epoch": 0.1458852106172813,
"grad_norm": 0.36378395557403564,
"learning_rate": 8.541430111390586e-05,
"loss": 0.1824,
"step": 10150
},
{
"epoch": 0.1466038569750019,
"grad_norm": 0.43480271100997925,
"learning_rate": 8.534243621990659e-05,
"loss": 0.1843,
"step": 10200
},
{
"epoch": 0.14732250333272248,
"grad_norm": 0.3115452527999878,
"learning_rate": 8.527200862378728e-05,
"loss": 0.187,
"step": 10250
},
{
"epoch": 0.14804114969044307,
"grad_norm": 0.4598091244697571,
"learning_rate": 8.520014372978801e-05,
"loss": 0.188,
"step": 10300
},
{
"epoch": 0.14875979604816367,
"grad_norm": 0.3190801739692688,
"learning_rate": 8.512827883578872e-05,
"loss": 0.1888,
"step": 10350
},
{
"epoch": 0.1494784424058843,
"grad_norm": 0.33876556158065796,
"learning_rate": 8.505785123966943e-05,
"loss": 0.1945,
"step": 10400
},
{
"epoch": 0.15019708876360488,
"grad_norm": 0.49205857515335083,
"learning_rate": 8.498598634567014e-05,
"loss": 0.1862,
"step": 10450
},
{
"epoch": 0.15091573512132547,
"grad_norm": 0.2966972887516022,
"learning_rate": 8.491412145167086e-05,
"loss": 0.1822,
"step": 10500
},
{
"epoch": 0.15163438147904607,
"grad_norm": 0.33791208267211914,
"learning_rate": 8.484225655767159e-05,
"loss": 0.1854,
"step": 10550
},
{
"epoch": 0.15235302783676666,
"grad_norm": 0.34383195638656616,
"learning_rate": 8.47703916636723e-05,
"loss": 0.1809,
"step": 10600
},
{
"epoch": 0.15307167419448725,
"grad_norm": 0.3117673397064209,
"learning_rate": 8.469852676967301e-05,
"loss": 0.1782,
"step": 10650
},
{
"epoch": 0.15379032055220787,
"grad_norm": 0.38107767701148987,
"learning_rate": 8.462666187567373e-05,
"loss": 0.1776,
"step": 10700
},
{
"epoch": 0.15450896690992846,
"grad_norm": 0.30095821619033813,
"learning_rate": 8.455479698167445e-05,
"loss": 0.1802,
"step": 10750
},
{
"epoch": 0.15522761326764906,
"grad_norm": 0.3196362257003784,
"learning_rate": 8.448293208767517e-05,
"loss": 0.1823,
"step": 10800
},
{
"epoch": 0.15594625962536965,
"grad_norm": 0.4483869671821594,
"learning_rate": 8.441106719367589e-05,
"loss": 0.1847,
"step": 10850
},
{
"epoch": 0.15666490598309024,
"grad_norm": 0.32411614060401917,
"learning_rate": 8.433920229967661e-05,
"loss": 0.1851,
"step": 10900
},
{
"epoch": 0.15738355234081086,
"grad_norm": 0.48899659514427185,
"learning_rate": 8.426733740567733e-05,
"loss": 0.1864,
"step": 10950
},
{
"epoch": 0.15810219869853145,
"grad_norm": 0.38691896200180054,
"learning_rate": 8.419547251167805e-05,
"loss": 0.1869,
"step": 11000
},
{
"epoch": 0.15882084505625205,
"grad_norm": 0.34995898604393005,
"learning_rate": 8.412360761767877e-05,
"loss": 0.1796,
"step": 11050
},
{
"epoch": 0.15953949141397264,
"grad_norm": 0.32180970907211304,
"learning_rate": 8.405174272367949e-05,
"loss": 0.1857,
"step": 11100
},
{
"epoch": 0.16025813777169323,
"grad_norm": 0.286548912525177,
"learning_rate": 8.39798778296802e-05,
"loss": 0.1817,
"step": 11150
},
{
"epoch": 0.16097678412941382,
"grad_norm": 0.26015704870224,
"learning_rate": 8.390801293568093e-05,
"loss": 0.1815,
"step": 11200
},
{
"epoch": 0.16169543048713444,
"grad_norm": 0.33400681614875793,
"learning_rate": 8.383614804168164e-05,
"loss": 0.1842,
"step": 11250
},
{
"epoch": 0.16241407684485504,
"grad_norm": 0.3002636134624481,
"learning_rate": 8.376428314768236e-05,
"loss": 0.1868,
"step": 11300
},
{
"epoch": 0.16313272320257563,
"grad_norm": 0.43470489978790283,
"learning_rate": 8.369241825368308e-05,
"loss": 0.1797,
"step": 11350
},
{
"epoch": 0.16385136956029622,
"grad_norm": 0.26626238226890564,
"learning_rate": 8.36205533596838e-05,
"loss": 0.1812,
"step": 11400
},
{
"epoch": 0.16457001591801682,
"grad_norm": 0.4098244905471802,
"learning_rate": 8.354868846568452e-05,
"loss": 0.1817,
"step": 11450
},
{
"epoch": 0.1652886622757374,
"grad_norm": 0.39458343386650085,
"learning_rate": 8.347682357168524e-05,
"loss": 0.1825,
"step": 11500
},
{
"epoch": 0.16600730863345803,
"grad_norm": 0.3881920874118805,
"learning_rate": 8.340495867768595e-05,
"loss": 0.1808,
"step": 11550
},
{
"epoch": 0.16672595499117862,
"grad_norm": 0.4013212323188782,
"learning_rate": 8.333309378368668e-05,
"loss": 0.1812,
"step": 11600
},
{
"epoch": 0.1674446013488992,
"grad_norm": 0.5738546848297119,
"learning_rate": 8.32612288896874e-05,
"loss": 0.1787,
"step": 11650
},
{
"epoch": 0.1681632477066198,
"grad_norm": 0.3394758701324463,
"learning_rate": 8.31893639956881e-05,
"loss": 0.1824,
"step": 11700
},
{
"epoch": 0.1688818940643404,
"grad_norm": 0.3730837106704712,
"learning_rate": 8.311749910168884e-05,
"loss": 0.1915,
"step": 11750
},
{
"epoch": 0.16960054042206102,
"grad_norm": 0.47480309009552,
"learning_rate": 8.304563420768955e-05,
"loss": 0.1838,
"step": 11800
},
{
"epoch": 0.1703191867797816,
"grad_norm": 0.37515339255332947,
"learning_rate": 8.297376931369026e-05,
"loss": 0.1813,
"step": 11850
},
{
"epoch": 0.1710378331375022,
"grad_norm": 0.39568060636520386,
"learning_rate": 8.290190441969099e-05,
"loss": 0.1792,
"step": 11900
},
{
"epoch": 0.1717564794952228,
"grad_norm": 0.2854001224040985,
"learning_rate": 8.283003952569171e-05,
"loss": 0.1809,
"step": 11950
},
{
"epoch": 0.1724751258529434,
"grad_norm": 0.2876518964767456,
"learning_rate": 8.275817463169242e-05,
"loss": 0.1817,
"step": 12000
},
{
"epoch": 0.1724751258529434,
"eval_loss": 0.1806262880563736,
"eval_runtime": 2341.5668,
"eval_samples_per_second": 25.022,
"eval_steps_per_second": 3.128,
"step": 12000
},
{
"epoch": 0.17319377221066398,
"grad_norm": 0.32629990577697754,
"learning_rate": 8.268630973769315e-05,
"loss": 0.1814,
"step": 12050
},
{
"epoch": 0.1739124185683846,
"grad_norm": 0.3833070695400238,
"learning_rate": 8.261444484369386e-05,
"loss": 0.1794,
"step": 12100
},
{
"epoch": 0.1746310649261052,
"grad_norm": 0.3579089343547821,
"learning_rate": 8.254257994969457e-05,
"loss": 0.177,
"step": 12150
},
{
"epoch": 0.1753497112838258,
"grad_norm": 0.3816784918308258,
"learning_rate": 8.247071505569531e-05,
"loss": 0.1809,
"step": 12200
},
{
"epoch": 0.17606835764154638,
"grad_norm": 0.23603789508342743,
"learning_rate": 8.239885016169601e-05,
"loss": 0.1809,
"step": 12250
},
{
"epoch": 0.17678700399926697,
"grad_norm": 0.39853760600090027,
"learning_rate": 8.232698526769673e-05,
"loss": 0.1836,
"step": 12300
},
{
"epoch": 0.17750565035698757,
"grad_norm": 0.4424062669277191,
"learning_rate": 8.225512037369745e-05,
"loss": 0.1781,
"step": 12350
},
{
"epoch": 0.17822429671470819,
"grad_norm": 0.36165034770965576,
"learning_rate": 8.218325547969817e-05,
"loss": 0.1828,
"step": 12400
},
{
"epoch": 0.17894294307242878,
"grad_norm": 0.3120635449886322,
"learning_rate": 8.211139058569889e-05,
"loss": 0.178,
"step": 12450
},
{
"epoch": 0.17966158943014937,
"grad_norm": 0.37852615118026733,
"learning_rate": 8.203952569169961e-05,
"loss": 0.185,
"step": 12500
},
{
"epoch": 0.18038023578786996,
"grad_norm": 0.38624006509780884,
"learning_rate": 8.196766079770033e-05,
"loss": 0.1806,
"step": 12550
},
{
"epoch": 0.18109888214559056,
"grad_norm": 0.26871535181999207,
"learning_rate": 8.189579590370105e-05,
"loss": 0.181,
"step": 12600
},
{
"epoch": 0.18181752850331118,
"grad_norm": 0.3296714425086975,
"learning_rate": 8.182393100970177e-05,
"loss": 0.1803,
"step": 12650
},
{
"epoch": 0.18253617486103177,
"grad_norm": 0.2623511254787445,
"learning_rate": 8.175206611570248e-05,
"loss": 0.178,
"step": 12700
},
{
"epoch": 0.18325482121875236,
"grad_norm": 0.35815101861953735,
"learning_rate": 8.16802012217032e-05,
"loss": 0.1766,
"step": 12750
},
{
"epoch": 0.18397346757647295,
"grad_norm": 0.3817836046218872,
"learning_rate": 8.160833632770391e-05,
"loss": 0.1845,
"step": 12800
},
{
"epoch": 0.18469211393419355,
"grad_norm": 0.2875792682170868,
"learning_rate": 8.153647143370464e-05,
"loss": 0.1781,
"step": 12850
},
{
"epoch": 0.18541076029191414,
"grad_norm": 0.3971846103668213,
"learning_rate": 8.146460653970536e-05,
"loss": 0.1766,
"step": 12900
},
{
"epoch": 0.18612940664963476,
"grad_norm": 0.39499327540397644,
"learning_rate": 8.139274164570607e-05,
"loss": 0.1778,
"step": 12950
},
{
"epoch": 0.18684805300735535,
"grad_norm": 0.45246168971061707,
"learning_rate": 8.13208767517068e-05,
"loss": 0.1798,
"step": 13000
},
{
"epoch": 0.18756669936507595,
"grad_norm": 0.2600598931312561,
"learning_rate": 8.124901185770752e-05,
"loss": 0.1777,
"step": 13050
},
{
"epoch": 0.18828534572279654,
"grad_norm": 0.29498159885406494,
"learning_rate": 8.117714696370822e-05,
"loss": 0.1834,
"step": 13100
},
{
"epoch": 0.18900399208051713,
"grad_norm": 0.3159259855747223,
"learning_rate": 8.110528206970896e-05,
"loss": 0.1769,
"step": 13150
},
{
"epoch": 0.18972263843823772,
"grad_norm": 0.3468966484069824,
"learning_rate": 8.103341717570968e-05,
"loss": 0.1812,
"step": 13200
},
{
"epoch": 0.19044128479595834,
"grad_norm": 0.2982788681983948,
"learning_rate": 8.096155228171038e-05,
"loss": 0.1793,
"step": 13250
},
{
"epoch": 0.19115993115367894,
"grad_norm": 0.40844494104385376,
"learning_rate": 8.088968738771111e-05,
"loss": 0.1765,
"step": 13300
},
{
"epoch": 0.19187857751139953,
"grad_norm": 0.35525286197662354,
"learning_rate": 8.081782249371182e-05,
"loss": 0.1739,
"step": 13350
},
{
"epoch": 0.19259722386912012,
"grad_norm": 0.42295753955841064,
"learning_rate": 8.074595759971254e-05,
"loss": 0.178,
"step": 13400
},
{
"epoch": 0.1933158702268407,
"grad_norm": 0.28371748328208923,
"learning_rate": 8.067409270571327e-05,
"loss": 0.1768,
"step": 13450
},
{
"epoch": 0.19403451658456133,
"grad_norm": 0.36987873911857605,
"learning_rate": 8.060222781171398e-05,
"loss": 0.1795,
"step": 13500
},
{
"epoch": 0.19475316294228193,
"grad_norm": 0.3212873339653015,
"learning_rate": 8.05303629177147e-05,
"loss": 0.1751,
"step": 13550
},
{
"epoch": 0.19547180930000252,
"grad_norm": 0.3947288393974304,
"learning_rate": 8.045849802371543e-05,
"loss": 0.1727,
"step": 13600
},
{
"epoch": 0.1961904556577231,
"grad_norm": 0.28673598170280457,
"learning_rate": 8.038663312971613e-05,
"loss": 0.1863,
"step": 13650
},
{
"epoch": 0.1969091020154437,
"grad_norm": 0.28890731930732727,
"learning_rate": 8.031476823571685e-05,
"loss": 0.1777,
"step": 13700
},
{
"epoch": 0.1976277483731643,
"grad_norm": 0.36219891905784607,
"learning_rate": 8.024290334171759e-05,
"loss": 0.1745,
"step": 13750
},
{
"epoch": 0.19834639473088492,
"grad_norm": 0.37695997953414917,
"learning_rate": 8.017103844771829e-05,
"loss": 0.1807,
"step": 13800
},
{
"epoch": 0.1990650410886055,
"grad_norm": 0.3192157447338104,
"learning_rate": 8.009917355371901e-05,
"loss": 0.173,
"step": 13850
},
{
"epoch": 0.1997836874463261,
"grad_norm": 0.4570382833480835,
"learning_rate": 8.002730865971973e-05,
"loss": 0.1723,
"step": 13900
},
{
"epoch": 0.2005023338040467,
"grad_norm": 0.3775467276573181,
"learning_rate": 7.995544376572045e-05,
"loss": 0.1814,
"step": 13950
},
{
"epoch": 0.2012209801617673,
"grad_norm": 0.32216453552246094,
"learning_rate": 7.988357887172117e-05,
"loss": 0.1788,
"step": 14000
},
{
"epoch": 0.20193962651948788,
"grad_norm": 0.30940186977386475,
"learning_rate": 7.981171397772189e-05,
"loss": 0.1767,
"step": 14050
},
{
"epoch": 0.2026582728772085,
"grad_norm": 0.5685888528823853,
"learning_rate": 7.97398490837226e-05,
"loss": 0.1824,
"step": 14100
},
{
"epoch": 0.2033769192349291,
"grad_norm": 0.3270226716995239,
"learning_rate": 7.966798418972333e-05,
"loss": 0.1816,
"step": 14150
},
{
"epoch": 0.20409556559264969,
"grad_norm": 0.2310134619474411,
"learning_rate": 7.959611929572404e-05,
"loss": 0.1812,
"step": 14200
},
{
"epoch": 0.20481421195037028,
"grad_norm": 0.33357006311416626,
"learning_rate": 7.952425440172476e-05,
"loss": 0.1783,
"step": 14250
},
{
"epoch": 0.20553285830809087,
"grad_norm": 0.26054659485816956,
"learning_rate": 7.945238950772548e-05,
"loss": 0.1752,
"step": 14300
},
{
"epoch": 0.20625150466581146,
"grad_norm": 0.2842780649662018,
"learning_rate": 7.93805246137262e-05,
"loss": 0.1738,
"step": 14350
},
{
"epoch": 0.20697015102353208,
"grad_norm": 0.4128149747848511,
"learning_rate": 7.930865971972692e-05,
"loss": 0.1705,
"step": 14400
},
{
"epoch": 0.20768879738125268,
"grad_norm": 0.36563193798065186,
"learning_rate": 7.923823212360762e-05,
"loss": 0.1683,
"step": 14450
},
{
"epoch": 0.20840744373897327,
"grad_norm": 0.32785964012145996,
"learning_rate": 7.916636722960834e-05,
"loss": 0.1705,
"step": 14500
},
{
"epoch": 0.20912609009669386,
"grad_norm": 0.3670963943004608,
"learning_rate": 7.909450233560906e-05,
"loss": 0.1786,
"step": 14550
},
{
"epoch": 0.20984473645441445,
"grad_norm": 0.33668437600135803,
"learning_rate": 7.902263744160978e-05,
"loss": 0.1715,
"step": 14600
},
{
"epoch": 0.21056338281213507,
"grad_norm": 0.29565784335136414,
"learning_rate": 7.89507725476105e-05,
"loss": 0.1726,
"step": 14650
},
{
"epoch": 0.21128202916985567,
"grad_norm": 0.42704764008522034,
"learning_rate": 7.88789076536112e-05,
"loss": 0.1814,
"step": 14700
},
{
"epoch": 0.21200067552757626,
"grad_norm": 0.49560704827308655,
"learning_rate": 7.880704275961194e-05,
"loss": 0.1763,
"step": 14750
},
{
"epoch": 0.21271932188529685,
"grad_norm": 0.39118367433547974,
"learning_rate": 7.873517786561266e-05,
"loss": 0.1784,
"step": 14800
},
{
"epoch": 0.21343796824301745,
"grad_norm": 0.30129724740982056,
"learning_rate": 7.866331297161336e-05,
"loss": 0.1768,
"step": 14850
},
{
"epoch": 0.21415661460073804,
"grad_norm": 0.3319786787033081,
"learning_rate": 7.85914480776141e-05,
"loss": 0.1786,
"step": 14900
},
{
"epoch": 0.21487526095845866,
"grad_norm": 0.3003113865852356,
"learning_rate": 7.851958318361481e-05,
"loss": 0.1791,
"step": 14950
},
{
"epoch": 0.21559390731617925,
"grad_norm": 0.34315207600593567,
"learning_rate": 7.844771828961552e-05,
"loss": 0.173,
"step": 15000
},
{
"epoch": 0.21559390731617925,
"eval_loss": 0.17433789372444153,
"eval_runtime": 2337.8325,
"eval_samples_per_second": 25.062,
"eval_steps_per_second": 3.133,
"step": 15000
},
{
"epoch": 0.21631255367389984,
"grad_norm": 0.4181901812553406,
"learning_rate": 7.837585339561625e-05,
"loss": 0.1783,
"step": 15050
},
{
"epoch": 0.21703120003162044,
"grad_norm": 0.3226313889026642,
"learning_rate": 7.830398850161696e-05,
"loss": 0.1703,
"step": 15100
},
{
"epoch": 0.21774984638934103,
"grad_norm": 0.2730088233947754,
"learning_rate": 7.823212360761768e-05,
"loss": 0.1757,
"step": 15150
},
{
"epoch": 0.21846849274706162,
"grad_norm": 0.39994746446609497,
"learning_rate": 7.816025871361841e-05,
"loss": 0.1755,
"step": 15200
},
{
"epoch": 0.21918713910478224,
"grad_norm": 0.3416014015674591,
"learning_rate": 7.808839381961912e-05,
"loss": 0.178,
"step": 15250
},
{
"epoch": 0.21990578546250283,
"grad_norm": 0.3324638605117798,
"learning_rate": 7.801652892561983e-05,
"loss": 0.174,
"step": 15300
},
{
"epoch": 0.22062443182022343,
"grad_norm": 0.4316116273403168,
"learning_rate": 7.794466403162057e-05,
"loss": 0.1701,
"step": 15350
},
{
"epoch": 0.22134307817794402,
"grad_norm": 0.3134278655052185,
"learning_rate": 7.787279913762127e-05,
"loss": 0.1761,
"step": 15400
},
{
"epoch": 0.2220617245356646,
"grad_norm": 0.31829679012298584,
"learning_rate": 7.780093424362199e-05,
"loss": 0.1744,
"step": 15450
},
{
"epoch": 0.22278037089338523,
"grad_norm": 0.7381129264831543,
"learning_rate": 7.772906934962272e-05,
"loss": 0.1722,
"step": 15500
},
{
"epoch": 0.22349901725110582,
"grad_norm": 0.28715264797210693,
"learning_rate": 7.765720445562343e-05,
"loss": 0.1757,
"step": 15550
},
{
"epoch": 0.22421766360882642,
"grad_norm": 0.307982474565506,
"learning_rate": 7.758533956162415e-05,
"loss": 0.169,
"step": 15600
},
{
"epoch": 0.224936309966547,
"grad_norm": 0.45478326082229614,
"learning_rate": 7.751347466762487e-05,
"loss": 0.1792,
"step": 15650
},
{
"epoch": 0.2256549563242676,
"grad_norm": 0.45371562242507935,
"learning_rate": 7.744160977362559e-05,
"loss": 0.18,
"step": 15700
},
{
"epoch": 0.2263736026819882,
"grad_norm": 0.3575720489025116,
"learning_rate": 7.73697448796263e-05,
"loss": 0.1764,
"step": 15750
},
{
"epoch": 0.22709224903970882,
"grad_norm": 0.41387391090393066,
"learning_rate": 7.729787998562703e-05,
"loss": 0.1697,
"step": 15800
},
{
"epoch": 0.2278108953974294,
"grad_norm": 0.4434982240200043,
"learning_rate": 7.722601509162774e-05,
"loss": 0.1742,
"step": 15850
},
{
"epoch": 0.22852954175515,
"grad_norm": 0.44597572088241577,
"learning_rate": 7.715415019762846e-05,
"loss": 0.1745,
"step": 15900
},
{
"epoch": 0.2292481881128706,
"grad_norm": 0.3628135323524475,
"learning_rate": 7.708228530362917e-05,
"loss": 0.1703,
"step": 15950
},
{
"epoch": 0.22996683447059119,
"grad_norm": 0.28047481179237366,
"learning_rate": 7.70104204096299e-05,
"loss": 0.1743,
"step": 16000
},
{
"epoch": 0.23068548082831178,
"grad_norm": 0.3026706278324127,
"learning_rate": 7.693855551563062e-05,
"loss": 0.1694,
"step": 16050
},
{
"epoch": 0.2314041271860324,
"grad_norm": 0.3518725037574768,
"learning_rate": 7.686669062163133e-05,
"loss": 0.1751,
"step": 16100
},
{
"epoch": 0.232122773543753,
"grad_norm": 0.41190633177757263,
"learning_rate": 7.679482572763206e-05,
"loss": 0.1744,
"step": 16150
},
{
"epoch": 0.23284141990147358,
"grad_norm": 0.39950138330459595,
"learning_rate": 7.672296083363278e-05,
"loss": 0.1714,
"step": 16200
},
{
"epoch": 0.23356006625919418,
"grad_norm": 0.3585628867149353,
"learning_rate": 7.665109593963348e-05,
"loss": 0.1693,
"step": 16250
},
{
"epoch": 0.23427871261691477,
"grad_norm": 0.4004826843738556,
"learning_rate": 7.657923104563422e-05,
"loss": 0.1755,
"step": 16300
},
{
"epoch": 0.2349973589746354,
"grad_norm": 0.360116183757782,
"learning_rate": 7.650736615163494e-05,
"loss": 0.1673,
"step": 16350
},
{
"epoch": 0.23571600533235598,
"grad_norm": 0.344833642244339,
"learning_rate": 7.643550125763564e-05,
"loss": 0.1746,
"step": 16400
},
{
"epoch": 0.23643465169007657,
"grad_norm": 0.32852986454963684,
"learning_rate": 7.636363636363637e-05,
"loss": 0.1748,
"step": 16450
},
{
"epoch": 0.23715329804779717,
"grad_norm": 0.3062026798725128,
"learning_rate": 7.629177146963708e-05,
"loss": 0.1755,
"step": 16500
},
{
"epoch": 0.23787194440551776,
"grad_norm": 0.4373960793018341,
"learning_rate": 7.62199065756378e-05,
"loss": 0.1759,
"step": 16550
},
{
"epoch": 0.23859059076323835,
"grad_norm": 0.36062178015708923,
"learning_rate": 7.614804168163853e-05,
"loss": 0.1693,
"step": 16600
},
{
"epoch": 0.23930923712095897,
"grad_norm": 0.514687716960907,
"learning_rate": 7.607905138339922e-05,
"loss": 0.1721,
"step": 16650
},
{
"epoch": 0.24002788347867957,
"grad_norm": 0.2862984538078308,
"learning_rate": 7.600718648939992e-05,
"loss": 0.1748,
"step": 16700
},
{
"epoch": 0.24074652983640016,
"grad_norm": 0.4001787304878235,
"learning_rate": 7.593532159540066e-05,
"loss": 0.1663,
"step": 16750
},
{
"epoch": 0.24146517619412075,
"grad_norm": 0.4565765857696533,
"learning_rate": 7.586345670140138e-05,
"loss": 0.1766,
"step": 16800
},
{
"epoch": 0.24218382255184134,
"grad_norm": 0.2525484561920166,
"learning_rate": 7.579159180740208e-05,
"loss": 0.172,
"step": 16850
},
{
"epoch": 0.24290246890956194,
"grad_norm": 0.5037934184074402,
"learning_rate": 7.571972691340281e-05,
"loss": 0.1735,
"step": 16900
},
{
"epoch": 0.24362111526728256,
"grad_norm": 0.33634111285209656,
"learning_rate": 7.564786201940352e-05,
"loss": 0.177,
"step": 16950
},
{
"epoch": 0.24433976162500315,
"grad_norm": 0.31144094467163086,
"learning_rate": 7.557599712540424e-05,
"loss": 0.1718,
"step": 17000
},
{
"epoch": 0.24505840798272374,
"grad_norm": 0.2749430239200592,
"learning_rate": 7.550413223140497e-05,
"loss": 0.1739,
"step": 17050
},
{
"epoch": 0.24577705434044433,
"grad_norm": 0.4118638336658478,
"learning_rate": 7.543226733740568e-05,
"loss": 0.173,
"step": 17100
},
{
"epoch": 0.24649570069816493,
"grad_norm": 0.2785220146179199,
"learning_rate": 7.53604024434064e-05,
"loss": 0.1749,
"step": 17150
},
{
"epoch": 0.24721434705588555,
"grad_norm": 0.3517521321773529,
"learning_rate": 7.528853754940713e-05,
"loss": 0.1734,
"step": 17200
},
{
"epoch": 0.24793299341360614,
"grad_norm": 0.42283421754837036,
"learning_rate": 7.521667265540783e-05,
"loss": 0.1717,
"step": 17250
},
{
"epoch": 0.24865163977132673,
"grad_norm": 0.3272489607334137,
"learning_rate": 7.514480776140855e-05,
"loss": 0.1745,
"step": 17300
},
{
"epoch": 0.24937028612904732,
"grad_norm": 0.3763234317302704,
"learning_rate": 7.507294286740929e-05,
"loss": 0.1716,
"step": 17350
},
{
"epoch": 0.25008893248676795,
"grad_norm": 0.3796086013317108,
"learning_rate": 7.500107797340999e-05,
"loss": 0.1639,
"step": 17400
},
{
"epoch": 0.2508075788444885,
"grad_norm": 0.3655567169189453,
"learning_rate": 7.492921307941071e-05,
"loss": 0.1716,
"step": 17450
},
{
"epoch": 0.25152622520220913,
"grad_norm": 0.323050320148468,
"learning_rate": 7.485734818541143e-05,
"loss": 0.1704,
"step": 17500
},
{
"epoch": 0.2522448715599297,
"grad_norm": 0.3040378987789154,
"learning_rate": 7.478548329141215e-05,
"loss": 0.1655,
"step": 17550
},
{
"epoch": 0.2529635179176503,
"grad_norm": 0.371359646320343,
"learning_rate": 7.471361839741287e-05,
"loss": 0.1729,
"step": 17600
},
{
"epoch": 0.25368216427537094,
"grad_norm": 0.3760315775871277,
"learning_rate": 7.464175350341359e-05,
"loss": 0.1671,
"step": 17650
},
{
"epoch": 0.2544008106330915,
"grad_norm": 0.325546532869339,
"learning_rate": 7.45698886094143e-05,
"loss": 0.168,
"step": 17700
},
{
"epoch": 0.2551194569908121,
"grad_norm": 0.3608609735965729,
"learning_rate": 7.449802371541503e-05,
"loss": 0.1694,
"step": 17750
},
{
"epoch": 0.2558381033485327,
"grad_norm": 0.49022817611694336,
"learning_rate": 7.442615882141573e-05,
"loss": 0.1692,
"step": 17800
},
{
"epoch": 0.2565567497062533,
"grad_norm": 0.3275602459907532,
"learning_rate": 7.435429392741646e-05,
"loss": 0.1709,
"step": 17850
},
{
"epoch": 0.25727539606397387,
"grad_norm": 0.3520912230014801,
"learning_rate": 7.428242903341718e-05,
"loss": 0.1657,
"step": 17900
},
{
"epoch": 0.2579940424216945,
"grad_norm": 0.4564454257488251,
"learning_rate": 7.421056413941789e-05,
"loss": 0.1724,
"step": 17950
},
{
"epoch": 0.2587126887794151,
"grad_norm": 0.4266446530818939,
"learning_rate": 7.413869924541862e-05,
"loss": 0.1747,
"step": 18000
},
{
"epoch": 0.2587126887794151,
"eval_loss": 0.16929900646209717,
"eval_runtime": 2331.677,
"eval_samples_per_second": 25.128,
"eval_steps_per_second": 3.141,
"step": 18000
},
{
"epoch": 0.2594313351371357,
"grad_norm": 0.32367444038391113,
"learning_rate": 7.406683435141934e-05,
"loss": 0.1693,
"step": 18050
},
{
"epoch": 0.2601499814948563,
"grad_norm": 0.3971792161464691,
"learning_rate": 7.399496945742005e-05,
"loss": 0.1691,
"step": 18100
},
{
"epoch": 0.26086862785257686,
"grad_norm": 0.37220925092697144,
"learning_rate": 7.392310456342078e-05,
"loss": 0.1671,
"step": 18150
},
{
"epoch": 0.2615872742102975,
"grad_norm": 0.3948183059692383,
"learning_rate": 7.385123966942148e-05,
"loss": 0.1696,
"step": 18200
},
{
"epoch": 0.2623059205680181,
"grad_norm": 0.3619794249534607,
"learning_rate": 7.37808120733022e-05,
"loss": 0.1686,
"step": 18250
},
{
"epoch": 0.26302456692573867,
"grad_norm": 0.3705451190471649,
"learning_rate": 7.37089471793029e-05,
"loss": 0.1731,
"step": 18300
},
{
"epoch": 0.2637432132834593,
"grad_norm": 0.609754204750061,
"learning_rate": 7.363708228530364e-05,
"loss": 0.1683,
"step": 18350
},
{
"epoch": 0.26446185964117985,
"grad_norm": 0.42656949162483215,
"learning_rate": 7.356521739130436e-05,
"loss": 0.1684,
"step": 18400
},
{
"epoch": 0.2651805059989005,
"grad_norm": 0.34473884105682373,
"learning_rate": 7.349335249730506e-05,
"loss": 0.1701,
"step": 18450
},
{
"epoch": 0.2658991523566211,
"grad_norm": 0.37473055720329285,
"learning_rate": 7.34214876033058e-05,
"loss": 0.1681,
"step": 18500
},
{
"epoch": 0.26661779871434166,
"grad_norm": 0.3349181115627289,
"learning_rate": 7.334962270930651e-05,
"loss": 0.1732,
"step": 18550
},
{
"epoch": 0.2673364450720623,
"grad_norm": 0.3274621069431305,
"learning_rate": 7.327775781530722e-05,
"loss": 0.168,
"step": 18600
},
{
"epoch": 0.26805509142978284,
"grad_norm": 0.2889033257961273,
"learning_rate": 7.320589292130794e-05,
"loss": 0.1651,
"step": 18650
},
{
"epoch": 0.26877373778750346,
"grad_norm": 0.4824863374233246,
"learning_rate": 7.313402802730866e-05,
"loss": 0.1716,
"step": 18700
},
{
"epoch": 0.26949238414522403,
"grad_norm": 0.4139736294746399,
"learning_rate": 7.306216313330938e-05,
"loss": 0.1713,
"step": 18750
},
{
"epoch": 0.27021103050294465,
"grad_norm": 0.3236710727214813,
"learning_rate": 7.29902982393101e-05,
"loss": 0.1668,
"step": 18800
},
{
"epoch": 0.27092967686066527,
"grad_norm": 0.3383229970932007,
"learning_rate": 7.291843334531082e-05,
"loss": 0.1679,
"step": 18850
},
{
"epoch": 0.27164832321838583,
"grad_norm": 0.30610159039497375,
"learning_rate": 7.284656845131154e-05,
"loss": 0.1683,
"step": 18900
},
{
"epoch": 0.27236696957610645,
"grad_norm": 0.3125470280647278,
"learning_rate": 7.277470355731225e-05,
"loss": 0.1716,
"step": 18950
},
{
"epoch": 0.273085615933827,
"grad_norm": 0.4086760878562927,
"learning_rate": 7.270283866331297e-05,
"loss": 0.1664,
"step": 19000
},
{
"epoch": 0.27380426229154764,
"grad_norm": 0.3585663139820099,
"learning_rate": 7.263097376931369e-05,
"loss": 0.1647,
"step": 19050
},
{
"epoch": 0.27452290864926826,
"grad_norm": 0.3493901491165161,
"learning_rate": 7.255910887531441e-05,
"loss": 0.1671,
"step": 19100
},
{
"epoch": 0.2752415550069888,
"grad_norm": 0.37844619154930115,
"learning_rate": 7.248724398131513e-05,
"loss": 0.1654,
"step": 19150
},
{
"epoch": 0.27596020136470945,
"grad_norm": 0.3647208511829376,
"learning_rate": 7.241537908731585e-05,
"loss": 0.1672,
"step": 19200
},
{
"epoch": 0.27667884772243,
"grad_norm": 0.3517756462097168,
"learning_rate": 7.234351419331657e-05,
"loss": 0.1696,
"step": 19250
},
{
"epoch": 0.27739749408015063,
"grad_norm": 0.3229799270629883,
"learning_rate": 7.227164929931729e-05,
"loss": 0.166,
"step": 19300
},
{
"epoch": 0.27811614043787125,
"grad_norm": 0.2893189489841461,
"learning_rate": 7.219978440531801e-05,
"loss": 0.1642,
"step": 19350
},
{
"epoch": 0.2788347867955918,
"grad_norm": 0.3653119206428528,
"learning_rate": 7.212791951131873e-05,
"loss": 0.1638,
"step": 19400
},
{
"epoch": 0.27955343315331244,
"grad_norm": 0.3465476632118225,
"learning_rate": 7.205605461731945e-05,
"loss": 0.1724,
"step": 19450
},
{
"epoch": 0.280272079511033,
"grad_norm": 0.3858092129230499,
"learning_rate": 7.198418972332016e-05,
"loss": 0.1654,
"step": 19500
},
{
"epoch": 0.2809907258687536,
"grad_norm": 0.45877212285995483,
"learning_rate": 7.191232482932087e-05,
"loss": 0.1655,
"step": 19550
},
{
"epoch": 0.2817093722264742,
"grad_norm": 0.31164032220840454,
"learning_rate": 7.18404599353216e-05,
"loss": 0.1697,
"step": 19600
},
{
"epoch": 0.2824280185841948,
"grad_norm": 0.35051417350769043,
"learning_rate": 7.176859504132232e-05,
"loss": 0.1608,
"step": 19650
},
{
"epoch": 0.2831466649419154,
"grad_norm": 0.4664309620857239,
"learning_rate": 7.169673014732303e-05,
"loss": 0.1629,
"step": 19700
},
{
"epoch": 0.283865311299636,
"grad_norm": 0.29969966411590576,
"learning_rate": 7.162486525332376e-05,
"loss": 0.1711,
"step": 19750
},
{
"epoch": 0.2845839576573566,
"grad_norm": 0.3115592300891876,
"learning_rate": 7.155300035932448e-05,
"loss": 0.1585,
"step": 19800
},
{
"epoch": 0.2853026040150772,
"grad_norm": 0.35464227199554443,
"learning_rate": 7.148113546532518e-05,
"loss": 0.1615,
"step": 19850
},
{
"epoch": 0.2860212503727978,
"grad_norm": 0.30538317561149597,
"learning_rate": 7.140927057132592e-05,
"loss": 0.1658,
"step": 19900
},
{
"epoch": 0.2867398967305184,
"grad_norm": 0.4378170073032379,
"learning_rate": 7.133740567732664e-05,
"loss": 0.1666,
"step": 19950
},
{
"epoch": 0.287458543088239,
"grad_norm": 0.44222235679626465,
"learning_rate": 7.126554078332734e-05,
"loss": 0.1665,
"step": 20000
},
{
"epoch": 0.2881771894459596,
"grad_norm": 0.3468632400035858,
"learning_rate": 7.119367588932807e-05,
"loss": 0.1641,
"step": 20050
},
{
"epoch": 0.28889583580368017,
"grad_norm": 0.2810463607311249,
"learning_rate": 7.112181099532878e-05,
"loss": 0.1681,
"step": 20100
},
{
"epoch": 0.2896144821614008,
"grad_norm": 0.30284568667411804,
"learning_rate": 7.10499461013295e-05,
"loss": 0.1682,
"step": 20150
},
{
"epoch": 0.2903331285191214,
"grad_norm": 0.5109066367149353,
"learning_rate": 7.097808120733023e-05,
"loss": 0.1635,
"step": 20200
},
{
"epoch": 0.291051774876842,
"grad_norm": 0.32616057991981506,
"learning_rate": 7.090621631333094e-05,
"loss": 0.1596,
"step": 20250
},
{
"epoch": 0.2917704212345626,
"grad_norm": 0.31997406482696533,
"learning_rate": 7.083435141933166e-05,
"loss": 0.1652,
"step": 20300
},
{
"epoch": 0.29248906759228316,
"grad_norm": 0.3968636989593506,
"learning_rate": 7.076248652533239e-05,
"loss": 0.1657,
"step": 20350
},
{
"epoch": 0.2932077139500038,
"grad_norm": 0.3720139265060425,
"learning_rate": 7.06906216313331e-05,
"loss": 0.1649,
"step": 20400
},
{
"epoch": 0.29392636030772434,
"grad_norm": 0.3689219653606415,
"learning_rate": 7.061875673733381e-05,
"loss": 0.1654,
"step": 20450
},
{
"epoch": 0.29464500666544496,
"grad_norm": 0.4002225995063782,
"learning_rate": 7.054689184333453e-05,
"loss": 0.1603,
"step": 20500
},
{
"epoch": 0.2953636530231656,
"grad_norm": 0.30403926968574524,
"learning_rate": 7.047502694933525e-05,
"loss": 0.1637,
"step": 20550
},
{
"epoch": 0.29608229938088615,
"grad_norm": 0.42684683203697205,
"learning_rate": 7.040316205533597e-05,
"loss": 0.1665,
"step": 20600
},
{
"epoch": 0.29680094573860677,
"grad_norm": 0.3254065215587616,
"learning_rate": 7.033129716133669e-05,
"loss": 0.1653,
"step": 20650
},
{
"epoch": 0.29751959209632733,
"grad_norm": 0.2438507229089737,
"learning_rate": 7.025943226733741e-05,
"loss": 0.1694,
"step": 20700
},
{
"epoch": 0.29823823845404795,
"grad_norm": 0.49997007846832275,
"learning_rate": 7.018756737333813e-05,
"loss": 0.1679,
"step": 20750
},
{
"epoch": 0.2989568848117686,
"grad_norm": 0.417477548122406,
"learning_rate": 7.011570247933885e-05,
"loss": 0.1648,
"step": 20800
},
{
"epoch": 0.29967553116948914,
"grad_norm": 0.33265259861946106,
"learning_rate": 7.004383758533957e-05,
"loss": 0.1648,
"step": 20850
},
{
"epoch": 0.30039417752720976,
"grad_norm": 0.37442734837532043,
"learning_rate": 6.997197269134029e-05,
"loss": 0.1648,
"step": 20900
},
{
"epoch": 0.3011128238849303,
"grad_norm": 0.4252796173095703,
"learning_rate": 6.990010779734099e-05,
"loss": 0.1684,
"step": 20950
},
{
"epoch": 0.30183147024265095,
"grad_norm": 0.5813310742378235,
"learning_rate": 6.982824290334172e-05,
"loss": 0.1629,
"step": 21000
},
{
"epoch": 0.30183147024265095,
"eval_loss": 0.16490380465984344,
"eval_runtime": 2333.5649,
"eval_samples_per_second": 25.108,
"eval_steps_per_second": 3.139,
"step": 21000
},
{
"epoch": 0.30255011660037157,
"grad_norm": 0.3204265534877777,
"learning_rate": 6.975637800934244e-05,
"loss": 0.1652,
"step": 21050
},
{
"epoch": 0.30326876295809213,
"grad_norm": 0.4411029815673828,
"learning_rate": 6.968451311534315e-05,
"loss": 0.1665,
"step": 21100
},
{
"epoch": 0.30398740931581275,
"grad_norm": 0.2782106399536133,
"learning_rate": 6.961264822134388e-05,
"loss": 0.1642,
"step": 21150
},
{
"epoch": 0.3047060556735333,
"grad_norm": 0.3360287845134735,
"learning_rate": 6.95407833273446e-05,
"loss": 0.1662,
"step": 21200
},
{
"epoch": 0.30542470203125394,
"grad_norm": 0.3113015294075012,
"learning_rate": 6.94689184333453e-05,
"loss": 0.1665,
"step": 21250
},
{
"epoch": 0.3061433483889745,
"grad_norm": 0.3733580708503723,
"learning_rate": 6.939705353934604e-05,
"loss": 0.1613,
"step": 21300
},
{
"epoch": 0.3068619947466951,
"grad_norm": 0.3798975348472595,
"learning_rate": 6.932518864534674e-05,
"loss": 0.1599,
"step": 21350
},
{
"epoch": 0.30758064110441574,
"grad_norm": 0.4411696493625641,
"learning_rate": 6.925332375134746e-05,
"loss": 0.1613,
"step": 21400
},
{
"epoch": 0.3082992874621363,
"grad_norm": 0.29650819301605225,
"learning_rate": 6.91814588573482e-05,
"loss": 0.1619,
"step": 21450
},
{
"epoch": 0.3090179338198569,
"grad_norm": 0.32305335998535156,
"learning_rate": 6.91095939633489e-05,
"loss": 0.1578,
"step": 21500
},
{
"epoch": 0.3097365801775775,
"grad_norm": 0.3363495469093323,
"learning_rate": 6.903772906934962e-05,
"loss": 0.162,
"step": 21550
},
{
"epoch": 0.3104552265352981,
"grad_norm": 0.39038559794425964,
"learning_rate": 6.896586417535035e-05,
"loss": 0.1665,
"step": 21600
},
{
"epoch": 0.31117387289301873,
"grad_norm": 0.3347565233707428,
"learning_rate": 6.889399928135106e-05,
"loss": 0.1646,
"step": 21650
},
{
"epoch": 0.3118925192507393,
"grad_norm": 0.41805052757263184,
"learning_rate": 6.882213438735178e-05,
"loss": 0.1581,
"step": 21700
},
{
"epoch": 0.3126111656084599,
"grad_norm": 0.31128305196762085,
"learning_rate": 6.875026949335251e-05,
"loss": 0.1608,
"step": 21750
},
{
"epoch": 0.3133298119661805,
"grad_norm": 0.3111174702644348,
"learning_rate": 6.867840459935322e-05,
"loss": 0.1646,
"step": 21800
},
{
"epoch": 0.3140484583239011,
"grad_norm": 0.3375270366668701,
"learning_rate": 6.860653970535394e-05,
"loss": 0.1635,
"step": 21850
},
{
"epoch": 0.3147671046816217,
"grad_norm": 0.2751725912094116,
"learning_rate": 6.853467481135465e-05,
"loss": 0.1612,
"step": 21900
},
{
"epoch": 0.3154857510393423,
"grad_norm": 0.31462764739990234,
"learning_rate": 6.846280991735537e-05,
"loss": 0.1686,
"step": 21950
},
{
"epoch": 0.3162043973970629,
"grad_norm": 0.3871542811393738,
"learning_rate": 6.839094502335609e-05,
"loss": 0.1576,
"step": 22000
},
{
"epoch": 0.3169230437547835,
"grad_norm": 0.36108696460723877,
"learning_rate": 6.831908012935681e-05,
"loss": 0.1604,
"step": 22050
},
{
"epoch": 0.3176416901125041,
"grad_norm": 0.48631274700164795,
"learning_rate": 6.824721523535753e-05,
"loss": 0.1607,
"step": 22100
},
{
"epoch": 0.31836033647022466,
"grad_norm": 0.4918723404407501,
"learning_rate": 6.817535034135825e-05,
"loss": 0.1581,
"step": 22150
},
{
"epoch": 0.3190789828279453,
"grad_norm": 0.40690186619758606,
"learning_rate": 6.810348544735897e-05,
"loss": 0.1651,
"step": 22200
},
{
"epoch": 0.3197976291856659,
"grad_norm": 0.38321653008461,
"learning_rate": 6.803162055335969e-05,
"loss": 0.1617,
"step": 22250
},
{
"epoch": 0.32051627554338646,
"grad_norm": 0.38967370986938477,
"learning_rate": 6.795975565936041e-05,
"loss": 0.1655,
"step": 22300
},
{
"epoch": 0.3212349219011071,
"grad_norm": 0.37661367654800415,
"learning_rate": 6.788932806324111e-05,
"loss": 0.1634,
"step": 22350
},
{
"epoch": 0.32195356825882765,
"grad_norm": 0.2743198871612549,
"learning_rate": 6.781746316924183e-05,
"loss": 0.1672,
"step": 22400
},
{
"epoch": 0.32267221461654827,
"grad_norm": 0.37295830249786377,
"learning_rate": 6.774559827524255e-05,
"loss": 0.1564,
"step": 22450
},
{
"epoch": 0.3233908609742689,
"grad_norm": 0.32078495621681213,
"learning_rate": 6.767373338124327e-05,
"loss": 0.1609,
"step": 22500
},
{
"epoch": 0.32410950733198945,
"grad_norm": 0.3385615944862366,
"learning_rate": 6.760186848724399e-05,
"loss": 0.1648,
"step": 22550
},
{
"epoch": 0.3248281536897101,
"grad_norm": 0.3343018889427185,
"learning_rate": 6.75300035932447e-05,
"loss": 0.1618,
"step": 22600
},
{
"epoch": 0.32554680004743064,
"grad_norm": 0.43768858909606934,
"learning_rate": 6.745813869924542e-05,
"loss": 0.1633,
"step": 22650
},
{
"epoch": 0.32626544640515126,
"grad_norm": 0.26847851276397705,
"learning_rate": 6.738627380524613e-05,
"loss": 0.164,
"step": 22700
},
{
"epoch": 0.3269840927628719,
"grad_norm": 0.4442514181137085,
"learning_rate": 6.731440891124686e-05,
"loss": 0.1582,
"step": 22750
},
{
"epoch": 0.32770273912059245,
"grad_norm": 0.40202733874320984,
"learning_rate": 6.724254401724758e-05,
"loss": 0.1628,
"step": 22800
},
{
"epoch": 0.32842138547831307,
"grad_norm": 0.3353193402290344,
"learning_rate": 6.717067912324829e-05,
"loss": 0.1669,
"step": 22850
},
{
"epoch": 0.32914003183603363,
"grad_norm": 0.3123689591884613,
"learning_rate": 6.709881422924902e-05,
"loss": 0.1625,
"step": 22900
},
{
"epoch": 0.32985867819375425,
"grad_norm": 0.38110092282295227,
"learning_rate": 6.702694933524974e-05,
"loss": 0.1627,
"step": 22950
},
{
"epoch": 0.3305773245514748,
"grad_norm": 0.46331876516342163,
"learning_rate": 6.695508444125044e-05,
"loss": 0.162,
"step": 23000
},
{
"epoch": 0.33129597090919544,
"grad_norm": 0.272176593542099,
"learning_rate": 6.688321954725118e-05,
"loss": 0.1574,
"step": 23050
},
{
"epoch": 0.33201461726691606,
"grad_norm": 0.43498551845550537,
"learning_rate": 6.681135465325188e-05,
"loss": 0.1633,
"step": 23100
},
{
"epoch": 0.3327332636246366,
"grad_norm": 0.2999245822429657,
"learning_rate": 6.67394897592526e-05,
"loss": 0.1622,
"step": 23150
},
{
"epoch": 0.33345190998235724,
"grad_norm": 0.31831660866737366,
"learning_rate": 6.666762486525333e-05,
"loss": 0.1589,
"step": 23200
},
{
"epoch": 0.3341705563400778,
"grad_norm": 0.33583468198776245,
"learning_rate": 6.659575997125404e-05,
"loss": 0.1615,
"step": 23250
},
{
"epoch": 0.3348892026977984,
"grad_norm": 0.4801647365093231,
"learning_rate": 6.652389507725476e-05,
"loss": 0.1617,
"step": 23300
},
{
"epoch": 0.33560784905551905,
"grad_norm": 0.3660373389720917,
"learning_rate": 6.645203018325549e-05,
"loss": 0.1608,
"step": 23350
},
{
"epoch": 0.3363264954132396,
"grad_norm": 0.3422200679779053,
"learning_rate": 6.63801652892562e-05,
"loss": 0.1698,
"step": 23400
},
{
"epoch": 0.33704514177096023,
"grad_norm": 0.3418559730052948,
"learning_rate": 6.630830039525692e-05,
"loss": 0.1634,
"step": 23450
},
{
"epoch": 0.3377637881286808,
"grad_norm": 0.2738020420074463,
"learning_rate": 6.623643550125765e-05,
"loss": 0.1593,
"step": 23500
},
{
"epoch": 0.3384824344864014,
"grad_norm": 0.28123342990875244,
"learning_rate": 6.616457060725836e-05,
"loss": 0.1605,
"step": 23550
},
{
"epoch": 0.33920108084412204,
"grad_norm": 0.2701767683029175,
"learning_rate": 6.609270571325907e-05,
"loss": 0.162,
"step": 23600
},
{
"epoch": 0.3399197272018426,
"grad_norm": 0.3176400065422058,
"learning_rate": 6.60208408192598e-05,
"loss": 0.1603,
"step": 23650
},
{
"epoch": 0.3406383735595632,
"grad_norm": 0.359195739030838,
"learning_rate": 6.594897592526051e-05,
"loss": 0.1624,
"step": 23700
},
{
"epoch": 0.3413570199172838,
"grad_norm": 0.4724487364292145,
"learning_rate": 6.587711103126123e-05,
"loss": 0.1587,
"step": 23750
},
{
"epoch": 0.3420756662750044,
"grad_norm": 0.3473828434944153,
"learning_rate": 6.580524613726195e-05,
"loss": 0.1618,
"step": 23800
},
{
"epoch": 0.342794312632725,
"grad_norm": 0.25780320167541504,
"learning_rate": 6.573338124326267e-05,
"loss": 0.1617,
"step": 23850
},
{
"epoch": 0.3435129589904456,
"grad_norm": 0.43437114357948303,
"learning_rate": 6.566151634926339e-05,
"loss": 0.1653,
"step": 23900
},
{
"epoch": 0.3442316053481662,
"grad_norm": 0.3810344934463501,
"learning_rate": 6.558965145526411e-05,
"loss": 0.1609,
"step": 23950
},
{
"epoch": 0.3449502517058868,
"grad_norm": 0.32231196761131287,
"learning_rate": 6.551778656126483e-05,
"loss": 0.1661,
"step": 24000
},
{
"epoch": 0.3449502517058868,
"eval_loss": 0.16020701825618744,
"eval_runtime": 2356.1463,
"eval_samples_per_second": 24.867,
"eval_steps_per_second": 3.108,
"step": 24000
},
{
"epoch": 0.3456688980636074,
"grad_norm": 0.3379141688346863,
"learning_rate": 6.544592166726555e-05,
"loss": 0.162,
"step": 24050
},
{
"epoch": 0.34638754442132796,
"grad_norm": 0.36986809968948364,
"learning_rate": 6.537405677326627e-05,
"loss": 0.1561,
"step": 24100
},
{
"epoch": 0.3471061907790486,
"grad_norm": 0.5383297801017761,
"learning_rate": 6.530219187926698e-05,
"loss": 0.1589,
"step": 24150
},
{
"epoch": 0.3478248371367692,
"grad_norm": 0.46481168270111084,
"learning_rate": 6.52303269852677e-05,
"loss": 0.1628,
"step": 24200
},
{
"epoch": 0.34854348349448977,
"grad_norm": 0.4319482743740082,
"learning_rate": 6.515846209126842e-05,
"loss": 0.1617,
"step": 24250
},
{
"epoch": 0.3492621298522104,
"grad_norm": 0.6843165159225464,
"learning_rate": 6.508659719726914e-05,
"loss": 0.1592,
"step": 24300
},
{
"epoch": 0.34998077620993095,
"grad_norm": 0.5816138982772827,
"learning_rate": 6.501473230326986e-05,
"loss": 0.1654,
"step": 24350
},
{
"epoch": 0.3506994225676516,
"grad_norm": 0.42936670780181885,
"learning_rate": 6.494286740927057e-05,
"loss": 0.1641,
"step": 24400
},
{
"epoch": 0.3514180689253722,
"grad_norm": 0.480822890996933,
"learning_rate": 6.48710025152713e-05,
"loss": 0.1569,
"step": 24450
},
{
"epoch": 0.35213671528309276,
"grad_norm": 0.39662879705429077,
"learning_rate": 6.4799137621272e-05,
"loss": 0.1595,
"step": 24500
},
{
"epoch": 0.3528553616408134,
"grad_norm": 0.31965863704681396,
"learning_rate": 6.472727272727272e-05,
"loss": 0.1616,
"step": 24550
},
{
"epoch": 0.35357400799853395,
"grad_norm": 0.3041664958000183,
"learning_rate": 6.465540783327346e-05,
"loss": 0.1576,
"step": 24600
},
{
"epoch": 0.35429265435625457,
"grad_norm": 0.32472431659698486,
"learning_rate": 6.458354293927416e-05,
"loss": 0.1577,
"step": 24650
},
{
"epoch": 0.35501130071397513,
"grad_norm": 0.6908242106437683,
"learning_rate": 6.451167804527488e-05,
"loss": 0.1614,
"step": 24700
},
{
"epoch": 0.35572994707169575,
"grad_norm": 0.31418710947036743,
"learning_rate": 6.443981315127561e-05,
"loss": 0.1586,
"step": 24750
},
{
"epoch": 0.35644859342941637,
"grad_norm": 0.4417416453361511,
"learning_rate": 6.436794825727632e-05,
"loss": 0.1563,
"step": 24800
},
{
"epoch": 0.35716723978713694,
"grad_norm": 0.35909807682037354,
"learning_rate": 6.429608336327704e-05,
"loss": 0.1576,
"step": 24850
},
{
"epoch": 0.35788588614485756,
"grad_norm": 0.6350358128547668,
"learning_rate": 6.422421846927776e-05,
"loss": 0.1644,
"step": 24900
},
{
"epoch": 0.3586045325025781,
"grad_norm": 0.368534117937088,
"learning_rate": 6.415235357527848e-05,
"loss": 0.1619,
"step": 24950
},
{
"epoch": 0.35932317886029874,
"grad_norm": 0.3108366131782532,
"learning_rate": 6.40804886812792e-05,
"loss": 0.1635,
"step": 25000
},
{
"epoch": 0.36004182521801936,
"grad_norm": 0.31624144315719604,
"learning_rate": 6.400862378727991e-05,
"loss": 0.1561,
"step": 25050
},
{
"epoch": 0.3607604715757399,
"grad_norm": 0.5008798837661743,
"learning_rate": 6.393675889328063e-05,
"loss": 0.1589,
"step": 25100
},
{
"epoch": 0.36147911793346055,
"grad_norm": 0.4550321400165558,
"learning_rate": 6.386489399928135e-05,
"loss": 0.1612,
"step": 25150
},
{
"epoch": 0.3621977642911811,
"grad_norm": 0.3721817433834076,
"learning_rate": 6.379302910528207e-05,
"loss": 0.1625,
"step": 25200
},
{
"epoch": 0.36291641064890173,
"grad_norm": 0.3496086001396179,
"learning_rate": 6.372116421128279e-05,
"loss": 0.1582,
"step": 25250
},
{
"epoch": 0.36363505700662235,
"grad_norm": 0.4079247713088989,
"learning_rate": 6.364929931728351e-05,
"loss": 0.1636,
"step": 25300
},
{
"epoch": 0.3643537033643429,
"grad_norm": 0.42480820417404175,
"learning_rate": 6.357743442328423e-05,
"loss": 0.1602,
"step": 25350
},
{
"epoch": 0.36507234972206354,
"grad_norm": 0.46133843064308167,
"learning_rate": 6.350556952928495e-05,
"loss": 0.1621,
"step": 25400
},
{
"epoch": 0.3657909960797841,
"grad_norm": 0.43702232837677,
"learning_rate": 6.343370463528567e-05,
"loss": 0.1581,
"step": 25450
},
{
"epoch": 0.3665096424375047,
"grad_norm": 0.30601567029953003,
"learning_rate": 6.336183974128639e-05,
"loss": 0.1582,
"step": 25500
},
{
"epoch": 0.3672282887952253,
"grad_norm": 0.33580270409584045,
"learning_rate": 6.32899748472871e-05,
"loss": 0.1644,
"step": 25550
},
{
"epoch": 0.3679469351529459,
"grad_norm": 0.3324548006057739,
"learning_rate": 6.321810995328782e-05,
"loss": 0.1618,
"step": 25600
},
{
"epoch": 0.36866558151066653,
"grad_norm": 0.3337075114250183,
"learning_rate": 6.314624505928854e-05,
"loss": 0.1571,
"step": 25650
},
{
"epoch": 0.3693842278683871,
"grad_norm": 0.30700790882110596,
"learning_rate": 6.307438016528926e-05,
"loss": 0.158,
"step": 25700
},
{
"epoch": 0.3701028742261077,
"grad_norm": 0.4422316551208496,
"learning_rate": 6.300251527128997e-05,
"loss": 0.1548,
"step": 25750
},
{
"epoch": 0.3708215205838283,
"grad_norm": 0.42883041501045227,
"learning_rate": 6.29306503772907e-05,
"loss": 0.1569,
"step": 25800
},
{
"epoch": 0.3715401669415489,
"grad_norm": 0.34244126081466675,
"learning_rate": 6.285878548329142e-05,
"loss": 0.1597,
"step": 25850
},
{
"epoch": 0.3722588132992695,
"grad_norm": 0.32595932483673096,
"learning_rate": 6.278692058929213e-05,
"loss": 0.1548,
"step": 25900
},
{
"epoch": 0.3729774596569901,
"grad_norm": 0.34954744577407837,
"learning_rate": 6.271505569529286e-05,
"loss": 0.1587,
"step": 25950
},
{
"epoch": 0.3736961060147107,
"grad_norm": 0.35941386222839355,
"learning_rate": 6.264319080129358e-05,
"loss": 0.1587,
"step": 26000
},
{
"epoch": 0.37441475237243127,
"grad_norm": 0.44000008702278137,
"learning_rate": 6.257132590729428e-05,
"loss": 0.1569,
"step": 26050
},
{
"epoch": 0.3751333987301519,
"grad_norm": 0.28137120604515076,
"learning_rate": 6.249946101329502e-05,
"loss": 0.1557,
"step": 26100
},
{
"epoch": 0.3758520450878725,
"grad_norm": 0.40533679723739624,
"learning_rate": 6.242759611929573e-05,
"loss": 0.1575,
"step": 26150
},
{
"epoch": 0.3765706914455931,
"grad_norm": 0.40215086936950684,
"learning_rate": 6.235573122529644e-05,
"loss": 0.1585,
"step": 26200
},
{
"epoch": 0.3772893378033137,
"grad_norm": 0.3472813069820404,
"learning_rate": 6.228386633129717e-05,
"loss": 0.1609,
"step": 26250
},
{
"epoch": 0.37800798416103426,
"grad_norm": 0.32452771067619324,
"learning_rate": 6.221200143729788e-05,
"loss": 0.1634,
"step": 26300
},
{
"epoch": 0.3787266305187549,
"grad_norm": 0.37654146552085876,
"learning_rate": 6.21415738411786e-05,
"loss": 0.1604,
"step": 26350
},
{
"epoch": 0.37944527687647545,
"grad_norm": 0.46723291277885437,
"learning_rate": 6.20697089471793e-05,
"loss": 0.157,
"step": 26400
},
{
"epoch": 0.38016392323419607,
"grad_norm": 0.42815640568733215,
"learning_rate": 6.199784405318002e-05,
"loss": 0.1615,
"step": 26450
},
{
"epoch": 0.3808825695919167,
"grad_norm": 0.4379606246948242,
"learning_rate": 6.192597915918075e-05,
"loss": 0.1638,
"step": 26500
},
{
"epoch": 0.38160121594963725,
"grad_norm": 0.5562979578971863,
"learning_rate": 6.185411426518146e-05,
"loss": 0.1608,
"step": 26550
},
{
"epoch": 0.38231986230735787,
"grad_norm": 0.33051741123199463,
"learning_rate": 6.178224937118218e-05,
"loss": 0.1548,
"step": 26600
},
{
"epoch": 0.38303850866507844,
"grad_norm": 0.2941145598888397,
"learning_rate": 6.171038447718291e-05,
"loss": 0.1551,
"step": 26650
},
{
"epoch": 0.38375715502279906,
"grad_norm": 0.4036601781845093,
"learning_rate": 6.163851958318362e-05,
"loss": 0.1592,
"step": 26700
},
{
"epoch": 0.3844758013805197,
"grad_norm": 0.4525456726551056,
"learning_rate": 6.156665468918433e-05,
"loss": 0.1553,
"step": 26750
},
{
"epoch": 0.38519444773824024,
"grad_norm": 0.4872748553752899,
"learning_rate": 6.149478979518505e-05,
"loss": 0.1582,
"step": 26800
},
{
"epoch": 0.38591309409596086,
"grad_norm": 0.451123982667923,
"learning_rate": 6.142292490118577e-05,
"loss": 0.1578,
"step": 26850
},
{
"epoch": 0.3866317404536814,
"grad_norm": 0.3543640077114105,
"learning_rate": 6.135106000718649e-05,
"loss": 0.1588,
"step": 26900
},
{
"epoch": 0.38735038681140205,
"grad_norm": 0.38010174036026,
"learning_rate": 6.127919511318721e-05,
"loss": 0.1575,
"step": 26950
},
{
"epoch": 0.38806903316912267,
"grad_norm": 0.39789631962776184,
"learning_rate": 6.120733021918793e-05,
"loss": 0.1578,
"step": 27000
},
{
"epoch": 0.38806903316912267,
"eval_loss": 0.15559689700603485,
"eval_runtime": 2336.5126,
"eval_samples_per_second": 25.076,
"eval_steps_per_second": 3.135,
"step": 27000
},
{
"epoch": 0.38878767952684323,
"grad_norm": 0.3860076367855072,
"learning_rate": 6.113546532518865e-05,
"loss": 0.1538,
"step": 27050
},
{
"epoch": 0.38950632588456385,
"grad_norm": 0.27461615204811096,
"learning_rate": 6.106360043118937e-05,
"loss": 0.1517,
"step": 27100
},
{
"epoch": 0.3902249722422844,
"grad_norm": 0.46942609548568726,
"learning_rate": 6.099173553719009e-05,
"loss": 0.1556,
"step": 27150
},
{
"epoch": 0.39094361860000504,
"grad_norm": 0.3067554831504822,
"learning_rate": 6.09198706431908e-05,
"loss": 0.1582,
"step": 27200
},
{
"epoch": 0.3916622649577256,
"grad_norm": 0.3194333016872406,
"learning_rate": 6.0848005749191526e-05,
"loss": 0.1576,
"step": 27250
},
{
"epoch": 0.3923809113154462,
"grad_norm": 0.5251829028129578,
"learning_rate": 6.0776140855192245e-05,
"loss": 0.1543,
"step": 27300
},
{
"epoch": 0.39309955767316684,
"grad_norm": 0.4211423099040985,
"learning_rate": 6.070427596119296e-05,
"loss": 0.1629,
"step": 27350
},
{
"epoch": 0.3938182040308874,
"grad_norm": 0.4318183660507202,
"learning_rate": 6.063241106719368e-05,
"loss": 0.1557,
"step": 27400
},
{
"epoch": 0.39453685038860803,
"grad_norm": 0.5136430263519287,
"learning_rate": 6.05605461731944e-05,
"loss": 0.157,
"step": 27450
},
{
"epoch": 0.3952554967463286,
"grad_norm": 0.35453012585639954,
"learning_rate": 6.0488681279195114e-05,
"loss": 0.1535,
"step": 27500
},
{
"epoch": 0.3959741431040492,
"grad_norm": 0.44323351979255676,
"learning_rate": 6.041681638519584e-05,
"loss": 0.1595,
"step": 27550
},
{
"epoch": 0.39669278946176983,
"grad_norm": 0.29345712065696716,
"learning_rate": 6.034495149119655e-05,
"loss": 0.1557,
"step": 27600
},
{
"epoch": 0.3974114358194904,
"grad_norm": 0.2903861701488495,
"learning_rate": 6.027308659719727e-05,
"loss": 0.1608,
"step": 27650
},
{
"epoch": 0.398130082177211,
"grad_norm": 0.30161532759666443,
"learning_rate": 6.0201221703197984e-05,
"loss": 0.1586,
"step": 27700
},
{
"epoch": 0.3988487285349316,
"grad_norm": 0.3740021288394928,
"learning_rate": 6.012935680919871e-05,
"loss": 0.154,
"step": 27750
},
{
"epoch": 0.3995673748926522,
"grad_norm": 0.4624473750591278,
"learning_rate": 6.005749191519943e-05,
"loss": 0.1544,
"step": 27800
},
{
"epoch": 0.4002860212503728,
"grad_norm": 0.36845239996910095,
"learning_rate": 5.998562702120014e-05,
"loss": 0.1556,
"step": 27850
},
{
"epoch": 0.4010046676080934,
"grad_norm": 0.5025461912155151,
"learning_rate": 5.991376212720087e-05,
"loss": 0.1531,
"step": 27900
},
{
"epoch": 0.401723313965814,
"grad_norm": 0.6607873439788818,
"learning_rate": 5.984189723320158e-05,
"loss": 0.1573,
"step": 27950
},
{
"epoch": 0.4024419603235346,
"grad_norm": 0.5143309831619263,
"learning_rate": 5.97700323392023e-05,
"loss": 0.1534,
"step": 28000
},
{
"epoch": 0.4031606066812552,
"grad_norm": 0.3975028991699219,
"learning_rate": 5.9698167445203024e-05,
"loss": 0.1568,
"step": 28050
},
{
"epoch": 0.40387925303897576,
"grad_norm": 0.3697468936443329,
"learning_rate": 5.962630255120374e-05,
"loss": 0.1476,
"step": 28100
},
{
"epoch": 0.4045978993966964,
"grad_norm": 0.8748229146003723,
"learning_rate": 5.9554437657204456e-05,
"loss": 0.1586,
"step": 28150
},
{
"epoch": 0.405316545754417,
"grad_norm": 0.43097078800201416,
"learning_rate": 5.948257276320518e-05,
"loss": 0.1576,
"step": 28200
},
{
"epoch": 0.40603519211213757,
"grad_norm": 0.3834463059902191,
"learning_rate": 5.9410707869205894e-05,
"loss": 0.1534,
"step": 28250
},
{
"epoch": 0.4067538384698582,
"grad_norm": 0.42111456394195557,
"learning_rate": 5.933884297520661e-05,
"loss": 0.1562,
"step": 28300
},
{
"epoch": 0.40747248482757875,
"grad_norm": 0.27892324328422546,
"learning_rate": 5.926697808120734e-05,
"loss": 0.1528,
"step": 28350
},
{
"epoch": 0.40819113118529937,
"grad_norm": 0.36843934655189514,
"learning_rate": 5.919655048508804e-05,
"loss": 0.1545,
"step": 28400
},
{
"epoch": 0.40890977754302,
"grad_norm": 0.4540693163871765,
"learning_rate": 5.9124685591088755e-05,
"loss": 0.1545,
"step": 28450
},
{
"epoch": 0.40962842390074056,
"grad_norm": 0.5054967403411865,
"learning_rate": 5.9052820697089474e-05,
"loss": 0.1549,
"step": 28500
},
{
"epoch": 0.4103470702584612,
"grad_norm": 0.27066469192504883,
"learning_rate": 5.89809558030902e-05,
"loss": 0.1517,
"step": 28550
},
{
"epoch": 0.41106571661618174,
"grad_norm": 0.3877571225166321,
"learning_rate": 5.890909090909091e-05,
"loss": 0.1524,
"step": 28600
},
{
"epoch": 0.41178436297390236,
"grad_norm": 0.385812371969223,
"learning_rate": 5.883722601509163e-05,
"loss": 0.1538,
"step": 28650
},
{
"epoch": 0.4125030093316229,
"grad_norm": 0.3364986777305603,
"learning_rate": 5.876536112109234e-05,
"loss": 0.1555,
"step": 28700
},
{
"epoch": 0.41322165568934355,
"grad_norm": 0.46070176362991333,
"learning_rate": 5.869349622709307e-05,
"loss": 0.1506,
"step": 28750
},
{
"epoch": 0.41394030204706417,
"grad_norm": 0.4020332396030426,
"learning_rate": 5.862163133309379e-05,
"loss": 0.1509,
"step": 28800
},
{
"epoch": 0.41465894840478473,
"grad_norm": 0.47747594118118286,
"learning_rate": 5.85497664390945e-05,
"loss": 0.1526,
"step": 28850
},
{
"epoch": 0.41537759476250535,
"grad_norm": 0.45608797669410706,
"learning_rate": 5.8477901545095226e-05,
"loss": 0.1543,
"step": 28900
},
{
"epoch": 0.4160962411202259,
"grad_norm": 0.2811453640460968,
"learning_rate": 5.840603665109594e-05,
"loss": 0.157,
"step": 28950
},
{
"epoch": 0.41681488747794654,
"grad_norm": 0.32047563791275024,
"learning_rate": 5.833417175709666e-05,
"loss": 0.1539,
"step": 29000
},
{
"epoch": 0.41753353383566716,
"grad_norm": 0.482090026140213,
"learning_rate": 5.8262306863097384e-05,
"loss": 0.1604,
"step": 29050
},
{
"epoch": 0.4182521801933877,
"grad_norm": 0.30156487226486206,
"learning_rate": 5.819187926697809e-05,
"loss": 0.155,
"step": 29100
},
{
"epoch": 0.41897082655110834,
"grad_norm": 0.36799871921539307,
"learning_rate": 5.8120014372978806e-05,
"loss": 0.1542,
"step": 29150
},
{
"epoch": 0.4196894729088289,
"grad_norm": 0.44371145963668823,
"learning_rate": 5.804814947897952e-05,
"loss": 0.1523,
"step": 29200
},
{
"epoch": 0.42040811926654953,
"grad_norm": 0.45741862058639526,
"learning_rate": 5.7976284584980244e-05,
"loss": 0.1526,
"step": 29250
},
{
"epoch": 0.42112676562427015,
"grad_norm": 0.33403101563453674,
"learning_rate": 5.790441969098096e-05,
"loss": 0.1534,
"step": 29300
},
{
"epoch": 0.4218454119819907,
"grad_norm": 0.31555646657943726,
"learning_rate": 5.7832554796981676e-05,
"loss": 0.1525,
"step": 29350
},
{
"epoch": 0.42256405833971133,
"grad_norm": 0.33370161056518555,
"learning_rate": 5.77606899029824e-05,
"loss": 0.159,
"step": 29400
},
{
"epoch": 0.4232827046974319,
"grad_norm": 0.41524410247802734,
"learning_rate": 5.7688825008983114e-05,
"loss": 0.1538,
"step": 29450
},
{
"epoch": 0.4240013510551525,
"grad_norm": 0.3497067391872406,
"learning_rate": 5.761696011498383e-05,
"loss": 0.1498,
"step": 29500
},
{
"epoch": 0.4247199974128731,
"grad_norm": 0.37955665588378906,
"learning_rate": 5.754509522098456e-05,
"loss": 0.1492,
"step": 29550
},
{
"epoch": 0.4254386437705937,
"grad_norm": 0.30920735001564026,
"learning_rate": 5.747323032698527e-05,
"loss": 0.1589,
"step": 29600
},
{
"epoch": 0.4261572901283143,
"grad_norm": 0.3532569110393524,
"learning_rate": 5.740136543298599e-05,
"loss": 0.1536,
"step": 29650
},
{
"epoch": 0.4268759364860349,
"grad_norm": 0.33438754081726074,
"learning_rate": 5.73295005389867e-05,
"loss": 0.1527,
"step": 29700
},
{
"epoch": 0.4275945828437555,
"grad_norm": 0.376600980758667,
"learning_rate": 5.725763564498743e-05,
"loss": 0.1521,
"step": 29750
},
{
"epoch": 0.4283132292014761,
"grad_norm": 0.48702993988990784,
"learning_rate": 5.718577075098814e-05,
"loss": 0.1545,
"step": 29800
},
{
"epoch": 0.4290318755591967,
"grad_norm": 0.2475581169128418,
"learning_rate": 5.711390585698886e-05,
"loss": 0.1544,
"step": 29850
},
{
"epoch": 0.4297505219169173,
"grad_norm": 0.37837105989456177,
"learning_rate": 5.7042040962989586e-05,
"loss": 0.1541,
"step": 29900
},
{
"epoch": 0.4304691682746379,
"grad_norm": 0.47116902470588684,
"learning_rate": 5.69701760689903e-05,
"loss": 0.1502,
"step": 29950
},
{
"epoch": 0.4311878146323585,
"grad_norm": 0.29833170771598816,
"learning_rate": 5.689831117499102e-05,
"loss": 0.1537,
"step": 30000
},
{
"epoch": 0.4311878146323585,
"eval_loss": 0.15278169512748718,
"eval_runtime": 2339.1715,
"eval_samples_per_second": 25.047,
"eval_steps_per_second": 3.131,
"step": 30000
},
{
"epoch": 0.43190646099007907,
"grad_norm": 0.3521724045276642,
"learning_rate": 5.682644628099174e-05,
"loss": 0.1482,
"step": 30050
},
{
"epoch": 0.4326251073477997,
"grad_norm": 0.2992171049118042,
"learning_rate": 5.6754581386992455e-05,
"loss": 0.1515,
"step": 30100
},
{
"epoch": 0.4333437537055203,
"grad_norm": 0.2566670775413513,
"learning_rate": 5.668271649299317e-05,
"loss": 0.155,
"step": 30150
},
{
"epoch": 0.43406240006324087,
"grad_norm": 0.42938894033432007,
"learning_rate": 5.66108515989939e-05,
"loss": 0.1514,
"step": 30200
},
{
"epoch": 0.4347810464209615,
"grad_norm": 0.3730209469795227,
"learning_rate": 5.653898670499461e-05,
"loss": 0.152,
"step": 30250
},
{
"epoch": 0.43549969277868206,
"grad_norm": 0.269544780254364,
"learning_rate": 5.6467121810995325e-05,
"loss": 0.1555,
"step": 30300
},
{
"epoch": 0.4362183391364027,
"grad_norm": 0.331620991230011,
"learning_rate": 5.639525691699605e-05,
"loss": 0.1558,
"step": 30350
},
{
"epoch": 0.43693698549412324,
"grad_norm": 0.2786395847797394,
"learning_rate": 5.632339202299677e-05,
"loss": 0.151,
"step": 30400
},
{
"epoch": 0.43765563185184386,
"grad_norm": 0.3796517550945282,
"learning_rate": 5.625152712899748e-05,
"loss": 0.1513,
"step": 30450
},
{
"epoch": 0.4383742782095645,
"grad_norm": 0.48707351088523865,
"learning_rate": 5.617966223499821e-05,
"loss": 0.1513,
"step": 30500
},
{
"epoch": 0.43909292456728505,
"grad_norm": 0.391174852848053,
"learning_rate": 5.610779734099893e-05,
"loss": 0.1536,
"step": 30550
},
{
"epoch": 0.43981157092500567,
"grad_norm": 0.4588553309440613,
"learning_rate": 5.603593244699964e-05,
"loss": 0.1506,
"step": 30600
},
{
"epoch": 0.44053021728272623,
"grad_norm": 0.6206002235412598,
"learning_rate": 5.5964067553000365e-05,
"loss": 0.1563,
"step": 30650
},
{
"epoch": 0.44124886364044685,
"grad_norm": 0.4559854567050934,
"learning_rate": 5.589220265900108e-05,
"loss": 0.1465,
"step": 30700
},
{
"epoch": 0.4419675099981675,
"grad_norm": 0.3629855811595917,
"learning_rate": 5.58203377650018e-05,
"loss": 0.1523,
"step": 30750
},
{
"epoch": 0.44268615635588804,
"grad_norm": 0.43928229808807373,
"learning_rate": 5.574847287100252e-05,
"loss": 0.1535,
"step": 30800
},
{
"epoch": 0.44340480271360866,
"grad_norm": 0.5060630440711975,
"learning_rate": 5.5676607977003235e-05,
"loss": 0.1504,
"step": 30850
},
{
"epoch": 0.4441234490713292,
"grad_norm": 0.7647538781166077,
"learning_rate": 5.5604743083003954e-05,
"loss": 0.1535,
"step": 30900
},
{
"epoch": 0.44484209542904984,
"grad_norm": 0.4331282377243042,
"learning_rate": 5.553287818900468e-05,
"loss": 0.1534,
"step": 30950
},
{
"epoch": 0.44556074178677046,
"grad_norm": 0.34580302238464355,
"learning_rate": 5.546101329500539e-05,
"loss": 0.1502,
"step": 31000
},
{
"epoch": 0.44627938814449103,
"grad_norm": 0.36288365721702576,
"learning_rate": 5.5389148401006105e-05,
"loss": 0.1549,
"step": 31050
},
{
"epoch": 0.44699803450221165,
"grad_norm": 0.319622278213501,
"learning_rate": 5.531728350700684e-05,
"loss": 0.1529,
"step": 31100
},
{
"epoch": 0.4477166808599322,
"grad_norm": 0.3031866252422333,
"learning_rate": 5.524541861300755e-05,
"loss": 0.152,
"step": 31150
},
{
"epoch": 0.44843532721765283,
"grad_norm": 0.6403375864028931,
"learning_rate": 5.517355371900826e-05,
"loss": 0.1491,
"step": 31200
},
{
"epoch": 0.4491539735753734,
"grad_norm": 0.3669109642505646,
"learning_rate": 5.510168882500899e-05,
"loss": 0.1505,
"step": 31250
},
{
"epoch": 0.449872619933094,
"grad_norm": 0.3309599459171295,
"learning_rate": 5.502982393100971e-05,
"loss": 0.1496,
"step": 31300
},
{
"epoch": 0.45059126629081464,
"grad_norm": 0.40921223163604736,
"learning_rate": 5.495795903701042e-05,
"loss": 0.1513,
"step": 31350
},
{
"epoch": 0.4513099126485352,
"grad_norm": 0.34538429975509644,
"learning_rate": 5.4886094143011145e-05,
"loss": 0.1477,
"step": 31400
},
{
"epoch": 0.4520285590062558,
"grad_norm": 0.3140852451324463,
"learning_rate": 5.4814229249011864e-05,
"loss": 0.1501,
"step": 31450
},
{
"epoch": 0.4527472053639764,
"grad_norm": 0.34375426173210144,
"learning_rate": 5.474236435501258e-05,
"loss": 0.1486,
"step": 31500
},
{
"epoch": 0.453465851721697,
"grad_norm": 0.2683236002922058,
"learning_rate": 5.46704994610133e-05,
"loss": 0.1521,
"step": 31550
},
{
"epoch": 0.45418449807941763,
"grad_norm": 0.2604835331439972,
"learning_rate": 5.4598634567014015e-05,
"loss": 0.1524,
"step": 31600
},
{
"epoch": 0.4549031444371382,
"grad_norm": 0.3806206285953522,
"learning_rate": 5.4526769673014734e-05,
"loss": 0.1537,
"step": 31650
},
{
"epoch": 0.4556217907948588,
"grad_norm": 0.34805959463119507,
"learning_rate": 5.445490477901546e-05,
"loss": 0.1507,
"step": 31700
},
{
"epoch": 0.4563404371525794,
"grad_norm": 0.36547720432281494,
"learning_rate": 5.438303988501617e-05,
"loss": 0.1544,
"step": 31750
},
{
"epoch": 0.4570590835103,
"grad_norm": 0.428213506937027,
"learning_rate": 5.431117499101689e-05,
"loss": 0.1522,
"step": 31800
},
{
"epoch": 0.4577777298680206,
"grad_norm": 0.42825666069984436,
"learning_rate": 5.423931009701762e-05,
"loss": 0.149,
"step": 31850
},
{
"epoch": 0.4584963762257412,
"grad_norm": 0.45687320828437805,
"learning_rate": 5.416744520301833e-05,
"loss": 0.1492,
"step": 31900
},
{
"epoch": 0.4592150225834618,
"grad_norm": 0.313772976398468,
"learning_rate": 5.409558030901904e-05,
"loss": 0.1475,
"step": 31950
},
{
"epoch": 0.45993366894118237,
"grad_norm": 0.43313875794410706,
"learning_rate": 5.402371541501976e-05,
"loss": 0.1533,
"step": 32000
},
{
"epoch": 0.460652315298903,
"grad_norm": 0.3474908173084259,
"learning_rate": 5.395185052102049e-05,
"loss": 0.1527,
"step": 32050
},
{
"epoch": 0.46137096165662356,
"grad_norm": 0.38463565707206726,
"learning_rate": 5.38799856270212e-05,
"loss": 0.1522,
"step": 32100
},
{
"epoch": 0.4620896080143442,
"grad_norm": 0.27655136585235596,
"learning_rate": 5.380812073302192e-05,
"loss": 0.1491,
"step": 32150
},
{
"epoch": 0.4628082543720648,
"grad_norm": 0.3056930899620056,
"learning_rate": 5.3736255839022644e-05,
"loss": 0.1439,
"step": 32200
},
{
"epoch": 0.46352690072978536,
"grad_norm": 0.4399576187133789,
"learning_rate": 5.3664390945023356e-05,
"loss": 0.1514,
"step": 32250
},
{
"epoch": 0.464245547087506,
"grad_norm": 0.32986631989479065,
"learning_rate": 5.3592526051024076e-05,
"loss": 0.1508,
"step": 32300
},
{
"epoch": 0.46496419344522655,
"grad_norm": 0.290811687707901,
"learning_rate": 5.35206611570248e-05,
"loss": 0.153,
"step": 32350
},
{
"epoch": 0.46568283980294717,
"grad_norm": 0.6335808634757996,
"learning_rate": 5.3448796263025514e-05,
"loss": 0.151,
"step": 32400
},
{
"epoch": 0.4664014861606678,
"grad_norm": 0.2713414132595062,
"learning_rate": 5.3376931369026226e-05,
"loss": 0.1501,
"step": 32450
},
{
"epoch": 0.46712013251838835,
"grad_norm": 0.5587482452392578,
"learning_rate": 5.330506647502695e-05,
"loss": 0.1495,
"step": 32500
},
{
"epoch": 0.467838778876109,
"grad_norm": 0.36849066615104675,
"learning_rate": 5.323320158102767e-05,
"loss": 0.1489,
"step": 32550
},
{
"epoch": 0.46855742523382954,
"grad_norm": 0.2901330590248108,
"learning_rate": 5.3161336687028383e-05,
"loss": 0.1488,
"step": 32600
},
{
"epoch": 0.46927607159155016,
"grad_norm": 0.28899720311164856,
"learning_rate": 5.308947179302911e-05,
"loss": 0.15,
"step": 32650
},
{
"epoch": 0.4699947179492708,
"grad_norm": 0.5915825963020325,
"learning_rate": 5.301760689902983e-05,
"loss": 0.1513,
"step": 32700
},
{
"epoch": 0.47071336430699134,
"grad_norm": 0.3386688828468323,
"learning_rate": 5.294574200503054e-05,
"loss": 0.1484,
"step": 32750
},
{
"epoch": 0.47143201066471196,
"grad_norm": 0.3731253445148468,
"learning_rate": 5.2873877111031267e-05,
"loss": 0.1532,
"step": 32800
},
{
"epoch": 0.47215065702243253,
"grad_norm": 0.31815454363822937,
"learning_rate": 5.280201221703198e-05,
"loss": 0.148,
"step": 32850
},
{
"epoch": 0.47286930338015315,
"grad_norm": 0.30257225036621094,
"learning_rate": 5.27301473230327e-05,
"loss": 0.1509,
"step": 32900
},
{
"epoch": 0.4735879497378737,
"grad_norm": 0.3791181743144989,
"learning_rate": 5.2658282429033424e-05,
"loss": 0.1484,
"step": 32950
},
{
"epoch": 0.47430659609559433,
"grad_norm": 0.39685317873954773,
"learning_rate": 5.2586417535034136e-05,
"loss": 0.1489,
"step": 33000
},
{
"epoch": 0.47430659609559433,
"eval_loss": 0.14942093193531036,
"eval_runtime": 2342.5781,
"eval_samples_per_second": 25.011,
"eval_steps_per_second": 3.126,
"step": 33000
},
{
"epoch": 0.47502524245331496,
"grad_norm": 0.5523746013641357,
"learning_rate": 5.2514552641034855e-05,
"loss": 0.147,
"step": 33050
},
{
"epoch": 0.4757438888110355,
"grad_norm": 0.36720308661460876,
"learning_rate": 5.244268774703558e-05,
"loss": 0.1517,
"step": 33100
},
{
"epoch": 0.47646253516875614,
"grad_norm": 0.45619773864746094,
"learning_rate": 5.237226015091628e-05,
"loss": 0.1517,
"step": 33150
},
{
"epoch": 0.4771811815264767,
"grad_norm": 0.31895503401756287,
"learning_rate": 5.2300395256917003e-05,
"loss": 0.1487,
"step": 33200
},
{
"epoch": 0.4778998278841973,
"grad_norm": 0.37323054671287537,
"learning_rate": 5.2228530362917716e-05,
"loss": 0.1479,
"step": 33250
},
{
"epoch": 0.47861847424191795,
"grad_norm": 0.4316665232181549,
"learning_rate": 5.215666546891843e-05,
"loss": 0.1465,
"step": 33300
},
{
"epoch": 0.4793371205996385,
"grad_norm": 0.4115291237831116,
"learning_rate": 5.2084800574919154e-05,
"loss": 0.1506,
"step": 33350
},
{
"epoch": 0.48005576695735913,
"grad_norm": 0.343179851770401,
"learning_rate": 5.201293568091987e-05,
"loss": 0.1473,
"step": 33400
},
{
"epoch": 0.4807744133150797,
"grad_norm": 0.3537336587905884,
"learning_rate": 5.1941070786920585e-05,
"loss": 0.149,
"step": 33450
},
{
"epoch": 0.4814930596728003,
"grad_norm": 0.34489813446998596,
"learning_rate": 5.186920589292131e-05,
"loss": 0.148,
"step": 33500
},
{
"epoch": 0.48221170603052094,
"grad_norm": 0.526101291179657,
"learning_rate": 5.179734099892203e-05,
"loss": 0.153,
"step": 33550
},
{
"epoch": 0.4829303523882415,
"grad_norm": 0.3146929144859314,
"learning_rate": 5.172547610492274e-05,
"loss": 0.1484,
"step": 33600
},
{
"epoch": 0.4836489987459621,
"grad_norm": 0.4303942918777466,
"learning_rate": 5.165361121092347e-05,
"loss": 0.1465,
"step": 33650
},
{
"epoch": 0.4843676451036827,
"grad_norm": 0.4743385910987854,
"learning_rate": 5.158174631692419e-05,
"loss": 0.1505,
"step": 33700
},
{
"epoch": 0.4850862914614033,
"grad_norm": 0.4490325152873993,
"learning_rate": 5.15098814229249e-05,
"loss": 0.1468,
"step": 33750
},
{
"epoch": 0.48580493781912387,
"grad_norm": 0.3729170858860016,
"learning_rate": 5.1438016528925626e-05,
"loss": 0.1495,
"step": 33800
},
{
"epoch": 0.4865235841768445,
"grad_norm": 0.2781628668308258,
"learning_rate": 5.136615163492634e-05,
"loss": 0.1479,
"step": 33850
},
{
"epoch": 0.4872422305345651,
"grad_norm": 0.30333954095840454,
"learning_rate": 5.129428674092706e-05,
"loss": 0.1433,
"step": 33900
},
{
"epoch": 0.4879608768922857,
"grad_norm": 0.30281704664230347,
"learning_rate": 5.122242184692778e-05,
"loss": 0.1454,
"step": 33950
},
{
"epoch": 0.4886795232500063,
"grad_norm": 0.25618281960487366,
"learning_rate": 5.1150556952928496e-05,
"loss": 0.151,
"step": 34000
},
{
"epoch": 0.48939816960772686,
"grad_norm": 0.34702569246292114,
"learning_rate": 5.1078692058929215e-05,
"loss": 0.1481,
"step": 34050
},
{
"epoch": 0.4901168159654475,
"grad_norm": 0.3548518121242523,
"learning_rate": 5.100682716492994e-05,
"loss": 0.1474,
"step": 34100
},
{
"epoch": 0.4908354623231681,
"grad_norm": 0.36157211661338806,
"learning_rate": 5.093496227093065e-05,
"loss": 0.1503,
"step": 34150
},
{
"epoch": 0.49155410868088867,
"grad_norm": 0.3797595500946045,
"learning_rate": 5.0863097376931365e-05,
"loss": 0.1432,
"step": 34200
},
{
"epoch": 0.4922727550386093,
"grad_norm": 0.33411136269569397,
"learning_rate": 5.079123248293209e-05,
"loss": 0.1497,
"step": 34250
},
{
"epoch": 0.49299140139632985,
"grad_norm": 0.5178517699241638,
"learning_rate": 5.071936758893281e-05,
"loss": 0.1526,
"step": 34300
},
{
"epoch": 0.4937100477540505,
"grad_norm": 0.4469529986381531,
"learning_rate": 5.064750269493352e-05,
"loss": 0.1478,
"step": 34350
},
{
"epoch": 0.4944286941117711,
"grad_norm": 0.3018459975719452,
"learning_rate": 5.057563780093425e-05,
"loss": 0.1444,
"step": 34400
},
{
"epoch": 0.49514734046949166,
"grad_norm": 0.38588500022888184,
"learning_rate": 5.050377290693497e-05,
"loss": 0.1492,
"step": 34450
},
{
"epoch": 0.4958659868272123,
"grad_norm": 0.38262608647346497,
"learning_rate": 5.043190801293568e-05,
"loss": 0.1454,
"step": 34500
},
{
"epoch": 0.49658463318493284,
"grad_norm": 0.32095280289649963,
"learning_rate": 5.0360043118936406e-05,
"loss": 0.1479,
"step": 34550
},
{
"epoch": 0.49730327954265346,
"grad_norm": 0.3346179127693176,
"learning_rate": 5.0288178224937125e-05,
"loss": 0.1511,
"step": 34600
},
{
"epoch": 0.49802192590037403,
"grad_norm": 0.5138089060783386,
"learning_rate": 5.021631333093784e-05,
"loss": 0.1511,
"step": 34650
},
{
"epoch": 0.49874057225809465,
"grad_norm": 0.664122462272644,
"learning_rate": 5.014444843693856e-05,
"loss": 0.149,
"step": 34700
},
{
"epoch": 0.49945921861581527,
"grad_norm": 0.43910887837409973,
"learning_rate": 5.0072583542939275e-05,
"loss": 0.149,
"step": 34750
},
{
"epoch": 0.5001778649735359,
"grad_norm": 0.4011009633541107,
"learning_rate": 5.0000718648939994e-05,
"loss": 0.1481,
"step": 34800
},
{
"epoch": 0.5008965113312565,
"grad_norm": 0.2885836660861969,
"learning_rate": 4.9928853754940713e-05,
"loss": 0.15,
"step": 34850
},
{
"epoch": 0.501615157688977,
"grad_norm": 0.5149396061897278,
"learning_rate": 4.985698886094143e-05,
"loss": 0.1506,
"step": 34900
},
{
"epoch": 0.5023338040466976,
"grad_norm": 0.3503366708755493,
"learning_rate": 4.978512396694215e-05,
"loss": 0.1457,
"step": 34950
},
{
"epoch": 0.5030524504044183,
"grad_norm": 0.4478318691253662,
"learning_rate": 4.971325907294287e-05,
"loss": 0.1468,
"step": 35000
},
{
"epoch": 0.5037710967621388,
"grad_norm": 0.4077318608760834,
"learning_rate": 4.964139417894359e-05,
"loss": 0.1421,
"step": 35050
},
{
"epoch": 0.5044897431198594,
"grad_norm": 0.4490613341331482,
"learning_rate": 4.95695292849443e-05,
"loss": 0.1471,
"step": 35100
},
{
"epoch": 0.5052083894775801,
"grad_norm": 0.4023280441761017,
"learning_rate": 4.949766439094503e-05,
"loss": 0.1467,
"step": 35150
},
{
"epoch": 0.5059270358353006,
"grad_norm": 0.426633358001709,
"learning_rate": 4.942579949694575e-05,
"loss": 0.1457,
"step": 35200
},
{
"epoch": 0.5066456821930212,
"grad_norm": 0.45067334175109863,
"learning_rate": 4.935537190082645e-05,
"loss": 0.1445,
"step": 35250
},
{
"epoch": 0.5073643285507419,
"grad_norm": 0.39662429690361023,
"learning_rate": 4.928350700682717e-05,
"loss": 0.145,
"step": 35300
},
{
"epoch": 0.5080829749084624,
"grad_norm": 0.3792784512042999,
"learning_rate": 4.921164211282789e-05,
"loss": 0.1501,
"step": 35350
},
{
"epoch": 0.508801621266183,
"grad_norm": 0.5802880525588989,
"learning_rate": 4.91397772188286e-05,
"loss": 0.1479,
"step": 35400
},
{
"epoch": 0.5095202676239036,
"grad_norm": 0.27767547965049744,
"learning_rate": 4.906791232482933e-05,
"loss": 0.1466,
"step": 35450
},
{
"epoch": 0.5102389139816242,
"grad_norm": 0.32960420846939087,
"learning_rate": 4.8996047430830046e-05,
"loss": 0.1441,
"step": 35500
},
{
"epoch": 0.5109575603393448,
"grad_norm": 0.3106556832790375,
"learning_rate": 4.892418253683076e-05,
"loss": 0.1495,
"step": 35550
},
{
"epoch": 0.5116762066970654,
"grad_norm": 0.37083184719085693,
"learning_rate": 4.885231764283148e-05,
"loss": 0.1483,
"step": 35600
},
{
"epoch": 0.512394853054786,
"grad_norm": 0.3685917854309082,
"learning_rate": 4.87804527488322e-05,
"loss": 0.1476,
"step": 35650
},
{
"epoch": 0.5131134994125066,
"grad_norm": 0.4368564784526825,
"learning_rate": 4.8708587854832915e-05,
"loss": 0.1474,
"step": 35700
},
{
"epoch": 0.5138321457702272,
"grad_norm": 0.3055019676685333,
"learning_rate": 4.8636722960833635e-05,
"loss": 0.1421,
"step": 35750
},
{
"epoch": 0.5145507921279477,
"grad_norm": 0.4695027470588684,
"learning_rate": 4.8564858066834354e-05,
"loss": 0.1493,
"step": 35800
},
{
"epoch": 0.5152694384856684,
"grad_norm": 0.35940343141555786,
"learning_rate": 4.849299317283507e-05,
"loss": 0.1489,
"step": 35850
},
{
"epoch": 0.515988084843389,
"grad_norm": 0.37435460090637207,
"learning_rate": 4.842112827883579e-05,
"loss": 0.1475,
"step": 35900
},
{
"epoch": 0.5167067312011095,
"grad_norm": 0.35947975516319275,
"learning_rate": 4.8349263384836504e-05,
"loss": 0.1467,
"step": 35950
},
{
"epoch": 0.5174253775588302,
"grad_norm": 0.3311944007873535,
"learning_rate": 4.827739849083723e-05,
"loss": 0.1465,
"step": 36000
},
{
"epoch": 0.5174253775588302,
"eval_loss": 0.14617860317230225,
"eval_runtime": 2341.814,
"eval_samples_per_second": 25.019,
"eval_steps_per_second": 3.127,
"step": 36000
},
{
"epoch": 0.5181440239165508,
"grad_norm": 0.3348549008369446,
"learning_rate": 4.820553359683795e-05,
"loss": 0.1499,
"step": 36050
},
{
"epoch": 0.5188626702742714,
"grad_norm": 0.3845024108886719,
"learning_rate": 4.813366870283866e-05,
"loss": 0.1479,
"step": 36100
},
{
"epoch": 0.519581316631992,
"grad_norm": 0.3920484483242035,
"learning_rate": 4.806324110671937e-05,
"loss": 0.143,
"step": 36150
},
{
"epoch": 0.5202999629897126,
"grad_norm": 0.3186359405517578,
"learning_rate": 4.799137621272009e-05,
"loss": 0.1464,
"step": 36200
},
{
"epoch": 0.5210186093474332,
"grad_norm": 0.3029921054840088,
"learning_rate": 4.79195113187208e-05,
"loss": 0.1479,
"step": 36250
},
{
"epoch": 0.5217372557051537,
"grad_norm": 0.37382078170776367,
"learning_rate": 4.784764642472153e-05,
"loss": 0.1424,
"step": 36300
},
{
"epoch": 0.5224559020628744,
"grad_norm": 0.3550768792629242,
"learning_rate": 4.777578153072225e-05,
"loss": 0.142,
"step": 36350
},
{
"epoch": 0.523174548420595,
"grad_norm": 0.33143946528434753,
"learning_rate": 4.770391663672296e-05,
"loss": 0.1453,
"step": 36400
},
{
"epoch": 0.5238931947783155,
"grad_norm": 0.40416479110717773,
"learning_rate": 4.763205174272368e-05,
"loss": 0.1435,
"step": 36450
},
{
"epoch": 0.5246118411360362,
"grad_norm": 0.3630949854850769,
"learning_rate": 4.7560186848724405e-05,
"loss": 0.1464,
"step": 36500
},
{
"epoch": 0.5253304874937568,
"grad_norm": 0.3661783039569855,
"learning_rate": 4.748832195472512e-05,
"loss": 0.1428,
"step": 36550
},
{
"epoch": 0.5260491338514773,
"grad_norm": 0.38454943895339966,
"learning_rate": 4.7416457060725837e-05,
"loss": 0.1437,
"step": 36600
},
{
"epoch": 0.5267677802091979,
"grad_norm": 0.4658641517162323,
"learning_rate": 4.734459216672656e-05,
"loss": 0.1445,
"step": 36650
},
{
"epoch": 0.5274864265669186,
"grad_norm": 0.3436949551105499,
"learning_rate": 4.7272727272727275e-05,
"loss": 0.1446,
"step": 36700
},
{
"epoch": 0.5282050729246391,
"grad_norm": 0.34002795815467834,
"learning_rate": 4.7200862378727994e-05,
"loss": 0.1441,
"step": 36750
},
{
"epoch": 0.5289237192823597,
"grad_norm": 0.40276002883911133,
"learning_rate": 4.712899748472871e-05,
"loss": 0.1479,
"step": 36800
},
{
"epoch": 0.5296423656400804,
"grad_norm": 0.5200607776641846,
"learning_rate": 4.705713259072943e-05,
"loss": 0.1443,
"step": 36850
},
{
"epoch": 0.530361011997801,
"grad_norm": 0.46847838163375854,
"learning_rate": 4.698526769673015e-05,
"loss": 0.1416,
"step": 36900
},
{
"epoch": 0.5310796583555215,
"grad_norm": 0.4312261641025543,
"learning_rate": 4.6913402802730863e-05,
"loss": 0.1483,
"step": 36950
},
{
"epoch": 0.5317983047132422,
"grad_norm": 0.3275109827518463,
"learning_rate": 4.684153790873159e-05,
"loss": 0.1467,
"step": 37000
},
{
"epoch": 0.5325169510709628,
"grad_norm": 0.4498741030693054,
"learning_rate": 4.676967301473231e-05,
"loss": 0.146,
"step": 37050
},
{
"epoch": 0.5332355974286833,
"grad_norm": 0.3524952530860901,
"learning_rate": 4.669780812073302e-05,
"loss": 0.1437,
"step": 37100
},
{
"epoch": 0.5339542437864039,
"grad_norm": 0.4064757823944092,
"learning_rate": 4.662594322673374e-05,
"loss": 0.1462,
"step": 37150
},
{
"epoch": 0.5346728901441246,
"grad_norm": 0.3992777168750763,
"learning_rate": 4.6554078332734466e-05,
"loss": 0.1451,
"step": 37200
},
{
"epoch": 0.5353915365018451,
"grad_norm": 0.34734034538269043,
"learning_rate": 4.648221343873518e-05,
"loss": 0.1478,
"step": 37250
},
{
"epoch": 0.5361101828595657,
"grad_norm": 0.29704445600509644,
"learning_rate": 4.64103485447359e-05,
"loss": 0.1421,
"step": 37300
},
{
"epoch": 0.5368288292172864,
"grad_norm": 0.3567189574241638,
"learning_rate": 4.6338483650736616e-05,
"loss": 0.1462,
"step": 37350
},
{
"epoch": 0.5375474755750069,
"grad_norm": 0.49130842089653015,
"learning_rate": 4.6266618756737335e-05,
"loss": 0.146,
"step": 37400
},
{
"epoch": 0.5382661219327275,
"grad_norm": 0.44548994302749634,
"learning_rate": 4.6194753862738055e-05,
"loss": 0.1409,
"step": 37450
},
{
"epoch": 0.5389847682904481,
"grad_norm": 0.426877498626709,
"learning_rate": 4.6122888968738774e-05,
"loss": 0.1421,
"step": 37500
},
{
"epoch": 0.5397034146481687,
"grad_norm": 0.4113141596317291,
"learning_rate": 4.605102407473949e-05,
"loss": 0.1478,
"step": 37550
},
{
"epoch": 0.5404220610058893,
"grad_norm": 0.4359269440174103,
"learning_rate": 4.597915918074021e-05,
"loss": 0.1393,
"step": 37600
},
{
"epoch": 0.5411407073636099,
"grad_norm": 0.5452132821083069,
"learning_rate": 4.590729428674093e-05,
"loss": 0.1489,
"step": 37650
},
{
"epoch": 0.5418593537213305,
"grad_norm": 0.41318729519844055,
"learning_rate": 4.583542939274165e-05,
"loss": 0.1413,
"step": 37700
},
{
"epoch": 0.5425780000790511,
"grad_norm": 0.35488757491111755,
"learning_rate": 4.576356449874237e-05,
"loss": 0.1415,
"step": 37750
},
{
"epoch": 0.5432966464367717,
"grad_norm": 0.34059152007102966,
"learning_rate": 4.569169960474309e-05,
"loss": 0.1456,
"step": 37800
},
{
"epoch": 0.5440152927944923,
"grad_norm": 0.35003551840782166,
"learning_rate": 4.56198347107438e-05,
"loss": 0.1491,
"step": 37850
},
{
"epoch": 0.5447339391522129,
"grad_norm": 0.25030508637428284,
"learning_rate": 4.5547969816744526e-05,
"loss": 0.1429,
"step": 37900
},
{
"epoch": 0.5454525855099335,
"grad_norm": 0.3320457339286804,
"learning_rate": 4.547610492274524e-05,
"loss": 0.1458,
"step": 37950
},
{
"epoch": 0.546171231867654,
"grad_norm": 0.29865363240242004,
"learning_rate": 4.540424002874596e-05,
"loss": 0.1432,
"step": 38000
},
{
"epoch": 0.5468898782253747,
"grad_norm": 0.29522374272346497,
"learning_rate": 4.533237513474668e-05,
"loss": 0.1429,
"step": 38050
},
{
"epoch": 0.5476085245830953,
"grad_norm": 0.29843688011169434,
"learning_rate": 4.5260510240747396e-05,
"loss": 0.1457,
"step": 38100
},
{
"epoch": 0.5483271709408158,
"grad_norm": 0.42439109086990356,
"learning_rate": 4.5188645346748115e-05,
"loss": 0.1434,
"step": 38150
},
{
"epoch": 0.5490458172985365,
"grad_norm": 0.4063067138195038,
"learning_rate": 4.5116780452748834e-05,
"loss": 0.1432,
"step": 38200
},
{
"epoch": 0.5497644636562571,
"grad_norm": 0.39855289459228516,
"learning_rate": 4.5044915558749553e-05,
"loss": 0.1461,
"step": 38250
},
{
"epoch": 0.5504831100139777,
"grad_norm": 0.3334510624408722,
"learning_rate": 4.497305066475027e-05,
"loss": 0.1463,
"step": 38300
},
{
"epoch": 0.5512017563716982,
"grad_norm": 0.4082428812980652,
"learning_rate": 4.490118577075099e-05,
"loss": 0.1466,
"step": 38350
},
{
"epoch": 0.5519204027294189,
"grad_norm": 0.32873401045799255,
"learning_rate": 4.4829320876751704e-05,
"loss": 0.1445,
"step": 38400
},
{
"epoch": 0.5526390490871395,
"grad_norm": 0.31031402945518494,
"learning_rate": 4.475745598275243e-05,
"loss": 0.1433,
"step": 38450
},
{
"epoch": 0.55335769544486,
"grad_norm": 0.45551490783691406,
"learning_rate": 4.468559108875315e-05,
"loss": 0.1398,
"step": 38500
},
{
"epoch": 0.5540763418025807,
"grad_norm": 0.3071674406528473,
"learning_rate": 4.461372619475386e-05,
"loss": 0.1447,
"step": 38550
},
{
"epoch": 0.5547949881603013,
"grad_norm": 0.4662153422832489,
"learning_rate": 4.454186130075459e-05,
"loss": 0.1416,
"step": 38600
},
{
"epoch": 0.5555136345180218,
"grad_norm": 0.42972332239151,
"learning_rate": 4.4469996406755306e-05,
"loss": 0.1448,
"step": 38650
},
{
"epoch": 0.5562322808757425,
"grad_norm": 0.5573573112487793,
"learning_rate": 4.439813151275602e-05,
"loss": 0.1449,
"step": 38700
},
{
"epoch": 0.5569509272334631,
"grad_norm": 0.460609495639801,
"learning_rate": 4.432626661875674e-05,
"loss": 0.1413,
"step": 38750
},
{
"epoch": 0.5576695735911836,
"grad_norm": 0.410047322511673,
"learning_rate": 4.4254401724757464e-05,
"loss": 0.1417,
"step": 38800
},
{
"epoch": 0.5583882199489042,
"grad_norm": 0.31715336441993713,
"learning_rate": 4.4182536830758176e-05,
"loss": 0.1396,
"step": 38850
},
{
"epoch": 0.5591068663066249,
"grad_norm": 0.2956278324127197,
"learning_rate": 4.4110671936758895e-05,
"loss": 0.1428,
"step": 38900
},
{
"epoch": 0.5598255126643454,
"grad_norm": 0.48904120922088623,
"learning_rate": 4.4038807042759614e-05,
"loss": 0.1451,
"step": 38950
},
{
"epoch": 0.560544159022066,
"grad_norm": 0.4974726140499115,
"learning_rate": 4.396694214876033e-05,
"loss": 0.141,
"step": 39000
},
{
"epoch": 0.560544159022066,
"eval_loss": 0.14253196120262146,
"eval_runtime": 2331.8967,
"eval_samples_per_second": 25.125,
"eval_steps_per_second": 3.141,
"step": 39000
},
{
"epoch": 0.5612628053797867,
"grad_norm": 0.386654794216156,
"learning_rate": 4.389507725476105e-05,
"loss": 0.1405,
"step": 39050
},
{
"epoch": 0.5619814517375072,
"grad_norm": 0.5421323776245117,
"learning_rate": 4.3823212360761765e-05,
"loss": 0.1433,
"step": 39100
},
{
"epoch": 0.5627000980952278,
"grad_norm": 0.3224097788333893,
"learning_rate": 4.375134746676249e-05,
"loss": 0.1406,
"step": 39150
},
{
"epoch": 0.5634187444529484,
"grad_norm": 0.29007741808891296,
"learning_rate": 4.367948257276321e-05,
"loss": 0.1444,
"step": 39200
},
{
"epoch": 0.564137390810669,
"grad_norm": 0.3608921468257904,
"learning_rate": 4.360761767876392e-05,
"loss": 0.1391,
"step": 39250
},
{
"epoch": 0.5648560371683896,
"grad_norm": 0.37967294454574585,
"learning_rate": 4.353575278476464e-05,
"loss": 0.1382,
"step": 39300
},
{
"epoch": 0.5655746835261102,
"grad_norm": 0.3170875310897827,
"learning_rate": 4.346388789076537e-05,
"loss": 0.1406,
"step": 39350
},
{
"epoch": 0.5662933298838309,
"grad_norm": 0.2862553596496582,
"learning_rate": 4.339202299676608e-05,
"loss": 0.1452,
"step": 39400
},
{
"epoch": 0.5670119762415514,
"grad_norm": 0.37836143374443054,
"learning_rate": 4.33201581027668e-05,
"loss": 0.1476,
"step": 39450
},
{
"epoch": 0.567730622599272,
"grad_norm": 0.4403735399246216,
"learning_rate": 4.324829320876752e-05,
"loss": 0.1419,
"step": 39500
},
{
"epoch": 0.5684492689569927,
"grad_norm": 0.2952041029930115,
"learning_rate": 4.3176428314768237e-05,
"loss": 0.1399,
"step": 39550
},
{
"epoch": 0.5691679153147132,
"grad_norm": 0.3520998954772949,
"learning_rate": 4.3104563420768956e-05,
"loss": 0.1414,
"step": 39600
},
{
"epoch": 0.5698865616724338,
"grad_norm": 0.667454719543457,
"learning_rate": 4.3032698526769675e-05,
"loss": 0.1414,
"step": 39650
},
{
"epoch": 0.5706052080301544,
"grad_norm": 0.37446698546409607,
"learning_rate": 4.2960833632770394e-05,
"loss": 0.1436,
"step": 39700
},
{
"epoch": 0.571323854387875,
"grad_norm": 0.46900761127471924,
"learning_rate": 4.288896873877111e-05,
"loss": 0.1414,
"step": 39750
},
{
"epoch": 0.5720425007455956,
"grad_norm": 0.3684402406215668,
"learning_rate": 4.281710384477183e-05,
"loss": 0.1447,
"step": 39800
},
{
"epoch": 0.5727611471033162,
"grad_norm": 0.3427809774875641,
"learning_rate": 4.274523895077255e-05,
"loss": 0.1414,
"step": 39850
},
{
"epoch": 0.5734797934610368,
"grad_norm": 0.3241630494594574,
"learning_rate": 4.267337405677327e-05,
"loss": 0.1397,
"step": 39900
},
{
"epoch": 0.5741984398187574,
"grad_norm": 0.3302249610424042,
"learning_rate": 4.260150916277399e-05,
"loss": 0.1412,
"step": 39950
},
{
"epoch": 0.574917086176478,
"grad_norm": 0.5053568482398987,
"learning_rate": 4.25296442687747e-05,
"loss": 0.1432,
"step": 40000
},
{
"epoch": 0.5756357325341985,
"grad_norm": 0.42272675037384033,
"learning_rate": 4.245777937477543e-05,
"loss": 0.146,
"step": 40050
},
{
"epoch": 0.5763543788919192,
"grad_norm": 0.3637758493423462,
"learning_rate": 4.238591448077615e-05,
"loss": 0.1418,
"step": 40100
},
{
"epoch": 0.5770730252496398,
"grad_norm": 0.4027983248233795,
"learning_rate": 4.231404958677686e-05,
"loss": 0.1414,
"step": 40150
},
{
"epoch": 0.5777916716073603,
"grad_norm": 0.2862512171268463,
"learning_rate": 4.224362199065757e-05,
"loss": 0.1416,
"step": 40200
},
{
"epoch": 0.578510317965081,
"grad_norm": 0.49394193291664124,
"learning_rate": 4.217175709665828e-05,
"loss": 0.1415,
"step": 40250
},
{
"epoch": 0.5792289643228016,
"grad_norm": 0.3553609549999237,
"learning_rate": 4.2099892202659e-05,
"loss": 0.1432,
"step": 40300
},
{
"epoch": 0.5799476106805221,
"grad_norm": 0.5765389204025269,
"learning_rate": 4.2028027308659726e-05,
"loss": 0.1437,
"step": 40350
},
{
"epoch": 0.5806662570382428,
"grad_norm": 0.376592755317688,
"learning_rate": 4.195616241466044e-05,
"loss": 0.1406,
"step": 40400
},
{
"epoch": 0.5813849033959634,
"grad_norm": 0.30451223254203796,
"learning_rate": 4.188429752066116e-05,
"loss": 0.1447,
"step": 40450
},
{
"epoch": 0.582103549753684,
"grad_norm": 0.40569472312927246,
"learning_rate": 4.181243262666188e-05,
"loss": 0.1475,
"step": 40500
},
{
"epoch": 0.5828221961114045,
"grad_norm": 0.5457615256309509,
"learning_rate": 4.1740567732662596e-05,
"loss": 0.1444,
"step": 40550
},
{
"epoch": 0.5835408424691252,
"grad_norm": 0.30570656061172485,
"learning_rate": 4.1668702838663315e-05,
"loss": 0.138,
"step": 40600
},
{
"epoch": 0.5842594888268458,
"grad_norm": 0.3235516846179962,
"learning_rate": 4.1596837944664034e-05,
"loss": 0.1429,
"step": 40650
},
{
"epoch": 0.5849781351845663,
"grad_norm": 0.3158193826675415,
"learning_rate": 4.152497305066475e-05,
"loss": 0.1411,
"step": 40700
},
{
"epoch": 0.585696781542287,
"grad_norm": 0.33914652466773987,
"learning_rate": 4.145310815666547e-05,
"loss": 0.1396,
"step": 40750
},
{
"epoch": 0.5864154279000076,
"grad_norm": 0.38597750663757324,
"learning_rate": 4.138124326266619e-05,
"loss": 0.1393,
"step": 40800
},
{
"epoch": 0.5871340742577281,
"grad_norm": 0.5863543152809143,
"learning_rate": 4.1309378368666904e-05,
"loss": 0.1418,
"step": 40850
},
{
"epoch": 0.5878527206154487,
"grad_norm": 0.44041261076927185,
"learning_rate": 4.123751347466763e-05,
"loss": 0.1407,
"step": 40900
},
{
"epoch": 0.5885713669731694,
"grad_norm": 0.41702163219451904,
"learning_rate": 4.116564858066835e-05,
"loss": 0.1406,
"step": 40950
},
{
"epoch": 0.5892900133308899,
"grad_norm": 0.304452121257782,
"learning_rate": 4.109378368666906e-05,
"loss": 0.1457,
"step": 41000
},
{
"epoch": 0.5900086596886105,
"grad_norm": 0.3917398452758789,
"learning_rate": 4.102191879266978e-05,
"loss": 0.1429,
"step": 41050
},
{
"epoch": 0.5907273060463312,
"grad_norm": 0.34760087728500366,
"learning_rate": 4.0950053898670506e-05,
"loss": 0.1384,
"step": 41100
},
{
"epoch": 0.5914459524040517,
"grad_norm": 0.36742857098579407,
"learning_rate": 4.087818900467122e-05,
"loss": 0.1386,
"step": 41150
},
{
"epoch": 0.5921645987617723,
"grad_norm": 0.31853848695755005,
"learning_rate": 4.080632411067194e-05,
"loss": 0.1363,
"step": 41200
},
{
"epoch": 0.592883245119493,
"grad_norm": 0.5090049505233765,
"learning_rate": 4.0734459216672656e-05,
"loss": 0.1372,
"step": 41250
},
{
"epoch": 0.5936018914772135,
"grad_norm": 0.38126641511917114,
"learning_rate": 4.0662594322673376e-05,
"loss": 0.1389,
"step": 41300
},
{
"epoch": 0.5943205378349341,
"grad_norm": 0.36152946949005127,
"learning_rate": 4.0590729428674095e-05,
"loss": 0.141,
"step": 41350
},
{
"epoch": 0.5950391841926547,
"grad_norm": 0.29812654852867126,
"learning_rate": 4.0518864534674814e-05,
"loss": 0.1395,
"step": 41400
},
{
"epoch": 0.5957578305503753,
"grad_norm": 0.38680991530418396,
"learning_rate": 4.044699964067553e-05,
"loss": 0.1411,
"step": 41450
},
{
"epoch": 0.5964764769080959,
"grad_norm": 0.4588952958583832,
"learning_rate": 4.037513474667625e-05,
"loss": 0.141,
"step": 41500
},
{
"epoch": 0.5971951232658165,
"grad_norm": 0.34070587158203125,
"learning_rate": 4.0303269852676964e-05,
"loss": 0.1402,
"step": 41550
},
{
"epoch": 0.5979137696235372,
"grad_norm": 0.4279496371746063,
"learning_rate": 4.023140495867769e-05,
"loss": 0.1407,
"step": 41600
},
{
"epoch": 0.5986324159812577,
"grad_norm": 0.3747555911540985,
"learning_rate": 4.015954006467841e-05,
"loss": 0.1388,
"step": 41650
},
{
"epoch": 0.5993510623389783,
"grad_norm": 0.46703338623046875,
"learning_rate": 4.008767517067912e-05,
"loss": 0.1444,
"step": 41700
},
{
"epoch": 0.6000697086966988,
"grad_norm": 0.2871130704879761,
"learning_rate": 4.001581027667984e-05,
"loss": 0.1397,
"step": 41750
},
{
"epoch": 0.6007883550544195,
"grad_norm": 0.5842506289482117,
"learning_rate": 3.994394538268057e-05,
"loss": 0.1393,
"step": 41800
},
{
"epoch": 0.6015070014121401,
"grad_norm": 0.5202719569206238,
"learning_rate": 3.987208048868128e-05,
"loss": 0.1407,
"step": 41850
},
{
"epoch": 0.6022256477698607,
"grad_norm": 0.39580488204956055,
"learning_rate": 3.9800215594682e-05,
"loss": 0.1425,
"step": 41900
},
{
"epoch": 0.6029442941275813,
"grad_norm": 0.35014522075653076,
"learning_rate": 3.972835070068272e-05,
"loss": 0.1419,
"step": 41950
},
{
"epoch": 0.6036629404853019,
"grad_norm": 0.4798388183116913,
"learning_rate": 3.9656485806683436e-05,
"loss": 0.1377,
"step": 42000
},
{
"epoch": 0.6036629404853019,
"eval_loss": 0.1396327018737793,
"eval_runtime": 2335.2661,
"eval_samples_per_second": 25.089,
"eval_steps_per_second": 3.136,
"step": 42000
},
{
"epoch": 0.6043815868430225,
"grad_norm": 0.5069937109947205,
"learning_rate": 3.9584620912684155e-05,
"loss": 0.1381,
"step": 42050
},
{
"epoch": 0.6051002332007431,
"grad_norm": 0.423450767993927,
"learning_rate": 3.9512756018684874e-05,
"loss": 0.1401,
"step": 42100
},
{
"epoch": 0.6058188795584637,
"grad_norm": 0.3367474675178528,
"learning_rate": 3.9440891124685594e-05,
"loss": 0.1389,
"step": 42150
},
{
"epoch": 0.6065375259161843,
"grad_norm": 0.33142003417015076,
"learning_rate": 3.936902623068631e-05,
"loss": 0.1403,
"step": 42200
},
{
"epoch": 0.6072561722739048,
"grad_norm": 0.4653840959072113,
"learning_rate": 3.929716133668703e-05,
"loss": 0.141,
"step": 42250
},
{
"epoch": 0.6079748186316255,
"grad_norm": 0.48399651050567627,
"learning_rate": 3.922529644268775e-05,
"loss": 0.1346,
"step": 42300
},
{
"epoch": 0.6086934649893461,
"grad_norm": 0.39922240376472473,
"learning_rate": 3.9154868846568454e-05,
"loss": 0.1397,
"step": 42350
},
{
"epoch": 0.6094121113470666,
"grad_norm": 0.370822012424469,
"learning_rate": 3.9083003952569166e-05,
"loss": 0.1442,
"step": 42400
},
{
"epoch": 0.6101307577047873,
"grad_norm": 0.33353352546691895,
"learning_rate": 3.901113905856989e-05,
"loss": 0.1382,
"step": 42450
},
{
"epoch": 0.6108494040625079,
"grad_norm": 0.34602826833724976,
"learning_rate": 3.893927416457061e-05,
"loss": 0.1376,
"step": 42500
},
{
"epoch": 0.6115680504202284,
"grad_norm": 0.40028107166290283,
"learning_rate": 3.8867409270571324e-05,
"loss": 0.137,
"step": 42550
},
{
"epoch": 0.612286696777949,
"grad_norm": 0.37151163816452026,
"learning_rate": 3.879554437657205e-05,
"loss": 0.1386,
"step": 42600
},
{
"epoch": 0.6130053431356697,
"grad_norm": 0.33360013365745544,
"learning_rate": 3.872367948257277e-05,
"loss": 0.1396,
"step": 42650
},
{
"epoch": 0.6137239894933902,
"grad_norm": 0.3027796447277069,
"learning_rate": 3.865181458857348e-05,
"loss": 0.1396,
"step": 42700
},
{
"epoch": 0.6144426358511108,
"grad_norm": 0.5223707556724548,
"learning_rate": 3.85799496945742e-05,
"loss": 0.1408,
"step": 42750
},
{
"epoch": 0.6151612822088315,
"grad_norm": 0.34969398379325867,
"learning_rate": 3.8508084800574926e-05,
"loss": 0.1403,
"step": 42800
},
{
"epoch": 0.615879928566552,
"grad_norm": 0.38828930258750916,
"learning_rate": 3.843621990657564e-05,
"loss": 0.1351,
"step": 42850
},
{
"epoch": 0.6165985749242726,
"grad_norm": 0.43073561787605286,
"learning_rate": 3.836435501257636e-05,
"loss": 0.1389,
"step": 42900
},
{
"epoch": 0.6173172212819933,
"grad_norm": 0.3976250886917114,
"learning_rate": 3.8292490118577076e-05,
"loss": 0.1389,
"step": 42950
},
{
"epoch": 0.6180358676397139,
"grad_norm": 0.37994539737701416,
"learning_rate": 3.8220625224577796e-05,
"loss": 0.1396,
"step": 43000
},
{
"epoch": 0.6187545139974344,
"grad_norm": 0.44032207131385803,
"learning_rate": 3.8148760330578515e-05,
"loss": 0.1407,
"step": 43050
},
{
"epoch": 0.619473160355155,
"grad_norm": 0.5004963278770447,
"learning_rate": 3.8076895436579234e-05,
"loss": 0.1444,
"step": 43100
},
{
"epoch": 0.6201918067128757,
"grad_norm": 0.43479058146476746,
"learning_rate": 3.800503054257995e-05,
"loss": 0.1394,
"step": 43150
},
{
"epoch": 0.6209104530705962,
"grad_norm": 0.3528944253921509,
"learning_rate": 3.793316564858067e-05,
"loss": 0.1366,
"step": 43200
},
{
"epoch": 0.6216290994283168,
"grad_norm": 0.3285259008407593,
"learning_rate": 3.786130075458139e-05,
"loss": 0.1386,
"step": 43250
},
{
"epoch": 0.6223477457860375,
"grad_norm": 0.38898158073425293,
"learning_rate": 3.77894358605821e-05,
"loss": 0.1418,
"step": 43300
},
{
"epoch": 0.623066392143758,
"grad_norm": 0.32555800676345825,
"learning_rate": 3.771757096658283e-05,
"loss": 0.1368,
"step": 43350
},
{
"epoch": 0.6237850385014786,
"grad_norm": 0.46589428186416626,
"learning_rate": 3.764570607258354e-05,
"loss": 0.1358,
"step": 43400
},
{
"epoch": 0.6245036848591992,
"grad_norm": 0.4179432988166809,
"learning_rate": 3.757384117858426e-05,
"loss": 0.1352,
"step": 43450
},
{
"epoch": 0.6252223312169198,
"grad_norm": 0.32889196276664734,
"learning_rate": 3.750197628458498e-05,
"loss": 0.1378,
"step": 43500
},
{
"epoch": 0.6259409775746404,
"grad_norm": 0.5279363393783569,
"learning_rate": 3.74301113905857e-05,
"loss": 0.1368,
"step": 43550
},
{
"epoch": 0.626659623932361,
"grad_norm": 0.36462467908859253,
"learning_rate": 3.735824649658642e-05,
"loss": 0.1381,
"step": 43600
},
{
"epoch": 0.6273782702900816,
"grad_norm": 0.333006888628006,
"learning_rate": 3.728638160258714e-05,
"loss": 0.137,
"step": 43650
},
{
"epoch": 0.6280969166478022,
"grad_norm": 0.3400765359401703,
"learning_rate": 3.7214516708587856e-05,
"loss": 0.139,
"step": 43700
},
{
"epoch": 0.6288155630055228,
"grad_norm": 0.4929637908935547,
"learning_rate": 3.7142651814588575e-05,
"loss": 0.1375,
"step": 43750
},
{
"epoch": 0.6295342093632434,
"grad_norm": 0.3783525824546814,
"learning_rate": 3.7070786920589294e-05,
"loss": 0.1383,
"step": 43800
},
{
"epoch": 0.630252855720964,
"grad_norm": 0.3588646650314331,
"learning_rate": 3.6998922026590014e-05,
"loss": 0.1383,
"step": 43850
},
{
"epoch": 0.6309715020786846,
"grad_norm": 0.2712932825088501,
"learning_rate": 3.692705713259073e-05,
"loss": 0.1346,
"step": 43900
},
{
"epoch": 0.6316901484364051,
"grad_norm": 0.3558711111545563,
"learning_rate": 3.685519223859145e-05,
"loss": 0.141,
"step": 43950
},
{
"epoch": 0.6324087947941258,
"grad_norm": 0.3488093912601471,
"learning_rate": 3.6783327344592164e-05,
"loss": 0.1391,
"step": 44000
},
{
"epoch": 0.6331274411518464,
"grad_norm": 0.409156858921051,
"learning_rate": 3.671146245059289e-05,
"loss": 0.137,
"step": 44050
},
{
"epoch": 0.633846087509567,
"grad_norm": 0.3519936800003052,
"learning_rate": 3.663959755659361e-05,
"loss": 0.1353,
"step": 44100
},
{
"epoch": 0.6345647338672876,
"grad_norm": 0.35567376017570496,
"learning_rate": 3.656773266259432e-05,
"loss": 0.1394,
"step": 44150
},
{
"epoch": 0.6352833802250082,
"grad_norm": 0.26964500546455383,
"learning_rate": 3.649586776859504e-05,
"loss": 0.1335,
"step": 44200
},
{
"epoch": 0.6360020265827288,
"grad_norm": 0.3359646797180176,
"learning_rate": 3.6424002874595766e-05,
"loss": 0.1422,
"step": 44250
},
{
"epoch": 0.6367206729404493,
"grad_norm": 0.31784096360206604,
"learning_rate": 3.635213798059648e-05,
"loss": 0.1379,
"step": 44300
},
{
"epoch": 0.63743931929817,
"grad_norm": 0.3648991286754608,
"learning_rate": 3.628171038447719e-05,
"loss": 0.1342,
"step": 44350
},
{
"epoch": 0.6381579656558906,
"grad_norm": 0.413467675447464,
"learning_rate": 3.62098454904779e-05,
"loss": 0.1381,
"step": 44400
},
{
"epoch": 0.6388766120136111,
"grad_norm": 0.5211649537086487,
"learning_rate": 3.613798059647862e-05,
"loss": 0.1408,
"step": 44450
},
{
"epoch": 0.6395952583713318,
"grad_norm": 0.358275830745697,
"learning_rate": 3.606611570247934e-05,
"loss": 0.1352,
"step": 44500
},
{
"epoch": 0.6403139047290524,
"grad_norm": 0.38519778847694397,
"learning_rate": 3.599425080848006e-05,
"loss": 0.1439,
"step": 44550
},
{
"epoch": 0.6410325510867729,
"grad_norm": 0.3506108820438385,
"learning_rate": 3.592238591448078e-05,
"loss": 0.1408,
"step": 44600
},
{
"epoch": 0.6417511974444936,
"grad_norm": 0.32862502336502075,
"learning_rate": 3.5850521020481496e-05,
"loss": 0.1375,
"step": 44650
},
{
"epoch": 0.6424698438022142,
"grad_norm": 0.34333035349845886,
"learning_rate": 3.5778656126482215e-05,
"loss": 0.1333,
"step": 44700
},
{
"epoch": 0.6431884901599347,
"grad_norm": 0.33827170729637146,
"learning_rate": 3.5706791232482935e-05,
"loss": 0.137,
"step": 44750
},
{
"epoch": 0.6439071365176553,
"grad_norm": 0.5813308954238892,
"learning_rate": 3.5634926338483654e-05,
"loss": 0.1375,
"step": 44800
},
{
"epoch": 0.644625782875376,
"grad_norm": 0.4128280580043793,
"learning_rate": 3.5563061444484366e-05,
"loss": 0.1377,
"step": 44850
},
{
"epoch": 0.6453444292330965,
"grad_norm": 0.33010971546173096,
"learning_rate": 3.549119655048509e-05,
"loss": 0.138,
"step": 44900
},
{
"epoch": 0.6460630755908171,
"grad_norm": 0.33612126111984253,
"learning_rate": 3.541933165648581e-05,
"loss": 0.1385,
"step": 44950
},
{
"epoch": 0.6467817219485378,
"grad_norm": 0.5942758321762085,
"learning_rate": 3.534746676248652e-05,
"loss": 0.1343,
"step": 45000
},
{
"epoch": 0.6467817219485378,
"eval_loss": 0.13751016557216644,
"eval_runtime": 2334.0416,
"eval_samples_per_second": 25.102,
"eval_steps_per_second": 3.138,
"step": 45000
},
{
"epoch": 0.6475003683062583,
"grad_norm": 0.49251455068588257,
"learning_rate": 3.527560186848724e-05,
"loss": 0.1368,
"step": 45050
},
{
"epoch": 0.6482190146639789,
"grad_norm": 0.306601345539093,
"learning_rate": 3.520373697448797e-05,
"loss": 0.1411,
"step": 45100
},
{
"epoch": 0.6489376610216995,
"grad_norm": 0.45201998949050903,
"learning_rate": 3.513187208048868e-05,
"loss": 0.1339,
"step": 45150
},
{
"epoch": 0.6496563073794202,
"grad_norm": 0.3716999590396881,
"learning_rate": 3.50600071864894e-05,
"loss": 0.1317,
"step": 45200
},
{
"epoch": 0.6503749537371407,
"grad_norm": 0.3443935811519623,
"learning_rate": 3.4988142292490126e-05,
"loss": 0.141,
"step": 45250
},
{
"epoch": 0.6510936000948613,
"grad_norm": 0.4641805589199066,
"learning_rate": 3.491627739849084e-05,
"loss": 0.1341,
"step": 45300
},
{
"epoch": 0.651812246452582,
"grad_norm": 0.5612492561340332,
"learning_rate": 3.484441250449156e-05,
"loss": 0.1433,
"step": 45350
},
{
"epoch": 0.6525308928103025,
"grad_norm": 0.36323145031929016,
"learning_rate": 3.4772547610492276e-05,
"loss": 0.1383,
"step": 45400
},
{
"epoch": 0.6532495391680231,
"grad_norm": 0.40366289019584656,
"learning_rate": 3.4700682716492995e-05,
"loss": 0.1363,
"step": 45450
},
{
"epoch": 0.6539681855257438,
"grad_norm": 0.318117618560791,
"learning_rate": 3.4628817822493714e-05,
"loss": 0.1347,
"step": 45500
},
{
"epoch": 0.6546868318834643,
"grad_norm": 0.43462902307510376,
"learning_rate": 3.4556952928494433e-05,
"loss": 0.1319,
"step": 45550
},
{
"epoch": 0.6554054782411849,
"grad_norm": 0.4332530200481415,
"learning_rate": 3.448508803449515e-05,
"loss": 0.1404,
"step": 45600
},
{
"epoch": 0.6561241245989055,
"grad_norm": 0.506473183631897,
"learning_rate": 3.441322314049587e-05,
"loss": 0.1366,
"step": 45650
},
{
"epoch": 0.6568427709566261,
"grad_norm": 0.3314126431941986,
"learning_rate": 3.4341358246496584e-05,
"loss": 0.1362,
"step": 45700
},
{
"epoch": 0.6575614173143467,
"grad_norm": 0.32727667689323425,
"learning_rate": 3.42694933524973e-05,
"loss": 0.1342,
"step": 45750
},
{
"epoch": 0.6582800636720673,
"grad_norm": 0.39728379249572754,
"learning_rate": 3.419762845849803e-05,
"loss": 0.135,
"step": 45800
},
{
"epoch": 0.6589987100297879,
"grad_norm": 0.4866426885128021,
"learning_rate": 3.412576356449874e-05,
"loss": 0.1346,
"step": 45850
},
{
"epoch": 0.6597173563875085,
"grad_norm": 0.47908952832221985,
"learning_rate": 3.405533596837945e-05,
"loss": 0.1378,
"step": 45900
},
{
"epoch": 0.6604360027452291,
"grad_norm": 0.490915983915329,
"learning_rate": 3.398347107438017e-05,
"loss": 0.1354,
"step": 45950
},
{
"epoch": 0.6611546491029496,
"grad_norm": 0.32982251048088074,
"learning_rate": 3.391160618038088e-05,
"loss": 0.1329,
"step": 46000
},
{
"epoch": 0.6618732954606703,
"grad_norm": 0.2903765141963959,
"learning_rate": 3.38397412863816e-05,
"loss": 0.1352,
"step": 46050
},
{
"epoch": 0.6625919418183909,
"grad_norm": 0.45338502526283264,
"learning_rate": 3.376787639238233e-05,
"loss": 0.1393,
"step": 46100
},
{
"epoch": 0.6633105881761114,
"grad_norm": 0.5581700801849365,
"learning_rate": 3.369601149838304e-05,
"loss": 0.1392,
"step": 46150
},
{
"epoch": 0.6640292345338321,
"grad_norm": 0.3929944932460785,
"learning_rate": 3.362414660438376e-05,
"loss": 0.135,
"step": 46200
},
{
"epoch": 0.6647478808915527,
"grad_norm": 0.3831471800804138,
"learning_rate": 3.355228171038448e-05,
"loss": 0.1374,
"step": 46250
},
{
"epoch": 0.6654665272492732,
"grad_norm": 0.319807231426239,
"learning_rate": 3.34804168163852e-05,
"loss": 0.1327,
"step": 46300
},
{
"epoch": 0.6661851736069939,
"grad_norm": 0.33727338910102844,
"learning_rate": 3.3408551922385916e-05,
"loss": 0.1338,
"step": 46350
},
{
"epoch": 0.6669038199647145,
"grad_norm": 0.37149056792259216,
"learning_rate": 3.3336687028386635e-05,
"loss": 0.1407,
"step": 46400
},
{
"epoch": 0.667622466322435,
"grad_norm": 0.29155993461608887,
"learning_rate": 3.3264822134387355e-05,
"loss": 0.1391,
"step": 46450
},
{
"epoch": 0.6683411126801556,
"grad_norm": 0.32874971628189087,
"learning_rate": 3.3192957240388074e-05,
"loss": 0.137,
"step": 46500
},
{
"epoch": 0.6690597590378763,
"grad_norm": 0.4324096143245697,
"learning_rate": 3.312109234638879e-05,
"loss": 0.1357,
"step": 46550
},
{
"epoch": 0.6697784053955969,
"grad_norm": 0.3491309881210327,
"learning_rate": 3.304922745238951e-05,
"loss": 0.1371,
"step": 46600
},
{
"epoch": 0.6704970517533174,
"grad_norm": 0.3900867998600006,
"learning_rate": 3.297736255839023e-05,
"loss": 0.1374,
"step": 46650
},
{
"epoch": 0.6712156981110381,
"grad_norm": 0.41907286643981934,
"learning_rate": 3.290549766439094e-05,
"loss": 0.1342,
"step": 46700
},
{
"epoch": 0.6719343444687587,
"grad_norm": 0.3794206380844116,
"learning_rate": 3.283363277039166e-05,
"loss": 0.1372,
"step": 46750
},
{
"epoch": 0.6726529908264792,
"grad_norm": 0.3382868468761444,
"learning_rate": 3.276176787639239e-05,
"loss": 0.1349,
"step": 46800
},
{
"epoch": 0.6733716371841998,
"grad_norm": 0.3015630543231964,
"learning_rate": 3.26899029823931e-05,
"loss": 0.1379,
"step": 46850
},
{
"epoch": 0.6740902835419205,
"grad_norm": 0.40960338711738586,
"learning_rate": 3.261803808839382e-05,
"loss": 0.1324,
"step": 46900
},
{
"epoch": 0.674808929899641,
"grad_norm": 0.30875468254089355,
"learning_rate": 3.254617319439454e-05,
"loss": 0.1323,
"step": 46950
},
{
"epoch": 0.6755275762573616,
"grad_norm": 0.35867223143577576,
"learning_rate": 3.247430830039526e-05,
"loss": 0.1367,
"step": 47000
},
{
"epoch": 0.6762462226150823,
"grad_norm": 0.3085405230522156,
"learning_rate": 3.240244340639598e-05,
"loss": 0.1402,
"step": 47050
},
{
"epoch": 0.6769648689728028,
"grad_norm": 0.30113691091537476,
"learning_rate": 3.2330578512396696e-05,
"loss": 0.1358,
"step": 47100
},
{
"epoch": 0.6776835153305234,
"grad_norm": 0.41874217987060547,
"learning_rate": 3.2258713618397415e-05,
"loss": 0.1359,
"step": 47150
},
{
"epoch": 0.6784021616882441,
"grad_norm": 0.3495503067970276,
"learning_rate": 3.2186848724398134e-05,
"loss": 0.14,
"step": 47200
},
{
"epoch": 0.6791208080459646,
"grad_norm": 0.45888441801071167,
"learning_rate": 3.2114983830398853e-05,
"loss": 0.1319,
"step": 47250
},
{
"epoch": 0.6798394544036852,
"grad_norm": 0.2843805253505707,
"learning_rate": 3.2043118936399566e-05,
"loss": 0.1338,
"step": 47300
},
{
"epoch": 0.6805581007614058,
"grad_norm": 0.5123314261436462,
"learning_rate": 3.197125404240029e-05,
"loss": 0.1408,
"step": 47350
},
{
"epoch": 0.6812767471191264,
"grad_norm": 0.38351091742515564,
"learning_rate": 3.189938914840101e-05,
"loss": 0.1369,
"step": 47400
},
{
"epoch": 0.681995393476847,
"grad_norm": 0.34910881519317627,
"learning_rate": 3.182752425440172e-05,
"loss": 0.1342,
"step": 47450
},
{
"epoch": 0.6827140398345676,
"grad_norm": 0.3906104564666748,
"learning_rate": 3.175565936040244e-05,
"loss": 0.1363,
"step": 47500
},
{
"epoch": 0.6834326861922883,
"grad_norm": 0.3617385923862457,
"learning_rate": 3.168379446640317e-05,
"loss": 0.1323,
"step": 47550
},
{
"epoch": 0.6841513325500088,
"grad_norm": 0.6945951581001282,
"learning_rate": 3.161192957240388e-05,
"loss": 0.1328,
"step": 47600
},
{
"epoch": 0.6848699789077294,
"grad_norm": 0.4061312973499298,
"learning_rate": 3.15400646784046e-05,
"loss": 0.1336,
"step": 47650
},
{
"epoch": 0.68558862526545,
"grad_norm": 0.3243468999862671,
"learning_rate": 3.1468199784405325e-05,
"loss": 0.1353,
"step": 47700
},
{
"epoch": 0.6863072716231706,
"grad_norm": 0.354533851146698,
"learning_rate": 3.139633489040604e-05,
"loss": 0.1324,
"step": 47750
},
{
"epoch": 0.6870259179808912,
"grad_norm": 0.34843167662620544,
"learning_rate": 3.132446999640676e-05,
"loss": 0.133,
"step": 47800
},
{
"epoch": 0.6877445643386118,
"grad_norm": 0.3735269606113434,
"learning_rate": 3.1252605102407476e-05,
"loss": 0.1333,
"step": 47850
},
{
"epoch": 0.6884632106963324,
"grad_norm": 0.5805733799934387,
"learning_rate": 3.1180740208408195e-05,
"loss": 0.1363,
"step": 47900
},
{
"epoch": 0.689181857054053,
"grad_norm": 0.4405001103878021,
"learning_rate": 3.1108875314408914e-05,
"loss": 0.1365,
"step": 47950
},
{
"epoch": 0.6899005034117736,
"grad_norm": 0.3531704545021057,
"learning_rate": 3.1037010420409626e-05,
"loss": 0.1315,
"step": 48000
},
{
"epoch": 0.6899005034117736,
"eval_loss": 0.13462956249713898,
"eval_runtime": 2343.1875,
"eval_samples_per_second": 25.004,
"eval_steps_per_second": 3.126,
"step": 48000
},
{
"epoch": 0.6906191497694942,
"grad_norm": 0.32714229822158813,
"learning_rate": 3.096514552641035e-05,
"loss": 0.1333,
"step": 48050
},
{
"epoch": 0.6913377961272148,
"grad_norm": 0.3133352994918823,
"learning_rate": 3.089328063241107e-05,
"loss": 0.135,
"step": 48100
},
{
"epoch": 0.6920564424849354,
"grad_norm": 0.3752550482749939,
"learning_rate": 3.0821415738411784e-05,
"loss": 0.1381,
"step": 48150
},
{
"epoch": 0.6927750888426559,
"grad_norm": 0.49100804328918457,
"learning_rate": 3.07495508444125e-05,
"loss": 0.135,
"step": 48200
},
{
"epoch": 0.6934937352003766,
"grad_norm": 0.36574167013168335,
"learning_rate": 3.067768595041323e-05,
"loss": 0.1308,
"step": 48250
},
{
"epoch": 0.6942123815580972,
"grad_norm": 0.3478115200996399,
"learning_rate": 3.0607258354293925e-05,
"loss": 0.1322,
"step": 48300
},
{
"epoch": 0.6949310279158177,
"grad_norm": 0.743575394153595,
"learning_rate": 3.053539346029465e-05,
"loss": 0.1356,
"step": 48350
},
{
"epoch": 0.6956496742735384,
"grad_norm": 0.3269284665584564,
"learning_rate": 3.046352856629537e-05,
"loss": 0.1327,
"step": 48400
},
{
"epoch": 0.696368320631259,
"grad_norm": 0.43469032645225525,
"learning_rate": 3.0391663672296082e-05,
"loss": 0.1341,
"step": 48450
},
{
"epoch": 0.6970869669889795,
"grad_norm": 0.46831926703453064,
"learning_rate": 3.0319798778296805e-05,
"loss": 0.1326,
"step": 48500
},
{
"epoch": 0.6978056133467001,
"grad_norm": 0.313024640083313,
"learning_rate": 3.0247933884297524e-05,
"loss": 0.1339,
"step": 48550
},
{
"epoch": 0.6985242597044208,
"grad_norm": 0.3206130564212799,
"learning_rate": 3.017606899029824e-05,
"loss": 0.1315,
"step": 48600
},
{
"epoch": 0.6992429060621413,
"grad_norm": 0.29317253828048706,
"learning_rate": 3.010420409629896e-05,
"loss": 0.1316,
"step": 48650
},
{
"epoch": 0.6999615524198619,
"grad_norm": 0.36203575134277344,
"learning_rate": 3.003233920229968e-05,
"loss": 0.1369,
"step": 48700
},
{
"epoch": 0.7006801987775826,
"grad_norm": 0.4818989932537079,
"learning_rate": 2.9960474308300397e-05,
"loss": 0.1312,
"step": 48750
},
{
"epoch": 0.7013988451353032,
"grad_norm": 0.4188140332698822,
"learning_rate": 2.9888609414301116e-05,
"loss": 0.1295,
"step": 48800
},
{
"epoch": 0.7021174914930237,
"grad_norm": 0.42703622579574585,
"learning_rate": 2.9816744520301832e-05,
"loss": 0.1357,
"step": 48850
},
{
"epoch": 0.7028361378507444,
"grad_norm": 0.3431522250175476,
"learning_rate": 2.974487962630255e-05,
"loss": 0.1343,
"step": 48900
},
{
"epoch": 0.703554784208465,
"grad_norm": 0.3017210364341736,
"learning_rate": 2.9673014732303273e-05,
"loss": 0.1357,
"step": 48950
},
{
"epoch": 0.7042734305661855,
"grad_norm": 0.32396504282951355,
"learning_rate": 2.960114983830399e-05,
"loss": 0.1348,
"step": 49000
},
{
"epoch": 0.7049920769239061,
"grad_norm": 0.4112311899662018,
"learning_rate": 2.9529284944304708e-05,
"loss": 0.1333,
"step": 49050
},
{
"epoch": 0.7057107232816268,
"grad_norm": 0.3641326427459717,
"learning_rate": 2.9457420050305427e-05,
"loss": 0.1377,
"step": 49100
},
{
"epoch": 0.7064293696393473,
"grad_norm": 0.41020339727401733,
"learning_rate": 2.9385555156306143e-05,
"loss": 0.1326,
"step": 49150
},
{
"epoch": 0.7071480159970679,
"grad_norm": 0.5133574604988098,
"learning_rate": 2.9313690262306865e-05,
"loss": 0.1404,
"step": 49200
},
{
"epoch": 0.7078666623547886,
"grad_norm": 0.4345245063304901,
"learning_rate": 2.9241825368307585e-05,
"loss": 0.1355,
"step": 49250
},
{
"epoch": 0.7085853087125091,
"grad_norm": 0.3085751235485077,
"learning_rate": 2.91699604743083e-05,
"loss": 0.1361,
"step": 49300
},
{
"epoch": 0.7093039550702297,
"grad_norm": 0.4417734444141388,
"learning_rate": 2.909809558030902e-05,
"loss": 0.1353,
"step": 49350
},
{
"epoch": 0.7100226014279503,
"grad_norm": 0.38431400060653687,
"learning_rate": 2.9026230686309742e-05,
"loss": 0.1371,
"step": 49400
},
{
"epoch": 0.7107412477856709,
"grad_norm": 0.420731782913208,
"learning_rate": 2.8954365792310458e-05,
"loss": 0.1292,
"step": 49450
},
{
"epoch": 0.7114598941433915,
"grad_norm": 0.3676437437534332,
"learning_rate": 2.8882500898311177e-05,
"loss": 0.1319,
"step": 49500
},
{
"epoch": 0.7121785405011121,
"grad_norm": 0.34518060088157654,
"learning_rate": 2.8810636004311896e-05,
"loss": 0.134,
"step": 49550
},
{
"epoch": 0.7128971868588327,
"grad_norm": 0.4851680099964142,
"learning_rate": 2.873877111031261e-05,
"loss": 0.129,
"step": 49600
},
{
"epoch": 0.7136158332165533,
"grad_norm": 0.29022541642189026,
"learning_rate": 2.8666906216313334e-05,
"loss": 0.1315,
"step": 49650
},
{
"epoch": 0.7143344795742739,
"grad_norm": 0.5281957387924194,
"learning_rate": 2.8595041322314053e-05,
"loss": 0.1353,
"step": 49700
},
{
"epoch": 0.7150531259319945,
"grad_norm": 0.36344966292381287,
"learning_rate": 2.852317642831477e-05,
"loss": 0.1378,
"step": 49750
},
{
"epoch": 0.7157717722897151,
"grad_norm": 0.42327845096588135,
"learning_rate": 2.8451311534315488e-05,
"loss": 0.1287,
"step": 49800
},
{
"epoch": 0.7164904186474357,
"grad_norm": 0.38851046562194824,
"learning_rate": 2.837944664031621e-05,
"loss": 0.1342,
"step": 49850
},
{
"epoch": 0.7172090650051562,
"grad_norm": 0.3646990656852722,
"learning_rate": 2.8307581746316926e-05,
"loss": 0.1334,
"step": 49900
},
{
"epoch": 0.7179277113628769,
"grad_norm": 0.3507033586502075,
"learning_rate": 2.8235716852317645e-05,
"loss": 0.1327,
"step": 49950
},
{
"epoch": 0.7186463577205975,
"grad_norm": 0.3572186827659607,
"learning_rate": 2.816385195831836e-05,
"loss": 0.1325,
"step": 50000
},
{
"epoch": 0.719365004078318,
"grad_norm": 0.43677380681037903,
"learning_rate": 2.809198706431908e-05,
"loss": 0.1346,
"step": 50050
},
{
"epoch": 0.7200836504360387,
"grad_norm": 0.3421187400817871,
"learning_rate": 2.8020122170319803e-05,
"loss": 0.1316,
"step": 50100
},
{
"epoch": 0.7208022967937593,
"grad_norm": 0.3370194435119629,
"learning_rate": 2.7948257276320515e-05,
"loss": 0.1318,
"step": 50150
},
{
"epoch": 0.7215209431514799,
"grad_norm": 0.36595064401626587,
"learning_rate": 2.7876392382321237e-05,
"loss": 0.1341,
"step": 50200
},
{
"epoch": 0.7222395895092004,
"grad_norm": 0.3835677206516266,
"learning_rate": 2.7804527488321956e-05,
"loss": 0.1293,
"step": 50250
},
{
"epoch": 0.7229582358669211,
"grad_norm": 0.40462374687194824,
"learning_rate": 2.7732662594322672e-05,
"loss": 0.1356,
"step": 50300
},
{
"epoch": 0.7236768822246417,
"grad_norm": 0.35873696208000183,
"learning_rate": 2.7660797700323395e-05,
"loss": 0.1347,
"step": 50350
},
{
"epoch": 0.7243955285823622,
"grad_norm": 0.40664613246917725,
"learning_rate": 2.7588932806324114e-05,
"loss": 0.1327,
"step": 50400
},
{
"epoch": 0.7251141749400829,
"grad_norm": 0.3808116912841797,
"learning_rate": 2.751706791232483e-05,
"loss": 0.1316,
"step": 50450
},
{
"epoch": 0.7258328212978035,
"grad_norm": 0.43004295229911804,
"learning_rate": 2.744520301832555e-05,
"loss": 0.1312,
"step": 50500
},
{
"epoch": 0.726551467655524,
"grad_norm": 0.27748802304267883,
"learning_rate": 2.737333812432627e-05,
"loss": 0.1347,
"step": 50550
},
{
"epoch": 0.7272701140132447,
"grad_norm": 0.2862594425678253,
"learning_rate": 2.7301473230326983e-05,
"loss": 0.133,
"step": 50600
},
{
"epoch": 0.7279887603709653,
"grad_norm": 0.4833882749080658,
"learning_rate": 2.7229608336327706e-05,
"loss": 0.1309,
"step": 50650
},
{
"epoch": 0.7287074067286858,
"grad_norm": 0.47044578194618225,
"learning_rate": 2.7157743442328425e-05,
"loss": 0.134,
"step": 50700
},
{
"epoch": 0.7294260530864064,
"grad_norm": 0.44328999519348145,
"learning_rate": 2.708587854832914e-05,
"loss": 0.1345,
"step": 50750
},
{
"epoch": 0.7301446994441271,
"grad_norm": 0.39716413617134094,
"learning_rate": 2.7014013654329863e-05,
"loss": 0.131,
"step": 50800
},
{
"epoch": 0.7308633458018476,
"grad_norm": 0.48879918456077576,
"learning_rate": 2.6942148760330582e-05,
"loss": 0.1362,
"step": 50850
},
{
"epoch": 0.7315819921595682,
"grad_norm": 0.3789602518081665,
"learning_rate": 2.6870283866331298e-05,
"loss": 0.1335,
"step": 50900
},
{
"epoch": 0.7323006385172889,
"grad_norm": 0.5360676050186157,
"learning_rate": 2.6798418972332017e-05,
"loss": 0.1368,
"step": 50950
},
{
"epoch": 0.7330192848750094,
"grad_norm": 0.3693239986896515,
"learning_rate": 2.672655407833274e-05,
"loss": 0.1324,
"step": 51000
},
{
"epoch": 0.7330192848750094,
"eval_loss": 0.13193804025650024,
"eval_runtime": 2341.8076,
"eval_samples_per_second": 25.019,
"eval_steps_per_second": 3.127,
"step": 51000
},
{
"epoch": 0.73373793123273,
"grad_norm": 0.4406863749027252,
"learning_rate": 2.6654689184333452e-05,
"loss": 0.1337,
"step": 51050
},
{
"epoch": 0.7344565775904506,
"grad_norm": 0.3431764245033264,
"learning_rate": 2.6582824290334174e-05,
"loss": 0.1288,
"step": 51100
},
{
"epoch": 0.7351752239481713,
"grad_norm": 0.4725956320762634,
"learning_rate": 2.651095939633489e-05,
"loss": 0.1324,
"step": 51150
},
{
"epoch": 0.7358938703058918,
"grad_norm": 0.2942321300506592,
"learning_rate": 2.643909450233561e-05,
"loss": 0.1328,
"step": 51200
},
{
"epoch": 0.7366125166636124,
"grad_norm": 0.3699278235435486,
"learning_rate": 2.6367229608336332e-05,
"loss": 0.1339,
"step": 51250
},
{
"epoch": 0.7373311630213331,
"grad_norm": 0.37523624300956726,
"learning_rate": 2.6295364714337044e-05,
"loss": 0.1296,
"step": 51300
},
{
"epoch": 0.7380498093790536,
"grad_norm": 0.31678199768066406,
"learning_rate": 2.6223499820337767e-05,
"loss": 0.1302,
"step": 51350
},
{
"epoch": 0.7387684557367742,
"grad_norm": 0.321972131729126,
"learning_rate": 2.6151634926338486e-05,
"loss": 0.1322,
"step": 51400
},
{
"epoch": 0.7394871020944949,
"grad_norm": 0.28065794706344604,
"learning_rate": 2.60797700323392e-05,
"loss": 0.1328,
"step": 51450
},
{
"epoch": 0.7402057484522154,
"grad_norm": 0.2923668324947357,
"learning_rate": 2.600790513833992e-05,
"loss": 0.1269,
"step": 51500
},
{
"epoch": 0.740924394809936,
"grad_norm": 0.3297905921936035,
"learning_rate": 2.5936040244340643e-05,
"loss": 0.1314,
"step": 51550
},
{
"epoch": 0.7416430411676566,
"grad_norm": 0.3125530779361725,
"learning_rate": 2.586417535034136e-05,
"loss": 0.1345,
"step": 51600
},
{
"epoch": 0.7423616875253772,
"grad_norm": 0.29957976937294006,
"learning_rate": 2.5792310456342078e-05,
"loss": 0.1339,
"step": 51650
},
{
"epoch": 0.7430803338830978,
"grad_norm": 0.45887959003448486,
"learning_rate": 2.57204455623428e-05,
"loss": 0.1294,
"step": 51700
},
{
"epoch": 0.7437989802408184,
"grad_norm": 0.44700437784194946,
"learning_rate": 2.5648580668343513e-05,
"loss": 0.1291,
"step": 51750
},
{
"epoch": 0.744517626598539,
"grad_norm": 0.30511099100112915,
"learning_rate": 2.5576715774344235e-05,
"loss": 0.1313,
"step": 51800
},
{
"epoch": 0.7452362729562596,
"grad_norm": 0.4534262418746948,
"learning_rate": 2.5504850880344954e-05,
"loss": 0.1343,
"step": 51850
},
{
"epoch": 0.7459549193139802,
"grad_norm": 0.4143444299697876,
"learning_rate": 2.543298598634567e-05,
"loss": 0.1305,
"step": 51900
},
{
"epoch": 0.7466735656717007,
"grad_norm": 0.4388984739780426,
"learning_rate": 2.536112109234639e-05,
"loss": 0.1307,
"step": 51950
},
{
"epoch": 0.7473922120294214,
"grad_norm": 0.33407118916511536,
"learning_rate": 2.528925619834711e-05,
"loss": 0.1287,
"step": 52000
},
{
"epoch": 0.748110858387142,
"grad_norm": 0.3362686336040497,
"learning_rate": 2.5217391304347827e-05,
"loss": 0.1305,
"step": 52050
},
{
"epoch": 0.7488295047448625,
"grad_norm": 0.3452841639518738,
"learning_rate": 2.5145526410348546e-05,
"loss": 0.1282,
"step": 52100
},
{
"epoch": 0.7495481511025832,
"grad_norm": 0.7336341142654419,
"learning_rate": 2.507366151634927e-05,
"loss": 0.134,
"step": 52150
},
{
"epoch": 0.7502667974603038,
"grad_norm": 0.40963059663772583,
"learning_rate": 2.500179662234998e-05,
"loss": 0.1259,
"step": 52200
},
{
"epoch": 0.7509854438180243,
"grad_norm": 0.40320196747779846,
"learning_rate": 2.4929931728350704e-05,
"loss": 0.1288,
"step": 52250
},
{
"epoch": 0.751704090175745,
"grad_norm": 0.391169011592865,
"learning_rate": 2.4859504132231407e-05,
"loss": 0.1292,
"step": 52300
},
{
"epoch": 0.7524227365334656,
"grad_norm": 0.456756591796875,
"learning_rate": 2.4787639238232126e-05,
"loss": 0.1287,
"step": 52350
},
{
"epoch": 0.7531413828911862,
"grad_norm": 0.3610314428806305,
"learning_rate": 2.471577434423284e-05,
"loss": 0.1287,
"step": 52400
},
{
"epoch": 0.7538600292489067,
"grad_norm": 0.3390330672264099,
"learning_rate": 2.4643909450233564e-05,
"loss": 0.1303,
"step": 52450
},
{
"epoch": 0.7545786756066274,
"grad_norm": 0.38022834062576294,
"learning_rate": 2.457204455623428e-05,
"loss": 0.1342,
"step": 52500
},
{
"epoch": 0.755297321964348,
"grad_norm": 0.37033766508102417,
"learning_rate": 2.4500179662235e-05,
"loss": 0.1296,
"step": 52550
},
{
"epoch": 0.7560159683220685,
"grad_norm": 0.39085620641708374,
"learning_rate": 2.4429752066115705e-05,
"loss": 0.1334,
"step": 52600
},
{
"epoch": 0.7567346146797892,
"grad_norm": 0.24048922955989838,
"learning_rate": 2.435788717211642e-05,
"loss": 0.1262,
"step": 52650
},
{
"epoch": 0.7574532610375098,
"grad_norm": 0.4394150376319885,
"learning_rate": 2.428602227811714e-05,
"loss": 0.1296,
"step": 52700
},
{
"epoch": 0.7581719073952303,
"grad_norm": 0.5719261765480042,
"learning_rate": 2.421415738411786e-05,
"loss": 0.1285,
"step": 52750
},
{
"epoch": 0.7588905537529509,
"grad_norm": 0.3501519560813904,
"learning_rate": 2.414229249011858e-05,
"loss": 0.1297,
"step": 52800
},
{
"epoch": 0.7596092001106716,
"grad_norm": 0.36480751633644104,
"learning_rate": 2.4070427596119298e-05,
"loss": 0.1298,
"step": 52850
},
{
"epoch": 0.7603278464683921,
"grad_norm": 0.48834893107414246,
"learning_rate": 2.3998562702120013e-05,
"loss": 0.1271,
"step": 52900
},
{
"epoch": 0.7610464928261127,
"grad_norm": 0.44482356309890747,
"learning_rate": 2.3926697808120736e-05,
"loss": 0.1306,
"step": 52950
},
{
"epoch": 0.7617651391838334,
"grad_norm": 0.5450196862220764,
"learning_rate": 2.385483291412145e-05,
"loss": 0.1271,
"step": 53000
},
{
"epoch": 0.7624837855415539,
"grad_norm": 0.3324733078479767,
"learning_rate": 2.378296802012217e-05,
"loss": 0.1329,
"step": 53050
},
{
"epoch": 0.7632024318992745,
"grad_norm": 0.34339848160743713,
"learning_rate": 2.371110312612289e-05,
"loss": 0.1321,
"step": 53100
},
{
"epoch": 0.7639210782569952,
"grad_norm": 0.3504130244255066,
"learning_rate": 2.363923823212361e-05,
"loss": 0.1281,
"step": 53150
},
{
"epoch": 0.7646397246147157,
"grad_norm": 0.46733903884887695,
"learning_rate": 2.3567373338124328e-05,
"loss": 0.1306,
"step": 53200
},
{
"epoch": 0.7653583709724363,
"grad_norm": 0.4953770041465759,
"learning_rate": 2.3495508444125047e-05,
"loss": 0.1308,
"step": 53250
},
{
"epoch": 0.7660770173301569,
"grad_norm": 0.3816761076450348,
"learning_rate": 2.3423643550125766e-05,
"loss": 0.1308,
"step": 53300
},
{
"epoch": 0.7667956636878775,
"grad_norm": 0.3412993848323822,
"learning_rate": 2.3351778656126482e-05,
"loss": 0.1272,
"step": 53350
},
{
"epoch": 0.7675143100455981,
"grad_norm": 0.5620062947273254,
"learning_rate": 2.32799137621272e-05,
"loss": 0.1308,
"step": 53400
},
{
"epoch": 0.7682329564033187,
"grad_norm": 0.44010090827941895,
"learning_rate": 2.320804886812792e-05,
"loss": 0.1271,
"step": 53450
},
{
"epoch": 0.7689516027610394,
"grad_norm": 0.3432207703590393,
"learning_rate": 2.313618397412864e-05,
"loss": 0.1291,
"step": 53500
},
{
"epoch": 0.7696702491187599,
"grad_norm": 0.45159292221069336,
"learning_rate": 2.3064319080129358e-05,
"loss": 0.1264,
"step": 53550
},
{
"epoch": 0.7703888954764805,
"grad_norm": 0.3891454339027405,
"learning_rate": 2.2992454186130077e-05,
"loss": 0.1311,
"step": 53600
},
{
"epoch": 0.771107541834201,
"grad_norm": 0.5852764844894409,
"learning_rate": 2.2920589292130796e-05,
"loss": 0.1307,
"step": 53650
},
{
"epoch": 0.7718261881919217,
"grad_norm": 0.48390915989875793,
"learning_rate": 2.2848724398131512e-05,
"loss": 0.1292,
"step": 53700
},
{
"epoch": 0.7725448345496423,
"grad_norm": 0.27066367864608765,
"learning_rate": 2.2776859504132235e-05,
"loss": 0.1316,
"step": 53750
},
{
"epoch": 0.7732634809073629,
"grad_norm": 0.32667478919029236,
"learning_rate": 2.270499461013295e-05,
"loss": 0.1261,
"step": 53800
},
{
"epoch": 0.7739821272650835,
"grad_norm": 0.35106027126312256,
"learning_rate": 2.263312971613367e-05,
"loss": 0.131,
"step": 53850
},
{
"epoch": 0.7747007736228041,
"grad_norm": 0.41025617718696594,
"learning_rate": 2.256126482213439e-05,
"loss": 0.1301,
"step": 53900
},
{
"epoch": 0.7754194199805247,
"grad_norm": 0.47719448804855347,
"learning_rate": 2.2489399928135108e-05,
"loss": 0.1255,
"step": 53950
},
{
"epoch": 0.7761380663382453,
"grad_norm": 0.38124901056289673,
"learning_rate": 2.2417535034135827e-05,
"loss": 0.1309,
"step": 54000
},
{
"epoch": 0.7761380663382453,
"eval_loss": 0.12959778308868408,
"eval_runtime": 2346.9836,
"eval_samples_per_second": 24.964,
"eval_steps_per_second": 3.121,
"step": 54000
},
{
"epoch": 0.7768567126959659,
"grad_norm": 0.48373425006866455,
"learning_rate": 2.2345670140136542e-05,
"loss": 0.1365,
"step": 54050
},
{
"epoch": 0.7775753590536865,
"grad_norm": 0.6832888722419739,
"learning_rate": 2.2273805246137265e-05,
"loss": 0.1315,
"step": 54100
},
{
"epoch": 0.778294005411407,
"grad_norm": 0.36773887276649475,
"learning_rate": 2.220194035213798e-05,
"loss": 0.13,
"step": 54150
},
{
"epoch": 0.7790126517691277,
"grad_norm": 0.4312587082386017,
"learning_rate": 2.21300754581387e-05,
"loss": 0.1291,
"step": 54200
},
{
"epoch": 0.7797312981268483,
"grad_norm": 0.3125370740890503,
"learning_rate": 2.205821056413942e-05,
"loss": 0.1307,
"step": 54250
},
{
"epoch": 0.7804499444845688,
"grad_norm": 0.617438554763794,
"learning_rate": 2.1986345670140138e-05,
"loss": 0.1343,
"step": 54300
},
{
"epoch": 0.7811685908422895,
"grad_norm": 0.45716118812561035,
"learning_rate": 2.1914480776140857e-05,
"loss": 0.129,
"step": 54350
},
{
"epoch": 0.7818872372000101,
"grad_norm": 0.3317459523677826,
"learning_rate": 2.1842615882141576e-05,
"loss": 0.1255,
"step": 54400
},
{
"epoch": 0.7826058835577306,
"grad_norm": 0.3295186161994934,
"learning_rate": 2.1770750988142295e-05,
"loss": 0.1254,
"step": 54450
},
{
"epoch": 0.7833245299154512,
"grad_norm": 0.32061442732810974,
"learning_rate": 2.169888609414301e-05,
"loss": 0.1276,
"step": 54500
},
{
"epoch": 0.7840431762731719,
"grad_norm": 0.5443814396858215,
"learning_rate": 2.162702120014373e-05,
"loss": 0.1318,
"step": 54550
},
{
"epoch": 0.7847618226308924,
"grad_norm": 0.36078059673309326,
"learning_rate": 2.155515630614445e-05,
"loss": 0.1261,
"step": 54600
},
{
"epoch": 0.785480468988613,
"grad_norm": 0.5464069247245789,
"learning_rate": 2.1483291412145168e-05,
"loss": 0.1287,
"step": 54650
},
{
"epoch": 0.7861991153463337,
"grad_norm": 0.39234134554862976,
"learning_rate": 2.1411426518145884e-05,
"loss": 0.1294,
"step": 54700
},
{
"epoch": 0.7869177617040543,
"grad_norm": 0.3379664123058319,
"learning_rate": 2.1339561624146606e-05,
"loss": 0.1315,
"step": 54750
},
{
"epoch": 0.7876364080617748,
"grad_norm": 0.5115400552749634,
"learning_rate": 2.1267696730147326e-05,
"loss": 0.1263,
"step": 54800
},
{
"epoch": 0.7883550544194955,
"grad_norm": 0.42865628004074097,
"learning_rate": 2.119583183614804e-05,
"loss": 0.1278,
"step": 54850
},
{
"epoch": 0.7890737007772161,
"grad_norm": 0.36056503653526306,
"learning_rate": 2.1123966942148764e-05,
"loss": 0.1284,
"step": 54900
},
{
"epoch": 0.7897923471349366,
"grad_norm": 0.3155864179134369,
"learning_rate": 2.105210204814948e-05,
"loss": 0.1293,
"step": 54950
},
{
"epoch": 0.7905109934926572,
"grad_norm": 0.37790244817733765,
"learning_rate": 2.09802371541502e-05,
"loss": 0.1267,
"step": 55000
},
{
"epoch": 0.7912296398503779,
"grad_norm": 0.3175414502620697,
"learning_rate": 2.0908372260150918e-05,
"loss": 0.1293,
"step": 55050
},
{
"epoch": 0.7919482862080984,
"grad_norm": 0.47029367089271545,
"learning_rate": 2.0836507366151637e-05,
"loss": 0.1288,
"step": 55100
},
{
"epoch": 0.792666932565819,
"grad_norm": 0.3179719150066376,
"learning_rate": 2.0764642472152353e-05,
"loss": 0.1253,
"step": 55150
},
{
"epoch": 0.7933855789235397,
"grad_norm": 0.6366003155708313,
"learning_rate": 2.069277757815307e-05,
"loss": 0.1286,
"step": 55200
},
{
"epoch": 0.7941042252812602,
"grad_norm": 0.3257125914096832,
"learning_rate": 2.0620912684153794e-05,
"loss": 0.1305,
"step": 55250
},
{
"epoch": 0.7948228716389808,
"grad_norm": 0.47698974609375,
"learning_rate": 2.054904779015451e-05,
"loss": 0.1287,
"step": 55300
},
{
"epoch": 0.7955415179967014,
"grad_norm": 0.701610267162323,
"learning_rate": 2.047718289615523e-05,
"loss": 0.1272,
"step": 55350
},
{
"epoch": 0.796260164354422,
"grad_norm": 0.388808935880661,
"learning_rate": 2.0405318002155948e-05,
"loss": 0.1255,
"step": 55400
},
{
"epoch": 0.7969788107121426,
"grad_norm": 0.5635932683944702,
"learning_rate": 2.0333453108156667e-05,
"loss": 0.1279,
"step": 55450
},
{
"epoch": 0.7976974570698632,
"grad_norm": 0.3552047908306122,
"learning_rate": 2.0261588214157383e-05,
"loss": 0.1244,
"step": 55500
},
{
"epoch": 0.7984161034275838,
"grad_norm": 0.38414454460144043,
"learning_rate": 2.0189723320158105e-05,
"loss": 0.127,
"step": 55550
},
{
"epoch": 0.7991347497853044,
"grad_norm": 0.4083007872104645,
"learning_rate": 2.011785842615882e-05,
"loss": 0.131,
"step": 55600
},
{
"epoch": 0.799853396143025,
"grad_norm": 0.3586987853050232,
"learning_rate": 2.004599353215954e-05,
"loss": 0.1294,
"step": 55650
},
{
"epoch": 0.8005720425007457,
"grad_norm": 0.3205280303955078,
"learning_rate": 1.9974128638160263e-05,
"loss": 0.1295,
"step": 55700
},
{
"epoch": 0.8012906888584662,
"grad_norm": 0.2761281132698059,
"learning_rate": 1.990226374416098e-05,
"loss": 0.1288,
"step": 55750
},
{
"epoch": 0.8020093352161868,
"grad_norm": 0.34950581192970276,
"learning_rate": 1.9830398850161698e-05,
"loss": 0.1297,
"step": 55800
},
{
"epoch": 0.8027279815739073,
"grad_norm": 0.35706281661987305,
"learning_rate": 1.9758533956162413e-05,
"loss": 0.1264,
"step": 55850
},
{
"epoch": 0.803446627931628,
"grad_norm": 0.40524542331695557,
"learning_rate": 1.9686669062163136e-05,
"loss": 0.1282,
"step": 55900
},
{
"epoch": 0.8041652742893486,
"grad_norm": 0.37444543838500977,
"learning_rate": 1.961480416816385e-05,
"loss": 0.125,
"step": 55950
},
{
"epoch": 0.8048839206470692,
"grad_norm": 0.46441900730133057,
"learning_rate": 1.954293927416457e-05,
"loss": 0.123,
"step": 56000
},
{
"epoch": 0.8056025670047898,
"grad_norm": 0.396161288022995,
"learning_rate": 1.947107438016529e-05,
"loss": 0.124,
"step": 56050
},
{
"epoch": 0.8063212133625104,
"grad_norm": 0.32282915711402893,
"learning_rate": 1.939920948616601e-05,
"loss": 0.1316,
"step": 56100
},
{
"epoch": 0.807039859720231,
"grad_norm": 0.6208024621009827,
"learning_rate": 1.9327344592166728e-05,
"loss": 0.1321,
"step": 56150
},
{
"epoch": 0.8077585060779515,
"grad_norm": 0.5710030198097229,
"learning_rate": 1.9255479698167447e-05,
"loss": 0.1252,
"step": 56200
},
{
"epoch": 0.8084771524356722,
"grad_norm": 0.49897143244743347,
"learning_rate": 1.9183614804168166e-05,
"loss": 0.1275,
"step": 56250
},
{
"epoch": 0.8091957987933928,
"grad_norm": 0.331696480512619,
"learning_rate": 1.9111749910168882e-05,
"loss": 0.1303,
"step": 56300
},
{
"epoch": 0.8099144451511133,
"grad_norm": 0.37506794929504395,
"learning_rate": 1.90398850161696e-05,
"loss": 0.1286,
"step": 56350
},
{
"epoch": 0.810633091508834,
"grad_norm": 0.3573820888996124,
"learning_rate": 1.896802012217032e-05,
"loss": 0.1345,
"step": 56400
},
{
"epoch": 0.8113517378665546,
"grad_norm": 0.3456536531448364,
"learning_rate": 1.889615522817104e-05,
"loss": 0.1246,
"step": 56450
},
{
"epoch": 0.8120703842242751,
"grad_norm": 0.30496513843536377,
"learning_rate": 1.8824290334171758e-05,
"loss": 0.1265,
"step": 56500
},
{
"epoch": 0.8127890305819957,
"grad_norm": 0.3960976302623749,
"learning_rate": 1.8752425440172477e-05,
"loss": 0.1256,
"step": 56550
},
{
"epoch": 0.8135076769397164,
"grad_norm": 0.6446784138679504,
"learning_rate": 1.8680560546173196e-05,
"loss": 0.1268,
"step": 56600
},
{
"epoch": 0.8142263232974369,
"grad_norm": 0.4691488742828369,
"learning_rate": 1.8608695652173912e-05,
"loss": 0.1313,
"step": 56650
},
{
"epoch": 0.8149449696551575,
"grad_norm": 0.34596702456474304,
"learning_rate": 1.853826805605462e-05,
"loss": 0.1246,
"step": 56700
},
{
"epoch": 0.8156636160128782,
"grad_norm": 0.2931258976459503,
"learning_rate": 1.8466403162055338e-05,
"loss": 0.1307,
"step": 56750
},
{
"epoch": 0.8163822623705987,
"grad_norm": 0.3972085416316986,
"learning_rate": 1.8394538268056057e-05,
"loss": 0.1247,
"step": 56800
},
{
"epoch": 0.8171009087283193,
"grad_norm": 0.3118221163749695,
"learning_rate": 1.8322673374056772e-05,
"loss": 0.1279,
"step": 56850
},
{
"epoch": 0.81781955508604,
"grad_norm": 0.3568058907985687,
"learning_rate": 1.8250808480057495e-05,
"loss": 0.1291,
"step": 56900
},
{
"epoch": 0.8185382014437605,
"grad_norm": 0.361229807138443,
"learning_rate": 1.817894358605821e-05,
"loss": 0.128,
"step": 56950
},
{
"epoch": 0.8192568478014811,
"grad_norm": 0.4731394648551941,
"learning_rate": 1.810707869205893e-05,
"loss": 0.1245,
"step": 57000
},
{
"epoch": 0.8192568478014811,
"eval_loss": 0.12679025530815125,
"eval_runtime": 2353.3114,
"eval_samples_per_second": 24.897,
"eval_steps_per_second": 3.112,
"step": 57000
},
{
"epoch": 0.8199754941592017,
"grad_norm": 0.34787485003471375,
"learning_rate": 1.803521379805965e-05,
"loss": 0.1297,
"step": 57050
},
{
"epoch": 0.8206941405169224,
"grad_norm": 0.3218761384487152,
"learning_rate": 1.7963348904060368e-05,
"loss": 0.1286,
"step": 57100
},
{
"epoch": 0.8214127868746429,
"grad_norm": 0.41955044865608215,
"learning_rate": 1.7891484010061084e-05,
"loss": 0.124,
"step": 57150
},
{
"epoch": 0.8221314332323635,
"grad_norm": 0.44354313611984253,
"learning_rate": 1.7819619116061806e-05,
"loss": 0.1268,
"step": 57200
},
{
"epoch": 0.8228500795900842,
"grad_norm": 0.4659808576107025,
"learning_rate": 1.7747754222062525e-05,
"loss": 0.1279,
"step": 57250
},
{
"epoch": 0.8235687259478047,
"grad_norm": 0.30388763546943665,
"learning_rate": 1.767588932806324e-05,
"loss": 0.1269,
"step": 57300
},
{
"epoch": 0.8242873723055253,
"grad_norm": 0.3265688121318817,
"learning_rate": 1.760402443406396e-05,
"loss": 0.1287,
"step": 57350
},
{
"epoch": 0.8250060186632459,
"grad_norm": 0.42907214164733887,
"learning_rate": 1.753215954006468e-05,
"loss": 0.1314,
"step": 57400
},
{
"epoch": 0.8257246650209665,
"grad_norm": 0.33852580189704895,
"learning_rate": 1.74602946460654e-05,
"loss": 0.1303,
"step": 57450
},
{
"epoch": 0.8264433113786871,
"grad_norm": 0.42768487334251404,
"learning_rate": 1.7388429752066114e-05,
"loss": 0.129,
"step": 57500
},
{
"epoch": 0.8271619577364077,
"grad_norm": 0.401076078414917,
"learning_rate": 1.7316564858066837e-05,
"loss": 0.1281,
"step": 57550
},
{
"epoch": 0.8278806040941283,
"grad_norm": 0.4327072501182556,
"learning_rate": 1.7244699964067552e-05,
"loss": 0.1233,
"step": 57600
},
{
"epoch": 0.8285992504518489,
"grad_norm": 0.3264212906360626,
"learning_rate": 1.717283507006827e-05,
"loss": 0.1274,
"step": 57650
},
{
"epoch": 0.8293178968095695,
"grad_norm": 0.3338124752044678,
"learning_rate": 1.7100970176068994e-05,
"loss": 0.1235,
"step": 57700
},
{
"epoch": 0.8300365431672901,
"grad_norm": 0.5374757647514343,
"learning_rate": 1.702910528206971e-05,
"loss": 0.1308,
"step": 57750
},
{
"epoch": 0.8307551895250107,
"grad_norm": 0.34573695063591003,
"learning_rate": 1.695724038807043e-05,
"loss": 0.1237,
"step": 57800
},
{
"epoch": 0.8314738358827313,
"grad_norm": 0.3776203691959381,
"learning_rate": 1.6885375494071148e-05,
"loss": 0.1258,
"step": 57850
},
{
"epoch": 0.8321924822404518,
"grad_norm": 0.4206998348236084,
"learning_rate": 1.6813510600071867e-05,
"loss": 0.1248,
"step": 57900
},
{
"epoch": 0.8329111285981725,
"grad_norm": 0.39265042543411255,
"learning_rate": 1.6741645706072583e-05,
"loss": 0.1287,
"step": 57950
},
{
"epoch": 0.8336297749558931,
"grad_norm": 0.4329000413417816,
"learning_rate": 1.667121810995329e-05,
"loss": 0.129,
"step": 58000
},
{
"epoch": 0.8343484213136136,
"grad_norm": 0.38255923986434937,
"learning_rate": 1.6599353215954008e-05,
"loss": 0.1251,
"step": 58050
},
{
"epoch": 0.8350670676713343,
"grad_norm": 0.4766669273376465,
"learning_rate": 1.6527488321954727e-05,
"loss": 0.1268,
"step": 58100
},
{
"epoch": 0.8357857140290549,
"grad_norm": 0.39188286662101746,
"learning_rate": 1.6455623427955443e-05,
"loss": 0.1249,
"step": 58150
},
{
"epoch": 0.8365043603867754,
"grad_norm": 0.3658916652202606,
"learning_rate": 1.6383758533956165e-05,
"loss": 0.1194,
"step": 58200
},
{
"epoch": 0.837223006744496,
"grad_norm": 0.3856219947338104,
"learning_rate": 1.631189363995688e-05,
"loss": 0.1324,
"step": 58250
},
{
"epoch": 0.8379416531022167,
"grad_norm": 0.3330176770687103,
"learning_rate": 1.62400287459576e-05,
"loss": 0.1275,
"step": 58300
},
{
"epoch": 0.8386602994599373,
"grad_norm": 0.2891123294830322,
"learning_rate": 1.616816385195832e-05,
"loss": 0.1254,
"step": 58350
},
{
"epoch": 0.8393789458176578,
"grad_norm": 0.41428953409194946,
"learning_rate": 1.609629895795904e-05,
"loss": 0.128,
"step": 58400
},
{
"epoch": 0.8400975921753785,
"grad_norm": 0.3331279754638672,
"learning_rate": 1.6024434063959758e-05,
"loss": 0.124,
"step": 58450
},
{
"epoch": 0.8408162385330991,
"grad_norm": 0.4965340495109558,
"learning_rate": 1.5952569169960473e-05,
"loss": 0.1224,
"step": 58500
},
{
"epoch": 0.8415348848908196,
"grad_norm": 0.3635750710964203,
"learning_rate": 1.5880704275961196e-05,
"loss": 0.1241,
"step": 58550
},
{
"epoch": 0.8422535312485403,
"grad_norm": 0.294466495513916,
"learning_rate": 1.580883938196191e-05,
"loss": 0.1277,
"step": 58600
},
{
"epoch": 0.8429721776062609,
"grad_norm": 0.3636401891708374,
"learning_rate": 1.573697448796263e-05,
"loss": 0.1278,
"step": 58650
},
{
"epoch": 0.8436908239639814,
"grad_norm": 0.2929422855377197,
"learning_rate": 1.566510959396335e-05,
"loss": 0.125,
"step": 58700
},
{
"epoch": 0.844409470321702,
"grad_norm": 0.4337705969810486,
"learning_rate": 1.559324469996407e-05,
"loss": 0.1232,
"step": 58750
},
{
"epoch": 0.8451281166794227,
"grad_norm": 0.39441823959350586,
"learning_rate": 1.5521379805964788e-05,
"loss": 0.1279,
"step": 58800
},
{
"epoch": 0.8458467630371432,
"grad_norm": 0.37334030866622925,
"learning_rate": 1.5449514911965507e-05,
"loss": 0.127,
"step": 58850
},
{
"epoch": 0.8465654093948638,
"grad_norm": 0.4512324333190918,
"learning_rate": 1.5377650017966226e-05,
"loss": 0.1247,
"step": 58900
},
{
"epoch": 0.8472840557525845,
"grad_norm": 0.39491838216781616,
"learning_rate": 1.5305785123966942e-05,
"loss": 0.129,
"step": 58950
},
{
"epoch": 0.848002702110305,
"grad_norm": 0.3801107704639435,
"learning_rate": 1.5233920229967661e-05,
"loss": 0.1254,
"step": 59000
},
{
"epoch": 0.8487213484680256,
"grad_norm": 0.33343058824539185,
"learning_rate": 1.516205533596838e-05,
"loss": 0.1259,
"step": 59050
},
{
"epoch": 0.8494399948257462,
"grad_norm": 0.3642762303352356,
"learning_rate": 1.50901904419691e-05,
"loss": 0.1265,
"step": 59100
},
{
"epoch": 0.8501586411834668,
"grad_norm": 0.3644247353076935,
"learning_rate": 1.5018325547969817e-05,
"loss": 0.1227,
"step": 59150
},
{
"epoch": 0.8508772875411874,
"grad_norm": 0.33389464020729065,
"learning_rate": 1.4946460653970537e-05,
"loss": 0.1217,
"step": 59200
},
{
"epoch": 0.851595933898908,
"grad_norm": 0.35677823424339294,
"learning_rate": 1.4874595759971255e-05,
"loss": 0.1273,
"step": 59250
},
{
"epoch": 0.8523145802566287,
"grad_norm": 0.32460057735443115,
"learning_rate": 1.4802730865971972e-05,
"loss": 0.1223,
"step": 59300
},
{
"epoch": 0.8530332266143492,
"grad_norm": 0.4664474427700043,
"learning_rate": 1.4730865971972693e-05,
"loss": 0.1258,
"step": 59350
},
{
"epoch": 0.8537518729720698,
"grad_norm": 0.43569889664649963,
"learning_rate": 1.465900107797341e-05,
"loss": 0.1225,
"step": 59400
},
{
"epoch": 0.8544705193297905,
"grad_norm": 0.41789641976356506,
"learning_rate": 1.458713618397413e-05,
"loss": 0.1226,
"step": 59450
},
{
"epoch": 0.855189165687511,
"grad_norm": 0.4204149842262268,
"learning_rate": 1.4515271289974847e-05,
"loss": 0.126,
"step": 59500
},
{
"epoch": 0.8559078120452316,
"grad_norm": 0.3769352436065674,
"learning_rate": 1.4443406395975568e-05,
"loss": 0.1222,
"step": 59550
},
{
"epoch": 0.8566264584029522,
"grad_norm": 0.3953361213207245,
"learning_rate": 1.4371541501976285e-05,
"loss": 0.1305,
"step": 59600
},
{
"epoch": 0.8573451047606728,
"grad_norm": 0.31381121277809143,
"learning_rate": 1.4299676607977003e-05,
"loss": 0.1255,
"step": 59650
},
{
"epoch": 0.8580637511183934,
"grad_norm": 0.35245615243911743,
"learning_rate": 1.4227811713977723e-05,
"loss": 0.1253,
"step": 59700
},
{
"epoch": 0.858782397476114,
"grad_norm": 0.30990514159202576,
"learning_rate": 1.415594681997844e-05,
"loss": 0.1231,
"step": 59750
},
{
"epoch": 0.8595010438338346,
"grad_norm": 0.3628983795642853,
"learning_rate": 1.4084081925979158e-05,
"loss": 0.1252,
"step": 59800
},
{
"epoch": 0.8602196901915552,
"grad_norm": 0.4560927152633667,
"learning_rate": 1.4012217031979879e-05,
"loss": 0.1232,
"step": 59850
},
{
"epoch": 0.8609383365492758,
"grad_norm": 0.3632454574108124,
"learning_rate": 1.3940352137980598e-05,
"loss": 0.1236,
"step": 59900
},
{
"epoch": 0.8616569829069963,
"grad_norm": 0.4410002529621124,
"learning_rate": 1.3868487243981315e-05,
"loss": 0.1218,
"step": 59950
},
{
"epoch": 0.862375629264717,
"grad_norm": 0.48087888956069946,
"learning_rate": 1.3796622349982036e-05,
"loss": 0.1252,
"step": 60000
},
{
"epoch": 0.862375629264717,
"eval_loss": 0.12454573065042496,
"eval_runtime": 2352.7822,
"eval_samples_per_second": 24.902,
"eval_steps_per_second": 3.113,
"step": 60000
},
{
"epoch": 0.8630942756224376,
"grad_norm": 0.3360944986343384,
"learning_rate": 1.3724757455982754e-05,
"loss": 0.1235,
"step": 60050
},
{
"epoch": 0.8638129219801581,
"grad_norm": 0.33876070380210876,
"learning_rate": 1.3652892561983471e-05,
"loss": 0.1245,
"step": 60100
},
{
"epoch": 0.8645315683378788,
"grad_norm": 0.4165988862514496,
"learning_rate": 1.3581027667984189e-05,
"loss": 0.1274,
"step": 60150
},
{
"epoch": 0.8652502146955994,
"grad_norm": 0.35403144359588623,
"learning_rate": 1.350916277398491e-05,
"loss": 0.1247,
"step": 60200
},
{
"epoch": 0.8659688610533199,
"grad_norm": 0.3999514579772949,
"learning_rate": 1.3438735177865614e-05,
"loss": 0.1261,
"step": 60250
},
{
"epoch": 0.8666875074110406,
"grad_norm": 0.33906814455986023,
"learning_rate": 1.3366870283866331e-05,
"loss": 0.1245,
"step": 60300
},
{
"epoch": 0.8674061537687612,
"grad_norm": 0.3145388662815094,
"learning_rate": 1.3295005389867052e-05,
"loss": 0.123,
"step": 60350
},
{
"epoch": 0.8681248001264817,
"grad_norm": 0.5379143953323364,
"learning_rate": 1.322314049586777e-05,
"loss": 0.1266,
"step": 60400
},
{
"epoch": 0.8688434464842023,
"grad_norm": 0.49181005358695984,
"learning_rate": 1.3151275601868487e-05,
"loss": 0.1233,
"step": 60450
},
{
"epoch": 0.869562092841923,
"grad_norm": 0.4837574064731598,
"learning_rate": 1.3079410707869205e-05,
"loss": 0.1291,
"step": 60500
},
{
"epoch": 0.8702807391996435,
"grad_norm": 0.37543419003486633,
"learning_rate": 1.3007545813869925e-05,
"loss": 0.1236,
"step": 60550
},
{
"epoch": 0.8709993855573641,
"grad_norm": 0.3877285122871399,
"learning_rate": 1.2935680919870643e-05,
"loss": 0.1229,
"step": 60600
},
{
"epoch": 0.8717180319150848,
"grad_norm": 0.34340938925743103,
"learning_rate": 1.2863816025871362e-05,
"loss": 0.1241,
"step": 60650
},
{
"epoch": 0.8724366782728054,
"grad_norm": 0.33106306195259094,
"learning_rate": 1.2791951131872083e-05,
"loss": 0.126,
"step": 60700
},
{
"epoch": 0.8731553246305259,
"grad_norm": 0.3516967296600342,
"learning_rate": 1.27200862378728e-05,
"loss": 0.126,
"step": 60750
},
{
"epoch": 0.8738739709882465,
"grad_norm": 0.3838764429092407,
"learning_rate": 1.2648221343873517e-05,
"loss": 0.1236,
"step": 60800
},
{
"epoch": 0.8745926173459672,
"grad_norm": 0.5866835117340088,
"learning_rate": 1.2576356449874238e-05,
"loss": 0.1243,
"step": 60850
},
{
"epoch": 0.8753112637036877,
"grad_norm": 0.42481040954589844,
"learning_rate": 1.2504491555874956e-05,
"loss": 0.1258,
"step": 60900
},
{
"epoch": 0.8760299100614083,
"grad_norm": 0.33105340600013733,
"learning_rate": 1.2432626661875673e-05,
"loss": 0.1242,
"step": 60950
},
{
"epoch": 0.876748556419129,
"grad_norm": 0.38015687465667725,
"learning_rate": 1.2360761767876392e-05,
"loss": 0.1246,
"step": 61000
},
{
"epoch": 0.8774672027768495,
"grad_norm": 0.35006070137023926,
"learning_rate": 1.2288896873877111e-05,
"loss": 0.1229,
"step": 61050
},
{
"epoch": 0.8781858491345701,
"grad_norm": 0.4309733808040619,
"learning_rate": 1.221703197987783e-05,
"loss": 0.1205,
"step": 61100
},
{
"epoch": 0.8789044954922908,
"grad_norm": 0.3616236448287964,
"learning_rate": 1.214516708587855e-05,
"loss": 0.1245,
"step": 61150
},
{
"epoch": 0.8796231418500113,
"grad_norm": 0.41629916429519653,
"learning_rate": 1.2073302191879267e-05,
"loss": 0.1216,
"step": 61200
},
{
"epoch": 0.8803417882077319,
"grad_norm": 0.283905565738678,
"learning_rate": 1.2001437297879986e-05,
"loss": 0.1221,
"step": 61250
},
{
"epoch": 0.8810604345654525,
"grad_norm": 0.439532995223999,
"learning_rate": 1.1929572403880705e-05,
"loss": 0.1294,
"step": 61300
},
{
"epoch": 0.8817790809231731,
"grad_norm": 0.41762885451316833,
"learning_rate": 1.1857707509881423e-05,
"loss": 0.125,
"step": 61350
},
{
"epoch": 0.8824977272808937,
"grad_norm": 0.361398845911026,
"learning_rate": 1.1785842615882142e-05,
"loss": 0.1196,
"step": 61400
},
{
"epoch": 0.8832163736386143,
"grad_norm": 0.4029219150543213,
"learning_rate": 1.171397772188286e-05,
"loss": 0.1225,
"step": 61450
},
{
"epoch": 0.883935019996335,
"grad_norm": 0.29444122314453125,
"learning_rate": 1.164211282788358e-05,
"loss": 0.125,
"step": 61500
},
{
"epoch": 0.8846536663540555,
"grad_norm": 0.3278166353702545,
"learning_rate": 1.1570247933884299e-05,
"loss": 0.1237,
"step": 61550
},
{
"epoch": 0.8853723127117761,
"grad_norm": 0.41596293449401855,
"learning_rate": 1.1498383039885016e-05,
"loss": 0.1239,
"step": 61600
},
{
"epoch": 0.8860909590694966,
"grad_norm": 0.3153913617134094,
"learning_rate": 1.1426518145885735e-05,
"loss": 0.1244,
"step": 61650
},
{
"epoch": 0.8868096054272173,
"grad_norm": 0.37591880559921265,
"learning_rate": 1.1354653251886455e-05,
"loss": 0.1205,
"step": 61700
},
{
"epoch": 0.8875282517849379,
"grad_norm": 0.28158771991729736,
"learning_rate": 1.1282788357887172e-05,
"loss": 0.1223,
"step": 61750
},
{
"epoch": 0.8882468981426584,
"grad_norm": 0.33741140365600586,
"learning_rate": 1.1210923463887891e-05,
"loss": 0.1309,
"step": 61800
},
{
"epoch": 0.8889655445003791,
"grad_norm": 0.4564758539199829,
"learning_rate": 1.113905856988861e-05,
"loss": 0.1243,
"step": 61850
},
{
"epoch": 0.8896841908580997,
"grad_norm": 0.6229817867279053,
"learning_rate": 1.106719367588933e-05,
"loss": 0.1264,
"step": 61900
},
{
"epoch": 0.8904028372158203,
"grad_norm": 0.35576000809669495,
"learning_rate": 1.0995328781890048e-05,
"loss": 0.1251,
"step": 61950
},
{
"epoch": 0.8911214835735409,
"grad_norm": 0.4052492380142212,
"learning_rate": 1.0923463887890766e-05,
"loss": 0.1219,
"step": 62000
},
{
"epoch": 0.8918401299312615,
"grad_norm": 0.33254197239875793,
"learning_rate": 1.0851598993891485e-05,
"loss": 0.1237,
"step": 62050
},
{
"epoch": 0.8925587762889821,
"grad_norm": 0.3527114987373352,
"learning_rate": 1.0779734099892202e-05,
"loss": 0.1215,
"step": 62100
},
{
"epoch": 0.8932774226467026,
"grad_norm": 0.36184269189834595,
"learning_rate": 1.0707869205892921e-05,
"loss": 0.1244,
"step": 62150
},
{
"epoch": 0.8939960690044233,
"grad_norm": 0.3354300558567047,
"learning_rate": 1.063600431189364e-05,
"loss": 0.12,
"step": 62200
},
{
"epoch": 0.8947147153621439,
"grad_norm": 0.38519734144210815,
"learning_rate": 1.0564139417894358e-05,
"loss": 0.1249,
"step": 62250
},
{
"epoch": 0.8954333617198644,
"grad_norm": 0.3049958348274231,
"learning_rate": 1.0492274523895079e-05,
"loss": 0.126,
"step": 62300
},
{
"epoch": 0.8961520080775851,
"grad_norm": 0.3645496070384979,
"learning_rate": 1.0420409629895798e-05,
"loss": 0.1267,
"step": 62350
},
{
"epoch": 0.8968706544353057,
"grad_norm": 0.48931238055229187,
"learning_rate": 1.0348544735896515e-05,
"loss": 0.1216,
"step": 62400
},
{
"epoch": 0.8975893007930262,
"grad_norm": 0.3747062385082245,
"learning_rate": 1.0276679841897234e-05,
"loss": 0.1231,
"step": 62450
},
{
"epoch": 0.8983079471507468,
"grad_norm": 0.3831491470336914,
"learning_rate": 1.0204814947897952e-05,
"loss": 0.1234,
"step": 62500
},
{
"epoch": 0.8990265935084675,
"grad_norm": 0.6291891932487488,
"learning_rate": 1.013295005389867e-05,
"loss": 0.1233,
"step": 62550
},
{
"epoch": 0.899745239866188,
"grad_norm": 0.30106669664382935,
"learning_rate": 1.006108515989939e-05,
"loss": 0.1221,
"step": 62600
},
{
"epoch": 0.9004638862239086,
"grad_norm": 0.3288785219192505,
"learning_rate": 9.989220265900107e-06,
"loss": 0.1273,
"step": 62650
},
{
"epoch": 0.9011825325816293,
"grad_norm": 0.33860695362091064,
"learning_rate": 9.917355371900826e-06,
"loss": 0.1217,
"step": 62700
},
{
"epoch": 0.9019011789393498,
"grad_norm": 0.3534477651119232,
"learning_rate": 9.845490477901546e-06,
"loss": 0.1217,
"step": 62750
},
{
"epoch": 0.9026198252970704,
"grad_norm": 0.4250911474227905,
"learning_rate": 9.773625583902265e-06,
"loss": 0.1236,
"step": 62800
},
{
"epoch": 0.9033384716547911,
"grad_norm": 0.3646557033061981,
"learning_rate": 9.703197987782968e-06,
"loss": 0.1273,
"step": 62850
},
{
"epoch": 0.9040571180125117,
"grad_norm": 0.3592480421066284,
"learning_rate": 9.631333093783687e-06,
"loss": 0.1233,
"step": 62900
},
{
"epoch": 0.9047757643702322,
"grad_norm": 0.4674370288848877,
"learning_rate": 9.559468199784406e-06,
"loss": 0.1194,
"step": 62950
},
{
"epoch": 0.9054944107279528,
"grad_norm": 0.6390477418899536,
"learning_rate": 9.487603305785123e-06,
"loss": 0.1193,
"step": 63000
},
{
"epoch": 0.9054944107279528,
"eval_loss": 0.12265664339065552,
"eval_runtime": 2338.7731,
"eval_samples_per_second": 25.052,
"eval_steps_per_second": 3.132,
"step": 63000
},
{
"epoch": 0.9062130570856735,
"grad_norm": 0.3408135175704956,
"learning_rate": 9.415738411785842e-06,
"loss": 0.1212,
"step": 63050
},
{
"epoch": 0.906931703443394,
"grad_norm": 0.3757847249507904,
"learning_rate": 9.343873517786562e-06,
"loss": 0.1223,
"step": 63100
},
{
"epoch": 0.9076503498011146,
"grad_norm": 0.4292818307876587,
"learning_rate": 9.27200862378728e-06,
"loss": 0.1239,
"step": 63150
},
{
"epoch": 0.9083689961588353,
"grad_norm": 0.3223564922809601,
"learning_rate": 9.200143729788e-06,
"loss": 0.1252,
"step": 63200
},
{
"epoch": 0.9090876425165558,
"grad_norm": 0.34768980741500854,
"learning_rate": 9.128278835788717e-06,
"loss": 0.125,
"step": 63250
},
{
"epoch": 0.9098062888742764,
"grad_norm": 0.38182663917541504,
"learning_rate": 9.056413941789436e-06,
"loss": 0.1222,
"step": 63300
},
{
"epoch": 0.910524935231997,
"grad_norm": 0.30806124210357666,
"learning_rate": 8.984549047790155e-06,
"loss": 0.1225,
"step": 63350
},
{
"epoch": 0.9112435815897176,
"grad_norm": 0.39036205410957336,
"learning_rate": 8.912684153790873e-06,
"loss": 0.124,
"step": 63400
},
{
"epoch": 0.9119622279474382,
"grad_norm": 0.31721031665802,
"learning_rate": 8.840819259791592e-06,
"loss": 0.1215,
"step": 63450
},
{
"epoch": 0.9126808743051588,
"grad_norm": 0.36282646656036377,
"learning_rate": 8.768954365792311e-06,
"loss": 0.1245,
"step": 63500
},
{
"epoch": 0.9133995206628794,
"grad_norm": 0.3416596055030823,
"learning_rate": 8.69708947179303e-06,
"loss": 0.1208,
"step": 63550
},
{
"epoch": 0.9141181670206,
"grad_norm": 0.46525707840919495,
"learning_rate": 8.62522457779375e-06,
"loss": 0.1254,
"step": 63600
},
{
"epoch": 0.9148368133783206,
"grad_norm": 0.49614182114601135,
"learning_rate": 8.553359683794467e-06,
"loss": 0.1224,
"step": 63650
},
{
"epoch": 0.9155554597360412,
"grad_norm": 0.36073940992355347,
"learning_rate": 8.481494789795186e-06,
"loss": 0.123,
"step": 63700
},
{
"epoch": 0.9162741060937618,
"grad_norm": 0.42497745156288147,
"learning_rate": 8.409629895795903e-06,
"loss": 0.1259,
"step": 63750
},
{
"epoch": 0.9169927524514824,
"grad_norm": 0.34898316860198975,
"learning_rate": 8.337765001796622e-06,
"loss": 0.1265,
"step": 63800
},
{
"epoch": 0.9177113988092029,
"grad_norm": 0.35650330781936646,
"learning_rate": 8.265900107797341e-06,
"loss": 0.1182,
"step": 63850
},
{
"epoch": 0.9184300451669236,
"grad_norm": 0.29349714517593384,
"learning_rate": 8.19403521379806e-06,
"loss": 0.1195,
"step": 63900
},
{
"epoch": 0.9191486915246442,
"grad_norm": 0.34547990560531616,
"learning_rate": 8.12217031979878e-06,
"loss": 0.1205,
"step": 63950
},
{
"epoch": 0.9198673378823647,
"grad_norm": 0.43164411187171936,
"learning_rate": 8.050305425799497e-06,
"loss": 0.1225,
"step": 64000
},
{
"epoch": 0.9205859842400854,
"grad_norm": 0.4483722746372223,
"learning_rate": 7.978440531800216e-06,
"loss": 0.1213,
"step": 64050
},
{
"epoch": 0.921304630597806,
"grad_norm": 0.42297491431236267,
"learning_rate": 7.906575637800935e-06,
"loss": 0.1239,
"step": 64100
},
{
"epoch": 0.9220232769555265,
"grad_norm": 0.3763730227947235,
"learning_rate": 7.834710743801653e-06,
"loss": 0.1219,
"step": 64150
},
{
"epoch": 0.9227419233132471,
"grad_norm": 0.39699843525886536,
"learning_rate": 7.762845849802372e-06,
"loss": 0.1187,
"step": 64200
},
{
"epoch": 0.9234605696709678,
"grad_norm": 0.3688933253288269,
"learning_rate": 7.69098095580309e-06,
"loss": 0.1191,
"step": 64250
},
{
"epoch": 0.9241792160286884,
"grad_norm": 0.411871075630188,
"learning_rate": 7.619116061803809e-06,
"loss": 0.1248,
"step": 64300
},
{
"epoch": 0.9248978623864089,
"grad_norm": 0.5669124722480774,
"learning_rate": 7.547251167804528e-06,
"loss": 0.1201,
"step": 64350
},
{
"epoch": 0.9256165087441296,
"grad_norm": 0.3789122700691223,
"learning_rate": 7.475386273805246e-06,
"loss": 0.1327,
"step": 64400
},
{
"epoch": 0.9263351551018502,
"grad_norm": 0.382330060005188,
"learning_rate": 7.4035213798059655e-06,
"loss": 0.1217,
"step": 64450
},
{
"epoch": 0.9270538014595707,
"grad_norm": 0.4880569279193878,
"learning_rate": 7.331656485806685e-06,
"loss": 0.1222,
"step": 64500
},
{
"epoch": 0.9277724478172914,
"grad_norm": 0.5543988943099976,
"learning_rate": 7.259791591807402e-06,
"loss": 0.1218,
"step": 64550
},
{
"epoch": 0.928491094175012,
"grad_norm": 0.36518344283103943,
"learning_rate": 7.187926697808121e-06,
"loss": 0.1217,
"step": 64600
},
{
"epoch": 0.9292097405327325,
"grad_norm": 0.39271408319473267,
"learning_rate": 7.116061803808839e-06,
"loss": 0.1214,
"step": 64650
},
{
"epoch": 0.9299283868904531,
"grad_norm": 0.4329274594783783,
"learning_rate": 7.0441969098095585e-06,
"loss": 0.1231,
"step": 64700
},
{
"epoch": 0.9306470332481738,
"grad_norm": 0.6806078553199768,
"learning_rate": 6.9723320158102776e-06,
"loss": 0.1214,
"step": 64750
},
{
"epoch": 0.9313656796058943,
"grad_norm": 0.4004870355129242,
"learning_rate": 6.900467121810995e-06,
"loss": 0.1263,
"step": 64800
},
{
"epoch": 0.9320843259636149,
"grad_norm": 0.48993903398513794,
"learning_rate": 6.828602227811715e-06,
"loss": 0.1223,
"step": 64850
},
{
"epoch": 0.9328029723213356,
"grad_norm": 0.5226307511329651,
"learning_rate": 6.756737333812432e-06,
"loss": 0.1221,
"step": 64900
},
{
"epoch": 0.9335216186790561,
"grad_norm": 0.28276339173316956,
"learning_rate": 6.6848724398131514e-06,
"loss": 0.1236,
"step": 64950
},
{
"epoch": 0.9342402650367767,
"grad_norm": 0.37379029393196106,
"learning_rate": 6.6130075458138705e-06,
"loss": 0.1204,
"step": 65000
},
{
"epoch": 0.9349589113944973,
"grad_norm": 0.3810805678367615,
"learning_rate": 6.541142651814589e-06,
"loss": 0.1179,
"step": 65050
},
{
"epoch": 0.935677557752218,
"grad_norm": 0.3675697147846222,
"learning_rate": 6.469277757815308e-06,
"loss": 0.1218,
"step": 65100
},
{
"epoch": 0.9363962041099385,
"grad_norm": 0.41229313611984253,
"learning_rate": 6.397412863816025e-06,
"loss": 0.1244,
"step": 65150
},
{
"epoch": 0.9371148504676591,
"grad_norm": 0.4059778153896332,
"learning_rate": 6.325547969816744e-06,
"loss": 0.1204,
"step": 65200
},
{
"epoch": 0.9378334968253798,
"grad_norm": 0.38567858934402466,
"learning_rate": 6.2536830758174635e-06,
"loss": 0.1173,
"step": 65250
},
{
"epoch": 0.9385521431831003,
"grad_norm": 0.36514681577682495,
"learning_rate": 6.181818181818183e-06,
"loss": 0.1188,
"step": 65300
},
{
"epoch": 0.9392707895408209,
"grad_norm": 0.37921783328056335,
"learning_rate": 6.109953287818901e-06,
"loss": 0.1234,
"step": 65350
},
{
"epoch": 0.9399894358985416,
"grad_norm": 0.41887423396110535,
"learning_rate": 6.038088393819619e-06,
"loss": 0.1169,
"step": 65400
},
{
"epoch": 0.9407080822562621,
"grad_norm": 0.41464152932167053,
"learning_rate": 5.966223499820338e-06,
"loss": 0.1208,
"step": 65450
},
{
"epoch": 0.9414267286139827,
"grad_norm": 0.3517071604728699,
"learning_rate": 5.894358605821057e-06,
"loss": 0.1222,
"step": 65500
},
{
"epoch": 0.9421453749717033,
"grad_norm": 0.39751720428466797,
"learning_rate": 5.822493711821776e-06,
"loss": 0.1201,
"step": 65550
},
{
"epoch": 0.9428640213294239,
"grad_norm": 0.42730578780174255,
"learning_rate": 5.750628817822494e-06,
"loss": 0.1197,
"step": 65600
},
{
"epoch": 0.9435826676871445,
"grad_norm": 0.4353543519973755,
"learning_rate": 5.678763923823212e-06,
"loss": 0.1231,
"step": 65650
},
{
"epoch": 0.9443013140448651,
"grad_norm": 0.4269670248031616,
"learning_rate": 5.606899029823931e-06,
"loss": 0.1229,
"step": 65700
},
{
"epoch": 0.9450199604025857,
"grad_norm": 0.4464121460914612,
"learning_rate": 5.53503413582465e-06,
"loss": 0.118,
"step": 65750
},
{
"epoch": 0.9457386067603063,
"grad_norm": 0.44756245613098145,
"learning_rate": 5.4631692418253686e-06,
"loss": 0.1225,
"step": 65800
},
{
"epoch": 0.9464572531180269,
"grad_norm": 0.3775683641433716,
"learning_rate": 5.391304347826087e-06,
"loss": 0.1232,
"step": 65850
},
{
"epoch": 0.9471758994757474,
"grad_norm": 0.5156663060188293,
"learning_rate": 5.319439453826806e-06,
"loss": 0.1183,
"step": 65900
},
{
"epoch": 0.9478945458334681,
"grad_norm": 0.44453561305999756,
"learning_rate": 5.247574559827524e-06,
"loss": 0.1243,
"step": 65950
},
{
"epoch": 0.9486131921911887,
"grad_norm": 0.4175598621368408,
"learning_rate": 5.175709665828243e-06,
"loss": 0.124,
"step": 66000
},
{
"epoch": 0.9486131921911887,
"eval_loss": 0.12094888836145401,
"eval_runtime": 2341.6257,
"eval_samples_per_second": 25.021,
"eval_steps_per_second": 3.128,
"step": 66000
},
{
"epoch": 0.9493318385489092,
"grad_norm": 0.37046581506729126,
"learning_rate": 5.1038447718289615e-06,
"loss": 0.1221,
"step": 66050
},
{
"epoch": 0.9500504849066299,
"grad_norm": 0.45041966438293457,
"learning_rate": 5.031979877829681e-06,
"loss": 0.1188,
"step": 66100
},
{
"epoch": 0.9507691312643505,
"grad_norm": 0.3930753469467163,
"learning_rate": 4.961552281710385e-06,
"loss": 0.1238,
"step": 66150
},
{
"epoch": 0.951487777622071,
"grad_norm": 0.3932070732116699,
"learning_rate": 4.889687387711103e-06,
"loss": 0.1223,
"step": 66200
},
{
"epoch": 0.9522064239797917,
"grad_norm": 0.40663453936576843,
"learning_rate": 4.817822493711822e-06,
"loss": 0.1189,
"step": 66250
},
{
"epoch": 0.9529250703375123,
"grad_norm": 0.5449784994125366,
"learning_rate": 4.745957599712541e-06,
"loss": 0.1217,
"step": 66300
},
{
"epoch": 0.9536437166952328,
"grad_norm": 0.4170607924461365,
"learning_rate": 4.674092705713259e-06,
"loss": 0.1203,
"step": 66350
},
{
"epoch": 0.9543623630529534,
"grad_norm": 0.4866325557231903,
"learning_rate": 4.6022278117139776e-06,
"loss": 0.1254,
"step": 66400
},
{
"epoch": 0.9550810094106741,
"grad_norm": 0.3833375573158264,
"learning_rate": 4.530362917714697e-06,
"loss": 0.1207,
"step": 66450
},
{
"epoch": 0.9557996557683947,
"grad_norm": 0.40516358613967896,
"learning_rate": 4.458498023715415e-06,
"loss": 0.1232,
"step": 66500
},
{
"epoch": 0.9565183021261152,
"grad_norm": 0.29924067854881287,
"learning_rate": 4.386633129716134e-06,
"loss": 0.1199,
"step": 66550
},
{
"epoch": 0.9572369484838359,
"grad_norm": 0.4255986213684082,
"learning_rate": 4.314768235716852e-06,
"loss": 0.1247,
"step": 66600
},
{
"epoch": 0.9579555948415565,
"grad_norm": 0.582648515701294,
"learning_rate": 4.242903341717571e-06,
"loss": 0.1179,
"step": 66650
},
{
"epoch": 0.958674241199277,
"grad_norm": 0.3907829821109772,
"learning_rate": 4.17103844771829e-06,
"loss": 0.1187,
"step": 66700
},
{
"epoch": 0.9593928875569976,
"grad_norm": 0.39846646785736084,
"learning_rate": 4.099173553719009e-06,
"loss": 0.1197,
"step": 66750
},
{
"epoch": 0.9601115339147183,
"grad_norm": 0.35272663831710815,
"learning_rate": 4.027308659719727e-06,
"loss": 0.1204,
"step": 66800
},
{
"epoch": 0.9608301802724388,
"grad_norm": 0.4485180974006653,
"learning_rate": 3.955443765720446e-06,
"loss": 0.1216,
"step": 66850
},
{
"epoch": 0.9615488266301594,
"grad_norm": 0.5025599002838135,
"learning_rate": 3.883578871721164e-06,
"loss": 0.1154,
"step": 66900
},
{
"epoch": 0.9622674729878801,
"grad_norm": 0.49099233746528625,
"learning_rate": 3.8117139777218826e-06,
"loss": 0.1175,
"step": 66950
},
{
"epoch": 0.9629861193456006,
"grad_norm": 0.33758753538131714,
"learning_rate": 3.7398490837226017e-06,
"loss": 0.1197,
"step": 67000
},
{
"epoch": 0.9637047657033212,
"grad_norm": 0.4802404046058655,
"learning_rate": 3.6679841897233204e-06,
"loss": 0.1185,
"step": 67050
},
{
"epoch": 0.9644234120610419,
"grad_norm": 0.3601958751678467,
"learning_rate": 3.596119295724039e-06,
"loss": 0.1195,
"step": 67100
},
{
"epoch": 0.9651420584187624,
"grad_norm": 0.3577285408973694,
"learning_rate": 3.5242544017247573e-06,
"loss": 0.1201,
"step": 67150
},
{
"epoch": 0.965860704776483,
"grad_norm": 0.4318629503250122,
"learning_rate": 3.4523895077254764e-06,
"loss": 0.1167,
"step": 67200
},
{
"epoch": 0.9665793511342036,
"grad_norm": 0.48125141859054565,
"learning_rate": 3.380524613726195e-06,
"loss": 0.1214,
"step": 67250
},
{
"epoch": 0.9672979974919242,
"grad_norm": 0.3523324728012085,
"learning_rate": 3.3086597197269134e-06,
"loss": 0.1225,
"step": 67300
},
{
"epoch": 0.9680166438496448,
"grad_norm": 0.4431188404560089,
"learning_rate": 3.236794825727632e-06,
"loss": 0.1196,
"step": 67350
},
{
"epoch": 0.9687352902073654,
"grad_norm": 0.42814207077026367,
"learning_rate": 3.1649299317283507e-06,
"loss": 0.1197,
"step": 67400
},
{
"epoch": 0.969453936565086,
"grad_norm": 0.37214395403862,
"learning_rate": 3.0930650377290694e-06,
"loss": 0.1218,
"step": 67450
},
{
"epoch": 0.9701725829228066,
"grad_norm": 0.45836591720581055,
"learning_rate": 3.021200143729788e-06,
"loss": 0.1253,
"step": 67500
},
{
"epoch": 0.9708912292805272,
"grad_norm": 0.3980534076690674,
"learning_rate": 2.9493352497305068e-06,
"loss": 0.1177,
"step": 67550
},
{
"epoch": 0.9716098756382477,
"grad_norm": 0.4024925231933594,
"learning_rate": 2.8774703557312255e-06,
"loss": 0.1191,
"step": 67600
},
{
"epoch": 0.9723285219959684,
"grad_norm": 0.3470667600631714,
"learning_rate": 2.805605461731944e-06,
"loss": 0.116,
"step": 67650
},
{
"epoch": 0.973047168353689,
"grad_norm": 0.3723811209201813,
"learning_rate": 2.733740567732663e-06,
"loss": 0.1214,
"step": 67700
},
{
"epoch": 0.9737658147114095,
"grad_norm": 0.3014863431453705,
"learning_rate": 2.6618756737333815e-06,
"loss": 0.1212,
"step": 67750
},
{
"epoch": 0.9744844610691302,
"grad_norm": 0.48357853293418884,
"learning_rate": 2.5900107797340997e-06,
"loss": 0.1244,
"step": 67800
},
{
"epoch": 0.9752031074268508,
"grad_norm": 0.5432282090187073,
"learning_rate": 2.518145885734819e-06,
"loss": 0.121,
"step": 67850
},
{
"epoch": 0.9759217537845714,
"grad_norm": 0.3833717703819275,
"learning_rate": 2.446280991735537e-06,
"loss": 0.118,
"step": 67900
},
{
"epoch": 0.976640400142292,
"grad_norm": 0.4205469787120819,
"learning_rate": 2.374416097736256e-06,
"loss": 0.1228,
"step": 67950
},
{
"epoch": 0.9773590465000126,
"grad_norm": 0.45980021357536316,
"learning_rate": 2.3025512037369745e-06,
"loss": 0.121,
"step": 68000
},
{
"epoch": 0.9780776928577332,
"grad_norm": 0.3673114478588104,
"learning_rate": 2.2306863097376936e-06,
"loss": 0.1205,
"step": 68050
},
{
"epoch": 0.9787963392154537,
"grad_norm": 0.42491433024406433,
"learning_rate": 2.158821415738412e-06,
"loss": 0.1198,
"step": 68100
},
{
"epoch": 0.9795149855731744,
"grad_norm": 0.2906801998615265,
"learning_rate": 2.0869565217391305e-06,
"loss": 0.122,
"step": 68150
},
{
"epoch": 0.980233631930895,
"grad_norm": 0.45080652832984924,
"learning_rate": 2.015091627739849e-06,
"loss": 0.1167,
"step": 68200
},
{
"epoch": 0.9809522782886155,
"grad_norm": 0.3137567937374115,
"learning_rate": 1.943226733740568e-06,
"loss": 0.1206,
"step": 68250
},
{
"epoch": 0.9816709246463362,
"grad_norm": 0.38510262966156006,
"learning_rate": 1.8713618397412865e-06,
"loss": 0.1197,
"step": 68300
},
{
"epoch": 0.9823895710040568,
"grad_norm": 0.32521912455558777,
"learning_rate": 1.799496945742005e-06,
"loss": 0.1213,
"step": 68350
},
{
"epoch": 0.9831082173617773,
"grad_norm": 0.4238761067390442,
"learning_rate": 1.727632051742724e-06,
"loss": 0.1216,
"step": 68400
},
{
"epoch": 0.9838268637194979,
"grad_norm": 0.39427056908607483,
"learning_rate": 1.6557671577434424e-06,
"loss": 0.122,
"step": 68450
},
{
"epoch": 0.9845455100772186,
"grad_norm": 0.343313604593277,
"learning_rate": 1.5839022637441613e-06,
"loss": 0.12,
"step": 68500
},
{
"epoch": 0.9852641564349391,
"grad_norm": 0.3430338203907013,
"learning_rate": 1.5120373697448797e-06,
"loss": 0.1252,
"step": 68550
},
{
"epoch": 0.9859828027926597,
"grad_norm": 0.3622065484523773,
"learning_rate": 1.4401724757455984e-06,
"loss": 0.1252,
"step": 68600
},
{
"epoch": 0.9867014491503804,
"grad_norm": 0.42893192172050476,
"learning_rate": 1.368307581746317e-06,
"loss": 0.1233,
"step": 68650
},
{
"epoch": 0.987420095508101,
"grad_norm": 0.3050183951854706,
"learning_rate": 1.2964426877470358e-06,
"loss": 0.1172,
"step": 68700
},
{
"epoch": 0.9881387418658215,
"grad_norm": 0.5666402578353882,
"learning_rate": 1.2245777937477545e-06,
"loss": 0.1238,
"step": 68750
},
{
"epoch": 0.9888573882235422,
"grad_norm": 0.35554978251457214,
"learning_rate": 1.152712899748473e-06,
"loss": 0.1194,
"step": 68800
},
{
"epoch": 0.9895760345812628,
"grad_norm": 0.4939674139022827,
"learning_rate": 1.0808480057491916e-06,
"loss": 0.1187,
"step": 68850
},
{
"epoch": 0.9902946809389833,
"grad_norm": 0.3537197709083557,
"learning_rate": 1.00898311174991e-06,
"loss": 0.119,
"step": 68900
},
{
"epoch": 0.9910133272967039,
"grad_norm": 0.5450920462608337,
"learning_rate": 9.371182177506289e-07,
"loss": 0.1203,
"step": 68950
},
{
"epoch": 0.9917319736544246,
"grad_norm": 0.33558523654937744,
"learning_rate": 8.652533237513475e-07,
"loss": 0.1225,
"step": 69000
},
{
"epoch": 0.9917319736544246,
"eval_loss": 0.11988582462072372,
"eval_runtime": 2351.3637,
"eval_samples_per_second": 24.917,
"eval_steps_per_second": 3.115,
"step": 69000
},
{
"epoch": 0.9924506200121451,
"grad_norm": 0.4475248456001282,
"learning_rate": 7.933884297520662e-07,
"loss": 0.119,
"step": 69050
},
{
"epoch": 0.9931692663698657,
"grad_norm": 0.34947699308395386,
"learning_rate": 7.215235357527848e-07,
"loss": 0.1205,
"step": 69100
},
{
"epoch": 0.9938879127275864,
"grad_norm": 0.4064067304134369,
"learning_rate": 6.496586417535035e-07,
"loss": 0.1215,
"step": 69150
},
{
"epoch": 0.9946065590853069,
"grad_norm": 0.5461844205856323,
"learning_rate": 5.77793747754222e-07,
"loss": 0.1214,
"step": 69200
},
{
"epoch": 0.9953252054430275,
"grad_norm": 0.4855654835700989,
"learning_rate": 5.059288537549407e-07,
"loss": 0.122,
"step": 69250
},
{
"epoch": 0.9960438518007481,
"grad_norm": 0.40837883949279785,
"learning_rate": 4.340639597556594e-07,
"loss": 0.1176,
"step": 69300
},
{
"epoch": 0.9967624981584687,
"grad_norm": 0.41178905963897705,
"learning_rate": 3.62199065756378e-07,
"loss": 0.1223,
"step": 69350
},
{
"epoch": 0.9974811445161893,
"grad_norm": 0.32353463768959045,
"learning_rate": 2.9033417175709665e-07,
"loss": 0.1182,
"step": 69400
},
{
"epoch": 0.9981997908739099,
"grad_norm": 0.40918976068496704,
"learning_rate": 2.1846927775781533e-07,
"loss": 0.1186,
"step": 69450
},
{
"epoch": 0.9989184372316305,
"grad_norm": 0.3727043569087982,
"learning_rate": 1.4660438375853396e-07,
"loss": 0.1196,
"step": 69500
},
{
"epoch": 0.9996370835893511,
"grad_norm": 0.30194342136383057,
"learning_rate": 7.473948975925261e-08,
"loss": 0.1217,
"step": 69550
}
],
"logging_steps": 50,
"max_steps": 69575,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0846345485833994e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}