esecutore-4-0619 / trainer_state.json
hidude562's picture
Upload 15 files
b0a40a0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9912379178471235,
"eval_steps": 50000,
"global_step": 430000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004610408920219179,
"grad_norm": 6.876699447631836,
"learning_rate": 4.997694795539891e-05,
"loss": 6.5362,
"step": 200
},
{
"epoch": 0.0009220817840438358,
"grad_norm": 5.820261478424072,
"learning_rate": 4.995389591079781e-05,
"loss": 2.9133,
"step": 400
},
{
"epoch": 0.0013831226760657536,
"grad_norm": 5.954094886779785,
"learning_rate": 4.993084386619671e-05,
"loss": 2.3126,
"step": 600
},
{
"epoch": 0.0018441635680876715,
"grad_norm": 8.361907005310059,
"learning_rate": 4.990779182159562e-05,
"loss": 2.1419,
"step": 800
},
{
"epoch": 0.0023052044601095893,
"grad_norm": 3.7237906455993652,
"learning_rate": 4.9884739776994524e-05,
"loss": 2.0302,
"step": 1000
},
{
"epoch": 0.002766245352131507,
"grad_norm": 2.4539265632629395,
"learning_rate": 4.986168773239342e-05,
"loss": 1.9613,
"step": 1200
},
{
"epoch": 0.003227286244153425,
"grad_norm": 4.33223819732666,
"learning_rate": 4.983863568779233e-05,
"loss": 1.9683,
"step": 1400
},
{
"epoch": 0.003688327136175343,
"grad_norm": 5.55894136428833,
"learning_rate": 4.9815583643191234e-05,
"loss": 1.8365,
"step": 1600
},
{
"epoch": 0.004149368028197261,
"grad_norm": 2.5861189365386963,
"learning_rate": 4.979253159859014e-05,
"loss": 1.7759,
"step": 1800
},
{
"epoch": 0.0046104089202191785,
"grad_norm": 3.7939908504486084,
"learning_rate": 4.9769479553989046e-05,
"loss": 1.7706,
"step": 2000
},
{
"epoch": 0.005071449812241097,
"grad_norm": 6.777012825012207,
"learning_rate": 4.974642750938795e-05,
"loss": 1.8386,
"step": 2200
},
{
"epoch": 0.005532490704263014,
"grad_norm": 3.8171310424804688,
"learning_rate": 4.972337546478685e-05,
"loss": 1.7511,
"step": 2400
},
{
"epoch": 0.005993531596284933,
"grad_norm": 3.042823076248169,
"learning_rate": 4.9700323420185756e-05,
"loss": 1.7009,
"step": 2600
},
{
"epoch": 0.00645457248830685,
"grad_norm": 3.2283356189727783,
"learning_rate": 4.967727137558466e-05,
"loss": 1.6847,
"step": 2800
},
{
"epoch": 0.006915613380328769,
"grad_norm": 2.336369037628174,
"learning_rate": 4.965421933098357e-05,
"loss": 1.6153,
"step": 3000
},
{
"epoch": 0.007376654272350686,
"grad_norm": 7.53791618347168,
"learning_rate": 4.9631167286382466e-05,
"loss": 1.645,
"step": 3200
},
{
"epoch": 0.007837695164372604,
"grad_norm": 3.086069345474243,
"learning_rate": 4.960811524178137e-05,
"loss": 1.5648,
"step": 3400
},
{
"epoch": 0.008298736056394522,
"grad_norm": 4.105820178985596,
"learning_rate": 4.958506319718028e-05,
"loss": 1.5533,
"step": 3600
},
{
"epoch": 0.00875977694841644,
"grad_norm": 3.5181972980499268,
"learning_rate": 4.9562011152579176e-05,
"loss": 1.5467,
"step": 3800
},
{
"epoch": 0.009220817840438357,
"grad_norm": 3.0619754791259766,
"learning_rate": 4.953895910797808e-05,
"loss": 1.5904,
"step": 4000
},
{
"epoch": 0.009681858732460276,
"grad_norm": 3.566425085067749,
"learning_rate": 4.951590706337699e-05,
"loss": 1.4991,
"step": 4200
},
{
"epoch": 0.010142899624482194,
"grad_norm": 2.424494743347168,
"learning_rate": 4.949285501877589e-05,
"loss": 1.4342,
"step": 4400
},
{
"epoch": 0.010603940516504111,
"grad_norm": 2.841980218887329,
"learning_rate": 4.946980297417479e-05,
"loss": 1.4694,
"step": 4600
},
{
"epoch": 0.011064981408526029,
"grad_norm": 3.2722415924072266,
"learning_rate": 4.94467509295737e-05,
"loss": 1.4593,
"step": 4800
},
{
"epoch": 0.011526022300547946,
"grad_norm": 3.9442007541656494,
"learning_rate": 4.9423698884972604e-05,
"loss": 1.4766,
"step": 5000
},
{
"epoch": 0.011987063192569865,
"grad_norm": 3.1083531379699707,
"learning_rate": 4.940064684037151e-05,
"loss": 1.3964,
"step": 5200
},
{
"epoch": 0.012448104084591783,
"grad_norm": 2.1749391555786133,
"learning_rate": 4.9377594795770415e-05,
"loss": 1.4046,
"step": 5400
},
{
"epoch": 0.0129091449766137,
"grad_norm": 3.8060245513916016,
"learning_rate": 4.935454275116932e-05,
"loss": 1.4299,
"step": 5600
},
{
"epoch": 0.013370185868635618,
"grad_norm": 3.412968397140503,
"learning_rate": 4.9331490706568226e-05,
"loss": 1.4236,
"step": 5800
},
{
"epoch": 0.013831226760657537,
"grad_norm": 3.04774808883667,
"learning_rate": 4.9308438661967125e-05,
"loss": 1.44,
"step": 6000
},
{
"epoch": 0.014292267652679455,
"grad_norm": 4.18248176574707,
"learning_rate": 4.928538661736603e-05,
"loss": 1.3892,
"step": 6200
},
{
"epoch": 0.014753308544701372,
"grad_norm": 4.440049648284912,
"learning_rate": 4.9262334572764937e-05,
"loss": 1.3493,
"step": 6400
},
{
"epoch": 0.01521434943672329,
"grad_norm": 2.509575128555298,
"learning_rate": 4.9239282528163835e-05,
"loss": 1.409,
"step": 6600
},
{
"epoch": 0.01567539032874521,
"grad_norm": 3.4608070850372314,
"learning_rate": 4.921623048356274e-05,
"loss": 1.4013,
"step": 6800
},
{
"epoch": 0.016136431220767126,
"grad_norm": 6.1122331619262695,
"learning_rate": 4.919317843896165e-05,
"loss": 1.4161,
"step": 7000
},
{
"epoch": 0.016597472112789044,
"grad_norm": 2.507805824279785,
"learning_rate": 4.917012639436055e-05,
"loss": 1.3504,
"step": 7200
},
{
"epoch": 0.01705851300481096,
"grad_norm": 4.3215012550354,
"learning_rate": 4.914707434975945e-05,
"loss": 1.3324,
"step": 7400
},
{
"epoch": 0.01751955389683288,
"grad_norm": 2.7966041564941406,
"learning_rate": 4.912402230515836e-05,
"loss": 1.3049,
"step": 7600
},
{
"epoch": 0.017980594788854797,
"grad_norm": 2.308271884918213,
"learning_rate": 4.910097026055726e-05,
"loss": 1.4026,
"step": 7800
},
{
"epoch": 0.018441635680876714,
"grad_norm": 2.970160484313965,
"learning_rate": 4.907791821595616e-05,
"loss": 1.363,
"step": 8000
},
{
"epoch": 0.01890267657289863,
"grad_norm": 2.760039806365967,
"learning_rate": 4.905486617135507e-05,
"loss": 1.3202,
"step": 8200
},
{
"epoch": 0.019363717464920552,
"grad_norm": 3.558941125869751,
"learning_rate": 4.903181412675397e-05,
"loss": 1.3337,
"step": 8400
},
{
"epoch": 0.01982475835694247,
"grad_norm": 2.800959587097168,
"learning_rate": 4.900876208215288e-05,
"loss": 1.2881,
"step": 8600
},
{
"epoch": 0.020285799248964387,
"grad_norm": 3.2539665699005127,
"learning_rate": 4.8985710037551784e-05,
"loss": 1.2243,
"step": 8800
},
{
"epoch": 0.020746840140986305,
"grad_norm": 4.043671131134033,
"learning_rate": 4.896265799295069e-05,
"loss": 1.2051,
"step": 9000
},
{
"epoch": 0.021207881033008222,
"grad_norm": 1.6535764932632446,
"learning_rate": 4.8939605948349596e-05,
"loss": 1.1973,
"step": 9200
},
{
"epoch": 0.02166892192503014,
"grad_norm": 2.395977020263672,
"learning_rate": 4.8916553903748495e-05,
"loss": 1.2781,
"step": 9400
},
{
"epoch": 0.022129962817052058,
"grad_norm": 2.445537567138672,
"learning_rate": 4.88935018591474e-05,
"loss": 1.2479,
"step": 9600
},
{
"epoch": 0.022591003709073975,
"grad_norm": 2.0848629474639893,
"learning_rate": 4.8870449814546306e-05,
"loss": 1.2139,
"step": 9800
},
{
"epoch": 0.023052044601095893,
"grad_norm": 2.457559585571289,
"learning_rate": 4.8847397769945205e-05,
"loss": 1.1932,
"step": 10000
},
{
"epoch": 0.023513085493117813,
"grad_norm": 3.07852840423584,
"learning_rate": 4.882434572534411e-05,
"loss": 1.179,
"step": 10200
},
{
"epoch": 0.02397412638513973,
"grad_norm": 2.2961785793304443,
"learning_rate": 4.8801293680743016e-05,
"loss": 1.1815,
"step": 10400
},
{
"epoch": 0.02443516727716165,
"grad_norm": 1.7498642206192017,
"learning_rate": 4.877824163614192e-05,
"loss": 1.1654,
"step": 10600
},
{
"epoch": 0.024896208169183566,
"grad_norm": 1.7616724967956543,
"learning_rate": 4.875518959154082e-05,
"loss": 1.2226,
"step": 10800
},
{
"epoch": 0.025357249061205483,
"grad_norm": 1.761996865272522,
"learning_rate": 4.8732137546939726e-05,
"loss": 1.1674,
"step": 11000
},
{
"epoch": 0.0258182899532274,
"grad_norm": 2.531437635421753,
"learning_rate": 4.870908550233863e-05,
"loss": 1.1966,
"step": 11200
},
{
"epoch": 0.02627933084524932,
"grad_norm": 2.4334516525268555,
"learning_rate": 4.868603345773753e-05,
"loss": 1.1132,
"step": 11400
},
{
"epoch": 0.026740371737271236,
"grad_norm": 3.8797712326049805,
"learning_rate": 4.866298141313644e-05,
"loss": 1.1435,
"step": 11600
},
{
"epoch": 0.027201412629293154,
"grad_norm": 3.8280975818634033,
"learning_rate": 4.863992936853534e-05,
"loss": 1.1939,
"step": 11800
},
{
"epoch": 0.027662453521315074,
"grad_norm": 4.0427703857421875,
"learning_rate": 4.861687732393425e-05,
"loss": 1.0749,
"step": 12000
},
{
"epoch": 0.028123494413336992,
"grad_norm": 2.366419553756714,
"learning_rate": 4.8593825279333154e-05,
"loss": 1.1491,
"step": 12200
},
{
"epoch": 0.02858453530535891,
"grad_norm": 2.7581288814544678,
"learning_rate": 4.857077323473206e-05,
"loss": 1.1654,
"step": 12400
},
{
"epoch": 0.029045576197380827,
"grad_norm": 1.7824950218200684,
"learning_rate": 4.8547721190130965e-05,
"loss": 1.0653,
"step": 12600
},
{
"epoch": 0.029506617089402744,
"grad_norm": 3.288841485977173,
"learning_rate": 4.8524669145529864e-05,
"loss": 1.107,
"step": 12800
},
{
"epoch": 0.029967657981424662,
"grad_norm": 2.365614652633667,
"learning_rate": 4.850161710092877e-05,
"loss": 1.1595,
"step": 13000
},
{
"epoch": 0.03042869887344658,
"grad_norm": 3.3963623046875,
"learning_rate": 4.8478565056327675e-05,
"loss": 1.1216,
"step": 13200
},
{
"epoch": 0.030889739765468497,
"grad_norm": 2.7090468406677246,
"learning_rate": 4.845551301172658e-05,
"loss": 1.1002,
"step": 13400
},
{
"epoch": 0.03135078065749042,
"grad_norm": 2.3977348804473877,
"learning_rate": 4.843246096712548e-05,
"loss": 1.0468,
"step": 13600
},
{
"epoch": 0.031811821549512335,
"grad_norm": 1.6876981258392334,
"learning_rate": 4.8409408922524386e-05,
"loss": 1.0675,
"step": 13800
},
{
"epoch": 0.03227286244153425,
"grad_norm": 2.1002566814422607,
"learning_rate": 4.838635687792329e-05,
"loss": 1.0378,
"step": 14000
},
{
"epoch": 0.03273390333355617,
"grad_norm": 1.5463937520980835,
"learning_rate": 4.836330483332219e-05,
"loss": 1.0406,
"step": 14200
},
{
"epoch": 0.03319494422557809,
"grad_norm": 2.2645134925842285,
"learning_rate": 4.8340252788721096e-05,
"loss": 1.118,
"step": 14400
},
{
"epoch": 0.033655985117600005,
"grad_norm": 4.435282230377197,
"learning_rate": 4.831720074412e-05,
"loss": 1.0542,
"step": 14600
},
{
"epoch": 0.03411702600962192,
"grad_norm": 2.544870615005493,
"learning_rate": 4.829414869951891e-05,
"loss": 1.0016,
"step": 14800
},
{
"epoch": 0.03457806690164384,
"grad_norm": 1.6866127252578735,
"learning_rate": 4.8271096654917806e-05,
"loss": 1.0184,
"step": 15000
},
{
"epoch": 0.03503910779366576,
"grad_norm": 2.1351208686828613,
"learning_rate": 4.824804461031671e-05,
"loss": 1.0695,
"step": 15200
},
{
"epoch": 0.035500148685687676,
"grad_norm": 2.4183170795440674,
"learning_rate": 4.8224992565715624e-05,
"loss": 1.0519,
"step": 15400
},
{
"epoch": 0.03596118957770959,
"grad_norm": 3.3186428546905518,
"learning_rate": 4.820194052111452e-05,
"loss": 1.0147,
"step": 15600
},
{
"epoch": 0.03642223046973151,
"grad_norm": 2.0496957302093506,
"learning_rate": 4.817888847651343e-05,
"loss": 1.054,
"step": 15800
},
{
"epoch": 0.03688327136175343,
"grad_norm": 2.232973575592041,
"learning_rate": 4.8155836431912334e-05,
"loss": 1.0129,
"step": 16000
},
{
"epoch": 0.037344312253775346,
"grad_norm": 2.1059677600860596,
"learning_rate": 4.813278438731123e-05,
"loss": 1.0563,
"step": 16200
},
{
"epoch": 0.03780535314579726,
"grad_norm": 3.6700918674468994,
"learning_rate": 4.810973234271014e-05,
"loss": 1.0314,
"step": 16400
},
{
"epoch": 0.03826639403781919,
"grad_norm": 2.078857421875,
"learning_rate": 4.8086680298109045e-05,
"loss": 1.0505,
"step": 16600
},
{
"epoch": 0.038727434929841105,
"grad_norm": 3.156902551651001,
"learning_rate": 4.806362825350795e-05,
"loss": 0.9496,
"step": 16800
},
{
"epoch": 0.03918847582186302,
"grad_norm": 5.415482044219971,
"learning_rate": 4.804057620890685e-05,
"loss": 1.0288,
"step": 17000
},
{
"epoch": 0.03964951671388494,
"grad_norm": 1.94370698928833,
"learning_rate": 4.8017524164305755e-05,
"loss": 1.0689,
"step": 17200
},
{
"epoch": 0.04011055760590686,
"grad_norm": 5.076870441436768,
"learning_rate": 4.799447211970466e-05,
"loss": 0.9809,
"step": 17400
},
{
"epoch": 0.040571598497928775,
"grad_norm": 1.5371116399765015,
"learning_rate": 4.797142007510356e-05,
"loss": 0.9868,
"step": 17600
},
{
"epoch": 0.04103263938995069,
"grad_norm": 3.8730454444885254,
"learning_rate": 4.7948368030502465e-05,
"loss": 0.9093,
"step": 17800
},
{
"epoch": 0.04149368028197261,
"grad_norm": 2.304157257080078,
"learning_rate": 4.792531598590137e-05,
"loss": 0.9993,
"step": 18000
},
{
"epoch": 0.04195472117399453,
"grad_norm": 3.116572856903076,
"learning_rate": 4.7902263941300277e-05,
"loss": 1.0456,
"step": 18200
},
{
"epoch": 0.042415762066016445,
"grad_norm": 2.1732380390167236,
"learning_rate": 4.7879211896699175e-05,
"loss": 0.9607,
"step": 18400
},
{
"epoch": 0.04287680295803836,
"grad_norm": 3.363409996032715,
"learning_rate": 4.785615985209808e-05,
"loss": 1.0467,
"step": 18600
},
{
"epoch": 0.04333784385006028,
"grad_norm": 3.746406078338623,
"learning_rate": 4.7833107807496994e-05,
"loss": 0.9335,
"step": 18800
},
{
"epoch": 0.0437988847420822,
"grad_norm": 1.5440335273742676,
"learning_rate": 4.781005576289589e-05,
"loss": 0.9262,
"step": 19000
},
{
"epoch": 0.044259925634104115,
"grad_norm": 3.108581066131592,
"learning_rate": 4.77870037182948e-05,
"loss": 0.9564,
"step": 19200
},
{
"epoch": 0.04472096652612603,
"grad_norm": 2.0899717807769775,
"learning_rate": 4.7763951673693704e-05,
"loss": 0.9544,
"step": 19400
},
{
"epoch": 0.04518200741814795,
"grad_norm": 2.250314474105835,
"learning_rate": 4.77408996290926e-05,
"loss": 0.9592,
"step": 19600
},
{
"epoch": 0.04564304831016987,
"grad_norm": 1.656875729560852,
"learning_rate": 4.771784758449151e-05,
"loss": 0.9054,
"step": 19800
},
{
"epoch": 0.046104089202191785,
"grad_norm": 2.7267401218414307,
"learning_rate": 4.7694795539890414e-05,
"loss": 0.932,
"step": 20000
},
{
"epoch": 0.04656513009421371,
"grad_norm": 1.7133885622024536,
"learning_rate": 4.767174349528932e-05,
"loss": 0.9014,
"step": 20200
},
{
"epoch": 0.04702617098623563,
"grad_norm": 1.699610710144043,
"learning_rate": 4.764869145068822e-05,
"loss": 0.9469,
"step": 20400
},
{
"epoch": 0.047487211878257544,
"grad_norm": 2.0547351837158203,
"learning_rate": 4.7625639406087124e-05,
"loss": 0.911,
"step": 20600
},
{
"epoch": 0.04794825277027946,
"grad_norm": 2.4188601970672607,
"learning_rate": 4.760258736148603e-05,
"loss": 0.9195,
"step": 20800
},
{
"epoch": 0.04840929366230138,
"grad_norm": 1.6855781078338623,
"learning_rate": 4.757953531688493e-05,
"loss": 0.9212,
"step": 21000
},
{
"epoch": 0.0488703345543233,
"grad_norm": 3.0659608840942383,
"learning_rate": 4.7556483272283835e-05,
"loss": 0.9408,
"step": 21200
},
{
"epoch": 0.049331375446345214,
"grad_norm": 1.8149137496948242,
"learning_rate": 4.753343122768274e-05,
"loss": 0.8916,
"step": 21400
},
{
"epoch": 0.04979241633836713,
"grad_norm": 3.0508041381835938,
"learning_rate": 4.7510379183081646e-05,
"loss": 0.9197,
"step": 21600
},
{
"epoch": 0.05025345723038905,
"grad_norm": 1.7217645645141602,
"learning_rate": 4.7487327138480545e-05,
"loss": 0.977,
"step": 21800
},
{
"epoch": 0.05071449812241097,
"grad_norm": 2.8696203231811523,
"learning_rate": 4.746427509387945e-05,
"loss": 0.8998,
"step": 22000
},
{
"epoch": 0.051175539014432884,
"grad_norm": 2.4317526817321777,
"learning_rate": 4.744122304927836e-05,
"loss": 0.895,
"step": 22200
},
{
"epoch": 0.0516365799064548,
"grad_norm": 1.4486078023910522,
"learning_rate": 4.741817100467726e-05,
"loss": 0.8794,
"step": 22400
},
{
"epoch": 0.05209762079847672,
"grad_norm": 2.6138267517089844,
"learning_rate": 4.739511896007617e-05,
"loss": 0.906,
"step": 22600
},
{
"epoch": 0.05255866169049864,
"grad_norm": 1.7025116682052612,
"learning_rate": 4.737206691547507e-05,
"loss": 0.9726,
"step": 22800
},
{
"epoch": 0.053019702582520555,
"grad_norm": 1.7836490869522095,
"learning_rate": 4.734901487087398e-05,
"loss": 0.8891,
"step": 23000
},
{
"epoch": 0.05348074347454247,
"grad_norm": 2.3155412673950195,
"learning_rate": 4.732596282627288e-05,
"loss": 0.8737,
"step": 23200
},
{
"epoch": 0.05394178436656439,
"grad_norm": 3.3721256256103516,
"learning_rate": 4.7302910781671783e-05,
"loss": 0.9199,
"step": 23400
},
{
"epoch": 0.05440282525858631,
"grad_norm": 1.807015061378479,
"learning_rate": 4.727985873707069e-05,
"loss": 0.9182,
"step": 23600
},
{
"epoch": 0.05486386615060823,
"grad_norm": 2.1808011531829834,
"learning_rate": 4.725680669246959e-05,
"loss": 0.9268,
"step": 23800
},
{
"epoch": 0.05532490704263015,
"grad_norm": 3.614689350128174,
"learning_rate": 4.7233754647868494e-05,
"loss": 0.8777,
"step": 24000
},
{
"epoch": 0.055785947934652066,
"grad_norm": 1.881955623626709,
"learning_rate": 4.72107026032674e-05,
"loss": 0.8564,
"step": 24200
},
{
"epoch": 0.056246988826673984,
"grad_norm": 2.8941149711608887,
"learning_rate": 4.7187650558666305e-05,
"loss": 0.8296,
"step": 24400
},
{
"epoch": 0.0567080297186959,
"grad_norm": 1.4242929220199585,
"learning_rate": 4.7164598514065204e-05,
"loss": 0.7617,
"step": 24600
},
{
"epoch": 0.05716907061071782,
"grad_norm": 4.67744255065918,
"learning_rate": 4.714154646946411e-05,
"loss": 0.9068,
"step": 24800
},
{
"epoch": 0.057630111502739736,
"grad_norm": 2.4979476928710938,
"learning_rate": 4.7118494424863015e-05,
"loss": 0.8382,
"step": 25000
},
{
"epoch": 0.058091152394761654,
"grad_norm": 2.030360460281372,
"learning_rate": 4.7095442380261914e-05,
"loss": 0.8524,
"step": 25200
},
{
"epoch": 0.05855219328678357,
"grad_norm": 2.282217502593994,
"learning_rate": 4.707239033566083e-05,
"loss": 0.8889,
"step": 25400
},
{
"epoch": 0.05901323417880549,
"grad_norm": 1.7318954467773438,
"learning_rate": 4.704933829105973e-05,
"loss": 0.9187,
"step": 25600
},
{
"epoch": 0.059474275070827406,
"grad_norm": 1.8734960556030273,
"learning_rate": 4.702628624645863e-05,
"loss": 0.7524,
"step": 25800
},
{
"epoch": 0.059935315962849324,
"grad_norm": 2.359909772872925,
"learning_rate": 4.700323420185754e-05,
"loss": 0.8388,
"step": 26000
},
{
"epoch": 0.06039635685487124,
"grad_norm": 1.9811803102493286,
"learning_rate": 4.698018215725644e-05,
"loss": 0.8994,
"step": 26200
},
{
"epoch": 0.06085739774689316,
"grad_norm": 1.4820138216018677,
"learning_rate": 4.695713011265535e-05,
"loss": 0.8098,
"step": 26400
},
{
"epoch": 0.061318438638915077,
"grad_norm": 3.8355236053466797,
"learning_rate": 4.693407806805425e-05,
"loss": 0.8365,
"step": 26600
},
{
"epoch": 0.061779479530936994,
"grad_norm": 1.9260506629943848,
"learning_rate": 4.691102602345315e-05,
"loss": 0.8508,
"step": 26800
},
{
"epoch": 0.06224052042295891,
"grad_norm": 1.6395090818405151,
"learning_rate": 4.688797397885206e-05,
"loss": 0.8537,
"step": 27000
},
{
"epoch": 0.06270156131498084,
"grad_norm": 0.8920634984970093,
"learning_rate": 4.686492193425096e-05,
"loss": 0.857,
"step": 27200
},
{
"epoch": 0.06316260220700275,
"grad_norm": 2.063812494277954,
"learning_rate": 4.684186988964986e-05,
"loss": 0.835,
"step": 27400
},
{
"epoch": 0.06362364309902467,
"grad_norm": 1.6132125854492188,
"learning_rate": 4.681881784504877e-05,
"loss": 0.8629,
"step": 27600
},
{
"epoch": 0.06408468399104658,
"grad_norm": 1.4121313095092773,
"learning_rate": 4.6795765800447674e-05,
"loss": 0.8026,
"step": 27800
},
{
"epoch": 0.0645457248830685,
"grad_norm": 1.3277547359466553,
"learning_rate": 4.677271375584657e-05,
"loss": 0.8315,
"step": 28000
},
{
"epoch": 0.06500676577509042,
"grad_norm": 1.51455819606781,
"learning_rate": 4.674966171124548e-05,
"loss": 0.8664,
"step": 28200
},
{
"epoch": 0.06546780666711234,
"grad_norm": 16.388320922851562,
"learning_rate": 4.6726609666644385e-05,
"loss": 0.8476,
"step": 28400
},
{
"epoch": 0.06592884755913425,
"grad_norm": 2.9211268424987793,
"learning_rate": 4.6703557622043284e-05,
"loss": 0.9153,
"step": 28600
},
{
"epoch": 0.06638988845115618,
"grad_norm": 1.7601099014282227,
"learning_rate": 4.6680505577442196e-05,
"loss": 0.8675,
"step": 28800
},
{
"epoch": 0.06685092934317809,
"grad_norm": 3.691970109939575,
"learning_rate": 4.66574535328411e-05,
"loss": 0.7636,
"step": 29000
},
{
"epoch": 0.06731197023520001,
"grad_norm": 0.8975255489349365,
"learning_rate": 4.663440148824e-05,
"loss": 0.7958,
"step": 29200
},
{
"epoch": 0.06777301112722194,
"grad_norm": 1.8996187448501587,
"learning_rate": 4.6611349443638906e-05,
"loss": 0.7784,
"step": 29400
},
{
"epoch": 0.06823405201924385,
"grad_norm": 2.2210590839385986,
"learning_rate": 4.658829739903781e-05,
"loss": 0.8002,
"step": 29600
},
{
"epoch": 0.06869509291126577,
"grad_norm": 2.341740369796753,
"learning_rate": 4.656524535443672e-05,
"loss": 0.778,
"step": 29800
},
{
"epoch": 0.06915613380328768,
"grad_norm": 2.567145347595215,
"learning_rate": 4.6542193309835617e-05,
"loss": 0.8061,
"step": 30000
},
{
"epoch": 0.0696171746953096,
"grad_norm": 0.8251648545265198,
"learning_rate": 4.651914126523452e-05,
"loss": 0.8163,
"step": 30200
},
{
"epoch": 0.07007821558733152,
"grad_norm": 3.3493523597717285,
"learning_rate": 4.649608922063343e-05,
"loss": 0.7326,
"step": 30400
},
{
"epoch": 0.07053925647935344,
"grad_norm": 3.1266913414001465,
"learning_rate": 4.6473037176032334e-05,
"loss": 0.7925,
"step": 30600
},
{
"epoch": 0.07100029737137535,
"grad_norm": 1.5895702838897705,
"learning_rate": 4.644998513143123e-05,
"loss": 0.7932,
"step": 30800
},
{
"epoch": 0.07146133826339728,
"grad_norm": 1.4103891849517822,
"learning_rate": 4.642693308683014e-05,
"loss": 0.7583,
"step": 31000
},
{
"epoch": 0.07192237915541919,
"grad_norm": 1.4762630462646484,
"learning_rate": 4.6403881042229044e-05,
"loss": 0.7364,
"step": 31200
},
{
"epoch": 0.07238342004744111,
"grad_norm": 1.4868961572647095,
"learning_rate": 4.638082899762794e-05,
"loss": 0.7874,
"step": 31400
},
{
"epoch": 0.07284446093946302,
"grad_norm": 1.9157131910324097,
"learning_rate": 4.635777695302685e-05,
"loss": 0.7669,
"step": 31600
},
{
"epoch": 0.07330550183148495,
"grad_norm": 1.249605417251587,
"learning_rate": 4.6334724908425754e-05,
"loss": 0.7711,
"step": 31800
},
{
"epoch": 0.07376654272350686,
"grad_norm": 2.004805326461792,
"learning_rate": 4.631167286382466e-05,
"loss": 0.7518,
"step": 32000
},
{
"epoch": 0.07422758361552878,
"grad_norm": 1.682356595993042,
"learning_rate": 4.6288620819223565e-05,
"loss": 0.7534,
"step": 32200
},
{
"epoch": 0.07468862450755069,
"grad_norm": 2.679586887359619,
"learning_rate": 4.626556877462247e-05,
"loss": 0.8238,
"step": 32400
},
{
"epoch": 0.07514966539957262,
"grad_norm": 1.364603042602539,
"learning_rate": 4.624251673002138e-05,
"loss": 0.8159,
"step": 32600
},
{
"epoch": 0.07561070629159453,
"grad_norm": 2.2733583450317383,
"learning_rate": 4.6219464685420276e-05,
"loss": 0.7822,
"step": 32800
},
{
"epoch": 0.07607174718361645,
"grad_norm": 2.5104455947875977,
"learning_rate": 4.619641264081918e-05,
"loss": 0.7664,
"step": 33000
},
{
"epoch": 0.07653278807563837,
"grad_norm": 1.4707565307617188,
"learning_rate": 4.617336059621809e-05,
"loss": 0.7817,
"step": 33200
},
{
"epoch": 0.07699382896766029,
"grad_norm": 1.9409255981445312,
"learning_rate": 4.6150308551616986e-05,
"loss": 0.8272,
"step": 33400
},
{
"epoch": 0.07745486985968221,
"grad_norm": 1.9460760354995728,
"learning_rate": 4.612725650701589e-05,
"loss": 0.815,
"step": 33600
},
{
"epoch": 0.07791591075170412,
"grad_norm": 2.3821299076080322,
"learning_rate": 4.61042044624148e-05,
"loss": 0.7747,
"step": 33800
},
{
"epoch": 0.07837695164372604,
"grad_norm": 1.8464001417160034,
"learning_rate": 4.60811524178137e-05,
"loss": 0.7529,
"step": 34000
},
{
"epoch": 0.07883799253574796,
"grad_norm": 2.189345121383667,
"learning_rate": 4.60581003732126e-05,
"loss": 0.7485,
"step": 34200
},
{
"epoch": 0.07929903342776988,
"grad_norm": 1.4213758707046509,
"learning_rate": 4.603504832861151e-05,
"loss": 0.7748,
"step": 34400
},
{
"epoch": 0.07976007431979179,
"grad_norm": 1.6908587217330933,
"learning_rate": 4.601199628401041e-05,
"loss": 0.7975,
"step": 34600
},
{
"epoch": 0.08022111521181371,
"grad_norm": 1.0378413200378418,
"learning_rate": 4.598894423940931e-05,
"loss": 0.7697,
"step": 34800
},
{
"epoch": 0.08068215610383563,
"grad_norm": 1.9026026725769043,
"learning_rate": 4.596589219480822e-05,
"loss": 0.7898,
"step": 35000
},
{
"epoch": 0.08114319699585755,
"grad_norm": 3.3741543292999268,
"learning_rate": 4.5942840150207123e-05,
"loss": 0.7273,
"step": 35200
},
{
"epoch": 0.08160423788787946,
"grad_norm": 1.1691900491714478,
"learning_rate": 4.591978810560603e-05,
"loss": 0.8134,
"step": 35400
},
{
"epoch": 0.08206527877990138,
"grad_norm": 1.40901780128479,
"learning_rate": 4.5896736061004935e-05,
"loss": 0.7824,
"step": 35600
},
{
"epoch": 0.0825263196719233,
"grad_norm": 2.224029064178467,
"learning_rate": 4.587368401640384e-05,
"loss": 0.7151,
"step": 35800
},
{
"epoch": 0.08298736056394522,
"grad_norm": 2.2175581455230713,
"learning_rate": 4.5850631971802746e-05,
"loss": 0.7764,
"step": 36000
},
{
"epoch": 0.08344840145596713,
"grad_norm": 1.4262895584106445,
"learning_rate": 4.5827579927201645e-05,
"loss": 0.8068,
"step": 36200
},
{
"epoch": 0.08390944234798905,
"grad_norm": 1.3810303211212158,
"learning_rate": 4.580452788260055e-05,
"loss": 0.7415,
"step": 36400
},
{
"epoch": 0.08437048324001098,
"grad_norm": 1.2411589622497559,
"learning_rate": 4.5781475837999456e-05,
"loss": 0.7406,
"step": 36600
},
{
"epoch": 0.08483152413203289,
"grad_norm": 1.8816428184509277,
"learning_rate": 4.5758423793398355e-05,
"loss": 0.7843,
"step": 36800
},
{
"epoch": 0.08529256502405481,
"grad_norm": 0.910955548286438,
"learning_rate": 4.573537174879726e-05,
"loss": 0.7791,
"step": 37000
},
{
"epoch": 0.08575360591607673,
"grad_norm": 2.3837499618530273,
"learning_rate": 4.571231970419617e-05,
"loss": 0.7272,
"step": 37200
},
{
"epoch": 0.08621464680809865,
"grad_norm": 1.2091758251190186,
"learning_rate": 4.568926765959507e-05,
"loss": 0.7586,
"step": 37400
},
{
"epoch": 0.08667568770012056,
"grad_norm": 2.031092643737793,
"learning_rate": 4.566621561499397e-05,
"loss": 0.7467,
"step": 37600
},
{
"epoch": 0.08713672859214248,
"grad_norm": 1.8834586143493652,
"learning_rate": 4.564316357039288e-05,
"loss": 0.7743,
"step": 37800
},
{
"epoch": 0.0875977694841644,
"grad_norm": 4.032400131225586,
"learning_rate": 4.562011152579178e-05,
"loss": 0.7581,
"step": 38000
},
{
"epoch": 0.08805881037618632,
"grad_norm": 1.3809504508972168,
"learning_rate": 4.559705948119069e-05,
"loss": 0.7752,
"step": 38200
},
{
"epoch": 0.08851985126820823,
"grad_norm": 2.5716593265533447,
"learning_rate": 4.557400743658959e-05,
"loss": 0.7599,
"step": 38400
},
{
"epoch": 0.08898089216023015,
"grad_norm": 1.1471354961395264,
"learning_rate": 4.555095539198849e-05,
"loss": 0.7802,
"step": 38600
},
{
"epoch": 0.08944193305225207,
"grad_norm": 1.757161259651184,
"learning_rate": 4.55279033473874e-05,
"loss": 0.7891,
"step": 38800
},
{
"epoch": 0.08990297394427399,
"grad_norm": 1.920569896697998,
"learning_rate": 4.5504851302786304e-05,
"loss": 0.833,
"step": 39000
},
{
"epoch": 0.0903640148362959,
"grad_norm": 1.7894421815872192,
"learning_rate": 4.548179925818521e-05,
"loss": 0.7353,
"step": 39200
},
{
"epoch": 0.09082505572831782,
"grad_norm": 1.6656538248062134,
"learning_rate": 4.5458747213584115e-05,
"loss": 0.7091,
"step": 39400
},
{
"epoch": 0.09128609662033974,
"grad_norm": 2.3881382942199707,
"learning_rate": 4.5435695168983014e-05,
"loss": 0.7609,
"step": 39600
},
{
"epoch": 0.09174713751236166,
"grad_norm": 2.9305579662323,
"learning_rate": 4.541264312438192e-05,
"loss": 0.785,
"step": 39800
},
{
"epoch": 0.09220817840438357,
"grad_norm": 1.734604835510254,
"learning_rate": 4.5389591079780826e-05,
"loss": 0.8049,
"step": 40000
},
{
"epoch": 0.0926692192964055,
"grad_norm": 2.1614363193511963,
"learning_rate": 4.536653903517973e-05,
"loss": 0.7618,
"step": 40200
},
{
"epoch": 0.09313026018842742,
"grad_norm": 1.1229090690612793,
"learning_rate": 4.534348699057863e-05,
"loss": 0.7517,
"step": 40400
},
{
"epoch": 0.09359130108044933,
"grad_norm": 2.0106265544891357,
"learning_rate": 4.5320434945977536e-05,
"loss": 0.7922,
"step": 40600
},
{
"epoch": 0.09405234197247125,
"grad_norm": 2.5871689319610596,
"learning_rate": 4.529738290137644e-05,
"loss": 0.6778,
"step": 40800
},
{
"epoch": 0.09451338286449316,
"grad_norm": 1.3384044170379639,
"learning_rate": 4.527433085677534e-05,
"loss": 0.8056,
"step": 41000
},
{
"epoch": 0.09497442375651509,
"grad_norm": 2.1800479888916016,
"learning_rate": 4.5251278812174246e-05,
"loss": 0.771,
"step": 41200
},
{
"epoch": 0.095435464648537,
"grad_norm": 2.0507094860076904,
"learning_rate": 4.522822676757315e-05,
"loss": 0.758,
"step": 41400
},
{
"epoch": 0.09589650554055892,
"grad_norm": 0.8887900710105896,
"learning_rate": 4.520517472297206e-05,
"loss": 0.7563,
"step": 41600
},
{
"epoch": 0.09635754643258083,
"grad_norm": 2.479279041290283,
"learning_rate": 4.5182122678370956e-05,
"loss": 0.7284,
"step": 41800
},
{
"epoch": 0.09681858732460276,
"grad_norm": 1.2137857675552368,
"learning_rate": 4.515907063376986e-05,
"loss": 0.7773,
"step": 42000
},
{
"epoch": 0.09727962821662467,
"grad_norm": 1.0529214143753052,
"learning_rate": 4.513601858916877e-05,
"loss": 0.7663,
"step": 42200
},
{
"epoch": 0.0977406691086466,
"grad_norm": 1.7939465045928955,
"learning_rate": 4.5112966544567674e-05,
"loss": 0.7695,
"step": 42400
},
{
"epoch": 0.0982017100006685,
"grad_norm": 0.4527842104434967,
"learning_rate": 4.508991449996658e-05,
"loss": 0.7037,
"step": 42600
},
{
"epoch": 0.09866275089269043,
"grad_norm": 1.5540140867233276,
"learning_rate": 4.5066862455365485e-05,
"loss": 0.7478,
"step": 42800
},
{
"epoch": 0.09912379178471234,
"grad_norm": 1.9301183223724365,
"learning_rate": 4.5043810410764384e-05,
"loss": 0.7098,
"step": 43000
},
{
"epoch": 0.09958483267673426,
"grad_norm": 2.3165171146392822,
"learning_rate": 4.502075836616329e-05,
"loss": 0.7192,
"step": 43200
},
{
"epoch": 0.10004587356875617,
"grad_norm": 2.4089784622192383,
"learning_rate": 4.4997706321562195e-05,
"loss": 0.7016,
"step": 43400
},
{
"epoch": 0.1005069144607781,
"grad_norm": 1.5298134088516235,
"learning_rate": 4.49746542769611e-05,
"loss": 0.7295,
"step": 43600
},
{
"epoch": 0.10096795535280002,
"grad_norm": 1.7216567993164062,
"learning_rate": 4.495160223236e-05,
"loss": 0.7603,
"step": 43800
},
{
"epoch": 0.10142899624482193,
"grad_norm": 2.678551435470581,
"learning_rate": 4.4928550187758905e-05,
"loss": 0.7225,
"step": 44000
},
{
"epoch": 0.10189003713684386,
"grad_norm": 2.051182985305786,
"learning_rate": 4.490549814315781e-05,
"loss": 0.7398,
"step": 44200
},
{
"epoch": 0.10235107802886577,
"grad_norm": 1.0527026653289795,
"learning_rate": 4.488244609855671e-05,
"loss": 0.7041,
"step": 44400
},
{
"epoch": 0.1028121189208877,
"grad_norm": 2.363438367843628,
"learning_rate": 4.4859394053955616e-05,
"loss": 0.7273,
"step": 44600
},
{
"epoch": 0.1032731598129096,
"grad_norm": 3.6583263874053955,
"learning_rate": 4.483634200935452e-05,
"loss": 0.7321,
"step": 44800
},
{
"epoch": 0.10373420070493153,
"grad_norm": 1.391920804977417,
"learning_rate": 4.481328996475343e-05,
"loss": 0.7498,
"step": 45000
},
{
"epoch": 0.10419524159695344,
"grad_norm": 1.3391286134719849,
"learning_rate": 4.4790237920152326e-05,
"loss": 0.7436,
"step": 45200
},
{
"epoch": 0.10465628248897536,
"grad_norm": 1.6960753202438354,
"learning_rate": 4.476718587555123e-05,
"loss": 0.6681,
"step": 45400
},
{
"epoch": 0.10511732338099727,
"grad_norm": 1.6384496688842773,
"learning_rate": 4.474413383095014e-05,
"loss": 0.6966,
"step": 45600
},
{
"epoch": 0.1055783642730192,
"grad_norm": 2.391704559326172,
"learning_rate": 4.472108178634904e-05,
"loss": 0.7039,
"step": 45800
},
{
"epoch": 0.10603940516504111,
"grad_norm": 1.6314672231674194,
"learning_rate": 4.469802974174795e-05,
"loss": 0.715,
"step": 46000
},
{
"epoch": 0.10650044605706303,
"grad_norm": 0.872035026550293,
"learning_rate": 4.4674977697146854e-05,
"loss": 0.7375,
"step": 46200
},
{
"epoch": 0.10696148694908494,
"grad_norm": 2.016697645187378,
"learning_rate": 4.465192565254575e-05,
"loss": 0.7388,
"step": 46400
},
{
"epoch": 0.10742252784110687,
"grad_norm": 2.294455051422119,
"learning_rate": 4.462887360794466e-05,
"loss": 0.7218,
"step": 46600
},
{
"epoch": 0.10788356873312878,
"grad_norm": 1.2068428993225098,
"learning_rate": 4.4605821563343564e-05,
"loss": 0.6365,
"step": 46800
},
{
"epoch": 0.1083446096251507,
"grad_norm": 2.1000618934631348,
"learning_rate": 4.458276951874247e-05,
"loss": 0.6978,
"step": 47000
},
{
"epoch": 0.10880565051717261,
"grad_norm": 2.496563673019409,
"learning_rate": 4.455971747414137e-05,
"loss": 0.6807,
"step": 47200
},
{
"epoch": 0.10926669140919454,
"grad_norm": 1.9439219236373901,
"learning_rate": 4.4536665429540275e-05,
"loss": 0.7186,
"step": 47400
},
{
"epoch": 0.10972773230121646,
"grad_norm": 1.817345142364502,
"learning_rate": 4.451361338493918e-05,
"loss": 0.7519,
"step": 47600
},
{
"epoch": 0.11018877319323837,
"grad_norm": 2.6443488597869873,
"learning_rate": 4.4490561340338086e-05,
"loss": 0.672,
"step": 47800
},
{
"epoch": 0.1106498140852603,
"grad_norm": 7.7301483154296875,
"learning_rate": 4.4467509295736985e-05,
"loss": 0.7019,
"step": 48000
},
{
"epoch": 0.11111085497728221,
"grad_norm": 2.1185405254364014,
"learning_rate": 4.444445725113589e-05,
"loss": 0.7819,
"step": 48200
},
{
"epoch": 0.11157189586930413,
"grad_norm": 1.3251652717590332,
"learning_rate": 4.4421405206534796e-05,
"loss": 0.688,
"step": 48400
},
{
"epoch": 0.11203293676132604,
"grad_norm": 2.554704427719116,
"learning_rate": 4.4398353161933695e-05,
"loss": 0.7729,
"step": 48600
},
{
"epoch": 0.11249397765334797,
"grad_norm": 1.0944995880126953,
"learning_rate": 4.43753011173326e-05,
"loss": 0.7296,
"step": 48800
},
{
"epoch": 0.11295501854536988,
"grad_norm": 0.5829809904098511,
"learning_rate": 4.4352249072731507e-05,
"loss": 0.6906,
"step": 49000
},
{
"epoch": 0.1134160594373918,
"grad_norm": 1.3186956644058228,
"learning_rate": 4.432919702813041e-05,
"loss": 0.6849,
"step": 49200
},
{
"epoch": 0.11387710032941371,
"grad_norm": 2.7295708656311035,
"learning_rate": 4.430614498352932e-05,
"loss": 0.7398,
"step": 49400
},
{
"epoch": 0.11433814122143564,
"grad_norm": 0.8470388054847717,
"learning_rate": 4.4283092938928224e-05,
"loss": 0.7197,
"step": 49600
},
{
"epoch": 0.11479918211345755,
"grad_norm": 2.0679562091827393,
"learning_rate": 4.426004089432713e-05,
"loss": 0.7102,
"step": 49800
},
{
"epoch": 0.11526022300547947,
"grad_norm": 1.7280285358428955,
"learning_rate": 4.423698884972603e-05,
"loss": 0.6808,
"step": 50000
},
{
"epoch": 0.11526022300547947,
"eval_loss": 0.6911378502845764,
"eval_runtime": 143.422,
"eval_samples_per_second": 30.553,
"eval_steps_per_second": 30.553,
"step": 50000
},
{
"epoch": 0.11572126389750138,
"grad_norm": 1.910679578781128,
"learning_rate": 4.4213936805124934e-05,
"loss": 0.6451,
"step": 50200
},
{
"epoch": 0.11618230478952331,
"grad_norm": 1.45720636844635,
"learning_rate": 4.419088476052384e-05,
"loss": 0.6488,
"step": 50400
},
{
"epoch": 0.11664334568154522,
"grad_norm": 2.245499610900879,
"learning_rate": 4.416783271592274e-05,
"loss": 0.6719,
"step": 50600
},
{
"epoch": 0.11710438657356714,
"grad_norm": 1.8845460414886475,
"learning_rate": 4.4144780671321644e-05,
"loss": 0.6931,
"step": 50800
},
{
"epoch": 0.11756542746558905,
"grad_norm": 0.9793957471847534,
"learning_rate": 4.412172862672055e-05,
"loss": 0.6606,
"step": 51000
},
{
"epoch": 0.11802646835761098,
"grad_norm": 0.7978737950325012,
"learning_rate": 4.4098676582119455e-05,
"loss": 0.7226,
"step": 51200
},
{
"epoch": 0.1184875092496329,
"grad_norm": 1.30372953414917,
"learning_rate": 4.4075624537518354e-05,
"loss": 0.6551,
"step": 51400
},
{
"epoch": 0.11894855014165481,
"grad_norm": 2.127319812774658,
"learning_rate": 4.405257249291726e-05,
"loss": 0.703,
"step": 51600
},
{
"epoch": 0.11940959103367674,
"grad_norm": 2.518284797668457,
"learning_rate": 4.4029520448316166e-05,
"loss": 0.6229,
"step": 51800
},
{
"epoch": 0.11987063192569865,
"grad_norm": 1.752998948097229,
"learning_rate": 4.4006468403715065e-05,
"loss": 0.7245,
"step": 52000
},
{
"epoch": 0.12033167281772057,
"grad_norm": 1.0647391080856323,
"learning_rate": 4.398341635911397e-05,
"loss": 0.6879,
"step": 52200
},
{
"epoch": 0.12079271370974248,
"grad_norm": 2.2331488132476807,
"learning_rate": 4.3960364314512876e-05,
"loss": 0.675,
"step": 52400
},
{
"epoch": 0.12125375460176441,
"grad_norm": 2.0386297702789307,
"learning_rate": 4.393731226991178e-05,
"loss": 0.6941,
"step": 52600
},
{
"epoch": 0.12171479549378632,
"grad_norm": 1.6465948820114136,
"learning_rate": 4.391426022531069e-05,
"loss": 0.6883,
"step": 52800
},
{
"epoch": 0.12217583638580824,
"grad_norm": 0.915367066860199,
"learning_rate": 4.389120818070959e-05,
"loss": 0.7423,
"step": 53000
},
{
"epoch": 0.12263687727783015,
"grad_norm": 1.3777244091033936,
"learning_rate": 4.38681561361085e-05,
"loss": 0.7046,
"step": 53200
},
{
"epoch": 0.12309791816985208,
"grad_norm": 1.9694982767105103,
"learning_rate": 4.38451040915074e-05,
"loss": 0.7019,
"step": 53400
},
{
"epoch": 0.12355895906187399,
"grad_norm": 2.005706310272217,
"learning_rate": 4.38220520469063e-05,
"loss": 0.633,
"step": 53600
},
{
"epoch": 0.12401999995389591,
"grad_norm": 1.4841361045837402,
"learning_rate": 4.379900000230521e-05,
"loss": 0.6973,
"step": 53800
},
{
"epoch": 0.12448104084591782,
"grad_norm": 1.7717888355255127,
"learning_rate": 4.377594795770411e-05,
"loss": 0.6861,
"step": 54000
},
{
"epoch": 0.12494208173793975,
"grad_norm": 2.585420608520508,
"learning_rate": 4.3752895913103013e-05,
"loss": 0.7221,
"step": 54200
},
{
"epoch": 0.12540312262996167,
"grad_norm": 1.8941155672073364,
"learning_rate": 4.372984386850192e-05,
"loss": 0.7162,
"step": 54400
},
{
"epoch": 0.12586416352198357,
"grad_norm": 1.920271396636963,
"learning_rate": 4.3706791823900825e-05,
"loss": 0.6739,
"step": 54600
},
{
"epoch": 0.1263252044140055,
"grad_norm": 1.4717075824737549,
"learning_rate": 4.3683739779299724e-05,
"loss": 0.6691,
"step": 54800
},
{
"epoch": 0.12678624530602742,
"grad_norm": 1.4651011228561401,
"learning_rate": 4.366068773469863e-05,
"loss": 0.7049,
"step": 55000
},
{
"epoch": 0.12724728619804934,
"grad_norm": 1.613660216331482,
"learning_rate": 4.3637635690097535e-05,
"loss": 0.7058,
"step": 55200
},
{
"epoch": 0.12770832709007127,
"grad_norm": 1.3848841190338135,
"learning_rate": 4.361458364549644e-05,
"loss": 0.6994,
"step": 55400
},
{
"epoch": 0.12816936798209316,
"grad_norm": 3.159140110015869,
"learning_rate": 4.359153160089534e-05,
"loss": 0.6746,
"step": 55600
},
{
"epoch": 0.1286304088741151,
"grad_norm": 1.353094458580017,
"learning_rate": 4.3568479556294245e-05,
"loss": 0.641,
"step": 55800
},
{
"epoch": 0.129091449766137,
"grad_norm": 1.5936461687088013,
"learning_rate": 4.354542751169315e-05,
"loss": 0.7054,
"step": 56000
},
{
"epoch": 0.12955249065815894,
"grad_norm": 1.0393725633621216,
"learning_rate": 4.352237546709206e-05,
"loss": 0.636,
"step": 56200
},
{
"epoch": 0.13001353155018083,
"grad_norm": 1.6490427255630493,
"learning_rate": 4.349932342249096e-05,
"loss": 0.6936,
"step": 56400
},
{
"epoch": 0.13047457244220276,
"grad_norm": 1.1870497465133667,
"learning_rate": 4.347627137788987e-05,
"loss": 0.7119,
"step": 56600
},
{
"epoch": 0.13093561333422468,
"grad_norm": 2.3116602897644043,
"learning_rate": 4.345321933328877e-05,
"loss": 0.6961,
"step": 56800
},
{
"epoch": 0.1313966542262466,
"grad_norm": 1.674390435218811,
"learning_rate": 4.343016728868767e-05,
"loss": 0.7203,
"step": 57000
},
{
"epoch": 0.1318576951182685,
"grad_norm": 1.583085536956787,
"learning_rate": 4.340711524408658e-05,
"loss": 0.6594,
"step": 57200
},
{
"epoch": 0.13231873601029043,
"grad_norm": 1.5700510740280151,
"learning_rate": 4.3384063199485484e-05,
"loss": 0.6794,
"step": 57400
},
{
"epoch": 0.13277977690231235,
"grad_norm": 2.0833590030670166,
"learning_rate": 4.336101115488438e-05,
"loss": 0.6751,
"step": 57600
},
{
"epoch": 0.13324081779433428,
"grad_norm": 6.332681655883789,
"learning_rate": 4.333795911028329e-05,
"loss": 0.6574,
"step": 57800
},
{
"epoch": 0.13370185868635617,
"grad_norm": 1.0451648235321045,
"learning_rate": 4.3314907065682194e-05,
"loss": 0.6514,
"step": 58000
},
{
"epoch": 0.1341628995783781,
"grad_norm": 2.710758924484253,
"learning_rate": 4.329185502108109e-05,
"loss": 0.6839,
"step": 58200
},
{
"epoch": 0.13462394047040002,
"grad_norm": 1.8599129915237427,
"learning_rate": 4.326880297648e-05,
"loss": 0.6423,
"step": 58400
},
{
"epoch": 0.13508498136242195,
"grad_norm": 2.223250389099121,
"learning_rate": 4.3245750931878904e-05,
"loss": 0.6981,
"step": 58600
},
{
"epoch": 0.13554602225444387,
"grad_norm": 1.308874249458313,
"learning_rate": 4.322269888727781e-05,
"loss": 0.6425,
"step": 58800
},
{
"epoch": 0.13600706314646577,
"grad_norm": 1.2840343713760376,
"learning_rate": 4.319964684267671e-05,
"loss": 0.6832,
"step": 59000
},
{
"epoch": 0.1364681040384877,
"grad_norm": 1.2683848142623901,
"learning_rate": 4.3176594798075615e-05,
"loss": 0.6748,
"step": 59200
},
{
"epoch": 0.13692914493050962,
"grad_norm": 1.666727900505066,
"learning_rate": 4.315354275347453e-05,
"loss": 0.6459,
"step": 59400
},
{
"epoch": 0.13739018582253154,
"grad_norm": 1.8931647539138794,
"learning_rate": 4.3130490708873426e-05,
"loss": 0.6688,
"step": 59600
},
{
"epoch": 0.13785122671455344,
"grad_norm": 1.728664755821228,
"learning_rate": 4.310743866427233e-05,
"loss": 0.6489,
"step": 59800
},
{
"epoch": 0.13831226760657536,
"grad_norm": 1.461280345916748,
"learning_rate": 4.308438661967124e-05,
"loss": 0.6415,
"step": 60000
},
{
"epoch": 0.1387733084985973,
"grad_norm": 0.6125675439834595,
"learning_rate": 4.3061334575070136e-05,
"loss": 0.6321,
"step": 60200
},
{
"epoch": 0.1392343493906192,
"grad_norm": 1.7109229564666748,
"learning_rate": 4.303828253046904e-05,
"loss": 0.6506,
"step": 60400
},
{
"epoch": 0.1396953902826411,
"grad_norm": 1.3291008472442627,
"learning_rate": 4.301523048586795e-05,
"loss": 0.6312,
"step": 60600
},
{
"epoch": 0.14015643117466303,
"grad_norm": 1.697153091430664,
"learning_rate": 4.299217844126685e-05,
"loss": 0.701,
"step": 60800
},
{
"epoch": 0.14061747206668496,
"grad_norm": 0.8234291672706604,
"learning_rate": 4.296912639666575e-05,
"loss": 0.6556,
"step": 61000
},
{
"epoch": 0.14107851295870688,
"grad_norm": 1.3336366415023804,
"learning_rate": 4.294607435206466e-05,
"loss": 0.6853,
"step": 61200
},
{
"epoch": 0.14153955385072878,
"grad_norm": 1.8199868202209473,
"learning_rate": 4.2923022307463564e-05,
"loss": 0.6498,
"step": 61400
},
{
"epoch": 0.1420005947427507,
"grad_norm": 2.1182050704956055,
"learning_rate": 4.289997026286246e-05,
"loss": 0.6555,
"step": 61600
},
{
"epoch": 0.14246163563477263,
"grad_norm": 1.9714126586914062,
"learning_rate": 4.287691821826137e-05,
"loss": 0.7304,
"step": 61800
},
{
"epoch": 0.14292267652679455,
"grad_norm": 1.536047339439392,
"learning_rate": 4.2853866173660274e-05,
"loss": 0.5836,
"step": 62000
},
{
"epoch": 0.14338371741881648,
"grad_norm": 1.4263625144958496,
"learning_rate": 4.283081412905918e-05,
"loss": 0.6165,
"step": 62200
},
{
"epoch": 0.14384475831083837,
"grad_norm": 0.6614183783531189,
"learning_rate": 4.280776208445808e-05,
"loss": 0.7117,
"step": 62400
},
{
"epoch": 0.1443057992028603,
"grad_norm": 1.4404590129852295,
"learning_rate": 4.2784710039856984e-05,
"loss": 0.6583,
"step": 62600
},
{
"epoch": 0.14476684009488222,
"grad_norm": 3.333214044570923,
"learning_rate": 4.2761657995255897e-05,
"loss": 0.5992,
"step": 62800
},
{
"epoch": 0.14522788098690415,
"grad_norm": 1.3741906881332397,
"learning_rate": 4.2738605950654795e-05,
"loss": 0.6238,
"step": 63000
},
{
"epoch": 0.14568892187892604,
"grad_norm": 2.261046886444092,
"learning_rate": 4.27155539060537e-05,
"loss": 0.6908,
"step": 63200
},
{
"epoch": 0.14614996277094797,
"grad_norm": 2.2750587463378906,
"learning_rate": 4.269250186145261e-05,
"loss": 0.6479,
"step": 63400
},
{
"epoch": 0.1466110036629699,
"grad_norm": 2.38415265083313,
"learning_rate": 4.2669449816851506e-05,
"loss": 0.6621,
"step": 63600
},
{
"epoch": 0.14707204455499182,
"grad_norm": 4.09643030166626,
"learning_rate": 4.264639777225041e-05,
"loss": 0.6689,
"step": 63800
},
{
"epoch": 0.1475330854470137,
"grad_norm": 1.5877435207366943,
"learning_rate": 4.262334572764932e-05,
"loss": 0.6664,
"step": 64000
},
{
"epoch": 0.14799412633903564,
"grad_norm": 1.692415475845337,
"learning_rate": 4.260029368304822e-05,
"loss": 0.6646,
"step": 64200
},
{
"epoch": 0.14845516723105756,
"grad_norm": 1.6003667116165161,
"learning_rate": 4.257724163844712e-05,
"loss": 0.6305,
"step": 64400
},
{
"epoch": 0.14891620812307949,
"grad_norm": 1.2886855602264404,
"learning_rate": 4.255418959384603e-05,
"loss": 0.6017,
"step": 64600
},
{
"epoch": 0.14937724901510138,
"grad_norm": 0.7296251654624939,
"learning_rate": 4.253113754924493e-05,
"loss": 0.6852,
"step": 64800
},
{
"epoch": 0.1498382899071233,
"grad_norm": 1.687552809715271,
"learning_rate": 4.250808550464384e-05,
"loss": 0.6716,
"step": 65000
},
{
"epoch": 0.15029933079914523,
"grad_norm": 1.0152884721755981,
"learning_rate": 4.248503346004274e-05,
"loss": 0.6823,
"step": 65200
},
{
"epoch": 0.15076037169116716,
"grad_norm": 2.022918939590454,
"learning_rate": 4.246198141544164e-05,
"loss": 0.6713,
"step": 65400
},
{
"epoch": 0.15122141258318905,
"grad_norm": 0.733291745185852,
"learning_rate": 4.243892937084055e-05,
"loss": 0.6375,
"step": 65600
},
{
"epoch": 0.15168245347521098,
"grad_norm": 2.1983726024627686,
"learning_rate": 4.241587732623945e-05,
"loss": 0.6861,
"step": 65800
},
{
"epoch": 0.1521434943672329,
"grad_norm": 3.5877902507781982,
"learning_rate": 4.2392825281638353e-05,
"loss": 0.6393,
"step": 66000
},
{
"epoch": 0.15260453525925483,
"grad_norm": 1.1176559925079346,
"learning_rate": 4.2369773237037266e-05,
"loss": 0.6933,
"step": 66200
},
{
"epoch": 0.15306557615127675,
"grad_norm": 1.4344258308410645,
"learning_rate": 4.2346721192436165e-05,
"loss": 0.6471,
"step": 66400
},
{
"epoch": 0.15352661704329865,
"grad_norm": 1.4673750400543213,
"learning_rate": 4.232366914783507e-05,
"loss": 0.6657,
"step": 66600
},
{
"epoch": 0.15398765793532057,
"grad_norm": 1.2807679176330566,
"learning_rate": 4.2300617103233976e-05,
"loss": 0.6353,
"step": 66800
},
{
"epoch": 0.1544486988273425,
"grad_norm": 1.1444551944732666,
"learning_rate": 4.227756505863288e-05,
"loss": 0.6789,
"step": 67000
},
{
"epoch": 0.15490973971936442,
"grad_norm": 2.322291374206543,
"learning_rate": 4.225451301403178e-05,
"loss": 0.6841,
"step": 67200
},
{
"epoch": 0.15537078061138632,
"grad_norm": 1.6149322986602783,
"learning_rate": 4.2231460969430686e-05,
"loss": 0.6654,
"step": 67400
},
{
"epoch": 0.15583182150340824,
"grad_norm": 1.7921006679534912,
"learning_rate": 4.220840892482959e-05,
"loss": 0.6688,
"step": 67600
},
{
"epoch": 0.15629286239543017,
"grad_norm": 1.522269606590271,
"learning_rate": 4.218535688022849e-05,
"loss": 0.6815,
"step": 67800
},
{
"epoch": 0.1567539032874521,
"grad_norm": 1.6208064556121826,
"learning_rate": 4.21623048356274e-05,
"loss": 0.6331,
"step": 68000
},
{
"epoch": 0.157214944179474,
"grad_norm": 1.7673718929290771,
"learning_rate": 4.21392527910263e-05,
"loss": 0.5858,
"step": 68200
},
{
"epoch": 0.1576759850714959,
"grad_norm": 1.3930482864379883,
"learning_rate": 4.211620074642521e-05,
"loss": 0.6221,
"step": 68400
},
{
"epoch": 0.15813702596351784,
"grad_norm": 1.0463271141052246,
"learning_rate": 4.209314870182411e-05,
"loss": 0.596,
"step": 68600
},
{
"epoch": 0.15859806685553976,
"grad_norm": 1.5553432703018188,
"learning_rate": 4.207009665722301e-05,
"loss": 0.6048,
"step": 68800
},
{
"epoch": 0.15905910774756166,
"grad_norm": 1.9478529691696167,
"learning_rate": 4.204704461262192e-05,
"loss": 0.6838,
"step": 69000
},
{
"epoch": 0.15952014863958358,
"grad_norm": 1.5347201824188232,
"learning_rate": 4.202399256802082e-05,
"loss": 0.6536,
"step": 69200
},
{
"epoch": 0.1599811895316055,
"grad_norm": 1.2360255718231201,
"learning_rate": 4.200094052341972e-05,
"loss": 0.662,
"step": 69400
},
{
"epoch": 0.16044223042362743,
"grad_norm": 1.09177827835083,
"learning_rate": 4.1977888478818635e-05,
"loss": 0.6767,
"step": 69600
},
{
"epoch": 0.16090327131564935,
"grad_norm": 1.0002694129943848,
"learning_rate": 4.1954836434217534e-05,
"loss": 0.6057,
"step": 69800
},
{
"epoch": 0.16136431220767125,
"grad_norm": 1.2823467254638672,
"learning_rate": 4.193178438961644e-05,
"loss": 0.6153,
"step": 70000
},
{
"epoch": 0.16182535309969318,
"grad_norm": 0.9123159646987915,
"learning_rate": 4.1908732345015346e-05,
"loss": 0.6432,
"step": 70200
},
{
"epoch": 0.1622863939917151,
"grad_norm": 2.3576698303222656,
"learning_rate": 4.188568030041425e-05,
"loss": 0.6284,
"step": 70400
},
{
"epoch": 0.16274743488373702,
"grad_norm": 0.9124912023544312,
"learning_rate": 4.186262825581315e-05,
"loss": 0.6879,
"step": 70600
},
{
"epoch": 0.16320847577575892,
"grad_norm": 1.3194003105163574,
"learning_rate": 4.1839576211212056e-05,
"loss": 0.6337,
"step": 70800
},
{
"epoch": 0.16366951666778085,
"grad_norm": 1.6139734983444214,
"learning_rate": 4.181652416661096e-05,
"loss": 0.6522,
"step": 71000
},
{
"epoch": 0.16413055755980277,
"grad_norm": 9.392971992492676,
"learning_rate": 4.179347212200986e-05,
"loss": 0.6708,
"step": 71200
},
{
"epoch": 0.1645915984518247,
"grad_norm": 1.462740421295166,
"learning_rate": 4.1770420077408766e-05,
"loss": 0.598,
"step": 71400
},
{
"epoch": 0.1650526393438466,
"grad_norm": 1.7748998403549194,
"learning_rate": 4.174736803280767e-05,
"loss": 0.644,
"step": 71600
},
{
"epoch": 0.16551368023586852,
"grad_norm": 1.202195644378662,
"learning_rate": 4.172431598820658e-05,
"loss": 0.6229,
"step": 71800
},
{
"epoch": 0.16597472112789044,
"grad_norm": 1.877752423286438,
"learning_rate": 4.1701263943605476e-05,
"loss": 0.5753,
"step": 72000
},
{
"epoch": 0.16643576201991236,
"grad_norm": 3.8123841285705566,
"learning_rate": 4.167821189900438e-05,
"loss": 0.6965,
"step": 72200
},
{
"epoch": 0.16689680291193426,
"grad_norm": 2.4701273441314697,
"learning_rate": 4.165515985440329e-05,
"loss": 0.6491,
"step": 72400
},
{
"epoch": 0.16735784380395619,
"grad_norm": 1.3478227853775024,
"learning_rate": 4.163210780980219e-05,
"loss": 0.657,
"step": 72600
},
{
"epoch": 0.1678188846959781,
"grad_norm": 1.1858279705047607,
"learning_rate": 4.16090557652011e-05,
"loss": 0.6297,
"step": 72800
},
{
"epoch": 0.16827992558800003,
"grad_norm": 1.016969919204712,
"learning_rate": 4.1586003720600005e-05,
"loss": 0.5969,
"step": 73000
},
{
"epoch": 0.16874096648002196,
"grad_norm": 1.7557319402694702,
"learning_rate": 4.1562951675998904e-05,
"loss": 0.6602,
"step": 73200
},
{
"epoch": 0.16920200737204386,
"grad_norm": 1.2610116004943848,
"learning_rate": 4.153989963139781e-05,
"loss": 0.5832,
"step": 73400
},
{
"epoch": 0.16966304826406578,
"grad_norm": 1.012919545173645,
"learning_rate": 4.1516847586796715e-05,
"loss": 0.6437,
"step": 73600
},
{
"epoch": 0.1701240891560877,
"grad_norm": 3.5607211589813232,
"learning_rate": 4.149379554219562e-05,
"loss": 0.6131,
"step": 73800
},
{
"epoch": 0.17058513004810963,
"grad_norm": 1.3184549808502197,
"learning_rate": 4.147074349759452e-05,
"loss": 0.5669,
"step": 74000
},
{
"epoch": 0.17104617094013153,
"grad_norm": 2.453568458557129,
"learning_rate": 4.1447691452993425e-05,
"loss": 0.609,
"step": 74200
},
{
"epoch": 0.17150721183215345,
"grad_norm": 0.942398726940155,
"learning_rate": 4.142463940839233e-05,
"loss": 0.6451,
"step": 74400
},
{
"epoch": 0.17196825272417537,
"grad_norm": 2.131546974182129,
"learning_rate": 4.1401587363791237e-05,
"loss": 0.6167,
"step": 74600
},
{
"epoch": 0.1724292936161973,
"grad_norm": 1.0977692604064941,
"learning_rate": 4.1378535319190135e-05,
"loss": 0.6038,
"step": 74800
},
{
"epoch": 0.1728903345082192,
"grad_norm": 1.6585220098495483,
"learning_rate": 4.135548327458904e-05,
"loss": 0.6221,
"step": 75000
},
{
"epoch": 0.17335137540024112,
"grad_norm": 1.4961862564086914,
"learning_rate": 4.133243122998795e-05,
"loss": 0.6083,
"step": 75200
},
{
"epoch": 0.17381241629226304,
"grad_norm": 1.8815230131149292,
"learning_rate": 4.1309379185386846e-05,
"loss": 0.6484,
"step": 75400
},
{
"epoch": 0.17427345718428497,
"grad_norm": 1.2106714248657227,
"learning_rate": 4.128632714078575e-05,
"loss": 0.6745,
"step": 75600
},
{
"epoch": 0.17473449807630687,
"grad_norm": 15.076075553894043,
"learning_rate": 4.126327509618466e-05,
"loss": 0.5759,
"step": 75800
},
{
"epoch": 0.1751955389683288,
"grad_norm": 1.6629307270050049,
"learning_rate": 4.124022305158356e-05,
"loss": 0.6511,
"step": 76000
},
{
"epoch": 0.17565657986035071,
"grad_norm": 0.919217586517334,
"learning_rate": 4.121717100698247e-05,
"loss": 0.6124,
"step": 76200
},
{
"epoch": 0.17611762075237264,
"grad_norm": 0.9907572269439697,
"learning_rate": 4.1194118962381374e-05,
"loss": 0.6668,
"step": 76400
},
{
"epoch": 0.17657866164439456,
"grad_norm": 1.0881201028823853,
"learning_rate": 4.117106691778028e-05,
"loss": 0.6564,
"step": 76600
},
{
"epoch": 0.17703970253641646,
"grad_norm": 1.2789230346679688,
"learning_rate": 4.114801487317918e-05,
"loss": 0.6228,
"step": 76800
},
{
"epoch": 0.17750074342843838,
"grad_norm": 2.680896759033203,
"learning_rate": 4.1124962828578084e-05,
"loss": 0.6754,
"step": 77000
},
{
"epoch": 0.1779617843204603,
"grad_norm": 1.4832789897918701,
"learning_rate": 4.110191078397699e-05,
"loss": 0.6153,
"step": 77200
},
{
"epoch": 0.17842282521248223,
"grad_norm": 1.8197680711746216,
"learning_rate": 4.107885873937589e-05,
"loss": 0.6174,
"step": 77400
},
{
"epoch": 0.17888386610450413,
"grad_norm": 1.8292102813720703,
"learning_rate": 4.1055806694774795e-05,
"loss": 0.63,
"step": 77600
},
{
"epoch": 0.17934490699652605,
"grad_norm": 1.0683658123016357,
"learning_rate": 4.10327546501737e-05,
"loss": 0.6622,
"step": 77800
},
{
"epoch": 0.17980594788854798,
"grad_norm": 1.9662219285964966,
"learning_rate": 4.1009702605572606e-05,
"loss": 0.6231,
"step": 78000
},
{
"epoch": 0.1802669887805699,
"grad_norm": 1.7541677951812744,
"learning_rate": 4.0986650560971505e-05,
"loss": 0.6551,
"step": 78200
},
{
"epoch": 0.1807280296725918,
"grad_norm": 1.8776569366455078,
"learning_rate": 4.096359851637041e-05,
"loss": 0.6121,
"step": 78400
},
{
"epoch": 0.18118907056461372,
"grad_norm": 1.9241667985916138,
"learning_rate": 4.0940546471769316e-05,
"loss": 0.6205,
"step": 78600
},
{
"epoch": 0.18165011145663565,
"grad_norm": 1.7925617694854736,
"learning_rate": 4.0917494427168215e-05,
"loss": 0.6353,
"step": 78800
},
{
"epoch": 0.18211115234865757,
"grad_norm": 0.9358586072921753,
"learning_rate": 4.089444238256712e-05,
"loss": 0.6129,
"step": 79000
},
{
"epoch": 0.18257219324067947,
"grad_norm": 1.744363784790039,
"learning_rate": 4.0871390337966026e-05,
"loss": 0.5996,
"step": 79200
},
{
"epoch": 0.1830332341327014,
"grad_norm": 1.6181316375732422,
"learning_rate": 4.084833829336493e-05,
"loss": 0.6316,
"step": 79400
},
{
"epoch": 0.18349427502472332,
"grad_norm": 0.8998286128044128,
"learning_rate": 4.082528624876384e-05,
"loss": 0.6386,
"step": 79600
},
{
"epoch": 0.18395531591674524,
"grad_norm": 1.9069503545761108,
"learning_rate": 4.0802234204162743e-05,
"loss": 0.6345,
"step": 79800
},
{
"epoch": 0.18441635680876714,
"grad_norm": 1.7913002967834473,
"learning_rate": 4.077918215956165e-05,
"loss": 0.5903,
"step": 80000
},
{
"epoch": 0.18487739770078906,
"grad_norm": 2.31486177444458,
"learning_rate": 4.075613011496055e-05,
"loss": 0.6663,
"step": 80200
},
{
"epoch": 0.185338438592811,
"grad_norm": 1.4911130666732788,
"learning_rate": 4.0733078070359454e-05,
"loss": 0.6346,
"step": 80400
},
{
"epoch": 0.1857994794848329,
"grad_norm": 0.8119006752967834,
"learning_rate": 4.071002602575836e-05,
"loss": 0.5683,
"step": 80600
},
{
"epoch": 0.18626052037685484,
"grad_norm": 1.8645226955413818,
"learning_rate": 4.068697398115726e-05,
"loss": 0.5985,
"step": 80800
},
{
"epoch": 0.18672156126887673,
"grad_norm": 0.8933721780776978,
"learning_rate": 4.0663921936556164e-05,
"loss": 0.6082,
"step": 81000
},
{
"epoch": 0.18718260216089866,
"grad_norm": 0.9477849006652832,
"learning_rate": 4.064086989195507e-05,
"loss": 0.5934,
"step": 81200
},
{
"epoch": 0.18764364305292058,
"grad_norm": 2.2654476165771484,
"learning_rate": 4.0617817847353975e-05,
"loss": 0.6266,
"step": 81400
},
{
"epoch": 0.1881046839449425,
"grad_norm": 1.381350040435791,
"learning_rate": 4.0594765802752874e-05,
"loss": 0.6231,
"step": 81600
},
{
"epoch": 0.1885657248369644,
"grad_norm": 1.9982389211654663,
"learning_rate": 4.057171375815178e-05,
"loss": 0.6029,
"step": 81800
},
{
"epoch": 0.18902676572898633,
"grad_norm": 1.583160400390625,
"learning_rate": 4.0548661713550686e-05,
"loss": 0.6152,
"step": 82000
},
{
"epoch": 0.18948780662100825,
"grad_norm": 0.8362854719161987,
"learning_rate": 4.052560966894959e-05,
"loss": 0.6231,
"step": 82200
},
{
"epoch": 0.18994884751303018,
"grad_norm": 2.0223453044891357,
"learning_rate": 4.050255762434849e-05,
"loss": 0.6013,
"step": 82400
},
{
"epoch": 0.19040988840505207,
"grad_norm": 1.9948159456253052,
"learning_rate": 4.0479505579747396e-05,
"loss": 0.6374,
"step": 82600
},
{
"epoch": 0.190870929297074,
"grad_norm": 1.763412594795227,
"learning_rate": 4.04564535351463e-05,
"loss": 0.6696,
"step": 82800
},
{
"epoch": 0.19133197018909592,
"grad_norm": 1.4458279609680176,
"learning_rate": 4.043340149054521e-05,
"loss": 0.6253,
"step": 83000
},
{
"epoch": 0.19179301108111785,
"grad_norm": 1.9040172100067139,
"learning_rate": 4.041034944594411e-05,
"loss": 0.6292,
"step": 83200
},
{
"epoch": 0.19225405197313974,
"grad_norm": 0.5876076817512512,
"learning_rate": 4.038729740134302e-05,
"loss": 0.5721,
"step": 83400
},
{
"epoch": 0.19271509286516167,
"grad_norm": 1.4014763832092285,
"learning_rate": 4.036424535674192e-05,
"loss": 0.6496,
"step": 83600
},
{
"epoch": 0.1931761337571836,
"grad_norm": 1.3236879110336304,
"learning_rate": 4.034119331214082e-05,
"loss": 0.6824,
"step": 83800
},
{
"epoch": 0.19363717464920552,
"grad_norm": 1.3417832851409912,
"learning_rate": 4.031814126753973e-05,
"loss": 0.6155,
"step": 84000
},
{
"epoch": 0.19409821554122744,
"grad_norm": 1.254905104637146,
"learning_rate": 4.0295089222938634e-05,
"loss": 0.6194,
"step": 84200
},
{
"epoch": 0.19455925643324934,
"grad_norm": 1.0880146026611328,
"learning_rate": 4.027203717833753e-05,
"loss": 0.566,
"step": 84400
},
{
"epoch": 0.19502029732527126,
"grad_norm": 0.5658175945281982,
"learning_rate": 4.024898513373644e-05,
"loss": 0.6118,
"step": 84600
},
{
"epoch": 0.1954813382172932,
"grad_norm": 1.4203405380249023,
"learning_rate": 4.0225933089135345e-05,
"loss": 0.6458,
"step": 84800
},
{
"epoch": 0.1959423791093151,
"grad_norm": 1.4831221103668213,
"learning_rate": 4.0202881044534244e-05,
"loss": 0.6129,
"step": 85000
},
{
"epoch": 0.196403420001337,
"grad_norm": 2.332782506942749,
"learning_rate": 4.017982899993315e-05,
"loss": 0.6036,
"step": 85200
},
{
"epoch": 0.19686446089335893,
"grad_norm": 1.699129343032837,
"learning_rate": 4.0156776955332055e-05,
"loss": 0.6667,
"step": 85400
},
{
"epoch": 0.19732550178538086,
"grad_norm": 2.4848811626434326,
"learning_rate": 4.013372491073096e-05,
"loss": 0.6281,
"step": 85600
},
{
"epoch": 0.19778654267740278,
"grad_norm": 1.896471381187439,
"learning_rate": 4.011067286612986e-05,
"loss": 0.6028,
"step": 85800
},
{
"epoch": 0.19824758356942468,
"grad_norm": 1.61887526512146,
"learning_rate": 4.0087620821528765e-05,
"loss": 0.6086,
"step": 86000
},
{
"epoch": 0.1987086244614466,
"grad_norm": 1.0907816886901855,
"learning_rate": 4.006456877692767e-05,
"loss": 0.6499,
"step": 86200
},
{
"epoch": 0.19916966535346853,
"grad_norm": 1.1306065320968628,
"learning_rate": 4.0041516732326576e-05,
"loss": 0.6152,
"step": 86400
},
{
"epoch": 0.19963070624549045,
"grad_norm": 4.158120155334473,
"learning_rate": 4.001846468772548e-05,
"loss": 0.6039,
"step": 86600
},
{
"epoch": 0.20009174713751235,
"grad_norm": 1.0758455991744995,
"learning_rate": 3.999541264312439e-05,
"loss": 0.5966,
"step": 86800
},
{
"epoch": 0.20055278802953427,
"grad_norm": 1.0376372337341309,
"learning_rate": 3.997236059852329e-05,
"loss": 0.589,
"step": 87000
},
{
"epoch": 0.2010138289215562,
"grad_norm": 1.2652366161346436,
"learning_rate": 3.994930855392219e-05,
"loss": 0.6588,
"step": 87200
},
{
"epoch": 0.20147486981357812,
"grad_norm": 1.8211579322814941,
"learning_rate": 3.99262565093211e-05,
"loss": 0.6191,
"step": 87400
},
{
"epoch": 0.20193591070560005,
"grad_norm": 4.478600025177002,
"learning_rate": 3.9903204464720004e-05,
"loss": 0.5878,
"step": 87600
},
{
"epoch": 0.20239695159762194,
"grad_norm": 1.4553157091140747,
"learning_rate": 3.98801524201189e-05,
"loss": 0.6204,
"step": 87800
},
{
"epoch": 0.20285799248964387,
"grad_norm": 1.3515084981918335,
"learning_rate": 3.985710037551781e-05,
"loss": 0.5467,
"step": 88000
},
{
"epoch": 0.2033190333816658,
"grad_norm": 1.0609192848205566,
"learning_rate": 3.9834048330916714e-05,
"loss": 0.5393,
"step": 88200
},
{
"epoch": 0.20378007427368772,
"grad_norm": 2.3497846126556396,
"learning_rate": 3.981099628631561e-05,
"loss": 0.6261,
"step": 88400
},
{
"epoch": 0.2042411151657096,
"grad_norm": 1.129948616027832,
"learning_rate": 3.978794424171452e-05,
"loss": 0.6367,
"step": 88600
},
{
"epoch": 0.20470215605773154,
"grad_norm": 1.0302705764770508,
"learning_rate": 3.9764892197113424e-05,
"loss": 0.59,
"step": 88800
},
{
"epoch": 0.20516319694975346,
"grad_norm": 1.1066232919692993,
"learning_rate": 3.974184015251233e-05,
"loss": 0.6325,
"step": 89000
},
{
"epoch": 0.2056242378417754,
"grad_norm": 2.078610897064209,
"learning_rate": 3.971878810791123e-05,
"loss": 0.6465,
"step": 89200
},
{
"epoch": 0.20608527873379728,
"grad_norm": 1.8704718351364136,
"learning_rate": 3.9695736063310134e-05,
"loss": 0.6202,
"step": 89400
},
{
"epoch": 0.2065463196258192,
"grad_norm": 0.496405690908432,
"learning_rate": 3.967268401870904e-05,
"loss": 0.6073,
"step": 89600
},
{
"epoch": 0.20700736051784113,
"grad_norm": 1.9287617206573486,
"learning_rate": 3.9649631974107946e-05,
"loss": 0.5779,
"step": 89800
},
{
"epoch": 0.20746840140986306,
"grad_norm": 1.867727279663086,
"learning_rate": 3.962657992950685e-05,
"loss": 0.5736,
"step": 90000
},
{
"epoch": 0.20792944230188495,
"grad_norm": 0.9726611971855164,
"learning_rate": 3.960352788490576e-05,
"loss": 0.6051,
"step": 90200
},
{
"epoch": 0.20839048319390688,
"grad_norm": 1.8991550207138062,
"learning_rate": 3.9580475840304656e-05,
"loss": 0.6306,
"step": 90400
},
{
"epoch": 0.2088515240859288,
"grad_norm": 1.3989739418029785,
"learning_rate": 3.955742379570356e-05,
"loss": 0.6165,
"step": 90600
},
{
"epoch": 0.20931256497795073,
"grad_norm": 2.0542263984680176,
"learning_rate": 3.953437175110247e-05,
"loss": 0.606,
"step": 90800
},
{
"epoch": 0.20977360586997262,
"grad_norm": 1.3546398878097534,
"learning_rate": 3.951131970650137e-05,
"loss": 0.6513,
"step": 91000
},
{
"epoch": 0.21023464676199455,
"grad_norm": 2.3966128826141357,
"learning_rate": 3.948826766190027e-05,
"loss": 0.6147,
"step": 91200
},
{
"epoch": 0.21069568765401647,
"grad_norm": 1.8540971279144287,
"learning_rate": 3.946521561729918e-05,
"loss": 0.6128,
"step": 91400
},
{
"epoch": 0.2111567285460384,
"grad_norm": 0.6874774694442749,
"learning_rate": 3.944216357269808e-05,
"loss": 0.603,
"step": 91600
},
{
"epoch": 0.21161776943806032,
"grad_norm": 3.1788859367370605,
"learning_rate": 3.941911152809699e-05,
"loss": 0.6173,
"step": 91800
},
{
"epoch": 0.21207881033008222,
"grad_norm": 1.5572599172592163,
"learning_rate": 3.939605948349589e-05,
"loss": 0.603,
"step": 92000
},
{
"epoch": 0.21253985122210414,
"grad_norm": 1.5014060735702515,
"learning_rate": 3.9373007438894794e-05,
"loss": 0.5746,
"step": 92200
},
{
"epoch": 0.21300089211412607,
"grad_norm": 2.458667516708374,
"learning_rate": 3.93499553942937e-05,
"loss": 0.6277,
"step": 92400
},
{
"epoch": 0.213461933006148,
"grad_norm": 2.5523571968078613,
"learning_rate": 3.93269033496926e-05,
"loss": 0.5994,
"step": 92600
},
{
"epoch": 0.2139229738981699,
"grad_norm": 1.136783480644226,
"learning_rate": 3.9303851305091504e-05,
"loss": 0.6284,
"step": 92800
},
{
"epoch": 0.2143840147901918,
"grad_norm": 1.2271496057510376,
"learning_rate": 3.928079926049041e-05,
"loss": 0.5876,
"step": 93000
},
{
"epoch": 0.21484505568221374,
"grad_norm": 0.6214015483856201,
"learning_rate": 3.9257747215889315e-05,
"loss": 0.6294,
"step": 93200
},
{
"epoch": 0.21530609657423566,
"grad_norm": 1.4034799337387085,
"learning_rate": 3.923469517128822e-05,
"loss": 0.6242,
"step": 93400
},
{
"epoch": 0.21576713746625756,
"grad_norm": 1.160979151725769,
"learning_rate": 3.9211643126687127e-05,
"loss": 0.6082,
"step": 93600
},
{
"epoch": 0.21622817835827948,
"grad_norm": 1.3025540113449097,
"learning_rate": 3.918859108208603e-05,
"loss": 0.5844,
"step": 93800
},
{
"epoch": 0.2166892192503014,
"grad_norm": 2.7265303134918213,
"learning_rate": 3.916553903748493e-05,
"loss": 0.5367,
"step": 94000
},
{
"epoch": 0.21715026014232333,
"grad_norm": 2.3376145362854004,
"learning_rate": 3.914248699288384e-05,
"loss": 0.605,
"step": 94200
},
{
"epoch": 0.21761130103434523,
"grad_norm": 0.6863404512405396,
"learning_rate": 3.911943494828274e-05,
"loss": 0.6038,
"step": 94400
},
{
"epoch": 0.21807234192636715,
"grad_norm": 2.042480230331421,
"learning_rate": 3.909638290368164e-05,
"loss": 0.5875,
"step": 94600
},
{
"epoch": 0.21853338281838908,
"grad_norm": 1.5179613828659058,
"learning_rate": 3.907333085908055e-05,
"loss": 0.6374,
"step": 94800
},
{
"epoch": 0.218994423710411,
"grad_norm": 1.8562968969345093,
"learning_rate": 3.905027881447945e-05,
"loss": 0.6243,
"step": 95000
},
{
"epoch": 0.21945546460243293,
"grad_norm": 1.0300766229629517,
"learning_rate": 3.902722676987836e-05,
"loss": 0.6338,
"step": 95200
},
{
"epoch": 0.21991650549445482,
"grad_norm": 3.0744545459747314,
"learning_rate": 3.900417472527726e-05,
"loss": 0.6158,
"step": 95400
},
{
"epoch": 0.22037754638647675,
"grad_norm": 3.355592727661133,
"learning_rate": 3.898112268067616e-05,
"loss": 0.628,
"step": 95600
},
{
"epoch": 0.22083858727849867,
"grad_norm": 1.0590027570724487,
"learning_rate": 3.895807063607507e-05,
"loss": 0.6363,
"step": 95800
},
{
"epoch": 0.2212996281705206,
"grad_norm": 1.37596595287323,
"learning_rate": 3.893501859147397e-05,
"loss": 0.6107,
"step": 96000
},
{
"epoch": 0.2217606690625425,
"grad_norm": 1.392102599143982,
"learning_rate": 3.891196654687287e-05,
"loss": 0.6182,
"step": 96200
},
{
"epoch": 0.22222170995456442,
"grad_norm": 1.0778827667236328,
"learning_rate": 3.888891450227178e-05,
"loss": 0.6225,
"step": 96400
},
{
"epoch": 0.22268275084658634,
"grad_norm": 0.8405503034591675,
"learning_rate": 3.8865862457670685e-05,
"loss": 0.5607,
"step": 96600
},
{
"epoch": 0.22314379173860827,
"grad_norm": 1.857490062713623,
"learning_rate": 3.884281041306959e-05,
"loss": 0.5927,
"step": 96800
},
{
"epoch": 0.22360483263063016,
"grad_norm": 1.9052844047546387,
"learning_rate": 3.8819758368468496e-05,
"loss": 0.631,
"step": 97000
},
{
"epoch": 0.2240658735226521,
"grad_norm": 0.8537679314613342,
"learning_rate": 3.87967063238674e-05,
"loss": 0.6252,
"step": 97200
},
{
"epoch": 0.224526914414674,
"grad_norm": 1.5780411958694458,
"learning_rate": 3.87736542792663e-05,
"loss": 0.6445,
"step": 97400
},
{
"epoch": 0.22498795530669594,
"grad_norm": 1.55938720703125,
"learning_rate": 3.8750602234665206e-05,
"loss": 0.592,
"step": 97600
},
{
"epoch": 0.22544899619871783,
"grad_norm": 2.4053783416748047,
"learning_rate": 3.872755019006411e-05,
"loss": 0.5912,
"step": 97800
},
{
"epoch": 0.22591003709073976,
"grad_norm": 1.1745800971984863,
"learning_rate": 3.870449814546301e-05,
"loss": 0.6163,
"step": 98000
},
{
"epoch": 0.22637107798276168,
"grad_norm": 1.0355582237243652,
"learning_rate": 3.8681446100861916e-05,
"loss": 0.6557,
"step": 98200
},
{
"epoch": 0.2268321188747836,
"grad_norm": 1.5494755506515503,
"learning_rate": 3.865839405626082e-05,
"loss": 0.5803,
"step": 98400
},
{
"epoch": 0.22729315976680553,
"grad_norm": 0.9093578457832336,
"learning_rate": 3.863534201165973e-05,
"loss": 0.5485,
"step": 98600
},
{
"epoch": 0.22775420065882743,
"grad_norm": 3.997178077697754,
"learning_rate": 3.861228996705863e-05,
"loss": 0.608,
"step": 98800
},
{
"epoch": 0.22821524155084935,
"grad_norm": 0.7264981269836426,
"learning_rate": 3.858923792245753e-05,
"loss": 0.5569,
"step": 99000
},
{
"epoch": 0.22867628244287128,
"grad_norm": 1.214425802230835,
"learning_rate": 3.856618587785644e-05,
"loss": 0.5799,
"step": 99200
},
{
"epoch": 0.2291373233348932,
"grad_norm": 1.1324894428253174,
"learning_rate": 3.8543133833255344e-05,
"loss": 0.5854,
"step": 99400
},
{
"epoch": 0.2295983642269151,
"grad_norm": 1.1045070886611938,
"learning_rate": 3.852008178865424e-05,
"loss": 0.6338,
"step": 99600
},
{
"epoch": 0.23005940511893702,
"grad_norm": 1.4003263711929321,
"learning_rate": 3.849702974405315e-05,
"loss": 0.6131,
"step": 99800
},
{
"epoch": 0.23052044601095895,
"grad_norm": 1.9223850965499878,
"learning_rate": 3.8473977699452054e-05,
"loss": 0.583,
"step": 100000
},
{
"epoch": 0.23052044601095895,
"eval_loss": 0.5901287198066711,
"eval_runtime": 144.11,
"eval_samples_per_second": 30.407,
"eval_steps_per_second": 30.407,
"step": 100000
},
{
"epoch": 0.23098148690298087,
"grad_norm": 3.727125883102417,
"learning_rate": 3.845092565485096e-05,
"loss": 0.5922,
"step": 100200
},
{
"epoch": 0.23144252779500277,
"grad_norm": 2.583871364593506,
"learning_rate": 3.8427873610249865e-05,
"loss": 0.5656,
"step": 100400
},
{
"epoch": 0.2319035686870247,
"grad_norm": 1.4674535989761353,
"learning_rate": 3.840482156564877e-05,
"loss": 0.6487,
"step": 100600
},
{
"epoch": 0.23236460957904662,
"grad_norm": 1.2001768350601196,
"learning_rate": 3.838176952104767e-05,
"loss": 0.5979,
"step": 100800
},
{
"epoch": 0.23282565047106854,
"grad_norm": 1.036700963973999,
"learning_rate": 3.8358717476446576e-05,
"loss": 0.5853,
"step": 101000
},
{
"epoch": 0.23328669136309044,
"grad_norm": 1.6959054470062256,
"learning_rate": 3.833566543184548e-05,
"loss": 0.6108,
"step": 101200
},
{
"epoch": 0.23374773225511236,
"grad_norm": 1.153205156326294,
"learning_rate": 3.831261338724439e-05,
"loss": 0.5994,
"step": 101400
},
{
"epoch": 0.23420877314713429,
"grad_norm": 1.5132783651351929,
"learning_rate": 3.8289561342643286e-05,
"loss": 0.5739,
"step": 101600
},
{
"epoch": 0.2346698140391562,
"grad_norm": 1.745678424835205,
"learning_rate": 3.826650929804219e-05,
"loss": 0.6051,
"step": 101800
},
{
"epoch": 0.2351308549311781,
"grad_norm": 1.6017553806304932,
"learning_rate": 3.82434572534411e-05,
"loss": 0.6234,
"step": 102000
},
{
"epoch": 0.23559189582320003,
"grad_norm": 1.4784915447235107,
"learning_rate": 3.8220405208839996e-05,
"loss": 0.61,
"step": 102200
},
{
"epoch": 0.23605293671522196,
"grad_norm": 1.5724163055419922,
"learning_rate": 3.81973531642389e-05,
"loss": 0.5704,
"step": 102400
},
{
"epoch": 0.23651397760724388,
"grad_norm": 1.936811923980713,
"learning_rate": 3.817430111963781e-05,
"loss": 0.6272,
"step": 102600
},
{
"epoch": 0.2369750184992658,
"grad_norm": 0.96824711561203,
"learning_rate": 3.815124907503671e-05,
"loss": 0.6139,
"step": 102800
},
{
"epoch": 0.2374360593912877,
"grad_norm": 1.1771214008331299,
"learning_rate": 3.812819703043561e-05,
"loss": 0.5996,
"step": 103000
},
{
"epoch": 0.23789710028330963,
"grad_norm": 1.3290009498596191,
"learning_rate": 3.810514498583452e-05,
"loss": 0.5637,
"step": 103200
},
{
"epoch": 0.23835814117533155,
"grad_norm": 1.389938473701477,
"learning_rate": 3.808209294123343e-05,
"loss": 0.5753,
"step": 103400
},
{
"epoch": 0.23881918206735347,
"grad_norm": 1.5995765924453735,
"learning_rate": 3.805904089663233e-05,
"loss": 0.5625,
"step": 103600
},
{
"epoch": 0.23928022295937537,
"grad_norm": 1.64626145362854,
"learning_rate": 3.8035988852031235e-05,
"loss": 0.6059,
"step": 103800
},
{
"epoch": 0.2397412638513973,
"grad_norm": 1.7561503648757935,
"learning_rate": 3.801293680743014e-05,
"loss": 0.5819,
"step": 104000
},
{
"epoch": 0.24020230474341922,
"grad_norm": 1.4345256090164185,
"learning_rate": 3.798988476282904e-05,
"loss": 0.6131,
"step": 104200
},
{
"epoch": 0.24066334563544114,
"grad_norm": 1.1421653032302856,
"learning_rate": 3.7966832718227945e-05,
"loss": 0.5468,
"step": 104400
},
{
"epoch": 0.24112438652746304,
"grad_norm": 1.356677532196045,
"learning_rate": 3.794378067362685e-05,
"loss": 0.5659,
"step": 104600
},
{
"epoch": 0.24158542741948497,
"grad_norm": 1.065327763557434,
"learning_rate": 3.7920728629025756e-05,
"loss": 0.5518,
"step": 104800
},
{
"epoch": 0.2420464683115069,
"grad_norm": 2.1725375652313232,
"learning_rate": 3.7897676584424655e-05,
"loss": 0.6386,
"step": 105000
},
{
"epoch": 0.24250750920352881,
"grad_norm": 1.0061650276184082,
"learning_rate": 3.787462453982356e-05,
"loss": 0.5936,
"step": 105200
},
{
"epoch": 0.2429685500955507,
"grad_norm": 1.8890901803970337,
"learning_rate": 3.7851572495222467e-05,
"loss": 0.5985,
"step": 105400
},
{
"epoch": 0.24342959098757264,
"grad_norm": 0.9927252531051636,
"learning_rate": 3.7828520450621365e-05,
"loss": 0.6082,
"step": 105600
},
{
"epoch": 0.24389063187959456,
"grad_norm": 1.791656494140625,
"learning_rate": 3.780546840602027e-05,
"loss": 0.5913,
"step": 105800
},
{
"epoch": 0.24435167277161648,
"grad_norm": 0.8485866785049438,
"learning_rate": 3.778241636141918e-05,
"loss": 0.5868,
"step": 106000
},
{
"epoch": 0.2448127136636384,
"grad_norm": 2.2644290924072266,
"learning_rate": 3.775936431681808e-05,
"loss": 0.5296,
"step": 106200
},
{
"epoch": 0.2452737545556603,
"grad_norm": 1.4203904867172241,
"learning_rate": 3.773631227221698e-05,
"loss": 0.542,
"step": 106400
},
{
"epoch": 0.24573479544768223,
"grad_norm": 1.9407037496566772,
"learning_rate": 3.771326022761589e-05,
"loss": 0.5666,
"step": 106600
},
{
"epoch": 0.24619583633970415,
"grad_norm": 0.9351466298103333,
"learning_rate": 3.76902081830148e-05,
"loss": 0.6103,
"step": 106800
},
{
"epoch": 0.24665687723172608,
"grad_norm": 0.9978102445602417,
"learning_rate": 3.76671561384137e-05,
"loss": 0.5896,
"step": 107000
},
{
"epoch": 0.24711791812374798,
"grad_norm": 1.419097900390625,
"learning_rate": 3.7644104093812604e-05,
"loss": 0.5511,
"step": 107200
},
{
"epoch": 0.2475789590157699,
"grad_norm": 0.8121142387390137,
"learning_rate": 3.762105204921151e-05,
"loss": 0.567,
"step": 107400
},
{
"epoch": 0.24803999990779182,
"grad_norm": 1.2004528045654297,
"learning_rate": 3.759800000461041e-05,
"loss": 0.5494,
"step": 107600
},
{
"epoch": 0.24850104079981375,
"grad_norm": 1.426767349243164,
"learning_rate": 3.7574947960009314e-05,
"loss": 0.5833,
"step": 107800
},
{
"epoch": 0.24896208169183565,
"grad_norm": 2.5049235820770264,
"learning_rate": 3.755189591540822e-05,
"loss": 0.6164,
"step": 108000
},
{
"epoch": 0.24942312258385757,
"grad_norm": 2.0731942653656006,
"learning_rate": 3.7528843870807126e-05,
"loss": 0.5709,
"step": 108200
},
{
"epoch": 0.2498841634758795,
"grad_norm": 1.43949556350708,
"learning_rate": 3.7505791826206025e-05,
"loss": 0.6137,
"step": 108400
},
{
"epoch": 0.2503452043679014,
"grad_norm": 1.452414870262146,
"learning_rate": 3.748273978160493e-05,
"loss": 0.5779,
"step": 108600
},
{
"epoch": 0.25080624525992334,
"grad_norm": 2.6152195930480957,
"learning_rate": 3.7459687737003836e-05,
"loss": 0.5681,
"step": 108800
},
{
"epoch": 0.25126728615194527,
"grad_norm": 1.348482370376587,
"learning_rate": 3.743663569240274e-05,
"loss": 0.6671,
"step": 109000
},
{
"epoch": 0.25172832704396714,
"grad_norm": 0.8128360509872437,
"learning_rate": 3.741358364780164e-05,
"loss": 0.5648,
"step": 109200
},
{
"epoch": 0.25218936793598906,
"grad_norm": 0.83039790391922,
"learning_rate": 3.7390531603200546e-05,
"loss": 0.6204,
"step": 109400
},
{
"epoch": 0.252650408828011,
"grad_norm": 1.9912052154541016,
"learning_rate": 3.736747955859945e-05,
"loss": 0.5364,
"step": 109600
},
{
"epoch": 0.2531114497200329,
"grad_norm": 1.4351979494094849,
"learning_rate": 3.734442751399835e-05,
"loss": 0.6486,
"step": 109800
},
{
"epoch": 0.25357249061205483,
"grad_norm": 1.6197021007537842,
"learning_rate": 3.7321375469397256e-05,
"loss": 0.5651,
"step": 110000
},
{
"epoch": 0.25403353150407676,
"grad_norm": 2.011810541152954,
"learning_rate": 3.729832342479617e-05,
"loss": 0.6064,
"step": 110200
},
{
"epoch": 0.2544945723960987,
"grad_norm": 1.3699722290039062,
"learning_rate": 3.727527138019507e-05,
"loss": 0.536,
"step": 110400
},
{
"epoch": 0.2549556132881206,
"grad_norm": 2.089066743850708,
"learning_rate": 3.7252219335593973e-05,
"loss": 0.6077,
"step": 110600
},
{
"epoch": 0.25541665418014253,
"grad_norm": 0.4626462459564209,
"learning_rate": 3.722916729099288e-05,
"loss": 0.5726,
"step": 110800
},
{
"epoch": 0.2558776950721644,
"grad_norm": 1.4077805280685425,
"learning_rate": 3.7206115246391785e-05,
"loss": 0.6218,
"step": 111000
},
{
"epoch": 0.2563387359641863,
"grad_norm": 2.0903522968292236,
"learning_rate": 3.7183063201790684e-05,
"loss": 0.5829,
"step": 111200
},
{
"epoch": 0.25679977685620825,
"grad_norm": 1.4433337450027466,
"learning_rate": 3.716001115718959e-05,
"loss": 0.6412,
"step": 111400
},
{
"epoch": 0.2572608177482302,
"grad_norm": 2.1463751792907715,
"learning_rate": 3.7136959112588495e-05,
"loss": 0.6183,
"step": 111600
},
{
"epoch": 0.2577218586402521,
"grad_norm": 0.8230465054512024,
"learning_rate": 3.7113907067987394e-05,
"loss": 0.5919,
"step": 111800
},
{
"epoch": 0.258182899532274,
"grad_norm": 1.8142331838607788,
"learning_rate": 3.70908550233863e-05,
"loss": 0.5895,
"step": 112000
},
{
"epoch": 0.25864394042429595,
"grad_norm": 1.7713125944137573,
"learning_rate": 3.7067802978785205e-05,
"loss": 0.591,
"step": 112200
},
{
"epoch": 0.2591049813163179,
"grad_norm": 1.0239676237106323,
"learning_rate": 3.704475093418411e-05,
"loss": 0.6209,
"step": 112400
},
{
"epoch": 0.25956602220833974,
"grad_norm": 2.02620267868042,
"learning_rate": 3.702169888958301e-05,
"loss": 0.5581,
"step": 112600
},
{
"epoch": 0.26002706310036167,
"grad_norm": 1.8414267301559448,
"learning_rate": 3.6998646844981916e-05,
"loss": 0.6137,
"step": 112800
},
{
"epoch": 0.2604881039923836,
"grad_norm": 1.4095929861068726,
"learning_rate": 3.697559480038082e-05,
"loss": 0.6136,
"step": 113000
},
{
"epoch": 0.2609491448844055,
"grad_norm": 1.6548664569854736,
"learning_rate": 3.695254275577972e-05,
"loss": 0.5464,
"step": 113200
},
{
"epoch": 0.26141018577642744,
"grad_norm": 1.0387002229690552,
"learning_rate": 3.6929490711178626e-05,
"loss": 0.6102,
"step": 113400
},
{
"epoch": 0.26187122666844936,
"grad_norm": 0.6978960633277893,
"learning_rate": 3.690643866657754e-05,
"loss": 0.5755,
"step": 113600
},
{
"epoch": 0.2623322675604713,
"grad_norm": 1.7503503561019897,
"learning_rate": 3.688338662197644e-05,
"loss": 0.5449,
"step": 113800
},
{
"epoch": 0.2627933084524932,
"grad_norm": 0.6255602836608887,
"learning_rate": 3.686033457737534e-05,
"loss": 0.5577,
"step": 114000
},
{
"epoch": 0.26325434934451514,
"grad_norm": 1.001632571220398,
"learning_rate": 3.683728253277425e-05,
"loss": 0.6007,
"step": 114200
},
{
"epoch": 0.263715390236537,
"grad_norm": 1.6783490180969238,
"learning_rate": 3.6814230488173154e-05,
"loss": 0.5887,
"step": 114400
},
{
"epoch": 0.26417643112855893,
"grad_norm": 0.6255197525024414,
"learning_rate": 3.679117844357205e-05,
"loss": 0.5561,
"step": 114600
},
{
"epoch": 0.26463747202058086,
"grad_norm": 2.288745880126953,
"learning_rate": 3.676812639897096e-05,
"loss": 0.5486,
"step": 114800
},
{
"epoch": 0.2650985129126028,
"grad_norm": 1.1330058574676514,
"learning_rate": 3.6745074354369864e-05,
"loss": 0.5508,
"step": 115000
},
{
"epoch": 0.2655595538046247,
"grad_norm": 1.2491919994354248,
"learning_rate": 3.672202230976876e-05,
"loss": 0.6188,
"step": 115200
},
{
"epoch": 0.26602059469664663,
"grad_norm": 1.020461916923523,
"learning_rate": 3.669897026516767e-05,
"loss": 0.6308,
"step": 115400
},
{
"epoch": 0.26648163558866855,
"grad_norm": 1.3160836696624756,
"learning_rate": 3.6675918220566575e-05,
"loss": 0.6101,
"step": 115600
},
{
"epoch": 0.2669426764806905,
"grad_norm": 1.1758986711502075,
"learning_rate": 3.665286617596548e-05,
"loss": 0.5964,
"step": 115800
},
{
"epoch": 0.26740371737271235,
"grad_norm": 0.9118921756744385,
"learning_rate": 3.662981413136438e-05,
"loss": 0.5713,
"step": 116000
},
{
"epoch": 0.26786475826473427,
"grad_norm": 1.9953539371490479,
"learning_rate": 3.6606762086763285e-05,
"loss": 0.5761,
"step": 116200
},
{
"epoch": 0.2683257991567562,
"grad_norm": 1.5514432191848755,
"learning_rate": 3.658371004216219e-05,
"loss": 0.5848,
"step": 116400
},
{
"epoch": 0.2687868400487781,
"grad_norm": 0.9288082122802734,
"learning_rate": 3.6560657997561096e-05,
"loss": 0.5664,
"step": 116600
},
{
"epoch": 0.26924788094080004,
"grad_norm": 1.547339677810669,
"learning_rate": 3.6537605952959995e-05,
"loss": 0.5863,
"step": 116800
},
{
"epoch": 0.26970892183282197,
"grad_norm": 1.671633005142212,
"learning_rate": 3.651455390835891e-05,
"loss": 0.5271,
"step": 117000
},
{
"epoch": 0.2701699627248439,
"grad_norm": 0.9012247920036316,
"learning_rate": 3.6491501863757807e-05,
"loss": 0.5724,
"step": 117200
},
{
"epoch": 0.2706310036168658,
"grad_norm": 2.2852792739868164,
"learning_rate": 3.646844981915671e-05,
"loss": 0.5644,
"step": 117400
},
{
"epoch": 0.27109204450888774,
"grad_norm": 1.312666893005371,
"learning_rate": 3.644539777455562e-05,
"loss": 0.612,
"step": 117600
},
{
"epoch": 0.2715530854009096,
"grad_norm": 0.9513750672340393,
"learning_rate": 3.6422345729954524e-05,
"loss": 0.5791,
"step": 117800
},
{
"epoch": 0.27201412629293154,
"grad_norm": 1.9773327112197876,
"learning_rate": 3.639929368535342e-05,
"loss": 0.5628,
"step": 118000
},
{
"epoch": 0.27247516718495346,
"grad_norm": 1.666195273399353,
"learning_rate": 3.637624164075233e-05,
"loss": 0.5722,
"step": 118200
},
{
"epoch": 0.2729362080769754,
"grad_norm": 1.6101315021514893,
"learning_rate": 3.6353189596151234e-05,
"loss": 0.6474,
"step": 118400
},
{
"epoch": 0.2733972489689973,
"grad_norm": 0.8097496628761292,
"learning_rate": 3.633013755155014e-05,
"loss": 0.5353,
"step": 118600
},
{
"epoch": 0.27385828986101923,
"grad_norm": 1.7693250179290771,
"learning_rate": 3.630708550694904e-05,
"loss": 0.6161,
"step": 118800
},
{
"epoch": 0.27431933075304116,
"grad_norm": 1.4188885688781738,
"learning_rate": 3.6284033462347944e-05,
"loss": 0.6031,
"step": 119000
},
{
"epoch": 0.2747803716450631,
"grad_norm": 0.7906126379966736,
"learning_rate": 3.626098141774685e-05,
"loss": 0.5421,
"step": 119200
},
{
"epoch": 0.27524141253708495,
"grad_norm": 1.3080761432647705,
"learning_rate": 3.623792937314575e-05,
"loss": 0.565,
"step": 119400
},
{
"epoch": 0.2757024534291069,
"grad_norm": 1.3079235553741455,
"learning_rate": 3.6214877328544654e-05,
"loss": 0.5828,
"step": 119600
},
{
"epoch": 0.2761634943211288,
"grad_norm": 1.9901784658432007,
"learning_rate": 3.619182528394356e-05,
"loss": 0.5621,
"step": 119800
},
{
"epoch": 0.2766245352131507,
"grad_norm": 0.5003865957260132,
"learning_rate": 3.6168773239342466e-05,
"loss": 0.5374,
"step": 120000
},
{
"epoch": 0.27708557610517265,
"grad_norm": 1.5458438396453857,
"learning_rate": 3.614572119474137e-05,
"loss": 0.5449,
"step": 120200
},
{
"epoch": 0.2775466169971946,
"grad_norm": 1.4383118152618408,
"learning_rate": 3.612266915014028e-05,
"loss": 0.6142,
"step": 120400
},
{
"epoch": 0.2780076578892165,
"grad_norm": 1.1855522394180298,
"learning_rate": 3.609961710553918e-05,
"loss": 0.564,
"step": 120600
},
{
"epoch": 0.2784686987812384,
"grad_norm": 0.840207040309906,
"learning_rate": 3.607656506093808e-05,
"loss": 0.5621,
"step": 120800
},
{
"epoch": 0.27892973967326035,
"grad_norm": 1.0996273756027222,
"learning_rate": 3.605351301633699e-05,
"loss": 0.5671,
"step": 121000
},
{
"epoch": 0.2793907805652822,
"grad_norm": 1.7531362771987915,
"learning_rate": 3.603046097173589e-05,
"loss": 0.6016,
"step": 121200
},
{
"epoch": 0.27985182145730414,
"grad_norm": 0.8433918952941895,
"learning_rate": 3.600740892713479e-05,
"loss": 0.632,
"step": 121400
},
{
"epoch": 0.28031286234932606,
"grad_norm": 0.8943939208984375,
"learning_rate": 3.59843568825337e-05,
"loss": 0.4969,
"step": 121600
},
{
"epoch": 0.280773903241348,
"grad_norm": 0.8883448839187622,
"learning_rate": 3.59613048379326e-05,
"loss": 0.5624,
"step": 121800
},
{
"epoch": 0.2812349441333699,
"grad_norm": 1.5441436767578125,
"learning_rate": 3.593825279333151e-05,
"loss": 0.5934,
"step": 122000
},
{
"epoch": 0.28169598502539184,
"grad_norm": 1.6779813766479492,
"learning_rate": 3.591520074873041e-05,
"loss": 0.5975,
"step": 122200
},
{
"epoch": 0.28215702591741376,
"grad_norm": 1.3484402894973755,
"learning_rate": 3.5892148704129313e-05,
"loss": 0.6151,
"step": 122400
},
{
"epoch": 0.2826180668094357,
"grad_norm": 0.881047785282135,
"learning_rate": 3.586909665952822e-05,
"loss": 0.5377,
"step": 122600
},
{
"epoch": 0.28307910770145756,
"grad_norm": 2.1730856895446777,
"learning_rate": 3.584604461492712e-05,
"loss": 0.5002,
"step": 122800
},
{
"epoch": 0.2835401485934795,
"grad_norm": 1.7546623945236206,
"learning_rate": 3.5822992570326024e-05,
"loss": 0.5439,
"step": 123000
},
{
"epoch": 0.2840011894855014,
"grad_norm": 1.6560966968536377,
"learning_rate": 3.579994052572493e-05,
"loss": 0.5278,
"step": 123200
},
{
"epoch": 0.28446223037752333,
"grad_norm": 1.4443609714508057,
"learning_rate": 3.5776888481123835e-05,
"loss": 0.631,
"step": 123400
},
{
"epoch": 0.28492327126954525,
"grad_norm": 1.6837761402130127,
"learning_rate": 3.575383643652274e-05,
"loss": 0.5833,
"step": 123600
},
{
"epoch": 0.2853843121615672,
"grad_norm": 1.0554946660995483,
"learning_rate": 3.5730784391921646e-05,
"loss": 0.5635,
"step": 123800
},
{
"epoch": 0.2858453530535891,
"grad_norm": 1.2719945907592773,
"learning_rate": 3.570773234732055e-05,
"loss": 0.5692,
"step": 124000
},
{
"epoch": 0.286306393945611,
"grad_norm": 0.48329654335975647,
"learning_rate": 3.568468030271945e-05,
"loss": 0.5724,
"step": 124200
},
{
"epoch": 0.28676743483763295,
"grad_norm": 1.2862858772277832,
"learning_rate": 3.566162825811836e-05,
"loss": 0.5593,
"step": 124400
},
{
"epoch": 0.2872284757296548,
"grad_norm": 2.067934513092041,
"learning_rate": 3.563857621351726e-05,
"loss": 0.5513,
"step": 124600
},
{
"epoch": 0.28768951662167674,
"grad_norm": 1.8785241842269897,
"learning_rate": 3.561552416891616e-05,
"loss": 0.5874,
"step": 124800
},
{
"epoch": 0.28815055751369867,
"grad_norm": 3.0009591579437256,
"learning_rate": 3.559247212431507e-05,
"loss": 0.5906,
"step": 125000
},
{
"epoch": 0.2886115984057206,
"grad_norm": 1.5266379117965698,
"learning_rate": 3.556942007971397e-05,
"loss": 0.6025,
"step": 125200
},
{
"epoch": 0.2890726392977425,
"grad_norm": 1.0007365942001343,
"learning_rate": 3.554636803511288e-05,
"loss": 0.5562,
"step": 125400
},
{
"epoch": 0.28953368018976444,
"grad_norm": 2.2831757068634033,
"learning_rate": 3.552331599051178e-05,
"loss": 0.6007,
"step": 125600
},
{
"epoch": 0.28999472108178637,
"grad_norm": 1.6605206727981567,
"learning_rate": 3.550026394591068e-05,
"loss": 0.5505,
"step": 125800
},
{
"epoch": 0.2904557619738083,
"grad_norm": 1.3791511058807373,
"learning_rate": 3.547721190130959e-05,
"loss": 0.6039,
"step": 126000
},
{
"epoch": 0.29091680286583016,
"grad_norm": 1.0427671670913696,
"learning_rate": 3.5454159856708494e-05,
"loss": 0.5216,
"step": 126200
},
{
"epoch": 0.2913778437578521,
"grad_norm": 1.1405614614486694,
"learning_rate": 3.543110781210739e-05,
"loss": 0.5689,
"step": 126400
},
{
"epoch": 0.291838884649874,
"grad_norm": 2.266157388687134,
"learning_rate": 3.54080557675063e-05,
"loss": 0.5273,
"step": 126600
},
{
"epoch": 0.29229992554189593,
"grad_norm": 1.7301876544952393,
"learning_rate": 3.5385003722905204e-05,
"loss": 0.5355,
"step": 126800
},
{
"epoch": 0.29276096643391786,
"grad_norm": 0.9307401180267334,
"learning_rate": 3.536195167830411e-05,
"loss": 0.5431,
"step": 127000
},
{
"epoch": 0.2932220073259398,
"grad_norm": 1.8494658470153809,
"learning_rate": 3.5338899633703016e-05,
"loss": 0.5768,
"step": 127200
},
{
"epoch": 0.2936830482179617,
"grad_norm": 1.0275499820709229,
"learning_rate": 3.531584758910192e-05,
"loss": 0.5996,
"step": 127400
},
{
"epoch": 0.29414408910998363,
"grad_norm": 0.5210323333740234,
"learning_rate": 3.529279554450082e-05,
"loss": 0.5473,
"step": 127600
},
{
"epoch": 0.29460513000200556,
"grad_norm": 1.827402114868164,
"learning_rate": 3.5269743499899726e-05,
"loss": 0.5728,
"step": 127800
},
{
"epoch": 0.2950661708940274,
"grad_norm": 2.054245948791504,
"learning_rate": 3.524669145529863e-05,
"loss": 0.6179,
"step": 128000
},
{
"epoch": 0.29552721178604935,
"grad_norm": 1.6693862676620483,
"learning_rate": 3.522363941069754e-05,
"loss": 0.5453,
"step": 128200
},
{
"epoch": 0.2959882526780713,
"grad_norm": 23.072887420654297,
"learning_rate": 3.5200587366096436e-05,
"loss": 0.5791,
"step": 128400
},
{
"epoch": 0.2964492935700932,
"grad_norm": 1.1938518285751343,
"learning_rate": 3.517753532149534e-05,
"loss": 0.5507,
"step": 128600
},
{
"epoch": 0.2969103344621151,
"grad_norm": 2.9994335174560547,
"learning_rate": 3.515448327689425e-05,
"loss": 0.5737,
"step": 128800
},
{
"epoch": 0.29737137535413705,
"grad_norm": 2.0268101692199707,
"learning_rate": 3.5131431232293146e-05,
"loss": 0.5941,
"step": 129000
},
{
"epoch": 0.29783241624615897,
"grad_norm": 1.4600251913070679,
"learning_rate": 3.510837918769205e-05,
"loss": 0.5456,
"step": 129200
},
{
"epoch": 0.2982934571381809,
"grad_norm": 0.5370715260505676,
"learning_rate": 3.508532714309096e-05,
"loss": 0.5618,
"step": 129400
},
{
"epoch": 0.29875449803020276,
"grad_norm": 1.65589439868927,
"learning_rate": 3.5062275098489864e-05,
"loss": 0.5189,
"step": 129600
},
{
"epoch": 0.2992155389222247,
"grad_norm": 1.9053618907928467,
"learning_rate": 3.503922305388876e-05,
"loss": 0.5698,
"step": 129800
},
{
"epoch": 0.2996765798142466,
"grad_norm": 0.9981529116630554,
"learning_rate": 3.501617100928767e-05,
"loss": 0.5622,
"step": 130000
},
{
"epoch": 0.30013762070626854,
"grad_norm": 1.5136228799819946,
"learning_rate": 3.4993118964686574e-05,
"loss": 0.5812,
"step": 130200
},
{
"epoch": 0.30059866159829046,
"grad_norm": 1.9930968284606934,
"learning_rate": 3.497006692008548e-05,
"loss": 0.5754,
"step": 130400
},
{
"epoch": 0.3010597024903124,
"grad_norm": 1.6242766380310059,
"learning_rate": 3.4947014875484385e-05,
"loss": 0.6422,
"step": 130600
},
{
"epoch": 0.3015207433823343,
"grad_norm": 1.142068862915039,
"learning_rate": 3.492396283088329e-05,
"loss": 0.5647,
"step": 130800
},
{
"epoch": 0.30198178427435624,
"grad_norm": 0.8593564629554749,
"learning_rate": 3.490091078628219e-05,
"loss": 0.5709,
"step": 131000
},
{
"epoch": 0.3024428251663781,
"grad_norm": 1.0364127159118652,
"learning_rate": 3.4877858741681095e-05,
"loss": 0.6261,
"step": 131200
},
{
"epoch": 0.30290386605840003,
"grad_norm": 0.7950695157051086,
"learning_rate": 3.485480669708e-05,
"loss": 0.5276,
"step": 131400
},
{
"epoch": 0.30336490695042195,
"grad_norm": 0.7673638463020325,
"learning_rate": 3.483175465247891e-05,
"loss": 0.5289,
"step": 131600
},
{
"epoch": 0.3038259478424439,
"grad_norm": 0.7830930948257446,
"learning_rate": 3.4808702607877806e-05,
"loss": 0.512,
"step": 131800
},
{
"epoch": 0.3042869887344658,
"grad_norm": 2.0144901275634766,
"learning_rate": 3.478565056327671e-05,
"loss": 0.5974,
"step": 132000
},
{
"epoch": 0.3047480296264877,
"grad_norm": 1.531823754310608,
"learning_rate": 3.476259851867562e-05,
"loss": 0.5889,
"step": 132200
},
{
"epoch": 0.30520907051850965,
"grad_norm": 1.1989134550094604,
"learning_rate": 3.4739546474074516e-05,
"loss": 0.5664,
"step": 132400
},
{
"epoch": 0.3056701114105316,
"grad_norm": 1.5596988201141357,
"learning_rate": 3.471649442947342e-05,
"loss": 0.5465,
"step": 132600
},
{
"epoch": 0.3061311523025535,
"grad_norm": 1.2339794635772705,
"learning_rate": 3.469344238487233e-05,
"loss": 0.5387,
"step": 132800
},
{
"epoch": 0.30659219319457537,
"grad_norm": 0.7480385303497314,
"learning_rate": 3.467039034027123e-05,
"loss": 0.5744,
"step": 133000
},
{
"epoch": 0.3070532340865973,
"grad_norm": 1.1106038093566895,
"learning_rate": 3.464733829567013e-05,
"loss": 0.5523,
"step": 133200
},
{
"epoch": 0.3075142749786192,
"grad_norm": 1.145395040512085,
"learning_rate": 3.462428625106904e-05,
"loss": 0.5758,
"step": 133400
},
{
"epoch": 0.30797531587064114,
"grad_norm": 1.4697068929672241,
"learning_rate": 3.460123420646794e-05,
"loss": 0.5938,
"step": 133600
},
{
"epoch": 0.30843635676266307,
"grad_norm": 1.8657139539718628,
"learning_rate": 3.457818216186685e-05,
"loss": 0.612,
"step": 133800
},
{
"epoch": 0.308897397654685,
"grad_norm": 1.3529716730117798,
"learning_rate": 3.4555130117265754e-05,
"loss": 0.6109,
"step": 134000
},
{
"epoch": 0.3093584385467069,
"grad_norm": 1.7217750549316406,
"learning_rate": 3.453207807266466e-05,
"loss": 0.5585,
"step": 134200
},
{
"epoch": 0.30981947943872884,
"grad_norm": 2.0881683826446533,
"learning_rate": 3.450902602806356e-05,
"loss": 0.5603,
"step": 134400
},
{
"epoch": 0.3102805203307507,
"grad_norm": 1.4093154668807983,
"learning_rate": 3.4485973983462465e-05,
"loss": 0.6025,
"step": 134600
},
{
"epoch": 0.31074156122277263,
"grad_norm": 1.2909964323043823,
"learning_rate": 3.446292193886137e-05,
"loss": 0.6318,
"step": 134800
},
{
"epoch": 0.31120260211479456,
"grad_norm": 1.9000458717346191,
"learning_rate": 3.4439869894260276e-05,
"loss": 0.5565,
"step": 135000
},
{
"epoch": 0.3116636430068165,
"grad_norm": 1.2994461059570312,
"learning_rate": 3.4416817849659175e-05,
"loss": 0.5426,
"step": 135200
},
{
"epoch": 0.3121246838988384,
"grad_norm": 0.6507192850112915,
"learning_rate": 3.439376580505808e-05,
"loss": 0.5631,
"step": 135400
},
{
"epoch": 0.31258572479086033,
"grad_norm": 1.4689639806747437,
"learning_rate": 3.4370713760456986e-05,
"loss": 0.6069,
"step": 135600
},
{
"epoch": 0.31304676568288226,
"grad_norm": 0.9149547219276428,
"learning_rate": 3.434766171585589e-05,
"loss": 0.5872,
"step": 135800
},
{
"epoch": 0.3135078065749042,
"grad_norm": 1.8406304121017456,
"learning_rate": 3.432460967125479e-05,
"loss": 0.5729,
"step": 136000
},
{
"epoch": 0.3139688474669261,
"grad_norm": 1.9627593755722046,
"learning_rate": 3.4301557626653697e-05,
"loss": 0.5771,
"step": 136200
},
{
"epoch": 0.314429888358948,
"grad_norm": 0.7546736001968384,
"learning_rate": 3.42785055820526e-05,
"loss": 0.4629,
"step": 136400
},
{
"epoch": 0.3148909292509699,
"grad_norm": 1.3984806537628174,
"learning_rate": 3.42554535374515e-05,
"loss": 0.5377,
"step": 136600
},
{
"epoch": 0.3153519701429918,
"grad_norm": 1.5485873222351074,
"learning_rate": 3.423240149285041e-05,
"loss": 0.5739,
"step": 136800
},
{
"epoch": 0.31581301103501375,
"grad_norm": 1.7093192338943481,
"learning_rate": 3.420934944824931e-05,
"loss": 0.5751,
"step": 137000
},
{
"epoch": 0.31627405192703567,
"grad_norm": 1.5941184759140015,
"learning_rate": 3.418629740364822e-05,
"loss": 0.555,
"step": 137200
},
{
"epoch": 0.3167350928190576,
"grad_norm": 1.0753742456436157,
"learning_rate": 3.4163245359047124e-05,
"loss": 0.5638,
"step": 137400
},
{
"epoch": 0.3171961337110795,
"grad_norm": 1.171726107597351,
"learning_rate": 3.414019331444603e-05,
"loss": 0.5748,
"step": 137600
},
{
"epoch": 0.31765717460310144,
"grad_norm": 1.5128881931304932,
"learning_rate": 3.4117141269844935e-05,
"loss": 0.5728,
"step": 137800
},
{
"epoch": 0.3181182154951233,
"grad_norm": 2.131058692932129,
"learning_rate": 3.4094089225243834e-05,
"loss": 0.5536,
"step": 138000
},
{
"epoch": 0.31857925638714524,
"grad_norm": 1.5034462213516235,
"learning_rate": 3.407103718064274e-05,
"loss": 0.5505,
"step": 138200
},
{
"epoch": 0.31904029727916716,
"grad_norm": 1.4908447265625,
"learning_rate": 3.4047985136041645e-05,
"loss": 0.5813,
"step": 138400
},
{
"epoch": 0.3195013381711891,
"grad_norm": 1.6707509756088257,
"learning_rate": 3.4024933091440544e-05,
"loss": 0.5984,
"step": 138600
},
{
"epoch": 0.319962379063211,
"grad_norm": 1.7882601022720337,
"learning_rate": 3.400188104683945e-05,
"loss": 0.5801,
"step": 138800
},
{
"epoch": 0.32042341995523294,
"grad_norm": 2.314807176589966,
"learning_rate": 3.3978829002238356e-05,
"loss": 0.5608,
"step": 139000
},
{
"epoch": 0.32088446084725486,
"grad_norm": 0.6125404834747314,
"learning_rate": 3.395577695763726e-05,
"loss": 0.5732,
"step": 139200
},
{
"epoch": 0.3213455017392768,
"grad_norm": 1.9929119348526,
"learning_rate": 3.393272491303616e-05,
"loss": 0.5998,
"step": 139400
},
{
"epoch": 0.3218065426312987,
"grad_norm": 1.571915626525879,
"learning_rate": 3.3909672868435066e-05,
"loss": 0.5613,
"step": 139600
},
{
"epoch": 0.3222675835233206,
"grad_norm": 1.3218785524368286,
"learning_rate": 3.388662082383397e-05,
"loss": 0.5558,
"step": 139800
},
{
"epoch": 0.3227286244153425,
"grad_norm": 1.0370618104934692,
"learning_rate": 3.386356877923287e-05,
"loss": 0.5212,
"step": 140000
},
{
"epoch": 0.3231896653073644,
"grad_norm": 1.202951431274414,
"learning_rate": 3.3840516734631776e-05,
"loss": 0.5084,
"step": 140200
},
{
"epoch": 0.32365070619938635,
"grad_norm": 1.7719680070877075,
"learning_rate": 3.381746469003068e-05,
"loss": 0.5619,
"step": 140400
},
{
"epoch": 0.3241117470914083,
"grad_norm": 1.611811876296997,
"learning_rate": 3.379441264542959e-05,
"loss": 0.5645,
"step": 140600
},
{
"epoch": 0.3245727879834302,
"grad_norm": 1.4955034255981445,
"learning_rate": 3.377136060082849e-05,
"loss": 0.5335,
"step": 140800
},
{
"epoch": 0.3250338288754521,
"grad_norm": 1.1228415966033936,
"learning_rate": 3.37483085562274e-05,
"loss": 0.538,
"step": 141000
},
{
"epoch": 0.32549486976747405,
"grad_norm": 0.8524361848831177,
"learning_rate": 3.3725256511626305e-05,
"loss": 0.5565,
"step": 141200
},
{
"epoch": 0.3259559106594959,
"grad_norm": 0.7709594368934631,
"learning_rate": 3.3702204467025203e-05,
"loss": 0.5728,
"step": 141400
},
{
"epoch": 0.32641695155151784,
"grad_norm": 0.9017342329025269,
"learning_rate": 3.367915242242411e-05,
"loss": 0.574,
"step": 141600
},
{
"epoch": 0.32687799244353977,
"grad_norm": 1.6135542392730713,
"learning_rate": 3.3656100377823015e-05,
"loss": 0.5467,
"step": 141800
},
{
"epoch": 0.3273390333355617,
"grad_norm": 1.0958969593048096,
"learning_rate": 3.3633048333221914e-05,
"loss": 0.5548,
"step": 142000
},
{
"epoch": 0.3278000742275836,
"grad_norm": 0.8333266973495483,
"learning_rate": 3.360999628862082e-05,
"loss": 0.6149,
"step": 142200
},
{
"epoch": 0.32826111511960554,
"grad_norm": 1.3214168548583984,
"learning_rate": 3.3586944244019725e-05,
"loss": 0.5691,
"step": 142400
},
{
"epoch": 0.32872215601162746,
"grad_norm": 1.9546606540679932,
"learning_rate": 3.356389219941863e-05,
"loss": 0.5188,
"step": 142600
},
{
"epoch": 0.3291831969036494,
"grad_norm": 2.063167095184326,
"learning_rate": 3.354084015481753e-05,
"loss": 0.5576,
"step": 142800
},
{
"epoch": 0.3296442377956713,
"grad_norm": 1.5281319618225098,
"learning_rate": 3.3517788110216435e-05,
"loss": 0.6239,
"step": 143000
},
{
"epoch": 0.3301052786876932,
"grad_norm": 0.9940102696418762,
"learning_rate": 3.349473606561534e-05,
"loss": 0.5521,
"step": 143200
},
{
"epoch": 0.3305663195797151,
"grad_norm": 0.5748217105865479,
"learning_rate": 3.347168402101425e-05,
"loss": 0.536,
"step": 143400
},
{
"epoch": 0.33102736047173703,
"grad_norm": 1.7020162343978882,
"learning_rate": 3.3448631976413146e-05,
"loss": 0.573,
"step": 143600
},
{
"epoch": 0.33148840136375896,
"grad_norm": 1.1483004093170166,
"learning_rate": 3.342557993181205e-05,
"loss": 0.5677,
"step": 143800
},
{
"epoch": 0.3319494422557809,
"grad_norm": 0.9976577162742615,
"learning_rate": 3.3402527887210964e-05,
"loss": 0.5171,
"step": 144000
},
{
"epoch": 0.3324104831478028,
"grad_norm": 1.9477131366729736,
"learning_rate": 3.337947584260986e-05,
"loss": 0.5206,
"step": 144200
},
{
"epoch": 0.33287152403982473,
"grad_norm": 2.5591280460357666,
"learning_rate": 3.335642379800877e-05,
"loss": 0.5785,
"step": 144400
},
{
"epoch": 0.33333256493184665,
"grad_norm": 0.9699960947036743,
"learning_rate": 3.3333371753407674e-05,
"loss": 0.5573,
"step": 144600
},
{
"epoch": 0.3337936058238685,
"grad_norm": 1.0641608238220215,
"learning_rate": 3.331031970880657e-05,
"loss": 0.5807,
"step": 144800
},
{
"epoch": 0.33425464671589045,
"grad_norm": 1.6940183639526367,
"learning_rate": 3.328726766420548e-05,
"loss": 0.5861,
"step": 145000
},
{
"epoch": 0.33471568760791237,
"grad_norm": 1.1107732057571411,
"learning_rate": 3.3264215619604384e-05,
"loss": 0.5613,
"step": 145200
},
{
"epoch": 0.3351767284999343,
"grad_norm": 1.3826497793197632,
"learning_rate": 3.324116357500329e-05,
"loss": 0.5364,
"step": 145400
},
{
"epoch": 0.3356377693919562,
"grad_norm": 2.2688817977905273,
"learning_rate": 3.321811153040219e-05,
"loss": 0.5485,
"step": 145600
},
{
"epoch": 0.33609881028397814,
"grad_norm": 1.0029947757720947,
"learning_rate": 3.3195059485801094e-05,
"loss": 0.5915,
"step": 145800
},
{
"epoch": 0.33655985117600007,
"grad_norm": 1.0812941789627075,
"learning_rate": 3.31720074412e-05,
"loss": 0.5652,
"step": 146000
},
{
"epoch": 0.337020892068022,
"grad_norm": 1.1072156429290771,
"learning_rate": 3.31489553965989e-05,
"loss": 0.5462,
"step": 146200
},
{
"epoch": 0.3374819329600439,
"grad_norm": 0.6877702474594116,
"learning_rate": 3.3125903351997805e-05,
"loss": 0.5372,
"step": 146400
},
{
"epoch": 0.3379429738520658,
"grad_norm": 1.1875689029693604,
"learning_rate": 3.310285130739671e-05,
"loss": 0.5579,
"step": 146600
},
{
"epoch": 0.3384040147440877,
"grad_norm": 1.9786611795425415,
"learning_rate": 3.3079799262795616e-05,
"loss": 0.5614,
"step": 146800
},
{
"epoch": 0.33886505563610964,
"grad_norm": 0.24953074753284454,
"learning_rate": 3.3056747218194515e-05,
"loss": 0.5648,
"step": 147000
},
{
"epoch": 0.33932609652813156,
"grad_norm": 2.5248162746429443,
"learning_rate": 3.303369517359342e-05,
"loss": 0.4776,
"step": 147200
},
{
"epoch": 0.3397871374201535,
"grad_norm": 0.7923634052276611,
"learning_rate": 3.301064312899233e-05,
"loss": 0.5524,
"step": 147400
},
{
"epoch": 0.3402481783121754,
"grad_norm": 1.1320934295654297,
"learning_rate": 3.298759108439123e-05,
"loss": 0.5924,
"step": 147600
},
{
"epoch": 0.34070921920419733,
"grad_norm": 0.9425584673881531,
"learning_rate": 3.296453903979014e-05,
"loss": 0.5637,
"step": 147800
},
{
"epoch": 0.34117026009621926,
"grad_norm": 1.1642394065856934,
"learning_rate": 3.294148699518904e-05,
"loss": 0.555,
"step": 148000
},
{
"epoch": 0.3416313009882411,
"grad_norm": 1.479867935180664,
"learning_rate": 3.291843495058794e-05,
"loss": 0.5555,
"step": 148200
},
{
"epoch": 0.34209234188026305,
"grad_norm": 1.6537656784057617,
"learning_rate": 3.289538290598685e-05,
"loss": 0.5266,
"step": 148400
},
{
"epoch": 0.342553382772285,
"grad_norm": 0.8928322196006775,
"learning_rate": 3.2872330861385754e-05,
"loss": 0.5169,
"step": 148600
},
{
"epoch": 0.3430144236643069,
"grad_norm": 0.6630598902702332,
"learning_rate": 3.284927881678466e-05,
"loss": 0.5868,
"step": 148800
},
{
"epoch": 0.3434754645563288,
"grad_norm": 1.361573338508606,
"learning_rate": 3.282622677218356e-05,
"loss": 0.542,
"step": 149000
},
{
"epoch": 0.34393650544835075,
"grad_norm": 1.668082356452942,
"learning_rate": 3.2803174727582464e-05,
"loss": 0.5735,
"step": 149200
},
{
"epoch": 0.3443975463403727,
"grad_norm": 2.2211737632751465,
"learning_rate": 3.278012268298137e-05,
"loss": 0.5747,
"step": 149400
},
{
"epoch": 0.3448585872323946,
"grad_norm": 0.685369610786438,
"learning_rate": 3.275707063838027e-05,
"loss": 0.5401,
"step": 149600
},
{
"epoch": 0.3453196281244165,
"grad_norm": 1.617565631866455,
"learning_rate": 3.2734018593779174e-05,
"loss": 0.5635,
"step": 149800
},
{
"epoch": 0.3457806690164384,
"grad_norm": 1.5583852529525757,
"learning_rate": 3.271096654917808e-05,
"loss": 0.542,
"step": 150000
},
{
"epoch": 0.3457806690164384,
"eval_loss": 0.5474369525909424,
"eval_runtime": 144.1295,
"eval_samples_per_second": 30.403,
"eval_steps_per_second": 30.403,
"step": 150000
},
{
"epoch": 0.3462417099084603,
"grad_norm": 1.612930178642273,
"learning_rate": 3.2687914504576985e-05,
"loss": 0.5401,
"step": 150200
},
{
"epoch": 0.34670275080048224,
"grad_norm": 1.3440135717391968,
"learning_rate": 3.2664862459975884e-05,
"loss": 0.557,
"step": 150400
},
{
"epoch": 0.34716379169250416,
"grad_norm": 1.8030917644500732,
"learning_rate": 3.264181041537479e-05,
"loss": 0.528,
"step": 150600
},
{
"epoch": 0.3476248325845261,
"grad_norm": 1.355789303779602,
"learning_rate": 3.26187583707737e-05,
"loss": 0.5977,
"step": 150800
},
{
"epoch": 0.348085873476548,
"grad_norm": 1.8958524465560913,
"learning_rate": 3.25957063261726e-05,
"loss": 0.5509,
"step": 151000
},
{
"epoch": 0.34854691436856994,
"grad_norm": 1.62078857421875,
"learning_rate": 3.257265428157151e-05,
"loss": 0.5261,
"step": 151200
},
{
"epoch": 0.34900795526059186,
"grad_norm": 1.1603842973709106,
"learning_rate": 3.254960223697041e-05,
"loss": 0.5319,
"step": 151400
},
{
"epoch": 0.34946899615261373,
"grad_norm": 1.1251477003097534,
"learning_rate": 3.252655019236932e-05,
"loss": 0.5416,
"step": 151600
},
{
"epoch": 0.34993003704463566,
"grad_norm": 1.0224628448486328,
"learning_rate": 3.250349814776822e-05,
"loss": 0.5649,
"step": 151800
},
{
"epoch": 0.3503910779366576,
"grad_norm": 1.211235523223877,
"learning_rate": 3.248044610316712e-05,
"loss": 0.596,
"step": 152000
},
{
"epoch": 0.3508521188286795,
"grad_norm": 0.8075993061065674,
"learning_rate": 3.245739405856603e-05,
"loss": 0.5491,
"step": 152200
},
{
"epoch": 0.35131315972070143,
"grad_norm": 1.6871740818023682,
"learning_rate": 3.243434201396493e-05,
"loss": 0.5996,
"step": 152400
},
{
"epoch": 0.35177420061272335,
"grad_norm": 1.8563005924224854,
"learning_rate": 3.241128996936383e-05,
"loss": 0.5544,
"step": 152600
},
{
"epoch": 0.3522352415047453,
"grad_norm": 1.102376103401184,
"learning_rate": 3.238823792476274e-05,
"loss": 0.5294,
"step": 152800
},
{
"epoch": 0.3526962823967672,
"grad_norm": 1.3146488666534424,
"learning_rate": 3.2365185880161645e-05,
"loss": 0.55,
"step": 153000
},
{
"epoch": 0.3531573232887891,
"grad_norm": 1.509630799293518,
"learning_rate": 3.2342133835560543e-05,
"loss": 0.5853,
"step": 153200
},
{
"epoch": 0.353618364180811,
"grad_norm": 1.378322958946228,
"learning_rate": 3.231908179095945e-05,
"loss": 0.5718,
"step": 153400
},
{
"epoch": 0.3540794050728329,
"grad_norm": 1.8150678873062134,
"learning_rate": 3.2296029746358355e-05,
"loss": 0.5234,
"step": 153600
},
{
"epoch": 0.35454044596485484,
"grad_norm": 1.5151995420455933,
"learning_rate": 3.2272977701757254e-05,
"loss": 0.55,
"step": 153800
},
{
"epoch": 0.35500148685687677,
"grad_norm": 1.823546290397644,
"learning_rate": 3.224992565715616e-05,
"loss": 0.5458,
"step": 154000
},
{
"epoch": 0.3554625277488987,
"grad_norm": 1.5419812202453613,
"learning_rate": 3.222687361255507e-05,
"loss": 0.567,
"step": 154200
},
{
"epoch": 0.3559235686409206,
"grad_norm": 0.9206061959266663,
"learning_rate": 3.220382156795397e-05,
"loss": 0.5666,
"step": 154400
},
{
"epoch": 0.35638460953294254,
"grad_norm": 1.9426078796386719,
"learning_rate": 3.2180769523352876e-05,
"loss": 0.5598,
"step": 154600
},
{
"epoch": 0.35684565042496447,
"grad_norm": 2.45462965965271,
"learning_rate": 3.215771747875178e-05,
"loss": 0.5728,
"step": 154800
},
{
"epoch": 0.35730669131698634,
"grad_norm": 1.4566892385482788,
"learning_rate": 3.213466543415069e-05,
"loss": 0.5465,
"step": 155000
},
{
"epoch": 0.35776773220900826,
"grad_norm": 1.2060158252716064,
"learning_rate": 3.211161338954959e-05,
"loss": 0.5656,
"step": 155200
},
{
"epoch": 0.3582287731010302,
"grad_norm": 2.714728832244873,
"learning_rate": 3.208856134494849e-05,
"loss": 0.5431,
"step": 155400
},
{
"epoch": 0.3586898139930521,
"grad_norm": 1.1903655529022217,
"learning_rate": 3.20655093003474e-05,
"loss": 0.543,
"step": 155600
},
{
"epoch": 0.35915085488507403,
"grad_norm": 1.3290653228759766,
"learning_rate": 3.20424572557463e-05,
"loss": 0.5193,
"step": 155800
},
{
"epoch": 0.35961189577709596,
"grad_norm": 1.43769371509552,
"learning_rate": 3.20194052111452e-05,
"loss": 0.5177,
"step": 156000
},
{
"epoch": 0.3600729366691179,
"grad_norm": 1.404023289680481,
"learning_rate": 3.199635316654411e-05,
"loss": 0.5425,
"step": 156200
},
{
"epoch": 0.3605339775611398,
"grad_norm": 1.71915602684021,
"learning_rate": 3.1973301121943014e-05,
"loss": 0.5128,
"step": 156400
},
{
"epoch": 0.3609950184531617,
"grad_norm": 0.7645987272262573,
"learning_rate": 3.195024907734191e-05,
"loss": 0.5194,
"step": 156600
},
{
"epoch": 0.3614560593451836,
"grad_norm": 0.7512270212173462,
"learning_rate": 3.192719703274082e-05,
"loss": 0.5535,
"step": 156800
},
{
"epoch": 0.3619171002372055,
"grad_norm": 1.369632601737976,
"learning_rate": 3.1904144988139724e-05,
"loss": 0.5799,
"step": 157000
},
{
"epoch": 0.36237814112922745,
"grad_norm": 1.033872127532959,
"learning_rate": 3.188109294353862e-05,
"loss": 0.4932,
"step": 157200
},
{
"epoch": 0.3628391820212494,
"grad_norm": 1.6982067823410034,
"learning_rate": 3.185804089893753e-05,
"loss": 0.5428,
"step": 157400
},
{
"epoch": 0.3633002229132713,
"grad_norm": 1.2654556035995483,
"learning_rate": 3.183498885433644e-05,
"loss": 0.5261,
"step": 157600
},
{
"epoch": 0.3637612638052932,
"grad_norm": 0.6754932403564453,
"learning_rate": 3.181193680973534e-05,
"loss": 0.5388,
"step": 157800
},
{
"epoch": 0.36422230469731515,
"grad_norm": 1.5985398292541504,
"learning_rate": 3.1788884765134246e-05,
"loss": 0.553,
"step": 158000
},
{
"epoch": 0.36468334558933707,
"grad_norm": 0.4007735848426819,
"learning_rate": 3.176583272053315e-05,
"loss": 0.5233,
"step": 158200
},
{
"epoch": 0.36514438648135894,
"grad_norm": 1.1381844282150269,
"learning_rate": 3.174278067593206e-05,
"loss": 0.5748,
"step": 158400
},
{
"epoch": 0.36560542737338086,
"grad_norm": 0.9528195858001709,
"learning_rate": 3.1719728631330956e-05,
"loss": 0.5558,
"step": 158600
},
{
"epoch": 0.3660664682654028,
"grad_norm": 0.8936863541603088,
"learning_rate": 3.169667658672986e-05,
"loss": 0.5473,
"step": 158800
},
{
"epoch": 0.3665275091574247,
"grad_norm": 1.4663864374160767,
"learning_rate": 3.167362454212877e-05,
"loss": 0.5891,
"step": 159000
},
{
"epoch": 0.36698855004944664,
"grad_norm": 1.6440341472625732,
"learning_rate": 3.1650572497527666e-05,
"loss": 0.5361,
"step": 159200
},
{
"epoch": 0.36744959094146856,
"grad_norm": 0.7922578454017639,
"learning_rate": 3.162752045292657e-05,
"loss": 0.5754,
"step": 159400
},
{
"epoch": 0.3679106318334905,
"grad_norm": 2.1551461219787598,
"learning_rate": 3.160446840832548e-05,
"loss": 0.512,
"step": 159600
},
{
"epoch": 0.3683716727255124,
"grad_norm": 0.9643208980560303,
"learning_rate": 3.158141636372438e-05,
"loss": 0.5467,
"step": 159800
},
{
"epoch": 0.3688327136175343,
"grad_norm": 2.1086177825927734,
"learning_rate": 3.155836431912328e-05,
"loss": 0.5213,
"step": 160000
},
{
"epoch": 0.3692937545095562,
"grad_norm": 1.441178321838379,
"learning_rate": 3.153531227452219e-05,
"loss": 0.6028,
"step": 160200
},
{
"epoch": 0.36975479540157813,
"grad_norm": 1.4054416418075562,
"learning_rate": 3.1512260229921094e-05,
"loss": 0.4865,
"step": 160400
},
{
"epoch": 0.37021583629360005,
"grad_norm": 1.6927324533462524,
"learning_rate": 3.148920818532e-05,
"loss": 0.626,
"step": 160600
},
{
"epoch": 0.370676877185622,
"grad_norm": 0.4474141299724579,
"learning_rate": 3.14661561407189e-05,
"loss": 0.5385,
"step": 160800
},
{
"epoch": 0.3711379180776439,
"grad_norm": 1.3374356031417847,
"learning_rate": 3.144310409611781e-05,
"loss": 0.5159,
"step": 161000
},
{
"epoch": 0.3715989589696658,
"grad_norm": 0.9584740996360779,
"learning_rate": 3.1420052051516716e-05,
"loss": 0.5547,
"step": 161200
},
{
"epoch": 0.37205999986168775,
"grad_norm": 0.8642265200614929,
"learning_rate": 3.1397000006915615e-05,
"loss": 0.5651,
"step": 161400
},
{
"epoch": 0.3725210407537097,
"grad_norm": 1.4360606670379639,
"learning_rate": 3.137394796231452e-05,
"loss": 0.535,
"step": 161600
},
{
"epoch": 0.37298208164573154,
"grad_norm": 1.210317611694336,
"learning_rate": 3.1350895917713427e-05,
"loss": 0.5291,
"step": 161800
},
{
"epoch": 0.37344312253775347,
"grad_norm": 0.818991482257843,
"learning_rate": 3.1327843873112325e-05,
"loss": 0.5441,
"step": 162000
},
{
"epoch": 0.3739041634297754,
"grad_norm": 1.7334657907485962,
"learning_rate": 3.130479182851123e-05,
"loss": 0.547,
"step": 162200
},
{
"epoch": 0.3743652043217973,
"grad_norm": 1.3756144046783447,
"learning_rate": 3.128173978391014e-05,
"loss": 0.5386,
"step": 162400
},
{
"epoch": 0.37482624521381924,
"grad_norm": 1.6707614660263062,
"learning_rate": 3.125868773930904e-05,
"loss": 0.5332,
"step": 162600
},
{
"epoch": 0.37528728610584117,
"grad_norm": 1.2302086353302002,
"learning_rate": 3.123563569470794e-05,
"loss": 0.5376,
"step": 162800
},
{
"epoch": 0.3757483269978631,
"grad_norm": 1.47279953956604,
"learning_rate": 3.121258365010685e-05,
"loss": 0.5065,
"step": 163000
},
{
"epoch": 0.376209367889885,
"grad_norm": 1.31904935836792,
"learning_rate": 3.118953160550575e-05,
"loss": 0.5673,
"step": 163200
},
{
"epoch": 0.3766704087819069,
"grad_norm": 0.5999027490615845,
"learning_rate": 3.116647956090465e-05,
"loss": 0.5637,
"step": 163400
},
{
"epoch": 0.3771314496739288,
"grad_norm": 0.6730818152427673,
"learning_rate": 3.114342751630356e-05,
"loss": 0.5457,
"step": 163600
},
{
"epoch": 0.37759249056595073,
"grad_norm": 1.5005543231964111,
"learning_rate": 3.112037547170246e-05,
"loss": 0.54,
"step": 163800
},
{
"epoch": 0.37805353145797266,
"grad_norm": 0.8119702339172363,
"learning_rate": 3.109732342710137e-05,
"loss": 0.539,
"step": 164000
},
{
"epoch": 0.3785145723499946,
"grad_norm": 0.7515968680381775,
"learning_rate": 3.107427138250027e-05,
"loss": 0.5466,
"step": 164200
},
{
"epoch": 0.3789756132420165,
"grad_norm": 1.7886674404144287,
"learning_rate": 3.105121933789918e-05,
"loss": 0.5196,
"step": 164400
},
{
"epoch": 0.37943665413403843,
"grad_norm": 1.1930861473083496,
"learning_rate": 3.1028167293298086e-05,
"loss": 0.5678,
"step": 164600
},
{
"epoch": 0.37989769502606036,
"grad_norm": 1.8339203596115112,
"learning_rate": 3.1005115248696985e-05,
"loss": 0.5559,
"step": 164800
},
{
"epoch": 0.3803587359180823,
"grad_norm": 1.1968586444854736,
"learning_rate": 3.098206320409589e-05,
"loss": 0.5661,
"step": 165000
},
{
"epoch": 0.38081977681010415,
"grad_norm": 1.7871519327163696,
"learning_rate": 3.0959011159494796e-05,
"loss": 0.5931,
"step": 165200
},
{
"epoch": 0.3812808177021261,
"grad_norm": 0.8988884091377258,
"learning_rate": 3.0935959114893695e-05,
"loss": 0.4913,
"step": 165400
},
{
"epoch": 0.381741858594148,
"grad_norm": 0.36570337414741516,
"learning_rate": 3.09129070702926e-05,
"loss": 0.5088,
"step": 165600
},
{
"epoch": 0.3822028994861699,
"grad_norm": 1.5454649925231934,
"learning_rate": 3.0889855025691506e-05,
"loss": 0.5556,
"step": 165800
},
{
"epoch": 0.38266394037819185,
"grad_norm": 4.354947090148926,
"learning_rate": 3.086680298109041e-05,
"loss": 0.543,
"step": 166000
},
{
"epoch": 0.38312498127021377,
"grad_norm": 1.1687140464782715,
"learning_rate": 3.084375093648931e-05,
"loss": 0.5557,
"step": 166200
},
{
"epoch": 0.3835860221622357,
"grad_norm": 0.9749841690063477,
"learning_rate": 3.0820698891888216e-05,
"loss": 0.5267,
"step": 166400
},
{
"epoch": 0.3840470630542576,
"grad_norm": 1.900041103363037,
"learning_rate": 3.079764684728712e-05,
"loss": 0.5163,
"step": 166600
},
{
"epoch": 0.3845081039462795,
"grad_norm": 1.2895805835723877,
"learning_rate": 3.077459480268602e-05,
"loss": 0.5756,
"step": 166800
},
{
"epoch": 0.3849691448383014,
"grad_norm": 1.4463883638381958,
"learning_rate": 3.075154275808493e-05,
"loss": 0.5656,
"step": 167000
},
{
"epoch": 0.38543018573032334,
"grad_norm": 0.9612560272216797,
"learning_rate": 3.072849071348383e-05,
"loss": 0.5103,
"step": 167200
},
{
"epoch": 0.38589122662234526,
"grad_norm": 1.8480556011199951,
"learning_rate": 3.070543866888274e-05,
"loss": 0.5257,
"step": 167400
},
{
"epoch": 0.3863522675143672,
"grad_norm": 1.0281248092651367,
"learning_rate": 3.0682386624281644e-05,
"loss": 0.5381,
"step": 167600
},
{
"epoch": 0.3868133084063891,
"grad_norm": 1.657851219177246,
"learning_rate": 3.065933457968055e-05,
"loss": 0.5224,
"step": 167800
},
{
"epoch": 0.38727434929841104,
"grad_norm": 0.9592533707618713,
"learning_rate": 3.0636282535079455e-05,
"loss": 0.527,
"step": 168000
},
{
"epoch": 0.38773539019043296,
"grad_norm": 2.421381950378418,
"learning_rate": 3.0613230490478354e-05,
"loss": 0.5972,
"step": 168200
},
{
"epoch": 0.3881964310824549,
"grad_norm": 0.9807179570198059,
"learning_rate": 3.059017844587726e-05,
"loss": 0.6076,
"step": 168400
},
{
"epoch": 0.38865747197447675,
"grad_norm": 1.1217988729476929,
"learning_rate": 3.0567126401276165e-05,
"loss": 0.5442,
"step": 168600
},
{
"epoch": 0.3891185128664987,
"grad_norm": 0.9705345630645752,
"learning_rate": 3.054407435667507e-05,
"loss": 0.5831,
"step": 168800
},
{
"epoch": 0.3895795537585206,
"grad_norm": 0.9477503895759583,
"learning_rate": 3.052102231207397e-05,
"loss": 0.5955,
"step": 169000
},
{
"epoch": 0.3900405946505425,
"grad_norm": 0.7813563346862793,
"learning_rate": 3.0497970267472876e-05,
"loss": 0.5686,
"step": 169200
},
{
"epoch": 0.39050163554256445,
"grad_norm": 1.0669126510620117,
"learning_rate": 3.0474918222871778e-05,
"loss": 0.5756,
"step": 169400
},
{
"epoch": 0.3909626764345864,
"grad_norm": 1.3676906824111938,
"learning_rate": 3.0451866178270683e-05,
"loss": 0.4965,
"step": 169600
},
{
"epoch": 0.3914237173266083,
"grad_norm": 1.404822587966919,
"learning_rate": 3.0428814133669586e-05,
"loss": 0.5471,
"step": 169800
},
{
"epoch": 0.3918847582186302,
"grad_norm": 0.7466553449630737,
"learning_rate": 3.040576208906849e-05,
"loss": 0.556,
"step": 170000
},
{
"epoch": 0.3923457991106521,
"grad_norm": 1.3484429121017456,
"learning_rate": 3.0382710044467394e-05,
"loss": 0.5521,
"step": 170200
},
{
"epoch": 0.392806840002674,
"grad_norm": 3.4249660968780518,
"learning_rate": 3.03596579998663e-05,
"loss": 0.5787,
"step": 170400
},
{
"epoch": 0.39326788089469594,
"grad_norm": 0.8153938055038452,
"learning_rate": 3.03366059552652e-05,
"loss": 0.5223,
"step": 170600
},
{
"epoch": 0.39372892178671787,
"grad_norm": 2.557283401489258,
"learning_rate": 3.0313553910664104e-05,
"loss": 0.5833,
"step": 170800
},
{
"epoch": 0.3941899626787398,
"grad_norm": 1.367695927619934,
"learning_rate": 3.0290501866063013e-05,
"loss": 0.5317,
"step": 171000
},
{
"epoch": 0.3946510035707617,
"grad_norm": 1.190898060798645,
"learning_rate": 3.026744982146192e-05,
"loss": 0.5361,
"step": 171200
},
{
"epoch": 0.39511204446278364,
"grad_norm": 1.7618181705474854,
"learning_rate": 3.024439777686082e-05,
"loss": 0.6089,
"step": 171400
},
{
"epoch": 0.39557308535480556,
"grad_norm": 1.191237211227417,
"learning_rate": 3.0221345732259727e-05,
"loss": 0.5271,
"step": 171600
},
{
"epoch": 0.3960341262468275,
"grad_norm": 1.8360000848770142,
"learning_rate": 3.019829368765863e-05,
"loss": 0.5879,
"step": 171800
},
{
"epoch": 0.39649516713884936,
"grad_norm": 1.363987684249878,
"learning_rate": 3.0175241643057535e-05,
"loss": 0.5211,
"step": 172000
},
{
"epoch": 0.3969562080308713,
"grad_norm": 0.9211211800575256,
"learning_rate": 3.0152189598456437e-05,
"loss": 0.5419,
"step": 172200
},
{
"epoch": 0.3974172489228932,
"grad_norm": 1.8756023645401,
"learning_rate": 3.0129137553855343e-05,
"loss": 0.5281,
"step": 172400
},
{
"epoch": 0.39787828981491513,
"grad_norm": 0.9270503520965576,
"learning_rate": 3.0106085509254245e-05,
"loss": 0.5506,
"step": 172600
},
{
"epoch": 0.39833933070693706,
"grad_norm": 1.689388394355774,
"learning_rate": 3.0083033464653147e-05,
"loss": 0.4929,
"step": 172800
},
{
"epoch": 0.398800371598959,
"grad_norm": 1.1315703392028809,
"learning_rate": 3.0059981420052053e-05,
"loss": 0.5469,
"step": 173000
},
{
"epoch": 0.3992614124909809,
"grad_norm": 1.1053519248962402,
"learning_rate": 3.0036929375450955e-05,
"loss": 0.5001,
"step": 173200
},
{
"epoch": 0.39972245338300283,
"grad_norm": 1.1651402711868286,
"learning_rate": 3.001387733084986e-05,
"loss": 0.5255,
"step": 173400
},
{
"epoch": 0.4001834942750247,
"grad_norm": 1.540276288986206,
"learning_rate": 2.9990825286248763e-05,
"loss": 0.5644,
"step": 173600
},
{
"epoch": 0.4006445351670466,
"grad_norm": 0.8608019948005676,
"learning_rate": 2.996777324164767e-05,
"loss": 0.5312,
"step": 173800
},
{
"epoch": 0.40110557605906855,
"grad_norm": 0.959018886089325,
"learning_rate": 2.994472119704657e-05,
"loss": 0.5322,
"step": 174000
},
{
"epoch": 0.40156661695109047,
"grad_norm": 2.531625986099243,
"learning_rate": 2.9921669152445477e-05,
"loss": 0.5521,
"step": 174200
},
{
"epoch": 0.4020276578431124,
"grad_norm": 1.8716404438018799,
"learning_rate": 2.9898617107844386e-05,
"loss": 0.4931,
"step": 174400
},
{
"epoch": 0.4024886987351343,
"grad_norm": 1.4556031227111816,
"learning_rate": 2.9875565063243288e-05,
"loss": 0.5879,
"step": 174600
},
{
"epoch": 0.40294973962715624,
"grad_norm": 1.2687571048736572,
"learning_rate": 2.985251301864219e-05,
"loss": 0.5636,
"step": 174800
},
{
"epoch": 0.40341078051917817,
"grad_norm": 1.354716420173645,
"learning_rate": 2.9829460974041096e-05,
"loss": 0.5851,
"step": 175000
},
{
"epoch": 0.4038718214112001,
"grad_norm": 0.4532039761543274,
"learning_rate": 2.980640892944e-05,
"loss": 0.5726,
"step": 175200
},
{
"epoch": 0.40433286230322196,
"grad_norm": 1.2430226802825928,
"learning_rate": 2.9783356884838904e-05,
"loss": 0.5263,
"step": 175400
},
{
"epoch": 0.4047939031952439,
"grad_norm": 1.0308810472488403,
"learning_rate": 2.9760304840237806e-05,
"loss": 0.5634,
"step": 175600
},
{
"epoch": 0.4052549440872658,
"grad_norm": 1.0540807247161865,
"learning_rate": 2.9737252795636712e-05,
"loss": 0.546,
"step": 175800
},
{
"epoch": 0.40571598497928774,
"grad_norm": 1.632247805595398,
"learning_rate": 2.9714200751035614e-05,
"loss": 0.5265,
"step": 176000
},
{
"epoch": 0.40617702587130966,
"grad_norm": 1.5189135074615479,
"learning_rate": 2.969114870643452e-05,
"loss": 0.5582,
"step": 176200
},
{
"epoch": 0.4066380667633316,
"grad_norm": 1.3175644874572754,
"learning_rate": 2.9668096661833422e-05,
"loss": 0.555,
"step": 176400
},
{
"epoch": 0.4070991076553535,
"grad_norm": 1.3439033031463623,
"learning_rate": 2.9645044617232325e-05,
"loss": 0.5526,
"step": 176600
},
{
"epoch": 0.40756014854737543,
"grad_norm": 0.6501840949058533,
"learning_rate": 2.962199257263123e-05,
"loss": 0.4856,
"step": 176800
},
{
"epoch": 0.4080211894393973,
"grad_norm": 2.5215022563934326,
"learning_rate": 2.9598940528030132e-05,
"loss": 0.5419,
"step": 177000
},
{
"epoch": 0.4084822303314192,
"grad_norm": 1.9052616357803345,
"learning_rate": 2.9575888483429038e-05,
"loss": 0.5189,
"step": 177200
},
{
"epoch": 0.40894327122344115,
"grad_norm": 1.2403985261917114,
"learning_rate": 2.955283643882794e-05,
"loss": 0.6047,
"step": 177400
},
{
"epoch": 0.4094043121154631,
"grad_norm": 1.517579436302185,
"learning_rate": 2.9529784394226846e-05,
"loss": 0.5691,
"step": 177600
},
{
"epoch": 0.409865353007485,
"grad_norm": 2.5231924057006836,
"learning_rate": 2.9506732349625755e-05,
"loss": 0.5686,
"step": 177800
},
{
"epoch": 0.4103263938995069,
"grad_norm": 0.6522693634033203,
"learning_rate": 2.9483680305024657e-05,
"loss": 0.5318,
"step": 178000
},
{
"epoch": 0.41078743479152885,
"grad_norm": 0.9372640252113342,
"learning_rate": 2.9460628260423563e-05,
"loss": 0.5535,
"step": 178200
},
{
"epoch": 0.4112484756835508,
"grad_norm": 1.2775940895080566,
"learning_rate": 2.9437576215822465e-05,
"loss": 0.5885,
"step": 178400
},
{
"epoch": 0.41170951657557264,
"grad_norm": 1.6325544118881226,
"learning_rate": 2.9414524171221368e-05,
"loss": 0.5622,
"step": 178600
},
{
"epoch": 0.41217055746759457,
"grad_norm": 1.4288066625595093,
"learning_rate": 2.9391472126620273e-05,
"loss": 0.4999,
"step": 178800
},
{
"epoch": 0.4126315983596165,
"grad_norm": 2.633436918258667,
"learning_rate": 2.9368420082019176e-05,
"loss": 0.5428,
"step": 179000
},
{
"epoch": 0.4130926392516384,
"grad_norm": 1.5107150077819824,
"learning_rate": 2.934536803741808e-05,
"loss": 0.5327,
"step": 179200
},
{
"epoch": 0.41355368014366034,
"grad_norm": 1.3021948337554932,
"learning_rate": 2.9322315992816984e-05,
"loss": 0.5725,
"step": 179400
},
{
"epoch": 0.41401472103568226,
"grad_norm": 1.0030542612075806,
"learning_rate": 2.929926394821589e-05,
"loss": 0.521,
"step": 179600
},
{
"epoch": 0.4144757619277042,
"grad_norm": 1.4533718824386597,
"learning_rate": 2.927621190361479e-05,
"loss": 0.537,
"step": 179800
},
{
"epoch": 0.4149368028197261,
"grad_norm": 0.5830268263816833,
"learning_rate": 2.9253159859013697e-05,
"loss": 0.6027,
"step": 180000
},
{
"epoch": 0.41539784371174804,
"grad_norm": 2.173309087753296,
"learning_rate": 2.92301078144126e-05,
"loss": 0.5337,
"step": 180200
},
{
"epoch": 0.4158588846037699,
"grad_norm": 1.0939158201217651,
"learning_rate": 2.9207055769811502e-05,
"loss": 0.5293,
"step": 180400
},
{
"epoch": 0.41631992549579183,
"grad_norm": 1.6121618747711182,
"learning_rate": 2.9184003725210408e-05,
"loss": 0.546,
"step": 180600
},
{
"epoch": 0.41678096638781376,
"grad_norm": 0.8111677169799805,
"learning_rate": 2.916095168060931e-05,
"loss": 0.5222,
"step": 180800
},
{
"epoch": 0.4172420072798357,
"grad_norm": 0.7552040219306946,
"learning_rate": 2.9137899636008215e-05,
"loss": 0.5809,
"step": 181000
},
{
"epoch": 0.4177030481718576,
"grad_norm": 1.146061897277832,
"learning_rate": 2.9114847591407125e-05,
"loss": 0.5609,
"step": 181200
},
{
"epoch": 0.41816408906387953,
"grad_norm": 0.885413646697998,
"learning_rate": 2.9091795546806027e-05,
"loss": 0.5252,
"step": 181400
},
{
"epoch": 0.41862512995590145,
"grad_norm": 1.3384150266647339,
"learning_rate": 2.9068743502204933e-05,
"loss": 0.5202,
"step": 181600
},
{
"epoch": 0.4190861708479234,
"grad_norm": 0.9868043065071106,
"learning_rate": 2.9045691457603835e-05,
"loss": 0.5393,
"step": 181800
},
{
"epoch": 0.41954721173994525,
"grad_norm": 1.3893357515335083,
"learning_rate": 2.902263941300274e-05,
"loss": 0.5337,
"step": 182000
},
{
"epoch": 0.42000825263196717,
"grad_norm": 1.7168641090393066,
"learning_rate": 2.8999587368401643e-05,
"loss": 0.5119,
"step": 182200
},
{
"epoch": 0.4204692935239891,
"grad_norm": 0.6522820591926575,
"learning_rate": 2.8976535323800545e-05,
"loss": 0.5551,
"step": 182400
},
{
"epoch": 0.420930334416011,
"grad_norm": 1.6360949277877808,
"learning_rate": 2.895348327919945e-05,
"loss": 0.5413,
"step": 182600
},
{
"epoch": 0.42139137530803294,
"grad_norm": 2.0071022510528564,
"learning_rate": 2.8930431234598353e-05,
"loss": 0.556,
"step": 182800
},
{
"epoch": 0.42185241620005487,
"grad_norm": 1.155096173286438,
"learning_rate": 2.890737918999726e-05,
"loss": 0.4924,
"step": 183000
},
{
"epoch": 0.4223134570920768,
"grad_norm": 0.7732855677604675,
"learning_rate": 2.888432714539616e-05,
"loss": 0.5849,
"step": 183200
},
{
"epoch": 0.4227744979840987,
"grad_norm": 1.4793187379837036,
"learning_rate": 2.8861275100795067e-05,
"loss": 0.5426,
"step": 183400
},
{
"epoch": 0.42323553887612064,
"grad_norm": 1.6665247678756714,
"learning_rate": 2.883822305619397e-05,
"loss": 0.5926,
"step": 183600
},
{
"epoch": 0.4236965797681425,
"grad_norm": 1.4480516910552979,
"learning_rate": 2.8815171011592875e-05,
"loss": 0.5335,
"step": 183800
},
{
"epoch": 0.42415762066016444,
"grad_norm": 0.944604754447937,
"learning_rate": 2.8792118966991777e-05,
"loss": 0.516,
"step": 184000
},
{
"epoch": 0.42461866155218636,
"grad_norm": 1.405192255973816,
"learning_rate": 2.876906692239068e-05,
"loss": 0.5339,
"step": 184200
},
{
"epoch": 0.4250797024442083,
"grad_norm": 1.1222949028015137,
"learning_rate": 2.8746014877789585e-05,
"loss": 0.5023,
"step": 184400
},
{
"epoch": 0.4255407433362302,
"grad_norm": 1.2079672813415527,
"learning_rate": 2.8722962833188494e-05,
"loss": 0.4828,
"step": 184600
},
{
"epoch": 0.42600178422825213,
"grad_norm": 3.4156157970428467,
"learning_rate": 2.8699910788587396e-05,
"loss": 0.4995,
"step": 184800
},
{
"epoch": 0.42646282512027406,
"grad_norm": 1.3917217254638672,
"learning_rate": 2.8676858743986302e-05,
"loss": 0.5099,
"step": 185000
},
{
"epoch": 0.426923866012296,
"grad_norm": 1.514889121055603,
"learning_rate": 2.8653806699385204e-05,
"loss": 0.5377,
"step": 185200
},
{
"epoch": 0.42738490690431785,
"grad_norm": 1.0316505432128906,
"learning_rate": 2.863075465478411e-05,
"loss": 0.5223,
"step": 185400
},
{
"epoch": 0.4278459477963398,
"grad_norm": 2.2684624195098877,
"learning_rate": 2.8607702610183012e-05,
"loss": 0.5482,
"step": 185600
},
{
"epoch": 0.4283069886883617,
"grad_norm": 0.6258700489997864,
"learning_rate": 2.8584650565581918e-05,
"loss": 0.5643,
"step": 185800
},
{
"epoch": 0.4287680295803836,
"grad_norm": 0.6727305054664612,
"learning_rate": 2.856159852098082e-05,
"loss": 0.5405,
"step": 186000
},
{
"epoch": 0.42922907047240555,
"grad_norm": 0.6856648921966553,
"learning_rate": 2.8538546476379722e-05,
"loss": 0.5571,
"step": 186200
},
{
"epoch": 0.4296901113644275,
"grad_norm": 1.6323261260986328,
"learning_rate": 2.8515494431778628e-05,
"loss": 0.5369,
"step": 186400
},
{
"epoch": 0.4301511522564494,
"grad_norm": 1.5054471492767334,
"learning_rate": 2.849244238717753e-05,
"loss": 0.5402,
"step": 186600
},
{
"epoch": 0.4306121931484713,
"grad_norm": 1.21519136428833,
"learning_rate": 2.8469390342576436e-05,
"loss": 0.4947,
"step": 186800
},
{
"epoch": 0.43107323404049325,
"grad_norm": 1.126180648803711,
"learning_rate": 2.8446338297975338e-05,
"loss": 0.4861,
"step": 187000
},
{
"epoch": 0.4315342749325151,
"grad_norm": 1.4017746448516846,
"learning_rate": 2.8423286253374244e-05,
"loss": 0.4995,
"step": 187200
},
{
"epoch": 0.43199531582453704,
"grad_norm": 1.8414978981018066,
"learning_rate": 2.8400234208773146e-05,
"loss": 0.5247,
"step": 187400
},
{
"epoch": 0.43245635671655897,
"grad_norm": 0.9502488374710083,
"learning_rate": 2.8377182164172052e-05,
"loss": 0.5712,
"step": 187600
},
{
"epoch": 0.4329173976085809,
"grad_norm": 1.3080493211746216,
"learning_rate": 2.8354130119570954e-05,
"loss": 0.5895,
"step": 187800
},
{
"epoch": 0.4333784385006028,
"grad_norm": 1.122564673423767,
"learning_rate": 2.8331078074969863e-05,
"loss": 0.5105,
"step": 188000
},
{
"epoch": 0.43383947939262474,
"grad_norm": 3.3100082874298096,
"learning_rate": 2.8308026030368766e-05,
"loss": 0.5266,
"step": 188200
},
{
"epoch": 0.43430052028464666,
"grad_norm": 2.0265512466430664,
"learning_rate": 2.828497398576767e-05,
"loss": 0.5605,
"step": 188400
},
{
"epoch": 0.4347615611766686,
"grad_norm": 1.7905211448669434,
"learning_rate": 2.8261921941166574e-05,
"loss": 0.5581,
"step": 188600
},
{
"epoch": 0.43522260206869046,
"grad_norm": 1.0183840990066528,
"learning_rate": 2.823886989656548e-05,
"loss": 0.5051,
"step": 188800
},
{
"epoch": 0.4356836429607124,
"grad_norm": 1.128341794013977,
"learning_rate": 2.821581785196438e-05,
"loss": 0.5511,
"step": 189000
},
{
"epoch": 0.4361446838527343,
"grad_norm": 0.9863077998161316,
"learning_rate": 2.8192765807363287e-05,
"loss": 0.5541,
"step": 189200
},
{
"epoch": 0.43660572474475623,
"grad_norm": 2.1484644412994385,
"learning_rate": 2.816971376276219e-05,
"loss": 0.5729,
"step": 189400
},
{
"epoch": 0.43706676563677815,
"grad_norm": 0.716901421546936,
"learning_rate": 2.8146661718161095e-05,
"loss": 0.5085,
"step": 189600
},
{
"epoch": 0.4375278065288001,
"grad_norm": 1.7285312414169312,
"learning_rate": 2.8123609673559997e-05,
"loss": 0.49,
"step": 189800
},
{
"epoch": 0.437988847420822,
"grad_norm": 1.697322130203247,
"learning_rate": 2.81005576289589e-05,
"loss": 0.5524,
"step": 190000
},
{
"epoch": 0.4384498883128439,
"grad_norm": 0.9568549394607544,
"learning_rate": 2.8077505584357805e-05,
"loss": 0.5403,
"step": 190200
},
{
"epoch": 0.43891092920486585,
"grad_norm": 2.225656747817993,
"learning_rate": 2.8054453539756708e-05,
"loss": 0.5146,
"step": 190400
},
{
"epoch": 0.4393719700968877,
"grad_norm": 1.7832934856414795,
"learning_rate": 2.8031401495155613e-05,
"loss": 0.5734,
"step": 190600
},
{
"epoch": 0.43983301098890965,
"grad_norm": 1.1611802577972412,
"learning_rate": 2.8008349450554516e-05,
"loss": 0.5316,
"step": 190800
},
{
"epoch": 0.44029405188093157,
"grad_norm": 0.3716856837272644,
"learning_rate": 2.798529740595342e-05,
"loss": 0.5683,
"step": 191000
},
{
"epoch": 0.4407550927729535,
"grad_norm": 0.911855161190033,
"learning_rate": 2.7962245361352324e-05,
"loss": 0.5488,
"step": 191200
},
{
"epoch": 0.4412161336649754,
"grad_norm": 4.299455165863037,
"learning_rate": 2.7939193316751233e-05,
"loss": 0.5083,
"step": 191400
},
{
"epoch": 0.44167717455699734,
"grad_norm": 0.8923743367195129,
"learning_rate": 2.791614127215014e-05,
"loss": 0.5514,
"step": 191600
},
{
"epoch": 0.44213821544901927,
"grad_norm": 2.5912487506866455,
"learning_rate": 2.789308922754904e-05,
"loss": 0.5585,
"step": 191800
},
{
"epoch": 0.4425992563410412,
"grad_norm": 1.8387411832809448,
"learning_rate": 2.7870037182947943e-05,
"loss": 0.5628,
"step": 192000
},
{
"epoch": 0.44306029723306306,
"grad_norm": 1.2115058898925781,
"learning_rate": 2.784698513834685e-05,
"loss": 0.5178,
"step": 192200
},
{
"epoch": 0.443521338125085,
"grad_norm": 1.1574034690856934,
"learning_rate": 2.782393309374575e-05,
"loss": 0.4819,
"step": 192400
},
{
"epoch": 0.4439823790171069,
"grad_norm": 0.6429279446601868,
"learning_rate": 2.7800881049144657e-05,
"loss": 0.5674,
"step": 192600
},
{
"epoch": 0.44444341990912883,
"grad_norm": 1.5901168584823608,
"learning_rate": 2.777782900454356e-05,
"loss": 0.5352,
"step": 192800
},
{
"epoch": 0.44490446080115076,
"grad_norm": 0.7381865978240967,
"learning_rate": 2.7754776959942465e-05,
"loss": 0.5223,
"step": 193000
},
{
"epoch": 0.4453655016931727,
"grad_norm": 0.6729177236557007,
"learning_rate": 2.7731724915341367e-05,
"loss": 0.5568,
"step": 193200
},
{
"epoch": 0.4458265425851946,
"grad_norm": 1.1146801710128784,
"learning_rate": 2.7708672870740272e-05,
"loss": 0.5336,
"step": 193400
},
{
"epoch": 0.44628758347721653,
"grad_norm": 0.9231970906257629,
"learning_rate": 2.7685620826139175e-05,
"loss": 0.5331,
"step": 193600
},
{
"epoch": 0.44674862436923846,
"grad_norm": 0.9126871228218079,
"learning_rate": 2.7662568781538077e-05,
"loss": 0.5018,
"step": 193800
},
{
"epoch": 0.4472096652612603,
"grad_norm": 1.343369483947754,
"learning_rate": 2.7639516736936983e-05,
"loss": 0.5321,
"step": 194000
},
{
"epoch": 0.44767070615328225,
"grad_norm": 1.209140419960022,
"learning_rate": 2.7616464692335885e-05,
"loss": 0.5341,
"step": 194200
},
{
"epoch": 0.4481317470453042,
"grad_norm": 2.7046828269958496,
"learning_rate": 2.759341264773479e-05,
"loss": 0.5259,
"step": 194400
},
{
"epoch": 0.4485927879373261,
"grad_norm": 1.0318337678909302,
"learning_rate": 2.7570360603133693e-05,
"loss": 0.5131,
"step": 194600
},
{
"epoch": 0.449053828829348,
"grad_norm": 2.206500291824341,
"learning_rate": 2.7547308558532602e-05,
"loss": 0.4956,
"step": 194800
},
{
"epoch": 0.44951486972136995,
"grad_norm": 1.1853792667388916,
"learning_rate": 2.7524256513931508e-05,
"loss": 0.4903,
"step": 195000
},
{
"epoch": 0.44997591061339187,
"grad_norm": 2.2172162532806396,
"learning_rate": 2.750120446933041e-05,
"loss": 0.5276,
"step": 195200
},
{
"epoch": 0.4504369515054138,
"grad_norm": 0.8798406720161438,
"learning_rate": 2.7478152424729316e-05,
"loss": 0.526,
"step": 195400
},
{
"epoch": 0.45089799239743567,
"grad_norm": 1.5308436155319214,
"learning_rate": 2.7455100380128218e-05,
"loss": 0.5206,
"step": 195600
},
{
"epoch": 0.4513590332894576,
"grad_norm": 0.7613127827644348,
"learning_rate": 2.743204833552712e-05,
"loss": 0.4945,
"step": 195800
},
{
"epoch": 0.4518200741814795,
"grad_norm": 1.1208069324493408,
"learning_rate": 2.7408996290926026e-05,
"loss": 0.4972,
"step": 196000
},
{
"epoch": 0.45228111507350144,
"grad_norm": 1.172491431236267,
"learning_rate": 2.7385944246324928e-05,
"loss": 0.519,
"step": 196200
},
{
"epoch": 0.45274215596552336,
"grad_norm": 1.6736866235733032,
"learning_rate": 2.7362892201723834e-05,
"loss": 0.5425,
"step": 196400
},
{
"epoch": 0.4532031968575453,
"grad_norm": 1.6905968189239502,
"learning_rate": 2.7339840157122736e-05,
"loss": 0.5561,
"step": 196600
},
{
"epoch": 0.4536642377495672,
"grad_norm": 1.852290153503418,
"learning_rate": 2.7316788112521642e-05,
"loss": 0.4633,
"step": 196800
},
{
"epoch": 0.45412527864158914,
"grad_norm": 1.671228289604187,
"learning_rate": 2.7293736067920544e-05,
"loss": 0.5361,
"step": 197000
},
{
"epoch": 0.45458631953361106,
"grad_norm": 4.358177185058594,
"learning_rate": 2.727068402331945e-05,
"loss": 0.5422,
"step": 197200
},
{
"epoch": 0.45504736042563293,
"grad_norm": 1.261697769165039,
"learning_rate": 2.7247631978718352e-05,
"loss": 0.5468,
"step": 197400
},
{
"epoch": 0.45550840131765485,
"grad_norm": 1.6779541969299316,
"learning_rate": 2.7224579934117254e-05,
"loss": 0.5578,
"step": 197600
},
{
"epoch": 0.4559694422096768,
"grad_norm": 1.5837364196777344,
"learning_rate": 2.720152788951616e-05,
"loss": 0.5796,
"step": 197800
},
{
"epoch": 0.4564304831016987,
"grad_norm": 2.479245662689209,
"learning_rate": 2.7178475844915062e-05,
"loss": 0.5441,
"step": 198000
},
{
"epoch": 0.4568915239937206,
"grad_norm": 2.000091552734375,
"learning_rate": 2.715542380031397e-05,
"loss": 0.5661,
"step": 198200
},
{
"epoch": 0.45735256488574255,
"grad_norm": 1.4363523721694946,
"learning_rate": 2.7132371755712877e-05,
"loss": 0.5565,
"step": 198400
},
{
"epoch": 0.4578136057777645,
"grad_norm": 1.766074776649475,
"learning_rate": 2.710931971111178e-05,
"loss": 0.5825,
"step": 198600
},
{
"epoch": 0.4582746466697864,
"grad_norm": 0.5402831435203552,
"learning_rate": 2.7086267666510685e-05,
"loss": 0.5039,
"step": 198800
},
{
"epoch": 0.45873568756180827,
"grad_norm": 1.0958600044250488,
"learning_rate": 2.7063215621909587e-05,
"loss": 0.5534,
"step": 199000
},
{
"epoch": 0.4591967284538302,
"grad_norm": 1.6260972023010254,
"learning_rate": 2.7040163577308493e-05,
"loss": 0.5222,
"step": 199200
},
{
"epoch": 0.4596577693458521,
"grad_norm": 1.382095217704773,
"learning_rate": 2.7017111532707395e-05,
"loss": 0.5278,
"step": 199400
},
{
"epoch": 0.46011881023787404,
"grad_norm": 1.0845330953598022,
"learning_rate": 2.6994059488106298e-05,
"loss": 0.5143,
"step": 199600
},
{
"epoch": 0.46057985112989597,
"grad_norm": 1.2804137468338013,
"learning_rate": 2.6971007443505203e-05,
"loss": 0.511,
"step": 199800
},
{
"epoch": 0.4610408920219179,
"grad_norm": 3.7605793476104736,
"learning_rate": 2.6947955398904106e-05,
"loss": 0.531,
"step": 200000
},
{
"epoch": 0.4610408920219179,
"eval_loss": 0.5235968232154846,
"eval_runtime": 144.1603,
"eval_samples_per_second": 30.397,
"eval_steps_per_second": 30.397,
"step": 200000
},
{
"epoch": 0.4615019329139398,
"grad_norm": 1.2853552103042603,
"learning_rate": 2.692490335430301e-05,
"loss": 0.52,
"step": 200200
},
{
"epoch": 0.46196297380596174,
"grad_norm": 0.8464341759681702,
"learning_rate": 2.6901851309701914e-05,
"loss": 0.5059,
"step": 200400
},
{
"epoch": 0.46242401469798367,
"grad_norm": 1.0232640504837036,
"learning_rate": 2.687879926510082e-05,
"loss": 0.6008,
"step": 200600
},
{
"epoch": 0.46288505559000553,
"grad_norm": 1.2209442853927612,
"learning_rate": 2.685574722049972e-05,
"loss": 0.5058,
"step": 200800
},
{
"epoch": 0.46334609648202746,
"grad_norm": 0.827387809753418,
"learning_rate": 2.6832695175898627e-05,
"loss": 0.5022,
"step": 201000
},
{
"epoch": 0.4638071373740494,
"grad_norm": 0.663145899772644,
"learning_rate": 2.680964313129753e-05,
"loss": 0.5287,
"step": 201200
},
{
"epoch": 0.4642681782660713,
"grad_norm": 1.2869213819503784,
"learning_rate": 2.6786591086696432e-05,
"loss": 0.588,
"step": 201400
},
{
"epoch": 0.46472921915809323,
"grad_norm": 0.9213125705718994,
"learning_rate": 2.676353904209534e-05,
"loss": 0.5375,
"step": 201600
},
{
"epoch": 0.46519026005011516,
"grad_norm": 0.9459083080291748,
"learning_rate": 2.6740486997494246e-05,
"loss": 0.539,
"step": 201800
},
{
"epoch": 0.4656513009421371,
"grad_norm": 0.9873161315917969,
"learning_rate": 2.671743495289315e-05,
"loss": 0.5549,
"step": 202000
},
{
"epoch": 0.466112341834159,
"grad_norm": 1.8117451667785645,
"learning_rate": 2.6694382908292054e-05,
"loss": 0.5255,
"step": 202200
},
{
"epoch": 0.4665733827261809,
"grad_norm": 1.219114899635315,
"learning_rate": 2.6671330863690957e-05,
"loss": 0.4845,
"step": 202400
},
{
"epoch": 0.4670344236182028,
"grad_norm": 2.0464797019958496,
"learning_rate": 2.6648278819089862e-05,
"loss": 0.5696,
"step": 202600
},
{
"epoch": 0.4674954645102247,
"grad_norm": 2.183873176574707,
"learning_rate": 2.6625226774488765e-05,
"loss": 0.5078,
"step": 202800
},
{
"epoch": 0.46795650540224665,
"grad_norm": 0.8037805557250977,
"learning_rate": 2.660217472988767e-05,
"loss": 0.5538,
"step": 203000
},
{
"epoch": 0.46841754629426857,
"grad_norm": 0.2990266978740692,
"learning_rate": 2.6579122685286573e-05,
"loss": 0.5458,
"step": 203200
},
{
"epoch": 0.4688785871862905,
"grad_norm": 1.854121446609497,
"learning_rate": 2.6556070640685475e-05,
"loss": 0.5138,
"step": 203400
},
{
"epoch": 0.4693396280783124,
"grad_norm": 2.2942981719970703,
"learning_rate": 2.653301859608438e-05,
"loss": 0.5268,
"step": 203600
},
{
"epoch": 0.46980066897033435,
"grad_norm": 1.3234660625457764,
"learning_rate": 2.6509966551483283e-05,
"loss": 0.4838,
"step": 203800
},
{
"epoch": 0.4702617098623562,
"grad_norm": 2.0463480949401855,
"learning_rate": 2.648691450688219e-05,
"loss": 0.5101,
"step": 204000
},
{
"epoch": 0.47072275075437814,
"grad_norm": 2.1210684776306152,
"learning_rate": 2.646386246228109e-05,
"loss": 0.5376,
"step": 204200
},
{
"epoch": 0.47118379164640006,
"grad_norm": 1.7364137172698975,
"learning_rate": 2.6440810417679997e-05,
"loss": 0.5649,
"step": 204400
},
{
"epoch": 0.471644832538422,
"grad_norm": 0.9832141399383545,
"learning_rate": 2.64177583730789e-05,
"loss": 0.5415,
"step": 204600
},
{
"epoch": 0.4721058734304439,
"grad_norm": 2.0210485458374023,
"learning_rate": 2.6394706328477804e-05,
"loss": 0.5323,
"step": 204800
},
{
"epoch": 0.47256691432246584,
"grad_norm": 1.7423853874206543,
"learning_rate": 2.6371654283876714e-05,
"loss": 0.5177,
"step": 205000
},
{
"epoch": 0.47302795521448776,
"grad_norm": 0.6872438788414001,
"learning_rate": 2.6348602239275616e-05,
"loss": 0.5507,
"step": 205200
},
{
"epoch": 0.4734889961065097,
"grad_norm": 1.3187884092330933,
"learning_rate": 2.6325550194674518e-05,
"loss": 0.5919,
"step": 205400
},
{
"epoch": 0.4739500369985316,
"grad_norm": 0.8862842321395874,
"learning_rate": 2.6302498150073424e-05,
"loss": 0.4935,
"step": 205600
},
{
"epoch": 0.4744110778905535,
"grad_norm": 1.1730307340621948,
"learning_rate": 2.6279446105472326e-05,
"loss": 0.5093,
"step": 205800
},
{
"epoch": 0.4748721187825754,
"grad_norm": 1.160568118095398,
"learning_rate": 2.6256394060871232e-05,
"loss": 0.5479,
"step": 206000
},
{
"epoch": 0.4753331596745973,
"grad_norm": 1.4531235694885254,
"learning_rate": 2.6233342016270134e-05,
"loss": 0.5399,
"step": 206200
},
{
"epoch": 0.47579420056661925,
"grad_norm": 2.6737730503082275,
"learning_rate": 2.621028997166904e-05,
"loss": 0.5246,
"step": 206400
},
{
"epoch": 0.4762552414586412,
"grad_norm": 1.8411715030670166,
"learning_rate": 2.6187237927067942e-05,
"loss": 0.548,
"step": 206600
},
{
"epoch": 0.4767162823506631,
"grad_norm": 1.6035988330841064,
"learning_rate": 2.6164185882466848e-05,
"loss": 0.4635,
"step": 206800
},
{
"epoch": 0.477177323242685,
"grad_norm": 0.9196053743362427,
"learning_rate": 2.614113383786575e-05,
"loss": 0.4865,
"step": 207000
},
{
"epoch": 0.47763836413470695,
"grad_norm": 1.3672767877578735,
"learning_rate": 2.6118081793264652e-05,
"loss": 0.5275,
"step": 207200
},
{
"epoch": 0.4780994050267288,
"grad_norm": 1.2003188133239746,
"learning_rate": 2.6095029748663558e-05,
"loss": 0.5181,
"step": 207400
},
{
"epoch": 0.47856044591875074,
"grad_norm": 0.8703144788742065,
"learning_rate": 2.607197770406246e-05,
"loss": 0.4987,
"step": 207600
},
{
"epoch": 0.47902148681077267,
"grad_norm": 1.2609108686447144,
"learning_rate": 2.6048925659461366e-05,
"loss": 0.5032,
"step": 207800
},
{
"epoch": 0.4794825277027946,
"grad_norm": 1.2695225477218628,
"learning_rate": 2.6025873614860268e-05,
"loss": 0.5221,
"step": 208000
},
{
"epoch": 0.4799435685948165,
"grad_norm": 1.1836507320404053,
"learning_rate": 2.6002821570259174e-05,
"loss": 0.5443,
"step": 208200
},
{
"epoch": 0.48040460948683844,
"grad_norm": 1.0860618352890015,
"learning_rate": 2.5979769525658083e-05,
"loss": 0.5391,
"step": 208400
},
{
"epoch": 0.48086565037886037,
"grad_norm": 2.6720314025878906,
"learning_rate": 2.5956717481056985e-05,
"loss": 0.5293,
"step": 208600
},
{
"epoch": 0.4813266912708823,
"grad_norm": 2.128580093383789,
"learning_rate": 2.593366543645589e-05,
"loss": 0.5426,
"step": 208800
},
{
"epoch": 0.4817877321629042,
"grad_norm": 1.0625451803207397,
"learning_rate": 2.5910613391854793e-05,
"loss": 0.5703,
"step": 209000
},
{
"epoch": 0.4822487730549261,
"grad_norm": 1.0436484813690186,
"learning_rate": 2.5887561347253695e-05,
"loss": 0.5036,
"step": 209200
},
{
"epoch": 0.482709813946948,
"grad_norm": 1.5313512086868286,
"learning_rate": 2.58645093026526e-05,
"loss": 0.4912,
"step": 209400
},
{
"epoch": 0.48317085483896993,
"grad_norm": 2.2933545112609863,
"learning_rate": 2.5841457258051503e-05,
"loss": 0.5143,
"step": 209600
},
{
"epoch": 0.48363189573099186,
"grad_norm": 0.9948174357414246,
"learning_rate": 2.581840521345041e-05,
"loss": 0.4997,
"step": 209800
},
{
"epoch": 0.4840929366230138,
"grad_norm": 0.6930698752403259,
"learning_rate": 2.579535316884931e-05,
"loss": 0.5701,
"step": 210000
},
{
"epoch": 0.4845539775150357,
"grad_norm": 2.551692247390747,
"learning_rate": 2.5772301124248217e-05,
"loss": 0.5026,
"step": 210200
},
{
"epoch": 0.48501501840705763,
"grad_norm": 0.6203674674034119,
"learning_rate": 2.574924907964712e-05,
"loss": 0.5457,
"step": 210400
},
{
"epoch": 0.48547605929907955,
"grad_norm": 0.8173620104789734,
"learning_rate": 2.5726197035046025e-05,
"loss": 0.5061,
"step": 210600
},
{
"epoch": 0.4859371001911014,
"grad_norm": 1.0083948373794556,
"learning_rate": 2.5703144990444927e-05,
"loss": 0.4877,
"step": 210800
},
{
"epoch": 0.48639814108312335,
"grad_norm": 0.48525819182395935,
"learning_rate": 2.568009294584383e-05,
"loss": 0.5158,
"step": 211000
},
{
"epoch": 0.48685918197514527,
"grad_norm": 2.381948709487915,
"learning_rate": 2.5657040901242735e-05,
"loss": 0.5087,
"step": 211200
},
{
"epoch": 0.4873202228671672,
"grad_norm": 1.283881425857544,
"learning_rate": 2.5633988856641638e-05,
"loss": 0.5529,
"step": 211400
},
{
"epoch": 0.4877812637591891,
"grad_norm": 1.0474011898040771,
"learning_rate": 2.5610936812040543e-05,
"loss": 0.4997,
"step": 211600
},
{
"epoch": 0.48824230465121105,
"grad_norm": 1.509234070777893,
"learning_rate": 2.5587884767439452e-05,
"loss": 0.5289,
"step": 211800
},
{
"epoch": 0.48870334554323297,
"grad_norm": 0.736985445022583,
"learning_rate": 2.5564832722838355e-05,
"loss": 0.56,
"step": 212000
},
{
"epoch": 0.4891643864352549,
"grad_norm": 0.5530835390090942,
"learning_rate": 2.554178067823726e-05,
"loss": 0.5385,
"step": 212200
},
{
"epoch": 0.4896254273272768,
"grad_norm": 1.0076507329940796,
"learning_rate": 2.5518728633636163e-05,
"loss": 0.5014,
"step": 212400
},
{
"epoch": 0.4900864682192987,
"grad_norm": 0.7996362447738647,
"learning_rate": 2.5495676589035068e-05,
"loss": 0.5417,
"step": 212600
},
{
"epoch": 0.4905475091113206,
"grad_norm": 1.1056005954742432,
"learning_rate": 2.547262454443397e-05,
"loss": 0.4684,
"step": 212800
},
{
"epoch": 0.49100855000334254,
"grad_norm": 1.4682406187057495,
"learning_rate": 2.5449572499832873e-05,
"loss": 0.5222,
"step": 213000
},
{
"epoch": 0.49146959089536446,
"grad_norm": 2.054387331008911,
"learning_rate": 2.542652045523178e-05,
"loss": 0.5487,
"step": 213200
},
{
"epoch": 0.4919306317873864,
"grad_norm": 1.1834423542022705,
"learning_rate": 2.540346841063068e-05,
"loss": 0.523,
"step": 213400
},
{
"epoch": 0.4923916726794083,
"grad_norm": 1.6938774585723877,
"learning_rate": 2.5380416366029586e-05,
"loss": 0.5807,
"step": 213600
},
{
"epoch": 0.49285271357143023,
"grad_norm": 1.825681209564209,
"learning_rate": 2.535736432142849e-05,
"loss": 0.5444,
"step": 213800
},
{
"epoch": 0.49331375446345216,
"grad_norm": 1.6016223430633545,
"learning_rate": 2.5334312276827394e-05,
"loss": 0.5095,
"step": 214000
},
{
"epoch": 0.493774795355474,
"grad_norm": 0.7464369535446167,
"learning_rate": 2.5311260232226297e-05,
"loss": 0.5111,
"step": 214200
},
{
"epoch": 0.49423583624749595,
"grad_norm": 1.6987085342407227,
"learning_rate": 2.5288208187625202e-05,
"loss": 0.4878,
"step": 214400
},
{
"epoch": 0.4946968771395179,
"grad_norm": 1.2027496099472046,
"learning_rate": 2.5265156143024105e-05,
"loss": 0.5734,
"step": 214600
},
{
"epoch": 0.4951579180315398,
"grad_norm": 1.1822620630264282,
"learning_rate": 2.5242104098423007e-05,
"loss": 0.5592,
"step": 214800
},
{
"epoch": 0.4956189589235617,
"grad_norm": 1.0884791612625122,
"learning_rate": 2.521905205382192e-05,
"loss": 0.5228,
"step": 215000
},
{
"epoch": 0.49607999981558365,
"grad_norm": 3.0900111198425293,
"learning_rate": 2.519600000922082e-05,
"loss": 0.5193,
"step": 215200
},
{
"epoch": 0.4965410407076056,
"grad_norm": 0.8263806104660034,
"learning_rate": 2.5172947964619724e-05,
"loss": 0.5436,
"step": 215400
},
{
"epoch": 0.4970020815996275,
"grad_norm": 0.9320021271705627,
"learning_rate": 2.514989592001863e-05,
"loss": 0.5525,
"step": 215600
},
{
"epoch": 0.4974631224916494,
"grad_norm": 1.8418340682983398,
"learning_rate": 2.5126843875417532e-05,
"loss": 0.5159,
"step": 215800
},
{
"epoch": 0.4979241633836713,
"grad_norm": 1.0613411664962769,
"learning_rate": 2.5103791830816438e-05,
"loss": 0.5222,
"step": 216000
},
{
"epoch": 0.4983852042756932,
"grad_norm": 0.9613930583000183,
"learning_rate": 2.508073978621534e-05,
"loss": 0.5506,
"step": 216200
},
{
"epoch": 0.49884624516771514,
"grad_norm": 1.2147666215896606,
"learning_rate": 2.5057687741614246e-05,
"loss": 0.5332,
"step": 216400
},
{
"epoch": 0.49930728605973707,
"grad_norm": 0.8295925259590149,
"learning_rate": 2.5034635697013148e-05,
"loss": 0.5083,
"step": 216600
},
{
"epoch": 0.499768326951759,
"grad_norm": 1.5370151996612549,
"learning_rate": 2.501158365241205e-05,
"loss": 0.5137,
"step": 216800
},
{
"epoch": 0.5002293678437809,
"grad_norm": 1.137407898902893,
"learning_rate": 2.4988531607810956e-05,
"loss": 0.5289,
"step": 217000
},
{
"epoch": 0.5006904087358028,
"grad_norm": 1.1642227172851562,
"learning_rate": 2.4965479563209858e-05,
"loss": 0.5223,
"step": 217200
},
{
"epoch": 0.5011514496278248,
"grad_norm": 1.7283347845077515,
"learning_rate": 2.4942427518608764e-05,
"loss": 0.5269,
"step": 217400
},
{
"epoch": 0.5016124905198467,
"grad_norm": 1.0114668607711792,
"learning_rate": 2.491937547400767e-05,
"loss": 0.5464,
"step": 217600
},
{
"epoch": 0.5020735314118686,
"grad_norm": 2.422441244125366,
"learning_rate": 2.4896323429406572e-05,
"loss": 0.5441,
"step": 217800
},
{
"epoch": 0.5025345723038905,
"grad_norm": 0.6557809710502625,
"learning_rate": 2.4873271384805477e-05,
"loss": 0.4985,
"step": 218000
},
{
"epoch": 0.5029956131959125,
"grad_norm": 1.6513997316360474,
"learning_rate": 2.485021934020438e-05,
"loss": 0.5022,
"step": 218200
},
{
"epoch": 0.5034566540879343,
"grad_norm": 0.7555482387542725,
"learning_rate": 2.4827167295603285e-05,
"loss": 0.5285,
"step": 218400
},
{
"epoch": 0.5039176949799562,
"grad_norm": 0.9121997356414795,
"learning_rate": 2.4804115251002188e-05,
"loss": 0.5312,
"step": 218600
},
{
"epoch": 0.5043787358719781,
"grad_norm": 0.36491402983665466,
"learning_rate": 2.4781063206401093e-05,
"loss": 0.5309,
"step": 218800
},
{
"epoch": 0.504839776764,
"grad_norm": 2.048449993133545,
"learning_rate": 2.4758011161799996e-05,
"loss": 0.5274,
"step": 219000
},
{
"epoch": 0.505300817656022,
"grad_norm": 2.769894599914551,
"learning_rate": 2.47349591171989e-05,
"loss": 0.5035,
"step": 219200
},
{
"epoch": 0.5057618585480439,
"grad_norm": 1.8023812770843506,
"learning_rate": 2.4711907072597807e-05,
"loss": 0.5071,
"step": 219400
},
{
"epoch": 0.5062228994400658,
"grad_norm": 0.6726931929588318,
"learning_rate": 2.468885502799671e-05,
"loss": 0.5223,
"step": 219600
},
{
"epoch": 0.5066839403320877,
"grad_norm": 9.744784355163574,
"learning_rate": 2.4665802983395615e-05,
"loss": 0.4931,
"step": 219800
},
{
"epoch": 0.5071449812241097,
"grad_norm": 1.1189628839492798,
"learning_rate": 2.4642750938794517e-05,
"loss": 0.5205,
"step": 220000
},
{
"epoch": 0.5076060221161316,
"grad_norm": 1.6368327140808105,
"learning_rate": 2.4619698894193423e-05,
"loss": 0.5169,
"step": 220200
},
{
"epoch": 0.5080670630081535,
"grad_norm": 1.834841012954712,
"learning_rate": 2.4596646849592325e-05,
"loss": 0.4931,
"step": 220400
},
{
"epoch": 0.5085281039001754,
"grad_norm": 1.0901039838790894,
"learning_rate": 2.4573594804991227e-05,
"loss": 0.5193,
"step": 220600
},
{
"epoch": 0.5089891447921974,
"grad_norm": 0.9557801485061646,
"learning_rate": 2.4550542760390133e-05,
"loss": 0.5249,
"step": 220800
},
{
"epoch": 0.5094501856842193,
"grad_norm": 1.0982486009597778,
"learning_rate": 2.452749071578904e-05,
"loss": 0.4845,
"step": 221000
},
{
"epoch": 0.5099112265762412,
"grad_norm": 1.3123830556869507,
"learning_rate": 2.4504438671187944e-05,
"loss": 0.4842,
"step": 221200
},
{
"epoch": 0.5103722674682631,
"grad_norm": 1.05722975730896,
"learning_rate": 2.4481386626586847e-05,
"loss": 0.5196,
"step": 221400
},
{
"epoch": 0.5108333083602851,
"grad_norm": 1.5994271039962769,
"learning_rate": 2.445833458198575e-05,
"loss": 0.4932,
"step": 221600
},
{
"epoch": 0.5112943492523069,
"grad_norm": 0.3710331916809082,
"learning_rate": 2.4435282537384655e-05,
"loss": 0.4854,
"step": 221800
},
{
"epoch": 0.5117553901443288,
"grad_norm": 1.2854666709899902,
"learning_rate": 2.4412230492783557e-05,
"loss": 0.5092,
"step": 222000
},
{
"epoch": 0.5122164310363507,
"grad_norm": 1.364815354347229,
"learning_rate": 2.4389178448182463e-05,
"loss": 0.4975,
"step": 222200
},
{
"epoch": 0.5126774719283727,
"grad_norm": 1.2252674102783203,
"learning_rate": 2.4366126403581365e-05,
"loss": 0.5075,
"step": 222400
},
{
"epoch": 0.5131385128203946,
"grad_norm": 0.9235671758651733,
"learning_rate": 2.434307435898027e-05,
"loss": 0.5051,
"step": 222600
},
{
"epoch": 0.5135995537124165,
"grad_norm": 1.0827833414077759,
"learning_rate": 2.4320022314379176e-05,
"loss": 0.498,
"step": 222800
},
{
"epoch": 0.5140605946044384,
"grad_norm": 1.4872461557388306,
"learning_rate": 2.429697026977808e-05,
"loss": 0.5253,
"step": 223000
},
{
"epoch": 0.5145216354964604,
"grad_norm": 0.5086209177970886,
"learning_rate": 2.4273918225176984e-05,
"loss": 0.4979,
"step": 223200
},
{
"epoch": 0.5149826763884823,
"grad_norm": 1.0882658958435059,
"learning_rate": 2.4250866180575887e-05,
"loss": 0.5244,
"step": 223400
},
{
"epoch": 0.5154437172805042,
"grad_norm": 1.3784066438674927,
"learning_rate": 2.4227814135974792e-05,
"loss": 0.5057,
"step": 223600
},
{
"epoch": 0.5159047581725261,
"grad_norm": 1.245423674583435,
"learning_rate": 2.4204762091373695e-05,
"loss": 0.5005,
"step": 223800
},
{
"epoch": 0.516365799064548,
"grad_norm": 2.1874382495880127,
"learning_rate": 2.41817100467726e-05,
"loss": 0.5206,
"step": 224000
},
{
"epoch": 0.51682683995657,
"grad_norm": 1.1349289417266846,
"learning_rate": 2.4158658002171503e-05,
"loss": 0.5547,
"step": 224200
},
{
"epoch": 0.5172878808485919,
"grad_norm": 0.9220569729804993,
"learning_rate": 2.4135605957570408e-05,
"loss": 0.5421,
"step": 224400
},
{
"epoch": 0.5177489217406138,
"grad_norm": 0.7660688757896423,
"learning_rate": 2.4112553912969314e-05,
"loss": 0.4737,
"step": 224600
},
{
"epoch": 0.5182099626326357,
"grad_norm": 1.1073906421661377,
"learning_rate": 2.4089501868368216e-05,
"loss": 0.5424,
"step": 224800
},
{
"epoch": 0.5186710035246577,
"grad_norm": 0.5724996328353882,
"learning_rate": 2.4066449823767122e-05,
"loss": 0.5261,
"step": 225000
},
{
"epoch": 0.5191320444166795,
"grad_norm": 0.3339095413684845,
"learning_rate": 2.4043397779166024e-05,
"loss": 0.5172,
"step": 225200
},
{
"epoch": 0.5195930853087014,
"grad_norm": 1.5384175777435303,
"learning_rate": 2.4020345734564926e-05,
"loss": 0.498,
"step": 225400
},
{
"epoch": 0.5200541262007233,
"grad_norm": 1.137721061706543,
"learning_rate": 2.3997293689963832e-05,
"loss": 0.5326,
"step": 225600
},
{
"epoch": 0.5205151670927453,
"grad_norm": 0.3401934504508972,
"learning_rate": 2.3974241645362734e-05,
"loss": 0.5264,
"step": 225800
},
{
"epoch": 0.5209762079847672,
"grad_norm": 0.9476338624954224,
"learning_rate": 2.3951189600761643e-05,
"loss": 0.4663,
"step": 226000
},
{
"epoch": 0.5214372488767891,
"grad_norm": 1.3103936910629272,
"learning_rate": 2.3928137556160546e-05,
"loss": 0.5328,
"step": 226200
},
{
"epoch": 0.521898289768811,
"grad_norm": 1.7903141975402832,
"learning_rate": 2.3905085511559448e-05,
"loss": 0.4953,
"step": 226400
},
{
"epoch": 0.522359330660833,
"grad_norm": 0.7507403492927551,
"learning_rate": 2.3882033466958354e-05,
"loss": 0.5132,
"step": 226600
},
{
"epoch": 0.5228203715528549,
"grad_norm": 1.1141492128372192,
"learning_rate": 2.3858981422357256e-05,
"loss": 0.4964,
"step": 226800
},
{
"epoch": 0.5232814124448768,
"grad_norm": 0.9881762862205505,
"learning_rate": 2.383592937775616e-05,
"loss": 0.5187,
"step": 227000
},
{
"epoch": 0.5237424533368987,
"grad_norm": 2.4193100929260254,
"learning_rate": 2.3812877333155064e-05,
"loss": 0.5324,
"step": 227200
},
{
"epoch": 0.5242034942289207,
"grad_norm": 0.5690718293190002,
"learning_rate": 2.378982528855397e-05,
"loss": 0.4951,
"step": 227400
},
{
"epoch": 0.5246645351209426,
"grad_norm": 1.6624326705932617,
"learning_rate": 2.3766773243952872e-05,
"loss": 0.5211,
"step": 227600
},
{
"epoch": 0.5251255760129645,
"grad_norm": 0.916460394859314,
"learning_rate": 2.3743721199351778e-05,
"loss": 0.5439,
"step": 227800
},
{
"epoch": 0.5255866169049864,
"grad_norm": 1.8242855072021484,
"learning_rate": 2.3720669154750683e-05,
"loss": 0.5436,
"step": 228000
},
{
"epoch": 0.5260476577970084,
"grad_norm": 1.3293455839157104,
"learning_rate": 2.3697617110149586e-05,
"loss": 0.5383,
"step": 228200
},
{
"epoch": 0.5265086986890303,
"grad_norm": 1.328596830368042,
"learning_rate": 2.367456506554849e-05,
"loss": 0.5401,
"step": 228400
},
{
"epoch": 0.5269697395810521,
"grad_norm": 0.9804822206497192,
"learning_rate": 2.3651513020947393e-05,
"loss": 0.5252,
"step": 228600
},
{
"epoch": 0.527430780473074,
"grad_norm": 1.9417587518692017,
"learning_rate": 2.36284609763463e-05,
"loss": 0.5389,
"step": 228800
},
{
"epoch": 0.5278918213650959,
"grad_norm": 1.445884346961975,
"learning_rate": 2.36054089317452e-05,
"loss": 0.5014,
"step": 229000
},
{
"epoch": 0.5283528622571179,
"grad_norm": 1.5352164506912231,
"learning_rate": 2.3582356887144104e-05,
"loss": 0.4702,
"step": 229200
},
{
"epoch": 0.5288139031491398,
"grad_norm": 0.47279122471809387,
"learning_rate": 2.3559304842543013e-05,
"loss": 0.5097,
"step": 229400
},
{
"epoch": 0.5292749440411617,
"grad_norm": 0.591940701007843,
"learning_rate": 2.3536252797941915e-05,
"loss": 0.4762,
"step": 229600
},
{
"epoch": 0.5297359849331836,
"grad_norm": 1.6824707984924316,
"learning_rate": 2.351320075334082e-05,
"loss": 0.4868,
"step": 229800
},
{
"epoch": 0.5301970258252056,
"grad_norm": 0.9410609602928162,
"learning_rate": 2.3490148708739723e-05,
"loss": 0.5622,
"step": 230000
},
{
"epoch": 0.5306580667172275,
"grad_norm": 1.2229105234146118,
"learning_rate": 2.3467096664138625e-05,
"loss": 0.5073,
"step": 230200
},
{
"epoch": 0.5311191076092494,
"grad_norm": 0.7156030535697937,
"learning_rate": 2.344404461953753e-05,
"loss": 0.4934,
"step": 230400
},
{
"epoch": 0.5315801485012713,
"grad_norm": 1.401571273803711,
"learning_rate": 2.3420992574936433e-05,
"loss": 0.4973,
"step": 230600
},
{
"epoch": 0.5320411893932933,
"grad_norm": 0.503180205821991,
"learning_rate": 2.339794053033534e-05,
"loss": 0.4983,
"step": 230800
},
{
"epoch": 0.5325022302853152,
"grad_norm": 1.6790913343429565,
"learning_rate": 2.337488848573424e-05,
"loss": 0.4945,
"step": 231000
},
{
"epoch": 0.5329632711773371,
"grad_norm": 1.007137417793274,
"learning_rate": 2.3351836441133147e-05,
"loss": 0.4822,
"step": 231200
},
{
"epoch": 0.533424312069359,
"grad_norm": 2.378171920776367,
"learning_rate": 2.3328784396532053e-05,
"loss": 0.5775,
"step": 231400
},
{
"epoch": 0.533885352961381,
"grad_norm": 1.203321099281311,
"learning_rate": 2.3305732351930955e-05,
"loss": 0.4724,
"step": 231600
},
{
"epoch": 0.5343463938534029,
"grad_norm": 1.0625741481781006,
"learning_rate": 2.328268030732986e-05,
"loss": 0.4916,
"step": 231800
},
{
"epoch": 0.5348074347454247,
"grad_norm": 1.0948866605758667,
"learning_rate": 2.3259628262728763e-05,
"loss": 0.5066,
"step": 232000
},
{
"epoch": 0.5352684756374466,
"grad_norm": 1.4360226392745972,
"learning_rate": 2.323657621812767e-05,
"loss": 0.4836,
"step": 232200
},
{
"epoch": 0.5357295165294685,
"grad_norm": 1.1512943506240845,
"learning_rate": 2.321352417352657e-05,
"loss": 0.5677,
"step": 232400
},
{
"epoch": 0.5361905574214905,
"grad_norm": 1.0096590518951416,
"learning_rate": 2.3190472128925477e-05,
"loss": 0.5734,
"step": 232600
},
{
"epoch": 0.5366515983135124,
"grad_norm": 1.4425885677337646,
"learning_rate": 2.3167420084324382e-05,
"loss": 0.4956,
"step": 232800
},
{
"epoch": 0.5371126392055343,
"grad_norm": 0.5548868775367737,
"learning_rate": 2.3144368039723284e-05,
"loss": 0.4904,
"step": 233000
},
{
"epoch": 0.5375736800975562,
"grad_norm": 1.1134722232818604,
"learning_rate": 2.312131599512219e-05,
"loss": 0.5376,
"step": 233200
},
{
"epoch": 0.5380347209895782,
"grad_norm": 0.9351561069488525,
"learning_rate": 2.3098263950521092e-05,
"loss": 0.5593,
"step": 233400
},
{
"epoch": 0.5384957618816001,
"grad_norm": 1.064975380897522,
"learning_rate": 2.3075211905919998e-05,
"loss": 0.5187,
"step": 233600
},
{
"epoch": 0.538956802773622,
"grad_norm": 1.065260648727417,
"learning_rate": 2.30521598613189e-05,
"loss": 0.5143,
"step": 233800
},
{
"epoch": 0.5394178436656439,
"grad_norm": 1.2114022970199585,
"learning_rate": 2.3029107816717803e-05,
"loss": 0.5338,
"step": 234000
},
{
"epoch": 0.5398788845576659,
"grad_norm": 0.8252068758010864,
"learning_rate": 2.300605577211671e-05,
"loss": 0.5506,
"step": 234200
},
{
"epoch": 0.5403399254496878,
"grad_norm": 1.3504903316497803,
"learning_rate": 2.298300372751561e-05,
"loss": 0.4864,
"step": 234400
},
{
"epoch": 0.5408009663417097,
"grad_norm": 1.2112751007080078,
"learning_rate": 2.295995168291452e-05,
"loss": 0.4996,
"step": 234600
},
{
"epoch": 0.5412620072337316,
"grad_norm": 0.6069416999816895,
"learning_rate": 2.2936899638313422e-05,
"loss": 0.5307,
"step": 234800
},
{
"epoch": 0.5417230481257536,
"grad_norm": 1.572514533996582,
"learning_rate": 2.2913847593712324e-05,
"loss": 0.5292,
"step": 235000
},
{
"epoch": 0.5421840890177755,
"grad_norm": 1.0099878311157227,
"learning_rate": 2.289079554911123e-05,
"loss": 0.5688,
"step": 235200
},
{
"epoch": 0.5426451299097973,
"grad_norm": 0.9012830853462219,
"learning_rate": 2.2867743504510132e-05,
"loss": 0.5366,
"step": 235400
},
{
"epoch": 0.5431061708018192,
"grad_norm": 1.135108232498169,
"learning_rate": 2.2844691459909038e-05,
"loss": 0.4941,
"step": 235600
},
{
"epoch": 0.5435672116938411,
"grad_norm": 0.9751501083374023,
"learning_rate": 2.282163941530794e-05,
"loss": 0.5217,
"step": 235800
},
{
"epoch": 0.5440282525858631,
"grad_norm": 1.2317419052124023,
"learning_rate": 2.2798587370706846e-05,
"loss": 0.5562,
"step": 236000
},
{
"epoch": 0.544489293477885,
"grad_norm": 1.3884457349777222,
"learning_rate": 2.277553532610575e-05,
"loss": 0.4626,
"step": 236200
},
{
"epoch": 0.5449503343699069,
"grad_norm": 0.9288251996040344,
"learning_rate": 2.2752483281504654e-05,
"loss": 0.5039,
"step": 236400
},
{
"epoch": 0.5454113752619288,
"grad_norm": 0.3665759563446045,
"learning_rate": 2.272943123690356e-05,
"loss": 0.5163,
"step": 236600
},
{
"epoch": 0.5458724161539508,
"grad_norm": 2.027440309524536,
"learning_rate": 2.2706379192302462e-05,
"loss": 0.5599,
"step": 236800
},
{
"epoch": 0.5463334570459727,
"grad_norm": 1.916327953338623,
"learning_rate": 2.2683327147701367e-05,
"loss": 0.482,
"step": 237000
},
{
"epoch": 0.5467944979379946,
"grad_norm": 1.4914941787719727,
"learning_rate": 2.266027510310027e-05,
"loss": 0.5569,
"step": 237200
},
{
"epoch": 0.5472555388300165,
"grad_norm": 1.7089998722076416,
"learning_rate": 2.2637223058499175e-05,
"loss": 0.4639,
"step": 237400
},
{
"epoch": 0.5477165797220385,
"grad_norm": 4.126305103302002,
"learning_rate": 2.2614171013898078e-05,
"loss": 0.519,
"step": 237600
},
{
"epoch": 0.5481776206140604,
"grad_norm": 1.5551437139511108,
"learning_rate": 2.259111896929698e-05,
"loss": 0.5204,
"step": 237800
},
{
"epoch": 0.5486386615060823,
"grad_norm": 0.7548621296882629,
"learning_rate": 2.256806692469589e-05,
"loss": 0.5807,
"step": 238000
},
{
"epoch": 0.5490997023981042,
"grad_norm": 0.2803627550601959,
"learning_rate": 2.254501488009479e-05,
"loss": 0.4846,
"step": 238200
},
{
"epoch": 0.5495607432901262,
"grad_norm": 0.9677246809005737,
"learning_rate": 2.2521962835493697e-05,
"loss": 0.4721,
"step": 238400
},
{
"epoch": 0.5500217841821481,
"grad_norm": 1.637499451637268,
"learning_rate": 2.24989107908926e-05,
"loss": 0.5269,
"step": 238600
},
{
"epoch": 0.5504828250741699,
"grad_norm": 2.227924346923828,
"learning_rate": 2.24758587462915e-05,
"loss": 0.5198,
"step": 238800
},
{
"epoch": 0.5509438659661918,
"grad_norm": 0.7341607213020325,
"learning_rate": 2.2452806701690407e-05,
"loss": 0.4917,
"step": 239000
},
{
"epoch": 0.5514049068582138,
"grad_norm": 0.4585340917110443,
"learning_rate": 2.242975465708931e-05,
"loss": 0.5,
"step": 239200
},
{
"epoch": 0.5518659477502357,
"grad_norm": 1.405619502067566,
"learning_rate": 2.2406702612488215e-05,
"loss": 0.5141,
"step": 239400
},
{
"epoch": 0.5523269886422576,
"grad_norm": 1.2896803617477417,
"learning_rate": 2.238365056788712e-05,
"loss": 0.4999,
"step": 239600
},
{
"epoch": 0.5527880295342795,
"grad_norm": 2.165039300918579,
"learning_rate": 2.2360598523286023e-05,
"loss": 0.5722,
"step": 239800
},
{
"epoch": 0.5532490704263014,
"grad_norm": 1.3514726161956787,
"learning_rate": 2.233754647868493e-05,
"loss": 0.5017,
"step": 240000
},
{
"epoch": 0.5537101113183234,
"grad_norm": 0.8125177621841431,
"learning_rate": 2.231449443408383e-05,
"loss": 0.5618,
"step": 240200
},
{
"epoch": 0.5541711522103453,
"grad_norm": 0.4262295961380005,
"learning_rate": 2.2291442389482737e-05,
"loss": 0.4865,
"step": 240400
},
{
"epoch": 0.5546321931023672,
"grad_norm": 2.328521966934204,
"learning_rate": 2.226839034488164e-05,
"loss": 0.5051,
"step": 240600
},
{
"epoch": 0.5550932339943891,
"grad_norm": 1.1261919736862183,
"learning_rate": 2.2245338300280545e-05,
"loss": 0.5119,
"step": 240800
},
{
"epoch": 0.5555542748864111,
"grad_norm": 1.1566516160964966,
"learning_rate": 2.2222286255679447e-05,
"loss": 0.5197,
"step": 241000
},
{
"epoch": 0.556015315778433,
"grad_norm": 1.7515827417373657,
"learning_rate": 2.2199234211078353e-05,
"loss": 0.5552,
"step": 241200
},
{
"epoch": 0.5564763566704549,
"grad_norm": 1.8269792795181274,
"learning_rate": 2.217618216647726e-05,
"loss": 0.4796,
"step": 241400
},
{
"epoch": 0.5569373975624768,
"grad_norm": 0.7790307402610779,
"learning_rate": 2.215313012187616e-05,
"loss": 0.5293,
"step": 241600
},
{
"epoch": 0.5573984384544988,
"grad_norm": 0.49990883469581604,
"learning_rate": 2.2130078077275066e-05,
"loss": 0.4879,
"step": 241800
},
{
"epoch": 0.5578594793465207,
"grad_norm": 1.0329365730285645,
"learning_rate": 2.210702603267397e-05,
"loss": 0.521,
"step": 242000
},
{
"epoch": 0.5583205202385425,
"grad_norm": 1.125595211982727,
"learning_rate": 2.2083973988072874e-05,
"loss": 0.4795,
"step": 242200
},
{
"epoch": 0.5587815611305644,
"grad_norm": 1.1356284618377686,
"learning_rate": 2.2060921943471777e-05,
"loss": 0.4882,
"step": 242400
},
{
"epoch": 0.5592426020225864,
"grad_norm": 0.7517489194869995,
"learning_rate": 2.203786989887068e-05,
"loss": 0.5531,
"step": 242600
},
{
"epoch": 0.5597036429146083,
"grad_norm": 1.4066451787948608,
"learning_rate": 2.2014817854269585e-05,
"loss": 0.5133,
"step": 242800
},
{
"epoch": 0.5601646838066302,
"grad_norm": 0.7683632373809814,
"learning_rate": 2.199176580966849e-05,
"loss": 0.5379,
"step": 243000
},
{
"epoch": 0.5606257246986521,
"grad_norm": 0.3758114278316498,
"learning_rate": 2.1968713765067396e-05,
"loss": 0.4681,
"step": 243200
},
{
"epoch": 0.561086765590674,
"grad_norm": 1.2410677671432495,
"learning_rate": 2.1945661720466298e-05,
"loss": 0.5409,
"step": 243400
},
{
"epoch": 0.561547806482696,
"grad_norm": 1.4236176013946533,
"learning_rate": 2.19226096758652e-05,
"loss": 0.4861,
"step": 243600
},
{
"epoch": 0.5620088473747179,
"grad_norm": 0.9534035325050354,
"learning_rate": 2.1899557631264106e-05,
"loss": 0.5307,
"step": 243800
},
{
"epoch": 0.5624698882667398,
"grad_norm": 0.711057186126709,
"learning_rate": 2.187650558666301e-05,
"loss": 0.4825,
"step": 244000
},
{
"epoch": 0.5629309291587618,
"grad_norm": 2.3626081943511963,
"learning_rate": 2.1853453542061914e-05,
"loss": 0.5344,
"step": 244200
},
{
"epoch": 0.5633919700507837,
"grad_norm": 0.23439358174800873,
"learning_rate": 2.1830401497460816e-05,
"loss": 0.5146,
"step": 244400
},
{
"epoch": 0.5638530109428056,
"grad_norm": 2.047996997833252,
"learning_rate": 2.1807349452859722e-05,
"loss": 0.4826,
"step": 244600
},
{
"epoch": 0.5643140518348275,
"grad_norm": 1.1761419773101807,
"learning_rate": 2.1784297408258628e-05,
"loss": 0.5096,
"step": 244800
},
{
"epoch": 0.5647750927268494,
"grad_norm": 1.0271129608154297,
"learning_rate": 2.176124536365753e-05,
"loss": 0.5182,
"step": 245000
},
{
"epoch": 0.5652361336188714,
"grad_norm": 1.1691869497299194,
"learning_rate": 2.1738193319056436e-05,
"loss": 0.4849,
"step": 245200
},
{
"epoch": 0.5656971745108933,
"grad_norm": 0.9857134222984314,
"learning_rate": 2.1715141274455338e-05,
"loss": 0.4795,
"step": 245400
},
{
"epoch": 0.5661582154029151,
"grad_norm": 0.6204602122306824,
"learning_rate": 2.1692089229854244e-05,
"loss": 0.5282,
"step": 245600
},
{
"epoch": 0.566619256294937,
"grad_norm": 2.198983669281006,
"learning_rate": 2.1669037185253146e-05,
"loss": 0.534,
"step": 245800
},
{
"epoch": 0.567080297186959,
"grad_norm": 0.9738652110099792,
"learning_rate": 2.1645985140652052e-05,
"loss": 0.5499,
"step": 246000
},
{
"epoch": 0.5675413380789809,
"grad_norm": 0.801446795463562,
"learning_rate": 2.1622933096050954e-05,
"loss": 0.5452,
"step": 246200
},
{
"epoch": 0.5680023789710028,
"grad_norm": 1.2199312448501587,
"learning_rate": 2.159988105144986e-05,
"loss": 0.5296,
"step": 246400
},
{
"epoch": 0.5684634198630247,
"grad_norm": 1.333871603012085,
"learning_rate": 2.1576829006848765e-05,
"loss": 0.5443,
"step": 246600
},
{
"epoch": 0.5689244607550467,
"grad_norm": 1.0577268600463867,
"learning_rate": 2.1553776962247668e-05,
"loss": 0.5119,
"step": 246800
},
{
"epoch": 0.5693855016470686,
"grad_norm": 1.1730480194091797,
"learning_rate": 2.1530724917646573e-05,
"loss": 0.5124,
"step": 247000
},
{
"epoch": 0.5698465425390905,
"grad_norm": 1.0999897718429565,
"learning_rate": 2.1507672873045476e-05,
"loss": 0.5589,
"step": 247200
},
{
"epoch": 0.5703075834311124,
"grad_norm": 1.2525196075439453,
"learning_rate": 2.1484620828444378e-05,
"loss": 0.5254,
"step": 247400
},
{
"epoch": 0.5707686243231344,
"grad_norm": 1.3364574909210205,
"learning_rate": 2.1461568783843284e-05,
"loss": 0.496,
"step": 247600
},
{
"epoch": 0.5712296652151563,
"grad_norm": 0.8777609467506409,
"learning_rate": 2.1438516739242186e-05,
"loss": 0.5314,
"step": 247800
},
{
"epoch": 0.5716907061071782,
"grad_norm": 0.9641389846801758,
"learning_rate": 2.141546469464109e-05,
"loss": 0.4883,
"step": 248000
},
{
"epoch": 0.5721517469992001,
"grad_norm": 0.8974488973617554,
"learning_rate": 2.1392412650039997e-05,
"loss": 0.4994,
"step": 248200
},
{
"epoch": 0.572612787891222,
"grad_norm": 1.1016892194747925,
"learning_rate": 2.13693606054389e-05,
"loss": 0.5206,
"step": 248400
},
{
"epoch": 0.573073828783244,
"grad_norm": 1.8941538333892822,
"learning_rate": 2.1346308560837805e-05,
"loss": 0.5283,
"step": 248600
},
{
"epoch": 0.5735348696752659,
"grad_norm": 0.882707417011261,
"learning_rate": 2.1323256516236707e-05,
"loss": 0.523,
"step": 248800
},
{
"epoch": 0.5739959105672877,
"grad_norm": 1.1047805547714233,
"learning_rate": 2.1300204471635613e-05,
"loss": 0.5199,
"step": 249000
},
{
"epoch": 0.5744569514593096,
"grad_norm": 0.9764407873153687,
"learning_rate": 2.1277152427034515e-05,
"loss": 0.4902,
"step": 249200
},
{
"epoch": 0.5749179923513316,
"grad_norm": 0.9825992584228516,
"learning_rate": 2.125410038243342e-05,
"loss": 0.5253,
"step": 249400
},
{
"epoch": 0.5753790332433535,
"grad_norm": 0.5447947978973389,
"learning_rate": 2.1231048337832323e-05,
"loss": 0.5162,
"step": 249600
},
{
"epoch": 0.5758400741353754,
"grad_norm": 1.0377503633499146,
"learning_rate": 2.120799629323123e-05,
"loss": 0.5193,
"step": 249800
},
{
"epoch": 0.5763011150273973,
"grad_norm": 0.5433443188667297,
"learning_rate": 2.1184944248630135e-05,
"loss": 0.5163,
"step": 250000
},
{
"epoch": 0.5763011150273973,
"eval_loss": 0.5065879821777344,
"eval_runtime": 144.2776,
"eval_samples_per_second": 30.372,
"eval_steps_per_second": 30.372,
"step": 250000
},
{
"epoch": 0.5767621559194193,
"grad_norm": 1.6914293766021729,
"learning_rate": 2.1161892204029037e-05,
"loss": 0.5304,
"step": 250200
},
{
"epoch": 0.5772231968114412,
"grad_norm": 1.1830875873565674,
"learning_rate": 2.1138840159427943e-05,
"loss": 0.518,
"step": 250400
},
{
"epoch": 0.5776842377034631,
"grad_norm": 1.4796136617660522,
"learning_rate": 2.1115788114826845e-05,
"loss": 0.525,
"step": 250600
},
{
"epoch": 0.578145278595485,
"grad_norm": 1.81144118309021,
"learning_rate": 2.109273607022575e-05,
"loss": 0.536,
"step": 250800
},
{
"epoch": 0.578606319487507,
"grad_norm": 1.3345705270767212,
"learning_rate": 2.1069684025624653e-05,
"loss": 0.4776,
"step": 251000
},
{
"epoch": 0.5790673603795289,
"grad_norm": 1.4617594480514526,
"learning_rate": 2.1046631981023555e-05,
"loss": 0.5112,
"step": 251200
},
{
"epoch": 0.5795284012715508,
"grad_norm": 1.4168286323547363,
"learning_rate": 2.1023579936422464e-05,
"loss": 0.5247,
"step": 251400
},
{
"epoch": 0.5799894421635727,
"grad_norm": 0.9052757024765015,
"learning_rate": 2.1000527891821367e-05,
"loss": 0.5189,
"step": 251600
},
{
"epoch": 0.5804504830555947,
"grad_norm": 1.7687321901321411,
"learning_rate": 2.0977475847220272e-05,
"loss": 0.4998,
"step": 251800
},
{
"epoch": 0.5809115239476166,
"grad_norm": 1.1558544635772705,
"learning_rate": 2.0954423802619175e-05,
"loss": 0.5648,
"step": 252000
},
{
"epoch": 0.5813725648396385,
"grad_norm": 1.4480737447738647,
"learning_rate": 2.0931371758018077e-05,
"loss": 0.5221,
"step": 252200
},
{
"epoch": 0.5818336057316603,
"grad_norm": 1.6768193244934082,
"learning_rate": 2.0908319713416982e-05,
"loss": 0.4758,
"step": 252400
},
{
"epoch": 0.5822946466236822,
"grad_norm": 1.9604754447937012,
"learning_rate": 2.0885267668815885e-05,
"loss": 0.5225,
"step": 252600
},
{
"epoch": 0.5827556875157042,
"grad_norm": 1.8727524280548096,
"learning_rate": 2.086221562421479e-05,
"loss": 0.5262,
"step": 252800
},
{
"epoch": 0.5832167284077261,
"grad_norm": 1.510044813156128,
"learning_rate": 2.0839163579613693e-05,
"loss": 0.5664,
"step": 253000
},
{
"epoch": 0.583677769299748,
"grad_norm": 1.9544621706008911,
"learning_rate": 2.0816111535012602e-05,
"loss": 0.5053,
"step": 253200
},
{
"epoch": 0.5841388101917699,
"grad_norm": 0.9827083349227905,
"learning_rate": 2.0793059490411504e-05,
"loss": 0.5479,
"step": 253400
},
{
"epoch": 0.5845998510837919,
"grad_norm": 2.2708816528320312,
"learning_rate": 2.0770007445810406e-05,
"loss": 0.5025,
"step": 253600
},
{
"epoch": 0.5850608919758138,
"grad_norm": 2.2587356567382812,
"learning_rate": 2.0746955401209312e-05,
"loss": 0.4923,
"step": 253800
},
{
"epoch": 0.5855219328678357,
"grad_norm": 1.3918339014053345,
"learning_rate": 2.0723903356608214e-05,
"loss": 0.4738,
"step": 254000
},
{
"epoch": 0.5859829737598576,
"grad_norm": 1.7613333463668823,
"learning_rate": 2.070085131200712e-05,
"loss": 0.4592,
"step": 254200
},
{
"epoch": 0.5864440146518796,
"grad_norm": 2.323390007019043,
"learning_rate": 2.0677799267406022e-05,
"loss": 0.4962,
"step": 254400
},
{
"epoch": 0.5869050555439015,
"grad_norm": 1.5669095516204834,
"learning_rate": 2.0654747222804928e-05,
"loss": 0.5616,
"step": 254600
},
{
"epoch": 0.5873660964359234,
"grad_norm": 1.5922577381134033,
"learning_rate": 2.0631695178203834e-05,
"loss": 0.494,
"step": 254800
},
{
"epoch": 0.5878271373279453,
"grad_norm": 1.2841917276382446,
"learning_rate": 2.0608643133602736e-05,
"loss": 0.4663,
"step": 255000
},
{
"epoch": 0.5882881782199673,
"grad_norm": 0.8427960872650146,
"learning_rate": 2.058559108900164e-05,
"loss": 0.5203,
"step": 255200
},
{
"epoch": 0.5887492191119892,
"grad_norm": 1.1014477014541626,
"learning_rate": 2.0562539044400544e-05,
"loss": 0.4983,
"step": 255400
},
{
"epoch": 0.5892102600040111,
"grad_norm": 0.7464996576309204,
"learning_rate": 2.053948699979945e-05,
"loss": 0.512,
"step": 255600
},
{
"epoch": 0.5896713008960329,
"grad_norm": 1.1050175428390503,
"learning_rate": 2.0516434955198352e-05,
"loss": 0.5039,
"step": 255800
},
{
"epoch": 0.5901323417880548,
"grad_norm": 1.4962995052337646,
"learning_rate": 2.0493382910597254e-05,
"loss": 0.4859,
"step": 256000
},
{
"epoch": 0.5905933826800768,
"grad_norm": 1.086658239364624,
"learning_rate": 2.047033086599616e-05,
"loss": 0.512,
"step": 256200
},
{
"epoch": 0.5910544235720987,
"grad_norm": 1.5740742683410645,
"learning_rate": 2.0447278821395062e-05,
"loss": 0.5017,
"step": 256400
},
{
"epoch": 0.5915154644641206,
"grad_norm": 1.2784602642059326,
"learning_rate": 2.042422677679397e-05,
"loss": 0.5347,
"step": 256600
},
{
"epoch": 0.5919765053561425,
"grad_norm": 1.1897175312042236,
"learning_rate": 2.0401174732192873e-05,
"loss": 0.5442,
"step": 256800
},
{
"epoch": 0.5924375462481645,
"grad_norm": 1.5644766092300415,
"learning_rate": 2.0378122687591776e-05,
"loss": 0.4957,
"step": 257000
},
{
"epoch": 0.5928985871401864,
"grad_norm": 1.350401520729065,
"learning_rate": 2.035507064299068e-05,
"loss": 0.4763,
"step": 257200
},
{
"epoch": 0.5933596280322083,
"grad_norm": 1.8206768035888672,
"learning_rate": 2.0332018598389584e-05,
"loss": 0.4756,
"step": 257400
},
{
"epoch": 0.5938206689242302,
"grad_norm": 1.9066009521484375,
"learning_rate": 2.030896655378849e-05,
"loss": 0.4968,
"step": 257600
},
{
"epoch": 0.5942817098162522,
"grad_norm": 0.9539717435836792,
"learning_rate": 2.028591450918739e-05,
"loss": 0.5454,
"step": 257800
},
{
"epoch": 0.5947427507082741,
"grad_norm": 1.8135906457901,
"learning_rate": 2.0262862464586297e-05,
"loss": 0.4961,
"step": 258000
},
{
"epoch": 0.595203791600296,
"grad_norm": 1.2675491571426392,
"learning_rate": 2.0239810419985203e-05,
"loss": 0.4997,
"step": 258200
},
{
"epoch": 0.5956648324923179,
"grad_norm": 0.6522994041442871,
"learning_rate": 2.0216758375384105e-05,
"loss": 0.5243,
"step": 258400
},
{
"epoch": 0.5961258733843399,
"grad_norm": 0.3235660791397095,
"learning_rate": 2.019370633078301e-05,
"loss": 0.4942,
"step": 258600
},
{
"epoch": 0.5965869142763618,
"grad_norm": 1.0544391870498657,
"learning_rate": 2.0170654286181913e-05,
"loss": 0.5452,
"step": 258800
},
{
"epoch": 0.5970479551683837,
"grad_norm": 2.637691020965576,
"learning_rate": 2.014760224158082e-05,
"loss": 0.5338,
"step": 259000
},
{
"epoch": 0.5975089960604055,
"grad_norm": 0.2857421934604645,
"learning_rate": 2.012455019697972e-05,
"loss": 0.5621,
"step": 259200
},
{
"epoch": 0.5979700369524275,
"grad_norm": 0.93863445520401,
"learning_rate": 2.0101498152378627e-05,
"loss": 0.5391,
"step": 259400
},
{
"epoch": 0.5984310778444494,
"grad_norm": 0.6566616892814636,
"learning_rate": 2.007844610777753e-05,
"loss": 0.5247,
"step": 259600
},
{
"epoch": 0.5988921187364713,
"grad_norm": 1.3079489469528198,
"learning_rate": 2.005539406317643e-05,
"loss": 0.5031,
"step": 259800
},
{
"epoch": 0.5993531596284932,
"grad_norm": 0.5705758333206177,
"learning_rate": 2.003234201857534e-05,
"loss": 0.5046,
"step": 260000
},
{
"epoch": 0.5998142005205152,
"grad_norm": 1.439122200012207,
"learning_rate": 2.0009289973974243e-05,
"loss": 0.4972,
"step": 260200
},
{
"epoch": 0.6002752414125371,
"grad_norm": 0.7958211302757263,
"learning_rate": 1.998623792937315e-05,
"loss": 0.5172,
"step": 260400
},
{
"epoch": 0.600736282304559,
"grad_norm": 1.4362818002700806,
"learning_rate": 1.996318588477205e-05,
"loss": 0.5031,
"step": 260600
},
{
"epoch": 0.6011973231965809,
"grad_norm": 1.128711462020874,
"learning_rate": 1.9940133840170953e-05,
"loss": 0.5035,
"step": 260800
},
{
"epoch": 0.6016583640886028,
"grad_norm": 0.9221576452255249,
"learning_rate": 1.991708179556986e-05,
"loss": 0.5039,
"step": 261000
},
{
"epoch": 0.6021194049806248,
"grad_norm": 1.0171575546264648,
"learning_rate": 1.989402975096876e-05,
"loss": 0.5009,
"step": 261200
},
{
"epoch": 0.6025804458726467,
"grad_norm": 1.2728921175003052,
"learning_rate": 1.9870977706367667e-05,
"loss": 0.56,
"step": 261400
},
{
"epoch": 0.6030414867646686,
"grad_norm": 0.6258471012115479,
"learning_rate": 1.9847925661766572e-05,
"loss": 0.5025,
"step": 261600
},
{
"epoch": 0.6035025276566905,
"grad_norm": 1.2376896142959595,
"learning_rate": 1.9824873617165478e-05,
"loss": 0.5488,
"step": 261800
},
{
"epoch": 0.6039635685487125,
"grad_norm": 1.5317405462265015,
"learning_rate": 1.980182157256438e-05,
"loss": 0.5412,
"step": 262000
},
{
"epoch": 0.6044246094407344,
"grad_norm": 2.4922080039978027,
"learning_rate": 1.9778769527963283e-05,
"loss": 0.5095,
"step": 262200
},
{
"epoch": 0.6048856503327562,
"grad_norm": 0.9650156497955322,
"learning_rate": 1.975571748336219e-05,
"loss": 0.5217,
"step": 262400
},
{
"epoch": 0.6053466912247781,
"grad_norm": 1.3613967895507812,
"learning_rate": 1.973266543876109e-05,
"loss": 0.5102,
"step": 262600
},
{
"epoch": 0.6058077321168001,
"grad_norm": 1.0593500137329102,
"learning_rate": 1.9709613394159996e-05,
"loss": 0.5723,
"step": 262800
},
{
"epoch": 0.606268773008822,
"grad_norm": 1.8354504108428955,
"learning_rate": 1.96865613495589e-05,
"loss": 0.5125,
"step": 263000
},
{
"epoch": 0.6067298139008439,
"grad_norm": 2.131420373916626,
"learning_rate": 1.9663509304957804e-05,
"loss": 0.5214,
"step": 263200
},
{
"epoch": 0.6071908547928658,
"grad_norm": 1.4709240198135376,
"learning_rate": 1.964045726035671e-05,
"loss": 0.4658,
"step": 263400
},
{
"epoch": 0.6076518956848878,
"grad_norm": 1.3069663047790527,
"learning_rate": 1.9617405215755612e-05,
"loss": 0.5497,
"step": 263600
},
{
"epoch": 0.6081129365769097,
"grad_norm": 0.6274604797363281,
"learning_rate": 1.9594353171154518e-05,
"loss": 0.5266,
"step": 263800
},
{
"epoch": 0.6085739774689316,
"grad_norm": 0.9188045263290405,
"learning_rate": 1.957130112655342e-05,
"loss": 0.5668,
"step": 264000
},
{
"epoch": 0.6090350183609535,
"grad_norm": 0.5703033804893494,
"learning_rate": 1.9548249081952326e-05,
"loss": 0.4844,
"step": 264200
},
{
"epoch": 0.6094960592529755,
"grad_norm": 2.1700258255004883,
"learning_rate": 1.9525197037351228e-05,
"loss": 0.5282,
"step": 264400
},
{
"epoch": 0.6099571001449974,
"grad_norm": 1.3549532890319824,
"learning_rate": 1.950214499275013e-05,
"loss": 0.4734,
"step": 264600
},
{
"epoch": 0.6104181410370193,
"grad_norm": 1.7705378532409668,
"learning_rate": 1.9479092948149036e-05,
"loss": 0.496,
"step": 264800
},
{
"epoch": 0.6108791819290412,
"grad_norm": 0.578196108341217,
"learning_rate": 1.9456040903547942e-05,
"loss": 0.5078,
"step": 265000
},
{
"epoch": 0.6113402228210632,
"grad_norm": 1.1500052213668823,
"learning_rate": 1.9432988858946847e-05,
"loss": 0.5196,
"step": 265200
},
{
"epoch": 0.6118012637130851,
"grad_norm": 1.3695541620254517,
"learning_rate": 1.940993681434575e-05,
"loss": 0.4915,
"step": 265400
},
{
"epoch": 0.612262304605107,
"grad_norm": 0.8905289173126221,
"learning_rate": 1.9386884769744655e-05,
"loss": 0.4662,
"step": 265600
},
{
"epoch": 0.6127233454971288,
"grad_norm": 2.054939031600952,
"learning_rate": 1.9363832725143558e-05,
"loss": 0.4665,
"step": 265800
},
{
"epoch": 0.6131843863891507,
"grad_norm": 1.421302080154419,
"learning_rate": 1.934078068054246e-05,
"loss": 0.5074,
"step": 266000
},
{
"epoch": 0.6136454272811727,
"grad_norm": 1.0554801225662231,
"learning_rate": 1.9317728635941366e-05,
"loss": 0.5406,
"step": 266200
},
{
"epoch": 0.6141064681731946,
"grad_norm": 1.5464704036712646,
"learning_rate": 1.9294676591340268e-05,
"loss": 0.5273,
"step": 266400
},
{
"epoch": 0.6145675090652165,
"grad_norm": 2.142878293991089,
"learning_rate": 1.9271624546739174e-05,
"loss": 0.5035,
"step": 266600
},
{
"epoch": 0.6150285499572384,
"grad_norm": 2.7854163646698,
"learning_rate": 1.924857250213808e-05,
"loss": 0.4915,
"step": 266800
},
{
"epoch": 0.6154895908492604,
"grad_norm": 1.0420928001403809,
"learning_rate": 1.922552045753698e-05,
"loss": 0.5025,
"step": 267000
},
{
"epoch": 0.6159506317412823,
"grad_norm": 1.2104905843734741,
"learning_rate": 1.9202468412935887e-05,
"loss": 0.5118,
"step": 267200
},
{
"epoch": 0.6164116726333042,
"grad_norm": 1.4268879890441895,
"learning_rate": 1.917941636833479e-05,
"loss": 0.5147,
"step": 267400
},
{
"epoch": 0.6168727135253261,
"grad_norm": 1.690464973449707,
"learning_rate": 1.9156364323733695e-05,
"loss": 0.4835,
"step": 267600
},
{
"epoch": 0.6173337544173481,
"grad_norm": 1.919801115989685,
"learning_rate": 1.9133312279132598e-05,
"loss": 0.5243,
"step": 267800
},
{
"epoch": 0.61779479530937,
"grad_norm": 0.6003401875495911,
"learning_rate": 1.9110260234531503e-05,
"loss": 0.4886,
"step": 268000
},
{
"epoch": 0.6182558362013919,
"grad_norm": 1.350727915763855,
"learning_rate": 1.9087208189930405e-05,
"loss": 0.5038,
"step": 268200
},
{
"epoch": 0.6187168770934138,
"grad_norm": 0.8154557347297668,
"learning_rate": 1.906415614532931e-05,
"loss": 0.5153,
"step": 268400
},
{
"epoch": 0.6191779179854358,
"grad_norm": 0.5474942326545715,
"learning_rate": 1.9041104100728217e-05,
"loss": 0.5569,
"step": 268600
},
{
"epoch": 0.6196389588774577,
"grad_norm": 0.8887852430343628,
"learning_rate": 1.901805205612712e-05,
"loss": 0.5289,
"step": 268800
},
{
"epoch": 0.6200999997694796,
"grad_norm": 0.9565109014511108,
"learning_rate": 1.8995000011526025e-05,
"loss": 0.5446,
"step": 269000
},
{
"epoch": 0.6205610406615014,
"grad_norm": 1.2200897932052612,
"learning_rate": 1.8971947966924927e-05,
"loss": 0.4689,
"step": 269200
},
{
"epoch": 0.6210220815535233,
"grad_norm": 0.5202858448028564,
"learning_rate": 1.894889592232383e-05,
"loss": 0.5117,
"step": 269400
},
{
"epoch": 0.6214831224455453,
"grad_norm": 1.1108614206314087,
"learning_rate": 1.8925843877722735e-05,
"loss": 0.5495,
"step": 269600
},
{
"epoch": 0.6219441633375672,
"grad_norm": 0.7820692658424377,
"learning_rate": 1.8902791833121637e-05,
"loss": 0.5489,
"step": 269800
},
{
"epoch": 0.6224052042295891,
"grad_norm": 0.5939005613327026,
"learning_rate": 1.8879739788520543e-05,
"loss": 0.5139,
"step": 270000
},
{
"epoch": 0.622866245121611,
"grad_norm": 0.809594452381134,
"learning_rate": 1.885668774391945e-05,
"loss": 0.5195,
"step": 270200
},
{
"epoch": 0.623327286013633,
"grad_norm": 1.638484001159668,
"learning_rate": 1.8833635699318354e-05,
"loss": 0.487,
"step": 270400
},
{
"epoch": 0.6237883269056549,
"grad_norm": 1.4749358892440796,
"learning_rate": 1.8810583654717257e-05,
"loss": 0.5058,
"step": 270600
},
{
"epoch": 0.6242493677976768,
"grad_norm": 0.8880025744438171,
"learning_rate": 1.878753161011616e-05,
"loss": 0.513,
"step": 270800
},
{
"epoch": 0.6247104086896987,
"grad_norm": 0.9958152174949646,
"learning_rate": 1.8764479565515065e-05,
"loss": 0.5261,
"step": 271000
},
{
"epoch": 0.6251714495817207,
"grad_norm": 1.6274564266204834,
"learning_rate": 1.8741427520913967e-05,
"loss": 0.5416,
"step": 271200
},
{
"epoch": 0.6256324904737426,
"grad_norm": 1.5362344980239868,
"learning_rate": 1.8718375476312873e-05,
"loss": 0.513,
"step": 271400
},
{
"epoch": 0.6260935313657645,
"grad_norm": 0.9581994414329529,
"learning_rate": 1.8695323431711775e-05,
"loss": 0.505,
"step": 271600
},
{
"epoch": 0.6265545722577864,
"grad_norm": 1.3298275470733643,
"learning_rate": 1.867227138711068e-05,
"loss": 0.4829,
"step": 271800
},
{
"epoch": 0.6270156131498084,
"grad_norm": 1.5617239475250244,
"learning_rate": 1.8649219342509586e-05,
"loss": 0.5669,
"step": 272000
},
{
"epoch": 0.6274766540418303,
"grad_norm": 1.6053404808044434,
"learning_rate": 1.862616729790849e-05,
"loss": 0.5203,
"step": 272200
},
{
"epoch": 0.6279376949338522,
"grad_norm": 1.7851396799087524,
"learning_rate": 1.8603115253307394e-05,
"loss": 0.512,
"step": 272400
},
{
"epoch": 0.628398735825874,
"grad_norm": 1.3142194747924805,
"learning_rate": 1.8580063208706296e-05,
"loss": 0.5205,
"step": 272600
},
{
"epoch": 0.628859776717896,
"grad_norm": 1.7642301321029663,
"learning_rate": 1.8557011164105202e-05,
"loss": 0.5199,
"step": 272800
},
{
"epoch": 0.6293208176099179,
"grad_norm": 1.0019512176513672,
"learning_rate": 1.8533959119504104e-05,
"loss": 0.5,
"step": 273000
},
{
"epoch": 0.6297818585019398,
"grad_norm": 1.3982213735580444,
"learning_rate": 1.8510907074903007e-05,
"loss": 0.4773,
"step": 273200
},
{
"epoch": 0.6302428993939617,
"grad_norm": 0.6312654614448547,
"learning_rate": 1.8487855030301912e-05,
"loss": 0.4853,
"step": 273400
},
{
"epoch": 0.6307039402859836,
"grad_norm": 1.554456353187561,
"learning_rate": 1.8464802985700818e-05,
"loss": 0.5074,
"step": 273600
},
{
"epoch": 0.6311649811780056,
"grad_norm": 1.26462984085083,
"learning_rate": 1.8441750941099724e-05,
"loss": 0.482,
"step": 273800
},
{
"epoch": 0.6316260220700275,
"grad_norm": 1.3933197259902954,
"learning_rate": 1.8418698896498626e-05,
"loss": 0.5149,
"step": 274000
},
{
"epoch": 0.6320870629620494,
"grad_norm": 1.4466843605041504,
"learning_rate": 1.839564685189753e-05,
"loss": 0.5609,
"step": 274200
},
{
"epoch": 0.6325481038540713,
"grad_norm": 0.9413987398147583,
"learning_rate": 1.8372594807296434e-05,
"loss": 0.5348,
"step": 274400
},
{
"epoch": 0.6330091447460933,
"grad_norm": 2.5217905044555664,
"learning_rate": 1.8349542762695336e-05,
"loss": 0.483,
"step": 274600
},
{
"epoch": 0.6334701856381152,
"grad_norm": 1.803232192993164,
"learning_rate": 1.8326490718094242e-05,
"loss": 0.5096,
"step": 274800
},
{
"epoch": 0.6339312265301371,
"grad_norm": 1.1358133554458618,
"learning_rate": 1.8303438673493144e-05,
"loss": 0.5148,
"step": 275000
},
{
"epoch": 0.634392267422159,
"grad_norm": 1.4829622507095337,
"learning_rate": 1.8280386628892053e-05,
"loss": 0.5048,
"step": 275200
},
{
"epoch": 0.634853308314181,
"grad_norm": 1.8766462802886963,
"learning_rate": 1.8257334584290956e-05,
"loss": 0.501,
"step": 275400
},
{
"epoch": 0.6353143492062029,
"grad_norm": 1.7556136846542358,
"learning_rate": 1.8234282539689858e-05,
"loss": 0.4652,
"step": 275600
},
{
"epoch": 0.6357753900982248,
"grad_norm": 1.6334820985794067,
"learning_rate": 1.8211230495088764e-05,
"loss": 0.5093,
"step": 275800
},
{
"epoch": 0.6362364309902466,
"grad_norm": 0.6144605875015259,
"learning_rate": 1.8188178450487666e-05,
"loss": 0.4899,
"step": 276000
},
{
"epoch": 0.6366974718822686,
"grad_norm": 0.49530643224716187,
"learning_rate": 1.816512640588657e-05,
"loss": 0.5037,
"step": 276200
},
{
"epoch": 0.6371585127742905,
"grad_norm": 0.8908922672271729,
"learning_rate": 1.8142074361285474e-05,
"loss": 0.4913,
"step": 276400
},
{
"epoch": 0.6376195536663124,
"grad_norm": 0.7277461290359497,
"learning_rate": 1.811902231668438e-05,
"loss": 0.509,
"step": 276600
},
{
"epoch": 0.6380805945583343,
"grad_norm": 1.4402283430099487,
"learning_rate": 1.8095970272083285e-05,
"loss": 0.5063,
"step": 276800
},
{
"epoch": 0.6385416354503562,
"grad_norm": 1.40396249294281,
"learning_rate": 1.8072918227482187e-05,
"loss": 0.5368,
"step": 277000
},
{
"epoch": 0.6390026763423782,
"grad_norm": 1.9143671989440918,
"learning_rate": 1.8049866182881093e-05,
"loss": 0.5159,
"step": 277200
},
{
"epoch": 0.6394637172344001,
"grad_norm": 1.0167429447174072,
"learning_rate": 1.8026814138279995e-05,
"loss": 0.4895,
"step": 277400
},
{
"epoch": 0.639924758126422,
"grad_norm": 1.2387683391571045,
"learning_rate": 1.80037620936789e-05,
"loss": 0.4982,
"step": 277600
},
{
"epoch": 0.640385799018444,
"grad_norm": 1.7970925569534302,
"learning_rate": 1.7980710049077803e-05,
"loss": 0.4749,
"step": 277800
},
{
"epoch": 0.6408468399104659,
"grad_norm": 1.2486504316329956,
"learning_rate": 1.7957658004476706e-05,
"loss": 0.5113,
"step": 278000
},
{
"epoch": 0.6413078808024878,
"grad_norm": 0.9315382838249207,
"learning_rate": 1.793460595987561e-05,
"loss": 0.5022,
"step": 278200
},
{
"epoch": 0.6417689216945097,
"grad_norm": 1.3397549390792847,
"learning_rate": 1.7911553915274514e-05,
"loss": 0.5411,
"step": 278400
},
{
"epoch": 0.6422299625865316,
"grad_norm": 1.5810282230377197,
"learning_rate": 1.7888501870673423e-05,
"loss": 0.5015,
"step": 278600
},
{
"epoch": 0.6426910034785536,
"grad_norm": 0.9700754284858704,
"learning_rate": 1.7865449826072325e-05,
"loss": 0.5014,
"step": 278800
},
{
"epoch": 0.6431520443705755,
"grad_norm": 1.5773003101348877,
"learning_rate": 1.784239778147123e-05,
"loss": 0.5001,
"step": 279000
},
{
"epoch": 0.6436130852625974,
"grad_norm": 1.5198345184326172,
"learning_rate": 1.7819345736870133e-05,
"loss": 0.4951,
"step": 279200
},
{
"epoch": 0.6440741261546192,
"grad_norm": 0.9884507060050964,
"learning_rate": 1.7796293692269035e-05,
"loss": 0.5342,
"step": 279400
},
{
"epoch": 0.6445351670466412,
"grad_norm": 0.6419351696968079,
"learning_rate": 1.777324164766794e-05,
"loss": 0.4904,
"step": 279600
},
{
"epoch": 0.6449962079386631,
"grad_norm": 1.171769618988037,
"learning_rate": 1.7750189603066843e-05,
"loss": 0.5071,
"step": 279800
},
{
"epoch": 0.645457248830685,
"grad_norm": 1.362993836402893,
"learning_rate": 1.772713755846575e-05,
"loss": 0.5205,
"step": 280000
},
{
"epoch": 0.6459182897227069,
"grad_norm": 1.8605279922485352,
"learning_rate": 1.7704085513864655e-05,
"loss": 0.5206,
"step": 280200
},
{
"epoch": 0.6463793306147289,
"grad_norm": 1.154487133026123,
"learning_rate": 1.7681033469263557e-05,
"loss": 0.5846,
"step": 280400
},
{
"epoch": 0.6468403715067508,
"grad_norm": 1.5201776027679443,
"learning_rate": 1.7657981424662462e-05,
"loss": 0.4864,
"step": 280600
},
{
"epoch": 0.6473014123987727,
"grad_norm": 1.0261558294296265,
"learning_rate": 1.7634929380061365e-05,
"loss": 0.4937,
"step": 280800
},
{
"epoch": 0.6477624532907946,
"grad_norm": 0.9769271612167358,
"learning_rate": 1.761187733546027e-05,
"loss": 0.4865,
"step": 281000
},
{
"epoch": 0.6482234941828166,
"grad_norm": 1.5987550020217896,
"learning_rate": 1.7588825290859173e-05,
"loss": 0.5231,
"step": 281200
},
{
"epoch": 0.6486845350748385,
"grad_norm": 1.0639326572418213,
"learning_rate": 1.756577324625808e-05,
"loss": 0.5197,
"step": 281400
},
{
"epoch": 0.6491455759668604,
"grad_norm": 2.6763956546783447,
"learning_rate": 1.754272120165698e-05,
"loss": 0.5459,
"step": 281600
},
{
"epoch": 0.6496066168588823,
"grad_norm": 0.49132779240608215,
"learning_rate": 1.7519669157055883e-05,
"loss": 0.518,
"step": 281800
},
{
"epoch": 0.6500676577509042,
"grad_norm": 1.8411035537719727,
"learning_rate": 1.7496617112454792e-05,
"loss": 0.4867,
"step": 282000
},
{
"epoch": 0.6505286986429262,
"grad_norm": 0.6566082835197449,
"learning_rate": 1.7473565067853694e-05,
"loss": 0.5135,
"step": 282200
},
{
"epoch": 0.6509897395349481,
"grad_norm": 1.3667335510253906,
"learning_rate": 1.74505130232526e-05,
"loss": 0.5306,
"step": 282400
},
{
"epoch": 0.65145078042697,
"grad_norm": 1.3689517974853516,
"learning_rate": 1.7427460978651502e-05,
"loss": 0.4903,
"step": 282600
},
{
"epoch": 0.6519118213189918,
"grad_norm": 1.0682365894317627,
"learning_rate": 1.7404408934050408e-05,
"loss": 0.5197,
"step": 282800
},
{
"epoch": 0.6523728622110138,
"grad_norm": 0.9434696435928345,
"learning_rate": 1.738135688944931e-05,
"loss": 0.5309,
"step": 283000
},
{
"epoch": 0.6528339031030357,
"grad_norm": 1.378448724746704,
"learning_rate": 1.7358304844848213e-05,
"loss": 0.4943,
"step": 283200
},
{
"epoch": 0.6532949439950576,
"grad_norm": 1.0012249946594238,
"learning_rate": 1.7335252800247118e-05,
"loss": 0.5066,
"step": 283400
},
{
"epoch": 0.6537559848870795,
"grad_norm": 2.5924713611602783,
"learning_rate": 1.7312200755646024e-05,
"loss": 0.4943,
"step": 283600
},
{
"epoch": 0.6542170257791015,
"grad_norm": 1.0362581014633179,
"learning_rate": 1.728914871104493e-05,
"loss": 0.5225,
"step": 283800
},
{
"epoch": 0.6546780666711234,
"grad_norm": 2.9695885181427,
"learning_rate": 1.7266096666443832e-05,
"loss": 0.516,
"step": 284000
},
{
"epoch": 0.6551391075631453,
"grad_norm": 1.1434212923049927,
"learning_rate": 1.7243044621842734e-05,
"loss": 0.51,
"step": 284200
},
{
"epoch": 0.6556001484551672,
"grad_norm": 0.8968667387962341,
"learning_rate": 1.721999257724164e-05,
"loss": 0.4848,
"step": 284400
},
{
"epoch": 0.6560611893471892,
"grad_norm": 6.086385726928711,
"learning_rate": 1.7196940532640542e-05,
"loss": 0.5094,
"step": 284600
},
{
"epoch": 0.6565222302392111,
"grad_norm": 1.7994771003723145,
"learning_rate": 1.7173888488039448e-05,
"loss": 0.5308,
"step": 284800
},
{
"epoch": 0.656983271131233,
"grad_norm": 1.471977949142456,
"learning_rate": 1.715083644343835e-05,
"loss": 0.4866,
"step": 285000
},
{
"epoch": 0.6574443120232549,
"grad_norm": 0.9150500893592834,
"learning_rate": 1.7127784398837256e-05,
"loss": 0.4983,
"step": 285200
},
{
"epoch": 0.6579053529152769,
"grad_norm": 1.0636359453201294,
"learning_rate": 1.710473235423616e-05,
"loss": 0.5032,
"step": 285400
},
{
"epoch": 0.6583663938072988,
"grad_norm": 1.005440354347229,
"learning_rate": 1.7081680309635064e-05,
"loss": 0.5163,
"step": 285600
},
{
"epoch": 0.6588274346993207,
"grad_norm": 0.7577878832817078,
"learning_rate": 1.705862826503397e-05,
"loss": 0.4763,
"step": 285800
},
{
"epoch": 0.6592884755913426,
"grad_norm": 1.632212519645691,
"learning_rate": 1.703557622043287e-05,
"loss": 0.477,
"step": 286000
},
{
"epoch": 0.6597495164833644,
"grad_norm": 0.42119720578193665,
"learning_rate": 1.7012524175831777e-05,
"loss": 0.5244,
"step": 286200
},
{
"epoch": 0.6602105573753864,
"grad_norm": 1.7082394361495972,
"learning_rate": 1.698947213123068e-05,
"loss": 0.4961,
"step": 286400
},
{
"epoch": 0.6606715982674083,
"grad_norm": 1.360280990600586,
"learning_rate": 1.6966420086629582e-05,
"loss": 0.5161,
"step": 286600
},
{
"epoch": 0.6611326391594302,
"grad_norm": 1.266839623451233,
"learning_rate": 1.6943368042028488e-05,
"loss": 0.4477,
"step": 286800
},
{
"epoch": 0.6615936800514521,
"grad_norm": 0.5453054308891296,
"learning_rate": 1.6920315997427393e-05,
"loss": 0.4772,
"step": 287000
},
{
"epoch": 0.6620547209434741,
"grad_norm": 1.4255741834640503,
"learning_rate": 1.68972639528263e-05,
"loss": 0.4471,
"step": 287200
},
{
"epoch": 0.662515761835496,
"grad_norm": 2.048753261566162,
"learning_rate": 1.68742119082252e-05,
"loss": 0.4945,
"step": 287400
},
{
"epoch": 0.6629768027275179,
"grad_norm": 1.00551176071167,
"learning_rate": 1.6851159863624107e-05,
"loss": 0.5258,
"step": 287600
},
{
"epoch": 0.6634378436195398,
"grad_norm": 1.403394103050232,
"learning_rate": 1.682810781902301e-05,
"loss": 0.515,
"step": 287800
},
{
"epoch": 0.6638988845115618,
"grad_norm": 1.374613881111145,
"learning_rate": 1.680505577442191e-05,
"loss": 0.504,
"step": 288000
},
{
"epoch": 0.6643599254035837,
"grad_norm": 0.9842983484268188,
"learning_rate": 1.6782003729820817e-05,
"loss": 0.536,
"step": 288200
},
{
"epoch": 0.6648209662956056,
"grad_norm": 1.1047396659851074,
"learning_rate": 1.675895168521972e-05,
"loss": 0.501,
"step": 288400
},
{
"epoch": 0.6652820071876275,
"grad_norm": 0.7167093753814697,
"learning_rate": 1.6735899640618625e-05,
"loss": 0.5139,
"step": 288600
},
{
"epoch": 0.6657430480796495,
"grad_norm": 2.0152106285095215,
"learning_rate": 1.671284759601753e-05,
"loss": 0.4873,
"step": 288800
},
{
"epoch": 0.6662040889716714,
"grad_norm": 2.2245209217071533,
"learning_rate": 1.6689795551416433e-05,
"loss": 0.5077,
"step": 289000
},
{
"epoch": 0.6666651298636933,
"grad_norm": 1.8077071905136108,
"learning_rate": 1.666674350681534e-05,
"loss": 0.5168,
"step": 289200
},
{
"epoch": 0.6671261707557152,
"grad_norm": 2.8042407035827637,
"learning_rate": 1.664369146221424e-05,
"loss": 0.5174,
"step": 289400
},
{
"epoch": 0.667587211647737,
"grad_norm": 0.7965187430381775,
"learning_rate": 1.6620639417613147e-05,
"loss": 0.4988,
"step": 289600
},
{
"epoch": 0.668048252539759,
"grad_norm": 0.6338868141174316,
"learning_rate": 1.659758737301205e-05,
"loss": 0.5782,
"step": 289800
},
{
"epoch": 0.6685092934317809,
"grad_norm": 1.7595531940460205,
"learning_rate": 1.6574535328410955e-05,
"loss": 0.4831,
"step": 290000
},
{
"epoch": 0.6689703343238028,
"grad_norm": 1.2702540159225464,
"learning_rate": 1.6551483283809857e-05,
"loss": 0.4689,
"step": 290200
},
{
"epoch": 0.6694313752158247,
"grad_norm": 0.9792807102203369,
"learning_rate": 1.6528431239208763e-05,
"loss": 0.5161,
"step": 290400
},
{
"epoch": 0.6698924161078467,
"grad_norm": 1.6363322734832764,
"learning_rate": 1.650537919460767e-05,
"loss": 0.5315,
"step": 290600
},
{
"epoch": 0.6703534569998686,
"grad_norm": 1.1259363889694214,
"learning_rate": 1.648232715000657e-05,
"loss": 0.5286,
"step": 290800
},
{
"epoch": 0.6708144978918905,
"grad_norm": 1.2707172632217407,
"learning_rate": 1.6459275105405476e-05,
"loss": 0.4925,
"step": 291000
},
{
"epoch": 0.6712755387839124,
"grad_norm": 1.0751131772994995,
"learning_rate": 1.643622306080438e-05,
"loss": 0.4835,
"step": 291200
},
{
"epoch": 0.6717365796759344,
"grad_norm": 0.9899608492851257,
"learning_rate": 1.6413171016203284e-05,
"loss": 0.4812,
"step": 291400
},
{
"epoch": 0.6721976205679563,
"grad_norm": 3.855407238006592,
"learning_rate": 1.6390118971602187e-05,
"loss": 0.5086,
"step": 291600
},
{
"epoch": 0.6726586614599782,
"grad_norm": 1.1831018924713135,
"learning_rate": 1.636706692700109e-05,
"loss": 0.5044,
"step": 291800
},
{
"epoch": 0.6731197023520001,
"grad_norm": 0.9542708396911621,
"learning_rate": 1.6344014882399994e-05,
"loss": 0.5374,
"step": 292000
},
{
"epoch": 0.6735807432440221,
"grad_norm": 1.1548891067504883,
"learning_rate": 1.63209628377989e-05,
"loss": 0.537,
"step": 292200
},
{
"epoch": 0.674041784136044,
"grad_norm": 0.7885655760765076,
"learning_rate": 1.6297910793197806e-05,
"loss": 0.4424,
"step": 292400
},
{
"epoch": 0.6745028250280659,
"grad_norm": 0.3185381293296814,
"learning_rate": 1.6274858748596708e-05,
"loss": 0.4631,
"step": 292600
},
{
"epoch": 0.6749638659200878,
"grad_norm": 1.5828882455825806,
"learning_rate": 1.625180670399561e-05,
"loss": 0.4709,
"step": 292800
},
{
"epoch": 0.6754249068121096,
"grad_norm": 1.0387425422668457,
"learning_rate": 1.6228754659394516e-05,
"loss": 0.5046,
"step": 293000
},
{
"epoch": 0.6758859477041316,
"grad_norm": 0.9464387893676758,
"learning_rate": 1.620570261479342e-05,
"loss": 0.4864,
"step": 293200
},
{
"epoch": 0.6763469885961535,
"grad_norm": 2.105416774749756,
"learning_rate": 1.6182650570192324e-05,
"loss": 0.4753,
"step": 293400
},
{
"epoch": 0.6768080294881754,
"grad_norm": 19.655559539794922,
"learning_rate": 1.6159598525591226e-05,
"loss": 0.5156,
"step": 293600
},
{
"epoch": 0.6772690703801973,
"grad_norm": 0.9485812187194824,
"learning_rate": 1.6136546480990132e-05,
"loss": 0.4566,
"step": 293800
},
{
"epoch": 0.6777301112722193,
"grad_norm": 2.1423091888427734,
"learning_rate": 1.6113494436389038e-05,
"loss": 0.4994,
"step": 294000
},
{
"epoch": 0.6781911521642412,
"grad_norm": 1.1267365217208862,
"learning_rate": 1.609044239178794e-05,
"loss": 0.4647,
"step": 294200
},
{
"epoch": 0.6786521930562631,
"grad_norm": 1.5974739789962769,
"learning_rate": 1.6067390347186846e-05,
"loss": 0.4851,
"step": 294400
},
{
"epoch": 0.679113233948285,
"grad_norm": 1.6099416017532349,
"learning_rate": 1.6044338302585748e-05,
"loss": 0.5076,
"step": 294600
},
{
"epoch": 0.679574274840307,
"grad_norm": 2.5845448970794678,
"learning_rate": 1.6021286257984654e-05,
"loss": 0.4898,
"step": 294800
},
{
"epoch": 0.6800353157323289,
"grad_norm": 2.4938390254974365,
"learning_rate": 1.5998234213383556e-05,
"loss": 0.5057,
"step": 295000
},
{
"epoch": 0.6804963566243508,
"grad_norm": 1.8456722497940063,
"learning_rate": 1.5975182168782458e-05,
"loss": 0.5114,
"step": 295200
},
{
"epoch": 0.6809573975163727,
"grad_norm": 1.0706640481948853,
"learning_rate": 1.5952130124181364e-05,
"loss": 0.5209,
"step": 295400
},
{
"epoch": 0.6814184384083947,
"grad_norm": 3.961984872817993,
"learning_rate": 1.592907807958027e-05,
"loss": 0.4766,
"step": 295600
},
{
"epoch": 0.6818794793004166,
"grad_norm": 1.8537254333496094,
"learning_rate": 1.5906026034979175e-05,
"loss": 0.5056,
"step": 295800
},
{
"epoch": 0.6823405201924385,
"grad_norm": 1.2177605628967285,
"learning_rate": 1.5882973990378077e-05,
"loss": 0.4694,
"step": 296000
},
{
"epoch": 0.6828015610844604,
"grad_norm": 4.802238464355469,
"learning_rate": 1.5859921945776983e-05,
"loss": 0.4912,
"step": 296200
},
{
"epoch": 0.6832626019764823,
"grad_norm": 1.457472801208496,
"learning_rate": 1.5836869901175885e-05,
"loss": 0.5113,
"step": 296400
},
{
"epoch": 0.6837236428685042,
"grad_norm": 1.4785571098327637,
"learning_rate": 1.5813817856574788e-05,
"loss": 0.5273,
"step": 296600
},
{
"epoch": 0.6841846837605261,
"grad_norm": 1.4524779319763184,
"learning_rate": 1.5790765811973693e-05,
"loss": 0.5218,
"step": 296800
},
{
"epoch": 0.684645724652548,
"grad_norm": 0.7074722051620483,
"learning_rate": 1.5767713767372596e-05,
"loss": 0.4772,
"step": 297000
},
{
"epoch": 0.68510676554457,
"grad_norm": 2.3584671020507812,
"learning_rate": 1.5744661722771505e-05,
"loss": 0.4854,
"step": 297200
},
{
"epoch": 0.6855678064365919,
"grad_norm": 0.7205916047096252,
"learning_rate": 1.5721609678170407e-05,
"loss": 0.5049,
"step": 297400
},
{
"epoch": 0.6860288473286138,
"grad_norm": 1.152288794517517,
"learning_rate": 1.569855763356931e-05,
"loss": 0.4726,
"step": 297600
},
{
"epoch": 0.6864898882206357,
"grad_norm": 1.2458863258361816,
"learning_rate": 1.5675505588968215e-05,
"loss": 0.5022,
"step": 297800
},
{
"epoch": 0.6869509291126576,
"grad_norm": 0.4532303214073181,
"learning_rate": 1.5652453544367117e-05,
"loss": 0.4986,
"step": 298000
},
{
"epoch": 0.6874119700046796,
"grad_norm": 1.452418327331543,
"learning_rate": 1.5629401499766023e-05,
"loss": 0.529,
"step": 298200
},
{
"epoch": 0.6878730108967015,
"grad_norm": 0.909852921962738,
"learning_rate": 1.5606349455164925e-05,
"loss": 0.4958,
"step": 298400
},
{
"epoch": 0.6883340517887234,
"grad_norm": 1.39362370967865,
"learning_rate": 1.558329741056383e-05,
"loss": 0.5138,
"step": 298600
},
{
"epoch": 0.6887950926807453,
"grad_norm": 1.186716914176941,
"learning_rate": 1.5560245365962737e-05,
"loss": 0.489,
"step": 298800
},
{
"epoch": 0.6892561335727673,
"grad_norm": 1.4374350309371948,
"learning_rate": 1.553719332136164e-05,
"loss": 0.476,
"step": 299000
},
{
"epoch": 0.6897171744647892,
"grad_norm": 1.2326973676681519,
"learning_rate": 1.5514141276760545e-05,
"loss": 0.5138,
"step": 299200
},
{
"epoch": 0.6901782153568111,
"grad_norm": 2.208893299102783,
"learning_rate": 1.5491089232159447e-05,
"loss": 0.5194,
"step": 299400
},
{
"epoch": 0.690639256248833,
"grad_norm": 2.6161091327667236,
"learning_rate": 1.5468037187558353e-05,
"loss": 0.5107,
"step": 299600
},
{
"epoch": 0.6911002971408549,
"grad_norm": 0.7406659126281738,
"learning_rate": 1.5444985142957255e-05,
"loss": 0.5295,
"step": 299800
},
{
"epoch": 0.6915613380328768,
"grad_norm": 0.9079631567001343,
"learning_rate": 1.542193309835616e-05,
"loss": 0.4774,
"step": 300000
},
{
"epoch": 0.6915613380328768,
"eval_loss": 0.49455514550209045,
"eval_runtime": 144.4178,
"eval_samples_per_second": 30.343,
"eval_steps_per_second": 30.343,
"step": 300000
},
{
"epoch": 0.6920223789248987,
"grad_norm": 1.3833597898483276,
"learning_rate": 1.5398881053755063e-05,
"loss": 0.5354,
"step": 300200
},
{
"epoch": 0.6924834198169206,
"grad_norm": 0.6728918552398682,
"learning_rate": 1.5375829009153965e-05,
"loss": 0.4536,
"step": 300400
},
{
"epoch": 0.6929444607089426,
"grad_norm": 1.655994176864624,
"learning_rate": 1.5352776964552874e-05,
"loss": 0.4603,
"step": 300600
},
{
"epoch": 0.6934055016009645,
"grad_norm": 1.8707417249679565,
"learning_rate": 1.5329724919951776e-05,
"loss": 0.5031,
"step": 300800
},
{
"epoch": 0.6938665424929864,
"grad_norm": 1.189855694770813,
"learning_rate": 1.5306672875350682e-05,
"loss": 0.4406,
"step": 301000
},
{
"epoch": 0.6943275833850083,
"grad_norm": 0.5549800395965576,
"learning_rate": 1.5283620830749584e-05,
"loss": 0.4955,
"step": 301200
},
{
"epoch": 0.6947886242770303,
"grad_norm": 1.3587613105773926,
"learning_rate": 1.5260568786148487e-05,
"loss": 0.4695,
"step": 301400
},
{
"epoch": 0.6952496651690522,
"grad_norm": 1.1256383657455444,
"learning_rate": 1.5237516741547392e-05,
"loss": 0.4928,
"step": 301600
},
{
"epoch": 0.6957107060610741,
"grad_norm": 1.0597585439682007,
"learning_rate": 1.5214464696946296e-05,
"loss": 0.5788,
"step": 301800
},
{
"epoch": 0.696171746953096,
"grad_norm": 1.196616768836975,
"learning_rate": 1.5191412652345199e-05,
"loss": 0.4771,
"step": 302000
},
{
"epoch": 0.696632787845118,
"grad_norm": 0.942761242389679,
"learning_rate": 1.5168360607744106e-05,
"loss": 0.5354,
"step": 302200
},
{
"epoch": 0.6970938287371399,
"grad_norm": 1.2657501697540283,
"learning_rate": 1.514530856314301e-05,
"loss": 0.4893,
"step": 302400
},
{
"epoch": 0.6975548696291618,
"grad_norm": 2.3571038246154785,
"learning_rate": 1.5122256518541914e-05,
"loss": 0.5137,
"step": 302600
},
{
"epoch": 0.6980159105211837,
"grad_norm": 0.39919519424438477,
"learning_rate": 1.5099204473940818e-05,
"loss": 0.4944,
"step": 302800
},
{
"epoch": 0.6984769514132056,
"grad_norm": 0.5027835965156555,
"learning_rate": 1.507615242933972e-05,
"loss": 0.5393,
"step": 303000
},
{
"epoch": 0.6989379923052275,
"grad_norm": 1.1620961427688599,
"learning_rate": 1.5053100384738624e-05,
"loss": 0.4845,
"step": 303200
},
{
"epoch": 0.6993990331972494,
"grad_norm": 1.5563163757324219,
"learning_rate": 1.5030048340137528e-05,
"loss": 0.5067,
"step": 303400
},
{
"epoch": 0.6998600740892713,
"grad_norm": 0.9374263882637024,
"learning_rate": 1.5006996295536432e-05,
"loss": 0.4745,
"step": 303600
},
{
"epoch": 0.7003211149812932,
"grad_norm": 1.7934794425964355,
"learning_rate": 1.4983944250935336e-05,
"loss": 0.469,
"step": 303800
},
{
"epoch": 0.7007821558733152,
"grad_norm": 1.6941883563995361,
"learning_rate": 1.4960892206334244e-05,
"loss": 0.4998,
"step": 304000
},
{
"epoch": 0.7012431967653371,
"grad_norm": 1.3214648962020874,
"learning_rate": 1.4937840161733146e-05,
"loss": 0.4831,
"step": 304200
},
{
"epoch": 0.701704237657359,
"grad_norm": 1.517357587814331,
"learning_rate": 1.491478811713205e-05,
"loss": 0.4715,
"step": 304400
},
{
"epoch": 0.7021652785493809,
"grad_norm": 0.819487988948822,
"learning_rate": 1.4891736072530954e-05,
"loss": 0.4914,
"step": 304600
},
{
"epoch": 0.7026263194414029,
"grad_norm": 1.0428346395492554,
"learning_rate": 1.4868684027929858e-05,
"loss": 0.5116,
"step": 304800
},
{
"epoch": 0.7030873603334248,
"grad_norm": 1.9063506126403809,
"learning_rate": 1.4845631983328762e-05,
"loss": 0.4993,
"step": 305000
},
{
"epoch": 0.7035484012254467,
"grad_norm": 2.997563600540161,
"learning_rate": 1.4822579938727666e-05,
"loss": 0.4698,
"step": 305200
},
{
"epoch": 0.7040094421174686,
"grad_norm": 1.612297534942627,
"learning_rate": 1.479952789412657e-05,
"loss": 0.5322,
"step": 305400
},
{
"epoch": 0.7044704830094906,
"grad_norm": 1.348860740661621,
"learning_rate": 1.4776475849525475e-05,
"loss": 0.5132,
"step": 305600
},
{
"epoch": 0.7049315239015125,
"grad_norm": 0.9498617649078369,
"learning_rate": 1.475342380492438e-05,
"loss": 0.5163,
"step": 305800
},
{
"epoch": 0.7053925647935344,
"grad_norm": 1.5654537677764893,
"learning_rate": 1.4730371760323283e-05,
"loss": 0.5524,
"step": 306000
},
{
"epoch": 0.7058536056855563,
"grad_norm": 1.3119844198226929,
"learning_rate": 1.4707319715722187e-05,
"loss": 0.5214,
"step": 306200
},
{
"epoch": 0.7063146465775783,
"grad_norm": 0.8046100735664368,
"learning_rate": 1.4684267671121091e-05,
"loss": 0.4921,
"step": 306400
},
{
"epoch": 0.7067756874696001,
"grad_norm": 0.5308769941329956,
"learning_rate": 1.4661215626519995e-05,
"loss": 0.4677,
"step": 306600
},
{
"epoch": 0.707236728361622,
"grad_norm": 1.8907235860824585,
"learning_rate": 1.4638163581918898e-05,
"loss": 0.5625,
"step": 306800
},
{
"epoch": 0.7076977692536439,
"grad_norm": 1.138887882232666,
"learning_rate": 1.4615111537317802e-05,
"loss": 0.4624,
"step": 307000
},
{
"epoch": 0.7081588101456658,
"grad_norm": 0.6800757646560669,
"learning_rate": 1.4592059492716706e-05,
"loss": 0.5375,
"step": 307200
},
{
"epoch": 0.7086198510376878,
"grad_norm": 1.3743557929992676,
"learning_rate": 1.4569007448115613e-05,
"loss": 0.548,
"step": 307400
},
{
"epoch": 0.7090808919297097,
"grad_norm": 1.4539231061935425,
"learning_rate": 1.4545955403514517e-05,
"loss": 0.5169,
"step": 307600
},
{
"epoch": 0.7095419328217316,
"grad_norm": 0.6173273324966431,
"learning_rate": 1.4522903358913421e-05,
"loss": 0.4933,
"step": 307800
},
{
"epoch": 0.7100029737137535,
"grad_norm": 1.401665210723877,
"learning_rate": 1.4499851314312323e-05,
"loss": 0.5009,
"step": 308000
},
{
"epoch": 0.7104640146057755,
"grad_norm": 1.782645344734192,
"learning_rate": 1.4476799269711227e-05,
"loss": 0.5133,
"step": 308200
},
{
"epoch": 0.7109250554977974,
"grad_norm": 1.1517479419708252,
"learning_rate": 1.4453747225110131e-05,
"loss": 0.4714,
"step": 308400
},
{
"epoch": 0.7113860963898193,
"grad_norm": 0.3535856604576111,
"learning_rate": 1.4430695180509035e-05,
"loss": 0.4667,
"step": 308600
},
{
"epoch": 0.7118471372818412,
"grad_norm": 1.6771602630615234,
"learning_rate": 1.4407643135907939e-05,
"loss": 0.4971,
"step": 308800
},
{
"epoch": 0.7123081781738632,
"grad_norm": 1.895080804824829,
"learning_rate": 1.4384591091306845e-05,
"loss": 0.4917,
"step": 309000
},
{
"epoch": 0.7127692190658851,
"grad_norm": 1.5443464517593384,
"learning_rate": 1.4361539046705749e-05,
"loss": 0.4998,
"step": 309200
},
{
"epoch": 0.713230259957907,
"grad_norm": 0.635612428188324,
"learning_rate": 1.4338487002104653e-05,
"loss": 0.5347,
"step": 309400
},
{
"epoch": 0.7136913008499289,
"grad_norm": 1.680080771446228,
"learning_rate": 1.4315434957503557e-05,
"loss": 0.5551,
"step": 309600
},
{
"epoch": 0.7141523417419507,
"grad_norm": 0.8438254594802856,
"learning_rate": 1.429238291290246e-05,
"loss": 0.5284,
"step": 309800
},
{
"epoch": 0.7146133826339727,
"grad_norm": 1.1309008598327637,
"learning_rate": 1.4269330868301365e-05,
"loss": 0.5249,
"step": 310000
},
{
"epoch": 0.7150744235259946,
"grad_norm": 0.8668766021728516,
"learning_rate": 1.4246278823700269e-05,
"loss": 0.4738,
"step": 310200
},
{
"epoch": 0.7155354644180165,
"grad_norm": 0.8339349627494812,
"learning_rate": 1.4223226779099173e-05,
"loss": 0.4969,
"step": 310400
},
{
"epoch": 0.7159965053100384,
"grad_norm": 1.1966744661331177,
"learning_rate": 1.4200174734498075e-05,
"loss": 0.5029,
"step": 310600
},
{
"epoch": 0.7164575462020604,
"grad_norm": 1.6723459959030151,
"learning_rate": 1.4177122689896982e-05,
"loss": 0.538,
"step": 310800
},
{
"epoch": 0.7169185870940823,
"grad_norm": 0.6843717694282532,
"learning_rate": 1.4154070645295886e-05,
"loss": 0.4896,
"step": 311000
},
{
"epoch": 0.7173796279861042,
"grad_norm": 2.2339181900024414,
"learning_rate": 1.413101860069479e-05,
"loss": 0.5173,
"step": 311200
},
{
"epoch": 0.7178406688781261,
"grad_norm": 1.8708288669586182,
"learning_rate": 1.4107966556093694e-05,
"loss": 0.4853,
"step": 311400
},
{
"epoch": 0.7183017097701481,
"grad_norm": 0.8902921080589294,
"learning_rate": 1.4084914511492597e-05,
"loss": 0.4688,
"step": 311600
},
{
"epoch": 0.71876275066217,
"grad_norm": 0.9172972440719604,
"learning_rate": 1.40618624668915e-05,
"loss": 0.4588,
"step": 311800
},
{
"epoch": 0.7192237915541919,
"grad_norm": 1.278566837310791,
"learning_rate": 1.4038810422290404e-05,
"loss": 0.5001,
"step": 312000
},
{
"epoch": 0.7196848324462138,
"grad_norm": 0.6410205364227295,
"learning_rate": 1.4015758377689308e-05,
"loss": 0.4772,
"step": 312200
},
{
"epoch": 0.7201458733382358,
"grad_norm": 1.300574541091919,
"learning_rate": 1.3992706333088216e-05,
"loss": 0.4829,
"step": 312400
},
{
"epoch": 0.7206069142302577,
"grad_norm": 1.1145926713943481,
"learning_rate": 1.396965428848712e-05,
"loss": 0.5403,
"step": 312600
},
{
"epoch": 0.7210679551222796,
"grad_norm": 2.115949869155884,
"learning_rate": 1.3946602243886022e-05,
"loss": 0.5284,
"step": 312800
},
{
"epoch": 0.7215289960143015,
"grad_norm": 1.5189509391784668,
"learning_rate": 1.3923550199284926e-05,
"loss": 0.4795,
"step": 313000
},
{
"epoch": 0.7219900369063234,
"grad_norm": 0.7120934724807739,
"learning_rate": 1.390049815468383e-05,
"loss": 0.4977,
"step": 313200
},
{
"epoch": 0.7224510777983453,
"grad_norm": 1.7092379331588745,
"learning_rate": 1.3877446110082734e-05,
"loss": 0.448,
"step": 313400
},
{
"epoch": 0.7229121186903672,
"grad_norm": 1.4430723190307617,
"learning_rate": 1.3854394065481638e-05,
"loss": 0.4991,
"step": 313600
},
{
"epoch": 0.7233731595823891,
"grad_norm": 0.8764591217041016,
"learning_rate": 1.3831342020880542e-05,
"loss": 0.5104,
"step": 313800
},
{
"epoch": 0.723834200474411,
"grad_norm": 1.5279911756515503,
"learning_rate": 1.3808289976279446e-05,
"loss": 0.4888,
"step": 314000
},
{
"epoch": 0.724295241366433,
"grad_norm": 1.9160465002059937,
"learning_rate": 1.3785237931678352e-05,
"loss": 0.5148,
"step": 314200
},
{
"epoch": 0.7247562822584549,
"grad_norm": 0.8003278374671936,
"learning_rate": 1.3762185887077256e-05,
"loss": 0.5243,
"step": 314400
},
{
"epoch": 0.7252173231504768,
"grad_norm": 1.049712061882019,
"learning_rate": 1.373913384247616e-05,
"loss": 0.4999,
"step": 314600
},
{
"epoch": 0.7256783640424987,
"grad_norm": 1.2144337892532349,
"learning_rate": 1.3716081797875064e-05,
"loss": 0.5561,
"step": 314800
},
{
"epoch": 0.7261394049345207,
"grad_norm": 2.1154098510742188,
"learning_rate": 1.3693029753273968e-05,
"loss": 0.4614,
"step": 315000
},
{
"epoch": 0.7266004458265426,
"grad_norm": 0.5475128889083862,
"learning_rate": 1.3669977708672872e-05,
"loss": 0.479,
"step": 315200
},
{
"epoch": 0.7270614867185645,
"grad_norm": 1.0177366733551025,
"learning_rate": 1.3646925664071774e-05,
"loss": 0.5073,
"step": 315400
},
{
"epoch": 0.7275225276105864,
"grad_norm": 3.217353105545044,
"learning_rate": 1.3623873619470678e-05,
"loss": 0.5399,
"step": 315600
},
{
"epoch": 0.7279835685026084,
"grad_norm": 2.1022963523864746,
"learning_rate": 1.3600821574869585e-05,
"loss": 0.5137,
"step": 315800
},
{
"epoch": 0.7284446093946303,
"grad_norm": 0.4113731384277344,
"learning_rate": 1.357776953026849e-05,
"loss": 0.4935,
"step": 316000
},
{
"epoch": 0.7289056502866522,
"grad_norm": 0.6860734224319458,
"learning_rate": 1.3554717485667393e-05,
"loss": 0.5092,
"step": 316200
},
{
"epoch": 0.7293666911786741,
"grad_norm": 1.0901679992675781,
"learning_rate": 1.3531665441066297e-05,
"loss": 0.5062,
"step": 316400
},
{
"epoch": 0.729827732070696,
"grad_norm": 1.102059006690979,
"learning_rate": 1.35086133964652e-05,
"loss": 0.5143,
"step": 316600
},
{
"epoch": 0.7302887729627179,
"grad_norm": 1.0236157178878784,
"learning_rate": 1.3485561351864103e-05,
"loss": 0.5476,
"step": 316800
},
{
"epoch": 0.7307498138547398,
"grad_norm": 1.4766557216644287,
"learning_rate": 1.3462509307263007e-05,
"loss": 0.4543,
"step": 317000
},
{
"epoch": 0.7312108547467617,
"grad_norm": 1.7664604187011719,
"learning_rate": 1.3439457262661911e-05,
"loss": 0.5531,
"step": 317200
},
{
"epoch": 0.7316718956387837,
"grad_norm": 1.5094674825668335,
"learning_rate": 1.3416405218060815e-05,
"loss": 0.4851,
"step": 317400
},
{
"epoch": 0.7321329365308056,
"grad_norm": 0.6211707592010498,
"learning_rate": 1.3393353173459721e-05,
"loss": 0.4945,
"step": 317600
},
{
"epoch": 0.7325939774228275,
"grad_norm": 0.9305445551872253,
"learning_rate": 1.3370301128858625e-05,
"loss": 0.5324,
"step": 317800
},
{
"epoch": 0.7330550183148494,
"grad_norm": 1.2025363445281982,
"learning_rate": 1.3347249084257529e-05,
"loss": 0.5128,
"step": 318000
},
{
"epoch": 0.7335160592068714,
"grad_norm": 1.1147645711898804,
"learning_rate": 1.3324197039656433e-05,
"loss": 0.4722,
"step": 318200
},
{
"epoch": 0.7339771000988933,
"grad_norm": 1.073165774345398,
"learning_rate": 1.3301144995055337e-05,
"loss": 0.5153,
"step": 318400
},
{
"epoch": 0.7344381409909152,
"grad_norm": 1.6959824562072754,
"learning_rate": 1.3278092950454241e-05,
"loss": 0.4795,
"step": 318600
},
{
"epoch": 0.7348991818829371,
"grad_norm": 0.850702702999115,
"learning_rate": 1.3255040905853145e-05,
"loss": 0.4875,
"step": 318800
},
{
"epoch": 0.735360222774959,
"grad_norm": 1.5950241088867188,
"learning_rate": 1.3231988861252049e-05,
"loss": 0.4988,
"step": 319000
},
{
"epoch": 0.735821263666981,
"grad_norm": 1.4513007402420044,
"learning_rate": 1.3208936816650955e-05,
"loss": 0.4947,
"step": 319200
},
{
"epoch": 0.7362823045590029,
"grad_norm": 2.507760524749756,
"learning_rate": 1.3185884772049859e-05,
"loss": 0.5527,
"step": 319400
},
{
"epoch": 0.7367433454510248,
"grad_norm": 0.49451202154159546,
"learning_rate": 1.3162832727448763e-05,
"loss": 0.5249,
"step": 319600
},
{
"epoch": 0.7372043863430467,
"grad_norm": 1.2579914331436157,
"learning_rate": 1.3139780682847667e-05,
"loss": 0.4911,
"step": 319800
},
{
"epoch": 0.7376654272350686,
"grad_norm": 0.30338361859321594,
"learning_rate": 1.311672863824657e-05,
"loss": 0.524,
"step": 320000
},
{
"epoch": 0.7381264681270905,
"grad_norm": 3.077241897583008,
"learning_rate": 1.3093676593645473e-05,
"loss": 0.5091,
"step": 320200
},
{
"epoch": 0.7385875090191124,
"grad_norm": 1.3362106084823608,
"learning_rate": 1.3070624549044377e-05,
"loss": 0.4827,
"step": 320400
},
{
"epoch": 0.7390485499111343,
"grad_norm": 1.2579853534698486,
"learning_rate": 1.304757250444328e-05,
"loss": 0.4945,
"step": 320600
},
{
"epoch": 0.7395095908031563,
"grad_norm": 1.0365217924118042,
"learning_rate": 1.3024520459842185e-05,
"loss": 0.5256,
"step": 320800
},
{
"epoch": 0.7399706316951782,
"grad_norm": 0.9613335132598877,
"learning_rate": 1.3001468415241092e-05,
"loss": 0.4811,
"step": 321000
},
{
"epoch": 0.7404316725872001,
"grad_norm": 1.111335039138794,
"learning_rate": 1.2978416370639996e-05,
"loss": 0.5011,
"step": 321200
},
{
"epoch": 0.740892713479222,
"grad_norm": 1.1504440307617188,
"learning_rate": 1.2955364326038898e-05,
"loss": 0.4916,
"step": 321400
},
{
"epoch": 0.741353754371244,
"grad_norm": 0.9241997599601746,
"learning_rate": 1.2932312281437802e-05,
"loss": 0.4507,
"step": 321600
},
{
"epoch": 0.7418147952632659,
"grad_norm": 1.1424815654754639,
"learning_rate": 1.2909260236836706e-05,
"loss": 0.5188,
"step": 321800
},
{
"epoch": 0.7422758361552878,
"grad_norm": 0.8069947957992554,
"learning_rate": 1.288620819223561e-05,
"loss": 0.4967,
"step": 322000
},
{
"epoch": 0.7427368770473097,
"grad_norm": 1.4160171747207642,
"learning_rate": 1.2863156147634514e-05,
"loss": 0.514,
"step": 322200
},
{
"epoch": 0.7431979179393317,
"grad_norm": 1.1542912721633911,
"learning_rate": 1.2840104103033418e-05,
"loss": 0.4799,
"step": 322400
},
{
"epoch": 0.7436589588313536,
"grad_norm": 1.112442970275879,
"learning_rate": 1.2817052058432324e-05,
"loss": 0.4787,
"step": 322600
},
{
"epoch": 0.7441199997233755,
"grad_norm": 1.970729112625122,
"learning_rate": 1.2794000013831228e-05,
"loss": 0.4734,
"step": 322800
},
{
"epoch": 0.7445810406153974,
"grad_norm": 0.7014828324317932,
"learning_rate": 1.2770947969230132e-05,
"loss": 0.5364,
"step": 323000
},
{
"epoch": 0.7450420815074194,
"grad_norm": 0.852289080619812,
"learning_rate": 1.2747895924629036e-05,
"loss": 0.5169,
"step": 323200
},
{
"epoch": 0.7455031223994412,
"grad_norm": 1.6365413665771484,
"learning_rate": 1.272484388002794e-05,
"loss": 0.4716,
"step": 323400
},
{
"epoch": 0.7459641632914631,
"grad_norm": 1.1326274871826172,
"learning_rate": 1.2701791835426844e-05,
"loss": 0.483,
"step": 323600
},
{
"epoch": 0.746425204183485,
"grad_norm": 1.7985695600509644,
"learning_rate": 1.2678739790825748e-05,
"loss": 0.5214,
"step": 323800
},
{
"epoch": 0.7468862450755069,
"grad_norm": 1.3214313983917236,
"learning_rate": 1.265568774622465e-05,
"loss": 0.5369,
"step": 324000
},
{
"epoch": 0.7473472859675289,
"grad_norm": 1.8575730323791504,
"learning_rate": 1.2632635701623557e-05,
"loss": 0.5292,
"step": 324200
},
{
"epoch": 0.7478083268595508,
"grad_norm": 0.62919682264328,
"learning_rate": 1.2609583657022461e-05,
"loss": 0.4887,
"step": 324400
},
{
"epoch": 0.7482693677515727,
"grad_norm": 2.681436777114868,
"learning_rate": 1.2586531612421365e-05,
"loss": 0.5284,
"step": 324600
},
{
"epoch": 0.7487304086435946,
"grad_norm": 1.6911917924880981,
"learning_rate": 1.256347956782027e-05,
"loss": 0.504,
"step": 324800
},
{
"epoch": 0.7491914495356166,
"grad_norm": 1.236039638519287,
"learning_rate": 1.2540427523219173e-05,
"loss": 0.5036,
"step": 325000
},
{
"epoch": 0.7496524904276385,
"grad_norm": 1.1618597507476807,
"learning_rate": 1.2517375478618076e-05,
"loss": 0.5154,
"step": 325200
},
{
"epoch": 0.7501135313196604,
"grad_norm": 1.5990595817565918,
"learning_rate": 1.249432343401698e-05,
"loss": 0.4939,
"step": 325400
},
{
"epoch": 0.7505745722116823,
"grad_norm": 1.3306795358657837,
"learning_rate": 1.2471271389415885e-05,
"loss": 0.5226,
"step": 325600
},
{
"epoch": 0.7510356131037043,
"grad_norm": 9.81534481048584,
"learning_rate": 1.244821934481479e-05,
"loss": 0.4952,
"step": 325800
},
{
"epoch": 0.7514966539957262,
"grad_norm": 1.0444341897964478,
"learning_rate": 1.2425167300213693e-05,
"loss": 0.4981,
"step": 326000
},
{
"epoch": 0.7519576948877481,
"grad_norm": 0.957382321357727,
"learning_rate": 1.2402115255612597e-05,
"loss": 0.4855,
"step": 326200
},
{
"epoch": 0.75241873577977,
"grad_norm": 1.7747009992599487,
"learning_rate": 1.2379063211011501e-05,
"loss": 0.4847,
"step": 326400
},
{
"epoch": 0.752879776671792,
"grad_norm": 0.8051755428314209,
"learning_rate": 1.2356011166410405e-05,
"loss": 0.4675,
"step": 326600
},
{
"epoch": 0.7533408175638138,
"grad_norm": 0.8562848567962646,
"learning_rate": 1.233295912180931e-05,
"loss": 0.5252,
"step": 326800
},
{
"epoch": 0.7538018584558357,
"grad_norm": 0.8655639886856079,
"learning_rate": 1.2309907077208213e-05,
"loss": 0.4904,
"step": 327000
},
{
"epoch": 0.7542628993478576,
"grad_norm": 2.3433034420013428,
"learning_rate": 1.2286855032607119e-05,
"loss": 0.4885,
"step": 327200
},
{
"epoch": 0.7547239402398795,
"grad_norm": 1.1155329942703247,
"learning_rate": 1.2263802988006023e-05,
"loss": 0.4917,
"step": 327400
},
{
"epoch": 0.7551849811319015,
"grad_norm": 1.4027127027511597,
"learning_rate": 1.2240750943404925e-05,
"loss": 0.4837,
"step": 327600
},
{
"epoch": 0.7556460220239234,
"grad_norm": 1.8373444080352783,
"learning_rate": 1.2217698898803829e-05,
"loss": 0.4972,
"step": 327800
},
{
"epoch": 0.7561070629159453,
"grad_norm": 1.7816424369812012,
"learning_rate": 1.2194646854202735e-05,
"loss": 0.526,
"step": 328000
},
{
"epoch": 0.7565681038079672,
"grad_norm": 1.9828554391860962,
"learning_rate": 1.2171594809601639e-05,
"loss": 0.4813,
"step": 328200
},
{
"epoch": 0.7570291446999892,
"grad_norm": 2.528639078140259,
"learning_rate": 1.2148542765000543e-05,
"loss": 0.4961,
"step": 328400
},
{
"epoch": 0.7574901855920111,
"grad_norm": 0.7348084449768066,
"learning_rate": 1.2125490720399447e-05,
"loss": 0.4763,
"step": 328600
},
{
"epoch": 0.757951226484033,
"grad_norm": 0.5879639983177185,
"learning_rate": 1.2102438675798349e-05,
"loss": 0.472,
"step": 328800
},
{
"epoch": 0.7584122673760549,
"grad_norm": 0.9352529048919678,
"learning_rate": 1.2079386631197255e-05,
"loss": 0.4944,
"step": 329000
},
{
"epoch": 0.7588733082680769,
"grad_norm": 1.5848828554153442,
"learning_rate": 1.2056334586596159e-05,
"loss": 0.5116,
"step": 329200
},
{
"epoch": 0.7593343491600988,
"grad_norm": 0.44051986932754517,
"learning_rate": 1.2033282541995063e-05,
"loss": 0.5375,
"step": 329400
},
{
"epoch": 0.7597953900521207,
"grad_norm": 2.127389907836914,
"learning_rate": 1.2010230497393967e-05,
"loss": 0.4606,
"step": 329600
},
{
"epoch": 0.7602564309441426,
"grad_norm": 1.7485988140106201,
"learning_rate": 1.1987178452792872e-05,
"loss": 0.4817,
"step": 329800
},
{
"epoch": 0.7607174718361646,
"grad_norm": 1.1227333545684814,
"learning_rate": 1.1964126408191775e-05,
"loss": 0.5069,
"step": 330000
},
{
"epoch": 0.7611785127281864,
"grad_norm": 0.8382754325866699,
"learning_rate": 1.1941074363590679e-05,
"loss": 0.5328,
"step": 330200
},
{
"epoch": 0.7616395536202083,
"grad_norm": 0.9372780323028564,
"learning_rate": 1.1918022318989583e-05,
"loss": 0.4781,
"step": 330400
},
{
"epoch": 0.7621005945122302,
"grad_norm": 1.3626426458358765,
"learning_rate": 1.1894970274388488e-05,
"loss": 0.4831,
"step": 330600
},
{
"epoch": 0.7625616354042521,
"grad_norm": 0.8523277044296265,
"learning_rate": 1.1871918229787392e-05,
"loss": 0.5254,
"step": 330800
},
{
"epoch": 0.7630226762962741,
"grad_norm": 1.5201365947723389,
"learning_rate": 1.1848866185186296e-05,
"loss": 0.5154,
"step": 331000
},
{
"epoch": 0.763483717188296,
"grad_norm": 0.46071958541870117,
"learning_rate": 1.18258141405852e-05,
"loss": 0.4999,
"step": 331200
},
{
"epoch": 0.7639447580803179,
"grad_norm": 1.4432693719863892,
"learning_rate": 1.1802762095984104e-05,
"loss": 0.4895,
"step": 331400
},
{
"epoch": 0.7644057989723398,
"grad_norm": 3.8710200786590576,
"learning_rate": 1.1779710051383008e-05,
"loss": 0.5162,
"step": 331600
},
{
"epoch": 0.7648668398643618,
"grad_norm": 1.2128450870513916,
"learning_rate": 1.1756658006781912e-05,
"loss": 0.482,
"step": 331800
},
{
"epoch": 0.7653278807563837,
"grad_norm": 1.517349123954773,
"learning_rate": 1.1733605962180816e-05,
"loss": 0.5071,
"step": 332000
},
{
"epoch": 0.7657889216484056,
"grad_norm": 1.6065720319747925,
"learning_rate": 1.171055391757972e-05,
"loss": 0.5092,
"step": 332200
},
{
"epoch": 0.7662499625404275,
"grad_norm": 2.150094747543335,
"learning_rate": 1.1687501872978624e-05,
"loss": 0.4952,
"step": 332400
},
{
"epoch": 0.7667110034324495,
"grad_norm": 0.7310593724250793,
"learning_rate": 1.1664449828377528e-05,
"loss": 0.4762,
"step": 332600
},
{
"epoch": 0.7671720443244714,
"grad_norm": 1.276360034942627,
"learning_rate": 1.1641397783776432e-05,
"loss": 0.481,
"step": 332800
},
{
"epoch": 0.7676330852164933,
"grad_norm": 0.42438310384750366,
"learning_rate": 1.1618345739175336e-05,
"loss": 0.4871,
"step": 333000
},
{
"epoch": 0.7680941261085152,
"grad_norm": 1.0823901891708374,
"learning_rate": 1.1595293694574242e-05,
"loss": 0.4841,
"step": 333200
},
{
"epoch": 0.7685551670005372,
"grad_norm": 1.3709418773651123,
"learning_rate": 1.1572241649973146e-05,
"loss": 0.4975,
"step": 333400
},
{
"epoch": 0.769016207892559,
"grad_norm": 1.654448390007019,
"learning_rate": 1.154918960537205e-05,
"loss": 0.4477,
"step": 333600
},
{
"epoch": 0.7694772487845809,
"grad_norm": 0.4724847078323364,
"learning_rate": 1.1526137560770952e-05,
"loss": 0.4991,
"step": 333800
},
{
"epoch": 0.7699382896766028,
"grad_norm": 1.3029577732086182,
"learning_rate": 1.1503085516169858e-05,
"loss": 0.5075,
"step": 334000
},
{
"epoch": 0.7703993305686248,
"grad_norm": 1.2783386707305908,
"learning_rate": 1.1480033471568762e-05,
"loss": 0.5014,
"step": 334200
},
{
"epoch": 0.7708603714606467,
"grad_norm": 1.8879179954528809,
"learning_rate": 1.1456981426967666e-05,
"loss": 0.4937,
"step": 334400
},
{
"epoch": 0.7713214123526686,
"grad_norm": 1.2683477401733398,
"learning_rate": 1.143392938236657e-05,
"loss": 0.4751,
"step": 334600
},
{
"epoch": 0.7717824532446905,
"grad_norm": 2.740619421005249,
"learning_rate": 1.1410877337765474e-05,
"loss": 0.5027,
"step": 334800
},
{
"epoch": 0.7722434941367124,
"grad_norm": 1.6804182529449463,
"learning_rate": 1.1387825293164378e-05,
"loss": 0.4677,
"step": 335000
},
{
"epoch": 0.7727045350287344,
"grad_norm": 2.2255728244781494,
"learning_rate": 1.1364773248563282e-05,
"loss": 0.4803,
"step": 335200
},
{
"epoch": 0.7731655759207563,
"grad_norm": 1.0658611059188843,
"learning_rate": 1.1341721203962186e-05,
"loss": 0.4537,
"step": 335400
},
{
"epoch": 0.7736266168127782,
"grad_norm": 1.3411928415298462,
"learning_rate": 1.131866915936109e-05,
"loss": 0.4775,
"step": 335600
},
{
"epoch": 0.7740876577048001,
"grad_norm": 1.467576265335083,
"learning_rate": 1.1295617114759995e-05,
"loss": 0.501,
"step": 335800
},
{
"epoch": 0.7745486985968221,
"grad_norm": 1.2459622621536255,
"learning_rate": 1.1272565070158899e-05,
"loss": 0.5128,
"step": 336000
},
{
"epoch": 0.775009739488844,
"grad_norm": 1.0791770219802856,
"learning_rate": 1.1249513025557801e-05,
"loss": 0.4476,
"step": 336200
},
{
"epoch": 0.7754707803808659,
"grad_norm": 1.271998643875122,
"learning_rate": 1.1226460980956705e-05,
"loss": 0.4701,
"step": 336400
},
{
"epoch": 0.7759318212728878,
"grad_norm": 1.7874670028686523,
"learning_rate": 1.1203408936355611e-05,
"loss": 0.5229,
"step": 336600
},
{
"epoch": 0.7763928621649098,
"grad_norm": 0.7723343968391418,
"learning_rate": 1.1180356891754515e-05,
"loss": 0.3966,
"step": 336800
},
{
"epoch": 0.7768539030569316,
"grad_norm": 1.4732195138931274,
"learning_rate": 1.1157304847153419e-05,
"loss": 0.4943,
"step": 337000
},
{
"epoch": 0.7773149439489535,
"grad_norm": 1.1352183818817139,
"learning_rate": 1.1134252802552323e-05,
"loss": 0.5189,
"step": 337200
},
{
"epoch": 0.7777759848409754,
"grad_norm": 1.1527478694915771,
"learning_rate": 1.1111200757951227e-05,
"loss": 0.5249,
"step": 337400
},
{
"epoch": 0.7782370257329974,
"grad_norm": 0.9301843643188477,
"learning_rate": 1.1088148713350131e-05,
"loss": 0.4648,
"step": 337600
},
{
"epoch": 0.7786980666250193,
"grad_norm": 1.1807146072387695,
"learning_rate": 1.1065096668749035e-05,
"loss": 0.4759,
"step": 337800
},
{
"epoch": 0.7791591075170412,
"grad_norm": 1.4340068101882935,
"learning_rate": 1.1042044624147939e-05,
"loss": 0.4719,
"step": 338000
},
{
"epoch": 0.7796201484090631,
"grad_norm": 1.1477597951889038,
"learning_rate": 1.1018992579546845e-05,
"loss": 0.5048,
"step": 338200
},
{
"epoch": 0.780081189301085,
"grad_norm": 1.487963318824768,
"learning_rate": 1.0995940534945749e-05,
"loss": 0.5077,
"step": 338400
},
{
"epoch": 0.780542230193107,
"grad_norm": 3.070131301879883,
"learning_rate": 1.0972888490344651e-05,
"loss": 0.4992,
"step": 338600
},
{
"epoch": 0.7810032710851289,
"grad_norm": 0.9652560949325562,
"learning_rate": 1.0949836445743555e-05,
"loss": 0.5147,
"step": 338800
},
{
"epoch": 0.7814643119771508,
"grad_norm": 1.0315585136413574,
"learning_rate": 1.092678440114246e-05,
"loss": 0.4721,
"step": 339000
},
{
"epoch": 0.7819253528691728,
"grad_norm": 1.015569806098938,
"learning_rate": 1.0903732356541365e-05,
"loss": 0.4365,
"step": 339200
},
{
"epoch": 0.7823863937611947,
"grad_norm": 0.49842461943626404,
"learning_rate": 1.0880680311940269e-05,
"loss": 0.4841,
"step": 339400
},
{
"epoch": 0.7828474346532166,
"grad_norm": 0.7842098474502563,
"learning_rate": 1.0857628267339173e-05,
"loss": 0.4589,
"step": 339600
},
{
"epoch": 0.7833084755452385,
"grad_norm": 1.2681951522827148,
"learning_rate": 1.0834576222738076e-05,
"loss": 0.4821,
"step": 339800
},
{
"epoch": 0.7837695164372604,
"grad_norm": 1.8472216129302979,
"learning_rate": 1.081152417813698e-05,
"loss": 0.4841,
"step": 340000
},
{
"epoch": 0.7842305573292824,
"grad_norm": 1.1875754594802856,
"learning_rate": 1.0788472133535884e-05,
"loss": 0.4509,
"step": 340200
},
{
"epoch": 0.7846915982213042,
"grad_norm": 1.493262529373169,
"learning_rate": 1.0765420088934788e-05,
"loss": 0.4853,
"step": 340400
},
{
"epoch": 0.7851526391133261,
"grad_norm": 1.0441592931747437,
"learning_rate": 1.0742368044333692e-05,
"loss": 0.5009,
"step": 340600
},
{
"epoch": 0.785613680005348,
"grad_norm": 1.7319620847702026,
"learning_rate": 1.0719315999732598e-05,
"loss": 0.5304,
"step": 340800
},
{
"epoch": 0.78607472089737,
"grad_norm": 1.3646876811981201,
"learning_rate": 1.06962639551315e-05,
"loss": 0.4885,
"step": 341000
},
{
"epoch": 0.7865357617893919,
"grad_norm": 1.5010404586791992,
"learning_rate": 1.0673211910530404e-05,
"loss": 0.4912,
"step": 341200
},
{
"epoch": 0.7869968026814138,
"grad_norm": 0.8283145427703857,
"learning_rate": 1.0650159865929308e-05,
"loss": 0.4941,
"step": 341400
},
{
"epoch": 0.7874578435734357,
"grad_norm": 0.6535471677780151,
"learning_rate": 1.0627107821328214e-05,
"loss": 0.522,
"step": 341600
},
{
"epoch": 0.7879188844654577,
"grad_norm": 1.1741523742675781,
"learning_rate": 1.0604055776727118e-05,
"loss": 0.5234,
"step": 341800
},
{
"epoch": 0.7883799253574796,
"grad_norm": 1.3052113056182861,
"learning_rate": 1.0581003732126022e-05,
"loss": 0.495,
"step": 342000
},
{
"epoch": 0.7888409662495015,
"grad_norm": 1.795502781867981,
"learning_rate": 1.0557951687524926e-05,
"loss": 0.4678,
"step": 342200
},
{
"epoch": 0.7893020071415234,
"grad_norm": 0.9580342769622803,
"learning_rate": 1.053489964292383e-05,
"loss": 0.5116,
"step": 342400
},
{
"epoch": 0.7897630480335454,
"grad_norm": 1.020665168762207,
"learning_rate": 1.0511847598322734e-05,
"loss": 0.4891,
"step": 342600
},
{
"epoch": 0.7902240889255673,
"grad_norm": 0.8749563694000244,
"learning_rate": 1.0488795553721638e-05,
"loss": 0.4898,
"step": 342800
},
{
"epoch": 0.7906851298175892,
"grad_norm": 0.8884357810020447,
"learning_rate": 1.0465743509120542e-05,
"loss": 0.4513,
"step": 343000
},
{
"epoch": 0.7911461707096111,
"grad_norm": 0.8629872798919678,
"learning_rate": 1.0442691464519446e-05,
"loss": 0.4825,
"step": 343200
},
{
"epoch": 0.791607211601633,
"grad_norm": 1.346708059310913,
"learning_rate": 1.041963941991835e-05,
"loss": 0.5254,
"step": 343400
},
{
"epoch": 0.792068252493655,
"grad_norm": 0.5898563265800476,
"learning_rate": 1.0396587375317254e-05,
"loss": 0.4761,
"step": 343600
},
{
"epoch": 0.7925292933856768,
"grad_norm": 0.49635791778564453,
"learning_rate": 1.0373535330716158e-05,
"loss": 0.4639,
"step": 343800
},
{
"epoch": 0.7929903342776987,
"grad_norm": 0.534585177898407,
"learning_rate": 1.0350483286115062e-05,
"loss": 0.5002,
"step": 344000
},
{
"epoch": 0.7934513751697206,
"grad_norm": 1.0430246591567993,
"learning_rate": 1.0327431241513967e-05,
"loss": 0.4492,
"step": 344200
},
{
"epoch": 0.7939124160617426,
"grad_norm": 0.9281976819038391,
"learning_rate": 1.0304379196912871e-05,
"loss": 0.4478,
"step": 344400
},
{
"epoch": 0.7943734569537645,
"grad_norm": 1.5951513051986694,
"learning_rate": 1.0281327152311775e-05,
"loss": 0.4651,
"step": 344600
},
{
"epoch": 0.7948344978457864,
"grad_norm": 1.9117207527160645,
"learning_rate": 1.0258275107710678e-05,
"loss": 0.4564,
"step": 344800
},
{
"epoch": 0.7952955387378083,
"grad_norm": 1.1856075525283813,
"learning_rate": 1.0235223063109583e-05,
"loss": 0.5218,
"step": 345000
},
{
"epoch": 0.7957565796298303,
"grad_norm": 1.4824328422546387,
"learning_rate": 1.0212171018508487e-05,
"loss": 0.5053,
"step": 345200
},
{
"epoch": 0.7962176205218522,
"grad_norm": 1.768130898475647,
"learning_rate": 1.0189118973907391e-05,
"loss": 0.5017,
"step": 345400
},
{
"epoch": 0.7966786614138741,
"grad_norm": 1.2414652109146118,
"learning_rate": 1.0166066929306295e-05,
"loss": 0.51,
"step": 345600
},
{
"epoch": 0.797139702305896,
"grad_norm": 2.830430507659912,
"learning_rate": 1.01430148847052e-05,
"loss": 0.4847,
"step": 345800
},
{
"epoch": 0.797600743197918,
"grad_norm": 1.8276104927062988,
"learning_rate": 1.0119962840104103e-05,
"loss": 0.5412,
"step": 346000
},
{
"epoch": 0.7980617840899399,
"grad_norm": 1.8435417413711548,
"learning_rate": 1.0096910795503007e-05,
"loss": 0.4832,
"step": 346200
},
{
"epoch": 0.7985228249819618,
"grad_norm": 1.2370027303695679,
"learning_rate": 1.0073858750901911e-05,
"loss": 0.437,
"step": 346400
},
{
"epoch": 0.7989838658739837,
"grad_norm": 0.6917985677719116,
"learning_rate": 1.0050806706300815e-05,
"loss": 0.5141,
"step": 346600
},
{
"epoch": 0.7994449067660057,
"grad_norm": 2.1598243713378906,
"learning_rate": 1.0027754661699721e-05,
"loss": 0.4865,
"step": 346800
},
{
"epoch": 0.7999059476580276,
"grad_norm": 1.8002493381500244,
"learning_rate": 1.0004702617098625e-05,
"loss": 0.4876,
"step": 347000
},
{
"epoch": 0.8003669885500494,
"grad_norm": 1.486546277999878,
"learning_rate": 9.981650572497527e-06,
"loss": 0.5157,
"step": 347200
},
{
"epoch": 0.8008280294420713,
"grad_norm": 1.7758817672729492,
"learning_rate": 9.958598527896431e-06,
"loss": 0.5249,
"step": 347400
},
{
"epoch": 0.8012890703340932,
"grad_norm": 0.8744950294494629,
"learning_rate": 9.935546483295337e-06,
"loss": 0.4575,
"step": 347600
},
{
"epoch": 0.8017501112261152,
"grad_norm": 1.4803967475891113,
"learning_rate": 9.91249443869424e-06,
"loss": 0.5104,
"step": 347800
},
{
"epoch": 0.8022111521181371,
"grad_norm": 2.251115560531616,
"learning_rate": 9.889442394093145e-06,
"loss": 0.471,
"step": 348000
},
{
"epoch": 0.802672193010159,
"grad_norm": 1.8598825931549072,
"learning_rate": 9.866390349492049e-06,
"loss": 0.5484,
"step": 348200
},
{
"epoch": 0.8031332339021809,
"grad_norm": 1.993989109992981,
"learning_rate": 9.843338304890953e-06,
"loss": 0.5437,
"step": 348400
},
{
"epoch": 0.8035942747942029,
"grad_norm": 1.425431251525879,
"learning_rate": 9.820286260289857e-06,
"loss": 0.4386,
"step": 348600
},
{
"epoch": 0.8040553156862248,
"grad_norm": 0.4540669620037079,
"learning_rate": 9.79723421568876e-06,
"loss": 0.4557,
"step": 348800
},
{
"epoch": 0.8045163565782467,
"grad_norm": 1.800315022468567,
"learning_rate": 9.774182171087665e-06,
"loss": 0.4771,
"step": 349000
},
{
"epoch": 0.8049773974702686,
"grad_norm": 0.8877231478691101,
"learning_rate": 9.75113012648657e-06,
"loss": 0.4811,
"step": 349200
},
{
"epoch": 0.8054384383622906,
"grad_norm": 1.3885689973831177,
"learning_rate": 9.728078081885474e-06,
"loss": 0.5492,
"step": 349400
},
{
"epoch": 0.8058994792543125,
"grad_norm": 1.6329267024993896,
"learning_rate": 9.705026037284377e-06,
"loss": 0.4933,
"step": 349600
},
{
"epoch": 0.8063605201463344,
"grad_norm": 1.2911161184310913,
"learning_rate": 9.68197399268328e-06,
"loss": 0.4724,
"step": 349800
},
{
"epoch": 0.8068215610383563,
"grad_norm": 1.7925668954849243,
"learning_rate": 9.658921948082185e-06,
"loss": 0.5562,
"step": 350000
},
{
"epoch": 0.8068215610383563,
"eval_loss": 0.48525139689445496,
"eval_runtime": 144.215,
"eval_samples_per_second": 30.385,
"eval_steps_per_second": 30.385,
"step": 350000
},
{
"epoch": 0.8072826019303783,
"grad_norm": 1.9523992538452148,
"learning_rate": 9.63586990348109e-06,
"loss": 0.4891,
"step": 350200
},
{
"epoch": 0.8077436428224002,
"grad_norm": 0.8594640493392944,
"learning_rate": 9.612817858879994e-06,
"loss": 0.5044,
"step": 350400
},
{
"epoch": 0.808204683714422,
"grad_norm": 0.9530147314071655,
"learning_rate": 9.589765814278898e-06,
"loss": 0.4518,
"step": 350600
},
{
"epoch": 0.8086657246064439,
"grad_norm": 1.8223358392715454,
"learning_rate": 9.566713769677802e-06,
"loss": 0.4809,
"step": 350800
},
{
"epoch": 0.8091267654984658,
"grad_norm": 4.091012477874756,
"learning_rate": 9.543661725076706e-06,
"loss": 0.4465,
"step": 351000
},
{
"epoch": 0.8095878063904878,
"grad_norm": 1.6293407678604126,
"learning_rate": 9.52060968047561e-06,
"loss": 0.4734,
"step": 351200
},
{
"epoch": 0.8100488472825097,
"grad_norm": 1.2203644514083862,
"learning_rate": 9.497557635874514e-06,
"loss": 0.5044,
"step": 351400
},
{
"epoch": 0.8105098881745316,
"grad_norm": 1.3531818389892578,
"learning_rate": 9.474505591273418e-06,
"loss": 0.4731,
"step": 351600
},
{
"epoch": 0.8109709290665535,
"grad_norm": 2.762836217880249,
"learning_rate": 9.451453546672324e-06,
"loss": 0.5298,
"step": 351800
},
{
"epoch": 0.8114319699585755,
"grad_norm": 1.708924651145935,
"learning_rate": 9.428401502071226e-06,
"loss": 0.5214,
"step": 352000
},
{
"epoch": 0.8118930108505974,
"grad_norm": 1.0070140361785889,
"learning_rate": 9.40534945747013e-06,
"loss": 0.55,
"step": 352200
},
{
"epoch": 0.8123540517426193,
"grad_norm": 1.6505459547042847,
"learning_rate": 9.382297412869034e-06,
"loss": 0.5069,
"step": 352400
},
{
"epoch": 0.8128150926346412,
"grad_norm": 1.5503573417663574,
"learning_rate": 9.35924536826794e-06,
"loss": 0.4478,
"step": 352600
},
{
"epoch": 0.8132761335266632,
"grad_norm": 1.1401780843734741,
"learning_rate": 9.336193323666844e-06,
"loss": 0.5148,
"step": 352800
},
{
"epoch": 0.8137371744186851,
"grad_norm": 1.4352729320526123,
"learning_rate": 9.313141279065748e-06,
"loss": 0.5326,
"step": 353000
},
{
"epoch": 0.814198215310707,
"grad_norm": 0.6954234838485718,
"learning_rate": 9.290089234464652e-06,
"loss": 0.5324,
"step": 353200
},
{
"epoch": 0.8146592562027289,
"grad_norm": 1.4972223043441772,
"learning_rate": 9.267037189863556e-06,
"loss": 0.4461,
"step": 353400
},
{
"epoch": 0.8151202970947509,
"grad_norm": 1.3123633861541748,
"learning_rate": 9.24398514526246e-06,
"loss": 0.4987,
"step": 353600
},
{
"epoch": 0.8155813379867728,
"grad_norm": 0.849063515663147,
"learning_rate": 9.220933100661364e-06,
"loss": 0.5218,
"step": 353800
},
{
"epoch": 0.8160423788787946,
"grad_norm": 0.5541665554046631,
"learning_rate": 9.197881056060268e-06,
"loss": 0.5017,
"step": 354000
},
{
"epoch": 0.8165034197708165,
"grad_norm": 2.253199577331543,
"learning_rate": 9.174829011459172e-06,
"loss": 0.5043,
"step": 354200
},
{
"epoch": 0.8169644606628385,
"grad_norm": 0.6903029680252075,
"learning_rate": 9.151776966858076e-06,
"loss": 0.494,
"step": 354400
},
{
"epoch": 0.8174255015548604,
"grad_norm": 0.6121809482574463,
"learning_rate": 9.12872492225698e-06,
"loss": 0.4946,
"step": 354600
},
{
"epoch": 0.8178865424468823,
"grad_norm": 1.0890499353408813,
"learning_rate": 9.105672877655884e-06,
"loss": 0.4798,
"step": 354800
},
{
"epoch": 0.8183475833389042,
"grad_norm": 1.7235876321792603,
"learning_rate": 9.082620833054788e-06,
"loss": 0.4504,
"step": 355000
},
{
"epoch": 0.8188086242309262,
"grad_norm": 0.649757444858551,
"learning_rate": 9.059568788453693e-06,
"loss": 0.5059,
"step": 355200
},
{
"epoch": 0.8192696651229481,
"grad_norm": 1.162328839302063,
"learning_rate": 9.036516743852597e-06,
"loss": 0.5443,
"step": 355400
},
{
"epoch": 0.81973070601497,
"grad_norm": 0.9448625445365906,
"learning_rate": 9.013464699251501e-06,
"loss": 0.4844,
"step": 355600
},
{
"epoch": 0.8201917469069919,
"grad_norm": 1.1881784200668335,
"learning_rate": 8.990412654650403e-06,
"loss": 0.5193,
"step": 355800
},
{
"epoch": 0.8206527877990138,
"grad_norm": 1.7445374727249146,
"learning_rate": 8.967360610049309e-06,
"loss": 0.4849,
"step": 356000
},
{
"epoch": 0.8211138286910358,
"grad_norm": 1.524045705795288,
"learning_rate": 8.944308565448213e-06,
"loss": 0.4695,
"step": 356200
},
{
"epoch": 0.8215748695830577,
"grad_norm": 1.7928262948989868,
"learning_rate": 8.921256520847117e-06,
"loss": 0.5043,
"step": 356400
},
{
"epoch": 0.8220359104750796,
"grad_norm": 1.1687183380126953,
"learning_rate": 8.898204476246021e-06,
"loss": 0.5196,
"step": 356600
},
{
"epoch": 0.8224969513671015,
"grad_norm": 0.9082534909248352,
"learning_rate": 8.875152431644925e-06,
"loss": 0.4634,
"step": 356800
},
{
"epoch": 0.8229579922591235,
"grad_norm": 1.261551022529602,
"learning_rate": 8.852100387043829e-06,
"loss": 0.4848,
"step": 357000
},
{
"epoch": 0.8234190331511453,
"grad_norm": 0.9897369146347046,
"learning_rate": 8.829048342442733e-06,
"loss": 0.4554,
"step": 357200
},
{
"epoch": 0.8238800740431672,
"grad_norm": 0.7321066856384277,
"learning_rate": 8.805996297841637e-06,
"loss": 0.4909,
"step": 357400
},
{
"epoch": 0.8243411149351891,
"grad_norm": 1.8298851251602173,
"learning_rate": 8.782944253240541e-06,
"loss": 0.4725,
"step": 357600
},
{
"epoch": 0.8248021558272111,
"grad_norm": 1.113755702972412,
"learning_rate": 8.759892208639447e-06,
"loss": 0.4988,
"step": 357800
},
{
"epoch": 0.825263196719233,
"grad_norm": 0.5906481742858887,
"learning_rate": 8.73684016403835e-06,
"loss": 0.4976,
"step": 358000
},
{
"epoch": 0.8257242376112549,
"grad_norm": 1.478716254234314,
"learning_rate": 8.713788119437253e-06,
"loss": 0.4813,
"step": 358200
},
{
"epoch": 0.8261852785032768,
"grad_norm": 1.8848345279693604,
"learning_rate": 8.690736074836157e-06,
"loss": 0.5385,
"step": 358400
},
{
"epoch": 0.8266463193952988,
"grad_norm": 2.71705961227417,
"learning_rate": 8.667684030235063e-06,
"loss": 0.4924,
"step": 358600
},
{
"epoch": 0.8271073602873207,
"grad_norm": 1.3063760995864868,
"learning_rate": 8.644631985633967e-06,
"loss": 0.5204,
"step": 358800
},
{
"epoch": 0.8275684011793426,
"grad_norm": 1.4281903505325317,
"learning_rate": 8.62157994103287e-06,
"loss": 0.4972,
"step": 359000
},
{
"epoch": 0.8280294420713645,
"grad_norm": 1.393025517463684,
"learning_rate": 8.598527896431775e-06,
"loss": 0.489,
"step": 359200
},
{
"epoch": 0.8284904829633865,
"grad_norm": 0.7618604302406311,
"learning_rate": 8.575475851830678e-06,
"loss": 0.4807,
"step": 359400
},
{
"epoch": 0.8289515238554084,
"grad_norm": 0.7368053197860718,
"learning_rate": 8.552423807229582e-06,
"loss": 0.4818,
"step": 359600
},
{
"epoch": 0.8294125647474303,
"grad_norm": 1.3130792379379272,
"learning_rate": 8.529371762628486e-06,
"loss": 0.4886,
"step": 359800
},
{
"epoch": 0.8298736056394522,
"grad_norm": 1.5593905448913574,
"learning_rate": 8.50631971802739e-06,
"loss": 0.5196,
"step": 360000
},
{
"epoch": 0.8303346465314742,
"grad_norm": 0.9520807266235352,
"learning_rate": 8.483267673426296e-06,
"loss": 0.4526,
"step": 360200
},
{
"epoch": 0.8307956874234961,
"grad_norm": 1.134156346321106,
"learning_rate": 8.4602156288252e-06,
"loss": 0.5142,
"step": 360400
},
{
"epoch": 0.8312567283155179,
"grad_norm": 0.47593235969543457,
"learning_rate": 8.437163584224102e-06,
"loss": 0.5145,
"step": 360600
},
{
"epoch": 0.8317177692075398,
"grad_norm": 1.2350735664367676,
"learning_rate": 8.414111539623006e-06,
"loss": 0.5239,
"step": 360800
},
{
"epoch": 0.8321788100995617,
"grad_norm": 1.0222281217575073,
"learning_rate": 8.39105949502191e-06,
"loss": 0.4508,
"step": 361000
},
{
"epoch": 0.8326398509915837,
"grad_norm": 1.2607372999191284,
"learning_rate": 8.368007450420816e-06,
"loss": 0.5243,
"step": 361200
},
{
"epoch": 0.8331008918836056,
"grad_norm": 1.2229344844818115,
"learning_rate": 8.34495540581972e-06,
"loss": 0.5201,
"step": 361400
},
{
"epoch": 0.8335619327756275,
"grad_norm": 1.4129853248596191,
"learning_rate": 8.321903361218624e-06,
"loss": 0.4407,
"step": 361600
},
{
"epoch": 0.8340229736676494,
"grad_norm": 0.8093553185462952,
"learning_rate": 8.298851316617528e-06,
"loss": 0.4439,
"step": 361800
},
{
"epoch": 0.8344840145596714,
"grad_norm": 0.9249831438064575,
"learning_rate": 8.275799272016432e-06,
"loss": 0.5359,
"step": 362000
},
{
"epoch": 0.8349450554516933,
"grad_norm": 1.773339867591858,
"learning_rate": 8.252747227415336e-06,
"loss": 0.4875,
"step": 362200
},
{
"epoch": 0.8354060963437152,
"grad_norm": 1.0773868560791016,
"learning_rate": 8.22969518281424e-06,
"loss": 0.4725,
"step": 362400
},
{
"epoch": 0.8358671372357371,
"grad_norm": 1.0181094408035278,
"learning_rate": 8.206643138213144e-06,
"loss": 0.4273,
"step": 362600
},
{
"epoch": 0.8363281781277591,
"grad_norm": 1.118444800376892,
"learning_rate": 8.18359109361205e-06,
"loss": 0.4468,
"step": 362800
},
{
"epoch": 0.836789219019781,
"grad_norm": 1.1972088813781738,
"learning_rate": 8.160539049010952e-06,
"loss": 0.4841,
"step": 363000
},
{
"epoch": 0.8372502599118029,
"grad_norm": 1.2389174699783325,
"learning_rate": 8.137487004409856e-06,
"loss": 0.4965,
"step": 363200
},
{
"epoch": 0.8377113008038248,
"grad_norm": 1.1917423009872437,
"learning_rate": 8.11443495980876e-06,
"loss": 0.4591,
"step": 363400
},
{
"epoch": 0.8381723416958468,
"grad_norm": 1.3053388595581055,
"learning_rate": 8.091382915207665e-06,
"loss": 0.473,
"step": 363600
},
{
"epoch": 0.8386333825878687,
"grad_norm": 1.1159336566925049,
"learning_rate": 8.06833087060657e-06,
"loss": 0.5112,
"step": 363800
},
{
"epoch": 0.8390944234798905,
"grad_norm": 1.8432027101516724,
"learning_rate": 8.045278826005473e-06,
"loss": 0.4769,
"step": 364000
},
{
"epoch": 0.8395554643719124,
"grad_norm": 1.2790404558181763,
"learning_rate": 8.022226781404377e-06,
"loss": 0.4743,
"step": 364200
},
{
"epoch": 0.8400165052639343,
"grad_norm": 1.2240092754364014,
"learning_rate": 7.999174736803281e-06,
"loss": 0.5003,
"step": 364400
},
{
"epoch": 0.8404775461559563,
"grad_norm": 1.5568150281906128,
"learning_rate": 7.976122692202185e-06,
"loss": 0.5212,
"step": 364600
},
{
"epoch": 0.8409385870479782,
"grad_norm": 1.0242736339569092,
"learning_rate": 7.95307064760109e-06,
"loss": 0.5117,
"step": 364800
},
{
"epoch": 0.8413996279400001,
"grad_norm": 1.5472807884216309,
"learning_rate": 7.930018602999993e-06,
"loss": 0.5041,
"step": 365000
},
{
"epoch": 0.841860668832022,
"grad_norm": 1.990938663482666,
"learning_rate": 7.906966558398897e-06,
"loss": 0.4807,
"step": 365200
},
{
"epoch": 0.842321709724044,
"grad_norm": 1.9151630401611328,
"learning_rate": 7.883914513797801e-06,
"loss": 0.4795,
"step": 365400
},
{
"epoch": 0.8427827506160659,
"grad_norm": 1.0808899402618408,
"learning_rate": 7.860862469196705e-06,
"loss": 0.513,
"step": 365600
},
{
"epoch": 0.8432437915080878,
"grad_norm": 0.6713162660598755,
"learning_rate": 7.83781042459561e-06,
"loss": 0.4933,
"step": 365800
},
{
"epoch": 0.8437048324001097,
"grad_norm": 1.5635173320770264,
"learning_rate": 7.814758379994513e-06,
"loss": 0.4533,
"step": 366000
},
{
"epoch": 0.8441658732921317,
"grad_norm": 0.9642801880836487,
"learning_rate": 7.791706335393419e-06,
"loss": 0.5032,
"step": 366200
},
{
"epoch": 0.8446269141841536,
"grad_norm": 2.7265806198120117,
"learning_rate": 7.768654290792323e-06,
"loss": 0.4832,
"step": 366400
},
{
"epoch": 0.8450879550761755,
"grad_norm": 1.7652499675750732,
"learning_rate": 7.745602246191227e-06,
"loss": 0.5327,
"step": 366600
},
{
"epoch": 0.8455489959681974,
"grad_norm": 1.3054319620132446,
"learning_rate": 7.72255020159013e-06,
"loss": 0.4696,
"step": 366800
},
{
"epoch": 0.8460100368602194,
"grad_norm": 1.4413760900497437,
"learning_rate": 7.699498156989035e-06,
"loss": 0.5115,
"step": 367000
},
{
"epoch": 0.8464710777522413,
"grad_norm": 1.7205134630203247,
"learning_rate": 7.676446112387939e-06,
"loss": 0.5295,
"step": 367200
},
{
"epoch": 0.8469321186442631,
"grad_norm": 0.7298296689987183,
"learning_rate": 7.653394067786843e-06,
"loss": 0.4637,
"step": 367400
},
{
"epoch": 0.847393159536285,
"grad_norm": 1.1865860223770142,
"learning_rate": 7.630342023185747e-06,
"loss": 0.4559,
"step": 367600
},
{
"epoch": 0.847854200428307,
"grad_norm": 4.188174247741699,
"learning_rate": 7.607289978584652e-06,
"loss": 0.4763,
"step": 367800
},
{
"epoch": 0.8483152413203289,
"grad_norm": 1.0749932527542114,
"learning_rate": 7.584237933983556e-06,
"loss": 0.4536,
"step": 368000
},
{
"epoch": 0.8487762822123508,
"grad_norm": 2.213075637817383,
"learning_rate": 7.561185889382459e-06,
"loss": 0.5016,
"step": 368200
},
{
"epoch": 0.8492373231043727,
"grad_norm": 2.0269930362701416,
"learning_rate": 7.538133844781363e-06,
"loss": 0.4591,
"step": 368400
},
{
"epoch": 0.8496983639963946,
"grad_norm": 1.625063180923462,
"learning_rate": 7.515081800180267e-06,
"loss": 0.4562,
"step": 368600
},
{
"epoch": 0.8501594048884166,
"grad_norm": 1.1130571365356445,
"learning_rate": 7.4920297555791715e-06,
"loss": 0.4971,
"step": 368800
},
{
"epoch": 0.8506204457804385,
"grad_norm": 1.0837411880493164,
"learning_rate": 7.4689777109780755e-06,
"loss": 0.52,
"step": 369000
},
{
"epoch": 0.8510814866724604,
"grad_norm": 1.0088603496551514,
"learning_rate": 7.4459256663769795e-06,
"loss": 0.4737,
"step": 369200
},
{
"epoch": 0.8515425275644823,
"grad_norm": 1.067406177520752,
"learning_rate": 7.4228736217758835e-06,
"loss": 0.5613,
"step": 369400
},
{
"epoch": 0.8520035684565043,
"grad_norm": 1.7936733961105347,
"learning_rate": 7.399821577174788e-06,
"loss": 0.5067,
"step": 369600
},
{
"epoch": 0.8524646093485262,
"grad_norm": 0.9910215139389038,
"learning_rate": 7.376769532573692e-06,
"loss": 0.473,
"step": 369800
},
{
"epoch": 0.8529256502405481,
"grad_norm": 1.1868542432785034,
"learning_rate": 7.353717487972596e-06,
"loss": 0.4962,
"step": 370000
},
{
"epoch": 0.85338669113257,
"grad_norm": 0.6035569906234741,
"learning_rate": 7.330665443371499e-06,
"loss": 0.5243,
"step": 370200
},
{
"epoch": 0.853847732024592,
"grad_norm": 1.3407708406448364,
"learning_rate": 7.307613398770405e-06,
"loss": 0.445,
"step": 370400
},
{
"epoch": 0.8543087729166139,
"grad_norm": 2.3808753490448,
"learning_rate": 7.284561354169308e-06,
"loss": 0.4929,
"step": 370600
},
{
"epoch": 0.8547698138086357,
"grad_norm": 1.6823943853378296,
"learning_rate": 7.261509309568212e-06,
"loss": 0.5004,
"step": 370800
},
{
"epoch": 0.8552308547006576,
"grad_norm": 0.6995494365692139,
"learning_rate": 7.238457264967116e-06,
"loss": 0.5143,
"step": 371000
},
{
"epoch": 0.8556918955926796,
"grad_norm": 0.914682924747467,
"learning_rate": 7.215405220366021e-06,
"loss": 0.4528,
"step": 371200
},
{
"epoch": 0.8561529364847015,
"grad_norm": 0.5527245402336121,
"learning_rate": 7.192353175764925e-06,
"loss": 0.5004,
"step": 371400
},
{
"epoch": 0.8566139773767234,
"grad_norm": 1.3169046640396118,
"learning_rate": 7.169301131163829e-06,
"loss": 0.4956,
"step": 371600
},
{
"epoch": 0.8570750182687453,
"grad_norm": 1.4355896711349487,
"learning_rate": 7.146249086562733e-06,
"loss": 0.4683,
"step": 371800
},
{
"epoch": 0.8575360591607673,
"grad_norm": 1.7638542652130127,
"learning_rate": 7.123197041961636e-06,
"loss": 0.4969,
"step": 372000
},
{
"epoch": 0.8579971000527892,
"grad_norm": 0.9192449450492859,
"learning_rate": 7.100144997360542e-06,
"loss": 0.5414,
"step": 372200
},
{
"epoch": 0.8584581409448111,
"grad_norm": 0.7934924960136414,
"learning_rate": 7.077092952759446e-06,
"loss": 0.4668,
"step": 372400
},
{
"epoch": 0.858919181836833,
"grad_norm": 1.7283356189727783,
"learning_rate": 7.054040908158349e-06,
"loss": 0.4944,
"step": 372600
},
{
"epoch": 0.859380222728855,
"grad_norm": 0.7687679529190063,
"learning_rate": 7.030988863557253e-06,
"loss": 0.51,
"step": 372800
},
{
"epoch": 0.8598412636208769,
"grad_norm": 1.0831148624420166,
"learning_rate": 7.0079368189561585e-06,
"loss": 0.531,
"step": 373000
},
{
"epoch": 0.8603023045128988,
"grad_norm": 1.0071626901626587,
"learning_rate": 6.984884774355062e-06,
"loss": 0.4789,
"step": 373200
},
{
"epoch": 0.8607633454049207,
"grad_norm": 0.7966915369033813,
"learning_rate": 6.961832729753966e-06,
"loss": 0.4438,
"step": 373400
},
{
"epoch": 0.8612243862969426,
"grad_norm": 0.544999897480011,
"learning_rate": 6.93878068515287e-06,
"loss": 0.5262,
"step": 373600
},
{
"epoch": 0.8616854271889646,
"grad_norm": 1.592140555381775,
"learning_rate": 6.9157286405517745e-06,
"loss": 0.5089,
"step": 373800
},
{
"epoch": 0.8621464680809865,
"grad_norm": 1.578158974647522,
"learning_rate": 6.8926765959506784e-06,
"loss": 0.5181,
"step": 374000
},
{
"epoch": 0.8626075089730083,
"grad_norm": 1.4605205059051514,
"learning_rate": 6.869624551349582e-06,
"loss": 0.5081,
"step": 374200
},
{
"epoch": 0.8630685498650302,
"grad_norm": 2.263418436050415,
"learning_rate": 6.8465725067484856e-06,
"loss": 0.467,
"step": 374400
},
{
"epoch": 0.8635295907570522,
"grad_norm": 1.5185531377792358,
"learning_rate": 6.823520462147391e-06,
"loss": 0.5348,
"step": 374600
},
{
"epoch": 0.8639906316490741,
"grad_norm": 1.1345553398132324,
"learning_rate": 6.800468417546295e-06,
"loss": 0.4811,
"step": 374800
},
{
"epoch": 0.864451672541096,
"grad_norm": 1.926391363143921,
"learning_rate": 6.777416372945198e-06,
"loss": 0.5368,
"step": 375000
},
{
"epoch": 0.8649127134331179,
"grad_norm": 0.6592217087745667,
"learning_rate": 6.754364328344102e-06,
"loss": 0.4902,
"step": 375200
},
{
"epoch": 0.8653737543251399,
"grad_norm": 1.7800625562667847,
"learning_rate": 6.731312283743008e-06,
"loss": 0.4957,
"step": 375400
},
{
"epoch": 0.8658347952171618,
"grad_norm": 0.7634375095367432,
"learning_rate": 6.708260239141911e-06,
"loss": 0.4961,
"step": 375600
},
{
"epoch": 0.8662958361091837,
"grad_norm": 1.417075514793396,
"learning_rate": 6.685208194540815e-06,
"loss": 0.4946,
"step": 375800
},
{
"epoch": 0.8667568770012056,
"grad_norm": 1.4515326023101807,
"learning_rate": 6.662156149939719e-06,
"loss": 0.4915,
"step": 376000
},
{
"epoch": 0.8672179178932276,
"grad_norm": 0.6862966418266296,
"learning_rate": 6.639104105338623e-06,
"loss": 0.4808,
"step": 376200
},
{
"epoch": 0.8676789587852495,
"grad_norm": 1.4989879131317139,
"learning_rate": 6.616052060737528e-06,
"loss": 0.4978,
"step": 376400
},
{
"epoch": 0.8681399996772714,
"grad_norm": 1.7666966915130615,
"learning_rate": 6.593000016136432e-06,
"loss": 0.4961,
"step": 376600
},
{
"epoch": 0.8686010405692933,
"grad_norm": 1.286030888557434,
"learning_rate": 6.569947971535335e-06,
"loss": 0.4874,
"step": 376800
},
{
"epoch": 0.8690620814613153,
"grad_norm": 1.1866004467010498,
"learning_rate": 6.546895926934239e-06,
"loss": 0.488,
"step": 377000
},
{
"epoch": 0.8695231223533372,
"grad_norm": 1.996006965637207,
"learning_rate": 6.523843882333145e-06,
"loss": 0.4987,
"step": 377200
},
{
"epoch": 0.8699841632453591,
"grad_norm": 3.1626696586608887,
"learning_rate": 6.500791837732048e-06,
"loss": 0.4398,
"step": 377400
},
{
"epoch": 0.8704452041373809,
"grad_norm": 2.171281337738037,
"learning_rate": 6.477739793130952e-06,
"loss": 0.4559,
"step": 377600
},
{
"epoch": 0.8709062450294028,
"grad_norm": 1.2836635112762451,
"learning_rate": 6.454687748529856e-06,
"loss": 0.4576,
"step": 377800
},
{
"epoch": 0.8713672859214248,
"grad_norm": 4.639097213745117,
"learning_rate": 6.431635703928761e-06,
"loss": 0.4969,
"step": 378000
},
{
"epoch": 0.8718283268134467,
"grad_norm": 1.5262006521224976,
"learning_rate": 6.408583659327665e-06,
"loss": 0.4538,
"step": 378200
},
{
"epoch": 0.8722893677054686,
"grad_norm": 2.327629566192627,
"learning_rate": 6.3855316147265686e-06,
"loss": 0.4854,
"step": 378400
},
{
"epoch": 0.8727504085974905,
"grad_norm": 2.366154909133911,
"learning_rate": 6.3624795701254725e-06,
"loss": 0.5611,
"step": 378600
},
{
"epoch": 0.8732114494895125,
"grad_norm": 1.4881547689437866,
"learning_rate": 6.339427525524377e-06,
"loss": 0.5075,
"step": 378800
},
{
"epoch": 0.8736724903815344,
"grad_norm": 1.2280333042144775,
"learning_rate": 6.316375480923281e-06,
"loss": 0.5045,
"step": 379000
},
{
"epoch": 0.8741335312735563,
"grad_norm": 4.236263751983643,
"learning_rate": 6.2933234363221845e-06,
"loss": 0.4608,
"step": 379200
},
{
"epoch": 0.8745945721655782,
"grad_norm": 1.3050642013549805,
"learning_rate": 6.2702713917210885e-06,
"loss": 0.4287,
"step": 379400
},
{
"epoch": 0.8750556130576002,
"grad_norm": 2.5533287525177,
"learning_rate": 6.247219347119993e-06,
"loss": 0.5086,
"step": 379600
},
{
"epoch": 0.8755166539496221,
"grad_norm": 1.513671875,
"learning_rate": 6.224167302518897e-06,
"loss": 0.4632,
"step": 379800
},
{
"epoch": 0.875977694841644,
"grad_norm": 1.573878288269043,
"learning_rate": 6.201115257917801e-06,
"loss": 0.4616,
"step": 380000
},
{
"epoch": 0.8764387357336659,
"grad_norm": 1.5972181558609009,
"learning_rate": 6.178063213316705e-06,
"loss": 0.5206,
"step": 380200
},
{
"epoch": 0.8768997766256879,
"grad_norm": 1.171190857887268,
"learning_rate": 6.15501116871561e-06,
"loss": 0.4504,
"step": 380400
},
{
"epoch": 0.8773608175177098,
"grad_norm": 2.334261178970337,
"learning_rate": 6.131959124114513e-06,
"loss": 0.4412,
"step": 380600
},
{
"epoch": 0.8778218584097317,
"grad_norm": 1.540120005607605,
"learning_rate": 6.108907079513418e-06,
"loss": 0.4997,
"step": 380800
},
{
"epoch": 0.8782828993017535,
"grad_norm": 1.3362219333648682,
"learning_rate": 6.085855034912322e-06,
"loss": 0.4924,
"step": 381000
},
{
"epoch": 0.8787439401937754,
"grad_norm": 1.4779139757156372,
"learning_rate": 6.062802990311226e-06,
"loss": 0.5206,
"step": 381200
},
{
"epoch": 0.8792049810857974,
"grad_norm": 2.28874135017395,
"learning_rate": 6.03975094571013e-06,
"loss": 0.4775,
"step": 381400
},
{
"epoch": 0.8796660219778193,
"grad_norm": 0.9095715880393982,
"learning_rate": 6.016698901109035e-06,
"loss": 0.452,
"step": 381600
},
{
"epoch": 0.8801270628698412,
"grad_norm": 2.00390887260437,
"learning_rate": 5.993646856507938e-06,
"loss": 0.5132,
"step": 381800
},
{
"epoch": 0.8805881037618631,
"grad_norm": 1.8259698152542114,
"learning_rate": 5.970594811906843e-06,
"loss": 0.4957,
"step": 382000
},
{
"epoch": 0.8810491446538851,
"grad_norm": 1.8643205165863037,
"learning_rate": 5.947542767305747e-06,
"loss": 0.4866,
"step": 382200
},
{
"epoch": 0.881510185545907,
"grad_norm": 1.181175708770752,
"learning_rate": 5.924490722704651e-06,
"loss": 0.4836,
"step": 382400
},
{
"epoch": 0.8819712264379289,
"grad_norm": 0.8782649040222168,
"learning_rate": 5.901438678103555e-06,
"loss": 0.4407,
"step": 382600
},
{
"epoch": 0.8824322673299508,
"grad_norm": 0.6341625452041626,
"learning_rate": 5.878386633502459e-06,
"loss": 0.4339,
"step": 382800
},
{
"epoch": 0.8828933082219728,
"grad_norm": 1.5220824480056763,
"learning_rate": 5.855334588901363e-06,
"loss": 0.4594,
"step": 383000
},
{
"epoch": 0.8833543491139947,
"grad_norm": 1.0653526782989502,
"learning_rate": 5.832282544300267e-06,
"loss": 0.4825,
"step": 383200
},
{
"epoch": 0.8838153900060166,
"grad_norm": 2.4460973739624023,
"learning_rate": 5.8092304996991715e-06,
"loss": 0.4756,
"step": 383400
},
{
"epoch": 0.8842764308980385,
"grad_norm": 2.1946046352386475,
"learning_rate": 5.786178455098075e-06,
"loss": 0.4698,
"step": 383600
},
{
"epoch": 0.8847374717900605,
"grad_norm": 0.8791565895080566,
"learning_rate": 5.7631264104969794e-06,
"loss": 0.4769,
"step": 383800
},
{
"epoch": 0.8851985126820824,
"grad_norm": 1.2844878435134888,
"learning_rate": 5.7400743658958834e-06,
"loss": 0.5021,
"step": 384000
},
{
"epoch": 0.8856595535741043,
"grad_norm": 1.0738441944122314,
"learning_rate": 5.717022321294787e-06,
"loss": 0.4685,
"step": 384200
},
{
"epoch": 0.8861205944661261,
"grad_norm": 1.2508662939071655,
"learning_rate": 5.693970276693691e-06,
"loss": 0.4716,
"step": 384400
},
{
"epoch": 0.886581635358148,
"grad_norm": 0.8982871174812317,
"learning_rate": 5.670918232092596e-06,
"loss": 0.509,
"step": 384600
},
{
"epoch": 0.88704267625017,
"grad_norm": 0.6702489852905273,
"learning_rate": 5.647866187491499e-06,
"loss": 0.5054,
"step": 384800
},
{
"epoch": 0.8875037171421919,
"grad_norm": 0.8494447469711304,
"learning_rate": 5.624814142890404e-06,
"loss": 0.4878,
"step": 385000
},
{
"epoch": 0.8879647580342138,
"grad_norm": 0.8795982599258423,
"learning_rate": 5.601762098289308e-06,
"loss": 0.4457,
"step": 385200
},
{
"epoch": 0.8884257989262357,
"grad_norm": 1.1568052768707275,
"learning_rate": 5.578710053688212e-06,
"loss": 0.4967,
"step": 385400
},
{
"epoch": 0.8888868398182577,
"grad_norm": 0.8400896191596985,
"learning_rate": 5.555658009087116e-06,
"loss": 0.4741,
"step": 385600
},
{
"epoch": 0.8893478807102796,
"grad_norm": 2.020911693572998,
"learning_rate": 5.532605964486021e-06,
"loss": 0.4592,
"step": 385800
},
{
"epoch": 0.8898089216023015,
"grad_norm": 1.735339641571045,
"learning_rate": 5.509553919884924e-06,
"loss": 0.4947,
"step": 386000
},
{
"epoch": 0.8902699624943234,
"grad_norm": 1.34779953956604,
"learning_rate": 5.486501875283829e-06,
"loss": 0.5204,
"step": 386200
},
{
"epoch": 0.8907310033863454,
"grad_norm": 1.5430375337600708,
"learning_rate": 5.463449830682733e-06,
"loss": 0.4742,
"step": 386400
},
{
"epoch": 0.8911920442783673,
"grad_norm": 1.2541803121566772,
"learning_rate": 5.440397786081636e-06,
"loss": 0.4613,
"step": 386600
},
{
"epoch": 0.8916530851703892,
"grad_norm": 1.007149577140808,
"learning_rate": 5.417345741480541e-06,
"loss": 0.4892,
"step": 386800
},
{
"epoch": 0.8921141260624111,
"grad_norm": 0.808237612247467,
"learning_rate": 5.394293696879445e-06,
"loss": 0.5208,
"step": 387000
},
{
"epoch": 0.8925751669544331,
"grad_norm": 0.8351776599884033,
"learning_rate": 5.371241652278349e-06,
"loss": 0.4799,
"step": 387200
},
{
"epoch": 0.893036207846455,
"grad_norm": 1.6771140098571777,
"learning_rate": 5.348189607677253e-06,
"loss": 0.4752,
"step": 387400
},
{
"epoch": 0.8934972487384769,
"grad_norm": 0.966846227645874,
"learning_rate": 5.325137563076158e-06,
"loss": 0.4927,
"step": 387600
},
{
"epoch": 0.8939582896304987,
"grad_norm": 1.701539397239685,
"learning_rate": 5.302085518475061e-06,
"loss": 0.524,
"step": 387800
},
{
"epoch": 0.8944193305225207,
"grad_norm": 0.4714783728122711,
"learning_rate": 5.279033473873966e-06,
"loss": 0.499,
"step": 388000
},
{
"epoch": 0.8948803714145426,
"grad_norm": 1.1191890239715576,
"learning_rate": 5.2559814292728696e-06,
"loss": 0.5277,
"step": 388200
},
{
"epoch": 0.8953414123065645,
"grad_norm": 1.3981695175170898,
"learning_rate": 5.2329293846717736e-06,
"loss": 0.4648,
"step": 388400
},
{
"epoch": 0.8958024531985864,
"grad_norm": 1.0515044927597046,
"learning_rate": 5.2098773400706775e-06,
"loss": 0.4638,
"step": 388600
},
{
"epoch": 0.8962634940906083,
"grad_norm": 0.9398881196975708,
"learning_rate": 5.186825295469582e-06,
"loss": 0.4993,
"step": 388800
},
{
"epoch": 0.8967245349826303,
"grad_norm": 0.9516793489456177,
"learning_rate": 5.163773250868486e-06,
"loss": 0.502,
"step": 389000
},
{
"epoch": 0.8971855758746522,
"grad_norm": 2.8746252059936523,
"learning_rate": 5.14072120626739e-06,
"loss": 0.4688,
"step": 389200
},
{
"epoch": 0.8976466167666741,
"grad_norm": 0.9749366641044617,
"learning_rate": 5.117669161666294e-06,
"loss": 0.4976,
"step": 389400
},
{
"epoch": 0.898107657658696,
"grad_norm": 1.4214197397232056,
"learning_rate": 5.094617117065198e-06,
"loss": 0.5034,
"step": 389600
},
{
"epoch": 0.898568698550718,
"grad_norm": 1.5979713201522827,
"learning_rate": 5.071565072464102e-06,
"loss": 0.4687,
"step": 389800
},
{
"epoch": 0.8990297394427399,
"grad_norm": 1.1002912521362305,
"learning_rate": 5.048513027863007e-06,
"loss": 0.4935,
"step": 390000
},
{
"epoch": 0.8994907803347618,
"grad_norm": 1.4925017356872559,
"learning_rate": 5.025460983261911e-06,
"loss": 0.4793,
"step": 390200
},
{
"epoch": 0.8999518212267837,
"grad_norm": 1.721877932548523,
"learning_rate": 5.002408938660814e-06,
"loss": 0.4216,
"step": 390400
},
{
"epoch": 0.9004128621188057,
"grad_norm": 1.0198794603347778,
"learning_rate": 4.979356894059719e-06,
"loss": 0.5106,
"step": 390600
},
{
"epoch": 0.9008739030108276,
"grad_norm": 1.2488328218460083,
"learning_rate": 4.956304849458623e-06,
"loss": 0.4419,
"step": 390800
},
{
"epoch": 0.9013349439028495,
"grad_norm": 1.1686707735061646,
"learning_rate": 4.933252804857527e-06,
"loss": 0.5004,
"step": 391000
},
{
"epoch": 0.9017959847948713,
"grad_norm": 2.1322028636932373,
"learning_rate": 4.910200760256431e-06,
"loss": 0.449,
"step": 391200
},
{
"epoch": 0.9022570256868933,
"grad_norm": 1.7106928825378418,
"learning_rate": 4.887148715655336e-06,
"loss": 0.4574,
"step": 391400
},
{
"epoch": 0.9027180665789152,
"grad_norm": 0.9263075590133667,
"learning_rate": 4.864096671054239e-06,
"loss": 0.4774,
"step": 391600
},
{
"epoch": 0.9031791074709371,
"grad_norm": 1.4855661392211914,
"learning_rate": 4.841044626453144e-06,
"loss": 0.4855,
"step": 391800
},
{
"epoch": 0.903640148362959,
"grad_norm": 1.2408193349838257,
"learning_rate": 4.817992581852048e-06,
"loss": 0.5039,
"step": 392000
},
{
"epoch": 0.904101189254981,
"grad_norm": 0.6845735311508179,
"learning_rate": 4.794940537250952e-06,
"loss": 0.5082,
"step": 392200
},
{
"epoch": 0.9045622301470029,
"grad_norm": 1.4098901748657227,
"learning_rate": 4.771888492649856e-06,
"loss": 0.5194,
"step": 392400
},
{
"epoch": 0.9050232710390248,
"grad_norm": 1.3616442680358887,
"learning_rate": 4.7488364480487605e-06,
"loss": 0.5032,
"step": 392600
},
{
"epoch": 0.9054843119310467,
"grad_norm": 1.0427989959716797,
"learning_rate": 4.725784403447664e-06,
"loss": 0.4969,
"step": 392800
},
{
"epoch": 0.9059453528230687,
"grad_norm": 1.2512778043746948,
"learning_rate": 4.7027323588465685e-06,
"loss": 0.4729,
"step": 393000
},
{
"epoch": 0.9064063937150906,
"grad_norm": 1.1229169368743896,
"learning_rate": 4.6796803142454725e-06,
"loss": 0.5122,
"step": 393200
},
{
"epoch": 0.9068674346071125,
"grad_norm": 1.5654805898666382,
"learning_rate": 4.6566282696443765e-06,
"loss": 0.4615,
"step": 393400
},
{
"epoch": 0.9073284754991344,
"grad_norm": 0.5866159796714783,
"learning_rate": 4.6335762250432805e-06,
"loss": 0.4916,
"step": 393600
},
{
"epoch": 0.9077895163911563,
"grad_norm": 1.5592825412750244,
"learning_rate": 4.6105241804421844e-06,
"loss": 0.4996,
"step": 393800
},
{
"epoch": 0.9082505572831783,
"grad_norm": 0.9208193421363831,
"learning_rate": 4.587472135841088e-06,
"loss": 0.4957,
"step": 394000
},
{
"epoch": 0.9087115981752002,
"grad_norm": 1.1779547929763794,
"learning_rate": 4.564420091239992e-06,
"loss": 0.4633,
"step": 394200
},
{
"epoch": 0.9091726390672221,
"grad_norm": 1.6727235317230225,
"learning_rate": 4.541368046638897e-06,
"loss": 0.4862,
"step": 394400
},
{
"epoch": 0.9096336799592439,
"grad_norm": 1.5873490571975708,
"learning_rate": 4.5183160020378e-06,
"loss": 0.4517,
"step": 394600
},
{
"epoch": 0.9100947208512659,
"grad_norm": 1.310510277748108,
"learning_rate": 4.495263957436705e-06,
"loss": 0.5331,
"step": 394800
},
{
"epoch": 0.9105557617432878,
"grad_norm": 0.5663114190101624,
"learning_rate": 4.472211912835609e-06,
"loss": 0.4823,
"step": 395000
},
{
"epoch": 0.9110168026353097,
"grad_norm": 1.231022596359253,
"learning_rate": 4.449159868234513e-06,
"loss": 0.4842,
"step": 395200
},
{
"epoch": 0.9114778435273316,
"grad_norm": 1.241389274597168,
"learning_rate": 4.426107823633417e-06,
"loss": 0.4604,
"step": 395400
},
{
"epoch": 0.9119388844193536,
"grad_norm": 0.6958038210868835,
"learning_rate": 4.403055779032322e-06,
"loss": 0.4894,
"step": 395600
},
{
"epoch": 0.9123999253113755,
"grad_norm": 0.7018533945083618,
"learning_rate": 4.380003734431225e-06,
"loss": 0.4328,
"step": 395800
},
{
"epoch": 0.9128609662033974,
"grad_norm": 1.3242965936660767,
"learning_rate": 4.35695168983013e-06,
"loss": 0.4759,
"step": 396000
},
{
"epoch": 0.9133220070954193,
"grad_norm": 1.1554487943649292,
"learning_rate": 4.333899645229034e-06,
"loss": 0.5244,
"step": 396200
},
{
"epoch": 0.9137830479874413,
"grad_norm": 1.8800641298294067,
"learning_rate": 4.310847600627938e-06,
"loss": 0.4965,
"step": 396400
},
{
"epoch": 0.9142440888794632,
"grad_norm": 4.930298328399658,
"learning_rate": 4.287795556026842e-06,
"loss": 0.4721,
"step": 396600
},
{
"epoch": 0.9147051297714851,
"grad_norm": 1.5765228271484375,
"learning_rate": 4.264743511425747e-06,
"loss": 0.5089,
"step": 396800
},
{
"epoch": 0.915166170663507,
"grad_norm": 0.436431348323822,
"learning_rate": 4.24169146682465e-06,
"loss": 0.4522,
"step": 397000
},
{
"epoch": 0.915627211555529,
"grad_norm": 1.2564866542816162,
"learning_rate": 4.218639422223555e-06,
"loss": 0.4901,
"step": 397200
},
{
"epoch": 0.9160882524475509,
"grad_norm": 1.2301688194274902,
"learning_rate": 4.195587377622459e-06,
"loss": 0.4858,
"step": 397400
},
{
"epoch": 0.9165492933395728,
"grad_norm": 3.086254358291626,
"learning_rate": 4.172535333021363e-06,
"loss": 0.4907,
"step": 397600
},
{
"epoch": 0.9170103342315947,
"grad_norm": 1.3928074836730957,
"learning_rate": 4.149483288420267e-06,
"loss": 0.5467,
"step": 397800
},
{
"epoch": 0.9174713751236165,
"grad_norm": 0.784092366695404,
"learning_rate": 4.126431243819171e-06,
"loss": 0.4641,
"step": 398000
},
{
"epoch": 0.9179324160156385,
"grad_norm": 0.6679478287696838,
"learning_rate": 4.1033791992180746e-06,
"loss": 0.5061,
"step": 398200
},
{
"epoch": 0.9183934569076604,
"grad_norm": 1.8378559350967407,
"learning_rate": 4.0803271546169785e-06,
"loss": 0.4525,
"step": 398400
},
{
"epoch": 0.9188544977996823,
"grad_norm": 1.1620184183120728,
"learning_rate": 4.057275110015883e-06,
"loss": 0.561,
"step": 398600
},
{
"epoch": 0.9193155386917042,
"grad_norm": 1.5776022672653198,
"learning_rate": 4.034223065414787e-06,
"loss": 0.4991,
"step": 398800
},
{
"epoch": 0.9197765795837262,
"grad_norm": 1.4153554439544678,
"learning_rate": 4.011171020813691e-06,
"loss": 0.5109,
"step": 399000
},
{
"epoch": 0.9202376204757481,
"grad_norm": 1.5178178548812866,
"learning_rate": 3.988118976212595e-06,
"loss": 0.4836,
"step": 399200
},
{
"epoch": 0.92069866136777,
"grad_norm": 0.7913076281547546,
"learning_rate": 3.965066931611499e-06,
"loss": 0.4699,
"step": 399400
},
{
"epoch": 0.9211597022597919,
"grad_norm": 0.7987996339797974,
"learning_rate": 3.942014887010403e-06,
"loss": 0.5013,
"step": 399600
},
{
"epoch": 0.9216207431518139,
"grad_norm": 0.8386745452880859,
"learning_rate": 3.918962842409308e-06,
"loss": 0.4956,
"step": 399800
},
{
"epoch": 0.9220817840438358,
"grad_norm": 0.8059350252151489,
"learning_rate": 3.895910797808212e-06,
"loss": 0.502,
"step": 400000
},
{
"epoch": 0.9220817840438358,
"eval_loss": 0.4786904454231262,
"eval_runtime": 144.2001,
"eval_samples_per_second": 30.388,
"eval_steps_per_second": 30.388,
"step": 400000
},
{
"epoch": 0.9225428249358577,
"grad_norm": 2.3382468223571777,
"learning_rate": 3.872858753207116e-06,
"loss": 0.5051,
"step": 400200
},
{
"epoch": 0.9230038658278796,
"grad_norm": 0.6655104756355286,
"learning_rate": 3.84980670860602e-06,
"loss": 0.4295,
"step": 400400
},
{
"epoch": 0.9234649067199016,
"grad_norm": 2.225646734237671,
"learning_rate": 3.826754664004924e-06,
"loss": 0.4922,
"step": 400600
},
{
"epoch": 0.9239259476119235,
"grad_norm": 1.4331623315811157,
"learning_rate": 3.803702619403828e-06,
"loss": 0.485,
"step": 400800
},
{
"epoch": 0.9243869885039454,
"grad_norm": 1.335250973701477,
"learning_rate": 3.780650574802732e-06,
"loss": 0.5001,
"step": 401000
},
{
"epoch": 0.9248480293959673,
"grad_norm": 1.107587218284607,
"learning_rate": 3.7575985302016364e-06,
"loss": 0.4745,
"step": 401200
},
{
"epoch": 0.9253090702879891,
"grad_norm": 1.2675089836120605,
"learning_rate": 3.7345464856005404e-06,
"loss": 0.4863,
"step": 401400
},
{
"epoch": 0.9257701111800111,
"grad_norm": 1.018123984336853,
"learning_rate": 3.7114944409994448e-06,
"loss": 0.4784,
"step": 401600
},
{
"epoch": 0.926231152072033,
"grad_norm": 0.6232244968414307,
"learning_rate": 3.6884423963983483e-06,
"loss": 0.4996,
"step": 401800
},
{
"epoch": 0.9266921929640549,
"grad_norm": 1.347090482711792,
"learning_rate": 3.6653903517972527e-06,
"loss": 0.4704,
"step": 402000
},
{
"epoch": 0.9271532338560768,
"grad_norm": 0.9146246314048767,
"learning_rate": 3.6423383071961567e-06,
"loss": 0.4572,
"step": 402200
},
{
"epoch": 0.9276142747480988,
"grad_norm": 1.380365014076233,
"learning_rate": 3.619286262595061e-06,
"loss": 0.5064,
"step": 402400
},
{
"epoch": 0.9280753156401207,
"grad_norm": 1.536133050918579,
"learning_rate": 3.596234217993965e-06,
"loss": 0.481,
"step": 402600
},
{
"epoch": 0.9285363565321426,
"grad_norm": 1.290397047996521,
"learning_rate": 3.5731821733928695e-06,
"loss": 0.4891,
"step": 402800
},
{
"epoch": 0.9289973974241645,
"grad_norm": 2.4600837230682373,
"learning_rate": 3.550130128791773e-06,
"loss": 0.4528,
"step": 403000
},
{
"epoch": 0.9294584383161865,
"grad_norm": 1.7255617380142212,
"learning_rate": 3.5270780841906775e-06,
"loss": 0.4647,
"step": 403200
},
{
"epoch": 0.9299194792082084,
"grad_norm": 1.0559278726577759,
"learning_rate": 3.5040260395895815e-06,
"loss": 0.5023,
"step": 403400
},
{
"epoch": 0.9303805201002303,
"grad_norm": 0.7714131474494934,
"learning_rate": 3.480973994988486e-06,
"loss": 0.449,
"step": 403600
},
{
"epoch": 0.9308415609922522,
"grad_norm": 1.1090224981307983,
"learning_rate": 3.45792195038739e-06,
"loss": 0.5151,
"step": 403800
},
{
"epoch": 0.9313026018842742,
"grad_norm": 1.1689685583114624,
"learning_rate": 3.4348699057862943e-06,
"loss": 0.4831,
"step": 404000
},
{
"epoch": 0.9317636427762961,
"grad_norm": 1.7004835605621338,
"learning_rate": 3.411817861185198e-06,
"loss": 0.4517,
"step": 404200
},
{
"epoch": 0.932224683668318,
"grad_norm": 1.8636317253112793,
"learning_rate": 3.3887658165841022e-06,
"loss": 0.4642,
"step": 404400
},
{
"epoch": 0.9326857245603398,
"grad_norm": 1.8215795755386353,
"learning_rate": 3.365713771983006e-06,
"loss": 0.4997,
"step": 404600
},
{
"epoch": 0.9331467654523617,
"grad_norm": 1.2667629718780518,
"learning_rate": 3.34266172738191e-06,
"loss": 0.475,
"step": 404800
},
{
"epoch": 0.9336078063443837,
"grad_norm": 1.1865830421447754,
"learning_rate": 3.3196096827808146e-06,
"loss": 0.495,
"step": 405000
},
{
"epoch": 0.9340688472364056,
"grad_norm": 0.7197660803794861,
"learning_rate": 3.296557638179718e-06,
"loss": 0.4726,
"step": 405200
},
{
"epoch": 0.9345298881284275,
"grad_norm": 0.5987845659255981,
"learning_rate": 3.2735055935786225e-06,
"loss": 0.4799,
"step": 405400
},
{
"epoch": 0.9349909290204494,
"grad_norm": 3.0414366722106934,
"learning_rate": 3.2504535489775265e-06,
"loss": 0.5096,
"step": 405600
},
{
"epoch": 0.9354519699124714,
"grad_norm": 1.372909426689148,
"learning_rate": 3.227401504376431e-06,
"loss": 0.4626,
"step": 405800
},
{
"epoch": 0.9359130108044933,
"grad_norm": 1.5821083784103394,
"learning_rate": 3.204349459775335e-06,
"loss": 0.4574,
"step": 406000
},
{
"epoch": 0.9363740516965152,
"grad_norm": 0.5546638369560242,
"learning_rate": 3.1812974151742393e-06,
"loss": 0.5013,
"step": 406200
},
{
"epoch": 0.9368350925885371,
"grad_norm": 1.4737298488616943,
"learning_rate": 3.158245370573143e-06,
"loss": 0.473,
"step": 406400
},
{
"epoch": 0.9372961334805591,
"grad_norm": 1.4075927734375,
"learning_rate": 3.1351933259720473e-06,
"loss": 0.4748,
"step": 406600
},
{
"epoch": 0.937757174372581,
"grad_norm": 1.2418146133422852,
"learning_rate": 3.1121412813709513e-06,
"loss": 0.5203,
"step": 406800
},
{
"epoch": 0.9382182152646029,
"grad_norm": 2.0183310508728027,
"learning_rate": 3.0890892367698552e-06,
"loss": 0.5137,
"step": 407000
},
{
"epoch": 0.9386792561566248,
"grad_norm": 0.8458141684532166,
"learning_rate": 3.0660371921687596e-06,
"loss": 0.4607,
"step": 407200
},
{
"epoch": 0.9391402970486468,
"grad_norm": 1.4068762063980103,
"learning_rate": 3.0429851475676636e-06,
"loss": 0.461,
"step": 407400
},
{
"epoch": 0.9396013379406687,
"grad_norm": 4.581197261810303,
"learning_rate": 3.0199331029665676e-06,
"loss": 0.4692,
"step": 407600
},
{
"epoch": 0.9400623788326906,
"grad_norm": 2.506011724472046,
"learning_rate": 2.996881058365472e-06,
"loss": 0.4364,
"step": 407800
},
{
"epoch": 0.9405234197247124,
"grad_norm": 1.3012163639068604,
"learning_rate": 2.973829013764376e-06,
"loss": 0.466,
"step": 408000
},
{
"epoch": 0.9409844606167344,
"grad_norm": 0.9710767269134521,
"learning_rate": 2.95077696916328e-06,
"loss": 0.4706,
"step": 408200
},
{
"epoch": 0.9414455015087563,
"grad_norm": 1.5749614238739014,
"learning_rate": 2.9277249245621844e-06,
"loss": 0.4507,
"step": 408400
},
{
"epoch": 0.9419065424007782,
"grad_norm": 1.1157305240631104,
"learning_rate": 2.9046728799610884e-06,
"loss": 0.4798,
"step": 408600
},
{
"epoch": 0.9423675832928001,
"grad_norm": 1.0349030494689941,
"learning_rate": 2.8816208353599923e-06,
"loss": 0.4385,
"step": 408800
},
{
"epoch": 0.942828624184822,
"grad_norm": 0.7431963682174683,
"learning_rate": 2.8585687907588967e-06,
"loss": 0.4558,
"step": 409000
},
{
"epoch": 0.943289665076844,
"grad_norm": 1.7582494020462036,
"learning_rate": 2.8355167461578007e-06,
"loss": 0.4805,
"step": 409200
},
{
"epoch": 0.9437507059688659,
"grad_norm": 1.688696026802063,
"learning_rate": 2.8124647015567047e-06,
"loss": 0.462,
"step": 409400
},
{
"epoch": 0.9442117468608878,
"grad_norm": 2.057497262954712,
"learning_rate": 2.789412656955609e-06,
"loss": 0.444,
"step": 409600
},
{
"epoch": 0.9446727877529097,
"grad_norm": 1.7381998300552368,
"learning_rate": 2.766360612354513e-06,
"loss": 0.4716,
"step": 409800
},
{
"epoch": 0.9451338286449317,
"grad_norm": 1.4783737659454346,
"learning_rate": 2.743308567753417e-06,
"loss": 0.507,
"step": 410000
},
{
"epoch": 0.9455948695369536,
"grad_norm": 2.4599671363830566,
"learning_rate": 2.720256523152321e-06,
"loss": 0.4724,
"step": 410200
},
{
"epoch": 0.9460559104289755,
"grad_norm": 1.6075626611709595,
"learning_rate": 2.697204478551225e-06,
"loss": 0.486,
"step": 410400
},
{
"epoch": 0.9465169513209974,
"grad_norm": 2.0719876289367676,
"learning_rate": 2.674152433950129e-06,
"loss": 0.4248,
"step": 410600
},
{
"epoch": 0.9469779922130194,
"grad_norm": 2.098074197769165,
"learning_rate": 2.6511003893490334e-06,
"loss": 0.4426,
"step": 410800
},
{
"epoch": 0.9474390331050413,
"grad_norm": 1.1049730777740479,
"learning_rate": 2.6280483447479374e-06,
"loss": 0.4627,
"step": 411000
},
{
"epoch": 0.9479000739970632,
"grad_norm": 0.8188923001289368,
"learning_rate": 2.6049963001468414e-06,
"loss": 0.4888,
"step": 411200
},
{
"epoch": 0.948361114889085,
"grad_norm": 0.9490247368812561,
"learning_rate": 2.581944255545746e-06,
"loss": 0.4942,
"step": 411400
},
{
"epoch": 0.948822155781107,
"grad_norm": 1.2527036666870117,
"learning_rate": 2.5588922109446498e-06,
"loss": 0.4867,
"step": 411600
},
{
"epoch": 0.9492831966731289,
"grad_norm": 0.47197577357292175,
"learning_rate": 2.5358401663435538e-06,
"loss": 0.464,
"step": 411800
},
{
"epoch": 0.9497442375651508,
"grad_norm": 1.5637418031692505,
"learning_rate": 2.512788121742458e-06,
"loss": 0.4771,
"step": 412000
},
{
"epoch": 0.9502052784571727,
"grad_norm": 0.9019871950149536,
"learning_rate": 2.489736077141362e-06,
"loss": 0.4814,
"step": 412200
},
{
"epoch": 0.9506663193491947,
"grad_norm": 0.7962387800216675,
"learning_rate": 2.466684032540266e-06,
"loss": 0.488,
"step": 412400
},
{
"epoch": 0.9511273602412166,
"grad_norm": 0.810796320438385,
"learning_rate": 2.4436319879391705e-06,
"loss": 0.4098,
"step": 412600
},
{
"epoch": 0.9515884011332385,
"grad_norm": 1.6829875707626343,
"learning_rate": 2.4205799433380745e-06,
"loss": 0.4659,
"step": 412800
},
{
"epoch": 0.9520494420252604,
"grad_norm": 0.9560777544975281,
"learning_rate": 2.3975278987369785e-06,
"loss": 0.4853,
"step": 413000
},
{
"epoch": 0.9525104829172824,
"grad_norm": 2.32140851020813,
"learning_rate": 2.374475854135883e-06,
"loss": 0.5007,
"step": 413200
},
{
"epoch": 0.9529715238093043,
"grad_norm": 1.2261013984680176,
"learning_rate": 2.351423809534787e-06,
"loss": 0.5207,
"step": 413400
},
{
"epoch": 0.9534325647013262,
"grad_norm": 1.996286153793335,
"learning_rate": 2.3283717649336913e-06,
"loss": 0.5164,
"step": 413600
},
{
"epoch": 0.9538936055933481,
"grad_norm": 1.2934073209762573,
"learning_rate": 2.305319720332595e-06,
"loss": 0.4809,
"step": 413800
},
{
"epoch": 0.95435464648537,
"grad_norm": 0.7042099833488464,
"learning_rate": 2.282267675731499e-06,
"loss": 0.4893,
"step": 414000
},
{
"epoch": 0.954815687377392,
"grad_norm": 1.0539119243621826,
"learning_rate": 2.2592156311304032e-06,
"loss": 0.5039,
"step": 414200
},
{
"epoch": 0.9552767282694139,
"grad_norm": 1.2834453582763672,
"learning_rate": 2.236163586529307e-06,
"loss": 0.5054,
"step": 414400
},
{
"epoch": 0.9557377691614358,
"grad_norm": 0.6034151911735535,
"learning_rate": 2.213111541928211e-06,
"loss": 0.4729,
"step": 414600
},
{
"epoch": 0.9561988100534576,
"grad_norm": 2.897521734237671,
"learning_rate": 2.1900594973271156e-06,
"loss": 0.4549,
"step": 414800
},
{
"epoch": 0.9566598509454796,
"grad_norm": 1.0604009628295898,
"learning_rate": 2.1670074527260196e-06,
"loss": 0.4573,
"step": 415000
},
{
"epoch": 0.9571208918375015,
"grad_norm": 0.8515986800193787,
"learning_rate": 2.1439554081249235e-06,
"loss": 0.4521,
"step": 415200
},
{
"epoch": 0.9575819327295234,
"grad_norm": 1.5794425010681152,
"learning_rate": 2.120903363523828e-06,
"loss": 0.5358,
"step": 415400
},
{
"epoch": 0.9580429736215453,
"grad_norm": 1.2372163534164429,
"learning_rate": 2.097851318922732e-06,
"loss": 0.4345,
"step": 415600
},
{
"epoch": 0.9585040145135673,
"grad_norm": 2.6105234622955322,
"learning_rate": 2.074799274321636e-06,
"loss": 0.5035,
"step": 415800
},
{
"epoch": 0.9589650554055892,
"grad_norm": 0.8453428149223328,
"learning_rate": 2.0517472297205403e-06,
"loss": 0.4723,
"step": 416000
},
{
"epoch": 0.9594260962976111,
"grad_norm": 1.2745046615600586,
"learning_rate": 2.0286951851194443e-06,
"loss": 0.4754,
"step": 416200
},
{
"epoch": 0.959887137189633,
"grad_norm": 1.6135262250900269,
"learning_rate": 2.0056431405183483e-06,
"loss": 0.4801,
"step": 416400
},
{
"epoch": 0.960348178081655,
"grad_norm": 1.6727254390716553,
"learning_rate": 1.9825910959172527e-06,
"loss": 0.4818,
"step": 416600
},
{
"epoch": 0.9608092189736769,
"grad_norm": 1.026893973350525,
"learning_rate": 1.9595390513161567e-06,
"loss": 0.4383,
"step": 416800
},
{
"epoch": 0.9612702598656988,
"grad_norm": 1.3765745162963867,
"learning_rate": 1.9364870067150607e-06,
"loss": 0.4625,
"step": 417000
},
{
"epoch": 0.9617313007577207,
"grad_norm": 1.6205723285675049,
"learning_rate": 1.913434962113965e-06,
"loss": 0.4705,
"step": 417200
},
{
"epoch": 0.9621923416497427,
"grad_norm": 1.5419261455535889,
"learning_rate": 1.8903829175128688e-06,
"loss": 0.5056,
"step": 417400
},
{
"epoch": 0.9626533825417646,
"grad_norm": 0.6733845472335815,
"learning_rate": 1.8673308729117728e-06,
"loss": 0.4988,
"step": 417600
},
{
"epoch": 0.9631144234337865,
"grad_norm": 1.0940847396850586,
"learning_rate": 1.844278828310677e-06,
"loss": 0.4654,
"step": 417800
},
{
"epoch": 0.9635754643258084,
"grad_norm": 1.1737462282180786,
"learning_rate": 1.8212267837095812e-06,
"loss": 0.4574,
"step": 418000
},
{
"epoch": 0.9640365052178302,
"grad_norm": 1.6984807252883911,
"learning_rate": 1.7981747391084852e-06,
"loss": 0.4525,
"step": 418200
},
{
"epoch": 0.9644975461098522,
"grad_norm": 1.8462785482406616,
"learning_rate": 1.7751226945073894e-06,
"loss": 0.4889,
"step": 418400
},
{
"epoch": 0.9649585870018741,
"grad_norm": 1.7319543361663818,
"learning_rate": 1.7520706499062936e-06,
"loss": 0.4821,
"step": 418600
},
{
"epoch": 0.965419627893896,
"grad_norm": 2.190124034881592,
"learning_rate": 1.7290186053051975e-06,
"loss": 0.5076,
"step": 418800
},
{
"epoch": 0.9658806687859179,
"grad_norm": 1.911737322807312,
"learning_rate": 1.7059665607041017e-06,
"loss": 0.4905,
"step": 419000
},
{
"epoch": 0.9663417096779399,
"grad_norm": 3.9163506031036377,
"learning_rate": 1.682914516103006e-06,
"loss": 0.469,
"step": 419200
},
{
"epoch": 0.9668027505699618,
"grad_norm": 1.2527137994766235,
"learning_rate": 1.65986247150191e-06,
"loss": 0.4247,
"step": 419400
},
{
"epoch": 0.9672637914619837,
"grad_norm": 0.49080777168273926,
"learning_rate": 1.636810426900814e-06,
"loss": 0.4783,
"step": 419600
},
{
"epoch": 0.9677248323540056,
"grad_norm": 1.7566986083984375,
"learning_rate": 1.6137583822997183e-06,
"loss": 0.5296,
"step": 419800
},
{
"epoch": 0.9681858732460276,
"grad_norm": 0.9041785597801208,
"learning_rate": 1.5907063376986223e-06,
"loss": 0.4821,
"step": 420000
},
{
"epoch": 0.9686469141380495,
"grad_norm": 1.486576795578003,
"learning_rate": 1.5676542930975265e-06,
"loss": 0.4653,
"step": 420200
},
{
"epoch": 0.9691079550300714,
"grad_norm": 1.5304393768310547,
"learning_rate": 1.5446022484964304e-06,
"loss": 0.4657,
"step": 420400
},
{
"epoch": 0.9695689959220933,
"grad_norm": 4.940136432647705,
"learning_rate": 1.5215502038953344e-06,
"loss": 0.4656,
"step": 420600
},
{
"epoch": 0.9700300368141153,
"grad_norm": 2.155877113342285,
"learning_rate": 1.4984981592942386e-06,
"loss": 0.4687,
"step": 420800
},
{
"epoch": 0.9704910777061372,
"grad_norm": 0.5753369927406311,
"learning_rate": 1.4754461146931428e-06,
"loss": 0.4809,
"step": 421000
},
{
"epoch": 0.9709521185981591,
"grad_norm": 1.4241207838058472,
"learning_rate": 1.4523940700920468e-06,
"loss": 0.4721,
"step": 421200
},
{
"epoch": 0.971413159490181,
"grad_norm": 0.4579220414161682,
"learning_rate": 1.429342025490951e-06,
"loss": 0.4865,
"step": 421400
},
{
"epoch": 0.9718742003822028,
"grad_norm": 2.118295669555664,
"learning_rate": 1.4062899808898552e-06,
"loss": 0.4461,
"step": 421600
},
{
"epoch": 0.9723352412742248,
"grad_norm": 2.08658766746521,
"learning_rate": 1.3832379362887594e-06,
"loss": 0.4564,
"step": 421800
},
{
"epoch": 0.9727962821662467,
"grad_norm": 1.8553085327148438,
"learning_rate": 1.3601858916876631e-06,
"loss": 0.5111,
"step": 422000
},
{
"epoch": 0.9732573230582686,
"grad_norm": 1.5697154998779297,
"learning_rate": 1.3371338470865673e-06,
"loss": 0.4776,
"step": 422200
},
{
"epoch": 0.9737183639502905,
"grad_norm": 0.5918155312538147,
"learning_rate": 1.3140818024854715e-06,
"loss": 0.4928,
"step": 422400
},
{
"epoch": 0.9741794048423125,
"grad_norm": 0.9090703725814819,
"learning_rate": 1.2910297578843755e-06,
"loss": 0.493,
"step": 422600
},
{
"epoch": 0.9746404457343344,
"grad_norm": 2.200510025024414,
"learning_rate": 1.2679777132832797e-06,
"loss": 0.4584,
"step": 422800
},
{
"epoch": 0.9751014866263563,
"grad_norm": 1.3335816860198975,
"learning_rate": 1.244925668682184e-06,
"loss": 0.4461,
"step": 423000
},
{
"epoch": 0.9755625275183782,
"grad_norm": 1.2546000480651855,
"learning_rate": 1.2218736240810879e-06,
"loss": 0.4431,
"step": 423200
},
{
"epoch": 0.9760235684104002,
"grad_norm": 1.394166350364685,
"learning_rate": 1.198821579479992e-06,
"loss": 0.452,
"step": 423400
},
{
"epoch": 0.9764846093024221,
"grad_norm": 1.7498624324798584,
"learning_rate": 1.1757695348788963e-06,
"loss": 0.5626,
"step": 423600
},
{
"epoch": 0.976945650194444,
"grad_norm": 1.2629833221435547,
"learning_rate": 1.1527174902778002e-06,
"loss": 0.4461,
"step": 423800
},
{
"epoch": 0.9774066910864659,
"grad_norm": 1.0957165956497192,
"learning_rate": 1.1296654456767042e-06,
"loss": 0.483,
"step": 424000
},
{
"epoch": 0.9778677319784879,
"grad_norm": 1.3717105388641357,
"learning_rate": 1.1066134010756084e-06,
"loss": 0.4599,
"step": 424200
},
{
"epoch": 0.9783287728705098,
"grad_norm": 1.0456079244613647,
"learning_rate": 1.0835613564745126e-06,
"loss": 0.4624,
"step": 424400
},
{
"epoch": 0.9787898137625317,
"grad_norm": 0.7499749660491943,
"learning_rate": 1.0605093118734166e-06,
"loss": 0.4567,
"step": 424600
},
{
"epoch": 0.9792508546545536,
"grad_norm": 1.2902302742004395,
"learning_rate": 1.0374572672723208e-06,
"loss": 0.4881,
"step": 424800
},
{
"epoch": 0.9797118955465755,
"grad_norm": 1.2328616380691528,
"learning_rate": 1.014405222671225e-06,
"loss": 0.4676,
"step": 425000
},
{
"epoch": 0.9801729364385974,
"grad_norm": 1.0173146724700928,
"learning_rate": 9.91353178070129e-07,
"loss": 0.4841,
"step": 425200
},
{
"epoch": 0.9806339773306193,
"grad_norm": 0.421296089887619,
"learning_rate": 9.683011334690332e-07,
"loss": 0.4964,
"step": 425400
},
{
"epoch": 0.9810950182226412,
"grad_norm": 0.7365984916687012,
"learning_rate": 9.452490888679371e-07,
"loss": 0.4944,
"step": 425600
},
{
"epoch": 0.9815560591146631,
"grad_norm": 1.2316726446151733,
"learning_rate": 9.221970442668412e-07,
"loss": 0.4656,
"step": 425800
},
{
"epoch": 0.9820171000066851,
"grad_norm": 0.8625339269638062,
"learning_rate": 8.991449996657453e-07,
"loss": 0.4667,
"step": 426000
},
{
"epoch": 0.982478140898707,
"grad_norm": 1.1301565170288086,
"learning_rate": 8.760929550646495e-07,
"loss": 0.4699,
"step": 426200
},
{
"epoch": 0.9829391817907289,
"grad_norm": 0.8868315815925598,
"learning_rate": 8.530409104635536e-07,
"loss": 0.5316,
"step": 426400
},
{
"epoch": 0.9834002226827508,
"grad_norm": 2.410291910171509,
"learning_rate": 8.299888658624577e-07,
"loss": 0.4651,
"step": 426600
},
{
"epoch": 0.9838612635747728,
"grad_norm": 3.955040693283081,
"learning_rate": 8.069368212613619e-07,
"loss": 0.4732,
"step": 426800
},
{
"epoch": 0.9843223044667947,
"grad_norm": 1.6138865947723389,
"learning_rate": 7.83884776660266e-07,
"loss": 0.5057,
"step": 427000
},
{
"epoch": 0.9847833453588166,
"grad_norm": 1.141384482383728,
"learning_rate": 7.6083273205917e-07,
"loss": 0.49,
"step": 427200
},
{
"epoch": 0.9852443862508385,
"grad_norm": 0.964368999004364,
"learning_rate": 7.377806874580741e-07,
"loss": 0.4702,
"step": 427400
},
{
"epoch": 0.9857054271428605,
"grad_norm": 1.7662829160690308,
"learning_rate": 7.147286428569782e-07,
"loss": 0.477,
"step": 427600
},
{
"epoch": 0.9861664680348824,
"grad_norm": 1.14377760887146,
"learning_rate": 6.916765982558824e-07,
"loss": 0.4613,
"step": 427800
},
{
"epoch": 0.9866275089269043,
"grad_norm": 1.1552037000656128,
"learning_rate": 6.686245536547864e-07,
"loss": 0.4659,
"step": 428000
},
{
"epoch": 0.9870885498189262,
"grad_norm": 1.6723991632461548,
"learning_rate": 6.455725090536906e-07,
"loss": 0.4614,
"step": 428200
},
{
"epoch": 0.987549590710948,
"grad_norm": 1.3214787244796753,
"learning_rate": 6.225204644525947e-07,
"loss": 0.4599,
"step": 428400
},
{
"epoch": 0.98801063160297,
"grad_norm": 0.9534615874290466,
"learning_rate": 5.994684198514988e-07,
"loss": 0.5336,
"step": 428600
},
{
"epoch": 0.9884716724949919,
"grad_norm": 1.9757567644119263,
"learning_rate": 5.764163752504028e-07,
"loss": 0.502,
"step": 428800
},
{
"epoch": 0.9889327133870138,
"grad_norm": 1.4372884035110474,
"learning_rate": 5.533643306493069e-07,
"loss": 0.4847,
"step": 429000
},
{
"epoch": 0.9893937542790358,
"grad_norm": 1.3356891870498657,
"learning_rate": 5.30312286048211e-07,
"loss": 0.5075,
"step": 429200
},
{
"epoch": 0.9898547951710577,
"grad_norm": 0.3389435410499573,
"learning_rate": 5.072602414471152e-07,
"loss": 0.4614,
"step": 429400
},
{
"epoch": 0.9903158360630796,
"grad_norm": 4.016057968139648,
"learning_rate": 4.842081968460193e-07,
"loss": 0.4765,
"step": 429600
},
{
"epoch": 0.9907768769551015,
"grad_norm": 1.7579454183578491,
"learning_rate": 4.6115615224492333e-07,
"loss": 0.4588,
"step": 429800
},
{
"epoch": 0.9912379178471235,
"grad_norm": 1.2797824144363403,
"learning_rate": 4.381041076438275e-07,
"loss": 0.511,
"step": 430000
}
],
"logging_steps": 200,
"max_steps": 433801,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.972456968192e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}