chess-ilan-v11 / trainer_state.json
ilanou20's picture
Chess Challenge submission by ilanou20
9a3ab50 verified
{
"best_global_step": 59376,
"best_metric": 0.44603702425956726,
"best_model_checkpoint": "./my_model1/checkpoint-59376",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 59376,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033683643222850983,
"grad_norm": 6.297862529754639,
"learning_rate": 1.1115477460281817e-06,
"loss": 4.6374,
"step": 100
},
{
"epoch": 0.006736728644570197,
"grad_norm": 2.914098024368286,
"learning_rate": 2.234323247066749e-06,
"loss": 4.3734,
"step": 200
},
{
"epoch": 0.010105092966855295,
"grad_norm": 2.6752781867980957,
"learning_rate": 3.357098748105317e-06,
"loss": 4.1151,
"step": 300
},
{
"epoch": 0.013473457289140393,
"grad_norm": 2.557371139526367,
"learning_rate": 4.479874249143884e-06,
"loss": 3.8823,
"step": 400
},
{
"epoch": 0.016841821611425493,
"grad_norm": 2.3797903060913086,
"learning_rate": 5.602649750182451e-06,
"loss": 3.6958,
"step": 500
},
{
"epoch": 0.02021018593371059,
"grad_norm": 2.318178653717041,
"learning_rate": 6.725425251221018e-06,
"loss": 3.5736,
"step": 600
},
{
"epoch": 0.02357855025599569,
"grad_norm": 2.263061046600342,
"learning_rate": 7.848200752259587e-06,
"loss": 3.4535,
"step": 700
},
{
"epoch": 0.026946914578280787,
"grad_norm": 2.245070219039917,
"learning_rate": 8.970976253298154e-06,
"loss": 3.3254,
"step": 800
},
{
"epoch": 0.030315278900565887,
"grad_norm": 2.1786885261535645,
"learning_rate": 1.009375175433672e-05,
"loss": 3.1968,
"step": 900
},
{
"epoch": 0.03368364322285099,
"grad_norm": 2.1120216846466064,
"learning_rate": 1.1216527255375288e-05,
"loss": 3.0749,
"step": 1000
},
{
"epoch": 0.03705200754513608,
"grad_norm": 2.225299835205078,
"learning_rate": 1.2339302756413855e-05,
"loss": 2.956,
"step": 1100
},
{
"epoch": 0.04042037186742118,
"grad_norm": 1.9265443086624146,
"learning_rate": 1.3462078257452423e-05,
"loss": 2.8324,
"step": 1200
},
{
"epoch": 0.04378873618970628,
"grad_norm": 4.846482276916504,
"learning_rate": 1.458485375849099e-05,
"loss": 2.7146,
"step": 1300
},
{
"epoch": 0.04715710051199138,
"grad_norm": 8.298853874206543,
"learning_rate": 1.5707629259529558e-05,
"loss": 2.6336,
"step": 1400
},
{
"epoch": 0.05052546483427647,
"grad_norm": 12.867733001708984,
"learning_rate": 1.6830404760568124e-05,
"loss": 2.5894,
"step": 1500
},
{
"epoch": 0.05389382915656157,
"grad_norm": 17.92266082763672,
"learning_rate": 1.7953180261606693e-05,
"loss": 2.5615,
"step": 1600
},
{
"epoch": 0.057262193478846674,
"grad_norm": 13.567904472351074,
"learning_rate": 1.907595576264526e-05,
"loss": 2.5376,
"step": 1700
},
{
"epoch": 0.060630557801131774,
"grad_norm": 18.349245071411133,
"learning_rate": 2.0198731263683825e-05,
"loss": 2.5115,
"step": 1800
},
{
"epoch": 0.06399892212341687,
"grad_norm": 18.910877227783203,
"learning_rate": 2.1321506764722397e-05,
"loss": 2.4836,
"step": 1900
},
{
"epoch": 0.06736728644570197,
"grad_norm": 5.438470363616943,
"learning_rate": 2.2444282265760963e-05,
"loss": 2.448,
"step": 2000
},
{
"epoch": 0.07073565076798706,
"grad_norm": 1.8990598917007446,
"learning_rate": 2.356705776679953e-05,
"loss": 2.3836,
"step": 2100
},
{
"epoch": 0.07410401509027216,
"grad_norm": 1.7939313650131226,
"learning_rate": 2.46898332678381e-05,
"loss": 2.2869,
"step": 2200
},
{
"epoch": 0.07747237941255726,
"grad_norm": 2.6316609382629395,
"learning_rate": 2.581260876887666e-05,
"loss": 2.1664,
"step": 2300
},
{
"epoch": 0.08084074373484236,
"grad_norm": 3.9971001148223877,
"learning_rate": 2.693538426991523e-05,
"loss": 2.0635,
"step": 2400
},
{
"epoch": 0.08420910805712746,
"grad_norm": 2.845649242401123,
"learning_rate": 2.8058159770953803e-05,
"loss": 2.0033,
"step": 2500
},
{
"epoch": 0.08757747237941256,
"grad_norm": 11.22779655456543,
"learning_rate": 2.9180935271992365e-05,
"loss": 1.944,
"step": 2600
},
{
"epoch": 0.09094583670169766,
"grad_norm": 8.039031982421875,
"learning_rate": 3.0303710773030935e-05,
"loss": 1.8935,
"step": 2700
},
{
"epoch": 0.09431420102398276,
"grad_norm": 19.868438720703125,
"learning_rate": 3.14264862740695e-05,
"loss": 1.8509,
"step": 2800
},
{
"epoch": 0.09768256534626785,
"grad_norm": 19.26648712158203,
"learning_rate": 3.254926177510807e-05,
"loss": 1.808,
"step": 2900
},
{
"epoch": 0.10105092966855295,
"grad_norm": 10.993364334106445,
"learning_rate": 3.367203727614663e-05,
"loss": 1.7658,
"step": 3000
},
{
"epoch": 0.10441929399083805,
"grad_norm": 12.577337265014648,
"learning_rate": 3.47948127771852e-05,
"loss": 1.7268,
"step": 3100
},
{
"epoch": 0.10778765831312315,
"grad_norm": 15.279227256774902,
"learning_rate": 3.591758827822377e-05,
"loss": 1.6961,
"step": 3200
},
{
"epoch": 0.11115602263540825,
"grad_norm": 15.154927253723145,
"learning_rate": 3.704036377926234e-05,
"loss": 1.6632,
"step": 3300
},
{
"epoch": 0.11452438695769335,
"grad_norm": 5.024831295013428,
"learning_rate": 3.816313928030091e-05,
"loss": 1.627,
"step": 3400
},
{
"epoch": 0.11789275127997845,
"grad_norm": 7.439777851104736,
"learning_rate": 3.928591478133947e-05,
"loss": 1.5909,
"step": 3500
},
{
"epoch": 0.12126111560226355,
"grad_norm": 7.653560638427734,
"learning_rate": 4.040869028237804e-05,
"loss": 1.5621,
"step": 3600
},
{
"epoch": 0.12462947992454863,
"grad_norm": 7.883094310760498,
"learning_rate": 4.1531465783416603e-05,
"loss": 1.5307,
"step": 3700
},
{
"epoch": 0.12799784424683375,
"grad_norm": 3.2945971488952637,
"learning_rate": 4.265424128445518e-05,
"loss": 1.5016,
"step": 3800
},
{
"epoch": 0.13136620856911885,
"grad_norm": 5.135283946990967,
"learning_rate": 4.377701678549374e-05,
"loss": 1.4741,
"step": 3900
},
{
"epoch": 0.13473457289140395,
"grad_norm": 8.129427909851074,
"learning_rate": 4.489979228653231e-05,
"loss": 1.4423,
"step": 4000
},
{
"epoch": 0.13810293721368902,
"grad_norm": 8.010125160217285,
"learning_rate": 4.6022567787570874e-05,
"loss": 1.4146,
"step": 4100
},
{
"epoch": 0.14147130153597412,
"grad_norm": 3.1212265491485596,
"learning_rate": 4.714534328860944e-05,
"loss": 1.3919,
"step": 4200
},
{
"epoch": 0.14483966585825922,
"grad_norm": 3.6468098163604736,
"learning_rate": 4.826811878964801e-05,
"loss": 1.373,
"step": 4300
},
{
"epoch": 0.14820803018054432,
"grad_norm": 4.597881317138672,
"learning_rate": 4.939089429068658e-05,
"loss": 1.352,
"step": 4400
},
{
"epoch": 0.15157639450282942,
"grad_norm": 4.9619622230529785,
"learning_rate": 5.051366979172515e-05,
"loss": 1.3299,
"step": 4500
},
{
"epoch": 0.15494475882511452,
"grad_norm": 4.055070877075195,
"learning_rate": 5.163644529276371e-05,
"loss": 1.312,
"step": 4600
},
{
"epoch": 0.15831312314739962,
"grad_norm": 4.076910018920898,
"learning_rate": 5.2759220793802276e-05,
"loss": 1.2963,
"step": 4700
},
{
"epoch": 0.16168148746968472,
"grad_norm": 2.7936923503875732,
"learning_rate": 5.388199629484085e-05,
"loss": 1.2782,
"step": 4800
},
{
"epoch": 0.16504985179196982,
"grad_norm": 3.8645057678222656,
"learning_rate": 5.5004771795879414e-05,
"loss": 1.2598,
"step": 4900
},
{
"epoch": 0.16841821611425492,
"grad_norm": 3.8098433017730713,
"learning_rate": 5.6127547296917983e-05,
"loss": 1.2426,
"step": 5000
},
{
"epoch": 0.17178658043654002,
"grad_norm": 3.690554618835449,
"learning_rate": 5.7250322797956546e-05,
"loss": 1.2257,
"step": 5100
},
{
"epoch": 0.17515494475882512,
"grad_norm": 3.7821402549743652,
"learning_rate": 5.837309829899512e-05,
"loss": 1.2138,
"step": 5200
},
{
"epoch": 0.17852330908111022,
"grad_norm": 4.070770263671875,
"learning_rate": 5.9495873800033684e-05,
"loss": 1.2005,
"step": 5300
},
{
"epoch": 0.18189167340339532,
"grad_norm": 5.843082904815674,
"learning_rate": 6.061864930107225e-05,
"loss": 1.1795,
"step": 5400
},
{
"epoch": 0.18526003772568042,
"grad_norm": 4.773739337921143,
"learning_rate": 6.174142480211082e-05,
"loss": 1.1665,
"step": 5500
},
{
"epoch": 0.18862840204796552,
"grad_norm": 3.8879311084747314,
"learning_rate": 6.286420030314939e-05,
"loss": 1.1529,
"step": 5600
},
{
"epoch": 0.1919967663702506,
"grad_norm": 4.927277088165283,
"learning_rate": 6.398697580418795e-05,
"loss": 1.1397,
"step": 5700
},
{
"epoch": 0.1953651306925357,
"grad_norm": 3.640209913253784,
"learning_rate": 6.510975130522652e-05,
"loss": 1.1199,
"step": 5800
},
{
"epoch": 0.1987334950148208,
"grad_norm": 5.0505595207214355,
"learning_rate": 6.62325268062651e-05,
"loss": 1.1073,
"step": 5900
},
{
"epoch": 0.2021018593371059,
"grad_norm": 3.703660011291504,
"learning_rate": 6.735530230730366e-05,
"loss": 1.0966,
"step": 6000
},
{
"epoch": 0.205470223659391,
"grad_norm": 3.3192944526672363,
"learning_rate": 6.847807780834223e-05,
"loss": 1.0823,
"step": 6100
},
{
"epoch": 0.2088385879816761,
"grad_norm": 4.713069915771484,
"learning_rate": 6.96008533093808e-05,
"loss": 1.0718,
"step": 6200
},
{
"epoch": 0.2122069523039612,
"grad_norm": 4.135160446166992,
"learning_rate": 7.072362881041936e-05,
"loss": 1.057,
"step": 6300
},
{
"epoch": 0.2155753166262463,
"grad_norm": 4.193116664886475,
"learning_rate": 7.184640431145793e-05,
"loss": 1.0509,
"step": 6400
},
{
"epoch": 0.2189436809485314,
"grad_norm": 4.028440475463867,
"learning_rate": 7.296917981249649e-05,
"loss": 1.0365,
"step": 6500
},
{
"epoch": 0.2223120452708165,
"grad_norm": 4.614249229431152,
"learning_rate": 7.409195531353507e-05,
"loss": 1.0293,
"step": 6600
},
{
"epoch": 0.2256804095931016,
"grad_norm": 4.366164684295654,
"learning_rate": 7.521473081457363e-05,
"loss": 1.0198,
"step": 6700
},
{
"epoch": 0.2290487739153867,
"grad_norm": 5.207546710968018,
"learning_rate": 7.63375063156122e-05,
"loss": 1.0059,
"step": 6800
},
{
"epoch": 0.2324171382376718,
"grad_norm": 3.651235342025757,
"learning_rate": 7.746028181665077e-05,
"loss": 1.0009,
"step": 6900
},
{
"epoch": 0.2357855025599569,
"grad_norm": 4.040618896484375,
"learning_rate": 7.858305731768933e-05,
"loss": 0.9907,
"step": 7000
},
{
"epoch": 0.239153866882242,
"grad_norm": 3.792742967605591,
"learning_rate": 7.97058328187279e-05,
"loss": 0.9847,
"step": 7100
},
{
"epoch": 0.2425222312045271,
"grad_norm": 4.362412929534912,
"learning_rate": 8.082860831976646e-05,
"loss": 0.9738,
"step": 7200
},
{
"epoch": 0.24589059552681217,
"grad_norm": 4.572664737701416,
"learning_rate": 8.195138382080504e-05,
"loss": 0.9678,
"step": 7300
},
{
"epoch": 0.24925895984909727,
"grad_norm": 4.474113464355469,
"learning_rate": 8.30741593218436e-05,
"loss": 0.957,
"step": 7400
},
{
"epoch": 0.2526273241713824,
"grad_norm": 4.847846984863281,
"learning_rate": 8.419693482288217e-05,
"loss": 0.9492,
"step": 7500
},
{
"epoch": 0.2559956884936675,
"grad_norm": 4.326010227203369,
"learning_rate": 8.531971032392074e-05,
"loss": 0.9444,
"step": 7600
},
{
"epoch": 0.2593640528159526,
"grad_norm": 4.634029388427734,
"learning_rate": 8.64424858249593e-05,
"loss": 0.9337,
"step": 7700
},
{
"epoch": 0.2627324171382377,
"grad_norm": 3.841517925262451,
"learning_rate": 8.756526132599788e-05,
"loss": 0.9282,
"step": 7800
},
{
"epoch": 0.2661007814605228,
"grad_norm": 4.89427375793457,
"learning_rate": 8.868803682703643e-05,
"loss": 0.9164,
"step": 7900
},
{
"epoch": 0.2694691457828079,
"grad_norm": 4.296108245849609,
"learning_rate": 8.9810812328075e-05,
"loss": 0.9146,
"step": 8000
},
{
"epoch": 0.27283751010509294,
"grad_norm": 4.8395586013793945,
"learning_rate": 9.093358782911357e-05,
"loss": 0.903,
"step": 8100
},
{
"epoch": 0.27620587442737804,
"grad_norm": 4.250405788421631,
"learning_rate": 9.205636333015214e-05,
"loss": 0.9013,
"step": 8200
},
{
"epoch": 0.27957423874966314,
"grad_norm": 3.9244723320007324,
"learning_rate": 9.317913883119071e-05,
"loss": 0.8968,
"step": 8300
},
{
"epoch": 0.28294260307194824,
"grad_norm": 4.492284774780273,
"learning_rate": 9.430191433222928e-05,
"loss": 0.8924,
"step": 8400
},
{
"epoch": 0.28631096739423334,
"grad_norm": 4.632638454437256,
"learning_rate": 9.542468983326785e-05,
"loss": 0.8822,
"step": 8500
},
{
"epoch": 0.28967933171651844,
"grad_norm": 3.6097586154937744,
"learning_rate": 9.65474653343064e-05,
"loss": 0.8774,
"step": 8600
},
{
"epoch": 0.29304769603880354,
"grad_norm": 3.6722657680511475,
"learning_rate": 9.767024083534497e-05,
"loss": 0.8697,
"step": 8700
},
{
"epoch": 0.29641606036108864,
"grad_norm": 4.693965911865234,
"learning_rate": 9.879301633638355e-05,
"loss": 0.8583,
"step": 8800
},
{
"epoch": 0.29978442468337374,
"grad_norm": 3.5417885780334473,
"learning_rate": 9.991579183742211e-05,
"loss": 0.8498,
"step": 8900
},
{
"epoch": 0.30315278900565884,
"grad_norm": 5.091881275177002,
"learning_rate": 0.00010103856733846069,
"loss": 0.8396,
"step": 9000
},
{
"epoch": 0.30652115332794394,
"grad_norm": 4.218757152557373,
"learning_rate": 0.00010216134283949925,
"loss": 0.8314,
"step": 9100
},
{
"epoch": 0.30988951765022904,
"grad_norm": 3.600708246231079,
"learning_rate": 0.00010328411834053782,
"loss": 0.8249,
"step": 9200
},
{
"epoch": 0.31325788197251414,
"grad_norm": 3.8332407474517822,
"learning_rate": 0.00010440689384157639,
"loss": 0.8187,
"step": 9300
},
{
"epoch": 0.31662624629479924,
"grad_norm": 3.1585068702697754,
"learning_rate": 0.00010552966934261494,
"loss": 0.8087,
"step": 9400
},
{
"epoch": 0.31999461061708434,
"grad_norm": 3.4112815856933594,
"learning_rate": 0.00010665244484365351,
"loss": 0.8015,
"step": 9500
},
{
"epoch": 0.32336297493936944,
"grad_norm": 4.372965335845947,
"learning_rate": 0.00010777522034469207,
"loss": 0.7892,
"step": 9600
},
{
"epoch": 0.32673133926165454,
"grad_norm": 3.7581305503845215,
"learning_rate": 0.00010889799584573066,
"loss": 0.7814,
"step": 9700
},
{
"epoch": 0.33009970358393964,
"grad_norm": 4.480976581573486,
"learning_rate": 0.00011002077134676922,
"loss": 0.7625,
"step": 9800
},
{
"epoch": 0.33346806790622474,
"grad_norm": 3.4865591526031494,
"learning_rate": 0.00011114354684780779,
"loss": 0.7524,
"step": 9900
},
{
"epoch": 0.33683643222850984,
"grad_norm": 3.5094540119171143,
"learning_rate": 0.00011226632234884636,
"loss": 0.7421,
"step": 10000
},
{
"epoch": 0.34020479655079494,
"grad_norm": 3.0365946292877197,
"learning_rate": 0.00011338909784988491,
"loss": 0.7354,
"step": 10100
},
{
"epoch": 0.34357316087308004,
"grad_norm": 3.5247597694396973,
"learning_rate": 0.00011451187335092348,
"loss": 0.7224,
"step": 10200
},
{
"epoch": 0.34694152519536514,
"grad_norm": 3.1095457077026367,
"learning_rate": 0.00011563464885196205,
"loss": 0.7195,
"step": 10300
},
{
"epoch": 0.35030988951765024,
"grad_norm": 3.8091487884521484,
"learning_rate": 0.00011675742435300064,
"loss": 0.713,
"step": 10400
},
{
"epoch": 0.35367825383993534,
"grad_norm": 2.9617044925689697,
"learning_rate": 0.00011788019985403919,
"loss": 0.7067,
"step": 10500
},
{
"epoch": 0.35704661816222044,
"grad_norm": 4.0781331062316895,
"learning_rate": 0.00011900297535507776,
"loss": 0.7022,
"step": 10600
},
{
"epoch": 0.36041498248450554,
"grad_norm": 2.9260106086730957,
"learning_rate": 0.00012012575085611633,
"loss": 0.6967,
"step": 10700
},
{
"epoch": 0.36378334680679064,
"grad_norm": 3.00919508934021,
"learning_rate": 0.00012124852635715489,
"loss": 0.6934,
"step": 10800
},
{
"epoch": 0.36715171112907574,
"grad_norm": 2.74841046333313,
"learning_rate": 0.00012237130185819344,
"loss": 0.6874,
"step": 10900
},
{
"epoch": 0.37052007545136084,
"grad_norm": 2.3908281326293945,
"learning_rate": 0.000123494077359232,
"loss": 0.6843,
"step": 11000
},
{
"epoch": 0.37388843977364594,
"grad_norm": 2.5212063789367676,
"learning_rate": 0.0001246168528602706,
"loss": 0.681,
"step": 11100
},
{
"epoch": 0.37725680409593104,
"grad_norm": 2.342548370361328,
"learning_rate": 0.00012573962836130918,
"loss": 0.6755,
"step": 11200
},
{
"epoch": 0.3806251684182161,
"grad_norm": 2.2817301750183105,
"learning_rate": 0.00012686240386234775,
"loss": 0.6762,
"step": 11300
},
{
"epoch": 0.3839935327405012,
"grad_norm": 2.4880239963531494,
"learning_rate": 0.0001279851793633863,
"loss": 0.6696,
"step": 11400
},
{
"epoch": 0.3873618970627863,
"grad_norm": 2.2513132095336914,
"learning_rate": 0.00012910795486442486,
"loss": 0.6698,
"step": 11500
},
{
"epoch": 0.3907302613850714,
"grad_norm": 2.4084956645965576,
"learning_rate": 0.00013023073036546343,
"loss": 0.6669,
"step": 11600
},
{
"epoch": 0.3940986257073565,
"grad_norm": 2.5854873657226562,
"learning_rate": 0.000131353505866502,
"loss": 0.6629,
"step": 11700
},
{
"epoch": 0.3974669900296416,
"grad_norm": 2.377323627471924,
"learning_rate": 0.00013247628136754056,
"loss": 0.6607,
"step": 11800
},
{
"epoch": 0.4008353543519267,
"grad_norm": 2.0934255123138428,
"learning_rate": 0.00013359905686857913,
"loss": 0.6557,
"step": 11900
},
{
"epoch": 0.4042037186742118,
"grad_norm": 2.2876408100128174,
"learning_rate": 0.0001347218323696177,
"loss": 0.6537,
"step": 12000
},
{
"epoch": 0.4075720829964969,
"grad_norm": 2.856818199157715,
"learning_rate": 0.00013584460787065627,
"loss": 0.6534,
"step": 12100
},
{
"epoch": 0.410940447318782,
"grad_norm": 2.3577589988708496,
"learning_rate": 0.00013696738337169484,
"loss": 0.6468,
"step": 12200
},
{
"epoch": 0.4143088116410671,
"grad_norm": 2.1369576454162598,
"learning_rate": 0.0001380901588727334,
"loss": 0.6466,
"step": 12300
},
{
"epoch": 0.4176771759633522,
"grad_norm": 2.0527994632720947,
"learning_rate": 0.00013921293437377195,
"loss": 0.6423,
"step": 12400
},
{
"epoch": 0.4210455402856373,
"grad_norm": 2.1849894523620605,
"learning_rate": 0.00014033570987481052,
"loss": 0.6408,
"step": 12500
},
{
"epoch": 0.4244139046079224,
"grad_norm": 2.403149127960205,
"learning_rate": 0.00014145848537584912,
"loss": 0.6401,
"step": 12600
},
{
"epoch": 0.4277822689302075,
"grad_norm": 1.983995795249939,
"learning_rate": 0.0001425812608768877,
"loss": 0.6387,
"step": 12700
},
{
"epoch": 0.4311506332524926,
"grad_norm": 2.141962766647339,
"learning_rate": 0.00014370403637792623,
"loss": 0.635,
"step": 12800
},
{
"epoch": 0.4345189975747777,
"grad_norm": 1.9785326719284058,
"learning_rate": 0.0001448268118789648,
"loss": 0.6314,
"step": 12900
},
{
"epoch": 0.4378873618970628,
"grad_norm": 2.0606772899627686,
"learning_rate": 0.00014594958738000337,
"loss": 0.6285,
"step": 13000
},
{
"epoch": 0.4412557262193479,
"grad_norm": 1.88225519657135,
"learning_rate": 0.00014707236288104194,
"loss": 0.6296,
"step": 13100
},
{
"epoch": 0.444624090541633,
"grad_norm": 2.204674005508423,
"learning_rate": 0.0001481951383820805,
"loss": 0.628,
"step": 13200
},
{
"epoch": 0.4479924548639181,
"grad_norm": 1.8650182485580444,
"learning_rate": 0.00014931791388311908,
"loss": 0.6264,
"step": 13300
},
{
"epoch": 0.4513608191862032,
"grad_norm": 1.7972240447998047,
"learning_rate": 0.00015044068938415765,
"loss": 0.6211,
"step": 13400
},
{
"epoch": 0.4547291835084883,
"grad_norm": 1.8085206747055054,
"learning_rate": 0.00015156346488519621,
"loss": 0.6223,
"step": 13500
},
{
"epoch": 0.4580975478307734,
"grad_norm": 1.877871036529541,
"learning_rate": 0.00015268624038623478,
"loss": 0.624,
"step": 13600
},
{
"epoch": 0.4614659121530585,
"grad_norm": 2.295692205429077,
"learning_rate": 0.00015380901588727335,
"loss": 0.6198,
"step": 13700
},
{
"epoch": 0.4648342764753436,
"grad_norm": 2.4655864238739014,
"learning_rate": 0.0001549317913883119,
"loss": 0.6171,
"step": 13800
},
{
"epoch": 0.4682026407976287,
"grad_norm": 1.9931831359863281,
"learning_rate": 0.00015605456688935046,
"loss": 0.6146,
"step": 13900
},
{
"epoch": 0.4715710051199138,
"grad_norm": 1.7389591932296753,
"learning_rate": 0.00015717734239038906,
"loss": 0.6141,
"step": 14000
},
{
"epoch": 0.4749393694421989,
"grad_norm": 2.0048677921295166,
"learning_rate": 0.00015830011789142763,
"loss": 0.613,
"step": 14100
},
{
"epoch": 0.478307733764484,
"grad_norm": 2.0038020610809326,
"learning_rate": 0.0001594228933924662,
"loss": 0.6116,
"step": 14200
},
{
"epoch": 0.4816760980867691,
"grad_norm": 1.8391730785369873,
"learning_rate": 0.00016054566889350474,
"loss": 0.6093,
"step": 14300
},
{
"epoch": 0.4850444624090542,
"grad_norm": 1.769494652748108,
"learning_rate": 0.0001616684443945433,
"loss": 0.6081,
"step": 14400
},
{
"epoch": 0.4884128267313393,
"grad_norm": 1.9740633964538574,
"learning_rate": 0.00016279121989558188,
"loss": 0.6069,
"step": 14500
},
{
"epoch": 0.49178119105362433,
"grad_norm": 2.1322596073150635,
"learning_rate": 0.00016391399539662045,
"loss": 0.6067,
"step": 14600
},
{
"epoch": 0.49514955537590943,
"grad_norm": 1.6382005214691162,
"learning_rate": 0.00016503677089765902,
"loss": 0.604,
"step": 14700
},
{
"epoch": 0.49851791969819453,
"grad_norm": 1.49541175365448,
"learning_rate": 0.0001661595463986976,
"loss": 0.6027,
"step": 14800
},
{
"epoch": 0.5018862840204796,
"grad_norm": 1.5882339477539062,
"learning_rate": 0.00016728232189973616,
"loss": 0.6014,
"step": 14900
},
{
"epoch": 0.5052546483427648,
"grad_norm": 1.491133213043213,
"learning_rate": 0.00016840509740077473,
"loss": 0.5983,
"step": 15000
},
{
"epoch": 0.5086230126650498,
"grad_norm": 1.7467178106307983,
"learning_rate": 0.0001695278729018133,
"loss": 0.5996,
"step": 15100
},
{
"epoch": 0.511991376987335,
"grad_norm": 1.5445200204849243,
"learning_rate": 0.00017065064840285186,
"loss": 0.5937,
"step": 15200
},
{
"epoch": 0.51535974130962,
"grad_norm": 1.613213300704956,
"learning_rate": 0.0001717734239038904,
"loss": 0.5924,
"step": 15300
},
{
"epoch": 0.5187281056319052,
"grad_norm": 1.67715585231781,
"learning_rate": 0.00017289619940492898,
"loss": 0.594,
"step": 15400
},
{
"epoch": 0.5220964699541902,
"grad_norm": 1.7080377340316772,
"learning_rate": 0.00017401897490596757,
"loss": 0.5935,
"step": 15500
},
{
"epoch": 0.5254648342764754,
"grad_norm": 1.7722272872924805,
"learning_rate": 0.00017514175040700614,
"loss": 0.5914,
"step": 15600
},
{
"epoch": 0.5288331985987604,
"grad_norm": 1.7470366954803467,
"learning_rate": 0.00017626452590804468,
"loss": 0.5883,
"step": 15700
},
{
"epoch": 0.5322015629210456,
"grad_norm": 1.974663496017456,
"learning_rate": 0.00017738730140908325,
"loss": 0.5908,
"step": 15800
},
{
"epoch": 0.5355699272433306,
"grad_norm": 1.4482321739196777,
"learning_rate": 0.00017851007691012182,
"loss": 0.5885,
"step": 15900
},
{
"epoch": 0.5389382915656158,
"grad_norm": 1.750618815422058,
"learning_rate": 0.0001796328524111604,
"loss": 0.5855,
"step": 16000
},
{
"epoch": 0.5423066558879008,
"grad_norm": 1.3821526765823364,
"learning_rate": 0.00018075562791219896,
"loss": 0.5884,
"step": 16100
},
{
"epoch": 0.5456750202101859,
"grad_norm": 1.4892586469650269,
"learning_rate": 0.00018187840341323753,
"loss": 0.5838,
"step": 16200
},
{
"epoch": 0.549043384532471,
"grad_norm": 1.5591208934783936,
"learning_rate": 0.0001830011789142761,
"loss": 0.5834,
"step": 16300
},
{
"epoch": 0.5524117488547561,
"grad_norm": 1.326253056526184,
"learning_rate": 0.00018412395441531467,
"loss": 0.5828,
"step": 16400
},
{
"epoch": 0.5557801131770412,
"grad_norm": 1.5288639068603516,
"learning_rate": 0.00018524672991635324,
"loss": 0.5793,
"step": 16500
},
{
"epoch": 0.5591484774993263,
"grad_norm": 1.4673304557800293,
"learning_rate": 0.0001863695054173918,
"loss": 0.5791,
"step": 16600
},
{
"epoch": 0.5625168418216114,
"grad_norm": 1.6291229724884033,
"learning_rate": 0.00018749228091843035,
"loss": 0.5792,
"step": 16700
},
{
"epoch": 0.5658852061438965,
"grad_norm": 1.3908525705337524,
"learning_rate": 0.00018861505641946892,
"loss": 0.5795,
"step": 16800
},
{
"epoch": 0.5692535704661816,
"grad_norm": 1.4598628282546997,
"learning_rate": 0.00018973783192050752,
"loss": 0.576,
"step": 16900
},
{
"epoch": 0.5726219347884667,
"grad_norm": 1.2881489992141724,
"learning_rate": 0.00019086060742154608,
"loss": 0.575,
"step": 17000
},
{
"epoch": 0.5759902991107518,
"grad_norm": 1.2719937562942505,
"learning_rate": 0.00019198338292258465,
"loss": 0.5747,
"step": 17100
},
{
"epoch": 0.5793586634330369,
"grad_norm": 1.2574406862258911,
"learning_rate": 0.0001931061584236232,
"loss": 0.573,
"step": 17200
},
{
"epoch": 0.582727027755322,
"grad_norm": 1.457133173942566,
"learning_rate": 0.00019422893392466177,
"loss": 0.5738,
"step": 17300
},
{
"epoch": 0.5860953920776071,
"grad_norm": 1.2623742818832397,
"learning_rate": 0.00019535170942570033,
"loss": 0.571,
"step": 17400
},
{
"epoch": 0.5894637563998922,
"grad_norm": 1.4135565757751465,
"learning_rate": 0.0001964744849267389,
"loss": 0.5706,
"step": 17500
},
{
"epoch": 0.5928321207221773,
"grad_norm": 1.502484917640686,
"learning_rate": 0.00019759726042777747,
"loss": 0.5713,
"step": 17600
},
{
"epoch": 0.5962004850444624,
"grad_norm": 1.3130122423171997,
"learning_rate": 0.00019872003592881604,
"loss": 0.5683,
"step": 17700
},
{
"epoch": 0.5995688493667475,
"grad_norm": 1.2580504417419434,
"learning_rate": 0.0001998428114298546,
"loss": 0.5696,
"step": 17800
},
{
"epoch": 0.6029372136890326,
"grad_norm": 1.204026460647583,
"learning_rate": 0.00019975859987929996,
"loss": 0.5664,
"step": 17900
},
{
"epoch": 0.6063055780113177,
"grad_norm": 1.3051841259002686,
"learning_rate": 0.00019947790206453243,
"loss": 0.5666,
"step": 18000
},
{
"epoch": 0.6096739423336028,
"grad_norm": 1.1939951181411743,
"learning_rate": 0.00019919720424976494,
"loss": 0.5634,
"step": 18100
},
{
"epoch": 0.6130423066558879,
"grad_norm": 1.25477135181427,
"learning_rate": 0.00019891650643499742,
"loss": 0.5628,
"step": 18200
},
{
"epoch": 0.616410670978173,
"grad_norm": 1.1275781393051147,
"learning_rate": 0.0001986358086202299,
"loss": 0.5624,
"step": 18300
},
{
"epoch": 0.6197790353004581,
"grad_norm": 1.1167781352996826,
"learning_rate": 0.00019835511080546237,
"loss": 0.5617,
"step": 18400
},
{
"epoch": 0.6231473996227432,
"grad_norm": 1.193454623222351,
"learning_rate": 0.00019807441299069488,
"loss": 0.5605,
"step": 18500
},
{
"epoch": 0.6265157639450283,
"grad_norm": 1.1406720876693726,
"learning_rate": 0.00019779371517592739,
"loss": 0.5587,
"step": 18600
},
{
"epoch": 0.6298841282673134,
"grad_norm": 1.2136386632919312,
"learning_rate": 0.00019751301736115986,
"loss": 0.5573,
"step": 18700
},
{
"epoch": 0.6332524925895985,
"grad_norm": 1.216199278831482,
"learning_rate": 0.00019723231954639234,
"loss": 0.5563,
"step": 18800
},
{
"epoch": 0.6366208569118836,
"grad_norm": 1.2443403005599976,
"learning_rate": 0.00019695162173162482,
"loss": 0.5519,
"step": 18900
},
{
"epoch": 0.6399892212341687,
"grad_norm": 1.1415669918060303,
"learning_rate": 0.0001966709239168573,
"loss": 0.5551,
"step": 19000
},
{
"epoch": 0.6433575855564538,
"grad_norm": 1.2228775024414062,
"learning_rate": 0.0001963902261020898,
"loss": 0.5547,
"step": 19100
},
{
"epoch": 0.6467259498787389,
"grad_norm": 1.1878366470336914,
"learning_rate": 0.0001961095282873223,
"loss": 0.5537,
"step": 19200
},
{
"epoch": 0.650094314201024,
"grad_norm": 1.1277652978897095,
"learning_rate": 0.0001958288304725548,
"loss": 0.5521,
"step": 19300
},
{
"epoch": 0.6534626785233091,
"grad_norm": 1.2011772394180298,
"learning_rate": 0.00019554813265778727,
"loss": 0.5519,
"step": 19400
},
{
"epoch": 0.6568310428455941,
"grad_norm": 1.1792044639587402,
"learning_rate": 0.00019526743484301975,
"loss": 0.5493,
"step": 19500
},
{
"epoch": 0.6601994071678793,
"grad_norm": 1.1553574800491333,
"learning_rate": 0.00019498673702825225,
"loss": 0.5464,
"step": 19600
},
{
"epoch": 0.6635677714901643,
"grad_norm": 1.1871212720870972,
"learning_rate": 0.00019470603921348473,
"loss": 0.5489,
"step": 19700
},
{
"epoch": 0.6669361358124495,
"grad_norm": 1.0879842042922974,
"learning_rate": 0.0001944253413987172,
"loss": 0.5476,
"step": 19800
},
{
"epoch": 0.6703045001347345,
"grad_norm": 1.3135937452316284,
"learning_rate": 0.0001941446435839497,
"loss": 0.5482,
"step": 19900
},
{
"epoch": 0.6736728644570197,
"grad_norm": 1.0638514757156372,
"learning_rate": 0.0001938639457691822,
"loss": 0.546,
"step": 20000
},
{
"epoch": 0.6770412287793047,
"grad_norm": 1.139218807220459,
"learning_rate": 0.0001935832479544147,
"loss": 0.5434,
"step": 20100
},
{
"epoch": 0.6804095931015899,
"grad_norm": 1.0563747882843018,
"learning_rate": 0.00019330255013964718,
"loss": 0.5462,
"step": 20200
},
{
"epoch": 0.6837779574238749,
"grad_norm": 1.0997061729431152,
"learning_rate": 0.00019302185232487965,
"loss": 0.5401,
"step": 20300
},
{
"epoch": 0.6871463217461601,
"grad_norm": 1.0555341243743896,
"learning_rate": 0.00019274115451011213,
"loss": 0.5413,
"step": 20400
},
{
"epoch": 0.6905146860684451,
"grad_norm": 1.1296801567077637,
"learning_rate": 0.00019246045669534464,
"loss": 0.5394,
"step": 20500
},
{
"epoch": 0.6938830503907303,
"grad_norm": 1.1637988090515137,
"learning_rate": 0.00019217975888057714,
"loss": 0.5405,
"step": 20600
},
{
"epoch": 0.6972514147130153,
"grad_norm": 1.1942201852798462,
"learning_rate": 0.00019189906106580962,
"loss": 0.5401,
"step": 20700
},
{
"epoch": 0.7006197790353005,
"grad_norm": 1.104561686515808,
"learning_rate": 0.0001916183632510421,
"loss": 0.5385,
"step": 20800
},
{
"epoch": 0.7039881433575855,
"grad_norm": 1.0518121719360352,
"learning_rate": 0.00019133766543627458,
"loss": 0.5394,
"step": 20900
},
{
"epoch": 0.7073565076798707,
"grad_norm": 1.0300666093826294,
"learning_rate": 0.00019105696762150706,
"loss": 0.5361,
"step": 21000
},
{
"epoch": 0.7107248720021557,
"grad_norm": 0.9076865315437317,
"learning_rate": 0.00019077626980673956,
"loss": 0.5384,
"step": 21100
},
{
"epoch": 0.7140932363244409,
"grad_norm": 1.170762062072754,
"learning_rate": 0.00019049557199197204,
"loss": 0.5356,
"step": 21200
},
{
"epoch": 0.7174616006467259,
"grad_norm": 1.102295160293579,
"learning_rate": 0.00019021487417720455,
"loss": 0.5359,
"step": 21300
},
{
"epoch": 0.7208299649690111,
"grad_norm": 1.102849006652832,
"learning_rate": 0.00018993417636243703,
"loss": 0.535,
"step": 21400
},
{
"epoch": 0.7241983292912961,
"grad_norm": 0.9895302653312683,
"learning_rate": 0.0001896534785476695,
"loss": 0.533,
"step": 21500
},
{
"epoch": 0.7275666936135813,
"grad_norm": 1.0017067193984985,
"learning_rate": 0.000189372780732902,
"loss": 0.5328,
"step": 21600
},
{
"epoch": 0.7309350579358663,
"grad_norm": 1.068293809890747,
"learning_rate": 0.0001890920829181345,
"loss": 0.5355,
"step": 21700
},
{
"epoch": 0.7343034222581515,
"grad_norm": 1.092910647392273,
"learning_rate": 0.00018881138510336697,
"loss": 0.5322,
"step": 21800
},
{
"epoch": 0.7376717865804365,
"grad_norm": 1.0329002141952515,
"learning_rate": 0.00018853068728859947,
"loss": 0.5308,
"step": 21900
},
{
"epoch": 0.7410401509027217,
"grad_norm": 1.1431453227996826,
"learning_rate": 0.00018824998947383195,
"loss": 0.5312,
"step": 22000
},
{
"epoch": 0.7444085152250067,
"grad_norm": 0.9961342811584473,
"learning_rate": 0.00018796929165906446,
"loss": 0.5316,
"step": 22100
},
{
"epoch": 0.7477768795472919,
"grad_norm": 0.9267546534538269,
"learning_rate": 0.00018768859384429693,
"loss": 0.5308,
"step": 22200
},
{
"epoch": 0.7511452438695769,
"grad_norm": 1.0788689851760864,
"learning_rate": 0.0001874078960295294,
"loss": 0.5297,
"step": 22300
},
{
"epoch": 0.7545136081918621,
"grad_norm": 1.0680807828903198,
"learning_rate": 0.0001871271982147619,
"loss": 0.5283,
"step": 22400
},
{
"epoch": 0.7578819725141471,
"grad_norm": 1.122947096824646,
"learning_rate": 0.0001868465003999944,
"loss": 0.5268,
"step": 22500
},
{
"epoch": 0.7612503368364322,
"grad_norm": 1.0286208391189575,
"learning_rate": 0.0001865658025852269,
"loss": 0.5264,
"step": 22600
},
{
"epoch": 0.7646187011587173,
"grad_norm": 1.0122915506362915,
"learning_rate": 0.00018628510477045938,
"loss": 0.5261,
"step": 22700
},
{
"epoch": 0.7679870654810024,
"grad_norm": 1.0254476070404053,
"learning_rate": 0.00018600440695569186,
"loss": 0.5253,
"step": 22800
},
{
"epoch": 0.7713554298032875,
"grad_norm": 0.9192175269126892,
"learning_rate": 0.00018572370914092434,
"loss": 0.5235,
"step": 22900
},
{
"epoch": 0.7747237941255726,
"grad_norm": 1.0937845706939697,
"learning_rate": 0.00018544301132615684,
"loss": 0.5243,
"step": 23000
},
{
"epoch": 0.7780921584478577,
"grad_norm": 1.0288293361663818,
"learning_rate": 0.00018516231351138932,
"loss": 0.5221,
"step": 23100
},
{
"epoch": 0.7814605227701428,
"grad_norm": 1.0520168542861938,
"learning_rate": 0.0001848816156966218,
"loss": 0.5237,
"step": 23200
},
{
"epoch": 0.7848288870924279,
"grad_norm": 0.9760498404502869,
"learning_rate": 0.0001846009178818543,
"loss": 0.5245,
"step": 23300
},
{
"epoch": 0.788197251414713,
"grad_norm": 1.0123729705810547,
"learning_rate": 0.00018432022006708678,
"loss": 0.5238,
"step": 23400
},
{
"epoch": 0.7915656157369981,
"grad_norm": 0.9239659905433655,
"learning_rate": 0.0001840395222523193,
"loss": 0.5228,
"step": 23500
},
{
"epoch": 0.7949339800592832,
"grad_norm": 0.964204728603363,
"learning_rate": 0.00018375882443755177,
"loss": 0.5202,
"step": 23600
},
{
"epoch": 0.7983023443815683,
"grad_norm": 1.024375081062317,
"learning_rate": 0.00018347812662278425,
"loss": 0.5214,
"step": 23700
},
{
"epoch": 0.8016707087038534,
"grad_norm": 0.9285891652107239,
"learning_rate": 0.00018319742880801672,
"loss": 0.5216,
"step": 23800
},
{
"epoch": 0.8050390730261385,
"grad_norm": 0.9374035000801086,
"learning_rate": 0.00018291673099324923,
"loss": 0.5199,
"step": 23900
},
{
"epoch": 0.8084074373484236,
"grad_norm": 0.9423925280570984,
"learning_rate": 0.00018263603317848174,
"loss": 0.5182,
"step": 24000
},
{
"epoch": 0.8117758016707087,
"grad_norm": 0.9198417663574219,
"learning_rate": 0.00018235533536371421,
"loss": 0.5195,
"step": 24100
},
{
"epoch": 0.8151441659929938,
"grad_norm": 0.8950690627098083,
"learning_rate": 0.0001820746375489467,
"loss": 0.5174,
"step": 24200
},
{
"epoch": 0.8185125303152789,
"grad_norm": 0.9775617718696594,
"learning_rate": 0.00018179393973417917,
"loss": 0.5163,
"step": 24300
},
{
"epoch": 0.821880894637564,
"grad_norm": 0.961654543876648,
"learning_rate": 0.00018151324191941165,
"loss": 0.5145,
"step": 24400
},
{
"epoch": 0.8252492589598491,
"grad_norm": 0.884971559047699,
"learning_rate": 0.00018123254410464415,
"loss": 0.5159,
"step": 24500
},
{
"epoch": 0.8286176232821342,
"grad_norm": 0.9463781118392944,
"learning_rate": 0.00018095184628987666,
"loss": 0.5147,
"step": 24600
},
{
"epoch": 0.8319859876044193,
"grad_norm": 0.9335620999336243,
"learning_rate": 0.00018067114847510914,
"loss": 0.5148,
"step": 24700
},
{
"epoch": 0.8353543519267044,
"grad_norm": 1.0065468549728394,
"learning_rate": 0.00018039045066034162,
"loss": 0.5145,
"step": 24800
},
{
"epoch": 0.8387227162489895,
"grad_norm": 0.9249733686447144,
"learning_rate": 0.0001801097528455741,
"loss": 0.5144,
"step": 24900
},
{
"epoch": 0.8420910805712746,
"grad_norm": 0.9696065783500671,
"learning_rate": 0.0001798290550308066,
"loss": 0.5146,
"step": 25000
},
{
"epoch": 0.8454594448935597,
"grad_norm": 0.9490009546279907,
"learning_rate": 0.00017954835721603908,
"loss": 0.5128,
"step": 25100
},
{
"epoch": 0.8488278092158448,
"grad_norm": 0.9294765591621399,
"learning_rate": 0.00017926765940127156,
"loss": 0.5128,
"step": 25200
},
{
"epoch": 0.8521961735381299,
"grad_norm": 0.9910796284675598,
"learning_rate": 0.00017898696158650406,
"loss": 0.5118,
"step": 25300
},
{
"epoch": 0.855564537860415,
"grad_norm": 0.9949105381965637,
"learning_rate": 0.00017870626377173654,
"loss": 0.511,
"step": 25400
},
{
"epoch": 0.8589329021827001,
"grad_norm": 0.9345620274543762,
"learning_rate": 0.00017842556595696905,
"loss": 0.5119,
"step": 25500
},
{
"epoch": 0.8623012665049852,
"grad_norm": 0.9553151726722717,
"learning_rate": 0.00017814486814220153,
"loss": 0.5103,
"step": 25600
},
{
"epoch": 0.8656696308272703,
"grad_norm": 0.878685474395752,
"learning_rate": 0.000177864170327434,
"loss": 0.5112,
"step": 25700
},
{
"epoch": 0.8690379951495554,
"grad_norm": 0.9728811979293823,
"learning_rate": 0.00017758347251266648,
"loss": 0.5088,
"step": 25800
},
{
"epoch": 0.8724063594718404,
"grad_norm": 0.9711565375328064,
"learning_rate": 0.000177302774697899,
"loss": 0.5087,
"step": 25900
},
{
"epoch": 0.8757747237941256,
"grad_norm": 0.9093062281608582,
"learning_rate": 0.0001770220768831315,
"loss": 0.5086,
"step": 26000
},
{
"epoch": 0.8791430881164106,
"grad_norm": 0.9751853942871094,
"learning_rate": 0.00017674137906836397,
"loss": 0.5106,
"step": 26100
},
{
"epoch": 0.8825114524386958,
"grad_norm": 0.9044291377067566,
"learning_rate": 0.00017646068125359645,
"loss": 0.5077,
"step": 26200
},
{
"epoch": 0.8858798167609808,
"grad_norm": 0.9224226474761963,
"learning_rate": 0.00017617998343882893,
"loss": 0.5075,
"step": 26300
},
{
"epoch": 0.889248181083266,
"grad_norm": 0.9981474876403809,
"learning_rate": 0.0001758992856240614,
"loss": 0.5048,
"step": 26400
},
{
"epoch": 0.892616545405551,
"grad_norm": 0.8626927733421326,
"learning_rate": 0.0001756185878092939,
"loss": 0.506,
"step": 26500
},
{
"epoch": 0.8959849097278362,
"grad_norm": 0.8800698518753052,
"learning_rate": 0.00017533788999452642,
"loss": 0.5067,
"step": 26600
},
{
"epoch": 0.8993532740501212,
"grad_norm": 0.8937718272209167,
"learning_rate": 0.0001750571921797589,
"loss": 0.5059,
"step": 26700
},
{
"epoch": 0.9027216383724064,
"grad_norm": 0.8680539727210999,
"learning_rate": 0.00017477649436499138,
"loss": 0.5074,
"step": 26800
},
{
"epoch": 0.9060900026946914,
"grad_norm": 0.8701693415641785,
"learning_rate": 0.00017449579655022385,
"loss": 0.5048,
"step": 26900
},
{
"epoch": 0.9094583670169766,
"grad_norm": 0.937451958656311,
"learning_rate": 0.00017421509873545636,
"loss": 0.5036,
"step": 27000
},
{
"epoch": 0.9128267313392616,
"grad_norm": 0.845152735710144,
"learning_rate": 0.00017393440092068884,
"loss": 0.5015,
"step": 27100
},
{
"epoch": 0.9161950956615468,
"grad_norm": 0.8485780358314514,
"learning_rate": 0.00017365370310592132,
"loss": 0.5021,
"step": 27200
},
{
"epoch": 0.9195634599838318,
"grad_norm": 0.8812822699546814,
"learning_rate": 0.00017337300529115382,
"loss": 0.5028,
"step": 27300
},
{
"epoch": 0.922931824306117,
"grad_norm": 0.9817461371421814,
"learning_rate": 0.0001730923074763863,
"loss": 0.5032,
"step": 27400
},
{
"epoch": 0.926300188628402,
"grad_norm": 0.8648643493652344,
"learning_rate": 0.0001728116096616188,
"loss": 0.5023,
"step": 27500
},
{
"epoch": 0.9296685529506872,
"grad_norm": 0.8859161734580994,
"learning_rate": 0.00017253091184685128,
"loss": 0.5022,
"step": 27600
},
{
"epoch": 0.9330369172729722,
"grad_norm": 0.8662147521972656,
"learning_rate": 0.00017225021403208376,
"loss": 0.5,
"step": 27700
},
{
"epoch": 0.9364052815952574,
"grad_norm": 0.9094113111495972,
"learning_rate": 0.00017196951621731624,
"loss": 0.5018,
"step": 27800
},
{
"epoch": 0.9397736459175424,
"grad_norm": 0.924689531326294,
"learning_rate": 0.00017168881840254875,
"loss": 0.5008,
"step": 27900
},
{
"epoch": 0.9431420102398276,
"grad_norm": 0.8770294785499573,
"learning_rate": 0.00017140812058778125,
"loss": 0.5023,
"step": 28000
},
{
"epoch": 0.9465103745621126,
"grad_norm": 0.8615702390670776,
"learning_rate": 0.00017112742277301373,
"loss": 0.4988,
"step": 28100
},
{
"epoch": 0.9498787388843978,
"grad_norm": 0.9163374304771423,
"learning_rate": 0.0001708467249582462,
"loss": 0.5004,
"step": 28200
},
{
"epoch": 0.9532471032066828,
"grad_norm": 0.8876280784606934,
"learning_rate": 0.0001705660271434787,
"loss": 0.5005,
"step": 28300
},
{
"epoch": 0.956615467528968,
"grad_norm": 0.9345399737358093,
"learning_rate": 0.00017028532932871117,
"loss": 0.498,
"step": 28400
},
{
"epoch": 0.959983831851253,
"grad_norm": 0.8554583191871643,
"learning_rate": 0.00017000463151394367,
"loss": 0.4995,
"step": 28500
},
{
"epoch": 0.9633521961735382,
"grad_norm": 0.910744845867157,
"learning_rate": 0.00016972393369917615,
"loss": 0.499,
"step": 28600
},
{
"epoch": 0.9667205604958232,
"grad_norm": 0.9200494289398193,
"learning_rate": 0.00016944323588440866,
"loss": 0.497,
"step": 28700
},
{
"epoch": 0.9700889248181084,
"grad_norm": 0.821864902973175,
"learning_rate": 0.00016916253806964113,
"loss": 0.4976,
"step": 28800
},
{
"epoch": 0.9734572891403934,
"grad_norm": 0.8839085698127747,
"learning_rate": 0.0001688818402548736,
"loss": 0.4981,
"step": 28900
},
{
"epoch": 0.9768256534626786,
"grad_norm": 0.8938930630683899,
"learning_rate": 0.00016860114244010612,
"loss": 0.4982,
"step": 29000
},
{
"epoch": 0.9801940177849636,
"grad_norm": 0.8309621810913086,
"learning_rate": 0.0001683204446253386,
"loss": 0.4971,
"step": 29100
},
{
"epoch": 0.9835623821072487,
"grad_norm": 0.8898798227310181,
"learning_rate": 0.00016803974681057107,
"loss": 0.4981,
"step": 29200
},
{
"epoch": 0.9869307464295338,
"grad_norm": 0.9762869477272034,
"learning_rate": 0.00016775904899580358,
"loss": 0.4968,
"step": 29300
},
{
"epoch": 0.9902991107518189,
"grad_norm": 0.8826524615287781,
"learning_rate": 0.00016747835118103606,
"loss": 0.4983,
"step": 29400
},
{
"epoch": 0.993667475074104,
"grad_norm": 0.8983336687088013,
"learning_rate": 0.00016719765336626856,
"loss": 0.4964,
"step": 29500
},
{
"epoch": 0.9970358393963891,
"grad_norm": 0.8700274229049683,
"learning_rate": 0.00016691695555150104,
"loss": 0.496,
"step": 29600
},
{
"epoch": 1.0,
"eval_loss": 0.49384912848472595,
"eval_runtime": 9.0835,
"eval_samples_per_second": 550.447,
"eval_steps_per_second": 8.697,
"step": 29688
},
{
"epoch": 1.0004042037186742,
"grad_norm": 0.9031352996826172,
"learning_rate": 0.00016663625773673352,
"loss": 0.4948,
"step": 29700
},
{
"epoch": 1.0037725680409593,
"grad_norm": 0.8552715182304382,
"learning_rate": 0.000166355559921966,
"loss": 0.4954,
"step": 29800
},
{
"epoch": 1.0071409323632443,
"grad_norm": 0.8794796466827393,
"learning_rate": 0.0001660748621071985,
"loss": 0.4944,
"step": 29900
},
{
"epoch": 1.0105092966855296,
"grad_norm": 0.876146137714386,
"learning_rate": 0.000165794164292431,
"loss": 0.4954,
"step": 30000
},
{
"epoch": 1.0138776610078146,
"grad_norm": 0.8548246026039124,
"learning_rate": 0.0001655134664776635,
"loss": 0.4957,
"step": 30100
},
{
"epoch": 1.0172460253300997,
"grad_norm": 0.8883000016212463,
"learning_rate": 0.00016523276866289597,
"loss": 0.4939,
"step": 30200
},
{
"epoch": 1.0206143896523847,
"grad_norm": 0.8102014064788818,
"learning_rate": 0.00016495207084812845,
"loss": 0.491,
"step": 30300
},
{
"epoch": 1.02398275397467,
"grad_norm": 0.9280298948287964,
"learning_rate": 0.00016467137303336095,
"loss": 0.4939,
"step": 30400
},
{
"epoch": 1.027351118296955,
"grad_norm": 0.9322350025177002,
"learning_rate": 0.00016439067521859343,
"loss": 0.4923,
"step": 30500
},
{
"epoch": 1.03071948261924,
"grad_norm": 0.8731549978256226,
"learning_rate": 0.0001641099774038259,
"loss": 0.4929,
"step": 30600
},
{
"epoch": 1.0340878469415251,
"grad_norm": 0.8500041365623474,
"learning_rate": 0.00016382927958905841,
"loss": 0.492,
"step": 30700
},
{
"epoch": 1.0374562112638104,
"grad_norm": 0.8375087976455688,
"learning_rate": 0.0001635485817742909,
"loss": 0.4917,
"step": 30800
},
{
"epoch": 1.0408245755860954,
"grad_norm": 0.8288936018943787,
"learning_rate": 0.0001632678839595234,
"loss": 0.4928,
"step": 30900
},
{
"epoch": 1.0441929399083805,
"grad_norm": 0.8341562151908875,
"learning_rate": 0.00016298718614475588,
"loss": 0.4889,
"step": 31000
},
{
"epoch": 1.0475613042306655,
"grad_norm": 0.8432872891426086,
"learning_rate": 0.00016270648832998835,
"loss": 0.4915,
"step": 31100
},
{
"epoch": 1.0509296685529508,
"grad_norm": 0.8462439775466919,
"learning_rate": 0.00016242579051522083,
"loss": 0.4883,
"step": 31200
},
{
"epoch": 1.0542980328752358,
"grad_norm": 0.8429282903671265,
"learning_rate": 0.00016214509270045334,
"loss": 0.4895,
"step": 31300
},
{
"epoch": 1.0576663971975209,
"grad_norm": 0.8985344767570496,
"learning_rate": 0.00016186439488568584,
"loss": 0.4906,
"step": 31400
},
{
"epoch": 1.061034761519806,
"grad_norm": 0.9159397482872009,
"learning_rate": 0.00016158369707091832,
"loss": 0.4891,
"step": 31500
},
{
"epoch": 1.0644031258420912,
"grad_norm": 0.8448222279548645,
"learning_rate": 0.0001613029992561508,
"loss": 0.4891,
"step": 31600
},
{
"epoch": 1.0677714901643762,
"grad_norm": 0.8303894400596619,
"learning_rate": 0.00016102230144138328,
"loss": 0.4902,
"step": 31700
},
{
"epoch": 1.0711398544866613,
"grad_norm": 0.8498880863189697,
"learning_rate": 0.00016074160362661576,
"loss": 0.4871,
"step": 31800
},
{
"epoch": 1.0745082188089463,
"grad_norm": 0.7907134294509888,
"learning_rate": 0.00016046090581184826,
"loss": 0.4885,
"step": 31900
},
{
"epoch": 1.0778765831312316,
"grad_norm": 0.9202895164489746,
"learning_rate": 0.00016018020799708077,
"loss": 0.4888,
"step": 32000
},
{
"epoch": 1.0812449474535166,
"grad_norm": 0.8670128583908081,
"learning_rate": 0.00015989951018231325,
"loss": 0.4859,
"step": 32100
},
{
"epoch": 1.0846133117758017,
"grad_norm": 0.8007021546363831,
"learning_rate": 0.00015961881236754573,
"loss": 0.4885,
"step": 32200
},
{
"epoch": 1.0879816760980867,
"grad_norm": 0.9113264083862305,
"learning_rate": 0.0001593381145527782,
"loss": 0.4876,
"step": 32300
},
{
"epoch": 1.0913500404203718,
"grad_norm": 0.8807794451713562,
"learning_rate": 0.0001590574167380107,
"loss": 0.489,
"step": 32400
},
{
"epoch": 1.094718404742657,
"grad_norm": 0.8606187105178833,
"learning_rate": 0.0001587767189232432,
"loss": 0.489,
"step": 32500
},
{
"epoch": 1.098086769064942,
"grad_norm": 0.8390567898750305,
"learning_rate": 0.00015849602110847567,
"loss": 0.4874,
"step": 32600
},
{
"epoch": 1.1014551333872271,
"grad_norm": 0.8143624663352966,
"learning_rate": 0.00015821532329370817,
"loss": 0.4883,
"step": 32700
},
{
"epoch": 1.1048234977095122,
"grad_norm": 0.9023911356925964,
"learning_rate": 0.00015793462547894065,
"loss": 0.4884,
"step": 32800
},
{
"epoch": 1.1081918620317974,
"grad_norm": 0.9291363954544067,
"learning_rate": 0.00015765392766417316,
"loss": 0.4869,
"step": 32900
},
{
"epoch": 1.1115602263540825,
"grad_norm": 0.834904134273529,
"learning_rate": 0.00015737322984940563,
"loss": 0.4863,
"step": 33000
},
{
"epoch": 1.1149285906763675,
"grad_norm": 0.8896390795707703,
"learning_rate": 0.0001570925320346381,
"loss": 0.485,
"step": 33100
},
{
"epoch": 1.1182969549986526,
"grad_norm": 0.8215962648391724,
"learning_rate": 0.0001568118342198706,
"loss": 0.4867,
"step": 33200
},
{
"epoch": 1.1216653193209378,
"grad_norm": 0.8174338936805725,
"learning_rate": 0.0001565311364051031,
"loss": 0.4865,
"step": 33300
},
{
"epoch": 1.1250336836432229,
"grad_norm": 0.8599314093589783,
"learning_rate": 0.0001562504385903356,
"loss": 0.4832,
"step": 33400
},
{
"epoch": 1.128402047965508,
"grad_norm": 0.7674278020858765,
"learning_rate": 0.00015596974077556808,
"loss": 0.4846,
"step": 33500
},
{
"epoch": 1.131770412287793,
"grad_norm": 0.8474441170692444,
"learning_rate": 0.00015568904296080056,
"loss": 0.4848,
"step": 33600
},
{
"epoch": 1.1351387766100782,
"grad_norm": 0.8045397996902466,
"learning_rate": 0.00015540834514603304,
"loss": 0.483,
"step": 33700
},
{
"epoch": 1.1385071409323633,
"grad_norm": 0.8756964802742004,
"learning_rate": 0.00015512764733126552,
"loss": 0.4836,
"step": 33800
},
{
"epoch": 1.1418755052546483,
"grad_norm": 0.8357768654823303,
"learning_rate": 0.00015484694951649802,
"loss": 0.4852,
"step": 33900
},
{
"epoch": 1.1452438695769334,
"grad_norm": 0.9370204210281372,
"learning_rate": 0.0001545662517017305,
"loss": 0.4839,
"step": 34000
},
{
"epoch": 1.1486122338992186,
"grad_norm": 0.8853762149810791,
"learning_rate": 0.000154285553886963,
"loss": 0.4844,
"step": 34100
},
{
"epoch": 1.1519805982215037,
"grad_norm": 0.7827624678611755,
"learning_rate": 0.00015400485607219548,
"loss": 0.4832,
"step": 34200
},
{
"epoch": 1.1553489625437887,
"grad_norm": 0.8665288090705872,
"learning_rate": 0.00015372415825742796,
"loss": 0.4838,
"step": 34300
},
{
"epoch": 1.1587173268660738,
"grad_norm": 0.8360339403152466,
"learning_rate": 0.00015344346044266047,
"loss": 0.4821,
"step": 34400
},
{
"epoch": 1.162085691188359,
"grad_norm": 0.8605954051017761,
"learning_rate": 0.00015316276262789295,
"loss": 0.4825,
"step": 34500
},
{
"epoch": 1.165454055510644,
"grad_norm": 0.857475221157074,
"learning_rate": 0.00015288206481312542,
"loss": 0.4827,
"step": 34600
},
{
"epoch": 1.1688224198329291,
"grad_norm": 0.8108141422271729,
"learning_rate": 0.00015260136699835793,
"loss": 0.4803,
"step": 34700
},
{
"epoch": 1.1721907841552142,
"grad_norm": 0.8359714150428772,
"learning_rate": 0.0001523206691835904,
"loss": 0.4809,
"step": 34800
},
{
"epoch": 1.1755591484774994,
"grad_norm": 0.8128540515899658,
"learning_rate": 0.00015203997136882291,
"loss": 0.4823,
"step": 34900
},
{
"epoch": 1.1789275127997845,
"grad_norm": 0.8871669769287109,
"learning_rate": 0.0001517592735540554,
"loss": 0.4806,
"step": 35000
},
{
"epoch": 1.1822958771220695,
"grad_norm": 0.8477233052253723,
"learning_rate": 0.00015147857573928787,
"loss": 0.481,
"step": 35100
},
{
"epoch": 1.1856642414443546,
"grad_norm": 0.7827205061912537,
"learning_rate": 0.00015119787792452035,
"loss": 0.4792,
"step": 35200
},
{
"epoch": 1.1890326057666396,
"grad_norm": 0.8286157250404358,
"learning_rate": 0.00015091718010975286,
"loss": 0.4782,
"step": 35300
},
{
"epoch": 1.1924009700889249,
"grad_norm": 0.76893150806427,
"learning_rate": 0.00015063648229498536,
"loss": 0.4805,
"step": 35400
},
{
"epoch": 1.19576933441121,
"grad_norm": 0.8076749444007874,
"learning_rate": 0.00015035578448021784,
"loss": 0.4813,
"step": 35500
},
{
"epoch": 1.199137698733495,
"grad_norm": 0.8551127910614014,
"learning_rate": 0.00015007508666545032,
"loss": 0.4797,
"step": 35600
},
{
"epoch": 1.2025060630557802,
"grad_norm": 0.9260111451148987,
"learning_rate": 0.0001497943888506828,
"loss": 0.4801,
"step": 35700
},
{
"epoch": 1.2058744273780653,
"grad_norm": 0.9091964960098267,
"learning_rate": 0.00014951369103591527,
"loss": 0.4782,
"step": 35800
},
{
"epoch": 1.2092427917003503,
"grad_norm": 0.8588406443595886,
"learning_rate": 0.00014923299322114778,
"loss": 0.4806,
"step": 35900
},
{
"epoch": 1.2126111560226354,
"grad_norm": 0.8295513391494751,
"learning_rate": 0.00014895229540638026,
"loss": 0.479,
"step": 36000
},
{
"epoch": 1.2159795203449204,
"grad_norm": 0.8360409736633301,
"learning_rate": 0.00014867159759161276,
"loss": 0.4793,
"step": 36100
},
{
"epoch": 1.2193478846672057,
"grad_norm": 0.8704560995101929,
"learning_rate": 0.00014839089977684524,
"loss": 0.4788,
"step": 36200
},
{
"epoch": 1.2227162489894907,
"grad_norm": 0.8278842568397522,
"learning_rate": 0.00014811020196207772,
"loss": 0.4796,
"step": 36300
},
{
"epoch": 1.2260846133117758,
"grad_norm": 0.8524438142776489,
"learning_rate": 0.00014782950414731023,
"loss": 0.4784,
"step": 36400
},
{
"epoch": 1.2294529776340608,
"grad_norm": 0.7825035452842712,
"learning_rate": 0.0001475488063325427,
"loss": 0.4783,
"step": 36500
},
{
"epoch": 1.232821341956346,
"grad_norm": 0.8001949787139893,
"learning_rate": 0.00014726810851777518,
"loss": 0.4789,
"step": 36600
},
{
"epoch": 1.2361897062786311,
"grad_norm": 0.7923149466514587,
"learning_rate": 0.0001469874107030077,
"loss": 0.4788,
"step": 36700
},
{
"epoch": 1.2395580706009162,
"grad_norm": 0.8405751585960388,
"learning_rate": 0.0001467067128882402,
"loss": 0.4773,
"step": 36800
},
{
"epoch": 1.2429264349232012,
"grad_norm": 0.8324115872383118,
"learning_rate": 0.00014642601507347267,
"loss": 0.4782,
"step": 36900
},
{
"epoch": 1.2462947992454865,
"grad_norm": 0.8548023700714111,
"learning_rate": 0.00014614531725870515,
"loss": 0.4798,
"step": 37000
},
{
"epoch": 1.2496631635677715,
"grad_norm": 0.8439319729804993,
"learning_rate": 0.00014586461944393763,
"loss": 0.4757,
"step": 37100
},
{
"epoch": 1.2530315278900566,
"grad_norm": 0.7825635075569153,
"learning_rate": 0.0001455839216291701,
"loss": 0.4783,
"step": 37200
},
{
"epoch": 1.2563998922123416,
"grad_norm": 0.8164156675338745,
"learning_rate": 0.0001453032238144026,
"loss": 0.4779,
"step": 37300
},
{
"epoch": 1.2597682565346267,
"grad_norm": 0.8076338768005371,
"learning_rate": 0.00014502252599963512,
"loss": 0.4773,
"step": 37400
},
{
"epoch": 1.263136620856912,
"grad_norm": 0.8112064003944397,
"learning_rate": 0.0001447418281848676,
"loss": 0.4754,
"step": 37500
},
{
"epoch": 1.266504985179197,
"grad_norm": 0.7940359711647034,
"learning_rate": 0.00014446113037010008,
"loss": 0.4745,
"step": 37600
},
{
"epoch": 1.269873349501482,
"grad_norm": 0.8495946526527405,
"learning_rate": 0.00014418043255533255,
"loss": 0.4758,
"step": 37700
},
{
"epoch": 1.2732417138237673,
"grad_norm": 0.8374922275543213,
"learning_rate": 0.00014389973474056506,
"loss": 0.4771,
"step": 37800
},
{
"epoch": 1.2766100781460523,
"grad_norm": 0.8647417426109314,
"learning_rate": 0.00014361903692579754,
"loss": 0.4771,
"step": 37900
},
{
"epoch": 1.2799784424683374,
"grad_norm": 0.8156632781028748,
"learning_rate": 0.00014333833911103002,
"loss": 0.4747,
"step": 38000
},
{
"epoch": 1.2833468067906224,
"grad_norm": 0.7802369594573975,
"learning_rate": 0.00014305764129626252,
"loss": 0.4741,
"step": 38100
},
{
"epoch": 1.2867151711129075,
"grad_norm": 0.7542524337768555,
"learning_rate": 0.000142776943481495,
"loss": 0.4761,
"step": 38200
},
{
"epoch": 1.2900835354351927,
"grad_norm": 0.8326511383056641,
"learning_rate": 0.0001424962456667275,
"loss": 0.4734,
"step": 38300
},
{
"epoch": 1.2934518997574778,
"grad_norm": 0.7556424736976624,
"learning_rate": 0.00014221554785195998,
"loss": 0.4757,
"step": 38400
},
{
"epoch": 1.2968202640797628,
"grad_norm": 0.8151201605796814,
"learning_rate": 0.00014193485003719246,
"loss": 0.4743,
"step": 38500
},
{
"epoch": 1.300188628402048,
"grad_norm": 0.8914119601249695,
"learning_rate": 0.00014165415222242494,
"loss": 0.4769,
"step": 38600
},
{
"epoch": 1.3035569927243331,
"grad_norm": 0.8541133999824524,
"learning_rate": 0.00014137345440765745,
"loss": 0.4744,
"step": 38700
},
{
"epoch": 1.3069253570466182,
"grad_norm": 0.8853744864463806,
"learning_rate": 0.00014109275659288995,
"loss": 0.474,
"step": 38800
},
{
"epoch": 1.3102937213689032,
"grad_norm": 0.8547524809837341,
"learning_rate": 0.00014081205877812243,
"loss": 0.474,
"step": 38900
},
{
"epoch": 1.3136620856911883,
"grad_norm": 0.7881298661231995,
"learning_rate": 0.0001405313609633549,
"loss": 0.4727,
"step": 39000
},
{
"epoch": 1.3170304500134735,
"grad_norm": 0.7588589191436768,
"learning_rate": 0.0001402506631485874,
"loss": 0.473,
"step": 39100
},
{
"epoch": 1.3203988143357586,
"grad_norm": 0.7980801463127136,
"learning_rate": 0.00013996996533381987,
"loss": 0.4727,
"step": 39200
},
{
"epoch": 1.3237671786580436,
"grad_norm": 0.8034206628799438,
"learning_rate": 0.00013968926751905237,
"loss": 0.4737,
"step": 39300
},
{
"epoch": 1.3271355429803289,
"grad_norm": 0.7804720401763916,
"learning_rate": 0.00013940856970428485,
"loss": 0.4754,
"step": 39400
},
{
"epoch": 1.330503907302614,
"grad_norm": 0.8541818261146545,
"learning_rate": 0.00013912787188951736,
"loss": 0.4733,
"step": 39500
},
{
"epoch": 1.333872271624899,
"grad_norm": 0.8339990377426147,
"learning_rate": 0.00013884717407474983,
"loss": 0.4721,
"step": 39600
},
{
"epoch": 1.337240635947184,
"grad_norm": 0.8007979393005371,
"learning_rate": 0.0001385664762599823,
"loss": 0.4745,
"step": 39700
},
{
"epoch": 1.340609000269469,
"grad_norm": 0.848199188709259,
"learning_rate": 0.00013828577844521482,
"loss": 0.4725,
"step": 39800
},
{
"epoch": 1.3439773645917543,
"grad_norm": 0.9129810333251953,
"learning_rate": 0.0001380050806304473,
"loss": 0.4716,
"step": 39900
},
{
"epoch": 1.3473457289140394,
"grad_norm": 0.869888186454773,
"learning_rate": 0.00013772438281567978,
"loss": 0.4744,
"step": 40000
},
{
"epoch": 1.3507140932363244,
"grad_norm": 0.8916295170783997,
"learning_rate": 0.00013744368500091228,
"loss": 0.4712,
"step": 40100
},
{
"epoch": 1.3540824575586097,
"grad_norm": 0.8144074082374573,
"learning_rate": 0.00013716298718614476,
"loss": 0.4734,
"step": 40200
},
{
"epoch": 1.3574508218808947,
"grad_norm": 0.7844826579093933,
"learning_rate": 0.00013688228937137726,
"loss": 0.473,
"step": 40300
},
{
"epoch": 1.3608191862031798,
"grad_norm": 0.8559306859970093,
"learning_rate": 0.00013660159155660974,
"loss": 0.4708,
"step": 40400
},
{
"epoch": 1.3641875505254648,
"grad_norm": 0.7995209693908691,
"learning_rate": 0.00013632089374184222,
"loss": 0.472,
"step": 40500
},
{
"epoch": 1.3675559148477499,
"grad_norm": 0.845758855342865,
"learning_rate": 0.0001360401959270747,
"loss": 0.4714,
"step": 40600
},
{
"epoch": 1.3709242791700351,
"grad_norm": 0.8122411370277405,
"learning_rate": 0.0001357594981123072,
"loss": 0.4715,
"step": 40700
},
{
"epoch": 1.3742926434923202,
"grad_norm": 0.7860530614852905,
"learning_rate": 0.0001354788002975397,
"loss": 0.4718,
"step": 40800
},
{
"epoch": 1.3776610078146052,
"grad_norm": 0.7795781493186951,
"learning_rate": 0.0001351981024827722,
"loss": 0.4696,
"step": 40900
},
{
"epoch": 1.3810293721368903,
"grad_norm": 0.7595000267028809,
"learning_rate": 0.00013491740466800467,
"loss": 0.4703,
"step": 41000
},
{
"epoch": 1.3843977364591753,
"grad_norm": 0.8687454462051392,
"learning_rate": 0.00013463670685323715,
"loss": 0.4698,
"step": 41100
},
{
"epoch": 1.3877661007814606,
"grad_norm": 0.8719391226768494,
"learning_rate": 0.00013435600903846962,
"loss": 0.4689,
"step": 41200
},
{
"epoch": 1.3911344651037456,
"grad_norm": 0.8451808094978333,
"learning_rate": 0.00013407531122370213,
"loss": 0.4681,
"step": 41300
},
{
"epoch": 1.3945028294260307,
"grad_norm": 0.8027797341346741,
"learning_rate": 0.0001337946134089346,
"loss": 0.4717,
"step": 41400
},
{
"epoch": 1.397871193748316,
"grad_norm": 0.7488086819648743,
"learning_rate": 0.00013351391559416711,
"loss": 0.4694,
"step": 41500
},
{
"epoch": 1.401239558070601,
"grad_norm": 0.8326307535171509,
"learning_rate": 0.0001332332177793996,
"loss": 0.4693,
"step": 41600
},
{
"epoch": 1.404607922392886,
"grad_norm": 0.8087652325630188,
"learning_rate": 0.00013295251996463207,
"loss": 0.4684,
"step": 41700
},
{
"epoch": 1.407976286715171,
"grad_norm": 0.7918603420257568,
"learning_rate": 0.00013267182214986458,
"loss": 0.47,
"step": 41800
},
{
"epoch": 1.411344651037456,
"grad_norm": 0.8231304883956909,
"learning_rate": 0.00013239112433509705,
"loss": 0.4694,
"step": 41900
},
{
"epoch": 1.4147130153597414,
"grad_norm": 0.7812530994415283,
"learning_rate": 0.00013211042652032953,
"loss": 0.4695,
"step": 42000
},
{
"epoch": 1.4180813796820264,
"grad_norm": 0.854972779750824,
"learning_rate": 0.00013182972870556204,
"loss": 0.47,
"step": 42100
},
{
"epoch": 1.4214497440043115,
"grad_norm": 0.8728025555610657,
"learning_rate": 0.00013154903089079452,
"loss": 0.468,
"step": 42200
},
{
"epoch": 1.4248181083265967,
"grad_norm": 0.8394129276275635,
"learning_rate": 0.00013126833307602702,
"loss": 0.4698,
"step": 42300
},
{
"epoch": 1.4281864726488818,
"grad_norm": 0.7810468673706055,
"learning_rate": 0.0001309876352612595,
"loss": 0.4694,
"step": 42400
},
{
"epoch": 1.4315548369711668,
"grad_norm": 0.8251649737358093,
"learning_rate": 0.00013070693744649198,
"loss": 0.4651,
"step": 42500
},
{
"epoch": 1.4349232012934519,
"grad_norm": 0.8438547253608704,
"learning_rate": 0.00013042623963172446,
"loss": 0.4685,
"step": 42600
},
{
"epoch": 1.438291565615737,
"grad_norm": 0.7687946557998657,
"learning_rate": 0.00013014554181695696,
"loss": 0.4657,
"step": 42700
},
{
"epoch": 1.4416599299380222,
"grad_norm": 0.7573995590209961,
"learning_rate": 0.00012986484400218947,
"loss": 0.4667,
"step": 42800
},
{
"epoch": 1.4450282942603072,
"grad_norm": 0.8200283646583557,
"learning_rate": 0.00012958414618742195,
"loss": 0.4666,
"step": 42900
},
{
"epoch": 1.4483966585825923,
"grad_norm": 0.8411341905593872,
"learning_rate": 0.00012930344837265443,
"loss": 0.4679,
"step": 43000
},
{
"epoch": 1.4517650229048775,
"grad_norm": 0.8489885330200195,
"learning_rate": 0.0001290227505578869,
"loss": 0.4679,
"step": 43100
},
{
"epoch": 1.4551333872271626,
"grad_norm": 0.8161250352859497,
"learning_rate": 0.00012874205274311938,
"loss": 0.4688,
"step": 43200
},
{
"epoch": 1.4585017515494476,
"grad_norm": 0.7844269871711731,
"learning_rate": 0.0001284613549283519,
"loss": 0.4666,
"step": 43300
},
{
"epoch": 1.4618701158717327,
"grad_norm": 0.7773265838623047,
"learning_rate": 0.00012818065711358437,
"loss": 0.4687,
"step": 43400
},
{
"epoch": 1.4652384801940177,
"grad_norm": 0.8081590533256531,
"learning_rate": 0.00012789995929881687,
"loss": 0.4643,
"step": 43500
},
{
"epoch": 1.468606844516303,
"grad_norm": 0.7888718843460083,
"learning_rate": 0.00012761926148404935,
"loss": 0.4682,
"step": 43600
},
{
"epoch": 1.471975208838588,
"grad_norm": 0.7907763719558716,
"learning_rate": 0.00012733856366928186,
"loss": 0.4653,
"step": 43700
},
{
"epoch": 1.475343573160873,
"grad_norm": 0.7945205569267273,
"learning_rate": 0.00012705786585451433,
"loss": 0.4648,
"step": 43800
},
{
"epoch": 1.478711937483158,
"grad_norm": 0.8834030032157898,
"learning_rate": 0.0001267771680397468,
"loss": 0.4664,
"step": 43900
},
{
"epoch": 1.4820803018054431,
"grad_norm": 0.7815008759498596,
"learning_rate": 0.0001264964702249793,
"loss": 0.4662,
"step": 44000
},
{
"epoch": 1.4854486661277284,
"grad_norm": 0.8282730579376221,
"learning_rate": 0.0001262157724102118,
"loss": 0.4652,
"step": 44100
},
{
"epoch": 1.4888170304500135,
"grad_norm": 0.7864588499069214,
"learning_rate": 0.0001259350745954443,
"loss": 0.4651,
"step": 44200
},
{
"epoch": 1.4921853947722985,
"grad_norm": 0.7972845435142517,
"learning_rate": 0.00012565437678067678,
"loss": 0.4656,
"step": 44300
},
{
"epoch": 1.4955537590945838,
"grad_norm": 0.8192013502120972,
"learning_rate": 0.00012537367896590926,
"loss": 0.4665,
"step": 44400
},
{
"epoch": 1.4989221234168688,
"grad_norm": 0.8526120185852051,
"learning_rate": 0.00012509298115114174,
"loss": 0.4654,
"step": 44500
},
{
"epoch": 1.5022904877391539,
"grad_norm": 0.8241577744483948,
"learning_rate": 0.00012481228333637422,
"loss": 0.4648,
"step": 44600
},
{
"epoch": 1.5056588520614391,
"grad_norm": 0.8311729431152344,
"learning_rate": 0.00012453158552160672,
"loss": 0.4662,
"step": 44700
},
{
"epoch": 1.509027216383724,
"grad_norm": 0.7880195379257202,
"learning_rate": 0.00012425088770683923,
"loss": 0.4637,
"step": 44800
},
{
"epoch": 1.5123955807060092,
"grad_norm": 0.7668688893318176,
"learning_rate": 0.0001239701898920717,
"loss": 0.465,
"step": 44900
},
{
"epoch": 1.5157639450282943,
"grad_norm": 0.8149063587188721,
"learning_rate": 0.00012368949207730418,
"loss": 0.4634,
"step": 45000
},
{
"epoch": 1.5191323093505793,
"grad_norm": 0.7656127214431763,
"learning_rate": 0.00012340879426253666,
"loss": 0.4635,
"step": 45100
},
{
"epoch": 1.5225006736728646,
"grad_norm": 0.8114592432975769,
"learning_rate": 0.00012312809644776917,
"loss": 0.4635,
"step": 45200
},
{
"epoch": 1.5258690379951494,
"grad_norm": 0.8734049797058105,
"learning_rate": 0.00012284739863300165,
"loss": 0.4629,
"step": 45300
},
{
"epoch": 1.5292374023174347,
"grad_norm": 0.806281328201294,
"learning_rate": 0.00012256670081823413,
"loss": 0.4644,
"step": 45400
},
{
"epoch": 1.5326057666397197,
"grad_norm": 0.8073423504829407,
"learning_rate": 0.00012228600300346663,
"loss": 0.4626,
"step": 45500
},
{
"epoch": 1.5359741309620047,
"grad_norm": 0.8023707270622253,
"learning_rate": 0.00012200530518869911,
"loss": 0.4637,
"step": 45600
},
{
"epoch": 1.53934249528429,
"grad_norm": 0.821060299873352,
"learning_rate": 0.00012172460737393161,
"loss": 0.4624,
"step": 45700
},
{
"epoch": 1.542710859606575,
"grad_norm": 0.7743229866027832,
"learning_rate": 0.00012144390955916409,
"loss": 0.4631,
"step": 45800
},
{
"epoch": 1.54607922392886,
"grad_norm": 0.8501706719398499,
"learning_rate": 0.00012116321174439657,
"loss": 0.4646,
"step": 45900
},
{
"epoch": 1.5494475882511454,
"grad_norm": 0.798643946647644,
"learning_rate": 0.00012088251392962906,
"loss": 0.4618,
"step": 46000
},
{
"epoch": 1.5528159525734302,
"grad_norm": 0.771360456943512,
"learning_rate": 0.00012060181611486154,
"loss": 0.464,
"step": 46100
},
{
"epoch": 1.5561843168957155,
"grad_norm": 0.7841131687164307,
"learning_rate": 0.00012032111830009405,
"loss": 0.4618,
"step": 46200
},
{
"epoch": 1.5595526812180005,
"grad_norm": 0.7240998148918152,
"learning_rate": 0.00012004042048532653,
"loss": 0.4648,
"step": 46300
},
{
"epoch": 1.5629210455402855,
"grad_norm": 0.8445931673049927,
"learning_rate": 0.00011975972267055902,
"loss": 0.4606,
"step": 46400
},
{
"epoch": 1.5662894098625708,
"grad_norm": 0.8375403881072998,
"learning_rate": 0.0001194790248557915,
"loss": 0.4633,
"step": 46500
},
{
"epoch": 1.5696577741848559,
"grad_norm": 0.7885960340499878,
"learning_rate": 0.00011919832704102399,
"loss": 0.4632,
"step": 46600
},
{
"epoch": 1.573026138507141,
"grad_norm": 0.8243712186813354,
"learning_rate": 0.0001189176292262565,
"loss": 0.4618,
"step": 46700
},
{
"epoch": 1.5763945028294262,
"grad_norm": 0.8182551860809326,
"learning_rate": 0.00011863693141148897,
"loss": 0.4607,
"step": 46800
},
{
"epoch": 1.579762867151711,
"grad_norm": 0.7784871459007263,
"learning_rate": 0.00011835623359672145,
"loss": 0.4628,
"step": 46900
},
{
"epoch": 1.5831312314739963,
"grad_norm": 0.8082338571548462,
"learning_rate": 0.00011807553578195394,
"loss": 0.4621,
"step": 47000
},
{
"epoch": 1.5864995957962813,
"grad_norm": 0.8203257322311401,
"learning_rate": 0.00011779483796718642,
"loss": 0.461,
"step": 47100
},
{
"epoch": 1.5898679601185663,
"grad_norm": 0.7920771837234497,
"learning_rate": 0.00011751414015241893,
"loss": 0.4611,
"step": 47200
},
{
"epoch": 1.5932363244408516,
"grad_norm": 0.8124784827232361,
"learning_rate": 0.0001172334423376514,
"loss": 0.4598,
"step": 47300
},
{
"epoch": 1.5966046887631367,
"grad_norm": 0.8094605803489685,
"learning_rate": 0.0001169527445228839,
"loss": 0.4605,
"step": 47400
},
{
"epoch": 1.5999730530854217,
"grad_norm": 0.7639499306678772,
"learning_rate": 0.00011667204670811638,
"loss": 0.46,
"step": 47500
},
{
"epoch": 1.603341417407707,
"grad_norm": 0.8600967526435852,
"learning_rate": 0.00011639134889334887,
"loss": 0.4623,
"step": 47600
},
{
"epoch": 1.6067097817299918,
"grad_norm": 0.7747792601585388,
"learning_rate": 0.00011611065107858137,
"loss": 0.463,
"step": 47700
},
{
"epoch": 1.610078146052277,
"grad_norm": 0.8040998578071594,
"learning_rate": 0.00011582995326381385,
"loss": 0.459,
"step": 47800
},
{
"epoch": 1.613446510374562,
"grad_norm": 0.7648651003837585,
"learning_rate": 0.00011554925544904633,
"loss": 0.4618,
"step": 47900
},
{
"epoch": 1.6168148746968471,
"grad_norm": 0.789125382900238,
"learning_rate": 0.00011526855763427882,
"loss": 0.4599,
"step": 48000
},
{
"epoch": 1.6201832390191324,
"grad_norm": 0.8133670687675476,
"learning_rate": 0.0001149878598195113,
"loss": 0.4594,
"step": 48100
},
{
"epoch": 1.6235516033414175,
"grad_norm": 0.7992141842842102,
"learning_rate": 0.0001147071620047438,
"loss": 0.4602,
"step": 48200
},
{
"epoch": 1.6269199676637025,
"grad_norm": 0.780681312084198,
"learning_rate": 0.00011442646418997628,
"loss": 0.4587,
"step": 48300
},
{
"epoch": 1.6302883319859878,
"grad_norm": 0.7979656457901001,
"learning_rate": 0.00011414576637520878,
"loss": 0.4587,
"step": 48400
},
{
"epoch": 1.6336566963082726,
"grad_norm": 0.8527476787567139,
"learning_rate": 0.00011386506856044125,
"loss": 0.4586,
"step": 48500
},
{
"epoch": 1.6370250606305579,
"grad_norm": 0.8187114000320435,
"learning_rate": 0.00011358437074567375,
"loss": 0.4611,
"step": 48600
},
{
"epoch": 1.640393424952843,
"grad_norm": 0.7977433204650879,
"learning_rate": 0.00011330367293090625,
"loss": 0.4581,
"step": 48700
},
{
"epoch": 1.643761789275128,
"grad_norm": 0.8355839252471924,
"learning_rate": 0.00011302297511613873,
"loss": 0.46,
"step": 48800
},
{
"epoch": 1.6471301535974132,
"grad_norm": 0.7887241840362549,
"learning_rate": 0.00011274227730137121,
"loss": 0.4595,
"step": 48900
},
{
"epoch": 1.650498517919698,
"grad_norm": 0.8219642639160156,
"learning_rate": 0.0001124615794866037,
"loss": 0.4605,
"step": 49000
},
{
"epoch": 1.6538668822419833,
"grad_norm": 0.797517716884613,
"learning_rate": 0.00011218088167183618,
"loss": 0.46,
"step": 49100
},
{
"epoch": 1.6572352465642683,
"grad_norm": 0.81880784034729,
"learning_rate": 0.00011190018385706868,
"loss": 0.4602,
"step": 49200
},
{
"epoch": 1.6606036108865534,
"grad_norm": 0.8267971277236938,
"learning_rate": 0.00011161948604230116,
"loss": 0.4584,
"step": 49300
},
{
"epoch": 1.6639719752088387,
"grad_norm": 0.8257302045822144,
"learning_rate": 0.00011133878822753366,
"loss": 0.4602,
"step": 49400
},
{
"epoch": 1.6673403395311237,
"grad_norm": 0.7903374433517456,
"learning_rate": 0.00011105809041276613,
"loss": 0.4558,
"step": 49500
},
{
"epoch": 1.6707087038534087,
"grad_norm": 0.7741321921348572,
"learning_rate": 0.00011077739259799863,
"loss": 0.4596,
"step": 49600
},
{
"epoch": 1.674077068175694,
"grad_norm": 0.771134078502655,
"learning_rate": 0.00011049669478323113,
"loss": 0.4568,
"step": 49700
},
{
"epoch": 1.6774454324979788,
"grad_norm": 0.7859461307525635,
"learning_rate": 0.00011021599696846361,
"loss": 0.4577,
"step": 49800
},
{
"epoch": 1.680813796820264,
"grad_norm": 0.7759444117546082,
"learning_rate": 0.00010993529915369609,
"loss": 0.457,
"step": 49900
},
{
"epoch": 1.6841821611425492,
"grad_norm": 0.8348528742790222,
"learning_rate": 0.00010965460133892858,
"loss": 0.4569,
"step": 50000
},
{
"epoch": 1.6875505254648342,
"grad_norm": 0.8011546730995178,
"learning_rate": 0.00010937390352416106,
"loss": 0.4585,
"step": 50100
},
{
"epoch": 1.6909188897871195,
"grad_norm": 0.790429413318634,
"learning_rate": 0.00010909320570939356,
"loss": 0.4582,
"step": 50200
},
{
"epoch": 1.6942872541094045,
"grad_norm": 0.8371046781539917,
"learning_rate": 0.00010881250789462604,
"loss": 0.4591,
"step": 50300
},
{
"epoch": 1.6976556184316896,
"grad_norm": 0.7836015820503235,
"learning_rate": 0.00010853181007985853,
"loss": 0.4581,
"step": 50400
},
{
"epoch": 1.7010239827539748,
"grad_norm": 0.846708357334137,
"learning_rate": 0.00010825111226509101,
"loss": 0.4569,
"step": 50500
},
{
"epoch": 1.7043923470762596,
"grad_norm": 0.797223687171936,
"learning_rate": 0.00010797041445032352,
"loss": 0.4569,
"step": 50600
},
{
"epoch": 1.707760711398545,
"grad_norm": 0.8466051816940308,
"learning_rate": 0.00010768971663555601,
"loss": 0.4567,
"step": 50700
},
{
"epoch": 1.71112907572083,
"grad_norm": 0.7285684943199158,
"learning_rate": 0.00010740901882078849,
"loss": 0.456,
"step": 50800
},
{
"epoch": 1.714497440043115,
"grad_norm": 0.8624778985977173,
"learning_rate": 0.00010712832100602097,
"loss": 0.4588,
"step": 50900
},
{
"epoch": 1.7178658043654003,
"grad_norm": 0.7958481311798096,
"learning_rate": 0.00010684762319125346,
"loss": 0.4566,
"step": 51000
},
{
"epoch": 1.7212341686876853,
"grad_norm": 0.7974202036857605,
"learning_rate": 0.00010656692537648596,
"loss": 0.4566,
"step": 51100
},
{
"epoch": 1.7246025330099704,
"grad_norm": 0.8782477378845215,
"learning_rate": 0.00010628622756171844,
"loss": 0.4577,
"step": 51200
},
{
"epoch": 1.7279708973322556,
"grad_norm": 0.8142967820167542,
"learning_rate": 0.00010600552974695092,
"loss": 0.4543,
"step": 51300
},
{
"epoch": 1.7313392616545404,
"grad_norm": 0.7704757452011108,
"learning_rate": 0.00010572483193218341,
"loss": 0.4565,
"step": 51400
},
{
"epoch": 1.7347076259768257,
"grad_norm": 0.8298918604850769,
"learning_rate": 0.00010544413411741589,
"loss": 0.4564,
"step": 51500
},
{
"epoch": 1.7380759902991108,
"grad_norm": 0.7840197682380676,
"learning_rate": 0.0001051634363026484,
"loss": 0.457,
"step": 51600
},
{
"epoch": 1.7414443546213958,
"grad_norm": 0.8080000281333923,
"learning_rate": 0.00010488273848788088,
"loss": 0.4563,
"step": 51700
},
{
"epoch": 1.744812718943681,
"grad_norm": 0.8133041262626648,
"learning_rate": 0.00010460204067311337,
"loss": 0.4529,
"step": 51800
},
{
"epoch": 1.7481810832659659,
"grad_norm": 0.8792639374732971,
"learning_rate": 0.00010432134285834585,
"loss": 0.4536,
"step": 51900
},
{
"epoch": 1.7515494475882512,
"grad_norm": 0.8580865263938904,
"learning_rate": 0.00010404064504357834,
"loss": 0.4542,
"step": 52000
},
{
"epoch": 1.7549178119105362,
"grad_norm": 0.7759612798690796,
"learning_rate": 0.00010375994722881084,
"loss": 0.4557,
"step": 52100
},
{
"epoch": 1.7582861762328212,
"grad_norm": 0.748423159122467,
"learning_rate": 0.00010347924941404332,
"loss": 0.454,
"step": 52200
},
{
"epoch": 1.7616545405551065,
"grad_norm": 0.7873731851577759,
"learning_rate": 0.0001031985515992758,
"loss": 0.4543,
"step": 52300
},
{
"epoch": 1.7650229048773916,
"grad_norm": 0.7736590504646301,
"learning_rate": 0.00010291785378450829,
"loss": 0.4556,
"step": 52400
},
{
"epoch": 1.7683912691996766,
"grad_norm": 0.7629456520080566,
"learning_rate": 0.00010263715596974077,
"loss": 0.4545,
"step": 52500
},
{
"epoch": 1.7717596335219619,
"grad_norm": 0.8270254731178284,
"learning_rate": 0.00010235645815497328,
"loss": 0.4546,
"step": 52600
},
{
"epoch": 1.7751279978442467,
"grad_norm": 0.7610684633255005,
"learning_rate": 0.00010207576034020576,
"loss": 0.4527,
"step": 52700
},
{
"epoch": 1.778496362166532,
"grad_norm": 0.8228756785392761,
"learning_rate": 0.00010179506252543825,
"loss": 0.4568,
"step": 52800
},
{
"epoch": 1.781864726488817,
"grad_norm": 0.8317448496818542,
"learning_rate": 0.00010151436471067073,
"loss": 0.4543,
"step": 52900
},
{
"epoch": 1.785233090811102,
"grad_norm": 0.7914367318153381,
"learning_rate": 0.00010123366689590322,
"loss": 0.4529,
"step": 53000
},
{
"epoch": 1.7886014551333873,
"grad_norm": 0.8205628395080566,
"learning_rate": 0.00010095296908113572,
"loss": 0.4537,
"step": 53100
},
{
"epoch": 1.7919698194556724,
"grad_norm": 0.794956386089325,
"learning_rate": 0.0001006722712663682,
"loss": 0.455,
"step": 53200
},
{
"epoch": 1.7953381837779574,
"grad_norm": 0.8285955786705017,
"learning_rate": 0.00010039157345160068,
"loss": 0.4535,
"step": 53300
},
{
"epoch": 1.7987065481002427,
"grad_norm": 0.8204521536827087,
"learning_rate": 0.00010011087563683317,
"loss": 0.4561,
"step": 53400
},
{
"epoch": 1.8020749124225275,
"grad_norm": 0.8407822251319885,
"learning_rate": 9.983017782206566e-05,
"loss": 0.4563,
"step": 53500
},
{
"epoch": 1.8054432767448128,
"grad_norm": 0.8138654828071594,
"learning_rate": 9.954948000729814e-05,
"loss": 0.4547,
"step": 53600
},
{
"epoch": 1.8088116410670978,
"grad_norm": 0.8514792323112488,
"learning_rate": 9.926878219253063e-05,
"loss": 0.453,
"step": 53700
},
{
"epoch": 1.8121800053893828,
"grad_norm": 0.836942195892334,
"learning_rate": 9.898808437776313e-05,
"loss": 0.4554,
"step": 53800
},
{
"epoch": 1.815548369711668,
"grad_norm": 0.8424620628356934,
"learning_rate": 9.87073865629956e-05,
"loss": 0.4541,
"step": 53900
},
{
"epoch": 1.8189167340339532,
"grad_norm": 0.7823119163513184,
"learning_rate": 9.84266887482281e-05,
"loss": 0.4539,
"step": 54000
},
{
"epoch": 1.8222850983562382,
"grad_norm": 0.8232121467590332,
"learning_rate": 9.814599093346059e-05,
"loss": 0.4518,
"step": 54100
},
{
"epoch": 1.8256534626785235,
"grad_norm": 0.7991457581520081,
"learning_rate": 9.786529311869308e-05,
"loss": 0.4516,
"step": 54200
},
{
"epoch": 1.8290218270008083,
"grad_norm": 0.7749050855636597,
"learning_rate": 9.758459530392556e-05,
"loss": 0.4528,
"step": 54300
},
{
"epoch": 1.8323901913230936,
"grad_norm": 0.7452788949012756,
"learning_rate": 9.730389748915805e-05,
"loss": 0.4555,
"step": 54400
},
{
"epoch": 1.8357585556453786,
"grad_norm": 0.816150963306427,
"learning_rate": 9.702319967439054e-05,
"loss": 0.4514,
"step": 54500
},
{
"epoch": 1.8391269199676636,
"grad_norm": 0.785351037979126,
"learning_rate": 9.674250185962302e-05,
"loss": 0.4517,
"step": 54600
},
{
"epoch": 1.842495284289949,
"grad_norm": 0.828187108039856,
"learning_rate": 9.646180404485551e-05,
"loss": 0.4533,
"step": 54700
},
{
"epoch": 1.845863648612234,
"grad_norm": 0.7950621247291565,
"learning_rate": 9.6181106230088e-05,
"loss": 0.4523,
"step": 54800
},
{
"epoch": 1.849232012934519,
"grad_norm": 0.7881097197532654,
"learning_rate": 9.590040841532048e-05,
"loss": 0.4516,
"step": 54900
},
{
"epoch": 1.852600377256804,
"grad_norm": 0.7643069624900818,
"learning_rate": 9.561971060055298e-05,
"loss": 0.453,
"step": 55000
},
{
"epoch": 1.855968741579089,
"grad_norm": 0.7741556167602539,
"learning_rate": 9.533901278578547e-05,
"loss": 0.4528,
"step": 55100
},
{
"epoch": 1.8593371059013744,
"grad_norm": 0.8289052844047546,
"learning_rate": 9.505831497101796e-05,
"loss": 0.452,
"step": 55200
},
{
"epoch": 1.8627054702236594,
"grad_norm": 0.7747401595115662,
"learning_rate": 9.477761715625044e-05,
"loss": 0.453,
"step": 55300
},
{
"epoch": 1.8660738345459444,
"grad_norm": 0.837910532951355,
"learning_rate": 9.449691934148293e-05,
"loss": 0.4532,
"step": 55400
},
{
"epoch": 1.8694421988682297,
"grad_norm": 0.7754988670349121,
"learning_rate": 9.421622152671542e-05,
"loss": 0.4497,
"step": 55500
},
{
"epoch": 1.8728105631905145,
"grad_norm": 0.8681314587593079,
"learning_rate": 9.39355237119479e-05,
"loss": 0.451,
"step": 55600
},
{
"epoch": 1.8761789275127998,
"grad_norm": 0.8410942554473877,
"learning_rate": 9.365482589718039e-05,
"loss": 0.451,
"step": 55700
},
{
"epoch": 1.8795472918350848,
"grad_norm": 0.8542850613594055,
"learning_rate": 9.337412808241288e-05,
"loss": 0.4524,
"step": 55800
},
{
"epoch": 1.8829156561573699,
"grad_norm": 0.806122899055481,
"learning_rate": 9.309343026764538e-05,
"loss": 0.4535,
"step": 55900
},
{
"epoch": 1.8862840204796552,
"grad_norm": 0.8103610277175903,
"learning_rate": 9.281273245287786e-05,
"loss": 0.4514,
"step": 56000
},
{
"epoch": 1.8896523848019402,
"grad_norm": 0.7982548475265503,
"learning_rate": 9.253203463811035e-05,
"loss": 0.4504,
"step": 56100
},
{
"epoch": 1.8930207491242252,
"grad_norm": 0.8081793189048767,
"learning_rate": 9.225133682334284e-05,
"loss": 0.4522,
"step": 56200
},
{
"epoch": 1.8963891134465105,
"grad_norm": 0.8284481763839722,
"learning_rate": 9.197063900857532e-05,
"loss": 0.4501,
"step": 56300
},
{
"epoch": 1.8997574777687953,
"grad_norm": 0.7722172737121582,
"learning_rate": 9.168994119380781e-05,
"loss": 0.4528,
"step": 56400
},
{
"epoch": 1.9031258420910806,
"grad_norm": 0.8065896034240723,
"learning_rate": 9.14092433790403e-05,
"loss": 0.4527,
"step": 56500
},
{
"epoch": 1.9064942064133656,
"grad_norm": 0.8267763257026672,
"learning_rate": 9.112854556427278e-05,
"loss": 0.4547,
"step": 56600
},
{
"epoch": 1.9098625707356507,
"grad_norm": 0.803359866142273,
"learning_rate": 9.084784774950527e-05,
"loss": 0.4506,
"step": 56700
},
{
"epoch": 1.913230935057936,
"grad_norm": 0.7984471321105957,
"learning_rate": 9.056714993473776e-05,
"loss": 0.4498,
"step": 56800
},
{
"epoch": 1.916599299380221,
"grad_norm": 0.8118926286697388,
"learning_rate": 9.028645211997026e-05,
"loss": 0.4511,
"step": 56900
},
{
"epoch": 1.919967663702506,
"grad_norm": 0.7954909205436707,
"learning_rate": 9.000575430520273e-05,
"loss": 0.45,
"step": 57000
},
{
"epoch": 1.9233360280247913,
"grad_norm": 0.7925546765327454,
"learning_rate": 8.972505649043523e-05,
"loss": 0.4503,
"step": 57100
},
{
"epoch": 1.9267043923470761,
"grad_norm": 0.7257952690124512,
"learning_rate": 8.944435867566772e-05,
"loss": 0.4501,
"step": 57200
},
{
"epoch": 1.9300727566693614,
"grad_norm": 0.7644702196121216,
"learning_rate": 8.91636608609002e-05,
"loss": 0.4502,
"step": 57300
},
{
"epoch": 1.9334411209916464,
"grad_norm": 0.8492504358291626,
"learning_rate": 8.888296304613269e-05,
"loss": 0.451,
"step": 57400
},
{
"epoch": 1.9368094853139315,
"grad_norm": 0.7511376142501831,
"learning_rate": 8.860226523136518e-05,
"loss": 0.4511,
"step": 57500
},
{
"epoch": 1.9401778496362168,
"grad_norm": 0.8549360036849976,
"learning_rate": 8.832156741659766e-05,
"loss": 0.4504,
"step": 57600
},
{
"epoch": 1.9435462139585018,
"grad_norm": 0.7821473479270935,
"learning_rate": 8.804086960183015e-05,
"loss": 0.4508,
"step": 57700
},
{
"epoch": 1.9469145782807868,
"grad_norm": 0.8527407050132751,
"learning_rate": 8.776017178706264e-05,
"loss": 0.4514,
"step": 57800
},
{
"epoch": 1.9502829426030721,
"grad_norm": 0.8409647941589355,
"learning_rate": 8.747947397229514e-05,
"loss": 0.4498,
"step": 57900
},
{
"epoch": 1.953651306925357,
"grad_norm": 0.8430731296539307,
"learning_rate": 8.719877615752761e-05,
"loss": 0.4498,
"step": 58000
},
{
"epoch": 1.9570196712476422,
"grad_norm": 0.8346706032752991,
"learning_rate": 8.69180783427601e-05,
"loss": 0.4475,
"step": 58100
},
{
"epoch": 1.9603880355699272,
"grad_norm": 0.7488289475440979,
"learning_rate": 8.66373805279926e-05,
"loss": 0.4488,
"step": 58200
},
{
"epoch": 1.9637563998922123,
"grad_norm": 0.836130678653717,
"learning_rate": 8.635668271322508e-05,
"loss": 0.451,
"step": 58300
},
{
"epoch": 1.9671247642144976,
"grad_norm": 0.7900556921958923,
"learning_rate": 8.607598489845757e-05,
"loss": 0.4463,
"step": 58400
},
{
"epoch": 1.9704931285367824,
"grad_norm": 0.8496758341789246,
"learning_rate": 8.579528708369006e-05,
"loss": 0.4475,
"step": 58500
},
{
"epoch": 1.9738614928590676,
"grad_norm": 0.8665506839752197,
"learning_rate": 8.551458926892254e-05,
"loss": 0.4474,
"step": 58600
},
{
"epoch": 1.9772298571813527,
"grad_norm": 0.8058724999427795,
"learning_rate": 8.523389145415503e-05,
"loss": 0.4496,
"step": 58700
},
{
"epoch": 1.9805982215036377,
"grad_norm": 0.8007978796958923,
"learning_rate": 8.495319363938752e-05,
"loss": 0.4494,
"step": 58800
},
{
"epoch": 1.983966585825923,
"grad_norm": 0.7893068790435791,
"learning_rate": 8.467249582462001e-05,
"loss": 0.4477,
"step": 58900
},
{
"epoch": 1.987334950148208,
"grad_norm": 0.8267046213150024,
"learning_rate": 8.439179800985249e-05,
"loss": 0.4479,
"step": 59000
},
{
"epoch": 1.990703314470493,
"grad_norm": 0.8301923274993896,
"learning_rate": 8.411110019508498e-05,
"loss": 0.4486,
"step": 59100
},
{
"epoch": 1.9940716787927784,
"grad_norm": 0.7466899156570435,
"learning_rate": 8.383040238031748e-05,
"loss": 0.4481,
"step": 59200
},
{
"epoch": 1.9974400431150632,
"grad_norm": 0.8137242794036865,
"learning_rate": 8.354970456554995e-05,
"loss": 0.4501,
"step": 59300
},
{
"epoch": 2.0,
"eval_loss": 0.44603702425956726,
"eval_runtime": 7.7293,
"eval_samples_per_second": 646.889,
"eval_steps_per_second": 10.221,
"step": 59376
}
],
"logging_steps": 100,
"max_steps": 89064,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2413913702400000.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}