pythia-14m / trainer_state.json
bfpill's picture
Set main to step150000
44d957a
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.9997600095996155,
"eval_steps": 500,
"global_step": 150000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.999840006399744e-05,
"grad_norm": 1.4777933359146118,
"learning_rate": 0.001,
"loss": 10.8399,
"step": 1
},
{
"epoch": 0.003999840006399744,
"grad_norm": 0.22688856720924377,
"learning_rate": 0.001,
"loss": 7.558,
"step": 100
},
{
"epoch": 0.007999680012799487,
"grad_norm": 0.24372541904449463,
"learning_rate": 0.001,
"loss": 6.3052,
"step": 200
},
{
"epoch": 0.011999520019199232,
"grad_norm": 0.5153403282165527,
"learning_rate": 0.001,
"loss": 5.9045,
"step": 300
},
{
"epoch": 0.015999360025598975,
"grad_norm": 0.5177266597747803,
"learning_rate": 0.001,
"loss": 5.6063,
"step": 400
},
{
"epoch": 0.01999920003199872,
"grad_norm": 0.6018334627151489,
"learning_rate": 0.001,
"loss": 5.4024,
"step": 500
},
{
"epoch": 0.023999040038398464,
"grad_norm": 0.4220522940158844,
"learning_rate": 0.001,
"loss": 5.2604,
"step": 600
},
{
"epoch": 0.027998880044798207,
"grad_norm": 0.4826813042163849,
"learning_rate": 0.001,
"loss": 5.149,
"step": 700
},
{
"epoch": 0.03199872005119795,
"grad_norm": 0.5880510807037354,
"learning_rate": 0.001,
"loss": 5.0663,
"step": 800
},
{
"epoch": 0.035998560057597696,
"grad_norm": 0.4875524640083313,
"learning_rate": 0.001,
"loss": 5.0008,
"step": 900
},
{
"epoch": 0.03999840006399744,
"grad_norm": 0.526023805141449,
"learning_rate": 0.001,
"loss": 4.9403,
"step": 1000
},
{
"epoch": 0.04399824007039718,
"grad_norm": 0.4925878942012787,
"learning_rate": 0.001,
"loss": 4.8988,
"step": 1100
},
{
"epoch": 0.04799808007679693,
"grad_norm": 0.5051125884056091,
"learning_rate": 0.001,
"loss": 4.8552,
"step": 1200
},
{
"epoch": 0.051997920083196675,
"grad_norm": 0.6583888530731201,
"learning_rate": 0.001,
"loss": 4.8247,
"step": 1300
},
{
"epoch": 0.055997760089596414,
"grad_norm": 0.4777211546897888,
"learning_rate": 0.001,
"loss": 4.7899,
"step": 1400
},
{
"epoch": 0.05999760009599616,
"grad_norm": 0.5130186080932617,
"learning_rate": 0.001,
"loss": 4.7612,
"step": 1500
},
{
"epoch": 0.0639974401023959,
"grad_norm": 0.5315548777580261,
"learning_rate": 0.001,
"loss": 4.7361,
"step": 1600
},
{
"epoch": 0.06799728010879565,
"grad_norm": 0.47726863622665405,
"learning_rate": 0.001,
"loss": 4.7013,
"step": 1700
},
{
"epoch": 0.07199712011519539,
"grad_norm": 0.5642004013061523,
"learning_rate": 0.001,
"loss": 4.6645,
"step": 1800
},
{
"epoch": 0.07599696012159514,
"grad_norm": 0.545949399471283,
"learning_rate": 0.001,
"loss": 4.6219,
"step": 1900
},
{
"epoch": 0.07999680012799489,
"grad_norm": 0.5206509232521057,
"learning_rate": 0.001,
"loss": 4.5969,
"step": 2000
},
{
"epoch": 0.08399664013439462,
"grad_norm": 0.44842126965522766,
"learning_rate": 0.001,
"loss": 4.5708,
"step": 2100
},
{
"epoch": 0.08799648014079436,
"grad_norm": 0.6365007162094116,
"learning_rate": 0.001,
"loss": 4.5523,
"step": 2200
},
{
"epoch": 0.09199632014719411,
"grad_norm": 0.5729643702507019,
"learning_rate": 0.001,
"loss": 4.5305,
"step": 2300
},
{
"epoch": 0.09599616015359386,
"grad_norm": 0.5888395309448242,
"learning_rate": 0.001,
"loss": 4.5189,
"step": 2400
},
{
"epoch": 0.0999960001599936,
"grad_norm": 0.6099081039428711,
"learning_rate": 0.001,
"loss": 4.5038,
"step": 2500
},
{
"epoch": 0.10399584016639335,
"grad_norm": 0.5354722738265991,
"learning_rate": 0.001,
"loss": 4.487,
"step": 2600
},
{
"epoch": 0.10799568017279308,
"grad_norm": 0.6929482221603394,
"learning_rate": 0.001,
"loss": 4.4784,
"step": 2700
},
{
"epoch": 0.11199552017919283,
"grad_norm": 0.5511060357093811,
"learning_rate": 0.001,
"loss": 4.4682,
"step": 2800
},
{
"epoch": 0.11599536018559257,
"grad_norm": 0.5773873329162598,
"learning_rate": 0.001,
"loss": 4.4589,
"step": 2900
},
{
"epoch": 0.11999520019199232,
"grad_norm": 0.6061297655105591,
"learning_rate": 0.001,
"loss": 4.452,
"step": 3000
},
{
"epoch": 0.12399504019839207,
"grad_norm": 0.6082037687301636,
"learning_rate": 0.001,
"loss": 4.443,
"step": 3100
},
{
"epoch": 0.1279948802047918,
"grad_norm": 0.5562213063240051,
"learning_rate": 0.001,
"loss": 4.4342,
"step": 3200
},
{
"epoch": 0.13199472021119155,
"grad_norm": 0.5706282258033752,
"learning_rate": 0.001,
"loss": 4.4281,
"step": 3300
},
{
"epoch": 0.1359945602175913,
"grad_norm": 0.6546366810798645,
"learning_rate": 0.001,
"loss": 4.4192,
"step": 3400
},
{
"epoch": 0.13999440022399104,
"grad_norm": 0.5441614389419556,
"learning_rate": 0.001,
"loss": 4.4157,
"step": 3500
},
{
"epoch": 0.14399424023039079,
"grad_norm": 0.547122061252594,
"learning_rate": 0.001,
"loss": 4.4079,
"step": 3600
},
{
"epoch": 0.14799408023679053,
"grad_norm": 0.520165741443634,
"learning_rate": 0.001,
"loss": 4.4047,
"step": 3700
},
{
"epoch": 0.15199392024319028,
"grad_norm": 0.529140293598175,
"learning_rate": 0.001,
"loss": 4.3997,
"step": 3800
},
{
"epoch": 0.15599376024959002,
"grad_norm": 0.5902653336524963,
"learning_rate": 0.001,
"loss": 4.3921,
"step": 3900
},
{
"epoch": 0.15999360025598977,
"grad_norm": 0.6136724948883057,
"learning_rate": 0.001,
"loss": 4.3877,
"step": 4000
},
{
"epoch": 0.16399344026238952,
"grad_norm": 0.5538173913955688,
"learning_rate": 0.001,
"loss": 4.383,
"step": 4100
},
{
"epoch": 0.16799328026878924,
"grad_norm": 0.5516422986984253,
"learning_rate": 0.001,
"loss": 4.3815,
"step": 4200
},
{
"epoch": 0.17199312027518898,
"grad_norm": 0.535236656665802,
"learning_rate": 0.001,
"loss": 4.375,
"step": 4300
},
{
"epoch": 0.17599296028158873,
"grad_norm": 0.5214977860450745,
"learning_rate": 0.001,
"loss": 4.3716,
"step": 4400
},
{
"epoch": 0.17999280028798847,
"grad_norm": 0.473036527633667,
"learning_rate": 0.001,
"loss": 4.3679,
"step": 4500
},
{
"epoch": 0.18399264029438822,
"grad_norm": 0.5115819573402405,
"learning_rate": 0.001,
"loss": 4.3661,
"step": 4600
},
{
"epoch": 0.18799248030078797,
"grad_norm": 0.5074037313461304,
"learning_rate": 0.001,
"loss": 4.3606,
"step": 4700
},
{
"epoch": 0.1919923203071877,
"grad_norm": 0.4992203116416931,
"learning_rate": 0.001,
"loss": 4.3579,
"step": 4800
},
{
"epoch": 0.19599216031358746,
"grad_norm": 0.5755491256713867,
"learning_rate": 0.001,
"loss": 4.3501,
"step": 4900
},
{
"epoch": 0.1999920003199872,
"grad_norm": 0.5061823725700378,
"learning_rate": 0.001,
"loss": 4.3486,
"step": 5000
},
{
"epoch": 0.20399184032638695,
"grad_norm": 0.5323928594589233,
"learning_rate": 0.001,
"loss": 4.3477,
"step": 5100
},
{
"epoch": 0.2079916803327867,
"grad_norm": 0.5301046967506409,
"learning_rate": 0.001,
"loss": 4.3411,
"step": 5200
},
{
"epoch": 0.21199152033918645,
"grad_norm": 0.563123345375061,
"learning_rate": 0.001,
"loss": 4.3405,
"step": 5300
},
{
"epoch": 0.21599136034558616,
"grad_norm": 0.4736695885658264,
"learning_rate": 0.001,
"loss": 4.3391,
"step": 5400
},
{
"epoch": 0.2199912003519859,
"grad_norm": 0.46568775177001953,
"learning_rate": 0.001,
"loss": 4.3372,
"step": 5500
},
{
"epoch": 0.22399104035838566,
"grad_norm": 0.591935396194458,
"learning_rate": 0.001,
"loss": 4.3318,
"step": 5600
},
{
"epoch": 0.2279908803647854,
"grad_norm": 0.4887066185474396,
"learning_rate": 0.001,
"loss": 4.3318,
"step": 5700
},
{
"epoch": 0.23199072037118515,
"grad_norm": 0.4820877015590668,
"learning_rate": 0.001,
"loss": 4.3253,
"step": 5800
},
{
"epoch": 0.2359905603775849,
"grad_norm": 0.48746803402900696,
"learning_rate": 0.001,
"loss": 4.3242,
"step": 5900
},
{
"epoch": 0.23999040038398464,
"grad_norm": 0.5339901447296143,
"learning_rate": 0.001,
"loss": 4.3226,
"step": 6000
},
{
"epoch": 0.2439902403903844,
"grad_norm": 0.5406463146209717,
"learning_rate": 0.001,
"loss": 4.3197,
"step": 6100
},
{
"epoch": 0.24799008039678413,
"grad_norm": 0.5854783654212952,
"learning_rate": 0.001,
"loss": 4.318,
"step": 6200
},
{
"epoch": 0.25198992040318385,
"grad_norm": 0.47678840160369873,
"learning_rate": 0.001,
"loss": 4.3187,
"step": 6300
},
{
"epoch": 0.2559897604095836,
"grad_norm": 0.5523233413696289,
"learning_rate": 0.001,
"loss": 4.3117,
"step": 6400
},
{
"epoch": 0.25998960041598335,
"grad_norm": 0.46079719066619873,
"learning_rate": 0.001,
"loss": 4.3165,
"step": 6500
},
{
"epoch": 0.2639894404223831,
"grad_norm": 0.4956238269805908,
"learning_rate": 0.001,
"loss": 4.3116,
"step": 6600
},
{
"epoch": 0.26798928042878284,
"grad_norm": 0.532508373260498,
"learning_rate": 0.001,
"loss": 4.307,
"step": 6700
},
{
"epoch": 0.2719891204351826,
"grad_norm": 0.5244960188865662,
"learning_rate": 0.001,
"loss": 4.309,
"step": 6800
},
{
"epoch": 0.27598896044158233,
"grad_norm": 0.48627936840057373,
"learning_rate": 0.001,
"loss": 4.3025,
"step": 6900
},
{
"epoch": 0.2799888004479821,
"grad_norm": 0.5393197536468506,
"learning_rate": 0.001,
"loss": 4.3062,
"step": 7000
},
{
"epoch": 0.2839886404543818,
"grad_norm": 0.5384635925292969,
"learning_rate": 0.001,
"loss": 4.2993,
"step": 7100
},
{
"epoch": 0.28798848046078157,
"grad_norm": 0.5455852150917053,
"learning_rate": 0.001,
"loss": 4.299,
"step": 7200
},
{
"epoch": 0.2919883204671813,
"grad_norm": 0.45626768469810486,
"learning_rate": 0.001,
"loss": 4.2983,
"step": 7300
},
{
"epoch": 0.29598816047358106,
"grad_norm": 0.5111705660820007,
"learning_rate": 0.001,
"loss": 4.2992,
"step": 7400
},
{
"epoch": 0.2999880004799808,
"grad_norm": 0.5607153177261353,
"learning_rate": 0.001,
"loss": 4.291,
"step": 7500
},
{
"epoch": 0.30398784048638056,
"grad_norm": 0.5863308906555176,
"learning_rate": 0.001,
"loss": 4.2928,
"step": 7600
},
{
"epoch": 0.3079876804927803,
"grad_norm": 0.48837366700172424,
"learning_rate": 0.001,
"loss": 4.2951,
"step": 7700
},
{
"epoch": 0.31198752049918005,
"grad_norm": 0.5300312042236328,
"learning_rate": 0.001,
"loss": 4.2891,
"step": 7800
},
{
"epoch": 0.3159873605055798,
"grad_norm": 0.4869995415210724,
"learning_rate": 0.001,
"loss": 4.2868,
"step": 7900
},
{
"epoch": 0.31998720051197954,
"grad_norm": 0.5378390550613403,
"learning_rate": 0.001,
"loss": 4.2863,
"step": 8000
},
{
"epoch": 0.3239870405183793,
"grad_norm": 0.49154022336006165,
"learning_rate": 0.001,
"loss": 4.2902,
"step": 8100
},
{
"epoch": 0.32798688052477903,
"grad_norm": 0.5984882712364197,
"learning_rate": 0.001,
"loss": 4.2851,
"step": 8200
},
{
"epoch": 0.3319867205311788,
"grad_norm": 0.5132819414138794,
"learning_rate": 0.001,
"loss": 4.2854,
"step": 8300
},
{
"epoch": 0.33598656053757847,
"grad_norm": 0.5187487006187439,
"learning_rate": 0.001,
"loss": 4.2853,
"step": 8400
},
{
"epoch": 0.3399864005439782,
"grad_norm": 0.5310469269752502,
"learning_rate": 0.001,
"loss": 4.2785,
"step": 8500
},
{
"epoch": 0.34398624055037796,
"grad_norm": 0.5427576899528503,
"learning_rate": 0.001,
"loss": 4.2799,
"step": 8600
},
{
"epoch": 0.3479860805567777,
"grad_norm": 0.566967248916626,
"learning_rate": 0.001,
"loss": 4.2822,
"step": 8700
},
{
"epoch": 0.35198592056317746,
"grad_norm": 0.48460662364959717,
"learning_rate": 0.001,
"loss": 4.2772,
"step": 8800
},
{
"epoch": 0.3559857605695772,
"grad_norm": 0.5119643807411194,
"learning_rate": 0.001,
"loss": 4.2782,
"step": 8900
},
{
"epoch": 0.35998560057597695,
"grad_norm": 0.5386670827865601,
"learning_rate": 0.001,
"loss": 4.2749,
"step": 9000
},
{
"epoch": 0.3639854405823767,
"grad_norm": 0.5337742567062378,
"learning_rate": 0.001,
"loss": 4.2745,
"step": 9100
},
{
"epoch": 0.36798528058877644,
"grad_norm": 0.522514283657074,
"learning_rate": 0.001,
"loss": 4.2717,
"step": 9200
},
{
"epoch": 0.3719851205951762,
"grad_norm": 0.549670934677124,
"learning_rate": 0.001,
"loss": 4.272,
"step": 9300
},
{
"epoch": 0.37598496060157593,
"grad_norm": 0.6063629388809204,
"learning_rate": 0.001,
"loss": 4.2684,
"step": 9400
},
{
"epoch": 0.3799848006079757,
"grad_norm": 0.5420516133308411,
"learning_rate": 0.001,
"loss": 4.2674,
"step": 9500
},
{
"epoch": 0.3839846406143754,
"grad_norm": 0.48435646295547485,
"learning_rate": 0.001,
"loss": 4.2675,
"step": 9600
},
{
"epoch": 0.3879844806207752,
"grad_norm": 0.5914377570152283,
"learning_rate": 0.001,
"loss": 4.2737,
"step": 9700
},
{
"epoch": 0.3919843206271749,
"grad_norm": 0.4351874589920044,
"learning_rate": 0.001,
"loss": 4.2684,
"step": 9800
},
{
"epoch": 0.39598416063357467,
"grad_norm": 0.5514108538627625,
"learning_rate": 0.001,
"loss": 4.2671,
"step": 9900
},
{
"epoch": 0.3999840006399744,
"grad_norm": 0.5687771439552307,
"learning_rate": 0.001,
"loss": 4.2626,
"step": 10000
},
{
"epoch": 0.40398384064637416,
"grad_norm": 0.5736687779426575,
"learning_rate": 0.001,
"loss": 4.2673,
"step": 10100
},
{
"epoch": 0.4079836806527739,
"grad_norm": 0.5822706818580627,
"learning_rate": 0.001,
"loss": 4.264,
"step": 10200
},
{
"epoch": 0.41198352065917365,
"grad_norm": 0.5612554550170898,
"learning_rate": 0.001,
"loss": 4.2609,
"step": 10300
},
{
"epoch": 0.4159833606655734,
"grad_norm": 0.5104981660842896,
"learning_rate": 0.001,
"loss": 4.2625,
"step": 10400
},
{
"epoch": 0.41998320067197314,
"grad_norm": 0.5948505997657776,
"learning_rate": 0.001,
"loss": 4.2566,
"step": 10500
},
{
"epoch": 0.4239830406783729,
"grad_norm": 0.5640226006507874,
"learning_rate": 0.001,
"loss": 4.2602,
"step": 10600
},
{
"epoch": 0.4279828806847726,
"grad_norm": 0.6263200640678406,
"learning_rate": 0.001,
"loss": 4.2603,
"step": 10700
},
{
"epoch": 0.43198272069117233,
"grad_norm": 0.48344025015830994,
"learning_rate": 0.001,
"loss": 4.2596,
"step": 10800
},
{
"epoch": 0.4359825606975721,
"grad_norm": 0.49527639150619507,
"learning_rate": 0.001,
"loss": 4.2579,
"step": 10900
},
{
"epoch": 0.4399824007039718,
"grad_norm": 0.4601668417453766,
"learning_rate": 0.001,
"loss": 4.2532,
"step": 11000
},
{
"epoch": 0.44398224071037157,
"grad_norm": 0.4835492968559265,
"learning_rate": 0.001,
"loss": 4.2565,
"step": 11100
},
{
"epoch": 0.4479820807167713,
"grad_norm": 0.4633197784423828,
"learning_rate": 0.001,
"loss": 4.2531,
"step": 11200
},
{
"epoch": 0.45198192072317106,
"grad_norm": 0.5395948886871338,
"learning_rate": 0.001,
"loss": 4.2558,
"step": 11300
},
{
"epoch": 0.4559817607295708,
"grad_norm": 0.5230295658111572,
"learning_rate": 0.001,
"loss": 4.2497,
"step": 11400
},
{
"epoch": 0.45998160073597055,
"grad_norm": 0.47804856300354004,
"learning_rate": 0.001,
"loss": 4.2561,
"step": 11500
},
{
"epoch": 0.4639814407423703,
"grad_norm": 0.4512189030647278,
"learning_rate": 0.001,
"loss": 4.2479,
"step": 11600
},
{
"epoch": 0.46798128074877005,
"grad_norm": 0.7052320837974548,
"learning_rate": 0.001,
"loss": 4.2509,
"step": 11700
},
{
"epoch": 0.4719811207551698,
"grad_norm": 0.4637924134731293,
"learning_rate": 0.001,
"loss": 4.2525,
"step": 11800
},
{
"epoch": 0.47598096076156954,
"grad_norm": 0.442754864692688,
"learning_rate": 0.001,
"loss": 4.2499,
"step": 11900
},
{
"epoch": 0.4799808007679693,
"grad_norm": 0.48194420337677,
"learning_rate": 0.001,
"loss": 4.2496,
"step": 12000
},
{
"epoch": 0.48398064077436903,
"grad_norm": 0.5276590585708618,
"learning_rate": 0.001,
"loss": 4.2488,
"step": 12100
},
{
"epoch": 0.4879804807807688,
"grad_norm": 0.4882962107658386,
"learning_rate": 0.001,
"loss": 4.2438,
"step": 12200
},
{
"epoch": 0.4919803207871685,
"grad_norm": 0.47169601917266846,
"learning_rate": 0.001,
"loss": 4.2453,
"step": 12300
},
{
"epoch": 0.49598016079356827,
"grad_norm": 0.48581433296203613,
"learning_rate": 0.001,
"loss": 4.2462,
"step": 12400
},
{
"epoch": 0.499980000799968,
"grad_norm": 0.4135693609714508,
"learning_rate": 0.001,
"loss": 4.2468,
"step": 12500
},
{
"epoch": 0.5039798408063677,
"grad_norm": 0.5517194271087646,
"learning_rate": 0.001,
"loss": 4.2432,
"step": 12600
},
{
"epoch": 0.5079796808127675,
"grad_norm": 0.4932815134525299,
"learning_rate": 0.001,
"loss": 4.247,
"step": 12700
},
{
"epoch": 0.5119795208191672,
"grad_norm": 0.4821571111679077,
"learning_rate": 0.001,
"loss": 4.2412,
"step": 12800
},
{
"epoch": 0.515979360825567,
"grad_norm": 0.49198025465011597,
"learning_rate": 0.001,
"loss": 4.2415,
"step": 12900
},
{
"epoch": 0.5199792008319667,
"grad_norm": 0.47645890712738037,
"learning_rate": 0.001,
"loss": 4.238,
"step": 13000
},
{
"epoch": 0.5239790408383664,
"grad_norm": 0.4690765142440796,
"learning_rate": 0.001,
"loss": 4.2408,
"step": 13100
},
{
"epoch": 0.5279788808447662,
"grad_norm": 0.4654984474182129,
"learning_rate": 0.001,
"loss": 4.2396,
"step": 13200
},
{
"epoch": 0.5319787208511659,
"grad_norm": 0.542238712310791,
"learning_rate": 0.001,
"loss": 4.2381,
"step": 13300
},
{
"epoch": 0.5359785608575657,
"grad_norm": 0.4381965100765228,
"learning_rate": 0.001,
"loss": 4.2369,
"step": 13400
},
{
"epoch": 0.5399784008639654,
"grad_norm": 0.429868221282959,
"learning_rate": 0.001,
"loss": 4.2429,
"step": 13500
},
{
"epoch": 0.5439782408703652,
"grad_norm": 0.4983363151550293,
"learning_rate": 0.001,
"loss": 4.2428,
"step": 13600
},
{
"epoch": 0.5479780808767649,
"grad_norm": 0.4784950017929077,
"learning_rate": 0.001,
"loss": 4.2388,
"step": 13700
},
{
"epoch": 0.5519779208831647,
"grad_norm": 0.5002242922782898,
"learning_rate": 0.001,
"loss": 4.2359,
"step": 13800
},
{
"epoch": 0.5559777608895644,
"grad_norm": 0.51786869764328,
"learning_rate": 0.001,
"loss": 4.2385,
"step": 13900
},
{
"epoch": 0.5599776008959642,
"grad_norm": 0.4682203233242035,
"learning_rate": 0.001,
"loss": 4.2385,
"step": 14000
},
{
"epoch": 0.5639774409023639,
"grad_norm": 0.4645497500896454,
"learning_rate": 0.001,
"loss": 4.2353,
"step": 14100
},
{
"epoch": 0.5679772809087636,
"grad_norm": 0.5080273151397705,
"learning_rate": 0.001,
"loss": 4.2344,
"step": 14200
},
{
"epoch": 0.5719771209151634,
"grad_norm": 0.4649428129196167,
"learning_rate": 0.001,
"loss": 4.2378,
"step": 14300
},
{
"epoch": 0.5759769609215631,
"grad_norm": 0.47332581877708435,
"learning_rate": 0.001,
"loss": 4.2347,
"step": 14400
},
{
"epoch": 0.5799768009279629,
"grad_norm": 0.48201680183410645,
"learning_rate": 0.001,
"loss": 4.2354,
"step": 14500
},
{
"epoch": 0.5839766409343626,
"grad_norm": 0.6143534779548645,
"learning_rate": 0.001,
"loss": 4.2376,
"step": 14600
},
{
"epoch": 0.5879764809407624,
"grad_norm": 0.5474959015846252,
"learning_rate": 0.001,
"loss": 4.2318,
"step": 14700
},
{
"epoch": 0.5919763209471621,
"grad_norm": 0.4949159026145935,
"learning_rate": 0.001,
"loss": 4.2316,
"step": 14800
},
{
"epoch": 0.5959761609535619,
"grad_norm": 0.5090238451957703,
"learning_rate": 0.001,
"loss": 4.235,
"step": 14900
},
{
"epoch": 0.5999760009599616,
"grad_norm": 0.5531513094902039,
"learning_rate": 0.001,
"loss": 4.2344,
"step": 15000
},
{
"epoch": 0.6039758409663614,
"grad_norm": 0.48956552147865295,
"learning_rate": 0.001,
"loss": 4.2324,
"step": 15100
},
{
"epoch": 0.6079756809727611,
"grad_norm": 0.4767073690891266,
"learning_rate": 0.001,
"loss": 4.2344,
"step": 15200
},
{
"epoch": 0.6119755209791609,
"grad_norm": 0.4603271782398224,
"learning_rate": 0.001,
"loss": 4.2335,
"step": 15300
},
{
"epoch": 0.6159753609855606,
"grad_norm": 0.46433115005493164,
"learning_rate": 0.001,
"loss": 4.2292,
"step": 15400
},
{
"epoch": 0.6199752009919604,
"grad_norm": 0.493105411529541,
"learning_rate": 0.001,
"loss": 4.2317,
"step": 15500
},
{
"epoch": 0.6239750409983601,
"grad_norm": 0.47192350029945374,
"learning_rate": 0.001,
"loss": 4.2328,
"step": 15600
},
{
"epoch": 0.6279748810047598,
"grad_norm": 0.4708082675933838,
"learning_rate": 0.001,
"loss": 4.231,
"step": 15700
},
{
"epoch": 0.6319747210111596,
"grad_norm": 0.45078226923942566,
"learning_rate": 0.001,
"loss": 4.2271,
"step": 15800
},
{
"epoch": 0.6359745610175593,
"grad_norm": 0.4881497919559479,
"learning_rate": 0.001,
"loss": 4.2284,
"step": 15900
},
{
"epoch": 0.6399744010239591,
"grad_norm": 0.5195273160934448,
"learning_rate": 0.001,
"loss": 4.234,
"step": 16000
},
{
"epoch": 0.6439742410303588,
"grad_norm": 0.4363176226615906,
"learning_rate": 0.001,
"loss": 4.2278,
"step": 16100
},
{
"epoch": 0.6479740810367586,
"grad_norm": 0.5371832251548767,
"learning_rate": 0.001,
"loss": 4.2242,
"step": 16200
},
{
"epoch": 0.6519739210431583,
"grad_norm": 0.47699272632598877,
"learning_rate": 0.001,
"loss": 4.2268,
"step": 16300
},
{
"epoch": 0.6559737610495581,
"grad_norm": 0.5306685566902161,
"learning_rate": 0.001,
"loss": 4.2284,
"step": 16400
},
{
"epoch": 0.6599736010559578,
"grad_norm": 0.41332292556762695,
"learning_rate": 0.001,
"loss": 4.2287,
"step": 16500
},
{
"epoch": 0.6639734410623576,
"grad_norm": 0.4745205044746399,
"learning_rate": 0.001,
"loss": 4.2259,
"step": 16600
},
{
"epoch": 0.6679732810687572,
"grad_norm": 0.4349898397922516,
"learning_rate": 0.001,
"loss": 4.2239,
"step": 16700
},
{
"epoch": 0.6719731210751569,
"grad_norm": 0.4942924380302429,
"learning_rate": 0.001,
"loss": 4.2223,
"step": 16800
},
{
"epoch": 0.6759729610815567,
"grad_norm": 0.5019668340682983,
"learning_rate": 0.001,
"loss": 4.2233,
"step": 16900
},
{
"epoch": 0.6799728010879564,
"grad_norm": 0.41412749886512756,
"learning_rate": 0.001,
"loss": 4.2237,
"step": 17000
},
{
"epoch": 0.6839726410943562,
"grad_norm": 0.47330179810523987,
"learning_rate": 0.001,
"loss": 4.2241,
"step": 17100
},
{
"epoch": 0.6879724811007559,
"grad_norm": 0.45668381452560425,
"learning_rate": 0.001,
"loss": 4.222,
"step": 17200
},
{
"epoch": 0.6919723211071557,
"grad_norm": 0.437050461769104,
"learning_rate": 0.001,
"loss": 4.2226,
"step": 17300
},
{
"epoch": 0.6959721611135554,
"grad_norm": 0.4829418659210205,
"learning_rate": 0.001,
"loss": 4.2252,
"step": 17400
},
{
"epoch": 0.6999720011199552,
"grad_norm": 0.47243532538414,
"learning_rate": 0.001,
"loss": 4.2167,
"step": 17500
},
{
"epoch": 0.7039718411263549,
"grad_norm": 0.4851880669593811,
"learning_rate": 0.001,
"loss": 4.2203,
"step": 17600
},
{
"epoch": 0.7079716811327547,
"grad_norm": 0.5821521282196045,
"learning_rate": 0.001,
"loss": 4.2222,
"step": 17700
},
{
"epoch": 0.7119715211391544,
"grad_norm": 0.4213780462741852,
"learning_rate": 0.001,
"loss": 4.2224,
"step": 17800
},
{
"epoch": 0.7159713611455542,
"grad_norm": 0.4478498101234436,
"learning_rate": 0.001,
"loss": 4.2176,
"step": 17900
},
{
"epoch": 0.7199712011519539,
"grad_norm": 0.41999441385269165,
"learning_rate": 0.001,
"loss": 4.2183,
"step": 18000
},
{
"epoch": 0.7239710411583536,
"grad_norm": 0.4531497657299042,
"learning_rate": 0.001,
"loss": 4.2201,
"step": 18100
},
{
"epoch": 0.7279708811647534,
"grad_norm": 0.5436483025550842,
"learning_rate": 0.001,
"loss": 4.2181,
"step": 18200
},
{
"epoch": 0.7319707211711531,
"grad_norm": 0.5103757381439209,
"learning_rate": 0.001,
"loss": 4.2195,
"step": 18300
},
{
"epoch": 0.7359705611775529,
"grad_norm": 0.5203491449356079,
"learning_rate": 0.001,
"loss": 4.2197,
"step": 18400
},
{
"epoch": 0.7399704011839526,
"grad_norm": 0.5450323224067688,
"learning_rate": 0.001,
"loss": 4.2175,
"step": 18500
},
{
"epoch": 0.7439702411903524,
"grad_norm": 0.4617632031440735,
"learning_rate": 0.001,
"loss": 4.2136,
"step": 18600
},
{
"epoch": 0.7479700811967521,
"grad_norm": 0.477172315120697,
"learning_rate": 0.001,
"loss": 4.2166,
"step": 18700
},
{
"epoch": 0.7519699212031519,
"grad_norm": 0.538207471370697,
"learning_rate": 0.001,
"loss": 4.2189,
"step": 18800
},
{
"epoch": 0.7559697612095516,
"grad_norm": 0.39729467034339905,
"learning_rate": 0.001,
"loss": 4.2162,
"step": 18900
},
{
"epoch": 0.7599696012159514,
"grad_norm": 0.4556116759777069,
"learning_rate": 0.001,
"loss": 4.2177,
"step": 19000
},
{
"epoch": 0.7639694412223511,
"grad_norm": 0.48764076828956604,
"learning_rate": 0.001,
"loss": 4.2178,
"step": 19100
},
{
"epoch": 0.7679692812287509,
"grad_norm": 0.5256556272506714,
"learning_rate": 0.001,
"loss": 4.2147,
"step": 19200
},
{
"epoch": 0.7719691212351506,
"grad_norm": 0.48659247159957886,
"learning_rate": 0.001,
"loss": 4.2167,
"step": 19300
},
{
"epoch": 0.7759689612415503,
"grad_norm": 0.4753814935684204,
"learning_rate": 0.001,
"loss": 4.2143,
"step": 19400
},
{
"epoch": 0.7799688012479501,
"grad_norm": 0.47923025488853455,
"learning_rate": 0.001,
"loss": 4.2154,
"step": 19500
},
{
"epoch": 0.7839686412543498,
"grad_norm": 0.5025440454483032,
"learning_rate": 0.001,
"loss": 4.2147,
"step": 19600
},
{
"epoch": 0.7879684812607496,
"grad_norm": 0.5111387968063354,
"learning_rate": 0.001,
"loss": 4.2162,
"step": 19700
},
{
"epoch": 0.7919683212671493,
"grad_norm": 0.5092292428016663,
"learning_rate": 0.001,
"loss": 4.2147,
"step": 19800
},
{
"epoch": 0.7959681612735491,
"grad_norm": 0.4506489634513855,
"learning_rate": 0.001,
"loss": 4.2158,
"step": 19900
},
{
"epoch": 0.7999680012799488,
"grad_norm": 0.43973225355148315,
"learning_rate": 0.001,
"loss": 4.2122,
"step": 20000
},
{
"epoch": 0.8039678412863486,
"grad_norm": 0.46984151005744934,
"learning_rate": 0.001,
"loss": 4.2125,
"step": 20100
},
{
"epoch": 0.8079676812927483,
"grad_norm": 0.4673251509666443,
"learning_rate": 0.001,
"loss": 4.2122,
"step": 20200
},
{
"epoch": 0.8119675212991481,
"grad_norm": 0.5964290499687195,
"learning_rate": 0.001,
"loss": 4.2097,
"step": 20300
},
{
"epoch": 0.8159673613055478,
"grad_norm": 0.4411192536354065,
"learning_rate": 0.001,
"loss": 4.208,
"step": 20400
},
{
"epoch": 0.8199672013119476,
"grad_norm": 0.4942370355129242,
"learning_rate": 0.001,
"loss": 4.2119,
"step": 20500
},
{
"epoch": 0.8239670413183473,
"grad_norm": 0.4818095862865448,
"learning_rate": 0.001,
"loss": 4.2103,
"step": 20600
},
{
"epoch": 0.827966881324747,
"grad_norm": 0.43105003237724304,
"learning_rate": 0.001,
"loss": 4.2108,
"step": 20700
},
{
"epoch": 0.8319667213311468,
"grad_norm": 0.447951078414917,
"learning_rate": 0.001,
"loss": 4.2137,
"step": 20800
},
{
"epoch": 0.8359665613375465,
"grad_norm": 0.4796685576438904,
"learning_rate": 0.001,
"loss": 4.212,
"step": 20900
},
{
"epoch": 0.8399664013439463,
"grad_norm": 0.443522572517395,
"learning_rate": 0.001,
"loss": 4.2076,
"step": 21000
},
{
"epoch": 0.843966241350346,
"grad_norm": 0.4776618778705597,
"learning_rate": 0.001,
"loss": 4.2128,
"step": 21100
},
{
"epoch": 0.8479660813567458,
"grad_norm": 0.45614078640937805,
"learning_rate": 0.001,
"loss": 4.2061,
"step": 21200
},
{
"epoch": 0.8519659213631455,
"grad_norm": 0.574044942855835,
"learning_rate": 0.001,
"loss": 4.2088,
"step": 21300
},
{
"epoch": 0.8559657613695452,
"grad_norm": 0.6252190470695496,
"learning_rate": 0.001,
"loss": 4.2092,
"step": 21400
},
{
"epoch": 0.8599656013759449,
"grad_norm": 0.4226459860801697,
"learning_rate": 0.001,
"loss": 4.2075,
"step": 21500
},
{
"epoch": 0.8639654413823447,
"grad_norm": 0.45728734135627747,
"learning_rate": 0.001,
"loss": 4.2086,
"step": 21600
},
{
"epoch": 0.8679652813887444,
"grad_norm": 0.486545592546463,
"learning_rate": 0.001,
"loss": 4.2108,
"step": 21700
},
{
"epoch": 0.8719651213951441,
"grad_norm": 0.45706841349601746,
"learning_rate": 0.001,
"loss": 4.2101,
"step": 21800
},
{
"epoch": 0.8759649614015439,
"grad_norm": 0.48993363976478577,
"learning_rate": 0.001,
"loss": 4.2092,
"step": 21900
},
{
"epoch": 0.8799648014079436,
"grad_norm": 0.42304420471191406,
"learning_rate": 0.001,
"loss": 4.213,
"step": 22000
},
{
"epoch": 0.8839646414143434,
"grad_norm": 0.4448954463005066,
"learning_rate": 0.001,
"loss": 4.2066,
"step": 22100
},
{
"epoch": 0.8879644814207431,
"grad_norm": 0.4916422367095947,
"learning_rate": 0.001,
"loss": 4.2028,
"step": 22200
},
{
"epoch": 0.8919643214271429,
"grad_norm": 0.44371068477630615,
"learning_rate": 0.001,
"loss": 4.2084,
"step": 22300
},
{
"epoch": 0.8959641614335426,
"grad_norm": 0.42282700538635254,
"learning_rate": 0.001,
"loss": 4.204,
"step": 22400
},
{
"epoch": 0.8999640014399424,
"grad_norm": 0.40497201681137085,
"learning_rate": 0.001,
"loss": 4.2057,
"step": 22500
},
{
"epoch": 0.9039638414463421,
"grad_norm": 0.4718570113182068,
"learning_rate": 0.001,
"loss": 4.2096,
"step": 22600
},
{
"epoch": 0.9079636814527419,
"grad_norm": 0.4425433576107025,
"learning_rate": 0.001,
"loss": 4.2071,
"step": 22700
},
{
"epoch": 0.9119635214591416,
"grad_norm": 0.5133687853813171,
"learning_rate": 0.001,
"loss": 4.2074,
"step": 22800
},
{
"epoch": 0.9159633614655414,
"grad_norm": 0.5017001032829285,
"learning_rate": 0.001,
"loss": 4.2065,
"step": 22900
},
{
"epoch": 0.9199632014719411,
"grad_norm": 0.47720518708229065,
"learning_rate": 0.001,
"loss": 4.2068,
"step": 23000
},
{
"epoch": 0.9239630414783409,
"grad_norm": 0.5085333585739136,
"learning_rate": 0.001,
"loss": 4.2069,
"step": 23100
},
{
"epoch": 0.9279628814847406,
"grad_norm": 0.451284259557724,
"learning_rate": 0.001,
"loss": 4.2052,
"step": 23200
},
{
"epoch": 0.9319627214911403,
"grad_norm": 0.4576238691806793,
"learning_rate": 0.001,
"loss": 4.205,
"step": 23300
},
{
"epoch": 0.9359625614975401,
"grad_norm": 0.45982882380485535,
"learning_rate": 0.001,
"loss": 4.2013,
"step": 23400
},
{
"epoch": 0.9399624015039398,
"grad_norm": 0.5430781841278076,
"learning_rate": 0.001,
"loss": 4.2077,
"step": 23500
},
{
"epoch": 0.9439622415103396,
"grad_norm": 0.49019157886505127,
"learning_rate": 0.001,
"loss": 4.2052,
"step": 23600
},
{
"epoch": 0.9479620815167393,
"grad_norm": 0.4381950795650482,
"learning_rate": 0.001,
"loss": 4.2023,
"step": 23700
},
{
"epoch": 0.9519619215231391,
"grad_norm": 0.5325062870979309,
"learning_rate": 0.001,
"loss": 4.2044,
"step": 23800
},
{
"epoch": 0.9559617615295388,
"grad_norm": 0.4855589270591736,
"learning_rate": 0.001,
"loss": 4.205,
"step": 23900
},
{
"epoch": 0.9599616015359386,
"grad_norm": 0.4132635295391083,
"learning_rate": 0.001,
"loss": 4.2018,
"step": 24000
},
{
"epoch": 0.9639614415423383,
"grad_norm": 0.4958603084087372,
"learning_rate": 0.001,
"loss": 4.2027,
"step": 24100
},
{
"epoch": 0.9679612815487381,
"grad_norm": 0.44566038250923157,
"learning_rate": 0.001,
"loss": 4.2043,
"step": 24200
},
{
"epoch": 0.9719611215551378,
"grad_norm": 0.4078667163848877,
"learning_rate": 0.001,
"loss": 4.2044,
"step": 24300
},
{
"epoch": 0.9759609615615376,
"grad_norm": 0.48166027665138245,
"learning_rate": 0.001,
"loss": 4.2022,
"step": 24400
},
{
"epoch": 0.9799608015679373,
"grad_norm": 0.472896933555603,
"learning_rate": 0.001,
"loss": 4.2078,
"step": 24500
},
{
"epoch": 0.983960641574337,
"grad_norm": 0.4770311117172241,
"learning_rate": 0.001,
"loss": 4.1982,
"step": 24600
},
{
"epoch": 0.9879604815807368,
"grad_norm": 0.4926893413066864,
"learning_rate": 0.001,
"loss": 4.2014,
"step": 24700
},
{
"epoch": 0.9919603215871365,
"grad_norm": 0.4080910086631775,
"learning_rate": 0.001,
"loss": 4.1947,
"step": 24800
},
{
"epoch": 0.9959601615935363,
"grad_norm": 0.5428063273429871,
"learning_rate": 0.001,
"loss": 4.1999,
"step": 24900
},
{
"epoch": 0.999960001599936,
"grad_norm": 0.4776434600353241,
"learning_rate": 0.001,
"loss": 4.2004,
"step": 25000
},
{
"epoch": 1.0039598416063358,
"grad_norm": 0.5323604941368103,
"learning_rate": 0.001,
"loss": 4.1961,
"step": 25100
},
{
"epoch": 1.0079596816127354,
"grad_norm": 0.5068919062614441,
"learning_rate": 0.001,
"loss": 4.1951,
"step": 25200
},
{
"epoch": 1.0119595216191353,
"grad_norm": 0.439967542886734,
"learning_rate": 0.001,
"loss": 4.2012,
"step": 25300
},
{
"epoch": 1.015959361625535,
"grad_norm": 0.5373870730400085,
"learning_rate": 0.001,
"loss": 4.1967,
"step": 25400
},
{
"epoch": 1.0199592016319348,
"grad_norm": 0.45972001552581787,
"learning_rate": 0.001,
"loss": 4.198,
"step": 25500
},
{
"epoch": 1.0239590416383344,
"grad_norm": 0.45675376057624817,
"learning_rate": 0.001,
"loss": 4.1974,
"step": 25600
},
{
"epoch": 1.0279588816447343,
"grad_norm": 0.5330101847648621,
"learning_rate": 0.001,
"loss": 4.1964,
"step": 25700
},
{
"epoch": 1.031958721651134,
"grad_norm": 0.557739794254303,
"learning_rate": 0.001,
"loss": 4.196,
"step": 25800
},
{
"epoch": 1.0359585616575337,
"grad_norm": 0.4591217339038849,
"learning_rate": 0.001,
"loss": 4.1974,
"step": 25900
},
{
"epoch": 1.0399584016639334,
"grad_norm": 0.43261614441871643,
"learning_rate": 0.001,
"loss": 4.1981,
"step": 26000
},
{
"epoch": 1.0439582416703332,
"grad_norm": 0.4880464971065521,
"learning_rate": 0.001,
"loss": 4.1994,
"step": 26100
},
{
"epoch": 1.0479580816767329,
"grad_norm": 0.48199212551116943,
"learning_rate": 0.001,
"loss": 4.1936,
"step": 26200
},
{
"epoch": 1.0519579216831327,
"grad_norm": 0.5580593943595886,
"learning_rate": 0.001,
"loss": 4.1972,
"step": 26300
},
{
"epoch": 1.0559577616895324,
"grad_norm": 0.44014519453048706,
"learning_rate": 0.001,
"loss": 4.1955,
"step": 26400
},
{
"epoch": 1.0599576016959322,
"grad_norm": 0.5002579092979431,
"learning_rate": 0.001,
"loss": 4.2022,
"step": 26500
},
{
"epoch": 1.0639574417023319,
"grad_norm": 0.4530857503414154,
"learning_rate": 0.001,
"loss": 4.1945,
"step": 26600
},
{
"epoch": 1.0679572817087317,
"grad_norm": 0.4876604676246643,
"learning_rate": 0.001,
"loss": 4.1971,
"step": 26700
},
{
"epoch": 1.0719571217151314,
"grad_norm": 0.4442366063594818,
"learning_rate": 0.001,
"loss": 4.1941,
"step": 26800
},
{
"epoch": 1.0759569617215312,
"grad_norm": 0.42312711477279663,
"learning_rate": 0.001,
"loss": 4.1976,
"step": 26900
},
{
"epoch": 1.0799568017279308,
"grad_norm": 0.49312129616737366,
"learning_rate": 0.001,
"loss": 4.1946,
"step": 27000
},
{
"epoch": 1.0839566417343307,
"grad_norm": 0.4688827693462372,
"learning_rate": 0.001,
"loss": 4.1944,
"step": 27100
},
{
"epoch": 1.0879564817407303,
"grad_norm": 0.48417580127716064,
"learning_rate": 0.001,
"loss": 4.1975,
"step": 27200
},
{
"epoch": 1.0919563217471302,
"grad_norm": 0.4930320382118225,
"learning_rate": 0.001,
"loss": 4.1957,
"step": 27300
},
{
"epoch": 1.0959561617535298,
"grad_norm": 0.5079306364059448,
"learning_rate": 0.001,
"loss": 4.1978,
"step": 27400
},
{
"epoch": 1.0999560017599297,
"grad_norm": 0.5758777856826782,
"learning_rate": 0.001,
"loss": 4.1953,
"step": 27500
},
{
"epoch": 1.1039558417663293,
"grad_norm": 0.49672508239746094,
"learning_rate": 0.001,
"loss": 4.1972,
"step": 27600
},
{
"epoch": 1.1079556817727292,
"grad_norm": 0.4356079399585724,
"learning_rate": 0.001,
"loss": 4.1941,
"step": 27700
},
{
"epoch": 1.1119555217791288,
"grad_norm": 0.44307178258895874,
"learning_rate": 0.001,
"loss": 4.1954,
"step": 27800
},
{
"epoch": 1.1159553617855287,
"grad_norm": 0.5404129028320312,
"learning_rate": 0.001,
"loss": 4.1914,
"step": 27900
},
{
"epoch": 1.1199552017919283,
"grad_norm": 0.47977906465530396,
"learning_rate": 0.001,
"loss": 4.1924,
"step": 28000
},
{
"epoch": 1.1239550417983282,
"grad_norm": 0.4677433371543884,
"learning_rate": 0.001,
"loss": 4.1941,
"step": 28100
},
{
"epoch": 1.1279548818047278,
"grad_norm": 0.6071330904960632,
"learning_rate": 0.001,
"loss": 4.1906,
"step": 28200
},
{
"epoch": 1.1319547218111277,
"grad_norm": 0.48553600907325745,
"learning_rate": 0.001,
"loss": 4.1961,
"step": 28300
},
{
"epoch": 1.1359545618175273,
"grad_norm": 0.4587904214859009,
"learning_rate": 0.001,
"loss": 4.1948,
"step": 28400
},
{
"epoch": 1.139954401823927,
"grad_norm": 0.4619959890842438,
"learning_rate": 0.001,
"loss": 4.1905,
"step": 28500
},
{
"epoch": 1.1439542418303268,
"grad_norm": 0.5305209755897522,
"learning_rate": 0.001,
"loss": 4.1918,
"step": 28600
},
{
"epoch": 1.1479540818367266,
"grad_norm": 0.46056920289993286,
"learning_rate": 0.001,
"loss": 4.1918,
"step": 28700
},
{
"epoch": 1.1519539218431263,
"grad_norm": 0.48591580986976624,
"learning_rate": 0.001,
"loss": 4.1965,
"step": 28800
},
{
"epoch": 1.155953761849526,
"grad_norm": 0.5184019804000854,
"learning_rate": 0.001,
"loss": 4.1925,
"step": 28900
},
{
"epoch": 1.1599536018559258,
"grad_norm": 0.44365832209587097,
"learning_rate": 0.001,
"loss": 4.1933,
"step": 29000
},
{
"epoch": 1.1639534418623254,
"grad_norm": 0.5565987825393677,
"learning_rate": 0.001,
"loss": 4.187,
"step": 29100
},
{
"epoch": 1.1679532818687253,
"grad_norm": 0.4826023280620575,
"learning_rate": 0.001,
"loss": 4.1955,
"step": 29200
},
{
"epoch": 1.171953121875125,
"grad_norm": 0.5205375552177429,
"learning_rate": 0.001,
"loss": 4.1923,
"step": 29300
},
{
"epoch": 1.1759529618815248,
"grad_norm": 0.5183901190757751,
"learning_rate": 0.001,
"loss": 4.1975,
"step": 29400
},
{
"epoch": 1.1799528018879244,
"grad_norm": 0.49648305773735046,
"learning_rate": 0.001,
"loss": 4.193,
"step": 29500
},
{
"epoch": 1.1839526418943243,
"grad_norm": 0.4555068612098694,
"learning_rate": 0.001,
"loss": 4.1908,
"step": 29600
},
{
"epoch": 1.1879524819007239,
"grad_norm": 0.48755526542663574,
"learning_rate": 0.001,
"loss": 4.1925,
"step": 29700
},
{
"epoch": 1.1919523219071237,
"grad_norm": 0.4887760281562805,
"learning_rate": 0.001,
"loss": 4.1881,
"step": 29800
},
{
"epoch": 1.1959521619135234,
"grad_norm": 0.5118262767791748,
"learning_rate": 0.001,
"loss": 4.1877,
"step": 29900
},
{
"epoch": 1.1999520019199232,
"grad_norm": 0.45270290970802307,
"learning_rate": 0.001,
"loss": 4.1933,
"step": 30000
},
{
"epoch": 1.2039518419263229,
"grad_norm": 0.5188767910003662,
"learning_rate": 0.001,
"loss": 4.1932,
"step": 30100
},
{
"epoch": 1.2079516819327227,
"grad_norm": 0.53879714012146,
"learning_rate": 0.001,
"loss": 4.1869,
"step": 30200
},
{
"epoch": 1.2119515219391224,
"grad_norm": 0.5128753185272217,
"learning_rate": 0.001,
"loss": 4.1901,
"step": 30300
},
{
"epoch": 1.2159513619455222,
"grad_norm": 0.3823694586753845,
"learning_rate": 0.001,
"loss": 4.1905,
"step": 30400
},
{
"epoch": 1.2199512019519219,
"grad_norm": 0.4704561233520508,
"learning_rate": 0.001,
"loss": 4.1856,
"step": 30500
},
{
"epoch": 1.2239510419583217,
"grad_norm": 0.4269457459449768,
"learning_rate": 0.001,
"loss": 4.1918,
"step": 30600
},
{
"epoch": 1.2279508819647214,
"grad_norm": 0.44246116280555725,
"learning_rate": 0.001,
"loss": 4.1915,
"step": 30700
},
{
"epoch": 1.2319507219711212,
"grad_norm": 0.45588257908821106,
"learning_rate": 0.001,
"loss": 4.1887,
"step": 30800
},
{
"epoch": 1.2359505619775208,
"grad_norm": 0.5354055166244507,
"learning_rate": 0.001,
"loss": 4.1916,
"step": 30900
},
{
"epoch": 1.2399504019839207,
"grad_norm": 0.48199784755706787,
"learning_rate": 0.001,
"loss": 4.1908,
"step": 31000
},
{
"epoch": 1.2439502419903203,
"grad_norm": 0.48949673771858215,
"learning_rate": 0.001,
"loss": 4.1891,
"step": 31100
},
{
"epoch": 1.2479500819967202,
"grad_norm": 0.49601200222969055,
"learning_rate": 0.001,
"loss": 4.1873,
"step": 31200
},
{
"epoch": 1.2519499220031198,
"grad_norm": 0.4721723198890686,
"learning_rate": 0.001,
"loss": 4.1879,
"step": 31300
},
{
"epoch": 1.2559497620095197,
"grad_norm": 0.44374367594718933,
"learning_rate": 0.001,
"loss": 4.1917,
"step": 31400
},
{
"epoch": 1.2599496020159193,
"grad_norm": 0.48409733176231384,
"learning_rate": 0.001,
"loss": 4.1839,
"step": 31500
},
{
"epoch": 1.2639494420223192,
"grad_norm": 0.4843854010105133,
"learning_rate": 0.001,
"loss": 4.1898,
"step": 31600
},
{
"epoch": 1.2679492820287188,
"grad_norm": 0.45039990544319153,
"learning_rate": 0.001,
"loss": 4.1891,
"step": 31700
},
{
"epoch": 1.2719491220351187,
"grad_norm": 0.3904966413974762,
"learning_rate": 0.001,
"loss": 4.1872,
"step": 31800
},
{
"epoch": 1.2759489620415183,
"grad_norm": 0.4539620876312256,
"learning_rate": 0.001,
"loss": 4.1836,
"step": 31900
},
{
"epoch": 1.2799488020479182,
"grad_norm": 0.46595314145088196,
"learning_rate": 0.001,
"loss": 4.19,
"step": 32000
},
{
"epoch": 1.2839486420543178,
"grad_norm": 0.4878152012825012,
"learning_rate": 0.001,
"loss": 4.1848,
"step": 32100
},
{
"epoch": 1.2879484820607177,
"grad_norm": 0.5768002271652222,
"learning_rate": 0.001,
"loss": 4.1912,
"step": 32200
},
{
"epoch": 1.2919483220671173,
"grad_norm": 0.43661263585090637,
"learning_rate": 0.001,
"loss": 4.1894,
"step": 32300
},
{
"epoch": 1.295948162073517,
"grad_norm": 0.4612700939178467,
"learning_rate": 0.001,
"loss": 4.1884,
"step": 32400
},
{
"epoch": 1.2999480020799168,
"grad_norm": 0.4451783001422882,
"learning_rate": 0.001,
"loss": 4.1886,
"step": 32500
},
{
"epoch": 1.3039478420863166,
"grad_norm": 0.5825871825218201,
"learning_rate": 0.001,
"loss": 4.191,
"step": 32600
},
{
"epoch": 1.3079476820927163,
"grad_norm": 0.5060557126998901,
"learning_rate": 0.001,
"loss": 4.1858,
"step": 32700
},
{
"epoch": 1.311947522099116,
"grad_norm": 0.49111461639404297,
"learning_rate": 0.001,
"loss": 4.1861,
"step": 32800
},
{
"epoch": 1.3159473621055158,
"grad_norm": 0.511899471282959,
"learning_rate": 0.001,
"loss": 4.1901,
"step": 32900
},
{
"epoch": 1.3199472021119156,
"grad_norm": 0.5053913593292236,
"learning_rate": 0.001,
"loss": 4.1885,
"step": 33000
},
{
"epoch": 1.3239470421183153,
"grad_norm": 0.55963534116745,
"learning_rate": 0.001,
"loss": 4.1868,
"step": 33100
},
{
"epoch": 1.327946882124715,
"grad_norm": 0.5135225653648376,
"learning_rate": 0.001,
"loss": 4.19,
"step": 33200
},
{
"epoch": 1.3319467221311148,
"grad_norm": 0.5401255488395691,
"learning_rate": 0.001,
"loss": 4.1892,
"step": 33300
},
{
"epoch": 1.3359465621375146,
"grad_norm": 0.5370189547538757,
"learning_rate": 0.001,
"loss": 4.1887,
"step": 33400
},
{
"epoch": 1.3399464021439143,
"grad_norm": 0.452307790517807,
"learning_rate": 0.001,
"loss": 4.1837,
"step": 33500
},
{
"epoch": 1.3439462421503139,
"grad_norm": 0.4923325777053833,
"learning_rate": 0.001,
"loss": 4.1876,
"step": 33600
},
{
"epoch": 1.3479460821567137,
"grad_norm": 0.4178541600704193,
"learning_rate": 0.001,
"loss": 4.1859,
"step": 33700
},
{
"epoch": 1.3519459221631136,
"grad_norm": 0.43804001808166504,
"learning_rate": 0.001,
"loss": 4.1859,
"step": 33800
},
{
"epoch": 1.3559457621695132,
"grad_norm": 0.4893229901790619,
"learning_rate": 0.001,
"loss": 4.185,
"step": 33900
},
{
"epoch": 1.3599456021759129,
"grad_norm": 0.43529701232910156,
"learning_rate": 0.001,
"loss": 4.1807,
"step": 34000
},
{
"epoch": 1.3639454421823127,
"grad_norm": 0.4353291094303131,
"learning_rate": 0.001,
"loss": 4.182,
"step": 34100
},
{
"epoch": 1.3679452821887126,
"grad_norm": 0.4755658507347107,
"learning_rate": 0.001,
"loss": 4.1849,
"step": 34200
},
{
"epoch": 1.3719451221951122,
"grad_norm": 0.5512502193450928,
"learning_rate": 0.001,
"loss": 4.1857,
"step": 34300
},
{
"epoch": 1.3759449622015119,
"grad_norm": 0.4462525546550751,
"learning_rate": 0.001,
"loss": 4.184,
"step": 34400
},
{
"epoch": 1.3799448022079117,
"grad_norm": 0.5683126449584961,
"learning_rate": 0.001,
"loss": 4.1849,
"step": 34500
},
{
"epoch": 1.3839446422143113,
"grad_norm": 0.4847952723503113,
"learning_rate": 0.001,
"loss": 4.1809,
"step": 34600
},
{
"epoch": 1.3879444822207112,
"grad_norm": 0.5147800445556641,
"learning_rate": 0.001,
"loss": 4.182,
"step": 34700
},
{
"epoch": 1.3919443222271108,
"grad_norm": 0.49664029479026794,
"learning_rate": 0.001,
"loss": 4.1826,
"step": 34800
},
{
"epoch": 1.3959441622335107,
"grad_norm": 0.4566904902458191,
"learning_rate": 0.001,
"loss": 4.1855,
"step": 34900
},
{
"epoch": 1.3999440022399103,
"grad_norm": 0.4743303954601288,
"learning_rate": 0.001,
"loss": 4.1854,
"step": 35000
},
{
"epoch": 1.4039438422463102,
"grad_norm": 0.478000670671463,
"learning_rate": 0.001,
"loss": 4.187,
"step": 35100
},
{
"epoch": 1.4079436822527098,
"grad_norm": 0.43782198429107666,
"learning_rate": 0.001,
"loss": 4.1808,
"step": 35200
},
{
"epoch": 1.4119435222591097,
"grad_norm": 0.46672046184539795,
"learning_rate": 0.001,
"loss": 4.1814,
"step": 35300
},
{
"epoch": 1.4159433622655093,
"grad_norm": 0.48321112990379333,
"learning_rate": 0.001,
"loss": 4.1814,
"step": 35400
},
{
"epoch": 1.4199432022719092,
"grad_norm": 0.48716631531715393,
"learning_rate": 0.001,
"loss": 4.1831,
"step": 35500
},
{
"epoch": 1.4239430422783088,
"grad_norm": 0.46520668268203735,
"learning_rate": 0.001,
"loss": 4.1823,
"step": 35600
},
{
"epoch": 1.4279428822847087,
"grad_norm": 0.4660239815711975,
"learning_rate": 0.001,
"loss": 4.1845,
"step": 35700
},
{
"epoch": 1.4319427222911083,
"grad_norm": 0.5418950319290161,
"learning_rate": 0.001,
"loss": 4.1828,
"step": 35800
},
{
"epoch": 1.4359425622975082,
"grad_norm": 0.45179441571235657,
"learning_rate": 0.001,
"loss": 4.1821,
"step": 35900
},
{
"epoch": 1.4399424023039078,
"grad_norm": 0.5119227766990662,
"learning_rate": 0.001,
"loss": 4.183,
"step": 36000
},
{
"epoch": 1.4439422423103077,
"grad_norm": 0.4730793237686157,
"learning_rate": 0.001,
"loss": 4.1813,
"step": 36100
},
{
"epoch": 1.4479420823167073,
"grad_norm": 0.4840889275074005,
"learning_rate": 0.001,
"loss": 4.182,
"step": 36200
},
{
"epoch": 1.4519419223231071,
"grad_norm": 0.4688670039176941,
"learning_rate": 0.001,
"loss": 4.1828,
"step": 36300
},
{
"epoch": 1.4559417623295068,
"grad_norm": 0.4670471251010895,
"learning_rate": 0.001,
"loss": 4.1784,
"step": 36400
},
{
"epoch": 1.4599416023359066,
"grad_norm": 0.4444526731967926,
"learning_rate": 0.001,
"loss": 4.1793,
"step": 36500
},
{
"epoch": 1.4639414423423063,
"grad_norm": 0.43045392632484436,
"learning_rate": 0.001,
"loss": 4.1814,
"step": 36600
},
{
"epoch": 1.4679412823487061,
"grad_norm": 0.491200715303421,
"learning_rate": 0.001,
"loss": 4.1848,
"step": 36700
},
{
"epoch": 1.4719411223551058,
"grad_norm": 0.4605400264263153,
"learning_rate": 0.001,
"loss": 4.1831,
"step": 36800
},
{
"epoch": 1.4759409623615056,
"grad_norm": 0.47693344950675964,
"learning_rate": 0.001,
"loss": 4.1797,
"step": 36900
},
{
"epoch": 1.4799408023679053,
"grad_norm": 0.5456331968307495,
"learning_rate": 0.001,
"loss": 4.1773,
"step": 37000
},
{
"epoch": 1.483940642374305,
"grad_norm": 0.42828118801116943,
"learning_rate": 0.001,
"loss": 4.1794,
"step": 37100
},
{
"epoch": 1.4879404823807048,
"grad_norm": 0.47602272033691406,
"learning_rate": 0.001,
"loss": 4.1821,
"step": 37200
},
{
"epoch": 1.4919403223871046,
"grad_norm": 0.5995456576347351,
"learning_rate": 0.001,
"loss": 4.1799,
"step": 37300
},
{
"epoch": 1.4959401623935042,
"grad_norm": 0.5107753276824951,
"learning_rate": 0.001,
"loss": 4.1838,
"step": 37400
},
{
"epoch": 1.4999400023999039,
"grad_norm": 0.5625353455543518,
"learning_rate": 0.001,
"loss": 4.1823,
"step": 37500
},
{
"epoch": 1.5039398424063037,
"grad_norm": 0.4833304286003113,
"learning_rate": 0.001,
"loss": 4.1825,
"step": 37600
},
{
"epoch": 1.5079396824127036,
"grad_norm": 0.4333184063434601,
"learning_rate": 0.001,
"loss": 4.1797,
"step": 37700
},
{
"epoch": 1.5119395224191032,
"grad_norm": 0.45237982273101807,
"learning_rate": 0.001,
"loss": 4.1804,
"step": 37800
},
{
"epoch": 1.5159393624255029,
"grad_norm": 0.5843102335929871,
"learning_rate": 0.001,
"loss": 4.1771,
"step": 37900
},
{
"epoch": 1.5199392024319027,
"grad_norm": 0.5091027021408081,
"learning_rate": 0.001,
"loss": 4.1825,
"step": 38000
},
{
"epoch": 1.5239390424383026,
"grad_norm": 0.4457857310771942,
"learning_rate": 0.001,
"loss": 4.1784,
"step": 38100
},
{
"epoch": 1.5279388824447022,
"grad_norm": 0.48936015367507935,
"learning_rate": 0.001,
"loss": 4.1798,
"step": 38200
},
{
"epoch": 1.5319387224511019,
"grad_norm": 0.5162155032157898,
"learning_rate": 0.001,
"loss": 4.1824,
"step": 38300
},
{
"epoch": 1.5359385624575017,
"grad_norm": 0.4464411735534668,
"learning_rate": 0.001,
"loss": 4.1808,
"step": 38400
},
{
"epoch": 1.5399384024639016,
"grad_norm": 0.47520169615745544,
"learning_rate": 0.001,
"loss": 4.1844,
"step": 38500
},
{
"epoch": 1.5439382424703012,
"grad_norm": 0.5208662152290344,
"learning_rate": 0.001,
"loss": 4.1771,
"step": 38600
},
{
"epoch": 1.5479380824767008,
"grad_norm": 0.4846671223640442,
"learning_rate": 0.001,
"loss": 4.1782,
"step": 38700
},
{
"epoch": 1.5519379224831007,
"grad_norm": 0.5209333300590515,
"learning_rate": 0.001,
"loss": 4.1786,
"step": 38800
},
{
"epoch": 1.5559377624895006,
"grad_norm": 0.4502977430820465,
"learning_rate": 0.001,
"loss": 4.1803,
"step": 38900
},
{
"epoch": 1.5599376024959002,
"grad_norm": 0.4156093895435333,
"learning_rate": 0.001,
"loss": 4.1785,
"step": 39000
},
{
"epoch": 1.5639374425022998,
"grad_norm": 0.49340036511421204,
"learning_rate": 0.001,
"loss": 4.1802,
"step": 39100
},
{
"epoch": 1.5679372825086997,
"grad_norm": 0.45686131715774536,
"learning_rate": 0.001,
"loss": 4.1772,
"step": 39200
},
{
"epoch": 1.5719371225150995,
"grad_norm": 0.564764142036438,
"learning_rate": 0.001,
"loss": 4.1759,
"step": 39300
},
{
"epoch": 1.5759369625214992,
"grad_norm": 0.5391719341278076,
"learning_rate": 0.001,
"loss": 4.1808,
"step": 39400
},
{
"epoch": 1.5799368025278988,
"grad_norm": 0.5221198797225952,
"learning_rate": 0.001,
"loss": 4.1823,
"step": 39500
},
{
"epoch": 1.5839366425342987,
"grad_norm": 0.4251661002635956,
"learning_rate": 0.001,
"loss": 4.1771,
"step": 39600
},
{
"epoch": 1.5879364825406985,
"grad_norm": 0.382951021194458,
"learning_rate": 0.001,
"loss": 4.1776,
"step": 39700
},
{
"epoch": 1.5919363225470982,
"grad_norm": 0.40156203508377075,
"learning_rate": 0.001,
"loss": 4.1793,
"step": 39800
},
{
"epoch": 1.5959361625534978,
"grad_norm": 0.4980160593986511,
"learning_rate": 0.001,
"loss": 4.179,
"step": 39900
},
{
"epoch": 1.5999360025598977,
"grad_norm": 0.5159147381782532,
"learning_rate": 0.001,
"loss": 4.179,
"step": 40000
},
{
"epoch": 1.6039358425662975,
"grad_norm": 0.4275522828102112,
"learning_rate": 0.001,
"loss": 4.1768,
"step": 40100
},
{
"epoch": 1.6079356825726971,
"grad_norm": 0.4483228027820587,
"learning_rate": 0.001,
"loss": 4.1762,
"step": 40200
},
{
"epoch": 1.6119355225790968,
"grad_norm": 0.5833166241645813,
"learning_rate": 0.001,
"loss": 4.178,
"step": 40300
},
{
"epoch": 1.6159353625854966,
"grad_norm": 0.4804055988788605,
"learning_rate": 0.001,
"loss": 4.1796,
"step": 40400
},
{
"epoch": 1.6199352025918963,
"grad_norm": 0.46036186814308167,
"learning_rate": 0.001,
"loss": 4.1773,
"step": 40500
},
{
"epoch": 1.623935042598296,
"grad_norm": 0.43077051639556885,
"learning_rate": 0.001,
"loss": 4.1768,
"step": 40600
},
{
"epoch": 1.6279348826046958,
"grad_norm": 0.5465964674949646,
"learning_rate": 0.001,
"loss": 4.1765,
"step": 40700
},
{
"epoch": 1.6319347226110956,
"grad_norm": 0.469560444355011,
"learning_rate": 0.001,
"loss": 4.178,
"step": 40800
},
{
"epoch": 1.6359345626174953,
"grad_norm": 0.48708251118659973,
"learning_rate": 0.001,
"loss": 4.1791,
"step": 40900
},
{
"epoch": 1.639934402623895,
"grad_norm": 0.43754613399505615,
"learning_rate": 0.001,
"loss": 4.1763,
"step": 41000
},
{
"epoch": 1.6439342426302948,
"grad_norm": 0.392625629901886,
"learning_rate": 0.001,
"loss": 4.1813,
"step": 41100
},
{
"epoch": 1.6479340826366946,
"grad_norm": 0.46056312322616577,
"learning_rate": 0.001,
"loss": 4.1798,
"step": 41200
},
{
"epoch": 1.6519339226430942,
"grad_norm": 0.4411376118659973,
"learning_rate": 0.001,
"loss": 4.1727,
"step": 41300
},
{
"epoch": 1.6559337626494939,
"grad_norm": 0.5168668031692505,
"learning_rate": 0.001,
"loss": 4.1765,
"step": 41400
},
{
"epoch": 1.6599336026558937,
"grad_norm": 0.4493384063243866,
"learning_rate": 0.001,
"loss": 4.1726,
"step": 41500
},
{
"epoch": 1.6639334426622936,
"grad_norm": 0.502347469329834,
"learning_rate": 0.001,
"loss": 4.1793,
"step": 41600
},
{
"epoch": 1.6679332826686932,
"grad_norm": 0.4458249509334564,
"learning_rate": 0.001,
"loss": 4.1787,
"step": 41700
},
{
"epoch": 1.6719331226750929,
"grad_norm": 0.4660811126232147,
"learning_rate": 0.001,
"loss": 4.1791,
"step": 41800
},
{
"epoch": 1.6759329626814927,
"grad_norm": 0.5625722408294678,
"learning_rate": 0.001,
"loss": 4.1754,
"step": 41900
},
{
"epoch": 1.6799328026878926,
"grad_norm": 0.47896459698677063,
"learning_rate": 0.001,
"loss": 4.1733,
"step": 42000
},
{
"epoch": 1.6839326426942922,
"grad_norm": 0.42776668071746826,
"learning_rate": 0.001,
"loss": 4.1725,
"step": 42100
},
{
"epoch": 1.6879324827006918,
"grad_norm": 0.47714999318122864,
"learning_rate": 0.001,
"loss": 4.1746,
"step": 42200
},
{
"epoch": 1.6919323227070917,
"grad_norm": 0.5495074987411499,
"learning_rate": 0.001,
"loss": 4.1771,
"step": 42300
},
{
"epoch": 1.6959321627134916,
"grad_norm": 0.48492980003356934,
"learning_rate": 0.001,
"loss": 4.1737,
"step": 42400
},
{
"epoch": 1.6999320027198912,
"grad_norm": 0.45363664627075195,
"learning_rate": 0.001,
"loss": 4.1751,
"step": 42500
},
{
"epoch": 1.7039318427262908,
"grad_norm": 0.4112115800380707,
"learning_rate": 0.001,
"loss": 4.1751,
"step": 42600
},
{
"epoch": 1.7079316827326907,
"grad_norm": 0.4674376845359802,
"learning_rate": 0.001,
"loss": 4.1755,
"step": 42700
},
{
"epoch": 1.7119315227390905,
"grad_norm": 0.4602874219417572,
"learning_rate": 0.001,
"loss": 4.1748,
"step": 42800
},
{
"epoch": 1.7159313627454902,
"grad_norm": 0.46376627683639526,
"learning_rate": 0.001,
"loss": 4.1768,
"step": 42900
},
{
"epoch": 1.7199312027518898,
"grad_norm": 0.7872702479362488,
"learning_rate": 0.001,
"loss": 4.1786,
"step": 43000
},
{
"epoch": 1.7239310427582897,
"grad_norm": 0.4959052801132202,
"learning_rate": 0.001,
"loss": 4.1758,
"step": 43100
},
{
"epoch": 1.7279308827646895,
"grad_norm": 0.47499415278434753,
"learning_rate": 0.001,
"loss": 4.1766,
"step": 43200
},
{
"epoch": 1.7319307227710892,
"grad_norm": 0.37570834159851074,
"learning_rate": 0.001,
"loss": 4.1761,
"step": 43300
},
{
"epoch": 1.7359305627774888,
"grad_norm": 0.5071618556976318,
"learning_rate": 0.001,
"loss": 4.1759,
"step": 43400
},
{
"epoch": 1.7399304027838887,
"grad_norm": 0.4444867670536041,
"learning_rate": 0.001,
"loss": 4.1724,
"step": 43500
},
{
"epoch": 1.7439302427902885,
"grad_norm": 0.4530576467514038,
"learning_rate": 0.001,
"loss": 4.1734,
"step": 43600
},
{
"epoch": 1.7479300827966882,
"grad_norm": 0.39011409878730774,
"learning_rate": 0.001,
"loss": 4.1773,
"step": 43700
},
{
"epoch": 1.7519299228030878,
"grad_norm": 0.4495677351951599,
"learning_rate": 0.001,
"loss": 4.175,
"step": 43800
},
{
"epoch": 1.7559297628094876,
"grad_norm": 0.5421786308288574,
"learning_rate": 0.001,
"loss": 4.1754,
"step": 43900
},
{
"epoch": 1.7599296028158875,
"grad_norm": 0.4947051405906677,
"learning_rate": 0.001,
"loss": 4.1744,
"step": 44000
},
{
"epoch": 1.7639294428222871,
"grad_norm": 0.42439621686935425,
"learning_rate": 0.001,
"loss": 4.175,
"step": 44100
},
{
"epoch": 1.7679292828286868,
"grad_norm": 0.4526050090789795,
"learning_rate": 0.001,
"loss": 4.172,
"step": 44200
},
{
"epoch": 1.7719291228350866,
"grad_norm": 0.4238271117210388,
"learning_rate": 0.001,
"loss": 4.1733,
"step": 44300
},
{
"epoch": 1.7759289628414865,
"grad_norm": 0.4912482500076294,
"learning_rate": 0.001,
"loss": 4.1764,
"step": 44400
},
{
"epoch": 1.7799288028478861,
"grad_norm": 0.4627314805984497,
"learning_rate": 0.001,
"loss": 4.1745,
"step": 44500
},
{
"epoch": 1.7839286428542858,
"grad_norm": 0.4460492432117462,
"learning_rate": 0.001,
"loss": 4.1772,
"step": 44600
},
{
"epoch": 1.7879284828606856,
"grad_norm": 0.46068111062049866,
"learning_rate": 0.001,
"loss": 4.175,
"step": 44700
},
{
"epoch": 1.7919283228670855,
"grad_norm": 0.5168552994728088,
"learning_rate": 0.001,
"loss": 4.1764,
"step": 44800
},
{
"epoch": 1.7959281628734851,
"grad_norm": 0.5711122155189514,
"learning_rate": 0.001,
"loss": 4.1745,
"step": 44900
},
{
"epoch": 1.7999280028798847,
"grad_norm": 0.48340123891830444,
"learning_rate": 0.001,
"loss": 4.1734,
"step": 45000
},
{
"epoch": 1.8039278428862846,
"grad_norm": 0.45124703645706177,
"learning_rate": 0.001,
"loss": 4.1692,
"step": 45100
},
{
"epoch": 1.8079276828926842,
"grad_norm": 0.4612937271595001,
"learning_rate": 0.001,
"loss": 4.176,
"step": 45200
},
{
"epoch": 1.8119275228990839,
"grad_norm": 0.45633766055107117,
"learning_rate": 0.001,
"loss": 4.1734,
"step": 45300
},
{
"epoch": 1.8159273629054837,
"grad_norm": 0.44668149948120117,
"learning_rate": 0.001,
"loss": 4.1735,
"step": 45400
},
{
"epoch": 1.8199272029118836,
"grad_norm": 0.47320279479026794,
"learning_rate": 0.001,
"loss": 4.1727,
"step": 45500
},
{
"epoch": 1.8239270429182832,
"grad_norm": 0.4252322018146515,
"learning_rate": 0.001,
"loss": 4.1743,
"step": 45600
},
{
"epoch": 1.8279268829246829,
"grad_norm": 0.4853968620300293,
"learning_rate": 0.001,
"loss": 4.1732,
"step": 45700
},
{
"epoch": 1.8319267229310827,
"grad_norm": 0.5151093006134033,
"learning_rate": 0.001,
"loss": 4.1741,
"step": 45800
},
{
"epoch": 1.8359265629374826,
"grad_norm": 0.473300039768219,
"learning_rate": 0.001,
"loss": 4.1745,
"step": 45900
},
{
"epoch": 1.8399264029438822,
"grad_norm": 0.48538273572921753,
"learning_rate": 0.001,
"loss": 4.1747,
"step": 46000
},
{
"epoch": 1.8439262429502818,
"grad_norm": 0.42796286940574646,
"learning_rate": 0.001,
"loss": 4.175,
"step": 46100
},
{
"epoch": 1.8479260829566817,
"grad_norm": 0.44311732053756714,
"learning_rate": 0.001,
"loss": 4.1724,
"step": 46200
},
{
"epoch": 1.8519259229630816,
"grad_norm": 0.45130372047424316,
"learning_rate": 0.001,
"loss": 4.1757,
"step": 46300
},
{
"epoch": 1.8559257629694812,
"grad_norm": 0.4500294327735901,
"learning_rate": 0.001,
"loss": 4.1762,
"step": 46400
},
{
"epoch": 1.8599256029758808,
"grad_norm": 0.47864317893981934,
"learning_rate": 0.001,
"loss": 4.1725,
"step": 46500
},
{
"epoch": 1.8639254429822807,
"grad_norm": 0.5632477402687073,
"learning_rate": 0.001,
"loss": 4.1747,
"step": 46600
},
{
"epoch": 1.8679252829886805,
"grad_norm": 0.48071813583374023,
"learning_rate": 0.001,
"loss": 4.1705,
"step": 46700
},
{
"epoch": 1.8719251229950802,
"grad_norm": 0.453741192817688,
"learning_rate": 0.001,
"loss": 4.1727,
"step": 46800
},
{
"epoch": 1.8759249630014798,
"grad_norm": 0.45912396907806396,
"learning_rate": 0.001,
"loss": 4.1748,
"step": 46900
},
{
"epoch": 1.8799248030078797,
"grad_norm": 0.48008185625076294,
"learning_rate": 0.001,
"loss": 4.1734,
"step": 47000
},
{
"epoch": 1.8839246430142795,
"grad_norm": 0.4684300422668457,
"learning_rate": 0.001,
"loss": 4.1688,
"step": 47100
},
{
"epoch": 1.8879244830206792,
"grad_norm": 0.49745339155197144,
"learning_rate": 0.001,
"loss": 4.1712,
"step": 47200
},
{
"epoch": 1.8919243230270788,
"grad_norm": 0.4778960049152374,
"learning_rate": 0.001,
"loss": 4.1693,
"step": 47300
},
{
"epoch": 1.8959241630334787,
"grad_norm": 0.46429726481437683,
"learning_rate": 0.001,
"loss": 4.1676,
"step": 47400
},
{
"epoch": 1.8999240030398785,
"grad_norm": 0.46908000111579895,
"learning_rate": 0.001,
"loss": 4.1711,
"step": 47500
},
{
"epoch": 1.9039238430462782,
"grad_norm": 0.4794583320617676,
"learning_rate": 0.001,
"loss": 4.1689,
"step": 47600
},
{
"epoch": 1.9079236830526778,
"grad_norm": 0.5767402648925781,
"learning_rate": 0.001,
"loss": 4.1719,
"step": 47700
},
{
"epoch": 1.9119235230590776,
"grad_norm": 0.45899704098701477,
"learning_rate": 0.001,
"loss": 4.1729,
"step": 47800
},
{
"epoch": 1.9159233630654775,
"grad_norm": 0.47999170422554016,
"learning_rate": 0.001,
"loss": 4.1722,
"step": 47900
},
{
"epoch": 1.9199232030718771,
"grad_norm": 0.4326845109462738,
"learning_rate": 0.001,
"loss": 4.1701,
"step": 48000
},
{
"epoch": 1.9239230430782768,
"grad_norm": 0.563529372215271,
"learning_rate": 0.001,
"loss": 4.1712,
"step": 48100
},
{
"epoch": 1.9279228830846766,
"grad_norm": 0.4267251491546631,
"learning_rate": 0.001,
"loss": 4.1711,
"step": 48200
},
{
"epoch": 1.9319227230910765,
"grad_norm": 0.4583933651447296,
"learning_rate": 0.001,
"loss": 4.1728,
"step": 48300
},
{
"epoch": 1.9359225630974761,
"grad_norm": 0.41948413848876953,
"learning_rate": 0.001,
"loss": 4.1728,
"step": 48400
},
{
"epoch": 1.9399224031038758,
"grad_norm": 0.4663727879524231,
"learning_rate": 0.001,
"loss": 4.1758,
"step": 48500
},
{
"epoch": 1.9439222431102756,
"grad_norm": 0.49384939670562744,
"learning_rate": 0.001,
"loss": 4.1728,
"step": 48600
},
{
"epoch": 1.9479220831166755,
"grad_norm": 0.4137873351573944,
"learning_rate": 0.001,
"loss": 4.1673,
"step": 48700
},
{
"epoch": 1.951921923123075,
"grad_norm": 0.4351732134819031,
"learning_rate": 0.001,
"loss": 4.1698,
"step": 48800
},
{
"epoch": 1.9559217631294747,
"grad_norm": 0.4443551301956177,
"learning_rate": 0.001,
"loss": 4.1694,
"step": 48900
},
{
"epoch": 1.9599216031358746,
"grad_norm": 0.4084385931491852,
"learning_rate": 0.001,
"loss": 4.1711,
"step": 49000
},
{
"epoch": 1.9639214431422745,
"grad_norm": 0.4777480661869049,
"learning_rate": 0.001,
"loss": 4.1707,
"step": 49100
},
{
"epoch": 1.967921283148674,
"grad_norm": 0.5114396214485168,
"learning_rate": 0.001,
"loss": 4.1681,
"step": 49200
},
{
"epoch": 1.9719211231550737,
"grad_norm": 0.4695410132408142,
"learning_rate": 0.001,
"loss": 4.1705,
"step": 49300
},
{
"epoch": 1.9759209631614736,
"grad_norm": 0.4276166558265686,
"learning_rate": 0.001,
"loss": 4.1699,
"step": 49400
},
{
"epoch": 1.9799208031678734,
"grad_norm": 0.4987983703613281,
"learning_rate": 0.001,
"loss": 4.1706,
"step": 49500
},
{
"epoch": 1.983920643174273,
"grad_norm": 0.4121693968772888,
"learning_rate": 0.001,
"loss": 4.1679,
"step": 49600
},
{
"epoch": 1.9879204831806727,
"grad_norm": 0.47886762022972107,
"learning_rate": 0.001,
"loss": 4.1718,
"step": 49700
},
{
"epoch": 1.9919203231870726,
"grad_norm": 0.4255962073802948,
"learning_rate": 0.001,
"loss": 4.1671,
"step": 49800
},
{
"epoch": 1.9959201631934722,
"grad_norm": 0.5012271404266357,
"learning_rate": 0.001,
"loss": 4.1682,
"step": 49900
},
{
"epoch": 1.9999200031998718,
"grad_norm": 0.44093242287635803,
"learning_rate": 0.001,
"loss": 4.1726,
"step": 50000
},
{
"epoch": 2.0039198432062717,
"grad_norm": 0.49300047755241394,
"learning_rate": 0.001,
"loss": 4.1668,
"step": 50100
},
{
"epoch": 2.0079196832126716,
"grad_norm": 0.44728681445121765,
"learning_rate": 0.001,
"loss": 4.168,
"step": 50200
},
{
"epoch": 2.0119195232190714,
"grad_norm": 0.5188434720039368,
"learning_rate": 0.001,
"loss": 4.1672,
"step": 50300
},
{
"epoch": 2.015919363225471,
"grad_norm": 0.517851710319519,
"learning_rate": 0.001,
"loss": 4.1668,
"step": 50400
},
{
"epoch": 2.0199192032318707,
"grad_norm": 0.47993385791778564,
"learning_rate": 0.001,
"loss": 4.1704,
"step": 50500
},
{
"epoch": 2.0239190432382705,
"grad_norm": 0.4726385772228241,
"learning_rate": 0.001,
"loss": 4.1688,
"step": 50600
},
{
"epoch": 2.0279188832446704,
"grad_norm": 0.5576769709587097,
"learning_rate": 0.001,
"loss": 4.1687,
"step": 50700
},
{
"epoch": 2.03191872325107,
"grad_norm": 0.5270803570747375,
"learning_rate": 0.001,
"loss": 4.1684,
"step": 50800
},
{
"epoch": 2.0359185632574697,
"grad_norm": 0.45349547266960144,
"learning_rate": 0.001,
"loss": 4.1687,
"step": 50900
},
{
"epoch": 2.0399184032638695,
"grad_norm": 0.5263473987579346,
"learning_rate": 0.001,
"loss": 4.1702,
"step": 51000
},
{
"epoch": 2.0439182432702694,
"grad_norm": 0.494325190782547,
"learning_rate": 0.001,
"loss": 4.1665,
"step": 51100
},
{
"epoch": 2.047918083276669,
"grad_norm": 0.5202022790908813,
"learning_rate": 0.001,
"loss": 4.165,
"step": 51200
},
{
"epoch": 2.0519179232830687,
"grad_norm": 0.4353752136230469,
"learning_rate": 0.001,
"loss": 4.1684,
"step": 51300
},
{
"epoch": 2.0559177632894685,
"grad_norm": 0.46369001269340515,
"learning_rate": 0.001,
"loss": 4.1652,
"step": 51400
},
{
"epoch": 2.0599176032958684,
"grad_norm": 0.4611663222312927,
"learning_rate": 0.001,
"loss": 4.1686,
"step": 51500
},
{
"epoch": 2.063917443302268,
"grad_norm": 0.44690844416618347,
"learning_rate": 0.001,
"loss": 4.1709,
"step": 51600
},
{
"epoch": 2.0679172833086676,
"grad_norm": 0.4432712495326996,
"learning_rate": 0.001,
"loss": 4.1686,
"step": 51700
},
{
"epoch": 2.0719171233150675,
"grad_norm": 0.46799278259277344,
"learning_rate": 0.001,
"loss": 4.169,
"step": 51800
},
{
"epoch": 2.0759169633214674,
"grad_norm": 0.488779217004776,
"learning_rate": 0.001,
"loss": 4.1661,
"step": 51900
},
{
"epoch": 2.0799168033278668,
"grad_norm": 0.44497257471084595,
"learning_rate": 0.001,
"loss": 4.1641,
"step": 52000
},
{
"epoch": 2.0839166433342666,
"grad_norm": 0.42999890446662903,
"learning_rate": 0.001,
"loss": 4.169,
"step": 52100
},
{
"epoch": 2.0879164833406665,
"grad_norm": 0.4540679454803467,
"learning_rate": 0.001,
"loss": 4.1607,
"step": 52200
},
{
"epoch": 2.091916323347066,
"grad_norm": 0.43836355209350586,
"learning_rate": 0.001,
"loss": 4.1697,
"step": 52300
},
{
"epoch": 2.0959161633534658,
"grad_norm": 0.46834954619407654,
"learning_rate": 0.001,
"loss": 4.1675,
"step": 52400
},
{
"epoch": 2.0999160033598656,
"grad_norm": 0.39395639300346375,
"learning_rate": 0.001,
"loss": 4.167,
"step": 52500
},
{
"epoch": 2.1039158433662655,
"grad_norm": 0.47284603118896484,
"learning_rate": 0.001,
"loss": 4.1701,
"step": 52600
},
{
"epoch": 2.107915683372665,
"grad_norm": 0.5229921936988831,
"learning_rate": 0.001,
"loss": 4.1692,
"step": 52700
},
{
"epoch": 2.1119155233790647,
"grad_norm": 0.4998793303966522,
"learning_rate": 0.001,
"loss": 4.1692,
"step": 52800
},
{
"epoch": 2.1159153633854646,
"grad_norm": 0.5066671967506409,
"learning_rate": 0.001,
"loss": 4.1652,
"step": 52900
},
{
"epoch": 2.1199152033918645,
"grad_norm": 0.4590517580509186,
"learning_rate": 0.001,
"loss": 4.1668,
"step": 53000
},
{
"epoch": 2.123915043398264,
"grad_norm": 0.49296894669532776,
"learning_rate": 0.001,
"loss": 4.1678,
"step": 53100
},
{
"epoch": 2.1279148834046637,
"grad_norm": 0.43287187814712524,
"learning_rate": 0.001,
"loss": 4.1635,
"step": 53200
},
{
"epoch": 2.1319147234110636,
"grad_norm": 0.5368506908416748,
"learning_rate": 0.001,
"loss": 4.168,
"step": 53300
},
{
"epoch": 2.1359145634174634,
"grad_norm": 0.47554171085357666,
"learning_rate": 0.001,
"loss": 4.1681,
"step": 53400
},
{
"epoch": 2.139914403423863,
"grad_norm": 0.47026315331459045,
"learning_rate": 0.001,
"loss": 4.1687,
"step": 53500
},
{
"epoch": 2.1439142434302627,
"grad_norm": 0.4864146411418915,
"learning_rate": 0.001,
"loss": 4.1646,
"step": 53600
},
{
"epoch": 2.1479140834366626,
"grad_norm": 0.45245715975761414,
"learning_rate": 0.001,
"loss": 4.164,
"step": 53700
},
{
"epoch": 2.1519139234430624,
"grad_norm": 0.5358317494392395,
"learning_rate": 0.001,
"loss": 4.1656,
"step": 53800
},
{
"epoch": 2.155913763449462,
"grad_norm": 0.47510644793510437,
"learning_rate": 0.001,
"loss": 4.169,
"step": 53900
},
{
"epoch": 2.1599136034558617,
"grad_norm": 0.518865168094635,
"learning_rate": 0.001,
"loss": 4.1712,
"step": 54000
},
{
"epoch": 2.1639134434622616,
"grad_norm": 0.49107488989830017,
"learning_rate": 0.001,
"loss": 4.1664,
"step": 54100
},
{
"epoch": 2.1679132834686614,
"grad_norm": 0.4293051064014435,
"learning_rate": 0.001,
"loss": 4.1623,
"step": 54200
},
{
"epoch": 2.171913123475061,
"grad_norm": 0.48307546973228455,
"learning_rate": 0.001,
"loss": 4.1671,
"step": 54300
},
{
"epoch": 2.1759129634814607,
"grad_norm": 0.49982860684394836,
"learning_rate": 0.001,
"loss": 4.1673,
"step": 54400
},
{
"epoch": 2.1799128034878605,
"grad_norm": 0.4217018187046051,
"learning_rate": 0.001,
"loss": 4.1636,
"step": 54500
},
{
"epoch": 2.1839126434942604,
"grad_norm": 0.4675614833831787,
"learning_rate": 0.001,
"loss": 4.1679,
"step": 54600
},
{
"epoch": 2.18791248350066,
"grad_norm": 0.46770352125167847,
"learning_rate": 0.001,
"loss": 4.1633,
"step": 54700
},
{
"epoch": 2.1919123235070597,
"grad_norm": 0.46287262439727783,
"learning_rate": 0.001,
"loss": 4.1665,
"step": 54800
},
{
"epoch": 2.1959121635134595,
"grad_norm": 0.42776986956596375,
"learning_rate": 0.001,
"loss": 4.1664,
"step": 54900
},
{
"epoch": 2.1999120035198594,
"grad_norm": 0.4742175042629242,
"learning_rate": 0.001,
"loss": 4.1655,
"step": 55000
},
{
"epoch": 2.203911843526259,
"grad_norm": 0.4570881128311157,
"learning_rate": 0.001,
"loss": 4.1659,
"step": 55100
},
{
"epoch": 2.2079116835326587,
"grad_norm": 0.4609364867210388,
"learning_rate": 0.001,
"loss": 4.1669,
"step": 55200
},
{
"epoch": 2.2119115235390585,
"grad_norm": 0.5724889039993286,
"learning_rate": 0.001,
"loss": 4.1644,
"step": 55300
},
{
"epoch": 2.2159113635454584,
"grad_norm": 0.4681205451488495,
"learning_rate": 0.001,
"loss": 4.1614,
"step": 55400
},
{
"epoch": 2.219911203551858,
"grad_norm": 0.5061549544334412,
"learning_rate": 0.001,
"loss": 4.168,
"step": 55500
},
{
"epoch": 2.2239110435582576,
"grad_norm": 0.4458412826061249,
"learning_rate": 0.001,
"loss": 4.1629,
"step": 55600
},
{
"epoch": 2.2279108835646575,
"grad_norm": 0.4831654131412506,
"learning_rate": 0.001,
"loss": 4.1668,
"step": 55700
},
{
"epoch": 2.2319107235710574,
"grad_norm": 0.5010032653808594,
"learning_rate": 0.001,
"loss": 4.1656,
"step": 55800
},
{
"epoch": 2.2359105635774568,
"grad_norm": 0.4242647886276245,
"learning_rate": 0.001,
"loss": 4.1653,
"step": 55900
},
{
"epoch": 2.2399104035838566,
"grad_norm": 0.40968021750450134,
"learning_rate": 0.001,
"loss": 4.168,
"step": 56000
},
{
"epoch": 2.2439102435902565,
"grad_norm": 0.4865590035915375,
"learning_rate": 0.001,
"loss": 4.1675,
"step": 56100
},
{
"epoch": 2.2479100835966563,
"grad_norm": 0.4834771156311035,
"learning_rate": 0.001,
"loss": 4.1637,
"step": 56200
},
{
"epoch": 2.2519099236030558,
"grad_norm": 0.41941970586776733,
"learning_rate": 0.001,
"loss": 4.1631,
"step": 56300
},
{
"epoch": 2.2559097636094556,
"grad_norm": 0.48071053624153137,
"learning_rate": 0.001,
"loss": 4.1675,
"step": 56400
},
{
"epoch": 2.2599096036158555,
"grad_norm": 0.4841105043888092,
"learning_rate": 0.001,
"loss": 4.165,
"step": 56500
},
{
"epoch": 2.2639094436222553,
"grad_norm": 0.44922900199890137,
"learning_rate": 0.001,
"loss": 4.1621,
"step": 56600
},
{
"epoch": 2.2679092836286547,
"grad_norm": 0.4089633524417877,
"learning_rate": 0.001,
"loss": 4.1707,
"step": 56700
},
{
"epoch": 2.2719091236350546,
"grad_norm": 0.5155735015869141,
"learning_rate": 0.001,
"loss": 4.1675,
"step": 56800
},
{
"epoch": 2.2759089636414545,
"grad_norm": 0.44511187076568604,
"learning_rate": 0.001,
"loss": 4.1626,
"step": 56900
},
{
"epoch": 2.279908803647854,
"grad_norm": 0.45319080352783203,
"learning_rate": 0.001,
"loss": 4.1639,
"step": 57000
},
{
"epoch": 2.2839086436542537,
"grad_norm": 0.4893025755882263,
"learning_rate": 0.001,
"loss": 4.1625,
"step": 57100
},
{
"epoch": 2.2879084836606536,
"grad_norm": 0.4628910720348358,
"learning_rate": 0.001,
"loss": 4.1635,
"step": 57200
},
{
"epoch": 2.2919083236670534,
"grad_norm": 0.5497888326644897,
"learning_rate": 0.001,
"loss": 4.1627,
"step": 57300
},
{
"epoch": 2.2959081636734533,
"grad_norm": 0.45991674065589905,
"learning_rate": 0.001,
"loss": 4.1666,
"step": 57400
},
{
"epoch": 2.2999080036798527,
"grad_norm": 0.5311243534088135,
"learning_rate": 0.001,
"loss": 4.1602,
"step": 57500
},
{
"epoch": 2.3039078436862526,
"grad_norm": 0.4266692101955414,
"learning_rate": 0.001,
"loss": 4.1605,
"step": 57600
},
{
"epoch": 2.3079076836926524,
"grad_norm": 0.5123735070228577,
"learning_rate": 0.001,
"loss": 4.164,
"step": 57700
},
{
"epoch": 2.311907523699052,
"grad_norm": 0.5435347557067871,
"learning_rate": 0.001,
"loss": 4.163,
"step": 57800
},
{
"epoch": 2.3159073637054517,
"grad_norm": 0.5728914737701416,
"learning_rate": 0.001,
"loss": 4.1661,
"step": 57900
},
{
"epoch": 2.3199072037118516,
"grad_norm": 0.5435721278190613,
"learning_rate": 0.001,
"loss": 4.1632,
"step": 58000
},
{
"epoch": 2.3239070437182514,
"grad_norm": 0.5009971261024475,
"learning_rate": 0.001,
"loss": 4.163,
"step": 58100
},
{
"epoch": 2.327906883724651,
"grad_norm": 0.47658857703208923,
"learning_rate": 0.001,
"loss": 4.1621,
"step": 58200
},
{
"epoch": 2.3319067237310507,
"grad_norm": 0.5185097455978394,
"learning_rate": 0.001,
"loss": 4.1616,
"step": 58300
},
{
"epoch": 2.3359065637374505,
"grad_norm": 0.43645840883255005,
"learning_rate": 0.001,
"loss": 4.1667,
"step": 58400
},
{
"epoch": 2.3399064037438504,
"grad_norm": 0.4473995566368103,
"learning_rate": 0.001,
"loss": 4.1658,
"step": 58500
},
{
"epoch": 2.34390624375025,
"grad_norm": 0.4278011918067932,
"learning_rate": 0.001,
"loss": 4.1678,
"step": 58600
},
{
"epoch": 2.3479060837566497,
"grad_norm": 0.47076526284217834,
"learning_rate": 0.001,
"loss": 4.1665,
"step": 58700
},
{
"epoch": 2.3519059237630495,
"grad_norm": 0.5503517985343933,
"learning_rate": 0.001,
"loss": 4.1628,
"step": 58800
},
{
"epoch": 2.3559057637694494,
"grad_norm": 0.41893520951271057,
"learning_rate": 0.001,
"loss": 4.1602,
"step": 58900
},
{
"epoch": 2.359905603775849,
"grad_norm": 0.4245523512363434,
"learning_rate": 0.001,
"loss": 4.1616,
"step": 59000
},
{
"epoch": 2.3639054437822486,
"grad_norm": 0.4149760603904724,
"learning_rate": 0.001,
"loss": 4.1676,
"step": 59100
},
{
"epoch": 2.3679052837886485,
"grad_norm": 0.5713924169540405,
"learning_rate": 0.001,
"loss": 4.1636,
"step": 59200
},
{
"epoch": 2.3719051237950484,
"grad_norm": 0.4798339009284973,
"learning_rate": 0.001,
"loss": 4.162,
"step": 59300
},
{
"epoch": 2.3759049638014478,
"grad_norm": 0.42810848355293274,
"learning_rate": 0.001,
"loss": 4.1651,
"step": 59400
},
{
"epoch": 2.3799048038078476,
"grad_norm": 0.5690004229545593,
"learning_rate": 0.001,
"loss": 4.1633,
"step": 59500
},
{
"epoch": 2.3839046438142475,
"grad_norm": 0.48632410168647766,
"learning_rate": 0.001,
"loss": 4.1668,
"step": 59600
},
{
"epoch": 2.3879044838206474,
"grad_norm": 0.4375806152820587,
"learning_rate": 0.001,
"loss": 4.1638,
"step": 59700
},
{
"epoch": 2.3919043238270468,
"grad_norm": 0.44997647404670715,
"learning_rate": 0.001,
"loss": 4.1614,
"step": 59800
},
{
"epoch": 2.3959041638334466,
"grad_norm": 0.5309412479400635,
"learning_rate": 0.001,
"loss": 4.1612,
"step": 59900
},
{
"epoch": 2.3999040038398465,
"grad_norm": 0.560085654258728,
"learning_rate": 0.001,
"loss": 4.1633,
"step": 60000
},
{
"epoch": 2.4039038438462463,
"grad_norm": 0.4551568925380707,
"learning_rate": 0.001,
"loss": 4.1618,
"step": 60100
},
{
"epoch": 2.4079036838526457,
"grad_norm": 0.4853755533695221,
"learning_rate": 0.001,
"loss": 4.164,
"step": 60200
},
{
"epoch": 2.4119035238590456,
"grad_norm": 0.47059595584869385,
"learning_rate": 0.001,
"loss": 4.1631,
"step": 60300
},
{
"epoch": 2.4159033638654455,
"grad_norm": 0.5161297917366028,
"learning_rate": 0.001,
"loss": 4.1636,
"step": 60400
},
{
"epoch": 2.4199032038718453,
"grad_norm": 0.4607383608818054,
"learning_rate": 0.001,
"loss": 4.1641,
"step": 60500
},
{
"epoch": 2.4239030438782447,
"grad_norm": 0.4741229712963104,
"learning_rate": 0.001,
"loss": 4.1621,
"step": 60600
},
{
"epoch": 2.4279028838846446,
"grad_norm": 0.4276678264141083,
"learning_rate": 0.001,
"loss": 4.1654,
"step": 60700
},
{
"epoch": 2.4319027238910444,
"grad_norm": 0.45867425203323364,
"learning_rate": 0.001,
"loss": 4.1639,
"step": 60800
},
{
"epoch": 2.435902563897444,
"grad_norm": 0.5171924233436584,
"learning_rate": 0.001,
"loss": 4.1617,
"step": 60900
},
{
"epoch": 2.4399024039038437,
"grad_norm": 0.4670430123806,
"learning_rate": 0.001,
"loss": 4.1625,
"step": 61000
},
{
"epoch": 2.4439022439102436,
"grad_norm": 0.4531850516796112,
"learning_rate": 0.001,
"loss": 4.1638,
"step": 61100
},
{
"epoch": 2.4479020839166434,
"grad_norm": 0.5091714262962341,
"learning_rate": 0.001,
"loss": 4.1636,
"step": 61200
},
{
"epoch": 2.4519019239230433,
"grad_norm": 0.48586076498031616,
"learning_rate": 0.001,
"loss": 4.1586,
"step": 61300
},
{
"epoch": 2.4559017639294427,
"grad_norm": 0.44302985072135925,
"learning_rate": 0.001,
"loss": 4.1601,
"step": 61400
},
{
"epoch": 2.4599016039358426,
"grad_norm": 0.4628585875034332,
"learning_rate": 0.001,
"loss": 4.163,
"step": 61500
},
{
"epoch": 2.4639014439422424,
"grad_norm": 0.5455500483512878,
"learning_rate": 0.001,
"loss": 4.1634,
"step": 61600
},
{
"epoch": 2.467901283948642,
"grad_norm": 0.5075648427009583,
"learning_rate": 0.001,
"loss": 4.1617,
"step": 61700
},
{
"epoch": 2.4719011239550417,
"grad_norm": 0.44180813431739807,
"learning_rate": 0.001,
"loss": 4.1665,
"step": 61800
},
{
"epoch": 2.4759009639614415,
"grad_norm": 0.5144279599189758,
"learning_rate": 0.001,
"loss": 4.1606,
"step": 61900
},
{
"epoch": 2.4799008039678414,
"grad_norm": 0.5430071353912354,
"learning_rate": 0.001,
"loss": 4.1612,
"step": 62000
},
{
"epoch": 2.4839006439742413,
"grad_norm": 0.4144330322742462,
"learning_rate": 0.001,
"loss": 4.1609,
"step": 62100
},
{
"epoch": 2.4879004839806407,
"grad_norm": 0.5358167290687561,
"learning_rate": 0.001,
"loss": 4.1565,
"step": 62200
},
{
"epoch": 2.4919003239870405,
"grad_norm": 0.5273513793945312,
"learning_rate": 0.001,
"loss": 4.1623,
"step": 62300
},
{
"epoch": 2.4959001639934404,
"grad_norm": 0.4575463533401489,
"learning_rate": 0.001,
"loss": 4.1635,
"step": 62400
},
{
"epoch": 2.49990000399984,
"grad_norm": 0.47879844903945923,
"learning_rate": 0.001,
"loss": 4.1629,
"step": 62500
},
{
"epoch": 2.5038998440062397,
"grad_norm": 0.40959274768829346,
"learning_rate": 0.001,
"loss": 4.1623,
"step": 62600
},
{
"epoch": 2.5078996840126395,
"grad_norm": 0.5272637009620667,
"learning_rate": 0.001,
"loss": 4.1621,
"step": 62700
},
{
"epoch": 2.5118995240190394,
"grad_norm": 0.40802329778671265,
"learning_rate": 0.001,
"loss": 4.162,
"step": 62800
},
{
"epoch": 2.5158993640254392,
"grad_norm": 0.4832558035850525,
"learning_rate": 0.001,
"loss": 4.1644,
"step": 62900
},
{
"epoch": 2.5198992040318386,
"grad_norm": 0.4066709280014038,
"learning_rate": 0.001,
"loss": 4.1644,
"step": 63000
},
{
"epoch": 2.5238990440382385,
"grad_norm": 0.42834344506263733,
"learning_rate": 0.001,
"loss": 4.1621,
"step": 63100
},
{
"epoch": 2.5278988840446384,
"grad_norm": 0.5093958377838135,
"learning_rate": 0.001,
"loss": 4.161,
"step": 63200
},
{
"epoch": 2.5318987240510378,
"grad_norm": 0.5456981658935547,
"learning_rate": 0.001,
"loss": 4.164,
"step": 63300
},
{
"epoch": 2.5358985640574376,
"grad_norm": 0.47444722056388855,
"learning_rate": 0.001,
"loss": 4.1582,
"step": 63400
},
{
"epoch": 2.5398984040638375,
"grad_norm": 0.49098628759384155,
"learning_rate": 0.001,
"loss": 4.1614,
"step": 63500
},
{
"epoch": 2.5438982440702373,
"grad_norm": 0.5193818807601929,
"learning_rate": 0.001,
"loss": 4.1603,
"step": 63600
},
{
"epoch": 2.547898084076637,
"grad_norm": 0.4925576150417328,
"learning_rate": 0.001,
"loss": 4.1603,
"step": 63700
},
{
"epoch": 2.5518979240830366,
"grad_norm": 0.4549446403980255,
"learning_rate": 0.001,
"loss": 4.1554,
"step": 63800
},
{
"epoch": 2.5558977640894365,
"grad_norm": 0.5603616237640381,
"learning_rate": 0.001,
"loss": 4.1607,
"step": 63900
},
{
"epoch": 2.5598976040958363,
"grad_norm": 0.429360955953598,
"learning_rate": 0.001,
"loss": 4.1628,
"step": 64000
},
{
"epoch": 2.5638974441022357,
"grad_norm": 0.4182247519493103,
"learning_rate": 0.001,
"loss": 4.164,
"step": 64100
},
{
"epoch": 2.5678972841086356,
"grad_norm": 0.5610747337341309,
"learning_rate": 0.001,
"loss": 4.1641,
"step": 64200
},
{
"epoch": 2.5718971241150355,
"grad_norm": 0.39815935492515564,
"learning_rate": 0.001,
"loss": 4.158,
"step": 64300
},
{
"epoch": 2.5758969641214353,
"grad_norm": 0.4976713955402374,
"learning_rate": 0.001,
"loss": 4.1603,
"step": 64400
},
{
"epoch": 2.5798968041278347,
"grad_norm": 0.46783262491226196,
"learning_rate": 0.001,
"loss": 4.1656,
"step": 64500
},
{
"epoch": 2.5838966441342346,
"grad_norm": 0.41783684492111206,
"learning_rate": 0.001,
"loss": 4.161,
"step": 64600
},
{
"epoch": 2.5878964841406344,
"grad_norm": 0.4668773412704468,
"learning_rate": 0.001,
"loss": 4.1637,
"step": 64700
},
{
"epoch": 2.591896324147034,
"grad_norm": 0.4740975797176361,
"learning_rate": 0.001,
"loss": 4.1598,
"step": 64800
},
{
"epoch": 2.5958961641534337,
"grad_norm": 0.4560339152812958,
"learning_rate": 0.001,
"loss": 4.1584,
"step": 64900
},
{
"epoch": 2.5998960041598336,
"grad_norm": 0.4181240200996399,
"learning_rate": 0.001,
"loss": 4.1586,
"step": 65000
},
{
"epoch": 2.6038958441662334,
"grad_norm": 0.5197315812110901,
"learning_rate": 0.001,
"loss": 4.1594,
"step": 65100
},
{
"epoch": 2.6078956841726333,
"grad_norm": 0.45271509885787964,
"learning_rate": 0.001,
"loss": 4.1605,
"step": 65200
},
{
"epoch": 2.6118955241790327,
"grad_norm": 0.521769642829895,
"learning_rate": 0.001,
"loss": 4.1631,
"step": 65300
},
{
"epoch": 2.6158953641854326,
"grad_norm": 0.4969339370727539,
"learning_rate": 0.001,
"loss": 4.1601,
"step": 65400
},
{
"epoch": 2.6198952041918324,
"grad_norm": 0.41261184215545654,
"learning_rate": 0.001,
"loss": 4.1627,
"step": 65500
},
{
"epoch": 2.623895044198232,
"grad_norm": 0.4068695306777954,
"learning_rate": 0.001,
"loss": 4.1583,
"step": 65600
},
{
"epoch": 2.6278948842046317,
"grad_norm": 0.5288635492324829,
"learning_rate": 0.001,
"loss": 4.1586,
"step": 65700
},
{
"epoch": 2.6318947242110315,
"grad_norm": 0.5345166921615601,
"learning_rate": 0.001,
"loss": 4.1637,
"step": 65800
},
{
"epoch": 2.6358945642174314,
"grad_norm": 0.5781984329223633,
"learning_rate": 0.001,
"loss": 4.1595,
"step": 65900
},
{
"epoch": 2.6398944042238313,
"grad_norm": 0.4580060541629791,
"learning_rate": 0.001,
"loss": 4.1658,
"step": 66000
},
{
"epoch": 2.6438942442302307,
"grad_norm": 0.4711572825908661,
"learning_rate": 0.001,
"loss": 4.1555,
"step": 66100
},
{
"epoch": 2.6478940842366305,
"grad_norm": 0.4615152180194855,
"learning_rate": 0.001,
"loss": 4.1619,
"step": 66200
},
{
"epoch": 2.6518939242430304,
"grad_norm": 0.45307356119155884,
"learning_rate": 0.001,
"loss": 4.1565,
"step": 66300
},
{
"epoch": 2.65589376424943,
"grad_norm": 0.4266311526298523,
"learning_rate": 0.001,
"loss": 4.1625,
"step": 66400
},
{
"epoch": 2.6598936042558297,
"grad_norm": 0.43120697140693665,
"learning_rate": 0.001,
"loss": 4.1637,
"step": 66500
},
{
"epoch": 2.6638934442622295,
"grad_norm": 0.49627289175987244,
"learning_rate": 0.001,
"loss": 4.1599,
"step": 66600
},
{
"epoch": 2.6678932842686294,
"grad_norm": 0.4489138126373291,
"learning_rate": 0.001,
"loss": 4.1557,
"step": 66700
},
{
"epoch": 2.6718931242750292,
"grad_norm": 0.5802924633026123,
"learning_rate": 0.001,
"loss": 4.161,
"step": 66800
},
{
"epoch": 2.6758929642814286,
"grad_norm": 0.4587540924549103,
"learning_rate": 0.001,
"loss": 4.1605,
"step": 66900
},
{
"epoch": 2.6798928042878285,
"grad_norm": 0.5906481742858887,
"learning_rate": 0.001,
"loss": 4.1578,
"step": 67000
},
{
"epoch": 2.6838926442942284,
"grad_norm": 0.4712335169315338,
"learning_rate": 0.001,
"loss": 4.1596,
"step": 67100
},
{
"epoch": 2.6878924843006278,
"grad_norm": 0.4989967942237854,
"learning_rate": 0.001,
"loss": 4.1572,
"step": 67200
},
{
"epoch": 2.6918923243070276,
"grad_norm": 0.49959269165992737,
"learning_rate": 0.001,
"loss": 4.1574,
"step": 67300
},
{
"epoch": 2.6958921643134275,
"grad_norm": 0.4110835790634155,
"learning_rate": 0.001,
"loss": 4.1605,
"step": 67400
},
{
"epoch": 2.6998920043198273,
"grad_norm": 0.45519450306892395,
"learning_rate": 0.001,
"loss": 4.1545,
"step": 67500
},
{
"epoch": 2.703891844326227,
"grad_norm": 0.41701555252075195,
"learning_rate": 0.001,
"loss": 4.1591,
"step": 67600
},
{
"epoch": 2.7078916843326266,
"grad_norm": 0.46233931183815,
"learning_rate": 0.001,
"loss": 4.1578,
"step": 67700
},
{
"epoch": 2.7118915243390265,
"grad_norm": 0.4422828257083893,
"learning_rate": 0.001,
"loss": 4.1584,
"step": 67800
},
{
"epoch": 2.7158913643454263,
"grad_norm": 0.4062967598438263,
"learning_rate": 0.001,
"loss": 4.163,
"step": 67900
},
{
"epoch": 2.7198912043518257,
"grad_norm": 0.45407694578170776,
"learning_rate": 0.001,
"loss": 4.1592,
"step": 68000
},
{
"epoch": 2.7238910443582256,
"grad_norm": 0.4106515347957611,
"learning_rate": 0.001,
"loss": 4.1577,
"step": 68100
},
{
"epoch": 2.7278908843646255,
"grad_norm": 0.4356382191181183,
"learning_rate": 0.001,
"loss": 4.1603,
"step": 68200
},
{
"epoch": 2.7318907243710253,
"grad_norm": 0.5237522721290588,
"learning_rate": 0.001,
"loss": 4.1598,
"step": 68300
},
{
"epoch": 2.735890564377425,
"grad_norm": 0.48187971115112305,
"learning_rate": 0.001,
"loss": 4.159,
"step": 68400
},
{
"epoch": 2.7398904043838246,
"grad_norm": 0.4024539887905121,
"learning_rate": 0.001,
"loss": 4.1535,
"step": 68500
},
{
"epoch": 2.7438902443902244,
"grad_norm": 0.49014198780059814,
"learning_rate": 0.001,
"loss": 4.1555,
"step": 68600
},
{
"epoch": 2.7478900843966243,
"grad_norm": 0.4648239016532898,
"learning_rate": 0.001,
"loss": 4.1573,
"step": 68700
},
{
"epoch": 2.7518899244030237,
"grad_norm": 0.53783118724823,
"learning_rate": 0.001,
"loss": 4.1578,
"step": 68800
},
{
"epoch": 2.7558897644094236,
"grad_norm": 0.4142454266548157,
"learning_rate": 0.001,
"loss": 4.1567,
"step": 68900
},
{
"epoch": 2.7598896044158234,
"grad_norm": 0.5513470768928528,
"learning_rate": 0.001,
"loss": 4.1607,
"step": 69000
},
{
"epoch": 2.7638894444222233,
"grad_norm": 0.45765164494514465,
"learning_rate": 0.001,
"loss": 4.1615,
"step": 69100
},
{
"epoch": 2.7678892844286227,
"grad_norm": 0.5637156367301941,
"learning_rate": 0.001,
"loss": 4.1588,
"step": 69200
},
{
"epoch": 2.7718891244350226,
"grad_norm": 0.41710424423217773,
"learning_rate": 0.001,
"loss": 4.1586,
"step": 69300
},
{
"epoch": 2.7758889644414224,
"grad_norm": 0.5280339121818542,
"learning_rate": 0.001,
"loss": 4.1615,
"step": 69400
},
{
"epoch": 2.779888804447822,
"grad_norm": 0.6178783178329468,
"learning_rate": 0.001,
"loss": 4.1582,
"step": 69500
},
{
"epoch": 2.7838886444542217,
"grad_norm": 0.4407796859741211,
"learning_rate": 0.001,
"loss": 4.1616,
"step": 69600
},
{
"epoch": 2.7878884844606215,
"grad_norm": 0.4791260361671448,
"learning_rate": 0.001,
"loss": 4.1584,
"step": 69700
},
{
"epoch": 2.7918883244670214,
"grad_norm": 0.7081926465034485,
"learning_rate": 0.001,
"loss": 4.1551,
"step": 69800
},
{
"epoch": 2.7958881644734213,
"grad_norm": 0.46901920437812805,
"learning_rate": 0.001,
"loss": 4.159,
"step": 69900
},
{
"epoch": 2.7998880044798207,
"grad_norm": 0.5519617795944214,
"learning_rate": 0.001,
"loss": 4.1603,
"step": 70000
},
{
"epoch": 2.8038878444862205,
"grad_norm": 0.5594943165779114,
"learning_rate": 0.001,
"loss": 4.1582,
"step": 70100
},
{
"epoch": 2.8078876844926204,
"grad_norm": 0.4514610171318054,
"learning_rate": 0.001,
"loss": 4.1565,
"step": 70200
},
{
"epoch": 2.81188752449902,
"grad_norm": 0.5486029982566833,
"learning_rate": 0.001,
"loss": 4.1568,
"step": 70300
},
{
"epoch": 2.8158873645054197,
"grad_norm": 0.4768097698688507,
"learning_rate": 0.001,
"loss": 4.1618,
"step": 70400
},
{
"epoch": 2.8198872045118195,
"grad_norm": 0.49742165207862854,
"learning_rate": 0.001,
"loss": 4.162,
"step": 70500
},
{
"epoch": 2.8238870445182194,
"grad_norm": 0.49774202704429626,
"learning_rate": 0.001,
"loss": 4.1565,
"step": 70600
},
{
"epoch": 2.8278868845246192,
"grad_norm": 0.5217127799987793,
"learning_rate": 0.001,
"loss": 4.1585,
"step": 70700
},
{
"epoch": 2.8318867245310186,
"grad_norm": 0.44911012053489685,
"learning_rate": 0.001,
"loss": 4.1573,
"step": 70800
},
{
"epoch": 2.8358865645374185,
"grad_norm": 0.47019949555397034,
"learning_rate": 0.001,
"loss": 4.1572,
"step": 70900
},
{
"epoch": 2.8398864045438184,
"grad_norm": 0.47618111968040466,
"learning_rate": 0.001,
"loss": 4.1596,
"step": 71000
},
{
"epoch": 2.8438862445502178,
"grad_norm": 0.5387282967567444,
"learning_rate": 0.001,
"loss": 4.1552,
"step": 71100
},
{
"epoch": 2.8478860845566176,
"grad_norm": 0.5334316492080688,
"learning_rate": 0.001,
"loss": 4.154,
"step": 71200
},
{
"epoch": 2.8518859245630175,
"grad_norm": 0.4238860607147217,
"learning_rate": 0.001,
"loss": 4.1582,
"step": 71300
},
{
"epoch": 2.8558857645694173,
"grad_norm": 0.42178449034690857,
"learning_rate": 0.001,
"loss": 4.1582,
"step": 71400
},
{
"epoch": 2.859885604575817,
"grad_norm": 0.5171424746513367,
"learning_rate": 0.001,
"loss": 4.1578,
"step": 71500
},
{
"epoch": 2.8638854445822166,
"grad_norm": 0.47590920329093933,
"learning_rate": 0.001,
"loss": 4.1577,
"step": 71600
},
{
"epoch": 2.8678852845886165,
"grad_norm": 0.4355865716934204,
"learning_rate": 0.001,
"loss": 4.1601,
"step": 71700
},
{
"epoch": 2.8718851245950163,
"grad_norm": 0.45468640327453613,
"learning_rate": 0.001,
"loss": 4.1588,
"step": 71800
},
{
"epoch": 2.8758849646014157,
"grad_norm": 0.4616718292236328,
"learning_rate": 0.001,
"loss": 4.1584,
"step": 71900
},
{
"epoch": 2.8798848046078156,
"grad_norm": 0.4420863389968872,
"learning_rate": 0.001,
"loss": 4.1566,
"step": 72000
},
{
"epoch": 2.8838846446142155,
"grad_norm": 0.6559464335441589,
"learning_rate": 0.001,
"loss": 4.1554,
"step": 72100
},
{
"epoch": 2.8878844846206153,
"grad_norm": 0.41194289922714233,
"learning_rate": 0.001,
"loss": 4.1572,
"step": 72200
},
{
"epoch": 2.891884324627015,
"grad_norm": 0.4040292501449585,
"learning_rate": 0.001,
"loss": 4.1588,
"step": 72300
},
{
"epoch": 2.8958841646334146,
"grad_norm": 0.45008403062820435,
"learning_rate": 0.001,
"loss": 4.1596,
"step": 72400
},
{
"epoch": 2.8998840046398144,
"grad_norm": 0.46512535214424133,
"learning_rate": 0.001,
"loss": 4.1518,
"step": 72500
},
{
"epoch": 2.9038838446462143,
"grad_norm": 0.4376848638057709,
"learning_rate": 0.001,
"loss": 4.1563,
"step": 72600
},
{
"epoch": 2.9078836846526137,
"grad_norm": 0.38827189803123474,
"learning_rate": 0.001,
"loss": 4.1556,
"step": 72700
},
{
"epoch": 2.9118835246590136,
"grad_norm": 0.494261234998703,
"learning_rate": 0.001,
"loss": 4.1587,
"step": 72800
},
{
"epoch": 2.9158833646654134,
"grad_norm": 0.4950977861881256,
"learning_rate": 0.001,
"loss": 4.1596,
"step": 72900
},
{
"epoch": 2.9198832046718133,
"grad_norm": 0.5697283744812012,
"learning_rate": 0.001,
"loss": 4.1547,
"step": 73000
},
{
"epoch": 2.923883044678213,
"grad_norm": 0.4631339907646179,
"learning_rate": 0.001,
"loss": 4.1594,
"step": 73100
},
{
"epoch": 2.9278828846846126,
"grad_norm": 0.4409984052181244,
"learning_rate": 0.001,
"loss": 4.1562,
"step": 73200
},
{
"epoch": 2.9318827246910124,
"grad_norm": 0.49438488483428955,
"learning_rate": 0.001,
"loss": 4.1568,
"step": 73300
},
{
"epoch": 2.9358825646974123,
"grad_norm": 0.45631879568099976,
"learning_rate": 0.001,
"loss": 4.1576,
"step": 73400
},
{
"epoch": 2.9398824047038117,
"grad_norm": 0.5139431357383728,
"learning_rate": 0.001,
"loss": 4.1583,
"step": 73500
},
{
"epoch": 2.9438822447102115,
"grad_norm": 0.5125510096549988,
"learning_rate": 0.001,
"loss": 4.1563,
"step": 73600
},
{
"epoch": 2.9478820847166114,
"grad_norm": 0.44619888067245483,
"learning_rate": 0.001,
"loss": 4.1548,
"step": 73700
},
{
"epoch": 2.9518819247230113,
"grad_norm": 0.4973961114883423,
"learning_rate": 0.001,
"loss": 4.1563,
"step": 73800
},
{
"epoch": 2.9558817647294107,
"grad_norm": 0.4607144892215729,
"learning_rate": 0.001,
"loss": 4.1569,
"step": 73900
},
{
"epoch": 2.9598816047358105,
"grad_norm": 0.5176932215690613,
"learning_rate": 0.001,
"loss": 4.1556,
"step": 74000
},
{
"epoch": 2.9638814447422104,
"grad_norm": 0.4734891653060913,
"learning_rate": 0.001,
"loss": 4.155,
"step": 74100
},
{
"epoch": 2.96788128474861,
"grad_norm": 0.5034810900688171,
"learning_rate": 0.001,
"loss": 4.1572,
"step": 74200
},
{
"epoch": 2.9718811247550097,
"grad_norm": 0.4262826144695282,
"learning_rate": 0.001,
"loss": 4.1591,
"step": 74300
},
{
"epoch": 2.9758809647614095,
"grad_norm": 0.46682965755462646,
"learning_rate": 0.001,
"loss": 4.1586,
"step": 74400
},
{
"epoch": 2.9798808047678094,
"grad_norm": 0.5210826992988586,
"learning_rate": 0.001,
"loss": 4.1586,
"step": 74500
},
{
"epoch": 2.9838806447742092,
"grad_norm": 0.42274290323257446,
"learning_rate": 0.001,
"loss": 4.1551,
"step": 74600
},
{
"epoch": 2.9878804847806086,
"grad_norm": 0.48334646224975586,
"learning_rate": 0.001,
"loss": 4.1543,
"step": 74700
},
{
"epoch": 2.9918803247870085,
"grad_norm": 0.4629717469215393,
"learning_rate": 0.001,
"loss": 4.1555,
"step": 74800
},
{
"epoch": 2.9958801647934084,
"grad_norm": 0.40987247228622437,
"learning_rate": 0.001,
"loss": 4.1594,
"step": 74900
},
{
"epoch": 2.9998800047998078,
"grad_norm": 0.48736295104026794,
"learning_rate": 0.001,
"loss": 4.1586,
"step": 75000
},
{
"epoch": 3.0038798448062076,
"grad_norm": 0.5455026626586914,
"learning_rate": 0.001,
"loss": 4.1533,
"step": 75100
},
{
"epoch": 3.0078796848126075,
"grad_norm": 0.4381161630153656,
"learning_rate": 0.001,
"loss": 4.1543,
"step": 75200
},
{
"epoch": 3.0118795248190073,
"grad_norm": 0.48587241768836975,
"learning_rate": 0.001,
"loss": 4.1544,
"step": 75300
},
{
"epoch": 3.015879364825407,
"grad_norm": 0.47117552161216736,
"learning_rate": 0.001,
"loss": 4.1539,
"step": 75400
},
{
"epoch": 3.0198792048318066,
"grad_norm": 0.4904659688472748,
"learning_rate": 0.001,
"loss": 4.1543,
"step": 75500
},
{
"epoch": 3.0238790448382065,
"grad_norm": 0.5004945397377014,
"learning_rate": 0.001,
"loss": 4.1548,
"step": 75600
},
{
"epoch": 3.0278788848446063,
"grad_norm": 0.42175960540771484,
"learning_rate": 0.001,
"loss": 4.1515,
"step": 75700
},
{
"epoch": 3.0318787248510057,
"grad_norm": 0.5447273850440979,
"learning_rate": 0.001,
"loss": 4.1531,
"step": 75800
},
{
"epoch": 3.0358785648574056,
"grad_norm": 0.4705660939216614,
"learning_rate": 0.001,
"loss": 4.1498,
"step": 75900
},
{
"epoch": 3.0398784048638055,
"grad_norm": 0.5477752685546875,
"learning_rate": 0.001,
"loss": 4.1533,
"step": 76000
},
{
"epoch": 3.0438782448702053,
"grad_norm": 0.47758665680885315,
"learning_rate": 0.001,
"loss": 4.1538,
"step": 76100
},
{
"epoch": 3.0478780848766047,
"grad_norm": 0.39745718240737915,
"learning_rate": 0.001,
"loss": 4.1558,
"step": 76200
},
{
"epoch": 3.0518779248830046,
"grad_norm": 0.4436202049255371,
"learning_rate": 0.001,
"loss": 4.1532,
"step": 76300
},
{
"epoch": 3.0558777648894044,
"grad_norm": 0.5916080474853516,
"learning_rate": 0.001,
"loss": 4.154,
"step": 76400
},
{
"epoch": 3.0598776048958043,
"grad_norm": 0.5111138224601746,
"learning_rate": 0.001,
"loss": 4.15,
"step": 76500
},
{
"epoch": 3.0638774449022037,
"grad_norm": 0.48636212944984436,
"learning_rate": 0.001,
"loss": 4.1509,
"step": 76600
},
{
"epoch": 3.0678772849086036,
"grad_norm": 0.4602707326412201,
"learning_rate": 0.001,
"loss": 4.1529,
"step": 76700
},
{
"epoch": 3.0718771249150034,
"grad_norm": 0.4185924828052521,
"learning_rate": 0.001,
"loss": 4.1583,
"step": 76800
},
{
"epoch": 3.0758769649214033,
"grad_norm": 0.4830791652202606,
"learning_rate": 0.001,
"loss": 4.1498,
"step": 76900
},
{
"epoch": 3.0798768049278027,
"grad_norm": 0.4777405858039856,
"learning_rate": 0.001,
"loss": 4.1546,
"step": 77000
},
{
"epoch": 3.0838766449342025,
"grad_norm": 0.4124826192855835,
"learning_rate": 0.001,
"loss": 4.1537,
"step": 77100
},
{
"epoch": 3.0878764849406024,
"grad_norm": 0.387603759765625,
"learning_rate": 0.001,
"loss": 4.1574,
"step": 77200
},
{
"epoch": 3.0918763249470023,
"grad_norm": 0.43888458609580994,
"learning_rate": 0.001,
"loss": 4.1528,
"step": 77300
},
{
"epoch": 3.0958761649534017,
"grad_norm": 0.5398756265640259,
"learning_rate": 0.001,
"loss": 4.156,
"step": 77400
},
{
"epoch": 3.0998760049598015,
"grad_norm": 0.4512723982334137,
"learning_rate": 0.001,
"loss": 4.15,
"step": 77500
},
{
"epoch": 3.1038758449662014,
"grad_norm": 0.4444531500339508,
"learning_rate": 0.001,
"loss": 4.1544,
"step": 77600
},
{
"epoch": 3.1078756849726012,
"grad_norm": 0.5301286578178406,
"learning_rate": 0.001,
"loss": 4.153,
"step": 77700
},
{
"epoch": 3.1118755249790007,
"grad_norm": 0.45263248682022095,
"learning_rate": 0.001,
"loss": 4.157,
"step": 77800
},
{
"epoch": 3.1158753649854005,
"grad_norm": 0.46157121658325195,
"learning_rate": 0.001,
"loss": 4.1527,
"step": 77900
},
{
"epoch": 3.1198752049918004,
"grad_norm": 0.43224167823791504,
"learning_rate": 0.001,
"loss": 4.1547,
"step": 78000
},
{
"epoch": 3.1238750449982002,
"grad_norm": 0.42079129815101624,
"learning_rate": 0.001,
"loss": 4.1553,
"step": 78100
},
{
"epoch": 3.1278748850045996,
"grad_norm": 0.4730684161186218,
"learning_rate": 0.001,
"loss": 4.1531,
"step": 78200
},
{
"epoch": 3.1318747250109995,
"grad_norm": 0.48231276869773865,
"learning_rate": 0.001,
"loss": 4.1579,
"step": 78300
},
{
"epoch": 3.1358745650173994,
"grad_norm": 0.4426518380641937,
"learning_rate": 0.001,
"loss": 4.1538,
"step": 78400
},
{
"epoch": 3.139874405023799,
"grad_norm": 0.5078949928283691,
"learning_rate": 0.001,
"loss": 4.1521,
"step": 78500
},
{
"epoch": 3.1438742450301986,
"grad_norm": 0.4642763137817383,
"learning_rate": 0.001,
"loss": 4.1543,
"step": 78600
},
{
"epoch": 3.1478740850365985,
"grad_norm": 0.43856772780418396,
"learning_rate": 0.001,
"loss": 4.1555,
"step": 78700
},
{
"epoch": 3.1518739250429983,
"grad_norm": 0.49219101667404175,
"learning_rate": 0.001,
"loss": 4.1534,
"step": 78800
},
{
"epoch": 3.155873765049398,
"grad_norm": 0.4498043954372406,
"learning_rate": 0.001,
"loss": 4.153,
"step": 78900
},
{
"epoch": 3.1598736050557976,
"grad_norm": 0.48328474164009094,
"learning_rate": 0.001,
"loss": 4.1548,
"step": 79000
},
{
"epoch": 3.1638734450621975,
"grad_norm": 0.510409951210022,
"learning_rate": 0.001,
"loss": 4.1513,
"step": 79100
},
{
"epoch": 3.1678732850685973,
"grad_norm": 0.5685029625892639,
"learning_rate": 0.001,
"loss": 4.1528,
"step": 79200
},
{
"epoch": 3.171873125074997,
"grad_norm": 0.5003494620323181,
"learning_rate": 0.001,
"loss": 4.153,
"step": 79300
},
{
"epoch": 3.1758729650813966,
"grad_norm": 0.42618006467819214,
"learning_rate": 0.001,
"loss": 4.1522,
"step": 79400
},
{
"epoch": 3.1798728050877965,
"grad_norm": 0.5276270508766174,
"learning_rate": 0.001,
"loss": 4.1562,
"step": 79500
},
{
"epoch": 3.1838726450941963,
"grad_norm": 0.4651762545108795,
"learning_rate": 0.001,
"loss": 4.1545,
"step": 79600
},
{
"epoch": 3.187872485100596,
"grad_norm": 0.5323458313941956,
"learning_rate": 0.001,
"loss": 4.1568,
"step": 79700
},
{
"epoch": 3.1918723251069956,
"grad_norm": 0.48029133677482605,
"learning_rate": 0.001,
"loss": 4.1571,
"step": 79800
},
{
"epoch": 3.1958721651133954,
"grad_norm": 0.40680381655693054,
"learning_rate": 0.001,
"loss": 4.1521,
"step": 79900
},
{
"epoch": 3.1998720051197953,
"grad_norm": 0.5401845574378967,
"learning_rate": 0.001,
"loss": 4.1551,
"step": 80000
},
{
"epoch": 3.203871845126195,
"grad_norm": 0.4607747197151184,
"learning_rate": 0.001,
"loss": 4.1503,
"step": 80100
},
{
"epoch": 3.2078716851325946,
"grad_norm": 0.4039115011692047,
"learning_rate": 0.001,
"loss": 4.1514,
"step": 80200
},
{
"epoch": 3.2118715251389944,
"grad_norm": 0.47579532861709595,
"learning_rate": 0.001,
"loss": 4.1514,
"step": 80300
},
{
"epoch": 3.2158713651453943,
"grad_norm": 0.43037813901901245,
"learning_rate": 0.001,
"loss": 4.1555,
"step": 80400
},
{
"epoch": 3.2198712051517937,
"grad_norm": 0.44526252150535583,
"learning_rate": 0.001,
"loss": 4.1538,
"step": 80500
},
{
"epoch": 3.2238710451581936,
"grad_norm": 0.43144863843917847,
"learning_rate": 0.001,
"loss": 4.1521,
"step": 80600
},
{
"epoch": 3.2278708851645934,
"grad_norm": 0.5244666934013367,
"learning_rate": 0.001,
"loss": 4.1526,
"step": 80700
},
{
"epoch": 3.2318707251709933,
"grad_norm": 0.569317638874054,
"learning_rate": 0.001,
"loss": 4.155,
"step": 80800
},
{
"epoch": 3.235870565177393,
"grad_norm": 0.48165997862815857,
"learning_rate": 0.001,
"loss": 4.1549,
"step": 80900
},
{
"epoch": 3.2398704051837925,
"grad_norm": 0.3605559170246124,
"learning_rate": 0.001,
"loss": 4.1521,
"step": 81000
},
{
"epoch": 3.2438702451901924,
"grad_norm": 0.4918169379234314,
"learning_rate": 0.001,
"loss": 4.1509,
"step": 81100
},
{
"epoch": 3.2478700851965923,
"grad_norm": 0.4783216714859009,
"learning_rate": 0.001,
"loss": 4.1556,
"step": 81200
},
{
"epoch": 3.2518699252029917,
"grad_norm": 0.44440028071403503,
"learning_rate": 0.001,
"loss": 4.1521,
"step": 81300
},
{
"epoch": 3.2558697652093915,
"grad_norm": 0.5073365569114685,
"learning_rate": 0.001,
"loss": 4.1559,
"step": 81400
},
{
"epoch": 3.2598696052157914,
"grad_norm": 0.47869718074798584,
"learning_rate": 0.001,
"loss": 4.1554,
"step": 81500
},
{
"epoch": 3.2638694452221912,
"grad_norm": 0.4230363667011261,
"learning_rate": 0.001,
"loss": 4.1493,
"step": 81600
},
{
"epoch": 3.267869285228591,
"grad_norm": 0.4750482738018036,
"learning_rate": 0.001,
"loss": 4.1528,
"step": 81700
},
{
"epoch": 3.2718691252349905,
"grad_norm": 0.5645243525505066,
"learning_rate": 0.001,
"loss": 4.1526,
"step": 81800
},
{
"epoch": 3.2758689652413904,
"grad_norm": 0.41942843794822693,
"learning_rate": 0.001,
"loss": 4.1512,
"step": 81900
},
{
"epoch": 3.2798688052477902,
"grad_norm": 0.4255695044994354,
"learning_rate": 0.001,
"loss": 4.1518,
"step": 82000
},
{
"epoch": 3.2838686452541896,
"grad_norm": 0.4215909242630005,
"learning_rate": 0.001,
"loss": 4.1568,
"step": 82100
},
{
"epoch": 3.2878684852605895,
"grad_norm": 0.4049839973449707,
"learning_rate": 0.001,
"loss": 4.1534,
"step": 82200
},
{
"epoch": 3.2918683252669894,
"grad_norm": 0.514345109462738,
"learning_rate": 0.001,
"loss": 4.1492,
"step": 82300
},
{
"epoch": 3.295868165273389,
"grad_norm": 0.43098345398902893,
"learning_rate": 0.001,
"loss": 4.1522,
"step": 82400
},
{
"epoch": 3.2998680052797886,
"grad_norm": 0.4352331757545471,
"learning_rate": 0.001,
"loss": 4.1524,
"step": 82500
},
{
"epoch": 3.3038678452861885,
"grad_norm": 0.4635871946811676,
"learning_rate": 0.001,
"loss": 4.1526,
"step": 82600
},
{
"epoch": 3.3078676852925883,
"grad_norm": 0.41384679079055786,
"learning_rate": 0.001,
"loss": 4.1548,
"step": 82700
},
{
"epoch": 3.311867525298988,
"grad_norm": 0.39796626567840576,
"learning_rate": 0.001,
"loss": 4.1544,
"step": 82800
},
{
"epoch": 3.3158673653053876,
"grad_norm": 0.40175408124923706,
"learning_rate": 0.001,
"loss": 4.155,
"step": 82900
},
{
"epoch": 3.3198672053117875,
"grad_norm": 0.4152776598930359,
"learning_rate": 0.001,
"loss": 4.1555,
"step": 83000
},
{
"epoch": 3.3238670453181873,
"grad_norm": 0.5190226435661316,
"learning_rate": 0.001,
"loss": 4.1571,
"step": 83100
},
{
"epoch": 3.327866885324587,
"grad_norm": 0.43292152881622314,
"learning_rate": 0.001,
"loss": 4.1502,
"step": 83200
},
{
"epoch": 3.3318667253309866,
"grad_norm": 0.4904835522174835,
"learning_rate": 0.001,
"loss": 4.1515,
"step": 83300
},
{
"epoch": 3.3358665653373865,
"grad_norm": 0.5600055456161499,
"learning_rate": 0.001,
"loss": 4.1497,
"step": 83400
},
{
"epoch": 3.3398664053437863,
"grad_norm": 0.5315993428230286,
"learning_rate": 0.001,
"loss": 4.1564,
"step": 83500
},
{
"epoch": 3.343866245350186,
"grad_norm": 0.4802263081073761,
"learning_rate": 0.001,
"loss": 4.1557,
"step": 83600
},
{
"epoch": 3.3478660853565856,
"grad_norm": 0.3861168324947357,
"learning_rate": 0.001,
"loss": 4.1547,
"step": 83700
},
{
"epoch": 3.3518659253629854,
"grad_norm": 0.5466539263725281,
"learning_rate": 0.001,
"loss": 4.1539,
"step": 83800
},
{
"epoch": 3.3558657653693853,
"grad_norm": 0.506841242313385,
"learning_rate": 0.001,
"loss": 4.1531,
"step": 83900
},
{
"epoch": 3.359865605375785,
"grad_norm": 0.5451818704605103,
"learning_rate": 0.001,
"loss": 4.1548,
"step": 84000
},
{
"epoch": 3.3638654453821846,
"grad_norm": 0.48338380455970764,
"learning_rate": 0.001,
"loss": 4.1526,
"step": 84100
},
{
"epoch": 3.3678652853885844,
"grad_norm": 0.5230739116668701,
"learning_rate": 0.001,
"loss": 4.1523,
"step": 84200
},
{
"epoch": 3.3718651253949843,
"grad_norm": 0.433020681142807,
"learning_rate": 0.001,
"loss": 4.1514,
"step": 84300
},
{
"epoch": 3.3758649654013837,
"grad_norm": 0.45081406831741333,
"learning_rate": 0.001,
"loss": 4.1533,
"step": 84400
},
{
"epoch": 3.3798648054077836,
"grad_norm": 0.5491710305213928,
"learning_rate": 0.001,
"loss": 4.1544,
"step": 84500
},
{
"epoch": 3.3838646454141834,
"grad_norm": 0.46437957882881165,
"learning_rate": 0.001,
"loss": 4.1538,
"step": 84600
},
{
"epoch": 3.3878644854205833,
"grad_norm": 0.5560771822929382,
"learning_rate": 0.001,
"loss": 4.1527,
"step": 84700
},
{
"epoch": 3.391864325426983,
"grad_norm": 0.5027199983596802,
"learning_rate": 0.001,
"loss": 4.1487,
"step": 84800
},
{
"epoch": 3.3958641654333825,
"grad_norm": 0.42762041091918945,
"learning_rate": 0.001,
"loss": 4.1525,
"step": 84900
},
{
"epoch": 3.3998640054397824,
"grad_norm": 0.5043109655380249,
"learning_rate": 0.001,
"loss": 4.152,
"step": 85000
},
{
"epoch": 3.4038638454461823,
"grad_norm": 0.4508378207683563,
"learning_rate": 0.001,
"loss": 4.1499,
"step": 85100
},
{
"epoch": 3.4078636854525817,
"grad_norm": 0.3978097140789032,
"learning_rate": 0.001,
"loss": 4.152,
"step": 85200
},
{
"epoch": 3.4118635254589815,
"grad_norm": 0.5391075015068054,
"learning_rate": 0.001,
"loss": 4.1558,
"step": 85300
},
{
"epoch": 3.4158633654653814,
"grad_norm": 0.5179737210273743,
"learning_rate": 0.001,
"loss": 4.1554,
"step": 85400
},
{
"epoch": 3.4198632054717812,
"grad_norm": 0.451274037361145,
"learning_rate": 0.001,
"loss": 4.1521,
"step": 85500
},
{
"epoch": 3.423863045478181,
"grad_norm": 0.46372953057289124,
"learning_rate": 0.001,
"loss": 4.1499,
"step": 85600
},
{
"epoch": 3.4278628854845805,
"grad_norm": 0.44644874334335327,
"learning_rate": 0.001,
"loss": 4.1542,
"step": 85700
},
{
"epoch": 3.4318627254909804,
"grad_norm": 0.45447978377342224,
"learning_rate": 0.001,
"loss": 4.1551,
"step": 85800
},
{
"epoch": 3.4358625654973802,
"grad_norm": 0.47864270210266113,
"learning_rate": 0.001,
"loss": 4.1539,
"step": 85900
},
{
"epoch": 3.4398624055037796,
"grad_norm": 0.48053839802742004,
"learning_rate": 0.001,
"loss": 4.1525,
"step": 86000
},
{
"epoch": 3.4438622455101795,
"grad_norm": 0.42985475063323975,
"learning_rate": 0.001,
"loss": 4.1518,
"step": 86100
},
{
"epoch": 3.4478620855165794,
"grad_norm": 0.4910486042499542,
"learning_rate": 0.001,
"loss": 4.1515,
"step": 86200
},
{
"epoch": 3.451861925522979,
"grad_norm": 0.4845552146434784,
"learning_rate": 0.001,
"loss": 4.1533,
"step": 86300
},
{
"epoch": 3.455861765529379,
"grad_norm": 0.42696115374565125,
"learning_rate": 0.001,
"loss": 4.1563,
"step": 86400
},
{
"epoch": 3.4598616055357785,
"grad_norm": 0.45997726917266846,
"learning_rate": 0.001,
"loss": 4.1504,
"step": 86500
},
{
"epoch": 3.4638614455421783,
"grad_norm": 0.47192636132240295,
"learning_rate": 0.001,
"loss": 4.1512,
"step": 86600
},
{
"epoch": 3.467861285548578,
"grad_norm": 0.4781351387500763,
"learning_rate": 0.001,
"loss": 4.1568,
"step": 86700
},
{
"epoch": 3.4718611255549776,
"grad_norm": 0.47357264161109924,
"learning_rate": 0.001,
"loss": 4.1512,
"step": 86800
},
{
"epoch": 3.4758609655613775,
"grad_norm": 0.500704288482666,
"learning_rate": 0.001,
"loss": 4.1511,
"step": 86900
},
{
"epoch": 3.4798608055677773,
"grad_norm": 0.5305373668670654,
"learning_rate": 0.001,
"loss": 4.1556,
"step": 87000
},
{
"epoch": 3.483860645574177,
"grad_norm": 0.5612720251083374,
"learning_rate": 0.001,
"loss": 4.1538,
"step": 87100
},
{
"epoch": 3.4878604855805766,
"grad_norm": 0.47861745953559875,
"learning_rate": 0.001,
"loss": 4.1543,
"step": 87200
},
{
"epoch": 3.4918603255869765,
"grad_norm": 0.4624346196651459,
"learning_rate": 0.001,
"loss": 4.1479,
"step": 87300
},
{
"epoch": 3.4958601655933763,
"grad_norm": 0.4761544167995453,
"learning_rate": 0.001,
"loss": 4.1535,
"step": 87400
},
{
"epoch": 3.499860005599776,
"grad_norm": 0.4098283648490906,
"learning_rate": 0.001,
"loss": 4.1534,
"step": 87500
},
{
"epoch": 3.5038598456061756,
"grad_norm": 0.4998922646045685,
"learning_rate": 0.001,
"loss": 4.1471,
"step": 87600
},
{
"epoch": 3.5078596856125754,
"grad_norm": 0.4544636607170105,
"learning_rate": 0.001,
"loss": 4.1505,
"step": 87700
},
{
"epoch": 3.5118595256189753,
"grad_norm": 0.4588119387626648,
"learning_rate": 0.001,
"loss": 4.1544,
"step": 87800
},
{
"epoch": 3.515859365625375,
"grad_norm": 0.5002636313438416,
"learning_rate": 0.001,
"loss": 4.1502,
"step": 87900
},
{
"epoch": 3.5198592056317746,
"grad_norm": 0.4344749450683594,
"learning_rate": 0.001,
"loss": 4.1506,
"step": 88000
},
{
"epoch": 3.5238590456381744,
"grad_norm": 0.5412445664405823,
"learning_rate": 0.001,
"loss": 4.1506,
"step": 88100
},
{
"epoch": 3.5278588856445743,
"grad_norm": 0.45813220739364624,
"learning_rate": 0.001,
"loss": 4.1501,
"step": 88200
},
{
"epoch": 3.5318587256509737,
"grad_norm": 0.43678992986679077,
"learning_rate": 0.001,
"loss": 4.1528,
"step": 88300
},
{
"epoch": 3.5358585656573736,
"grad_norm": 0.5613416433334351,
"learning_rate": 0.001,
"loss": 4.1513,
"step": 88400
},
{
"epoch": 3.5398584056637734,
"grad_norm": 0.6545833945274353,
"learning_rate": 0.001,
"loss": 4.1531,
"step": 88500
},
{
"epoch": 3.5438582456701733,
"grad_norm": 0.4009111821651459,
"learning_rate": 0.001,
"loss": 4.1476,
"step": 88600
},
{
"epoch": 3.547858085676573,
"grad_norm": 0.5212500095367432,
"learning_rate": 0.001,
"loss": 4.1516,
"step": 88700
},
{
"epoch": 3.5518579256829725,
"grad_norm": 0.4273383915424347,
"learning_rate": 0.001,
"loss": 4.1499,
"step": 88800
},
{
"epoch": 3.5558577656893724,
"grad_norm": 0.4782946705818176,
"learning_rate": 0.001,
"loss": 4.1545,
"step": 88900
},
{
"epoch": 3.5598576056957723,
"grad_norm": 0.41619250178337097,
"learning_rate": 0.001,
"loss": 4.1534,
"step": 89000
},
{
"epoch": 3.5638574457021717,
"grad_norm": 0.4649808704853058,
"learning_rate": 0.001,
"loss": 4.154,
"step": 89100
},
{
"epoch": 3.5678572857085715,
"grad_norm": 0.44941842555999756,
"learning_rate": 0.001,
"loss": 4.1506,
"step": 89200
},
{
"epoch": 3.5718571257149714,
"grad_norm": 0.5763667821884155,
"learning_rate": 0.001,
"loss": 4.1527,
"step": 89300
},
{
"epoch": 3.5758569657213712,
"grad_norm": 0.5648437142372131,
"learning_rate": 0.001,
"loss": 4.1563,
"step": 89400
},
{
"epoch": 3.579856805727771,
"grad_norm": 0.5101251006126404,
"learning_rate": 0.001,
"loss": 4.1547,
"step": 89500
},
{
"epoch": 3.5838566457341705,
"grad_norm": 0.434222936630249,
"learning_rate": 0.001,
"loss": 4.1537,
"step": 89600
},
{
"epoch": 3.5878564857405704,
"grad_norm": 0.4032537341117859,
"learning_rate": 0.001,
"loss": 4.1495,
"step": 89700
},
{
"epoch": 3.5918563257469702,
"grad_norm": 0.4581545889377594,
"learning_rate": 0.001,
"loss": 4.1474,
"step": 89800
},
{
"epoch": 3.5958561657533696,
"grad_norm": 0.6091015338897705,
"learning_rate": 0.001,
"loss": 4.1492,
"step": 89900
},
{
"epoch": 3.5998560057597695,
"grad_norm": 0.5093620419502258,
"learning_rate": 0.001,
"loss": 4.1496,
"step": 90000
},
{
"epoch": 3.6038558457661694,
"grad_norm": 0.4329790771007538,
"learning_rate": 0.001,
"loss": 4.1494,
"step": 90100
},
{
"epoch": 3.607855685772569,
"grad_norm": 0.5041528344154358,
"learning_rate": 0.001,
"loss": 4.154,
"step": 90200
},
{
"epoch": 3.611855525778969,
"grad_norm": 0.3949008285999298,
"learning_rate": 0.001,
"loss": 4.1527,
"step": 90300
},
{
"epoch": 3.6158553657853685,
"grad_norm": 0.40398308634757996,
"learning_rate": 0.001,
"loss": 4.1507,
"step": 90400
},
{
"epoch": 3.6198552057917683,
"grad_norm": 0.4658049941062927,
"learning_rate": 0.001,
"loss": 4.1488,
"step": 90500
},
{
"epoch": 3.623855045798168,
"grad_norm": 0.4312227666378021,
"learning_rate": 0.001,
"loss": 4.1503,
"step": 90600
},
{
"epoch": 3.6278548858045676,
"grad_norm": 0.39520397782325745,
"learning_rate": 0.001,
"loss": 4.1486,
"step": 90700
},
{
"epoch": 3.6318547258109675,
"grad_norm": 0.4321967363357544,
"learning_rate": 0.001,
"loss": 4.1485,
"step": 90800
},
{
"epoch": 3.6358545658173673,
"grad_norm": 0.5055027604103088,
"learning_rate": 0.001,
"loss": 4.1507,
"step": 90900
},
{
"epoch": 3.639854405823767,
"grad_norm": 0.44665881991386414,
"learning_rate": 0.001,
"loss": 4.149,
"step": 91000
},
{
"epoch": 3.643854245830167,
"grad_norm": 0.48920923471450806,
"learning_rate": 0.001,
"loss": 4.1483,
"step": 91100
},
{
"epoch": 3.6478540858365665,
"grad_norm": 0.5461563467979431,
"learning_rate": 0.001,
"loss": 4.1524,
"step": 91200
},
{
"epoch": 3.6518539258429663,
"grad_norm": 0.47327253222465515,
"learning_rate": 0.001,
"loss": 4.1502,
"step": 91300
},
{
"epoch": 3.655853765849366,
"grad_norm": 0.478876531124115,
"learning_rate": 0.001,
"loss": 4.1512,
"step": 91400
},
{
"epoch": 3.6598536058557656,
"grad_norm": 0.42543497681617737,
"learning_rate": 0.001,
"loss": 4.1532,
"step": 91500
},
{
"epoch": 3.6638534458621654,
"grad_norm": 0.4441344738006592,
"learning_rate": 0.001,
"loss": 4.1484,
"step": 91600
},
{
"epoch": 3.6678532858685653,
"grad_norm": 0.4607570469379425,
"learning_rate": 0.001,
"loss": 4.1479,
"step": 91700
},
{
"epoch": 3.671853125874965,
"grad_norm": 0.45845574140548706,
"learning_rate": 0.001,
"loss": 4.1518,
"step": 91800
},
{
"epoch": 3.675852965881365,
"grad_norm": 0.5007107853889465,
"learning_rate": 0.001,
"loss": 4.1481,
"step": 91900
},
{
"epoch": 3.6798528058877644,
"grad_norm": 0.5496152639389038,
"learning_rate": 0.001,
"loss": 4.1514,
"step": 92000
},
{
"epoch": 3.6838526458941643,
"grad_norm": 0.49697238206863403,
"learning_rate": 0.001,
"loss": 4.1462,
"step": 92100
},
{
"epoch": 3.687852485900564,
"grad_norm": 0.46701979637145996,
"learning_rate": 0.001,
"loss": 4.149,
"step": 92200
},
{
"epoch": 3.6918523259069636,
"grad_norm": 0.4277583360671997,
"learning_rate": 0.001,
"loss": 4.1531,
"step": 92300
},
{
"epoch": 3.6958521659133634,
"grad_norm": 0.4613393545150757,
"learning_rate": 0.001,
"loss": 4.1477,
"step": 92400
},
{
"epoch": 3.6998520059197633,
"grad_norm": 0.5120405554771423,
"learning_rate": 0.001,
"loss": 4.1487,
"step": 92500
},
{
"epoch": 3.703851845926163,
"grad_norm": 0.5001824498176575,
"learning_rate": 0.001,
"loss": 4.1518,
"step": 92600
},
{
"epoch": 3.7078516859325625,
"grad_norm": 0.4255179762840271,
"learning_rate": 0.001,
"loss": 4.1511,
"step": 92700
},
{
"epoch": 3.7118515259389624,
"grad_norm": 0.48289310932159424,
"learning_rate": 0.001,
"loss": 4.1509,
"step": 92800
},
{
"epoch": 3.7158513659453623,
"grad_norm": 0.536251962184906,
"learning_rate": 0.001,
"loss": 4.1479,
"step": 92900
},
{
"epoch": 3.7198512059517617,
"grad_norm": 0.5439473390579224,
"learning_rate": 0.001,
"loss": 4.1511,
"step": 93000
},
{
"epoch": 3.7238510459581615,
"grad_norm": 0.4157579839229584,
"learning_rate": 0.001,
"loss": 4.1502,
"step": 93100
},
{
"epoch": 3.7278508859645614,
"grad_norm": 0.4220696985721588,
"learning_rate": 0.001,
"loss": 4.1461,
"step": 93200
},
{
"epoch": 3.7318507259709612,
"grad_norm": 0.4461278021335602,
"learning_rate": 0.001,
"loss": 4.1525,
"step": 93300
},
{
"epoch": 3.735850565977361,
"grad_norm": 0.5958127379417419,
"learning_rate": 0.001,
"loss": 4.1529,
"step": 93400
},
{
"epoch": 3.7398504059837605,
"grad_norm": 0.3829163908958435,
"learning_rate": 0.001,
"loss": 4.1517,
"step": 93500
},
{
"epoch": 3.7438502459901604,
"grad_norm": 0.43108895421028137,
"learning_rate": 0.001,
"loss": 4.1471,
"step": 93600
},
{
"epoch": 3.74785008599656,
"grad_norm": 0.5303171873092651,
"learning_rate": 0.001,
"loss": 4.1497,
"step": 93700
},
{
"epoch": 3.7518499260029596,
"grad_norm": 0.6245208382606506,
"learning_rate": 0.001,
"loss": 4.1461,
"step": 93800
},
{
"epoch": 3.7558497660093595,
"grad_norm": 0.4686441421508789,
"learning_rate": 0.001,
"loss": 4.1511,
"step": 93900
},
{
"epoch": 3.7598496060157593,
"grad_norm": 0.5424903035163879,
"learning_rate": 0.001,
"loss": 4.1473,
"step": 94000
},
{
"epoch": 3.763849446022159,
"grad_norm": 0.42156532406806946,
"learning_rate": 0.001,
"loss": 4.1499,
"step": 94100
},
{
"epoch": 3.767849286028559,
"grad_norm": 0.4944685399532318,
"learning_rate": 0.001,
"loss": 4.1486,
"step": 94200
},
{
"epoch": 3.7718491260349585,
"grad_norm": 0.46695804595947266,
"learning_rate": 0.001,
"loss": 4.1495,
"step": 94300
},
{
"epoch": 3.7758489660413583,
"grad_norm": 0.4274919033050537,
"learning_rate": 0.001,
"loss": 4.149,
"step": 94400
},
{
"epoch": 3.779848806047758,
"grad_norm": 0.5031160116195679,
"learning_rate": 0.001,
"loss": 4.1464,
"step": 94500
},
{
"epoch": 3.7838486460541576,
"grad_norm": 0.4584692716598511,
"learning_rate": 0.001,
"loss": 4.1483,
"step": 94600
},
{
"epoch": 3.7878484860605575,
"grad_norm": 0.4695260524749756,
"learning_rate": 0.001,
"loss": 4.1496,
"step": 94700
},
{
"epoch": 3.7918483260669573,
"grad_norm": 0.4564335346221924,
"learning_rate": 0.001,
"loss": 4.1472,
"step": 94800
},
{
"epoch": 3.795848166073357,
"grad_norm": 0.47409653663635254,
"learning_rate": 0.001,
"loss": 4.15,
"step": 94900
},
{
"epoch": 3.799848006079757,
"grad_norm": 0.4272071123123169,
"learning_rate": 0.001,
"loss": 4.1465,
"step": 95000
},
{
"epoch": 3.8038478460861564,
"grad_norm": 0.5578600764274597,
"learning_rate": 0.001,
"loss": 4.1486,
"step": 95100
},
{
"epoch": 3.8078476860925563,
"grad_norm": 0.503226637840271,
"learning_rate": 0.001,
"loss": 4.1463,
"step": 95200
},
{
"epoch": 3.811847526098956,
"grad_norm": 0.4407929480075836,
"learning_rate": 0.001,
"loss": 4.1503,
"step": 95300
},
{
"epoch": 3.8158473661053556,
"grad_norm": 0.3911983370780945,
"learning_rate": 0.001,
"loss": 4.1478,
"step": 95400
},
{
"epoch": 3.8198472061117554,
"grad_norm": 0.4753795564174652,
"learning_rate": 0.001,
"loss": 4.1465,
"step": 95500
},
{
"epoch": 3.8238470461181553,
"grad_norm": 0.5648890733718872,
"learning_rate": 0.001,
"loss": 4.1505,
"step": 95600
},
{
"epoch": 3.827846886124555,
"grad_norm": 0.5674782991409302,
"learning_rate": 0.001,
"loss": 4.15,
"step": 95700
},
{
"epoch": 3.831846726130955,
"grad_norm": 0.45150429010391235,
"learning_rate": 0.001,
"loss": 4.1479,
"step": 95800
},
{
"epoch": 3.8358465661373544,
"grad_norm": 0.44328397512435913,
"learning_rate": 0.001,
"loss": 4.1486,
"step": 95900
},
{
"epoch": 3.8398464061437543,
"grad_norm": 0.5726007223129272,
"learning_rate": 0.001,
"loss": 4.1518,
"step": 96000
},
{
"epoch": 3.843846246150154,
"grad_norm": 0.4201109707355499,
"learning_rate": 0.001,
"loss": 4.1511,
"step": 96100
},
{
"epoch": 3.8478460861565535,
"grad_norm": 0.4402865469455719,
"learning_rate": 0.001,
"loss": 4.1442,
"step": 96200
},
{
"epoch": 3.8518459261629534,
"grad_norm": 0.41505661606788635,
"learning_rate": 0.001,
"loss": 4.1523,
"step": 96300
},
{
"epoch": 3.8558457661693533,
"grad_norm": 0.4434868097305298,
"learning_rate": 0.001,
"loss": 4.1508,
"step": 96400
},
{
"epoch": 3.859845606175753,
"grad_norm": 0.44477754831314087,
"learning_rate": 0.001,
"loss": 4.1495,
"step": 96500
},
{
"epoch": 3.863845446182153,
"grad_norm": 0.44696667790412903,
"learning_rate": 0.001,
"loss": 4.1491,
"step": 96600
},
{
"epoch": 3.8678452861885524,
"grad_norm": 0.4718262553215027,
"learning_rate": 0.001,
"loss": 4.1483,
"step": 96700
},
{
"epoch": 3.8718451261949522,
"grad_norm": 0.48947885632514954,
"learning_rate": 0.001,
"loss": 4.1486,
"step": 96800
},
{
"epoch": 3.875844966201352,
"grad_norm": 0.42789411544799805,
"learning_rate": 0.001,
"loss": 4.1496,
"step": 96900
},
{
"epoch": 3.8798448062077515,
"grad_norm": 0.46309271454811096,
"learning_rate": 0.001,
"loss": 4.1467,
"step": 97000
},
{
"epoch": 3.8838446462141514,
"grad_norm": 0.5170295834541321,
"learning_rate": 0.001,
"loss": 4.1456,
"step": 97100
},
{
"epoch": 3.8878444862205512,
"grad_norm": 0.4399054944515228,
"learning_rate": 0.001,
"loss": 4.1451,
"step": 97200
},
{
"epoch": 3.891844326226951,
"grad_norm": 0.5585961937904358,
"learning_rate": 0.001,
"loss": 4.1487,
"step": 97300
},
{
"epoch": 3.8958441662333505,
"grad_norm": 0.4246786832809448,
"learning_rate": 0.001,
"loss": 4.1491,
"step": 97400
},
{
"epoch": 3.8998440062397504,
"grad_norm": 0.44548454880714417,
"learning_rate": 0.001,
"loss": 4.1479,
"step": 97500
},
{
"epoch": 3.90384384624615,
"grad_norm": 0.43676796555519104,
"learning_rate": 0.001,
"loss": 4.1451,
"step": 97600
},
{
"epoch": 3.9078436862525496,
"grad_norm": 0.494795024394989,
"learning_rate": 0.001,
"loss": 4.1469,
"step": 97700
},
{
"epoch": 3.9118435262589495,
"grad_norm": 0.5050995349884033,
"learning_rate": 0.001,
"loss": 4.1478,
"step": 97800
},
{
"epoch": 3.9158433662653493,
"grad_norm": 0.38782063126564026,
"learning_rate": 0.001,
"loss": 4.1497,
"step": 97900
},
{
"epoch": 3.919843206271749,
"grad_norm": 0.4937835931777954,
"learning_rate": 0.001,
"loss": 4.1456,
"step": 98000
},
{
"epoch": 3.923843046278149,
"grad_norm": 0.45296600461006165,
"learning_rate": 0.001,
"loss": 4.1495,
"step": 98100
},
{
"epoch": 3.9278428862845485,
"grad_norm": 0.40895891189575195,
"learning_rate": 0.001,
"loss": 4.1471,
"step": 98200
},
{
"epoch": 3.9318427262909483,
"grad_norm": 0.4339890480041504,
"learning_rate": 0.001,
"loss": 4.1465,
"step": 98300
},
{
"epoch": 3.935842566297348,
"grad_norm": 0.395710825920105,
"learning_rate": 0.001,
"loss": 4.1454,
"step": 98400
},
{
"epoch": 3.9398424063037476,
"grad_norm": 0.43159592151641846,
"learning_rate": 0.001,
"loss": 4.1436,
"step": 98500
},
{
"epoch": 3.9438422463101475,
"grad_norm": 0.4928899109363556,
"learning_rate": 0.001,
"loss": 4.1465,
"step": 98600
},
{
"epoch": 3.9478420863165473,
"grad_norm": 0.5097815990447998,
"learning_rate": 0.001,
"loss": 4.1464,
"step": 98700
},
{
"epoch": 3.951841926322947,
"grad_norm": 0.4376477003097534,
"learning_rate": 0.001,
"loss": 4.1485,
"step": 98800
},
{
"epoch": 3.955841766329347,
"grad_norm": 0.5436988472938538,
"learning_rate": 0.001,
"loss": 4.1502,
"step": 98900
},
{
"epoch": 3.9598416063357464,
"grad_norm": 0.47442567348480225,
"learning_rate": 0.001,
"loss": 4.1452,
"step": 99000
},
{
"epoch": 3.9638414463421463,
"grad_norm": 0.512935221195221,
"learning_rate": 0.001,
"loss": 4.1537,
"step": 99100
},
{
"epoch": 3.967841286348546,
"grad_norm": 0.44137364625930786,
"learning_rate": 0.001,
"loss": 4.148,
"step": 99200
},
{
"epoch": 3.9718411263549456,
"grad_norm": 0.4600997269153595,
"learning_rate": 0.001,
"loss": 4.1469,
"step": 99300
},
{
"epoch": 3.9758409663613454,
"grad_norm": 0.44996100664138794,
"learning_rate": 0.001,
"loss": 4.1487,
"step": 99400
},
{
"epoch": 3.9798408063677453,
"grad_norm": 0.42435500025749207,
"learning_rate": 0.001,
"loss": 4.148,
"step": 99500
},
{
"epoch": 3.983840646374145,
"grad_norm": 0.47722697257995605,
"learning_rate": 0.001,
"loss": 4.1451,
"step": 99600
},
{
"epoch": 3.987840486380545,
"grad_norm": 0.6269773244857788,
"learning_rate": 0.001,
"loss": 4.1473,
"step": 99700
},
{
"epoch": 3.9918403263869444,
"grad_norm": 0.4844716787338257,
"learning_rate": 0.001,
"loss": 4.146,
"step": 99800
},
{
"epoch": 3.9958401663933443,
"grad_norm": 0.43544551730155945,
"learning_rate": 0.001,
"loss": 4.1479,
"step": 99900
},
{
"epoch": 3.999840006399744,
"grad_norm": 0.5435088872909546,
"learning_rate": 0.001,
"loss": 4.1454,
"step": 100000
},
{
"epoch": 4.0038398464061435,
"grad_norm": 0.39157047867774963,
"learning_rate": 0.001,
"loss": 4.1453,
"step": 100100
},
{
"epoch": 4.007839686412543,
"grad_norm": 0.42717739939689636,
"learning_rate": 0.001,
"loss": 4.1492,
"step": 100200
},
{
"epoch": 4.011839526418943,
"grad_norm": 0.5479187965393066,
"learning_rate": 0.001,
"loss": 4.1474,
"step": 100300
},
{
"epoch": 4.015839366425343,
"grad_norm": 0.39487773180007935,
"learning_rate": 0.001,
"loss": 4.148,
"step": 100400
},
{
"epoch": 4.019839206431743,
"grad_norm": 0.49917787313461304,
"learning_rate": 0.001,
"loss": 4.1477,
"step": 100500
},
{
"epoch": 4.023839046438143,
"grad_norm": 0.5411247611045837,
"learning_rate": 0.001,
"loss": 4.1453,
"step": 100600
},
{
"epoch": 4.027838886444542,
"grad_norm": 0.4550989866256714,
"learning_rate": 0.001,
"loss": 4.1474,
"step": 100700
},
{
"epoch": 4.031838726450942,
"grad_norm": 0.44234633445739746,
"learning_rate": 0.001,
"loss": 4.1453,
"step": 100800
},
{
"epoch": 4.0358385664573415,
"grad_norm": 0.42147624492645264,
"learning_rate": 0.001,
"loss": 4.1424,
"step": 100900
},
{
"epoch": 4.039838406463741,
"grad_norm": 0.43127307295799255,
"learning_rate": 0.001,
"loss": 4.1434,
"step": 101000
},
{
"epoch": 4.043838246470141,
"grad_norm": 0.5709433555603027,
"learning_rate": 0.001,
"loss": 4.1443,
"step": 101100
},
{
"epoch": 4.047838086476541,
"grad_norm": 0.5325762033462524,
"learning_rate": 0.001,
"loss": 4.1414,
"step": 101200
},
{
"epoch": 4.051837926482941,
"grad_norm": 0.5265848636627197,
"learning_rate": 0.001,
"loss": 4.1492,
"step": 101300
},
{
"epoch": 4.055837766489341,
"grad_norm": 0.4727579653263092,
"learning_rate": 0.001,
"loss": 4.1522,
"step": 101400
},
{
"epoch": 4.05983760649574,
"grad_norm": 0.4549713432788849,
"learning_rate": 0.001,
"loss": 4.1404,
"step": 101500
},
{
"epoch": 4.06383744650214,
"grad_norm": 0.4658414125442505,
"learning_rate": 0.001,
"loss": 4.1482,
"step": 101600
},
{
"epoch": 4.0678372865085395,
"grad_norm": 0.4108069837093353,
"learning_rate": 0.001,
"loss": 4.1426,
"step": 101700
},
{
"epoch": 4.071837126514939,
"grad_norm": 0.5544761419296265,
"learning_rate": 0.001,
"loss": 4.1447,
"step": 101800
},
{
"epoch": 4.075836966521339,
"grad_norm": 0.3932327628135681,
"learning_rate": 0.001,
"loss": 4.1436,
"step": 101900
},
{
"epoch": 4.079836806527739,
"grad_norm": 0.5421287417411804,
"learning_rate": 0.001,
"loss": 4.1473,
"step": 102000
},
{
"epoch": 4.083836646534139,
"grad_norm": 0.4574621915817261,
"learning_rate": 0.001,
"loss": 4.1413,
"step": 102100
},
{
"epoch": 4.087836486540539,
"grad_norm": 0.47430068254470825,
"learning_rate": 0.001,
"loss": 4.1469,
"step": 102200
},
{
"epoch": 4.091836326546938,
"grad_norm": 0.4744085967540741,
"learning_rate": 0.001,
"loss": 4.1456,
"step": 102300
},
{
"epoch": 4.095836166553338,
"grad_norm": 0.48127326369285583,
"learning_rate": 0.001,
"loss": 4.1443,
"step": 102400
},
{
"epoch": 4.0998360065597375,
"grad_norm": 0.4618822932243347,
"learning_rate": 0.001,
"loss": 4.1474,
"step": 102500
},
{
"epoch": 4.103835846566137,
"grad_norm": 0.43074139952659607,
"learning_rate": 0.001,
"loss": 4.1454,
"step": 102600
},
{
"epoch": 4.107835686572537,
"grad_norm": 0.47091934084892273,
"learning_rate": 0.001,
"loss": 4.1447,
"step": 102700
},
{
"epoch": 4.111835526578937,
"grad_norm": 0.3798442780971527,
"learning_rate": 0.001,
"loss": 4.1447,
"step": 102800
},
{
"epoch": 4.115835366585337,
"grad_norm": 0.4601074755191803,
"learning_rate": 0.001,
"loss": 4.1504,
"step": 102900
},
{
"epoch": 4.119835206591737,
"grad_norm": 0.43777865171432495,
"learning_rate": 0.001,
"loss": 4.1473,
"step": 103000
},
{
"epoch": 4.123835046598136,
"grad_norm": 0.45747852325439453,
"learning_rate": 0.001,
"loss": 4.1429,
"step": 103100
},
{
"epoch": 4.127834886604536,
"grad_norm": 0.5466395020484924,
"learning_rate": 0.001,
"loss": 4.1458,
"step": 103200
},
{
"epoch": 4.131834726610935,
"grad_norm": 0.5395704507827759,
"learning_rate": 0.001,
"loss": 4.1456,
"step": 103300
},
{
"epoch": 4.135834566617335,
"grad_norm": 0.4724808931350708,
"learning_rate": 0.001,
"loss": 4.1468,
"step": 103400
},
{
"epoch": 4.139834406623735,
"grad_norm": 0.47559893131256104,
"learning_rate": 0.001,
"loss": 4.1452,
"step": 103500
},
{
"epoch": 4.143834246630135,
"grad_norm": 0.4290676712989807,
"learning_rate": 0.001,
"loss": 4.1469,
"step": 103600
},
{
"epoch": 4.147834086636535,
"grad_norm": 0.39940178394317627,
"learning_rate": 0.001,
"loss": 4.147,
"step": 103700
},
{
"epoch": 4.151833926642935,
"grad_norm": 0.45378994941711426,
"learning_rate": 0.001,
"loss": 4.1441,
"step": 103800
},
{
"epoch": 4.155833766649334,
"grad_norm": 0.46410059928894043,
"learning_rate": 0.001,
"loss": 4.1481,
"step": 103900
},
{
"epoch": 4.1598336066557335,
"grad_norm": 0.45726585388183594,
"learning_rate": 0.001,
"loss": 4.1477,
"step": 104000
},
{
"epoch": 4.163833446662133,
"grad_norm": 0.42764076590538025,
"learning_rate": 0.001,
"loss": 4.1468,
"step": 104100
},
{
"epoch": 4.167833286668533,
"grad_norm": 0.47908028960227966,
"learning_rate": 0.001,
"loss": 4.1426,
"step": 104200
},
{
"epoch": 4.171833126674933,
"grad_norm": 0.5179200172424316,
"learning_rate": 0.001,
"loss": 4.1402,
"step": 104300
},
{
"epoch": 4.175832966681333,
"grad_norm": 0.46754130721092224,
"learning_rate": 0.001,
"loss": 4.146,
"step": 104400
},
{
"epoch": 4.179832806687733,
"grad_norm": 0.45480966567993164,
"learning_rate": 0.001,
"loss": 4.1457,
"step": 104500
},
{
"epoch": 4.183832646694132,
"grad_norm": 0.42622312903404236,
"learning_rate": 0.001,
"loss": 4.1406,
"step": 104600
},
{
"epoch": 4.187832486700532,
"grad_norm": 0.37732553482055664,
"learning_rate": 0.001,
"loss": 4.1425,
"step": 104700
},
{
"epoch": 4.1918323267069315,
"grad_norm": 0.5029783844947815,
"learning_rate": 0.001,
"loss": 4.1434,
"step": 104800
},
{
"epoch": 4.195832166713331,
"grad_norm": 0.4873427152633667,
"learning_rate": 0.001,
"loss": 4.1432,
"step": 104900
},
{
"epoch": 4.199832006719731,
"grad_norm": 0.4739370048046112,
"learning_rate": 0.001,
"loss": 4.1463,
"step": 105000
},
{
"epoch": 4.203831846726131,
"grad_norm": 0.4919280409812927,
"learning_rate": 0.001,
"loss": 4.145,
"step": 105100
},
{
"epoch": 4.207831686732531,
"grad_norm": 0.44847992062568665,
"learning_rate": 0.001,
"loss": 4.1447,
"step": 105200
},
{
"epoch": 4.211831526738931,
"grad_norm": 0.5035785436630249,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 105300
},
{
"epoch": 4.21583136674533,
"grad_norm": 0.5161508321762085,
"learning_rate": 0.001,
"loss": 4.1436,
"step": 105400
},
{
"epoch": 4.21983120675173,
"grad_norm": 0.44143009185791016,
"learning_rate": 0.001,
"loss": 4.1476,
"step": 105500
},
{
"epoch": 4.2238310467581295,
"grad_norm": 0.38801082968711853,
"learning_rate": 0.001,
"loss": 4.1484,
"step": 105600
},
{
"epoch": 4.227830886764529,
"grad_norm": 0.46244215965270996,
"learning_rate": 0.001,
"loss": 4.1457,
"step": 105700
},
{
"epoch": 4.231830726770929,
"grad_norm": 0.4244415760040283,
"learning_rate": 0.001,
"loss": 4.1458,
"step": 105800
},
{
"epoch": 4.235830566777329,
"grad_norm": 0.4295971095561981,
"learning_rate": 0.001,
"loss": 4.1489,
"step": 105900
},
{
"epoch": 4.239830406783729,
"grad_norm": 0.4482729136943817,
"learning_rate": 0.001,
"loss": 4.1448,
"step": 106000
},
{
"epoch": 4.243830246790129,
"grad_norm": 0.4174524247646332,
"learning_rate": 0.001,
"loss": 4.1432,
"step": 106100
},
{
"epoch": 4.247830086796528,
"grad_norm": 0.37934377789497375,
"learning_rate": 0.001,
"loss": 4.1404,
"step": 106200
},
{
"epoch": 4.251829926802928,
"grad_norm": 0.439449280500412,
"learning_rate": 0.001,
"loss": 4.1437,
"step": 106300
},
{
"epoch": 4.2558297668093275,
"grad_norm": 0.42520901560783386,
"learning_rate": 0.001,
"loss": 4.1464,
"step": 106400
},
{
"epoch": 4.259829606815727,
"grad_norm": 0.4702022671699524,
"learning_rate": 0.001,
"loss": 4.1434,
"step": 106500
},
{
"epoch": 4.263829446822127,
"grad_norm": 0.4666096568107605,
"learning_rate": 0.001,
"loss": 4.1453,
"step": 106600
},
{
"epoch": 4.267829286828527,
"grad_norm": 0.47754520177841187,
"learning_rate": 0.001,
"loss": 4.1441,
"step": 106700
},
{
"epoch": 4.271829126834927,
"grad_norm": 0.438281387090683,
"learning_rate": 0.001,
"loss": 4.1477,
"step": 106800
},
{
"epoch": 4.275828966841327,
"grad_norm": 0.4417981505393982,
"learning_rate": 0.001,
"loss": 4.1446,
"step": 106900
},
{
"epoch": 4.279828806847726,
"grad_norm": 0.4685574173927307,
"learning_rate": 0.001,
"loss": 4.1458,
"step": 107000
},
{
"epoch": 4.283828646854126,
"grad_norm": 0.53219074010849,
"learning_rate": 0.001,
"loss": 4.146,
"step": 107100
},
{
"epoch": 4.287828486860525,
"grad_norm": 0.453630268573761,
"learning_rate": 0.001,
"loss": 4.1466,
"step": 107200
},
{
"epoch": 4.291828326866925,
"grad_norm": 0.40888792276382446,
"learning_rate": 0.001,
"loss": 4.1471,
"step": 107300
},
{
"epoch": 4.295828166873325,
"grad_norm": 0.48033514618873596,
"learning_rate": 0.001,
"loss": 4.146,
"step": 107400
},
{
"epoch": 4.299828006879725,
"grad_norm": 0.4497097134590149,
"learning_rate": 0.001,
"loss": 4.1485,
"step": 107500
},
{
"epoch": 4.303827846886125,
"grad_norm": 0.4512811601161957,
"learning_rate": 0.001,
"loss": 4.1461,
"step": 107600
},
{
"epoch": 4.307827686892525,
"grad_norm": 0.40005770325660706,
"learning_rate": 0.001,
"loss": 4.1433,
"step": 107700
},
{
"epoch": 4.311827526898924,
"grad_norm": 0.44940298795700073,
"learning_rate": 0.001,
"loss": 4.1433,
"step": 107800
},
{
"epoch": 4.3158273669053235,
"grad_norm": 0.4794534742832184,
"learning_rate": 0.001,
"loss": 4.1471,
"step": 107900
},
{
"epoch": 4.319827206911723,
"grad_norm": 0.5258973836898804,
"learning_rate": 0.001,
"loss": 4.1424,
"step": 108000
},
{
"epoch": 4.323827046918123,
"grad_norm": 0.4339228868484497,
"learning_rate": 0.001,
"loss": 4.1466,
"step": 108100
},
{
"epoch": 4.327826886924523,
"grad_norm": 0.41444161534309387,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 108200
},
{
"epoch": 4.331826726930923,
"grad_norm": 0.47487524151802063,
"learning_rate": 0.001,
"loss": 4.1484,
"step": 108300
},
{
"epoch": 4.335826566937323,
"grad_norm": 0.39907756447792053,
"learning_rate": 0.001,
"loss": 4.1487,
"step": 108400
},
{
"epoch": 4.339826406943722,
"grad_norm": 0.5254673957824707,
"learning_rate": 0.001,
"loss": 4.1447,
"step": 108500
},
{
"epoch": 4.343826246950122,
"grad_norm": 0.45602646470069885,
"learning_rate": 0.001,
"loss": 4.145,
"step": 108600
},
{
"epoch": 4.3478260869565215,
"grad_norm": 0.40609806776046753,
"learning_rate": 0.001,
"loss": 4.1415,
"step": 108700
},
{
"epoch": 4.351825926962921,
"grad_norm": 0.5290670394897461,
"learning_rate": 0.001,
"loss": 4.1444,
"step": 108800
},
{
"epoch": 4.355825766969321,
"grad_norm": 0.4068310558795929,
"learning_rate": 0.001,
"loss": 4.1416,
"step": 108900
},
{
"epoch": 4.359825606975721,
"grad_norm": 0.44302281737327576,
"learning_rate": 0.001,
"loss": 4.1464,
"step": 109000
},
{
"epoch": 4.363825446982121,
"grad_norm": 0.46425190567970276,
"learning_rate": 0.001,
"loss": 4.1456,
"step": 109100
},
{
"epoch": 4.367825286988521,
"grad_norm": 0.4178661108016968,
"learning_rate": 0.001,
"loss": 4.1456,
"step": 109200
},
{
"epoch": 4.371825126994921,
"grad_norm": 0.5556158423423767,
"learning_rate": 0.001,
"loss": 4.1431,
"step": 109300
},
{
"epoch": 4.37582496700132,
"grad_norm": 0.4908580780029297,
"learning_rate": 0.001,
"loss": 4.1446,
"step": 109400
},
{
"epoch": 4.3798248070077195,
"grad_norm": 0.4489957392215729,
"learning_rate": 0.001,
"loss": 4.1442,
"step": 109500
},
{
"epoch": 4.383824647014119,
"grad_norm": 0.5880224108695984,
"learning_rate": 0.001,
"loss": 4.1451,
"step": 109600
},
{
"epoch": 4.387824487020519,
"grad_norm": 0.5525696873664856,
"learning_rate": 0.001,
"loss": 4.1462,
"step": 109700
},
{
"epoch": 4.391824327026919,
"grad_norm": 0.5361529588699341,
"learning_rate": 0.001,
"loss": 4.1425,
"step": 109800
},
{
"epoch": 4.395824167033319,
"grad_norm": 0.48454704880714417,
"learning_rate": 0.001,
"loss": 4.1427,
"step": 109900
},
{
"epoch": 4.399824007039719,
"grad_norm": 0.6087040305137634,
"learning_rate": 0.001,
"loss": 4.1437,
"step": 110000
},
{
"epoch": 4.403823847046118,
"grad_norm": 0.4859618544578552,
"learning_rate": 0.001,
"loss": 4.1431,
"step": 110100
},
{
"epoch": 4.407823687052518,
"grad_norm": 0.4525204598903656,
"learning_rate": 0.001,
"loss": 4.1456,
"step": 110200
},
{
"epoch": 4.4118235270589174,
"grad_norm": 0.4616955816745758,
"learning_rate": 0.001,
"loss": 4.1424,
"step": 110300
},
{
"epoch": 4.415823367065317,
"grad_norm": 0.5667575597763062,
"learning_rate": 0.001,
"loss": 4.145,
"step": 110400
},
{
"epoch": 4.419823207071717,
"grad_norm": 0.527301013469696,
"learning_rate": 0.001,
"loss": 4.1432,
"step": 110500
},
{
"epoch": 4.423823047078117,
"grad_norm": 0.5262458324432373,
"learning_rate": 0.001,
"loss": 4.1409,
"step": 110600
},
{
"epoch": 4.427822887084517,
"grad_norm": 0.4203338325023651,
"learning_rate": 0.001,
"loss": 4.1443,
"step": 110700
},
{
"epoch": 4.431822727090917,
"grad_norm": 0.463851660490036,
"learning_rate": 0.001,
"loss": 4.1458,
"step": 110800
},
{
"epoch": 4.435822567097316,
"grad_norm": 0.49283987283706665,
"learning_rate": 0.001,
"loss": 4.1445,
"step": 110900
},
{
"epoch": 4.439822407103716,
"grad_norm": 0.4866863787174225,
"learning_rate": 0.001,
"loss": 4.1411,
"step": 111000
},
{
"epoch": 4.443822247110115,
"grad_norm": 0.5105231404304504,
"learning_rate": 0.001,
"loss": 4.1414,
"step": 111100
},
{
"epoch": 4.447822087116515,
"grad_norm": 0.4239439070224762,
"learning_rate": 0.001,
"loss": 4.1396,
"step": 111200
},
{
"epoch": 4.451821927122915,
"grad_norm": 0.4156837463378906,
"learning_rate": 0.001,
"loss": 4.1444,
"step": 111300
},
{
"epoch": 4.455821767129315,
"grad_norm": 0.49761706590652466,
"learning_rate": 0.001,
"loss": 4.1413,
"step": 111400
},
{
"epoch": 4.459821607135715,
"grad_norm": 0.4880112409591675,
"learning_rate": 0.001,
"loss": 4.1455,
"step": 111500
},
{
"epoch": 4.463821447142115,
"grad_norm": 0.38512125611305237,
"learning_rate": 0.001,
"loss": 4.143,
"step": 111600
},
{
"epoch": 4.467821287148514,
"grad_norm": 0.5444674491882324,
"learning_rate": 0.001,
"loss": 4.1451,
"step": 111700
},
{
"epoch": 4.4718211271549135,
"grad_norm": 0.4214431643486023,
"learning_rate": 0.001,
"loss": 4.144,
"step": 111800
},
{
"epoch": 4.475820967161313,
"grad_norm": 0.4738007187843323,
"learning_rate": 0.001,
"loss": 4.142,
"step": 111900
},
{
"epoch": 4.479820807167713,
"grad_norm": 0.48944899439811707,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 112000
},
{
"epoch": 4.483820647174113,
"grad_norm": 0.4785895347595215,
"learning_rate": 0.001,
"loss": 4.1434,
"step": 112100
},
{
"epoch": 4.487820487180513,
"grad_norm": 0.4528314173221588,
"learning_rate": 0.001,
"loss": 4.1382,
"step": 112200
},
{
"epoch": 4.491820327186913,
"grad_norm": 0.5328041315078735,
"learning_rate": 0.001,
"loss": 4.1479,
"step": 112300
},
{
"epoch": 4.495820167193313,
"grad_norm": 0.49370276927948,
"learning_rate": 0.001,
"loss": 4.142,
"step": 112400
},
{
"epoch": 4.499820007199712,
"grad_norm": 0.4953836500644684,
"learning_rate": 0.001,
"loss": 4.1453,
"step": 112500
},
{
"epoch": 4.5038198472061115,
"grad_norm": 0.4475695788860321,
"learning_rate": 0.001,
"loss": 4.1436,
"step": 112600
},
{
"epoch": 4.507819687212511,
"grad_norm": 0.4099849760532379,
"learning_rate": 0.001,
"loss": 4.1432,
"step": 112700
},
{
"epoch": 4.511819527218911,
"grad_norm": 0.45879650115966797,
"learning_rate": 0.001,
"loss": 4.1446,
"step": 112800
},
{
"epoch": 4.515819367225311,
"grad_norm": 0.4368346929550171,
"learning_rate": 0.001,
"loss": 4.1445,
"step": 112900
},
{
"epoch": 4.519819207231711,
"grad_norm": 0.4217066466808319,
"learning_rate": 0.001,
"loss": 4.1414,
"step": 113000
},
{
"epoch": 4.523819047238111,
"grad_norm": 0.42964696884155273,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 113100
},
{
"epoch": 4.527818887244511,
"grad_norm": 0.38772520422935486,
"learning_rate": 0.001,
"loss": 4.1426,
"step": 113200
},
{
"epoch": 4.53181872725091,
"grad_norm": 0.43408331274986267,
"learning_rate": 0.001,
"loss": 4.1463,
"step": 113300
},
{
"epoch": 4.5358185672573095,
"grad_norm": 0.49354737997055054,
"learning_rate": 0.001,
"loss": 4.1416,
"step": 113400
},
{
"epoch": 4.539818407263709,
"grad_norm": 0.43434685468673706,
"learning_rate": 0.001,
"loss": 4.146,
"step": 113500
},
{
"epoch": 4.543818247270109,
"grad_norm": 0.49511098861694336,
"learning_rate": 0.001,
"loss": 4.145,
"step": 113600
},
{
"epoch": 4.547818087276509,
"grad_norm": 0.46711239218711853,
"learning_rate": 0.001,
"loss": 4.1406,
"step": 113700
},
{
"epoch": 4.551817927282909,
"grad_norm": 0.6184647083282471,
"learning_rate": 0.001,
"loss": 4.1421,
"step": 113800
},
{
"epoch": 4.555817767289309,
"grad_norm": 0.587983250617981,
"learning_rate": 0.001,
"loss": 4.1425,
"step": 113900
},
{
"epoch": 4.559817607295708,
"grad_norm": 0.42902278900146484,
"learning_rate": 0.001,
"loss": 4.1431,
"step": 114000
},
{
"epoch": 4.563817447302108,
"grad_norm": 0.5256754159927368,
"learning_rate": 0.001,
"loss": 4.1401,
"step": 114100
},
{
"epoch": 4.5678172873085074,
"grad_norm": 0.5793132781982422,
"learning_rate": 0.001,
"loss": 4.1413,
"step": 114200
},
{
"epoch": 4.571817127314907,
"grad_norm": 0.47969871759414673,
"learning_rate": 0.001,
"loss": 4.1444,
"step": 114300
},
{
"epoch": 4.575816967321307,
"grad_norm": 0.4756941795349121,
"learning_rate": 0.001,
"loss": 4.1446,
"step": 114400
},
{
"epoch": 4.579816807327707,
"grad_norm": 0.5472639799118042,
"learning_rate": 0.001,
"loss": 4.1393,
"step": 114500
},
{
"epoch": 4.583816647334107,
"grad_norm": 0.5101819634437561,
"learning_rate": 0.001,
"loss": 4.1423,
"step": 114600
},
{
"epoch": 4.587816487340507,
"grad_norm": 0.47236403822898865,
"learning_rate": 0.001,
"loss": 4.1432,
"step": 114700
},
{
"epoch": 4.591816327346907,
"grad_norm": 0.5631404519081116,
"learning_rate": 0.001,
"loss": 4.1446,
"step": 114800
},
{
"epoch": 4.595816167353306,
"grad_norm": 0.4453705847263336,
"learning_rate": 0.001,
"loss": 4.1453,
"step": 114900
},
{
"epoch": 4.599816007359705,
"grad_norm": 0.49028831720352173,
"learning_rate": 0.001,
"loss": 4.1403,
"step": 115000
},
{
"epoch": 4.603815847366105,
"grad_norm": 0.4380851686000824,
"learning_rate": 0.001,
"loss": 4.1448,
"step": 115100
},
{
"epoch": 4.607815687372505,
"grad_norm": 0.48736339807510376,
"learning_rate": 0.001,
"loss": 4.1445,
"step": 115200
},
{
"epoch": 4.611815527378905,
"grad_norm": 0.43725523352622986,
"learning_rate": 0.001,
"loss": 4.1419,
"step": 115300
},
{
"epoch": 4.615815367385305,
"grad_norm": 0.5325472354888916,
"learning_rate": 0.001,
"loss": 4.1444,
"step": 115400
},
{
"epoch": 4.619815207391705,
"grad_norm": 0.4731554388999939,
"learning_rate": 0.001,
"loss": 4.1435,
"step": 115500
},
{
"epoch": 4.623815047398104,
"grad_norm": 0.4858817160129547,
"learning_rate": 0.001,
"loss": 4.1461,
"step": 115600
},
{
"epoch": 4.6278148874045035,
"grad_norm": 0.4575202763080597,
"learning_rate": 0.001,
"loss": 4.1439,
"step": 115700
},
{
"epoch": 4.631814727410903,
"grad_norm": 0.44957438111305237,
"learning_rate": 0.001,
"loss": 4.1416,
"step": 115800
},
{
"epoch": 4.635814567417303,
"grad_norm": 0.37792956829071045,
"learning_rate": 0.001,
"loss": 4.1451,
"step": 115900
},
{
"epoch": 4.639814407423703,
"grad_norm": 0.4588908553123474,
"learning_rate": 0.001,
"loss": 4.1442,
"step": 116000
},
{
"epoch": 4.643814247430103,
"grad_norm": 0.4842056334018707,
"learning_rate": 0.001,
"loss": 4.1445,
"step": 116100
},
{
"epoch": 4.647814087436503,
"grad_norm": 0.44424474239349365,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 116200
},
{
"epoch": 4.651813927442902,
"grad_norm": 0.5730035901069641,
"learning_rate": 0.001,
"loss": 4.1472,
"step": 116300
},
{
"epoch": 4.655813767449302,
"grad_norm": 0.5033777356147766,
"learning_rate": 0.001,
"loss": 4.1479,
"step": 116400
},
{
"epoch": 4.6598136074557015,
"grad_norm": 0.49093732237815857,
"learning_rate": 0.001,
"loss": 4.1445,
"step": 116500
},
{
"epoch": 4.663813447462101,
"grad_norm": 0.4203215539455414,
"learning_rate": 0.001,
"loss": 4.1416,
"step": 116600
},
{
"epoch": 4.667813287468501,
"grad_norm": 0.5095353722572327,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 116700
},
{
"epoch": 4.671813127474901,
"grad_norm": 0.47804033756256104,
"learning_rate": 0.001,
"loss": 4.1421,
"step": 116800
},
{
"epoch": 4.675812967481301,
"grad_norm": 0.4211972653865814,
"learning_rate": 0.001,
"loss": 4.1403,
"step": 116900
},
{
"epoch": 4.679812807487701,
"grad_norm": 0.6364486813545227,
"learning_rate": 0.001,
"loss": 4.1456,
"step": 117000
},
{
"epoch": 4.683812647494101,
"grad_norm": 0.525810718536377,
"learning_rate": 0.001,
"loss": 4.1436,
"step": 117100
},
{
"epoch": 4.6878124875005,
"grad_norm": 0.4373362064361572,
"learning_rate": 0.001,
"loss": 4.1399,
"step": 117200
},
{
"epoch": 4.6918123275068995,
"grad_norm": 0.4389038383960724,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 117300
},
{
"epoch": 4.695812167513299,
"grad_norm": 0.4124441146850586,
"learning_rate": 0.001,
"loss": 4.1432,
"step": 117400
},
{
"epoch": 4.699812007519699,
"grad_norm": 0.4817601442337036,
"learning_rate": 0.001,
"loss": 4.1447,
"step": 117500
},
{
"epoch": 4.703811847526099,
"grad_norm": 0.4811069071292877,
"learning_rate": 0.001,
"loss": 4.1391,
"step": 117600
},
{
"epoch": 4.707811687532499,
"grad_norm": 0.40754398703575134,
"learning_rate": 0.001,
"loss": 4.1386,
"step": 117700
},
{
"epoch": 4.711811527538899,
"grad_norm": 0.468555212020874,
"learning_rate": 0.001,
"loss": 4.1386,
"step": 117800
},
{
"epoch": 4.715811367545298,
"grad_norm": 0.4321994185447693,
"learning_rate": 0.001,
"loss": 4.1404,
"step": 117900
},
{
"epoch": 4.719811207551698,
"grad_norm": 0.46261945366859436,
"learning_rate": 0.001,
"loss": 4.1427,
"step": 118000
},
{
"epoch": 4.723811047558097,
"grad_norm": 0.4801866412162781,
"learning_rate": 0.001,
"loss": 4.1424,
"step": 118100
},
{
"epoch": 4.727810887564497,
"grad_norm": 0.4368051588535309,
"learning_rate": 0.001,
"loss": 4.145,
"step": 118200
},
{
"epoch": 4.731810727570897,
"grad_norm": 0.45344650745391846,
"learning_rate": 0.001,
"loss": 4.1446,
"step": 118300
},
{
"epoch": 4.735810567577297,
"grad_norm": 0.4357813000679016,
"learning_rate": 0.001,
"loss": 4.1395,
"step": 118400
},
{
"epoch": 4.739810407583697,
"grad_norm": 0.37463030219078064,
"learning_rate": 0.001,
"loss": 4.1424,
"step": 118500
},
{
"epoch": 4.743810247590097,
"grad_norm": 0.4514647126197815,
"learning_rate": 0.001,
"loss": 4.1448,
"step": 118600
},
{
"epoch": 4.747810087596497,
"grad_norm": 0.47029080986976624,
"learning_rate": 0.001,
"loss": 4.1437,
"step": 118700
},
{
"epoch": 4.7518099276028956,
"grad_norm": 0.39589250087738037,
"learning_rate": 0.001,
"loss": 4.1467,
"step": 118800
},
{
"epoch": 4.755809767609295,
"grad_norm": 0.4465102553367615,
"learning_rate": 0.001,
"loss": 4.1438,
"step": 118900
},
{
"epoch": 4.759809607615695,
"grad_norm": 0.4341897964477539,
"learning_rate": 0.001,
"loss": 4.1392,
"step": 119000
},
{
"epoch": 4.763809447622095,
"grad_norm": 0.45617157220840454,
"learning_rate": 0.001,
"loss": 4.1406,
"step": 119100
},
{
"epoch": 4.767809287628495,
"grad_norm": 0.4648855924606323,
"learning_rate": 0.001,
"loss": 4.1428,
"step": 119200
},
{
"epoch": 4.771809127634895,
"grad_norm": 0.4467076361179352,
"learning_rate": 0.001,
"loss": 4.1423,
"step": 119300
},
{
"epoch": 4.775808967641295,
"grad_norm": 0.45838209986686707,
"learning_rate": 0.001,
"loss": 4.1429,
"step": 119400
},
{
"epoch": 4.779808807647694,
"grad_norm": 0.4301731288433075,
"learning_rate": 0.001,
"loss": 4.1427,
"step": 119500
},
{
"epoch": 4.7838086476540935,
"grad_norm": 0.4407281279563904,
"learning_rate": 0.001,
"loss": 4.147,
"step": 119600
},
{
"epoch": 4.787808487660493,
"grad_norm": 0.49695926904678345,
"learning_rate": 0.001,
"loss": 4.1426,
"step": 119700
},
{
"epoch": 4.791808327666893,
"grad_norm": 0.43553370237350464,
"learning_rate": 0.001,
"loss": 4.1449,
"step": 119800
},
{
"epoch": 4.795808167673293,
"grad_norm": 0.4836173355579376,
"learning_rate": 0.001,
"loss": 4.1454,
"step": 119900
},
{
"epoch": 4.799808007679693,
"grad_norm": 0.4971003234386444,
"learning_rate": 0.001,
"loss": 4.1433,
"step": 120000
},
{
"epoch": 4.803807847686093,
"grad_norm": 0.47055745124816895,
"learning_rate": 0.001,
"loss": 4.1423,
"step": 120100
},
{
"epoch": 4.807807687692493,
"grad_norm": 0.39940592646598816,
"learning_rate": 0.001,
"loss": 4.1407,
"step": 120200
},
{
"epoch": 4.8118075276988925,
"grad_norm": 0.4526260495185852,
"learning_rate": 0.001,
"loss": 4.1399,
"step": 120300
},
{
"epoch": 4.8158073677052915,
"grad_norm": 0.5053595304489136,
"learning_rate": 0.001,
"loss": 4.1401,
"step": 120400
},
{
"epoch": 4.819807207711691,
"grad_norm": 0.4834200441837311,
"learning_rate": 0.001,
"loss": 4.1419,
"step": 120500
},
{
"epoch": 4.823807047718091,
"grad_norm": 0.5198436379432678,
"learning_rate": 0.001,
"loss": 4.1421,
"step": 120600
},
{
"epoch": 4.827806887724491,
"grad_norm": 0.46774643659591675,
"learning_rate": 0.001,
"loss": 4.1447,
"step": 120700
},
{
"epoch": 4.831806727730891,
"grad_norm": 0.4808708429336548,
"learning_rate": 0.001,
"loss": 4.1391,
"step": 120800
},
{
"epoch": 4.835806567737291,
"grad_norm": 0.46363064646720886,
"learning_rate": 0.001,
"loss": 4.1426,
"step": 120900
},
{
"epoch": 4.839806407743691,
"grad_norm": 0.4087159335613251,
"learning_rate": 0.001,
"loss": 4.1421,
"step": 121000
},
{
"epoch": 4.84380624775009,
"grad_norm": 0.47745776176452637,
"learning_rate": 0.001,
"loss": 4.1419,
"step": 121100
},
{
"epoch": 4.8478060877564895,
"grad_norm": 0.4563154876232147,
"learning_rate": 0.001,
"loss": 4.1403,
"step": 121200
},
{
"epoch": 4.851805927762889,
"grad_norm": 0.43224015831947327,
"learning_rate": 0.001,
"loss": 4.1436,
"step": 121300
},
{
"epoch": 4.855805767769289,
"grad_norm": 0.4743672013282776,
"learning_rate": 0.001,
"loss": 4.1379,
"step": 121400
},
{
"epoch": 4.859805607775689,
"grad_norm": 0.44347378611564636,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 121500
},
{
"epoch": 4.863805447782089,
"grad_norm": 0.45574894547462463,
"learning_rate": 0.001,
"loss": 4.1465,
"step": 121600
},
{
"epoch": 4.867805287788489,
"grad_norm": 0.46385806798934937,
"learning_rate": 0.001,
"loss": 4.1386,
"step": 121700
},
{
"epoch": 4.871805127794888,
"grad_norm": 0.42676347494125366,
"learning_rate": 0.001,
"loss": 4.1395,
"step": 121800
},
{
"epoch": 4.875804967801288,
"grad_norm": 0.5199492573738098,
"learning_rate": 0.001,
"loss": 4.1381,
"step": 121900
},
{
"epoch": 4.879804807807687,
"grad_norm": 0.4669468104839325,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 122000
},
{
"epoch": 4.883804647814087,
"grad_norm": 0.5818643569946289,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 122100
},
{
"epoch": 4.887804487820487,
"grad_norm": 0.5275595784187317,
"learning_rate": 0.001,
"loss": 4.1439,
"step": 122200
},
{
"epoch": 4.891804327826887,
"grad_norm": 0.4505153000354767,
"learning_rate": 0.001,
"loss": 4.1436,
"step": 122300
},
{
"epoch": 4.895804167833287,
"grad_norm": 0.4731706380844116,
"learning_rate": 0.001,
"loss": 4.1446,
"step": 122400
},
{
"epoch": 4.899804007839687,
"grad_norm": 0.5030878186225891,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 122500
},
{
"epoch": 4.903803847846087,
"grad_norm": 0.4733423888683319,
"learning_rate": 0.001,
"loss": 4.1406,
"step": 122600
},
{
"epoch": 4.9078036878524856,
"grad_norm": 0.5757489204406738,
"learning_rate": 0.001,
"loss": 4.1407,
"step": 122700
},
{
"epoch": 4.911803527858885,
"grad_norm": 0.5101374983787537,
"learning_rate": 0.001,
"loss": 4.1386,
"step": 122800
},
{
"epoch": 4.915803367865285,
"grad_norm": 0.467282235622406,
"learning_rate": 0.001,
"loss": 4.1456,
"step": 122900
},
{
"epoch": 4.919803207871685,
"grad_norm": 0.4633881449699402,
"learning_rate": 0.001,
"loss": 4.1387,
"step": 123000
},
{
"epoch": 4.923803047878085,
"grad_norm": 0.36808955669403076,
"learning_rate": 0.001,
"loss": 4.143,
"step": 123100
},
{
"epoch": 4.927802887884485,
"grad_norm": 0.4221063554286957,
"learning_rate": 0.001,
"loss": 4.1463,
"step": 123200
},
{
"epoch": 4.931802727890885,
"grad_norm": 0.4532271921634674,
"learning_rate": 0.001,
"loss": 4.1423,
"step": 123300
},
{
"epoch": 4.935802567897284,
"grad_norm": 0.4424666166305542,
"learning_rate": 0.001,
"loss": 4.141,
"step": 123400
},
{
"epoch": 4.9398024079036835,
"grad_norm": 0.47042563557624817,
"learning_rate": 0.001,
"loss": 4.1386,
"step": 123500
},
{
"epoch": 4.943802247910083,
"grad_norm": 0.486246258020401,
"learning_rate": 0.001,
"loss": 4.1393,
"step": 123600
},
{
"epoch": 4.947802087916483,
"grad_norm": 0.4523676633834839,
"learning_rate": 0.001,
"loss": 4.1371,
"step": 123700
},
{
"epoch": 4.951801927922883,
"grad_norm": 0.5111677646636963,
"learning_rate": 0.001,
"loss": 4.1386,
"step": 123800
},
{
"epoch": 4.955801767929283,
"grad_norm": 0.47272786498069763,
"learning_rate": 0.001,
"loss": 4.138,
"step": 123900
},
{
"epoch": 4.959801607935683,
"grad_norm": 0.46790051460266113,
"learning_rate": 0.001,
"loss": 4.14,
"step": 124000
},
{
"epoch": 4.963801447942083,
"grad_norm": 0.4354240894317627,
"learning_rate": 0.001,
"loss": 4.1445,
"step": 124100
},
{
"epoch": 4.9678012879484825,
"grad_norm": 0.42303115129470825,
"learning_rate": 0.001,
"loss": 4.1405,
"step": 124200
},
{
"epoch": 4.9718011279548815,
"grad_norm": 0.44789764285087585,
"learning_rate": 0.001,
"loss": 4.1383,
"step": 124300
},
{
"epoch": 4.975800967961281,
"grad_norm": 0.46547091007232666,
"learning_rate": 0.001,
"loss": 4.1386,
"step": 124400
},
{
"epoch": 4.979800807967681,
"grad_norm": 0.5278778076171875,
"learning_rate": 0.001,
"loss": 4.1391,
"step": 124500
},
{
"epoch": 4.983800647974081,
"grad_norm": 0.503690779209137,
"learning_rate": 0.001,
"loss": 4.1416,
"step": 124600
},
{
"epoch": 4.987800487980481,
"grad_norm": 0.44487857818603516,
"learning_rate": 0.001,
"loss": 4.1377,
"step": 124700
},
{
"epoch": 4.991800327986881,
"grad_norm": 0.5172649621963501,
"learning_rate": 0.001,
"loss": 4.137,
"step": 124800
},
{
"epoch": 4.995800167993281,
"grad_norm": 0.39961203932762146,
"learning_rate": 0.001,
"loss": 4.1397,
"step": 124900
},
{
"epoch": 4.99980000799968,
"grad_norm": 0.41589173674583435,
"learning_rate": 0.001,
"loss": 4.1411,
"step": 125000
},
{
"epoch": 5.0037998480060795,
"grad_norm": 0.39197656512260437,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 125100
},
{
"epoch": 5.007799688012479,
"grad_norm": 0.4566977322101593,
"learning_rate": 0.001,
"loss": 4.1384,
"step": 125200
},
{
"epoch": 5.011799528018879,
"grad_norm": 0.54954594373703,
"learning_rate": 0.001,
"loss": 4.1395,
"step": 125300
},
{
"epoch": 5.015799368025279,
"grad_norm": 0.4543614983558655,
"learning_rate": 0.001,
"loss": 4.1394,
"step": 125400
},
{
"epoch": 5.019799208031679,
"grad_norm": 0.5545991063117981,
"learning_rate": 0.001,
"loss": 4.1405,
"step": 125500
},
{
"epoch": 5.023799048038079,
"grad_norm": 0.4615522027015686,
"learning_rate": 0.001,
"loss": 4.139,
"step": 125600
},
{
"epoch": 5.027798888044479,
"grad_norm": 0.4874088168144226,
"learning_rate": 0.001,
"loss": 4.139,
"step": 125700
},
{
"epoch": 5.031798728050878,
"grad_norm": 0.48862114548683167,
"learning_rate": 0.001,
"loss": 4.1366,
"step": 125800
},
{
"epoch": 5.035798568057277,
"grad_norm": 0.5121699571609497,
"learning_rate": 0.001,
"loss": 4.1384,
"step": 125900
},
{
"epoch": 5.039798408063677,
"grad_norm": 0.4240550398826599,
"learning_rate": 0.001,
"loss": 4.1401,
"step": 126000
},
{
"epoch": 5.043798248070077,
"grad_norm": 0.4307209551334381,
"learning_rate": 0.001,
"loss": 4.1408,
"step": 126100
},
{
"epoch": 5.047798088076477,
"grad_norm": 0.5086374878883362,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 126200
},
{
"epoch": 5.051797928082877,
"grad_norm": 0.5460554361343384,
"learning_rate": 0.001,
"loss": 4.1401,
"step": 126300
},
{
"epoch": 5.055797768089277,
"grad_norm": 0.4712692201137543,
"learning_rate": 0.001,
"loss": 4.1368,
"step": 126400
},
{
"epoch": 5.059797608095677,
"grad_norm": 0.4204212725162506,
"learning_rate": 0.001,
"loss": 4.1383,
"step": 126500
},
{
"epoch": 5.0637974481020755,
"grad_norm": 0.4033453166484833,
"learning_rate": 0.001,
"loss": 4.1389,
"step": 126600
},
{
"epoch": 5.067797288108475,
"grad_norm": 0.4524138569831848,
"learning_rate": 0.001,
"loss": 4.1416,
"step": 126700
},
{
"epoch": 5.071797128114875,
"grad_norm": 0.5840047597885132,
"learning_rate": 0.001,
"loss": 4.1408,
"step": 126800
},
{
"epoch": 5.075796968121275,
"grad_norm": 0.41969379782676697,
"learning_rate": 0.001,
"loss": 4.1396,
"step": 126900
},
{
"epoch": 5.079796808127675,
"grad_norm": 0.56402987241745,
"learning_rate": 0.001,
"loss": 4.1362,
"step": 127000
},
{
"epoch": 5.083796648134075,
"grad_norm": 0.3998129069805145,
"learning_rate": 0.001,
"loss": 4.1392,
"step": 127100
},
{
"epoch": 5.087796488140475,
"grad_norm": 0.443665474653244,
"learning_rate": 0.001,
"loss": 4.1392,
"step": 127200
},
{
"epoch": 5.0917963281468746,
"grad_norm": 0.4244501292705536,
"learning_rate": 0.001,
"loss": 4.1396,
"step": 127300
},
{
"epoch": 5.0957961681532735,
"grad_norm": 0.5381417274475098,
"learning_rate": 0.001,
"loss": 4.1352,
"step": 127400
},
{
"epoch": 5.099796008159673,
"grad_norm": 0.4484384059906006,
"learning_rate": 0.001,
"loss": 4.1416,
"step": 127500
},
{
"epoch": 5.103795848166073,
"grad_norm": 0.4542734920978546,
"learning_rate": 0.001,
"loss": 4.1358,
"step": 127600
},
{
"epoch": 5.107795688172473,
"grad_norm": 0.5095553398132324,
"learning_rate": 0.001,
"loss": 4.1385,
"step": 127700
},
{
"epoch": 5.111795528178873,
"grad_norm": 0.4590986669063568,
"learning_rate": 0.001,
"loss": 4.1381,
"step": 127800
},
{
"epoch": 5.115795368185273,
"grad_norm": 0.38399380445480347,
"learning_rate": 0.001,
"loss": 4.1367,
"step": 127900
},
{
"epoch": 5.119795208191673,
"grad_norm": 0.3857038915157318,
"learning_rate": 0.001,
"loss": 4.1431,
"step": 128000
},
{
"epoch": 5.1237950481980725,
"grad_norm": 0.49884679913520813,
"learning_rate": 0.001,
"loss": 4.138,
"step": 128100
},
{
"epoch": 5.1277948882044715,
"grad_norm": 0.5274735689163208,
"learning_rate": 0.001,
"loss": 4.1409,
"step": 128200
},
{
"epoch": 5.131794728210871,
"grad_norm": 0.4781200587749481,
"learning_rate": 0.001,
"loss": 4.1354,
"step": 128300
},
{
"epoch": 5.135794568217271,
"grad_norm": 0.37105411291122437,
"learning_rate": 0.001,
"loss": 4.1379,
"step": 128400
},
{
"epoch": 5.139794408223671,
"grad_norm": 0.4230349063873291,
"learning_rate": 0.001,
"loss": 4.1411,
"step": 128500
},
{
"epoch": 5.143794248230071,
"grad_norm": 0.3767766058444977,
"learning_rate": 0.001,
"loss": 4.1417,
"step": 128600
},
{
"epoch": 5.147794088236471,
"grad_norm": 0.4571896195411682,
"learning_rate": 0.001,
"loss": 4.1464,
"step": 128700
},
{
"epoch": 5.151793928242871,
"grad_norm": 0.4790409803390503,
"learning_rate": 0.001,
"loss": 4.1372,
"step": 128800
},
{
"epoch": 5.15579376824927,
"grad_norm": 0.492097407579422,
"learning_rate": 0.001,
"loss": 4.1411,
"step": 128900
},
{
"epoch": 5.1597936082556695,
"grad_norm": 0.4251415729522705,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 129000
},
{
"epoch": 5.163793448262069,
"grad_norm": 0.4291365146636963,
"learning_rate": 0.001,
"loss": 4.1416,
"step": 129100
},
{
"epoch": 5.167793288268469,
"grad_norm": 0.4996872544288635,
"learning_rate": 0.001,
"loss": 4.1397,
"step": 129200
},
{
"epoch": 5.171793128274869,
"grad_norm": 0.43896353244781494,
"learning_rate": 0.001,
"loss": 4.1392,
"step": 129300
},
{
"epoch": 5.175792968281269,
"grad_norm": 0.4306037127971649,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 129400
},
{
"epoch": 5.179792808287669,
"grad_norm": 0.45990708470344543,
"learning_rate": 0.001,
"loss": 4.1381,
"step": 129500
},
{
"epoch": 5.183792648294069,
"grad_norm": 0.40390530228614807,
"learning_rate": 0.001,
"loss": 4.1375,
"step": 129600
},
{
"epoch": 5.187792488300468,
"grad_norm": 0.48158109188079834,
"learning_rate": 0.001,
"loss": 4.1377,
"step": 129700
},
{
"epoch": 5.191792328306867,
"grad_norm": 0.4878210425376892,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 129800
},
{
"epoch": 5.195792168313267,
"grad_norm": 0.4516715407371521,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 129900
},
{
"epoch": 5.199792008319667,
"grad_norm": 0.41719183325767517,
"learning_rate": 0.001,
"loss": 4.142,
"step": 130000
},
{
"epoch": 5.203791848326067,
"grad_norm": 0.4469333291053772,
"learning_rate": 0.001,
"loss": 4.14,
"step": 130100
},
{
"epoch": 5.207791688332467,
"grad_norm": 0.45951905846595764,
"learning_rate": 0.001,
"loss": 4.1382,
"step": 130200
},
{
"epoch": 5.211791528338867,
"grad_norm": 0.3892884850502014,
"learning_rate": 0.001,
"loss": 4.1367,
"step": 130300
},
{
"epoch": 5.215791368345267,
"grad_norm": 0.49934664368629456,
"learning_rate": 0.001,
"loss": 4.14,
"step": 130400
},
{
"epoch": 5.2197912083516655,
"grad_norm": 0.3898753523826599,
"learning_rate": 0.001,
"loss": 4.1423,
"step": 130500
},
{
"epoch": 5.223791048358065,
"grad_norm": 0.4911145269870758,
"learning_rate": 0.001,
"loss": 4.1422,
"step": 130600
},
{
"epoch": 5.227790888364465,
"grad_norm": 0.40083619952201843,
"learning_rate": 0.001,
"loss": 4.1362,
"step": 130700
},
{
"epoch": 5.231790728370865,
"grad_norm": 0.45886579155921936,
"learning_rate": 0.001,
"loss": 4.1344,
"step": 130800
},
{
"epoch": 5.235790568377265,
"grad_norm": 0.6365372538566589,
"learning_rate": 0.001,
"loss": 4.1392,
"step": 130900
},
{
"epoch": 5.239790408383665,
"grad_norm": 0.38311493396759033,
"learning_rate": 0.001,
"loss": 4.1384,
"step": 131000
},
{
"epoch": 5.243790248390065,
"grad_norm": 0.5525908470153809,
"learning_rate": 0.001,
"loss": 4.1404,
"step": 131100
},
{
"epoch": 5.2477900883964645,
"grad_norm": 0.41158297657966614,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 131200
},
{
"epoch": 5.2517899284028635,
"grad_norm": 0.45629459619522095,
"learning_rate": 0.001,
"loss": 4.1405,
"step": 131300
},
{
"epoch": 5.255789768409263,
"grad_norm": 0.4971814751625061,
"learning_rate": 0.001,
"loss": 4.1393,
"step": 131400
},
{
"epoch": 5.259789608415663,
"grad_norm": 0.39209917187690735,
"learning_rate": 0.001,
"loss": 4.1418,
"step": 131500
},
{
"epoch": 5.263789448422063,
"grad_norm": 0.5428614020347595,
"learning_rate": 0.001,
"loss": 4.135,
"step": 131600
},
{
"epoch": 5.267789288428463,
"grad_norm": 0.3522048890590668,
"learning_rate": 0.001,
"loss": 4.1423,
"step": 131700
},
{
"epoch": 5.271789128434863,
"grad_norm": 0.40293794870376587,
"learning_rate": 0.001,
"loss": 4.137,
"step": 131800
},
{
"epoch": 5.275788968441263,
"grad_norm": 0.45241355895996094,
"learning_rate": 0.001,
"loss": 4.1394,
"step": 131900
},
{
"epoch": 5.2797888084476625,
"grad_norm": 0.44267144799232483,
"learning_rate": 0.001,
"loss": 4.1408,
"step": 132000
},
{
"epoch": 5.2837886484540615,
"grad_norm": 0.46244797110557556,
"learning_rate": 0.001,
"loss": 4.1341,
"step": 132100
},
{
"epoch": 5.287788488460461,
"grad_norm": 0.40171074867248535,
"learning_rate": 0.001,
"loss": 4.1397,
"step": 132200
},
{
"epoch": 5.291788328466861,
"grad_norm": 0.4446714222431183,
"learning_rate": 0.001,
"loss": 4.1385,
"step": 132300
},
{
"epoch": 5.295788168473261,
"grad_norm": 0.5090124011039734,
"learning_rate": 0.001,
"loss": 4.1379,
"step": 132400
},
{
"epoch": 5.299788008479661,
"grad_norm": 0.45481523871421814,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 132500
},
{
"epoch": 5.303787848486061,
"grad_norm": 0.5546725988388062,
"learning_rate": 0.001,
"loss": 4.1383,
"step": 132600
},
{
"epoch": 5.307787688492461,
"grad_norm": 0.46896442770957947,
"learning_rate": 0.001,
"loss": 4.1392,
"step": 132700
},
{
"epoch": 5.31178752849886,
"grad_norm": 0.39645591378211975,
"learning_rate": 0.001,
"loss": 4.1387,
"step": 132800
},
{
"epoch": 5.3157873685052595,
"grad_norm": 0.5071548819541931,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 132900
},
{
"epoch": 5.319787208511659,
"grad_norm": 0.4763946533203125,
"learning_rate": 0.001,
"loss": 4.1405,
"step": 133000
},
{
"epoch": 5.323787048518059,
"grad_norm": 0.4352118968963623,
"learning_rate": 0.001,
"loss": 4.1351,
"step": 133100
},
{
"epoch": 5.327786888524459,
"grad_norm": 0.4797314405441284,
"learning_rate": 0.001,
"loss": 4.136,
"step": 133200
},
{
"epoch": 5.331786728530859,
"grad_norm": 0.4417945444583893,
"learning_rate": 0.001,
"loss": 4.1386,
"step": 133300
},
{
"epoch": 5.335786568537259,
"grad_norm": 0.4554136395454407,
"learning_rate": 0.001,
"loss": 4.137,
"step": 133400
},
{
"epoch": 5.339786408543659,
"grad_norm": 0.40435078740119934,
"learning_rate": 0.001,
"loss": 4.1408,
"step": 133500
},
{
"epoch": 5.3437862485500585,
"grad_norm": 0.43502071499824524,
"learning_rate": 0.001,
"loss": 4.1412,
"step": 133600
},
{
"epoch": 5.347786088556457,
"grad_norm": 0.5281967520713806,
"learning_rate": 0.001,
"loss": 4.1373,
"step": 133700
},
{
"epoch": 5.351785928562857,
"grad_norm": 0.5273252129554749,
"learning_rate": 0.001,
"loss": 4.1373,
"step": 133800
},
{
"epoch": 5.355785768569257,
"grad_norm": 0.4398253262042999,
"learning_rate": 0.001,
"loss": 4.1369,
"step": 133900
},
{
"epoch": 5.359785608575657,
"grad_norm": 0.4958343505859375,
"learning_rate": 0.001,
"loss": 4.1413,
"step": 134000
},
{
"epoch": 5.363785448582057,
"grad_norm": 0.4647107720375061,
"learning_rate": 0.001,
"loss": 4.138,
"step": 134100
},
{
"epoch": 5.367785288588457,
"grad_norm": 0.4397704005241394,
"learning_rate": 0.001,
"loss": 4.1364,
"step": 134200
},
{
"epoch": 5.371785128594857,
"grad_norm": 0.4742043912410736,
"learning_rate": 0.001,
"loss": 4.138,
"step": 134300
},
{
"epoch": 5.3757849686012555,
"grad_norm": 0.4011693000793457,
"learning_rate": 0.001,
"loss": 4.1386,
"step": 134400
},
{
"epoch": 5.379784808607655,
"grad_norm": 0.3930029273033142,
"learning_rate": 0.001,
"loss": 4.141,
"step": 134500
},
{
"epoch": 5.383784648614055,
"grad_norm": 0.39063316583633423,
"learning_rate": 0.001,
"loss": 4.1398,
"step": 134600
},
{
"epoch": 5.387784488620455,
"grad_norm": 0.44939857721328735,
"learning_rate": 0.001,
"loss": 4.1366,
"step": 134700
},
{
"epoch": 5.391784328626855,
"grad_norm": 0.5439363121986389,
"learning_rate": 0.001,
"loss": 4.1369,
"step": 134800
},
{
"epoch": 5.395784168633255,
"grad_norm": 0.42577844858169556,
"learning_rate": 0.001,
"loss": 4.1402,
"step": 134900
},
{
"epoch": 5.399784008639655,
"grad_norm": 0.4027504622936249,
"learning_rate": 0.001,
"loss": 4.1366,
"step": 135000
},
{
"epoch": 5.4037838486460545,
"grad_norm": 0.6525794863700867,
"learning_rate": 0.001,
"loss": 4.1389,
"step": 135100
},
{
"epoch": 5.4077836886524535,
"grad_norm": 0.4911954700946808,
"learning_rate": 0.001,
"loss": 4.1385,
"step": 135200
},
{
"epoch": 5.411783528658853,
"grad_norm": 0.4476899802684784,
"learning_rate": 0.001,
"loss": 4.139,
"step": 135300
},
{
"epoch": 5.415783368665253,
"grad_norm": 0.4557499885559082,
"learning_rate": 0.001,
"loss": 4.1364,
"step": 135400
},
{
"epoch": 5.419783208671653,
"grad_norm": 0.39908871054649353,
"learning_rate": 0.001,
"loss": 4.1368,
"step": 135500
},
{
"epoch": 5.423783048678053,
"grad_norm": 0.4525020122528076,
"learning_rate": 0.001,
"loss": 4.1385,
"step": 135600
},
{
"epoch": 5.427782888684453,
"grad_norm": 0.45615649223327637,
"learning_rate": 0.001,
"loss": 4.1399,
"step": 135700
},
{
"epoch": 5.431782728690853,
"grad_norm": 0.4389837086200714,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 135800
},
{
"epoch": 5.4357825686972525,
"grad_norm": 0.5461357831954956,
"learning_rate": 0.001,
"loss": 4.1372,
"step": 135900
},
{
"epoch": 5.4397824087036515,
"grad_norm": 0.4126543402671814,
"learning_rate": 0.001,
"loss": 4.1364,
"step": 136000
},
{
"epoch": 5.443782248710051,
"grad_norm": 0.39160358905792236,
"learning_rate": 0.001,
"loss": 4.1412,
"step": 136100
},
{
"epoch": 5.447782088716451,
"grad_norm": 0.4216913878917694,
"learning_rate": 0.001,
"loss": 4.1367,
"step": 136200
},
{
"epoch": 5.451781928722851,
"grad_norm": 0.4482346177101135,
"learning_rate": 0.001,
"loss": 4.1366,
"step": 136300
},
{
"epoch": 5.455781768729251,
"grad_norm": 0.5682035684585571,
"learning_rate": 0.001,
"loss": 4.1387,
"step": 136400
},
{
"epoch": 5.459781608735651,
"grad_norm": 0.5753220319747925,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 136500
},
{
"epoch": 5.463781448742051,
"grad_norm": 0.49236294627189636,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 136600
},
{
"epoch": 5.46778128874845,
"grad_norm": 0.4295791685581207,
"learning_rate": 0.001,
"loss": 4.1382,
"step": 136700
},
{
"epoch": 5.4717811287548495,
"grad_norm": 0.4442785978317261,
"learning_rate": 0.001,
"loss": 4.1383,
"step": 136800
},
{
"epoch": 5.475780968761249,
"grad_norm": 0.5207620859146118,
"learning_rate": 0.001,
"loss": 4.1382,
"step": 136900
},
{
"epoch": 5.479780808767649,
"grad_norm": 0.45256859064102173,
"learning_rate": 0.001,
"loss": 4.1385,
"step": 137000
},
{
"epoch": 5.483780648774049,
"grad_norm": 0.4170093536376953,
"learning_rate": 0.001,
"loss": 4.14,
"step": 137100
},
{
"epoch": 5.487780488780449,
"grad_norm": 0.47792255878448486,
"learning_rate": 0.001,
"loss": 4.1347,
"step": 137200
},
{
"epoch": 5.491780328786849,
"grad_norm": 0.4334956705570221,
"learning_rate": 0.001,
"loss": 4.1368,
"step": 137300
},
{
"epoch": 5.495780168793249,
"grad_norm": 0.47183749079704285,
"learning_rate": 0.001,
"loss": 4.1389,
"step": 137400
},
{
"epoch": 5.4997800087996485,
"grad_norm": 0.492654412984848,
"learning_rate": 0.001,
"loss": 4.135,
"step": 137500
},
{
"epoch": 5.503779848806047,
"grad_norm": 0.474648654460907,
"learning_rate": 0.001,
"loss": 4.1391,
"step": 137600
},
{
"epoch": 5.507779688812447,
"grad_norm": 0.40896373987197876,
"learning_rate": 0.001,
"loss": 4.1379,
"step": 137700
},
{
"epoch": 5.511779528818847,
"grad_norm": 0.45079365372657776,
"learning_rate": 0.001,
"loss": 4.1399,
"step": 137800
},
{
"epoch": 5.515779368825247,
"grad_norm": 0.5783036351203918,
"learning_rate": 0.001,
"loss": 4.1359,
"step": 137900
},
{
"epoch": 5.519779208831647,
"grad_norm": 0.4422449469566345,
"learning_rate": 0.001,
"loss": 4.1375,
"step": 138000
},
{
"epoch": 5.523779048838047,
"grad_norm": 0.5112189054489136,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 138100
},
{
"epoch": 5.527778888844447,
"grad_norm": 0.40671586990356445,
"learning_rate": 0.001,
"loss": 4.1391,
"step": 138200
},
{
"epoch": 5.5317787288508455,
"grad_norm": 0.5037602186203003,
"learning_rate": 0.001,
"loss": 4.1383,
"step": 138300
},
{
"epoch": 5.535778568857245,
"grad_norm": 0.46466997265815735,
"learning_rate": 0.001,
"loss": 4.1379,
"step": 138400
},
{
"epoch": 5.539778408863645,
"grad_norm": 0.4853058159351349,
"learning_rate": 0.001,
"loss": 4.135,
"step": 138500
},
{
"epoch": 5.543778248870045,
"grad_norm": 0.3657609820365906,
"learning_rate": 0.001,
"loss": 4.1356,
"step": 138600
},
{
"epoch": 5.547778088876445,
"grad_norm": 0.49444639682769775,
"learning_rate": 0.001,
"loss": 4.1401,
"step": 138700
},
{
"epoch": 5.551777928882845,
"grad_norm": 0.4573862850666046,
"learning_rate": 0.001,
"loss": 4.1392,
"step": 138800
},
{
"epoch": 5.555777768889245,
"grad_norm": 0.5398617386817932,
"learning_rate": 0.001,
"loss": 4.1349,
"step": 138900
},
{
"epoch": 5.5597776088956445,
"grad_norm": 0.44698962569236755,
"learning_rate": 0.001,
"loss": 4.1378,
"step": 139000
},
{
"epoch": 5.563777448902044,
"grad_norm": 0.37685704231262207,
"learning_rate": 0.001,
"loss": 4.1367,
"step": 139100
},
{
"epoch": 5.567777288908443,
"grad_norm": 0.40856295824050903,
"learning_rate": 0.001,
"loss": 4.1417,
"step": 139200
},
{
"epoch": 5.571777128914843,
"grad_norm": 0.36752602458000183,
"learning_rate": 0.001,
"loss": 4.1339,
"step": 139300
},
{
"epoch": 5.575776968921243,
"grad_norm": 0.4708743095397949,
"learning_rate": 0.001,
"loss": 4.137,
"step": 139400
},
{
"epoch": 5.579776808927643,
"grad_norm": 0.4223979413509369,
"learning_rate": 0.001,
"loss": 4.135,
"step": 139500
},
{
"epoch": 5.583776648934043,
"grad_norm": 0.4208683371543884,
"learning_rate": 0.001,
"loss": 4.1346,
"step": 139600
},
{
"epoch": 5.587776488940443,
"grad_norm": 0.47049957513809204,
"learning_rate": 0.001,
"loss": 4.1367,
"step": 139700
},
{
"epoch": 5.5917763289468425,
"grad_norm": 0.44872990250587463,
"learning_rate": 0.001,
"loss": 4.139,
"step": 139800
},
{
"epoch": 5.5957761689532415,
"grad_norm": 0.43615269660949707,
"learning_rate": 0.001,
"loss": 4.1379,
"step": 139900
},
{
"epoch": 5.599776008959641,
"grad_norm": 0.5177183151245117,
"learning_rate": 0.001,
"loss": 4.1402,
"step": 140000
},
{
"epoch": 5.603775848966041,
"grad_norm": 0.4234861731529236,
"learning_rate": 0.001,
"loss": 4.1423,
"step": 140100
},
{
"epoch": 5.607775688972441,
"grad_norm": 0.39408451318740845,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 140200
},
{
"epoch": 5.611775528978841,
"grad_norm": 0.5079990029335022,
"learning_rate": 0.001,
"loss": 4.1363,
"step": 140300
},
{
"epoch": 5.615775368985241,
"grad_norm": 0.48556408286094666,
"learning_rate": 0.001,
"loss": 4.138,
"step": 140400
},
{
"epoch": 5.619775208991641,
"grad_norm": 0.4212859570980072,
"learning_rate": 0.001,
"loss": 4.1349,
"step": 140500
},
{
"epoch": 5.62377504899804,
"grad_norm": 0.3887998163700104,
"learning_rate": 0.001,
"loss": 4.1354,
"step": 140600
},
{
"epoch": 5.6277748890044395,
"grad_norm": 0.41680628061294556,
"learning_rate": 0.001,
"loss": 4.1354,
"step": 140700
},
{
"epoch": 5.631774729010839,
"grad_norm": 0.4846498370170593,
"learning_rate": 0.001,
"loss": 4.1359,
"step": 140800
},
{
"epoch": 5.635774569017239,
"grad_norm": 0.45596760511398315,
"learning_rate": 0.001,
"loss": 4.1375,
"step": 140900
},
{
"epoch": 5.639774409023639,
"grad_norm": 0.484160840511322,
"learning_rate": 0.001,
"loss": 4.1346,
"step": 141000
},
{
"epoch": 5.643774249030039,
"grad_norm": 0.4429890811443329,
"learning_rate": 0.001,
"loss": 4.1369,
"step": 141100
},
{
"epoch": 5.647774089036439,
"grad_norm": 0.436334490776062,
"learning_rate": 0.001,
"loss": 4.1365,
"step": 141200
},
{
"epoch": 5.651773929042839,
"grad_norm": 0.5436973571777344,
"learning_rate": 0.001,
"loss": 4.1372,
"step": 141300
},
{
"epoch": 5.6557737690492385,
"grad_norm": 0.46049728989601135,
"learning_rate": 0.001,
"loss": 4.1339,
"step": 141400
},
{
"epoch": 5.659773609055637,
"grad_norm": 0.45588213205337524,
"learning_rate": 0.001,
"loss": 4.1348,
"step": 141500
},
{
"epoch": 5.663773449062037,
"grad_norm": 0.4084899425506592,
"learning_rate": 0.001,
"loss": 4.137,
"step": 141600
},
{
"epoch": 5.667773289068437,
"grad_norm": 0.5410123467445374,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 141700
},
{
"epoch": 5.671773129074837,
"grad_norm": 0.4420919120311737,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 141800
},
{
"epoch": 5.675772969081237,
"grad_norm": 0.48726823925971985,
"learning_rate": 0.001,
"loss": 4.1366,
"step": 141900
},
{
"epoch": 5.679772809087637,
"grad_norm": 0.425656259059906,
"learning_rate": 0.001,
"loss": 4.1385,
"step": 142000
},
{
"epoch": 5.683772649094037,
"grad_norm": 0.481614887714386,
"learning_rate": 0.001,
"loss": 4.1342,
"step": 142100
},
{
"epoch": 5.6877724891004355,
"grad_norm": 0.41768065094947815,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 142200
},
{
"epoch": 5.691772329106835,
"grad_norm": 0.42194467782974243,
"learning_rate": 0.001,
"loss": 4.1339,
"step": 142300
},
{
"epoch": 5.695772169113235,
"grad_norm": 0.49403807520866394,
"learning_rate": 0.001,
"loss": 4.1373,
"step": 142400
},
{
"epoch": 5.699772009119635,
"grad_norm": 0.496571809053421,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 142500
},
{
"epoch": 5.703771849126035,
"grad_norm": 0.43960630893707275,
"learning_rate": 0.001,
"loss": 4.132,
"step": 142600
},
{
"epoch": 5.707771689132435,
"grad_norm": 0.43595102429389954,
"learning_rate": 0.001,
"loss": 4.1372,
"step": 142700
},
{
"epoch": 5.711771529138835,
"grad_norm": 0.421332448720932,
"learning_rate": 0.001,
"loss": 4.139,
"step": 142800
},
{
"epoch": 5.7157713691452345,
"grad_norm": 0.4697113037109375,
"learning_rate": 0.001,
"loss": 4.1363,
"step": 142900
},
{
"epoch": 5.719771209151634,
"grad_norm": 0.4212019443511963,
"learning_rate": 0.001,
"loss": 4.1385,
"step": 143000
},
{
"epoch": 5.723771049158033,
"grad_norm": 0.5039213299751282,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 143100
},
{
"epoch": 5.727770889164433,
"grad_norm": 0.4202103614807129,
"learning_rate": 0.001,
"loss": 4.139,
"step": 143200
},
{
"epoch": 5.731770729170833,
"grad_norm": 0.48808401823043823,
"learning_rate": 0.001,
"loss": 4.1343,
"step": 143300
},
{
"epoch": 5.735770569177233,
"grad_norm": 0.484749972820282,
"learning_rate": 0.001,
"loss": 4.1377,
"step": 143400
},
{
"epoch": 5.739770409183633,
"grad_norm": 0.37245190143585205,
"learning_rate": 0.001,
"loss": 4.1386,
"step": 143500
},
{
"epoch": 5.743770249190033,
"grad_norm": 0.49025431275367737,
"learning_rate": 0.001,
"loss": 4.1392,
"step": 143600
},
{
"epoch": 5.7477700891964325,
"grad_norm": 0.4268828332424164,
"learning_rate": 0.001,
"loss": 4.1373,
"step": 143700
},
{
"epoch": 5.7517699292028315,
"grad_norm": 0.4585922360420227,
"learning_rate": 0.001,
"loss": 4.137,
"step": 143800
},
{
"epoch": 5.755769769209231,
"grad_norm": 0.47988182306289673,
"learning_rate": 0.001,
"loss": 4.1351,
"step": 143900
},
{
"epoch": 5.759769609215631,
"grad_norm": 0.45885664224624634,
"learning_rate": 0.001,
"loss": 4.1384,
"step": 144000
},
{
"epoch": 5.763769449222031,
"grad_norm": 0.474288672208786,
"learning_rate": 0.001,
"loss": 4.139,
"step": 144100
},
{
"epoch": 5.767769289228431,
"grad_norm": 0.4917161464691162,
"learning_rate": 0.001,
"loss": 4.1369,
"step": 144200
},
{
"epoch": 5.771769129234831,
"grad_norm": 0.46606698632240295,
"learning_rate": 0.001,
"loss": 4.1322,
"step": 144300
},
{
"epoch": 5.775768969241231,
"grad_norm": 0.49236711859703064,
"learning_rate": 0.001,
"loss": 4.1371,
"step": 144400
},
{
"epoch": 5.7797688092476305,
"grad_norm": 0.48581868410110474,
"learning_rate": 0.001,
"loss": 4.1363,
"step": 144500
},
{
"epoch": 5.78376864925403,
"grad_norm": 0.44188404083251953,
"learning_rate": 0.001,
"loss": 4.1348,
"step": 144600
},
{
"epoch": 5.787768489260429,
"grad_norm": 0.5125553011894226,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 144700
},
{
"epoch": 5.791768329266829,
"grad_norm": 0.3982478380203247,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 144800
},
{
"epoch": 5.795768169273229,
"grad_norm": 0.4386448860168457,
"learning_rate": 0.001,
"loss": 4.1338,
"step": 144900
},
{
"epoch": 5.799768009279629,
"grad_norm": 0.4385557174682617,
"learning_rate": 0.001,
"loss": 4.1355,
"step": 145000
},
{
"epoch": 5.803767849286029,
"grad_norm": 0.4551478624343872,
"learning_rate": 0.001,
"loss": 4.134,
"step": 145100
},
{
"epoch": 5.807767689292429,
"grad_norm": 0.4078340530395508,
"learning_rate": 0.001,
"loss": 4.1342,
"step": 145200
},
{
"epoch": 5.8117675292988284,
"grad_norm": 0.4332394003868103,
"learning_rate": 0.001,
"loss": 4.1354,
"step": 145300
},
{
"epoch": 5.815767369305227,
"grad_norm": 0.3897719979286194,
"learning_rate": 0.001,
"loss": 4.1364,
"step": 145400
},
{
"epoch": 5.819767209311627,
"grad_norm": 0.45064935088157654,
"learning_rate": 0.001,
"loss": 4.1339,
"step": 145500
},
{
"epoch": 5.823767049318027,
"grad_norm": 0.4647873044013977,
"learning_rate": 0.001,
"loss": 4.1335,
"step": 145600
},
{
"epoch": 5.827766889324427,
"grad_norm": 0.4528816342353821,
"learning_rate": 0.001,
"loss": 4.1351,
"step": 145700
},
{
"epoch": 5.831766729330827,
"grad_norm": 0.38677456974983215,
"learning_rate": 0.001,
"loss": 4.1388,
"step": 145800
},
{
"epoch": 5.835766569337227,
"grad_norm": 0.4616670608520508,
"learning_rate": 0.001,
"loss": 4.1353,
"step": 145900
},
{
"epoch": 5.839766409343627,
"grad_norm": 0.4020819067955017,
"learning_rate": 0.001,
"loss": 4.1376,
"step": 146000
},
{
"epoch": 5.8437662493500255,
"grad_norm": 0.4845848083496094,
"learning_rate": 0.001,
"loss": 4.1409,
"step": 146100
},
{
"epoch": 5.847766089356425,
"grad_norm": 0.40645313262939453,
"learning_rate": 0.001,
"loss": 4.1372,
"step": 146200
},
{
"epoch": 5.851765929362825,
"grad_norm": 0.37546342611312866,
"learning_rate": 0.001,
"loss": 4.1342,
"step": 146300
},
{
"epoch": 5.855765769369225,
"grad_norm": 0.406170517206192,
"learning_rate": 0.001,
"loss": 4.1333,
"step": 146400
},
{
"epoch": 5.859765609375625,
"grad_norm": 0.5377382636070251,
"learning_rate": 0.001,
"loss": 4.14,
"step": 146500
},
{
"epoch": 5.863765449382025,
"grad_norm": 0.45157358050346375,
"learning_rate": 0.001,
"loss": 4.1341,
"step": 146600
},
{
"epoch": 5.867765289388425,
"grad_norm": 0.6039636731147766,
"learning_rate": 0.001,
"loss": 4.1389,
"step": 146700
},
{
"epoch": 5.8717651293948245,
"grad_norm": 0.4240739643573761,
"learning_rate": 0.001,
"loss": 4.1356,
"step": 146800
},
{
"epoch": 5.875764969401224,
"grad_norm": 0.42058026790618896,
"learning_rate": 0.001,
"loss": 4.1364,
"step": 146900
},
{
"epoch": 5.879764809407623,
"grad_norm": 0.454563170671463,
"learning_rate": 0.001,
"loss": 4.138,
"step": 147000
},
{
"epoch": 5.883764649414023,
"grad_norm": 0.5056443214416504,
"learning_rate": 0.001,
"loss": 4.1388,
"step": 147100
},
{
"epoch": 5.887764489420423,
"grad_norm": 0.44132763147354126,
"learning_rate": 0.001,
"loss": 4.1325,
"step": 147200
},
{
"epoch": 5.891764329426823,
"grad_norm": 0.4522813856601715,
"learning_rate": 0.001,
"loss": 4.137,
"step": 147300
},
{
"epoch": 5.895764169433223,
"grad_norm": 0.36617109179496765,
"learning_rate": 0.001,
"loss": 4.1326,
"step": 147400
},
{
"epoch": 5.899764009439623,
"grad_norm": 0.4096498191356659,
"learning_rate": 0.001,
"loss": 4.1364,
"step": 147500
},
{
"epoch": 5.9037638494460225,
"grad_norm": 0.3995516300201416,
"learning_rate": 0.001,
"loss": 4.1374,
"step": 147600
},
{
"epoch": 5.9077636894524215,
"grad_norm": 0.5837684869766235,
"learning_rate": 0.001,
"loss": 4.1337,
"step": 147700
},
{
"epoch": 5.911763529458821,
"grad_norm": 0.4246392548084259,
"learning_rate": 0.001,
"loss": 4.1349,
"step": 147800
},
{
"epoch": 5.915763369465221,
"grad_norm": 0.480863094329834,
"learning_rate": 0.001,
"loss": 4.1365,
"step": 147900
},
{
"epoch": 5.919763209471621,
"grad_norm": 0.3852327764034271,
"learning_rate": 0.001,
"loss": 4.1358,
"step": 148000
},
{
"epoch": 5.923763049478021,
"grad_norm": 0.4895519018173218,
"learning_rate": 0.001,
"loss": 4.1393,
"step": 148100
},
{
"epoch": 5.927762889484421,
"grad_norm": 0.517063319683075,
"learning_rate": 0.001,
"loss": 4.137,
"step": 148200
},
{
"epoch": 5.931762729490821,
"grad_norm": 0.47970083355903625,
"learning_rate": 0.001,
"loss": 4.1403,
"step": 148300
},
{
"epoch": 5.9357625694972205,
"grad_norm": 0.4487200081348419,
"learning_rate": 0.001,
"loss": 4.135,
"step": 148400
},
{
"epoch": 5.93976240950362,
"grad_norm": 0.46553564071655273,
"learning_rate": 0.001,
"loss": 4.1392,
"step": 148500
},
{
"epoch": 5.943762249510019,
"grad_norm": 0.39696386456489563,
"learning_rate": 0.001,
"loss": 4.1334,
"step": 148600
},
{
"epoch": 5.947762089516419,
"grad_norm": 0.3962916433811188,
"learning_rate": 0.001,
"loss": 4.1388,
"step": 148700
},
{
"epoch": 5.951761929522819,
"grad_norm": 0.5088990926742554,
"learning_rate": 0.001,
"loss": 4.1363,
"step": 148800
},
{
"epoch": 5.955761769529219,
"grad_norm": 0.5045955777168274,
"learning_rate": 0.001,
"loss": 4.1367,
"step": 148900
},
{
"epoch": 5.959761609535619,
"grad_norm": 0.4137150049209595,
"learning_rate": 0.001,
"loss": 4.1347,
"step": 149000
},
{
"epoch": 5.963761449542019,
"grad_norm": 0.4232303202152252,
"learning_rate": 0.001,
"loss": 4.1397,
"step": 149100
},
{
"epoch": 5.9677612895484184,
"grad_norm": 0.4458197057247162,
"learning_rate": 0.001,
"loss": 4.1358,
"step": 149200
},
{
"epoch": 5.971761129554817,
"grad_norm": 0.4045810103416443,
"learning_rate": 0.001,
"loss": 4.1314,
"step": 149300
},
{
"epoch": 5.975760969561217,
"grad_norm": 0.45485568046569824,
"learning_rate": 0.001,
"loss": 4.1348,
"step": 149400
},
{
"epoch": 5.979760809567617,
"grad_norm": 0.4166460335254669,
"learning_rate": 0.001,
"loss": 4.1394,
"step": 149500
},
{
"epoch": 5.983760649574017,
"grad_norm": 0.40538185834884644,
"learning_rate": 0.001,
"loss": 4.136,
"step": 149600
},
{
"epoch": 5.987760489580417,
"grad_norm": 0.4489404857158661,
"learning_rate": 0.001,
"loss": 4.1382,
"step": 149700
},
{
"epoch": 5.991760329586817,
"grad_norm": 0.47682425379753113,
"learning_rate": 0.001,
"loss": 4.134,
"step": 149800
},
{
"epoch": 5.995760169593217,
"grad_norm": 0.5068487524986267,
"learning_rate": 0.001,
"loss": 4.1346,
"step": 149900
},
{
"epoch": 5.9997600095996155,
"grad_norm": 0.4409950077533722,
"learning_rate": 0.001,
"loss": 4.1359,
"step": 150000
}
],
"logging_steps": 100,
"max_steps": 150000,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.59971946496e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}