AusmitM's picture
Upload 31 files
283f8c4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 99.5475113122172,
"eval_steps": 20000,
"global_step": 308000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03232062055591468,
"grad_norm": 9.332721710205078,
"learning_rate": 9.900000000000002e-06,
"loss": 4.2426,
"step": 100
},
{
"epoch": 0.06464124111182935,
"grad_norm": 8.882152557373047,
"learning_rate": 1.9900000000000003e-05,
"loss": 4.0765,
"step": 200
},
{
"epoch": 0.09696186166774402,
"grad_norm": 15.724532127380371,
"learning_rate": 2.9900000000000002e-05,
"loss": 4.0268,
"step": 300
},
{
"epoch": 0.1292824822236587,
"grad_norm": 7.888890266418457,
"learning_rate": 3.99e-05,
"loss": 3.9902,
"step": 400
},
{
"epoch": 0.16160310277957338,
"grad_norm": 10.950591087341309,
"learning_rate": 4.99e-05,
"loss": 3.9292,
"step": 500
},
{
"epoch": 0.19392372333548805,
"grad_norm": 16.338972091674805,
"learning_rate": 5.9900000000000006e-05,
"loss": 3.8515,
"step": 600
},
{
"epoch": 0.22624434389140272,
"grad_norm": 5.990420818328857,
"learning_rate": 6.99e-05,
"loss": 3.8284,
"step": 700
},
{
"epoch": 0.2585649644473174,
"grad_norm": 9.774703025817871,
"learning_rate": 7.99e-05,
"loss": 3.7876,
"step": 800
},
{
"epoch": 0.2908855850032321,
"grad_norm": 7.742414951324463,
"learning_rate": 8.989999999999999e-05,
"loss": 3.7472,
"step": 900
},
{
"epoch": 0.32320620555914675,
"grad_norm": 4.4023284912109375,
"learning_rate": 9.99e-05,
"loss": 3.7163,
"step": 1000
},
{
"epoch": 0.3555268261150614,
"grad_norm": 4.680882453918457,
"learning_rate": 0.0001099,
"loss": 3.6804,
"step": 1100
},
{
"epoch": 0.3878474466709761,
"grad_norm": 4.001336574554443,
"learning_rate": 0.00011990000000000001,
"loss": 3.6462,
"step": 1200
},
{
"epoch": 0.42016806722689076,
"grad_norm": 5.962468147277832,
"learning_rate": 0.00012989999999999999,
"loss": 3.6611,
"step": 1300
},
{
"epoch": 0.45248868778280543,
"grad_norm": 3.200063467025757,
"learning_rate": 0.0001399,
"loss": 3.6355,
"step": 1400
},
{
"epoch": 0.4848093083387201,
"grad_norm": 4.732622146606445,
"learning_rate": 0.0001499,
"loss": 3.6127,
"step": 1500
},
{
"epoch": 0.5171299288946348,
"grad_norm": 3.1570827960968018,
"learning_rate": 0.00015989999999999998,
"loss": 3.6142,
"step": 1600
},
{
"epoch": 0.5494505494505495,
"grad_norm": 4.7870073318481445,
"learning_rate": 0.0001699,
"loss": 3.5934,
"step": 1700
},
{
"epoch": 0.5817711700064642,
"grad_norm": 3.6958024501800537,
"learning_rate": 0.0001799,
"loss": 3.5961,
"step": 1800
},
{
"epoch": 0.6140917905623788,
"grad_norm": 3.7356350421905518,
"learning_rate": 0.0001899,
"loss": 3.6097,
"step": 1900
},
{
"epoch": 0.6464124111182935,
"grad_norm": 3.8976237773895264,
"learning_rate": 0.0001999,
"loss": 3.5555,
"step": 2000
},
{
"epoch": 0.6787330316742082,
"grad_norm": 3.5063016414642334,
"learning_rate": 0.0002099,
"loss": 3.5528,
"step": 2100
},
{
"epoch": 0.7110536522301228,
"grad_norm": 3.2186264991760254,
"learning_rate": 0.0002199,
"loss": 3.5596,
"step": 2200
},
{
"epoch": 0.7433742727860375,
"grad_norm": 3.2118473052978516,
"learning_rate": 0.0002299,
"loss": 3.5283,
"step": 2300
},
{
"epoch": 0.7756948933419522,
"grad_norm": 3.3671634197235107,
"learning_rate": 0.0002399,
"loss": 3.5257,
"step": 2400
},
{
"epoch": 0.8080155138978669,
"grad_norm": 4.391268730163574,
"learning_rate": 0.0002499,
"loss": 3.5231,
"step": 2500
},
{
"epoch": 0.8403361344537815,
"grad_norm": 2.7105960845947266,
"learning_rate": 0.00025990000000000003,
"loss": 3.522,
"step": 2600
},
{
"epoch": 0.8726567550096962,
"grad_norm": 3.373960494995117,
"learning_rate": 0.0002699,
"loss": 3.5354,
"step": 2700
},
{
"epoch": 0.9049773755656109,
"grad_norm": 2.757404088973999,
"learning_rate": 0.0002799,
"loss": 3.5267,
"step": 2800
},
{
"epoch": 0.9372979961215255,
"grad_norm": 4.519193649291992,
"learning_rate": 0.0002899,
"loss": 3.501,
"step": 2900
},
{
"epoch": 0.9696186166774402,
"grad_norm": 4.307316780090332,
"learning_rate": 0.0002999,
"loss": 3.4861,
"step": 3000
},
{
"epoch": 1.0019392372333549,
"grad_norm": 3.6178064346313477,
"learning_rate": 0.0003099,
"loss": 3.4992,
"step": 3100
},
{
"epoch": 1.0342598577892697,
"grad_norm": 1.7168558835983276,
"learning_rate": 0.0003199,
"loss": 3.4419,
"step": 3200
},
{
"epoch": 1.0665804783451842,
"grad_norm": 1.5993854999542236,
"learning_rate": 0.00032990000000000005,
"loss": 3.4434,
"step": 3300
},
{
"epoch": 1.098901098901099,
"grad_norm": 1.2065600156784058,
"learning_rate": 0.00033989999999999997,
"loss": 3.4292,
"step": 3400
},
{
"epoch": 1.1312217194570136,
"grad_norm": 1.1300657987594604,
"learning_rate": 0.0003499,
"loss": 3.4418,
"step": 3500
},
{
"epoch": 1.1635423400129283,
"grad_norm": 2.4604320526123047,
"learning_rate": 0.0003599,
"loss": 3.4454,
"step": 3600
},
{
"epoch": 1.195862960568843,
"grad_norm": 5.035538196563721,
"learning_rate": 0.0003699,
"loss": 3.4292,
"step": 3700
},
{
"epoch": 1.2281835811247577,
"grad_norm": 1.4227688312530518,
"learning_rate": 0.0003799,
"loss": 3.4257,
"step": 3800
},
{
"epoch": 1.2605042016806722,
"grad_norm": 1.5160913467407227,
"learning_rate": 0.00038990000000000004,
"loss": 3.4242,
"step": 3900
},
{
"epoch": 1.292824822236587,
"grad_norm": 1.1818920373916626,
"learning_rate": 0.00039989999999999996,
"loss": 3.4004,
"step": 4000
},
{
"epoch": 1.3251454427925016,
"grad_norm": 1.411624550819397,
"learning_rate": 0.0004099,
"loss": 3.4166,
"step": 4100
},
{
"epoch": 1.3574660633484164,
"grad_norm": 1.5991268157958984,
"learning_rate": 0.0004199,
"loss": 3.4149,
"step": 4200
},
{
"epoch": 1.389786683904331,
"grad_norm": 1.228127360343933,
"learning_rate": 0.0004299,
"loss": 3.4229,
"step": 4300
},
{
"epoch": 1.4221073044602457,
"grad_norm": 7.947666645050049,
"learning_rate": 0.0004399,
"loss": 3.4058,
"step": 4400
},
{
"epoch": 1.4544279250161603,
"grad_norm": 1.4560375213623047,
"learning_rate": 0.00044990000000000004,
"loss": 3.3876,
"step": 4500
},
{
"epoch": 1.486748545572075,
"grad_norm": 1.2084722518920898,
"learning_rate": 0.0004599,
"loss": 3.3963,
"step": 4600
},
{
"epoch": 1.5190691661279896,
"grad_norm": 1.5284830331802368,
"learning_rate": 0.0004699,
"loss": 3.4088,
"step": 4700
},
{
"epoch": 1.5513897866839044,
"grad_norm": 1.1983979940414429,
"learning_rate": 0.0004799,
"loss": 3.4061,
"step": 4800
},
{
"epoch": 1.5837104072398192,
"grad_norm": 1.314408540725708,
"learning_rate": 0.0004899,
"loss": 3.3987,
"step": 4900
},
{
"epoch": 1.6160310277957337,
"grad_norm": 1.5176293849945068,
"learning_rate": 0.0004999000000000001,
"loss": 3.366,
"step": 5000
},
{
"epoch": 1.6483516483516483,
"grad_norm": 1.500085711479187,
"learning_rate": 0.0005099,
"loss": 3.3984,
"step": 5100
},
{
"epoch": 1.680672268907563,
"grad_norm": 1.016550898551941,
"learning_rate": 0.0005199,
"loss": 3.3626,
"step": 5200
},
{
"epoch": 1.7129928894634778,
"grad_norm": 13.165894508361816,
"learning_rate": 0.0005299,
"loss": 3.3609,
"step": 5300
},
{
"epoch": 1.7453135100193924,
"grad_norm": 1.186579942703247,
"learning_rate": 0.0005399000000000001,
"loss": 3.3672,
"step": 5400
},
{
"epoch": 1.777634130575307,
"grad_norm": 1.2896537780761719,
"learning_rate": 0.0005499000000000001,
"loss": 3.3825,
"step": 5500
},
{
"epoch": 1.8099547511312217,
"grad_norm": 1.2675527334213257,
"learning_rate": 0.0005599,
"loss": 3.3698,
"step": 5600
},
{
"epoch": 1.8422753716871365,
"grad_norm": 1.1798584461212158,
"learning_rate": 0.0005698999999999999,
"loss": 3.3624,
"step": 5700
},
{
"epoch": 1.874595992243051,
"grad_norm": 0.8817252516746521,
"learning_rate": 0.0005799,
"loss": 3.3503,
"step": 5800
},
{
"epoch": 1.9069166127989656,
"grad_norm": 1.2770187854766846,
"learning_rate": 0.0005899,
"loss": 3.3655,
"step": 5900
},
{
"epoch": 1.9392372333548804,
"grad_norm": 1.062826156616211,
"learning_rate": 0.0005999,
"loss": 3.3595,
"step": 6000
},
{
"epoch": 1.9715578539107952,
"grad_norm": 0.97618567943573,
"learning_rate": 0.0006099,
"loss": 3.365,
"step": 6100
},
{
"epoch": 2.0038784744667097,
"grad_norm": 0.8138112425804138,
"learning_rate": 0.0006199,
"loss": 3.3674,
"step": 6200
},
{
"epoch": 2.0361990950226243,
"grad_norm": 0.8098726272583008,
"learning_rate": 0.0006299000000000001,
"loss": 3.255,
"step": 6300
},
{
"epoch": 2.0685197155785393,
"grad_norm": 1.1858711242675781,
"learning_rate": 0.0006399,
"loss": 3.2813,
"step": 6400
},
{
"epoch": 2.100840336134454,
"grad_norm": 0.9740011692047119,
"learning_rate": 0.0006499,
"loss": 3.2799,
"step": 6500
},
{
"epoch": 2.1331609566903684,
"grad_norm": 1.2355788946151733,
"learning_rate": 0.0006599,
"loss": 3.2678,
"step": 6600
},
{
"epoch": 2.165481577246283,
"grad_norm": 5.133415222167969,
"learning_rate": 0.0006699000000000001,
"loss": 3.2945,
"step": 6700
},
{
"epoch": 2.197802197802198,
"grad_norm": 1.0777193307876587,
"learning_rate": 0.0006799,
"loss": 3.3022,
"step": 6800
},
{
"epoch": 2.2301228183581125,
"grad_norm": 0.9968545436859131,
"learning_rate": 0.0006899,
"loss": 3.2852,
"step": 6900
},
{
"epoch": 2.262443438914027,
"grad_norm": 1.0664645433425903,
"learning_rate": 0.0006998999999999999,
"loss": 3.2945,
"step": 7000
},
{
"epoch": 2.2947640594699417,
"grad_norm": 0.9292928576469421,
"learning_rate": 0.0007099,
"loss": 3.2809,
"step": 7100
},
{
"epoch": 2.3270846800258567,
"grad_norm": 0.9592123627662659,
"learning_rate": 0.0007199,
"loss": 3.2944,
"step": 7200
},
{
"epoch": 2.3594053005817712,
"grad_norm": 1.028623342514038,
"learning_rate": 0.0007299,
"loss": 3.2946,
"step": 7300
},
{
"epoch": 2.391725921137686,
"grad_norm": 4.865314483642578,
"learning_rate": 0.0007399,
"loss": 3.296,
"step": 7400
},
{
"epoch": 2.4240465416936003,
"grad_norm": 1.256349802017212,
"learning_rate": 0.0007499000000000001,
"loss": 3.2976,
"step": 7500
},
{
"epoch": 2.4563671622495153,
"grad_norm": 1.0199131965637207,
"learning_rate": 0.0007599,
"loss": 3.2802,
"step": 7600
},
{
"epoch": 2.48868778280543,
"grad_norm": 1.946007251739502,
"learning_rate": 0.0007699,
"loss": 3.2834,
"step": 7700
},
{
"epoch": 2.5210084033613445,
"grad_norm": 0.9734399914741516,
"learning_rate": 0.0007799,
"loss": 3.2793,
"step": 7800
},
{
"epoch": 2.553329023917259,
"grad_norm": 0.9436636567115784,
"learning_rate": 0.0007899000000000001,
"loss": 3.29,
"step": 7900
},
{
"epoch": 2.585649644473174,
"grad_norm": 0.9262025952339172,
"learning_rate": 0.0007999000000000001,
"loss": 3.2756,
"step": 8000
},
{
"epoch": 2.6179702650290886,
"grad_norm": 1.195101022720337,
"learning_rate": 0.0008099,
"loss": 3.2799,
"step": 8100
},
{
"epoch": 2.650290885585003,
"grad_norm": 0.9717804193496704,
"learning_rate": 0.0008198999999999999,
"loss": 3.2714,
"step": 8200
},
{
"epoch": 2.682611506140918,
"grad_norm": 1.1211719512939453,
"learning_rate": 0.0008299,
"loss": 3.287,
"step": 8300
},
{
"epoch": 2.7149321266968327,
"grad_norm": 1.057012915611267,
"learning_rate": 0.0008399,
"loss": 3.2752,
"step": 8400
},
{
"epoch": 2.7472527472527473,
"grad_norm": 1.0968471765518188,
"learning_rate": 0.0008499,
"loss": 3.2719,
"step": 8500
},
{
"epoch": 2.779573367808662,
"grad_norm": 1.0198901891708374,
"learning_rate": 0.0008599,
"loss": 3.2522,
"step": 8600
},
{
"epoch": 2.8118939883645764,
"grad_norm": 1.330259919166565,
"learning_rate": 0.0008699000000000001,
"loss": 3.2589,
"step": 8700
},
{
"epoch": 2.8442146089204914,
"grad_norm": 0.8509685397148132,
"learning_rate": 0.0008799000000000001,
"loss": 3.2736,
"step": 8800
},
{
"epoch": 2.876535229476406,
"grad_norm": 1.174782633781433,
"learning_rate": 0.0008899,
"loss": 3.2597,
"step": 8900
},
{
"epoch": 2.9088558500323205,
"grad_norm": 1.1556833982467651,
"learning_rate": 0.0008999,
"loss": 3.2822,
"step": 9000
},
{
"epoch": 2.9411764705882355,
"grad_norm": 1.1285648345947266,
"learning_rate": 0.0009099,
"loss": 3.2897,
"step": 9100
},
{
"epoch": 2.97349709114415,
"grad_norm": 0.9292157292366028,
"learning_rate": 0.0009199000000000001,
"loss": 3.2389,
"step": 9200
},
{
"epoch": 3.0058177117000646,
"grad_norm": 0.9960983991622925,
"learning_rate": 0.0009299,
"loss": 3.2608,
"step": 9300
},
{
"epoch": 3.038138332255979,
"grad_norm": 1.0418298244476318,
"learning_rate": 0.0009399,
"loss": 3.1552,
"step": 9400
},
{
"epoch": 3.070458952811894,
"grad_norm": 0.8377931714057922,
"learning_rate": 0.0009498999999999999,
"loss": 3.1847,
"step": 9500
},
{
"epoch": 3.1027795733678087,
"grad_norm": 1.1886683702468872,
"learning_rate": 0.0009599,
"loss": 3.1789,
"step": 9600
},
{
"epoch": 3.1351001939237233,
"grad_norm": 0.9381577968597412,
"learning_rate": 0.0009699,
"loss": 3.166,
"step": 9700
},
{
"epoch": 3.167420814479638,
"grad_norm": 0.9787984490394592,
"learning_rate": 0.0009799,
"loss": 3.1831,
"step": 9800
},
{
"epoch": 3.199741435035553,
"grad_norm": 2.247471332550049,
"learning_rate": 0.0009899,
"loss": 3.2012,
"step": 9900
},
{
"epoch": 3.2320620555914674,
"grad_norm": 0.855204701423645,
"learning_rate": 0.0009999,
"loss": 3.1842,
"step": 10000
},
{
"epoch": 3.264382676147382,
"grad_norm": 1.1390490531921387,
"learning_rate": 0.001,
"loss": 3.1923,
"step": 10100
},
{
"epoch": 3.2967032967032965,
"grad_norm": 1.1109646558761597,
"learning_rate": 0.001,
"loss": 3.192,
"step": 10200
},
{
"epoch": 3.3290239172592115,
"grad_norm": 1.2080135345458984,
"learning_rate": 0.001,
"loss": 3.1771,
"step": 10300
},
{
"epoch": 3.361344537815126,
"grad_norm": 1.106696367263794,
"learning_rate": 0.001,
"loss": 3.1865,
"step": 10400
},
{
"epoch": 3.3936651583710407,
"grad_norm": 1.0035741329193115,
"learning_rate": 0.001,
"loss": 3.1756,
"step": 10500
},
{
"epoch": 3.425985778926955,
"grad_norm": 1.0501046180725098,
"learning_rate": 0.001,
"loss": 3.1716,
"step": 10600
},
{
"epoch": 3.45830639948287,
"grad_norm": 0.8912081122398376,
"learning_rate": 0.001,
"loss": 3.1657,
"step": 10700
},
{
"epoch": 3.490627020038785,
"grad_norm": 1.302748680114746,
"learning_rate": 0.001,
"loss": 3.1769,
"step": 10800
},
{
"epoch": 3.5229476405946993,
"grad_norm": 0.816489577293396,
"learning_rate": 0.001,
"loss": 3.201,
"step": 10900
},
{
"epoch": 3.555268261150614,
"grad_norm": 1.2402598857879639,
"learning_rate": 0.001,
"loss": 3.1804,
"step": 11000
},
{
"epoch": 3.587588881706529,
"grad_norm": 1.3531599044799805,
"learning_rate": 0.001,
"loss": 3.1921,
"step": 11100
},
{
"epoch": 3.6199095022624435,
"grad_norm": 2.2036519050598145,
"learning_rate": 0.001,
"loss": 3.1774,
"step": 11200
},
{
"epoch": 3.652230122818358,
"grad_norm": 1.7961952686309814,
"learning_rate": 0.001,
"loss": 3.1717,
"step": 11300
},
{
"epoch": 3.684550743374273,
"grad_norm": 1.176538348197937,
"learning_rate": 0.001,
"loss": 3.171,
"step": 11400
},
{
"epoch": 3.7168713639301876,
"grad_norm": 1.1532666683197021,
"learning_rate": 0.001,
"loss": 3.1913,
"step": 11500
},
{
"epoch": 3.749191984486102,
"grad_norm": 0.9709277749061584,
"learning_rate": 0.001,
"loss": 3.1926,
"step": 11600
},
{
"epoch": 3.7815126050420167,
"grad_norm": 1.0044294595718384,
"learning_rate": 0.001,
"loss": 3.19,
"step": 11700
},
{
"epoch": 3.8138332255979313,
"grad_norm": 0.8759526610374451,
"learning_rate": 0.001,
"loss": 3.1719,
"step": 11800
},
{
"epoch": 3.8461538461538463,
"grad_norm": 0.9571463465690613,
"learning_rate": 0.001,
"loss": 3.1798,
"step": 11900
},
{
"epoch": 3.878474466709761,
"grad_norm": 1.0519996881484985,
"learning_rate": 0.001,
"loss": 3.1891,
"step": 12000
},
{
"epoch": 3.9107950872656754,
"grad_norm": 1.0874314308166504,
"learning_rate": 0.001,
"loss": 3.177,
"step": 12100
},
{
"epoch": 3.9431157078215904,
"grad_norm": 1.134121298789978,
"learning_rate": 0.001,
"loss": 3.1724,
"step": 12200
},
{
"epoch": 3.975436328377505,
"grad_norm": 1.093509554862976,
"learning_rate": 0.001,
"loss": 3.1751,
"step": 12300
},
{
"epoch": 4.0077569489334195,
"grad_norm": 0.838979184627533,
"learning_rate": 0.001,
"loss": 3.1506,
"step": 12400
},
{
"epoch": 4.040077569489334,
"grad_norm": 0.9417329430580139,
"learning_rate": 0.001,
"loss": 3.0295,
"step": 12500
},
{
"epoch": 4.072398190045249,
"grad_norm": 0.941433310508728,
"learning_rate": 0.001,
"loss": 3.0514,
"step": 12600
},
{
"epoch": 4.104718810601163,
"grad_norm": 1.1772059202194214,
"learning_rate": 0.001,
"loss": 3.0639,
"step": 12700
},
{
"epoch": 4.137039431157079,
"grad_norm": 0.9115270376205444,
"learning_rate": 0.001,
"loss": 3.0563,
"step": 12800
},
{
"epoch": 4.169360051712993,
"grad_norm": 1.0796817541122437,
"learning_rate": 0.001,
"loss": 3.07,
"step": 12900
},
{
"epoch": 4.201680672268908,
"grad_norm": 0.9457636475563049,
"learning_rate": 0.001,
"loss": 3.0671,
"step": 13000
},
{
"epoch": 4.234001292824822,
"grad_norm": 1.6689468622207642,
"learning_rate": 0.001,
"loss": 3.0806,
"step": 13100
},
{
"epoch": 4.266321913380737,
"grad_norm": 0.8102026581764221,
"learning_rate": 0.001,
"loss": 3.0564,
"step": 13200
},
{
"epoch": 4.298642533936651,
"grad_norm": 0.8251219391822815,
"learning_rate": 0.001,
"loss": 3.0567,
"step": 13300
},
{
"epoch": 4.330963154492566,
"grad_norm": 0.921691358089447,
"learning_rate": 0.001,
"loss": 3.0732,
"step": 13400
},
{
"epoch": 4.3632837750484805,
"grad_norm": 1.216403841972351,
"learning_rate": 0.001,
"loss": 3.0796,
"step": 13500
},
{
"epoch": 4.395604395604396,
"grad_norm": 1.2740323543548584,
"learning_rate": 0.001,
"loss": 3.0667,
"step": 13600
},
{
"epoch": 4.4279250161603105,
"grad_norm": 1.0430225133895874,
"learning_rate": 0.001,
"loss": 3.0521,
"step": 13700
},
{
"epoch": 4.460245636716225,
"grad_norm": 0.8932771682739258,
"learning_rate": 0.001,
"loss": 3.0704,
"step": 13800
},
{
"epoch": 4.49256625727214,
"grad_norm": 1.011426568031311,
"learning_rate": 0.001,
"loss": 3.0857,
"step": 13900
},
{
"epoch": 4.524886877828054,
"grad_norm": 1.2008609771728516,
"learning_rate": 0.001,
"loss": 3.0533,
"step": 14000
},
{
"epoch": 4.557207498383969,
"grad_norm": 0.847837507724762,
"learning_rate": 0.001,
"loss": 3.0694,
"step": 14100
},
{
"epoch": 4.589528118939883,
"grad_norm": 0.951865017414093,
"learning_rate": 0.001,
"loss": 3.084,
"step": 14200
},
{
"epoch": 4.621848739495798,
"grad_norm": 0.972174882888794,
"learning_rate": 0.001,
"loss": 3.0808,
"step": 14300
},
{
"epoch": 4.654169360051713,
"grad_norm": 1.0798579454421997,
"learning_rate": 0.001,
"loss": 3.0889,
"step": 14400
},
{
"epoch": 4.686489980607628,
"grad_norm": 0.7689244747161865,
"learning_rate": 0.001,
"loss": 3.0891,
"step": 14500
},
{
"epoch": 4.7188106011635425,
"grad_norm": 1.4271835088729858,
"learning_rate": 0.001,
"loss": 3.0655,
"step": 14600
},
{
"epoch": 4.751131221719457,
"grad_norm": 1.0069650411605835,
"learning_rate": 0.001,
"loss": 3.0706,
"step": 14700
},
{
"epoch": 4.783451842275372,
"grad_norm": 0.9084206223487854,
"learning_rate": 0.001,
"loss": 3.057,
"step": 14800
},
{
"epoch": 4.815772462831286,
"grad_norm": 1.3670860528945923,
"learning_rate": 0.001,
"loss": 3.0592,
"step": 14900
},
{
"epoch": 4.848093083387201,
"grad_norm": 0.9387325048446655,
"learning_rate": 0.001,
"loss": 3.0789,
"step": 15000
},
{
"epoch": 4.880413703943116,
"grad_norm": 0.8084505200386047,
"learning_rate": 0.001,
"loss": 3.0631,
"step": 15100
},
{
"epoch": 4.912734324499031,
"grad_norm": 1.0331807136535645,
"learning_rate": 0.001,
"loss": 3.0723,
"step": 15200
},
{
"epoch": 4.945054945054945,
"grad_norm": 0.9408292770385742,
"learning_rate": 0.001,
"loss": 3.0616,
"step": 15300
},
{
"epoch": 4.97737556561086,
"grad_norm": 0.9665517807006836,
"learning_rate": 0.001,
"loss": 3.0801,
"step": 15400
},
{
"epoch": 5.009696186166774,
"grad_norm": 1.1656768321990967,
"learning_rate": 0.001,
"loss": 3.0375,
"step": 15500
},
{
"epoch": 5.042016806722689,
"grad_norm": 0.9300348162651062,
"learning_rate": 0.001,
"loss": 2.9091,
"step": 15600
},
{
"epoch": 5.0743374272786035,
"grad_norm": 1.0597182512283325,
"learning_rate": 0.001,
"loss": 2.9504,
"step": 15700
},
{
"epoch": 5.106658047834518,
"grad_norm": 1.2280610799789429,
"learning_rate": 0.001,
"loss": 2.9492,
"step": 15800
},
{
"epoch": 5.1389786683904335,
"grad_norm": 1.0233289003372192,
"learning_rate": 0.001,
"loss": 2.9551,
"step": 15900
},
{
"epoch": 5.171299288946348,
"grad_norm": 0.941676676273346,
"learning_rate": 0.001,
"loss": 2.9642,
"step": 16000
},
{
"epoch": 5.203619909502263,
"grad_norm": 0.7993482351303101,
"learning_rate": 0.001,
"loss": 2.9547,
"step": 16100
},
{
"epoch": 5.235940530058177,
"grad_norm": 0.9896591305732727,
"learning_rate": 0.001,
"loss": 2.9548,
"step": 16200
},
{
"epoch": 5.268261150614092,
"grad_norm": 0.9227080345153809,
"learning_rate": 0.001,
"loss": 2.9812,
"step": 16300
},
{
"epoch": 5.300581771170006,
"grad_norm": 1.2044575214385986,
"learning_rate": 0.001,
"loss": 2.9787,
"step": 16400
},
{
"epoch": 5.332902391725921,
"grad_norm": 1.003462791442871,
"learning_rate": 0.001,
"loss": 2.9652,
"step": 16500
},
{
"epoch": 5.365223012281835,
"grad_norm": 1.0406891107559204,
"learning_rate": 0.001,
"loss": 2.963,
"step": 16600
},
{
"epoch": 5.397543632837751,
"grad_norm": 1.0937559604644775,
"learning_rate": 0.001,
"loss": 2.9639,
"step": 16700
},
{
"epoch": 5.429864253393665,
"grad_norm": 0.8680944442749023,
"learning_rate": 0.001,
"loss": 2.9819,
"step": 16800
},
{
"epoch": 5.46218487394958,
"grad_norm": 0.917489230632782,
"learning_rate": 0.001,
"loss": 2.9663,
"step": 16900
},
{
"epoch": 5.4945054945054945,
"grad_norm": 0.8815052509307861,
"learning_rate": 0.001,
"loss": 2.9881,
"step": 17000
},
{
"epoch": 5.526826115061409,
"grad_norm": 0.9785053133964539,
"learning_rate": 0.001,
"loss": 2.9701,
"step": 17100
},
{
"epoch": 5.559146735617324,
"grad_norm": 1.2235257625579834,
"learning_rate": 0.001,
"loss": 2.9775,
"step": 17200
},
{
"epoch": 5.591467356173238,
"grad_norm": 0.8558531403541565,
"learning_rate": 0.001,
"loss": 2.9492,
"step": 17300
},
{
"epoch": 5.623787976729153,
"grad_norm": 1.730175495147705,
"learning_rate": 0.001,
"loss": 2.9725,
"step": 17400
},
{
"epoch": 5.656108597285068,
"grad_norm": 0.9976469278335571,
"learning_rate": 0.001,
"loss": 2.9795,
"step": 17500
},
{
"epoch": 5.688429217840983,
"grad_norm": 1.102630853652954,
"learning_rate": 0.001,
"loss": 3.0011,
"step": 17600
},
{
"epoch": 5.720749838396897,
"grad_norm": 0.8655111789703369,
"learning_rate": 0.001,
"loss": 2.982,
"step": 17700
},
{
"epoch": 5.753070458952812,
"grad_norm": 0.9005181193351746,
"learning_rate": 0.001,
"loss": 2.9795,
"step": 17800
},
{
"epoch": 5.785391079508726,
"grad_norm": 0.91997891664505,
"learning_rate": 0.001,
"loss": 2.9668,
"step": 17900
},
{
"epoch": 5.817711700064641,
"grad_norm": 0.9092044234275818,
"learning_rate": 0.001,
"loss": 2.9854,
"step": 18000
},
{
"epoch": 5.850032320620556,
"grad_norm": 1.1681147813796997,
"learning_rate": 0.001,
"loss": 2.9888,
"step": 18100
},
{
"epoch": 5.882352941176471,
"grad_norm": 0.88965904712677,
"learning_rate": 0.001,
"loss": 2.99,
"step": 18200
},
{
"epoch": 5.914673561732386,
"grad_norm": 0.915117084980011,
"learning_rate": 0.001,
"loss": 3.0072,
"step": 18300
},
{
"epoch": 5.9469941822883,
"grad_norm": 0.9954575300216675,
"learning_rate": 0.001,
"loss": 2.9764,
"step": 18400
},
{
"epoch": 5.979314802844215,
"grad_norm": 1.087384581565857,
"learning_rate": 0.001,
"loss": 2.9776,
"step": 18500
},
{
"epoch": 6.011635423400129,
"grad_norm": 0.9920886158943176,
"learning_rate": 0.001,
"loss": 2.9278,
"step": 18600
},
{
"epoch": 6.043956043956044,
"grad_norm": 1.1691513061523438,
"learning_rate": 0.001,
"loss": 2.842,
"step": 18700
},
{
"epoch": 6.076276664511958,
"grad_norm": 1.0943280458450317,
"learning_rate": 0.001,
"loss": 2.8453,
"step": 18800
},
{
"epoch": 6.108597285067873,
"grad_norm": 1.4848939180374146,
"learning_rate": 0.001,
"loss": 2.8522,
"step": 18900
},
{
"epoch": 6.140917905623788,
"grad_norm": 0.8807019591331482,
"learning_rate": 0.001,
"loss": 2.8703,
"step": 19000
},
{
"epoch": 6.173238526179703,
"grad_norm": 1.2009692192077637,
"learning_rate": 0.001,
"loss": 2.8789,
"step": 19100
},
{
"epoch": 6.2055591467356175,
"grad_norm": 1.0970031023025513,
"learning_rate": 0.001,
"loss": 2.8751,
"step": 19200
},
{
"epoch": 6.237879767291532,
"grad_norm": 1.1768124103546143,
"learning_rate": 0.001,
"loss": 2.886,
"step": 19300
},
{
"epoch": 6.270200387847447,
"grad_norm": 0.8989688754081726,
"learning_rate": 0.001,
"loss": 2.8944,
"step": 19400
},
{
"epoch": 6.302521008403361,
"grad_norm": 1.2694783210754395,
"learning_rate": 0.001,
"loss": 2.8668,
"step": 19500
},
{
"epoch": 6.334841628959276,
"grad_norm": 0.9729022979736328,
"learning_rate": 0.001,
"loss": 2.8705,
"step": 19600
},
{
"epoch": 6.36716224951519,
"grad_norm": 1.0138781070709229,
"learning_rate": 0.001,
"loss": 2.8859,
"step": 19700
},
{
"epoch": 6.399482870071106,
"grad_norm": 0.9176075458526611,
"learning_rate": 0.001,
"loss": 2.9032,
"step": 19800
},
{
"epoch": 6.43180349062702,
"grad_norm": 1.1158503293991089,
"learning_rate": 0.001,
"loss": 2.8774,
"step": 19900
},
{
"epoch": 6.464124111182935,
"grad_norm": 0.9626113176345825,
"learning_rate": 0.001,
"loss": 2.8744,
"step": 20000
},
{
"epoch": 6.496444731738849,
"grad_norm": 0.9146256446838379,
"learning_rate": 0.001,
"loss": 2.8945,
"step": 20100
},
{
"epoch": 6.528765352294764,
"grad_norm": 0.9654421806335449,
"learning_rate": 0.001,
"loss": 2.8934,
"step": 20200
},
{
"epoch": 6.5610859728506785,
"grad_norm": 0.9559252262115479,
"learning_rate": 0.001,
"loss": 2.8954,
"step": 20300
},
{
"epoch": 6.593406593406593,
"grad_norm": 0.9600493907928467,
"learning_rate": 0.001,
"loss": 2.8964,
"step": 20400
},
{
"epoch": 6.625727213962508,
"grad_norm": 1.152198314666748,
"learning_rate": 0.001,
"loss": 2.8953,
"step": 20500
},
{
"epoch": 6.658047834518423,
"grad_norm": 0.7821874618530273,
"learning_rate": 0.001,
"loss": 2.8899,
"step": 20600
},
{
"epoch": 6.690368455074338,
"grad_norm": 1.2071696519851685,
"learning_rate": 0.001,
"loss": 2.8832,
"step": 20700
},
{
"epoch": 6.722689075630252,
"grad_norm": 0.9243321418762207,
"learning_rate": 0.001,
"loss": 2.9011,
"step": 20800
},
{
"epoch": 6.755009696186167,
"grad_norm": 0.925390899181366,
"learning_rate": 0.001,
"loss": 2.9143,
"step": 20900
},
{
"epoch": 6.787330316742081,
"grad_norm": 0.9191309213638306,
"learning_rate": 0.001,
"loss": 2.9058,
"step": 21000
},
{
"epoch": 6.819650937297996,
"grad_norm": 0.8833218812942505,
"learning_rate": 0.001,
"loss": 2.8933,
"step": 21100
},
{
"epoch": 6.85197155785391,
"grad_norm": 0.9937705397605896,
"learning_rate": 0.001,
"loss": 2.9203,
"step": 21200
},
{
"epoch": 6.884292178409826,
"grad_norm": 1.2760004997253418,
"learning_rate": 0.001,
"loss": 2.9042,
"step": 21300
},
{
"epoch": 6.91661279896574,
"grad_norm": 1.0547213554382324,
"learning_rate": 0.001,
"loss": 2.9187,
"step": 21400
},
{
"epoch": 6.948933419521655,
"grad_norm": 0.957062840461731,
"learning_rate": 0.001,
"loss": 2.903,
"step": 21500
},
{
"epoch": 6.98125404007757,
"grad_norm": 1.1016453504562378,
"learning_rate": 0.001,
"loss": 2.8896,
"step": 21600
},
{
"epoch": 7.013574660633484,
"grad_norm": 1.3905339241027832,
"learning_rate": 0.001,
"loss": 2.8379,
"step": 21700
},
{
"epoch": 7.045895281189399,
"grad_norm": 1.1392508745193481,
"learning_rate": 0.001,
"loss": 2.7692,
"step": 21800
},
{
"epoch": 7.078215901745313,
"grad_norm": 0.9623212814331055,
"learning_rate": 0.001,
"loss": 2.76,
"step": 21900
},
{
"epoch": 7.110536522301228,
"grad_norm": 1.0762792825698853,
"learning_rate": 0.001,
"loss": 2.7784,
"step": 22000
},
{
"epoch": 7.142857142857143,
"grad_norm": 1.1317200660705566,
"learning_rate": 0.001,
"loss": 2.7783,
"step": 22100
},
{
"epoch": 7.175177763413058,
"grad_norm": 1.379044532775879,
"learning_rate": 0.001,
"loss": 2.7868,
"step": 22200
},
{
"epoch": 7.207498383968972,
"grad_norm": 1.036180019378662,
"learning_rate": 0.001,
"loss": 2.8014,
"step": 22300
},
{
"epoch": 7.239819004524887,
"grad_norm": 1.326994776725769,
"learning_rate": 0.001,
"loss": 2.8077,
"step": 22400
},
{
"epoch": 7.2721396250808015,
"grad_norm": 1.378857135772705,
"learning_rate": 0.001,
"loss": 2.8186,
"step": 22500
},
{
"epoch": 7.304460245636716,
"grad_norm": 1.1402287483215332,
"learning_rate": 0.001,
"loss": 2.8114,
"step": 22600
},
{
"epoch": 7.336780866192631,
"grad_norm": 1.236741304397583,
"learning_rate": 0.001,
"loss": 2.798,
"step": 22700
},
{
"epoch": 7.369101486748546,
"grad_norm": 0.9529298543930054,
"learning_rate": 0.001,
"loss": 2.8223,
"step": 22800
},
{
"epoch": 7.401422107304461,
"grad_norm": 1.272033929824829,
"learning_rate": 0.001,
"loss": 2.8241,
"step": 22900
},
{
"epoch": 7.433742727860375,
"grad_norm": 0.9919891953468323,
"learning_rate": 0.001,
"loss": 2.8106,
"step": 23000
},
{
"epoch": 7.46606334841629,
"grad_norm": 0.9951006770133972,
"learning_rate": 0.001,
"loss": 2.8164,
"step": 23100
},
{
"epoch": 7.498383968972204,
"grad_norm": 1.260886549949646,
"learning_rate": 0.001,
"loss": 2.8238,
"step": 23200
},
{
"epoch": 7.530704589528119,
"grad_norm": 1.0894906520843506,
"learning_rate": 0.001,
"loss": 2.818,
"step": 23300
},
{
"epoch": 7.563025210084033,
"grad_norm": 1.1154838800430298,
"learning_rate": 0.001,
"loss": 2.8312,
"step": 23400
},
{
"epoch": 7.595345830639948,
"grad_norm": 1.0110588073730469,
"learning_rate": 0.001,
"loss": 2.812,
"step": 23500
},
{
"epoch": 7.6276664511958625,
"grad_norm": 0.9715908765792847,
"learning_rate": 0.001,
"loss": 2.819,
"step": 23600
},
{
"epoch": 7.659987071751778,
"grad_norm": 1.0196453332901,
"learning_rate": 0.001,
"loss": 2.8195,
"step": 23700
},
{
"epoch": 7.6923076923076925,
"grad_norm": 1.3575221300125122,
"learning_rate": 0.001,
"loss": 2.8115,
"step": 23800
},
{
"epoch": 7.724628312863607,
"grad_norm": 1.3838183879852295,
"learning_rate": 0.001,
"loss": 2.8206,
"step": 23900
},
{
"epoch": 7.756948933419522,
"grad_norm": 1.2254596948623657,
"learning_rate": 0.001,
"loss": 2.8281,
"step": 24000
},
{
"epoch": 7.789269553975436,
"grad_norm": 1.2324926853179932,
"learning_rate": 0.001,
"loss": 2.8426,
"step": 24100
},
{
"epoch": 7.821590174531351,
"grad_norm": 1.2173677682876587,
"learning_rate": 0.001,
"loss": 2.8355,
"step": 24200
},
{
"epoch": 7.853910795087265,
"grad_norm": 0.9907102584838867,
"learning_rate": 0.001,
"loss": 2.8081,
"step": 24300
},
{
"epoch": 7.886231415643181,
"grad_norm": 1.141242265701294,
"learning_rate": 0.001,
"loss": 2.8234,
"step": 24400
},
{
"epoch": 7.918552036199095,
"grad_norm": 1.286460518836975,
"learning_rate": 0.001,
"loss": 2.8324,
"step": 24500
},
{
"epoch": 7.95087265675501,
"grad_norm": 0.9198762774467468,
"learning_rate": 0.001,
"loss": 2.8671,
"step": 24600
},
{
"epoch": 7.983193277310924,
"grad_norm": 0.9404067397117615,
"learning_rate": 0.001,
"loss": 2.8488,
"step": 24700
},
{
"epoch": 8.015513897866839,
"grad_norm": 1.2536792755126953,
"learning_rate": 0.001,
"loss": 2.7556,
"step": 24800
},
{
"epoch": 8.047834518422754,
"grad_norm": 1.4987441301345825,
"learning_rate": 0.001,
"loss": 2.7025,
"step": 24900
},
{
"epoch": 8.080155138978668,
"grad_norm": 1.1885977983474731,
"learning_rate": 0.001,
"loss": 2.7104,
"step": 25000
},
{
"epoch": 8.112475759534583,
"grad_norm": 1.5676301717758179,
"learning_rate": 0.001,
"loss": 2.7184,
"step": 25100
},
{
"epoch": 8.144796380090497,
"grad_norm": 1.3227053880691528,
"learning_rate": 0.001,
"loss": 2.7353,
"step": 25200
},
{
"epoch": 8.177117000646412,
"grad_norm": 1.1693710088729858,
"learning_rate": 0.001,
"loss": 2.7109,
"step": 25300
},
{
"epoch": 8.209437621202326,
"grad_norm": 1.5500166416168213,
"learning_rate": 0.001,
"loss": 2.7279,
"step": 25400
},
{
"epoch": 8.241758241758241,
"grad_norm": 1.6105555295944214,
"learning_rate": 0.001,
"loss": 2.7238,
"step": 25500
},
{
"epoch": 8.274078862314157,
"grad_norm": 1.3008909225463867,
"learning_rate": 0.001,
"loss": 2.7138,
"step": 25600
},
{
"epoch": 8.306399482870072,
"grad_norm": 1.1506481170654297,
"learning_rate": 0.001,
"loss": 2.7202,
"step": 25700
},
{
"epoch": 8.338720103425986,
"grad_norm": 1.583932876586914,
"learning_rate": 0.001,
"loss": 2.7328,
"step": 25800
},
{
"epoch": 8.371040723981901,
"grad_norm": 1.3606271743774414,
"learning_rate": 0.001,
"loss": 2.7364,
"step": 25900
},
{
"epoch": 8.403361344537815,
"grad_norm": 1.6803429126739502,
"learning_rate": 0.001,
"loss": 2.7109,
"step": 26000
},
{
"epoch": 8.43568196509373,
"grad_norm": 1.6417889595031738,
"learning_rate": 0.001,
"loss": 2.7318,
"step": 26100
},
{
"epoch": 8.468002585649645,
"grad_norm": 1.2529147863388062,
"learning_rate": 0.001,
"loss": 2.7559,
"step": 26200
},
{
"epoch": 8.50032320620556,
"grad_norm": 1.352419137954712,
"learning_rate": 0.001,
"loss": 2.7578,
"step": 26300
},
{
"epoch": 8.532643826761474,
"grad_norm": 1.5327883958816528,
"learning_rate": 0.001,
"loss": 2.7497,
"step": 26400
},
{
"epoch": 8.564964447317388,
"grad_norm": 1.2606614828109741,
"learning_rate": 0.001,
"loss": 2.7531,
"step": 26500
},
{
"epoch": 8.597285067873303,
"grad_norm": 1.340108871459961,
"learning_rate": 0.001,
"loss": 2.76,
"step": 26600
},
{
"epoch": 8.629605688429217,
"grad_norm": 1.622501015663147,
"learning_rate": 0.001,
"loss": 2.7671,
"step": 26700
},
{
"epoch": 8.661926308985132,
"grad_norm": 2.9881439208984375,
"learning_rate": 0.001,
"loss": 2.7507,
"step": 26800
},
{
"epoch": 8.694246929541046,
"grad_norm": 1.4684876203536987,
"learning_rate": 0.001,
"loss": 2.7754,
"step": 26900
},
{
"epoch": 8.726567550096961,
"grad_norm": 1.279051423072815,
"learning_rate": 0.001,
"loss": 2.7737,
"step": 27000
},
{
"epoch": 8.758888170652877,
"grad_norm": 1.2881108522415161,
"learning_rate": 0.001,
"loss": 2.7566,
"step": 27100
},
{
"epoch": 8.791208791208792,
"grad_norm": 1.4721473455429077,
"learning_rate": 0.001,
"loss": 2.753,
"step": 27200
},
{
"epoch": 8.823529411764707,
"grad_norm": 1.335915207862854,
"learning_rate": 0.001,
"loss": 2.7699,
"step": 27300
},
{
"epoch": 8.855850032320621,
"grad_norm": 1.3305962085723877,
"learning_rate": 0.001,
"loss": 2.7792,
"step": 27400
},
{
"epoch": 8.888170652876536,
"grad_norm": 1.196067452430725,
"learning_rate": 0.001,
"loss": 2.7727,
"step": 27500
},
{
"epoch": 8.92049127343245,
"grad_norm": 1.2162010669708252,
"learning_rate": 0.001,
"loss": 2.751,
"step": 27600
},
{
"epoch": 8.952811893988365,
"grad_norm": 1.5516213178634644,
"learning_rate": 0.001,
"loss": 2.7937,
"step": 27700
},
{
"epoch": 8.98513251454428,
"grad_norm": 1.2385696172714233,
"learning_rate": 0.001,
"loss": 2.7732,
"step": 27800
},
{
"epoch": 9.017453135100194,
"grad_norm": 1.991585373878479,
"learning_rate": 0.001,
"loss": 2.6816,
"step": 27900
},
{
"epoch": 9.049773755656108,
"grad_norm": 1.8597898483276367,
"learning_rate": 0.001,
"loss": 2.6321,
"step": 28000
},
{
"epoch": 9.082094376212023,
"grad_norm": 1.6625946760177612,
"learning_rate": 0.001,
"loss": 2.6467,
"step": 28100
},
{
"epoch": 9.114414996767938,
"grad_norm": 1.5089329481124878,
"learning_rate": 0.001,
"loss": 2.6456,
"step": 28200
},
{
"epoch": 9.146735617323852,
"grad_norm": 1.6687277555465698,
"learning_rate": 0.001,
"loss": 2.652,
"step": 28300
},
{
"epoch": 9.179056237879767,
"grad_norm": 1.1334947347640991,
"learning_rate": 0.001,
"loss": 2.6543,
"step": 28400
},
{
"epoch": 9.211376858435681,
"grad_norm": 1.7903298139572144,
"learning_rate": 0.001,
"loss": 2.6396,
"step": 28500
},
{
"epoch": 9.243697478991596,
"grad_norm": 1.8491761684417725,
"learning_rate": 0.001,
"loss": 2.6443,
"step": 28600
},
{
"epoch": 9.276018099547512,
"grad_norm": 1.4492714405059814,
"learning_rate": 0.001,
"loss": 2.6509,
"step": 28700
},
{
"epoch": 9.308338720103427,
"grad_norm": 1.8237452507019043,
"learning_rate": 0.001,
"loss": 2.6867,
"step": 28800
},
{
"epoch": 9.340659340659341,
"grad_norm": 1.7002811431884766,
"learning_rate": 0.001,
"loss": 2.6911,
"step": 28900
},
{
"epoch": 9.372979961215256,
"grad_norm": 1.24556303024292,
"learning_rate": 0.001,
"loss": 2.6842,
"step": 29000
},
{
"epoch": 9.40530058177117,
"grad_norm": 1.3534449338912964,
"learning_rate": 0.001,
"loss": 2.6882,
"step": 29100
},
{
"epoch": 9.437621202327085,
"grad_norm": 1.465098261833191,
"learning_rate": 0.001,
"loss": 2.6845,
"step": 29200
},
{
"epoch": 9.469941822883,
"grad_norm": 1.1282223463058472,
"learning_rate": 0.001,
"loss": 2.6926,
"step": 29300
},
{
"epoch": 9.502262443438914,
"grad_norm": 1.5469937324523926,
"learning_rate": 0.001,
"loss": 2.6861,
"step": 29400
},
{
"epoch": 9.534583063994829,
"grad_norm": 1.313873052597046,
"learning_rate": 0.001,
"loss": 2.6821,
"step": 29500
},
{
"epoch": 9.566903684550743,
"grad_norm": 1.0639135837554932,
"learning_rate": 0.001,
"loss": 2.6661,
"step": 29600
},
{
"epoch": 9.599224305106658,
"grad_norm": 1.1810053586959839,
"learning_rate": 0.001,
"loss": 2.6854,
"step": 29700
},
{
"epoch": 9.631544925662572,
"grad_norm": 1.4090721607208252,
"learning_rate": 0.001,
"loss": 2.7232,
"step": 29800
},
{
"epoch": 9.663865546218487,
"grad_norm": 1.278445839881897,
"learning_rate": 0.001,
"loss": 2.6675,
"step": 29900
},
{
"epoch": 9.696186166774401,
"grad_norm": 1.2034200429916382,
"learning_rate": 0.001,
"loss": 2.6747,
"step": 30000
},
{
"epoch": 9.728506787330316,
"grad_norm": 1.2123016119003296,
"learning_rate": 0.001,
"loss": 2.7185,
"step": 30100
},
{
"epoch": 9.760827407886232,
"grad_norm": 1.6041324138641357,
"learning_rate": 0.001,
"loss": 2.7031,
"step": 30200
},
{
"epoch": 9.793148028442147,
"grad_norm": 1.3464832305908203,
"learning_rate": 0.001,
"loss": 2.7295,
"step": 30300
},
{
"epoch": 9.825468648998061,
"grad_norm": 1.8239651918411255,
"learning_rate": 0.001,
"loss": 2.6939,
"step": 30400
},
{
"epoch": 9.857789269553976,
"grad_norm": 1.3928236961364746,
"learning_rate": 0.001,
"loss": 2.702,
"step": 30500
},
{
"epoch": 9.89010989010989,
"grad_norm": 1.341913104057312,
"learning_rate": 0.001,
"loss": 2.699,
"step": 30600
},
{
"epoch": 9.922430510665805,
"grad_norm": 1.3967502117156982,
"learning_rate": 0.001,
"loss": 2.7218,
"step": 30700
},
{
"epoch": 9.95475113122172,
"grad_norm": 1.2057521343231201,
"learning_rate": 0.001,
"loss": 2.7233,
"step": 30800
},
{
"epoch": 9.987071751777634,
"grad_norm": 1.4760347604751587,
"learning_rate": 0.001,
"loss": 2.7322,
"step": 30900
},
{
"epoch": 10.019392372333549,
"grad_norm": 0.9195663332939148,
"learning_rate": 0.001,
"loss": 2.642,
"step": 31000
},
{
"epoch": 10.051712992889463,
"grad_norm": 1.7018245458602905,
"learning_rate": 0.001,
"loss": 2.5902,
"step": 31100
},
{
"epoch": 10.084033613445378,
"grad_norm": 1.2946157455444336,
"learning_rate": 0.001,
"loss": 2.5836,
"step": 31200
},
{
"epoch": 10.116354234001292,
"grad_norm": 1.2677333354949951,
"learning_rate": 0.001,
"loss": 2.5861,
"step": 31300
},
{
"epoch": 10.148674854557207,
"grad_norm": 0.9341103434562683,
"learning_rate": 0.001,
"loss": 2.5815,
"step": 31400
},
{
"epoch": 10.180995475113122,
"grad_norm": 1.197549819946289,
"learning_rate": 0.001,
"loss": 2.5988,
"step": 31500
},
{
"epoch": 10.213316095669036,
"grad_norm": 0.9701215028762817,
"learning_rate": 0.001,
"loss": 2.6033,
"step": 31600
},
{
"epoch": 10.24563671622495,
"grad_norm": 1.440954327583313,
"learning_rate": 0.001,
"loss": 2.6095,
"step": 31700
},
{
"epoch": 10.277957336780867,
"grad_norm": 1.2938240766525269,
"learning_rate": 0.001,
"loss": 2.6024,
"step": 31800
},
{
"epoch": 10.310277957336782,
"grad_norm": 1.4884780645370483,
"learning_rate": 0.001,
"loss": 2.6229,
"step": 31900
},
{
"epoch": 10.342598577892696,
"grad_norm": 0.9927781820297241,
"learning_rate": 0.001,
"loss": 2.5953,
"step": 32000
},
{
"epoch": 10.37491919844861,
"grad_norm": 0.957020103931427,
"learning_rate": 0.001,
"loss": 2.6005,
"step": 32100
},
{
"epoch": 10.407239819004525,
"grad_norm": 1.0292260646820068,
"learning_rate": 0.001,
"loss": 2.6422,
"step": 32200
},
{
"epoch": 10.43956043956044,
"grad_norm": 1.205029010772705,
"learning_rate": 0.001,
"loss": 2.6276,
"step": 32300
},
{
"epoch": 10.471881060116354,
"grad_norm": 1.0172486305236816,
"learning_rate": 0.001,
"loss": 2.6254,
"step": 32400
},
{
"epoch": 10.504201680672269,
"grad_norm": 0.9256879687309265,
"learning_rate": 0.001,
"loss": 2.6103,
"step": 32500
},
{
"epoch": 10.536522301228183,
"grad_norm": 1.0289719104766846,
"learning_rate": 0.001,
"loss": 2.6329,
"step": 32600
},
{
"epoch": 10.568842921784098,
"grad_norm": 1.1163206100463867,
"learning_rate": 0.001,
"loss": 2.6488,
"step": 32700
},
{
"epoch": 10.601163542340013,
"grad_norm": 1.0654981136322021,
"learning_rate": 0.001,
"loss": 2.6346,
"step": 32800
},
{
"epoch": 10.633484162895927,
"grad_norm": 0.7902207374572754,
"learning_rate": 0.001,
"loss": 2.6188,
"step": 32900
},
{
"epoch": 10.665804783451842,
"grad_norm": 1.250271201133728,
"learning_rate": 0.001,
"loss": 2.6563,
"step": 33000
},
{
"epoch": 10.698125404007756,
"grad_norm": 0.969681978225708,
"learning_rate": 0.001,
"loss": 2.6384,
"step": 33100
},
{
"epoch": 10.73044602456367,
"grad_norm": 1.1124166250228882,
"learning_rate": 0.001,
"loss": 2.6356,
"step": 33200
},
{
"epoch": 10.762766645119587,
"grad_norm": 0.918755292892456,
"learning_rate": 0.001,
"loss": 2.6393,
"step": 33300
},
{
"epoch": 10.795087265675502,
"grad_norm": 0.9233816862106323,
"learning_rate": 0.001,
"loss": 2.6507,
"step": 33400
},
{
"epoch": 10.827407886231416,
"grad_norm": 1.036242127418518,
"learning_rate": 0.001,
"loss": 2.6541,
"step": 33500
},
{
"epoch": 10.85972850678733,
"grad_norm": 1.00826895236969,
"learning_rate": 0.001,
"loss": 2.6496,
"step": 33600
},
{
"epoch": 10.892049127343245,
"grad_norm": 1.0117528438568115,
"learning_rate": 0.001,
"loss": 2.6466,
"step": 33700
},
{
"epoch": 10.92436974789916,
"grad_norm": 0.9768591523170471,
"learning_rate": 0.001,
"loss": 2.6448,
"step": 33800
},
{
"epoch": 10.956690368455074,
"grad_norm": 1.2766749858856201,
"learning_rate": 0.001,
"loss": 2.647,
"step": 33900
},
{
"epoch": 10.989010989010989,
"grad_norm": 1.2299177646636963,
"learning_rate": 0.001,
"loss": 2.6419,
"step": 34000
},
{
"epoch": 11.021331609566904,
"grad_norm": 1.1423105001449585,
"learning_rate": 0.001,
"loss": 2.5704,
"step": 34100
},
{
"epoch": 11.053652230122818,
"grad_norm": 0.8941395282745361,
"learning_rate": 0.001,
"loss": 2.4834,
"step": 34200
},
{
"epoch": 11.085972850678733,
"grad_norm": 1.1033960580825806,
"learning_rate": 0.001,
"loss": 2.513,
"step": 34300
},
{
"epoch": 11.118293471234647,
"grad_norm": 1.0653290748596191,
"learning_rate": 0.001,
"loss": 2.541,
"step": 34400
},
{
"epoch": 11.150614091790562,
"grad_norm": 1.4494647979736328,
"learning_rate": 0.001,
"loss": 2.5199,
"step": 34500
},
{
"epoch": 11.182934712346476,
"grad_norm": 1.2029805183410645,
"learning_rate": 0.001,
"loss": 2.5371,
"step": 34600
},
{
"epoch": 11.215255332902391,
"grad_norm": 0.9592697024345398,
"learning_rate": 0.001,
"loss": 2.5513,
"step": 34700
},
{
"epoch": 11.247575953458306,
"grad_norm": 0.9625367522239685,
"learning_rate": 0.001,
"loss": 2.5316,
"step": 34800
},
{
"epoch": 11.279896574014222,
"grad_norm": 1.119964361190796,
"learning_rate": 0.001,
"loss": 2.5531,
"step": 34900
},
{
"epoch": 11.312217194570136,
"grad_norm": 0.9373201131820679,
"learning_rate": 0.001,
"loss": 2.5427,
"step": 35000
},
{
"epoch": 11.344537815126051,
"grad_norm": 0.9922090172767639,
"learning_rate": 0.001,
"loss": 2.5593,
"step": 35100
},
{
"epoch": 11.376858435681966,
"grad_norm": 0.949802577495575,
"learning_rate": 0.001,
"loss": 2.5581,
"step": 35200
},
{
"epoch": 11.40917905623788,
"grad_norm": 1.0595334768295288,
"learning_rate": 0.001,
"loss": 2.572,
"step": 35300
},
{
"epoch": 11.441499676793795,
"grad_norm": 0.883158802986145,
"learning_rate": 0.001,
"loss": 2.584,
"step": 35400
},
{
"epoch": 11.47382029734971,
"grad_norm": 0.983586311340332,
"learning_rate": 0.001,
"loss": 2.5704,
"step": 35500
},
{
"epoch": 11.506140917905624,
"grad_norm": 0.814781129360199,
"learning_rate": 0.001,
"loss": 2.5732,
"step": 35600
},
{
"epoch": 11.538461538461538,
"grad_norm": 1.2671406269073486,
"learning_rate": 0.001,
"loss": 2.5872,
"step": 35700
},
{
"epoch": 11.570782159017453,
"grad_norm": 1.1636486053466797,
"learning_rate": 0.001,
"loss": 2.5662,
"step": 35800
},
{
"epoch": 11.603102779573367,
"grad_norm": 0.9227073192596436,
"learning_rate": 0.001,
"loss": 2.5826,
"step": 35900
},
{
"epoch": 11.635423400129282,
"grad_norm": 1.394180178642273,
"learning_rate": 0.001,
"loss": 2.5915,
"step": 36000
},
{
"epoch": 11.667744020685197,
"grad_norm": 1.2034887075424194,
"learning_rate": 0.001,
"loss": 2.5843,
"step": 36100
},
{
"epoch": 11.700064641241111,
"grad_norm": 1.2949236631393433,
"learning_rate": 0.001,
"loss": 2.617,
"step": 36200
},
{
"epoch": 11.732385261797026,
"grad_norm": 0.9753849506378174,
"learning_rate": 0.001,
"loss": 2.5988,
"step": 36300
},
{
"epoch": 11.764705882352942,
"grad_norm": 0.8794882893562317,
"learning_rate": 0.001,
"loss": 2.602,
"step": 36400
},
{
"epoch": 11.797026502908857,
"grad_norm": 0.9771369099617004,
"learning_rate": 0.001,
"loss": 2.5793,
"step": 36500
},
{
"epoch": 11.829347123464771,
"grad_norm": 1.5032073259353638,
"learning_rate": 0.001,
"loss": 2.5841,
"step": 36600
},
{
"epoch": 11.861667744020686,
"grad_norm": 1.11965012550354,
"learning_rate": 0.001,
"loss": 2.5914,
"step": 36700
},
{
"epoch": 11.8939883645766,
"grad_norm": 1.1560014486312866,
"learning_rate": 0.001,
"loss": 2.5853,
"step": 36800
},
{
"epoch": 11.926308985132515,
"grad_norm": 1.01851224899292,
"learning_rate": 0.001,
"loss": 2.6131,
"step": 36900
},
{
"epoch": 11.95862960568843,
"grad_norm": 1.0631927251815796,
"learning_rate": 0.001,
"loss": 2.6137,
"step": 37000
},
{
"epoch": 11.990950226244344,
"grad_norm": 1.0793895721435547,
"learning_rate": 0.001,
"loss": 2.5941,
"step": 37100
},
{
"epoch": 12.023270846800258,
"grad_norm": 1.1296499967575073,
"learning_rate": 0.001,
"loss": 2.5119,
"step": 37200
},
{
"epoch": 12.055591467356173,
"grad_norm": 1.054184913635254,
"learning_rate": 0.001,
"loss": 2.4729,
"step": 37300
},
{
"epoch": 12.087912087912088,
"grad_norm": 21.742403030395508,
"learning_rate": 0.001,
"loss": 2.4631,
"step": 37400
},
{
"epoch": 12.120232708468002,
"grad_norm": 1.0219130516052246,
"learning_rate": 0.001,
"loss": 2.4652,
"step": 37500
},
{
"epoch": 12.152553329023917,
"grad_norm": 1.1618340015411377,
"learning_rate": 0.001,
"loss": 2.4674,
"step": 37600
},
{
"epoch": 12.184873949579831,
"grad_norm": 1.190769910812378,
"learning_rate": 0.001,
"loss": 2.4804,
"step": 37700
},
{
"epoch": 12.217194570135746,
"grad_norm": 1.1756348609924316,
"learning_rate": 0.001,
"loss": 2.5027,
"step": 37800
},
{
"epoch": 12.24951519069166,
"grad_norm": 0.9078492522239685,
"learning_rate": 0.001,
"loss": 2.4863,
"step": 37900
},
{
"epoch": 12.281835811247577,
"grad_norm": 0.9652780294418335,
"learning_rate": 0.001,
"loss": 2.4849,
"step": 38000
},
{
"epoch": 12.314156431803491,
"grad_norm": 1.3750672340393066,
"learning_rate": 0.001,
"loss": 2.4803,
"step": 38100
},
{
"epoch": 12.346477052359406,
"grad_norm": 1.0233724117279053,
"learning_rate": 0.001,
"loss": 2.5161,
"step": 38200
},
{
"epoch": 12.37879767291532,
"grad_norm": 1.130647897720337,
"learning_rate": 0.001,
"loss": 2.5181,
"step": 38300
},
{
"epoch": 12.411118293471235,
"grad_norm": 1.1900297403335571,
"learning_rate": 0.001,
"loss": 2.5251,
"step": 38400
},
{
"epoch": 12.44343891402715,
"grad_norm": 0.9599136710166931,
"learning_rate": 0.001,
"loss": 2.5308,
"step": 38500
},
{
"epoch": 12.475759534583064,
"grad_norm": 1.0950437784194946,
"learning_rate": 0.001,
"loss": 2.528,
"step": 38600
},
{
"epoch": 12.508080155138979,
"grad_norm": 2.040606737136841,
"learning_rate": 0.001,
"loss": 2.5159,
"step": 38700
},
{
"epoch": 12.540400775694893,
"grad_norm": 0.9455929398536682,
"learning_rate": 0.001,
"loss": 2.5413,
"step": 38800
},
{
"epoch": 12.572721396250808,
"grad_norm": 1.0487362146377563,
"learning_rate": 0.001,
"loss": 2.5331,
"step": 38900
},
{
"epoch": 12.605042016806722,
"grad_norm": 1.202513337135315,
"learning_rate": 0.001,
"loss": 2.5273,
"step": 39000
},
{
"epoch": 12.637362637362637,
"grad_norm": 0.8983702063560486,
"learning_rate": 0.001,
"loss": 2.5342,
"step": 39100
},
{
"epoch": 12.669683257918551,
"grad_norm": 1.1663144826889038,
"learning_rate": 0.001,
"loss": 2.5271,
"step": 39200
},
{
"epoch": 12.702003878474466,
"grad_norm": 1.0637140274047852,
"learning_rate": 0.001,
"loss": 2.5429,
"step": 39300
},
{
"epoch": 12.73432449903038,
"grad_norm": 0.9071537852287292,
"learning_rate": 0.001,
"loss": 2.5431,
"step": 39400
},
{
"epoch": 12.766645119586297,
"grad_norm": 1.0884722471237183,
"learning_rate": 0.001,
"loss": 2.5415,
"step": 39500
},
{
"epoch": 12.798965740142211,
"grad_norm": 1.1432896852493286,
"learning_rate": 0.001,
"loss": 2.5633,
"step": 39600
},
{
"epoch": 12.831286360698126,
"grad_norm": 1.1623923778533936,
"learning_rate": 0.001,
"loss": 2.5508,
"step": 39700
},
{
"epoch": 12.86360698125404,
"grad_norm": 0.9450523257255554,
"learning_rate": 0.001,
"loss": 2.5332,
"step": 39800
},
{
"epoch": 12.895927601809955,
"grad_norm": 1.2209385633468628,
"learning_rate": 0.001,
"loss": 2.538,
"step": 39900
},
{
"epoch": 12.92824822236587,
"grad_norm": 0.8747568726539612,
"learning_rate": 0.001,
"loss": 2.5379,
"step": 40000
},
{
"epoch": 12.960568842921784,
"grad_norm": 0.8547672629356384,
"learning_rate": 0.001,
"loss": 2.531,
"step": 40100
},
{
"epoch": 12.992889463477699,
"grad_norm": 1.1148180961608887,
"learning_rate": 0.001,
"loss": 2.5493,
"step": 40200
},
{
"epoch": 13.025210084033613,
"grad_norm": 1.0299571752548218,
"learning_rate": 0.001,
"loss": 2.455,
"step": 40300
},
{
"epoch": 13.057530704589528,
"grad_norm": 1.5488170385360718,
"learning_rate": 0.001,
"loss": 2.4235,
"step": 40400
},
{
"epoch": 13.089851325145442,
"grad_norm": 0.9480970501899719,
"learning_rate": 0.001,
"loss": 2.4149,
"step": 40500
},
{
"epoch": 13.122171945701357,
"grad_norm": 0.9796513319015503,
"learning_rate": 0.001,
"loss": 2.4012,
"step": 40600
},
{
"epoch": 13.154492566257272,
"grad_norm": 1.253645658493042,
"learning_rate": 0.001,
"loss": 2.4345,
"step": 40700
},
{
"epoch": 13.186813186813186,
"grad_norm": 0.9671187996864319,
"learning_rate": 0.001,
"loss": 2.421,
"step": 40800
},
{
"epoch": 13.2191338073691,
"grad_norm": 1.2620867490768433,
"learning_rate": 0.001,
"loss": 2.4489,
"step": 40900
},
{
"epoch": 13.251454427925015,
"grad_norm": 1.3267464637756348,
"learning_rate": 0.001,
"loss": 2.4219,
"step": 41000
},
{
"epoch": 13.283775048480932,
"grad_norm": 0.949113130569458,
"learning_rate": 0.001,
"loss": 2.4586,
"step": 41100
},
{
"epoch": 13.316095669036846,
"grad_norm": 0.9057651162147522,
"learning_rate": 0.001,
"loss": 2.4504,
"step": 41200
},
{
"epoch": 13.34841628959276,
"grad_norm": 0.9519304633140564,
"learning_rate": 0.001,
"loss": 2.4565,
"step": 41300
},
{
"epoch": 13.380736910148675,
"grad_norm": 0.8539422154426575,
"learning_rate": 0.001,
"loss": 2.4649,
"step": 41400
},
{
"epoch": 13.41305753070459,
"grad_norm": 0.9074021577835083,
"learning_rate": 0.001,
"loss": 2.4422,
"step": 41500
},
{
"epoch": 13.445378151260504,
"grad_norm": 0.9951125383377075,
"learning_rate": 0.001,
"loss": 2.474,
"step": 41600
},
{
"epoch": 13.477698771816419,
"grad_norm": 0.884623646736145,
"learning_rate": 0.001,
"loss": 2.4678,
"step": 41700
},
{
"epoch": 13.510019392372334,
"grad_norm": 1.0569515228271484,
"learning_rate": 0.001,
"loss": 2.4836,
"step": 41800
},
{
"epoch": 13.542340012928248,
"grad_norm": 1.202636957168579,
"learning_rate": 0.001,
"loss": 2.4781,
"step": 41900
},
{
"epoch": 13.574660633484163,
"grad_norm": 0.9308040738105774,
"learning_rate": 0.001,
"loss": 2.4682,
"step": 42000
},
{
"epoch": 13.606981254040077,
"grad_norm": 1.0900559425354004,
"learning_rate": 0.001,
"loss": 2.504,
"step": 42100
},
{
"epoch": 13.639301874595992,
"grad_norm": 1.135162591934204,
"learning_rate": 0.001,
"loss": 2.4863,
"step": 42200
},
{
"epoch": 13.671622495151906,
"grad_norm": 1.2860257625579834,
"learning_rate": 0.001,
"loss": 2.4946,
"step": 42300
},
{
"epoch": 13.70394311570782,
"grad_norm": 1.5642868280410767,
"learning_rate": 0.001,
"loss": 2.4893,
"step": 42400
},
{
"epoch": 13.736263736263737,
"grad_norm": 1.355553388595581,
"learning_rate": 0.001,
"loss": 2.4914,
"step": 42500
},
{
"epoch": 13.768584356819652,
"grad_norm": 1.0944535732269287,
"learning_rate": 0.001,
"loss": 2.4979,
"step": 42600
},
{
"epoch": 13.800904977375566,
"grad_norm": 1.1084762811660767,
"learning_rate": 0.001,
"loss": 2.4889,
"step": 42700
},
{
"epoch": 13.83322559793148,
"grad_norm": 1.0795499086380005,
"learning_rate": 0.001,
"loss": 2.5077,
"step": 42800
},
{
"epoch": 13.865546218487395,
"grad_norm": 1.1181540489196777,
"learning_rate": 0.001,
"loss": 2.5115,
"step": 42900
},
{
"epoch": 13.89786683904331,
"grad_norm": 0.8543340563774109,
"learning_rate": 0.001,
"loss": 2.4785,
"step": 43000
},
{
"epoch": 13.930187459599225,
"grad_norm": 0.9921061396598816,
"learning_rate": 0.001,
"loss": 2.5062,
"step": 43100
},
{
"epoch": 13.96250808015514,
"grad_norm": 1.1608710289001465,
"learning_rate": 0.001,
"loss": 2.5112,
"step": 43200
},
{
"epoch": 13.994828700711054,
"grad_norm": 0.9125173687934875,
"learning_rate": 0.001,
"loss": 2.4806,
"step": 43300
},
{
"epoch": 14.027149321266968,
"grad_norm": 1.0954087972640991,
"learning_rate": 0.001,
"loss": 2.3845,
"step": 43400
},
{
"epoch": 14.059469941822883,
"grad_norm": 1.1624791622161865,
"learning_rate": 0.001,
"loss": 2.3829,
"step": 43500
},
{
"epoch": 14.091790562378797,
"grad_norm": 1.3029100894927979,
"learning_rate": 0.001,
"loss": 2.3835,
"step": 43600
},
{
"epoch": 14.124111182934712,
"grad_norm": 1.024627923965454,
"learning_rate": 0.001,
"loss": 2.3614,
"step": 43700
},
{
"epoch": 14.156431803490626,
"grad_norm": 1.298632264137268,
"learning_rate": 0.001,
"loss": 2.3911,
"step": 43800
},
{
"epoch": 14.188752424046541,
"grad_norm": 1.313515543937683,
"learning_rate": 0.001,
"loss": 2.4011,
"step": 43900
},
{
"epoch": 14.221073044602456,
"grad_norm": 1.3333510160446167,
"learning_rate": 0.001,
"loss": 2.3881,
"step": 44000
},
{
"epoch": 14.25339366515837,
"grad_norm": 1.04513680934906,
"learning_rate": 0.001,
"loss": 2.3704,
"step": 44100
},
{
"epoch": 14.285714285714286,
"grad_norm": 1.4951848983764648,
"learning_rate": 0.001,
"loss": 2.386,
"step": 44200
},
{
"epoch": 14.318034906270201,
"grad_norm": 1.0465465784072876,
"learning_rate": 0.001,
"loss": 2.3972,
"step": 44300
},
{
"epoch": 14.350355526826116,
"grad_norm": 1.1519221067428589,
"learning_rate": 0.001,
"loss": 2.4056,
"step": 44400
},
{
"epoch": 14.38267614738203,
"grad_norm": 1.315697431564331,
"learning_rate": 0.001,
"loss": 2.4214,
"step": 44500
},
{
"epoch": 14.414996767937945,
"grad_norm": 1.0481849908828735,
"learning_rate": 0.001,
"loss": 2.4211,
"step": 44600
},
{
"epoch": 14.44731738849386,
"grad_norm": 1.1554055213928223,
"learning_rate": 0.001,
"loss": 2.4134,
"step": 44700
},
{
"epoch": 14.479638009049774,
"grad_norm": 1.0574384927749634,
"learning_rate": 0.001,
"loss": 2.4197,
"step": 44800
},
{
"epoch": 14.511958629605688,
"grad_norm": 1.1386839151382446,
"learning_rate": 0.001,
"loss": 2.414,
"step": 44900
},
{
"epoch": 14.544279250161603,
"grad_norm": 1.327596664428711,
"learning_rate": 0.001,
"loss": 2.4182,
"step": 45000
},
{
"epoch": 14.576599870717518,
"grad_norm": 0.963005542755127,
"learning_rate": 0.001,
"loss": 2.4251,
"step": 45100
},
{
"epoch": 14.608920491273432,
"grad_norm": 1.0058512687683105,
"learning_rate": 0.001,
"loss": 2.4233,
"step": 45200
},
{
"epoch": 14.641241111829347,
"grad_norm": 1.0360257625579834,
"learning_rate": 0.001,
"loss": 2.4353,
"step": 45300
},
{
"epoch": 14.673561732385261,
"grad_norm": 1.2501556873321533,
"learning_rate": 0.001,
"loss": 2.4511,
"step": 45400
},
{
"epoch": 14.705882352941176,
"grad_norm": 1.212724208831787,
"learning_rate": 0.001,
"loss": 2.4493,
"step": 45500
},
{
"epoch": 14.738202973497092,
"grad_norm": 1.4460214376449585,
"learning_rate": 0.001,
"loss": 2.4591,
"step": 45600
},
{
"epoch": 14.770523594053007,
"grad_norm": 1.2264606952667236,
"learning_rate": 0.001,
"loss": 2.431,
"step": 45700
},
{
"epoch": 14.802844214608921,
"grad_norm": 0.9162919521331787,
"learning_rate": 0.001,
"loss": 2.4497,
"step": 45800
},
{
"epoch": 14.835164835164836,
"grad_norm": 1.2006787061691284,
"learning_rate": 0.001,
"loss": 2.4343,
"step": 45900
},
{
"epoch": 14.86748545572075,
"grad_norm": 1.3091291189193726,
"learning_rate": 0.001,
"loss": 2.4543,
"step": 46000
},
{
"epoch": 14.899806076276665,
"grad_norm": 1.2788023948669434,
"learning_rate": 0.001,
"loss": 2.4441,
"step": 46100
},
{
"epoch": 14.93212669683258,
"grad_norm": 1.4079340696334839,
"learning_rate": 0.001,
"loss": 2.467,
"step": 46200
},
{
"epoch": 14.964447317388494,
"grad_norm": 1.0543346405029297,
"learning_rate": 0.001,
"loss": 2.4757,
"step": 46300
},
{
"epoch": 14.996767937944409,
"grad_norm": 1.0508509874343872,
"learning_rate": 0.001,
"loss": 2.448,
"step": 46400
},
{
"epoch": 15.029088558500323,
"grad_norm": 1.4893016815185547,
"learning_rate": 0.001,
"loss": 2.3238,
"step": 46500
},
{
"epoch": 15.061409179056238,
"grad_norm": 1.837750792503357,
"learning_rate": 0.001,
"loss": 2.3131,
"step": 46600
},
{
"epoch": 15.093729799612152,
"grad_norm": 1.2078869342803955,
"learning_rate": 0.001,
"loss": 2.3283,
"step": 46700
},
{
"epoch": 15.126050420168067,
"grad_norm": 1.6027250289916992,
"learning_rate": 0.001,
"loss": 2.3327,
"step": 46800
},
{
"epoch": 15.158371040723981,
"grad_norm": 1.5826632976531982,
"learning_rate": 0.001,
"loss": 2.3439,
"step": 46900
},
{
"epoch": 15.190691661279896,
"grad_norm": 1.4874987602233887,
"learning_rate": 0.001,
"loss": 2.3284,
"step": 47000
},
{
"epoch": 15.22301228183581,
"grad_norm": 1.3203476667404175,
"learning_rate": 0.001,
"loss": 2.3384,
"step": 47100
},
{
"epoch": 15.255332902391725,
"grad_norm": 1.3712375164031982,
"learning_rate": 0.001,
"loss": 2.3232,
"step": 47200
},
{
"epoch": 15.287653522947641,
"grad_norm": 1.7889151573181152,
"learning_rate": 0.001,
"loss": 2.3565,
"step": 47300
},
{
"epoch": 15.319974143503556,
"grad_norm": 1.271273136138916,
"learning_rate": 0.001,
"loss": 2.3609,
"step": 47400
},
{
"epoch": 15.35229476405947,
"grad_norm": 1.26175856590271,
"learning_rate": 0.001,
"loss": 2.3648,
"step": 47500
},
{
"epoch": 15.384615384615385,
"grad_norm": 1.1784569025039673,
"learning_rate": 0.001,
"loss": 2.3543,
"step": 47600
},
{
"epoch": 15.4169360051713,
"grad_norm": 1.213889241218567,
"learning_rate": 0.001,
"loss": 2.3653,
"step": 47700
},
{
"epoch": 15.449256625727214,
"grad_norm": 1.2286897897720337,
"learning_rate": 0.001,
"loss": 2.3669,
"step": 47800
},
{
"epoch": 15.481577246283129,
"grad_norm": 1.0805023908615112,
"learning_rate": 0.001,
"loss": 2.3646,
"step": 47900
},
{
"epoch": 15.513897866839043,
"grad_norm": 1.3555302619934082,
"learning_rate": 0.001,
"loss": 2.4106,
"step": 48000
},
{
"epoch": 15.546218487394958,
"grad_norm": 1.2589572668075562,
"learning_rate": 0.001,
"loss": 2.3889,
"step": 48100
},
{
"epoch": 15.578539107950872,
"grad_norm": 1.3926182985305786,
"learning_rate": 0.001,
"loss": 2.3845,
"step": 48200
},
{
"epoch": 15.610859728506787,
"grad_norm": 1.1518105268478394,
"learning_rate": 0.001,
"loss": 2.3971,
"step": 48300
},
{
"epoch": 15.643180349062701,
"grad_norm": 3.0709455013275146,
"learning_rate": 0.001,
"loss": 2.3771,
"step": 48400
},
{
"epoch": 15.675500969618616,
"grad_norm": 1.155657172203064,
"learning_rate": 0.001,
"loss": 2.3964,
"step": 48500
},
{
"epoch": 15.70782159017453,
"grad_norm": 1.1553372144699097,
"learning_rate": 0.001,
"loss": 2.3915,
"step": 48600
},
{
"epoch": 15.740142210730447,
"grad_norm": 1.320940613746643,
"learning_rate": 0.001,
"loss": 2.4046,
"step": 48700
},
{
"epoch": 15.772462831286362,
"grad_norm": 1.5498952865600586,
"learning_rate": 0.001,
"loss": 2.3993,
"step": 48800
},
{
"epoch": 15.804783451842276,
"grad_norm": 1.2019668817520142,
"learning_rate": 0.001,
"loss": 2.3913,
"step": 48900
},
{
"epoch": 15.83710407239819,
"grad_norm": 1.5592061281204224,
"learning_rate": 0.001,
"loss": 2.3928,
"step": 49000
},
{
"epoch": 15.869424692954105,
"grad_norm": 1.2563676834106445,
"learning_rate": 0.001,
"loss": 2.4179,
"step": 49100
},
{
"epoch": 15.90174531351002,
"grad_norm": 1.7604995965957642,
"learning_rate": 0.001,
"loss": 2.4154,
"step": 49200
},
{
"epoch": 15.934065934065934,
"grad_norm": 2.444636583328247,
"learning_rate": 0.001,
"loss": 2.4203,
"step": 49300
},
{
"epoch": 15.966386554621849,
"grad_norm": 1.1105613708496094,
"learning_rate": 0.001,
"loss": 2.4068,
"step": 49400
},
{
"epoch": 15.998707175177763,
"grad_norm": 1.4374933242797852,
"learning_rate": 0.001,
"loss": 2.4238,
"step": 49500
},
{
"epoch": 16.031027795733678,
"grad_norm": 1.5967682600021362,
"learning_rate": 0.001,
"loss": 2.265,
"step": 49600
},
{
"epoch": 16.063348416289593,
"grad_norm": 1.267298698425293,
"learning_rate": 0.001,
"loss": 2.2788,
"step": 49700
},
{
"epoch": 16.095669036845507,
"grad_norm": 1.7938473224639893,
"learning_rate": 0.001,
"loss": 2.2835,
"step": 49800
},
{
"epoch": 16.12798965740142,
"grad_norm": 1.4659450054168701,
"learning_rate": 0.001,
"loss": 2.2731,
"step": 49900
},
{
"epoch": 16.160310277957336,
"grad_norm": 1.5971636772155762,
"learning_rate": 0.001,
"loss": 2.3012,
"step": 50000
},
{
"epoch": 16.19263089851325,
"grad_norm": 1.6608117818832397,
"learning_rate": 0.001,
"loss": 2.309,
"step": 50100
},
{
"epoch": 16.224951519069165,
"grad_norm": 1.403343915939331,
"learning_rate": 0.001,
"loss": 2.3204,
"step": 50200
},
{
"epoch": 16.25727213962508,
"grad_norm": 1.548507809638977,
"learning_rate": 0.001,
"loss": 2.307,
"step": 50300
},
{
"epoch": 16.289592760180994,
"grad_norm": 1.3704252243041992,
"learning_rate": 0.001,
"loss": 2.3057,
"step": 50400
},
{
"epoch": 16.32191338073691,
"grad_norm": 1.283632755279541,
"learning_rate": 0.001,
"loss": 2.3067,
"step": 50500
},
{
"epoch": 16.354234001292824,
"grad_norm": 1.857095718383789,
"learning_rate": 0.001,
"loss": 2.3188,
"step": 50600
},
{
"epoch": 16.386554621848738,
"grad_norm": 1.4724833965301514,
"learning_rate": 0.001,
"loss": 2.3249,
"step": 50700
},
{
"epoch": 16.418875242404653,
"grad_norm": 1.4150060415267944,
"learning_rate": 0.001,
"loss": 2.3166,
"step": 50800
},
{
"epoch": 16.451195862960567,
"grad_norm": 1.3506375551223755,
"learning_rate": 0.001,
"loss": 2.332,
"step": 50900
},
{
"epoch": 16.483516483516482,
"grad_norm": 1.4680278301239014,
"learning_rate": 0.001,
"loss": 2.3305,
"step": 51000
},
{
"epoch": 16.5158371040724,
"grad_norm": 1.3896517753601074,
"learning_rate": 0.001,
"loss": 2.3371,
"step": 51100
},
{
"epoch": 16.548157724628314,
"grad_norm": 1.4641127586364746,
"learning_rate": 0.001,
"loss": 2.3332,
"step": 51200
},
{
"epoch": 16.58047834518423,
"grad_norm": 1.54449462890625,
"learning_rate": 0.001,
"loss": 2.3543,
"step": 51300
},
{
"epoch": 16.612798965740144,
"grad_norm": 1.260672688484192,
"learning_rate": 0.001,
"loss": 2.3588,
"step": 51400
},
{
"epoch": 16.645119586296058,
"grad_norm": 1.3852020502090454,
"learning_rate": 0.001,
"loss": 2.3545,
"step": 51500
},
{
"epoch": 16.677440206851973,
"grad_norm": 1.5494886636734009,
"learning_rate": 0.001,
"loss": 2.358,
"step": 51600
},
{
"epoch": 16.709760827407887,
"grad_norm": 1.8374618291854858,
"learning_rate": 0.001,
"loss": 2.3726,
"step": 51700
},
{
"epoch": 16.742081447963802,
"grad_norm": 1.3113868236541748,
"learning_rate": 0.001,
"loss": 2.3415,
"step": 51800
},
{
"epoch": 16.774402068519716,
"grad_norm": 1.4161752462387085,
"learning_rate": 0.001,
"loss": 2.3594,
"step": 51900
},
{
"epoch": 16.80672268907563,
"grad_norm": 1.4049732685089111,
"learning_rate": 0.001,
"loss": 2.3403,
"step": 52000
},
{
"epoch": 16.839043309631545,
"grad_norm": 1.5107018947601318,
"learning_rate": 0.001,
"loss": 2.3809,
"step": 52100
},
{
"epoch": 16.87136393018746,
"grad_norm": 1.6911265850067139,
"learning_rate": 0.001,
"loss": 2.3562,
"step": 52200
},
{
"epoch": 16.903684550743375,
"grad_norm": 1.6009875535964966,
"learning_rate": 0.001,
"loss": 2.3589,
"step": 52300
},
{
"epoch": 16.93600517129929,
"grad_norm": 1.415225863456726,
"learning_rate": 0.001,
"loss": 2.3723,
"step": 52400
},
{
"epoch": 16.968325791855204,
"grad_norm": 1.5780458450317383,
"learning_rate": 0.001,
"loss": 2.3631,
"step": 52500
},
{
"epoch": 17.00064641241112,
"grad_norm": 1.3046797513961792,
"learning_rate": 0.001,
"loss": 2.3591,
"step": 52600
},
{
"epoch": 17.032967032967033,
"grad_norm": 1.631547212600708,
"learning_rate": 0.001,
"loss": 2.213,
"step": 52700
},
{
"epoch": 17.065287653522947,
"grad_norm": 1.5670453310012817,
"learning_rate": 0.001,
"loss": 2.231,
"step": 52800
},
{
"epoch": 17.097608274078862,
"grad_norm": 1.5162924528121948,
"learning_rate": 0.001,
"loss": 2.2282,
"step": 52900
},
{
"epoch": 17.129928894634777,
"grad_norm": 1.8685030937194824,
"learning_rate": 0.001,
"loss": 2.2464,
"step": 53000
},
{
"epoch": 17.16224951519069,
"grad_norm": 1.8752682209014893,
"learning_rate": 0.001,
"loss": 2.2316,
"step": 53100
},
{
"epoch": 17.194570135746606,
"grad_norm": 1.5304337739944458,
"learning_rate": 0.001,
"loss": 2.2437,
"step": 53200
},
{
"epoch": 17.22689075630252,
"grad_norm": 1.8339931964874268,
"learning_rate": 0.001,
"loss": 2.2524,
"step": 53300
},
{
"epoch": 17.259211376858435,
"grad_norm": 1.6601121425628662,
"learning_rate": 0.001,
"loss": 2.2821,
"step": 53400
},
{
"epoch": 17.29153199741435,
"grad_norm": 1.037027359008789,
"learning_rate": 0.001,
"loss": 2.2599,
"step": 53500
},
{
"epoch": 17.323852617970264,
"grad_norm": 1.4101696014404297,
"learning_rate": 0.001,
"loss": 2.2894,
"step": 53600
},
{
"epoch": 17.35617323852618,
"grad_norm": 1.715714931488037,
"learning_rate": 0.001,
"loss": 2.2673,
"step": 53700
},
{
"epoch": 17.388493859082093,
"grad_norm": 1.6918067932128906,
"learning_rate": 0.001,
"loss": 2.277,
"step": 53800
},
{
"epoch": 17.420814479638008,
"grad_norm": 1.513771653175354,
"learning_rate": 0.001,
"loss": 2.2822,
"step": 53900
},
{
"epoch": 17.453135100193922,
"grad_norm": 2.0623667240142822,
"learning_rate": 0.001,
"loss": 2.3088,
"step": 54000
},
{
"epoch": 17.485455720749837,
"grad_norm": 1.43783700466156,
"learning_rate": 0.001,
"loss": 2.2943,
"step": 54100
},
{
"epoch": 17.517776341305755,
"grad_norm": 1.387234091758728,
"learning_rate": 0.001,
"loss": 2.3021,
"step": 54200
},
{
"epoch": 17.55009696186167,
"grad_norm": 1.8661473989486694,
"learning_rate": 0.001,
"loss": 2.2701,
"step": 54300
},
{
"epoch": 17.582417582417584,
"grad_norm": 1.76520836353302,
"learning_rate": 0.001,
"loss": 2.2823,
"step": 54400
},
{
"epoch": 17.6147382029735,
"grad_norm": 1.5826014280319214,
"learning_rate": 0.001,
"loss": 2.3244,
"step": 54500
},
{
"epoch": 17.647058823529413,
"grad_norm": 1.3721729516983032,
"learning_rate": 0.001,
"loss": 2.318,
"step": 54600
},
{
"epoch": 17.679379444085328,
"grad_norm": 1.4153558015823364,
"learning_rate": 0.001,
"loss": 2.3211,
"step": 54700
},
{
"epoch": 17.711700064641242,
"grad_norm": 1.6873489618301392,
"learning_rate": 0.001,
"loss": 2.3211,
"step": 54800
},
{
"epoch": 17.744020685197157,
"grad_norm": 1.48008131980896,
"learning_rate": 0.001,
"loss": 2.3298,
"step": 54900
},
{
"epoch": 17.77634130575307,
"grad_norm": 1.2169060707092285,
"learning_rate": 0.001,
"loss": 2.3117,
"step": 55000
},
{
"epoch": 17.808661926308986,
"grad_norm": 2.0541675090789795,
"learning_rate": 0.001,
"loss": 2.3168,
"step": 55100
},
{
"epoch": 17.8409825468649,
"grad_norm": 1.6494852304458618,
"learning_rate": 0.001,
"loss": 2.3136,
"step": 55200
},
{
"epoch": 17.873303167420815,
"grad_norm": 1.9559639692306519,
"learning_rate": 0.001,
"loss": 2.3385,
"step": 55300
},
{
"epoch": 17.90562378797673,
"grad_norm": 1.883894443511963,
"learning_rate": 0.001,
"loss": 2.3241,
"step": 55400
},
{
"epoch": 17.937944408532644,
"grad_norm": 1.4204341173171997,
"learning_rate": 0.001,
"loss": 2.3306,
"step": 55500
},
{
"epoch": 17.97026502908856,
"grad_norm": 1.837131142616272,
"learning_rate": 0.001,
"loss": 2.3515,
"step": 55600
},
{
"epoch": 18.002585649644473,
"grad_norm": 1.2758315801620483,
"learning_rate": 0.001,
"loss": 2.3336,
"step": 55700
},
{
"epoch": 18.034906270200388,
"grad_norm": 1.0778571367263794,
"learning_rate": 0.001,
"loss": 2.1599,
"step": 55800
},
{
"epoch": 18.067226890756302,
"grad_norm": 1.2033774852752686,
"learning_rate": 0.001,
"loss": 2.1879,
"step": 55900
},
{
"epoch": 18.099547511312217,
"grad_norm": 1.5203527212142944,
"learning_rate": 0.001,
"loss": 2.1859,
"step": 56000
},
{
"epoch": 18.13186813186813,
"grad_norm": 1.2778196334838867,
"learning_rate": 0.001,
"loss": 2.2118,
"step": 56100
},
{
"epoch": 18.164188752424046,
"grad_norm": 1.490444302558899,
"learning_rate": 0.001,
"loss": 2.215,
"step": 56200
},
{
"epoch": 18.19650937297996,
"grad_norm": 1.25520658493042,
"learning_rate": 0.001,
"loss": 2.2096,
"step": 56300
},
{
"epoch": 18.228829993535875,
"grad_norm": 1.3420361280441284,
"learning_rate": 0.001,
"loss": 2.2346,
"step": 56400
},
{
"epoch": 18.26115061409179,
"grad_norm": 1.4662959575653076,
"learning_rate": 0.001,
"loss": 2.2047,
"step": 56500
},
{
"epoch": 18.293471234647704,
"grad_norm": 1.3517006635665894,
"learning_rate": 0.001,
"loss": 2.2302,
"step": 56600
},
{
"epoch": 18.32579185520362,
"grad_norm": 1.6744149923324585,
"learning_rate": 0.001,
"loss": 2.2548,
"step": 56700
},
{
"epoch": 18.358112475759533,
"grad_norm": 1.6994774341583252,
"learning_rate": 0.001,
"loss": 2.2184,
"step": 56800
},
{
"epoch": 18.390433096315448,
"grad_norm": 1.2075378894805908,
"learning_rate": 0.001,
"loss": 2.2467,
"step": 56900
},
{
"epoch": 18.422753716871362,
"grad_norm": 1.0433144569396973,
"learning_rate": 0.001,
"loss": 2.2499,
"step": 57000
},
{
"epoch": 18.455074337427277,
"grad_norm": 1.2884716987609863,
"learning_rate": 0.001,
"loss": 2.2475,
"step": 57100
},
{
"epoch": 18.48739495798319,
"grad_norm": 1.8086559772491455,
"learning_rate": 0.001,
"loss": 2.2572,
"step": 57200
},
{
"epoch": 18.51971557853911,
"grad_norm": 1.1635278463363647,
"learning_rate": 0.001,
"loss": 2.2554,
"step": 57300
},
{
"epoch": 18.552036199095024,
"grad_norm": 1.3635642528533936,
"learning_rate": 0.001,
"loss": 2.2633,
"step": 57400
},
{
"epoch": 18.58435681965094,
"grad_norm": 1.2767882347106934,
"learning_rate": 0.001,
"loss": 2.2519,
"step": 57500
},
{
"epoch": 18.616677440206853,
"grad_norm": 1.571807861328125,
"learning_rate": 0.001,
"loss": 2.2582,
"step": 57600
},
{
"epoch": 18.648998060762768,
"grad_norm": 1.5809171199798584,
"learning_rate": 0.001,
"loss": 2.2612,
"step": 57700
},
{
"epoch": 18.681318681318682,
"grad_norm": 1.2579069137573242,
"learning_rate": 0.001,
"loss": 2.2713,
"step": 57800
},
{
"epoch": 18.713639301874597,
"grad_norm": 1.2632404565811157,
"learning_rate": 0.001,
"loss": 2.2876,
"step": 57900
},
{
"epoch": 18.74595992243051,
"grad_norm": 1.0768790245056152,
"learning_rate": 0.001,
"loss": 2.2794,
"step": 58000
},
{
"epoch": 18.778280542986426,
"grad_norm": 1.4682295322418213,
"learning_rate": 0.001,
"loss": 2.2766,
"step": 58100
},
{
"epoch": 18.81060116354234,
"grad_norm": 1.269097089767456,
"learning_rate": 0.001,
"loss": 2.2587,
"step": 58200
},
{
"epoch": 18.842921784098255,
"grad_norm": 1.7296055555343628,
"learning_rate": 0.001,
"loss": 2.2853,
"step": 58300
},
{
"epoch": 18.87524240465417,
"grad_norm": 1.5035419464111328,
"learning_rate": 0.001,
"loss": 2.2967,
"step": 58400
},
{
"epoch": 18.907563025210084,
"grad_norm": 1.2617650032043457,
"learning_rate": 0.001,
"loss": 2.3184,
"step": 58500
},
{
"epoch": 18.939883645766,
"grad_norm": 1.4061576128005981,
"learning_rate": 0.001,
"loss": 2.2902,
"step": 58600
},
{
"epoch": 18.972204266321913,
"grad_norm": 1.2522116899490356,
"learning_rate": 0.001,
"loss": 2.2897,
"step": 58700
},
{
"epoch": 19.004524886877828,
"grad_norm": 1.2318428754806519,
"learning_rate": 0.001,
"loss": 2.295,
"step": 58800
},
{
"epoch": 19.036845507433743,
"grad_norm": 1.2215492725372314,
"learning_rate": 0.001,
"loss": 2.1301,
"step": 58900
},
{
"epoch": 19.069166127989657,
"grad_norm": 1.204942226409912,
"learning_rate": 0.001,
"loss": 2.1383,
"step": 59000
},
{
"epoch": 19.10148674854557,
"grad_norm": 1.343122124671936,
"learning_rate": 0.001,
"loss": 2.1669,
"step": 59100
},
{
"epoch": 19.133807369101486,
"grad_norm": 1.4247043132781982,
"learning_rate": 0.001,
"loss": 2.17,
"step": 59200
},
{
"epoch": 19.1661279896574,
"grad_norm": 1.212086796760559,
"learning_rate": 0.001,
"loss": 2.1771,
"step": 59300
},
{
"epoch": 19.198448610213315,
"grad_norm": 0.9887686371803284,
"learning_rate": 0.001,
"loss": 2.1871,
"step": 59400
},
{
"epoch": 19.23076923076923,
"grad_norm": 0.9896878600120544,
"learning_rate": 0.001,
"loss": 2.1768,
"step": 59500
},
{
"epoch": 19.263089851325145,
"grad_norm": 1.0798989534378052,
"learning_rate": 0.001,
"loss": 2.1963,
"step": 59600
},
{
"epoch": 19.29541047188106,
"grad_norm": 1.0032464265823364,
"learning_rate": 0.001,
"loss": 2.1917,
"step": 59700
},
{
"epoch": 19.327731092436974,
"grad_norm": 1.21811044216156,
"learning_rate": 0.001,
"loss": 2.204,
"step": 59800
},
{
"epoch": 19.360051712992888,
"grad_norm": 1.1439648866653442,
"learning_rate": 0.001,
"loss": 2.2006,
"step": 59900
},
{
"epoch": 19.392372333548803,
"grad_norm": 1.0855740308761597,
"learning_rate": 0.001,
"loss": 2.2165,
"step": 60000
},
{
"epoch": 19.424692954104717,
"grad_norm": 1.388441562652588,
"learning_rate": 0.001,
"loss": 2.2109,
"step": 60100
},
{
"epoch": 19.457013574660632,
"grad_norm": 1.4667842388153076,
"learning_rate": 0.001,
"loss": 2.1972,
"step": 60200
},
{
"epoch": 19.489334195216546,
"grad_norm": 1.7039697170257568,
"learning_rate": 0.001,
"loss": 2.2221,
"step": 60300
},
{
"epoch": 19.521654815772465,
"grad_norm": 1.1940791606903076,
"learning_rate": 0.001,
"loss": 2.2167,
"step": 60400
},
{
"epoch": 19.55397543632838,
"grad_norm": 1.150011420249939,
"learning_rate": 0.001,
"loss": 2.2246,
"step": 60500
},
{
"epoch": 19.586296056884294,
"grad_norm": 1.097654104232788,
"learning_rate": 0.001,
"loss": 2.2212,
"step": 60600
},
{
"epoch": 19.618616677440208,
"grad_norm": 1.1845519542694092,
"learning_rate": 0.001,
"loss": 2.2251,
"step": 60700
},
{
"epoch": 19.650937297996123,
"grad_norm": 1.1336361169815063,
"learning_rate": 0.001,
"loss": 2.234,
"step": 60800
},
{
"epoch": 19.683257918552037,
"grad_norm": 1.1724891662597656,
"learning_rate": 0.001,
"loss": 2.2089,
"step": 60900
},
{
"epoch": 19.715578539107952,
"grad_norm": 0.9693626165390015,
"learning_rate": 0.001,
"loss": 2.2348,
"step": 61000
},
{
"epoch": 19.747899159663866,
"grad_norm": 1.1252988576889038,
"learning_rate": 0.001,
"loss": 2.2443,
"step": 61100
},
{
"epoch": 19.78021978021978,
"grad_norm": 0.9875534772872925,
"learning_rate": 0.001,
"loss": 2.2378,
"step": 61200
},
{
"epoch": 19.812540400775696,
"grad_norm": 1.3839106559753418,
"learning_rate": 0.001,
"loss": 2.2329,
"step": 61300
},
{
"epoch": 19.84486102133161,
"grad_norm": 1.5243983268737793,
"learning_rate": 0.001,
"loss": 2.252,
"step": 61400
},
{
"epoch": 19.877181641887525,
"grad_norm": 1.1300511360168457,
"learning_rate": 0.001,
"loss": 2.2568,
"step": 61500
},
{
"epoch": 19.90950226244344,
"grad_norm": 1.2548259496688843,
"learning_rate": 0.001,
"loss": 2.2585,
"step": 61600
},
{
"epoch": 19.941822882999354,
"grad_norm": 1.2727535963058472,
"learning_rate": 0.001,
"loss": 2.236,
"step": 61700
},
{
"epoch": 19.97414350355527,
"grad_norm": 1.0166510343551636,
"learning_rate": 0.001,
"loss": 2.2477,
"step": 61800
},
{
"epoch": 20.006464124111183,
"grad_norm": 1.0059797763824463,
"learning_rate": 0.001,
"loss": 2.2228,
"step": 61900
},
{
"epoch": 20.038784744667097,
"grad_norm": 1.5406187772750854,
"learning_rate": 0.001,
"loss": 2.1074,
"step": 62000
},
{
"epoch": 20.071105365223012,
"grad_norm": 1.2194257974624634,
"learning_rate": 0.001,
"loss": 2.128,
"step": 62100
},
{
"epoch": 20.103425985778927,
"grad_norm": 1.000863790512085,
"learning_rate": 0.001,
"loss": 2.1116,
"step": 62200
},
{
"epoch": 20.13574660633484,
"grad_norm": 1.1983182430267334,
"learning_rate": 0.001,
"loss": 2.1301,
"step": 62300
},
{
"epoch": 20.168067226890756,
"grad_norm": 1.2805134057998657,
"learning_rate": 0.001,
"loss": 2.1357,
"step": 62400
},
{
"epoch": 20.20038784744667,
"grad_norm": 1.5315334796905518,
"learning_rate": 0.001,
"loss": 2.127,
"step": 62500
},
{
"epoch": 20.232708468002585,
"grad_norm": 1.239235520362854,
"learning_rate": 0.001,
"loss": 2.1294,
"step": 62600
},
{
"epoch": 20.2650290885585,
"grad_norm": 1.214128851890564,
"learning_rate": 0.001,
"loss": 2.1297,
"step": 62700
},
{
"epoch": 20.297349709114414,
"grad_norm": 1.1846396923065186,
"learning_rate": 0.001,
"loss": 2.1661,
"step": 62800
},
{
"epoch": 20.32967032967033,
"grad_norm": 1.3728803396224976,
"learning_rate": 0.001,
"loss": 2.1509,
"step": 62900
},
{
"epoch": 20.361990950226243,
"grad_norm": 1.369428038597107,
"learning_rate": 0.001,
"loss": 2.1814,
"step": 63000
},
{
"epoch": 20.394311570782158,
"grad_norm": 1.49150812625885,
"learning_rate": 0.001,
"loss": 2.156,
"step": 63100
},
{
"epoch": 20.426632191338072,
"grad_norm": 1.056602954864502,
"learning_rate": 0.001,
"loss": 2.1691,
"step": 63200
},
{
"epoch": 20.458952811893987,
"grad_norm": 1.071666955947876,
"learning_rate": 0.001,
"loss": 2.1601,
"step": 63300
},
{
"epoch": 20.4912734324499,
"grad_norm": 1.1927623748779297,
"learning_rate": 0.001,
"loss": 2.1771,
"step": 63400
},
{
"epoch": 20.52359405300582,
"grad_norm": 1.1696590185165405,
"learning_rate": 0.001,
"loss": 2.1663,
"step": 63500
},
{
"epoch": 20.555914673561734,
"grad_norm": 1.454006314277649,
"learning_rate": 0.001,
"loss": 2.1836,
"step": 63600
},
{
"epoch": 20.58823529411765,
"grad_norm": 1.0862654447555542,
"learning_rate": 0.001,
"loss": 2.1822,
"step": 63700
},
{
"epoch": 20.620555914673563,
"grad_norm": 1.1355229616165161,
"learning_rate": 0.001,
"loss": 2.1819,
"step": 63800
},
{
"epoch": 20.652876535229478,
"grad_norm": 1.1009161472320557,
"learning_rate": 0.001,
"loss": 2.1741,
"step": 63900
},
{
"epoch": 20.685197155785392,
"grad_norm": 1.2176330089569092,
"learning_rate": 0.001,
"loss": 2.1882,
"step": 64000
},
{
"epoch": 20.717517776341307,
"grad_norm": 1.3134511709213257,
"learning_rate": 0.001,
"loss": 2.1978,
"step": 64100
},
{
"epoch": 20.74983839689722,
"grad_norm": 1.0069459676742554,
"learning_rate": 0.001,
"loss": 2.2084,
"step": 64200
},
{
"epoch": 20.782159017453136,
"grad_norm": 1.4432331323623657,
"learning_rate": 0.001,
"loss": 2.1979,
"step": 64300
},
{
"epoch": 20.81447963800905,
"grad_norm": 1.021673321723938,
"learning_rate": 0.001,
"loss": 2.2206,
"step": 64400
},
{
"epoch": 20.846800258564965,
"grad_norm": 1.5848689079284668,
"learning_rate": 0.001,
"loss": 2.2038,
"step": 64500
},
{
"epoch": 20.87912087912088,
"grad_norm": 1.2562905550003052,
"learning_rate": 0.001,
"loss": 2.2304,
"step": 64600
},
{
"epoch": 20.911441499676794,
"grad_norm": 1.084649920463562,
"learning_rate": 0.001,
"loss": 2.2199,
"step": 64700
},
{
"epoch": 20.94376212023271,
"grad_norm": 1.2564764022827148,
"learning_rate": 0.001,
"loss": 2.2264,
"step": 64800
},
{
"epoch": 20.976082740788623,
"grad_norm": 1.4754396677017212,
"learning_rate": 0.001,
"loss": 2.2299,
"step": 64900
},
{
"epoch": 21.008403361344538,
"grad_norm": 1.1850255727767944,
"learning_rate": 0.001,
"loss": 2.1639,
"step": 65000
},
{
"epoch": 21.040723981900452,
"grad_norm": 1.1101585626602173,
"learning_rate": 0.001,
"loss": 2.0785,
"step": 65100
},
{
"epoch": 21.073044602456367,
"grad_norm": 1.2771697044372559,
"learning_rate": 0.001,
"loss": 2.084,
"step": 65200
},
{
"epoch": 21.10536522301228,
"grad_norm": 1.1103767156600952,
"learning_rate": 0.001,
"loss": 2.0889,
"step": 65300
},
{
"epoch": 21.137685843568196,
"grad_norm": 1.2200546264648438,
"learning_rate": 0.001,
"loss": 2.0993,
"step": 65400
},
{
"epoch": 21.17000646412411,
"grad_norm": 1.3447659015655518,
"learning_rate": 0.001,
"loss": 2.0916,
"step": 65500
},
{
"epoch": 21.202327084680025,
"grad_norm": 2.29350209236145,
"learning_rate": 0.001,
"loss": 2.1049,
"step": 65600
},
{
"epoch": 21.23464770523594,
"grad_norm": 1.195257306098938,
"learning_rate": 0.001,
"loss": 2.1069,
"step": 65700
},
{
"epoch": 21.266968325791854,
"grad_norm": 1.0652481317520142,
"learning_rate": 0.001,
"loss": 2.1048,
"step": 65800
},
{
"epoch": 21.29928894634777,
"grad_norm": 1.1504040956497192,
"learning_rate": 0.001,
"loss": 2.1239,
"step": 65900
},
{
"epoch": 21.331609566903683,
"grad_norm": 1.2053735256195068,
"learning_rate": 0.001,
"loss": 2.1326,
"step": 66000
},
{
"epoch": 21.363930187459598,
"grad_norm": 39.07048034667969,
"learning_rate": 0.001,
"loss": 2.112,
"step": 66100
},
{
"epoch": 21.396250808015512,
"grad_norm": 1.1385326385498047,
"learning_rate": 0.001,
"loss": 2.1137,
"step": 66200
},
{
"epoch": 21.428571428571427,
"grad_norm": 1.2207857370376587,
"learning_rate": 0.001,
"loss": 2.125,
"step": 66300
},
{
"epoch": 21.46089204912734,
"grad_norm": 1.2614213228225708,
"learning_rate": 0.001,
"loss": 2.1434,
"step": 66400
},
{
"epoch": 21.49321266968326,
"grad_norm": 1.3514631986618042,
"learning_rate": 0.001,
"loss": 2.1106,
"step": 66500
},
{
"epoch": 21.525533290239174,
"grad_norm": 1.471451759338379,
"learning_rate": 0.001,
"loss": 2.1038,
"step": 66600
},
{
"epoch": 21.55785391079509,
"grad_norm": 1.3486419916152954,
"learning_rate": 0.001,
"loss": 2.1398,
"step": 66700
},
{
"epoch": 21.590174531351003,
"grad_norm": 3.350062847137451,
"learning_rate": 0.001,
"loss": 2.1478,
"step": 66800
},
{
"epoch": 21.622495151906918,
"grad_norm": 1.3389320373535156,
"learning_rate": 0.001,
"loss": 2.1472,
"step": 66900
},
{
"epoch": 21.654815772462833,
"grad_norm": 1.0626788139343262,
"learning_rate": 0.001,
"loss": 2.1595,
"step": 67000
},
{
"epoch": 21.687136393018747,
"grad_norm": 1.2367748022079468,
"learning_rate": 0.001,
"loss": 2.1655,
"step": 67100
},
{
"epoch": 21.71945701357466,
"grad_norm": 1.1122276782989502,
"learning_rate": 0.001,
"loss": 2.179,
"step": 67200
},
{
"epoch": 21.751777634130576,
"grad_norm": 1.179870367050171,
"learning_rate": 0.001,
"loss": 2.1743,
"step": 67300
},
{
"epoch": 21.78409825468649,
"grad_norm": 1.1807243824005127,
"learning_rate": 0.001,
"loss": 2.1917,
"step": 67400
},
{
"epoch": 21.816418875242405,
"grad_norm": 1.0804619789123535,
"learning_rate": 0.001,
"loss": 2.192,
"step": 67500
},
{
"epoch": 21.84873949579832,
"grad_norm": 1.6039589643478394,
"learning_rate": 0.001,
"loss": 2.1808,
"step": 67600
},
{
"epoch": 21.881060116354234,
"grad_norm": 1.2812756299972534,
"learning_rate": 0.001,
"loss": 2.1729,
"step": 67700
},
{
"epoch": 21.91338073691015,
"grad_norm": 1.1737068891525269,
"learning_rate": 0.001,
"loss": 2.1592,
"step": 67800
},
{
"epoch": 21.945701357466064,
"grad_norm": 1.1612744331359863,
"learning_rate": 0.001,
"loss": 2.1783,
"step": 67900
},
{
"epoch": 21.978021978021978,
"grad_norm": 1.238431692123413,
"learning_rate": 0.001,
"loss": 2.1802,
"step": 68000
},
{
"epoch": 22.010342598577893,
"grad_norm": 1.3498260974884033,
"learning_rate": 0.001,
"loss": 2.1187,
"step": 68100
},
{
"epoch": 22.042663219133807,
"grad_norm": 1.307900309562683,
"learning_rate": 0.001,
"loss": 2.0347,
"step": 68200
},
{
"epoch": 22.07498383968972,
"grad_norm": 1.265341877937317,
"learning_rate": 0.001,
"loss": 2.0262,
"step": 68300
},
{
"epoch": 22.107304460245636,
"grad_norm": 1.0917607545852661,
"learning_rate": 0.001,
"loss": 2.0515,
"step": 68400
},
{
"epoch": 22.13962508080155,
"grad_norm": 1.6194117069244385,
"learning_rate": 0.001,
"loss": 2.0563,
"step": 68500
},
{
"epoch": 22.171945701357465,
"grad_norm": 1.8932991027832031,
"learning_rate": 0.001,
"loss": 2.0652,
"step": 68600
},
{
"epoch": 22.20426632191338,
"grad_norm": 1.2356934547424316,
"learning_rate": 0.001,
"loss": 2.0741,
"step": 68700
},
{
"epoch": 22.236586942469295,
"grad_norm": 1.2971307039260864,
"learning_rate": 0.001,
"loss": 2.0682,
"step": 68800
},
{
"epoch": 22.26890756302521,
"grad_norm": 1.5780755281448364,
"learning_rate": 0.001,
"loss": 2.0567,
"step": 68900
},
{
"epoch": 22.301228183581124,
"grad_norm": 1.664420247077942,
"learning_rate": 0.001,
"loss": 2.0788,
"step": 69000
},
{
"epoch": 22.33354880413704,
"grad_norm": 1.3689608573913574,
"learning_rate": 0.001,
"loss": 2.0612,
"step": 69100
},
{
"epoch": 22.365869424692953,
"grad_norm": 1.2644816637039185,
"learning_rate": 0.001,
"loss": 2.1076,
"step": 69200
},
{
"epoch": 22.398190045248867,
"grad_norm": 1.6748441457748413,
"learning_rate": 0.001,
"loss": 2.1104,
"step": 69300
},
{
"epoch": 22.430510665804782,
"grad_norm": 1.0100698471069336,
"learning_rate": 0.001,
"loss": 2.1049,
"step": 69400
},
{
"epoch": 22.462831286360696,
"grad_norm": 1.4298042058944702,
"learning_rate": 0.001,
"loss": 2.1022,
"step": 69500
},
{
"epoch": 22.49515190691661,
"grad_norm": 1.8333765268325806,
"learning_rate": 0.001,
"loss": 2.12,
"step": 69600
},
{
"epoch": 22.52747252747253,
"grad_norm": 1.4487437009811401,
"learning_rate": 0.001,
"loss": 2.1007,
"step": 69700
},
{
"epoch": 22.559793148028444,
"grad_norm": 1.6013681888580322,
"learning_rate": 0.001,
"loss": 2.1079,
"step": 69800
},
{
"epoch": 22.59211376858436,
"grad_norm": 2.3832428455352783,
"learning_rate": 0.001,
"loss": 2.1165,
"step": 69900
},
{
"epoch": 22.624434389140273,
"grad_norm": 1.600501298904419,
"learning_rate": 0.001,
"loss": 2.1214,
"step": 70000
},
{
"epoch": 22.656755009696187,
"grad_norm": 1.5591310262680054,
"learning_rate": 0.001,
"loss": 2.115,
"step": 70100
},
{
"epoch": 22.689075630252102,
"grad_norm": 1.2109787464141846,
"learning_rate": 0.001,
"loss": 2.1442,
"step": 70200
},
{
"epoch": 22.721396250808017,
"grad_norm": 1.465110421180725,
"learning_rate": 0.001,
"loss": 2.1413,
"step": 70300
},
{
"epoch": 22.75371687136393,
"grad_norm": 1.2152010202407837,
"learning_rate": 0.001,
"loss": 2.1242,
"step": 70400
},
{
"epoch": 22.786037491919846,
"grad_norm": 1.4363352060317993,
"learning_rate": 0.001,
"loss": 2.1379,
"step": 70500
},
{
"epoch": 22.81835811247576,
"grad_norm": 1.399573564529419,
"learning_rate": 0.001,
"loss": 2.138,
"step": 70600
},
{
"epoch": 22.850678733031675,
"grad_norm": 1.378006100654602,
"learning_rate": 0.001,
"loss": 2.1285,
"step": 70700
},
{
"epoch": 22.88299935358759,
"grad_norm": 1.274100422859192,
"learning_rate": 0.001,
"loss": 2.1528,
"step": 70800
},
{
"epoch": 22.915319974143504,
"grad_norm": 1.2786856889724731,
"learning_rate": 0.001,
"loss": 2.1371,
"step": 70900
},
{
"epoch": 22.94764059469942,
"grad_norm": 1.3367137908935547,
"learning_rate": 0.001,
"loss": 2.1356,
"step": 71000
},
{
"epoch": 22.979961215255333,
"grad_norm": 1.1747994422912598,
"learning_rate": 0.001,
"loss": 2.1513,
"step": 71100
},
{
"epoch": 23.012281835811248,
"grad_norm": 1.472936987876892,
"learning_rate": 0.001,
"loss": 2.0644,
"step": 71200
},
{
"epoch": 23.044602456367162,
"grad_norm": 1.5637643337249756,
"learning_rate": 0.001,
"loss": 2.0074,
"step": 71300
},
{
"epoch": 23.076923076923077,
"grad_norm": 1.5909124612808228,
"learning_rate": 0.001,
"loss": 1.99,
"step": 71400
},
{
"epoch": 23.10924369747899,
"grad_norm": 1.779449462890625,
"learning_rate": 0.001,
"loss": 2.0129,
"step": 71500
},
{
"epoch": 23.141564318034906,
"grad_norm": 1.4273805618286133,
"learning_rate": 0.001,
"loss": 2.0336,
"step": 71600
},
{
"epoch": 23.17388493859082,
"grad_norm": 1.4807401895523071,
"learning_rate": 0.001,
"loss": 2.0258,
"step": 71700
},
{
"epoch": 23.206205559146735,
"grad_norm": 1.5846195220947266,
"learning_rate": 0.001,
"loss": 2.0398,
"step": 71800
},
{
"epoch": 23.23852617970265,
"grad_norm": 1.2522149085998535,
"learning_rate": 0.001,
"loss": 2.0358,
"step": 71900
},
{
"epoch": 23.270846800258564,
"grad_norm": 1.8011384010314941,
"learning_rate": 0.001,
"loss": 2.0368,
"step": 72000
},
{
"epoch": 23.30316742081448,
"grad_norm": 1.339313268661499,
"learning_rate": 0.001,
"loss": 2.0504,
"step": 72100
},
{
"epoch": 23.335488041370393,
"grad_norm": 1.317734956741333,
"learning_rate": 0.001,
"loss": 2.0429,
"step": 72200
},
{
"epoch": 23.367808661926308,
"grad_norm": 1.351259708404541,
"learning_rate": 0.001,
"loss": 2.0514,
"step": 72300
},
{
"epoch": 23.400129282482222,
"grad_norm": 1.3582738637924194,
"learning_rate": 0.001,
"loss": 2.0577,
"step": 72400
},
{
"epoch": 23.432449903038137,
"grad_norm": 1.813624382019043,
"learning_rate": 0.001,
"loss": 2.0445,
"step": 72500
},
{
"epoch": 23.46477052359405,
"grad_norm": 1.531417727470398,
"learning_rate": 0.001,
"loss": 2.0614,
"step": 72600
},
{
"epoch": 23.49709114414997,
"grad_norm": 1.5919642448425293,
"learning_rate": 0.001,
"loss": 2.0691,
"step": 72700
},
{
"epoch": 23.529411764705884,
"grad_norm": 1.5767107009887695,
"learning_rate": 0.001,
"loss": 2.0789,
"step": 72800
},
{
"epoch": 23.5617323852618,
"grad_norm": 1.5957386493682861,
"learning_rate": 0.001,
"loss": 2.067,
"step": 72900
},
{
"epoch": 23.594053005817713,
"grad_norm": 1.8179656267166138,
"learning_rate": 0.001,
"loss": 2.0862,
"step": 73000
},
{
"epoch": 23.626373626373628,
"grad_norm": 1.5586670637130737,
"learning_rate": 0.001,
"loss": 2.0848,
"step": 73100
},
{
"epoch": 23.658694246929542,
"grad_norm": 1.4760098457336426,
"learning_rate": 0.001,
"loss": 2.106,
"step": 73200
},
{
"epoch": 23.691014867485457,
"grad_norm": 1.4071135520935059,
"learning_rate": 0.001,
"loss": 2.0928,
"step": 73300
},
{
"epoch": 23.72333548804137,
"grad_norm": 1.3541771173477173,
"learning_rate": 0.001,
"loss": 2.1112,
"step": 73400
},
{
"epoch": 23.755656108597286,
"grad_norm": 1.6055703163146973,
"learning_rate": 0.001,
"loss": 2.1058,
"step": 73500
},
{
"epoch": 23.7879767291532,
"grad_norm": 1.7289507389068604,
"learning_rate": 0.001,
"loss": 2.1219,
"step": 73600
},
{
"epoch": 23.820297349709115,
"grad_norm": 1.8029732704162598,
"learning_rate": 0.001,
"loss": 2.083,
"step": 73700
},
{
"epoch": 23.85261797026503,
"grad_norm": 1.8605992794036865,
"learning_rate": 0.001,
"loss": 2.103,
"step": 73800
},
{
"epoch": 23.884938590820944,
"grad_norm": 1.1460589170455933,
"learning_rate": 0.001,
"loss": 2.0971,
"step": 73900
},
{
"epoch": 23.91725921137686,
"grad_norm": 1.4249467849731445,
"learning_rate": 0.001,
"loss": 2.1331,
"step": 74000
},
{
"epoch": 23.949579831932773,
"grad_norm": 1.4224003553390503,
"learning_rate": 0.001,
"loss": 2.1221,
"step": 74100
},
{
"epoch": 23.981900452488688,
"grad_norm": 1.3229635953903198,
"learning_rate": 0.001,
"loss": 2.1182,
"step": 74200
},
{
"epoch": 24.014221073044602,
"grad_norm": 1.3930984735488892,
"learning_rate": 0.001,
"loss": 2.0503,
"step": 74300
},
{
"epoch": 24.046541693600517,
"grad_norm": 1.4770911931991577,
"learning_rate": 0.001,
"loss": 1.9697,
"step": 74400
},
{
"epoch": 24.07886231415643,
"grad_norm": 1.6999855041503906,
"learning_rate": 0.001,
"loss": 1.9748,
"step": 74500
},
{
"epoch": 24.111182934712346,
"grad_norm": 1.63511061668396,
"learning_rate": 0.001,
"loss": 1.9672,
"step": 74600
},
{
"epoch": 24.14350355526826,
"grad_norm": 1.4599164724349976,
"learning_rate": 0.001,
"loss": 1.9993,
"step": 74700
},
{
"epoch": 24.175824175824175,
"grad_norm": 1.9511414766311646,
"learning_rate": 0.001,
"loss": 2.0237,
"step": 74800
},
{
"epoch": 24.20814479638009,
"grad_norm": 1.46794593334198,
"learning_rate": 0.001,
"loss": 1.985,
"step": 74900
},
{
"epoch": 24.240465416936004,
"grad_norm": 1.2807059288024902,
"learning_rate": 0.001,
"loss": 2.0034,
"step": 75000
},
{
"epoch": 24.27278603749192,
"grad_norm": 1.7410281896591187,
"learning_rate": 0.001,
"loss": 2.024,
"step": 75100
},
{
"epoch": 24.305106658047833,
"grad_norm": 1.439113736152649,
"learning_rate": 0.001,
"loss": 2.0233,
"step": 75200
},
{
"epoch": 24.337427278603748,
"grad_norm": 1.419325351715088,
"learning_rate": 0.001,
"loss": 2.0295,
"step": 75300
},
{
"epoch": 24.369747899159663,
"grad_norm": 1.564228892326355,
"learning_rate": 0.001,
"loss": 2.0153,
"step": 75400
},
{
"epoch": 24.402068519715577,
"grad_norm": 1.38504159450531,
"learning_rate": 0.001,
"loss": 2.0205,
"step": 75500
},
{
"epoch": 24.43438914027149,
"grad_norm": 1.453363060951233,
"learning_rate": 0.001,
"loss": 2.0271,
"step": 75600
},
{
"epoch": 24.466709760827406,
"grad_norm": 1.6083861589431763,
"learning_rate": 0.001,
"loss": 2.014,
"step": 75700
},
{
"epoch": 24.49903038138332,
"grad_norm": 1.7092853784561157,
"learning_rate": 0.001,
"loss": 2.0424,
"step": 75800
},
{
"epoch": 24.53135100193924,
"grad_norm": 1.4002851247787476,
"learning_rate": 0.001,
"loss": 2.0249,
"step": 75900
},
{
"epoch": 24.563671622495153,
"grad_norm": 1.5491043329238892,
"learning_rate": 0.001,
"loss": 2.0572,
"step": 76000
},
{
"epoch": 24.595992243051068,
"grad_norm": 1.3030824661254883,
"learning_rate": 0.001,
"loss": 2.0439,
"step": 76100
},
{
"epoch": 24.628312863606983,
"grad_norm": 1.2916637659072876,
"learning_rate": 0.001,
"loss": 2.0471,
"step": 76200
},
{
"epoch": 24.660633484162897,
"grad_norm": 1.4240041971206665,
"learning_rate": 0.001,
"loss": 2.0634,
"step": 76300
},
{
"epoch": 24.69295410471881,
"grad_norm": 1.753969669342041,
"learning_rate": 0.001,
"loss": 2.0657,
"step": 76400
},
{
"epoch": 24.725274725274726,
"grad_norm": 1.2906912565231323,
"learning_rate": 0.001,
"loss": 2.0597,
"step": 76500
},
{
"epoch": 24.75759534583064,
"grad_norm": 1.5350031852722168,
"learning_rate": 0.001,
"loss": 2.0839,
"step": 76600
},
{
"epoch": 24.789915966386555,
"grad_norm": 1.9816159009933472,
"learning_rate": 0.001,
"loss": 2.057,
"step": 76700
},
{
"epoch": 24.82223658694247,
"grad_norm": 1.9255553483963013,
"learning_rate": 0.001,
"loss": 2.0699,
"step": 76800
},
{
"epoch": 24.854557207498384,
"grad_norm": 1.4984052181243896,
"learning_rate": 0.001,
"loss": 2.0713,
"step": 76900
},
{
"epoch": 24.8868778280543,
"grad_norm": 4.322579860687256,
"learning_rate": 0.001,
"loss": 2.1062,
"step": 77000
},
{
"epoch": 24.919198448610214,
"grad_norm": 1.597294569015503,
"learning_rate": 0.001,
"loss": 2.0879,
"step": 77100
},
{
"epoch": 24.951519069166128,
"grad_norm": 1.4483535289764404,
"learning_rate": 0.001,
"loss": 2.0873,
"step": 77200
},
{
"epoch": 24.983839689722043,
"grad_norm": 1.4084569215774536,
"learning_rate": 0.001,
"loss": 2.0913,
"step": 77300
},
{
"epoch": 25.016160310277957,
"grad_norm": 1.9849538803100586,
"learning_rate": 0.001,
"loss": 1.9734,
"step": 77400
},
{
"epoch": 25.048480930833872,
"grad_norm": 2.1516408920288086,
"learning_rate": 0.001,
"loss": 1.9801,
"step": 77500
},
{
"epoch": 25.080801551389786,
"grad_norm": 2.2661306858062744,
"learning_rate": 0.001,
"loss": 1.937,
"step": 77600
},
{
"epoch": 25.1131221719457,
"grad_norm": 2.510815382003784,
"learning_rate": 0.001,
"loss": 1.9473,
"step": 77700
},
{
"epoch": 25.145442792501616,
"grad_norm": 2.2111470699310303,
"learning_rate": 0.001,
"loss": 1.96,
"step": 77800
},
{
"epoch": 25.17776341305753,
"grad_norm": 2.111010789871216,
"learning_rate": 0.001,
"loss": 1.9912,
"step": 77900
},
{
"epoch": 25.210084033613445,
"grad_norm": 2.753941297531128,
"learning_rate": 0.001,
"loss": 1.9919,
"step": 78000
},
{
"epoch": 25.24240465416936,
"grad_norm": 2.347527027130127,
"learning_rate": 0.001,
"loss": 1.9843,
"step": 78100
},
{
"epoch": 25.274725274725274,
"grad_norm": 2.777312755584717,
"learning_rate": 0.001,
"loss": 1.9692,
"step": 78200
},
{
"epoch": 25.30704589528119,
"grad_norm": 1.7811833620071411,
"learning_rate": 0.001,
"loss": 2.0007,
"step": 78300
},
{
"epoch": 25.339366515837103,
"grad_norm": 2.1932690143585205,
"learning_rate": 0.001,
"loss": 2.0012,
"step": 78400
},
{
"epoch": 25.371687136393017,
"grad_norm": 2.195629596710205,
"learning_rate": 0.001,
"loss": 1.9921,
"step": 78500
},
{
"epoch": 25.404007756948932,
"grad_norm": 2.693999767303467,
"learning_rate": 0.001,
"loss": 2.0042,
"step": 78600
},
{
"epoch": 25.436328377504847,
"grad_norm": 3.0932207107543945,
"learning_rate": 0.001,
"loss": 1.9884,
"step": 78700
},
{
"epoch": 25.46864899806076,
"grad_norm": 2.486372232437134,
"learning_rate": 0.001,
"loss": 2.0082,
"step": 78800
},
{
"epoch": 25.50096961861668,
"grad_norm": 2.309953212738037,
"learning_rate": 0.001,
"loss": 1.9836,
"step": 78900
},
{
"epoch": 25.533290239172594,
"grad_norm": 2.481170892715454,
"learning_rate": 0.001,
"loss": 2.0401,
"step": 79000
},
{
"epoch": 25.56561085972851,
"grad_norm": 6.894639492034912,
"learning_rate": 0.001,
"loss": 2.0187,
"step": 79100
},
{
"epoch": 25.597931480284423,
"grad_norm": 2.154688596725464,
"learning_rate": 0.001,
"loss": 2.0334,
"step": 79200
},
{
"epoch": 25.630252100840337,
"grad_norm": 2.5269687175750732,
"learning_rate": 0.001,
"loss": 2.0062,
"step": 79300
},
{
"epoch": 25.662572721396252,
"grad_norm": 2.0479533672332764,
"learning_rate": 0.001,
"loss": 2.0332,
"step": 79400
},
{
"epoch": 25.694893341952167,
"grad_norm": 2.8446123600006104,
"learning_rate": 0.001,
"loss": 2.0332,
"step": 79500
},
{
"epoch": 25.72721396250808,
"grad_norm": 1.9865922927856445,
"learning_rate": 0.001,
"loss": 2.0408,
"step": 79600
},
{
"epoch": 25.759534583063996,
"grad_norm": 2.4870991706848145,
"learning_rate": 0.001,
"loss": 2.0251,
"step": 79700
},
{
"epoch": 25.79185520361991,
"grad_norm": 2.0632777214050293,
"learning_rate": 0.001,
"loss": 2.0453,
"step": 79800
},
{
"epoch": 25.824175824175825,
"grad_norm": 2.7556283473968506,
"learning_rate": 0.001,
"loss": 2.0285,
"step": 79900
},
{
"epoch": 25.85649644473174,
"grad_norm": 2.0263702869415283,
"learning_rate": 0.001,
"loss": 2.0623,
"step": 80000
},
{
"epoch": 25.888817065287654,
"grad_norm": 3.15863299369812,
"learning_rate": 0.001,
"loss": 2.0461,
"step": 80100
},
{
"epoch": 25.92113768584357,
"grad_norm": 2.3202335834503174,
"learning_rate": 0.001,
"loss": 2.0489,
"step": 80200
},
{
"epoch": 25.953458306399483,
"grad_norm": 2.275404691696167,
"learning_rate": 0.001,
"loss": 2.0559,
"step": 80300
},
{
"epoch": 25.985778926955398,
"grad_norm": 1.9701374769210815,
"learning_rate": 0.001,
"loss": 2.0521,
"step": 80400
},
{
"epoch": 26.018099547511312,
"grad_norm": 2.0686566829681396,
"learning_rate": 0.001,
"loss": 1.982,
"step": 80500
},
{
"epoch": 26.050420168067227,
"grad_norm": 1.7882318496704102,
"learning_rate": 0.001,
"loss": 1.9092,
"step": 80600
},
{
"epoch": 26.08274078862314,
"grad_norm": 1.3887428045272827,
"learning_rate": 0.001,
"loss": 1.9209,
"step": 80700
},
{
"epoch": 26.115061409179056,
"grad_norm": 1.775455117225647,
"learning_rate": 0.001,
"loss": 1.935,
"step": 80800
},
{
"epoch": 26.14738202973497,
"grad_norm": 1.9676622152328491,
"learning_rate": 0.001,
"loss": 1.9301,
"step": 80900
},
{
"epoch": 26.179702650290885,
"grad_norm": 1.7484667301177979,
"learning_rate": 0.001,
"loss": 1.9365,
"step": 81000
},
{
"epoch": 26.2120232708468,
"grad_norm": 1.333925485610962,
"learning_rate": 0.001,
"loss": 1.9396,
"step": 81100
},
{
"epoch": 26.244343891402714,
"grad_norm": 1.3024888038635254,
"learning_rate": 0.001,
"loss": 1.9488,
"step": 81200
},
{
"epoch": 26.27666451195863,
"grad_norm": 1.3314354419708252,
"learning_rate": 0.001,
"loss": 1.9625,
"step": 81300
},
{
"epoch": 26.308985132514543,
"grad_norm": 2.3453688621520996,
"learning_rate": 0.001,
"loss": 1.9588,
"step": 81400
},
{
"epoch": 26.341305753070458,
"grad_norm": 1.459130048751831,
"learning_rate": 0.001,
"loss": 1.9716,
"step": 81500
},
{
"epoch": 26.373626373626372,
"grad_norm": 1.3043862581253052,
"learning_rate": 0.001,
"loss": 1.9662,
"step": 81600
},
{
"epoch": 26.405946994182287,
"grad_norm": 1.2890630960464478,
"learning_rate": 0.001,
"loss": 1.9729,
"step": 81700
},
{
"epoch": 26.4382676147382,
"grad_norm": 1.2323054075241089,
"learning_rate": 0.001,
"loss": 1.9562,
"step": 81800
},
{
"epoch": 26.470588235294116,
"grad_norm": 1.9107179641723633,
"learning_rate": 0.001,
"loss": 1.9665,
"step": 81900
},
{
"epoch": 26.50290885585003,
"grad_norm": 1.5742831230163574,
"learning_rate": 0.001,
"loss": 1.9809,
"step": 82000
},
{
"epoch": 26.53522947640595,
"grad_norm": 1.3814857006072998,
"learning_rate": 0.001,
"loss": 1.975,
"step": 82100
},
{
"epoch": 26.567550096961863,
"grad_norm": 1.3042103052139282,
"learning_rate": 0.001,
"loss": 1.9745,
"step": 82200
},
{
"epoch": 26.599870717517778,
"grad_norm": 1.6151447296142578,
"learning_rate": 0.001,
"loss": 1.9872,
"step": 82300
},
{
"epoch": 26.632191338073692,
"grad_norm": 1.6068259477615356,
"learning_rate": 0.001,
"loss": 1.9933,
"step": 82400
},
{
"epoch": 26.664511958629607,
"grad_norm": 1.3208508491516113,
"learning_rate": 0.001,
"loss": 2.0022,
"step": 82500
},
{
"epoch": 26.69683257918552,
"grad_norm": 1.5930817127227783,
"learning_rate": 0.001,
"loss": 1.9939,
"step": 82600
},
{
"epoch": 26.729153199741436,
"grad_norm": 1.5000683069229126,
"learning_rate": 0.001,
"loss": 2.0076,
"step": 82700
},
{
"epoch": 26.76147382029735,
"grad_norm": 1.692630410194397,
"learning_rate": 0.001,
"loss": 2.002,
"step": 82800
},
{
"epoch": 26.793794440853265,
"grad_norm": 2.1297543048858643,
"learning_rate": 0.001,
"loss": 2.024,
"step": 82900
},
{
"epoch": 26.82611506140918,
"grad_norm": 1.2182215452194214,
"learning_rate": 0.001,
"loss": 2.0362,
"step": 83000
},
{
"epoch": 26.858435681965094,
"grad_norm": 1.3465772867202759,
"learning_rate": 0.001,
"loss": 2.0525,
"step": 83100
},
{
"epoch": 26.89075630252101,
"grad_norm": 1.9355134963989258,
"learning_rate": 0.001,
"loss": 2.0377,
"step": 83200
},
{
"epoch": 26.923076923076923,
"grad_norm": 1.3993531465530396,
"learning_rate": 0.001,
"loss": 2.0204,
"step": 83300
},
{
"epoch": 26.955397543632838,
"grad_norm": 1.3632115125656128,
"learning_rate": 0.001,
"loss": 2.0247,
"step": 83400
},
{
"epoch": 26.987718164188752,
"grad_norm": 1.70760977268219,
"learning_rate": 0.001,
"loss": 2.0275,
"step": 83500
},
{
"epoch": 27.020038784744667,
"grad_norm": 1.203029751777649,
"learning_rate": 0.001,
"loss": 1.951,
"step": 83600
},
{
"epoch": 27.05235940530058,
"grad_norm": 1.2251530885696411,
"learning_rate": 0.001,
"loss": 1.8761,
"step": 83700
},
{
"epoch": 27.084680025856496,
"grad_norm": 1.2010034322738647,
"learning_rate": 0.001,
"loss": 1.8784,
"step": 83800
},
{
"epoch": 27.11700064641241,
"grad_norm": 1.628166913986206,
"learning_rate": 0.001,
"loss": 1.9073,
"step": 83900
},
{
"epoch": 27.149321266968325,
"grad_norm": 1.467311143875122,
"learning_rate": 0.001,
"loss": 1.9048,
"step": 84000
},
{
"epoch": 27.18164188752424,
"grad_norm": 1.3212134838104248,
"learning_rate": 0.001,
"loss": 1.9009,
"step": 84100
},
{
"epoch": 27.213962508080154,
"grad_norm": 1.3070082664489746,
"learning_rate": 0.001,
"loss": 1.9234,
"step": 84200
},
{
"epoch": 27.24628312863607,
"grad_norm": 1.4710814952850342,
"learning_rate": 0.001,
"loss": 1.897,
"step": 84300
},
{
"epoch": 27.278603749191983,
"grad_norm": 1.5833498239517212,
"learning_rate": 0.001,
"loss": 1.9441,
"step": 84400
},
{
"epoch": 27.310924369747898,
"grad_norm": 1.346295952796936,
"learning_rate": 0.001,
"loss": 1.9243,
"step": 84500
},
{
"epoch": 27.343244990303813,
"grad_norm": 1.4825193881988525,
"learning_rate": 0.001,
"loss": 1.948,
"step": 84600
},
{
"epoch": 27.375565610859727,
"grad_norm": 1.489837884902954,
"learning_rate": 0.001,
"loss": 1.933,
"step": 84700
},
{
"epoch": 27.40788623141564,
"grad_norm": 1.3613611459732056,
"learning_rate": 0.001,
"loss": 1.9417,
"step": 84800
},
{
"epoch": 27.440206851971556,
"grad_norm": 1.3851776123046875,
"learning_rate": 0.001,
"loss": 1.9535,
"step": 84900
},
{
"epoch": 27.47252747252747,
"grad_norm": 1.228777289390564,
"learning_rate": 0.001,
"loss": 1.9639,
"step": 85000
},
{
"epoch": 27.50484809308339,
"grad_norm": 1.362752914428711,
"learning_rate": 0.001,
"loss": 1.9513,
"step": 85100
},
{
"epoch": 27.537168713639304,
"grad_norm": 1.3782377243041992,
"learning_rate": 0.001,
"loss": 1.9598,
"step": 85200
},
{
"epoch": 27.569489334195218,
"grad_norm": 1.31719970703125,
"learning_rate": 0.001,
"loss": 1.961,
"step": 85300
},
{
"epoch": 27.601809954751133,
"grad_norm": 1.4611492156982422,
"learning_rate": 0.001,
"loss": 1.9581,
"step": 85400
},
{
"epoch": 27.634130575307047,
"grad_norm": 1.6541672945022583,
"learning_rate": 0.001,
"loss": 1.9674,
"step": 85500
},
{
"epoch": 27.66645119586296,
"grad_norm": 1.4168950319290161,
"learning_rate": 0.001,
"loss": 1.9783,
"step": 85600
},
{
"epoch": 27.698771816418876,
"grad_norm": 1.565339207649231,
"learning_rate": 0.001,
"loss": 1.9772,
"step": 85700
},
{
"epoch": 27.73109243697479,
"grad_norm": 1.4636658430099487,
"learning_rate": 0.001,
"loss": 1.9662,
"step": 85800
},
{
"epoch": 27.763413057530705,
"grad_norm": 1.1653151512145996,
"learning_rate": 0.001,
"loss": 1.9753,
"step": 85900
},
{
"epoch": 27.79573367808662,
"grad_norm": 1.3461968898773193,
"learning_rate": 0.001,
"loss": 1.9591,
"step": 86000
},
{
"epoch": 27.828054298642535,
"grad_norm": 1.2734227180480957,
"learning_rate": 0.001,
"loss": 1.975,
"step": 86100
},
{
"epoch": 27.86037491919845,
"grad_norm": 1.3196417093276978,
"learning_rate": 0.001,
"loss": 1.9847,
"step": 86200
},
{
"epoch": 27.892695539754364,
"grad_norm": 1.2176880836486816,
"learning_rate": 0.001,
"loss": 1.9904,
"step": 86300
},
{
"epoch": 27.92501616031028,
"grad_norm": 1.1071490049362183,
"learning_rate": 0.001,
"loss": 1.9906,
"step": 86400
},
{
"epoch": 27.957336780866193,
"grad_norm": 1.5895119905471802,
"learning_rate": 0.001,
"loss": 2.0066,
"step": 86500
},
{
"epoch": 27.989657401422107,
"grad_norm": 1.3792082071304321,
"learning_rate": 0.001,
"loss": 1.9941,
"step": 86600
},
{
"epoch": 28.021978021978022,
"grad_norm": 1.2426801919937134,
"learning_rate": 0.001,
"loss": 1.9242,
"step": 86700
},
{
"epoch": 28.054298642533936,
"grad_norm": 1.7085320949554443,
"learning_rate": 0.001,
"loss": 1.8433,
"step": 86800
},
{
"epoch": 28.08661926308985,
"grad_norm": 1.3165888786315918,
"learning_rate": 0.001,
"loss": 1.8714,
"step": 86900
},
{
"epoch": 28.118939883645766,
"grad_norm": 1.4661237001419067,
"learning_rate": 0.001,
"loss": 1.8749,
"step": 87000
},
{
"epoch": 28.15126050420168,
"grad_norm": 1.6082890033721924,
"learning_rate": 0.001,
"loss": 1.8612,
"step": 87100
},
{
"epoch": 28.183581124757595,
"grad_norm": 1.3356757164001465,
"learning_rate": 0.001,
"loss": 1.8897,
"step": 87200
},
{
"epoch": 28.21590174531351,
"grad_norm": 1.557093620300293,
"learning_rate": 0.001,
"loss": 1.8871,
"step": 87300
},
{
"epoch": 28.248222365869424,
"grad_norm": 1.7916589975357056,
"learning_rate": 0.001,
"loss": 1.8883,
"step": 87400
},
{
"epoch": 28.28054298642534,
"grad_norm": 1.2493481636047363,
"learning_rate": 0.001,
"loss": 1.9002,
"step": 87500
},
{
"epoch": 28.312863606981253,
"grad_norm": 1.3864846229553223,
"learning_rate": 0.001,
"loss": 1.9133,
"step": 87600
},
{
"epoch": 28.345184227537167,
"grad_norm": 1.6483922004699707,
"learning_rate": 0.001,
"loss": 1.8825,
"step": 87700
},
{
"epoch": 28.377504848093082,
"grad_norm": 1.1912819147109985,
"learning_rate": 0.001,
"loss": 1.91,
"step": 87800
},
{
"epoch": 28.409825468648997,
"grad_norm": 1.1268420219421387,
"learning_rate": 0.001,
"loss": 1.9175,
"step": 87900
},
{
"epoch": 28.44214608920491,
"grad_norm": 1.364435076713562,
"learning_rate": 0.001,
"loss": 1.9084,
"step": 88000
},
{
"epoch": 28.474466709760826,
"grad_norm": 1.3538644313812256,
"learning_rate": 0.001,
"loss": 1.8976,
"step": 88100
},
{
"epoch": 28.50678733031674,
"grad_norm": 1.308135986328125,
"learning_rate": 0.001,
"loss": 1.9203,
"step": 88200
},
{
"epoch": 28.53910795087266,
"grad_norm": 1.5055309534072876,
"learning_rate": 0.001,
"loss": 1.9177,
"step": 88300
},
{
"epoch": 28.571428571428573,
"grad_norm": 1.4888206720352173,
"learning_rate": 0.001,
"loss": 1.9213,
"step": 88400
},
{
"epoch": 28.603749191984488,
"grad_norm": 1.112297773361206,
"learning_rate": 0.001,
"loss": 1.939,
"step": 88500
},
{
"epoch": 28.636069812540402,
"grad_norm": 1.3420555591583252,
"learning_rate": 0.001,
"loss": 1.9181,
"step": 88600
},
{
"epoch": 28.668390433096317,
"grad_norm": 1.143880009651184,
"learning_rate": 0.001,
"loss": 1.9568,
"step": 88700
},
{
"epoch": 28.70071105365223,
"grad_norm": 1.6893914937973022,
"learning_rate": 0.001,
"loss": 1.9262,
"step": 88800
},
{
"epoch": 28.733031674208146,
"grad_norm": 1.6080857515335083,
"learning_rate": 0.001,
"loss": 1.929,
"step": 88900
},
{
"epoch": 28.76535229476406,
"grad_norm": 1.342633605003357,
"learning_rate": 0.001,
"loss": 1.9665,
"step": 89000
},
{
"epoch": 28.797672915319975,
"grad_norm": 1.5504629611968994,
"learning_rate": 0.001,
"loss": 1.9502,
"step": 89100
},
{
"epoch": 28.82999353587589,
"grad_norm": 1.4633890390396118,
"learning_rate": 0.001,
"loss": 1.9644,
"step": 89200
},
{
"epoch": 28.862314156431804,
"grad_norm": 1.3921465873718262,
"learning_rate": 0.001,
"loss": 1.9698,
"step": 89300
},
{
"epoch": 28.89463477698772,
"grad_norm": 1.5433467626571655,
"learning_rate": 0.001,
"loss": 1.9694,
"step": 89400
},
{
"epoch": 28.926955397543633,
"grad_norm": 1.159615397453308,
"learning_rate": 0.001,
"loss": 1.968,
"step": 89500
},
{
"epoch": 28.959276018099548,
"grad_norm": 1.3793103694915771,
"learning_rate": 0.001,
"loss": 1.9649,
"step": 89600
},
{
"epoch": 28.991596638655462,
"grad_norm": 2.6745166778564453,
"learning_rate": 0.001,
"loss": 1.9738,
"step": 89700
},
{
"epoch": 29.023917259211377,
"grad_norm": 1.4499655961990356,
"learning_rate": 0.001,
"loss": 1.8535,
"step": 89800
},
{
"epoch": 29.05623787976729,
"grad_norm": 1.5100798606872559,
"learning_rate": 0.001,
"loss": 1.8261,
"step": 89900
},
{
"epoch": 29.088558500323206,
"grad_norm": 1.518397569656372,
"learning_rate": 0.001,
"loss": 1.8219,
"step": 90000
},
{
"epoch": 29.12087912087912,
"grad_norm": 1.3293129205703735,
"learning_rate": 0.001,
"loss": 1.8459,
"step": 90100
},
{
"epoch": 29.153199741435035,
"grad_norm": 1.8555089235305786,
"learning_rate": 0.001,
"loss": 1.8459,
"step": 90200
},
{
"epoch": 29.18552036199095,
"grad_norm": 1.4512817859649658,
"learning_rate": 0.001,
"loss": 1.8584,
"step": 90300
},
{
"epoch": 29.217840982546864,
"grad_norm": 1.735163688659668,
"learning_rate": 0.001,
"loss": 1.8434,
"step": 90400
},
{
"epoch": 29.25016160310278,
"grad_norm": 1.3833225965499878,
"learning_rate": 0.001,
"loss": 1.884,
"step": 90500
},
{
"epoch": 29.282482223658693,
"grad_norm": 1.7664813995361328,
"learning_rate": 0.001,
"loss": 1.88,
"step": 90600
},
{
"epoch": 29.314802844214608,
"grad_norm": 1.440193772315979,
"learning_rate": 0.001,
"loss": 1.8567,
"step": 90700
},
{
"epoch": 29.347123464770522,
"grad_norm": 1.239136815071106,
"learning_rate": 0.001,
"loss": 1.8746,
"step": 90800
},
{
"epoch": 29.379444085326437,
"grad_norm": 1.3825310468673706,
"learning_rate": 0.001,
"loss": 1.8684,
"step": 90900
},
{
"epoch": 29.41176470588235,
"grad_norm": 1.4752728939056396,
"learning_rate": 0.001,
"loss": 1.8587,
"step": 91000
},
{
"epoch": 29.444085326438266,
"grad_norm": 1.495429515838623,
"learning_rate": 0.001,
"loss": 1.8849,
"step": 91100
},
{
"epoch": 29.47640594699418,
"grad_norm": 1.4592987298965454,
"learning_rate": 0.001,
"loss": 1.8892,
"step": 91200
},
{
"epoch": 29.5087265675501,
"grad_norm": 1.321603536605835,
"learning_rate": 0.001,
"loss": 1.9044,
"step": 91300
},
{
"epoch": 29.541047188106013,
"grad_norm": 1.2586690187454224,
"learning_rate": 0.001,
"loss": 1.9141,
"step": 91400
},
{
"epoch": 29.573367808661928,
"grad_norm": 1.3730324506759644,
"learning_rate": 0.001,
"loss": 1.8934,
"step": 91500
},
{
"epoch": 29.605688429217842,
"grad_norm": 1.6763105392456055,
"learning_rate": 0.001,
"loss": 1.905,
"step": 91600
},
{
"epoch": 29.638009049773757,
"grad_norm": 1.6392866373062134,
"learning_rate": 0.001,
"loss": 1.9155,
"step": 91700
},
{
"epoch": 29.67032967032967,
"grad_norm": 1.2820043563842773,
"learning_rate": 0.001,
"loss": 1.9181,
"step": 91800
},
{
"epoch": 29.702650290885586,
"grad_norm": 1.6354836225509644,
"learning_rate": 0.001,
"loss": 1.9207,
"step": 91900
},
{
"epoch": 29.7349709114415,
"grad_norm": 1.3978163003921509,
"learning_rate": 0.001,
"loss": 1.9039,
"step": 92000
},
{
"epoch": 29.767291531997415,
"grad_norm": 1.3554919958114624,
"learning_rate": 0.001,
"loss": 1.9283,
"step": 92100
},
{
"epoch": 29.79961215255333,
"grad_norm": 1.4941645860671997,
"learning_rate": 0.001,
"loss": 1.9183,
"step": 92200
},
{
"epoch": 29.831932773109244,
"grad_norm": 1.5266228914260864,
"learning_rate": 0.001,
"loss": 1.9186,
"step": 92300
},
{
"epoch": 29.86425339366516,
"grad_norm": 1.4845457077026367,
"learning_rate": 0.001,
"loss": 1.9399,
"step": 92400
},
{
"epoch": 29.896574014221073,
"grad_norm": 1.1934114694595337,
"learning_rate": 0.001,
"loss": 1.9314,
"step": 92500
},
{
"epoch": 29.928894634776988,
"grad_norm": 1.3535517454147339,
"learning_rate": 0.001,
"loss": 1.9399,
"step": 92600
},
{
"epoch": 29.961215255332903,
"grad_norm": 1.230025291442871,
"learning_rate": 0.001,
"loss": 1.9475,
"step": 92700
},
{
"epoch": 29.993535875888817,
"grad_norm": 1.4865684509277344,
"learning_rate": 0.001,
"loss": 1.9565,
"step": 92800
},
{
"epoch": 30.02585649644473,
"grad_norm": 1.4620200395584106,
"learning_rate": 0.001,
"loss": 1.828,
"step": 92900
},
{
"epoch": 30.058177117000646,
"grad_norm": 3.006803512573242,
"learning_rate": 0.001,
"loss": 1.7962,
"step": 93000
},
{
"epoch": 30.09049773755656,
"grad_norm": 1.6281250715255737,
"learning_rate": 0.001,
"loss": 1.8051,
"step": 93100
},
{
"epoch": 30.122818358112475,
"grad_norm": 1.3403794765472412,
"learning_rate": 0.001,
"loss": 1.8096,
"step": 93200
},
{
"epoch": 30.15513897866839,
"grad_norm": 1.3069578409194946,
"learning_rate": 0.001,
"loss": 1.8239,
"step": 93300
},
{
"epoch": 30.187459599224304,
"grad_norm": 1.467483401298523,
"learning_rate": 0.001,
"loss": 1.8257,
"step": 93400
},
{
"epoch": 30.21978021978022,
"grad_norm": 3.415764570236206,
"learning_rate": 0.001,
"loss": 1.8274,
"step": 93500
},
{
"epoch": 30.252100840336134,
"grad_norm": 2.0394747257232666,
"learning_rate": 0.001,
"loss": 1.8256,
"step": 93600
},
{
"epoch": 30.284421460892048,
"grad_norm": 1.498351812362671,
"learning_rate": 0.001,
"loss": 1.855,
"step": 93700
},
{
"epoch": 30.316742081447963,
"grad_norm": 1.360203742980957,
"learning_rate": 0.001,
"loss": 1.8346,
"step": 93800
},
{
"epoch": 30.349062702003877,
"grad_norm": 1.4011281728744507,
"learning_rate": 0.001,
"loss": 1.8497,
"step": 93900
},
{
"epoch": 30.381383322559792,
"grad_norm": 1.6812119483947754,
"learning_rate": 0.001,
"loss": 1.8551,
"step": 94000
},
{
"epoch": 30.413703943115706,
"grad_norm": 1.4505479335784912,
"learning_rate": 0.001,
"loss": 1.8439,
"step": 94100
},
{
"epoch": 30.44602456367162,
"grad_norm": 1.6102886199951172,
"learning_rate": 0.001,
"loss": 1.8579,
"step": 94200
},
{
"epoch": 30.478345184227535,
"grad_norm": 1.5858819484710693,
"learning_rate": 0.001,
"loss": 1.8701,
"step": 94300
},
{
"epoch": 30.51066580478345,
"grad_norm": 2.008108139038086,
"learning_rate": 0.001,
"loss": 1.8719,
"step": 94400
},
{
"epoch": 30.542986425339368,
"grad_norm": 1.247879147529602,
"learning_rate": 0.001,
"loss": 1.8679,
"step": 94500
},
{
"epoch": 30.575307045895283,
"grad_norm": 1.4479765892028809,
"learning_rate": 0.001,
"loss": 1.8821,
"step": 94600
},
{
"epoch": 30.607627666451197,
"grad_norm": 1.325579047203064,
"learning_rate": 0.001,
"loss": 1.8679,
"step": 94700
},
{
"epoch": 30.639948287007112,
"grad_norm": 1.6809526681900024,
"learning_rate": 0.001,
"loss": 1.8784,
"step": 94800
},
{
"epoch": 30.672268907563026,
"grad_norm": 1.5918498039245605,
"learning_rate": 0.001,
"loss": 1.8975,
"step": 94900
},
{
"epoch": 30.70458952811894,
"grad_norm": 1.6501222848892212,
"learning_rate": 0.001,
"loss": 1.8868,
"step": 95000
},
{
"epoch": 30.736910148674855,
"grad_norm": 2.2188880443573,
"learning_rate": 0.001,
"loss": 1.8769,
"step": 95100
},
{
"epoch": 30.76923076923077,
"grad_norm": 1.1673107147216797,
"learning_rate": 0.001,
"loss": 1.9076,
"step": 95200
},
{
"epoch": 30.801551389786685,
"grad_norm": 1.7042407989501953,
"learning_rate": 0.001,
"loss": 1.8965,
"step": 95300
},
{
"epoch": 30.8338720103426,
"grad_norm": 1.224590539932251,
"learning_rate": 0.001,
"loss": 1.8954,
"step": 95400
},
{
"epoch": 30.866192630898514,
"grad_norm": 1.4763602018356323,
"learning_rate": 0.001,
"loss": 1.9105,
"step": 95500
},
{
"epoch": 30.89851325145443,
"grad_norm": 1.6836724281311035,
"learning_rate": 0.001,
"loss": 1.8827,
"step": 95600
},
{
"epoch": 30.930833872010343,
"grad_norm": 1.3475334644317627,
"learning_rate": 0.001,
"loss": 1.9173,
"step": 95700
},
{
"epoch": 30.963154492566257,
"grad_norm": 1.347589373588562,
"learning_rate": 0.001,
"loss": 1.889,
"step": 95800
},
{
"epoch": 30.995475113122172,
"grad_norm": 1.473758339881897,
"learning_rate": 0.001,
"loss": 1.8965,
"step": 95900
},
{
"epoch": 31.027795733678087,
"grad_norm": 1.728955864906311,
"learning_rate": 0.001,
"loss": 1.8083,
"step": 96000
},
{
"epoch": 31.060116354234,
"grad_norm": 1.5232839584350586,
"learning_rate": 0.001,
"loss": 1.7555,
"step": 96100
},
{
"epoch": 31.092436974789916,
"grad_norm": 1.8657755851745605,
"learning_rate": 0.001,
"loss": 1.7685,
"step": 96200
},
{
"epoch": 31.12475759534583,
"grad_norm": 1.4750165939331055,
"learning_rate": 0.001,
"loss": 1.7735,
"step": 96300
},
{
"epoch": 31.157078215901745,
"grad_norm": 1.5432738065719604,
"learning_rate": 0.001,
"loss": 1.7882,
"step": 96400
},
{
"epoch": 31.18939883645766,
"grad_norm": 1.422799825668335,
"learning_rate": 0.001,
"loss": 1.7952,
"step": 96500
},
{
"epoch": 31.221719457013574,
"grad_norm": 1.6849409341812134,
"learning_rate": 0.001,
"loss": 1.784,
"step": 96600
},
{
"epoch": 31.25404007756949,
"grad_norm": 1.4621268510818481,
"learning_rate": 0.001,
"loss": 1.788,
"step": 96700
},
{
"epoch": 31.286360698125403,
"grad_norm": 1.3027772903442383,
"learning_rate": 0.001,
"loss": 1.8025,
"step": 96800
},
{
"epoch": 31.318681318681318,
"grad_norm": 1.5838264226913452,
"learning_rate": 0.001,
"loss": 1.8274,
"step": 96900
},
{
"epoch": 31.351001939237232,
"grad_norm": 1.368515968322754,
"learning_rate": 0.001,
"loss": 1.8323,
"step": 97000
},
{
"epoch": 31.383322559793147,
"grad_norm": 1.6732114553451538,
"learning_rate": 0.001,
"loss": 1.8276,
"step": 97100
},
{
"epoch": 31.41564318034906,
"grad_norm": 1.4694806337356567,
"learning_rate": 0.001,
"loss": 1.8157,
"step": 97200
},
{
"epoch": 31.447963800904976,
"grad_norm": 2.3622124195098877,
"learning_rate": 0.001,
"loss": 1.8327,
"step": 97300
},
{
"epoch": 31.48028442146089,
"grad_norm": 1.6618635654449463,
"learning_rate": 0.001,
"loss": 1.8298,
"step": 97400
},
{
"epoch": 31.51260504201681,
"grad_norm": 1.743264079093933,
"learning_rate": 0.001,
"loss": 1.8448,
"step": 97500
},
{
"epoch": 31.544925662572723,
"grad_norm": 1.2837010622024536,
"learning_rate": 0.001,
"loss": 1.8467,
"step": 97600
},
{
"epoch": 31.577246283128638,
"grad_norm": 1.7191213369369507,
"learning_rate": 0.001,
"loss": 1.8323,
"step": 97700
},
{
"epoch": 31.609566903684552,
"grad_norm": 2.304013729095459,
"learning_rate": 0.001,
"loss": 1.8629,
"step": 97800
},
{
"epoch": 31.641887524240467,
"grad_norm": 1.6232678890228271,
"learning_rate": 0.001,
"loss": 1.8508,
"step": 97900
},
{
"epoch": 31.67420814479638,
"grad_norm": 2.126199722290039,
"learning_rate": 0.001,
"loss": 1.858,
"step": 98000
},
{
"epoch": 31.706528765352296,
"grad_norm": 1.7926188707351685,
"learning_rate": 0.001,
"loss": 1.8507,
"step": 98100
},
{
"epoch": 31.73884938590821,
"grad_norm": 1.4954445362091064,
"learning_rate": 0.001,
"loss": 1.8566,
"step": 98200
},
{
"epoch": 31.771170006464125,
"grad_norm": 1.5035561323165894,
"learning_rate": 0.001,
"loss": 1.8696,
"step": 98300
},
{
"epoch": 31.80349062702004,
"grad_norm": 1.323290467262268,
"learning_rate": 0.001,
"loss": 1.8866,
"step": 98400
},
{
"epoch": 31.835811247575954,
"grad_norm": 1.6078685522079468,
"learning_rate": 0.001,
"loss": 1.872,
"step": 98500
},
{
"epoch": 31.86813186813187,
"grad_norm": 1.5674740076065063,
"learning_rate": 0.001,
"loss": 1.8907,
"step": 98600
},
{
"epoch": 31.900452488687783,
"grad_norm": 1.6643004417419434,
"learning_rate": 0.001,
"loss": 1.8766,
"step": 98700
},
{
"epoch": 31.932773109243698,
"grad_norm": 1.5275764465332031,
"learning_rate": 0.001,
"loss": 1.8751,
"step": 98800
},
{
"epoch": 31.965093729799612,
"grad_norm": 1.471692681312561,
"learning_rate": 0.001,
"loss": 1.8686,
"step": 98900
},
{
"epoch": 31.997414350355527,
"grad_norm": 1.378650188446045,
"learning_rate": 0.001,
"loss": 1.8815,
"step": 99000
},
{
"epoch": 32.02973497091144,
"grad_norm": 1.5829144716262817,
"learning_rate": 0.001,
"loss": 1.7734,
"step": 99100
},
{
"epoch": 32.062055591467356,
"grad_norm": 1.8891956806182861,
"learning_rate": 0.001,
"loss": 1.7384,
"step": 99200
},
{
"epoch": 32.09437621202327,
"grad_norm": 1.706789493560791,
"learning_rate": 0.001,
"loss": 1.7708,
"step": 99300
},
{
"epoch": 32.126696832579185,
"grad_norm": 1.6363348960876465,
"learning_rate": 0.001,
"loss": 1.751,
"step": 99400
},
{
"epoch": 32.1590174531351,
"grad_norm": 1.6457102298736572,
"learning_rate": 0.001,
"loss": 1.765,
"step": 99500
},
{
"epoch": 32.191338073691014,
"grad_norm": 1.5266406536102295,
"learning_rate": 0.001,
"loss": 1.7812,
"step": 99600
},
{
"epoch": 32.22365869424693,
"grad_norm": 1.9714754819869995,
"learning_rate": 0.001,
"loss": 1.7729,
"step": 99700
},
{
"epoch": 32.25597931480284,
"grad_norm": 1.6170477867126465,
"learning_rate": 0.001,
"loss": 1.7729,
"step": 99800
},
{
"epoch": 32.28829993535876,
"grad_norm": 1.3753221035003662,
"learning_rate": 0.001,
"loss": 1.7805,
"step": 99900
},
{
"epoch": 32.32062055591467,
"grad_norm": 2.1753334999084473,
"learning_rate": 0.001,
"loss": 1.7737,
"step": 100000
},
{
"epoch": 32.35294117647059,
"grad_norm": 1.8860663175582886,
"learning_rate": 0.001,
"loss": 1.7906,
"step": 100100
},
{
"epoch": 32.3852617970265,
"grad_norm": 2.0654399394989014,
"learning_rate": 0.001,
"loss": 1.7885,
"step": 100200
},
{
"epoch": 32.417582417582416,
"grad_norm": 1.7325553894042969,
"learning_rate": 0.001,
"loss": 1.8063,
"step": 100300
},
{
"epoch": 32.44990303813833,
"grad_norm": 1.588712215423584,
"learning_rate": 0.001,
"loss": 1.8004,
"step": 100400
},
{
"epoch": 32.482223658694245,
"grad_norm": 1.6810243129730225,
"learning_rate": 0.001,
"loss": 1.8152,
"step": 100500
},
{
"epoch": 32.51454427925016,
"grad_norm": 1.6769487857818604,
"learning_rate": 0.001,
"loss": 1.8212,
"step": 100600
},
{
"epoch": 32.546864899806074,
"grad_norm": 1.9445384740829468,
"learning_rate": 0.001,
"loss": 1.7917,
"step": 100700
},
{
"epoch": 32.57918552036199,
"grad_norm": 1.6605241298675537,
"learning_rate": 0.001,
"loss": 1.8362,
"step": 100800
},
{
"epoch": 32.6115061409179,
"grad_norm": 2.058520793914795,
"learning_rate": 0.001,
"loss": 1.8341,
"step": 100900
},
{
"epoch": 32.64382676147382,
"grad_norm": 1.3105531930923462,
"learning_rate": 0.001,
"loss": 1.8431,
"step": 101000
},
{
"epoch": 32.67614738202973,
"grad_norm": 1.626055359840393,
"learning_rate": 0.001,
"loss": 1.824,
"step": 101100
},
{
"epoch": 32.70846800258565,
"grad_norm": 1.5629281997680664,
"learning_rate": 0.001,
"loss": 1.8388,
"step": 101200
},
{
"epoch": 32.74078862314156,
"grad_norm": 1.8073807954788208,
"learning_rate": 0.001,
"loss": 1.8373,
"step": 101300
},
{
"epoch": 32.773109243697476,
"grad_norm": 2.0889344215393066,
"learning_rate": 0.001,
"loss": 1.8304,
"step": 101400
},
{
"epoch": 32.80542986425339,
"grad_norm": 1.771985650062561,
"learning_rate": 0.001,
"loss": 1.8436,
"step": 101500
},
{
"epoch": 32.837750484809305,
"grad_norm": 1.631714940071106,
"learning_rate": 0.001,
"loss": 1.8502,
"step": 101600
},
{
"epoch": 32.87007110536522,
"grad_norm": 2.003098487854004,
"learning_rate": 0.001,
"loss": 1.8608,
"step": 101700
},
{
"epoch": 32.902391725921134,
"grad_norm": 1.7163927555084229,
"learning_rate": 0.001,
"loss": 1.8409,
"step": 101800
},
{
"epoch": 32.93471234647705,
"grad_norm": 1.5179773569107056,
"learning_rate": 0.001,
"loss": 1.836,
"step": 101900
},
{
"epoch": 32.967032967032964,
"grad_norm": 1.826611042022705,
"learning_rate": 0.001,
"loss": 1.8544,
"step": 102000
},
{
"epoch": 32.999353587588885,
"grad_norm": 2.0512094497680664,
"learning_rate": 0.001,
"loss": 1.8587,
"step": 102100
},
{
"epoch": 33.0316742081448,
"grad_norm": 2.0743322372436523,
"learning_rate": 0.001,
"loss": 1.6994,
"step": 102200
},
{
"epoch": 33.063994828700714,
"grad_norm": 2.1216299533843994,
"learning_rate": 0.001,
"loss": 1.7167,
"step": 102300
},
{
"epoch": 33.09631544925663,
"grad_norm": 2.0061864852905273,
"learning_rate": 0.001,
"loss": 1.7269,
"step": 102400
},
{
"epoch": 33.12863606981254,
"grad_norm": 1.8516168594360352,
"learning_rate": 0.001,
"loss": 1.7471,
"step": 102500
},
{
"epoch": 33.16095669036846,
"grad_norm": 2.0900888442993164,
"learning_rate": 0.001,
"loss": 1.7444,
"step": 102600
},
{
"epoch": 33.19327731092437,
"grad_norm": 1.9788792133331299,
"learning_rate": 0.001,
"loss": 1.7557,
"step": 102700
},
{
"epoch": 33.22559793148029,
"grad_norm": 2.034575939178467,
"learning_rate": 0.001,
"loss": 1.7563,
"step": 102800
},
{
"epoch": 33.2579185520362,
"grad_norm": 2.135305881500244,
"learning_rate": 0.001,
"loss": 1.7338,
"step": 102900
},
{
"epoch": 33.290239172592116,
"grad_norm": 1.8343353271484375,
"learning_rate": 0.001,
"loss": 1.7662,
"step": 103000
},
{
"epoch": 33.32255979314803,
"grad_norm": 2.277712106704712,
"learning_rate": 0.001,
"loss": 1.7439,
"step": 103100
},
{
"epoch": 33.354880413703945,
"grad_norm": 1.983909010887146,
"learning_rate": 0.001,
"loss": 1.7626,
"step": 103200
},
{
"epoch": 33.38720103425986,
"grad_norm": 1.6377862691879272,
"learning_rate": 0.001,
"loss": 1.7789,
"step": 103300
},
{
"epoch": 33.419521654815775,
"grad_norm": 1.8458659648895264,
"learning_rate": 0.001,
"loss": 1.7728,
"step": 103400
},
{
"epoch": 33.45184227537169,
"grad_norm": 1.9960947036743164,
"learning_rate": 0.001,
"loss": 1.7659,
"step": 103500
},
{
"epoch": 33.484162895927604,
"grad_norm": 2.0840041637420654,
"learning_rate": 0.001,
"loss": 1.763,
"step": 103600
},
{
"epoch": 33.51648351648352,
"grad_norm": 2.215972661972046,
"learning_rate": 0.001,
"loss": 1.7933,
"step": 103700
},
{
"epoch": 33.54880413703943,
"grad_norm": 1.8777416944503784,
"learning_rate": 0.001,
"loss": 1.7918,
"step": 103800
},
{
"epoch": 33.58112475759535,
"grad_norm": 1.7286779880523682,
"learning_rate": 0.001,
"loss": 1.792,
"step": 103900
},
{
"epoch": 33.61344537815126,
"grad_norm": 2.0685818195343018,
"learning_rate": 0.001,
"loss": 1.7941,
"step": 104000
},
{
"epoch": 33.645765998707176,
"grad_norm": 1.928328037261963,
"learning_rate": 0.001,
"loss": 1.8152,
"step": 104100
},
{
"epoch": 33.67808661926309,
"grad_norm": 1.7642704248428345,
"learning_rate": 0.001,
"loss": 1.8128,
"step": 104200
},
{
"epoch": 33.710407239819006,
"grad_norm": 2.2432053089141846,
"learning_rate": 0.001,
"loss": 1.8034,
"step": 104300
},
{
"epoch": 33.74272786037492,
"grad_norm": 2.012679100036621,
"learning_rate": 0.001,
"loss": 1.8132,
"step": 104400
},
{
"epoch": 33.775048480930835,
"grad_norm": 2.067655086517334,
"learning_rate": 0.001,
"loss": 1.8267,
"step": 104500
},
{
"epoch": 33.80736910148675,
"grad_norm": 1.9166897535324097,
"learning_rate": 0.001,
"loss": 1.8191,
"step": 104600
},
{
"epoch": 33.839689722042664,
"grad_norm": 1.8261538743972778,
"learning_rate": 0.001,
"loss": 1.8396,
"step": 104700
},
{
"epoch": 33.87201034259858,
"grad_norm": 1.6121453046798706,
"learning_rate": 0.001,
"loss": 1.8192,
"step": 104800
},
{
"epoch": 33.90433096315449,
"grad_norm": 1.7790288925170898,
"learning_rate": 0.001,
"loss": 1.8441,
"step": 104900
},
{
"epoch": 33.93665158371041,
"grad_norm": 2.033315658569336,
"learning_rate": 0.001,
"loss": 1.8407,
"step": 105000
},
{
"epoch": 33.96897220426632,
"grad_norm": 1.9757274389266968,
"learning_rate": 0.001,
"loss": 1.8312,
"step": 105100
},
{
"epoch": 34.00129282482224,
"grad_norm": 1.8282935619354248,
"learning_rate": 0.001,
"loss": 1.824,
"step": 105200
},
{
"epoch": 34.03361344537815,
"grad_norm": 1.3721950054168701,
"learning_rate": 0.001,
"loss": 1.6864,
"step": 105300
},
{
"epoch": 34.065934065934066,
"grad_norm": 2.061631679534912,
"learning_rate": 0.001,
"loss": 1.6903,
"step": 105400
},
{
"epoch": 34.09825468648998,
"grad_norm": 2.0354113578796387,
"learning_rate": 0.001,
"loss": 1.708,
"step": 105500
},
{
"epoch": 34.130575307045895,
"grad_norm": 1.6836439371109009,
"learning_rate": 0.001,
"loss": 1.7179,
"step": 105600
},
{
"epoch": 34.16289592760181,
"grad_norm": 2.024641513824463,
"learning_rate": 0.001,
"loss": 1.7056,
"step": 105700
},
{
"epoch": 34.195216548157724,
"grad_norm": 1.5095860958099365,
"learning_rate": 0.001,
"loss": 1.705,
"step": 105800
},
{
"epoch": 34.22753716871364,
"grad_norm": 2.405456781387329,
"learning_rate": 0.001,
"loss": 1.7241,
"step": 105900
},
{
"epoch": 34.25985778926955,
"grad_norm": 1.871866226196289,
"learning_rate": 0.001,
"loss": 1.7392,
"step": 106000
},
{
"epoch": 34.29217840982547,
"grad_norm": 2.0286736488342285,
"learning_rate": 0.001,
"loss": 1.7312,
"step": 106100
},
{
"epoch": 34.32449903038138,
"grad_norm": 1.6875593662261963,
"learning_rate": 0.001,
"loss": 1.747,
"step": 106200
},
{
"epoch": 34.3568196509373,
"grad_norm": 1.6652581691741943,
"learning_rate": 0.001,
"loss": 1.7525,
"step": 106300
},
{
"epoch": 34.38914027149321,
"grad_norm": 1.8134232759475708,
"learning_rate": 0.001,
"loss": 1.7527,
"step": 106400
},
{
"epoch": 34.421460892049126,
"grad_norm": 1.7698140144348145,
"learning_rate": 0.001,
"loss": 1.762,
"step": 106500
},
{
"epoch": 34.45378151260504,
"grad_norm": 1.5082416534423828,
"learning_rate": 0.001,
"loss": 1.7687,
"step": 106600
},
{
"epoch": 34.486102133160955,
"grad_norm": 1.5613726377487183,
"learning_rate": 0.001,
"loss": 1.7655,
"step": 106700
},
{
"epoch": 34.51842275371687,
"grad_norm": 1.7272530794143677,
"learning_rate": 0.001,
"loss": 1.7483,
"step": 106800
},
{
"epoch": 34.550743374272784,
"grad_norm": 1.644972801208496,
"learning_rate": 0.001,
"loss": 1.7832,
"step": 106900
},
{
"epoch": 34.5830639948287,
"grad_norm": 1.644237756729126,
"learning_rate": 0.001,
"loss": 1.7609,
"step": 107000
},
{
"epoch": 34.61538461538461,
"grad_norm": 1.6201183795928955,
"learning_rate": 0.001,
"loss": 1.772,
"step": 107100
},
{
"epoch": 34.64770523594053,
"grad_norm": 1.9227070808410645,
"learning_rate": 0.001,
"loss": 1.7954,
"step": 107200
},
{
"epoch": 34.68002585649644,
"grad_norm": 1.4974156618118286,
"learning_rate": 0.001,
"loss": 1.7881,
"step": 107300
},
{
"epoch": 34.71234647705236,
"grad_norm": 1.9709665775299072,
"learning_rate": 0.001,
"loss": 1.7943,
"step": 107400
},
{
"epoch": 34.74466709760827,
"grad_norm": 1.6651779413223267,
"learning_rate": 0.001,
"loss": 1.803,
"step": 107500
},
{
"epoch": 34.776987718164186,
"grad_norm": 1.9187260866165161,
"learning_rate": 0.001,
"loss": 1.7989,
"step": 107600
},
{
"epoch": 34.8093083387201,
"grad_norm": 1.8573428392410278,
"learning_rate": 0.001,
"loss": 1.8145,
"step": 107700
},
{
"epoch": 34.841628959276015,
"grad_norm": 1.4682703018188477,
"learning_rate": 0.001,
"loss": 1.8096,
"step": 107800
},
{
"epoch": 34.87394957983193,
"grad_norm": 2.2076940536499023,
"learning_rate": 0.001,
"loss": 1.8052,
"step": 107900
},
{
"epoch": 34.906270200387844,
"grad_norm": 1.7180118560791016,
"learning_rate": 0.001,
"loss": 1.8129,
"step": 108000
},
{
"epoch": 34.93859082094376,
"grad_norm": 1.665969729423523,
"learning_rate": 0.001,
"loss": 1.8067,
"step": 108100
},
{
"epoch": 34.97091144149967,
"grad_norm": 5.9310622215271,
"learning_rate": 0.001,
"loss": 1.8212,
"step": 108200
},
{
"epoch": 35.003232062055595,
"grad_norm": 1.5211377143859863,
"learning_rate": 0.001,
"loss": 1.8044,
"step": 108300
},
{
"epoch": 35.03555268261151,
"grad_norm": 1.2844172716140747,
"learning_rate": 0.001,
"loss": 1.6721,
"step": 108400
},
{
"epoch": 35.067873303167424,
"grad_norm": 1.5357627868652344,
"learning_rate": 0.001,
"loss": 1.6904,
"step": 108500
},
{
"epoch": 35.10019392372334,
"grad_norm": 1.5204764604568481,
"learning_rate": 0.001,
"loss": 1.6687,
"step": 108600
},
{
"epoch": 35.13251454427925,
"grad_norm": 2.441347599029541,
"learning_rate": 0.001,
"loss": 1.675,
"step": 108700
},
{
"epoch": 35.16483516483517,
"grad_norm": 1.9407317638397217,
"learning_rate": 0.001,
"loss": 1.7092,
"step": 108800
},
{
"epoch": 35.19715578539108,
"grad_norm": 1.4935519695281982,
"learning_rate": 0.001,
"loss": 1.7088,
"step": 108900
},
{
"epoch": 35.229476405947,
"grad_norm": 1.2903261184692383,
"learning_rate": 0.001,
"loss": 1.6767,
"step": 109000
},
{
"epoch": 35.26179702650291,
"grad_norm": 2.764295816421509,
"learning_rate": 0.001,
"loss": 1.7007,
"step": 109100
},
{
"epoch": 35.294117647058826,
"grad_norm": 1.714455008506775,
"learning_rate": 0.001,
"loss": 1.7017,
"step": 109200
},
{
"epoch": 35.32643826761474,
"grad_norm": 1.885419487953186,
"learning_rate": 0.001,
"loss": 1.7402,
"step": 109300
},
{
"epoch": 35.358758888170655,
"grad_norm": 1.5595377683639526,
"learning_rate": 0.001,
"loss": 1.7365,
"step": 109400
},
{
"epoch": 35.39107950872657,
"grad_norm": 1.3263633251190186,
"learning_rate": 0.001,
"loss": 1.7513,
"step": 109500
},
{
"epoch": 35.423400129282484,
"grad_norm": 1.623655080795288,
"learning_rate": 0.001,
"loss": 1.7408,
"step": 109600
},
{
"epoch": 35.4557207498384,
"grad_norm": 1.5252445936203003,
"learning_rate": 0.001,
"loss": 1.7426,
"step": 109700
},
{
"epoch": 35.48804137039431,
"grad_norm": 1.5424871444702148,
"learning_rate": 0.001,
"loss": 1.7424,
"step": 109800
},
{
"epoch": 35.52036199095023,
"grad_norm": 1.604275107383728,
"learning_rate": 0.001,
"loss": 1.7404,
"step": 109900
},
{
"epoch": 35.55268261150614,
"grad_norm": 1.3040121793746948,
"learning_rate": 0.001,
"loss": 1.7377,
"step": 110000
},
{
"epoch": 35.58500323206206,
"grad_norm": 1.4881088733673096,
"learning_rate": 0.001,
"loss": 1.7472,
"step": 110100
},
{
"epoch": 35.61732385261797,
"grad_norm": 1.2945785522460938,
"learning_rate": 0.001,
"loss": 1.7656,
"step": 110200
},
{
"epoch": 35.649644473173886,
"grad_norm": 1.5668212175369263,
"learning_rate": 0.001,
"loss": 1.7522,
"step": 110300
},
{
"epoch": 35.6819650937298,
"grad_norm": 1.6249139308929443,
"learning_rate": 0.001,
"loss": 1.7661,
"step": 110400
},
{
"epoch": 35.714285714285715,
"grad_norm": 1.5623595714569092,
"learning_rate": 0.001,
"loss": 1.7587,
"step": 110500
},
{
"epoch": 35.74660633484163,
"grad_norm": 1.6370724439620972,
"learning_rate": 0.001,
"loss": 1.7873,
"step": 110600
},
{
"epoch": 35.778926955397544,
"grad_norm": 1.387434959411621,
"learning_rate": 0.001,
"loss": 1.7579,
"step": 110700
},
{
"epoch": 35.81124757595346,
"grad_norm": 1.4653477668762207,
"learning_rate": 0.001,
"loss": 1.7723,
"step": 110800
},
{
"epoch": 35.84356819650937,
"grad_norm": 1.6415297985076904,
"learning_rate": 0.001,
"loss": 1.7719,
"step": 110900
},
{
"epoch": 35.87588881706529,
"grad_norm": 1.447180151939392,
"learning_rate": 0.001,
"loss": 1.7818,
"step": 111000
},
{
"epoch": 35.9082094376212,
"grad_norm": 1.4582403898239136,
"learning_rate": 0.001,
"loss": 1.7835,
"step": 111100
},
{
"epoch": 35.94053005817712,
"grad_norm": 1.6619702577590942,
"learning_rate": 0.001,
"loss": 1.7904,
"step": 111200
},
{
"epoch": 35.97285067873303,
"grad_norm": 1.6435052156448364,
"learning_rate": 0.001,
"loss": 1.7815,
"step": 111300
},
{
"epoch": 36.005171299288946,
"grad_norm": 1.2920403480529785,
"learning_rate": 0.001,
"loss": 1.7803,
"step": 111400
},
{
"epoch": 36.03749191984486,
"grad_norm": 1.1500316858291626,
"learning_rate": 0.001,
"loss": 1.6365,
"step": 111500
},
{
"epoch": 36.069812540400775,
"grad_norm": 1.4132637977600098,
"learning_rate": 0.001,
"loss": 1.6589,
"step": 111600
},
{
"epoch": 36.10213316095669,
"grad_norm": 1.4161231517791748,
"learning_rate": 0.001,
"loss": 1.6784,
"step": 111700
},
{
"epoch": 36.134453781512605,
"grad_norm": 1.4937621355056763,
"learning_rate": 0.001,
"loss": 1.6499,
"step": 111800
},
{
"epoch": 36.16677440206852,
"grad_norm": 1.2990803718566895,
"learning_rate": 0.001,
"loss": 1.6766,
"step": 111900
},
{
"epoch": 36.199095022624434,
"grad_norm": 1.636744737625122,
"learning_rate": 0.001,
"loss": 1.6621,
"step": 112000
},
{
"epoch": 36.23141564318035,
"grad_norm": 1.783856987953186,
"learning_rate": 0.001,
"loss": 1.6684,
"step": 112100
},
{
"epoch": 36.26373626373626,
"grad_norm": 1.3172340393066406,
"learning_rate": 0.001,
"loss": 1.6902,
"step": 112200
},
{
"epoch": 36.29605688429218,
"grad_norm": 1.2710336446762085,
"learning_rate": 0.001,
"loss": 1.6858,
"step": 112300
},
{
"epoch": 36.32837750484809,
"grad_norm": 2.070700168609619,
"learning_rate": 0.001,
"loss": 1.6842,
"step": 112400
},
{
"epoch": 36.36069812540401,
"grad_norm": 1.738664984703064,
"learning_rate": 0.001,
"loss": 1.7021,
"step": 112500
},
{
"epoch": 36.39301874595992,
"grad_norm": 1.4381351470947266,
"learning_rate": 0.001,
"loss": 1.7221,
"step": 112600
},
{
"epoch": 36.425339366515836,
"grad_norm": 1.3279963731765747,
"learning_rate": 0.001,
"loss": 1.7187,
"step": 112700
},
{
"epoch": 36.45765998707175,
"grad_norm": 1.7753669023513794,
"learning_rate": 0.001,
"loss": 1.7147,
"step": 112800
},
{
"epoch": 36.489980607627665,
"grad_norm": 2.194094657897949,
"learning_rate": 0.001,
"loss": 1.7264,
"step": 112900
},
{
"epoch": 36.52230122818358,
"grad_norm": 1.360891342163086,
"learning_rate": 0.001,
"loss": 1.7423,
"step": 113000
},
{
"epoch": 36.554621848739494,
"grad_norm": 1.8281059265136719,
"learning_rate": 0.001,
"loss": 1.7389,
"step": 113100
},
{
"epoch": 36.58694246929541,
"grad_norm": 7.225008964538574,
"learning_rate": 0.001,
"loss": 1.7317,
"step": 113200
},
{
"epoch": 36.61926308985132,
"grad_norm": 1.3751835823059082,
"learning_rate": 0.001,
"loss": 1.7178,
"step": 113300
},
{
"epoch": 36.65158371040724,
"grad_norm": 1.3303834199905396,
"learning_rate": 0.001,
"loss": 1.7376,
"step": 113400
},
{
"epoch": 36.68390433096315,
"grad_norm": 1.3303159475326538,
"learning_rate": 0.001,
"loss": 1.7494,
"step": 113500
},
{
"epoch": 36.71622495151907,
"grad_norm": 1.3926934003829956,
"learning_rate": 0.001,
"loss": 1.7545,
"step": 113600
},
{
"epoch": 36.74854557207498,
"grad_norm": 1.3599934577941895,
"learning_rate": 0.001,
"loss": 1.7438,
"step": 113700
},
{
"epoch": 36.780866192630896,
"grad_norm": 17.112966537475586,
"learning_rate": 0.001,
"loss": 1.7566,
"step": 113800
},
{
"epoch": 36.81318681318681,
"grad_norm": 1.1081054210662842,
"learning_rate": 0.001,
"loss": 1.7545,
"step": 113900
},
{
"epoch": 36.845507433742725,
"grad_norm": 1.49489426612854,
"learning_rate": 0.001,
"loss": 1.7553,
"step": 114000
},
{
"epoch": 36.87782805429864,
"grad_norm": 1.4801154136657715,
"learning_rate": 0.001,
"loss": 1.739,
"step": 114100
},
{
"epoch": 36.910148674854554,
"grad_norm": 1.4519503116607666,
"learning_rate": 0.001,
"loss": 1.7696,
"step": 114200
},
{
"epoch": 36.94246929541047,
"grad_norm": 1.424436092376709,
"learning_rate": 0.001,
"loss": 1.7624,
"step": 114300
},
{
"epoch": 36.97478991596638,
"grad_norm": 1.5529747009277344,
"learning_rate": 0.001,
"loss": 1.7481,
"step": 114400
},
{
"epoch": 37.007110536522305,
"grad_norm": 1.3554753065109253,
"learning_rate": 0.001,
"loss": 1.7531,
"step": 114500
},
{
"epoch": 37.03943115707822,
"grad_norm": 1.5860496759414673,
"learning_rate": 0.001,
"loss": 1.6318,
"step": 114600
},
{
"epoch": 37.071751777634134,
"grad_norm": 2.0900566577911377,
"learning_rate": 0.001,
"loss": 1.623,
"step": 114700
},
{
"epoch": 37.10407239819005,
"grad_norm": 1.452841877937317,
"learning_rate": 0.001,
"loss": 1.6301,
"step": 114800
},
{
"epoch": 37.13639301874596,
"grad_norm": 2.630765914916992,
"learning_rate": 0.001,
"loss": 1.6386,
"step": 114900
},
{
"epoch": 37.16871363930188,
"grad_norm": 1.6086002588272095,
"learning_rate": 0.001,
"loss": 1.6525,
"step": 115000
},
{
"epoch": 37.20103425985779,
"grad_norm": 1.5856311321258545,
"learning_rate": 0.001,
"loss": 1.6452,
"step": 115100
},
{
"epoch": 37.23335488041371,
"grad_norm": 1.3888436555862427,
"learning_rate": 0.001,
"loss": 1.6605,
"step": 115200
},
{
"epoch": 37.26567550096962,
"grad_norm": 1.6696892976760864,
"learning_rate": 0.001,
"loss": 1.6635,
"step": 115300
},
{
"epoch": 37.297996121525536,
"grad_norm": 1.3410964012145996,
"learning_rate": 0.001,
"loss": 1.6879,
"step": 115400
},
{
"epoch": 37.33031674208145,
"grad_norm": 1.9728986024856567,
"learning_rate": 0.001,
"loss": 1.6818,
"step": 115500
},
{
"epoch": 37.362637362637365,
"grad_norm": 1.2899627685546875,
"learning_rate": 0.001,
"loss": 1.6637,
"step": 115600
},
{
"epoch": 37.39495798319328,
"grad_norm": 1.4178438186645508,
"learning_rate": 0.001,
"loss": 1.6829,
"step": 115700
},
{
"epoch": 37.427278603749194,
"grad_norm": 1.2930041551589966,
"learning_rate": 0.001,
"loss": 1.6989,
"step": 115800
},
{
"epoch": 37.45959922430511,
"grad_norm": 1.5607823133468628,
"learning_rate": 0.001,
"loss": 1.6994,
"step": 115900
},
{
"epoch": 37.49191984486102,
"grad_norm": 1.3192517757415771,
"learning_rate": 0.001,
"loss": 1.7011,
"step": 116000
},
{
"epoch": 37.52424046541694,
"grad_norm": 1.385068655014038,
"learning_rate": 0.001,
"loss": 1.715,
"step": 116100
},
{
"epoch": 37.55656108597285,
"grad_norm": 1.4092013835906982,
"learning_rate": 0.001,
"loss": 1.7085,
"step": 116200
},
{
"epoch": 37.58888170652877,
"grad_norm": 1.5187952518463135,
"learning_rate": 0.001,
"loss": 1.6988,
"step": 116300
},
{
"epoch": 37.62120232708468,
"grad_norm": 1.5603824853897095,
"learning_rate": 0.001,
"loss": 1.6981,
"step": 116400
},
{
"epoch": 37.653522947640596,
"grad_norm": 1.3722445964813232,
"learning_rate": 0.001,
"loss": 1.7052,
"step": 116500
},
{
"epoch": 37.68584356819651,
"grad_norm": 1.2925854921340942,
"learning_rate": 0.001,
"loss": 1.7185,
"step": 116600
},
{
"epoch": 37.718164188752425,
"grad_norm": 1.8050371408462524,
"learning_rate": 0.001,
"loss": 1.7142,
"step": 116700
},
{
"epoch": 37.75048480930834,
"grad_norm": 1.6540824174880981,
"learning_rate": 0.001,
"loss": 1.7186,
"step": 116800
},
{
"epoch": 37.782805429864254,
"grad_norm": 1.7223883867263794,
"learning_rate": 0.001,
"loss": 1.7336,
"step": 116900
},
{
"epoch": 37.81512605042017,
"grad_norm": 1.4943795204162598,
"learning_rate": 0.001,
"loss": 1.7443,
"step": 117000
},
{
"epoch": 37.84744667097608,
"grad_norm": 1.3059314489364624,
"learning_rate": 0.001,
"loss": 1.7356,
"step": 117100
},
{
"epoch": 37.879767291532,
"grad_norm": 1.4005022048950195,
"learning_rate": 0.001,
"loss": 1.7175,
"step": 117200
},
{
"epoch": 37.91208791208791,
"grad_norm": 1.3799285888671875,
"learning_rate": 0.001,
"loss": 1.7465,
"step": 117300
},
{
"epoch": 37.94440853264383,
"grad_norm": 1.4844553470611572,
"learning_rate": 0.001,
"loss": 1.7428,
"step": 117400
},
{
"epoch": 37.97672915319974,
"grad_norm": 1.2771897315979004,
"learning_rate": 0.001,
"loss": 1.746,
"step": 117500
},
{
"epoch": 38.009049773755656,
"grad_norm": 1.4916632175445557,
"learning_rate": 0.001,
"loss": 1.7074,
"step": 117600
},
{
"epoch": 38.04137039431157,
"grad_norm": 1.445297360420227,
"learning_rate": 0.001,
"loss": 1.5948,
"step": 117700
},
{
"epoch": 38.073691014867485,
"grad_norm": 1.513931155204773,
"learning_rate": 0.001,
"loss": 1.61,
"step": 117800
},
{
"epoch": 38.1060116354234,
"grad_norm": 1.5125148296356201,
"learning_rate": 0.001,
"loss": 1.6239,
"step": 117900
},
{
"epoch": 38.138332255979314,
"grad_norm": 1.384259819984436,
"learning_rate": 0.001,
"loss": 1.6232,
"step": 118000
},
{
"epoch": 38.17065287653523,
"grad_norm": 1.5894758701324463,
"learning_rate": 0.001,
"loss": 1.6157,
"step": 118100
},
{
"epoch": 38.20297349709114,
"grad_norm": 1.6156755685806274,
"learning_rate": 0.001,
"loss": 1.638,
"step": 118200
},
{
"epoch": 38.23529411764706,
"grad_norm": 1.5429646968841553,
"learning_rate": 0.001,
"loss": 1.6424,
"step": 118300
},
{
"epoch": 38.26761473820297,
"grad_norm": 2.129305839538574,
"learning_rate": 0.001,
"loss": 1.6344,
"step": 118400
},
{
"epoch": 38.29993535875889,
"grad_norm": 1.6252347230911255,
"learning_rate": 0.001,
"loss": 1.6621,
"step": 118500
},
{
"epoch": 38.3322559793148,
"grad_norm": 1.6291354894638062,
"learning_rate": 0.001,
"loss": 1.6543,
"step": 118600
},
{
"epoch": 38.364576599870716,
"grad_norm": 1.9039782285690308,
"learning_rate": 0.001,
"loss": 1.6766,
"step": 118700
},
{
"epoch": 38.39689722042663,
"grad_norm": 1.4002729654312134,
"learning_rate": 0.001,
"loss": 1.6732,
"step": 118800
},
{
"epoch": 38.429217840982545,
"grad_norm": 1.6930391788482666,
"learning_rate": 0.001,
"loss": 1.6552,
"step": 118900
},
{
"epoch": 38.46153846153846,
"grad_norm": 1.3052587509155273,
"learning_rate": 0.001,
"loss": 1.6699,
"step": 119000
},
{
"epoch": 38.493859082094374,
"grad_norm": 1.9328466653823853,
"learning_rate": 0.001,
"loss": 1.6617,
"step": 119100
},
{
"epoch": 38.52617970265029,
"grad_norm": 1.5333701372146606,
"learning_rate": 0.001,
"loss": 1.6919,
"step": 119200
},
{
"epoch": 38.558500323206204,
"grad_norm": 1.3492674827575684,
"learning_rate": 0.001,
"loss": 1.6642,
"step": 119300
},
{
"epoch": 38.59082094376212,
"grad_norm": 1.2537891864776611,
"learning_rate": 0.001,
"loss": 1.6928,
"step": 119400
},
{
"epoch": 38.62314156431803,
"grad_norm": 1.5430333614349365,
"learning_rate": 0.001,
"loss": 1.6925,
"step": 119500
},
{
"epoch": 38.65546218487395,
"grad_norm": 1.3039238452911377,
"learning_rate": 0.001,
"loss": 1.6964,
"step": 119600
},
{
"epoch": 38.68778280542986,
"grad_norm": 1.9416965246200562,
"learning_rate": 0.001,
"loss": 1.6892,
"step": 119700
},
{
"epoch": 38.720103425985776,
"grad_norm": 1.5908054113388062,
"learning_rate": 0.001,
"loss": 1.6994,
"step": 119800
},
{
"epoch": 38.75242404654169,
"grad_norm": 1.4828917980194092,
"learning_rate": 0.001,
"loss": 1.678,
"step": 119900
},
{
"epoch": 38.784744667097605,
"grad_norm": 1.236195683479309,
"learning_rate": 0.001,
"loss": 1.7161,
"step": 120000
},
{
"epoch": 38.81706528765352,
"grad_norm": 1.6175165176391602,
"learning_rate": 0.001,
"loss": 1.7076,
"step": 120100
},
{
"epoch": 38.849385908209435,
"grad_norm": 1.4187142848968506,
"learning_rate": 0.001,
"loss": 1.7213,
"step": 120200
},
{
"epoch": 38.88170652876535,
"grad_norm": 1.615424633026123,
"learning_rate": 0.001,
"loss": 1.6994,
"step": 120300
},
{
"epoch": 38.914027149321264,
"grad_norm": 1.3184314966201782,
"learning_rate": 0.001,
"loss": 1.7017,
"step": 120400
},
{
"epoch": 38.94634776987718,
"grad_norm": 2.0182273387908936,
"learning_rate": 0.001,
"loss": 1.7112,
"step": 120500
},
{
"epoch": 38.97866839043309,
"grad_norm": 1.53033447265625,
"learning_rate": 0.001,
"loss": 1.7221,
"step": 120600
},
{
"epoch": 39.010989010989015,
"grad_norm": 1.5735342502593994,
"learning_rate": 0.001,
"loss": 1.6814,
"step": 120700
},
{
"epoch": 39.04330963154493,
"grad_norm": 1.396195411682129,
"learning_rate": 0.001,
"loss": 1.5669,
"step": 120800
},
{
"epoch": 39.075630252100844,
"grad_norm": 1.510571002960205,
"learning_rate": 0.001,
"loss": 1.5803,
"step": 120900
},
{
"epoch": 39.10795087265676,
"grad_norm": 1.533634066581726,
"learning_rate": 0.001,
"loss": 1.5828,
"step": 121000
},
{
"epoch": 39.14027149321267,
"grad_norm": 1.4806658029556274,
"learning_rate": 0.001,
"loss": 1.5861,
"step": 121100
},
{
"epoch": 39.17259211376859,
"grad_norm": 1.7448620796203613,
"learning_rate": 0.001,
"loss": 1.6208,
"step": 121200
},
{
"epoch": 39.2049127343245,
"grad_norm": 1.800340175628662,
"learning_rate": 0.001,
"loss": 1.6287,
"step": 121300
},
{
"epoch": 39.237233354880416,
"grad_norm": 1.4154140949249268,
"learning_rate": 0.001,
"loss": 1.6252,
"step": 121400
},
{
"epoch": 39.26955397543633,
"grad_norm": 1.729437232017517,
"learning_rate": 0.001,
"loss": 1.6358,
"step": 121500
},
{
"epoch": 39.301874595992246,
"grad_norm": 1.3439644575119019,
"learning_rate": 0.001,
"loss": 1.6278,
"step": 121600
},
{
"epoch": 39.33419521654816,
"grad_norm": 2.249131441116333,
"learning_rate": 0.001,
"loss": 1.6345,
"step": 121700
},
{
"epoch": 39.366515837104075,
"grad_norm": 1.8861782550811768,
"learning_rate": 0.001,
"loss": 1.6142,
"step": 121800
},
{
"epoch": 39.39883645765999,
"grad_norm": 1.7923866510391235,
"learning_rate": 0.001,
"loss": 1.6408,
"step": 121900
},
{
"epoch": 39.431157078215904,
"grad_norm": 1.8089905977249146,
"learning_rate": 0.001,
"loss": 1.625,
"step": 122000
},
{
"epoch": 39.46347769877182,
"grad_norm": 1.6108099222183228,
"learning_rate": 0.001,
"loss": 1.6471,
"step": 122100
},
{
"epoch": 39.49579831932773,
"grad_norm": 1.8288142681121826,
"learning_rate": 0.001,
"loss": 1.6492,
"step": 122200
},
{
"epoch": 39.52811893988365,
"grad_norm": 1.6648753881454468,
"learning_rate": 0.001,
"loss": 1.6743,
"step": 122300
},
{
"epoch": 39.56043956043956,
"grad_norm": 1.447311282157898,
"learning_rate": 0.001,
"loss": 1.6649,
"step": 122400
},
{
"epoch": 39.59276018099548,
"grad_norm": 1.5929150581359863,
"learning_rate": 0.001,
"loss": 1.6768,
"step": 122500
},
{
"epoch": 39.62508080155139,
"grad_norm": 1.5306599140167236,
"learning_rate": 0.001,
"loss": 1.6823,
"step": 122600
},
{
"epoch": 39.657401422107306,
"grad_norm": 2.1185336112976074,
"learning_rate": 0.001,
"loss": 1.647,
"step": 122700
},
{
"epoch": 39.68972204266322,
"grad_norm": 1.5616239309310913,
"learning_rate": 0.001,
"loss": 1.6811,
"step": 122800
},
{
"epoch": 39.722042663219135,
"grad_norm": 1.610378623008728,
"learning_rate": 0.001,
"loss": 1.682,
"step": 122900
},
{
"epoch": 39.75436328377505,
"grad_norm": 1.679341197013855,
"learning_rate": 0.001,
"loss": 1.6773,
"step": 123000
},
{
"epoch": 39.786683904330964,
"grad_norm": 1.6334840059280396,
"learning_rate": 0.001,
"loss": 1.6717,
"step": 123100
},
{
"epoch": 39.81900452488688,
"grad_norm": 1.4154548645019531,
"learning_rate": 0.001,
"loss": 1.6878,
"step": 123200
},
{
"epoch": 39.85132514544279,
"grad_norm": 1.5215219259262085,
"learning_rate": 0.001,
"loss": 1.6776,
"step": 123300
},
{
"epoch": 39.88364576599871,
"grad_norm": 1.4763894081115723,
"learning_rate": 0.001,
"loss": 1.6989,
"step": 123400
},
{
"epoch": 39.91596638655462,
"grad_norm": 1.5995749235153198,
"learning_rate": 0.001,
"loss": 1.6769,
"step": 123500
},
{
"epoch": 39.94828700711054,
"grad_norm": 1.726880431175232,
"learning_rate": 0.001,
"loss": 1.7047,
"step": 123600
},
{
"epoch": 39.98060762766645,
"grad_norm": 2.136265754699707,
"learning_rate": 0.001,
"loss": 1.7003,
"step": 123700
},
{
"epoch": 40.012928248222366,
"grad_norm": 1.5406869649887085,
"learning_rate": 0.001,
"loss": 1.6333,
"step": 123800
},
{
"epoch": 40.04524886877828,
"grad_norm": 1.669394850730896,
"learning_rate": 0.001,
"loss": 1.5618,
"step": 123900
},
{
"epoch": 40.077569489334195,
"grad_norm": 1.9656615257263184,
"learning_rate": 0.001,
"loss": 1.5684,
"step": 124000
},
{
"epoch": 40.10989010989011,
"grad_norm": 1.5017229318618774,
"learning_rate": 0.001,
"loss": 1.5721,
"step": 124100
},
{
"epoch": 40.142210730446024,
"grad_norm": 1.9601807594299316,
"learning_rate": 0.001,
"loss": 1.5763,
"step": 124200
},
{
"epoch": 40.17453135100194,
"grad_norm": 1.748874545097351,
"learning_rate": 0.001,
"loss": 1.5934,
"step": 124300
},
{
"epoch": 40.20685197155785,
"grad_norm": 1.5721076726913452,
"learning_rate": 0.001,
"loss": 1.5867,
"step": 124400
},
{
"epoch": 40.23917259211377,
"grad_norm": 2.065415620803833,
"learning_rate": 0.001,
"loss": 1.5856,
"step": 124500
},
{
"epoch": 40.27149321266968,
"grad_norm": 1.926713466644287,
"learning_rate": 0.001,
"loss": 1.6055,
"step": 124600
},
{
"epoch": 40.3038138332256,
"grad_norm": 1.827108383178711,
"learning_rate": 0.001,
"loss": 1.5936,
"step": 124700
},
{
"epoch": 40.33613445378151,
"grad_norm": 2.123699426651001,
"learning_rate": 0.001,
"loss": 1.6068,
"step": 124800
},
{
"epoch": 40.368455074337426,
"grad_norm": 2.119037628173828,
"learning_rate": 0.001,
"loss": 1.6113,
"step": 124900
},
{
"epoch": 40.40077569489334,
"grad_norm": 1.7764804363250732,
"learning_rate": 0.001,
"loss": 1.6156,
"step": 125000
},
{
"epoch": 40.433096315449255,
"grad_norm": 1.7340418100357056,
"learning_rate": 0.001,
"loss": 1.6309,
"step": 125100
},
{
"epoch": 40.46541693600517,
"grad_norm": 2.1750295162200928,
"learning_rate": 0.001,
"loss": 1.6165,
"step": 125200
},
{
"epoch": 40.497737556561084,
"grad_norm": 1.8627874851226807,
"learning_rate": 0.001,
"loss": 1.6342,
"step": 125300
},
{
"epoch": 40.530058177117,
"grad_norm": 2.254770517349243,
"learning_rate": 0.001,
"loss": 1.64,
"step": 125400
},
{
"epoch": 40.56237879767291,
"grad_norm": 1.959664225578308,
"learning_rate": 0.001,
"loss": 1.6451,
"step": 125500
},
{
"epoch": 40.59469941822883,
"grad_norm": 1.7636991739273071,
"learning_rate": 0.001,
"loss": 1.6386,
"step": 125600
},
{
"epoch": 40.62702003878474,
"grad_norm": 1.5677493810653687,
"learning_rate": 0.001,
"loss": 1.6454,
"step": 125700
},
{
"epoch": 40.65934065934066,
"grad_norm": 2.089933156967163,
"learning_rate": 0.001,
"loss": 1.6412,
"step": 125800
},
{
"epoch": 40.69166127989657,
"grad_norm": 1.8187321424484253,
"learning_rate": 0.001,
"loss": 1.6429,
"step": 125900
},
{
"epoch": 40.723981900452486,
"grad_norm": 1.580985426902771,
"learning_rate": 0.001,
"loss": 1.6749,
"step": 126000
},
{
"epoch": 40.7563025210084,
"grad_norm": 1.7752652168273926,
"learning_rate": 0.001,
"loss": 1.6616,
"step": 126100
},
{
"epoch": 40.788623141564315,
"grad_norm": 2.0827362537384033,
"learning_rate": 0.001,
"loss": 1.6555,
"step": 126200
},
{
"epoch": 40.82094376212023,
"grad_norm": 1.8399465084075928,
"learning_rate": 0.001,
"loss": 1.6663,
"step": 126300
},
{
"epoch": 40.853264382676144,
"grad_norm": 1.5653818845748901,
"learning_rate": 0.001,
"loss": 1.6752,
"step": 126400
},
{
"epoch": 40.88558500323206,
"grad_norm": 2.2567121982574463,
"learning_rate": 0.001,
"loss": 1.6658,
"step": 126500
},
{
"epoch": 40.91790562378797,
"grad_norm": 1.8558109998703003,
"learning_rate": 0.001,
"loss": 1.6766,
"step": 126600
},
{
"epoch": 40.95022624434389,
"grad_norm": 1.7659657001495361,
"learning_rate": 0.001,
"loss": 1.6761,
"step": 126700
},
{
"epoch": 40.9825468648998,
"grad_norm": 1.8757858276367188,
"learning_rate": 0.001,
"loss": 1.6769,
"step": 126800
},
{
"epoch": 41.014867485455724,
"grad_norm": 2.512212038040161,
"learning_rate": 0.001,
"loss": 1.6044,
"step": 126900
},
{
"epoch": 41.04718810601164,
"grad_norm": 2.0472733974456787,
"learning_rate": 0.001,
"loss": 1.541,
"step": 127000
},
{
"epoch": 41.07950872656755,
"grad_norm": 1.9753613471984863,
"learning_rate": 0.001,
"loss": 1.5501,
"step": 127100
},
{
"epoch": 41.11182934712347,
"grad_norm": 1.7416068315505981,
"learning_rate": 0.001,
"loss": 1.5391,
"step": 127200
},
{
"epoch": 41.14414996767938,
"grad_norm": 2.020958662033081,
"learning_rate": 0.001,
"loss": 1.55,
"step": 127300
},
{
"epoch": 41.1764705882353,
"grad_norm": 1.6599868535995483,
"learning_rate": 0.001,
"loss": 1.5525,
"step": 127400
},
{
"epoch": 41.20879120879121,
"grad_norm": 2.1170713901519775,
"learning_rate": 0.001,
"loss": 1.5644,
"step": 127500
},
{
"epoch": 41.241111829347126,
"grad_norm": 2.277888298034668,
"learning_rate": 0.001,
"loss": 1.5721,
"step": 127600
},
{
"epoch": 41.27343244990304,
"grad_norm": 2.1243207454681396,
"learning_rate": 0.001,
"loss": 1.585,
"step": 127700
},
{
"epoch": 41.305753070458955,
"grad_norm": 1.9793413877487183,
"learning_rate": 0.001,
"loss": 1.5874,
"step": 127800
},
{
"epoch": 41.33807369101487,
"grad_norm": 2.173837661743164,
"learning_rate": 0.001,
"loss": 1.6025,
"step": 127900
},
{
"epoch": 41.370394311570784,
"grad_norm": 1.8958253860473633,
"learning_rate": 0.001,
"loss": 1.5792,
"step": 128000
},
{
"epoch": 41.4027149321267,
"grad_norm": 1.8988944292068481,
"learning_rate": 0.001,
"loss": 1.5951,
"step": 128100
},
{
"epoch": 41.43503555268261,
"grad_norm": 2.107556104660034,
"learning_rate": 0.001,
"loss": 1.6001,
"step": 128200
},
{
"epoch": 41.46735617323853,
"grad_norm": 1.8033684492111206,
"learning_rate": 0.001,
"loss": 1.5962,
"step": 128300
},
{
"epoch": 41.49967679379444,
"grad_norm": 2.004406213760376,
"learning_rate": 0.001,
"loss": 1.6003,
"step": 128400
},
{
"epoch": 41.53199741435036,
"grad_norm": 1.7807986736297607,
"learning_rate": 0.001,
"loss": 1.6142,
"step": 128500
},
{
"epoch": 41.56431803490627,
"grad_norm": 2.0727765560150146,
"learning_rate": 0.001,
"loss": 1.6103,
"step": 128600
},
{
"epoch": 41.596638655462186,
"grad_norm": 1.7568210363388062,
"learning_rate": 0.001,
"loss": 1.6313,
"step": 128700
},
{
"epoch": 41.6289592760181,
"grad_norm": 1.965510606765747,
"learning_rate": 0.001,
"loss": 1.6206,
"step": 128800
},
{
"epoch": 41.661279896574015,
"grad_norm": 1.6988037824630737,
"learning_rate": 0.001,
"loss": 1.6457,
"step": 128900
},
{
"epoch": 41.69360051712993,
"grad_norm": 2.4268720149993896,
"learning_rate": 0.001,
"loss": 1.6157,
"step": 129000
},
{
"epoch": 41.725921137685845,
"grad_norm": 2.0334534645080566,
"learning_rate": 0.001,
"loss": 1.6322,
"step": 129100
},
{
"epoch": 41.75824175824176,
"grad_norm": 2.085456371307373,
"learning_rate": 0.001,
"loss": 1.6448,
"step": 129200
},
{
"epoch": 41.790562378797674,
"grad_norm": 2.056472063064575,
"learning_rate": 0.001,
"loss": 1.6533,
"step": 129300
},
{
"epoch": 41.82288299935359,
"grad_norm": 2.280672073364258,
"learning_rate": 0.001,
"loss": 1.6466,
"step": 129400
},
{
"epoch": 41.8552036199095,
"grad_norm": 1.9739775657653809,
"learning_rate": 0.001,
"loss": 1.647,
"step": 129500
},
{
"epoch": 41.88752424046542,
"grad_norm": 2.5622267723083496,
"learning_rate": 0.001,
"loss": 1.6521,
"step": 129600
},
{
"epoch": 41.91984486102133,
"grad_norm": 1.893709421157837,
"learning_rate": 0.001,
"loss": 1.6407,
"step": 129700
},
{
"epoch": 41.95216548157725,
"grad_norm": 1.7965614795684814,
"learning_rate": 0.001,
"loss": 1.6568,
"step": 129800
},
{
"epoch": 41.98448610213316,
"grad_norm": 2.208484411239624,
"learning_rate": 0.001,
"loss": 1.6759,
"step": 129900
},
{
"epoch": 42.016806722689076,
"grad_norm": 2.078091859817505,
"learning_rate": 0.001,
"loss": 1.5652,
"step": 130000
},
{
"epoch": 42.04912734324499,
"grad_norm": 1.9855774641036987,
"learning_rate": 0.001,
"loss": 1.5017,
"step": 130100
},
{
"epoch": 42.081447963800905,
"grad_norm": 2.0214784145355225,
"learning_rate": 0.001,
"loss": 1.545,
"step": 130200
},
{
"epoch": 42.11376858435682,
"grad_norm": 2.2106988430023193,
"learning_rate": 0.001,
"loss": 1.5264,
"step": 130300
},
{
"epoch": 42.146089204912734,
"grad_norm": 2.208866834640503,
"learning_rate": 0.001,
"loss": 1.5276,
"step": 130400
},
{
"epoch": 42.17840982546865,
"grad_norm": 2.6902999877929688,
"learning_rate": 0.001,
"loss": 1.5549,
"step": 130500
},
{
"epoch": 42.21073044602456,
"grad_norm": 2.041752576828003,
"learning_rate": 0.001,
"loss": 1.5361,
"step": 130600
},
{
"epoch": 42.24305106658048,
"grad_norm": 1.764216423034668,
"learning_rate": 0.001,
"loss": 1.5404,
"step": 130700
},
{
"epoch": 42.27537168713639,
"grad_norm": 2.0946340560913086,
"learning_rate": 0.001,
"loss": 1.5454,
"step": 130800
},
{
"epoch": 42.30769230769231,
"grad_norm": 2.1806182861328125,
"learning_rate": 0.001,
"loss": 1.5488,
"step": 130900
},
{
"epoch": 42.34001292824822,
"grad_norm": 1.8357605934143066,
"learning_rate": 0.001,
"loss": 1.572,
"step": 131000
},
{
"epoch": 42.372333548804136,
"grad_norm": 2.311324119567871,
"learning_rate": 0.001,
"loss": 1.5771,
"step": 131100
},
{
"epoch": 42.40465416936005,
"grad_norm": 2.406909465789795,
"learning_rate": 0.001,
"loss": 1.588,
"step": 131200
},
{
"epoch": 42.436974789915965,
"grad_norm": 2.6602001190185547,
"learning_rate": 0.001,
"loss": 1.5705,
"step": 131300
},
{
"epoch": 42.46929541047188,
"grad_norm": 2.230372428894043,
"learning_rate": 0.001,
"loss": 1.5834,
"step": 131400
},
{
"epoch": 42.501616031027794,
"grad_norm": 2.017826557159424,
"learning_rate": 0.001,
"loss": 1.5873,
"step": 131500
},
{
"epoch": 42.53393665158371,
"grad_norm": 2.3046560287475586,
"learning_rate": 0.001,
"loss": 1.5883,
"step": 131600
},
{
"epoch": 42.56625727213962,
"grad_norm": 2.286146402359009,
"learning_rate": 0.001,
"loss": 1.6031,
"step": 131700
},
{
"epoch": 42.59857789269554,
"grad_norm": 2.2403616905212402,
"learning_rate": 0.001,
"loss": 1.6109,
"step": 131800
},
{
"epoch": 42.63089851325145,
"grad_norm": 2.069786787033081,
"learning_rate": 0.001,
"loss": 1.6011,
"step": 131900
},
{
"epoch": 42.66321913380737,
"grad_norm": 2.289213180541992,
"learning_rate": 0.001,
"loss": 1.6092,
"step": 132000
},
{
"epoch": 42.69553975436328,
"grad_norm": 2.493170738220215,
"learning_rate": 0.001,
"loss": 1.608,
"step": 132100
},
{
"epoch": 42.727860374919196,
"grad_norm": 2.157357931137085,
"learning_rate": 0.001,
"loss": 1.6024,
"step": 132200
},
{
"epoch": 42.76018099547511,
"grad_norm": 1.8655034303665161,
"learning_rate": 0.001,
"loss": 1.6296,
"step": 132300
},
{
"epoch": 42.792501616031025,
"grad_norm": 1.8031189441680908,
"learning_rate": 0.001,
"loss": 1.623,
"step": 132400
},
{
"epoch": 42.82482223658694,
"grad_norm": 1.9016975164413452,
"learning_rate": 0.001,
"loss": 1.6318,
"step": 132500
},
{
"epoch": 42.857142857142854,
"grad_norm": 2.0630972385406494,
"learning_rate": 0.001,
"loss": 1.6342,
"step": 132600
},
{
"epoch": 42.88946347769877,
"grad_norm": 2.427926778793335,
"learning_rate": 0.001,
"loss": 1.6399,
"step": 132700
},
{
"epoch": 42.92178409825468,
"grad_norm": 2.0053133964538574,
"learning_rate": 0.001,
"loss": 1.6561,
"step": 132800
},
{
"epoch": 42.9541047188106,
"grad_norm": 2.1016571521759033,
"learning_rate": 0.001,
"loss": 1.6391,
"step": 132900
},
{
"epoch": 42.98642533936652,
"grad_norm": 2.22855281829834,
"learning_rate": 0.001,
"loss": 1.6486,
"step": 133000
},
{
"epoch": 43.018745959922434,
"grad_norm": 1.6028803586959839,
"learning_rate": 0.001,
"loss": 1.5826,
"step": 133100
},
{
"epoch": 43.05106658047835,
"grad_norm": 2.0032293796539307,
"learning_rate": 0.001,
"loss": 1.4941,
"step": 133200
},
{
"epoch": 43.08338720103426,
"grad_norm": 1.550441861152649,
"learning_rate": 0.001,
"loss": 1.527,
"step": 133300
},
{
"epoch": 43.11570782159018,
"grad_norm": 1.429332971572876,
"learning_rate": 0.001,
"loss": 1.5175,
"step": 133400
},
{
"epoch": 43.14802844214609,
"grad_norm": 1.6169044971466064,
"learning_rate": 0.001,
"loss": 1.5182,
"step": 133500
},
{
"epoch": 43.18034906270201,
"grad_norm": 1.5943845510482788,
"learning_rate": 0.001,
"loss": 1.5155,
"step": 133600
},
{
"epoch": 43.21266968325792,
"grad_norm": 5.721950531005859,
"learning_rate": 0.001,
"loss": 1.5268,
"step": 133700
},
{
"epoch": 43.244990303813836,
"grad_norm": 1.472016453742981,
"learning_rate": 0.001,
"loss": 1.5302,
"step": 133800
},
{
"epoch": 43.27731092436975,
"grad_norm": 1.616565227508545,
"learning_rate": 0.001,
"loss": 1.5384,
"step": 133900
},
{
"epoch": 43.309631544925665,
"grad_norm": 1.4653263092041016,
"learning_rate": 0.001,
"loss": 1.5375,
"step": 134000
},
{
"epoch": 43.34195216548158,
"grad_norm": 1.5831880569458008,
"learning_rate": 0.001,
"loss": 1.5442,
"step": 134100
},
{
"epoch": 43.374272786037494,
"grad_norm": 2.1112189292907715,
"learning_rate": 0.001,
"loss": 1.5827,
"step": 134200
},
{
"epoch": 43.40659340659341,
"grad_norm": 1.6573975086212158,
"learning_rate": 0.001,
"loss": 1.5598,
"step": 134300
},
{
"epoch": 43.43891402714932,
"grad_norm": 1.8012890815734863,
"learning_rate": 0.001,
"loss": 1.5378,
"step": 134400
},
{
"epoch": 43.47123464770524,
"grad_norm": 1.6912004947662354,
"learning_rate": 0.001,
"loss": 1.5774,
"step": 134500
},
{
"epoch": 43.50355526826115,
"grad_norm": 1.7754735946655273,
"learning_rate": 0.001,
"loss": 1.5764,
"step": 134600
},
{
"epoch": 43.53587588881707,
"grad_norm": 2.1237239837646484,
"learning_rate": 0.001,
"loss": 1.576,
"step": 134700
},
{
"epoch": 43.56819650937298,
"grad_norm": 1.9989526271820068,
"learning_rate": 0.001,
"loss": 1.5701,
"step": 134800
},
{
"epoch": 43.600517129928896,
"grad_norm": 1.6959154605865479,
"learning_rate": 0.001,
"loss": 1.5828,
"step": 134900
},
{
"epoch": 43.63283775048481,
"grad_norm": 1.4177024364471436,
"learning_rate": 0.001,
"loss": 1.5929,
"step": 135000
},
{
"epoch": 43.665158371040725,
"grad_norm": 1.2994517087936401,
"learning_rate": 0.001,
"loss": 1.5947,
"step": 135100
},
{
"epoch": 43.69747899159664,
"grad_norm": 1.7819082736968994,
"learning_rate": 0.001,
"loss": 1.5897,
"step": 135200
},
{
"epoch": 43.729799612152554,
"grad_norm": 1.472542405128479,
"learning_rate": 0.001,
"loss": 1.6063,
"step": 135300
},
{
"epoch": 43.76212023270847,
"grad_norm": 1.7978127002716064,
"learning_rate": 0.001,
"loss": 1.5995,
"step": 135400
},
{
"epoch": 43.79444085326438,
"grad_norm": 1.496368169784546,
"learning_rate": 0.001,
"loss": 1.6055,
"step": 135500
},
{
"epoch": 43.8267614738203,
"grad_norm": 2.0534775257110596,
"learning_rate": 0.001,
"loss": 1.624,
"step": 135600
},
{
"epoch": 43.85908209437621,
"grad_norm": 1.7040880918502808,
"learning_rate": 0.001,
"loss": 1.6129,
"step": 135700
},
{
"epoch": 43.89140271493213,
"grad_norm": 1.5424871444702148,
"learning_rate": 0.001,
"loss": 1.6121,
"step": 135800
},
{
"epoch": 43.92372333548804,
"grad_norm": 1.6452441215515137,
"learning_rate": 0.001,
"loss": 1.6367,
"step": 135900
},
{
"epoch": 43.956043956043956,
"grad_norm": 1.9150134325027466,
"learning_rate": 0.001,
"loss": 1.6305,
"step": 136000
},
{
"epoch": 43.98836457659987,
"grad_norm": 1.769917607307434,
"learning_rate": 0.001,
"loss": 1.6333,
"step": 136100
},
{
"epoch": 44.020685197155785,
"grad_norm": 1.7169297933578491,
"learning_rate": 0.001,
"loss": 1.5414,
"step": 136200
},
{
"epoch": 44.0530058177117,
"grad_norm": 1.613829255104065,
"learning_rate": 0.001,
"loss": 1.494,
"step": 136300
},
{
"epoch": 44.085326438267614,
"grad_norm": 1.3439263105392456,
"learning_rate": 0.001,
"loss": 1.488,
"step": 136400
},
{
"epoch": 44.11764705882353,
"grad_norm": 1.3409062623977661,
"learning_rate": 0.001,
"loss": 1.5025,
"step": 136500
},
{
"epoch": 44.14996767937944,
"grad_norm": 1.8013584613800049,
"learning_rate": 0.001,
"loss": 1.4958,
"step": 136600
},
{
"epoch": 44.18228829993536,
"grad_norm": 1.5000190734863281,
"learning_rate": 0.001,
"loss": 1.5028,
"step": 136700
},
{
"epoch": 44.21460892049127,
"grad_norm": 1.672287940979004,
"learning_rate": 0.001,
"loss": 1.5285,
"step": 136800
},
{
"epoch": 44.24692954104719,
"grad_norm": 1.3260489702224731,
"learning_rate": 0.001,
"loss": 1.5245,
"step": 136900
},
{
"epoch": 44.2792501616031,
"grad_norm": 1.5546982288360596,
"learning_rate": 0.001,
"loss": 1.5327,
"step": 137000
},
{
"epoch": 44.311570782159016,
"grad_norm": 1.8142194747924805,
"learning_rate": 0.001,
"loss": 1.5384,
"step": 137100
},
{
"epoch": 44.34389140271493,
"grad_norm": 1.5028103590011597,
"learning_rate": 0.001,
"loss": 1.5345,
"step": 137200
},
{
"epoch": 44.376212023270845,
"grad_norm": 1.577204704284668,
"learning_rate": 0.001,
"loss": 1.5437,
"step": 137300
},
{
"epoch": 44.40853264382676,
"grad_norm": 1.8334197998046875,
"learning_rate": 0.001,
"loss": 1.5337,
"step": 137400
},
{
"epoch": 44.440853264382675,
"grad_norm": 1.5087043046951294,
"learning_rate": 0.001,
"loss": 1.5548,
"step": 137500
},
{
"epoch": 44.47317388493859,
"grad_norm": 1.5817244052886963,
"learning_rate": 0.001,
"loss": 1.5527,
"step": 137600
},
{
"epoch": 44.505494505494504,
"grad_norm": 1.6286990642547607,
"learning_rate": 0.001,
"loss": 1.5375,
"step": 137700
},
{
"epoch": 44.53781512605042,
"grad_norm": 1.6903146505355835,
"learning_rate": 0.001,
"loss": 1.555,
"step": 137800
},
{
"epoch": 44.57013574660633,
"grad_norm": 1.5768530368804932,
"learning_rate": 0.001,
"loss": 1.5537,
"step": 137900
},
{
"epoch": 44.60245636716225,
"grad_norm": 1.6440843343734741,
"learning_rate": 0.001,
"loss": 1.5767,
"step": 138000
},
{
"epoch": 44.63477698771816,
"grad_norm": 1.4411373138427734,
"learning_rate": 0.001,
"loss": 1.5582,
"step": 138100
},
{
"epoch": 44.66709760827408,
"grad_norm": 6.931090354919434,
"learning_rate": 0.001,
"loss": 1.5674,
"step": 138200
},
{
"epoch": 44.69941822882999,
"grad_norm": 1.7121952772140503,
"learning_rate": 0.001,
"loss": 1.5849,
"step": 138300
},
{
"epoch": 44.731738849385906,
"grad_norm": 1.3487331867218018,
"learning_rate": 0.001,
"loss": 1.581,
"step": 138400
},
{
"epoch": 44.76405946994182,
"grad_norm": 1.4686524868011475,
"learning_rate": 0.001,
"loss": 1.5855,
"step": 138500
},
{
"epoch": 44.796380090497735,
"grad_norm": 1.3534255027770996,
"learning_rate": 0.001,
"loss": 1.5878,
"step": 138600
},
{
"epoch": 44.82870071105365,
"grad_norm": 1.4645819664001465,
"learning_rate": 0.001,
"loss": 1.5831,
"step": 138700
},
{
"epoch": 44.861021331609564,
"grad_norm": 1.9700833559036255,
"learning_rate": 0.001,
"loss": 1.5986,
"step": 138800
},
{
"epoch": 44.89334195216548,
"grad_norm": 1.4428085088729858,
"learning_rate": 0.001,
"loss": 1.5975,
"step": 138900
},
{
"epoch": 44.92566257272139,
"grad_norm": 1.724913239479065,
"learning_rate": 0.001,
"loss": 1.601,
"step": 139000
},
{
"epoch": 44.95798319327731,
"grad_norm": 1.7658933401107788,
"learning_rate": 0.001,
"loss": 1.5877,
"step": 139100
},
{
"epoch": 44.99030381383322,
"grad_norm": 1.6144723892211914,
"learning_rate": 0.001,
"loss": 1.566,
"step": 139200
},
{
"epoch": 45.022624434389144,
"grad_norm": 1.3865910768508911,
"learning_rate": 0.001,
"loss": 1.5231,
"step": 139300
},
{
"epoch": 45.05494505494506,
"grad_norm": 1.631665825843811,
"learning_rate": 0.001,
"loss": 1.4477,
"step": 139400
},
{
"epoch": 45.08726567550097,
"grad_norm": 1.7368639707565308,
"learning_rate": 0.001,
"loss": 1.4724,
"step": 139500
},
{
"epoch": 45.11958629605689,
"grad_norm": 1.483976125717163,
"learning_rate": 0.001,
"loss": 1.4668,
"step": 139600
},
{
"epoch": 45.1519069166128,
"grad_norm": 1.597456932067871,
"learning_rate": 0.001,
"loss": 1.484,
"step": 139700
},
{
"epoch": 45.18422753716872,
"grad_norm": 1.684712529182434,
"learning_rate": 0.001,
"loss": 1.4857,
"step": 139800
},
{
"epoch": 45.21654815772463,
"grad_norm": 1.7697254419326782,
"learning_rate": 0.001,
"loss": 1.4854,
"step": 139900
},
{
"epoch": 45.248868778280546,
"grad_norm": 1.6394789218902588,
"learning_rate": 0.001,
"loss": 1.494,
"step": 140000
},
{
"epoch": 45.28118939883646,
"grad_norm": 1.8333781957626343,
"learning_rate": 0.001,
"loss": 1.4974,
"step": 140100
},
{
"epoch": 45.313510019392375,
"grad_norm": 1.4193600416183472,
"learning_rate": 0.001,
"loss": 1.5108,
"step": 140200
},
{
"epoch": 45.34583063994829,
"grad_norm": 1.4354008436203003,
"learning_rate": 0.001,
"loss": 1.4979,
"step": 140300
},
{
"epoch": 45.378151260504204,
"grad_norm": 1.3426471948623657,
"learning_rate": 0.001,
"loss": 1.5283,
"step": 140400
},
{
"epoch": 45.41047188106012,
"grad_norm": 1.817825198173523,
"learning_rate": 0.001,
"loss": 1.5295,
"step": 140500
},
{
"epoch": 45.44279250161603,
"grad_norm": 1.8174062967300415,
"learning_rate": 0.001,
"loss": 1.5311,
"step": 140600
},
{
"epoch": 45.47511312217195,
"grad_norm": 1.4800944328308105,
"learning_rate": 0.001,
"loss": 1.5262,
"step": 140700
},
{
"epoch": 45.50743374272786,
"grad_norm": 1.8175491094589233,
"learning_rate": 0.001,
"loss": 1.5133,
"step": 140800
},
{
"epoch": 45.53975436328378,
"grad_norm": 1.32296884059906,
"learning_rate": 0.001,
"loss": 1.5473,
"step": 140900
},
{
"epoch": 45.57207498383969,
"grad_norm": 1.3446507453918457,
"learning_rate": 0.001,
"loss": 1.5304,
"step": 141000
},
{
"epoch": 45.604395604395606,
"grad_norm": 1.742011547088623,
"learning_rate": 0.001,
"loss": 1.5464,
"step": 141100
},
{
"epoch": 45.63671622495152,
"grad_norm": 1.6908282041549683,
"learning_rate": 0.001,
"loss": 1.5599,
"step": 141200
},
{
"epoch": 45.669036845507435,
"grad_norm": 1.6784456968307495,
"learning_rate": 0.001,
"loss": 1.5543,
"step": 141300
},
{
"epoch": 45.70135746606335,
"grad_norm": 1.5689213275909424,
"learning_rate": 0.001,
"loss": 1.555,
"step": 141400
},
{
"epoch": 45.733678086619264,
"grad_norm": 1.5936824083328247,
"learning_rate": 0.001,
"loss": 1.5674,
"step": 141500
},
{
"epoch": 45.76599870717518,
"grad_norm": 1.7053321599960327,
"learning_rate": 0.001,
"loss": 1.5705,
"step": 141600
},
{
"epoch": 45.79831932773109,
"grad_norm": 2.1247267723083496,
"learning_rate": 0.001,
"loss": 1.5815,
"step": 141700
},
{
"epoch": 45.83063994828701,
"grad_norm": 1.4352737665176392,
"learning_rate": 0.001,
"loss": 1.573,
"step": 141800
},
{
"epoch": 45.86296056884292,
"grad_norm": 1.7753167152404785,
"learning_rate": 0.001,
"loss": 1.5797,
"step": 141900
},
{
"epoch": 45.89528118939884,
"grad_norm": 1.3698890209197998,
"learning_rate": 0.001,
"loss": 1.5852,
"step": 142000
},
{
"epoch": 45.92760180995475,
"grad_norm": 1.4331963062286377,
"learning_rate": 0.001,
"loss": 1.5806,
"step": 142100
},
{
"epoch": 45.959922430510666,
"grad_norm": 1.4742848873138428,
"learning_rate": 0.001,
"loss": 1.5736,
"step": 142200
},
{
"epoch": 45.99224305106658,
"grad_norm": 1.542399287223816,
"learning_rate": 0.001,
"loss": 1.6072,
"step": 142300
},
{
"epoch": 46.024563671622495,
"grad_norm": 1.7629003524780273,
"learning_rate": 0.001,
"loss": 1.4912,
"step": 142400
},
{
"epoch": 46.05688429217841,
"grad_norm": 1.528564691543579,
"learning_rate": 0.001,
"loss": 1.4412,
"step": 142500
},
{
"epoch": 46.089204912734324,
"grad_norm": 1.6338632106781006,
"learning_rate": 0.001,
"loss": 1.4508,
"step": 142600
},
{
"epoch": 46.12152553329024,
"grad_norm": 2.0615508556365967,
"learning_rate": 0.001,
"loss": 1.4567,
"step": 142700
},
{
"epoch": 46.15384615384615,
"grad_norm": 1.3304144144058228,
"learning_rate": 0.001,
"loss": 1.4729,
"step": 142800
},
{
"epoch": 46.18616677440207,
"grad_norm": 2.0170974731445312,
"learning_rate": 0.001,
"loss": 1.4745,
"step": 142900
},
{
"epoch": 46.21848739495798,
"grad_norm": 1.8077945709228516,
"learning_rate": 0.001,
"loss": 1.4868,
"step": 143000
},
{
"epoch": 46.2508080155139,
"grad_norm": 1.6119381189346313,
"learning_rate": 0.001,
"loss": 1.4822,
"step": 143100
},
{
"epoch": 46.28312863606981,
"grad_norm": 1.403429388999939,
"learning_rate": 0.001,
"loss": 1.4943,
"step": 143200
},
{
"epoch": 46.315449256625726,
"grad_norm": 1.3802709579467773,
"learning_rate": 0.001,
"loss": 1.485,
"step": 143300
},
{
"epoch": 46.34776987718164,
"grad_norm": 1.4897013902664185,
"learning_rate": 0.001,
"loss": 1.4872,
"step": 143400
},
{
"epoch": 46.380090497737555,
"grad_norm": 1.8272331953048706,
"learning_rate": 0.001,
"loss": 1.4959,
"step": 143500
},
{
"epoch": 46.41241111829347,
"grad_norm": 1.6205357313156128,
"learning_rate": 0.001,
"loss": 1.4995,
"step": 143600
},
{
"epoch": 46.444731738849384,
"grad_norm": 1.613732099533081,
"learning_rate": 0.001,
"loss": 1.5116,
"step": 143700
},
{
"epoch": 46.4770523594053,
"grad_norm": 1.474404215812683,
"learning_rate": 0.001,
"loss": 1.5179,
"step": 143800
},
{
"epoch": 46.50937297996121,
"grad_norm": 1.4578990936279297,
"learning_rate": 0.001,
"loss": 1.5057,
"step": 143900
},
{
"epoch": 46.54169360051713,
"grad_norm": 1.424071192741394,
"learning_rate": 0.001,
"loss": 1.5122,
"step": 144000
},
{
"epoch": 46.57401422107304,
"grad_norm": 1.4065488576889038,
"learning_rate": 0.001,
"loss": 1.5131,
"step": 144100
},
{
"epoch": 46.60633484162896,
"grad_norm": 1.8913878202438354,
"learning_rate": 0.001,
"loss": 1.5399,
"step": 144200
},
{
"epoch": 46.63865546218487,
"grad_norm": 1.6994715929031372,
"learning_rate": 0.001,
"loss": 1.5329,
"step": 144300
},
{
"epoch": 46.670976082740786,
"grad_norm": 2.1696887016296387,
"learning_rate": 0.001,
"loss": 1.5482,
"step": 144400
},
{
"epoch": 46.7032967032967,
"grad_norm": 1.8181955814361572,
"learning_rate": 0.001,
"loss": 1.5486,
"step": 144500
},
{
"epoch": 46.735617323852615,
"grad_norm": 1.3913260698318481,
"learning_rate": 0.001,
"loss": 1.5384,
"step": 144600
},
{
"epoch": 46.76793794440853,
"grad_norm": 1.6030380725860596,
"learning_rate": 0.001,
"loss": 1.5396,
"step": 144700
},
{
"epoch": 46.800258564964444,
"grad_norm": 2.035700559616089,
"learning_rate": 0.001,
"loss": 1.5648,
"step": 144800
},
{
"epoch": 46.83257918552036,
"grad_norm": 1.7987518310546875,
"learning_rate": 0.001,
"loss": 1.5471,
"step": 144900
},
{
"epoch": 46.864899806076274,
"grad_norm": 1.4945619106292725,
"learning_rate": 0.001,
"loss": 1.5659,
"step": 145000
},
{
"epoch": 46.89722042663219,
"grad_norm": 1.7767857313156128,
"learning_rate": 0.001,
"loss": 1.5632,
"step": 145100
},
{
"epoch": 46.9295410471881,
"grad_norm": 1.6083945035934448,
"learning_rate": 0.001,
"loss": 1.5382,
"step": 145200
},
{
"epoch": 46.96186166774402,
"grad_norm": 1.7108503580093384,
"learning_rate": 0.001,
"loss": 1.57,
"step": 145300
},
{
"epoch": 46.99418228829994,
"grad_norm": 1.4575233459472656,
"learning_rate": 0.001,
"loss": 1.566,
"step": 145400
},
{
"epoch": 47.02650290885585,
"grad_norm": 1.838165283203125,
"learning_rate": 0.001,
"loss": 1.4575,
"step": 145500
},
{
"epoch": 47.05882352941177,
"grad_norm": 1.6801042556762695,
"learning_rate": 0.001,
"loss": 1.4235,
"step": 145600
},
{
"epoch": 47.09114414996768,
"grad_norm": 1.862371802330017,
"learning_rate": 0.001,
"loss": 1.434,
"step": 145700
},
{
"epoch": 47.1234647705236,
"grad_norm": 2.157015800476074,
"learning_rate": 0.001,
"loss": 1.4335,
"step": 145800
},
{
"epoch": 47.15578539107951,
"grad_norm": 1.6546690464019775,
"learning_rate": 0.001,
"loss": 1.4435,
"step": 145900
},
{
"epoch": 47.188106011635426,
"grad_norm": 1.6954889297485352,
"learning_rate": 0.001,
"loss": 1.4419,
"step": 146000
},
{
"epoch": 47.22042663219134,
"grad_norm": 1.6528239250183105,
"learning_rate": 0.001,
"loss": 1.468,
"step": 146100
},
{
"epoch": 47.252747252747255,
"grad_norm": 1.9358820915222168,
"learning_rate": 0.001,
"loss": 1.4415,
"step": 146200
},
{
"epoch": 47.28506787330317,
"grad_norm": 1.525715708732605,
"learning_rate": 0.001,
"loss": 1.483,
"step": 146300
},
{
"epoch": 47.317388493859085,
"grad_norm": 1.8445520401000977,
"learning_rate": 0.001,
"loss": 1.4701,
"step": 146400
},
{
"epoch": 47.349709114415,
"grad_norm": 1.9748725891113281,
"learning_rate": 0.001,
"loss": 1.4815,
"step": 146500
},
{
"epoch": 47.382029734970914,
"grad_norm": 1.626009464263916,
"learning_rate": 0.001,
"loss": 1.4907,
"step": 146600
},
{
"epoch": 47.41435035552683,
"grad_norm": 1.9791555404663086,
"learning_rate": 0.001,
"loss": 1.4922,
"step": 146700
},
{
"epoch": 47.44667097608274,
"grad_norm": 2.1587910652160645,
"learning_rate": 0.001,
"loss": 1.4788,
"step": 146800
},
{
"epoch": 47.47899159663866,
"grad_norm": 1.6786390542984009,
"learning_rate": 0.001,
"loss": 1.483,
"step": 146900
},
{
"epoch": 47.51131221719457,
"grad_norm": 1.9314137697219849,
"learning_rate": 0.001,
"loss": 1.5109,
"step": 147000
},
{
"epoch": 47.543632837750486,
"grad_norm": 1.7206977605819702,
"learning_rate": 0.001,
"loss": 1.5194,
"step": 147100
},
{
"epoch": 47.5759534583064,
"grad_norm": 1.6161469221115112,
"learning_rate": 0.001,
"loss": 1.5116,
"step": 147200
},
{
"epoch": 47.608274078862316,
"grad_norm": 2.3246424198150635,
"learning_rate": 0.001,
"loss": 1.5192,
"step": 147300
},
{
"epoch": 47.64059469941823,
"grad_norm": 1.691521167755127,
"learning_rate": 0.001,
"loss": 1.5237,
"step": 147400
},
{
"epoch": 47.672915319974145,
"grad_norm": 2.0103893280029297,
"learning_rate": 0.001,
"loss": 1.5214,
"step": 147500
},
{
"epoch": 47.70523594053006,
"grad_norm": 1.7042127847671509,
"learning_rate": 0.001,
"loss": 1.5186,
"step": 147600
},
{
"epoch": 47.737556561085974,
"grad_norm": 1.4514890909194946,
"learning_rate": 0.001,
"loss": 1.5237,
"step": 147700
},
{
"epoch": 47.76987718164189,
"grad_norm": 1.584375023841858,
"learning_rate": 0.001,
"loss": 1.5369,
"step": 147800
},
{
"epoch": 47.8021978021978,
"grad_norm": 1.8270056247711182,
"learning_rate": 0.001,
"loss": 1.5443,
"step": 147900
},
{
"epoch": 47.83451842275372,
"grad_norm": 1.6497583389282227,
"learning_rate": 0.001,
"loss": 1.5389,
"step": 148000
},
{
"epoch": 47.86683904330963,
"grad_norm": 1.663865089416504,
"learning_rate": 0.001,
"loss": 1.5383,
"step": 148100
},
{
"epoch": 47.89915966386555,
"grad_norm": 1.5827676057815552,
"learning_rate": 0.001,
"loss": 1.5506,
"step": 148200
},
{
"epoch": 47.93148028442146,
"grad_norm": 1.687949776649475,
"learning_rate": 0.001,
"loss": 1.5282,
"step": 148300
},
{
"epoch": 47.963800904977376,
"grad_norm": 1.9395780563354492,
"learning_rate": 0.001,
"loss": 1.5506,
"step": 148400
},
{
"epoch": 47.99612152553329,
"grad_norm": 2.5985894203186035,
"learning_rate": 0.001,
"loss": 1.5313,
"step": 148500
},
{
"epoch": 48.028442146089205,
"grad_norm": 1.8825600147247314,
"learning_rate": 0.001,
"loss": 1.4213,
"step": 148600
},
{
"epoch": 48.06076276664512,
"grad_norm": 1.617006540298462,
"learning_rate": 0.001,
"loss": 1.4215,
"step": 148700
},
{
"epoch": 48.093083387201034,
"grad_norm": 1.5653728246688843,
"learning_rate": 0.001,
"loss": 1.4204,
"step": 148800
},
{
"epoch": 48.12540400775695,
"grad_norm": 1.995313286781311,
"learning_rate": 0.001,
"loss": 1.4367,
"step": 148900
},
{
"epoch": 48.15772462831286,
"grad_norm": 1.708569049835205,
"learning_rate": 0.001,
"loss": 1.4358,
"step": 149000
},
{
"epoch": 48.19004524886878,
"grad_norm": 1.6492958068847656,
"learning_rate": 0.001,
"loss": 1.4422,
"step": 149100
},
{
"epoch": 48.22236586942469,
"grad_norm": 1.8807870149612427,
"learning_rate": 0.001,
"loss": 1.4331,
"step": 149200
},
{
"epoch": 48.25468648998061,
"grad_norm": 1.88623046875,
"learning_rate": 0.001,
"loss": 1.4425,
"step": 149300
},
{
"epoch": 48.28700711053652,
"grad_norm": 2.2832539081573486,
"learning_rate": 0.001,
"loss": 1.4621,
"step": 149400
},
{
"epoch": 48.319327731092436,
"grad_norm": 1.6884719133377075,
"learning_rate": 0.001,
"loss": 1.4365,
"step": 149500
},
{
"epoch": 48.35164835164835,
"grad_norm": 1.904253602027893,
"learning_rate": 0.001,
"loss": 1.4675,
"step": 149600
},
{
"epoch": 48.383968972204265,
"grad_norm": 1.6859662532806396,
"learning_rate": 0.001,
"loss": 1.4725,
"step": 149700
},
{
"epoch": 48.41628959276018,
"grad_norm": 2.050351858139038,
"learning_rate": 0.001,
"loss": 1.4567,
"step": 149800
},
{
"epoch": 48.448610213316094,
"grad_norm": 1.3949456214904785,
"learning_rate": 0.001,
"loss": 1.489,
"step": 149900
},
{
"epoch": 48.48093083387201,
"grad_norm": 1.9798871278762817,
"learning_rate": 0.001,
"loss": 1.4699,
"step": 150000
},
{
"epoch": 48.51325145442792,
"grad_norm": 1.7036690711975098,
"learning_rate": 0.001,
"loss": 1.4783,
"step": 150100
},
{
"epoch": 48.54557207498384,
"grad_norm": 2.0233912467956543,
"learning_rate": 0.001,
"loss": 1.4945,
"step": 150200
},
{
"epoch": 48.57789269553975,
"grad_norm": 1.6232671737670898,
"learning_rate": 0.001,
"loss": 1.4934,
"step": 150300
},
{
"epoch": 48.61021331609567,
"grad_norm": 1.8260281085968018,
"learning_rate": 0.001,
"loss": 1.5012,
"step": 150400
},
{
"epoch": 48.64253393665158,
"grad_norm": 2.079585552215576,
"learning_rate": 0.001,
"loss": 1.5057,
"step": 150500
},
{
"epoch": 48.674854557207496,
"grad_norm": 1.9923733472824097,
"learning_rate": 0.001,
"loss": 1.4999,
"step": 150600
},
{
"epoch": 48.70717517776341,
"grad_norm": 2.004462480545044,
"learning_rate": 0.001,
"loss": 1.5087,
"step": 150700
},
{
"epoch": 48.739495798319325,
"grad_norm": 1.9867020845413208,
"learning_rate": 0.001,
"loss": 1.5161,
"step": 150800
},
{
"epoch": 48.77181641887524,
"grad_norm": 1.81278657913208,
"learning_rate": 0.001,
"loss": 1.5151,
"step": 150900
},
{
"epoch": 48.804137039431154,
"grad_norm": 1.93266761302948,
"learning_rate": 0.001,
"loss": 1.5136,
"step": 151000
},
{
"epoch": 48.83645765998707,
"grad_norm": 2.0835654735565186,
"learning_rate": 0.001,
"loss": 1.5254,
"step": 151100
},
{
"epoch": 48.86877828054298,
"grad_norm": 1.9827555418014526,
"learning_rate": 0.001,
"loss": 1.5334,
"step": 151200
},
{
"epoch": 48.9010989010989,
"grad_norm": 1.724138855934143,
"learning_rate": 0.001,
"loss": 1.5281,
"step": 151300
},
{
"epoch": 48.93341952165481,
"grad_norm": 1.6550005674362183,
"learning_rate": 0.001,
"loss": 1.5372,
"step": 151400
},
{
"epoch": 48.96574014221073,
"grad_norm": 2.217447519302368,
"learning_rate": 0.001,
"loss": 1.5312,
"step": 151500
},
{
"epoch": 48.99806076276664,
"grad_norm": 2.1639389991760254,
"learning_rate": 0.001,
"loss": 1.5187,
"step": 151600
},
{
"epoch": 49.03038138332256,
"grad_norm": 1.84763503074646,
"learning_rate": 0.001,
"loss": 1.3963,
"step": 151700
},
{
"epoch": 49.06270200387848,
"grad_norm": 1.755767583847046,
"learning_rate": 0.001,
"loss": 1.4093,
"step": 151800
},
{
"epoch": 49.09502262443439,
"grad_norm": 2.258392333984375,
"learning_rate": 0.001,
"loss": 1.4131,
"step": 151900
},
{
"epoch": 49.12734324499031,
"grad_norm": 2.0753369331359863,
"learning_rate": 0.001,
"loss": 1.4137,
"step": 152000
},
{
"epoch": 49.15966386554622,
"grad_norm": 1.6378613710403442,
"learning_rate": 0.001,
"loss": 1.424,
"step": 152100
},
{
"epoch": 49.191984486102136,
"grad_norm": 7.034304618835449,
"learning_rate": 0.001,
"loss": 1.4069,
"step": 152200
},
{
"epoch": 49.22430510665805,
"grad_norm": 2.0201122760772705,
"learning_rate": 0.001,
"loss": 1.426,
"step": 152300
},
{
"epoch": 49.256625727213965,
"grad_norm": 1.9883705377578735,
"learning_rate": 0.001,
"loss": 1.4445,
"step": 152400
},
{
"epoch": 49.28894634776988,
"grad_norm": 2.0591297149658203,
"learning_rate": 0.001,
"loss": 1.43,
"step": 152500
},
{
"epoch": 49.321266968325794,
"grad_norm": 1.975419521331787,
"learning_rate": 0.001,
"loss": 1.4497,
"step": 152600
},
{
"epoch": 49.35358758888171,
"grad_norm": 2.1251235008239746,
"learning_rate": 0.001,
"loss": 1.465,
"step": 152700
},
{
"epoch": 49.38590820943762,
"grad_norm": 1.797031283378601,
"learning_rate": 0.001,
"loss": 1.441,
"step": 152800
},
{
"epoch": 49.41822882999354,
"grad_norm": 2.097074270248413,
"learning_rate": 0.001,
"loss": 1.4631,
"step": 152900
},
{
"epoch": 49.45054945054945,
"grad_norm": 1.9780620336532593,
"learning_rate": 0.001,
"loss": 1.4636,
"step": 153000
},
{
"epoch": 49.48287007110537,
"grad_norm": 1.8536884784698486,
"learning_rate": 0.001,
"loss": 1.4603,
"step": 153100
},
{
"epoch": 49.51519069166128,
"grad_norm": 2.2152280807495117,
"learning_rate": 0.001,
"loss": 1.4746,
"step": 153200
},
{
"epoch": 49.547511312217196,
"grad_norm": 2.028168201446533,
"learning_rate": 0.001,
"loss": 1.4809,
"step": 153300
},
{
"epoch": 49.57983193277311,
"grad_norm": 2.026210308074951,
"learning_rate": 0.001,
"loss": 1.4934,
"step": 153400
},
{
"epoch": 49.612152553329025,
"grad_norm": 1.8274730443954468,
"learning_rate": 0.001,
"loss": 1.4745,
"step": 153500
},
{
"epoch": 49.64447317388494,
"grad_norm": 1.7476829290390015,
"learning_rate": 0.001,
"loss": 1.4877,
"step": 153600
},
{
"epoch": 49.676793794440854,
"grad_norm": 1.8586665391921997,
"learning_rate": 0.001,
"loss": 1.4779,
"step": 153700
},
{
"epoch": 49.70911441499677,
"grad_norm": 2.142073154449463,
"learning_rate": 0.001,
"loss": 1.4904,
"step": 153800
},
{
"epoch": 49.74143503555268,
"grad_norm": 2.4163622856140137,
"learning_rate": 0.001,
"loss": 1.4913,
"step": 153900
},
{
"epoch": 49.7737556561086,
"grad_norm": 1.5314381122589111,
"learning_rate": 0.001,
"loss": 1.4888,
"step": 154000
},
{
"epoch": 49.80607627666451,
"grad_norm": 2.1272311210632324,
"learning_rate": 0.001,
"loss": 1.4829,
"step": 154100
},
{
"epoch": 49.83839689722043,
"grad_norm": 1.8081731796264648,
"learning_rate": 0.001,
"loss": 1.4969,
"step": 154200
},
{
"epoch": 49.87071751777634,
"grad_norm": 2.0165419578552246,
"learning_rate": 0.001,
"loss": 1.5123,
"step": 154300
},
{
"epoch": 49.903038138332256,
"grad_norm": 1.895053744316101,
"learning_rate": 0.001,
"loss": 1.5118,
"step": 154400
},
{
"epoch": 49.93535875888817,
"grad_norm": 1.836590051651001,
"learning_rate": 0.001,
"loss": 1.5071,
"step": 154500
},
{
"epoch": 49.967679379444085,
"grad_norm": 2.259945869445801,
"learning_rate": 0.001,
"loss": 1.5239,
"step": 154600
},
{
"epoch": 50.0,
"grad_norm": 2.8617236614227295,
"learning_rate": 0.001,
"loss": 1.4828,
"step": 154700
},
{
"epoch": 50.032320620555915,
"grad_norm": 2.7500596046447754,
"learning_rate": 0.001,
"loss": 1.3622,
"step": 154800
},
{
"epoch": 50.06464124111183,
"grad_norm": 2.843418598175049,
"learning_rate": 0.001,
"loss": 1.389,
"step": 154900
},
{
"epoch": 50.096961861667744,
"grad_norm": 1.911349892616272,
"learning_rate": 0.001,
"loss": 1.4045,
"step": 155000
},
{
"epoch": 50.12928248222366,
"grad_norm": 2.789196491241455,
"learning_rate": 0.001,
"loss": 1.4044,
"step": 155100
},
{
"epoch": 50.16160310277957,
"grad_norm": 2.3924665451049805,
"learning_rate": 0.001,
"loss": 1.4078,
"step": 155200
},
{
"epoch": 50.19392372333549,
"grad_norm": 2.2527916431427,
"learning_rate": 0.001,
"loss": 1.4179,
"step": 155300
},
{
"epoch": 50.2262443438914,
"grad_norm": 2.869682788848877,
"learning_rate": 0.001,
"loss": 1.4157,
"step": 155400
},
{
"epoch": 50.25856496444732,
"grad_norm": 3.076284646987915,
"learning_rate": 0.001,
"loss": 1.4194,
"step": 155500
},
{
"epoch": 50.29088558500323,
"grad_norm": 2.405968427658081,
"learning_rate": 0.001,
"loss": 1.4324,
"step": 155600
},
{
"epoch": 50.323206205559146,
"grad_norm": 2.4868760108947754,
"learning_rate": 0.001,
"loss": 1.4284,
"step": 155700
},
{
"epoch": 50.35552682611506,
"grad_norm": 2.351515769958496,
"learning_rate": 0.001,
"loss": 1.4396,
"step": 155800
},
{
"epoch": 50.387847446670975,
"grad_norm": 2.545591354370117,
"learning_rate": 0.001,
"loss": 1.4433,
"step": 155900
},
{
"epoch": 50.42016806722689,
"grad_norm": 2.52632212638855,
"learning_rate": 0.001,
"loss": 1.4548,
"step": 156000
},
{
"epoch": 50.452488687782804,
"grad_norm": 2.8453922271728516,
"learning_rate": 0.001,
"loss": 1.4365,
"step": 156100
},
{
"epoch": 50.48480930833872,
"grad_norm": 2.562619209289551,
"learning_rate": 0.001,
"loss": 1.4401,
"step": 156200
},
{
"epoch": 50.51712992889463,
"grad_norm": 2.5165834426879883,
"learning_rate": 0.001,
"loss": 1.4542,
"step": 156300
},
{
"epoch": 50.54945054945055,
"grad_norm": 2.4320249557495117,
"learning_rate": 0.001,
"loss": 1.4499,
"step": 156400
},
{
"epoch": 50.58177117000646,
"grad_norm": 2.7280805110931396,
"learning_rate": 0.001,
"loss": 1.4534,
"step": 156500
},
{
"epoch": 50.61409179056238,
"grad_norm": 2.658902645111084,
"learning_rate": 0.001,
"loss": 1.4678,
"step": 156600
},
{
"epoch": 50.64641241111829,
"grad_norm": 2.3166139125823975,
"learning_rate": 0.001,
"loss": 1.4596,
"step": 156700
},
{
"epoch": 50.678733031674206,
"grad_norm": 3.5382041931152344,
"learning_rate": 0.001,
"loss": 1.4593,
"step": 156800
},
{
"epoch": 50.71105365223012,
"grad_norm": 3.2525131702423096,
"learning_rate": 0.001,
"loss": 1.462,
"step": 156900
},
{
"epoch": 50.743374272786035,
"grad_norm": 2.6875829696655273,
"learning_rate": 0.001,
"loss": 1.5004,
"step": 157000
},
{
"epoch": 50.77569489334195,
"grad_norm": 2.6666595935821533,
"learning_rate": 0.001,
"loss": 1.473,
"step": 157100
},
{
"epoch": 50.808015513897864,
"grad_norm": 2.2740638256073,
"learning_rate": 0.001,
"loss": 1.4764,
"step": 157200
},
{
"epoch": 50.84033613445378,
"grad_norm": 2.457540273666382,
"learning_rate": 0.001,
"loss": 1.4869,
"step": 157300
},
{
"epoch": 50.87265675500969,
"grad_norm": 7.748457431793213,
"learning_rate": 0.001,
"loss": 1.4969,
"step": 157400
},
{
"epoch": 50.90497737556561,
"grad_norm": 2.187288284301758,
"learning_rate": 0.001,
"loss": 1.4913,
"step": 157500
},
{
"epoch": 50.93729799612152,
"grad_norm": 7.933531761169434,
"learning_rate": 0.001,
"loss": 1.5105,
"step": 157600
},
{
"epoch": 50.96961861667744,
"grad_norm": 2.370905637741089,
"learning_rate": 0.001,
"loss": 1.4957,
"step": 157700
},
{
"epoch": 51.00193923723336,
"grad_norm": 1.6068298816680908,
"learning_rate": 0.001,
"loss": 1.5156,
"step": 157800
},
{
"epoch": 51.03425985778927,
"grad_norm": 1.8159526586532593,
"learning_rate": 0.001,
"loss": 1.358,
"step": 157900
},
{
"epoch": 51.06658047834519,
"grad_norm": 1.58469820022583,
"learning_rate": 0.001,
"loss": 1.3614,
"step": 158000
},
{
"epoch": 51.0989010989011,
"grad_norm": 1.645398736000061,
"learning_rate": 0.001,
"loss": 1.3755,
"step": 158100
},
{
"epoch": 51.13122171945702,
"grad_norm": 2.198871612548828,
"learning_rate": 0.001,
"loss": 1.3954,
"step": 158200
},
{
"epoch": 51.16354234001293,
"grad_norm": 1.8951984643936157,
"learning_rate": 0.001,
"loss": 1.3857,
"step": 158300
},
{
"epoch": 51.195862960568846,
"grad_norm": 1.9388495683670044,
"learning_rate": 0.001,
"loss": 1.3751,
"step": 158400
},
{
"epoch": 51.22818358112476,
"grad_norm": 1.894666075706482,
"learning_rate": 0.001,
"loss": 1.414,
"step": 158500
},
{
"epoch": 51.260504201680675,
"grad_norm": 2.324024200439453,
"learning_rate": 0.001,
"loss": 1.4045,
"step": 158600
},
{
"epoch": 51.29282482223659,
"grad_norm": 2.4311294555664062,
"learning_rate": 0.001,
"loss": 1.4152,
"step": 158700
},
{
"epoch": 51.325145442792504,
"grad_norm": 1.6439367532730103,
"learning_rate": 0.001,
"loss": 1.4066,
"step": 158800
},
{
"epoch": 51.35746606334842,
"grad_norm": 1.458225131034851,
"learning_rate": 0.001,
"loss": 1.4157,
"step": 158900
},
{
"epoch": 51.38978668390433,
"grad_norm": 1.7206692695617676,
"learning_rate": 0.001,
"loss": 1.4376,
"step": 159000
},
{
"epoch": 51.42210730446025,
"grad_norm": 2.1803271770477295,
"learning_rate": 0.001,
"loss": 1.4167,
"step": 159100
},
{
"epoch": 51.45442792501616,
"grad_norm": 1.9700300693511963,
"learning_rate": 0.001,
"loss": 1.4532,
"step": 159200
},
{
"epoch": 51.48674854557208,
"grad_norm": 1.7611888647079468,
"learning_rate": 0.001,
"loss": 1.4344,
"step": 159300
},
{
"epoch": 51.51906916612799,
"grad_norm": 1.614646553993225,
"learning_rate": 0.001,
"loss": 1.4172,
"step": 159400
},
{
"epoch": 51.551389786683906,
"grad_norm": 1.9561604261398315,
"learning_rate": 0.001,
"loss": 1.4543,
"step": 159500
},
{
"epoch": 51.58371040723982,
"grad_norm": 2.4098892211914062,
"learning_rate": 0.001,
"loss": 1.4481,
"step": 159600
},
{
"epoch": 51.616031027795735,
"grad_norm": 1.6354035139083862,
"learning_rate": 0.001,
"loss": 1.4402,
"step": 159700
},
{
"epoch": 51.64835164835165,
"grad_norm": 1.5231209993362427,
"learning_rate": 0.001,
"loss": 1.4591,
"step": 159800
},
{
"epoch": 51.680672268907564,
"grad_norm": 1.4801390171051025,
"learning_rate": 0.001,
"loss": 1.4647,
"step": 159900
},
{
"epoch": 51.71299288946348,
"grad_norm": 1.897993803024292,
"learning_rate": 0.001,
"loss": 1.4731,
"step": 160000
},
{
"epoch": 51.74531351001939,
"grad_norm": 1.9183740615844727,
"learning_rate": 0.001,
"loss": 1.4652,
"step": 160100
},
{
"epoch": 51.77763413057531,
"grad_norm": 1.6743416786193848,
"learning_rate": 0.001,
"loss": 1.468,
"step": 160200
},
{
"epoch": 51.80995475113122,
"grad_norm": 1.5103219747543335,
"learning_rate": 0.001,
"loss": 1.4753,
"step": 160300
},
{
"epoch": 51.84227537168714,
"grad_norm": 1.8089489936828613,
"learning_rate": 0.001,
"loss": 1.4642,
"step": 160400
},
{
"epoch": 51.87459599224305,
"grad_norm": 1.889352560043335,
"learning_rate": 0.001,
"loss": 1.4753,
"step": 160500
},
{
"epoch": 51.906916612798966,
"grad_norm": 1.892152190208435,
"learning_rate": 0.001,
"loss": 1.4855,
"step": 160600
},
{
"epoch": 51.93923723335488,
"grad_norm": 2.1310675144195557,
"learning_rate": 0.001,
"loss": 1.4741,
"step": 160700
},
{
"epoch": 51.971557853910795,
"grad_norm": 1.6801360845565796,
"learning_rate": 0.001,
"loss": 1.4823,
"step": 160800
},
{
"epoch": 52.00387847446671,
"grad_norm": 1.5871704816818237,
"learning_rate": 0.001,
"loss": 1.4805,
"step": 160900
},
{
"epoch": 52.036199095022624,
"grad_norm": 1.765568494796753,
"learning_rate": 0.001,
"loss": 1.3575,
"step": 161000
},
{
"epoch": 52.06851971557854,
"grad_norm": 1.6732524633407593,
"learning_rate": 0.001,
"loss": 1.3625,
"step": 161100
},
{
"epoch": 52.10084033613445,
"grad_norm": 2.0522379875183105,
"learning_rate": 0.001,
"loss": 1.3777,
"step": 161200
},
{
"epoch": 52.13316095669037,
"grad_norm": 1.4788262844085693,
"learning_rate": 0.001,
"loss": 1.3684,
"step": 161300
},
{
"epoch": 52.16548157724628,
"grad_norm": 1.94754159450531,
"learning_rate": 0.001,
"loss": 1.364,
"step": 161400
},
{
"epoch": 52.1978021978022,
"grad_norm": 1.6368705034255981,
"learning_rate": 0.001,
"loss": 1.3813,
"step": 161500
},
{
"epoch": 52.23012281835811,
"grad_norm": 1.5317388772964478,
"learning_rate": 0.001,
"loss": 1.392,
"step": 161600
},
{
"epoch": 52.262443438914026,
"grad_norm": 1.593577265739441,
"learning_rate": 0.001,
"loss": 1.3962,
"step": 161700
},
{
"epoch": 52.29476405946994,
"grad_norm": 1.7241520881652832,
"learning_rate": 0.001,
"loss": 1.3977,
"step": 161800
},
{
"epoch": 52.327084680025855,
"grad_norm": 1.9638442993164062,
"learning_rate": 0.001,
"loss": 1.3956,
"step": 161900
},
{
"epoch": 52.35940530058177,
"grad_norm": 1.7215887308120728,
"learning_rate": 0.001,
"loss": 1.4052,
"step": 162000
},
{
"epoch": 52.391725921137684,
"grad_norm": 1.7366405725479126,
"learning_rate": 0.001,
"loss": 1.4169,
"step": 162100
},
{
"epoch": 52.4240465416936,
"grad_norm": 2.66810941696167,
"learning_rate": 0.001,
"loss": 1.4099,
"step": 162200
},
{
"epoch": 52.456367162249514,
"grad_norm": 1.9656466245651245,
"learning_rate": 0.001,
"loss": 1.4154,
"step": 162300
},
{
"epoch": 52.48868778280543,
"grad_norm": 1.7060424089431763,
"learning_rate": 0.001,
"loss": 1.4269,
"step": 162400
},
{
"epoch": 52.52100840336134,
"grad_norm": 1.6961543560028076,
"learning_rate": 0.001,
"loss": 1.4199,
"step": 162500
},
{
"epoch": 52.55332902391726,
"grad_norm": 1.670259952545166,
"learning_rate": 0.001,
"loss": 1.4225,
"step": 162600
},
{
"epoch": 52.58564964447317,
"grad_norm": 1.4814908504486084,
"learning_rate": 0.001,
"loss": 1.4246,
"step": 162700
},
{
"epoch": 52.617970265029086,
"grad_norm": 1.7170677185058594,
"learning_rate": 0.001,
"loss": 1.4303,
"step": 162800
},
{
"epoch": 52.650290885585,
"grad_norm": 1.9022291898727417,
"learning_rate": 0.001,
"loss": 1.4298,
"step": 162900
},
{
"epoch": 52.682611506140915,
"grad_norm": 1.8609496355056763,
"learning_rate": 0.001,
"loss": 1.4399,
"step": 163000
},
{
"epoch": 52.71493212669683,
"grad_norm": 1.6735694408416748,
"learning_rate": 0.001,
"loss": 1.4488,
"step": 163100
},
{
"epoch": 52.747252747252745,
"grad_norm": 1.5999531745910645,
"learning_rate": 0.001,
"loss": 1.4326,
"step": 163200
},
{
"epoch": 52.77957336780866,
"grad_norm": 1.8553581237792969,
"learning_rate": 0.001,
"loss": 1.4322,
"step": 163300
},
{
"epoch": 52.811893988364574,
"grad_norm": 1.971063256263733,
"learning_rate": 0.001,
"loss": 1.4361,
"step": 163400
},
{
"epoch": 52.84421460892049,
"grad_norm": 1.682065725326538,
"learning_rate": 0.001,
"loss": 1.4516,
"step": 163500
},
{
"epoch": 52.8765352294764,
"grad_norm": 1.498920202255249,
"learning_rate": 0.001,
"loss": 1.4783,
"step": 163600
},
{
"epoch": 52.90885585003232,
"grad_norm": 2.0326061248779297,
"learning_rate": 0.001,
"loss": 1.4666,
"step": 163700
},
{
"epoch": 52.94117647058823,
"grad_norm": 1.505751609802246,
"learning_rate": 0.001,
"loss": 1.4689,
"step": 163800
},
{
"epoch": 52.97349709114415,
"grad_norm": 1.652345895767212,
"learning_rate": 0.001,
"loss": 1.4698,
"step": 163900
},
{
"epoch": 53.00581771170007,
"grad_norm": 1.6074447631835938,
"learning_rate": 0.001,
"loss": 1.4543,
"step": 164000
},
{
"epoch": 53.03813833225598,
"grad_norm": 1.6570724248886108,
"learning_rate": 0.001,
"loss": 1.3409,
"step": 164100
},
{
"epoch": 53.0704589528119,
"grad_norm": 1.9990196228027344,
"learning_rate": 0.001,
"loss": 1.3491,
"step": 164200
},
{
"epoch": 53.10277957336781,
"grad_norm": 1.4702783823013306,
"learning_rate": 0.001,
"loss": 1.3252,
"step": 164300
},
{
"epoch": 53.135100193923726,
"grad_norm": 2.1267101764678955,
"learning_rate": 0.001,
"loss": 1.3418,
"step": 164400
},
{
"epoch": 53.16742081447964,
"grad_norm": 1.68787682056427,
"learning_rate": 0.001,
"loss": 1.3588,
"step": 164500
},
{
"epoch": 53.199741435035556,
"grad_norm": 1.6803096532821655,
"learning_rate": 0.001,
"loss": 1.3685,
"step": 164600
},
{
"epoch": 53.23206205559147,
"grad_norm": 1.3000693321228027,
"learning_rate": 0.001,
"loss": 1.3675,
"step": 164700
},
{
"epoch": 53.264382676147385,
"grad_norm": 1.6280704736709595,
"learning_rate": 0.001,
"loss": 1.3659,
"step": 164800
},
{
"epoch": 53.2967032967033,
"grad_norm": 1.7576541900634766,
"learning_rate": 0.001,
"loss": 1.383,
"step": 164900
},
{
"epoch": 53.329023917259214,
"grad_norm": 1.8576921224594116,
"learning_rate": 0.001,
"loss": 1.373,
"step": 165000
},
{
"epoch": 53.36134453781513,
"grad_norm": 1.5620722770690918,
"learning_rate": 0.001,
"loss": 1.3778,
"step": 165100
},
{
"epoch": 53.39366515837104,
"grad_norm": 1.6440199613571167,
"learning_rate": 0.001,
"loss": 1.413,
"step": 165200
},
{
"epoch": 53.42598577892696,
"grad_norm": 2.071763038635254,
"learning_rate": 0.001,
"loss": 1.3942,
"step": 165300
},
{
"epoch": 53.45830639948287,
"grad_norm": 1.619279384613037,
"learning_rate": 0.001,
"loss": 1.4029,
"step": 165400
},
{
"epoch": 53.49062702003879,
"grad_norm": 1.8446378707885742,
"learning_rate": 0.001,
"loss": 1.398,
"step": 165500
},
{
"epoch": 53.5229476405947,
"grad_norm": 6.851656913757324,
"learning_rate": 0.001,
"loss": 1.406,
"step": 165600
},
{
"epoch": 53.555268261150616,
"grad_norm": 1.4749475717544556,
"learning_rate": 0.001,
"loss": 1.4049,
"step": 165700
},
{
"epoch": 53.58758888170653,
"grad_norm": 1.752159595489502,
"learning_rate": 0.001,
"loss": 1.3914,
"step": 165800
},
{
"epoch": 53.619909502262445,
"grad_norm": 1.5363788604736328,
"learning_rate": 0.001,
"loss": 1.4229,
"step": 165900
},
{
"epoch": 53.65223012281836,
"grad_norm": 1.6976779699325562,
"learning_rate": 0.001,
"loss": 1.4316,
"step": 166000
},
{
"epoch": 53.684550743374274,
"grad_norm": 1.7210675477981567,
"learning_rate": 0.001,
"loss": 1.4234,
"step": 166100
},
{
"epoch": 53.71687136393019,
"grad_norm": 1.6245074272155762,
"learning_rate": 0.001,
"loss": 1.4233,
"step": 166200
},
{
"epoch": 53.7491919844861,
"grad_norm": 1.5693413019180298,
"learning_rate": 0.001,
"loss": 1.4393,
"step": 166300
},
{
"epoch": 53.78151260504202,
"grad_norm": 1.9812067747116089,
"learning_rate": 0.001,
"loss": 1.4486,
"step": 166400
},
{
"epoch": 53.81383322559793,
"grad_norm": 1.4747971296310425,
"learning_rate": 0.001,
"loss": 1.4399,
"step": 166500
},
{
"epoch": 53.84615384615385,
"grad_norm": 1.851163625717163,
"learning_rate": 0.001,
"loss": 1.4454,
"step": 166600
},
{
"epoch": 53.87847446670976,
"grad_norm": 2.2305819988250732,
"learning_rate": 0.001,
"loss": 1.4636,
"step": 166700
},
{
"epoch": 53.910795087265676,
"grad_norm": 1.541678547859192,
"learning_rate": 0.001,
"loss": 1.4269,
"step": 166800
},
{
"epoch": 53.94311570782159,
"grad_norm": 1.715518593788147,
"learning_rate": 0.001,
"loss": 1.4537,
"step": 166900
},
{
"epoch": 53.975436328377505,
"grad_norm": 1.4480615854263306,
"learning_rate": 0.001,
"loss": 1.4655,
"step": 167000
},
{
"epoch": 54.00775694893342,
"grad_norm": 1.692636251449585,
"learning_rate": 0.001,
"loss": 1.4344,
"step": 167100
},
{
"epoch": 54.040077569489334,
"grad_norm": 1.7430903911590576,
"learning_rate": 0.001,
"loss": 1.3191,
"step": 167200
},
{
"epoch": 54.07239819004525,
"grad_norm": 1.308061957359314,
"learning_rate": 0.001,
"loss": 1.3301,
"step": 167300
},
{
"epoch": 54.10471881060116,
"grad_norm": 1.7454525232315063,
"learning_rate": 0.001,
"loss": 1.3403,
"step": 167400
},
{
"epoch": 54.13703943115708,
"grad_norm": 1.5327422618865967,
"learning_rate": 0.001,
"loss": 1.3231,
"step": 167500
},
{
"epoch": 54.16936005171299,
"grad_norm": 1.5026171207427979,
"learning_rate": 0.001,
"loss": 1.3433,
"step": 167600
},
{
"epoch": 54.20168067226891,
"grad_norm": 1.3935140371322632,
"learning_rate": 0.001,
"loss": 1.3591,
"step": 167700
},
{
"epoch": 54.23400129282482,
"grad_norm": 1.617540717124939,
"learning_rate": 0.001,
"loss": 1.3346,
"step": 167800
},
{
"epoch": 54.266321913380736,
"grad_norm": 2.285799741744995,
"learning_rate": 0.001,
"loss": 1.3603,
"step": 167900
},
{
"epoch": 54.29864253393665,
"grad_norm": 1.4892338514328003,
"learning_rate": 0.001,
"loss": 1.3933,
"step": 168000
},
{
"epoch": 54.330963154492565,
"grad_norm": 1.4886034727096558,
"learning_rate": 0.001,
"loss": 1.3704,
"step": 168100
},
{
"epoch": 54.36328377504848,
"grad_norm": 1.4371678829193115,
"learning_rate": 0.001,
"loss": 1.3819,
"step": 168200
},
{
"epoch": 54.395604395604394,
"grad_norm": 2.0230281352996826,
"learning_rate": 0.001,
"loss": 1.3826,
"step": 168300
},
{
"epoch": 54.42792501616031,
"grad_norm": 1.5713683366775513,
"learning_rate": 0.001,
"loss": 1.3671,
"step": 168400
},
{
"epoch": 54.46024563671622,
"grad_norm": 1.58280348777771,
"learning_rate": 0.001,
"loss": 1.3767,
"step": 168500
},
{
"epoch": 54.49256625727214,
"grad_norm": 1.4634507894515991,
"learning_rate": 0.001,
"loss": 1.3916,
"step": 168600
},
{
"epoch": 54.52488687782805,
"grad_norm": 1.4438565969467163,
"learning_rate": 0.001,
"loss": 1.4001,
"step": 168700
},
{
"epoch": 54.55720749838397,
"grad_norm": 1.6048426628112793,
"learning_rate": 0.001,
"loss": 1.3831,
"step": 168800
},
{
"epoch": 54.58952811893988,
"grad_norm": 1.7846475839614868,
"learning_rate": 0.001,
"loss": 1.3927,
"step": 168900
},
{
"epoch": 54.621848739495796,
"grad_norm": 1.8997995853424072,
"learning_rate": 0.001,
"loss": 1.4055,
"step": 169000
},
{
"epoch": 54.65416936005171,
"grad_norm": 1.6765133142471313,
"learning_rate": 0.001,
"loss": 1.4073,
"step": 169100
},
{
"epoch": 54.686489980607625,
"grad_norm": 1.3951281309127808,
"learning_rate": 0.001,
"loss": 1.4121,
"step": 169200
},
{
"epoch": 54.71881060116354,
"grad_norm": 1.5928879976272583,
"learning_rate": 0.001,
"loss": 1.425,
"step": 169300
},
{
"epoch": 54.751131221719454,
"grad_norm": 1.5925168991088867,
"learning_rate": 0.001,
"loss": 1.4258,
"step": 169400
},
{
"epoch": 54.78345184227537,
"grad_norm": 1.8010461330413818,
"learning_rate": 0.001,
"loss": 1.4312,
"step": 169500
},
{
"epoch": 54.81577246283128,
"grad_norm": 2.023576259613037,
"learning_rate": 0.001,
"loss": 1.4175,
"step": 169600
},
{
"epoch": 54.8480930833872,
"grad_norm": 1.5003081560134888,
"learning_rate": 0.001,
"loss": 1.445,
"step": 169700
},
{
"epoch": 54.88041370394311,
"grad_norm": 1.3284757137298584,
"learning_rate": 0.001,
"loss": 1.4262,
"step": 169800
},
{
"epoch": 54.91273432449903,
"grad_norm": 1.90644371509552,
"learning_rate": 0.001,
"loss": 1.42,
"step": 169900
},
{
"epoch": 54.94505494505494,
"grad_norm": 2.487614393234253,
"learning_rate": 0.001,
"loss": 1.4205,
"step": 170000
},
{
"epoch": 54.977375565610856,
"grad_norm": 2.273512601852417,
"learning_rate": 0.001,
"loss": 1.4401,
"step": 170100
},
{
"epoch": 55.00969618616678,
"grad_norm": 1.6120694875717163,
"learning_rate": 0.001,
"loss": 1.3946,
"step": 170200
},
{
"epoch": 55.04201680672269,
"grad_norm": 1.6807186603546143,
"learning_rate": 0.001,
"loss": 1.3055,
"step": 170300
},
{
"epoch": 55.07433742727861,
"grad_norm": 1.9705253839492798,
"learning_rate": 0.001,
"loss": 1.318,
"step": 170400
},
{
"epoch": 55.10665804783452,
"grad_norm": 1.604475498199463,
"learning_rate": 0.001,
"loss": 1.3274,
"step": 170500
},
{
"epoch": 55.138978668390436,
"grad_norm": 6.541220664978027,
"learning_rate": 0.001,
"loss": 1.332,
"step": 170600
},
{
"epoch": 55.17129928894635,
"grad_norm": 2.1754775047302246,
"learning_rate": 0.001,
"loss": 1.3366,
"step": 170700
},
{
"epoch": 55.203619909502265,
"grad_norm": 1.566157341003418,
"learning_rate": 0.001,
"loss": 1.3424,
"step": 170800
},
{
"epoch": 55.23594053005818,
"grad_norm": 1.5534968376159668,
"learning_rate": 0.001,
"loss": 1.3364,
"step": 170900
},
{
"epoch": 55.268261150614094,
"grad_norm": 2.1709282398223877,
"learning_rate": 0.001,
"loss": 1.3536,
"step": 171000
},
{
"epoch": 55.30058177117001,
"grad_norm": 1.6569381952285767,
"learning_rate": 0.001,
"loss": 1.3321,
"step": 171100
},
{
"epoch": 55.33290239172592,
"grad_norm": 1.758289098739624,
"learning_rate": 0.001,
"loss": 1.3522,
"step": 171200
},
{
"epoch": 55.36522301228184,
"grad_norm": 1.7878657579421997,
"learning_rate": 0.001,
"loss": 1.3564,
"step": 171300
},
{
"epoch": 55.39754363283775,
"grad_norm": 1.8341219425201416,
"learning_rate": 0.001,
"loss": 1.3519,
"step": 171400
},
{
"epoch": 55.42986425339367,
"grad_norm": 1.522491216659546,
"learning_rate": 0.001,
"loss": 1.3586,
"step": 171500
},
{
"epoch": 55.46218487394958,
"grad_norm": 2.06044864654541,
"learning_rate": 0.001,
"loss": 1.3746,
"step": 171600
},
{
"epoch": 55.494505494505496,
"grad_norm": 1.9041239023208618,
"learning_rate": 0.001,
"loss": 1.3811,
"step": 171700
},
{
"epoch": 55.52682611506141,
"grad_norm": 1.7315250635147095,
"learning_rate": 0.001,
"loss": 1.3812,
"step": 171800
},
{
"epoch": 55.559146735617325,
"grad_norm": 1.8228223323822021,
"learning_rate": 0.001,
"loss": 1.3803,
"step": 171900
},
{
"epoch": 55.59146735617324,
"grad_norm": 1.6775150299072266,
"learning_rate": 0.001,
"loss": 1.395,
"step": 172000
},
{
"epoch": 55.623787976729155,
"grad_norm": 1.7854257822036743,
"learning_rate": 0.001,
"loss": 1.3913,
"step": 172100
},
{
"epoch": 55.65610859728507,
"grad_norm": 1.8913499116897583,
"learning_rate": 0.001,
"loss": 1.3857,
"step": 172200
},
{
"epoch": 55.688429217840984,
"grad_norm": 1.7007423639297485,
"learning_rate": 0.001,
"loss": 1.411,
"step": 172300
},
{
"epoch": 55.7207498383969,
"grad_norm": 1.6376309394836426,
"learning_rate": 0.001,
"loss": 1.3868,
"step": 172400
},
{
"epoch": 55.75307045895281,
"grad_norm": 1.9880905151367188,
"learning_rate": 0.001,
"loss": 1.4165,
"step": 172500
},
{
"epoch": 55.78539107950873,
"grad_norm": 1.8876042366027832,
"learning_rate": 0.001,
"loss": 1.3944,
"step": 172600
},
{
"epoch": 55.81771170006464,
"grad_norm": 1.67818284034729,
"learning_rate": 0.001,
"loss": 1.4349,
"step": 172700
},
{
"epoch": 55.85003232062056,
"grad_norm": 1.440558671951294,
"learning_rate": 0.001,
"loss": 1.4127,
"step": 172800
},
{
"epoch": 55.88235294117647,
"grad_norm": 1.8764231204986572,
"learning_rate": 0.001,
"loss": 1.4169,
"step": 172900
},
{
"epoch": 55.914673561732386,
"grad_norm": 2.102301597595215,
"learning_rate": 0.001,
"loss": 1.4181,
"step": 173000
},
{
"epoch": 55.9469941822883,
"grad_norm": 1.709457278251648,
"learning_rate": 0.001,
"loss": 1.4222,
"step": 173100
},
{
"epoch": 55.979314802844215,
"grad_norm": 1.5915135145187378,
"learning_rate": 0.001,
"loss": 1.4292,
"step": 173200
},
{
"epoch": 56.01163542340013,
"grad_norm": 2.3316972255706787,
"learning_rate": 0.001,
"loss": 1.378,
"step": 173300
},
{
"epoch": 56.043956043956044,
"grad_norm": 2.285443067550659,
"learning_rate": 0.001,
"loss": 1.2946,
"step": 173400
},
{
"epoch": 56.07627666451196,
"grad_norm": 1.96236252784729,
"learning_rate": 0.001,
"loss": 1.302,
"step": 173500
},
{
"epoch": 56.10859728506787,
"grad_norm": 2.4459619522094727,
"learning_rate": 0.001,
"loss": 1.304,
"step": 173600
},
{
"epoch": 56.14091790562379,
"grad_norm": 2.265803813934326,
"learning_rate": 0.001,
"loss": 1.3218,
"step": 173700
},
{
"epoch": 56.1732385261797,
"grad_norm": 1.7974573373794556,
"learning_rate": 0.001,
"loss": 1.3045,
"step": 173800
},
{
"epoch": 56.20555914673562,
"grad_norm": 1.8836841583251953,
"learning_rate": 0.001,
"loss": 1.3102,
"step": 173900
},
{
"epoch": 56.23787976729153,
"grad_norm": 1.8796806335449219,
"learning_rate": 0.001,
"loss": 1.3573,
"step": 174000
},
{
"epoch": 56.270200387847446,
"grad_norm": 1.8522303104400635,
"learning_rate": 0.001,
"loss": 1.3435,
"step": 174100
},
{
"epoch": 56.30252100840336,
"grad_norm": 1.5742342472076416,
"learning_rate": 0.001,
"loss": 1.3479,
"step": 174200
},
{
"epoch": 56.334841628959275,
"grad_norm": 1.655555248260498,
"learning_rate": 0.001,
"loss": 1.3261,
"step": 174300
},
{
"epoch": 56.36716224951519,
"grad_norm": 1.8719924688339233,
"learning_rate": 0.001,
"loss": 1.3298,
"step": 174400
},
{
"epoch": 56.399482870071104,
"grad_norm": 1.8895421028137207,
"learning_rate": 0.001,
"loss": 1.3586,
"step": 174500
},
{
"epoch": 56.43180349062702,
"grad_norm": 1.98106849193573,
"learning_rate": 0.001,
"loss": 1.3609,
"step": 174600
},
{
"epoch": 56.46412411118293,
"grad_norm": 1.809706687927246,
"learning_rate": 0.001,
"loss": 1.3688,
"step": 174700
},
{
"epoch": 56.49644473173885,
"grad_norm": 1.962716817855835,
"learning_rate": 0.001,
"loss": 1.3798,
"step": 174800
},
{
"epoch": 56.52876535229476,
"grad_norm": 2.347630500793457,
"learning_rate": 0.001,
"loss": 1.3793,
"step": 174900
},
{
"epoch": 56.56108597285068,
"grad_norm": 1.7523319721221924,
"learning_rate": 0.001,
"loss": 1.3641,
"step": 175000
},
{
"epoch": 56.59340659340659,
"grad_norm": 2.0193288326263428,
"learning_rate": 0.001,
"loss": 1.3682,
"step": 175100
},
{
"epoch": 56.625727213962506,
"grad_norm": 1.7714121341705322,
"learning_rate": 0.001,
"loss": 1.3583,
"step": 175200
},
{
"epoch": 56.65804783451842,
"grad_norm": 1.6798348426818848,
"learning_rate": 0.001,
"loss": 1.371,
"step": 175300
},
{
"epoch": 56.690368455074335,
"grad_norm": 1.8950109481811523,
"learning_rate": 0.001,
"loss": 1.3803,
"step": 175400
},
{
"epoch": 56.72268907563025,
"grad_norm": 2.079096555709839,
"learning_rate": 0.001,
"loss": 1.383,
"step": 175500
},
{
"epoch": 56.755009696186164,
"grad_norm": 1.8894920349121094,
"learning_rate": 0.001,
"loss": 1.3906,
"step": 175600
},
{
"epoch": 56.78733031674208,
"grad_norm": 1.8664498329162598,
"learning_rate": 0.001,
"loss": 1.3867,
"step": 175700
},
{
"epoch": 56.81965093729799,
"grad_norm": 1.9517700672149658,
"learning_rate": 0.001,
"loss": 1.4043,
"step": 175800
},
{
"epoch": 56.85197155785391,
"grad_norm": 1.8154116868972778,
"learning_rate": 0.001,
"loss": 1.4082,
"step": 175900
},
{
"epoch": 56.88429217840982,
"grad_norm": 1.7526671886444092,
"learning_rate": 0.001,
"loss": 1.4217,
"step": 176000
},
{
"epoch": 56.91661279896574,
"grad_norm": 1.6760896444320679,
"learning_rate": 0.001,
"loss": 1.411,
"step": 176100
},
{
"epoch": 56.94893341952165,
"grad_norm": 1.8976603746414185,
"learning_rate": 0.001,
"loss": 1.4064,
"step": 176200
},
{
"epoch": 56.981254040077566,
"grad_norm": 1.5558828115463257,
"learning_rate": 0.001,
"loss": 1.4082,
"step": 176300
},
{
"epoch": 57.01357466063349,
"grad_norm": 1.8569399118423462,
"learning_rate": 0.001,
"loss": 1.3491,
"step": 176400
},
{
"epoch": 57.0458952811894,
"grad_norm": 2.1090731620788574,
"learning_rate": 0.001,
"loss": 1.2905,
"step": 176500
},
{
"epoch": 57.07821590174532,
"grad_norm": 1.87632417678833,
"learning_rate": 0.001,
"loss": 1.2937,
"step": 176600
},
{
"epoch": 57.11053652230123,
"grad_norm": 2.033785820007324,
"learning_rate": 0.001,
"loss": 1.2854,
"step": 176700
},
{
"epoch": 57.142857142857146,
"grad_norm": 3.1014299392700195,
"learning_rate": 0.001,
"loss": 1.3014,
"step": 176800
},
{
"epoch": 57.17517776341306,
"grad_norm": 1.5487799644470215,
"learning_rate": 0.001,
"loss": 1.2949,
"step": 176900
},
{
"epoch": 57.207498383968975,
"grad_norm": 1.8203353881835938,
"learning_rate": 0.001,
"loss": 1.3163,
"step": 177000
},
{
"epoch": 57.23981900452489,
"grad_norm": 2.1298274993896484,
"learning_rate": 0.001,
"loss": 1.3148,
"step": 177100
},
{
"epoch": 57.272139625080804,
"grad_norm": 2.3572335243225098,
"learning_rate": 0.001,
"loss": 1.3131,
"step": 177200
},
{
"epoch": 57.30446024563672,
"grad_norm": 1.9870686531066895,
"learning_rate": 0.001,
"loss": 1.3295,
"step": 177300
},
{
"epoch": 57.33678086619263,
"grad_norm": 1.9002041816711426,
"learning_rate": 0.001,
"loss": 1.3113,
"step": 177400
},
{
"epoch": 57.36910148674855,
"grad_norm": 2.0429224967956543,
"learning_rate": 0.001,
"loss": 1.3369,
"step": 177500
},
{
"epoch": 57.40142210730446,
"grad_norm": 2.353167772293091,
"learning_rate": 0.001,
"loss": 1.3369,
"step": 177600
},
{
"epoch": 57.43374272786038,
"grad_norm": 1.6245406866073608,
"learning_rate": 0.001,
"loss": 1.3467,
"step": 177700
},
{
"epoch": 57.46606334841629,
"grad_norm": 1.77859628200531,
"learning_rate": 0.001,
"loss": 1.3399,
"step": 177800
},
{
"epoch": 57.498383968972206,
"grad_norm": 1.8216441869735718,
"learning_rate": 0.001,
"loss": 1.3448,
"step": 177900
},
{
"epoch": 57.53070458952812,
"grad_norm": 1.8462305068969727,
"learning_rate": 0.001,
"loss": 1.3505,
"step": 178000
},
{
"epoch": 57.563025210084035,
"grad_norm": 2.081976890563965,
"learning_rate": 0.001,
"loss": 1.3565,
"step": 178100
},
{
"epoch": 57.59534583063995,
"grad_norm": 1.996206521987915,
"learning_rate": 0.001,
"loss": 1.3618,
"step": 178200
},
{
"epoch": 57.627666451195864,
"grad_norm": 2.0252761840820312,
"learning_rate": 0.001,
"loss": 1.3762,
"step": 178300
},
{
"epoch": 57.65998707175178,
"grad_norm": 1.6856194734573364,
"learning_rate": 0.001,
"loss": 1.3561,
"step": 178400
},
{
"epoch": 57.69230769230769,
"grad_norm": 1.955251693725586,
"learning_rate": 0.001,
"loss": 1.3857,
"step": 178500
},
{
"epoch": 57.72462831286361,
"grad_norm": 1.9201545715332031,
"learning_rate": 0.001,
"loss": 1.3897,
"step": 178600
},
{
"epoch": 57.75694893341952,
"grad_norm": 2.1210641860961914,
"learning_rate": 0.001,
"loss": 1.3676,
"step": 178700
},
{
"epoch": 57.78926955397544,
"grad_norm": 1.891424536705017,
"learning_rate": 0.001,
"loss": 1.3912,
"step": 178800
},
{
"epoch": 57.82159017453135,
"grad_norm": 1.859992504119873,
"learning_rate": 0.001,
"loss": 1.3864,
"step": 178900
},
{
"epoch": 57.853910795087266,
"grad_norm": 2.3439786434173584,
"learning_rate": 0.001,
"loss": 1.3738,
"step": 179000
},
{
"epoch": 57.88623141564318,
"grad_norm": 1.835242748260498,
"learning_rate": 0.001,
"loss": 1.3923,
"step": 179100
},
{
"epoch": 57.918552036199095,
"grad_norm": 2.018841028213501,
"learning_rate": 0.001,
"loss": 1.3933,
"step": 179200
},
{
"epoch": 57.95087265675501,
"grad_norm": 2.056886672973633,
"learning_rate": 0.001,
"loss": 1.3979,
"step": 179300
},
{
"epoch": 57.983193277310924,
"grad_norm": 2.031996965408325,
"learning_rate": 0.001,
"loss": 1.3989,
"step": 179400
},
{
"epoch": 58.01551389786684,
"grad_norm": 1.9441972970962524,
"learning_rate": 0.001,
"loss": 1.3036,
"step": 179500
},
{
"epoch": 58.04783451842275,
"grad_norm": 2.2586889266967773,
"learning_rate": 0.001,
"loss": 1.2679,
"step": 179600
},
{
"epoch": 58.08015513897867,
"grad_norm": 2.025006055831909,
"learning_rate": 0.001,
"loss": 1.2601,
"step": 179700
},
{
"epoch": 58.11247575953458,
"grad_norm": 4.384105682373047,
"learning_rate": 0.001,
"loss": 1.2748,
"step": 179800
},
{
"epoch": 58.1447963800905,
"grad_norm": 1.9416863918304443,
"learning_rate": 0.001,
"loss": 1.2984,
"step": 179900
},
{
"epoch": 58.17711700064641,
"grad_norm": 2.0471954345703125,
"learning_rate": 0.001,
"loss": 1.2953,
"step": 180000
},
{
"epoch": 58.209437621202326,
"grad_norm": 1.9570256471633911,
"learning_rate": 0.001,
"loss": 1.2878,
"step": 180100
},
{
"epoch": 58.24175824175824,
"grad_norm": 2.0297162532806396,
"learning_rate": 0.001,
"loss": 1.3079,
"step": 180200
},
{
"epoch": 58.274078862314155,
"grad_norm": 1.9571456909179688,
"learning_rate": 0.001,
"loss": 1.3165,
"step": 180300
},
{
"epoch": 58.30639948287007,
"grad_norm": 3.118157148361206,
"learning_rate": 0.001,
"loss": 1.3078,
"step": 180400
},
{
"epoch": 58.338720103425985,
"grad_norm": 2.2558462619781494,
"learning_rate": 0.001,
"loss": 1.3159,
"step": 180500
},
{
"epoch": 58.3710407239819,
"grad_norm": 2.5575222969055176,
"learning_rate": 0.001,
"loss": 1.3179,
"step": 180600
},
{
"epoch": 58.403361344537814,
"grad_norm": 2.0723485946655273,
"learning_rate": 0.001,
"loss": 1.3242,
"step": 180700
},
{
"epoch": 58.43568196509373,
"grad_norm": 2.4277594089508057,
"learning_rate": 0.001,
"loss": 1.3392,
"step": 180800
},
{
"epoch": 58.46800258564964,
"grad_norm": 2.787843942642212,
"learning_rate": 0.001,
"loss": 1.3318,
"step": 180900
},
{
"epoch": 58.50032320620556,
"grad_norm": 2.410322666168213,
"learning_rate": 0.001,
"loss": 1.3433,
"step": 181000
},
{
"epoch": 58.53264382676147,
"grad_norm": 2.142733335494995,
"learning_rate": 0.001,
"loss": 1.3394,
"step": 181100
},
{
"epoch": 58.56496444731739,
"grad_norm": 2.8335678577423096,
"learning_rate": 0.001,
"loss": 1.3411,
"step": 181200
},
{
"epoch": 58.5972850678733,
"grad_norm": 2.7325358390808105,
"learning_rate": 0.001,
"loss": 1.3377,
"step": 181300
},
{
"epoch": 58.629605688429216,
"grad_norm": 2.1823666095733643,
"learning_rate": 0.001,
"loss": 1.3473,
"step": 181400
},
{
"epoch": 58.66192630898513,
"grad_norm": 2.302861213684082,
"learning_rate": 0.001,
"loss": 1.36,
"step": 181500
},
{
"epoch": 58.694246929541045,
"grad_norm": 2.170161485671997,
"learning_rate": 0.001,
"loss": 1.3743,
"step": 181600
},
{
"epoch": 58.72656755009696,
"grad_norm": 2.141266345977783,
"learning_rate": 0.001,
"loss": 1.3702,
"step": 181700
},
{
"epoch": 58.758888170652874,
"grad_norm": 2.1460530757904053,
"learning_rate": 0.001,
"loss": 1.3566,
"step": 181800
},
{
"epoch": 58.79120879120879,
"grad_norm": 2.7301716804504395,
"learning_rate": 0.001,
"loss": 1.3815,
"step": 181900
},
{
"epoch": 58.8235294117647,
"grad_norm": 2.333367109298706,
"learning_rate": 0.001,
"loss": 1.372,
"step": 182000
},
{
"epoch": 58.85585003232062,
"grad_norm": 2.421165943145752,
"learning_rate": 0.001,
"loss": 1.3698,
"step": 182100
},
{
"epoch": 58.88817065287653,
"grad_norm": 2.190744638442993,
"learning_rate": 0.001,
"loss": 1.3695,
"step": 182200
},
{
"epoch": 58.92049127343245,
"grad_norm": 2.4283175468444824,
"learning_rate": 0.001,
"loss": 1.3902,
"step": 182300
},
{
"epoch": 58.95281189398836,
"grad_norm": 2.749220132827759,
"learning_rate": 0.001,
"loss": 1.3813,
"step": 182400
},
{
"epoch": 58.985132514544276,
"grad_norm": 1.7633317708969116,
"learning_rate": 0.001,
"loss": 1.3811,
"step": 182500
},
{
"epoch": 59.0174531351002,
"grad_norm": 2.3028452396392822,
"learning_rate": 0.001,
"loss": 1.3115,
"step": 182600
},
{
"epoch": 59.04977375565611,
"grad_norm": 1.91004478931427,
"learning_rate": 0.001,
"loss": 1.2523,
"step": 182700
},
{
"epoch": 59.08209437621203,
"grad_norm": 1.9824846982955933,
"learning_rate": 0.001,
"loss": 1.2691,
"step": 182800
},
{
"epoch": 59.11441499676794,
"grad_norm": 1.9162917137145996,
"learning_rate": 0.001,
"loss": 1.2654,
"step": 182900
},
{
"epoch": 59.146735617323856,
"grad_norm": 2.174314022064209,
"learning_rate": 0.001,
"loss": 1.2729,
"step": 183000
},
{
"epoch": 59.17905623787977,
"grad_norm": 1.950962781906128,
"learning_rate": 0.001,
"loss": 1.2842,
"step": 183100
},
{
"epoch": 59.211376858435685,
"grad_norm": 2.099749803543091,
"learning_rate": 0.001,
"loss": 1.2827,
"step": 183200
},
{
"epoch": 59.2436974789916,
"grad_norm": 1.7778706550598145,
"learning_rate": 0.001,
"loss": 1.2878,
"step": 183300
},
{
"epoch": 59.276018099547514,
"grad_norm": 2.519252300262451,
"learning_rate": 0.001,
"loss": 1.2948,
"step": 183400
},
{
"epoch": 59.30833872010343,
"grad_norm": 2.304509162902832,
"learning_rate": 0.001,
"loss": 1.2958,
"step": 183500
},
{
"epoch": 59.34065934065934,
"grad_norm": 2.029158353805542,
"learning_rate": 0.001,
"loss": 1.3077,
"step": 183600
},
{
"epoch": 59.37297996121526,
"grad_norm": 2.7044732570648193,
"learning_rate": 0.001,
"loss": 1.3067,
"step": 183700
},
{
"epoch": 59.40530058177117,
"grad_norm": 2.5257177352905273,
"learning_rate": 0.001,
"loss": 1.3115,
"step": 183800
},
{
"epoch": 59.43762120232709,
"grad_norm": 2.422498941421509,
"learning_rate": 0.001,
"loss": 1.3333,
"step": 183900
},
{
"epoch": 59.469941822883,
"grad_norm": 2.1336445808410645,
"learning_rate": 0.001,
"loss": 1.3216,
"step": 184000
},
{
"epoch": 59.502262443438916,
"grad_norm": 1.7418371438980103,
"learning_rate": 0.001,
"loss": 1.3296,
"step": 184100
},
{
"epoch": 59.53458306399483,
"grad_norm": 1.9461792707443237,
"learning_rate": 0.001,
"loss": 1.3301,
"step": 184200
},
{
"epoch": 59.566903684550745,
"grad_norm": 1.9766813516616821,
"learning_rate": 0.001,
"loss": 1.3293,
"step": 184300
},
{
"epoch": 59.59922430510666,
"grad_norm": 1.7881988286972046,
"learning_rate": 0.001,
"loss": 1.3434,
"step": 184400
},
{
"epoch": 59.631544925662574,
"grad_norm": 2.1620521545410156,
"learning_rate": 0.001,
"loss": 1.3423,
"step": 184500
},
{
"epoch": 59.66386554621849,
"grad_norm": 1.9429798126220703,
"learning_rate": 0.001,
"loss": 1.3373,
"step": 184600
},
{
"epoch": 59.6961861667744,
"grad_norm": 1.764739990234375,
"learning_rate": 0.001,
"loss": 1.3384,
"step": 184700
},
{
"epoch": 59.72850678733032,
"grad_norm": 1.7464152574539185,
"learning_rate": 0.001,
"loss": 1.3531,
"step": 184800
},
{
"epoch": 59.76082740788623,
"grad_norm": 2.2472636699676514,
"learning_rate": 0.001,
"loss": 1.359,
"step": 184900
},
{
"epoch": 59.79314802844215,
"grad_norm": 1.8305083513259888,
"learning_rate": 0.001,
"loss": 1.353,
"step": 185000
},
{
"epoch": 59.82546864899806,
"grad_norm": 1.6903504133224487,
"learning_rate": 0.001,
"loss": 1.3556,
"step": 185100
},
{
"epoch": 59.857789269553976,
"grad_norm": 2.093635082244873,
"learning_rate": 0.001,
"loss": 1.3483,
"step": 185200
},
{
"epoch": 59.89010989010989,
"grad_norm": 2.056464910507202,
"learning_rate": 0.001,
"loss": 1.3502,
"step": 185300
},
{
"epoch": 59.922430510665805,
"grad_norm": 1.7775726318359375,
"learning_rate": 0.001,
"loss": 1.3602,
"step": 185400
},
{
"epoch": 59.95475113122172,
"grad_norm": 1.6163804531097412,
"learning_rate": 0.001,
"loss": 1.3618,
"step": 185500
},
{
"epoch": 59.987071751777634,
"grad_norm": 1.7184257507324219,
"learning_rate": 0.001,
"loss": 1.3704,
"step": 185600
},
{
"epoch": 60.01939237233355,
"grad_norm": 1.7940794229507446,
"learning_rate": 0.001,
"loss": 1.2951,
"step": 185700
},
{
"epoch": 60.05171299288946,
"grad_norm": 1.667309284210205,
"learning_rate": 0.001,
"loss": 1.248,
"step": 185800
},
{
"epoch": 60.08403361344538,
"grad_norm": 1.5899906158447266,
"learning_rate": 0.001,
"loss": 1.2604,
"step": 185900
},
{
"epoch": 60.11635423400129,
"grad_norm": 1.9902898073196411,
"learning_rate": 0.001,
"loss": 1.2485,
"step": 186000
},
{
"epoch": 60.14867485455721,
"grad_norm": 1.522161841392517,
"learning_rate": 0.001,
"loss": 1.2438,
"step": 186100
},
{
"epoch": 60.18099547511312,
"grad_norm": 1.5227113962173462,
"learning_rate": 0.001,
"loss": 1.2651,
"step": 186200
},
{
"epoch": 60.213316095669036,
"grad_norm": 1.6835823059082031,
"learning_rate": 0.001,
"loss": 1.2652,
"step": 186300
},
{
"epoch": 60.24563671622495,
"grad_norm": 1.817858338356018,
"learning_rate": 0.001,
"loss": 1.2638,
"step": 186400
},
{
"epoch": 60.277957336780865,
"grad_norm": 1.9655687808990479,
"learning_rate": 0.001,
"loss": 1.2967,
"step": 186500
},
{
"epoch": 60.31027795733678,
"grad_norm": 2.3373231887817383,
"learning_rate": 0.001,
"loss": 1.2971,
"step": 186600
},
{
"epoch": 60.342598577892694,
"grad_norm": 2.2942612171173096,
"learning_rate": 0.001,
"loss": 1.2899,
"step": 186700
},
{
"epoch": 60.37491919844861,
"grad_norm": 2.092560291290283,
"learning_rate": 0.001,
"loss": 1.2844,
"step": 186800
},
{
"epoch": 60.40723981900452,
"grad_norm": 1.5219141244888306,
"learning_rate": 0.001,
"loss": 1.2982,
"step": 186900
},
{
"epoch": 60.43956043956044,
"grad_norm": 1.906146764755249,
"learning_rate": 0.001,
"loss": 1.2941,
"step": 187000
},
{
"epoch": 60.47188106011635,
"grad_norm": 1.6990609169006348,
"learning_rate": 0.001,
"loss": 1.2908,
"step": 187100
},
{
"epoch": 60.50420168067227,
"grad_norm": 1.9348620176315308,
"learning_rate": 0.001,
"loss": 1.3104,
"step": 187200
},
{
"epoch": 60.53652230122818,
"grad_norm": 1.878623366355896,
"learning_rate": 0.001,
"loss": 1.3012,
"step": 187300
},
{
"epoch": 60.568842921784096,
"grad_norm": 1.4890978336334229,
"learning_rate": 0.001,
"loss": 1.342,
"step": 187400
},
{
"epoch": 60.60116354234001,
"grad_norm": 3.4084646701812744,
"learning_rate": 0.001,
"loss": 1.3242,
"step": 187500
},
{
"epoch": 60.633484162895925,
"grad_norm": 1.784811019897461,
"learning_rate": 0.001,
"loss": 1.3028,
"step": 187600
},
{
"epoch": 60.66580478345184,
"grad_norm": 1.9564650058746338,
"learning_rate": 0.001,
"loss": 1.3375,
"step": 187700
},
{
"epoch": 60.698125404007754,
"grad_norm": 1.9819107055664062,
"learning_rate": 0.001,
"loss": 1.3309,
"step": 187800
},
{
"epoch": 60.73044602456367,
"grad_norm": 1.938475489616394,
"learning_rate": 0.001,
"loss": 1.3285,
"step": 187900
},
{
"epoch": 60.762766645119584,
"grad_norm": 1.7497060298919678,
"learning_rate": 0.001,
"loss": 1.3384,
"step": 188000
},
{
"epoch": 60.7950872656755,
"grad_norm": 1.5079044103622437,
"learning_rate": 0.001,
"loss": 1.342,
"step": 188100
},
{
"epoch": 60.82740788623141,
"grad_norm": 1.7045460939407349,
"learning_rate": 0.001,
"loss": 1.3398,
"step": 188200
},
{
"epoch": 60.85972850678733,
"grad_norm": 1.809365153312683,
"learning_rate": 0.001,
"loss": 1.3565,
"step": 188300
},
{
"epoch": 60.89204912734324,
"grad_norm": 1.9220020771026611,
"learning_rate": 0.001,
"loss": 1.3592,
"step": 188400
},
{
"epoch": 60.924369747899156,
"grad_norm": 1.7121220827102661,
"learning_rate": 0.001,
"loss": 1.3586,
"step": 188500
},
{
"epoch": 60.95669036845507,
"grad_norm": 1.5285897254943848,
"learning_rate": 0.001,
"loss": 1.3651,
"step": 188600
},
{
"epoch": 60.98901098901099,
"grad_norm": 1.6838990449905396,
"learning_rate": 0.001,
"loss": 1.3529,
"step": 188700
},
{
"epoch": 61.02133160956691,
"grad_norm": 1.7248560190200806,
"learning_rate": 0.001,
"loss": 1.2819,
"step": 188800
},
{
"epoch": 61.05365223012282,
"grad_norm": 1.699639916419983,
"learning_rate": 0.001,
"loss": 1.2323,
"step": 188900
},
{
"epoch": 61.085972850678736,
"grad_norm": 1.7705409526824951,
"learning_rate": 0.001,
"loss": 1.2365,
"step": 189000
},
{
"epoch": 61.11829347123465,
"grad_norm": 1.709913969039917,
"learning_rate": 0.001,
"loss": 1.257,
"step": 189100
},
{
"epoch": 61.150614091790565,
"grad_norm": 1.7027641534805298,
"learning_rate": 0.001,
"loss": 1.2317,
"step": 189200
},
{
"epoch": 61.18293471234648,
"grad_norm": 1.5179117918014526,
"learning_rate": 0.001,
"loss": 1.2539,
"step": 189300
},
{
"epoch": 61.215255332902395,
"grad_norm": 3.146793842315674,
"learning_rate": 0.001,
"loss": 1.2494,
"step": 189400
},
{
"epoch": 61.24757595345831,
"grad_norm": 1.4820927381515503,
"learning_rate": 0.001,
"loss": 1.26,
"step": 189500
},
{
"epoch": 61.279896574014224,
"grad_norm": 2.3077597618103027,
"learning_rate": 0.001,
"loss": 1.2502,
"step": 189600
},
{
"epoch": 61.31221719457014,
"grad_norm": 1.8483630418777466,
"learning_rate": 0.001,
"loss": 1.2922,
"step": 189700
},
{
"epoch": 61.34453781512605,
"grad_norm": 1.5808696746826172,
"learning_rate": 0.001,
"loss": 1.2773,
"step": 189800
},
{
"epoch": 61.37685843568197,
"grad_norm": 1.8577014207839966,
"learning_rate": 0.001,
"loss": 1.2712,
"step": 189900
},
{
"epoch": 61.40917905623788,
"grad_norm": 1.837640643119812,
"learning_rate": 0.001,
"loss": 1.2924,
"step": 190000
},
{
"epoch": 61.441499676793796,
"grad_norm": 1.4489262104034424,
"learning_rate": 0.001,
"loss": 1.2856,
"step": 190100
},
{
"epoch": 61.47382029734971,
"grad_norm": 1.756155014038086,
"learning_rate": 0.001,
"loss": 1.2945,
"step": 190200
},
{
"epoch": 61.506140917905626,
"grad_norm": 1.7968300580978394,
"learning_rate": 0.001,
"loss": 1.3083,
"step": 190300
},
{
"epoch": 61.53846153846154,
"grad_norm": 1.3242460489273071,
"learning_rate": 0.001,
"loss": 1.3004,
"step": 190400
},
{
"epoch": 61.570782159017455,
"grad_norm": 1.5521363019943237,
"learning_rate": 0.001,
"loss": 1.3086,
"step": 190500
},
{
"epoch": 61.60310277957337,
"grad_norm": 1.6019891500473022,
"learning_rate": 0.001,
"loss": 1.3037,
"step": 190600
},
{
"epoch": 61.635423400129284,
"grad_norm": 2.068342685699463,
"learning_rate": 0.001,
"loss": 1.3043,
"step": 190700
},
{
"epoch": 61.6677440206852,
"grad_norm": 1.7331029176712036,
"learning_rate": 0.001,
"loss": 1.3196,
"step": 190800
},
{
"epoch": 61.70006464124111,
"grad_norm": 2.1270153522491455,
"learning_rate": 0.001,
"loss": 1.3156,
"step": 190900
},
{
"epoch": 61.73238526179703,
"grad_norm": 1.7159593105316162,
"learning_rate": 0.001,
"loss": 1.3161,
"step": 191000
},
{
"epoch": 61.76470588235294,
"grad_norm": 1.6671262979507446,
"learning_rate": 0.001,
"loss": 1.3241,
"step": 191100
},
{
"epoch": 61.79702650290886,
"grad_norm": 1.7520173788070679,
"learning_rate": 0.001,
"loss": 1.3196,
"step": 191200
},
{
"epoch": 61.82934712346477,
"grad_norm": 1.407358169555664,
"learning_rate": 0.001,
"loss": 1.3465,
"step": 191300
},
{
"epoch": 61.861667744020686,
"grad_norm": 1.7164902687072754,
"learning_rate": 0.001,
"loss": 1.3407,
"step": 191400
},
{
"epoch": 61.8939883645766,
"grad_norm": 1.714900016784668,
"learning_rate": 0.001,
"loss": 1.3496,
"step": 191500
},
{
"epoch": 61.926308985132515,
"grad_norm": 2.160857677459717,
"learning_rate": 0.001,
"loss": 1.3361,
"step": 191600
},
{
"epoch": 61.95862960568843,
"grad_norm": 1.8158776760101318,
"learning_rate": 0.001,
"loss": 1.3433,
"step": 191700
},
{
"epoch": 61.990950226244344,
"grad_norm": 1.847709059715271,
"learning_rate": 0.001,
"loss": 1.3598,
"step": 191800
},
{
"epoch": 62.02327084680026,
"grad_norm": 1.7720732688903809,
"learning_rate": 0.001,
"loss": 1.2744,
"step": 191900
},
{
"epoch": 62.05559146735617,
"grad_norm": 1.5437538623809814,
"learning_rate": 0.001,
"loss": 1.2139,
"step": 192000
},
{
"epoch": 62.08791208791209,
"grad_norm": 1.6394867897033691,
"learning_rate": 0.001,
"loss": 1.2301,
"step": 192100
},
{
"epoch": 62.120232708468,
"grad_norm": 1.5179033279418945,
"learning_rate": 0.001,
"loss": 1.2352,
"step": 192200
},
{
"epoch": 62.15255332902392,
"grad_norm": 2.0513007640838623,
"learning_rate": 0.001,
"loss": 1.2292,
"step": 192300
},
{
"epoch": 62.18487394957983,
"grad_norm": 1.76735258102417,
"learning_rate": 0.001,
"loss": 1.2531,
"step": 192400
},
{
"epoch": 62.217194570135746,
"grad_norm": 1.9428764581680298,
"learning_rate": 0.001,
"loss": 1.2507,
"step": 192500
},
{
"epoch": 62.24951519069166,
"grad_norm": 1.3896160125732422,
"learning_rate": 0.001,
"loss": 1.2556,
"step": 192600
},
{
"epoch": 62.281835811247575,
"grad_norm": 1.634589433670044,
"learning_rate": 0.001,
"loss": 1.2598,
"step": 192700
},
{
"epoch": 62.31415643180349,
"grad_norm": 1.906327486038208,
"learning_rate": 0.001,
"loss": 1.2587,
"step": 192800
},
{
"epoch": 62.346477052359404,
"grad_norm": 1.7237509489059448,
"learning_rate": 0.001,
"loss": 1.2544,
"step": 192900
},
{
"epoch": 62.37879767291532,
"grad_norm": 1.5861009359359741,
"learning_rate": 0.001,
"loss": 1.2693,
"step": 193000
},
{
"epoch": 62.41111829347123,
"grad_norm": 1.6999458074569702,
"learning_rate": 0.001,
"loss": 1.2815,
"step": 193100
},
{
"epoch": 62.44343891402715,
"grad_norm": 1.9093917608261108,
"learning_rate": 0.001,
"loss": 1.268,
"step": 193200
},
{
"epoch": 62.47575953458306,
"grad_norm": 1.64690363407135,
"learning_rate": 0.001,
"loss": 1.2734,
"step": 193300
},
{
"epoch": 62.50808015513898,
"grad_norm": 1.8700169324874878,
"learning_rate": 0.001,
"loss": 1.2684,
"step": 193400
},
{
"epoch": 62.54040077569489,
"grad_norm": 1.862876534461975,
"learning_rate": 0.001,
"loss": 1.3038,
"step": 193500
},
{
"epoch": 62.572721396250806,
"grad_norm": 1.5933880805969238,
"learning_rate": 0.001,
"loss": 1.3032,
"step": 193600
},
{
"epoch": 62.60504201680672,
"grad_norm": 1.569177269935608,
"learning_rate": 0.001,
"loss": 1.2934,
"step": 193700
},
{
"epoch": 62.637362637362635,
"grad_norm": 1.7045800685882568,
"learning_rate": 0.001,
"loss": 1.3054,
"step": 193800
},
{
"epoch": 62.66968325791855,
"grad_norm": 1.502102017402649,
"learning_rate": 0.001,
"loss": 1.2973,
"step": 193900
},
{
"epoch": 62.702003878474464,
"grad_norm": 1.3741238117218018,
"learning_rate": 0.001,
"loss": 1.3095,
"step": 194000
},
{
"epoch": 62.73432449903038,
"grad_norm": 1.508573293685913,
"learning_rate": 0.001,
"loss": 1.3103,
"step": 194100
},
{
"epoch": 62.76664511958629,
"grad_norm": 1.6713041067123413,
"learning_rate": 0.001,
"loss": 1.3066,
"step": 194200
},
{
"epoch": 62.79896574014221,
"grad_norm": 2.0911645889282227,
"learning_rate": 0.001,
"loss": 1.3418,
"step": 194300
},
{
"epoch": 62.83128636069812,
"grad_norm": 1.6197428703308105,
"learning_rate": 0.001,
"loss": 1.3179,
"step": 194400
},
{
"epoch": 62.86360698125404,
"grad_norm": 1.4563323259353638,
"learning_rate": 0.001,
"loss": 1.3078,
"step": 194500
},
{
"epoch": 62.89592760180995,
"grad_norm": 1.447568655014038,
"learning_rate": 0.001,
"loss": 1.3128,
"step": 194600
},
{
"epoch": 62.928248222365866,
"grad_norm": 1.577903151512146,
"learning_rate": 0.001,
"loss": 1.3518,
"step": 194700
},
{
"epoch": 62.96056884292178,
"grad_norm": 1.6062977313995361,
"learning_rate": 0.001,
"loss": 1.3364,
"step": 194800
},
{
"epoch": 62.992889463477695,
"grad_norm": 1.6443058252334595,
"learning_rate": 0.001,
"loss": 1.3298,
"step": 194900
},
{
"epoch": 63.02521008403362,
"grad_norm": 1.7947728633880615,
"learning_rate": 0.001,
"loss": 1.2272,
"step": 195000
},
{
"epoch": 63.05753070458953,
"grad_norm": 1.7414778470993042,
"learning_rate": 0.001,
"loss": 1.2157,
"step": 195100
},
{
"epoch": 63.089851325145446,
"grad_norm": 1.7546195983886719,
"learning_rate": 0.001,
"loss": 1.2209,
"step": 195200
},
{
"epoch": 63.12217194570136,
"grad_norm": 1.7215334177017212,
"learning_rate": 0.001,
"loss": 1.2283,
"step": 195300
},
{
"epoch": 63.154492566257275,
"grad_norm": 1.6781865358352661,
"learning_rate": 0.001,
"loss": 1.2115,
"step": 195400
},
{
"epoch": 63.18681318681319,
"grad_norm": 2.100222110748291,
"learning_rate": 0.001,
"loss": 1.2348,
"step": 195500
},
{
"epoch": 63.219133807369104,
"grad_norm": 1.6543059349060059,
"learning_rate": 0.001,
"loss": 1.2364,
"step": 195600
},
{
"epoch": 63.25145442792502,
"grad_norm": 1.8639600276947021,
"learning_rate": 0.001,
"loss": 1.2418,
"step": 195700
},
{
"epoch": 63.28377504848093,
"grad_norm": 1.4900529384613037,
"learning_rate": 0.001,
"loss": 1.2321,
"step": 195800
},
{
"epoch": 63.31609566903685,
"grad_norm": 1.7212891578674316,
"learning_rate": 0.001,
"loss": 1.2556,
"step": 195900
},
{
"epoch": 63.34841628959276,
"grad_norm": 1.827954649925232,
"learning_rate": 0.001,
"loss": 1.2661,
"step": 196000
},
{
"epoch": 63.38073691014868,
"grad_norm": 2.1511712074279785,
"learning_rate": 0.001,
"loss": 1.2529,
"step": 196100
},
{
"epoch": 63.41305753070459,
"grad_norm": 1.4950047731399536,
"learning_rate": 0.001,
"loss": 1.2808,
"step": 196200
},
{
"epoch": 63.445378151260506,
"grad_norm": 1.7102982997894287,
"learning_rate": 0.001,
"loss": 1.2631,
"step": 196300
},
{
"epoch": 63.47769877181642,
"grad_norm": 1.6856714487075806,
"learning_rate": 0.001,
"loss": 1.2602,
"step": 196400
},
{
"epoch": 63.510019392372335,
"grad_norm": 1.7770227193832397,
"learning_rate": 0.001,
"loss": 1.2767,
"step": 196500
},
{
"epoch": 63.54234001292825,
"grad_norm": 2.618396520614624,
"learning_rate": 0.001,
"loss": 1.2671,
"step": 196600
},
{
"epoch": 63.574660633484164,
"grad_norm": 2.238501787185669,
"learning_rate": 0.001,
"loss": 1.2707,
"step": 196700
},
{
"epoch": 63.60698125404008,
"grad_norm": 1.4879484176635742,
"learning_rate": 0.001,
"loss": 1.2657,
"step": 196800
},
{
"epoch": 63.63930187459599,
"grad_norm": 1.6655293703079224,
"learning_rate": 0.001,
"loss": 1.2964,
"step": 196900
},
{
"epoch": 63.67162249515191,
"grad_norm": 1.5491960048675537,
"learning_rate": 0.001,
"loss": 1.2974,
"step": 197000
},
{
"epoch": 63.70394311570782,
"grad_norm": 2.067685842514038,
"learning_rate": 0.001,
"loss": 1.2997,
"step": 197100
},
{
"epoch": 63.73626373626374,
"grad_norm": 1.9535744190216064,
"learning_rate": 0.001,
"loss": 1.298,
"step": 197200
},
{
"epoch": 63.76858435681965,
"grad_norm": 2.4942731857299805,
"learning_rate": 0.001,
"loss": 1.2957,
"step": 197300
},
{
"epoch": 63.800904977375566,
"grad_norm": 1.6687902212142944,
"learning_rate": 0.001,
"loss": 1.2998,
"step": 197400
},
{
"epoch": 63.83322559793148,
"grad_norm": 1.4590330123901367,
"learning_rate": 0.001,
"loss": 1.2974,
"step": 197500
},
{
"epoch": 63.865546218487395,
"grad_norm": 1.4519684314727783,
"learning_rate": 0.001,
"loss": 1.3264,
"step": 197600
},
{
"epoch": 63.89786683904331,
"grad_norm": 1.676956057548523,
"learning_rate": 0.001,
"loss": 1.3151,
"step": 197700
},
{
"epoch": 63.930187459599225,
"grad_norm": 1.7003228664398193,
"learning_rate": 0.001,
"loss": 1.314,
"step": 197800
},
{
"epoch": 63.96250808015514,
"grad_norm": 1.785687804222107,
"learning_rate": 0.001,
"loss": 1.3328,
"step": 197900
},
{
"epoch": 63.994828700711054,
"grad_norm": 1.8566793203353882,
"learning_rate": 0.001,
"loss": 1.3199,
"step": 198000
},
{
"epoch": 64.02714932126698,
"grad_norm": 2.3256516456604004,
"learning_rate": 0.001,
"loss": 1.2133,
"step": 198100
},
{
"epoch": 64.05946994182288,
"grad_norm": 1.8626536130905151,
"learning_rate": 0.001,
"loss": 1.1985,
"step": 198200
},
{
"epoch": 64.0917905623788,
"grad_norm": 1.5357648134231567,
"learning_rate": 0.001,
"loss": 1.2016,
"step": 198300
},
{
"epoch": 64.12411118293471,
"grad_norm": 2.2160654067993164,
"learning_rate": 0.001,
"loss": 1.2191,
"step": 198400
},
{
"epoch": 64.15643180349063,
"grad_norm": 1.763607144355774,
"learning_rate": 0.001,
"loss": 1.2095,
"step": 198500
},
{
"epoch": 64.18875242404654,
"grad_norm": 5.209096431732178,
"learning_rate": 0.001,
"loss": 1.2131,
"step": 198600
},
{
"epoch": 64.22107304460246,
"grad_norm": 1.854581356048584,
"learning_rate": 0.001,
"loss": 1.2368,
"step": 198700
},
{
"epoch": 64.25339366515837,
"grad_norm": 1.6673282384872437,
"learning_rate": 0.001,
"loss": 1.2243,
"step": 198800
},
{
"epoch": 64.28571428571429,
"grad_norm": 81.45235443115234,
"learning_rate": 0.001,
"loss": 1.2377,
"step": 198900
},
{
"epoch": 64.3180349062702,
"grad_norm": 2.5092577934265137,
"learning_rate": 0.001,
"loss": 1.2312,
"step": 199000
},
{
"epoch": 64.35035552682612,
"grad_norm": 1.8041945695877075,
"learning_rate": 0.001,
"loss": 1.2396,
"step": 199100
},
{
"epoch": 64.38267614738203,
"grad_norm": 1.6541348695755005,
"learning_rate": 0.001,
"loss": 1.2544,
"step": 199200
},
{
"epoch": 64.41499676793795,
"grad_norm": 2.0701494216918945,
"learning_rate": 0.001,
"loss": 1.2392,
"step": 199300
},
{
"epoch": 64.44731738849386,
"grad_norm": 1.6884028911590576,
"learning_rate": 0.001,
"loss": 1.2514,
"step": 199400
},
{
"epoch": 64.47963800904978,
"grad_norm": 1.775291085243225,
"learning_rate": 0.001,
"loss": 1.2639,
"step": 199500
},
{
"epoch": 64.51195862960569,
"grad_norm": 2.3357295989990234,
"learning_rate": 0.001,
"loss": 1.2685,
"step": 199600
},
{
"epoch": 64.54427925016161,
"grad_norm": 2.315829277038574,
"learning_rate": 0.001,
"loss": 1.2687,
"step": 199700
},
{
"epoch": 64.57659987071752,
"grad_norm": 1.7337130308151245,
"learning_rate": 0.001,
"loss": 1.2687,
"step": 199800
},
{
"epoch": 64.60892049127344,
"grad_norm": 1.704189658164978,
"learning_rate": 0.001,
"loss": 1.2801,
"step": 199900
},
{
"epoch": 64.64124111182934,
"grad_norm": 1.7531574964523315,
"learning_rate": 0.001,
"loss": 1.2692,
"step": 200000
},
{
"epoch": 64.67356173238527,
"grad_norm": 2.4136340618133545,
"learning_rate": 0.001,
"loss": 1.2724,
"step": 200100
},
{
"epoch": 64.70588235294117,
"grad_norm": 1.4422646760940552,
"learning_rate": 0.001,
"loss": 1.2875,
"step": 200200
},
{
"epoch": 64.7382029734971,
"grad_norm": 1.8993738889694214,
"learning_rate": 0.001,
"loss": 1.2896,
"step": 200300
},
{
"epoch": 64.770523594053,
"grad_norm": 1.7030922174453735,
"learning_rate": 0.001,
"loss": 1.2888,
"step": 200400
},
{
"epoch": 64.80284421460892,
"grad_norm": 1.6060155630111694,
"learning_rate": 0.001,
"loss": 1.3016,
"step": 200500
},
{
"epoch": 64.83516483516483,
"grad_norm": 4.930392265319824,
"learning_rate": 0.001,
"loss": 1.299,
"step": 200600
},
{
"epoch": 64.86748545572075,
"grad_norm": 1.7060980796813965,
"learning_rate": 0.001,
"loss": 1.3025,
"step": 200700
},
{
"epoch": 64.89980607627666,
"grad_norm": 1.754961609840393,
"learning_rate": 0.001,
"loss": 1.2893,
"step": 200800
},
{
"epoch": 64.93212669683258,
"grad_norm": 1.7960935831069946,
"learning_rate": 0.001,
"loss": 1.2953,
"step": 200900
},
{
"epoch": 64.96444731738849,
"grad_norm": 2.0267605781555176,
"learning_rate": 0.001,
"loss": 1.3143,
"step": 201000
},
{
"epoch": 64.99676793794441,
"grad_norm": 2.2149808406829834,
"learning_rate": 0.001,
"loss": 1.2921,
"step": 201100
},
{
"epoch": 65.02908855850032,
"grad_norm": 2.4537580013275146,
"learning_rate": 0.001,
"loss": 1.1977,
"step": 201200
},
{
"epoch": 65.06140917905624,
"grad_norm": 1.6021087169647217,
"learning_rate": 0.001,
"loss": 1.1736,
"step": 201300
},
{
"epoch": 65.09372979961215,
"grad_norm": 1.9216489791870117,
"learning_rate": 0.001,
"loss": 1.1835,
"step": 201400
},
{
"epoch": 65.12605042016807,
"grad_norm": 2.218745470046997,
"learning_rate": 0.001,
"loss": 1.2057,
"step": 201500
},
{
"epoch": 65.15837104072398,
"grad_norm": 1.8575019836425781,
"learning_rate": 0.001,
"loss": 1.2034,
"step": 201600
},
{
"epoch": 65.1906916612799,
"grad_norm": 2.2216298580169678,
"learning_rate": 0.001,
"loss": 1.2068,
"step": 201700
},
{
"epoch": 65.2230122818358,
"grad_norm": 1.9984310865402222,
"learning_rate": 0.001,
"loss": 1.1996,
"step": 201800
},
{
"epoch": 65.25533290239173,
"grad_norm": 1.926300048828125,
"learning_rate": 0.001,
"loss": 1.2163,
"step": 201900
},
{
"epoch": 65.28765352294764,
"grad_norm": 2.752197504043579,
"learning_rate": 0.001,
"loss": 1.2257,
"step": 202000
},
{
"epoch": 65.31997414350356,
"grad_norm": 1.9841582775115967,
"learning_rate": 0.001,
"loss": 1.2055,
"step": 202100
},
{
"epoch": 65.35229476405947,
"grad_norm": 2.0355589389801025,
"learning_rate": 0.001,
"loss": 1.2373,
"step": 202200
},
{
"epoch": 65.38461538461539,
"grad_norm": 1.867073893547058,
"learning_rate": 0.001,
"loss": 1.227,
"step": 202300
},
{
"epoch": 65.4169360051713,
"grad_norm": 1.8634486198425293,
"learning_rate": 0.001,
"loss": 1.2322,
"step": 202400
},
{
"epoch": 65.44925662572722,
"grad_norm": 1.804103136062622,
"learning_rate": 0.001,
"loss": 1.2494,
"step": 202500
},
{
"epoch": 65.48157724628312,
"grad_norm": 1.7396485805511475,
"learning_rate": 0.001,
"loss": 1.2445,
"step": 202600
},
{
"epoch": 65.51389786683905,
"grad_norm": 2.412324905395508,
"learning_rate": 0.001,
"loss": 1.2476,
"step": 202700
},
{
"epoch": 65.54621848739495,
"grad_norm": 1.8536796569824219,
"learning_rate": 0.001,
"loss": 1.259,
"step": 202800
},
{
"epoch": 65.57853910795087,
"grad_norm": 1.7683600187301636,
"learning_rate": 0.001,
"loss": 1.2411,
"step": 202900
},
{
"epoch": 65.61085972850678,
"grad_norm": 1.793859839439392,
"learning_rate": 0.001,
"loss": 1.2516,
"step": 203000
},
{
"epoch": 65.6431803490627,
"grad_norm": 1.6775269508361816,
"learning_rate": 0.001,
"loss": 1.2822,
"step": 203100
},
{
"epoch": 65.67550096961861,
"grad_norm": 1.8493618965148926,
"learning_rate": 0.001,
"loss": 1.2761,
"step": 203200
},
{
"epoch": 65.70782159017453,
"grad_norm": 1.6558525562286377,
"learning_rate": 0.001,
"loss": 1.2712,
"step": 203300
},
{
"epoch": 65.74014221073044,
"grad_norm": 2.0000391006469727,
"learning_rate": 0.001,
"loss": 1.2709,
"step": 203400
},
{
"epoch": 65.77246283128636,
"grad_norm": 2.3382959365844727,
"learning_rate": 0.001,
"loss": 1.2822,
"step": 203500
},
{
"epoch": 65.80478345184227,
"grad_norm": 1.79212486743927,
"learning_rate": 0.001,
"loss": 1.2702,
"step": 203600
},
{
"epoch": 65.83710407239819,
"grad_norm": 1.6742587089538574,
"learning_rate": 0.001,
"loss": 1.2835,
"step": 203700
},
{
"epoch": 65.8694246929541,
"grad_norm": 2.517657995223999,
"learning_rate": 0.001,
"loss": 1.2946,
"step": 203800
},
{
"epoch": 65.90174531351002,
"grad_norm": 1.6825872659683228,
"learning_rate": 0.001,
"loss": 1.2763,
"step": 203900
},
{
"epoch": 65.93406593406593,
"grad_norm": 2.0730693340301514,
"learning_rate": 0.001,
"loss": 1.3014,
"step": 204000
},
{
"epoch": 65.96638655462185,
"grad_norm": 2.2603330612182617,
"learning_rate": 0.001,
"loss": 1.3025,
"step": 204100
},
{
"epoch": 65.99870717517777,
"grad_norm": 2.1212308406829834,
"learning_rate": 0.001,
"loss": 1.2994,
"step": 204200
},
{
"epoch": 66.03102779573368,
"grad_norm": 2.014443874359131,
"learning_rate": 0.001,
"loss": 1.1643,
"step": 204300
},
{
"epoch": 66.0633484162896,
"grad_norm": 1.8272758722305298,
"learning_rate": 0.001,
"loss": 1.1669,
"step": 204400
},
{
"epoch": 66.0956690368455,
"grad_norm": 1.8921802043914795,
"learning_rate": 0.001,
"loss": 1.1608,
"step": 204500
},
{
"epoch": 66.12798965740143,
"grad_norm": 2.260369062423706,
"learning_rate": 0.001,
"loss": 1.2049,
"step": 204600
},
{
"epoch": 66.16031027795734,
"grad_norm": 2.3378474712371826,
"learning_rate": 0.001,
"loss": 1.175,
"step": 204700
},
{
"epoch": 66.19263089851326,
"grad_norm": 1.8673335313796997,
"learning_rate": 0.001,
"loss": 1.1971,
"step": 204800
},
{
"epoch": 66.22495151906917,
"grad_norm": 2.035820960998535,
"learning_rate": 0.001,
"loss": 1.2128,
"step": 204900
},
{
"epoch": 66.25727213962509,
"grad_norm": 1.6351029872894287,
"learning_rate": 0.001,
"loss": 1.2055,
"step": 205000
},
{
"epoch": 66.289592760181,
"grad_norm": 2.6478259563446045,
"learning_rate": 0.001,
"loss": 1.1864,
"step": 205100
},
{
"epoch": 66.32191338073692,
"grad_norm": 2.325328826904297,
"learning_rate": 0.001,
"loss": 1.2158,
"step": 205200
},
{
"epoch": 66.35423400129282,
"grad_norm": 1.886999487876892,
"learning_rate": 0.001,
"loss": 1.2038,
"step": 205300
},
{
"epoch": 66.38655462184875,
"grad_norm": 2.104828119277954,
"learning_rate": 0.001,
"loss": 1.2265,
"step": 205400
},
{
"epoch": 66.41887524240465,
"grad_norm": 1.7025809288024902,
"learning_rate": 0.001,
"loss": 1.2375,
"step": 205500
},
{
"epoch": 66.45119586296057,
"grad_norm": 2.26194429397583,
"learning_rate": 0.001,
"loss": 1.2465,
"step": 205600
},
{
"epoch": 66.48351648351648,
"grad_norm": 2.2679295539855957,
"learning_rate": 0.001,
"loss": 1.2504,
"step": 205700
},
{
"epoch": 66.5158371040724,
"grad_norm": 2.020747184753418,
"learning_rate": 0.001,
"loss": 1.2355,
"step": 205800
},
{
"epoch": 66.54815772462831,
"grad_norm": 2.4566147327423096,
"learning_rate": 0.001,
"loss": 1.2323,
"step": 205900
},
{
"epoch": 66.58047834518423,
"grad_norm": 1.9596573114395142,
"learning_rate": 0.001,
"loss": 1.2406,
"step": 206000
},
{
"epoch": 66.61279896574014,
"grad_norm": 2.363355875015259,
"learning_rate": 0.001,
"loss": 1.2559,
"step": 206100
},
{
"epoch": 66.64511958629606,
"grad_norm": 2.451207160949707,
"learning_rate": 0.001,
"loss": 1.2636,
"step": 206200
},
{
"epoch": 66.67744020685197,
"grad_norm": 1.8735805749893188,
"learning_rate": 0.001,
"loss": 1.2496,
"step": 206300
},
{
"epoch": 66.70976082740789,
"grad_norm": 2.4483344554901123,
"learning_rate": 0.001,
"loss": 1.2541,
"step": 206400
},
{
"epoch": 66.7420814479638,
"grad_norm": 2.172673463821411,
"learning_rate": 0.001,
"loss": 1.2746,
"step": 206500
},
{
"epoch": 66.77440206851972,
"grad_norm": 1.755159616470337,
"learning_rate": 0.001,
"loss": 1.2705,
"step": 206600
},
{
"epoch": 66.80672268907563,
"grad_norm": 2.2386600971221924,
"learning_rate": 0.001,
"loss": 1.2813,
"step": 206700
},
{
"epoch": 66.83904330963155,
"grad_norm": 2.0152530670166016,
"learning_rate": 0.001,
"loss": 1.2673,
"step": 206800
},
{
"epoch": 66.87136393018746,
"grad_norm": 2.021780014038086,
"learning_rate": 0.001,
"loss": 1.2761,
"step": 206900
},
{
"epoch": 66.90368455074338,
"grad_norm": 1.9706413745880127,
"learning_rate": 0.001,
"loss": 1.2577,
"step": 207000
},
{
"epoch": 66.93600517129929,
"grad_norm": 1.9766478538513184,
"learning_rate": 0.001,
"loss": 1.2831,
"step": 207100
},
{
"epoch": 66.96832579185521,
"grad_norm": 2.4492130279541016,
"learning_rate": 0.001,
"loss": 1.2767,
"step": 207200
},
{
"epoch": 67.00064641241111,
"grad_norm": 3.763784885406494,
"learning_rate": 0.001,
"loss": 1.2547,
"step": 207300
},
{
"epoch": 67.03296703296704,
"grad_norm": 1.9353259801864624,
"learning_rate": 0.001,
"loss": 1.1491,
"step": 207400
},
{
"epoch": 67.06528765352294,
"grad_norm": 2.1818177700042725,
"learning_rate": 0.001,
"loss": 1.1611,
"step": 207500
},
{
"epoch": 67.09760827407887,
"grad_norm": 2.305077314376831,
"learning_rate": 0.001,
"loss": 1.166,
"step": 207600
},
{
"epoch": 67.12992889463477,
"grad_norm": 3.8952369689941406,
"learning_rate": 0.001,
"loss": 1.1686,
"step": 207700
},
{
"epoch": 67.1622495151907,
"grad_norm": 2.74645733833313,
"learning_rate": 0.001,
"loss": 1.1822,
"step": 207800
},
{
"epoch": 67.1945701357466,
"grad_norm": 2.66611909866333,
"learning_rate": 0.001,
"loss": 1.1769,
"step": 207900
},
{
"epoch": 67.22689075630252,
"grad_norm": 4.485386848449707,
"learning_rate": 0.001,
"loss": 1.1983,
"step": 208000
},
{
"epoch": 67.25921137685843,
"grad_norm": 2.7096071243286133,
"learning_rate": 0.001,
"loss": 1.1863,
"step": 208100
},
{
"epoch": 67.29153199741435,
"grad_norm": 2.0292038917541504,
"learning_rate": 0.001,
"loss": 1.1882,
"step": 208200
},
{
"epoch": 67.32385261797026,
"grad_norm": 2.7626845836639404,
"learning_rate": 0.001,
"loss": 1.2086,
"step": 208300
},
{
"epoch": 67.35617323852618,
"grad_norm": 2.0532233715057373,
"learning_rate": 0.001,
"loss": 1.2166,
"step": 208400
},
{
"epoch": 67.38849385908209,
"grad_norm": 1.7796385288238525,
"learning_rate": 0.001,
"loss": 1.2025,
"step": 208500
},
{
"epoch": 67.42081447963801,
"grad_norm": 2.683412790298462,
"learning_rate": 0.001,
"loss": 1.2139,
"step": 208600
},
{
"epoch": 67.45313510019392,
"grad_norm": 2.9881527423858643,
"learning_rate": 0.001,
"loss": 1.2179,
"step": 208700
},
{
"epoch": 67.48545572074984,
"grad_norm": 2.409613609313965,
"learning_rate": 0.001,
"loss": 1.2216,
"step": 208800
},
{
"epoch": 67.51777634130575,
"grad_norm": 2.4920945167541504,
"learning_rate": 0.001,
"loss": 1.2171,
"step": 208900
},
{
"epoch": 67.55009696186167,
"grad_norm": 2.262345790863037,
"learning_rate": 0.001,
"loss": 1.2262,
"step": 209000
},
{
"epoch": 67.58241758241758,
"grad_norm": 2.0987603664398193,
"learning_rate": 0.001,
"loss": 1.243,
"step": 209100
},
{
"epoch": 67.6147382029735,
"grad_norm": 2.0718164443969727,
"learning_rate": 0.001,
"loss": 1.2397,
"step": 209200
},
{
"epoch": 67.6470588235294,
"grad_norm": 3.155087947845459,
"learning_rate": 0.001,
"loss": 1.2412,
"step": 209300
},
{
"epoch": 67.67937944408533,
"grad_norm": 2.312764883041382,
"learning_rate": 0.001,
"loss": 1.2456,
"step": 209400
},
{
"epoch": 67.71170006464124,
"grad_norm": 2.4429471492767334,
"learning_rate": 0.001,
"loss": 1.2403,
"step": 209500
},
{
"epoch": 67.74402068519716,
"grad_norm": 2.445016384124756,
"learning_rate": 0.001,
"loss": 1.2438,
"step": 209600
},
{
"epoch": 67.77634130575306,
"grad_norm": 2.587137460708618,
"learning_rate": 0.001,
"loss": 1.269,
"step": 209700
},
{
"epoch": 67.80866192630899,
"grad_norm": 3.0269641876220703,
"learning_rate": 0.001,
"loss": 1.2673,
"step": 209800
},
{
"epoch": 67.8409825468649,
"grad_norm": 3.1837799549102783,
"learning_rate": 0.001,
"loss": 1.2538,
"step": 209900
},
{
"epoch": 67.87330316742081,
"grad_norm": 3.0893819332122803,
"learning_rate": 0.001,
"loss": 1.2495,
"step": 210000
},
{
"epoch": 67.90562378797672,
"grad_norm": 2.351608991622925,
"learning_rate": 0.001,
"loss": 1.2596,
"step": 210100
},
{
"epoch": 67.93794440853264,
"grad_norm": 2.225374221801758,
"learning_rate": 0.001,
"loss": 1.2566,
"step": 210200
},
{
"epoch": 67.97026502908855,
"grad_norm": 2.7132184505462646,
"learning_rate": 0.001,
"loss": 1.2556,
"step": 210300
},
{
"epoch": 68.00258564964447,
"grad_norm": 1.5622951984405518,
"learning_rate": 0.001,
"loss": 1.2869,
"step": 210400
},
{
"epoch": 68.0349062702004,
"grad_norm": 2.333732843399048,
"learning_rate": 0.001,
"loss": 1.1528,
"step": 210500
},
{
"epoch": 68.0672268907563,
"grad_norm": 1.970744252204895,
"learning_rate": 0.001,
"loss": 1.1497,
"step": 210600
},
{
"epoch": 68.09954751131222,
"grad_norm": 2.23043155670166,
"learning_rate": 0.001,
"loss": 1.1657,
"step": 210700
},
{
"epoch": 68.13186813186813,
"grad_norm": 1.8723633289337158,
"learning_rate": 0.001,
"loss": 1.1477,
"step": 210800
},
{
"epoch": 68.16418875242405,
"grad_norm": 2.50980544090271,
"learning_rate": 0.001,
"loss": 1.1604,
"step": 210900
},
{
"epoch": 68.19650937297996,
"grad_norm": 2.1375374794006348,
"learning_rate": 0.001,
"loss": 1.1707,
"step": 211000
},
{
"epoch": 68.22882999353588,
"grad_norm": 1.930857539176941,
"learning_rate": 0.001,
"loss": 1.1758,
"step": 211100
},
{
"epoch": 68.26115061409179,
"grad_norm": 2.3334290981292725,
"learning_rate": 0.001,
"loss": 1.1827,
"step": 211200
},
{
"epoch": 68.29347123464771,
"grad_norm": 2.729995012283325,
"learning_rate": 0.001,
"loss": 1.1833,
"step": 211300
},
{
"epoch": 68.32579185520362,
"grad_norm": 3.255042552947998,
"learning_rate": 0.001,
"loss": 1.1921,
"step": 211400
},
{
"epoch": 68.35811247575954,
"grad_norm": 1.533887505531311,
"learning_rate": 0.001,
"loss": 1.1909,
"step": 211500
},
{
"epoch": 68.39043309631545,
"grad_norm": 1.8926416635513306,
"learning_rate": 0.001,
"loss": 1.2026,
"step": 211600
},
{
"epoch": 68.42275371687137,
"grad_norm": 2.0479161739349365,
"learning_rate": 0.001,
"loss": 1.2002,
"step": 211700
},
{
"epoch": 68.45507433742728,
"grad_norm": 1.674736499786377,
"learning_rate": 0.001,
"loss": 1.2203,
"step": 211800
},
{
"epoch": 68.4873949579832,
"grad_norm": 2.498920440673828,
"learning_rate": 0.001,
"loss": 1.2082,
"step": 211900
},
{
"epoch": 68.5197155785391,
"grad_norm": 2.435779571533203,
"learning_rate": 0.001,
"loss": 1.226,
"step": 212000
},
{
"epoch": 68.55203619909503,
"grad_norm": 1.8328531980514526,
"learning_rate": 0.001,
"loss": 1.2292,
"step": 212100
},
{
"epoch": 68.58435681965094,
"grad_norm": 2.3320796489715576,
"learning_rate": 0.001,
"loss": 1.2184,
"step": 212200
},
{
"epoch": 68.61667744020686,
"grad_norm": 1.9430440664291382,
"learning_rate": 0.001,
"loss": 1.2313,
"step": 212300
},
{
"epoch": 68.64899806076276,
"grad_norm": 2.1987974643707275,
"learning_rate": 0.001,
"loss": 1.2309,
"step": 212400
},
{
"epoch": 68.68131868131869,
"grad_norm": 1.669899344444275,
"learning_rate": 0.001,
"loss": 1.2107,
"step": 212500
},
{
"epoch": 68.7136393018746,
"grad_norm": 2.1277084350585938,
"learning_rate": 0.001,
"loss": 1.2357,
"step": 212600
},
{
"epoch": 68.74595992243052,
"grad_norm": 1.9171146154403687,
"learning_rate": 0.001,
"loss": 1.2358,
"step": 212700
},
{
"epoch": 68.77828054298642,
"grad_norm": 1.9358433485031128,
"learning_rate": 0.001,
"loss": 1.2294,
"step": 212800
},
{
"epoch": 68.81060116354234,
"grad_norm": 1.9799690246582031,
"learning_rate": 0.001,
"loss": 1.242,
"step": 212900
},
{
"epoch": 68.84292178409825,
"grad_norm": 1.7972420454025269,
"learning_rate": 0.001,
"loss": 1.2474,
"step": 213000
},
{
"epoch": 68.87524240465417,
"grad_norm": 1.9665274620056152,
"learning_rate": 0.001,
"loss": 1.2567,
"step": 213100
},
{
"epoch": 68.90756302521008,
"grad_norm": 2.131694793701172,
"learning_rate": 0.001,
"loss": 1.2475,
"step": 213200
},
{
"epoch": 68.939883645766,
"grad_norm": 1.7594505548477173,
"learning_rate": 0.001,
"loss": 1.2593,
"step": 213300
},
{
"epoch": 68.97220426632191,
"grad_norm": 5.345921993255615,
"learning_rate": 0.001,
"loss": 1.2423,
"step": 213400
},
{
"epoch": 69.00452488687783,
"grad_norm": 1.549072504043579,
"learning_rate": 0.001,
"loss": 1.2657,
"step": 213500
},
{
"epoch": 69.03684550743374,
"grad_norm": 1.8662244081497192,
"learning_rate": 0.001,
"loss": 1.1174,
"step": 213600
},
{
"epoch": 69.06916612798966,
"grad_norm": 1.5122578144073486,
"learning_rate": 0.001,
"loss": 1.1488,
"step": 213700
},
{
"epoch": 69.10148674854557,
"grad_norm": 2.3468809127807617,
"learning_rate": 0.001,
"loss": 1.1498,
"step": 213800
},
{
"epoch": 69.13380736910149,
"grad_norm": 1.8483396768569946,
"learning_rate": 0.001,
"loss": 1.1517,
"step": 213900
},
{
"epoch": 69.1661279896574,
"grad_norm": 1.6163078546524048,
"learning_rate": 0.001,
"loss": 1.1765,
"step": 214000
},
{
"epoch": 69.19844861021332,
"grad_norm": 1.5859019756317139,
"learning_rate": 0.001,
"loss": 1.1647,
"step": 214100
},
{
"epoch": 69.23076923076923,
"grad_norm": 1.5599806308746338,
"learning_rate": 0.001,
"loss": 1.1526,
"step": 214200
},
{
"epoch": 69.26308985132515,
"grad_norm": 2.253723382949829,
"learning_rate": 0.001,
"loss": 1.1671,
"step": 214300
},
{
"epoch": 69.29541047188106,
"grad_norm": 1.7934225797653198,
"learning_rate": 0.001,
"loss": 1.1669,
"step": 214400
},
{
"epoch": 69.32773109243698,
"grad_norm": 1.9382452964782715,
"learning_rate": 0.001,
"loss": 1.165,
"step": 214500
},
{
"epoch": 69.36005171299288,
"grad_norm": 2.22196626663208,
"learning_rate": 0.001,
"loss": 1.1777,
"step": 214600
},
{
"epoch": 69.3923723335488,
"grad_norm": 1.5601698160171509,
"learning_rate": 0.001,
"loss": 1.1841,
"step": 214700
},
{
"epoch": 69.42469295410471,
"grad_norm": 2.0012974739074707,
"learning_rate": 0.001,
"loss": 1.1786,
"step": 214800
},
{
"epoch": 69.45701357466064,
"grad_norm": 1.9047846794128418,
"learning_rate": 0.001,
"loss": 1.2062,
"step": 214900
},
{
"epoch": 69.48933419521654,
"grad_norm": 1.562074899673462,
"learning_rate": 0.001,
"loss": 1.2138,
"step": 215000
},
{
"epoch": 69.52165481577246,
"grad_norm": 1.7316213846206665,
"learning_rate": 0.001,
"loss": 1.2113,
"step": 215100
},
{
"epoch": 69.55397543632837,
"grad_norm": 1.642343282699585,
"learning_rate": 0.001,
"loss": 1.2102,
"step": 215200
},
{
"epoch": 69.5862960568843,
"grad_norm": 1.579736590385437,
"learning_rate": 0.001,
"loss": 1.2164,
"step": 215300
},
{
"epoch": 69.6186166774402,
"grad_norm": 1.445095181465149,
"learning_rate": 0.001,
"loss": 1.2076,
"step": 215400
},
{
"epoch": 69.65093729799612,
"grad_norm": 1.5851157903671265,
"learning_rate": 0.001,
"loss": 1.2212,
"step": 215500
},
{
"epoch": 69.68325791855203,
"grad_norm": 1.7522008419036865,
"learning_rate": 0.001,
"loss": 1.2242,
"step": 215600
},
{
"epoch": 69.71557853910795,
"grad_norm": 1.8869768381118774,
"learning_rate": 0.001,
"loss": 1.2357,
"step": 215700
},
{
"epoch": 69.74789915966386,
"grad_norm": 1.9056206941604614,
"learning_rate": 0.001,
"loss": 1.2266,
"step": 215800
},
{
"epoch": 69.78021978021978,
"grad_norm": 1.9747616052627563,
"learning_rate": 0.001,
"loss": 1.2128,
"step": 215900
},
{
"epoch": 69.81254040077569,
"grad_norm": 1.853060245513916,
"learning_rate": 0.001,
"loss": 1.2285,
"step": 216000
},
{
"epoch": 69.84486102133161,
"grad_norm": 1.8886538743972778,
"learning_rate": 0.001,
"loss": 1.2295,
"step": 216100
},
{
"epoch": 69.87718164188752,
"grad_norm": 2.0137014389038086,
"learning_rate": 0.001,
"loss": 1.2251,
"step": 216200
},
{
"epoch": 69.90950226244344,
"grad_norm": 1.9366350173950195,
"learning_rate": 0.001,
"loss": 1.2532,
"step": 216300
},
{
"epoch": 69.94182288299935,
"grad_norm": 2.439671277999878,
"learning_rate": 0.001,
"loss": 1.2342,
"step": 216400
},
{
"epoch": 69.97414350355527,
"grad_norm": 1.625091552734375,
"learning_rate": 0.001,
"loss": 1.2523,
"step": 216500
},
{
"epoch": 70.00646412411119,
"grad_norm": 1.5262469053268433,
"learning_rate": 0.001,
"loss": 1.2416,
"step": 216600
},
{
"epoch": 70.0387847446671,
"grad_norm": 1.6262037754058838,
"learning_rate": 0.001,
"loss": 1.1363,
"step": 216700
},
{
"epoch": 70.07110536522302,
"grad_norm": 1.6918765306472778,
"learning_rate": 0.001,
"loss": 1.1288,
"step": 216800
},
{
"epoch": 70.10342598577893,
"grad_norm": 1.7063210010528564,
"learning_rate": 0.001,
"loss": 1.1313,
"step": 216900
},
{
"epoch": 70.13574660633485,
"grad_norm": 2.047490358352661,
"learning_rate": 0.001,
"loss": 1.1365,
"step": 217000
},
{
"epoch": 70.16806722689076,
"grad_norm": 1.7240195274353027,
"learning_rate": 0.001,
"loss": 1.1556,
"step": 217100
},
{
"epoch": 70.20038784744668,
"grad_norm": 2.204714775085449,
"learning_rate": 0.001,
"loss": 1.1435,
"step": 217200
},
{
"epoch": 70.23270846800258,
"grad_norm": 1.780590295791626,
"learning_rate": 0.001,
"loss": 1.1467,
"step": 217300
},
{
"epoch": 70.2650290885585,
"grad_norm": 1.6181354522705078,
"learning_rate": 0.001,
"loss": 1.1562,
"step": 217400
},
{
"epoch": 70.29734970911441,
"grad_norm": 1.7674238681793213,
"learning_rate": 0.001,
"loss": 1.1726,
"step": 217500
},
{
"epoch": 70.32967032967034,
"grad_norm": 1.9929686784744263,
"learning_rate": 0.001,
"loss": 1.1684,
"step": 217600
},
{
"epoch": 70.36199095022624,
"grad_norm": 1.562532901763916,
"learning_rate": 0.001,
"loss": 1.1832,
"step": 217700
},
{
"epoch": 70.39431157078216,
"grad_norm": 1.6133959293365479,
"learning_rate": 0.001,
"loss": 1.174,
"step": 217800
},
{
"epoch": 70.42663219133807,
"grad_norm": 1.9063694477081299,
"learning_rate": 0.001,
"loss": 1.1721,
"step": 217900
},
{
"epoch": 70.458952811894,
"grad_norm": 1.6396887302398682,
"learning_rate": 0.001,
"loss": 1.1818,
"step": 218000
},
{
"epoch": 70.4912734324499,
"grad_norm": 5.434491157531738,
"learning_rate": 0.001,
"loss": 1.1929,
"step": 218100
},
{
"epoch": 70.52359405300582,
"grad_norm": 2.1285393238067627,
"learning_rate": 0.001,
"loss": 1.2068,
"step": 218200
},
{
"epoch": 70.55591467356173,
"grad_norm": 2.146338701248169,
"learning_rate": 0.001,
"loss": 1.1927,
"step": 218300
},
{
"epoch": 70.58823529411765,
"grad_norm": 1.5270682573318481,
"learning_rate": 0.001,
"loss": 1.196,
"step": 218400
},
{
"epoch": 70.62055591467356,
"grad_norm": 1.8791449069976807,
"learning_rate": 0.001,
"loss": 1.2008,
"step": 218500
},
{
"epoch": 70.65287653522948,
"grad_norm": 1.5155069828033447,
"learning_rate": 0.001,
"loss": 1.202,
"step": 218600
},
{
"epoch": 70.68519715578539,
"grad_norm": 1.610196828842163,
"learning_rate": 0.001,
"loss": 1.2087,
"step": 218700
},
{
"epoch": 70.71751777634131,
"grad_norm": 1.5798680782318115,
"learning_rate": 0.001,
"loss": 1.1986,
"step": 218800
},
{
"epoch": 70.74983839689722,
"grad_norm": 1.494688868522644,
"learning_rate": 0.001,
"loss": 1.2055,
"step": 218900
},
{
"epoch": 70.78215901745314,
"grad_norm": 1.8557690382003784,
"learning_rate": 0.001,
"loss": 1.2105,
"step": 219000
},
{
"epoch": 70.81447963800905,
"grad_norm": 1.6135377883911133,
"learning_rate": 0.001,
"loss": 1.2151,
"step": 219100
},
{
"epoch": 70.84680025856497,
"grad_norm": 1.7288098335266113,
"learning_rate": 0.001,
"loss": 1.2289,
"step": 219200
},
{
"epoch": 70.87912087912088,
"grad_norm": 1.838810920715332,
"learning_rate": 0.001,
"loss": 1.2268,
"step": 219300
},
{
"epoch": 70.9114414996768,
"grad_norm": 2.151858329772949,
"learning_rate": 0.001,
"loss": 1.2403,
"step": 219400
},
{
"epoch": 70.9437621202327,
"grad_norm": 1.791812539100647,
"learning_rate": 0.001,
"loss": 1.2395,
"step": 219500
},
{
"epoch": 70.97608274078863,
"grad_norm": 1.4549312591552734,
"learning_rate": 0.001,
"loss": 1.2375,
"step": 219600
},
{
"epoch": 71.00840336134453,
"grad_norm": 1.5893354415893555,
"learning_rate": 0.001,
"loss": 1.1986,
"step": 219700
},
{
"epoch": 71.04072398190046,
"grad_norm": 1.598484992980957,
"learning_rate": 0.001,
"loss": 1.1016,
"step": 219800
},
{
"epoch": 71.07304460245636,
"grad_norm": 1.74235200881958,
"learning_rate": 0.001,
"loss": 1.1297,
"step": 219900
},
{
"epoch": 71.10536522301229,
"grad_norm": 2.0175647735595703,
"learning_rate": 0.001,
"loss": 1.1254,
"step": 220000
},
{
"epoch": 71.13768584356819,
"grad_norm": 1.877590537071228,
"learning_rate": 0.001,
"loss": 1.1301,
"step": 220100
},
{
"epoch": 71.17000646412411,
"grad_norm": 1.6253721714019775,
"learning_rate": 0.001,
"loss": 1.127,
"step": 220200
},
{
"epoch": 71.20232708468002,
"grad_norm": 1.6335793733596802,
"learning_rate": 0.001,
"loss": 1.1491,
"step": 220300
},
{
"epoch": 71.23464770523594,
"grad_norm": 1.8717635869979858,
"learning_rate": 0.001,
"loss": 1.129,
"step": 220400
},
{
"epoch": 71.26696832579185,
"grad_norm": 1.9798500537872314,
"learning_rate": 0.001,
"loss": 1.1498,
"step": 220500
},
{
"epoch": 71.29928894634777,
"grad_norm": 1.6062580347061157,
"learning_rate": 0.001,
"loss": 1.149,
"step": 220600
},
{
"epoch": 71.33160956690368,
"grad_norm": 1.7187769412994385,
"learning_rate": 0.001,
"loss": 1.1589,
"step": 220700
},
{
"epoch": 71.3639301874596,
"grad_norm": 1.6825973987579346,
"learning_rate": 0.001,
"loss": 1.1705,
"step": 220800
},
{
"epoch": 71.39625080801551,
"grad_norm": 1.5847103595733643,
"learning_rate": 0.001,
"loss": 1.1657,
"step": 220900
},
{
"epoch": 71.42857142857143,
"grad_norm": 1.477260947227478,
"learning_rate": 0.001,
"loss": 1.1714,
"step": 221000
},
{
"epoch": 71.46089204912734,
"grad_norm": 2.0396647453308105,
"learning_rate": 0.001,
"loss": 1.1745,
"step": 221100
},
{
"epoch": 71.49321266968326,
"grad_norm": 1.5809693336486816,
"learning_rate": 0.001,
"loss": 1.169,
"step": 221200
},
{
"epoch": 71.52553329023917,
"grad_norm": 1.5389111042022705,
"learning_rate": 0.001,
"loss": 1.184,
"step": 221300
},
{
"epoch": 71.55785391079509,
"grad_norm": 1.5018457174301147,
"learning_rate": 0.001,
"loss": 1.1905,
"step": 221400
},
{
"epoch": 71.590174531351,
"grad_norm": 1.8730908632278442,
"learning_rate": 0.001,
"loss": 1.1808,
"step": 221500
},
{
"epoch": 71.62249515190692,
"grad_norm": 1.9678943157196045,
"learning_rate": 0.001,
"loss": 1.1696,
"step": 221600
},
{
"epoch": 71.65481577246283,
"grad_norm": 1.6994378566741943,
"learning_rate": 0.001,
"loss": 1.1976,
"step": 221700
},
{
"epoch": 71.68713639301875,
"grad_norm": 1.820391058921814,
"learning_rate": 0.001,
"loss": 1.2076,
"step": 221800
},
{
"epoch": 71.71945701357465,
"grad_norm": 1.6418462991714478,
"learning_rate": 0.001,
"loss": 1.1973,
"step": 221900
},
{
"epoch": 71.75177763413058,
"grad_norm": 1.805640459060669,
"learning_rate": 0.001,
"loss": 1.2126,
"step": 222000
},
{
"epoch": 71.78409825468648,
"grad_norm": 1.5427037477493286,
"learning_rate": 0.001,
"loss": 1.2058,
"step": 222100
},
{
"epoch": 71.8164188752424,
"grad_norm": 1.6487582921981812,
"learning_rate": 0.001,
"loss": 1.2155,
"step": 222200
},
{
"epoch": 71.84873949579831,
"grad_norm": 2.0085551738739014,
"learning_rate": 0.001,
"loss": 1.2167,
"step": 222300
},
{
"epoch": 71.88106011635423,
"grad_norm": 1.4483178853988647,
"learning_rate": 0.001,
"loss": 1.2155,
"step": 222400
},
{
"epoch": 71.91338073691014,
"grad_norm": 1.6381824016571045,
"learning_rate": 0.001,
"loss": 1.2192,
"step": 222500
},
{
"epoch": 71.94570135746606,
"grad_norm": 2.2828452587127686,
"learning_rate": 0.001,
"loss": 1.227,
"step": 222600
},
{
"epoch": 71.97802197802197,
"grad_norm": 2.1275863647460938,
"learning_rate": 0.001,
"loss": 1.2355,
"step": 222700
},
{
"epoch": 72.01034259857789,
"grad_norm": 1.8210406303405762,
"learning_rate": 0.001,
"loss": 1.1774,
"step": 222800
},
{
"epoch": 72.04266321913381,
"grad_norm": 1.9121840000152588,
"learning_rate": 0.001,
"loss": 1.102,
"step": 222900
},
{
"epoch": 72.07498383968972,
"grad_norm": 1.948397159576416,
"learning_rate": 0.001,
"loss": 1.1163,
"step": 223000
},
{
"epoch": 72.10730446024564,
"grad_norm": 2.113853693008423,
"learning_rate": 0.001,
"loss": 1.1221,
"step": 223100
},
{
"epoch": 72.13962508080155,
"grad_norm": 1.9064280986785889,
"learning_rate": 0.001,
"loss": 1.1297,
"step": 223200
},
{
"epoch": 72.17194570135747,
"grad_norm": 1.8943758010864258,
"learning_rate": 0.001,
"loss": 1.1214,
"step": 223300
},
{
"epoch": 72.20426632191338,
"grad_norm": 2.0406997203826904,
"learning_rate": 0.001,
"loss": 1.134,
"step": 223400
},
{
"epoch": 72.2365869424693,
"grad_norm": 1.837704062461853,
"learning_rate": 0.001,
"loss": 1.1316,
"step": 223500
},
{
"epoch": 72.26890756302521,
"grad_norm": 1.7503560781478882,
"learning_rate": 0.001,
"loss": 1.1263,
"step": 223600
},
{
"epoch": 72.30122818358113,
"grad_norm": 1.602188229560852,
"learning_rate": 0.001,
"loss": 1.1543,
"step": 223700
},
{
"epoch": 72.33354880413704,
"grad_norm": 1.8570600748062134,
"learning_rate": 0.001,
"loss": 1.159,
"step": 223800
},
{
"epoch": 72.36586942469296,
"grad_norm": 1.7517465353012085,
"learning_rate": 0.001,
"loss": 1.1539,
"step": 223900
},
{
"epoch": 72.39819004524887,
"grad_norm": 1.8670244216918945,
"learning_rate": 0.001,
"loss": 1.1495,
"step": 224000
},
{
"epoch": 72.43051066580479,
"grad_norm": 1.831101655960083,
"learning_rate": 0.001,
"loss": 1.1613,
"step": 224100
},
{
"epoch": 72.4628312863607,
"grad_norm": 1.6500862836837769,
"learning_rate": 0.001,
"loss": 1.1413,
"step": 224200
},
{
"epoch": 72.49515190691662,
"grad_norm": 1.9786334037780762,
"learning_rate": 0.001,
"loss": 1.1655,
"step": 224300
},
{
"epoch": 72.52747252747253,
"grad_norm": 1.7236011028289795,
"learning_rate": 0.001,
"loss": 1.1654,
"step": 224400
},
{
"epoch": 72.55979314802845,
"grad_norm": 2.018129587173462,
"learning_rate": 0.001,
"loss": 1.1609,
"step": 224500
},
{
"epoch": 72.59211376858435,
"grad_norm": 1.9249852895736694,
"learning_rate": 0.001,
"loss": 1.1889,
"step": 224600
},
{
"epoch": 72.62443438914028,
"grad_norm": 1.736116886138916,
"learning_rate": 0.001,
"loss": 1.1828,
"step": 224700
},
{
"epoch": 72.65675500969618,
"grad_norm": 1.6699976921081543,
"learning_rate": 0.001,
"loss": 1.173,
"step": 224800
},
{
"epoch": 72.6890756302521,
"grad_norm": 2.20131778717041,
"learning_rate": 0.001,
"loss": 1.1921,
"step": 224900
},
{
"epoch": 72.72139625080801,
"grad_norm": 1.9852443933486938,
"learning_rate": 0.001,
"loss": 1.1904,
"step": 225000
},
{
"epoch": 72.75371687136393,
"grad_norm": 1.9196783304214478,
"learning_rate": 0.001,
"loss": 1.1867,
"step": 225100
},
{
"epoch": 72.78603749191984,
"grad_norm": 1.7682510614395142,
"learning_rate": 0.001,
"loss": 1.193,
"step": 225200
},
{
"epoch": 72.81835811247576,
"grad_norm": 2.267354726791382,
"learning_rate": 0.001,
"loss": 1.1856,
"step": 225300
},
{
"epoch": 72.85067873303167,
"grad_norm": 1.7886394262313843,
"learning_rate": 0.001,
"loss": 1.2008,
"step": 225400
},
{
"epoch": 72.88299935358759,
"grad_norm": 1.6785390377044678,
"learning_rate": 0.001,
"loss": 1.2051,
"step": 225500
},
{
"epoch": 72.9153199741435,
"grad_norm": 1.8354374170303345,
"learning_rate": 0.001,
"loss": 1.1972,
"step": 225600
},
{
"epoch": 72.94764059469942,
"grad_norm": 1.6264238357543945,
"learning_rate": 0.001,
"loss": 1.2162,
"step": 225700
},
{
"epoch": 72.97996121525533,
"grad_norm": 2.1394410133361816,
"learning_rate": 0.001,
"loss": 1.223,
"step": 225800
},
{
"epoch": 73.01228183581125,
"grad_norm": 1.795905351638794,
"learning_rate": 0.001,
"loss": 1.1582,
"step": 225900
},
{
"epoch": 73.04460245636716,
"grad_norm": 2.1484296321868896,
"learning_rate": 0.001,
"loss": 1.0819,
"step": 226000
},
{
"epoch": 73.07692307692308,
"grad_norm": 1.72504460811615,
"learning_rate": 0.001,
"loss": 1.0955,
"step": 226100
},
{
"epoch": 73.10924369747899,
"grad_norm": 1.5780655145645142,
"learning_rate": 0.001,
"loss": 1.0913,
"step": 226200
},
{
"epoch": 73.14156431803491,
"grad_norm": 1.83950674533844,
"learning_rate": 0.001,
"loss": 1.1112,
"step": 226300
},
{
"epoch": 73.17388493859082,
"grad_norm": 2.016282320022583,
"learning_rate": 0.001,
"loss": 1.1083,
"step": 226400
},
{
"epoch": 73.20620555914674,
"grad_norm": 2.395186185836792,
"learning_rate": 0.001,
"loss": 1.111,
"step": 226500
},
{
"epoch": 73.23852617970265,
"grad_norm": 2.154127836227417,
"learning_rate": 0.001,
"loss": 1.1132,
"step": 226600
},
{
"epoch": 73.27084680025857,
"grad_norm": 1.6973828077316284,
"learning_rate": 0.001,
"loss": 1.1365,
"step": 226700
},
{
"epoch": 73.30316742081448,
"grad_norm": 1.861863613128662,
"learning_rate": 0.001,
"loss": 1.1306,
"step": 226800
},
{
"epoch": 73.3354880413704,
"grad_norm": 2.0509049892425537,
"learning_rate": 0.001,
"loss": 1.1463,
"step": 226900
},
{
"epoch": 73.3678086619263,
"grad_norm": 1.933553695678711,
"learning_rate": 0.001,
"loss": 1.161,
"step": 227000
},
{
"epoch": 73.40012928248223,
"grad_norm": 2.0474693775177,
"learning_rate": 0.001,
"loss": 1.1489,
"step": 227100
},
{
"epoch": 73.43244990303813,
"grad_norm": 2.182870388031006,
"learning_rate": 0.001,
"loss": 1.1481,
"step": 227200
},
{
"epoch": 73.46477052359405,
"grad_norm": 1.6766440868377686,
"learning_rate": 0.001,
"loss": 1.1566,
"step": 227300
},
{
"epoch": 73.49709114414996,
"grad_norm": 2.327120542526245,
"learning_rate": 0.001,
"loss": 1.1424,
"step": 227400
},
{
"epoch": 73.52941176470588,
"grad_norm": 1.8558249473571777,
"learning_rate": 0.001,
"loss": 1.1572,
"step": 227500
},
{
"epoch": 73.56173238526179,
"grad_norm": 2.0666706562042236,
"learning_rate": 0.001,
"loss": 1.1534,
"step": 227600
},
{
"epoch": 73.59405300581771,
"grad_norm": 2.563960313796997,
"learning_rate": 0.001,
"loss": 1.1588,
"step": 227700
},
{
"epoch": 73.62637362637362,
"grad_norm": 1.841058373451233,
"learning_rate": 0.001,
"loss": 1.174,
"step": 227800
},
{
"epoch": 73.65869424692954,
"grad_norm": 1.7862833738327026,
"learning_rate": 0.001,
"loss": 1.1698,
"step": 227900
},
{
"epoch": 73.69101486748545,
"grad_norm": 1.935362458229065,
"learning_rate": 0.001,
"loss": 1.1793,
"step": 228000
},
{
"epoch": 73.72333548804137,
"grad_norm": 1.609352707862854,
"learning_rate": 0.001,
"loss": 1.1831,
"step": 228100
},
{
"epoch": 73.75565610859728,
"grad_norm": 2.0712058544158936,
"learning_rate": 0.001,
"loss": 1.18,
"step": 228200
},
{
"epoch": 73.7879767291532,
"grad_norm": 1.8805397748947144,
"learning_rate": 0.001,
"loss": 1.1897,
"step": 228300
},
{
"epoch": 73.82029734970911,
"grad_norm": 1.8375552892684937,
"learning_rate": 0.001,
"loss": 1.1886,
"step": 228400
},
{
"epoch": 73.85261797026503,
"grad_norm": 2.0276403427124023,
"learning_rate": 0.001,
"loss": 1.1811,
"step": 228500
},
{
"epoch": 73.88493859082094,
"grad_norm": 2.1257636547088623,
"learning_rate": 0.001,
"loss": 1.1942,
"step": 228600
},
{
"epoch": 73.91725921137686,
"grad_norm": 1.6965306997299194,
"learning_rate": 0.001,
"loss": 1.2039,
"step": 228700
},
{
"epoch": 73.94957983193277,
"grad_norm": 2.1263046264648438,
"learning_rate": 0.001,
"loss": 1.2031,
"step": 228800
},
{
"epoch": 73.98190045248869,
"grad_norm": 1.8232370615005493,
"learning_rate": 0.001,
"loss": 1.2104,
"step": 228900
},
{
"epoch": 74.01422107304461,
"grad_norm": 2.033893346786499,
"learning_rate": 0.001,
"loss": 1.1334,
"step": 229000
},
{
"epoch": 74.04654169360052,
"grad_norm": 1.9348260164260864,
"learning_rate": 0.001,
"loss": 1.0812,
"step": 229100
},
{
"epoch": 74.07886231415644,
"grad_norm": 2.0578017234802246,
"learning_rate": 0.001,
"loss": 1.0994,
"step": 229200
},
{
"epoch": 74.11118293471235,
"grad_norm": 2.131514072418213,
"learning_rate": 0.001,
"loss": 1.1066,
"step": 229300
},
{
"epoch": 74.14350355526827,
"grad_norm": 2.09892201423645,
"learning_rate": 0.001,
"loss": 1.0978,
"step": 229400
},
{
"epoch": 74.17582417582418,
"grad_norm": 2.2361555099487305,
"learning_rate": 0.001,
"loss": 1.0948,
"step": 229500
},
{
"epoch": 74.2081447963801,
"grad_norm": 2.1429715156555176,
"learning_rate": 0.001,
"loss": 1.12,
"step": 229600
},
{
"epoch": 74.240465416936,
"grad_norm": 1.7811630964279175,
"learning_rate": 0.001,
"loss": 1.1277,
"step": 229700
},
{
"epoch": 74.27278603749193,
"grad_norm": 2.2443344593048096,
"learning_rate": 0.001,
"loss": 1.1082,
"step": 229800
},
{
"epoch": 74.30510665804783,
"grad_norm": 2.121556520462036,
"learning_rate": 0.001,
"loss": 1.1229,
"step": 229900
},
{
"epoch": 74.33742727860376,
"grad_norm": 2.5482897758483887,
"learning_rate": 0.001,
"loss": 1.1368,
"step": 230000
},
{
"epoch": 74.36974789915966,
"grad_norm": 1.9960894584655762,
"learning_rate": 0.001,
"loss": 1.1245,
"step": 230100
},
{
"epoch": 74.40206851971558,
"grad_norm": 2.089287757873535,
"learning_rate": 0.001,
"loss": 1.1405,
"step": 230200
},
{
"epoch": 74.43438914027149,
"grad_norm": 1.8604276180267334,
"learning_rate": 0.001,
"loss": 1.1538,
"step": 230300
},
{
"epoch": 74.46670976082741,
"grad_norm": 1.9729381799697876,
"learning_rate": 0.001,
"loss": 1.1571,
"step": 230400
},
{
"epoch": 74.49903038138332,
"grad_norm": 1.8836462497711182,
"learning_rate": 0.001,
"loss": 1.1498,
"step": 230500
},
{
"epoch": 74.53135100193924,
"grad_norm": 1.781795859336853,
"learning_rate": 0.001,
"loss": 1.1463,
"step": 230600
},
{
"epoch": 74.56367162249515,
"grad_norm": 2.167222023010254,
"learning_rate": 0.001,
"loss": 1.1387,
"step": 230700
},
{
"epoch": 74.59599224305107,
"grad_norm": 2.2089223861694336,
"learning_rate": 0.001,
"loss": 1.1499,
"step": 230800
},
{
"epoch": 74.62831286360698,
"grad_norm": 2.309671401977539,
"learning_rate": 0.001,
"loss": 1.1569,
"step": 230900
},
{
"epoch": 74.6606334841629,
"grad_norm": 1.9005663394927979,
"learning_rate": 0.001,
"loss": 1.1723,
"step": 231000
},
{
"epoch": 74.69295410471881,
"grad_norm": 1.8752561807632446,
"learning_rate": 0.001,
"loss": 1.1675,
"step": 231100
},
{
"epoch": 74.72527472527473,
"grad_norm": 2.7028305530548096,
"learning_rate": 0.001,
"loss": 1.1656,
"step": 231200
},
{
"epoch": 74.75759534583064,
"grad_norm": 2.19155216217041,
"learning_rate": 0.001,
"loss": 1.1671,
"step": 231300
},
{
"epoch": 74.78991596638656,
"grad_norm": 2.1153290271759033,
"learning_rate": 0.001,
"loss": 1.1774,
"step": 231400
},
{
"epoch": 74.82223658694247,
"grad_norm": 1.7875677347183228,
"learning_rate": 0.001,
"loss": 1.1752,
"step": 231500
},
{
"epoch": 74.85455720749839,
"grad_norm": 2.1596906185150146,
"learning_rate": 0.001,
"loss": 1.1678,
"step": 231600
},
{
"epoch": 74.8868778280543,
"grad_norm": 2.106273889541626,
"learning_rate": 0.001,
"loss": 1.1814,
"step": 231700
},
{
"epoch": 74.91919844861022,
"grad_norm": 2.4843180179595947,
"learning_rate": 0.001,
"loss": 1.1823,
"step": 231800
},
{
"epoch": 74.95151906916612,
"grad_norm": 2.45287823677063,
"learning_rate": 0.001,
"loss": 1.1806,
"step": 231900
},
{
"epoch": 74.98383968972205,
"grad_norm": 2.348428249359131,
"learning_rate": 0.001,
"loss": 1.1847,
"step": 232000
},
{
"epoch": 75.01616031027795,
"grad_norm": 2.1072657108306885,
"learning_rate": 0.001,
"loss": 1.1007,
"step": 232100
},
{
"epoch": 75.04848093083388,
"grad_norm": 2.227376699447632,
"learning_rate": 0.001,
"loss": 1.0656,
"step": 232200
},
{
"epoch": 75.08080155138978,
"grad_norm": 2.6325557231903076,
"learning_rate": 0.001,
"loss": 1.0806,
"step": 232300
},
{
"epoch": 75.1131221719457,
"grad_norm": 2.4202475547790527,
"learning_rate": 0.001,
"loss": 1.0943,
"step": 232400
},
{
"epoch": 75.14544279250161,
"grad_norm": 2.6263668537139893,
"learning_rate": 0.001,
"loss": 1.0991,
"step": 232500
},
{
"epoch": 75.17776341305753,
"grad_norm": 2.652233600616455,
"learning_rate": 0.001,
"loss": 1.0959,
"step": 232600
},
{
"epoch": 75.21008403361344,
"grad_norm": 2.31381893157959,
"learning_rate": 0.001,
"loss": 1.1064,
"step": 232700
},
{
"epoch": 75.24240465416936,
"grad_norm": 2.8877060413360596,
"learning_rate": 0.001,
"loss": 1.101,
"step": 232800
},
{
"epoch": 75.27472527472527,
"grad_norm": 3.5789542198181152,
"learning_rate": 0.001,
"loss": 1.1082,
"step": 232900
},
{
"epoch": 75.30704589528119,
"grad_norm": 2.622349500656128,
"learning_rate": 0.001,
"loss": 1.101,
"step": 233000
},
{
"epoch": 75.3393665158371,
"grad_norm": 2.9478626251220703,
"learning_rate": 0.001,
"loss": 1.1134,
"step": 233100
},
{
"epoch": 75.37168713639302,
"grad_norm": 2.258899688720703,
"learning_rate": 0.001,
"loss": 1.1272,
"step": 233200
},
{
"epoch": 75.40400775694893,
"grad_norm": 2.8616788387298584,
"learning_rate": 0.001,
"loss": 1.1265,
"step": 233300
},
{
"epoch": 75.43632837750485,
"grad_norm": 2.5713586807250977,
"learning_rate": 0.001,
"loss": 1.1222,
"step": 233400
},
{
"epoch": 75.46864899806076,
"grad_norm": 2.9132118225097656,
"learning_rate": 0.001,
"loss": 1.129,
"step": 233500
},
{
"epoch": 75.50096961861668,
"grad_norm": 2.412076473236084,
"learning_rate": 0.001,
"loss": 1.1272,
"step": 233600
},
{
"epoch": 75.53329023917259,
"grad_norm": 2.0582900047302246,
"learning_rate": 0.001,
"loss": 1.1382,
"step": 233700
},
{
"epoch": 75.56561085972851,
"grad_norm": 2.4409642219543457,
"learning_rate": 0.001,
"loss": 1.1436,
"step": 233800
},
{
"epoch": 75.59793148028442,
"grad_norm": 2.7730462551116943,
"learning_rate": 0.001,
"loss": 1.1466,
"step": 233900
},
{
"epoch": 75.63025210084034,
"grad_norm": 2.9127862453460693,
"learning_rate": 0.001,
"loss": 1.1634,
"step": 234000
},
{
"epoch": 75.66257272139624,
"grad_norm": 2.6338980197906494,
"learning_rate": 0.001,
"loss": 1.1515,
"step": 234100
},
{
"epoch": 75.69489334195217,
"grad_norm": 2.3682878017425537,
"learning_rate": 0.001,
"loss": 1.1716,
"step": 234200
},
{
"epoch": 75.72721396250807,
"grad_norm": 3.0970637798309326,
"learning_rate": 0.001,
"loss": 1.1681,
"step": 234300
},
{
"epoch": 75.759534583064,
"grad_norm": 3.246640682220459,
"learning_rate": 0.001,
"loss": 1.1656,
"step": 234400
},
{
"epoch": 75.7918552036199,
"grad_norm": 2.827653169631958,
"learning_rate": 0.001,
"loss": 1.1712,
"step": 234500
},
{
"epoch": 75.82417582417582,
"grad_norm": 2.37351393699646,
"learning_rate": 0.001,
"loss": 1.1767,
"step": 234600
},
{
"epoch": 75.85649644473173,
"grad_norm": 2.936460018157959,
"learning_rate": 0.001,
"loss": 1.1798,
"step": 234700
},
{
"epoch": 75.88881706528765,
"grad_norm": 2.810807228088379,
"learning_rate": 0.001,
"loss": 1.1756,
"step": 234800
},
{
"epoch": 75.92113768584356,
"grad_norm": 2.5156216621398926,
"learning_rate": 0.001,
"loss": 1.1682,
"step": 234900
},
{
"epoch": 75.95345830639948,
"grad_norm": 2.3264551162719727,
"learning_rate": 0.001,
"loss": 1.1573,
"step": 235000
},
{
"epoch": 75.98577892695539,
"grad_norm": 2.1307079792022705,
"learning_rate": 0.001,
"loss": 1.189,
"step": 235100
},
{
"epoch": 76.01809954751131,
"grad_norm": 1.8315860033035278,
"learning_rate": 0.001,
"loss": 1.1347,
"step": 235200
},
{
"epoch": 76.05042016806723,
"grad_norm": 1.5642365217208862,
"learning_rate": 0.001,
"loss": 1.062,
"step": 235300
},
{
"epoch": 76.08274078862314,
"grad_norm": 2.2328715324401855,
"learning_rate": 0.001,
"loss": 1.0708,
"step": 235400
},
{
"epoch": 76.11506140917906,
"grad_norm": 1.7951757907867432,
"learning_rate": 0.001,
"loss": 1.0796,
"step": 235500
},
{
"epoch": 76.14738202973497,
"grad_norm": 1.8427295684814453,
"learning_rate": 0.001,
"loss": 1.0759,
"step": 235600
},
{
"epoch": 76.17970265029089,
"grad_norm": 1.8815374374389648,
"learning_rate": 0.001,
"loss": 1.0794,
"step": 235700
},
{
"epoch": 76.2120232708468,
"grad_norm": 2.425978899002075,
"learning_rate": 0.001,
"loss": 1.1024,
"step": 235800
},
{
"epoch": 76.24434389140272,
"grad_norm": 2.3707029819488525,
"learning_rate": 0.001,
"loss": 1.0746,
"step": 235900
},
{
"epoch": 76.27666451195863,
"grad_norm": 2.2675940990448,
"learning_rate": 0.001,
"loss": 1.1096,
"step": 236000
},
{
"epoch": 76.30898513251455,
"grad_norm": 1.627684473991394,
"learning_rate": 0.001,
"loss": 1.1072,
"step": 236100
},
{
"epoch": 76.34130575307046,
"grad_norm": 1.8927665948867798,
"learning_rate": 0.001,
"loss": 1.1098,
"step": 236200
},
{
"epoch": 76.37362637362638,
"grad_norm": 1.8863893747329712,
"learning_rate": 0.001,
"loss": 1.0979,
"step": 236300
},
{
"epoch": 76.40594699418229,
"grad_norm": 2.0543971061706543,
"learning_rate": 0.001,
"loss": 1.1123,
"step": 236400
},
{
"epoch": 76.43826761473821,
"grad_norm": 2.1276564598083496,
"learning_rate": 0.001,
"loss": 1.1174,
"step": 236500
},
{
"epoch": 76.47058823529412,
"grad_norm": 2.0694077014923096,
"learning_rate": 0.001,
"loss": 1.1169,
"step": 236600
},
{
"epoch": 76.50290885585004,
"grad_norm": 1.7249040603637695,
"learning_rate": 0.001,
"loss": 1.1289,
"step": 236700
},
{
"epoch": 76.53522947640595,
"grad_norm": 2.4833602905273438,
"learning_rate": 0.001,
"loss": 1.1457,
"step": 236800
},
{
"epoch": 76.56755009696187,
"grad_norm": 1.8441822528839111,
"learning_rate": 0.001,
"loss": 1.122,
"step": 236900
},
{
"epoch": 76.59987071751777,
"grad_norm": 1.9829216003417969,
"learning_rate": 0.001,
"loss": 1.134,
"step": 237000
},
{
"epoch": 76.6321913380737,
"grad_norm": 2.1754186153411865,
"learning_rate": 0.001,
"loss": 1.1358,
"step": 237100
},
{
"epoch": 76.6645119586296,
"grad_norm": 1.6874982118606567,
"learning_rate": 0.001,
"loss": 1.1385,
"step": 237200
},
{
"epoch": 76.69683257918552,
"grad_norm": 1.662231683731079,
"learning_rate": 0.001,
"loss": 1.1278,
"step": 237300
},
{
"epoch": 76.72915319974143,
"grad_norm": 1.977017879486084,
"learning_rate": 0.001,
"loss": 1.1418,
"step": 237400
},
{
"epoch": 76.76147382029735,
"grad_norm": 1.5747963190078735,
"learning_rate": 0.001,
"loss": 1.1458,
"step": 237500
},
{
"epoch": 76.79379444085326,
"grad_norm": 2.3878796100616455,
"learning_rate": 0.001,
"loss": 1.1603,
"step": 237600
},
{
"epoch": 76.82611506140918,
"grad_norm": 2.2227580547332764,
"learning_rate": 0.001,
"loss": 1.174,
"step": 237700
},
{
"epoch": 76.85843568196509,
"grad_norm": 2.162853479385376,
"learning_rate": 0.001,
"loss": 1.1545,
"step": 237800
},
{
"epoch": 76.89075630252101,
"grad_norm": 1.6438781023025513,
"learning_rate": 0.001,
"loss": 1.1794,
"step": 237900
},
{
"epoch": 76.92307692307692,
"grad_norm": 1.6767332553863525,
"learning_rate": 0.001,
"loss": 1.1709,
"step": 238000
},
{
"epoch": 76.95539754363284,
"grad_norm": 1.6060292720794678,
"learning_rate": 0.001,
"loss": 1.1717,
"step": 238100
},
{
"epoch": 76.98771816418875,
"grad_norm": 1.7817925214767456,
"learning_rate": 0.001,
"loss": 1.1818,
"step": 238200
},
{
"epoch": 77.02003878474467,
"grad_norm": 1.6019141674041748,
"learning_rate": 0.001,
"loss": 1.1144,
"step": 238300
},
{
"epoch": 77.05235940530058,
"grad_norm": 1.992630124092102,
"learning_rate": 0.001,
"loss": 1.0535,
"step": 238400
},
{
"epoch": 77.0846800258565,
"grad_norm": 1.9400583505630493,
"learning_rate": 0.001,
"loss": 1.0624,
"step": 238500
},
{
"epoch": 77.11700064641241,
"grad_norm": 1.91287100315094,
"learning_rate": 0.001,
"loss": 1.0571,
"step": 238600
},
{
"epoch": 77.14932126696833,
"grad_norm": 1.7860596179962158,
"learning_rate": 0.001,
"loss": 1.0697,
"step": 238700
},
{
"epoch": 77.18164188752424,
"grad_norm": 1.6752883195877075,
"learning_rate": 0.001,
"loss": 1.0746,
"step": 238800
},
{
"epoch": 77.21396250808016,
"grad_norm": 1.7331321239471436,
"learning_rate": 0.001,
"loss": 1.0964,
"step": 238900
},
{
"epoch": 77.24628312863607,
"grad_norm": 1.943953275680542,
"learning_rate": 0.001,
"loss": 1.0953,
"step": 239000
},
{
"epoch": 77.27860374919199,
"grad_norm": 1.7045990228652954,
"learning_rate": 0.001,
"loss": 1.0782,
"step": 239100
},
{
"epoch": 77.3109243697479,
"grad_norm": 2.1768674850463867,
"learning_rate": 0.001,
"loss": 1.0918,
"step": 239200
},
{
"epoch": 77.34324499030382,
"grad_norm": 1.8054300546646118,
"learning_rate": 0.001,
"loss": 1.0982,
"step": 239300
},
{
"epoch": 77.37556561085972,
"grad_norm": 1.6111589670181274,
"learning_rate": 0.001,
"loss": 1.094,
"step": 239400
},
{
"epoch": 77.40788623141565,
"grad_norm": 1.7453973293304443,
"learning_rate": 0.001,
"loss": 1.1012,
"step": 239500
},
{
"epoch": 77.44020685197155,
"grad_norm": 2.423070192337036,
"learning_rate": 0.001,
"loss": 1.1149,
"step": 239600
},
{
"epoch": 77.47252747252747,
"grad_norm": 2.0236239433288574,
"learning_rate": 0.001,
"loss": 1.1102,
"step": 239700
},
{
"epoch": 77.50484809308338,
"grad_norm": 2.0543761253356934,
"learning_rate": 0.001,
"loss": 1.1263,
"step": 239800
},
{
"epoch": 77.5371687136393,
"grad_norm": 1.643644094467163,
"learning_rate": 0.001,
"loss": 1.1168,
"step": 239900
},
{
"epoch": 77.56948933419521,
"grad_norm": 1.8869572877883911,
"learning_rate": 0.001,
"loss": 1.125,
"step": 240000
},
{
"epoch": 77.60180995475113,
"grad_norm": 1.5091863870620728,
"learning_rate": 0.001,
"loss": 1.1353,
"step": 240100
},
{
"epoch": 77.63413057530704,
"grad_norm": 1.7741518020629883,
"learning_rate": 0.001,
"loss": 1.1292,
"step": 240200
},
{
"epoch": 77.66645119586296,
"grad_norm": 1.767402172088623,
"learning_rate": 0.001,
"loss": 1.1508,
"step": 240300
},
{
"epoch": 77.69877181641887,
"grad_norm": 1.9282313585281372,
"learning_rate": 0.001,
"loss": 1.1398,
"step": 240400
},
{
"epoch": 77.73109243697479,
"grad_norm": 1.8274956941604614,
"learning_rate": 0.001,
"loss": 1.1345,
"step": 240500
},
{
"epoch": 77.7634130575307,
"grad_norm": 1.7720264196395874,
"learning_rate": 0.001,
"loss": 1.1591,
"step": 240600
},
{
"epoch": 77.79573367808662,
"grad_norm": 1.7153974771499634,
"learning_rate": 0.001,
"loss": 1.1481,
"step": 240700
},
{
"epoch": 77.82805429864253,
"grad_norm": 2.328188180923462,
"learning_rate": 0.001,
"loss": 1.1558,
"step": 240800
},
{
"epoch": 77.86037491919845,
"grad_norm": 1.966943383216858,
"learning_rate": 0.001,
"loss": 1.1406,
"step": 240900
},
{
"epoch": 77.89269553975436,
"grad_norm": 1.7131937742233276,
"learning_rate": 0.001,
"loss": 1.1459,
"step": 241000
},
{
"epoch": 77.92501616031028,
"grad_norm": 1.452742576599121,
"learning_rate": 0.001,
"loss": 1.1633,
"step": 241100
},
{
"epoch": 77.95733678086619,
"grad_norm": 1.5621833801269531,
"learning_rate": 0.001,
"loss": 1.1689,
"step": 241200
},
{
"epoch": 77.98965740142211,
"grad_norm": 2.072866916656494,
"learning_rate": 0.001,
"loss": 1.1554,
"step": 241300
},
{
"epoch": 78.02197802197803,
"grad_norm": 1.7882293462753296,
"learning_rate": 0.001,
"loss": 1.0927,
"step": 241400
},
{
"epoch": 78.05429864253394,
"grad_norm": 1.8420997858047485,
"learning_rate": 0.001,
"loss": 1.051,
"step": 241500
},
{
"epoch": 78.08661926308986,
"grad_norm": 1.671029806137085,
"learning_rate": 0.001,
"loss": 1.0459,
"step": 241600
},
{
"epoch": 78.11893988364577,
"grad_norm": 2.1033594608306885,
"learning_rate": 0.001,
"loss": 1.0547,
"step": 241700
},
{
"epoch": 78.15126050420169,
"grad_norm": 1.9632328748703003,
"learning_rate": 0.001,
"loss": 1.0743,
"step": 241800
},
{
"epoch": 78.1835811247576,
"grad_norm": 1.8516623973846436,
"learning_rate": 0.001,
"loss": 1.0665,
"step": 241900
},
{
"epoch": 78.21590174531352,
"grad_norm": 1.782353162765503,
"learning_rate": 0.001,
"loss": 1.081,
"step": 242000
},
{
"epoch": 78.24822236586942,
"grad_norm": 2.159865140914917,
"learning_rate": 0.001,
"loss": 1.0792,
"step": 242100
},
{
"epoch": 78.28054298642535,
"grad_norm": 1.7599161863327026,
"learning_rate": 0.001,
"loss": 1.0962,
"step": 242200
},
{
"epoch": 78.31286360698125,
"grad_norm": 1.6156737804412842,
"learning_rate": 0.001,
"loss": 1.0881,
"step": 242300
},
{
"epoch": 78.34518422753717,
"grad_norm": 1.9483507871627808,
"learning_rate": 0.001,
"loss": 1.093,
"step": 242400
},
{
"epoch": 78.37750484809308,
"grad_norm": 1.5878406763076782,
"learning_rate": 0.001,
"loss": 1.0838,
"step": 242500
},
{
"epoch": 78.409825468649,
"grad_norm": 1.6766425371170044,
"learning_rate": 0.001,
"loss": 1.0998,
"step": 242600
},
{
"epoch": 78.44214608920491,
"grad_norm": 1.4642685651779175,
"learning_rate": 0.001,
"loss": 1.0929,
"step": 242700
},
{
"epoch": 78.47446670976083,
"grad_norm": 1.8012272119522095,
"learning_rate": 0.001,
"loss": 1.1047,
"step": 242800
},
{
"epoch": 78.50678733031674,
"grad_norm": 1.8630805015563965,
"learning_rate": 0.001,
"loss": 1.1029,
"step": 242900
},
{
"epoch": 78.53910795087266,
"grad_norm": 1.4753458499908447,
"learning_rate": 0.001,
"loss": 1.1177,
"step": 243000
},
{
"epoch": 78.57142857142857,
"grad_norm": 2.500214099884033,
"learning_rate": 0.001,
"loss": 1.109,
"step": 243100
},
{
"epoch": 78.60374919198449,
"grad_norm": 1.680084466934204,
"learning_rate": 0.001,
"loss": 1.1114,
"step": 243200
},
{
"epoch": 78.6360698125404,
"grad_norm": 1.6189630031585693,
"learning_rate": 0.001,
"loss": 1.124,
"step": 243300
},
{
"epoch": 78.66839043309632,
"grad_norm": 1.7440742254257202,
"learning_rate": 0.001,
"loss": 1.1218,
"step": 243400
},
{
"epoch": 78.70071105365223,
"grad_norm": 1.8845442533493042,
"learning_rate": 0.001,
"loss": 1.1245,
"step": 243500
},
{
"epoch": 78.73303167420815,
"grad_norm": 1.6810959577560425,
"learning_rate": 0.001,
"loss": 1.123,
"step": 243600
},
{
"epoch": 78.76535229476406,
"grad_norm": 2.0967421531677246,
"learning_rate": 0.001,
"loss": 1.1421,
"step": 243700
},
{
"epoch": 78.79767291531998,
"grad_norm": 1.5401570796966553,
"learning_rate": 0.001,
"loss": 1.1464,
"step": 243800
},
{
"epoch": 78.82999353587589,
"grad_norm": 1.8480286598205566,
"learning_rate": 0.001,
"loss": 1.1326,
"step": 243900
},
{
"epoch": 78.86231415643181,
"grad_norm": 1.814274787902832,
"learning_rate": 0.001,
"loss": 1.1383,
"step": 244000
},
{
"epoch": 78.89463477698771,
"grad_norm": 2.2483580112457275,
"learning_rate": 0.001,
"loss": 1.157,
"step": 244100
},
{
"epoch": 78.92695539754364,
"grad_norm": 1.8789129257202148,
"learning_rate": 0.001,
"loss": 1.1414,
"step": 244200
},
{
"epoch": 78.95927601809954,
"grad_norm": 1.6194325685501099,
"learning_rate": 0.001,
"loss": 1.1507,
"step": 244300
},
{
"epoch": 78.99159663865547,
"grad_norm": 1.7908927202224731,
"learning_rate": 0.001,
"loss": 1.1566,
"step": 244400
},
{
"epoch": 79.02391725921137,
"grad_norm": 2.1163265705108643,
"learning_rate": 0.001,
"loss": 1.0654,
"step": 244500
},
{
"epoch": 79.0562378797673,
"grad_norm": 1.7553027868270874,
"learning_rate": 0.001,
"loss": 1.0315,
"step": 244600
},
{
"epoch": 79.0885585003232,
"grad_norm": 1.6223750114440918,
"learning_rate": 0.001,
"loss": 1.0521,
"step": 244700
},
{
"epoch": 79.12087912087912,
"grad_norm": 1.7326061725616455,
"learning_rate": 0.001,
"loss": 1.0585,
"step": 244800
},
{
"epoch": 79.15319974143503,
"grad_norm": 2.0212154388427734,
"learning_rate": 0.001,
"loss": 1.0589,
"step": 244900
},
{
"epoch": 79.18552036199095,
"grad_norm": 1.9795660972595215,
"learning_rate": 0.001,
"loss": 1.0626,
"step": 245000
},
{
"epoch": 79.21784098254686,
"grad_norm": 2.4282681941986084,
"learning_rate": 0.001,
"loss": 1.0559,
"step": 245100
},
{
"epoch": 79.25016160310278,
"grad_norm": 1.6875724792480469,
"learning_rate": 0.001,
"loss": 1.083,
"step": 245200
},
{
"epoch": 79.28248222365869,
"grad_norm": 2.1071879863739014,
"learning_rate": 0.001,
"loss": 1.0644,
"step": 245300
},
{
"epoch": 79.31480284421461,
"grad_norm": 1.889708161354065,
"learning_rate": 0.001,
"loss": 1.0704,
"step": 245400
},
{
"epoch": 79.34712346477052,
"grad_norm": 1.9262644052505493,
"learning_rate": 0.001,
"loss": 1.0836,
"step": 245500
},
{
"epoch": 79.37944408532644,
"grad_norm": 1.769736886024475,
"learning_rate": 0.001,
"loss": 1.0883,
"step": 245600
},
{
"epoch": 79.41176470588235,
"grad_norm": 1.7323601245880127,
"learning_rate": 0.001,
"loss": 1.0888,
"step": 245700
},
{
"epoch": 79.44408532643827,
"grad_norm": 1.7969982624053955,
"learning_rate": 0.001,
"loss": 1.0957,
"step": 245800
},
{
"epoch": 79.47640594699418,
"grad_norm": 1.693835735321045,
"learning_rate": 0.001,
"loss": 1.0999,
"step": 245900
},
{
"epoch": 79.5087265675501,
"grad_norm": 1.5995159149169922,
"learning_rate": 0.001,
"loss": 1.0871,
"step": 246000
},
{
"epoch": 79.541047188106,
"grad_norm": 1.8406943082809448,
"learning_rate": 0.001,
"loss": 1.0955,
"step": 246100
},
{
"epoch": 79.57336780866193,
"grad_norm": 1.8488978147506714,
"learning_rate": 0.001,
"loss": 1.1048,
"step": 246200
},
{
"epoch": 79.60568842921784,
"grad_norm": 1.6870185136795044,
"learning_rate": 0.001,
"loss": 1.0973,
"step": 246300
},
{
"epoch": 79.63800904977376,
"grad_norm": 1.5949409008026123,
"learning_rate": 0.001,
"loss": 1.1263,
"step": 246400
},
{
"epoch": 79.67032967032966,
"grad_norm": 1.8820393085479736,
"learning_rate": 0.001,
"loss": 1.1124,
"step": 246500
},
{
"epoch": 79.70265029088559,
"grad_norm": 1.714228868484497,
"learning_rate": 0.001,
"loss": 1.1208,
"step": 246600
},
{
"epoch": 79.7349709114415,
"grad_norm": 1.7019908428192139,
"learning_rate": 0.001,
"loss": 1.1255,
"step": 246700
},
{
"epoch": 79.76729153199742,
"grad_norm": 1.826647162437439,
"learning_rate": 0.001,
"loss": 1.1164,
"step": 246800
},
{
"epoch": 79.79961215255332,
"grad_norm": 1.647612452507019,
"learning_rate": 0.001,
"loss": 1.1187,
"step": 246900
},
{
"epoch": 79.83193277310924,
"grad_norm": 1.8978357315063477,
"learning_rate": 0.001,
"loss": 1.1302,
"step": 247000
},
{
"epoch": 79.86425339366515,
"grad_norm": 1.7612745761871338,
"learning_rate": 0.001,
"loss": 1.1271,
"step": 247100
},
{
"epoch": 79.89657401422107,
"grad_norm": 1.9454165697097778,
"learning_rate": 0.001,
"loss": 1.146,
"step": 247200
},
{
"epoch": 79.92889463477698,
"grad_norm": 1.8270703554153442,
"learning_rate": 0.001,
"loss": 1.1378,
"step": 247300
},
{
"epoch": 79.9612152553329,
"grad_norm": 1.6154886484146118,
"learning_rate": 0.001,
"loss": 1.1621,
"step": 247400
},
{
"epoch": 79.99353587588882,
"grad_norm": 2.264509916305542,
"learning_rate": 0.001,
"loss": 1.1423,
"step": 247500
},
{
"epoch": 80.02585649644473,
"grad_norm": 2.3280816078186035,
"learning_rate": 0.001,
"loss": 1.0525,
"step": 247600
},
{
"epoch": 80.05817711700065,
"grad_norm": 1.731689453125,
"learning_rate": 0.001,
"loss": 1.0274,
"step": 247700
},
{
"epoch": 80.09049773755656,
"grad_norm": 2.1244609355926514,
"learning_rate": 0.001,
"loss": 1.031,
"step": 247800
},
{
"epoch": 80.12281835811248,
"grad_norm": 1.8890272378921509,
"learning_rate": 0.001,
"loss": 1.0379,
"step": 247900
},
{
"epoch": 80.15513897866839,
"grad_norm": 2.0371882915496826,
"learning_rate": 0.001,
"loss": 1.0401,
"step": 248000
},
{
"epoch": 80.18745959922431,
"grad_norm": 1.8187429904937744,
"learning_rate": 0.001,
"loss": 1.0508,
"step": 248100
},
{
"epoch": 80.21978021978022,
"grad_norm": 1.776618480682373,
"learning_rate": 0.001,
"loss": 1.0607,
"step": 248200
},
{
"epoch": 80.25210084033614,
"grad_norm": 1.967862844467163,
"learning_rate": 0.001,
"loss": 1.0589,
"step": 248300
},
{
"epoch": 80.28442146089205,
"grad_norm": 1.8256206512451172,
"learning_rate": 0.001,
"loss": 1.066,
"step": 248400
},
{
"epoch": 80.31674208144797,
"grad_norm": 2.1693742275238037,
"learning_rate": 0.001,
"loss": 1.0786,
"step": 248500
},
{
"epoch": 80.34906270200388,
"grad_norm": 2.159891128540039,
"learning_rate": 0.001,
"loss": 1.0698,
"step": 248600
},
{
"epoch": 80.3813833225598,
"grad_norm": 1.8442610502243042,
"learning_rate": 0.001,
"loss": 1.0768,
"step": 248700
},
{
"epoch": 80.4137039431157,
"grad_norm": 1.802513599395752,
"learning_rate": 0.001,
"loss": 1.0811,
"step": 248800
},
{
"epoch": 80.44602456367163,
"grad_norm": 1.897873878479004,
"learning_rate": 0.001,
"loss": 1.0844,
"step": 248900
},
{
"epoch": 80.47834518422754,
"grad_norm": 2.003659248352051,
"learning_rate": 0.001,
"loss": 1.0747,
"step": 249000
},
{
"epoch": 80.51066580478346,
"grad_norm": 1.620818853378296,
"learning_rate": 0.001,
"loss": 1.0853,
"step": 249100
},
{
"epoch": 80.54298642533936,
"grad_norm": 1.9488434791564941,
"learning_rate": 0.001,
"loss": 1.0955,
"step": 249200
},
{
"epoch": 80.57530704589529,
"grad_norm": 1.8911210298538208,
"learning_rate": 0.001,
"loss": 1.1133,
"step": 249300
},
{
"epoch": 80.6076276664512,
"grad_norm": 2.607034206390381,
"learning_rate": 0.001,
"loss": 1.0753,
"step": 249400
},
{
"epoch": 80.63994828700712,
"grad_norm": 1.5252186059951782,
"learning_rate": 0.001,
"loss": 1.0995,
"step": 249500
},
{
"epoch": 80.67226890756302,
"grad_norm": 1.8382611274719238,
"learning_rate": 0.001,
"loss": 1.1067,
"step": 249600
},
{
"epoch": 80.70458952811894,
"grad_norm": 1.9256694316864014,
"learning_rate": 0.001,
"loss": 1.113,
"step": 249700
},
{
"epoch": 80.73691014867485,
"grad_norm": 2.985775947570801,
"learning_rate": 0.001,
"loss": 1.1075,
"step": 249800
},
{
"epoch": 80.76923076923077,
"grad_norm": 1.662009835243225,
"learning_rate": 0.001,
"loss": 1.1271,
"step": 249900
},
{
"epoch": 80.80155138978668,
"grad_norm": 2.561980724334717,
"learning_rate": 0.001,
"loss": 1.1167,
"step": 250000
},
{
"epoch": 80.8338720103426,
"grad_norm": 1.9441149234771729,
"learning_rate": 0.001,
"loss": 1.1162,
"step": 250100
},
{
"epoch": 80.86619263089851,
"grad_norm": 2.120574474334717,
"learning_rate": 0.001,
"loss": 1.1231,
"step": 250200
},
{
"epoch": 80.89851325145443,
"grad_norm": 1.7940996885299683,
"learning_rate": 0.001,
"loss": 1.1185,
"step": 250300
},
{
"epoch": 80.93083387201034,
"grad_norm": 2.060868263244629,
"learning_rate": 0.001,
"loss": 1.1226,
"step": 250400
},
{
"epoch": 80.96315449256626,
"grad_norm": 1.9433925151824951,
"learning_rate": 0.001,
"loss": 1.1511,
"step": 250500
},
{
"epoch": 80.99547511312217,
"grad_norm": 2.249880075454712,
"learning_rate": 0.001,
"loss": 1.1191,
"step": 250600
},
{
"epoch": 81.02779573367809,
"grad_norm": 2.3257598876953125,
"learning_rate": 0.001,
"loss": 1.0245,
"step": 250700
},
{
"epoch": 81.060116354234,
"grad_norm": 2.1970770359039307,
"learning_rate": 0.001,
"loss": 1.02,
"step": 250800
},
{
"epoch": 81.09243697478992,
"grad_norm": 1.8370674848556519,
"learning_rate": 0.001,
"loss": 1.0309,
"step": 250900
},
{
"epoch": 81.12475759534583,
"grad_norm": 2.330951452255249,
"learning_rate": 0.001,
"loss": 1.0438,
"step": 251000
},
{
"epoch": 81.15707821590175,
"grad_norm": 1.8591450452804565,
"learning_rate": 0.001,
"loss": 1.0488,
"step": 251100
},
{
"epoch": 81.18939883645766,
"grad_norm": 2.0385501384735107,
"learning_rate": 0.001,
"loss": 1.0499,
"step": 251200
},
{
"epoch": 81.22171945701358,
"grad_norm": 1.6502009630203247,
"learning_rate": 0.001,
"loss": 1.0435,
"step": 251300
},
{
"epoch": 81.25404007756948,
"grad_norm": 2.523043394088745,
"learning_rate": 0.001,
"loss": 1.0426,
"step": 251400
},
{
"epoch": 81.2863606981254,
"grad_norm": 2.149442434310913,
"learning_rate": 0.001,
"loss": 1.0617,
"step": 251500
},
{
"epoch": 81.31868131868131,
"grad_norm": 1.8482860326766968,
"learning_rate": 0.001,
"loss": 1.0735,
"step": 251600
},
{
"epoch": 81.35100193923724,
"grad_norm": 2.3649866580963135,
"learning_rate": 0.001,
"loss": 1.0445,
"step": 251700
},
{
"epoch": 81.38332255979314,
"grad_norm": 1.7468416690826416,
"learning_rate": 0.001,
"loss": 1.0568,
"step": 251800
},
{
"epoch": 81.41564318034906,
"grad_norm": 2.0502419471740723,
"learning_rate": 0.001,
"loss": 1.0767,
"step": 251900
},
{
"epoch": 81.44796380090497,
"grad_norm": 1.6154065132141113,
"learning_rate": 0.001,
"loss": 1.0785,
"step": 252000
},
{
"epoch": 81.4802844214609,
"grad_norm": 1.6533535718917847,
"learning_rate": 0.001,
"loss": 1.0839,
"step": 252100
},
{
"epoch": 81.5126050420168,
"grad_norm": 1.7687009572982788,
"learning_rate": 0.001,
"loss": 1.0721,
"step": 252200
},
{
"epoch": 81.54492566257272,
"grad_norm": 1.9510821104049683,
"learning_rate": 0.001,
"loss": 1.0825,
"step": 252300
},
{
"epoch": 81.57724628312863,
"grad_norm": 2.1591577529907227,
"learning_rate": 0.001,
"loss": 1.0866,
"step": 252400
},
{
"epoch": 81.60956690368455,
"grad_norm": 2.1044604778289795,
"learning_rate": 0.001,
"loss": 1.1099,
"step": 252500
},
{
"epoch": 81.64188752424046,
"grad_norm": 2.2758374214172363,
"learning_rate": 0.001,
"loss": 1.0963,
"step": 252600
},
{
"epoch": 81.67420814479638,
"grad_norm": 2.09716534614563,
"learning_rate": 0.001,
"loss": 1.0998,
"step": 252700
},
{
"epoch": 81.70652876535229,
"grad_norm": 2.193350076675415,
"learning_rate": 0.001,
"loss": 1.0906,
"step": 252800
},
{
"epoch": 81.73884938590821,
"grad_norm": 1.868490219116211,
"learning_rate": 0.001,
"loss": 1.1172,
"step": 252900
},
{
"epoch": 81.77117000646412,
"grad_norm": 2.276905059814453,
"learning_rate": 0.001,
"loss": 1.1076,
"step": 253000
},
{
"epoch": 81.80349062702004,
"grad_norm": 1.9997442960739136,
"learning_rate": 0.001,
"loss": 1.1122,
"step": 253100
},
{
"epoch": 81.83581124757595,
"grad_norm": 2.066814422607422,
"learning_rate": 0.001,
"loss": 1.1113,
"step": 253200
},
{
"epoch": 81.86813186813187,
"grad_norm": 2.0630595684051514,
"learning_rate": 0.001,
"loss": 1.14,
"step": 253300
},
{
"epoch": 81.90045248868778,
"grad_norm": 1.7941197156906128,
"learning_rate": 0.001,
"loss": 1.1236,
"step": 253400
},
{
"epoch": 81.9327731092437,
"grad_norm": 1.8599138259887695,
"learning_rate": 0.001,
"loss": 1.1066,
"step": 253500
},
{
"epoch": 81.9650937297996,
"grad_norm": 2.007969379425049,
"learning_rate": 0.001,
"loss": 1.1161,
"step": 253600
},
{
"epoch": 81.99741435035553,
"grad_norm": 2.2284483909606934,
"learning_rate": 0.001,
"loss": 1.1156,
"step": 253700
},
{
"epoch": 82.02973497091145,
"grad_norm": 2.0131847858428955,
"learning_rate": 0.001,
"loss": 1.0191,
"step": 253800
},
{
"epoch": 82.06205559146736,
"grad_norm": 2.0108609199523926,
"learning_rate": 0.001,
"loss": 1.01,
"step": 253900
},
{
"epoch": 82.09437621202328,
"grad_norm": 2.120751142501831,
"learning_rate": 0.001,
"loss": 1.0098,
"step": 254000
},
{
"epoch": 82.12669683257919,
"grad_norm": 2.449652671813965,
"learning_rate": 0.001,
"loss": 1.021,
"step": 254100
},
{
"epoch": 82.1590174531351,
"grad_norm": 2.4260470867156982,
"learning_rate": 0.001,
"loss": 1.0347,
"step": 254200
},
{
"epoch": 82.19133807369101,
"grad_norm": 2.6759181022644043,
"learning_rate": 0.001,
"loss": 1.0325,
"step": 254300
},
{
"epoch": 82.22365869424694,
"grad_norm": 2.0493171215057373,
"learning_rate": 0.001,
"loss": 1.0422,
"step": 254400
},
{
"epoch": 82.25597931480284,
"grad_norm": 2.4781503677368164,
"learning_rate": 0.001,
"loss": 1.0344,
"step": 254500
},
{
"epoch": 82.28829993535876,
"grad_norm": 3.0941216945648193,
"learning_rate": 0.001,
"loss": 1.0478,
"step": 254600
},
{
"epoch": 82.32062055591467,
"grad_norm": 2.149822235107422,
"learning_rate": 0.001,
"loss": 1.0428,
"step": 254700
},
{
"epoch": 82.3529411764706,
"grad_norm": 1.9726589918136597,
"learning_rate": 0.001,
"loss": 1.0629,
"step": 254800
},
{
"epoch": 82.3852617970265,
"grad_norm": 2.1840827465057373,
"learning_rate": 0.001,
"loss": 1.0648,
"step": 254900
},
{
"epoch": 82.41758241758242,
"grad_norm": 2.0328707695007324,
"learning_rate": 0.001,
"loss": 1.0667,
"step": 255000
},
{
"epoch": 82.44990303813833,
"grad_norm": 2.3106980323791504,
"learning_rate": 0.001,
"loss": 1.0656,
"step": 255100
},
{
"epoch": 82.48222365869425,
"grad_norm": 2.14380145072937,
"learning_rate": 0.001,
"loss": 1.0788,
"step": 255200
},
{
"epoch": 82.51454427925016,
"grad_norm": 1.6182340383529663,
"learning_rate": 0.001,
"loss": 1.0802,
"step": 255300
},
{
"epoch": 82.54686489980608,
"grad_norm": 2.0344672203063965,
"learning_rate": 0.001,
"loss": 1.0871,
"step": 255400
},
{
"epoch": 82.57918552036199,
"grad_norm": 1.7792701721191406,
"learning_rate": 0.001,
"loss": 1.0804,
"step": 255500
},
{
"epoch": 82.61150614091791,
"grad_norm": 2.1773808002471924,
"learning_rate": 0.001,
"loss": 1.0792,
"step": 255600
},
{
"epoch": 82.64382676147382,
"grad_norm": 1.9904121160507202,
"learning_rate": 0.001,
"loss": 1.0807,
"step": 255700
},
{
"epoch": 82.67614738202974,
"grad_norm": 2.2778069972991943,
"learning_rate": 0.001,
"loss": 1.0802,
"step": 255800
},
{
"epoch": 82.70846800258565,
"grad_norm": 2.273298978805542,
"learning_rate": 0.001,
"loss": 1.1049,
"step": 255900
},
{
"epoch": 82.74078862314157,
"grad_norm": 2.3957090377807617,
"learning_rate": 0.001,
"loss": 1.0932,
"step": 256000
},
{
"epoch": 82.77310924369748,
"grad_norm": 1.931657314300537,
"learning_rate": 0.001,
"loss": 1.1028,
"step": 256100
},
{
"epoch": 82.8054298642534,
"grad_norm": 1.7905445098876953,
"learning_rate": 0.001,
"loss": 1.0865,
"step": 256200
},
{
"epoch": 82.8377504848093,
"grad_norm": 1.855185627937317,
"learning_rate": 0.001,
"loss": 1.0979,
"step": 256300
},
{
"epoch": 82.87007110536523,
"grad_norm": 1.7903704643249512,
"learning_rate": 0.001,
"loss": 1.0997,
"step": 256400
},
{
"epoch": 82.90239172592113,
"grad_norm": 2.024670124053955,
"learning_rate": 0.001,
"loss": 1.0971,
"step": 256500
},
{
"epoch": 82.93471234647706,
"grad_norm": 2.054471492767334,
"learning_rate": 0.001,
"loss": 1.1191,
"step": 256600
},
{
"epoch": 82.96703296703296,
"grad_norm": 2.021584987640381,
"learning_rate": 0.001,
"loss": 1.1027,
"step": 256700
},
{
"epoch": 82.99935358758889,
"grad_norm": 2.977588176727295,
"learning_rate": 0.001,
"loss": 1.0963,
"step": 256800
},
{
"epoch": 83.03167420814479,
"grad_norm": 2.254401922225952,
"learning_rate": 0.001,
"loss": 0.9857,
"step": 256900
},
{
"epoch": 83.06399482870071,
"grad_norm": 2.0758557319641113,
"learning_rate": 0.001,
"loss": 0.9981,
"step": 257000
},
{
"epoch": 83.09631544925662,
"grad_norm": 2.851766586303711,
"learning_rate": 0.001,
"loss": 1.0049,
"step": 257100
},
{
"epoch": 83.12863606981254,
"grad_norm": 1.9387279748916626,
"learning_rate": 0.001,
"loss": 1.0158,
"step": 257200
},
{
"epoch": 83.16095669036845,
"grad_norm": 2.2792084217071533,
"learning_rate": 0.001,
"loss": 1.0109,
"step": 257300
},
{
"epoch": 83.19327731092437,
"grad_norm": 2.8158106803894043,
"learning_rate": 0.001,
"loss": 1.0325,
"step": 257400
},
{
"epoch": 83.22559793148028,
"grad_norm": 1.8931865692138672,
"learning_rate": 0.001,
"loss": 1.0084,
"step": 257500
},
{
"epoch": 83.2579185520362,
"grad_norm": 2.0553183555603027,
"learning_rate": 0.001,
"loss": 1.0223,
"step": 257600
},
{
"epoch": 83.29023917259211,
"grad_norm": 2.032785415649414,
"learning_rate": 0.001,
"loss": 1.0368,
"step": 257700
},
{
"epoch": 83.32255979314803,
"grad_norm": 2.276414632797241,
"learning_rate": 0.001,
"loss": 1.0434,
"step": 257800
},
{
"epoch": 83.35488041370394,
"grad_norm": 2.2769389152526855,
"learning_rate": 0.001,
"loss": 1.0405,
"step": 257900
},
{
"epoch": 83.38720103425986,
"grad_norm": 2.457798719406128,
"learning_rate": 0.001,
"loss": 1.0536,
"step": 258000
},
{
"epoch": 83.41952165481577,
"grad_norm": 2.427795886993408,
"learning_rate": 0.001,
"loss": 1.0574,
"step": 258100
},
{
"epoch": 83.45184227537169,
"grad_norm": 2.3485844135284424,
"learning_rate": 0.001,
"loss": 1.0604,
"step": 258200
},
{
"epoch": 83.4841628959276,
"grad_norm": 2.122995615005493,
"learning_rate": 0.001,
"loss": 1.0593,
"step": 258300
},
{
"epoch": 83.51648351648352,
"grad_norm": 2.6342482566833496,
"learning_rate": 0.001,
"loss": 1.0711,
"step": 258400
},
{
"epoch": 83.54880413703943,
"grad_norm": 2.1874687671661377,
"learning_rate": 0.001,
"loss": 1.063,
"step": 258500
},
{
"epoch": 83.58112475759535,
"grad_norm": 2.3595123291015625,
"learning_rate": 0.001,
"loss": 1.0756,
"step": 258600
},
{
"epoch": 83.61344537815125,
"grad_norm": 2.185002326965332,
"learning_rate": 0.001,
"loss": 1.0728,
"step": 258700
},
{
"epoch": 83.64576599870718,
"grad_norm": 2.350257158279419,
"learning_rate": 0.001,
"loss": 1.0819,
"step": 258800
},
{
"epoch": 83.67808661926308,
"grad_norm": 2.661860227584839,
"learning_rate": 0.001,
"loss": 1.0873,
"step": 258900
},
{
"epoch": 83.710407239819,
"grad_norm": 2.071593999862671,
"learning_rate": 0.001,
"loss": 1.0969,
"step": 259000
},
{
"epoch": 83.74272786037491,
"grad_norm": 2.097931146621704,
"learning_rate": 0.001,
"loss": 1.0821,
"step": 259100
},
{
"epoch": 83.77504848093083,
"grad_norm": 2.636651039123535,
"learning_rate": 0.001,
"loss": 1.0815,
"step": 259200
},
{
"epoch": 83.80736910148674,
"grad_norm": 2.398634433746338,
"learning_rate": 0.001,
"loss": 1.0849,
"step": 259300
},
{
"epoch": 83.83968972204266,
"grad_norm": 1.94718599319458,
"learning_rate": 0.001,
"loss": 1.0987,
"step": 259400
},
{
"epoch": 83.87201034259857,
"grad_norm": 2.3222267627716064,
"learning_rate": 0.001,
"loss": 1.0896,
"step": 259500
},
{
"epoch": 83.9043309631545,
"grad_norm": 2.0322906970977783,
"learning_rate": 0.001,
"loss": 1.0968,
"step": 259600
},
{
"epoch": 83.9366515837104,
"grad_norm": 2.293200731277466,
"learning_rate": 0.001,
"loss": 1.1018,
"step": 259700
},
{
"epoch": 83.96897220426632,
"grad_norm": 2.212113380432129,
"learning_rate": 0.001,
"loss": 1.1003,
"step": 259800
},
{
"epoch": 84.00129282482224,
"grad_norm": 1.9160456657409668,
"learning_rate": 0.001,
"loss": 1.1094,
"step": 259900
},
{
"epoch": 84.03361344537815,
"grad_norm": 2.0068440437316895,
"learning_rate": 0.001,
"loss": 0.9994,
"step": 260000
},
{
"epoch": 84.06593406593407,
"grad_norm": 1.8730653524398804,
"learning_rate": 0.001,
"loss": 0.9916,
"step": 260100
},
{
"epoch": 84.09825468648998,
"grad_norm": 1.9432926177978516,
"learning_rate": 0.001,
"loss": 0.9884,
"step": 260200
},
{
"epoch": 84.1305753070459,
"grad_norm": 2.5531952381134033,
"learning_rate": 0.001,
"loss": 1.0135,
"step": 260300
},
{
"epoch": 84.16289592760181,
"grad_norm": 1.827590823173523,
"learning_rate": 0.001,
"loss": 1.0184,
"step": 260400
},
{
"epoch": 84.19521654815773,
"grad_norm": 1.953426718711853,
"learning_rate": 0.001,
"loss": 1.0346,
"step": 260500
},
{
"epoch": 84.22753716871364,
"grad_norm": 1.895742416381836,
"learning_rate": 0.001,
"loss": 1.0192,
"step": 260600
},
{
"epoch": 84.25985778926956,
"grad_norm": 2.4154608249664307,
"learning_rate": 0.001,
"loss": 1.0212,
"step": 260700
},
{
"epoch": 84.29217840982547,
"grad_norm": 1.9845346212387085,
"learning_rate": 0.001,
"loss": 1.0385,
"step": 260800
},
{
"epoch": 84.32449903038139,
"grad_norm": 1.9528859853744507,
"learning_rate": 0.001,
"loss": 1.038,
"step": 260900
},
{
"epoch": 84.3568196509373,
"grad_norm": 1.5986595153808594,
"learning_rate": 0.001,
"loss": 1.0472,
"step": 261000
},
{
"epoch": 84.38914027149322,
"grad_norm": 2.879544734954834,
"learning_rate": 0.001,
"loss": 1.0387,
"step": 261100
},
{
"epoch": 84.42146089204913,
"grad_norm": 2.07099986076355,
"learning_rate": 0.001,
"loss": 1.0485,
"step": 261200
},
{
"epoch": 84.45378151260505,
"grad_norm": 1.9697378873825073,
"learning_rate": 0.001,
"loss": 1.046,
"step": 261300
},
{
"epoch": 84.48610213316095,
"grad_norm": 2.495154619216919,
"learning_rate": 0.001,
"loss": 1.0427,
"step": 261400
},
{
"epoch": 84.51842275371688,
"grad_norm": 2.241727113723755,
"learning_rate": 0.001,
"loss": 1.0599,
"step": 261500
},
{
"epoch": 84.55074337427278,
"grad_norm": 2.0560214519500732,
"learning_rate": 0.001,
"loss": 1.0515,
"step": 261600
},
{
"epoch": 84.5830639948287,
"grad_norm": 2.1529815196990967,
"learning_rate": 0.001,
"loss": 1.0516,
"step": 261700
},
{
"epoch": 84.61538461538461,
"grad_norm": 2.4618332386016846,
"learning_rate": 0.001,
"loss": 1.0721,
"step": 261800
},
{
"epoch": 84.64770523594053,
"grad_norm": 1.9415512084960938,
"learning_rate": 0.001,
"loss": 1.0545,
"step": 261900
},
{
"epoch": 84.68002585649644,
"grad_norm": 40.67611312866211,
"learning_rate": 0.001,
"loss": 1.0695,
"step": 262000
},
{
"epoch": 84.71234647705236,
"grad_norm": 1.830438256263733,
"learning_rate": 0.001,
"loss": 1.0702,
"step": 262100
},
{
"epoch": 84.74466709760827,
"grad_norm": 2.0758326053619385,
"learning_rate": 0.001,
"loss": 1.0715,
"step": 262200
},
{
"epoch": 84.7769877181642,
"grad_norm": 1.9768143892288208,
"learning_rate": 0.001,
"loss": 1.0926,
"step": 262300
},
{
"epoch": 84.8093083387201,
"grad_norm": 2.4921510219573975,
"learning_rate": 0.001,
"loss": 1.0887,
"step": 262400
},
{
"epoch": 84.84162895927602,
"grad_norm": 1.8019990921020508,
"learning_rate": 0.001,
"loss": 1.1003,
"step": 262500
},
{
"epoch": 84.87394957983193,
"grad_norm": 1.7619433403015137,
"learning_rate": 0.001,
"loss": 1.0909,
"step": 262600
},
{
"epoch": 84.90627020038785,
"grad_norm": 2.410701274871826,
"learning_rate": 0.001,
"loss": 1.094,
"step": 262700
},
{
"epoch": 84.93859082094376,
"grad_norm": 2.021223306655884,
"learning_rate": 0.001,
"loss": 1.0909,
"step": 262800
},
{
"epoch": 84.97091144149968,
"grad_norm": 10.952349662780762,
"learning_rate": 0.001,
"loss": 1.1052,
"step": 262900
},
{
"epoch": 85.00323206205559,
"grad_norm": 1.784826636314392,
"learning_rate": 0.001,
"loss": 1.1001,
"step": 263000
},
{
"epoch": 85.03555268261151,
"grad_norm": 2.587707042694092,
"learning_rate": 0.001,
"loss": 0.9838,
"step": 263100
},
{
"epoch": 85.06787330316742,
"grad_norm": 2.302337169647217,
"learning_rate": 0.001,
"loss": 0.9897,
"step": 263200
},
{
"epoch": 85.10019392372334,
"grad_norm": 1.8791141510009766,
"learning_rate": 0.001,
"loss": 0.9905,
"step": 263300
},
{
"epoch": 85.13251454427925,
"grad_norm": 1.891874074935913,
"learning_rate": 0.001,
"loss": 0.9889,
"step": 263400
},
{
"epoch": 85.16483516483517,
"grad_norm": 2.0654654502868652,
"learning_rate": 0.001,
"loss": 0.9969,
"step": 263500
},
{
"epoch": 85.19715578539108,
"grad_norm": 2.999326229095459,
"learning_rate": 0.001,
"loss": 1.0026,
"step": 263600
},
{
"epoch": 85.229476405947,
"grad_norm": 2.2999064922332764,
"learning_rate": 0.001,
"loss": 1.0114,
"step": 263700
},
{
"epoch": 85.2617970265029,
"grad_norm": 2.1461431980133057,
"learning_rate": 0.001,
"loss": 1.0403,
"step": 263800
},
{
"epoch": 85.29411764705883,
"grad_norm": 1.715585708618164,
"learning_rate": 0.001,
"loss": 1.0147,
"step": 263900
},
{
"epoch": 85.32643826761473,
"grad_norm": 2.0307717323303223,
"learning_rate": 0.001,
"loss": 1.0311,
"step": 264000
},
{
"epoch": 85.35875888817066,
"grad_norm": 1.924255609512329,
"learning_rate": 0.001,
"loss": 1.0376,
"step": 264100
},
{
"epoch": 85.39107950872656,
"grad_norm": 3.5688138008117676,
"learning_rate": 0.001,
"loss": 1.0331,
"step": 264200
},
{
"epoch": 85.42340012928248,
"grad_norm": 1.627633810043335,
"learning_rate": 0.001,
"loss": 1.0413,
"step": 264300
},
{
"epoch": 85.45572074983839,
"grad_norm": 2.091357946395874,
"learning_rate": 0.001,
"loss": 1.0438,
"step": 264400
},
{
"epoch": 85.48804137039431,
"grad_norm": 1.7921959161758423,
"learning_rate": 0.001,
"loss": 1.0332,
"step": 264500
},
{
"epoch": 85.52036199095022,
"grad_norm": 1.9144126176834106,
"learning_rate": 0.001,
"loss": 1.0537,
"step": 264600
},
{
"epoch": 85.55268261150614,
"grad_norm": 1.995881199836731,
"learning_rate": 0.001,
"loss": 1.0525,
"step": 264700
},
{
"epoch": 85.58500323206205,
"grad_norm": 1.8649122714996338,
"learning_rate": 0.001,
"loss": 1.0506,
"step": 264800
},
{
"epoch": 85.61732385261797,
"grad_norm": 1.7572340965270996,
"learning_rate": 0.001,
"loss": 1.0468,
"step": 264900
},
{
"epoch": 85.64964447317388,
"grad_norm": 2.5630545616149902,
"learning_rate": 0.001,
"loss": 1.0482,
"step": 265000
},
{
"epoch": 85.6819650937298,
"grad_norm": 1.790952205657959,
"learning_rate": 0.001,
"loss": 1.0607,
"step": 265100
},
{
"epoch": 85.71428571428571,
"grad_norm": 1.7042549848556519,
"learning_rate": 0.001,
"loss": 1.0452,
"step": 265200
},
{
"epoch": 85.74660633484163,
"grad_norm": 2.4065299034118652,
"learning_rate": 0.001,
"loss": 1.0821,
"step": 265300
},
{
"epoch": 85.77892695539754,
"grad_norm": 1.7829989194869995,
"learning_rate": 0.001,
"loss": 1.0725,
"step": 265400
},
{
"epoch": 85.81124757595346,
"grad_norm": 2.1033124923706055,
"learning_rate": 0.001,
"loss": 1.0794,
"step": 265500
},
{
"epoch": 85.84356819650937,
"grad_norm": 1.9515258073806763,
"learning_rate": 0.001,
"loss": 1.0688,
"step": 265600
},
{
"epoch": 85.87588881706529,
"grad_norm": 1.796294927597046,
"learning_rate": 0.001,
"loss": 1.0845,
"step": 265700
},
{
"epoch": 85.9082094376212,
"grad_norm": 1.7326546907424927,
"learning_rate": 0.001,
"loss": 1.0781,
"step": 265800
},
{
"epoch": 85.94053005817712,
"grad_norm": 1.7627480030059814,
"learning_rate": 0.001,
"loss": 1.0979,
"step": 265900
},
{
"epoch": 85.97285067873302,
"grad_norm": 2.2406697273254395,
"learning_rate": 0.001,
"loss": 1.0872,
"step": 266000
},
{
"epoch": 86.00517129928895,
"grad_norm": 1.6462572813034058,
"learning_rate": 0.001,
"loss": 1.08,
"step": 266100
},
{
"epoch": 86.03749191984487,
"grad_norm": 1.6872225999832153,
"learning_rate": 0.001,
"loss": 0.9723,
"step": 266200
},
{
"epoch": 86.06981254040078,
"grad_norm": 2.2746024131774902,
"learning_rate": 0.001,
"loss": 0.9815,
"step": 266300
},
{
"epoch": 86.1021331609567,
"grad_norm": 1.6541063785552979,
"learning_rate": 0.001,
"loss": 0.9831,
"step": 266400
},
{
"epoch": 86.1344537815126,
"grad_norm": 1.9259800910949707,
"learning_rate": 0.001,
"loss": 0.9992,
"step": 266500
},
{
"epoch": 86.16677440206853,
"grad_norm": 1.8844987154006958,
"learning_rate": 0.001,
"loss": 0.9949,
"step": 266600
},
{
"epoch": 86.19909502262443,
"grad_norm": 1.5561342239379883,
"learning_rate": 0.001,
"loss": 0.9989,
"step": 266700
},
{
"epoch": 86.23141564318036,
"grad_norm": 2.4345591068267822,
"learning_rate": 0.001,
"loss": 0.9958,
"step": 266800
},
{
"epoch": 86.26373626373626,
"grad_norm": 1.8101435899734497,
"learning_rate": 0.001,
"loss": 1.0139,
"step": 266900
},
{
"epoch": 86.29605688429218,
"grad_norm": 2.0286176204681396,
"learning_rate": 0.001,
"loss": 1.0251,
"step": 267000
},
{
"epoch": 86.32837750484809,
"grad_norm": 1.9802911281585693,
"learning_rate": 0.001,
"loss": 1.0139,
"step": 267100
},
{
"epoch": 86.36069812540401,
"grad_norm": 1.6296907663345337,
"learning_rate": 0.001,
"loss": 1.0182,
"step": 267200
},
{
"epoch": 86.39301874595992,
"grad_norm": 2.130852460861206,
"learning_rate": 0.001,
"loss": 1.0272,
"step": 267300
},
{
"epoch": 86.42533936651584,
"grad_norm": 1.6675856113433838,
"learning_rate": 0.001,
"loss": 1.0309,
"step": 267400
},
{
"epoch": 86.45765998707175,
"grad_norm": 1.873880386352539,
"learning_rate": 0.001,
"loss": 1.02,
"step": 267500
},
{
"epoch": 86.48998060762767,
"grad_norm": 1.6466994285583496,
"learning_rate": 0.001,
"loss": 1.036,
"step": 267600
},
{
"epoch": 86.52230122818358,
"grad_norm": 2.3659939765930176,
"learning_rate": 0.001,
"loss": 1.0379,
"step": 267700
},
{
"epoch": 86.5546218487395,
"grad_norm": 1.846489667892456,
"learning_rate": 0.001,
"loss": 1.0386,
"step": 267800
},
{
"epoch": 86.58694246929541,
"grad_norm": 1.9823304414749146,
"learning_rate": 0.001,
"loss": 1.0549,
"step": 267900
},
{
"epoch": 86.61926308985133,
"grad_norm": 2.0953738689422607,
"learning_rate": 0.001,
"loss": 1.0404,
"step": 268000
},
{
"epoch": 86.65158371040724,
"grad_norm": 5.9522705078125,
"learning_rate": 0.001,
"loss": 1.0526,
"step": 268100
},
{
"epoch": 86.68390433096316,
"grad_norm": 1.8562268018722534,
"learning_rate": 0.001,
"loss": 1.0652,
"step": 268200
},
{
"epoch": 86.71622495151907,
"grad_norm": 1.8963710069656372,
"learning_rate": 0.001,
"loss": 1.0499,
"step": 268300
},
{
"epoch": 86.74854557207499,
"grad_norm": 1.8700145483016968,
"learning_rate": 0.001,
"loss": 1.0587,
"step": 268400
},
{
"epoch": 86.7808661926309,
"grad_norm": 1.7103973627090454,
"learning_rate": 0.001,
"loss": 1.0584,
"step": 268500
},
{
"epoch": 86.81318681318682,
"grad_norm": 1.864179253578186,
"learning_rate": 0.001,
"loss": 1.0672,
"step": 268600
},
{
"epoch": 86.84550743374272,
"grad_norm": 2.295189380645752,
"learning_rate": 0.001,
"loss": 1.0639,
"step": 268700
},
{
"epoch": 86.87782805429865,
"grad_norm": 1.8950257301330566,
"learning_rate": 0.001,
"loss": 1.0659,
"step": 268800
},
{
"epoch": 86.91014867485455,
"grad_norm": 2.016186237335205,
"learning_rate": 0.001,
"loss": 1.0682,
"step": 268900
},
{
"epoch": 86.94246929541048,
"grad_norm": 1.8546134233474731,
"learning_rate": 0.001,
"loss": 1.0724,
"step": 269000
},
{
"epoch": 86.97478991596638,
"grad_norm": 1.4972987174987793,
"learning_rate": 0.001,
"loss": 1.0884,
"step": 269100
},
{
"epoch": 87.0071105365223,
"grad_norm": 2.2357635498046875,
"learning_rate": 0.001,
"loss": 1.0689,
"step": 269200
},
{
"epoch": 87.03943115707821,
"grad_norm": 1.957870602607727,
"learning_rate": 0.001,
"loss": 0.9617,
"step": 269300
},
{
"epoch": 87.07175177763413,
"grad_norm": 1.9988821744918823,
"learning_rate": 0.001,
"loss": 0.9642,
"step": 269400
},
{
"epoch": 87.10407239819004,
"grad_norm": 1.902817964553833,
"learning_rate": 0.001,
"loss": 0.9724,
"step": 269500
},
{
"epoch": 87.13639301874596,
"grad_norm": 1.8384301662445068,
"learning_rate": 0.001,
"loss": 0.9825,
"step": 269600
},
{
"epoch": 87.16871363930187,
"grad_norm": 1.9688271284103394,
"learning_rate": 0.001,
"loss": 0.9702,
"step": 269700
},
{
"epoch": 87.20103425985779,
"grad_norm": 1.5213277339935303,
"learning_rate": 0.001,
"loss": 0.9891,
"step": 269800
},
{
"epoch": 87.2333548804137,
"grad_norm": 1.8877270221710205,
"learning_rate": 0.001,
"loss": 0.9974,
"step": 269900
},
{
"epoch": 87.26567550096962,
"grad_norm": 1.5288257598876953,
"learning_rate": 0.001,
"loss": 0.999,
"step": 270000
},
{
"epoch": 87.29799612152553,
"grad_norm": 1.6828035116195679,
"learning_rate": 0.001,
"loss": 1.0073,
"step": 270100
},
{
"epoch": 87.33031674208145,
"grad_norm": 1.9386004209518433,
"learning_rate": 0.001,
"loss": 1.0163,
"step": 270200
},
{
"epoch": 87.36263736263736,
"grad_norm": 1.8773216009140015,
"learning_rate": 0.001,
"loss": 1.0245,
"step": 270300
},
{
"epoch": 87.39495798319328,
"grad_norm": 1.8428542613983154,
"learning_rate": 0.001,
"loss": 1.0207,
"step": 270400
},
{
"epoch": 87.42727860374919,
"grad_norm": 2.0465738773345947,
"learning_rate": 0.001,
"loss": 1.0034,
"step": 270500
},
{
"epoch": 87.45959922430511,
"grad_norm": 2.624429225921631,
"learning_rate": 0.001,
"loss": 1.0296,
"step": 270600
},
{
"epoch": 87.49191984486102,
"grad_norm": 2.0403313636779785,
"learning_rate": 0.001,
"loss": 1.0304,
"step": 270700
},
{
"epoch": 87.52424046541694,
"grad_norm": 1.6845457553863525,
"learning_rate": 0.001,
"loss": 1.0235,
"step": 270800
},
{
"epoch": 87.55656108597285,
"grad_norm": 1.5582964420318604,
"learning_rate": 0.001,
"loss": 1.0235,
"step": 270900
},
{
"epoch": 87.58888170652877,
"grad_norm": 2.086839199066162,
"learning_rate": 0.001,
"loss": 1.0346,
"step": 271000
},
{
"epoch": 87.62120232708467,
"grad_norm": 1.4940403699874878,
"learning_rate": 0.001,
"loss": 1.0434,
"step": 271100
},
{
"epoch": 87.6535229476406,
"grad_norm": 1.7732195854187012,
"learning_rate": 0.001,
"loss": 1.0391,
"step": 271200
},
{
"epoch": 87.6858435681965,
"grad_norm": 1.9552558660507202,
"learning_rate": 0.001,
"loss": 1.0588,
"step": 271300
},
{
"epoch": 87.71816418875243,
"grad_norm": 1.6922935247421265,
"learning_rate": 0.001,
"loss": 1.0525,
"step": 271400
},
{
"epoch": 87.75048480930833,
"grad_norm": 1.606885552406311,
"learning_rate": 0.001,
"loss": 1.0609,
"step": 271500
},
{
"epoch": 87.78280542986425,
"grad_norm": 1.7126843929290771,
"learning_rate": 0.001,
"loss": 1.0544,
"step": 271600
},
{
"epoch": 87.81512605042016,
"grad_norm": 1.9816648960113525,
"learning_rate": 0.001,
"loss": 1.0636,
"step": 271700
},
{
"epoch": 87.84744667097608,
"grad_norm": 1.776976466178894,
"learning_rate": 0.001,
"loss": 1.0594,
"step": 271800
},
{
"epoch": 87.87976729153199,
"grad_norm": 2.1870553493499756,
"learning_rate": 0.001,
"loss": 1.0482,
"step": 271900
},
{
"epoch": 87.91208791208791,
"grad_norm": 1.7203819751739502,
"learning_rate": 0.001,
"loss": 1.0711,
"step": 272000
},
{
"epoch": 87.94440853264382,
"grad_norm": 1.6706738471984863,
"learning_rate": 0.001,
"loss": 1.0778,
"step": 272100
},
{
"epoch": 87.97672915319974,
"grad_norm": 2.048996925354004,
"learning_rate": 0.001,
"loss": 1.071,
"step": 272200
},
{
"epoch": 88.00904977375566,
"grad_norm": 1.791468858718872,
"learning_rate": 0.001,
"loss": 1.0356,
"step": 272300
},
{
"epoch": 88.04137039431157,
"grad_norm": 2.0778145790100098,
"learning_rate": 0.001,
"loss": 0.9626,
"step": 272400
},
{
"epoch": 88.07369101486749,
"grad_norm": 2.016916275024414,
"learning_rate": 0.001,
"loss": 0.9573,
"step": 272500
},
{
"epoch": 88.1060116354234,
"grad_norm": 1.8074690103530884,
"learning_rate": 0.001,
"loss": 0.9704,
"step": 272600
},
{
"epoch": 88.13833225597932,
"grad_norm": 1.7361093759536743,
"learning_rate": 0.001,
"loss": 0.9908,
"step": 272700
},
{
"epoch": 88.17065287653523,
"grad_norm": 1.7573654651641846,
"learning_rate": 0.001,
"loss": 0.9922,
"step": 272800
},
{
"epoch": 88.20297349709115,
"grad_norm": 1.6067391633987427,
"learning_rate": 0.001,
"loss": 0.9822,
"step": 272900
},
{
"epoch": 88.23529411764706,
"grad_norm": 1.9911582469940186,
"learning_rate": 0.001,
"loss": 0.9969,
"step": 273000
},
{
"epoch": 88.26761473820298,
"grad_norm": 1.9228503704071045,
"learning_rate": 0.001,
"loss": 0.9908,
"step": 273100
},
{
"epoch": 88.29993535875889,
"grad_norm": 1.954790711402893,
"learning_rate": 0.001,
"loss": 0.9979,
"step": 273200
},
{
"epoch": 88.33225597931481,
"grad_norm": 2.02158784866333,
"learning_rate": 0.001,
"loss": 1.0014,
"step": 273300
},
{
"epoch": 88.36457659987072,
"grad_norm": 1.9888592958450317,
"learning_rate": 0.001,
"loss": 0.9951,
"step": 273400
},
{
"epoch": 88.39689722042664,
"grad_norm": 1.7298367023468018,
"learning_rate": 0.001,
"loss": 1.0075,
"step": 273500
},
{
"epoch": 88.42921784098255,
"grad_norm": 1.7865296602249146,
"learning_rate": 0.001,
"loss": 1.0108,
"step": 273600
},
{
"epoch": 88.46153846153847,
"grad_norm": 2.6155149936676025,
"learning_rate": 0.001,
"loss": 1.0153,
"step": 273700
},
{
"epoch": 88.49385908209437,
"grad_norm": 1.765749216079712,
"learning_rate": 0.001,
"loss": 1.0148,
"step": 273800
},
{
"epoch": 88.5261797026503,
"grad_norm": 1.943948745727539,
"learning_rate": 0.001,
"loss": 1.017,
"step": 273900
},
{
"epoch": 88.5585003232062,
"grad_norm": 1.7162889242172241,
"learning_rate": 0.001,
"loss": 1.0314,
"step": 274000
},
{
"epoch": 88.59082094376213,
"grad_norm": 1.5523930788040161,
"learning_rate": 0.001,
"loss": 1.0302,
"step": 274100
},
{
"epoch": 88.62314156431803,
"grad_norm": 1.733382225036621,
"learning_rate": 0.001,
"loss": 1.0303,
"step": 274200
},
{
"epoch": 88.65546218487395,
"grad_norm": 2.412778615951538,
"learning_rate": 0.001,
"loss": 1.0385,
"step": 274300
},
{
"epoch": 88.68778280542986,
"grad_norm": 1.9336631298065186,
"learning_rate": 0.001,
"loss": 1.0359,
"step": 274400
},
{
"epoch": 88.72010342598578,
"grad_norm": 2.2607991695404053,
"learning_rate": 0.001,
"loss": 1.0412,
"step": 274500
},
{
"epoch": 88.75242404654169,
"grad_norm": 1.6918398141860962,
"learning_rate": 0.001,
"loss": 1.056,
"step": 274600
},
{
"epoch": 88.78474466709761,
"grad_norm": 1.6877381801605225,
"learning_rate": 0.001,
"loss": 1.0467,
"step": 274700
},
{
"epoch": 88.81706528765352,
"grad_norm": 1.8707000017166138,
"learning_rate": 0.001,
"loss": 1.0523,
"step": 274800
},
{
"epoch": 88.84938590820944,
"grad_norm": 1.7763044834136963,
"learning_rate": 0.001,
"loss": 1.044,
"step": 274900
},
{
"epoch": 88.88170652876535,
"grad_norm": 2.0772578716278076,
"learning_rate": 0.001,
"loss": 1.0556,
"step": 275000
},
{
"epoch": 88.91402714932127,
"grad_norm": 1.7194854021072388,
"learning_rate": 0.001,
"loss": 1.0596,
"step": 275100
},
{
"epoch": 88.94634776987718,
"grad_norm": 2.1079659461975098,
"learning_rate": 0.001,
"loss": 1.0678,
"step": 275200
},
{
"epoch": 88.9786683904331,
"grad_norm": 1.841643214225769,
"learning_rate": 0.001,
"loss": 1.0617,
"step": 275300
},
{
"epoch": 89.01098901098901,
"grad_norm": 1.7587463855743408,
"learning_rate": 0.001,
"loss": 1.0222,
"step": 275400
},
{
"epoch": 89.04330963154493,
"grad_norm": 1.8258470296859741,
"learning_rate": 0.001,
"loss": 0.9631,
"step": 275500
},
{
"epoch": 89.07563025210084,
"grad_norm": 1.7834150791168213,
"learning_rate": 0.001,
"loss": 0.9544,
"step": 275600
},
{
"epoch": 89.10795087265676,
"grad_norm": 1.8407515287399292,
"learning_rate": 0.001,
"loss": 0.9701,
"step": 275700
},
{
"epoch": 89.14027149321267,
"grad_norm": 1.9101598262786865,
"learning_rate": 0.001,
"loss": 0.9584,
"step": 275800
},
{
"epoch": 89.17259211376859,
"grad_norm": 1.7279962301254272,
"learning_rate": 0.001,
"loss": 0.983,
"step": 275900
},
{
"epoch": 89.2049127343245,
"grad_norm": 1.8472046852111816,
"learning_rate": 0.001,
"loss": 0.9722,
"step": 276000
},
{
"epoch": 89.23723335488042,
"grad_norm": 1.6440308094024658,
"learning_rate": 0.001,
"loss": 0.984,
"step": 276100
},
{
"epoch": 89.26955397543632,
"grad_norm": 1.48750901222229,
"learning_rate": 0.001,
"loss": 0.9765,
"step": 276200
},
{
"epoch": 89.30187459599225,
"grad_norm": 1.7771786451339722,
"learning_rate": 0.001,
"loss": 1.0027,
"step": 276300
},
{
"epoch": 89.33419521654815,
"grad_norm": 1.5432380437850952,
"learning_rate": 0.001,
"loss": 0.9903,
"step": 276400
},
{
"epoch": 89.36651583710407,
"grad_norm": 2.1023707389831543,
"learning_rate": 0.001,
"loss": 0.9946,
"step": 276500
},
{
"epoch": 89.39883645765998,
"grad_norm": 1.942596435546875,
"learning_rate": 0.001,
"loss": 1.0045,
"step": 276600
},
{
"epoch": 89.4311570782159,
"grad_norm": 2.248884916305542,
"learning_rate": 0.001,
"loss": 0.9991,
"step": 276700
},
{
"epoch": 89.46347769877181,
"grad_norm": 2.2643485069274902,
"learning_rate": 0.001,
"loss": 1.0091,
"step": 276800
},
{
"epoch": 89.49579831932773,
"grad_norm": 2.112210273742676,
"learning_rate": 0.001,
"loss": 1.0035,
"step": 276900
},
{
"epoch": 89.52811893988364,
"grad_norm": 2.1388866901397705,
"learning_rate": 0.001,
"loss": 1.0105,
"step": 277000
},
{
"epoch": 89.56043956043956,
"grad_norm": 1.910740613937378,
"learning_rate": 0.001,
"loss": 1.0142,
"step": 277100
},
{
"epoch": 89.59276018099547,
"grad_norm": 2.0235660076141357,
"learning_rate": 0.001,
"loss": 1.0254,
"step": 277200
},
{
"epoch": 89.62508080155139,
"grad_norm": 1.9163745641708374,
"learning_rate": 0.001,
"loss": 1.0277,
"step": 277300
},
{
"epoch": 89.6574014221073,
"grad_norm": 1.8493446111679077,
"learning_rate": 0.001,
"loss": 1.0239,
"step": 277400
},
{
"epoch": 89.68972204266322,
"grad_norm": 2.082435131072998,
"learning_rate": 0.001,
"loss": 1.0255,
"step": 277500
},
{
"epoch": 89.72204266321913,
"grad_norm": 1.9139171838760376,
"learning_rate": 0.001,
"loss": 1.0456,
"step": 277600
},
{
"epoch": 89.75436328377505,
"grad_norm": 1.9875322580337524,
"learning_rate": 0.001,
"loss": 1.034,
"step": 277700
},
{
"epoch": 89.78668390433096,
"grad_norm": 2.1025478839874268,
"learning_rate": 0.001,
"loss": 1.0329,
"step": 277800
},
{
"epoch": 89.81900452488688,
"grad_norm": 1.9872552156448364,
"learning_rate": 0.001,
"loss": 1.042,
"step": 277900
},
{
"epoch": 89.85132514544279,
"grad_norm": 2.346891164779663,
"learning_rate": 0.001,
"loss": 1.0439,
"step": 278000
},
{
"epoch": 89.88364576599871,
"grad_norm": 2.0617330074310303,
"learning_rate": 0.001,
"loss": 1.0526,
"step": 278100
},
{
"epoch": 89.91596638655462,
"grad_norm": 2.033355474472046,
"learning_rate": 0.001,
"loss": 1.053,
"step": 278200
},
{
"epoch": 89.94828700711054,
"grad_norm": 1.9766206741333008,
"learning_rate": 0.001,
"loss": 1.0571,
"step": 278300
},
{
"epoch": 89.98060762766644,
"grad_norm": 1.6863994598388672,
"learning_rate": 0.001,
"loss": 1.0536,
"step": 278400
},
{
"epoch": 90.01292824822237,
"grad_norm": 2.222926616668701,
"learning_rate": 0.001,
"loss": 0.9922,
"step": 278500
},
{
"epoch": 90.04524886877829,
"grad_norm": 2.0000972747802734,
"learning_rate": 0.001,
"loss": 0.9525,
"step": 278600
},
{
"epoch": 90.0775694893342,
"grad_norm": 2.640467643737793,
"learning_rate": 0.001,
"loss": 0.9637,
"step": 278700
},
{
"epoch": 90.10989010989012,
"grad_norm": 2.6437063217163086,
"learning_rate": 0.001,
"loss": 0.9518,
"step": 278800
},
{
"epoch": 90.14221073044602,
"grad_norm": 2.0729470252990723,
"learning_rate": 0.001,
"loss": 0.9644,
"step": 278900
},
{
"epoch": 90.17453135100195,
"grad_norm": 1.665229320526123,
"learning_rate": 0.001,
"loss": 0.9703,
"step": 279000
},
{
"epoch": 90.20685197155785,
"grad_norm": 2.4748964309692383,
"learning_rate": 0.001,
"loss": 0.9828,
"step": 279100
},
{
"epoch": 90.23917259211377,
"grad_norm": 2.3275153636932373,
"learning_rate": 0.001,
"loss": 0.9758,
"step": 279200
},
{
"epoch": 90.27149321266968,
"grad_norm": 2.267796516418457,
"learning_rate": 0.001,
"loss": 0.9762,
"step": 279300
},
{
"epoch": 90.3038138332256,
"grad_norm": 3.0510411262512207,
"learning_rate": 0.001,
"loss": 0.9634,
"step": 279400
},
{
"epoch": 90.33613445378151,
"grad_norm": 1.9009895324707031,
"learning_rate": 0.001,
"loss": 0.9854,
"step": 279500
},
{
"epoch": 90.36845507433743,
"grad_norm": 2.0102601051330566,
"learning_rate": 0.001,
"loss": 0.9867,
"step": 279600
},
{
"epoch": 90.40077569489334,
"grad_norm": 1.7540924549102783,
"learning_rate": 0.001,
"loss": 1.0011,
"step": 279700
},
{
"epoch": 90.43309631544926,
"grad_norm": 2.0557961463928223,
"learning_rate": 0.001,
"loss": 0.9923,
"step": 279800
},
{
"epoch": 90.46541693600517,
"grad_norm": 2.086256980895996,
"learning_rate": 0.001,
"loss": 0.9925,
"step": 279900
},
{
"epoch": 90.49773755656109,
"grad_norm": 2.0977132320404053,
"learning_rate": 0.001,
"loss": 1.0084,
"step": 280000
},
{
"epoch": 90.530058177117,
"grad_norm": 2.174241304397583,
"learning_rate": 0.001,
"loss": 1.0013,
"step": 280100
},
{
"epoch": 90.56237879767292,
"grad_norm": 1.7331504821777344,
"learning_rate": 0.001,
"loss": 1.0204,
"step": 280200
},
{
"epoch": 90.59469941822883,
"grad_norm": 1.912105679512024,
"learning_rate": 0.001,
"loss": 1.0077,
"step": 280300
},
{
"epoch": 90.62702003878475,
"grad_norm": 1.9463039636611938,
"learning_rate": 0.001,
"loss": 1.014,
"step": 280400
},
{
"epoch": 90.65934065934066,
"grad_norm": 1.9718258380889893,
"learning_rate": 0.001,
"loss": 1.0205,
"step": 280500
},
{
"epoch": 90.69166127989658,
"grad_norm": 2.5784502029418945,
"learning_rate": 0.001,
"loss": 1.0149,
"step": 280600
},
{
"epoch": 90.72398190045249,
"grad_norm": 1.9825204610824585,
"learning_rate": 0.001,
"loss": 1.025,
"step": 280700
},
{
"epoch": 90.75630252100841,
"grad_norm": 1.9849514961242676,
"learning_rate": 0.001,
"loss": 1.0323,
"step": 280800
},
{
"epoch": 90.78862314156432,
"grad_norm": 2.067756414413452,
"learning_rate": 0.001,
"loss": 1.0378,
"step": 280900
},
{
"epoch": 90.82094376212024,
"grad_norm": 3.0538594722747803,
"learning_rate": 0.001,
"loss": 1.0244,
"step": 281000
},
{
"epoch": 90.85326438267614,
"grad_norm": 2.0136399269104004,
"learning_rate": 0.001,
"loss": 1.0411,
"step": 281100
},
{
"epoch": 90.88558500323207,
"grad_norm": 1.8992334604263306,
"learning_rate": 0.001,
"loss": 1.0451,
"step": 281200
},
{
"epoch": 90.91790562378797,
"grad_norm": 2.309920310974121,
"learning_rate": 0.001,
"loss": 1.0304,
"step": 281300
},
{
"epoch": 90.9502262443439,
"grad_norm": 2.1204164028167725,
"learning_rate": 0.001,
"loss": 1.0425,
"step": 281400
},
{
"epoch": 90.9825468648998,
"grad_norm": 2.363699436187744,
"learning_rate": 0.001,
"loss": 1.0557,
"step": 281500
},
{
"epoch": 91.01486748545572,
"grad_norm": 2.024256706237793,
"learning_rate": 0.001,
"loss": 0.98,
"step": 281600
},
{
"epoch": 91.04718810601163,
"grad_norm": 1.8037205934524536,
"learning_rate": 0.001,
"loss": 0.9362,
"step": 281700
},
{
"epoch": 91.07950872656755,
"grad_norm": 2.3736801147460938,
"learning_rate": 0.001,
"loss": 0.944,
"step": 281800
},
{
"epoch": 91.11182934712346,
"grad_norm": 2.0845656394958496,
"learning_rate": 0.001,
"loss": 0.9526,
"step": 281900
},
{
"epoch": 91.14414996767938,
"grad_norm": 1.9515149593353271,
"learning_rate": 0.001,
"loss": 0.9533,
"step": 282000
},
{
"epoch": 91.17647058823529,
"grad_norm": 2.203016757965088,
"learning_rate": 0.001,
"loss": 0.9585,
"step": 282100
},
{
"epoch": 91.20879120879121,
"grad_norm": 2.299415349960327,
"learning_rate": 0.001,
"loss": 0.952,
"step": 282200
},
{
"epoch": 91.24111182934712,
"grad_norm": 1.9382708072662354,
"learning_rate": 0.001,
"loss": 0.9691,
"step": 282300
},
{
"epoch": 91.27343244990304,
"grad_norm": 2.493281602859497,
"learning_rate": 0.001,
"loss": 0.9665,
"step": 282400
},
{
"epoch": 91.30575307045895,
"grad_norm": 2.7134528160095215,
"learning_rate": 0.001,
"loss": 0.971,
"step": 282500
},
{
"epoch": 91.33807369101487,
"grad_norm": 2.434889078140259,
"learning_rate": 0.001,
"loss": 0.973,
"step": 282600
},
{
"epoch": 91.37039431157078,
"grad_norm": 1.6504502296447754,
"learning_rate": 0.001,
"loss": 0.9882,
"step": 282700
},
{
"epoch": 91.4027149321267,
"grad_norm": 2.516812324523926,
"learning_rate": 0.001,
"loss": 0.9917,
"step": 282800
},
{
"epoch": 91.4350355526826,
"grad_norm": 2.250521421432495,
"learning_rate": 0.001,
"loss": 0.9858,
"step": 282900
},
{
"epoch": 91.46735617323853,
"grad_norm": 2.721055746078491,
"learning_rate": 0.001,
"loss": 0.9903,
"step": 283000
},
{
"epoch": 91.49967679379444,
"grad_norm": 2.0062334537506104,
"learning_rate": 0.001,
"loss": 1.0004,
"step": 283100
},
{
"epoch": 91.53199741435036,
"grad_norm": 2.329655408859253,
"learning_rate": 0.001,
"loss": 0.9853,
"step": 283200
},
{
"epoch": 91.56431803490626,
"grad_norm": 1.8753358125686646,
"learning_rate": 0.001,
"loss": 1.0124,
"step": 283300
},
{
"epoch": 91.59663865546219,
"grad_norm": 2.209588050842285,
"learning_rate": 0.001,
"loss": 1.0197,
"step": 283400
},
{
"epoch": 91.6289592760181,
"grad_norm": 2.219310760498047,
"learning_rate": 0.001,
"loss": 1.0008,
"step": 283500
},
{
"epoch": 91.66127989657402,
"grad_norm": 2.1283211708068848,
"learning_rate": 0.001,
"loss": 1.0384,
"step": 283600
},
{
"epoch": 91.69360051712992,
"grad_norm": 1.7601873874664307,
"learning_rate": 0.001,
"loss": 1.0298,
"step": 283700
},
{
"epoch": 91.72592113768584,
"grad_norm": 1.8941537141799927,
"learning_rate": 0.001,
"loss": 1.0165,
"step": 283800
},
{
"epoch": 91.75824175824175,
"grad_norm": 2.771538734436035,
"learning_rate": 0.001,
"loss": 1.0324,
"step": 283900
},
{
"epoch": 91.79056237879767,
"grad_norm": 2.580246686935425,
"learning_rate": 0.001,
"loss": 1.0321,
"step": 284000
},
{
"epoch": 91.82288299935358,
"grad_norm": 2.1796884536743164,
"learning_rate": 0.001,
"loss": 1.0281,
"step": 284100
},
{
"epoch": 91.8552036199095,
"grad_norm": 2.224008798599243,
"learning_rate": 0.001,
"loss": 1.0264,
"step": 284200
},
{
"epoch": 91.88752424046541,
"grad_norm": 2.6273763179779053,
"learning_rate": 0.001,
"loss": 1.0378,
"step": 284300
},
{
"epoch": 91.91984486102133,
"grad_norm": 2.327000856399536,
"learning_rate": 0.001,
"loss": 1.0412,
"step": 284400
},
{
"epoch": 91.95216548157724,
"grad_norm": 2.1067605018615723,
"learning_rate": 0.001,
"loss": 1.0331,
"step": 284500
},
{
"epoch": 91.98448610213316,
"grad_norm": 2.0482475757598877,
"learning_rate": 0.001,
"loss": 1.0539,
"step": 284600
},
{
"epoch": 92.01680672268908,
"grad_norm": 2.45538067817688,
"learning_rate": 0.001,
"loss": 0.9696,
"step": 284700
},
{
"epoch": 92.04912734324499,
"grad_norm": 2.8478100299835205,
"learning_rate": 0.001,
"loss": 0.9344,
"step": 284800
},
{
"epoch": 92.08144796380091,
"grad_norm": 2.6036441326141357,
"learning_rate": 0.001,
"loss": 0.9354,
"step": 284900
},
{
"epoch": 92.11376858435682,
"grad_norm": 3.401431083679199,
"learning_rate": 0.001,
"loss": 0.9437,
"step": 285000
},
{
"epoch": 92.14608920491274,
"grad_norm": 2.8432137966156006,
"learning_rate": 0.001,
"loss": 0.9512,
"step": 285100
},
{
"epoch": 92.17840982546865,
"grad_norm": 2.483217477798462,
"learning_rate": 0.001,
"loss": 0.9671,
"step": 285200
},
{
"epoch": 92.21073044602457,
"grad_norm": 2.2827484607696533,
"learning_rate": 0.001,
"loss": 0.9642,
"step": 285300
},
{
"epoch": 92.24305106658048,
"grad_norm": 1.9338748455047607,
"learning_rate": 0.001,
"loss": 0.9574,
"step": 285400
},
{
"epoch": 92.2753716871364,
"grad_norm": 3.546093225479126,
"learning_rate": 0.001,
"loss": 0.9691,
"step": 285500
},
{
"epoch": 92.3076923076923,
"grad_norm": 2.77447247505188,
"learning_rate": 0.001,
"loss": 0.967,
"step": 285600
},
{
"epoch": 92.34001292824823,
"grad_norm": 2.7795073986053467,
"learning_rate": 0.001,
"loss": 0.9534,
"step": 285700
},
{
"epoch": 92.37233354880414,
"grad_norm": 3.57477068901062,
"learning_rate": 0.001,
"loss": 0.9676,
"step": 285800
},
{
"epoch": 92.40465416936006,
"grad_norm": 2.2975080013275146,
"learning_rate": 0.001,
"loss": 0.979,
"step": 285900
},
{
"epoch": 92.43697478991596,
"grad_norm": 2.965996026992798,
"learning_rate": 0.001,
"loss": 0.9877,
"step": 286000
},
{
"epoch": 92.46929541047189,
"grad_norm": 2.1376302242279053,
"learning_rate": 0.001,
"loss": 1.0011,
"step": 286100
},
{
"epoch": 92.5016160310278,
"grad_norm": 2.3904857635498047,
"learning_rate": 0.001,
"loss": 0.9948,
"step": 286200
},
{
"epoch": 92.53393665158372,
"grad_norm": 2.1248250007629395,
"learning_rate": 0.001,
"loss": 0.9912,
"step": 286300
},
{
"epoch": 92.56625727213962,
"grad_norm": 3.2705423831939697,
"learning_rate": 0.001,
"loss": 0.9846,
"step": 286400
},
{
"epoch": 92.59857789269554,
"grad_norm": 2.439633369445801,
"learning_rate": 0.001,
"loss": 0.9929,
"step": 286500
},
{
"epoch": 92.63089851325145,
"grad_norm": 2.881314277648926,
"learning_rate": 0.001,
"loss": 1.0093,
"step": 286600
},
{
"epoch": 92.66321913380737,
"grad_norm": 3.5969815254211426,
"learning_rate": 0.001,
"loss": 0.9981,
"step": 286700
},
{
"epoch": 92.69553975436328,
"grad_norm": 1.9576853513717651,
"learning_rate": 0.001,
"loss": 1.0196,
"step": 286800
},
{
"epoch": 92.7278603749192,
"grad_norm": 3.114851713180542,
"learning_rate": 0.001,
"loss": 1.0087,
"step": 286900
},
{
"epoch": 92.76018099547511,
"grad_norm": 2.5850727558135986,
"learning_rate": 0.001,
"loss": 1.012,
"step": 287000
},
{
"epoch": 92.79250161603103,
"grad_norm": 2.376007080078125,
"learning_rate": 0.001,
"loss": 1.0238,
"step": 287100
},
{
"epoch": 92.82482223658694,
"grad_norm": 2.491420269012451,
"learning_rate": 0.001,
"loss": 1.023,
"step": 287200
},
{
"epoch": 92.85714285714286,
"grad_norm": 2.0345892906188965,
"learning_rate": 0.001,
"loss": 1.0287,
"step": 287300
},
{
"epoch": 92.88946347769877,
"grad_norm": 4.1525702476501465,
"learning_rate": 0.001,
"loss": 1.037,
"step": 287400
},
{
"epoch": 92.92178409825469,
"grad_norm": 3.620197057723999,
"learning_rate": 0.001,
"loss": 1.0469,
"step": 287500
},
{
"epoch": 92.9541047188106,
"grad_norm": 3.1248631477355957,
"learning_rate": 0.001,
"loss": 1.0341,
"step": 287600
},
{
"epoch": 92.98642533936652,
"grad_norm": 2.6980485916137695,
"learning_rate": 0.001,
"loss": 1.0328,
"step": 287700
},
{
"epoch": 93.01874595992243,
"grad_norm": 2.4870355129241943,
"learning_rate": 0.001,
"loss": 0.9789,
"step": 287800
},
{
"epoch": 93.05106658047835,
"grad_norm": 2.198549270629883,
"learning_rate": 0.001,
"loss": 0.9173,
"step": 287900
},
{
"epoch": 93.08338720103426,
"grad_norm": 1.8441483974456787,
"learning_rate": 0.001,
"loss": 0.934,
"step": 288000
},
{
"epoch": 93.11570782159018,
"grad_norm": 2.0669450759887695,
"learning_rate": 0.001,
"loss": 0.9305,
"step": 288100
},
{
"epoch": 93.14802844214609,
"grad_norm": 2.3490703105926514,
"learning_rate": 0.001,
"loss": 0.9552,
"step": 288200
},
{
"epoch": 93.180349062702,
"grad_norm": 2.179919481277466,
"learning_rate": 0.001,
"loss": 0.9391,
"step": 288300
},
{
"epoch": 93.21266968325791,
"grad_norm": 1.8578362464904785,
"learning_rate": 0.001,
"loss": 0.942,
"step": 288400
},
{
"epoch": 93.24499030381384,
"grad_norm": 1.6484123468399048,
"learning_rate": 0.001,
"loss": 0.9545,
"step": 288500
},
{
"epoch": 93.27731092436974,
"grad_norm": 2.483506441116333,
"learning_rate": 0.001,
"loss": 0.9774,
"step": 288600
},
{
"epoch": 93.30963154492567,
"grad_norm": 1.8346855640411377,
"learning_rate": 0.001,
"loss": 0.9638,
"step": 288700
},
{
"epoch": 93.34195216548157,
"grad_norm": 1.6694176197052002,
"learning_rate": 0.001,
"loss": 0.9704,
"step": 288800
},
{
"epoch": 93.3742727860375,
"grad_norm": 2.2945752143859863,
"learning_rate": 0.001,
"loss": 0.9813,
"step": 288900
},
{
"epoch": 93.4065934065934,
"grad_norm": 2.2705020904541016,
"learning_rate": 0.001,
"loss": 0.9744,
"step": 289000
},
{
"epoch": 93.43891402714932,
"grad_norm": 2.5650382041931152,
"learning_rate": 0.001,
"loss": 0.9751,
"step": 289100
},
{
"epoch": 93.47123464770523,
"grad_norm": 1.7471798658370972,
"learning_rate": 0.001,
"loss": 0.9724,
"step": 289200
},
{
"epoch": 93.50355526826115,
"grad_norm": 2.5118908882141113,
"learning_rate": 0.001,
"loss": 0.9774,
"step": 289300
},
{
"epoch": 93.53587588881706,
"grad_norm": 1.7975685596466064,
"learning_rate": 0.001,
"loss": 0.9868,
"step": 289400
},
{
"epoch": 93.56819650937298,
"grad_norm": 2.0186667442321777,
"learning_rate": 0.001,
"loss": 0.9923,
"step": 289500
},
{
"epoch": 93.60051712992889,
"grad_norm": 3.608877420425415,
"learning_rate": 0.001,
"loss": 0.987,
"step": 289600
},
{
"epoch": 93.63283775048481,
"grad_norm": 2.746725559234619,
"learning_rate": 0.001,
"loss": 0.9888,
"step": 289700
},
{
"epoch": 93.66515837104072,
"grad_norm": 2.6709630489349365,
"learning_rate": 0.001,
"loss": 0.9949,
"step": 289800
},
{
"epoch": 93.69747899159664,
"grad_norm": 2.15086030960083,
"learning_rate": 0.001,
"loss": 0.9904,
"step": 289900
},
{
"epoch": 93.72979961215255,
"grad_norm": 2.7408602237701416,
"learning_rate": 0.001,
"loss": 1.0152,
"step": 290000
},
{
"epoch": 93.76212023270847,
"grad_norm": 2.362180233001709,
"learning_rate": 0.001,
"loss": 1.0091,
"step": 290100
},
{
"epoch": 93.79444085326438,
"grad_norm": 1.734055519104004,
"learning_rate": 0.001,
"loss": 1.0105,
"step": 290200
},
{
"epoch": 93.8267614738203,
"grad_norm": 1.879744052886963,
"learning_rate": 0.001,
"loss": 1.0108,
"step": 290300
},
{
"epoch": 93.8590820943762,
"grad_norm": 2.03096342086792,
"learning_rate": 0.001,
"loss": 1.0089,
"step": 290400
},
{
"epoch": 93.89140271493213,
"grad_norm": 1.8851549625396729,
"learning_rate": 0.001,
"loss": 1.0173,
"step": 290500
},
{
"epoch": 93.92372333548803,
"grad_norm": 2.0545690059661865,
"learning_rate": 0.001,
"loss": 1.0307,
"step": 290600
},
{
"epoch": 93.95604395604396,
"grad_norm": 2.3553688526153564,
"learning_rate": 0.001,
"loss": 1.023,
"step": 290700
},
{
"epoch": 93.98836457659988,
"grad_norm": 2.5887322425842285,
"learning_rate": 0.001,
"loss": 1.0195,
"step": 290800
},
{
"epoch": 94.02068519715579,
"grad_norm": 1.7967345714569092,
"learning_rate": 0.001,
"loss": 0.9646,
"step": 290900
},
{
"epoch": 94.0530058177117,
"grad_norm": 2.0765960216522217,
"learning_rate": 0.001,
"loss": 0.9075,
"step": 291000
},
{
"epoch": 94.08532643826761,
"grad_norm": 1.8205326795578003,
"learning_rate": 0.001,
"loss": 0.9234,
"step": 291100
},
{
"epoch": 94.11764705882354,
"grad_norm": 2.5845706462860107,
"learning_rate": 0.001,
"loss": 0.9408,
"step": 291200
},
{
"epoch": 94.14996767937944,
"grad_norm": 2.3111801147460938,
"learning_rate": 0.001,
"loss": 0.9202,
"step": 291300
},
{
"epoch": 94.18228829993537,
"grad_norm": 2.1115758419036865,
"learning_rate": 0.001,
"loss": 0.941,
"step": 291400
},
{
"epoch": 94.21460892049127,
"grad_norm": 1.778016448020935,
"learning_rate": 0.001,
"loss": 0.9442,
"step": 291500
},
{
"epoch": 94.2469295410472,
"grad_norm": 2.3348562717437744,
"learning_rate": 0.001,
"loss": 0.9538,
"step": 291600
},
{
"epoch": 94.2792501616031,
"grad_norm": 1.9034233093261719,
"learning_rate": 0.001,
"loss": 0.9438,
"step": 291700
},
{
"epoch": 94.31157078215902,
"grad_norm": 1.7321242094039917,
"learning_rate": 0.001,
"loss": 0.9677,
"step": 291800
},
{
"epoch": 94.34389140271493,
"grad_norm": 1.788539171218872,
"learning_rate": 0.001,
"loss": 0.9492,
"step": 291900
},
{
"epoch": 94.37621202327085,
"grad_norm": 1.9123979806900024,
"learning_rate": 0.001,
"loss": 0.9652,
"step": 292000
},
{
"epoch": 94.40853264382676,
"grad_norm": 2.391134262084961,
"learning_rate": 0.001,
"loss": 0.9594,
"step": 292100
},
{
"epoch": 94.44085326438268,
"grad_norm": 1.6947717666625977,
"learning_rate": 0.001,
"loss": 0.9634,
"step": 292200
},
{
"epoch": 94.47317388493859,
"grad_norm": 3.093050241470337,
"learning_rate": 0.001,
"loss": 0.9646,
"step": 292300
},
{
"epoch": 94.50549450549451,
"grad_norm": 2.0479140281677246,
"learning_rate": 0.001,
"loss": 0.9826,
"step": 292400
},
{
"epoch": 94.53781512605042,
"grad_norm": 2.06463623046875,
"learning_rate": 0.001,
"loss": 0.9834,
"step": 292500
},
{
"epoch": 94.57013574660634,
"grad_norm": 1.81849205493927,
"learning_rate": 0.001,
"loss": 0.976,
"step": 292600
},
{
"epoch": 94.60245636716225,
"grad_norm": 1.5885215997695923,
"learning_rate": 0.001,
"loss": 0.9767,
"step": 292700
},
{
"epoch": 94.63477698771817,
"grad_norm": 3.0277457237243652,
"learning_rate": 0.001,
"loss": 0.985,
"step": 292800
},
{
"epoch": 94.66709760827408,
"grad_norm": 1.899543046951294,
"learning_rate": 0.001,
"loss": 0.9824,
"step": 292900
},
{
"epoch": 94.69941822883,
"grad_norm": 2.1208620071411133,
"learning_rate": 0.001,
"loss": 1.0109,
"step": 293000
},
{
"epoch": 94.7317388493859,
"grad_norm": 2.451232671737671,
"learning_rate": 0.001,
"loss": 0.9952,
"step": 293100
},
{
"epoch": 94.76405946994183,
"grad_norm": 2.232257604598999,
"learning_rate": 0.001,
"loss": 1.0,
"step": 293200
},
{
"epoch": 94.79638009049773,
"grad_norm": 1.7241202592849731,
"learning_rate": 0.001,
"loss": 0.9966,
"step": 293300
},
{
"epoch": 94.82870071105366,
"grad_norm": 2.0741376876831055,
"learning_rate": 0.001,
"loss": 1.0213,
"step": 293400
},
{
"epoch": 94.86102133160956,
"grad_norm": 1.8772655725479126,
"learning_rate": 0.001,
"loss": 1.0044,
"step": 293500
},
{
"epoch": 94.89334195216549,
"grad_norm": 2.1163671016693115,
"learning_rate": 0.001,
"loss": 1.0106,
"step": 293600
},
{
"epoch": 94.9256625727214,
"grad_norm": 2.3701653480529785,
"learning_rate": 0.001,
"loss": 1.0054,
"step": 293700
},
{
"epoch": 94.95798319327731,
"grad_norm": 1.8388867378234863,
"learning_rate": 0.001,
"loss": 1.0137,
"step": 293800
},
{
"epoch": 94.99030381383322,
"grad_norm": 1.9023023843765259,
"learning_rate": 0.001,
"loss": 1.0238,
"step": 293900
},
{
"epoch": 95.02262443438914,
"grad_norm": 2.3602747917175293,
"learning_rate": 0.001,
"loss": 0.9515,
"step": 294000
},
{
"epoch": 95.05494505494505,
"grad_norm": 1.9043653011322021,
"learning_rate": 0.001,
"loss": 0.9271,
"step": 294100
},
{
"epoch": 95.08726567550097,
"grad_norm": 1.6567951440811157,
"learning_rate": 0.001,
"loss": 0.9284,
"step": 294200
},
{
"epoch": 95.11958629605688,
"grad_norm": 1.8202804327011108,
"learning_rate": 0.001,
"loss": 0.9266,
"step": 294300
},
{
"epoch": 95.1519069166128,
"grad_norm": 1.613312005996704,
"learning_rate": 0.001,
"loss": 0.9181,
"step": 294400
},
{
"epoch": 95.18422753716871,
"grad_norm": 2.1593878269195557,
"learning_rate": 0.001,
"loss": 0.932,
"step": 294500
},
{
"epoch": 95.21654815772463,
"grad_norm": 1.7870193719863892,
"learning_rate": 0.001,
"loss": 0.9439,
"step": 294600
},
{
"epoch": 95.24886877828054,
"grad_norm": 1.8779339790344238,
"learning_rate": 0.001,
"loss": 0.9384,
"step": 294700
},
{
"epoch": 95.28118939883646,
"grad_norm": 1.589490532875061,
"learning_rate": 0.001,
"loss": 0.9503,
"step": 294800
},
{
"epoch": 95.31351001939237,
"grad_norm": 2.23694109916687,
"learning_rate": 0.001,
"loss": 0.9386,
"step": 294900
},
{
"epoch": 95.34583063994829,
"grad_norm": 1.9749599695205688,
"learning_rate": 0.001,
"loss": 0.9478,
"step": 295000
},
{
"epoch": 95.3781512605042,
"grad_norm": 1.9359219074249268,
"learning_rate": 0.001,
"loss": 0.9549,
"step": 295100
},
{
"epoch": 95.41047188106012,
"grad_norm": 1.7539458274841309,
"learning_rate": 0.001,
"loss": 0.9532,
"step": 295200
},
{
"epoch": 95.44279250161603,
"grad_norm": 2.1427483558654785,
"learning_rate": 0.001,
"loss": 0.954,
"step": 295300
},
{
"epoch": 95.47511312217195,
"grad_norm": 1.6027040481567383,
"learning_rate": 0.001,
"loss": 0.9658,
"step": 295400
},
{
"epoch": 95.50743374272786,
"grad_norm": 1.9406092166900635,
"learning_rate": 0.001,
"loss": 0.9661,
"step": 295500
},
{
"epoch": 95.53975436328378,
"grad_norm": 1.6757909059524536,
"learning_rate": 0.001,
"loss": 0.9748,
"step": 295600
},
{
"epoch": 95.57207498383968,
"grad_norm": 1.8995640277862549,
"learning_rate": 0.001,
"loss": 0.9726,
"step": 295700
},
{
"epoch": 95.6043956043956,
"grad_norm": 1.8258506059646606,
"learning_rate": 0.001,
"loss": 0.9771,
"step": 295800
},
{
"epoch": 95.63671622495151,
"grad_norm": 2.0489180088043213,
"learning_rate": 0.001,
"loss": 0.9804,
"step": 295900
},
{
"epoch": 95.66903684550743,
"grad_norm": 1.8056089878082275,
"learning_rate": 0.001,
"loss": 0.9902,
"step": 296000
},
{
"epoch": 95.70135746606334,
"grad_norm": 1.7540534734725952,
"learning_rate": 0.001,
"loss": 0.9825,
"step": 296100
},
{
"epoch": 95.73367808661926,
"grad_norm": 1.9533356428146362,
"learning_rate": 0.001,
"loss": 0.9919,
"step": 296200
},
{
"epoch": 95.76599870717517,
"grad_norm": 2.378761053085327,
"learning_rate": 0.001,
"loss": 0.99,
"step": 296300
},
{
"epoch": 95.7983193277311,
"grad_norm": 1.724198818206787,
"learning_rate": 0.001,
"loss": 0.9953,
"step": 296400
},
{
"epoch": 95.830639948287,
"grad_norm": 2.2945659160614014,
"learning_rate": 0.001,
"loss": 0.9999,
"step": 296500
},
{
"epoch": 95.86296056884292,
"grad_norm": 1.710324764251709,
"learning_rate": 0.001,
"loss": 1.0072,
"step": 296600
},
{
"epoch": 95.89528118939883,
"grad_norm": 1.5836914777755737,
"learning_rate": 0.001,
"loss": 0.9861,
"step": 296700
},
{
"epoch": 95.92760180995475,
"grad_norm": 1.6677262783050537,
"learning_rate": 0.001,
"loss": 0.9965,
"step": 296800
},
{
"epoch": 95.95992243051066,
"grad_norm": 1.6849901676177979,
"learning_rate": 0.001,
"loss": 1.0093,
"step": 296900
},
{
"epoch": 95.99224305106658,
"grad_norm": 1.8210523128509521,
"learning_rate": 0.001,
"loss": 1.0217,
"step": 297000
},
{
"epoch": 96.0245636716225,
"grad_norm": 1.721255898475647,
"learning_rate": 0.001,
"loss": 0.9372,
"step": 297100
},
{
"epoch": 96.05688429217841,
"grad_norm": 1.9859453439712524,
"learning_rate": 0.001,
"loss": 0.9033,
"step": 297200
},
{
"epoch": 96.08920491273433,
"grad_norm": 1.655301809310913,
"learning_rate": 0.001,
"loss": 0.9167,
"step": 297300
},
{
"epoch": 96.12152553329024,
"grad_norm": 2.0336883068084717,
"learning_rate": 0.001,
"loss": 0.9247,
"step": 297400
},
{
"epoch": 96.15384615384616,
"grad_norm": 2.0027976036071777,
"learning_rate": 0.001,
"loss": 0.9255,
"step": 297500
},
{
"epoch": 96.18616677440207,
"grad_norm": 1.4069693088531494,
"learning_rate": 0.001,
"loss": 0.9324,
"step": 297600
},
{
"epoch": 96.21848739495799,
"grad_norm": 1.8238356113433838,
"learning_rate": 0.001,
"loss": 0.9208,
"step": 297700
},
{
"epoch": 96.2508080155139,
"grad_norm": 1.9688968658447266,
"learning_rate": 0.001,
"loss": 0.9407,
"step": 297800
},
{
"epoch": 96.28312863606982,
"grad_norm": 1.9155817031860352,
"learning_rate": 0.001,
"loss": 0.939,
"step": 297900
},
{
"epoch": 96.31544925662573,
"grad_norm": 1.7525216341018677,
"learning_rate": 0.001,
"loss": 0.9424,
"step": 298000
},
{
"epoch": 96.34776987718165,
"grad_norm": 2.573129415512085,
"learning_rate": 0.001,
"loss": 0.9396,
"step": 298100
},
{
"epoch": 96.38009049773756,
"grad_norm": 2.0800180435180664,
"learning_rate": 0.001,
"loss": 0.9402,
"step": 298200
},
{
"epoch": 96.41241111829348,
"grad_norm": 1.723068118095398,
"learning_rate": 0.001,
"loss": 0.9481,
"step": 298300
},
{
"epoch": 96.44473173884938,
"grad_norm": 1.7221217155456543,
"learning_rate": 0.001,
"loss": 0.9542,
"step": 298400
},
{
"epoch": 96.4770523594053,
"grad_norm": 1.8597713708877563,
"learning_rate": 0.001,
"loss": 0.9532,
"step": 298500
},
{
"epoch": 96.50937297996121,
"grad_norm": 1.7525596618652344,
"learning_rate": 0.001,
"loss": 0.9557,
"step": 298600
},
{
"epoch": 96.54169360051714,
"grad_norm": 2.1579041481018066,
"learning_rate": 0.001,
"loss": 0.9692,
"step": 298700
},
{
"epoch": 96.57401422107304,
"grad_norm": 1.7372926473617554,
"learning_rate": 0.001,
"loss": 0.9585,
"step": 298800
},
{
"epoch": 96.60633484162896,
"grad_norm": 1.6883844137191772,
"learning_rate": 0.001,
"loss": 0.9598,
"step": 298900
},
{
"epoch": 96.63865546218487,
"grad_norm": 2.018057346343994,
"learning_rate": 0.001,
"loss": 0.9729,
"step": 299000
},
{
"epoch": 96.6709760827408,
"grad_norm": 1.6414058208465576,
"learning_rate": 0.001,
"loss": 0.9783,
"step": 299100
},
{
"epoch": 96.7032967032967,
"grad_norm": 1.5033763647079468,
"learning_rate": 0.001,
"loss": 0.9791,
"step": 299200
},
{
"epoch": 96.73561732385262,
"grad_norm": 1.5835376977920532,
"learning_rate": 0.001,
"loss": 0.9804,
"step": 299300
},
{
"epoch": 96.76793794440853,
"grad_norm": 2.2092549800872803,
"learning_rate": 0.001,
"loss": 1.0009,
"step": 299400
},
{
"epoch": 96.80025856496445,
"grad_norm": 1.7708107233047485,
"learning_rate": 0.001,
"loss": 0.9765,
"step": 299500
},
{
"epoch": 96.83257918552036,
"grad_norm": 1.8122010231018066,
"learning_rate": 0.001,
"loss": 1.0012,
"step": 299600
},
{
"epoch": 96.86489980607628,
"grad_norm": 1.8053666353225708,
"learning_rate": 0.001,
"loss": 0.9871,
"step": 299700
},
{
"epoch": 96.89722042663219,
"grad_norm": 2.1326189041137695,
"learning_rate": 0.001,
"loss": 0.9737,
"step": 299800
},
{
"epoch": 96.92954104718811,
"grad_norm": 2.2978103160858154,
"learning_rate": 0.001,
"loss": 0.9991,
"step": 299900
},
{
"epoch": 96.96186166774402,
"grad_norm": 1.8257157802581787,
"learning_rate": 0.001,
"loss": 1.0042,
"step": 300000
},
{
"epoch": 96.99418228829994,
"grad_norm": 1.9147756099700928,
"learning_rate": 0.001,
"loss": 0.9982,
"step": 300100
},
{
"epoch": 97.02650290885585,
"grad_norm": 1.7436898946762085,
"learning_rate": 0.001,
"loss": 0.9217,
"step": 300200
},
{
"epoch": 97.05882352941177,
"grad_norm": 2.3132598400115967,
"learning_rate": 0.001,
"loss": 0.9009,
"step": 300300
},
{
"epoch": 97.09114414996768,
"grad_norm": 1.7439414262771606,
"learning_rate": 0.001,
"loss": 0.9058,
"step": 300400
},
{
"epoch": 97.1234647705236,
"grad_norm": 1.6833429336547852,
"learning_rate": 0.001,
"loss": 0.8999,
"step": 300500
},
{
"epoch": 97.1557853910795,
"grad_norm": 1.65819251537323,
"learning_rate": 0.001,
"loss": 0.9105,
"step": 300600
},
{
"epoch": 97.18810601163543,
"grad_norm": 1.7139781713485718,
"learning_rate": 0.001,
"loss": 0.9218,
"step": 300700
},
{
"epoch": 97.22042663219133,
"grad_norm": 2.181692361831665,
"learning_rate": 0.001,
"loss": 0.9336,
"step": 300800
},
{
"epoch": 97.25274725274726,
"grad_norm": 2.0716326236724854,
"learning_rate": 0.001,
"loss": 0.9164,
"step": 300900
},
{
"epoch": 97.28506787330316,
"grad_norm": 2.2653331756591797,
"learning_rate": 0.001,
"loss": 0.9367,
"step": 301000
},
{
"epoch": 97.31738849385908,
"grad_norm": 2.1563384532928467,
"learning_rate": 0.001,
"loss": 0.9216,
"step": 301100
},
{
"epoch": 97.34970911441499,
"grad_norm": 2.309046983718872,
"learning_rate": 0.001,
"loss": 0.9338,
"step": 301200
},
{
"epoch": 97.38202973497091,
"grad_norm": 2.024261236190796,
"learning_rate": 0.001,
"loss": 0.9365,
"step": 301300
},
{
"epoch": 97.41435035552682,
"grad_norm": 2.32181978225708,
"learning_rate": 0.001,
"loss": 0.9496,
"step": 301400
},
{
"epoch": 97.44667097608274,
"grad_norm": 1.8799872398376465,
"learning_rate": 0.001,
"loss": 0.9549,
"step": 301500
},
{
"epoch": 97.47899159663865,
"grad_norm": 2.4056990146636963,
"learning_rate": 0.001,
"loss": 0.954,
"step": 301600
},
{
"epoch": 97.51131221719457,
"grad_norm": 2.1647398471832275,
"learning_rate": 0.001,
"loss": 0.9651,
"step": 301700
},
{
"epoch": 97.54363283775048,
"grad_norm": 2.175403356552124,
"learning_rate": 0.001,
"loss": 0.9534,
"step": 301800
},
{
"epoch": 97.5759534583064,
"grad_norm": 1.9020843505859375,
"learning_rate": 0.001,
"loss": 0.9618,
"step": 301900
},
{
"epoch": 97.60827407886231,
"grad_norm": 2.246063232421875,
"learning_rate": 0.001,
"loss": 0.9539,
"step": 302000
},
{
"epoch": 97.64059469941823,
"grad_norm": 1.7729310989379883,
"learning_rate": 0.001,
"loss": 0.9634,
"step": 302100
},
{
"epoch": 97.67291531997414,
"grad_norm": 1.6846141815185547,
"learning_rate": 0.001,
"loss": 0.9614,
"step": 302200
},
{
"epoch": 97.70523594053006,
"grad_norm": 1.7244802713394165,
"learning_rate": 0.001,
"loss": 0.9859,
"step": 302300
},
{
"epoch": 97.73755656108597,
"grad_norm": 2.1463193893432617,
"learning_rate": 0.001,
"loss": 0.9803,
"step": 302400
},
{
"epoch": 97.76987718164189,
"grad_norm": 1.5155985355377197,
"learning_rate": 0.001,
"loss": 0.9796,
"step": 302500
},
{
"epoch": 97.8021978021978,
"grad_norm": 2.3664512634277344,
"learning_rate": 0.001,
"loss": 0.9756,
"step": 302600
},
{
"epoch": 97.83451842275372,
"grad_norm": 2.193302631378174,
"learning_rate": 0.001,
"loss": 0.9786,
"step": 302700
},
{
"epoch": 97.86683904330962,
"grad_norm": 1.757538080215454,
"learning_rate": 0.001,
"loss": 0.9773,
"step": 302800
},
{
"epoch": 97.89915966386555,
"grad_norm": 2.0512661933898926,
"learning_rate": 0.001,
"loss": 0.9928,
"step": 302900
},
{
"epoch": 97.93148028442145,
"grad_norm": 1.9938185214996338,
"learning_rate": 0.001,
"loss": 0.9976,
"step": 303000
},
{
"epoch": 97.96380090497738,
"grad_norm": 2.04455304145813,
"learning_rate": 0.001,
"loss": 0.9859,
"step": 303100
},
{
"epoch": 97.99612152553328,
"grad_norm": 2.0681285858154297,
"learning_rate": 0.001,
"loss": 0.994,
"step": 303200
},
{
"epoch": 98.0284421460892,
"grad_norm": 1.7298678159713745,
"learning_rate": 0.001,
"loss": 0.9041,
"step": 303300
},
{
"epoch": 98.06076276664513,
"grad_norm": 1.8354662656784058,
"learning_rate": 0.001,
"loss": 0.888,
"step": 303400
},
{
"epoch": 98.09308338720103,
"grad_norm": 1.9386900663375854,
"learning_rate": 0.001,
"loss": 0.8944,
"step": 303500
},
{
"epoch": 98.12540400775696,
"grad_norm": 2.130673408508301,
"learning_rate": 0.001,
"loss": 0.9052,
"step": 303600
},
{
"epoch": 98.15772462831286,
"grad_norm": 1.854596734046936,
"learning_rate": 0.001,
"loss": 0.9047,
"step": 303700
},
{
"epoch": 98.19004524886878,
"grad_norm": 1.988502860069275,
"learning_rate": 0.001,
"loss": 0.9114,
"step": 303800
},
{
"epoch": 98.22236586942469,
"grad_norm": 1.8438735008239746,
"learning_rate": 0.001,
"loss": 0.905,
"step": 303900
},
{
"epoch": 98.25468648998061,
"grad_norm": 1.9197919368743896,
"learning_rate": 0.001,
"loss": 0.9029,
"step": 304000
},
{
"epoch": 98.28700711053652,
"grad_norm": 1.8311904668807983,
"learning_rate": 0.001,
"loss": 0.9201,
"step": 304100
},
{
"epoch": 98.31932773109244,
"grad_norm": 2.5999064445495605,
"learning_rate": 0.001,
"loss": 0.93,
"step": 304200
},
{
"epoch": 98.35164835164835,
"grad_norm": 2.2026302814483643,
"learning_rate": 0.001,
"loss": 0.9327,
"step": 304300
},
{
"epoch": 98.38396897220427,
"grad_norm": 2.3874950408935547,
"learning_rate": 0.001,
"loss": 0.9415,
"step": 304400
},
{
"epoch": 98.41628959276018,
"grad_norm": 1.6854950189590454,
"learning_rate": 0.001,
"loss": 0.9401,
"step": 304500
},
{
"epoch": 98.4486102133161,
"grad_norm": 1.915907621383667,
"learning_rate": 0.001,
"loss": 0.9611,
"step": 304600
},
{
"epoch": 98.48093083387201,
"grad_norm": 2.3232321739196777,
"learning_rate": 0.001,
"loss": 0.9398,
"step": 304700
},
{
"epoch": 98.51325145442793,
"grad_norm": 1.8774093389511108,
"learning_rate": 0.001,
"loss": 0.9582,
"step": 304800
},
{
"epoch": 98.54557207498384,
"grad_norm": 2.2585456371307373,
"learning_rate": 0.001,
"loss": 0.9513,
"step": 304900
},
{
"epoch": 98.57789269553976,
"grad_norm": 1.8954609632492065,
"learning_rate": 0.001,
"loss": 0.9578,
"step": 305000
},
{
"epoch": 98.61021331609567,
"grad_norm": 2.1864609718322754,
"learning_rate": 0.001,
"loss": 0.9581,
"step": 305100
},
{
"epoch": 98.64253393665159,
"grad_norm": 2.2927186489105225,
"learning_rate": 0.001,
"loss": 0.956,
"step": 305200
},
{
"epoch": 98.6748545572075,
"grad_norm": 2.110893964767456,
"learning_rate": 0.001,
"loss": 0.9598,
"step": 305300
},
{
"epoch": 98.70717517776342,
"grad_norm": 1.7718578577041626,
"learning_rate": 0.001,
"loss": 0.95,
"step": 305400
},
{
"epoch": 98.73949579831933,
"grad_norm": 1.7689002752304077,
"learning_rate": 0.001,
"loss": 0.9672,
"step": 305500
},
{
"epoch": 98.77181641887525,
"grad_norm": 2.084601879119873,
"learning_rate": 0.001,
"loss": 0.9858,
"step": 305600
},
{
"epoch": 98.80413703943115,
"grad_norm": 1.817209243774414,
"learning_rate": 0.001,
"loss": 0.9709,
"step": 305700
},
{
"epoch": 98.83645765998708,
"grad_norm": 2.194476842880249,
"learning_rate": 0.001,
"loss": 0.9676,
"step": 305800
},
{
"epoch": 98.86877828054298,
"grad_norm": 2.089932680130005,
"learning_rate": 0.001,
"loss": 0.9804,
"step": 305900
},
{
"epoch": 98.9010989010989,
"grad_norm": 1.8631789684295654,
"learning_rate": 0.001,
"loss": 0.9771,
"step": 306000
},
{
"epoch": 98.93341952165481,
"grad_norm": 2.2455215454101562,
"learning_rate": 0.001,
"loss": 0.9932,
"step": 306100
},
{
"epoch": 98.96574014221073,
"grad_norm": 1.965256690979004,
"learning_rate": 0.001,
"loss": 0.9878,
"step": 306200
},
{
"epoch": 98.99806076276664,
"grad_norm": 2.0805888175964355,
"learning_rate": 0.001,
"loss": 0.9783,
"step": 306300
},
{
"epoch": 99.03038138332256,
"grad_norm": 1.8865318298339844,
"learning_rate": 0.001,
"loss": 0.8933,
"step": 306400
},
{
"epoch": 99.06270200387847,
"grad_norm": 1.988250970840454,
"learning_rate": 0.001,
"loss": 0.8875,
"step": 306500
},
{
"epoch": 99.09502262443439,
"grad_norm": 1.8929754495620728,
"learning_rate": 0.001,
"loss": 0.892,
"step": 306600
},
{
"epoch": 99.1273432449903,
"grad_norm": 2.4540634155273438,
"learning_rate": 0.001,
"loss": 0.886,
"step": 306700
},
{
"epoch": 99.15966386554622,
"grad_norm": 2.009747266769409,
"learning_rate": 0.001,
"loss": 0.904,
"step": 306800
},
{
"epoch": 99.19198448610213,
"grad_norm": 1.8114582300186157,
"learning_rate": 0.001,
"loss": 0.9166,
"step": 306900
},
{
"epoch": 99.22430510665805,
"grad_norm": 2.278742790222168,
"learning_rate": 0.001,
"loss": 0.9117,
"step": 307000
},
{
"epoch": 99.25662572721396,
"grad_norm": 2.1780436038970947,
"learning_rate": 0.001,
"loss": 0.9065,
"step": 307100
},
{
"epoch": 99.28894634776988,
"grad_norm": 2.099867343902588,
"learning_rate": 0.001,
"loss": 0.9221,
"step": 307200
},
{
"epoch": 99.32126696832579,
"grad_norm": 2.2217254638671875,
"learning_rate": 0.001,
"loss": 0.9354,
"step": 307300
},
{
"epoch": 99.35358758888171,
"grad_norm": 1.9928747415542603,
"learning_rate": 0.001,
"loss": 0.9202,
"step": 307400
},
{
"epoch": 99.38590820943762,
"grad_norm": 1.8501205444335938,
"learning_rate": 0.001,
"loss": 0.9434,
"step": 307500
},
{
"epoch": 99.41822882999354,
"grad_norm": 2.4966423511505127,
"learning_rate": 0.001,
"loss": 0.9282,
"step": 307600
},
{
"epoch": 99.45054945054945,
"grad_norm": 2.4920759201049805,
"learning_rate": 0.001,
"loss": 0.9306,
"step": 307700
},
{
"epoch": 99.48287007110537,
"grad_norm": 2.0279624462127686,
"learning_rate": 0.001,
"loss": 0.9554,
"step": 307800
},
{
"epoch": 99.51519069166127,
"grad_norm": 2.1771886348724365,
"learning_rate": 0.001,
"loss": 0.9452,
"step": 307900
},
{
"epoch": 99.5475113122172,
"grad_norm": 2.0133235454559326,
"learning_rate": 0.001,
"loss": 0.9396,
"step": 308000
}
],
"logging_steps": 100,
"max_steps": 309400,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.247081673967452e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}