1JV45 / checkpoint-1200 /trainer_state.json
gotzmann's picture
..
af9780f
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5855562784645413,
"eval_steps": 500,
"global_step": 1200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00048796356538711777,
"grad_norm": 0.4446243345737457,
"learning_rate": 0.0001,
"loss": 1.8998,
"step": 1
},
{
"epoch": 0.0009759271307742355,
"grad_norm": 0.443472683429718,
"learning_rate": 0.0001,
"loss": 2.146,
"step": 2
},
{
"epoch": 0.0014638906961613532,
"grad_norm": 0.246729776263237,
"learning_rate": 0.0001,
"loss": 1.8931,
"step": 3
},
{
"epoch": 0.001951854261548471,
"grad_norm": 0.3018186688423157,
"learning_rate": 0.0001,
"loss": 1.984,
"step": 4
},
{
"epoch": 0.002439817826935589,
"grad_norm": 0.2850761413574219,
"learning_rate": 0.0001,
"loss": 1.863,
"step": 5
},
{
"epoch": 0.0029277813923227064,
"grad_norm": 0.23705212771892548,
"learning_rate": 0.0001,
"loss": 1.8384,
"step": 6
},
{
"epoch": 0.0034157449577098243,
"grad_norm": 0.24392390251159668,
"learning_rate": 0.0001,
"loss": 1.8743,
"step": 7
},
{
"epoch": 0.003903708523096942,
"grad_norm": 0.24215014278888702,
"learning_rate": 0.0001,
"loss": 1.8048,
"step": 8
},
{
"epoch": 0.00439167208848406,
"grad_norm": 0.22235405445098877,
"learning_rate": 0.0001,
"loss": 1.8098,
"step": 9
},
{
"epoch": 0.004879635653871178,
"grad_norm": 0.1880388706922531,
"learning_rate": 0.0001,
"loss": 1.7519,
"step": 10
},
{
"epoch": 0.005367599219258295,
"grad_norm": 0.2197292149066925,
"learning_rate": 0.0001,
"loss": 1.905,
"step": 11
},
{
"epoch": 0.005855562784645413,
"grad_norm": 0.20583945512771606,
"learning_rate": 0.0001,
"loss": 1.8143,
"step": 12
},
{
"epoch": 0.006343526350032531,
"grad_norm": 0.20737111568450928,
"learning_rate": 0.0001,
"loss": 1.8505,
"step": 13
},
{
"epoch": 0.0068314899154196486,
"grad_norm": 0.19384053349494934,
"learning_rate": 0.0001,
"loss": 1.7528,
"step": 14
},
{
"epoch": 0.007319453480806766,
"grad_norm": 0.23753000795841217,
"learning_rate": 0.0001,
"loss": 1.7206,
"step": 15
},
{
"epoch": 0.007807417046193884,
"grad_norm": 0.1946115642786026,
"learning_rate": 0.0001,
"loss": 1.7562,
"step": 16
},
{
"epoch": 0.008295380611581003,
"grad_norm": 0.18985839188098907,
"learning_rate": 0.0001,
"loss": 1.6665,
"step": 17
},
{
"epoch": 0.00878334417696812,
"grad_norm": 0.20499983429908752,
"learning_rate": 0.0001,
"loss": 1.9491,
"step": 18
},
{
"epoch": 0.009271307742355238,
"grad_norm": 0.1874532699584961,
"learning_rate": 0.0001,
"loss": 1.7975,
"step": 19
},
{
"epoch": 0.009759271307742356,
"grad_norm": 0.18048429489135742,
"learning_rate": 0.0001,
"loss": 1.7799,
"step": 20
},
{
"epoch": 0.010247234873129472,
"grad_norm": 0.1777779906988144,
"learning_rate": 0.0001,
"loss": 1.7816,
"step": 21
},
{
"epoch": 0.01073519843851659,
"grad_norm": 0.17349651455879211,
"learning_rate": 0.0001,
"loss": 1.7431,
"step": 22
},
{
"epoch": 0.011223162003903709,
"grad_norm": 0.18479375541210175,
"learning_rate": 0.0001,
"loss": 1.903,
"step": 23
},
{
"epoch": 0.011711125569290826,
"grad_norm": 0.1918632984161377,
"learning_rate": 0.0001,
"loss": 1.7957,
"step": 24
},
{
"epoch": 0.012199089134677944,
"grad_norm": 0.18239013850688934,
"learning_rate": 0.0001,
"loss": 1.8039,
"step": 25
},
{
"epoch": 0.012687052700065062,
"grad_norm": 0.17392802238464355,
"learning_rate": 0.0001,
"loss": 1.7022,
"step": 26
},
{
"epoch": 0.013175016265452179,
"grad_norm": 0.1769259124994278,
"learning_rate": 0.0001,
"loss": 1.7131,
"step": 27
},
{
"epoch": 0.013662979830839297,
"grad_norm": 0.17371872067451477,
"learning_rate": 0.0001,
"loss": 1.7657,
"step": 28
},
{
"epoch": 0.014150943396226415,
"grad_norm": 0.19897091388702393,
"learning_rate": 0.0001,
"loss": 1.8791,
"step": 29
},
{
"epoch": 0.014638906961613532,
"grad_norm": 0.17471033334732056,
"learning_rate": 0.0001,
"loss": 1.8765,
"step": 30
},
{
"epoch": 0.01512687052700065,
"grad_norm": 0.17650161683559418,
"learning_rate": 0.0001,
"loss": 1.8181,
"step": 31
},
{
"epoch": 0.015614834092387769,
"grad_norm": 0.18008925020694733,
"learning_rate": 0.0001,
"loss": 1.8138,
"step": 32
},
{
"epoch": 0.016102797657774885,
"grad_norm": 0.18406356871128082,
"learning_rate": 0.0001,
"loss": 1.907,
"step": 33
},
{
"epoch": 0.016590761223162005,
"grad_norm": 0.18869489431381226,
"learning_rate": 0.0001,
"loss": 1.9043,
"step": 34
},
{
"epoch": 0.017078724788549122,
"grad_norm": 0.18416965007781982,
"learning_rate": 0.0001,
"loss": 1.7695,
"step": 35
},
{
"epoch": 0.01756668835393624,
"grad_norm": 0.18121257424354553,
"learning_rate": 0.0001,
"loss": 1.8342,
"step": 36
},
{
"epoch": 0.01805465191932336,
"grad_norm": 0.18426860868930817,
"learning_rate": 0.0001,
"loss": 1.818,
"step": 37
},
{
"epoch": 0.018542615484710475,
"grad_norm": 0.18800823390483856,
"learning_rate": 0.0001,
"loss": 1.8019,
"step": 38
},
{
"epoch": 0.01903057905009759,
"grad_norm": 0.18787121772766113,
"learning_rate": 0.0001,
"loss": 1.8052,
"step": 39
},
{
"epoch": 0.01951854261548471,
"grad_norm": 0.18341200053691864,
"learning_rate": 0.0001,
"loss": 1.7288,
"step": 40
},
{
"epoch": 0.020006506180871828,
"grad_norm": 0.18460282683372498,
"learning_rate": 0.0001,
"loss": 1.9984,
"step": 41
},
{
"epoch": 0.020494469746258945,
"grad_norm": 0.17212441563606262,
"learning_rate": 0.0001,
"loss": 1.7928,
"step": 42
},
{
"epoch": 0.020982433311646065,
"grad_norm": 0.18548350036144257,
"learning_rate": 0.0001,
"loss": 1.9719,
"step": 43
},
{
"epoch": 0.02147039687703318,
"grad_norm": 0.18035617470741272,
"learning_rate": 0.0001,
"loss": 1.9265,
"step": 44
},
{
"epoch": 0.021958360442420298,
"grad_norm": 0.16300201416015625,
"learning_rate": 0.0001,
"loss": 1.6821,
"step": 45
},
{
"epoch": 0.022446324007807418,
"grad_norm": 0.1797887086868286,
"learning_rate": 0.0001,
"loss": 1.8276,
"step": 46
},
{
"epoch": 0.022934287573194535,
"grad_norm": 0.18614032864570618,
"learning_rate": 0.0001,
"loss": 1.769,
"step": 47
},
{
"epoch": 0.02342225113858165,
"grad_norm": 0.18762686848640442,
"learning_rate": 0.0001,
"loss": 1.7716,
"step": 48
},
{
"epoch": 0.02391021470396877,
"grad_norm": 0.1779824048280716,
"learning_rate": 0.0001,
"loss": 1.7047,
"step": 49
},
{
"epoch": 0.024398178269355888,
"grad_norm": 0.1713806688785553,
"learning_rate": 0.0001,
"loss": 1.7085,
"step": 50
},
{
"epoch": 0.024886141834743004,
"grad_norm": 0.17888174951076508,
"learning_rate": 0.0001,
"loss": 1.8539,
"step": 51
},
{
"epoch": 0.025374105400130124,
"grad_norm": 0.18366138637065887,
"learning_rate": 0.0001,
"loss": 1.7948,
"step": 52
},
{
"epoch": 0.02586206896551724,
"grad_norm": 0.1684766262769699,
"learning_rate": 0.0001,
"loss": 1.7752,
"step": 53
},
{
"epoch": 0.026350032530904358,
"grad_norm": 0.18316026031970978,
"learning_rate": 0.0001,
"loss": 1.8153,
"step": 54
},
{
"epoch": 0.026837996096291478,
"grad_norm": 0.1712900847196579,
"learning_rate": 0.0001,
"loss": 1.8209,
"step": 55
},
{
"epoch": 0.027325959661678594,
"grad_norm": 0.17653001844882965,
"learning_rate": 0.0001,
"loss": 1.7142,
"step": 56
},
{
"epoch": 0.02781392322706571,
"grad_norm": 0.17115001380443573,
"learning_rate": 0.0001,
"loss": 1.7014,
"step": 57
},
{
"epoch": 0.02830188679245283,
"grad_norm": 0.19934123754501343,
"learning_rate": 0.0001,
"loss": 1.8184,
"step": 58
},
{
"epoch": 0.028789850357839947,
"grad_norm": 0.20567697286605835,
"learning_rate": 0.0001,
"loss": 1.9174,
"step": 59
},
{
"epoch": 0.029277813923227064,
"grad_norm": 0.17345917224884033,
"learning_rate": 0.0001,
"loss": 1.7448,
"step": 60
},
{
"epoch": 0.029765777488614184,
"grad_norm": 0.24353067576885223,
"learning_rate": 0.0001,
"loss": 1.7974,
"step": 61
},
{
"epoch": 0.0302537410540013,
"grad_norm": 0.18949398398399353,
"learning_rate": 0.0001,
"loss": 1.8231,
"step": 62
},
{
"epoch": 0.03074170461938842,
"grad_norm": 0.22029712796211243,
"learning_rate": 0.0001,
"loss": 1.8535,
"step": 63
},
{
"epoch": 0.031229668184775537,
"grad_norm": 0.16962048411369324,
"learning_rate": 0.0001,
"loss": 1.7686,
"step": 64
},
{
"epoch": 0.03171763175016266,
"grad_norm": 0.19039765000343323,
"learning_rate": 0.0001,
"loss": 1.8303,
"step": 65
},
{
"epoch": 0.03220559531554977,
"grad_norm": 0.20166978240013123,
"learning_rate": 0.0001,
"loss": 1.768,
"step": 66
},
{
"epoch": 0.03269355888093689,
"grad_norm": 0.173394113779068,
"learning_rate": 0.0001,
"loss": 1.8253,
"step": 67
},
{
"epoch": 0.03318152244632401,
"grad_norm": 0.19260728359222412,
"learning_rate": 0.0001,
"loss": 1.7589,
"step": 68
},
{
"epoch": 0.033669486011711124,
"grad_norm": 0.19539032876491547,
"learning_rate": 0.0001,
"loss": 1.749,
"step": 69
},
{
"epoch": 0.034157449577098244,
"grad_norm": 0.16770870983600616,
"learning_rate": 0.0001,
"loss": 1.7132,
"step": 70
},
{
"epoch": 0.034645413142485364,
"grad_norm": 0.19755178689956665,
"learning_rate": 0.0001,
"loss": 1.8323,
"step": 71
},
{
"epoch": 0.03513337670787248,
"grad_norm": 0.18038292229175568,
"learning_rate": 0.0001,
"loss": 1.7599,
"step": 72
},
{
"epoch": 0.0356213402732596,
"grad_norm": 0.17995433509349823,
"learning_rate": 0.0001,
"loss": 1.9183,
"step": 73
},
{
"epoch": 0.03610930383864672,
"grad_norm": 0.19222807884216309,
"learning_rate": 0.0001,
"loss": 1.8642,
"step": 74
},
{
"epoch": 0.03659726740403383,
"grad_norm": 0.16965682804584503,
"learning_rate": 0.0001,
"loss": 1.7271,
"step": 75
},
{
"epoch": 0.03708523096942095,
"grad_norm": 0.17662999033927917,
"learning_rate": 0.0001,
"loss": 1.8263,
"step": 76
},
{
"epoch": 0.03757319453480807,
"grad_norm": 0.1699201613664627,
"learning_rate": 0.0001,
"loss": 1.6818,
"step": 77
},
{
"epoch": 0.03806115810019518,
"grad_norm": 0.17309829592704773,
"learning_rate": 0.0001,
"loss": 1.7424,
"step": 78
},
{
"epoch": 0.0385491216655823,
"grad_norm": 0.18537020683288574,
"learning_rate": 0.0001,
"loss": 1.7986,
"step": 79
},
{
"epoch": 0.03903708523096942,
"grad_norm": 0.1709861010313034,
"learning_rate": 0.0001,
"loss": 1.6091,
"step": 80
},
{
"epoch": 0.039525048796356536,
"grad_norm": 0.17050296068191528,
"learning_rate": 0.0001,
"loss": 1.6904,
"step": 81
},
{
"epoch": 0.040013012361743656,
"grad_norm": 0.17640157043933868,
"learning_rate": 0.0001,
"loss": 1.7087,
"step": 82
},
{
"epoch": 0.040500975927130776,
"grad_norm": 0.1919400542974472,
"learning_rate": 0.0001,
"loss": 1.8223,
"step": 83
},
{
"epoch": 0.04098893949251789,
"grad_norm": 0.19427765905857086,
"learning_rate": 0.0001,
"loss": 1.7443,
"step": 84
},
{
"epoch": 0.04147690305790501,
"grad_norm": 0.19496281445026398,
"learning_rate": 0.0001,
"loss": 1.8336,
"step": 85
},
{
"epoch": 0.04196486662329213,
"grad_norm": 0.18101565539836884,
"learning_rate": 0.0001,
"loss": 1.8422,
"step": 86
},
{
"epoch": 0.04245283018867924,
"grad_norm": 0.19941496849060059,
"learning_rate": 0.0001,
"loss": 1.7168,
"step": 87
},
{
"epoch": 0.04294079375406636,
"grad_norm": 0.1963973492383957,
"learning_rate": 0.0001,
"loss": 1.7558,
"step": 88
},
{
"epoch": 0.04342875731945348,
"grad_norm": 0.17694450914859772,
"learning_rate": 0.0001,
"loss": 1.6953,
"step": 89
},
{
"epoch": 0.043916720884840596,
"grad_norm": 0.19362711906433105,
"learning_rate": 0.0001,
"loss": 1.8165,
"step": 90
},
{
"epoch": 0.044404684450227716,
"grad_norm": 0.1736024022102356,
"learning_rate": 0.0001,
"loss": 1.777,
"step": 91
},
{
"epoch": 0.044892648015614836,
"grad_norm": 0.17649488151073456,
"learning_rate": 0.0001,
"loss": 1.7507,
"step": 92
},
{
"epoch": 0.04538061158100195,
"grad_norm": 0.2002265304327011,
"learning_rate": 0.0001,
"loss": 1.8796,
"step": 93
},
{
"epoch": 0.04586857514638907,
"grad_norm": 0.1667991429567337,
"learning_rate": 0.0001,
"loss": 1.7051,
"step": 94
},
{
"epoch": 0.04635653871177619,
"grad_norm": 0.1868171989917755,
"learning_rate": 0.0001,
"loss": 1.747,
"step": 95
},
{
"epoch": 0.0468445022771633,
"grad_norm": 0.18312174081802368,
"learning_rate": 0.0001,
"loss": 1.7835,
"step": 96
},
{
"epoch": 0.04733246584255042,
"grad_norm": 0.1762659102678299,
"learning_rate": 0.0001,
"loss": 1.6517,
"step": 97
},
{
"epoch": 0.04782042940793754,
"grad_norm": 0.19766494631767273,
"learning_rate": 0.0001,
"loss": 1.826,
"step": 98
},
{
"epoch": 0.048308392973324656,
"grad_norm": 0.17331789433956146,
"learning_rate": 0.0001,
"loss": 1.7506,
"step": 99
},
{
"epoch": 0.048796356538711776,
"grad_norm": 0.16851170361042023,
"learning_rate": 0.0001,
"loss": 1.744,
"step": 100
},
{
"epoch": 0.049284320104098896,
"grad_norm": 0.17572622001171112,
"learning_rate": 0.0001,
"loss": 1.6986,
"step": 101
},
{
"epoch": 0.04977228366948601,
"grad_norm": 0.1850849688053131,
"learning_rate": 0.0001,
"loss": 1.7895,
"step": 102
},
{
"epoch": 0.05026024723487313,
"grad_norm": 0.18450362980365753,
"learning_rate": 0.0001,
"loss": 1.8234,
"step": 103
},
{
"epoch": 0.05074821080026025,
"grad_norm": 0.1832476705312729,
"learning_rate": 0.0001,
"loss": 1.7986,
"step": 104
},
{
"epoch": 0.05123617436564736,
"grad_norm": 0.1809314638376236,
"learning_rate": 0.0001,
"loss": 1.7923,
"step": 105
},
{
"epoch": 0.05172413793103448,
"grad_norm": 0.17974039912223816,
"learning_rate": 0.0001,
"loss": 1.7095,
"step": 106
},
{
"epoch": 0.0522121014964216,
"grad_norm": 0.16436076164245605,
"learning_rate": 0.0001,
"loss": 1.6873,
"step": 107
},
{
"epoch": 0.052700065061808715,
"grad_norm": 0.16344858705997467,
"learning_rate": 0.0001,
"loss": 1.6991,
"step": 108
},
{
"epoch": 0.053188028627195835,
"grad_norm": 0.17950277030467987,
"learning_rate": 0.0001,
"loss": 1.8591,
"step": 109
},
{
"epoch": 0.053675992192582955,
"grad_norm": 0.18337760865688324,
"learning_rate": 0.0001,
"loss": 1.784,
"step": 110
},
{
"epoch": 0.05416395575797007,
"grad_norm": 0.1895488053560257,
"learning_rate": 0.0001,
"loss": 1.7853,
"step": 111
},
{
"epoch": 0.05465191932335719,
"grad_norm": 0.17522425949573517,
"learning_rate": 0.0001,
"loss": 1.7127,
"step": 112
},
{
"epoch": 0.05513988288874431,
"grad_norm": 0.17943814396858215,
"learning_rate": 0.0001,
"loss": 1.755,
"step": 113
},
{
"epoch": 0.05562784645413142,
"grad_norm": 0.1815492808818817,
"learning_rate": 0.0001,
"loss": 1.7687,
"step": 114
},
{
"epoch": 0.05611581001951854,
"grad_norm": 0.16954658925533295,
"learning_rate": 0.0001,
"loss": 1.7562,
"step": 115
},
{
"epoch": 0.05660377358490566,
"grad_norm": 0.17870648205280304,
"learning_rate": 0.0001,
"loss": 1.841,
"step": 116
},
{
"epoch": 0.057091737150292775,
"grad_norm": 0.17044954001903534,
"learning_rate": 0.0001,
"loss": 1.7118,
"step": 117
},
{
"epoch": 0.057579700715679895,
"grad_norm": 0.17524173855781555,
"learning_rate": 0.0001,
"loss": 1.6045,
"step": 118
},
{
"epoch": 0.058067664281067015,
"grad_norm": 0.17537613213062286,
"learning_rate": 0.0001,
"loss": 1.8018,
"step": 119
},
{
"epoch": 0.05855562784645413,
"grad_norm": 0.17819495499134064,
"learning_rate": 0.0001,
"loss": 1.7723,
"step": 120
},
{
"epoch": 0.05904359141184125,
"grad_norm": 0.17807795107364655,
"learning_rate": 0.0001,
"loss": 1.8558,
"step": 121
},
{
"epoch": 0.05953155497722837,
"grad_norm": 0.1687198132276535,
"learning_rate": 0.0001,
"loss": 1.7673,
"step": 122
},
{
"epoch": 0.06001951854261549,
"grad_norm": 0.17069241404533386,
"learning_rate": 0.0001,
"loss": 1.7561,
"step": 123
},
{
"epoch": 0.0605074821080026,
"grad_norm": 0.1655956506729126,
"learning_rate": 0.0001,
"loss": 1.6607,
"step": 124
},
{
"epoch": 0.06099544567338972,
"grad_norm": 0.1846679002046585,
"learning_rate": 0.0001,
"loss": 1.8676,
"step": 125
},
{
"epoch": 0.06148340923877684,
"grad_norm": 0.17344145476818085,
"learning_rate": 0.0001,
"loss": 1.7427,
"step": 126
},
{
"epoch": 0.061971372804163954,
"grad_norm": 0.17264996469020844,
"learning_rate": 0.0001,
"loss": 1.7279,
"step": 127
},
{
"epoch": 0.062459336369551074,
"grad_norm": 0.18628281354904175,
"learning_rate": 0.0001,
"loss": 1.6708,
"step": 128
},
{
"epoch": 0.0629472999349382,
"grad_norm": 0.178174689412117,
"learning_rate": 0.0001,
"loss": 1.7931,
"step": 129
},
{
"epoch": 0.06343526350032531,
"grad_norm": 0.17690585553646088,
"learning_rate": 0.0001,
"loss": 1.7647,
"step": 130
},
{
"epoch": 0.06392322706571242,
"grad_norm": 0.18117444217205048,
"learning_rate": 0.0001,
"loss": 1.7376,
"step": 131
},
{
"epoch": 0.06441119063109954,
"grad_norm": 0.17523089051246643,
"learning_rate": 0.0001,
"loss": 1.8403,
"step": 132
},
{
"epoch": 0.06489915419648666,
"grad_norm": 0.16988244652748108,
"learning_rate": 0.0001,
"loss": 1.6958,
"step": 133
},
{
"epoch": 0.06538711776187378,
"grad_norm": 0.1890041083097458,
"learning_rate": 0.0001,
"loss": 1.7388,
"step": 134
},
{
"epoch": 0.0658750813272609,
"grad_norm": 0.1703094244003296,
"learning_rate": 0.0001,
"loss": 1.6424,
"step": 135
},
{
"epoch": 0.06636304489264802,
"grad_norm": 0.17852698266506195,
"learning_rate": 0.0001,
"loss": 1.7786,
"step": 136
},
{
"epoch": 0.06685100845803513,
"grad_norm": 0.17648550868034363,
"learning_rate": 0.0001,
"loss": 1.7172,
"step": 137
},
{
"epoch": 0.06733897202342225,
"grad_norm": 0.18284566700458527,
"learning_rate": 0.0001,
"loss": 1.7491,
"step": 138
},
{
"epoch": 0.06782693558880937,
"grad_norm": 0.1686737835407257,
"learning_rate": 0.0001,
"loss": 1.7218,
"step": 139
},
{
"epoch": 0.06831489915419649,
"grad_norm": 0.1741771250963211,
"learning_rate": 0.0001,
"loss": 1.7534,
"step": 140
},
{
"epoch": 0.06880286271958361,
"grad_norm": 0.1778876781463623,
"learning_rate": 0.0001,
"loss": 1.7388,
"step": 141
},
{
"epoch": 0.06929082628497073,
"grad_norm": 0.1860485076904297,
"learning_rate": 0.0001,
"loss": 1.8109,
"step": 142
},
{
"epoch": 0.06977878985035783,
"grad_norm": 0.17966079711914062,
"learning_rate": 0.0001,
"loss": 1.7171,
"step": 143
},
{
"epoch": 0.07026675341574495,
"grad_norm": 0.19341900944709778,
"learning_rate": 0.0001,
"loss": 1.7911,
"step": 144
},
{
"epoch": 0.07075471698113207,
"grad_norm": 0.1968701183795929,
"learning_rate": 0.0001,
"loss": 1.858,
"step": 145
},
{
"epoch": 0.0712426805465192,
"grad_norm": 0.17585061490535736,
"learning_rate": 0.0001,
"loss": 1.6731,
"step": 146
},
{
"epoch": 0.07173064411190631,
"grad_norm": 0.17294664680957794,
"learning_rate": 0.0001,
"loss": 1.7284,
"step": 147
},
{
"epoch": 0.07221860767729343,
"grad_norm": 0.18245872855186462,
"learning_rate": 0.0001,
"loss": 1.7595,
"step": 148
},
{
"epoch": 0.07270657124268054,
"grad_norm": 0.16850219666957855,
"learning_rate": 0.0001,
"loss": 1.73,
"step": 149
},
{
"epoch": 0.07319453480806766,
"grad_norm": 0.16891759634017944,
"learning_rate": 0.0001,
"loss": 1.7434,
"step": 150
},
{
"epoch": 0.07368249837345478,
"grad_norm": 0.17363204061985016,
"learning_rate": 0.0001,
"loss": 1.738,
"step": 151
},
{
"epoch": 0.0741704619388419,
"grad_norm": 0.16307075321674347,
"learning_rate": 0.0001,
"loss": 1.6285,
"step": 152
},
{
"epoch": 0.07465842550422902,
"grad_norm": 0.1735111027956009,
"learning_rate": 0.0001,
"loss": 1.5711,
"step": 153
},
{
"epoch": 0.07514638906961614,
"grad_norm": 0.18169796466827393,
"learning_rate": 0.0001,
"loss": 1.7395,
"step": 154
},
{
"epoch": 0.07563435263500325,
"grad_norm": 0.16926725208759308,
"learning_rate": 0.0001,
"loss": 1.7534,
"step": 155
},
{
"epoch": 0.07612231620039037,
"grad_norm": 0.19919319450855255,
"learning_rate": 0.0001,
"loss": 1.6975,
"step": 156
},
{
"epoch": 0.07661027976577749,
"grad_norm": 0.19146177172660828,
"learning_rate": 0.0001,
"loss": 1.8272,
"step": 157
},
{
"epoch": 0.0770982433311646,
"grad_norm": 0.19453231990337372,
"learning_rate": 0.0001,
"loss": 1.8229,
"step": 158
},
{
"epoch": 0.07758620689655173,
"grad_norm": 0.20597495138645172,
"learning_rate": 0.0001,
"loss": 1.8567,
"step": 159
},
{
"epoch": 0.07807417046193885,
"grad_norm": 0.18599432706832886,
"learning_rate": 0.0001,
"loss": 1.7587,
"step": 160
},
{
"epoch": 0.07856213402732595,
"grad_norm": 0.21232162415981293,
"learning_rate": 0.0001,
"loss": 1.7179,
"step": 161
},
{
"epoch": 0.07905009759271307,
"grad_norm": 0.1712743043899536,
"learning_rate": 0.0001,
"loss": 1.678,
"step": 162
},
{
"epoch": 0.07953806115810019,
"grad_norm": 0.18402481079101562,
"learning_rate": 0.0001,
"loss": 1.7731,
"step": 163
},
{
"epoch": 0.08002602472348731,
"grad_norm": 0.18908202648162842,
"learning_rate": 0.0001,
"loss": 1.841,
"step": 164
},
{
"epoch": 0.08051398828887443,
"grad_norm": 0.17370882630348206,
"learning_rate": 0.0001,
"loss": 1.6713,
"step": 165
},
{
"epoch": 0.08100195185426155,
"grad_norm": 0.1881919503211975,
"learning_rate": 0.0001,
"loss": 1.8285,
"step": 166
},
{
"epoch": 0.08148991541964867,
"grad_norm": 0.1770172417163849,
"learning_rate": 0.0001,
"loss": 1.7292,
"step": 167
},
{
"epoch": 0.08197787898503578,
"grad_norm": 0.1822032779455185,
"learning_rate": 0.0001,
"loss": 1.6977,
"step": 168
},
{
"epoch": 0.0824658425504229,
"grad_norm": 0.19020989537239075,
"learning_rate": 0.0001,
"loss": 1.6964,
"step": 169
},
{
"epoch": 0.08295380611581002,
"grad_norm": 0.17227591574192047,
"learning_rate": 0.0001,
"loss": 1.703,
"step": 170
},
{
"epoch": 0.08344176968119714,
"grad_norm": 0.19228717684745789,
"learning_rate": 0.0001,
"loss": 1.7247,
"step": 171
},
{
"epoch": 0.08392973324658426,
"grad_norm": 0.1909552961587906,
"learning_rate": 0.0001,
"loss": 1.7973,
"step": 172
},
{
"epoch": 0.08441769681197138,
"grad_norm": 0.18189294636249542,
"learning_rate": 0.0001,
"loss": 1.7579,
"step": 173
},
{
"epoch": 0.08490566037735849,
"grad_norm": 0.19137217104434967,
"learning_rate": 0.0001,
"loss": 1.7198,
"step": 174
},
{
"epoch": 0.0853936239427456,
"grad_norm": 0.18612581491470337,
"learning_rate": 0.0001,
"loss": 1.7585,
"step": 175
},
{
"epoch": 0.08588158750813273,
"grad_norm": 0.1759909838438034,
"learning_rate": 0.0001,
"loss": 1.6732,
"step": 176
},
{
"epoch": 0.08636955107351985,
"grad_norm": 0.18982531130313873,
"learning_rate": 0.0001,
"loss": 1.8301,
"step": 177
},
{
"epoch": 0.08685751463890697,
"grad_norm": 0.16662733256816864,
"learning_rate": 0.0001,
"loss": 1.6799,
"step": 178
},
{
"epoch": 0.08734547820429409,
"grad_norm": 0.17956425249576569,
"learning_rate": 0.0001,
"loss": 1.671,
"step": 179
},
{
"epoch": 0.08783344176968119,
"grad_norm": 0.18416181206703186,
"learning_rate": 0.0001,
"loss": 1.7922,
"step": 180
},
{
"epoch": 0.08832140533506831,
"grad_norm": 0.16633754968643188,
"learning_rate": 0.0001,
"loss": 1.7096,
"step": 181
},
{
"epoch": 0.08880936890045543,
"grad_norm": 0.19759412109851837,
"learning_rate": 0.0001,
"loss": 1.8402,
"step": 182
},
{
"epoch": 0.08929733246584255,
"grad_norm": 0.17006362974643707,
"learning_rate": 0.0001,
"loss": 1.6922,
"step": 183
},
{
"epoch": 0.08978529603122967,
"grad_norm": 0.16919896006584167,
"learning_rate": 0.0001,
"loss": 1.6657,
"step": 184
},
{
"epoch": 0.09027325959661679,
"grad_norm": 0.20307502150535583,
"learning_rate": 0.0001,
"loss": 1.8772,
"step": 185
},
{
"epoch": 0.0907612231620039,
"grad_norm": 0.17572732269763947,
"learning_rate": 0.0001,
"loss": 1.7666,
"step": 186
},
{
"epoch": 0.09124918672739102,
"grad_norm": 0.17327293753623962,
"learning_rate": 0.0001,
"loss": 1.8206,
"step": 187
},
{
"epoch": 0.09173715029277814,
"grad_norm": 0.18354281783103943,
"learning_rate": 0.0001,
"loss": 1.8013,
"step": 188
},
{
"epoch": 0.09222511385816526,
"grad_norm": 0.16821032762527466,
"learning_rate": 0.0001,
"loss": 1.6893,
"step": 189
},
{
"epoch": 0.09271307742355238,
"grad_norm": 0.17506404221057892,
"learning_rate": 0.0001,
"loss": 1.7657,
"step": 190
},
{
"epoch": 0.0932010409889395,
"grad_norm": 0.1758153885602951,
"learning_rate": 0.0001,
"loss": 1.7095,
"step": 191
},
{
"epoch": 0.0936890045543266,
"grad_norm": 0.18787072598934174,
"learning_rate": 0.0001,
"loss": 1.7312,
"step": 192
},
{
"epoch": 0.09417696811971372,
"grad_norm": 0.1803017109632492,
"learning_rate": 0.0001,
"loss": 1.7521,
"step": 193
},
{
"epoch": 0.09466493168510084,
"grad_norm": 0.18097610771656036,
"learning_rate": 0.0001,
"loss": 1.6861,
"step": 194
},
{
"epoch": 0.09515289525048796,
"grad_norm": 0.1760302186012268,
"learning_rate": 0.0001,
"loss": 1.6703,
"step": 195
},
{
"epoch": 0.09564085881587508,
"grad_norm": 0.17225316166877747,
"learning_rate": 0.0001,
"loss": 1.73,
"step": 196
},
{
"epoch": 0.0961288223812622,
"grad_norm": 0.1856345683336258,
"learning_rate": 0.0001,
"loss": 1.6828,
"step": 197
},
{
"epoch": 0.09661678594664931,
"grad_norm": 0.18595090508460999,
"learning_rate": 0.0001,
"loss": 1.7136,
"step": 198
},
{
"epoch": 0.09710474951203643,
"grad_norm": 0.1780211329460144,
"learning_rate": 0.0001,
"loss": 1.8146,
"step": 199
},
{
"epoch": 0.09759271307742355,
"grad_norm": 0.17781271040439606,
"learning_rate": 0.0001,
"loss": 1.6679,
"step": 200
},
{
"epoch": 0.09808067664281067,
"grad_norm": 0.17124401032924652,
"learning_rate": 0.0001,
"loss": 1.7077,
"step": 201
},
{
"epoch": 0.09856864020819779,
"grad_norm": 0.18443076312541962,
"learning_rate": 0.0001,
"loss": 1.8058,
"step": 202
},
{
"epoch": 0.09905660377358491,
"grad_norm": 0.1758834272623062,
"learning_rate": 0.0001,
"loss": 1.81,
"step": 203
},
{
"epoch": 0.09954456733897202,
"grad_norm": 0.17878177762031555,
"learning_rate": 0.0001,
"loss": 1.7515,
"step": 204
},
{
"epoch": 0.10003253090435914,
"grad_norm": 0.18028298020362854,
"learning_rate": 0.0001,
"loss": 1.7733,
"step": 205
},
{
"epoch": 0.10052049446974626,
"grad_norm": 0.17935384809970856,
"learning_rate": 0.0001,
"loss": 1.8011,
"step": 206
},
{
"epoch": 0.10100845803513338,
"grad_norm": 0.19665150344371796,
"learning_rate": 0.0001,
"loss": 1.7667,
"step": 207
},
{
"epoch": 0.1014964216005205,
"grad_norm": 0.16669659316539764,
"learning_rate": 0.0001,
"loss": 1.7046,
"step": 208
},
{
"epoch": 0.10198438516590762,
"grad_norm": 0.17783086001873016,
"learning_rate": 0.0001,
"loss": 1.6424,
"step": 209
},
{
"epoch": 0.10247234873129472,
"grad_norm": 0.1761302351951599,
"learning_rate": 0.0001,
"loss": 1.726,
"step": 210
},
{
"epoch": 0.10296031229668184,
"grad_norm": 0.17417997121810913,
"learning_rate": 0.0001,
"loss": 1.7181,
"step": 211
},
{
"epoch": 0.10344827586206896,
"grad_norm": 0.17537769675254822,
"learning_rate": 0.0001,
"loss": 1.6876,
"step": 212
},
{
"epoch": 0.10393623942745608,
"grad_norm": 0.16924896836280823,
"learning_rate": 0.0001,
"loss": 1.768,
"step": 213
},
{
"epoch": 0.1044242029928432,
"grad_norm": 0.20247921347618103,
"learning_rate": 0.0001,
"loss": 1.9159,
"step": 214
},
{
"epoch": 0.10491216655823032,
"grad_norm": 0.16506172716617584,
"learning_rate": 0.0001,
"loss": 1.6491,
"step": 215
},
{
"epoch": 0.10540013012361743,
"grad_norm": 0.17558075487613678,
"learning_rate": 0.0001,
"loss": 1.7169,
"step": 216
},
{
"epoch": 0.10588809368900455,
"grad_norm": 0.17124514281749725,
"learning_rate": 0.0001,
"loss": 1.6931,
"step": 217
},
{
"epoch": 0.10637605725439167,
"grad_norm": 0.16885621845722198,
"learning_rate": 0.0001,
"loss": 1.6946,
"step": 218
},
{
"epoch": 0.10686402081977879,
"grad_norm": 0.17787247896194458,
"learning_rate": 0.0001,
"loss": 1.7477,
"step": 219
},
{
"epoch": 0.10735198438516591,
"grad_norm": 0.17979493737220764,
"learning_rate": 0.0001,
"loss": 1.7215,
"step": 220
},
{
"epoch": 0.10783994795055303,
"grad_norm": 0.187989741563797,
"learning_rate": 0.0001,
"loss": 1.6946,
"step": 221
},
{
"epoch": 0.10832791151594014,
"grad_norm": 0.18497705459594727,
"learning_rate": 0.0001,
"loss": 1.7725,
"step": 222
},
{
"epoch": 0.10881587508132726,
"grad_norm": 0.1895315796136856,
"learning_rate": 0.0001,
"loss": 1.7455,
"step": 223
},
{
"epoch": 0.10930383864671438,
"grad_norm": 0.17897574603557587,
"learning_rate": 0.0001,
"loss": 1.7297,
"step": 224
},
{
"epoch": 0.1097918022121015,
"grad_norm": 0.18770314753055573,
"learning_rate": 0.0001,
"loss": 1.7948,
"step": 225
},
{
"epoch": 0.11027976577748862,
"grad_norm": 0.1812209188938141,
"learning_rate": 0.0001,
"loss": 1.8229,
"step": 226
},
{
"epoch": 0.11076772934287574,
"grad_norm": 0.17030760645866394,
"learning_rate": 0.0001,
"loss": 1.6029,
"step": 227
},
{
"epoch": 0.11125569290826284,
"grad_norm": 0.18503767251968384,
"learning_rate": 0.0001,
"loss": 1.644,
"step": 228
},
{
"epoch": 0.11174365647364996,
"grad_norm": 0.17443233728408813,
"learning_rate": 0.0001,
"loss": 1.7024,
"step": 229
},
{
"epoch": 0.11223162003903708,
"grad_norm": 0.1859743744134903,
"learning_rate": 0.0001,
"loss": 1.7859,
"step": 230
},
{
"epoch": 0.1127195836044242,
"grad_norm": 0.1692182421684265,
"learning_rate": 0.0001,
"loss": 1.6996,
"step": 231
},
{
"epoch": 0.11320754716981132,
"grad_norm": 0.16695043444633484,
"learning_rate": 0.0001,
"loss": 1.7185,
"step": 232
},
{
"epoch": 0.11369551073519844,
"grad_norm": 0.18184787034988403,
"learning_rate": 0.0001,
"loss": 1.712,
"step": 233
},
{
"epoch": 0.11418347430058555,
"grad_norm": 0.19107092916965485,
"learning_rate": 0.0001,
"loss": 1.8902,
"step": 234
},
{
"epoch": 0.11467143786597267,
"grad_norm": 0.1724960058927536,
"learning_rate": 0.0001,
"loss": 1.7464,
"step": 235
},
{
"epoch": 0.11515940143135979,
"grad_norm": 0.17673127353191376,
"learning_rate": 0.0001,
"loss": 1.785,
"step": 236
},
{
"epoch": 0.11564736499674691,
"grad_norm": 0.18474438786506653,
"learning_rate": 0.0001,
"loss": 1.8143,
"step": 237
},
{
"epoch": 0.11613532856213403,
"grad_norm": 0.17361678183078766,
"learning_rate": 0.0001,
"loss": 1.7558,
"step": 238
},
{
"epoch": 0.11662329212752115,
"grad_norm": 0.17701455950737,
"learning_rate": 0.0001,
"loss": 1.5568,
"step": 239
},
{
"epoch": 0.11711125569290826,
"grad_norm": 0.18372413516044617,
"learning_rate": 0.0001,
"loss": 1.7913,
"step": 240
},
{
"epoch": 0.11759921925829538,
"grad_norm": 0.17780154943466187,
"learning_rate": 0.0001,
"loss": 1.668,
"step": 241
},
{
"epoch": 0.1180871828236825,
"grad_norm": 0.17763271927833557,
"learning_rate": 0.0001,
"loss": 1.7006,
"step": 242
},
{
"epoch": 0.11857514638906962,
"grad_norm": 0.17323441803455353,
"learning_rate": 0.0001,
"loss": 1.5985,
"step": 243
},
{
"epoch": 0.11906310995445674,
"grad_norm": 0.1981297731399536,
"learning_rate": 0.0001,
"loss": 1.7938,
"step": 244
},
{
"epoch": 0.11955107351984386,
"grad_norm": 0.1856129914522171,
"learning_rate": 0.0001,
"loss": 1.7469,
"step": 245
},
{
"epoch": 0.12003903708523098,
"grad_norm": 0.17878711223602295,
"learning_rate": 0.0001,
"loss": 1.7156,
"step": 246
},
{
"epoch": 0.12052700065061808,
"grad_norm": 0.18860337138175964,
"learning_rate": 0.0001,
"loss": 1.6269,
"step": 247
},
{
"epoch": 0.1210149642160052,
"grad_norm": 0.17960023880004883,
"learning_rate": 0.0001,
"loss": 1.7484,
"step": 248
},
{
"epoch": 0.12150292778139232,
"grad_norm": 0.21390804648399353,
"learning_rate": 0.0001,
"loss": 1.7815,
"step": 249
},
{
"epoch": 0.12199089134677944,
"grad_norm": 0.18213345110416412,
"learning_rate": 0.0001,
"loss": 1.8368,
"step": 250
},
{
"epoch": 0.12247885491216656,
"grad_norm": 0.19667306542396545,
"learning_rate": 0.0001,
"loss": 1.7547,
"step": 251
},
{
"epoch": 0.12296681847755368,
"grad_norm": 0.18796378374099731,
"learning_rate": 0.0001,
"loss": 1.6831,
"step": 252
},
{
"epoch": 0.12345478204294079,
"grad_norm": 0.18432985246181488,
"learning_rate": 0.0001,
"loss": 1.8219,
"step": 253
},
{
"epoch": 0.12394274560832791,
"grad_norm": 0.19263121485710144,
"learning_rate": 0.0001,
"loss": 1.7033,
"step": 254
},
{
"epoch": 0.12443070917371503,
"grad_norm": 0.19383201003074646,
"learning_rate": 0.0001,
"loss": 1.723,
"step": 255
},
{
"epoch": 0.12491867273910215,
"grad_norm": 0.17456290125846863,
"learning_rate": 0.0001,
"loss": 1.7354,
"step": 256
},
{
"epoch": 0.12540663630448926,
"grad_norm": 0.2073334902524948,
"learning_rate": 0.0001,
"loss": 1.7359,
"step": 257
},
{
"epoch": 0.1258945998698764,
"grad_norm": 0.1819145232439041,
"learning_rate": 0.0001,
"loss": 1.661,
"step": 258
},
{
"epoch": 0.1263825634352635,
"grad_norm": 0.18823570013046265,
"learning_rate": 0.0001,
"loss": 1.7093,
"step": 259
},
{
"epoch": 0.12687052700065063,
"grad_norm": 0.2142113894224167,
"learning_rate": 0.0001,
"loss": 1.7367,
"step": 260
},
{
"epoch": 0.12735849056603774,
"grad_norm": 0.17133839428424835,
"learning_rate": 0.0001,
"loss": 1.7257,
"step": 261
},
{
"epoch": 0.12784645413142484,
"grad_norm": 0.20852066576480865,
"learning_rate": 0.0001,
"loss": 1.7453,
"step": 262
},
{
"epoch": 0.12833441769681198,
"grad_norm": 0.19172458350658417,
"learning_rate": 0.0001,
"loss": 1.817,
"step": 263
},
{
"epoch": 0.12882238126219908,
"grad_norm": 0.1805960088968277,
"learning_rate": 0.0001,
"loss": 1.7679,
"step": 264
},
{
"epoch": 0.12931034482758622,
"grad_norm": 0.2055218368768692,
"learning_rate": 0.0001,
"loss": 1.7874,
"step": 265
},
{
"epoch": 0.12979830839297332,
"grad_norm": 0.16831174492835999,
"learning_rate": 0.0001,
"loss": 1.6342,
"step": 266
},
{
"epoch": 0.13028627195836046,
"grad_norm": 0.17563872039318085,
"learning_rate": 0.0001,
"loss": 1.7768,
"step": 267
},
{
"epoch": 0.13077423552374756,
"grad_norm": 0.1891409158706665,
"learning_rate": 0.0001,
"loss": 1.7653,
"step": 268
},
{
"epoch": 0.13126219908913467,
"grad_norm": 0.2160748541355133,
"learning_rate": 0.0001,
"loss": 1.6957,
"step": 269
},
{
"epoch": 0.1317501626545218,
"grad_norm": 0.16802331805229187,
"learning_rate": 0.0001,
"loss": 1.6474,
"step": 270
},
{
"epoch": 0.1322381262199089,
"grad_norm": 0.21498991549015045,
"learning_rate": 0.0001,
"loss": 1.7201,
"step": 271
},
{
"epoch": 0.13272608978529604,
"grad_norm": 0.1941365897655487,
"learning_rate": 0.0001,
"loss": 1.7387,
"step": 272
},
{
"epoch": 0.13321405335068315,
"grad_norm": 0.19020740687847137,
"learning_rate": 0.0001,
"loss": 1.6985,
"step": 273
},
{
"epoch": 0.13370201691607025,
"grad_norm": 0.18627683818340302,
"learning_rate": 0.0001,
"loss": 1.7752,
"step": 274
},
{
"epoch": 0.1341899804814574,
"grad_norm": 0.1916990429162979,
"learning_rate": 0.0001,
"loss": 1.7438,
"step": 275
},
{
"epoch": 0.1346779440468445,
"grad_norm": 0.18649545311927795,
"learning_rate": 0.0001,
"loss": 1.663,
"step": 276
},
{
"epoch": 0.13516590761223163,
"grad_norm": 0.17986956238746643,
"learning_rate": 0.0001,
"loss": 1.7905,
"step": 277
},
{
"epoch": 0.13565387117761873,
"grad_norm": 0.18601469695568085,
"learning_rate": 0.0001,
"loss": 1.5608,
"step": 278
},
{
"epoch": 0.13614183474300587,
"grad_norm": 0.19612380862236023,
"learning_rate": 0.0001,
"loss": 1.7317,
"step": 279
},
{
"epoch": 0.13662979830839297,
"grad_norm": 0.17528840899467468,
"learning_rate": 0.0001,
"loss": 1.7114,
"step": 280
},
{
"epoch": 0.13711776187378008,
"grad_norm": 0.196456179022789,
"learning_rate": 0.0001,
"loss": 1.674,
"step": 281
},
{
"epoch": 0.13760572543916721,
"grad_norm": 0.18218737840652466,
"learning_rate": 0.0001,
"loss": 1.6971,
"step": 282
},
{
"epoch": 0.13809368900455432,
"grad_norm": 0.18146923184394836,
"learning_rate": 0.0001,
"loss": 1.7656,
"step": 283
},
{
"epoch": 0.13858165256994145,
"grad_norm": 0.17707045376300812,
"learning_rate": 0.0001,
"loss": 1.6322,
"step": 284
},
{
"epoch": 0.13906961613532856,
"grad_norm": 0.18990135192871094,
"learning_rate": 0.0001,
"loss": 1.7412,
"step": 285
},
{
"epoch": 0.13955757970071567,
"grad_norm": 0.17993967235088348,
"learning_rate": 0.0001,
"loss": 1.6734,
"step": 286
},
{
"epoch": 0.1400455432661028,
"grad_norm": 0.20445284247398376,
"learning_rate": 0.0001,
"loss": 1.9164,
"step": 287
},
{
"epoch": 0.1405335068314899,
"grad_norm": 0.18881991505622864,
"learning_rate": 0.0001,
"loss": 1.8395,
"step": 288
},
{
"epoch": 0.14102147039687704,
"grad_norm": 0.17268231511116028,
"learning_rate": 0.0001,
"loss": 1.6494,
"step": 289
},
{
"epoch": 0.14150943396226415,
"grad_norm": 0.17375007271766663,
"learning_rate": 0.0001,
"loss": 1.6968,
"step": 290
},
{
"epoch": 0.14199739752765128,
"grad_norm": 0.17844517529010773,
"learning_rate": 0.0001,
"loss": 1.8686,
"step": 291
},
{
"epoch": 0.1424853610930384,
"grad_norm": 0.18538935482501984,
"learning_rate": 0.0001,
"loss": 1.8035,
"step": 292
},
{
"epoch": 0.1429733246584255,
"grad_norm": 0.18314018845558167,
"learning_rate": 0.0001,
"loss": 1.8051,
"step": 293
},
{
"epoch": 0.14346128822381263,
"grad_norm": 0.18008261919021606,
"learning_rate": 0.0001,
"loss": 1.7992,
"step": 294
},
{
"epoch": 0.14394925178919973,
"grad_norm": 0.19243541359901428,
"learning_rate": 0.0001,
"loss": 1.7394,
"step": 295
},
{
"epoch": 0.14443721535458687,
"grad_norm": 0.18523713946342468,
"learning_rate": 0.0001,
"loss": 1.7845,
"step": 296
},
{
"epoch": 0.14492517891997397,
"grad_norm": 0.1781051605939865,
"learning_rate": 0.0001,
"loss": 1.6748,
"step": 297
},
{
"epoch": 0.14541314248536108,
"grad_norm": 0.18994836509227753,
"learning_rate": 0.0001,
"loss": 1.704,
"step": 298
},
{
"epoch": 0.1459011060507482,
"grad_norm": 0.17285694181919098,
"learning_rate": 0.0001,
"loss": 1.7832,
"step": 299
},
{
"epoch": 0.14638906961613532,
"grad_norm": 0.20339974761009216,
"learning_rate": 0.0001,
"loss": 1.7191,
"step": 300
},
{
"epoch": 0.14687703318152245,
"grad_norm": 0.17608943581581116,
"learning_rate": 0.0001,
"loss": 1.6315,
"step": 301
},
{
"epoch": 0.14736499674690956,
"grad_norm": 0.17653749883174896,
"learning_rate": 0.0001,
"loss": 1.6948,
"step": 302
},
{
"epoch": 0.1478529603122967,
"grad_norm": 0.1792931854724884,
"learning_rate": 0.0001,
"loss": 1.7027,
"step": 303
},
{
"epoch": 0.1483409238776838,
"grad_norm": 0.18247826397418976,
"learning_rate": 0.0001,
"loss": 1.7433,
"step": 304
},
{
"epoch": 0.1488288874430709,
"grad_norm": 0.1712041050195694,
"learning_rate": 0.0001,
"loss": 1.6548,
"step": 305
},
{
"epoch": 0.14931685100845804,
"grad_norm": 0.184691920876503,
"learning_rate": 0.0001,
"loss": 1.7226,
"step": 306
},
{
"epoch": 0.14980481457384515,
"grad_norm": 0.1834600865840912,
"learning_rate": 0.0001,
"loss": 1.7894,
"step": 307
},
{
"epoch": 0.15029277813923228,
"grad_norm": 0.1753443032503128,
"learning_rate": 0.0001,
"loss": 1.636,
"step": 308
},
{
"epoch": 0.1507807417046194,
"grad_norm": 0.16590848565101624,
"learning_rate": 0.0001,
"loss": 1.6802,
"step": 309
},
{
"epoch": 0.1512687052700065,
"grad_norm": 0.17210128903388977,
"learning_rate": 0.0001,
"loss": 1.758,
"step": 310
},
{
"epoch": 0.15175666883539363,
"grad_norm": 0.19016823172569275,
"learning_rate": 0.0001,
"loss": 1.8243,
"step": 311
},
{
"epoch": 0.15224463240078073,
"grad_norm": 0.1756354421377182,
"learning_rate": 0.0001,
"loss": 1.7666,
"step": 312
},
{
"epoch": 0.15273259596616787,
"grad_norm": 0.19266565144062042,
"learning_rate": 0.0001,
"loss": 1.7856,
"step": 313
},
{
"epoch": 0.15322055953155497,
"grad_norm": 0.17626765370368958,
"learning_rate": 0.0001,
"loss": 1.7453,
"step": 314
},
{
"epoch": 0.1537085230969421,
"grad_norm": 0.1796361356973648,
"learning_rate": 0.0001,
"loss": 1.8428,
"step": 315
},
{
"epoch": 0.1541964866623292,
"grad_norm": 0.1971481889486313,
"learning_rate": 0.0001,
"loss": 1.8298,
"step": 316
},
{
"epoch": 0.15468445022771632,
"grad_norm": 0.17479249835014343,
"learning_rate": 0.0001,
"loss": 1.7243,
"step": 317
},
{
"epoch": 0.15517241379310345,
"grad_norm": 0.18558745086193085,
"learning_rate": 0.0001,
"loss": 1.8265,
"step": 318
},
{
"epoch": 0.15566037735849056,
"grad_norm": 0.17821088433265686,
"learning_rate": 0.0001,
"loss": 1.6735,
"step": 319
},
{
"epoch": 0.1561483409238777,
"grad_norm": 0.17939302325248718,
"learning_rate": 0.0001,
"loss": 1.7158,
"step": 320
},
{
"epoch": 0.1566363044892648,
"grad_norm": 0.17538347840309143,
"learning_rate": 0.0001,
"loss": 1.7467,
"step": 321
},
{
"epoch": 0.1571242680546519,
"grad_norm": 0.1796545684337616,
"learning_rate": 0.0001,
"loss": 1.7148,
"step": 322
},
{
"epoch": 0.15761223162003904,
"grad_norm": 0.19828006625175476,
"learning_rate": 0.0001,
"loss": 1.8431,
"step": 323
},
{
"epoch": 0.15810019518542615,
"grad_norm": 0.17246133089065552,
"learning_rate": 0.0001,
"loss": 1.7291,
"step": 324
},
{
"epoch": 0.15858815875081328,
"grad_norm": 0.1835339218378067,
"learning_rate": 0.0001,
"loss": 1.7319,
"step": 325
},
{
"epoch": 0.15907612231620039,
"grad_norm": 0.18122561275959015,
"learning_rate": 0.0001,
"loss": 1.7263,
"step": 326
},
{
"epoch": 0.15956408588158752,
"grad_norm": 0.19297321140766144,
"learning_rate": 0.0001,
"loss": 1.8792,
"step": 327
},
{
"epoch": 0.16005204944697463,
"grad_norm": 0.1762656420469284,
"learning_rate": 0.0001,
"loss": 1.7495,
"step": 328
},
{
"epoch": 0.16054001301236173,
"grad_norm": 0.17146944999694824,
"learning_rate": 0.0001,
"loss": 1.7089,
"step": 329
},
{
"epoch": 0.16102797657774887,
"grad_norm": 0.17192597687244415,
"learning_rate": 0.0001,
"loss": 1.694,
"step": 330
},
{
"epoch": 0.16151594014313597,
"grad_norm": 0.17271386086940765,
"learning_rate": 0.0001,
"loss": 1.6223,
"step": 331
},
{
"epoch": 0.1620039037085231,
"grad_norm": 0.17589011788368225,
"learning_rate": 0.0001,
"loss": 1.7123,
"step": 332
},
{
"epoch": 0.1624918672739102,
"grad_norm": 0.17920418083667755,
"learning_rate": 0.0001,
"loss": 1.6938,
"step": 333
},
{
"epoch": 0.16297983083929735,
"grad_norm": 0.16645678877830505,
"learning_rate": 0.0001,
"loss": 1.6704,
"step": 334
},
{
"epoch": 0.16346779440468445,
"grad_norm": 0.1698988974094391,
"learning_rate": 0.0001,
"loss": 1.7562,
"step": 335
},
{
"epoch": 0.16395575797007156,
"grad_norm": 0.17255748808383942,
"learning_rate": 0.0001,
"loss": 1.7408,
"step": 336
},
{
"epoch": 0.1644437215354587,
"grad_norm": 0.16908328235149384,
"learning_rate": 0.0001,
"loss": 1.711,
"step": 337
},
{
"epoch": 0.1649316851008458,
"grad_norm": 0.17891424894332886,
"learning_rate": 0.0001,
"loss": 1.7199,
"step": 338
},
{
"epoch": 0.16541964866623293,
"grad_norm": 0.17500531673431396,
"learning_rate": 0.0001,
"loss": 1.8027,
"step": 339
},
{
"epoch": 0.16590761223162004,
"grad_norm": 0.1908222734928131,
"learning_rate": 0.0001,
"loss": 1.7267,
"step": 340
},
{
"epoch": 0.16639557579700714,
"grad_norm": 0.16457560658454895,
"learning_rate": 0.0001,
"loss": 1.6551,
"step": 341
},
{
"epoch": 0.16688353936239428,
"grad_norm": 0.17455148696899414,
"learning_rate": 0.0001,
"loss": 1.7536,
"step": 342
},
{
"epoch": 0.16737150292778138,
"grad_norm": 0.24865932762622833,
"learning_rate": 0.0001,
"loss": 1.7038,
"step": 343
},
{
"epoch": 0.16785946649316852,
"grad_norm": 0.16769102215766907,
"learning_rate": 0.0001,
"loss": 1.6666,
"step": 344
},
{
"epoch": 0.16834743005855562,
"grad_norm": 0.17845629155635834,
"learning_rate": 0.0001,
"loss": 1.7729,
"step": 345
},
{
"epoch": 0.16883539362394276,
"grad_norm": 0.18893101811408997,
"learning_rate": 0.0001,
"loss": 1.6953,
"step": 346
},
{
"epoch": 0.16932335718932987,
"grad_norm": 0.17489705979824066,
"learning_rate": 0.0001,
"loss": 1.6451,
"step": 347
},
{
"epoch": 0.16981132075471697,
"grad_norm": 0.1895252764225006,
"learning_rate": 0.0001,
"loss": 1.6664,
"step": 348
},
{
"epoch": 0.1702992843201041,
"grad_norm": 0.18796460330486298,
"learning_rate": 0.0001,
"loss": 1.8179,
"step": 349
},
{
"epoch": 0.1707872478854912,
"grad_norm": 0.18239444494247437,
"learning_rate": 0.0001,
"loss": 1.7895,
"step": 350
},
{
"epoch": 0.17127521145087835,
"grad_norm": 0.18578602373600006,
"learning_rate": 0.0001,
"loss": 1.7201,
"step": 351
},
{
"epoch": 0.17176317501626545,
"grad_norm": 0.17505811154842377,
"learning_rate": 0.0001,
"loss": 1.6738,
"step": 352
},
{
"epoch": 0.17225113858165256,
"grad_norm": 0.16880185902118683,
"learning_rate": 0.0001,
"loss": 1.7064,
"step": 353
},
{
"epoch": 0.1727391021470397,
"grad_norm": 0.1847655326128006,
"learning_rate": 0.0001,
"loss": 1.6227,
"step": 354
},
{
"epoch": 0.1732270657124268,
"grad_norm": 0.18033885955810547,
"learning_rate": 0.0001,
"loss": 1.7613,
"step": 355
},
{
"epoch": 0.17371502927781393,
"grad_norm": 0.2022799551486969,
"learning_rate": 0.0001,
"loss": 1.6975,
"step": 356
},
{
"epoch": 0.17420299284320104,
"grad_norm": 0.18487118184566498,
"learning_rate": 0.0001,
"loss": 1.6245,
"step": 357
},
{
"epoch": 0.17469095640858817,
"grad_norm": 0.18200282752513885,
"learning_rate": 0.0001,
"loss": 1.8013,
"step": 358
},
{
"epoch": 0.17517891997397528,
"grad_norm": 0.16840700805187225,
"learning_rate": 0.0001,
"loss": 1.6904,
"step": 359
},
{
"epoch": 0.17566688353936238,
"grad_norm": 0.17556121945381165,
"learning_rate": 0.0001,
"loss": 1.7331,
"step": 360
},
{
"epoch": 0.17615484710474952,
"grad_norm": 0.18641792237758636,
"learning_rate": 0.0001,
"loss": 1.8248,
"step": 361
},
{
"epoch": 0.17664281067013662,
"grad_norm": 0.16753801703453064,
"learning_rate": 0.0001,
"loss": 1.591,
"step": 362
},
{
"epoch": 0.17713077423552376,
"grad_norm": 0.16265541315078735,
"learning_rate": 0.0001,
"loss": 1.5814,
"step": 363
},
{
"epoch": 0.17761873780091086,
"grad_norm": 0.17881396412849426,
"learning_rate": 0.0001,
"loss": 1.8452,
"step": 364
},
{
"epoch": 0.17810670136629797,
"grad_norm": 0.18160590529441833,
"learning_rate": 0.0001,
"loss": 1.7977,
"step": 365
},
{
"epoch": 0.1785946649316851,
"grad_norm": 0.1778435856103897,
"learning_rate": 0.0001,
"loss": 1.7319,
"step": 366
},
{
"epoch": 0.1790826284970722,
"grad_norm": 0.17236903309822083,
"learning_rate": 0.0001,
"loss": 1.6572,
"step": 367
},
{
"epoch": 0.17957059206245934,
"grad_norm": 0.16980677843093872,
"learning_rate": 0.0001,
"loss": 1.6814,
"step": 368
},
{
"epoch": 0.18005855562784645,
"grad_norm": 0.17113539576530457,
"learning_rate": 0.0001,
"loss": 1.5835,
"step": 369
},
{
"epoch": 0.18054651919323358,
"grad_norm": 0.22926300764083862,
"learning_rate": 0.0001,
"loss": 1.7127,
"step": 370
},
{
"epoch": 0.1810344827586207,
"grad_norm": 0.1766396313905716,
"learning_rate": 0.0001,
"loss": 1.8002,
"step": 371
},
{
"epoch": 0.1815224463240078,
"grad_norm": 0.1911155730485916,
"learning_rate": 0.0001,
"loss": 1.7287,
"step": 372
},
{
"epoch": 0.18201040988939493,
"grad_norm": 0.1996450275182724,
"learning_rate": 0.0001,
"loss": 1.5601,
"step": 373
},
{
"epoch": 0.18249837345478204,
"grad_norm": 0.17531970143318176,
"learning_rate": 0.0001,
"loss": 1.674,
"step": 374
},
{
"epoch": 0.18298633702016917,
"grad_norm": 0.19017955660820007,
"learning_rate": 0.0001,
"loss": 1.8052,
"step": 375
},
{
"epoch": 0.18347430058555628,
"grad_norm": 0.195291206240654,
"learning_rate": 0.0001,
"loss": 1.6787,
"step": 376
},
{
"epoch": 0.18396226415094338,
"grad_norm": 0.18030132353305817,
"learning_rate": 0.0001,
"loss": 1.6931,
"step": 377
},
{
"epoch": 0.18445022771633052,
"grad_norm": 0.1725359857082367,
"learning_rate": 0.0001,
"loss": 1.5814,
"step": 378
},
{
"epoch": 0.18493819128171762,
"grad_norm": 0.18235339224338531,
"learning_rate": 0.0001,
"loss": 1.7759,
"step": 379
},
{
"epoch": 0.18542615484710476,
"grad_norm": 0.19052359461784363,
"learning_rate": 0.0001,
"loss": 1.7898,
"step": 380
},
{
"epoch": 0.18591411841249186,
"grad_norm": 0.1713322550058365,
"learning_rate": 0.0001,
"loss": 1.623,
"step": 381
},
{
"epoch": 0.186402081977879,
"grad_norm": 0.19699741899967194,
"learning_rate": 0.0001,
"loss": 1.7517,
"step": 382
},
{
"epoch": 0.1868900455432661,
"grad_norm": 0.17510955035686493,
"learning_rate": 0.0001,
"loss": 1.7045,
"step": 383
},
{
"epoch": 0.1873780091086532,
"grad_norm": 0.17883911728858948,
"learning_rate": 0.0001,
"loss": 1.6763,
"step": 384
},
{
"epoch": 0.18786597267404034,
"grad_norm": 0.18562713265419006,
"learning_rate": 0.0001,
"loss": 1.6603,
"step": 385
},
{
"epoch": 0.18835393623942745,
"grad_norm": 0.18200963735580444,
"learning_rate": 0.0001,
"loss": 1.7698,
"step": 386
},
{
"epoch": 0.18884189980481458,
"grad_norm": 0.192865788936615,
"learning_rate": 0.0001,
"loss": 1.8058,
"step": 387
},
{
"epoch": 0.1893298633702017,
"grad_norm": 0.17498141527175903,
"learning_rate": 0.0001,
"loss": 1.657,
"step": 388
},
{
"epoch": 0.1898178269355888,
"grad_norm": 0.17550218105316162,
"learning_rate": 0.0001,
"loss": 1.7638,
"step": 389
},
{
"epoch": 0.19030579050097593,
"grad_norm": 0.19263967871665955,
"learning_rate": 0.0001,
"loss": 1.7375,
"step": 390
},
{
"epoch": 0.19079375406636304,
"grad_norm": 0.1728338897228241,
"learning_rate": 0.0001,
"loss": 1.7467,
"step": 391
},
{
"epoch": 0.19128171763175017,
"grad_norm": 0.17929600179195404,
"learning_rate": 0.0001,
"loss": 1.6489,
"step": 392
},
{
"epoch": 0.19176968119713728,
"grad_norm": 0.18325988948345184,
"learning_rate": 0.0001,
"loss": 1.8676,
"step": 393
},
{
"epoch": 0.1922576447625244,
"grad_norm": 0.17365989089012146,
"learning_rate": 0.0001,
"loss": 1.6916,
"step": 394
},
{
"epoch": 0.19274560832791152,
"grad_norm": 0.17361170053482056,
"learning_rate": 0.0001,
"loss": 1.7118,
"step": 395
},
{
"epoch": 0.19323357189329862,
"grad_norm": 0.181492879986763,
"learning_rate": 0.0001,
"loss": 1.7197,
"step": 396
},
{
"epoch": 0.19372153545868576,
"grad_norm": 0.19113008677959442,
"learning_rate": 0.0001,
"loss": 1.788,
"step": 397
},
{
"epoch": 0.19420949902407286,
"grad_norm": 0.173355832695961,
"learning_rate": 0.0001,
"loss": 1.6866,
"step": 398
},
{
"epoch": 0.19469746258946,
"grad_norm": 0.1797139197587967,
"learning_rate": 0.0001,
"loss": 1.7505,
"step": 399
},
{
"epoch": 0.1951854261548471,
"grad_norm": 0.18337444961071014,
"learning_rate": 0.0001,
"loss": 1.7099,
"step": 400
},
{
"epoch": 0.1956733897202342,
"grad_norm": 0.17387695610523224,
"learning_rate": 0.0001,
"loss": 1.737,
"step": 401
},
{
"epoch": 0.19616135328562134,
"grad_norm": 0.1695685237646103,
"learning_rate": 0.0001,
"loss": 1.6916,
"step": 402
},
{
"epoch": 0.19664931685100845,
"grad_norm": 0.1874959021806717,
"learning_rate": 0.0001,
"loss": 1.6919,
"step": 403
},
{
"epoch": 0.19713728041639558,
"grad_norm": 0.17886492609977722,
"learning_rate": 0.0001,
"loss": 1.737,
"step": 404
},
{
"epoch": 0.1976252439817827,
"grad_norm": 0.19390465319156647,
"learning_rate": 0.0001,
"loss": 1.8003,
"step": 405
},
{
"epoch": 0.19811320754716982,
"grad_norm": 0.17292645573616028,
"learning_rate": 0.0001,
"loss": 1.6714,
"step": 406
},
{
"epoch": 0.19860117111255693,
"grad_norm": 0.16998599469661713,
"learning_rate": 0.0001,
"loss": 1.7242,
"step": 407
},
{
"epoch": 0.19908913467794404,
"grad_norm": 0.18668459355831146,
"learning_rate": 0.0001,
"loss": 1.7025,
"step": 408
},
{
"epoch": 0.19957709824333117,
"grad_norm": 0.16807502508163452,
"learning_rate": 0.0001,
"loss": 1.6738,
"step": 409
},
{
"epoch": 0.20006506180871828,
"grad_norm": 0.1849876344203949,
"learning_rate": 0.0001,
"loss": 1.8173,
"step": 410
},
{
"epoch": 0.2005530253741054,
"grad_norm": 0.18935902416706085,
"learning_rate": 0.0001,
"loss": 1.7108,
"step": 411
},
{
"epoch": 0.20104098893949252,
"grad_norm": 0.17630939185619354,
"learning_rate": 0.0001,
"loss": 1.7023,
"step": 412
},
{
"epoch": 0.20152895250487965,
"grad_norm": 0.19990061223506927,
"learning_rate": 0.0001,
"loss": 1.6862,
"step": 413
},
{
"epoch": 0.20201691607026676,
"grad_norm": 0.18538086116313934,
"learning_rate": 0.0001,
"loss": 1.796,
"step": 414
},
{
"epoch": 0.20250487963565386,
"grad_norm": 0.18812508881092072,
"learning_rate": 0.0001,
"loss": 1.7034,
"step": 415
},
{
"epoch": 0.202992843201041,
"grad_norm": 0.19069646298885345,
"learning_rate": 0.0001,
"loss": 1.7504,
"step": 416
},
{
"epoch": 0.2034808067664281,
"grad_norm": 0.17794154584407806,
"learning_rate": 0.0001,
"loss": 1.6469,
"step": 417
},
{
"epoch": 0.20396877033181524,
"grad_norm": 0.17641998827457428,
"learning_rate": 0.0001,
"loss": 1.7526,
"step": 418
},
{
"epoch": 0.20445673389720234,
"grad_norm": 0.19693951308727264,
"learning_rate": 0.0001,
"loss": 1.7007,
"step": 419
},
{
"epoch": 0.20494469746258945,
"grad_norm": 0.1921786069869995,
"learning_rate": 0.0001,
"loss": 1.7514,
"step": 420
},
{
"epoch": 0.20543266102797658,
"grad_norm": 0.1899469792842865,
"learning_rate": 0.0001,
"loss": 1.7508,
"step": 421
},
{
"epoch": 0.2059206245933637,
"grad_norm": 0.16994713246822357,
"learning_rate": 0.0001,
"loss": 1.6313,
"step": 422
},
{
"epoch": 0.20640858815875082,
"grad_norm": 0.20480570197105408,
"learning_rate": 0.0001,
"loss": 1.7714,
"step": 423
},
{
"epoch": 0.20689655172413793,
"grad_norm": 0.20870919525623322,
"learning_rate": 0.0001,
"loss": 1.7782,
"step": 424
},
{
"epoch": 0.20738451528952506,
"grad_norm": 0.18410471081733704,
"learning_rate": 0.0001,
"loss": 1.72,
"step": 425
},
{
"epoch": 0.20787247885491217,
"grad_norm": 0.23531974852085114,
"learning_rate": 0.0001,
"loss": 1.8923,
"step": 426
},
{
"epoch": 0.20836044242029927,
"grad_norm": 0.18552608788013458,
"learning_rate": 0.0001,
"loss": 1.7272,
"step": 427
},
{
"epoch": 0.2088484059856864,
"grad_norm": 0.2085346281528473,
"learning_rate": 0.0001,
"loss": 1.6953,
"step": 428
},
{
"epoch": 0.20933636955107351,
"grad_norm": 0.1959279626607895,
"learning_rate": 0.0001,
"loss": 1.6288,
"step": 429
},
{
"epoch": 0.20982433311646065,
"grad_norm": 0.17610879242420197,
"learning_rate": 0.0001,
"loss": 1.7151,
"step": 430
},
{
"epoch": 0.21031229668184775,
"grad_norm": 0.1928284466266632,
"learning_rate": 0.0001,
"loss": 1.687,
"step": 431
},
{
"epoch": 0.21080026024723486,
"grad_norm": 0.199452742934227,
"learning_rate": 0.0001,
"loss": 1.7704,
"step": 432
},
{
"epoch": 0.211288223812622,
"grad_norm": 0.18074338138103485,
"learning_rate": 0.0001,
"loss": 1.7899,
"step": 433
},
{
"epoch": 0.2117761873780091,
"grad_norm": 0.19121356308460236,
"learning_rate": 0.0001,
"loss": 1.694,
"step": 434
},
{
"epoch": 0.21226415094339623,
"grad_norm": 0.18307030200958252,
"learning_rate": 0.0001,
"loss": 1.6335,
"step": 435
},
{
"epoch": 0.21275211450878334,
"grad_norm": 0.18400311470031738,
"learning_rate": 0.0001,
"loss": 1.7526,
"step": 436
},
{
"epoch": 0.21324007807417047,
"grad_norm": 0.1944567859172821,
"learning_rate": 0.0001,
"loss": 1.7884,
"step": 437
},
{
"epoch": 0.21372804163955758,
"grad_norm": 0.18847782909870148,
"learning_rate": 0.0001,
"loss": 1.6859,
"step": 438
},
{
"epoch": 0.2142160052049447,
"grad_norm": 0.17663119733333588,
"learning_rate": 0.0001,
"loss": 1.615,
"step": 439
},
{
"epoch": 0.21470396877033182,
"grad_norm": 0.18704909086227417,
"learning_rate": 0.0001,
"loss": 1.7352,
"step": 440
},
{
"epoch": 0.21519193233571893,
"grad_norm": 0.19525641202926636,
"learning_rate": 0.0001,
"loss": 1.6241,
"step": 441
},
{
"epoch": 0.21567989590110606,
"grad_norm": 0.19030174612998962,
"learning_rate": 0.0001,
"loss": 1.7425,
"step": 442
},
{
"epoch": 0.21616785946649317,
"grad_norm": 0.18872150778770447,
"learning_rate": 0.0001,
"loss": 1.7177,
"step": 443
},
{
"epoch": 0.21665582303188027,
"grad_norm": 0.17374157905578613,
"learning_rate": 0.0001,
"loss": 1.7236,
"step": 444
},
{
"epoch": 0.2171437865972674,
"grad_norm": 0.18159011006355286,
"learning_rate": 0.0001,
"loss": 1.6885,
"step": 445
},
{
"epoch": 0.2176317501626545,
"grad_norm": 0.18726180493831635,
"learning_rate": 0.0001,
"loss": 1.8226,
"step": 446
},
{
"epoch": 0.21811971372804165,
"grad_norm": 0.193464457988739,
"learning_rate": 0.0001,
"loss": 1.7834,
"step": 447
},
{
"epoch": 0.21860767729342875,
"grad_norm": 0.19700440764427185,
"learning_rate": 0.0001,
"loss": 1.6766,
"step": 448
},
{
"epoch": 0.2190956408588159,
"grad_norm": 0.16808220744132996,
"learning_rate": 0.0001,
"loss": 1.6773,
"step": 449
},
{
"epoch": 0.219583604424203,
"grad_norm": 0.1885610967874527,
"learning_rate": 0.0001,
"loss": 1.7195,
"step": 450
},
{
"epoch": 0.2200715679895901,
"grad_norm": 0.17235183715820312,
"learning_rate": 0.0001,
"loss": 1.6651,
"step": 451
},
{
"epoch": 0.22055953155497723,
"grad_norm": 0.17667032778263092,
"learning_rate": 0.0001,
"loss": 1.647,
"step": 452
},
{
"epoch": 0.22104749512036434,
"grad_norm": 0.17659679055213928,
"learning_rate": 0.0001,
"loss": 1.8337,
"step": 453
},
{
"epoch": 0.22153545868575147,
"grad_norm": 0.17201969027519226,
"learning_rate": 0.0001,
"loss": 1.7385,
"step": 454
},
{
"epoch": 0.22202342225113858,
"grad_norm": 0.17937779426574707,
"learning_rate": 0.0001,
"loss": 1.7864,
"step": 455
},
{
"epoch": 0.2225113858165257,
"grad_norm": 0.1681385189294815,
"learning_rate": 0.0001,
"loss": 1.636,
"step": 456
},
{
"epoch": 0.22299934938191282,
"grad_norm": 0.17030152678489685,
"learning_rate": 0.0001,
"loss": 1.7613,
"step": 457
},
{
"epoch": 0.22348731294729993,
"grad_norm": 0.18430882692337036,
"learning_rate": 0.0001,
"loss": 1.7746,
"step": 458
},
{
"epoch": 0.22397527651268706,
"grad_norm": 0.17070208489894867,
"learning_rate": 0.0001,
"loss": 1.619,
"step": 459
},
{
"epoch": 0.22446324007807417,
"grad_norm": 0.1672583520412445,
"learning_rate": 0.0001,
"loss": 1.6935,
"step": 460
},
{
"epoch": 0.2249512036434613,
"grad_norm": 0.18070879578590393,
"learning_rate": 0.0001,
"loss": 1.7752,
"step": 461
},
{
"epoch": 0.2254391672088484,
"grad_norm": 0.17931310832500458,
"learning_rate": 0.0001,
"loss": 1.8331,
"step": 462
},
{
"epoch": 0.2259271307742355,
"grad_norm": 0.18687482178211212,
"learning_rate": 0.0001,
"loss": 1.7745,
"step": 463
},
{
"epoch": 0.22641509433962265,
"grad_norm": 0.18673428893089294,
"learning_rate": 0.0001,
"loss": 1.8001,
"step": 464
},
{
"epoch": 0.22690305790500975,
"grad_norm": 0.18758326768875122,
"learning_rate": 0.0001,
"loss": 1.8024,
"step": 465
},
{
"epoch": 0.2273910214703969,
"grad_norm": 0.17651711404323578,
"learning_rate": 0.0001,
"loss": 1.6348,
"step": 466
},
{
"epoch": 0.227878985035784,
"grad_norm": 0.17466424405574799,
"learning_rate": 0.0001,
"loss": 1.6529,
"step": 467
},
{
"epoch": 0.2283669486011711,
"grad_norm": 0.17049545049667358,
"learning_rate": 0.0001,
"loss": 1.6707,
"step": 468
},
{
"epoch": 0.22885491216655823,
"grad_norm": 0.19238895177841187,
"learning_rate": 0.0001,
"loss": 1.7262,
"step": 469
},
{
"epoch": 0.22934287573194534,
"grad_norm": 0.183549702167511,
"learning_rate": 0.0001,
"loss": 1.6949,
"step": 470
},
{
"epoch": 0.22983083929733247,
"grad_norm": 0.19222155213356018,
"learning_rate": 0.0001,
"loss": 1.7727,
"step": 471
},
{
"epoch": 0.23031880286271958,
"grad_norm": 0.18078762292861938,
"learning_rate": 0.0001,
"loss": 1.8166,
"step": 472
},
{
"epoch": 0.2308067664281067,
"grad_norm": 0.17769628763198853,
"learning_rate": 0.0001,
"loss": 1.7215,
"step": 473
},
{
"epoch": 0.23129472999349382,
"grad_norm": 0.1750006526708603,
"learning_rate": 0.0001,
"loss": 1.7311,
"step": 474
},
{
"epoch": 0.23178269355888093,
"grad_norm": 0.1803676038980484,
"learning_rate": 0.0001,
"loss": 1.7596,
"step": 475
},
{
"epoch": 0.23227065712426806,
"grad_norm": 0.18478356301784515,
"learning_rate": 0.0001,
"loss": 1.7262,
"step": 476
},
{
"epoch": 0.23275862068965517,
"grad_norm": 0.16509763896465302,
"learning_rate": 0.0001,
"loss": 1.623,
"step": 477
},
{
"epoch": 0.2332465842550423,
"grad_norm": 0.19317001104354858,
"learning_rate": 0.0001,
"loss": 1.6284,
"step": 478
},
{
"epoch": 0.2337345478204294,
"grad_norm": 0.18081186711788177,
"learning_rate": 0.0001,
"loss": 1.6959,
"step": 479
},
{
"epoch": 0.2342225113858165,
"grad_norm": 0.18306545913219452,
"learning_rate": 0.0001,
"loss": 1.7328,
"step": 480
},
{
"epoch": 0.23471047495120365,
"grad_norm": 0.18552261590957642,
"learning_rate": 0.0001,
"loss": 1.6847,
"step": 481
},
{
"epoch": 0.23519843851659075,
"grad_norm": 0.17930322885513306,
"learning_rate": 0.0001,
"loss": 1.7678,
"step": 482
},
{
"epoch": 0.23568640208197789,
"grad_norm": 0.17558367550373077,
"learning_rate": 0.0001,
"loss": 1.6756,
"step": 483
},
{
"epoch": 0.236174365647365,
"grad_norm": 0.18899041414260864,
"learning_rate": 0.0001,
"loss": 1.7778,
"step": 484
},
{
"epoch": 0.23666232921275213,
"grad_norm": 0.17528998851776123,
"learning_rate": 0.0001,
"loss": 1.6651,
"step": 485
},
{
"epoch": 0.23715029277813923,
"grad_norm": 0.16732053458690643,
"learning_rate": 0.0001,
"loss": 1.6796,
"step": 486
},
{
"epoch": 0.23763825634352634,
"grad_norm": 0.1849820613861084,
"learning_rate": 0.0001,
"loss": 1.737,
"step": 487
},
{
"epoch": 0.23812621990891347,
"grad_norm": 0.1789163500070572,
"learning_rate": 0.0001,
"loss": 1.6919,
"step": 488
},
{
"epoch": 0.23861418347430058,
"grad_norm": 0.1739804446697235,
"learning_rate": 0.0001,
"loss": 1.8225,
"step": 489
},
{
"epoch": 0.2391021470396877,
"grad_norm": 0.18246984481811523,
"learning_rate": 0.0001,
"loss": 1.734,
"step": 490
},
{
"epoch": 0.23959011060507482,
"grad_norm": 0.17464157938957214,
"learning_rate": 0.0001,
"loss": 1.7442,
"step": 491
},
{
"epoch": 0.24007807417046195,
"grad_norm": 0.19501306116580963,
"learning_rate": 0.0001,
"loss": 1.7521,
"step": 492
},
{
"epoch": 0.24056603773584906,
"grad_norm": 0.17958857119083405,
"learning_rate": 0.0001,
"loss": 1.8191,
"step": 493
},
{
"epoch": 0.24105400130123616,
"grad_norm": 0.18241986632347107,
"learning_rate": 0.0001,
"loss": 1.7709,
"step": 494
},
{
"epoch": 0.2415419648666233,
"grad_norm": 0.18529468774795532,
"learning_rate": 0.0001,
"loss": 1.6871,
"step": 495
},
{
"epoch": 0.2420299284320104,
"grad_norm": 0.18519562482833862,
"learning_rate": 0.0001,
"loss": 1.7605,
"step": 496
},
{
"epoch": 0.24251789199739754,
"grad_norm": 0.17868764698505402,
"learning_rate": 0.0001,
"loss": 1.725,
"step": 497
},
{
"epoch": 0.24300585556278465,
"grad_norm": 0.17040537297725677,
"learning_rate": 0.0001,
"loss": 1.6161,
"step": 498
},
{
"epoch": 0.24349381912817175,
"grad_norm": 0.1820056289434433,
"learning_rate": 0.0001,
"loss": 1.7249,
"step": 499
},
{
"epoch": 0.24398178269355889,
"grad_norm": 0.1877366453409195,
"learning_rate": 0.0001,
"loss": 1.6976,
"step": 500
},
{
"epoch": 0.244469746258946,
"grad_norm": 0.1717415153980255,
"learning_rate": 0.0001,
"loss": 1.6109,
"step": 501
},
{
"epoch": 0.24495770982433313,
"grad_norm": 0.17338915169239044,
"learning_rate": 0.0001,
"loss": 1.7433,
"step": 502
},
{
"epoch": 0.24544567338972023,
"grad_norm": 0.18489517271518707,
"learning_rate": 0.0001,
"loss": 1.7283,
"step": 503
},
{
"epoch": 0.24593363695510737,
"grad_norm": 0.17153921723365784,
"learning_rate": 0.0001,
"loss": 1.7261,
"step": 504
},
{
"epoch": 0.24642160052049447,
"grad_norm": 0.19024662673473358,
"learning_rate": 0.0001,
"loss": 1.8498,
"step": 505
},
{
"epoch": 0.24690956408588158,
"grad_norm": 0.1675989329814911,
"learning_rate": 0.0001,
"loss": 1.5903,
"step": 506
},
{
"epoch": 0.2473975276512687,
"grad_norm": 0.18422546982765198,
"learning_rate": 0.0001,
"loss": 1.7294,
"step": 507
},
{
"epoch": 0.24788549121665582,
"grad_norm": 0.17943088710308075,
"learning_rate": 0.0001,
"loss": 1.6842,
"step": 508
},
{
"epoch": 0.24837345478204295,
"grad_norm": 0.18048308789730072,
"learning_rate": 0.0001,
"loss": 1.677,
"step": 509
},
{
"epoch": 0.24886141834743006,
"grad_norm": 0.17185211181640625,
"learning_rate": 0.0001,
"loss": 1.6738,
"step": 510
},
{
"epoch": 0.24934938191281716,
"grad_norm": 0.1717991977930069,
"learning_rate": 0.0001,
"loss": 1.7077,
"step": 511
},
{
"epoch": 0.2498373454782043,
"grad_norm": 0.18661388754844666,
"learning_rate": 0.0001,
"loss": 1.8163,
"step": 512
},
{
"epoch": 0.2503253090435914,
"grad_norm": 0.19672876596450806,
"learning_rate": 0.0001,
"loss": 1.7733,
"step": 513
},
{
"epoch": 0.2508132726089785,
"grad_norm": 0.18052315711975098,
"learning_rate": 0.0001,
"loss": 1.7242,
"step": 514
},
{
"epoch": 0.25130123617436567,
"grad_norm": 0.17241713404655457,
"learning_rate": 0.0001,
"loss": 1.6513,
"step": 515
},
{
"epoch": 0.2517891997397528,
"grad_norm": 0.1861806958913803,
"learning_rate": 0.0001,
"loss": 1.7189,
"step": 516
},
{
"epoch": 0.2522771633051399,
"grad_norm": 0.17267678678035736,
"learning_rate": 0.0001,
"loss": 1.5993,
"step": 517
},
{
"epoch": 0.252765126870527,
"grad_norm": 0.16948658227920532,
"learning_rate": 0.0001,
"loss": 1.5733,
"step": 518
},
{
"epoch": 0.2532530904359141,
"grad_norm": 0.18075625598430634,
"learning_rate": 0.0001,
"loss": 1.7755,
"step": 519
},
{
"epoch": 0.25374105400130126,
"grad_norm": 0.17203836143016815,
"learning_rate": 0.0001,
"loss": 1.6755,
"step": 520
},
{
"epoch": 0.25422901756668836,
"grad_norm": 0.1631672978401184,
"learning_rate": 0.0001,
"loss": 1.5949,
"step": 521
},
{
"epoch": 0.25471698113207547,
"grad_norm": 0.1776244342327118,
"learning_rate": 0.0001,
"loss": 1.7231,
"step": 522
},
{
"epoch": 0.2552049446974626,
"grad_norm": 0.18010790646076202,
"learning_rate": 0.0001,
"loss": 1.7575,
"step": 523
},
{
"epoch": 0.2556929082628497,
"grad_norm": 0.16827166080474854,
"learning_rate": 0.0001,
"loss": 1.6907,
"step": 524
},
{
"epoch": 0.25618087182823684,
"grad_norm": 0.19028151035308838,
"learning_rate": 0.0001,
"loss": 1.6602,
"step": 525
},
{
"epoch": 0.25666883539362395,
"grad_norm": 0.17831748723983765,
"learning_rate": 0.0001,
"loss": 1.7746,
"step": 526
},
{
"epoch": 0.25715679895901106,
"grad_norm": 0.19768738746643066,
"learning_rate": 0.0001,
"loss": 1.7111,
"step": 527
},
{
"epoch": 0.25764476252439816,
"grad_norm": 0.1869453638792038,
"learning_rate": 0.0001,
"loss": 1.7493,
"step": 528
},
{
"epoch": 0.25813272608978527,
"grad_norm": 0.17493435740470886,
"learning_rate": 0.0001,
"loss": 1.6401,
"step": 529
},
{
"epoch": 0.25862068965517243,
"grad_norm": 0.1741894632577896,
"learning_rate": 0.0001,
"loss": 1.6737,
"step": 530
},
{
"epoch": 0.25910865322055954,
"grad_norm": 0.19671699404716492,
"learning_rate": 0.0001,
"loss": 1.7265,
"step": 531
},
{
"epoch": 0.25959661678594664,
"grad_norm": 0.1766589730978012,
"learning_rate": 0.0001,
"loss": 1.6655,
"step": 532
},
{
"epoch": 0.26008458035133375,
"grad_norm": 0.17494948208332062,
"learning_rate": 0.0001,
"loss": 1.6571,
"step": 533
},
{
"epoch": 0.2605725439167209,
"grad_norm": 0.20303772389888763,
"learning_rate": 0.0001,
"loss": 1.7987,
"step": 534
},
{
"epoch": 0.261060507482108,
"grad_norm": 0.18097007274627686,
"learning_rate": 0.0001,
"loss": 1.6341,
"step": 535
},
{
"epoch": 0.2615484710474951,
"grad_norm": 0.20877449214458466,
"learning_rate": 0.0001,
"loss": 1.7057,
"step": 536
},
{
"epoch": 0.26203643461288223,
"grad_norm": 0.19047099351882935,
"learning_rate": 0.0001,
"loss": 1.7048,
"step": 537
},
{
"epoch": 0.26252439817826934,
"grad_norm": 0.18251296877861023,
"learning_rate": 0.0001,
"loss": 1.6979,
"step": 538
},
{
"epoch": 0.2630123617436565,
"grad_norm": 0.18078570067882538,
"learning_rate": 0.0001,
"loss": 1.801,
"step": 539
},
{
"epoch": 0.2635003253090436,
"grad_norm": 0.18725551664829254,
"learning_rate": 0.0001,
"loss": 1.7638,
"step": 540
},
{
"epoch": 0.2639882888744307,
"grad_norm": 0.20769141614437103,
"learning_rate": 0.0001,
"loss": 1.8201,
"step": 541
},
{
"epoch": 0.2644762524398178,
"grad_norm": 0.16759508848190308,
"learning_rate": 0.0001,
"loss": 1.6739,
"step": 542
},
{
"epoch": 0.2649642160052049,
"grad_norm": 0.20297077298164368,
"learning_rate": 0.0001,
"loss": 1.8241,
"step": 543
},
{
"epoch": 0.2654521795705921,
"grad_norm": 0.17038699984550476,
"learning_rate": 0.0001,
"loss": 1.6566,
"step": 544
},
{
"epoch": 0.2659401431359792,
"grad_norm": 0.17414064705371857,
"learning_rate": 0.0001,
"loss": 1.5866,
"step": 545
},
{
"epoch": 0.2664281067013663,
"grad_norm": 0.1856188178062439,
"learning_rate": 0.0001,
"loss": 1.7166,
"step": 546
},
{
"epoch": 0.2669160702667534,
"grad_norm": 0.17565833032131195,
"learning_rate": 0.0001,
"loss": 1.7206,
"step": 547
},
{
"epoch": 0.2674040338321405,
"grad_norm": 0.18267709016799927,
"learning_rate": 0.0001,
"loss": 1.6728,
"step": 548
},
{
"epoch": 0.26789199739752767,
"grad_norm": 0.18981780111789703,
"learning_rate": 0.0001,
"loss": 1.7425,
"step": 549
},
{
"epoch": 0.2683799609629148,
"grad_norm": 0.18254795670509338,
"learning_rate": 0.0001,
"loss": 1.6948,
"step": 550
},
{
"epoch": 0.2688679245283019,
"grad_norm": 0.18846552073955536,
"learning_rate": 0.0001,
"loss": 1.6572,
"step": 551
},
{
"epoch": 0.269355888093689,
"grad_norm": 0.1776316910982132,
"learning_rate": 0.0001,
"loss": 1.618,
"step": 552
},
{
"epoch": 0.2698438516590761,
"grad_norm": 0.1822226643562317,
"learning_rate": 0.0001,
"loss": 1.8876,
"step": 553
},
{
"epoch": 0.27033181522446326,
"grad_norm": 0.1873788982629776,
"learning_rate": 0.0001,
"loss": 1.7301,
"step": 554
},
{
"epoch": 0.27081977878985036,
"grad_norm": 0.19234952330589294,
"learning_rate": 0.0001,
"loss": 1.7235,
"step": 555
},
{
"epoch": 0.27130774235523747,
"grad_norm": 0.17642012238502502,
"learning_rate": 0.0001,
"loss": 1.7258,
"step": 556
},
{
"epoch": 0.2717957059206246,
"grad_norm": 0.21255896985530853,
"learning_rate": 0.0001,
"loss": 1.6937,
"step": 557
},
{
"epoch": 0.27228366948601174,
"grad_norm": 0.2181590497493744,
"learning_rate": 0.0001,
"loss": 1.9076,
"step": 558
},
{
"epoch": 0.27277163305139884,
"grad_norm": 0.16595962643623352,
"learning_rate": 0.0001,
"loss": 1.5664,
"step": 559
},
{
"epoch": 0.27325959661678595,
"grad_norm": 0.1832776963710785,
"learning_rate": 0.0001,
"loss": 1.658,
"step": 560
},
{
"epoch": 0.27374756018217306,
"grad_norm": 0.18969666957855225,
"learning_rate": 0.0001,
"loss": 1.8031,
"step": 561
},
{
"epoch": 0.27423552374756016,
"grad_norm": 0.1813500076532364,
"learning_rate": 0.0001,
"loss": 1.7209,
"step": 562
},
{
"epoch": 0.2747234873129473,
"grad_norm": 0.18055056035518646,
"learning_rate": 0.0001,
"loss": 1.7658,
"step": 563
},
{
"epoch": 0.27521145087833443,
"grad_norm": 0.17362233996391296,
"learning_rate": 0.0001,
"loss": 1.7746,
"step": 564
},
{
"epoch": 0.27569941444372154,
"grad_norm": 0.19305916130542755,
"learning_rate": 0.0001,
"loss": 1.9062,
"step": 565
},
{
"epoch": 0.27618737800910864,
"grad_norm": 0.17458635568618774,
"learning_rate": 0.0001,
"loss": 1.6339,
"step": 566
},
{
"epoch": 0.27667534157449575,
"grad_norm": 0.18760624527931213,
"learning_rate": 0.0001,
"loss": 1.6433,
"step": 567
},
{
"epoch": 0.2771633051398829,
"grad_norm": 0.17057117819786072,
"learning_rate": 0.0001,
"loss": 1.6318,
"step": 568
},
{
"epoch": 0.27765126870527,
"grad_norm": 0.17930074036121368,
"learning_rate": 0.0001,
"loss": 1.7227,
"step": 569
},
{
"epoch": 0.2781392322706571,
"grad_norm": 0.17012158036231995,
"learning_rate": 0.0001,
"loss": 1.6309,
"step": 570
},
{
"epoch": 0.27862719583604423,
"grad_norm": 0.17562495172023773,
"learning_rate": 0.0001,
"loss": 1.6351,
"step": 571
},
{
"epoch": 0.27911515940143133,
"grad_norm": 0.18494853377342224,
"learning_rate": 0.0001,
"loss": 1.8355,
"step": 572
},
{
"epoch": 0.2796031229668185,
"grad_norm": 0.18261797726154327,
"learning_rate": 0.0001,
"loss": 1.6015,
"step": 573
},
{
"epoch": 0.2800910865322056,
"grad_norm": 0.18148979544639587,
"learning_rate": 0.0001,
"loss": 1.797,
"step": 574
},
{
"epoch": 0.2805790500975927,
"grad_norm": 0.16941653192043304,
"learning_rate": 0.0001,
"loss": 1.6382,
"step": 575
},
{
"epoch": 0.2810670136629798,
"grad_norm": 0.18611697852611542,
"learning_rate": 0.0001,
"loss": 1.6595,
"step": 576
},
{
"epoch": 0.281554977228367,
"grad_norm": 0.16945675015449524,
"learning_rate": 0.0001,
"loss": 1.6678,
"step": 577
},
{
"epoch": 0.2820429407937541,
"grad_norm": 0.17999336123466492,
"learning_rate": 0.0001,
"loss": 1.7161,
"step": 578
},
{
"epoch": 0.2825309043591412,
"grad_norm": 0.185410276055336,
"learning_rate": 0.0001,
"loss": 1.6731,
"step": 579
},
{
"epoch": 0.2830188679245283,
"grad_norm": 0.1757509708404541,
"learning_rate": 0.0001,
"loss": 1.7162,
"step": 580
},
{
"epoch": 0.2835068314899154,
"grad_norm": 0.1721939593553543,
"learning_rate": 0.0001,
"loss": 1.6374,
"step": 581
},
{
"epoch": 0.28399479505530256,
"grad_norm": 0.17961697280406952,
"learning_rate": 0.0001,
"loss": 1.5798,
"step": 582
},
{
"epoch": 0.28448275862068967,
"grad_norm": 0.18612822890281677,
"learning_rate": 0.0001,
"loss": 1.7694,
"step": 583
},
{
"epoch": 0.2849707221860768,
"grad_norm": 0.18089883029460907,
"learning_rate": 0.0001,
"loss": 1.7426,
"step": 584
},
{
"epoch": 0.2854586857514639,
"grad_norm": 0.19402338564395905,
"learning_rate": 0.0001,
"loss": 1.7604,
"step": 585
},
{
"epoch": 0.285946649316851,
"grad_norm": 0.18208986520767212,
"learning_rate": 0.0001,
"loss": 1.6998,
"step": 586
},
{
"epoch": 0.28643461288223815,
"grad_norm": 0.19270221889019012,
"learning_rate": 0.0001,
"loss": 1.6564,
"step": 587
},
{
"epoch": 0.28692257644762525,
"grad_norm": 0.17604075372219086,
"learning_rate": 0.0001,
"loss": 1.653,
"step": 588
},
{
"epoch": 0.28741054001301236,
"grad_norm": 0.17964652180671692,
"learning_rate": 0.0001,
"loss": 1.7613,
"step": 589
},
{
"epoch": 0.28789850357839947,
"grad_norm": 0.18317797780036926,
"learning_rate": 0.0001,
"loss": 1.6621,
"step": 590
},
{
"epoch": 0.2883864671437866,
"grad_norm": 0.18271799385547638,
"learning_rate": 0.0001,
"loss": 1.8067,
"step": 591
},
{
"epoch": 0.28887443070917374,
"grad_norm": 0.19613641500473022,
"learning_rate": 0.0001,
"loss": 1.8544,
"step": 592
},
{
"epoch": 0.28936239427456084,
"grad_norm": 0.19165842235088348,
"learning_rate": 0.0001,
"loss": 1.8834,
"step": 593
},
{
"epoch": 0.28985035783994795,
"grad_norm": 0.18238607048988342,
"learning_rate": 0.0001,
"loss": 1.7776,
"step": 594
},
{
"epoch": 0.29033832140533505,
"grad_norm": 0.16585291922092438,
"learning_rate": 0.0001,
"loss": 1.5959,
"step": 595
},
{
"epoch": 0.29082628497072216,
"grad_norm": 0.1774480640888214,
"learning_rate": 0.0001,
"loss": 1.6114,
"step": 596
},
{
"epoch": 0.2913142485361093,
"grad_norm": 0.17970281839370728,
"learning_rate": 0.0001,
"loss": 1.79,
"step": 597
},
{
"epoch": 0.2918022121014964,
"grad_norm": 0.18806995451450348,
"learning_rate": 0.0001,
"loss": 1.7842,
"step": 598
},
{
"epoch": 0.29229017566688353,
"grad_norm": 0.16845998167991638,
"learning_rate": 0.0001,
"loss": 1.6788,
"step": 599
},
{
"epoch": 0.29277813923227064,
"grad_norm": 0.18506960570812225,
"learning_rate": 0.0001,
"loss": 1.758,
"step": 600
},
{
"epoch": 0.2932661027976578,
"grad_norm": 0.1771155744791031,
"learning_rate": 0.0001,
"loss": 1.7259,
"step": 601
},
{
"epoch": 0.2937540663630449,
"grad_norm": 0.1760523021221161,
"learning_rate": 0.0001,
"loss": 1.7807,
"step": 602
},
{
"epoch": 0.294242029928432,
"grad_norm": 0.1765487641096115,
"learning_rate": 0.0001,
"loss": 1.5886,
"step": 603
},
{
"epoch": 0.2947299934938191,
"grad_norm": 0.17646710574626923,
"learning_rate": 0.0001,
"loss": 1.6508,
"step": 604
},
{
"epoch": 0.2952179570592062,
"grad_norm": 0.18383362889289856,
"learning_rate": 0.0001,
"loss": 1.7049,
"step": 605
},
{
"epoch": 0.2957059206245934,
"grad_norm": 0.18808609247207642,
"learning_rate": 0.0001,
"loss": 1.6948,
"step": 606
},
{
"epoch": 0.2961938841899805,
"grad_norm": 0.18178711831569672,
"learning_rate": 0.0001,
"loss": 1.7306,
"step": 607
},
{
"epoch": 0.2966818477553676,
"grad_norm": 0.18499815464019775,
"learning_rate": 0.0001,
"loss": 1.6072,
"step": 608
},
{
"epoch": 0.2971698113207547,
"grad_norm": 0.18511821329593658,
"learning_rate": 0.0001,
"loss": 1.6383,
"step": 609
},
{
"epoch": 0.2976577748861418,
"grad_norm": 0.17731331288814545,
"learning_rate": 0.0001,
"loss": 1.738,
"step": 610
},
{
"epoch": 0.298145738451529,
"grad_norm": 0.19273065030574799,
"learning_rate": 0.0001,
"loss": 1.6286,
"step": 611
},
{
"epoch": 0.2986337020169161,
"grad_norm": 0.1858029067516327,
"learning_rate": 0.0001,
"loss": 1.6565,
"step": 612
},
{
"epoch": 0.2991216655823032,
"grad_norm": 0.18791264295578003,
"learning_rate": 0.0001,
"loss": 1.6857,
"step": 613
},
{
"epoch": 0.2996096291476903,
"grad_norm": 0.19478711485862732,
"learning_rate": 0.0001,
"loss": 1.6655,
"step": 614
},
{
"epoch": 0.3000975927130774,
"grad_norm": 0.18538743257522583,
"learning_rate": 0.0001,
"loss": 1.701,
"step": 615
},
{
"epoch": 0.30058555627846456,
"grad_norm": 0.1899065524339676,
"learning_rate": 0.0001,
"loss": 1.7014,
"step": 616
},
{
"epoch": 0.30107351984385167,
"grad_norm": 0.19550780951976776,
"learning_rate": 0.0001,
"loss": 1.8021,
"step": 617
},
{
"epoch": 0.3015614834092388,
"grad_norm": 0.1695028841495514,
"learning_rate": 0.0001,
"loss": 1.6423,
"step": 618
},
{
"epoch": 0.3020494469746259,
"grad_norm": 0.18605121970176697,
"learning_rate": 0.0001,
"loss": 1.7441,
"step": 619
},
{
"epoch": 0.302537410540013,
"grad_norm": 0.20526890456676483,
"learning_rate": 0.0001,
"loss": 1.7878,
"step": 620
},
{
"epoch": 0.30302537410540015,
"grad_norm": 0.17033647000789642,
"learning_rate": 0.0001,
"loss": 1.688,
"step": 621
},
{
"epoch": 0.30351333767078725,
"grad_norm": 0.1756584197282791,
"learning_rate": 0.0001,
"loss": 1.6914,
"step": 622
},
{
"epoch": 0.30400130123617436,
"grad_norm": 0.18451380729675293,
"learning_rate": 0.0001,
"loss": 1.6135,
"step": 623
},
{
"epoch": 0.30448926480156147,
"grad_norm": 0.17828862369060516,
"learning_rate": 0.0001,
"loss": 1.677,
"step": 624
},
{
"epoch": 0.3049772283669486,
"grad_norm": 0.17056816816329956,
"learning_rate": 0.0001,
"loss": 1.647,
"step": 625
},
{
"epoch": 0.30546519193233573,
"grad_norm": 0.1786261945962906,
"learning_rate": 0.0001,
"loss": 1.7212,
"step": 626
},
{
"epoch": 0.30595315549772284,
"grad_norm": 0.1788036823272705,
"learning_rate": 0.0001,
"loss": 1.6646,
"step": 627
},
{
"epoch": 0.30644111906310995,
"grad_norm": 0.17864547669887543,
"learning_rate": 0.0001,
"loss": 1.7123,
"step": 628
},
{
"epoch": 0.30692908262849705,
"grad_norm": 0.19462743401527405,
"learning_rate": 0.0001,
"loss": 1.7975,
"step": 629
},
{
"epoch": 0.3074170461938842,
"grad_norm": 0.17800424993038177,
"learning_rate": 0.0001,
"loss": 1.5499,
"step": 630
},
{
"epoch": 0.3079050097592713,
"grad_norm": 0.1856238692998886,
"learning_rate": 0.0001,
"loss": 1.9104,
"step": 631
},
{
"epoch": 0.3083929733246584,
"grad_norm": 0.17673279345035553,
"learning_rate": 0.0001,
"loss": 1.6382,
"step": 632
},
{
"epoch": 0.30888093689004553,
"grad_norm": 0.18032853305339813,
"learning_rate": 0.0001,
"loss": 1.7374,
"step": 633
},
{
"epoch": 0.30936890045543264,
"grad_norm": 0.17968174815177917,
"learning_rate": 0.0001,
"loss": 1.662,
"step": 634
},
{
"epoch": 0.3098568640208198,
"grad_norm": 0.1789749562740326,
"learning_rate": 0.0001,
"loss": 1.6044,
"step": 635
},
{
"epoch": 0.3103448275862069,
"grad_norm": 0.175074502825737,
"learning_rate": 0.0001,
"loss": 1.7047,
"step": 636
},
{
"epoch": 0.310832791151594,
"grad_norm": 0.17318876087665558,
"learning_rate": 0.0001,
"loss": 1.6148,
"step": 637
},
{
"epoch": 0.3113207547169811,
"grad_norm": 0.20739412307739258,
"learning_rate": 0.0001,
"loss": 1.9162,
"step": 638
},
{
"epoch": 0.3118087182823682,
"grad_norm": 0.1787186861038208,
"learning_rate": 0.0001,
"loss": 1.6657,
"step": 639
},
{
"epoch": 0.3122966818477554,
"grad_norm": 0.1855590045452118,
"learning_rate": 0.0001,
"loss": 1.7058,
"step": 640
},
{
"epoch": 0.3127846454131425,
"grad_norm": 0.17939618229866028,
"learning_rate": 0.0001,
"loss": 1.7663,
"step": 641
},
{
"epoch": 0.3132726089785296,
"grad_norm": 0.17440925538539886,
"learning_rate": 0.0001,
"loss": 1.6337,
"step": 642
},
{
"epoch": 0.3137605725439167,
"grad_norm": 0.19695165753364563,
"learning_rate": 0.0001,
"loss": 1.6048,
"step": 643
},
{
"epoch": 0.3142485361093038,
"grad_norm": 0.16877804696559906,
"learning_rate": 0.0001,
"loss": 1.6677,
"step": 644
},
{
"epoch": 0.314736499674691,
"grad_norm": 0.1742711365222931,
"learning_rate": 0.0001,
"loss": 1.6459,
"step": 645
},
{
"epoch": 0.3152244632400781,
"grad_norm": 0.18073154985904694,
"learning_rate": 0.0001,
"loss": 1.7392,
"step": 646
},
{
"epoch": 0.3157124268054652,
"grad_norm": 0.1714729368686676,
"learning_rate": 0.0001,
"loss": 1.6981,
"step": 647
},
{
"epoch": 0.3162003903708523,
"grad_norm": 0.17316888272762299,
"learning_rate": 0.0001,
"loss": 1.6746,
"step": 648
},
{
"epoch": 0.31668835393623945,
"grad_norm": 0.1779533475637436,
"learning_rate": 0.0001,
"loss": 1.7709,
"step": 649
},
{
"epoch": 0.31717631750162656,
"grad_norm": 0.1709679216146469,
"learning_rate": 0.0001,
"loss": 1.5822,
"step": 650
},
{
"epoch": 0.31766428106701367,
"grad_norm": 0.17804761230945587,
"learning_rate": 0.0001,
"loss": 1.7638,
"step": 651
},
{
"epoch": 0.31815224463240077,
"grad_norm": 0.18509989976882935,
"learning_rate": 0.0001,
"loss": 1.8712,
"step": 652
},
{
"epoch": 0.3186402081977879,
"grad_norm": 0.1751030832529068,
"learning_rate": 0.0001,
"loss": 1.7032,
"step": 653
},
{
"epoch": 0.31912817176317504,
"grad_norm": 0.17232050001621246,
"learning_rate": 0.0001,
"loss": 1.6331,
"step": 654
},
{
"epoch": 0.31961613532856215,
"grad_norm": 0.17198053002357483,
"learning_rate": 0.0001,
"loss": 1.7067,
"step": 655
},
{
"epoch": 0.32010409889394925,
"grad_norm": 0.1797952950000763,
"learning_rate": 0.0001,
"loss": 1.687,
"step": 656
},
{
"epoch": 0.32059206245933636,
"grad_norm": 0.1817045360803604,
"learning_rate": 0.0001,
"loss": 1.7448,
"step": 657
},
{
"epoch": 0.32108002602472346,
"grad_norm": 0.1710105687379837,
"learning_rate": 0.0001,
"loss": 1.6186,
"step": 658
},
{
"epoch": 0.3215679895901106,
"grad_norm": 0.19661752879619598,
"learning_rate": 0.0001,
"loss": 1.7867,
"step": 659
},
{
"epoch": 0.32205595315549773,
"grad_norm": 0.1723627746105194,
"learning_rate": 0.0001,
"loss": 1.5887,
"step": 660
},
{
"epoch": 0.32254391672088484,
"grad_norm": 0.21364371478557587,
"learning_rate": 0.0001,
"loss": 1.8418,
"step": 661
},
{
"epoch": 0.32303188028627194,
"grad_norm": 0.17605622112751007,
"learning_rate": 0.0001,
"loss": 1.6892,
"step": 662
},
{
"epoch": 0.32351984385165905,
"grad_norm": 0.17851850390434265,
"learning_rate": 0.0001,
"loss": 1.7639,
"step": 663
},
{
"epoch": 0.3240078074170462,
"grad_norm": 0.1816173940896988,
"learning_rate": 0.0001,
"loss": 1.6567,
"step": 664
},
{
"epoch": 0.3244957709824333,
"grad_norm": 0.17529702186584473,
"learning_rate": 0.0001,
"loss": 1.6945,
"step": 665
},
{
"epoch": 0.3249837345478204,
"grad_norm": 0.16997535526752472,
"learning_rate": 0.0001,
"loss": 1.6833,
"step": 666
},
{
"epoch": 0.32547169811320753,
"grad_norm": 0.18423834443092346,
"learning_rate": 0.0001,
"loss": 1.7486,
"step": 667
},
{
"epoch": 0.3259596616785947,
"grad_norm": 0.18737761676311493,
"learning_rate": 0.0001,
"loss": 1.7561,
"step": 668
},
{
"epoch": 0.3264476252439818,
"grad_norm": 0.17731069028377533,
"learning_rate": 0.0001,
"loss": 1.5679,
"step": 669
},
{
"epoch": 0.3269355888093689,
"grad_norm": 0.197565495967865,
"learning_rate": 0.0001,
"loss": 1.7457,
"step": 670
},
{
"epoch": 0.327423552374756,
"grad_norm": 0.19319871068000793,
"learning_rate": 0.0001,
"loss": 1.8458,
"step": 671
},
{
"epoch": 0.3279115159401431,
"grad_norm": 0.18049995601177216,
"learning_rate": 0.0001,
"loss": 1.7076,
"step": 672
},
{
"epoch": 0.3283994795055303,
"grad_norm": 0.18907921016216278,
"learning_rate": 0.0001,
"loss": 1.7031,
"step": 673
},
{
"epoch": 0.3288874430709174,
"grad_norm": 0.18252240121364594,
"learning_rate": 0.0001,
"loss": 1.6304,
"step": 674
},
{
"epoch": 0.3293754066363045,
"grad_norm": 0.1798553168773651,
"learning_rate": 0.0001,
"loss": 1.6504,
"step": 675
},
{
"epoch": 0.3298633702016916,
"grad_norm": 0.1712959110736847,
"learning_rate": 0.0001,
"loss": 1.6827,
"step": 676
},
{
"epoch": 0.3303513337670787,
"grad_norm": 0.169499009847641,
"learning_rate": 0.0001,
"loss": 1.67,
"step": 677
},
{
"epoch": 0.33083929733246586,
"grad_norm": 0.17921562492847443,
"learning_rate": 0.0001,
"loss": 1.6913,
"step": 678
},
{
"epoch": 0.33132726089785297,
"grad_norm": 0.16730189323425293,
"learning_rate": 0.0001,
"loss": 1.6585,
"step": 679
},
{
"epoch": 0.3318152244632401,
"grad_norm": 0.1731245219707489,
"learning_rate": 0.0001,
"loss": 1.6891,
"step": 680
},
{
"epoch": 0.3323031880286272,
"grad_norm": 0.18989908695220947,
"learning_rate": 0.0001,
"loss": 1.8335,
"step": 681
},
{
"epoch": 0.3327911515940143,
"grad_norm": 0.17079797387123108,
"learning_rate": 0.0001,
"loss": 1.6074,
"step": 682
},
{
"epoch": 0.33327911515940145,
"grad_norm": 0.1855732947587967,
"learning_rate": 0.0001,
"loss": 1.8051,
"step": 683
},
{
"epoch": 0.33376707872478856,
"grad_norm": 0.19362801313400269,
"learning_rate": 0.0001,
"loss": 1.7934,
"step": 684
},
{
"epoch": 0.33425504229017566,
"grad_norm": 0.18407447636127472,
"learning_rate": 0.0001,
"loss": 1.7676,
"step": 685
},
{
"epoch": 0.33474300585556277,
"grad_norm": 0.17326807975769043,
"learning_rate": 0.0001,
"loss": 1.6867,
"step": 686
},
{
"epoch": 0.3352309694209499,
"grad_norm": 0.18629767000675201,
"learning_rate": 0.0001,
"loss": 1.7577,
"step": 687
},
{
"epoch": 0.33571893298633704,
"grad_norm": 0.19202108681201935,
"learning_rate": 0.0001,
"loss": 1.7742,
"step": 688
},
{
"epoch": 0.33620689655172414,
"grad_norm": 0.1923230141401291,
"learning_rate": 0.0001,
"loss": 1.7646,
"step": 689
},
{
"epoch": 0.33669486011711125,
"grad_norm": 0.1855097860097885,
"learning_rate": 0.0001,
"loss": 1.7189,
"step": 690
},
{
"epoch": 0.33718282368249836,
"grad_norm": 0.17661595344543457,
"learning_rate": 0.0001,
"loss": 1.6404,
"step": 691
},
{
"epoch": 0.3376707872478855,
"grad_norm": 0.19284093379974365,
"learning_rate": 0.0001,
"loss": 1.7621,
"step": 692
},
{
"epoch": 0.3381587508132726,
"grad_norm": 0.18006063997745514,
"learning_rate": 0.0001,
"loss": 1.6163,
"step": 693
},
{
"epoch": 0.33864671437865973,
"grad_norm": 0.1881456822156906,
"learning_rate": 0.0001,
"loss": 1.732,
"step": 694
},
{
"epoch": 0.33913467794404684,
"grad_norm": 0.17196986079216003,
"learning_rate": 0.0001,
"loss": 1.7099,
"step": 695
},
{
"epoch": 0.33962264150943394,
"grad_norm": 0.186056986451149,
"learning_rate": 0.0001,
"loss": 1.8247,
"step": 696
},
{
"epoch": 0.3401106050748211,
"grad_norm": 0.18548524379730225,
"learning_rate": 0.0001,
"loss": 1.7185,
"step": 697
},
{
"epoch": 0.3405985686402082,
"grad_norm": 0.182390958070755,
"learning_rate": 0.0001,
"loss": 1.8278,
"step": 698
},
{
"epoch": 0.3410865322055953,
"grad_norm": 0.18355803191661835,
"learning_rate": 0.0001,
"loss": 1.6432,
"step": 699
},
{
"epoch": 0.3415744957709824,
"grad_norm": 0.176362544298172,
"learning_rate": 0.0001,
"loss": 1.71,
"step": 700
},
{
"epoch": 0.34206245933636953,
"grad_norm": 0.1753791868686676,
"learning_rate": 0.0001,
"loss": 1.7079,
"step": 701
},
{
"epoch": 0.3425504229017567,
"grad_norm": 0.17833958566188812,
"learning_rate": 0.0001,
"loss": 1.6155,
"step": 702
},
{
"epoch": 0.3430383864671438,
"grad_norm": 0.18626241385936737,
"learning_rate": 0.0001,
"loss": 1.8164,
"step": 703
},
{
"epoch": 0.3435263500325309,
"grad_norm": 0.18040528893470764,
"learning_rate": 0.0001,
"loss": 1.7061,
"step": 704
},
{
"epoch": 0.344014313597918,
"grad_norm": 0.18248948454856873,
"learning_rate": 0.0001,
"loss": 1.7002,
"step": 705
},
{
"epoch": 0.3445022771633051,
"grad_norm": 0.18155597150325775,
"learning_rate": 0.0001,
"loss": 1.7623,
"step": 706
},
{
"epoch": 0.3449902407286923,
"grad_norm": 0.18167854845523834,
"learning_rate": 0.0001,
"loss": 1.7209,
"step": 707
},
{
"epoch": 0.3454782042940794,
"grad_norm": 0.18228544294834137,
"learning_rate": 0.0001,
"loss": 1.7166,
"step": 708
},
{
"epoch": 0.3459661678594665,
"grad_norm": 0.1872456818819046,
"learning_rate": 0.0001,
"loss": 1.8073,
"step": 709
},
{
"epoch": 0.3464541314248536,
"grad_norm": 0.17062440514564514,
"learning_rate": 0.0001,
"loss": 1.653,
"step": 710
},
{
"epoch": 0.3469420949902407,
"grad_norm": 0.17459101974964142,
"learning_rate": 0.0001,
"loss": 1.6982,
"step": 711
},
{
"epoch": 0.34743005855562786,
"grad_norm": 0.1724562644958496,
"learning_rate": 0.0001,
"loss": 1.7638,
"step": 712
},
{
"epoch": 0.34791802212101497,
"grad_norm": 0.16791169345378876,
"learning_rate": 0.0001,
"loss": 1.5451,
"step": 713
},
{
"epoch": 0.3484059856864021,
"grad_norm": 0.17250396311283112,
"learning_rate": 0.0001,
"loss": 1.6266,
"step": 714
},
{
"epoch": 0.3488939492517892,
"grad_norm": 0.17893101274967194,
"learning_rate": 0.0001,
"loss": 1.7786,
"step": 715
},
{
"epoch": 0.34938191281717634,
"grad_norm": 0.1739955097436905,
"learning_rate": 0.0001,
"loss": 1.6286,
"step": 716
},
{
"epoch": 0.34986987638256345,
"grad_norm": 0.183289036154747,
"learning_rate": 0.0001,
"loss": 1.7026,
"step": 717
},
{
"epoch": 0.35035783994795056,
"grad_norm": 0.1769326776266098,
"learning_rate": 0.0001,
"loss": 1.7008,
"step": 718
},
{
"epoch": 0.35084580351333766,
"grad_norm": 0.1857866495847702,
"learning_rate": 0.0001,
"loss": 1.6844,
"step": 719
},
{
"epoch": 0.35133376707872477,
"grad_norm": 0.18651182949543,
"learning_rate": 0.0001,
"loss": 1.7033,
"step": 720
},
{
"epoch": 0.35182173064411193,
"grad_norm": 0.18966244161128998,
"learning_rate": 0.0001,
"loss": 1.7673,
"step": 721
},
{
"epoch": 0.35230969420949904,
"grad_norm": 0.1810387372970581,
"learning_rate": 0.0001,
"loss": 1.7161,
"step": 722
},
{
"epoch": 0.35279765777488614,
"grad_norm": 0.17334793508052826,
"learning_rate": 0.0001,
"loss": 1.5957,
"step": 723
},
{
"epoch": 0.35328562134027325,
"grad_norm": 0.18044047057628632,
"learning_rate": 0.0001,
"loss": 1.6443,
"step": 724
},
{
"epoch": 0.35377358490566035,
"grad_norm": 0.18923179805278778,
"learning_rate": 0.0001,
"loss": 1.7244,
"step": 725
},
{
"epoch": 0.3542615484710475,
"grad_norm": 0.18003158271312714,
"learning_rate": 0.0001,
"loss": 1.7655,
"step": 726
},
{
"epoch": 0.3547495120364346,
"grad_norm": 0.18161289393901825,
"learning_rate": 0.0001,
"loss": 1.7199,
"step": 727
},
{
"epoch": 0.35523747560182173,
"grad_norm": 0.19969268143177032,
"learning_rate": 0.0001,
"loss": 1.7138,
"step": 728
},
{
"epoch": 0.35572543916720883,
"grad_norm": 0.1782398670911789,
"learning_rate": 0.0001,
"loss": 1.6231,
"step": 729
},
{
"epoch": 0.35621340273259594,
"grad_norm": 0.20619311928749084,
"learning_rate": 0.0001,
"loss": 1.7745,
"step": 730
},
{
"epoch": 0.3567013662979831,
"grad_norm": 0.1790829598903656,
"learning_rate": 0.0001,
"loss": 1.6251,
"step": 731
},
{
"epoch": 0.3571893298633702,
"grad_norm": 0.17978286743164062,
"learning_rate": 0.0001,
"loss": 1.6495,
"step": 732
},
{
"epoch": 0.3576772934287573,
"grad_norm": 0.20410868525505066,
"learning_rate": 0.0001,
"loss": 1.7264,
"step": 733
},
{
"epoch": 0.3581652569941444,
"grad_norm": 0.18116474151611328,
"learning_rate": 0.0001,
"loss": 1.7379,
"step": 734
},
{
"epoch": 0.3586532205595316,
"grad_norm": 0.20212259888648987,
"learning_rate": 0.0001,
"loss": 1.6964,
"step": 735
},
{
"epoch": 0.3591411841249187,
"grad_norm": 0.17794452607631683,
"learning_rate": 0.0001,
"loss": 1.6666,
"step": 736
},
{
"epoch": 0.3596291476903058,
"grad_norm": 0.17267604172229767,
"learning_rate": 0.0001,
"loss": 1.5783,
"step": 737
},
{
"epoch": 0.3601171112556929,
"grad_norm": 0.21285639703273773,
"learning_rate": 0.0001,
"loss": 1.7575,
"step": 738
},
{
"epoch": 0.36060507482108,
"grad_norm": 0.1822413057088852,
"learning_rate": 0.0001,
"loss": 1.7244,
"step": 739
},
{
"epoch": 0.36109303838646717,
"grad_norm": 0.1909700185060501,
"learning_rate": 0.0001,
"loss": 1.7614,
"step": 740
},
{
"epoch": 0.3615810019518543,
"grad_norm": 0.19396358728408813,
"learning_rate": 0.0001,
"loss": 1.701,
"step": 741
},
{
"epoch": 0.3620689655172414,
"grad_norm": 0.18860898911952972,
"learning_rate": 0.0001,
"loss": 1.7215,
"step": 742
},
{
"epoch": 0.3625569290826285,
"grad_norm": 0.1891864836215973,
"learning_rate": 0.0001,
"loss": 1.7127,
"step": 743
},
{
"epoch": 0.3630448926480156,
"grad_norm": 0.18963932991027832,
"learning_rate": 0.0001,
"loss": 1.6591,
"step": 744
},
{
"epoch": 0.36353285621340276,
"grad_norm": 0.17823189496994019,
"learning_rate": 0.0001,
"loss": 1.7356,
"step": 745
},
{
"epoch": 0.36402081977878986,
"grad_norm": 0.19020548462867737,
"learning_rate": 0.0001,
"loss": 1.7591,
"step": 746
},
{
"epoch": 0.36450878334417697,
"grad_norm": 0.1983988732099533,
"learning_rate": 0.0001,
"loss": 1.6688,
"step": 747
},
{
"epoch": 0.3649967469095641,
"grad_norm": 0.17455948889255524,
"learning_rate": 0.0001,
"loss": 1.6981,
"step": 748
},
{
"epoch": 0.3654847104749512,
"grad_norm": 0.19214113056659698,
"learning_rate": 0.0001,
"loss": 1.6858,
"step": 749
},
{
"epoch": 0.36597267404033834,
"grad_norm": 0.19815075397491455,
"learning_rate": 0.0001,
"loss": 1.7088,
"step": 750
},
{
"epoch": 0.36646063760572545,
"grad_norm": 0.18052172660827637,
"learning_rate": 0.0001,
"loss": 1.7046,
"step": 751
},
{
"epoch": 0.36694860117111255,
"grad_norm": 0.19308723509311676,
"learning_rate": 0.0001,
"loss": 1.7145,
"step": 752
},
{
"epoch": 0.36743656473649966,
"grad_norm": 0.20036271214485168,
"learning_rate": 0.0001,
"loss": 1.6666,
"step": 753
},
{
"epoch": 0.36792452830188677,
"grad_norm": 0.18619637191295624,
"learning_rate": 0.0001,
"loss": 1.7144,
"step": 754
},
{
"epoch": 0.36841249186727393,
"grad_norm": 0.19576376676559448,
"learning_rate": 0.0001,
"loss": 1.7653,
"step": 755
},
{
"epoch": 0.36890045543266103,
"grad_norm": 0.18974775075912476,
"learning_rate": 0.0001,
"loss": 1.7836,
"step": 756
},
{
"epoch": 0.36938841899804814,
"grad_norm": 0.17752085626125336,
"learning_rate": 0.0001,
"loss": 1.6496,
"step": 757
},
{
"epoch": 0.36987638256343525,
"grad_norm": 0.1844092309474945,
"learning_rate": 0.0001,
"loss": 1.6863,
"step": 758
},
{
"epoch": 0.3703643461288224,
"grad_norm": 0.18102730810642242,
"learning_rate": 0.0001,
"loss": 1.5805,
"step": 759
},
{
"epoch": 0.3708523096942095,
"grad_norm": 0.1773853898048401,
"learning_rate": 0.0001,
"loss": 1.7169,
"step": 760
},
{
"epoch": 0.3713402732595966,
"grad_norm": 0.17917506396770477,
"learning_rate": 0.0001,
"loss": 1.705,
"step": 761
},
{
"epoch": 0.3718282368249837,
"grad_norm": 0.1869056671857834,
"learning_rate": 0.0001,
"loss": 1.5653,
"step": 762
},
{
"epoch": 0.37231620039037083,
"grad_norm": 0.1744174063205719,
"learning_rate": 0.0001,
"loss": 1.7014,
"step": 763
},
{
"epoch": 0.372804163955758,
"grad_norm": 0.18072061240673065,
"learning_rate": 0.0001,
"loss": 1.6638,
"step": 764
},
{
"epoch": 0.3732921275211451,
"grad_norm": 0.17331485450267792,
"learning_rate": 0.0001,
"loss": 1.6642,
"step": 765
},
{
"epoch": 0.3737800910865322,
"grad_norm": 0.1780969500541687,
"learning_rate": 0.0001,
"loss": 1.6563,
"step": 766
},
{
"epoch": 0.3742680546519193,
"grad_norm": 0.1959829479455948,
"learning_rate": 0.0001,
"loss": 1.8421,
"step": 767
},
{
"epoch": 0.3747560182173064,
"grad_norm": 0.18532420694828033,
"learning_rate": 0.0001,
"loss": 1.7752,
"step": 768
},
{
"epoch": 0.3752439817826936,
"grad_norm": 0.1861323118209839,
"learning_rate": 0.0001,
"loss": 1.6672,
"step": 769
},
{
"epoch": 0.3757319453480807,
"grad_norm": 0.17399415373802185,
"learning_rate": 0.0001,
"loss": 1.506,
"step": 770
},
{
"epoch": 0.3762199089134678,
"grad_norm": 0.1861727237701416,
"learning_rate": 0.0001,
"loss": 1.7164,
"step": 771
},
{
"epoch": 0.3767078724788549,
"grad_norm": 0.17571841180324554,
"learning_rate": 0.0001,
"loss": 1.6256,
"step": 772
},
{
"epoch": 0.377195836044242,
"grad_norm": 0.1843421310186386,
"learning_rate": 0.0001,
"loss": 1.7273,
"step": 773
},
{
"epoch": 0.37768379960962917,
"grad_norm": 0.17336313426494598,
"learning_rate": 0.0001,
"loss": 1.628,
"step": 774
},
{
"epoch": 0.3781717631750163,
"grad_norm": 0.173604816198349,
"learning_rate": 0.0001,
"loss": 1.6492,
"step": 775
},
{
"epoch": 0.3786597267404034,
"grad_norm": 0.19042102992534637,
"learning_rate": 0.0001,
"loss": 1.7671,
"step": 776
},
{
"epoch": 0.3791476903057905,
"grad_norm": 0.19237715005874634,
"learning_rate": 0.0001,
"loss": 1.6948,
"step": 777
},
{
"epoch": 0.3796356538711776,
"grad_norm": 0.1934320628643036,
"learning_rate": 0.0001,
"loss": 1.7704,
"step": 778
},
{
"epoch": 0.38012361743656475,
"grad_norm": 0.18237414956092834,
"learning_rate": 0.0001,
"loss": 1.7163,
"step": 779
},
{
"epoch": 0.38061158100195186,
"grad_norm": 0.1750539243221283,
"learning_rate": 0.0001,
"loss": 1.675,
"step": 780
},
{
"epoch": 0.38109954456733897,
"grad_norm": 0.18425478041172028,
"learning_rate": 0.0001,
"loss": 1.803,
"step": 781
},
{
"epoch": 0.38158750813272607,
"grad_norm": 0.17386333644390106,
"learning_rate": 0.0001,
"loss": 1.5968,
"step": 782
},
{
"epoch": 0.38207547169811323,
"grad_norm": 0.1958070695400238,
"learning_rate": 0.0001,
"loss": 1.7117,
"step": 783
},
{
"epoch": 0.38256343526350034,
"grad_norm": 0.18313884735107422,
"learning_rate": 0.0001,
"loss": 1.7634,
"step": 784
},
{
"epoch": 0.38305139882888745,
"grad_norm": 0.1904529333114624,
"learning_rate": 0.0001,
"loss": 1.7944,
"step": 785
},
{
"epoch": 0.38353936239427455,
"grad_norm": 0.18762192130088806,
"learning_rate": 0.0001,
"loss": 1.6575,
"step": 786
},
{
"epoch": 0.38402732595966166,
"grad_norm": 0.1828492432832718,
"learning_rate": 0.0001,
"loss": 1.6451,
"step": 787
},
{
"epoch": 0.3845152895250488,
"grad_norm": 0.19027890264987946,
"learning_rate": 0.0001,
"loss": 1.7919,
"step": 788
},
{
"epoch": 0.3850032530904359,
"grad_norm": 0.17186413705348969,
"learning_rate": 0.0001,
"loss": 1.6794,
"step": 789
},
{
"epoch": 0.38549121665582303,
"grad_norm": 0.1878061145544052,
"learning_rate": 0.0001,
"loss": 1.6987,
"step": 790
},
{
"epoch": 0.38597918022121014,
"grad_norm": 0.18121576309204102,
"learning_rate": 0.0001,
"loss": 1.796,
"step": 791
},
{
"epoch": 0.38646714378659724,
"grad_norm": 0.19097453355789185,
"learning_rate": 0.0001,
"loss": 1.7155,
"step": 792
},
{
"epoch": 0.3869551073519844,
"grad_norm": 0.18126630783081055,
"learning_rate": 0.0001,
"loss": 1.7499,
"step": 793
},
{
"epoch": 0.3874430709173715,
"grad_norm": 0.1922173947095871,
"learning_rate": 0.0001,
"loss": 1.7447,
"step": 794
},
{
"epoch": 0.3879310344827586,
"grad_norm": 0.17474421858787537,
"learning_rate": 0.0001,
"loss": 1.6234,
"step": 795
},
{
"epoch": 0.3884189980481457,
"grad_norm": 0.19023337960243225,
"learning_rate": 0.0001,
"loss": 1.7285,
"step": 796
},
{
"epoch": 0.38890696161353283,
"grad_norm": 0.17856378853321075,
"learning_rate": 0.0001,
"loss": 1.598,
"step": 797
},
{
"epoch": 0.38939492517892,
"grad_norm": 0.17470918595790863,
"learning_rate": 0.0001,
"loss": 1.7021,
"step": 798
},
{
"epoch": 0.3898828887443071,
"grad_norm": 0.20127350091934204,
"learning_rate": 0.0001,
"loss": 1.6433,
"step": 799
},
{
"epoch": 0.3903708523096942,
"grad_norm": 0.17676322162151337,
"learning_rate": 0.0001,
"loss": 1.6967,
"step": 800
},
{
"epoch": 0.3908588158750813,
"grad_norm": 0.17519530653953552,
"learning_rate": 0.0001,
"loss": 1.7357,
"step": 801
},
{
"epoch": 0.3913467794404684,
"grad_norm": 0.19061584770679474,
"learning_rate": 0.0001,
"loss": 1.7182,
"step": 802
},
{
"epoch": 0.3918347430058556,
"grad_norm": 0.18246081471443176,
"learning_rate": 0.0001,
"loss": 1.7688,
"step": 803
},
{
"epoch": 0.3923227065712427,
"grad_norm": 0.20583999156951904,
"learning_rate": 0.0001,
"loss": 1.8205,
"step": 804
},
{
"epoch": 0.3928106701366298,
"grad_norm": 0.18392029404640198,
"learning_rate": 0.0001,
"loss": 1.7499,
"step": 805
},
{
"epoch": 0.3932986337020169,
"grad_norm": 0.18296070396900177,
"learning_rate": 0.0001,
"loss": 1.7422,
"step": 806
},
{
"epoch": 0.39378659726740406,
"grad_norm": 0.176628977060318,
"learning_rate": 0.0001,
"loss": 1.6818,
"step": 807
},
{
"epoch": 0.39427456083279117,
"grad_norm": 0.17783887684345245,
"learning_rate": 0.0001,
"loss": 1.6935,
"step": 808
},
{
"epoch": 0.39476252439817827,
"grad_norm": 0.18225261569023132,
"learning_rate": 0.0001,
"loss": 1.7117,
"step": 809
},
{
"epoch": 0.3952504879635654,
"grad_norm": 0.18413884937763214,
"learning_rate": 0.0001,
"loss": 1.6266,
"step": 810
},
{
"epoch": 0.3957384515289525,
"grad_norm": 0.18847863376140594,
"learning_rate": 0.0001,
"loss": 1.6942,
"step": 811
},
{
"epoch": 0.39622641509433965,
"grad_norm": 0.177464559674263,
"learning_rate": 0.0001,
"loss": 1.7731,
"step": 812
},
{
"epoch": 0.39671437865972675,
"grad_norm": 0.18517576158046722,
"learning_rate": 0.0001,
"loss": 1.709,
"step": 813
},
{
"epoch": 0.39720234222511386,
"grad_norm": 0.18677739799022675,
"learning_rate": 0.0001,
"loss": 1.709,
"step": 814
},
{
"epoch": 0.39769030579050096,
"grad_norm": 0.1786472350358963,
"learning_rate": 0.0001,
"loss": 1.6966,
"step": 815
},
{
"epoch": 0.39817826935588807,
"grad_norm": 0.18321356177330017,
"learning_rate": 0.0001,
"loss": 1.6611,
"step": 816
},
{
"epoch": 0.39866623292127523,
"grad_norm": 0.19883863627910614,
"learning_rate": 0.0001,
"loss": 1.7824,
"step": 817
},
{
"epoch": 0.39915419648666234,
"grad_norm": 0.18374767899513245,
"learning_rate": 0.0001,
"loss": 1.8102,
"step": 818
},
{
"epoch": 0.39964216005204944,
"grad_norm": 0.1768617182970047,
"learning_rate": 0.0001,
"loss": 1.6278,
"step": 819
},
{
"epoch": 0.40013012361743655,
"grad_norm": 0.17839239537715912,
"learning_rate": 0.0001,
"loss": 1.5887,
"step": 820
},
{
"epoch": 0.40061808718282366,
"grad_norm": 0.18420036137104034,
"learning_rate": 0.0001,
"loss": 1.7334,
"step": 821
},
{
"epoch": 0.4011060507482108,
"grad_norm": 0.18662692606449127,
"learning_rate": 0.0001,
"loss": 1.7035,
"step": 822
},
{
"epoch": 0.4015940143135979,
"grad_norm": 0.1809212863445282,
"learning_rate": 0.0001,
"loss": 1.6425,
"step": 823
},
{
"epoch": 0.40208197787898503,
"grad_norm": 0.18343691527843475,
"learning_rate": 0.0001,
"loss": 1.6915,
"step": 824
},
{
"epoch": 0.40256994144437214,
"grad_norm": 0.19546520709991455,
"learning_rate": 0.0001,
"loss": 1.5398,
"step": 825
},
{
"epoch": 0.4030579050097593,
"grad_norm": 0.18498557806015015,
"learning_rate": 0.0001,
"loss": 1.76,
"step": 826
},
{
"epoch": 0.4035458685751464,
"grad_norm": 0.1787293255329132,
"learning_rate": 0.0001,
"loss": 1.7072,
"step": 827
},
{
"epoch": 0.4040338321405335,
"grad_norm": 0.18626105785369873,
"learning_rate": 0.0001,
"loss": 1.6154,
"step": 828
},
{
"epoch": 0.4045217957059206,
"grad_norm": 0.18181754648685455,
"learning_rate": 0.0001,
"loss": 1.6343,
"step": 829
},
{
"epoch": 0.4050097592713077,
"grad_norm": 0.1738763153553009,
"learning_rate": 0.0001,
"loss": 1.6003,
"step": 830
},
{
"epoch": 0.4054977228366949,
"grad_norm": 0.19205868244171143,
"learning_rate": 0.0001,
"loss": 1.6516,
"step": 831
},
{
"epoch": 0.405985686402082,
"grad_norm": 0.17389516532421112,
"learning_rate": 0.0001,
"loss": 1.6675,
"step": 832
},
{
"epoch": 0.4064736499674691,
"grad_norm": 0.17901460826396942,
"learning_rate": 0.0001,
"loss": 1.7835,
"step": 833
},
{
"epoch": 0.4069616135328562,
"grad_norm": 0.16918572783470154,
"learning_rate": 0.0001,
"loss": 1.5688,
"step": 834
},
{
"epoch": 0.4074495770982433,
"grad_norm": 0.17327755689620972,
"learning_rate": 0.0001,
"loss": 1.612,
"step": 835
},
{
"epoch": 0.40793754066363047,
"grad_norm": 0.17260931432247162,
"learning_rate": 0.0001,
"loss": 1.5631,
"step": 836
},
{
"epoch": 0.4084255042290176,
"grad_norm": 0.18616695702075958,
"learning_rate": 0.0001,
"loss": 1.8026,
"step": 837
},
{
"epoch": 0.4089134677944047,
"grad_norm": 0.1833159476518631,
"learning_rate": 0.0001,
"loss": 1.7407,
"step": 838
},
{
"epoch": 0.4094014313597918,
"grad_norm": 0.17563556134700775,
"learning_rate": 0.0001,
"loss": 1.6497,
"step": 839
},
{
"epoch": 0.4098893949251789,
"grad_norm": 0.1728363335132599,
"learning_rate": 0.0001,
"loss": 1.7369,
"step": 840
},
{
"epoch": 0.41037735849056606,
"grad_norm": 0.16742554306983948,
"learning_rate": 0.0001,
"loss": 1.5323,
"step": 841
},
{
"epoch": 0.41086532205595316,
"grad_norm": 0.18149816989898682,
"learning_rate": 0.0001,
"loss": 1.6658,
"step": 842
},
{
"epoch": 0.41135328562134027,
"grad_norm": 0.1730806678533554,
"learning_rate": 0.0001,
"loss": 1.6736,
"step": 843
},
{
"epoch": 0.4118412491867274,
"grad_norm": 0.19350793957710266,
"learning_rate": 0.0001,
"loss": 1.7305,
"step": 844
},
{
"epoch": 0.4123292127521145,
"grad_norm": 0.17669609189033508,
"learning_rate": 0.0001,
"loss": 1.7208,
"step": 845
},
{
"epoch": 0.41281717631750164,
"grad_norm": 0.18896430730819702,
"learning_rate": 0.0001,
"loss": 1.7677,
"step": 846
},
{
"epoch": 0.41330513988288875,
"grad_norm": 0.18296490609645844,
"learning_rate": 0.0001,
"loss": 1.7551,
"step": 847
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.18311992287635803,
"learning_rate": 0.0001,
"loss": 1.6724,
"step": 848
},
{
"epoch": 0.41428106701366296,
"grad_norm": 0.1732887476682663,
"learning_rate": 0.0001,
"loss": 1.6779,
"step": 849
},
{
"epoch": 0.4147690305790501,
"grad_norm": 0.18442484736442566,
"learning_rate": 0.0001,
"loss": 1.6707,
"step": 850
},
{
"epoch": 0.41525699414443723,
"grad_norm": 0.18358947336673737,
"learning_rate": 0.0001,
"loss": 1.7059,
"step": 851
},
{
"epoch": 0.41574495770982434,
"grad_norm": 0.17849397659301758,
"learning_rate": 0.0001,
"loss": 1.6633,
"step": 852
},
{
"epoch": 0.41623292127521144,
"grad_norm": 0.17558790743350983,
"learning_rate": 0.0001,
"loss": 1.7351,
"step": 853
},
{
"epoch": 0.41672088484059855,
"grad_norm": 0.18554963171482086,
"learning_rate": 0.0001,
"loss": 1.722,
"step": 854
},
{
"epoch": 0.4172088484059857,
"grad_norm": 0.17529337108135223,
"learning_rate": 0.0001,
"loss": 1.7565,
"step": 855
},
{
"epoch": 0.4176968119713728,
"grad_norm": 0.1806408166885376,
"learning_rate": 0.0001,
"loss": 1.6164,
"step": 856
},
{
"epoch": 0.4181847755367599,
"grad_norm": 0.17640672624111176,
"learning_rate": 0.0001,
"loss": 1.6622,
"step": 857
},
{
"epoch": 0.41867273910214703,
"grad_norm": 0.18511973321437836,
"learning_rate": 0.0001,
"loss": 1.7708,
"step": 858
},
{
"epoch": 0.41916070266753414,
"grad_norm": 0.17402327060699463,
"learning_rate": 0.0001,
"loss": 1.5703,
"step": 859
},
{
"epoch": 0.4196486662329213,
"grad_norm": 0.1716722548007965,
"learning_rate": 0.0001,
"loss": 1.6326,
"step": 860
},
{
"epoch": 0.4201366297983084,
"grad_norm": 0.18517763912677765,
"learning_rate": 0.0001,
"loss": 1.638,
"step": 861
},
{
"epoch": 0.4206245933636955,
"grad_norm": 0.18149396777153015,
"learning_rate": 0.0001,
"loss": 1.772,
"step": 862
},
{
"epoch": 0.4211125569290826,
"grad_norm": 0.1842370480298996,
"learning_rate": 0.0001,
"loss": 1.7326,
"step": 863
},
{
"epoch": 0.4216005204944697,
"grad_norm": 0.1832754909992218,
"learning_rate": 0.0001,
"loss": 1.571,
"step": 864
},
{
"epoch": 0.4220884840598569,
"grad_norm": 0.18610063195228577,
"learning_rate": 0.0001,
"loss": 1.6853,
"step": 865
},
{
"epoch": 0.422576447625244,
"grad_norm": 0.18227741122245789,
"learning_rate": 0.0001,
"loss": 1.7299,
"step": 866
},
{
"epoch": 0.4230644111906311,
"grad_norm": 0.1710875779390335,
"learning_rate": 0.0001,
"loss": 1.6311,
"step": 867
},
{
"epoch": 0.4235523747560182,
"grad_norm": 0.1772422045469284,
"learning_rate": 0.0001,
"loss": 1.6997,
"step": 868
},
{
"epoch": 0.4240403383214053,
"grad_norm": 0.18706001341342926,
"learning_rate": 0.0001,
"loss": 1.7453,
"step": 869
},
{
"epoch": 0.42452830188679247,
"grad_norm": 0.18400168418884277,
"learning_rate": 0.0001,
"loss": 1.7748,
"step": 870
},
{
"epoch": 0.4250162654521796,
"grad_norm": 0.1813107579946518,
"learning_rate": 0.0001,
"loss": 1.6386,
"step": 871
},
{
"epoch": 0.4255042290175667,
"grad_norm": 0.18432138860225677,
"learning_rate": 0.0001,
"loss": 1.6548,
"step": 872
},
{
"epoch": 0.4259921925829538,
"grad_norm": 0.1701667755842209,
"learning_rate": 0.0001,
"loss": 1.7228,
"step": 873
},
{
"epoch": 0.42648015614834095,
"grad_norm": 0.17490911483764648,
"learning_rate": 0.0001,
"loss": 1.6574,
"step": 874
},
{
"epoch": 0.42696811971372806,
"grad_norm": 0.1863052397966385,
"learning_rate": 0.0001,
"loss": 1.6902,
"step": 875
},
{
"epoch": 0.42745608327911516,
"grad_norm": 0.17869678139686584,
"learning_rate": 0.0001,
"loss": 1.7961,
"step": 876
},
{
"epoch": 0.42794404684450227,
"grad_norm": 0.17393270134925842,
"learning_rate": 0.0001,
"loss": 1.6968,
"step": 877
},
{
"epoch": 0.4284320104098894,
"grad_norm": 0.1801164150238037,
"learning_rate": 0.0001,
"loss": 1.8419,
"step": 878
},
{
"epoch": 0.42891997397527654,
"grad_norm": 0.17271965742111206,
"learning_rate": 0.0001,
"loss": 1.6948,
"step": 879
},
{
"epoch": 0.42940793754066364,
"grad_norm": 0.18875744938850403,
"learning_rate": 0.0001,
"loss": 1.7529,
"step": 880
},
{
"epoch": 0.42989590110605075,
"grad_norm": 0.18350331485271454,
"learning_rate": 0.0001,
"loss": 1.7162,
"step": 881
},
{
"epoch": 0.43038386467143785,
"grad_norm": 0.18316605687141418,
"learning_rate": 0.0001,
"loss": 1.7071,
"step": 882
},
{
"epoch": 0.43087182823682496,
"grad_norm": 0.17159631848335266,
"learning_rate": 0.0001,
"loss": 1.5494,
"step": 883
},
{
"epoch": 0.4313597918022121,
"grad_norm": 0.1835523098707199,
"learning_rate": 0.0001,
"loss": 1.7773,
"step": 884
},
{
"epoch": 0.43184775536759923,
"grad_norm": 0.18305568397045135,
"learning_rate": 0.0001,
"loss": 1.6616,
"step": 885
},
{
"epoch": 0.43233571893298633,
"grad_norm": 0.18325333297252655,
"learning_rate": 0.0001,
"loss": 1.71,
"step": 886
},
{
"epoch": 0.43282368249837344,
"grad_norm": 0.16807565093040466,
"learning_rate": 0.0001,
"loss": 1.5946,
"step": 887
},
{
"epoch": 0.43331164606376055,
"grad_norm": 0.17560525238513947,
"learning_rate": 0.0001,
"loss": 1.573,
"step": 888
},
{
"epoch": 0.4337996096291477,
"grad_norm": 0.1823277622461319,
"learning_rate": 0.0001,
"loss": 1.616,
"step": 889
},
{
"epoch": 0.4342875731945348,
"grad_norm": 0.17946025729179382,
"learning_rate": 0.0001,
"loss": 1.5907,
"step": 890
},
{
"epoch": 0.4347755367599219,
"grad_norm": 0.18940189480781555,
"learning_rate": 0.0001,
"loss": 1.6697,
"step": 891
},
{
"epoch": 0.435263500325309,
"grad_norm": 0.17899388074874878,
"learning_rate": 0.0001,
"loss": 1.6849,
"step": 892
},
{
"epoch": 0.4357514638906962,
"grad_norm": 0.1885358840227127,
"learning_rate": 0.0001,
"loss": 1.6212,
"step": 893
},
{
"epoch": 0.4362394274560833,
"grad_norm": 0.1721390187740326,
"learning_rate": 0.0001,
"loss": 1.6514,
"step": 894
},
{
"epoch": 0.4367273910214704,
"grad_norm": 0.19019658863544464,
"learning_rate": 0.0001,
"loss": 1.7234,
"step": 895
},
{
"epoch": 0.4372153545868575,
"grad_norm": 0.17101971805095673,
"learning_rate": 0.0001,
"loss": 1.6003,
"step": 896
},
{
"epoch": 0.4377033181522446,
"grad_norm": 0.192877396941185,
"learning_rate": 0.0001,
"loss": 1.8151,
"step": 897
},
{
"epoch": 0.4381912817176318,
"grad_norm": 0.17775356769561768,
"learning_rate": 0.0001,
"loss": 1.5926,
"step": 898
},
{
"epoch": 0.4386792452830189,
"grad_norm": 0.19545124471187592,
"learning_rate": 0.0001,
"loss": 1.7123,
"step": 899
},
{
"epoch": 0.439167208848406,
"grad_norm": 0.17418169975280762,
"learning_rate": 0.0001,
"loss": 1.6774,
"step": 900
},
{
"epoch": 0.4396551724137931,
"grad_norm": 0.19206389784812927,
"learning_rate": 0.0001,
"loss": 1.7278,
"step": 901
},
{
"epoch": 0.4401431359791802,
"grad_norm": 0.18674510717391968,
"learning_rate": 0.0001,
"loss": 1.6049,
"step": 902
},
{
"epoch": 0.44063109954456736,
"grad_norm": 0.18307790160179138,
"learning_rate": 0.0001,
"loss": 1.6985,
"step": 903
},
{
"epoch": 0.44111906310995447,
"grad_norm": 0.1894843429327011,
"learning_rate": 0.0001,
"loss": 1.676,
"step": 904
},
{
"epoch": 0.4416070266753416,
"grad_norm": 0.17619220912456512,
"learning_rate": 0.0001,
"loss": 1.6807,
"step": 905
},
{
"epoch": 0.4420949902407287,
"grad_norm": 0.1805913895368576,
"learning_rate": 0.0001,
"loss": 1.6704,
"step": 906
},
{
"epoch": 0.4425829538061158,
"grad_norm": 0.17293816804885864,
"learning_rate": 0.0001,
"loss": 1.597,
"step": 907
},
{
"epoch": 0.44307091737150295,
"grad_norm": 0.17609193921089172,
"learning_rate": 0.0001,
"loss": 1.6562,
"step": 908
},
{
"epoch": 0.44355888093689005,
"grad_norm": 0.17432111501693726,
"learning_rate": 0.0001,
"loss": 1.594,
"step": 909
},
{
"epoch": 0.44404684450227716,
"grad_norm": 0.17889589071273804,
"learning_rate": 0.0001,
"loss": 1.8029,
"step": 910
},
{
"epoch": 0.44453480806766427,
"grad_norm": 0.17299845814704895,
"learning_rate": 0.0001,
"loss": 1.6116,
"step": 911
},
{
"epoch": 0.4450227716330514,
"grad_norm": 0.17839674651622772,
"learning_rate": 0.0001,
"loss": 1.7055,
"step": 912
},
{
"epoch": 0.44551073519843853,
"grad_norm": 0.1751437783241272,
"learning_rate": 0.0001,
"loss": 1.6218,
"step": 913
},
{
"epoch": 0.44599869876382564,
"grad_norm": 0.1901925653219223,
"learning_rate": 0.0001,
"loss": 1.6578,
"step": 914
},
{
"epoch": 0.44648666232921275,
"grad_norm": 0.17236626148223877,
"learning_rate": 0.0001,
"loss": 1.6951,
"step": 915
},
{
"epoch": 0.44697462589459985,
"grad_norm": 0.17387427389621735,
"learning_rate": 0.0001,
"loss": 1.5922,
"step": 916
},
{
"epoch": 0.447462589459987,
"grad_norm": 0.1684548258781433,
"learning_rate": 0.0001,
"loss": 1.5566,
"step": 917
},
{
"epoch": 0.4479505530253741,
"grad_norm": 0.18070632219314575,
"learning_rate": 0.0001,
"loss": 1.6904,
"step": 918
},
{
"epoch": 0.4484385165907612,
"grad_norm": 0.1905713975429535,
"learning_rate": 0.0001,
"loss": 1.8206,
"step": 919
},
{
"epoch": 0.44892648015614833,
"grad_norm": 0.1828422248363495,
"learning_rate": 0.0001,
"loss": 1.7974,
"step": 920
},
{
"epoch": 0.44941444372153544,
"grad_norm": 0.17595981061458588,
"learning_rate": 0.0001,
"loss": 1.7308,
"step": 921
},
{
"epoch": 0.4499024072869226,
"grad_norm": 0.18210361897945404,
"learning_rate": 0.0001,
"loss": 1.6915,
"step": 922
},
{
"epoch": 0.4503903708523097,
"grad_norm": 0.18826089799404144,
"learning_rate": 0.0001,
"loss": 1.7588,
"step": 923
},
{
"epoch": 0.4508783344176968,
"grad_norm": 0.17665328085422516,
"learning_rate": 0.0001,
"loss": 1.6797,
"step": 924
},
{
"epoch": 0.4513662979830839,
"grad_norm": 0.17838731408119202,
"learning_rate": 0.0001,
"loss": 1.6644,
"step": 925
},
{
"epoch": 0.451854261548471,
"grad_norm": 0.18045654892921448,
"learning_rate": 0.0001,
"loss": 1.689,
"step": 926
},
{
"epoch": 0.4523422251138582,
"grad_norm": 0.18226969242095947,
"learning_rate": 0.0001,
"loss": 1.8157,
"step": 927
},
{
"epoch": 0.4528301886792453,
"grad_norm": 0.17917855083942413,
"learning_rate": 0.0001,
"loss": 1.7772,
"step": 928
},
{
"epoch": 0.4533181522446324,
"grad_norm": 0.1778966784477234,
"learning_rate": 0.0001,
"loss": 1.6912,
"step": 929
},
{
"epoch": 0.4538061158100195,
"grad_norm": 0.18105091154575348,
"learning_rate": 0.0001,
"loss": 1.7072,
"step": 930
},
{
"epoch": 0.4542940793754066,
"grad_norm": 0.17502936720848083,
"learning_rate": 0.0001,
"loss": 1.6462,
"step": 931
},
{
"epoch": 0.4547820429407938,
"grad_norm": 0.1830134093761444,
"learning_rate": 0.0001,
"loss": 1.6876,
"step": 932
},
{
"epoch": 0.4552700065061809,
"grad_norm": 0.18607327342033386,
"learning_rate": 0.0001,
"loss": 1.7082,
"step": 933
},
{
"epoch": 0.455757970071568,
"grad_norm": 0.18888945877552032,
"learning_rate": 0.0001,
"loss": 1.7509,
"step": 934
},
{
"epoch": 0.4562459336369551,
"grad_norm": 0.1867811232805252,
"learning_rate": 0.0001,
"loss": 1.7233,
"step": 935
},
{
"epoch": 0.4567338972023422,
"grad_norm": 0.1898915022611618,
"learning_rate": 0.0001,
"loss": 1.6237,
"step": 936
},
{
"epoch": 0.45722186076772936,
"grad_norm": 0.1797095388174057,
"learning_rate": 0.0001,
"loss": 1.7404,
"step": 937
},
{
"epoch": 0.45770982433311647,
"grad_norm": 0.17534306645393372,
"learning_rate": 0.0001,
"loss": 1.6726,
"step": 938
},
{
"epoch": 0.4581977878985036,
"grad_norm": 0.19073282182216644,
"learning_rate": 0.0001,
"loss": 1.8081,
"step": 939
},
{
"epoch": 0.4586857514638907,
"grad_norm": 0.1878473460674286,
"learning_rate": 0.0001,
"loss": 1.6855,
"step": 940
},
{
"epoch": 0.45917371502927784,
"grad_norm": 0.18376657366752625,
"learning_rate": 0.0001,
"loss": 1.6833,
"step": 941
},
{
"epoch": 0.45966167859466495,
"grad_norm": 0.18948735296726227,
"learning_rate": 0.0001,
"loss": 1.7525,
"step": 942
},
{
"epoch": 0.46014964216005205,
"grad_norm": 0.18738175928592682,
"learning_rate": 0.0001,
"loss": 1.752,
"step": 943
},
{
"epoch": 0.46063760572543916,
"grad_norm": 0.1765458881855011,
"learning_rate": 0.0001,
"loss": 1.696,
"step": 944
},
{
"epoch": 0.46112556929082626,
"grad_norm": 0.18650664389133453,
"learning_rate": 0.0001,
"loss": 1.7409,
"step": 945
},
{
"epoch": 0.4616135328562134,
"grad_norm": 0.1759469360113144,
"learning_rate": 0.0001,
"loss": 1.6119,
"step": 946
},
{
"epoch": 0.46210149642160053,
"grad_norm": 0.18343883752822876,
"learning_rate": 0.0001,
"loss": 1.7091,
"step": 947
},
{
"epoch": 0.46258945998698764,
"grad_norm": 0.1964959353208542,
"learning_rate": 0.0001,
"loss": 1.7388,
"step": 948
},
{
"epoch": 0.46307742355237475,
"grad_norm": 0.18265226483345032,
"learning_rate": 0.0001,
"loss": 1.7036,
"step": 949
},
{
"epoch": 0.46356538711776185,
"grad_norm": 0.18132254481315613,
"learning_rate": 0.0001,
"loss": 1.688,
"step": 950
},
{
"epoch": 0.464053350683149,
"grad_norm": 0.18742497265338898,
"learning_rate": 0.0001,
"loss": 1.6212,
"step": 951
},
{
"epoch": 0.4645413142485361,
"grad_norm": 0.1776818335056305,
"learning_rate": 0.0001,
"loss": 1.5739,
"step": 952
},
{
"epoch": 0.4650292778139232,
"grad_norm": 0.193990558385849,
"learning_rate": 0.0001,
"loss": 1.6852,
"step": 953
},
{
"epoch": 0.46551724137931033,
"grad_norm": 0.1853352040052414,
"learning_rate": 0.0001,
"loss": 1.6057,
"step": 954
},
{
"epoch": 0.46600520494469744,
"grad_norm": 0.2000368982553482,
"learning_rate": 0.0001,
"loss": 1.7329,
"step": 955
},
{
"epoch": 0.4664931685100846,
"grad_norm": 0.20909981429576874,
"learning_rate": 0.0001,
"loss": 1.687,
"step": 956
},
{
"epoch": 0.4669811320754717,
"grad_norm": 0.21065653860569,
"learning_rate": 0.0001,
"loss": 1.7239,
"step": 957
},
{
"epoch": 0.4674690956408588,
"grad_norm": 0.1819789707660675,
"learning_rate": 0.0001,
"loss": 1.7258,
"step": 958
},
{
"epoch": 0.4679570592062459,
"grad_norm": 0.20444951951503754,
"learning_rate": 0.0001,
"loss": 1.679,
"step": 959
},
{
"epoch": 0.468445022771633,
"grad_norm": 0.19722609221935272,
"learning_rate": 0.0001,
"loss": 1.6114,
"step": 960
},
{
"epoch": 0.4689329863370202,
"grad_norm": 0.18290160596370697,
"learning_rate": 0.0001,
"loss": 1.7676,
"step": 961
},
{
"epoch": 0.4694209499024073,
"grad_norm": 0.20910906791687012,
"learning_rate": 0.0001,
"loss": 1.6688,
"step": 962
},
{
"epoch": 0.4699089134677944,
"grad_norm": 0.2053229659795761,
"learning_rate": 0.0001,
"loss": 1.7208,
"step": 963
},
{
"epoch": 0.4703968770331815,
"grad_norm": 0.18317236006259918,
"learning_rate": 0.0001,
"loss": 1.6808,
"step": 964
},
{
"epoch": 0.47088484059856867,
"grad_norm": 0.20331262052059174,
"learning_rate": 0.0001,
"loss": 1.7621,
"step": 965
},
{
"epoch": 0.47137280416395577,
"grad_norm": 0.194210484623909,
"learning_rate": 0.0001,
"loss": 1.7045,
"step": 966
},
{
"epoch": 0.4718607677293429,
"grad_norm": 0.18274177610874176,
"learning_rate": 0.0001,
"loss": 1.7462,
"step": 967
},
{
"epoch": 0.47234873129473,
"grad_norm": 0.211595356464386,
"learning_rate": 0.0001,
"loss": 1.7322,
"step": 968
},
{
"epoch": 0.4728366948601171,
"grad_norm": 0.1885220855474472,
"learning_rate": 0.0001,
"loss": 1.6825,
"step": 969
},
{
"epoch": 0.47332465842550425,
"grad_norm": 0.17875580489635468,
"learning_rate": 0.0001,
"loss": 1.6192,
"step": 970
},
{
"epoch": 0.47381262199089136,
"grad_norm": 0.1805390864610672,
"learning_rate": 0.0001,
"loss": 1.6668,
"step": 971
},
{
"epoch": 0.47430058555627846,
"grad_norm": 0.19222760200500488,
"learning_rate": 0.0001,
"loss": 1.7478,
"step": 972
},
{
"epoch": 0.47478854912166557,
"grad_norm": 0.18637999892234802,
"learning_rate": 0.0001,
"loss": 1.7773,
"step": 973
},
{
"epoch": 0.4752765126870527,
"grad_norm": 0.18341195583343506,
"learning_rate": 0.0001,
"loss": 1.7021,
"step": 974
},
{
"epoch": 0.47576447625243984,
"grad_norm": 0.17885076999664307,
"learning_rate": 0.0001,
"loss": 1.6424,
"step": 975
},
{
"epoch": 0.47625243981782694,
"grad_norm": 0.1952183097600937,
"learning_rate": 0.0001,
"loss": 1.9142,
"step": 976
},
{
"epoch": 0.47674040338321405,
"grad_norm": 0.18243496119976044,
"learning_rate": 0.0001,
"loss": 1.6983,
"step": 977
},
{
"epoch": 0.47722836694860116,
"grad_norm": 0.18224705755710602,
"learning_rate": 0.0001,
"loss": 1.5847,
"step": 978
},
{
"epoch": 0.47771633051398826,
"grad_norm": 0.25170522928237915,
"learning_rate": 0.0001,
"loss": 1.9113,
"step": 979
},
{
"epoch": 0.4782042940793754,
"grad_norm": 0.18615500628948212,
"learning_rate": 0.0001,
"loss": 1.7893,
"step": 980
},
{
"epoch": 0.47869225764476253,
"grad_norm": 0.18177960813045502,
"learning_rate": 0.0001,
"loss": 1.753,
"step": 981
},
{
"epoch": 0.47918022121014964,
"grad_norm": 0.17566373944282532,
"learning_rate": 0.0001,
"loss": 1.749,
"step": 982
},
{
"epoch": 0.47966818477553674,
"grad_norm": 0.18363641202449799,
"learning_rate": 0.0001,
"loss": 1.7202,
"step": 983
},
{
"epoch": 0.4801561483409239,
"grad_norm": 0.18019676208496094,
"learning_rate": 0.0001,
"loss": 1.7756,
"step": 984
},
{
"epoch": 0.480644111906311,
"grad_norm": 0.18838275969028473,
"learning_rate": 0.0001,
"loss": 1.6533,
"step": 985
},
{
"epoch": 0.4811320754716981,
"grad_norm": 0.17840002477169037,
"learning_rate": 0.0001,
"loss": 1.6495,
"step": 986
},
{
"epoch": 0.4816200390370852,
"grad_norm": 0.18629398941993713,
"learning_rate": 0.0001,
"loss": 1.746,
"step": 987
},
{
"epoch": 0.48210800260247233,
"grad_norm": 0.19068728387355804,
"learning_rate": 0.0001,
"loss": 1.7956,
"step": 988
},
{
"epoch": 0.4825959661678595,
"grad_norm": 0.17752403020858765,
"learning_rate": 0.0001,
"loss": 1.6085,
"step": 989
},
{
"epoch": 0.4830839297332466,
"grad_norm": 0.17869940400123596,
"learning_rate": 0.0001,
"loss": 1.687,
"step": 990
},
{
"epoch": 0.4835718932986337,
"grad_norm": 0.19462576508522034,
"learning_rate": 0.0001,
"loss": 1.766,
"step": 991
},
{
"epoch": 0.4840598568640208,
"grad_norm": 0.17635509371757507,
"learning_rate": 0.0001,
"loss": 1.6512,
"step": 992
},
{
"epoch": 0.4845478204294079,
"grad_norm": 0.18457075953483582,
"learning_rate": 0.0001,
"loss": 1.6829,
"step": 993
},
{
"epoch": 0.4850357839947951,
"grad_norm": 0.19008415937423706,
"learning_rate": 0.0001,
"loss": 1.8335,
"step": 994
},
{
"epoch": 0.4855237475601822,
"grad_norm": 0.1748104840517044,
"learning_rate": 0.0001,
"loss": 1.6822,
"step": 995
},
{
"epoch": 0.4860117111255693,
"grad_norm": 0.18871375918388367,
"learning_rate": 0.0001,
"loss": 1.7749,
"step": 996
},
{
"epoch": 0.4864996746909564,
"grad_norm": 0.19204716384410858,
"learning_rate": 0.0001,
"loss": 1.7027,
"step": 997
},
{
"epoch": 0.4869876382563435,
"grad_norm": 0.17363031208515167,
"learning_rate": 0.0001,
"loss": 1.6329,
"step": 998
},
{
"epoch": 0.48747560182173066,
"grad_norm": 0.18046556413173676,
"learning_rate": 0.0001,
"loss": 1.6251,
"step": 999
},
{
"epoch": 0.48796356538711777,
"grad_norm": 0.18280474841594696,
"learning_rate": 0.0001,
"loss": 1.7468,
"step": 1000
},
{
"epoch": 0.4884515289525049,
"grad_norm": 0.1856307089328766,
"learning_rate": 0.0001,
"loss": 1.8059,
"step": 1001
},
{
"epoch": 0.488939492517892,
"grad_norm": 0.18734587728977203,
"learning_rate": 0.0001,
"loss": 1.7482,
"step": 1002
},
{
"epoch": 0.4894274560832791,
"grad_norm": 0.18201518058776855,
"learning_rate": 0.0001,
"loss": 1.6618,
"step": 1003
},
{
"epoch": 0.48991541964866625,
"grad_norm": 0.18317224085330963,
"learning_rate": 0.0001,
"loss": 1.6556,
"step": 1004
},
{
"epoch": 0.49040338321405336,
"grad_norm": 0.18233336508274078,
"learning_rate": 0.0001,
"loss": 1.7073,
"step": 1005
},
{
"epoch": 0.49089134677944046,
"grad_norm": 0.19454477727413177,
"learning_rate": 0.0001,
"loss": 1.5993,
"step": 1006
},
{
"epoch": 0.49137931034482757,
"grad_norm": 0.1874353140592575,
"learning_rate": 0.0001,
"loss": 1.6976,
"step": 1007
},
{
"epoch": 0.49186727391021473,
"grad_norm": 0.18378609418869019,
"learning_rate": 0.0001,
"loss": 1.7292,
"step": 1008
},
{
"epoch": 0.49235523747560184,
"grad_norm": 0.18301472067832947,
"learning_rate": 0.0001,
"loss": 1.6702,
"step": 1009
},
{
"epoch": 0.49284320104098894,
"grad_norm": 0.18581345677375793,
"learning_rate": 0.0001,
"loss": 1.769,
"step": 1010
},
{
"epoch": 0.49333116460637605,
"grad_norm": 0.18604816496372223,
"learning_rate": 0.0001,
"loss": 1.7022,
"step": 1011
},
{
"epoch": 0.49381912817176316,
"grad_norm": 0.1670636236667633,
"learning_rate": 0.0001,
"loss": 1.6245,
"step": 1012
},
{
"epoch": 0.4943070917371503,
"grad_norm": 0.18545298278331757,
"learning_rate": 0.0001,
"loss": 1.777,
"step": 1013
},
{
"epoch": 0.4947950553025374,
"grad_norm": 0.18108947575092316,
"learning_rate": 0.0001,
"loss": 1.7066,
"step": 1014
},
{
"epoch": 0.49528301886792453,
"grad_norm": 0.18042118847370148,
"learning_rate": 0.0001,
"loss": 1.6393,
"step": 1015
},
{
"epoch": 0.49577098243331164,
"grad_norm": 0.19193610548973083,
"learning_rate": 0.0001,
"loss": 1.8438,
"step": 1016
},
{
"epoch": 0.49625894599869874,
"grad_norm": 0.18542861938476562,
"learning_rate": 0.0001,
"loss": 1.8076,
"step": 1017
},
{
"epoch": 0.4967469095640859,
"grad_norm": 0.17646706104278564,
"learning_rate": 0.0001,
"loss": 1.4699,
"step": 1018
},
{
"epoch": 0.497234873129473,
"grad_norm": 0.18862095475196838,
"learning_rate": 0.0001,
"loss": 1.7165,
"step": 1019
},
{
"epoch": 0.4977228366948601,
"grad_norm": 0.18618489801883698,
"learning_rate": 0.0001,
"loss": 1.7683,
"step": 1020
},
{
"epoch": 0.4982108002602472,
"grad_norm": 0.18750105798244476,
"learning_rate": 0.0001,
"loss": 1.6681,
"step": 1021
},
{
"epoch": 0.49869876382563433,
"grad_norm": 0.1942930370569229,
"learning_rate": 0.0001,
"loss": 1.6555,
"step": 1022
},
{
"epoch": 0.4991867273910215,
"grad_norm": 0.18165245652198792,
"learning_rate": 0.0001,
"loss": 1.7059,
"step": 1023
},
{
"epoch": 0.4996746909564086,
"grad_norm": 0.18349111080169678,
"learning_rate": 0.0001,
"loss": 1.7165,
"step": 1024
},
{
"epoch": 0.5001626545217958,
"grad_norm": 0.17459173500537872,
"learning_rate": 0.0001,
"loss": 1.6784,
"step": 1025
},
{
"epoch": 0.5006506180871828,
"grad_norm": 0.19236469268798828,
"learning_rate": 0.0001,
"loss": 1.6727,
"step": 1026
},
{
"epoch": 0.50113858165257,
"grad_norm": 0.18120145797729492,
"learning_rate": 0.0001,
"loss": 1.7109,
"step": 1027
},
{
"epoch": 0.501626545217957,
"grad_norm": 0.18319325149059296,
"learning_rate": 0.0001,
"loss": 1.6353,
"step": 1028
},
{
"epoch": 0.5021145087833442,
"grad_norm": 0.1807912439107895,
"learning_rate": 0.0001,
"loss": 1.6866,
"step": 1029
},
{
"epoch": 0.5026024723487313,
"grad_norm": 0.1748090237379074,
"learning_rate": 0.0001,
"loss": 1.6196,
"step": 1030
},
{
"epoch": 0.5030904359141184,
"grad_norm": 0.1822468489408493,
"learning_rate": 0.0001,
"loss": 1.7539,
"step": 1031
},
{
"epoch": 0.5035783994795056,
"grad_norm": 0.18360479176044464,
"learning_rate": 0.0001,
"loss": 1.6853,
"step": 1032
},
{
"epoch": 0.5040663630448926,
"grad_norm": 0.18836341798305511,
"learning_rate": 0.0001,
"loss": 1.6796,
"step": 1033
},
{
"epoch": 0.5045543266102798,
"grad_norm": 0.18044047057628632,
"learning_rate": 0.0001,
"loss": 1.6929,
"step": 1034
},
{
"epoch": 0.5050422901756669,
"grad_norm": 0.18836145102977753,
"learning_rate": 0.0001,
"loss": 1.8204,
"step": 1035
},
{
"epoch": 0.505530253741054,
"grad_norm": 0.1829444319009781,
"learning_rate": 0.0001,
"loss": 1.7364,
"step": 1036
},
{
"epoch": 0.5060182173064411,
"grad_norm": 0.1847165822982788,
"learning_rate": 0.0001,
"loss": 1.6792,
"step": 1037
},
{
"epoch": 0.5065061808718282,
"grad_norm": 0.17972713708877563,
"learning_rate": 0.0001,
"loss": 1.5694,
"step": 1038
},
{
"epoch": 0.5069941444372154,
"grad_norm": 0.1910099983215332,
"learning_rate": 0.0001,
"loss": 1.6189,
"step": 1039
},
{
"epoch": 0.5074821080026025,
"grad_norm": 0.18901146948337555,
"learning_rate": 0.0001,
"loss": 1.7515,
"step": 1040
},
{
"epoch": 0.5079700715679896,
"grad_norm": 0.18210864067077637,
"learning_rate": 0.0001,
"loss": 1.729,
"step": 1041
},
{
"epoch": 0.5084580351333767,
"grad_norm": 0.18417298793792725,
"learning_rate": 0.0001,
"loss": 1.7392,
"step": 1042
},
{
"epoch": 0.5089459986987638,
"grad_norm": 0.18548882007598877,
"learning_rate": 0.0001,
"loss": 1.7452,
"step": 1043
},
{
"epoch": 0.5094339622641509,
"grad_norm": 0.17644409835338593,
"learning_rate": 0.0001,
"loss": 1.5658,
"step": 1044
},
{
"epoch": 0.5099219258295381,
"grad_norm": 0.18809697031974792,
"learning_rate": 0.0001,
"loss": 1.6806,
"step": 1045
},
{
"epoch": 0.5104098893949252,
"grad_norm": 0.18309113383293152,
"learning_rate": 0.0001,
"loss": 1.7068,
"step": 1046
},
{
"epoch": 0.5108978529603123,
"grad_norm": 0.1873452365398407,
"learning_rate": 0.0001,
"loss": 1.7401,
"step": 1047
},
{
"epoch": 0.5113858165256994,
"grad_norm": 0.18118296563625336,
"learning_rate": 0.0001,
"loss": 1.6853,
"step": 1048
},
{
"epoch": 0.5118737800910865,
"grad_norm": 0.19551081955432892,
"learning_rate": 0.0001,
"loss": 1.6851,
"step": 1049
},
{
"epoch": 0.5123617436564737,
"grad_norm": 0.19051168859004974,
"learning_rate": 0.0001,
"loss": 1.7153,
"step": 1050
},
{
"epoch": 0.5128497072218607,
"grad_norm": 0.1723107546567917,
"learning_rate": 0.0001,
"loss": 1.6446,
"step": 1051
},
{
"epoch": 0.5133376707872479,
"grad_norm": 0.18448057770729065,
"learning_rate": 0.0001,
"loss": 1.6798,
"step": 1052
},
{
"epoch": 0.513825634352635,
"grad_norm": 0.1888912320137024,
"learning_rate": 0.0001,
"loss": 1.7696,
"step": 1053
},
{
"epoch": 0.5143135979180221,
"grad_norm": 0.19481922686100006,
"learning_rate": 0.0001,
"loss": 1.6657,
"step": 1054
},
{
"epoch": 0.5148015614834093,
"grad_norm": 0.17614057660102844,
"learning_rate": 0.0001,
"loss": 1.6758,
"step": 1055
},
{
"epoch": 0.5152895250487963,
"grad_norm": 0.1752062737941742,
"learning_rate": 0.0001,
"loss": 1.644,
"step": 1056
},
{
"epoch": 0.5157774886141835,
"grad_norm": 0.1882951855659485,
"learning_rate": 0.0001,
"loss": 1.6644,
"step": 1057
},
{
"epoch": 0.5162654521795705,
"grad_norm": 0.20255088806152344,
"learning_rate": 0.0001,
"loss": 1.7119,
"step": 1058
},
{
"epoch": 0.5167534157449577,
"grad_norm": 0.181501105427742,
"learning_rate": 0.0001,
"loss": 1.662,
"step": 1059
},
{
"epoch": 0.5172413793103449,
"grad_norm": 0.1865651160478592,
"learning_rate": 0.0001,
"loss": 1.7279,
"step": 1060
},
{
"epoch": 0.5177293428757319,
"grad_norm": 0.1911836862564087,
"learning_rate": 0.0001,
"loss": 1.6795,
"step": 1061
},
{
"epoch": 0.5182173064411191,
"grad_norm": 0.18534213304519653,
"learning_rate": 0.0001,
"loss": 1.7126,
"step": 1062
},
{
"epoch": 0.5187052700065062,
"grad_norm": 0.1829744428396225,
"learning_rate": 0.0001,
"loss": 1.6598,
"step": 1063
},
{
"epoch": 0.5191932335718933,
"grad_norm": 0.17899416387081146,
"learning_rate": 0.0001,
"loss": 1.6293,
"step": 1064
},
{
"epoch": 0.5196811971372804,
"grad_norm": 0.17233431339263916,
"learning_rate": 0.0001,
"loss": 1.6195,
"step": 1065
},
{
"epoch": 0.5201691607026675,
"grad_norm": 0.1891251802444458,
"learning_rate": 0.0001,
"loss": 1.72,
"step": 1066
},
{
"epoch": 0.5206571242680547,
"grad_norm": 0.19288107752799988,
"learning_rate": 0.0001,
"loss": 1.8331,
"step": 1067
},
{
"epoch": 0.5211450878334418,
"grad_norm": 0.18534426391124725,
"learning_rate": 0.0001,
"loss": 1.6229,
"step": 1068
},
{
"epoch": 0.5216330513988289,
"grad_norm": 0.19013041257858276,
"learning_rate": 0.0001,
"loss": 1.7331,
"step": 1069
},
{
"epoch": 0.522121014964216,
"grad_norm": 0.18765857815742493,
"learning_rate": 0.0001,
"loss": 1.6951,
"step": 1070
},
{
"epoch": 0.5226089785296031,
"grad_norm": 0.17150448262691498,
"learning_rate": 0.0001,
"loss": 1.6581,
"step": 1071
},
{
"epoch": 0.5230969420949902,
"grad_norm": 0.20504555106163025,
"learning_rate": 0.0001,
"loss": 1.7247,
"step": 1072
},
{
"epoch": 0.5235849056603774,
"grad_norm": 0.17816084623336792,
"learning_rate": 0.0001,
"loss": 1.5654,
"step": 1073
},
{
"epoch": 0.5240728692257645,
"grad_norm": 0.1842648684978485,
"learning_rate": 0.0001,
"loss": 1.601,
"step": 1074
},
{
"epoch": 0.5245608327911516,
"grad_norm": 0.18370290100574493,
"learning_rate": 0.0001,
"loss": 1.6369,
"step": 1075
},
{
"epoch": 0.5250487963565387,
"grad_norm": 0.18270552158355713,
"learning_rate": 0.0001,
"loss": 1.6541,
"step": 1076
},
{
"epoch": 0.5255367599219258,
"grad_norm": 0.1808508038520813,
"learning_rate": 0.0001,
"loss": 1.6598,
"step": 1077
},
{
"epoch": 0.526024723487313,
"grad_norm": 0.17794300615787506,
"learning_rate": 0.0001,
"loss": 1.7294,
"step": 1078
},
{
"epoch": 0.5265126870527,
"grad_norm": 0.18382461369037628,
"learning_rate": 0.0001,
"loss": 1.6901,
"step": 1079
},
{
"epoch": 0.5270006506180872,
"grad_norm": 0.1806422621011734,
"learning_rate": 0.0001,
"loss": 1.6193,
"step": 1080
},
{
"epoch": 0.5274886141834743,
"grad_norm": 0.18108539283275604,
"learning_rate": 0.0001,
"loss": 1.6911,
"step": 1081
},
{
"epoch": 0.5279765777488614,
"grad_norm": 0.18681305646896362,
"learning_rate": 0.0001,
"loss": 1.726,
"step": 1082
},
{
"epoch": 0.5284645413142486,
"grad_norm": 0.18909889459609985,
"learning_rate": 0.0001,
"loss": 1.6857,
"step": 1083
},
{
"epoch": 0.5289525048796356,
"grad_norm": 0.18421509861946106,
"learning_rate": 0.0001,
"loss": 1.6564,
"step": 1084
},
{
"epoch": 0.5294404684450228,
"grad_norm": 0.18811306357383728,
"learning_rate": 0.0001,
"loss": 1.7817,
"step": 1085
},
{
"epoch": 0.5299284320104098,
"grad_norm": 0.17478449642658234,
"learning_rate": 0.0001,
"loss": 1.681,
"step": 1086
},
{
"epoch": 0.530416395575797,
"grad_norm": 0.1789132058620453,
"learning_rate": 0.0001,
"loss": 1.6906,
"step": 1087
},
{
"epoch": 0.5309043591411842,
"grad_norm": 0.18358959257602692,
"learning_rate": 0.0001,
"loss": 1.6347,
"step": 1088
},
{
"epoch": 0.5313923227065712,
"grad_norm": 0.18565410375595093,
"learning_rate": 0.0001,
"loss": 1.7078,
"step": 1089
},
{
"epoch": 0.5318802862719584,
"grad_norm": 0.19210746884346008,
"learning_rate": 0.0001,
"loss": 1.6195,
"step": 1090
},
{
"epoch": 0.5323682498373454,
"grad_norm": 0.18205370008945465,
"learning_rate": 0.0001,
"loss": 1.6541,
"step": 1091
},
{
"epoch": 0.5328562134027326,
"grad_norm": 0.19181987643241882,
"learning_rate": 0.0001,
"loss": 1.8033,
"step": 1092
},
{
"epoch": 0.5333441769681198,
"grad_norm": 0.20362940430641174,
"learning_rate": 0.0001,
"loss": 1.7497,
"step": 1093
},
{
"epoch": 0.5338321405335068,
"grad_norm": 0.1858234405517578,
"learning_rate": 0.0001,
"loss": 1.6342,
"step": 1094
},
{
"epoch": 0.534320104098894,
"grad_norm": 0.19925346970558167,
"learning_rate": 0.0001,
"loss": 1.686,
"step": 1095
},
{
"epoch": 0.534808067664281,
"grad_norm": 0.19114282727241516,
"learning_rate": 0.0001,
"loss": 1.7186,
"step": 1096
},
{
"epoch": 0.5352960312296682,
"grad_norm": 0.1771971732378006,
"learning_rate": 0.0001,
"loss": 1.776,
"step": 1097
},
{
"epoch": 0.5357839947950553,
"grad_norm": 0.18942809104919434,
"learning_rate": 0.0001,
"loss": 1.7179,
"step": 1098
},
{
"epoch": 0.5362719583604424,
"grad_norm": 0.1868084967136383,
"learning_rate": 0.0001,
"loss": 1.6454,
"step": 1099
},
{
"epoch": 0.5367599219258296,
"grad_norm": 0.18689820170402527,
"learning_rate": 0.0001,
"loss": 1.6196,
"step": 1100
},
{
"epoch": 0.5372478854912166,
"grad_norm": 0.1820572018623352,
"learning_rate": 0.0001,
"loss": 1.6673,
"step": 1101
},
{
"epoch": 0.5377358490566038,
"grad_norm": 0.17870689928531647,
"learning_rate": 0.0001,
"loss": 1.5968,
"step": 1102
},
{
"epoch": 0.5382238126219909,
"grad_norm": 0.18118569254875183,
"learning_rate": 0.0001,
"loss": 1.7227,
"step": 1103
},
{
"epoch": 0.538711776187378,
"grad_norm": 0.1880924552679062,
"learning_rate": 0.0001,
"loss": 1.6108,
"step": 1104
},
{
"epoch": 0.5391997397527651,
"grad_norm": 0.18598206341266632,
"learning_rate": 0.0001,
"loss": 1.6542,
"step": 1105
},
{
"epoch": 0.5396877033181522,
"grad_norm": 0.1872934103012085,
"learning_rate": 0.0001,
"loss": 1.737,
"step": 1106
},
{
"epoch": 0.5401756668835394,
"grad_norm": 0.1890784651041031,
"learning_rate": 0.0001,
"loss": 1.6661,
"step": 1107
},
{
"epoch": 0.5406636304489265,
"grad_norm": 0.18039381504058838,
"learning_rate": 0.0001,
"loss": 1.6276,
"step": 1108
},
{
"epoch": 0.5411515940143136,
"grad_norm": 0.18550348281860352,
"learning_rate": 0.0001,
"loss": 1.6828,
"step": 1109
},
{
"epoch": 0.5416395575797007,
"grad_norm": 0.17449964582920074,
"learning_rate": 0.0001,
"loss": 1.5034,
"step": 1110
},
{
"epoch": 0.5421275211450879,
"grad_norm": 0.18202394247055054,
"learning_rate": 0.0001,
"loss": 1.6561,
"step": 1111
},
{
"epoch": 0.5426154847104749,
"grad_norm": 0.19365155696868896,
"learning_rate": 0.0001,
"loss": 1.5,
"step": 1112
},
{
"epoch": 0.5431034482758621,
"grad_norm": 0.17744717001914978,
"learning_rate": 0.0001,
"loss": 1.5921,
"step": 1113
},
{
"epoch": 0.5435914118412492,
"grad_norm": 0.17965885996818542,
"learning_rate": 0.0001,
"loss": 1.6819,
"step": 1114
},
{
"epoch": 0.5440793754066363,
"grad_norm": 0.17675574123859406,
"learning_rate": 0.0001,
"loss": 1.6471,
"step": 1115
},
{
"epoch": 0.5445673389720235,
"grad_norm": 0.17376431822776794,
"learning_rate": 0.0001,
"loss": 1.7007,
"step": 1116
},
{
"epoch": 0.5450553025374105,
"grad_norm": 0.18188650906085968,
"learning_rate": 0.0001,
"loss": 1.774,
"step": 1117
},
{
"epoch": 0.5455432661027977,
"grad_norm": 0.17877081036567688,
"learning_rate": 0.0001,
"loss": 1.6535,
"step": 1118
},
{
"epoch": 0.5460312296681847,
"grad_norm": 0.17933769524097443,
"learning_rate": 0.0001,
"loss": 1.7362,
"step": 1119
},
{
"epoch": 0.5465191932335719,
"grad_norm": 0.1805192083120346,
"learning_rate": 0.0001,
"loss": 1.7321,
"step": 1120
},
{
"epoch": 0.5470071567989591,
"grad_norm": 0.17312046885490417,
"learning_rate": 0.0001,
"loss": 1.6415,
"step": 1121
},
{
"epoch": 0.5474951203643461,
"grad_norm": 0.18119437992572784,
"learning_rate": 0.0001,
"loss": 1.7104,
"step": 1122
},
{
"epoch": 0.5479830839297333,
"grad_norm": 0.182356595993042,
"learning_rate": 0.0001,
"loss": 1.6866,
"step": 1123
},
{
"epoch": 0.5484710474951203,
"grad_norm": 0.1846156120300293,
"learning_rate": 0.0001,
"loss": 1.6612,
"step": 1124
},
{
"epoch": 0.5489590110605075,
"grad_norm": 0.17960377037525177,
"learning_rate": 0.0001,
"loss": 1.6848,
"step": 1125
},
{
"epoch": 0.5494469746258946,
"grad_norm": 0.17133495211601257,
"learning_rate": 0.0001,
"loss": 1.5885,
"step": 1126
},
{
"epoch": 0.5499349381912817,
"grad_norm": 0.18075834214687347,
"learning_rate": 0.0001,
"loss": 1.7428,
"step": 1127
},
{
"epoch": 0.5504229017566689,
"grad_norm": 0.18319405615329742,
"learning_rate": 0.0001,
"loss": 1.5856,
"step": 1128
},
{
"epoch": 0.5509108653220559,
"grad_norm": 0.17644239962100983,
"learning_rate": 0.0001,
"loss": 1.6198,
"step": 1129
},
{
"epoch": 0.5513988288874431,
"grad_norm": 0.18394580483436584,
"learning_rate": 0.0001,
"loss": 1.6435,
"step": 1130
},
{
"epoch": 0.5518867924528302,
"grad_norm": 0.1763201355934143,
"learning_rate": 0.0001,
"loss": 1.6975,
"step": 1131
},
{
"epoch": 0.5523747560182173,
"grad_norm": 0.16742850840091705,
"learning_rate": 0.0001,
"loss": 1.5377,
"step": 1132
},
{
"epoch": 0.5528627195836044,
"grad_norm": 0.1892685890197754,
"learning_rate": 0.0001,
"loss": 1.6111,
"step": 1133
},
{
"epoch": 0.5533506831489915,
"grad_norm": 0.18346691131591797,
"learning_rate": 0.0001,
"loss": 1.6844,
"step": 1134
},
{
"epoch": 0.5538386467143787,
"grad_norm": 0.1796543449163437,
"learning_rate": 0.0001,
"loss": 1.7746,
"step": 1135
},
{
"epoch": 0.5543266102797658,
"grad_norm": 0.18673722445964813,
"learning_rate": 0.0001,
"loss": 1.6464,
"step": 1136
},
{
"epoch": 0.5548145738451529,
"grad_norm": 0.17763900756835938,
"learning_rate": 0.0001,
"loss": 1.6237,
"step": 1137
},
{
"epoch": 0.55530253741054,
"grad_norm": 0.17686204612255096,
"learning_rate": 0.0001,
"loss": 1.5131,
"step": 1138
},
{
"epoch": 0.5557905009759271,
"grad_norm": 0.18360872566699982,
"learning_rate": 0.0001,
"loss": 1.6699,
"step": 1139
},
{
"epoch": 0.5562784645413142,
"grad_norm": 0.1827259063720703,
"learning_rate": 0.0001,
"loss": 1.746,
"step": 1140
},
{
"epoch": 0.5567664281067014,
"grad_norm": 0.17962484061717987,
"learning_rate": 0.0001,
"loss": 1.6284,
"step": 1141
},
{
"epoch": 0.5572543916720885,
"grad_norm": 0.18114878237247467,
"learning_rate": 0.0001,
"loss": 1.6737,
"step": 1142
},
{
"epoch": 0.5577423552374756,
"grad_norm": 0.18968282639980316,
"learning_rate": 0.0001,
"loss": 1.7798,
"step": 1143
},
{
"epoch": 0.5582303188028627,
"grad_norm": 0.18505877256393433,
"learning_rate": 0.0001,
"loss": 1.708,
"step": 1144
},
{
"epoch": 0.5587182823682498,
"grad_norm": 0.1776040643453598,
"learning_rate": 0.0001,
"loss": 1.7424,
"step": 1145
},
{
"epoch": 0.559206245933637,
"grad_norm": 0.17982693016529083,
"learning_rate": 0.0001,
"loss": 1.6197,
"step": 1146
},
{
"epoch": 0.559694209499024,
"grad_norm": 0.19187504053115845,
"learning_rate": 0.0001,
"loss": 1.7451,
"step": 1147
},
{
"epoch": 0.5601821730644112,
"grad_norm": 0.17975229024887085,
"learning_rate": 0.0001,
"loss": 1.6236,
"step": 1148
},
{
"epoch": 0.5606701366297983,
"grad_norm": 0.18996664881706238,
"learning_rate": 0.0001,
"loss": 1.7377,
"step": 1149
},
{
"epoch": 0.5611581001951854,
"grad_norm": 0.18252383172512054,
"learning_rate": 0.0001,
"loss": 1.628,
"step": 1150
},
{
"epoch": 0.5616460637605726,
"grad_norm": 0.18448345363140106,
"learning_rate": 0.0001,
"loss": 1.7109,
"step": 1151
},
{
"epoch": 0.5621340273259596,
"grad_norm": 0.17741243541240692,
"learning_rate": 0.0001,
"loss": 1.6088,
"step": 1152
},
{
"epoch": 0.5626219908913468,
"grad_norm": 0.19825778901576996,
"learning_rate": 0.0001,
"loss": 1.5972,
"step": 1153
},
{
"epoch": 0.563109954456734,
"grad_norm": 0.18595324456691742,
"learning_rate": 0.0001,
"loss": 1.672,
"step": 1154
},
{
"epoch": 0.563597918022121,
"grad_norm": 0.18176652491092682,
"learning_rate": 0.0001,
"loss": 1.6216,
"step": 1155
},
{
"epoch": 0.5640858815875082,
"grad_norm": 0.1950223743915558,
"learning_rate": 0.0001,
"loss": 1.705,
"step": 1156
},
{
"epoch": 0.5645738451528952,
"grad_norm": 0.1990990787744522,
"learning_rate": 0.0001,
"loss": 1.7031,
"step": 1157
},
{
"epoch": 0.5650618087182824,
"grad_norm": 0.1937246173620224,
"learning_rate": 0.0001,
"loss": 1.6838,
"step": 1158
},
{
"epoch": 0.5655497722836695,
"grad_norm": 0.1884077787399292,
"learning_rate": 0.0001,
"loss": 1.5994,
"step": 1159
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.19293847680091858,
"learning_rate": 0.0001,
"loss": 1.7657,
"step": 1160
},
{
"epoch": 0.5665256994144438,
"grad_norm": 0.18362392485141754,
"learning_rate": 0.0001,
"loss": 1.7443,
"step": 1161
},
{
"epoch": 0.5670136629798308,
"grad_norm": 0.17800559103488922,
"learning_rate": 0.0001,
"loss": 1.6433,
"step": 1162
},
{
"epoch": 0.567501626545218,
"grad_norm": 0.1774267852306366,
"learning_rate": 0.0001,
"loss": 1.6468,
"step": 1163
},
{
"epoch": 0.5679895901106051,
"grad_norm": 0.18834517896175385,
"learning_rate": 0.0001,
"loss": 1.7715,
"step": 1164
},
{
"epoch": 0.5684775536759922,
"grad_norm": 0.1841384768486023,
"learning_rate": 0.0001,
"loss": 1.7604,
"step": 1165
},
{
"epoch": 0.5689655172413793,
"grad_norm": 0.18285635113716125,
"learning_rate": 0.0001,
"loss": 1.6941,
"step": 1166
},
{
"epoch": 0.5694534808067664,
"grad_norm": 0.1796160191297531,
"learning_rate": 0.0001,
"loss": 1.6835,
"step": 1167
},
{
"epoch": 0.5699414443721535,
"grad_norm": 0.18359331786632538,
"learning_rate": 0.0001,
"loss": 1.6658,
"step": 1168
},
{
"epoch": 0.5704294079375407,
"grad_norm": 0.17833665013313293,
"learning_rate": 0.0001,
"loss": 1.6455,
"step": 1169
},
{
"epoch": 0.5709173715029278,
"grad_norm": 0.17929013073444366,
"learning_rate": 0.0001,
"loss": 1.5912,
"step": 1170
},
{
"epoch": 0.5714053350683149,
"grad_norm": 0.18901382386684418,
"learning_rate": 0.0001,
"loss": 1.7305,
"step": 1171
},
{
"epoch": 0.571893298633702,
"grad_norm": 0.18040084838867188,
"learning_rate": 0.0001,
"loss": 1.6239,
"step": 1172
},
{
"epoch": 0.5723812621990891,
"grad_norm": 0.1832232028245926,
"learning_rate": 0.0001,
"loss": 1.6594,
"step": 1173
},
{
"epoch": 0.5728692257644763,
"grad_norm": 0.1900448203086853,
"learning_rate": 0.0001,
"loss": 1.7176,
"step": 1174
},
{
"epoch": 0.5733571893298633,
"grad_norm": 0.1859886199235916,
"learning_rate": 0.0001,
"loss": 1.6823,
"step": 1175
},
{
"epoch": 0.5738451528952505,
"grad_norm": 0.1816965788602829,
"learning_rate": 0.0001,
"loss": 1.6936,
"step": 1176
},
{
"epoch": 0.5743331164606376,
"grad_norm": 0.1927751749753952,
"learning_rate": 0.0001,
"loss": 1.7069,
"step": 1177
},
{
"epoch": 0.5748210800260247,
"grad_norm": 0.20290379226207733,
"learning_rate": 0.0001,
"loss": 1.7987,
"step": 1178
},
{
"epoch": 0.5753090435914119,
"grad_norm": 0.1756032556295395,
"learning_rate": 0.0001,
"loss": 1.656,
"step": 1179
},
{
"epoch": 0.5757970071567989,
"grad_norm": 0.19676676392555237,
"learning_rate": 0.0001,
"loss": 1.8415,
"step": 1180
},
{
"epoch": 0.5762849707221861,
"grad_norm": 0.18112622201442719,
"learning_rate": 0.0001,
"loss": 1.6081,
"step": 1181
},
{
"epoch": 0.5767729342875731,
"grad_norm": 0.20109887421131134,
"learning_rate": 0.0001,
"loss": 1.7772,
"step": 1182
},
{
"epoch": 0.5772608978529603,
"grad_norm": 0.191656693816185,
"learning_rate": 0.0001,
"loss": 1.6869,
"step": 1183
},
{
"epoch": 0.5777488614183475,
"grad_norm": 0.17886236310005188,
"learning_rate": 0.0001,
"loss": 1.5931,
"step": 1184
},
{
"epoch": 0.5782368249837345,
"grad_norm": 0.18148286640644073,
"learning_rate": 0.0001,
"loss": 1.6056,
"step": 1185
},
{
"epoch": 0.5787247885491217,
"grad_norm": 0.20596817135810852,
"learning_rate": 0.0001,
"loss": 1.6129,
"step": 1186
},
{
"epoch": 0.5792127521145087,
"grad_norm": 0.17900511622428894,
"learning_rate": 0.0001,
"loss": 1.6487,
"step": 1187
},
{
"epoch": 0.5797007156798959,
"grad_norm": 0.1893642693758011,
"learning_rate": 0.0001,
"loss": 1.7566,
"step": 1188
},
{
"epoch": 0.5801886792452831,
"grad_norm": 0.19354504346847534,
"learning_rate": 0.0001,
"loss": 1.6665,
"step": 1189
},
{
"epoch": 0.5806766428106701,
"grad_norm": 0.18692192435264587,
"learning_rate": 0.0001,
"loss": 1.7069,
"step": 1190
},
{
"epoch": 0.5811646063760573,
"grad_norm": 0.204212948679924,
"learning_rate": 0.0001,
"loss": 1.7943,
"step": 1191
},
{
"epoch": 0.5816525699414443,
"grad_norm": 0.18666908144950867,
"learning_rate": 0.0001,
"loss": 1.7031,
"step": 1192
},
{
"epoch": 0.5821405335068315,
"grad_norm": 0.1859620362520218,
"learning_rate": 0.0001,
"loss": 1.7443,
"step": 1193
},
{
"epoch": 0.5826284970722186,
"grad_norm": 0.1774389147758484,
"learning_rate": 0.0001,
"loss": 1.6697,
"step": 1194
},
{
"epoch": 0.5831164606376057,
"grad_norm": 0.17645440995693207,
"learning_rate": 0.0001,
"loss": 1.7566,
"step": 1195
},
{
"epoch": 0.5836044242029929,
"grad_norm": 0.17927305400371552,
"learning_rate": 0.0001,
"loss": 1.5341,
"step": 1196
},
{
"epoch": 0.5840923877683799,
"grad_norm": 0.19179411232471466,
"learning_rate": 0.0001,
"loss": 1.706,
"step": 1197
},
{
"epoch": 0.5845803513337671,
"grad_norm": 0.18921273946762085,
"learning_rate": 0.0001,
"loss": 1.6651,
"step": 1198
},
{
"epoch": 0.5850683148991542,
"grad_norm": 0.20988748967647552,
"learning_rate": 0.0001,
"loss": 1.8307,
"step": 1199
},
{
"epoch": 0.5855562784645413,
"grad_norm": 0.1767909973859787,
"learning_rate": 0.0001,
"loss": 1.7116,
"step": 1200
}
],
"logging_steps": 1,
"max_steps": 2049,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.490022511384986e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}