claudiomello's picture
Upload folder using huggingface_hub
3674cef verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_metric": 0.244761124253273,
"best_model_checkpoint": "Classifier-Intent-snowflake/checkpoint-803",
"epoch": 1.0,
"eval_steps": 500,
"global_step": 803,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012453300124533001,
"grad_norm": 14.392992973327637,
"learning_rate": 3.1133250311332504e-08,
"loss": 1.3872,
"step": 1
},
{
"epoch": 0.0024906600249066002,
"grad_norm": 16.613218307495117,
"learning_rate": 6.226650062266501e-08,
"loss": 1.4209,
"step": 2
},
{
"epoch": 0.0037359900373599006,
"grad_norm": 14.957581520080566,
"learning_rate": 9.339975093399752e-08,
"loss": 1.5269,
"step": 3
},
{
"epoch": 0.0049813200498132005,
"grad_norm": 14.315893173217773,
"learning_rate": 1.2453300124533001e-07,
"loss": 1.3745,
"step": 4
},
{
"epoch": 0.0062266500622665,
"grad_norm": 17.72991371154785,
"learning_rate": 1.556662515566625e-07,
"loss": 1.2588,
"step": 5
},
{
"epoch": 0.007471980074719801,
"grad_norm": 15.170116424560547,
"learning_rate": 1.8679950186799505e-07,
"loss": 1.4722,
"step": 6
},
{
"epoch": 0.008717310087173101,
"grad_norm": 14.7129487991333,
"learning_rate": 2.1793275217932754e-07,
"loss": 1.4404,
"step": 7
},
{
"epoch": 0.009962640099626401,
"grad_norm": 19.042442321777344,
"learning_rate": 2.4906600249066003e-07,
"loss": 1.5845,
"step": 8
},
{
"epoch": 0.0112079701120797,
"grad_norm": 14.830946922302246,
"learning_rate": 2.801992528019925e-07,
"loss": 1.3213,
"step": 9
},
{
"epoch": 0.012453300124533,
"grad_norm": 15.1524076461792,
"learning_rate": 3.11332503113325e-07,
"loss": 1.2402,
"step": 10
},
{
"epoch": 0.0136986301369863,
"grad_norm": 15.068155288696289,
"learning_rate": 3.4246575342465755e-07,
"loss": 1.3062,
"step": 11
},
{
"epoch": 0.014943960149439602,
"grad_norm": 17.31379508972168,
"learning_rate": 3.735990037359901e-07,
"loss": 1.6055,
"step": 12
},
{
"epoch": 0.0161892901618929,
"grad_norm": 15.690240859985352,
"learning_rate": 4.0473225404732254e-07,
"loss": 1.4761,
"step": 13
},
{
"epoch": 0.017434620174346202,
"grad_norm": 14.473444938659668,
"learning_rate": 4.358655043586551e-07,
"loss": 1.4365,
"step": 14
},
{
"epoch": 0.0186799501867995,
"grad_norm": 15.4556884765625,
"learning_rate": 4.669987546699875e-07,
"loss": 1.5645,
"step": 15
},
{
"epoch": 0.019925280199252802,
"grad_norm": 16.610450744628906,
"learning_rate": 4.981320049813201e-07,
"loss": 1.3652,
"step": 16
},
{
"epoch": 0.021170610211706103,
"grad_norm": Infinity,
"learning_rate": 4.981320049813201e-07,
"loss": 1.5137,
"step": 17
},
{
"epoch": 0.0224159402241594,
"grad_norm": 16.464548110961914,
"learning_rate": 5.292652552926527e-07,
"loss": 1.2983,
"step": 18
},
{
"epoch": 0.023661270236612703,
"grad_norm": 13.879263877868652,
"learning_rate": 5.60398505603985e-07,
"loss": 1.3018,
"step": 19
},
{
"epoch": 0.024906600249066,
"grad_norm": 18.191198348999023,
"learning_rate": 5.915317559153176e-07,
"loss": 1.5151,
"step": 20
},
{
"epoch": 0.026151930261519303,
"grad_norm": 14.711188316345215,
"learning_rate": 6.2266500622665e-07,
"loss": 1.4517,
"step": 21
},
{
"epoch": 0.0273972602739726,
"grad_norm": 20.39883804321289,
"learning_rate": 6.537982565379826e-07,
"loss": 1.5142,
"step": 22
},
{
"epoch": 0.028642590286425903,
"grad_norm": 17.874603271484375,
"learning_rate": 6.849315068493151e-07,
"loss": 1.4731,
"step": 23
},
{
"epoch": 0.029887920298879204,
"grad_norm": 15.248433113098145,
"learning_rate": 7.160647571606476e-07,
"loss": 1.4927,
"step": 24
},
{
"epoch": 0.031133250311332503,
"grad_norm": 14.43382453918457,
"learning_rate": 7.471980074719802e-07,
"loss": 1.2744,
"step": 25
},
{
"epoch": 0.0323785803237858,
"grad_norm": 20.193641662597656,
"learning_rate": 7.783312577833126e-07,
"loss": 1.5669,
"step": 26
},
{
"epoch": 0.033623910336239106,
"grad_norm": 16.741762161254883,
"learning_rate": 8.094645080946451e-07,
"loss": 1.5303,
"step": 27
},
{
"epoch": 0.034869240348692404,
"grad_norm": 15.6235933303833,
"learning_rate": 8.405977584059777e-07,
"loss": 1.3936,
"step": 28
},
{
"epoch": 0.0361145703611457,
"grad_norm": 14.727874755859375,
"learning_rate": 8.717310087173102e-07,
"loss": 1.4126,
"step": 29
},
{
"epoch": 0.037359900373599,
"grad_norm": 16.20413589477539,
"learning_rate": 9.028642590286426e-07,
"loss": 1.4624,
"step": 30
},
{
"epoch": 0.038605230386052306,
"grad_norm": 20.796939849853516,
"learning_rate": 9.33997509339975e-07,
"loss": 1.3433,
"step": 31
},
{
"epoch": 0.039850560398505604,
"grad_norm": 16.971792221069336,
"learning_rate": 9.651307596513077e-07,
"loss": 1.3628,
"step": 32
},
{
"epoch": 0.0410958904109589,
"grad_norm": 14.428796768188477,
"learning_rate": 9.962640099626401e-07,
"loss": 1.2837,
"step": 33
},
{
"epoch": 0.04234122042341221,
"grad_norm": 15.790252685546875,
"learning_rate": 1.0273972602739725e-06,
"loss": 1.4268,
"step": 34
},
{
"epoch": 0.043586550435865505,
"grad_norm": 16.02347183227539,
"learning_rate": 1.0585305105853053e-06,
"loss": 1.4766,
"step": 35
},
{
"epoch": 0.0448318804483188,
"grad_norm": 15.317863464355469,
"learning_rate": 1.0896637608966377e-06,
"loss": 1.3018,
"step": 36
},
{
"epoch": 0.0460772104607721,
"grad_norm": 22.28313636779785,
"learning_rate": 1.12079701120797e-06,
"loss": 1.4688,
"step": 37
},
{
"epoch": 0.047322540473225407,
"grad_norm": 15.996356964111328,
"learning_rate": 1.1519302615193027e-06,
"loss": 1.1543,
"step": 38
},
{
"epoch": 0.048567870485678705,
"grad_norm": 15.208770751953125,
"learning_rate": 1.1830635118306353e-06,
"loss": 1.4375,
"step": 39
},
{
"epoch": 0.049813200498132,
"grad_norm": 15.227863311767578,
"learning_rate": 1.2141967621419677e-06,
"loss": 1.4365,
"step": 40
},
{
"epoch": 0.05105853051058531,
"grad_norm": 14.673625946044922,
"learning_rate": 1.2453300124533e-06,
"loss": 1.2534,
"step": 41
},
{
"epoch": 0.052303860523038606,
"grad_norm": 17.28438949584961,
"learning_rate": 1.2764632627646329e-06,
"loss": 1.5381,
"step": 42
},
{
"epoch": 0.053549190535491904,
"grad_norm": 16.5577449798584,
"learning_rate": 1.3075965130759652e-06,
"loss": 1.5459,
"step": 43
},
{
"epoch": 0.0547945205479452,
"grad_norm": 18.29193687438965,
"learning_rate": 1.3387297633872976e-06,
"loss": 1.1919,
"step": 44
},
{
"epoch": 0.05603985056039851,
"grad_norm": 15.694727897644043,
"learning_rate": 1.3698630136986302e-06,
"loss": 1.4409,
"step": 45
},
{
"epoch": 0.057285180572851806,
"grad_norm": 14.10815715789795,
"learning_rate": 1.4009962640099628e-06,
"loss": 1.2461,
"step": 46
},
{
"epoch": 0.058530510585305104,
"grad_norm": 14.045819282531738,
"learning_rate": 1.4321295143212952e-06,
"loss": 1.4111,
"step": 47
},
{
"epoch": 0.05977584059775841,
"grad_norm": 19.675201416015625,
"learning_rate": 1.4632627646326276e-06,
"loss": 1.4072,
"step": 48
},
{
"epoch": 0.06102117061021171,
"grad_norm": 14.410515785217285,
"learning_rate": 1.4943960149439604e-06,
"loss": 1.23,
"step": 49
},
{
"epoch": 0.062266500622665005,
"grad_norm": 16.496902465820312,
"learning_rate": 1.5255292652552928e-06,
"loss": 1.3691,
"step": 50
},
{
"epoch": 0.06351183063511831,
"grad_norm": 14.99001407623291,
"learning_rate": 1.5566625155666252e-06,
"loss": 1.2393,
"step": 51
},
{
"epoch": 0.0647571606475716,
"grad_norm": 14.407447814941406,
"learning_rate": 1.5877957658779578e-06,
"loss": 1.3501,
"step": 52
},
{
"epoch": 0.0660024906600249,
"grad_norm": 15.634856224060059,
"learning_rate": 1.6189290161892901e-06,
"loss": 1.5059,
"step": 53
},
{
"epoch": 0.06724782067247821,
"grad_norm": 13.683075904846191,
"learning_rate": 1.6500622665006227e-06,
"loss": 1.2251,
"step": 54
},
{
"epoch": 0.0684931506849315,
"grad_norm": 15.530966758728027,
"learning_rate": 1.6811955168119553e-06,
"loss": 1.229,
"step": 55
},
{
"epoch": 0.06973848069738481,
"grad_norm": 14.17822265625,
"learning_rate": 1.7123287671232877e-06,
"loss": 1.2646,
"step": 56
},
{
"epoch": 0.07098381070983811,
"grad_norm": 14.06949234008789,
"learning_rate": 1.7434620174346203e-06,
"loss": 1.1851,
"step": 57
},
{
"epoch": 0.0722291407222914,
"grad_norm": 13.386149406433105,
"learning_rate": 1.774595267745953e-06,
"loss": 1.1406,
"step": 58
},
{
"epoch": 0.07347447073474471,
"grad_norm": 15.319520950317383,
"learning_rate": 1.8057285180572853e-06,
"loss": 1.2173,
"step": 59
},
{
"epoch": 0.074719800747198,
"grad_norm": 14.985965728759766,
"learning_rate": 1.8368617683686179e-06,
"loss": 1.3159,
"step": 60
},
{
"epoch": 0.0759651307596513,
"grad_norm": 17.426523208618164,
"learning_rate": 1.86799501867995e-06,
"loss": 1.0112,
"step": 61
},
{
"epoch": 0.07721046077210461,
"grad_norm": 15.114604949951172,
"learning_rate": 1.8991282689912827e-06,
"loss": 1.3403,
"step": 62
},
{
"epoch": 0.0784557907845579,
"grad_norm": 16.03323745727539,
"learning_rate": 1.9302615193026155e-06,
"loss": 1.2666,
"step": 63
},
{
"epoch": 0.07970112079701121,
"grad_norm": 13.463469505310059,
"learning_rate": 1.9613947696139476e-06,
"loss": 1.106,
"step": 64
},
{
"epoch": 0.08094645080946451,
"grad_norm": 15.67467212677002,
"learning_rate": 1.9925280199252802e-06,
"loss": 1.3604,
"step": 65
},
{
"epoch": 0.0821917808219178,
"grad_norm": 16.3656063079834,
"learning_rate": 2.023661270236613e-06,
"loss": 1.0149,
"step": 66
},
{
"epoch": 0.08343711083437111,
"grad_norm": 18.009429931640625,
"learning_rate": 2.054794520547945e-06,
"loss": 1.2056,
"step": 67
},
{
"epoch": 0.08468244084682441,
"grad_norm": 17.479284286499023,
"learning_rate": 2.085927770859278e-06,
"loss": 0.96,
"step": 68
},
{
"epoch": 0.0859277708592777,
"grad_norm": 18.173294067382812,
"learning_rate": 2.1170610211706106e-06,
"loss": 1.0894,
"step": 69
},
{
"epoch": 0.08717310087173101,
"grad_norm": 13.998863220214844,
"learning_rate": 2.148194271481943e-06,
"loss": 1.1992,
"step": 70
},
{
"epoch": 0.08841843088418432,
"grad_norm": 20.954397201538086,
"learning_rate": 2.1793275217932754e-06,
"loss": 1.2236,
"step": 71
},
{
"epoch": 0.0896637608966376,
"grad_norm": 15.964156150817871,
"learning_rate": 2.210460772104608e-06,
"loss": 1.4097,
"step": 72
},
{
"epoch": 0.09090909090909091,
"grad_norm": 15.810689926147461,
"learning_rate": 2.24159402241594e-06,
"loss": 0.8547,
"step": 73
},
{
"epoch": 0.0921544209215442,
"grad_norm": 17.040708541870117,
"learning_rate": 2.2727272727272728e-06,
"loss": 1.4102,
"step": 74
},
{
"epoch": 0.09339975093399751,
"grad_norm": 14.936725616455078,
"learning_rate": 2.3038605230386054e-06,
"loss": 1.249,
"step": 75
},
{
"epoch": 0.09464508094645081,
"grad_norm": 15.473489761352539,
"learning_rate": 2.334993773349938e-06,
"loss": 0.833,
"step": 76
},
{
"epoch": 0.0958904109589041,
"grad_norm": 20.1041259765625,
"learning_rate": 2.3661270236612705e-06,
"loss": 1.4458,
"step": 77
},
{
"epoch": 0.09713574097135741,
"grad_norm": 13.799981117248535,
"learning_rate": 2.3972602739726027e-06,
"loss": 1.0784,
"step": 78
},
{
"epoch": 0.09838107098381071,
"grad_norm": 17.304981231689453,
"learning_rate": 2.4283935242839353e-06,
"loss": 1.5112,
"step": 79
},
{
"epoch": 0.099626400996264,
"grad_norm": 13.382006645202637,
"learning_rate": 2.459526774595268e-06,
"loss": 1.063,
"step": 80
},
{
"epoch": 0.10087173100871731,
"grad_norm": 14.760406494140625,
"learning_rate": 2.4906600249066e-06,
"loss": 1.1277,
"step": 81
},
{
"epoch": 0.10211706102117062,
"grad_norm": 13.276914596557617,
"learning_rate": 2.5217932752179327e-06,
"loss": 0.9333,
"step": 82
},
{
"epoch": 0.10336239103362391,
"grad_norm": 21.620939254760742,
"learning_rate": 2.5529265255292657e-06,
"loss": 1.7554,
"step": 83
},
{
"epoch": 0.10460772104607721,
"grad_norm": 18.264217376708984,
"learning_rate": 2.584059775840598e-06,
"loss": 1.2744,
"step": 84
},
{
"epoch": 0.10585305105853052,
"grad_norm": 15.217682838439941,
"learning_rate": 2.6151930261519305e-06,
"loss": 1.2827,
"step": 85
},
{
"epoch": 0.10709838107098381,
"grad_norm": 18.51647186279297,
"learning_rate": 2.646326276463263e-06,
"loss": 1.5586,
"step": 86
},
{
"epoch": 0.10834371108343711,
"grad_norm": 15.398965835571289,
"learning_rate": 2.6774595267745952e-06,
"loss": 0.916,
"step": 87
},
{
"epoch": 0.1095890410958904,
"grad_norm": 14.449968338012695,
"learning_rate": 2.708592777085928e-06,
"loss": 0.6475,
"step": 88
},
{
"epoch": 0.11083437110834371,
"grad_norm": 15.214373588562012,
"learning_rate": 2.7397260273972604e-06,
"loss": 1.1885,
"step": 89
},
{
"epoch": 0.11207970112079702,
"grad_norm": 21.287311553955078,
"learning_rate": 2.770859277708593e-06,
"loss": 1.3501,
"step": 90
},
{
"epoch": 0.1133250311332503,
"grad_norm": 14.835405349731445,
"learning_rate": 2.8019925280199256e-06,
"loss": 0.9062,
"step": 91
},
{
"epoch": 0.11457036114570361,
"grad_norm": 16.75213050842285,
"learning_rate": 2.833125778331258e-06,
"loss": 1.1338,
"step": 92
},
{
"epoch": 0.11581569115815692,
"grad_norm": 14.93796157836914,
"learning_rate": 2.8642590286425904e-06,
"loss": 0.9265,
"step": 93
},
{
"epoch": 0.11706102117061021,
"grad_norm": 15.707828521728516,
"learning_rate": 2.895392278953923e-06,
"loss": 1.0312,
"step": 94
},
{
"epoch": 0.11830635118306351,
"grad_norm": 15.904691696166992,
"learning_rate": 2.926525529265255e-06,
"loss": 0.9608,
"step": 95
},
{
"epoch": 0.11955168119551682,
"grad_norm": 13.52252197265625,
"learning_rate": 2.9576587795765878e-06,
"loss": 0.6462,
"step": 96
},
{
"epoch": 0.12079701120797011,
"grad_norm": 15.788945198059082,
"learning_rate": 2.9887920298879208e-06,
"loss": 1.2263,
"step": 97
},
{
"epoch": 0.12204234122042341,
"grad_norm": 15.971314430236816,
"learning_rate": 3.019925280199253e-06,
"loss": 0.6865,
"step": 98
},
{
"epoch": 0.1232876712328767,
"grad_norm": 16.350345611572266,
"learning_rate": 3.0510585305105856e-06,
"loss": 0.9343,
"step": 99
},
{
"epoch": 0.12453300124533001,
"grad_norm": 23.604875564575195,
"learning_rate": 3.0821917808219177e-06,
"loss": 1.2271,
"step": 100
},
{
"epoch": 0.12577833125778332,
"grad_norm": 16.765127182006836,
"learning_rate": 3.1133250311332503e-06,
"loss": 0.9685,
"step": 101
},
{
"epoch": 0.12702366127023662,
"grad_norm": 19.068199157714844,
"learning_rate": 3.144458281444583e-06,
"loss": 1.4028,
"step": 102
},
{
"epoch": 0.12826899128268993,
"grad_norm": 23.64339828491211,
"learning_rate": 3.1755915317559155e-06,
"loss": 1.1528,
"step": 103
},
{
"epoch": 0.1295143212951432,
"grad_norm": 17.963857650756836,
"learning_rate": 3.206724782067248e-06,
"loss": 1.2183,
"step": 104
},
{
"epoch": 0.1307596513075965,
"grad_norm": 24.50640106201172,
"learning_rate": 3.2378580323785803e-06,
"loss": 1.1194,
"step": 105
},
{
"epoch": 0.1320049813200498,
"grad_norm": 13.496341705322266,
"learning_rate": 3.268991282689913e-06,
"loss": 0.8138,
"step": 106
},
{
"epoch": 0.13325031133250312,
"grad_norm": 13.470151901245117,
"learning_rate": 3.3001245330012455e-06,
"loss": 0.4418,
"step": 107
},
{
"epoch": 0.13449564134495642,
"grad_norm": 15.696036338806152,
"learning_rate": 3.331257783312578e-06,
"loss": 0.9106,
"step": 108
},
{
"epoch": 0.1357409713574097,
"grad_norm": 15.40795612335205,
"learning_rate": 3.3623910336239107e-06,
"loss": 0.8492,
"step": 109
},
{
"epoch": 0.136986301369863,
"grad_norm": 14.989590644836426,
"learning_rate": 3.393524283935243e-06,
"loss": 0.6815,
"step": 110
},
{
"epoch": 0.1382316313823163,
"grad_norm": 11.08140754699707,
"learning_rate": 3.4246575342465754e-06,
"loss": 0.3635,
"step": 111
},
{
"epoch": 0.13947696139476962,
"grad_norm": 13.492122650146484,
"learning_rate": 3.455790784557908e-06,
"loss": 0.4391,
"step": 112
},
{
"epoch": 0.14072229140722292,
"grad_norm": 24.947566986083984,
"learning_rate": 3.4869240348692406e-06,
"loss": 1.6245,
"step": 113
},
{
"epoch": 0.14196762141967623,
"grad_norm": 21.374814987182617,
"learning_rate": 3.5180572851805732e-06,
"loss": 1.0759,
"step": 114
},
{
"epoch": 0.1432129514321295,
"grad_norm": 12.628018379211426,
"learning_rate": 3.549190535491906e-06,
"loss": 0.3741,
"step": 115
},
{
"epoch": 0.1444582814445828,
"grad_norm": 28.174150466918945,
"learning_rate": 3.5803237858032376e-06,
"loss": 1.8252,
"step": 116
},
{
"epoch": 0.14570361145703611,
"grad_norm": 29.708969116210938,
"learning_rate": 3.6114570361145706e-06,
"loss": 1.6035,
"step": 117
},
{
"epoch": 0.14694894146948942,
"grad_norm": 14.904471397399902,
"learning_rate": 3.642590286425903e-06,
"loss": 0.693,
"step": 118
},
{
"epoch": 0.14819427148194272,
"grad_norm": 19.106191635131836,
"learning_rate": 3.6737235367372358e-06,
"loss": 0.7761,
"step": 119
},
{
"epoch": 0.149439601494396,
"grad_norm": 21.0386905670166,
"learning_rate": 3.7048567870485684e-06,
"loss": 1.1099,
"step": 120
},
{
"epoch": 0.1506849315068493,
"grad_norm": 11.261611938476562,
"learning_rate": 3.7359900373599e-06,
"loss": 0.3363,
"step": 121
},
{
"epoch": 0.1519302615193026,
"grad_norm": 21.45566749572754,
"learning_rate": 3.7671232876712327e-06,
"loss": 1.1392,
"step": 122
},
{
"epoch": 0.15317559153175592,
"grad_norm": 23.72317123413086,
"learning_rate": 3.7982565379825653e-06,
"loss": 1.2175,
"step": 123
},
{
"epoch": 0.15442092154420922,
"grad_norm": 9.110578536987305,
"learning_rate": 3.829389788293898e-06,
"loss": 0.2401,
"step": 124
},
{
"epoch": 0.15566625155666253,
"grad_norm": 10.689005851745605,
"learning_rate": 3.860523038605231e-06,
"loss": 0.2262,
"step": 125
},
{
"epoch": 0.1569115815691158,
"grad_norm": 18.003347396850586,
"learning_rate": 3.8916562889165635e-06,
"loss": 0.8304,
"step": 126
},
{
"epoch": 0.1581569115815691,
"grad_norm": 16.37116241455078,
"learning_rate": 3.922789539227895e-06,
"loss": 0.6732,
"step": 127
},
{
"epoch": 0.15940224159402241,
"grad_norm": 20.549619674682617,
"learning_rate": 3.953922789539228e-06,
"loss": 0.7898,
"step": 128
},
{
"epoch": 0.16064757160647572,
"grad_norm": 27.759565353393555,
"learning_rate": 3.9850560398505605e-06,
"loss": 1.6685,
"step": 129
},
{
"epoch": 0.16189290161892902,
"grad_norm": 10.014034271240234,
"learning_rate": 4.016189290161893e-06,
"loss": 0.2059,
"step": 130
},
{
"epoch": 0.16313823163138233,
"grad_norm": 18.375551223754883,
"learning_rate": 4.047322540473226e-06,
"loss": 0.5604,
"step": 131
},
{
"epoch": 0.1643835616438356,
"grad_norm": 23.120948791503906,
"learning_rate": 4.078455790784558e-06,
"loss": 1.2139,
"step": 132
},
{
"epoch": 0.1656288916562889,
"grad_norm": 20.939762115478516,
"learning_rate": 4.10958904109589e-06,
"loss": 0.8262,
"step": 133
},
{
"epoch": 0.16687422166874222,
"grad_norm": 39.98530578613281,
"learning_rate": 4.140722291407223e-06,
"loss": 1.2119,
"step": 134
},
{
"epoch": 0.16811955168119552,
"grad_norm": 16.684823989868164,
"learning_rate": 4.171855541718556e-06,
"loss": 0.7434,
"step": 135
},
{
"epoch": 0.16936488169364883,
"grad_norm": 8.765166282653809,
"learning_rate": 4.202988792029889e-06,
"loss": 0.1506,
"step": 136
},
{
"epoch": 0.1706102117061021,
"grad_norm": 20.599409103393555,
"learning_rate": 4.234122042341221e-06,
"loss": 0.8276,
"step": 137
},
{
"epoch": 0.1718555417185554,
"grad_norm": 27.572763442993164,
"learning_rate": 4.265255292652553e-06,
"loss": 1.0833,
"step": 138
},
{
"epoch": 0.17310087173100872,
"grad_norm": 18.92407989501953,
"learning_rate": 4.296388542963886e-06,
"loss": 0.4558,
"step": 139
},
{
"epoch": 0.17434620174346202,
"grad_norm": 17.19509506225586,
"learning_rate": 4.327521793275218e-06,
"loss": 0.2935,
"step": 140
},
{
"epoch": 0.17559153175591533,
"grad_norm": 24.49059295654297,
"learning_rate": 4.358655043586551e-06,
"loss": 0.7617,
"step": 141
},
{
"epoch": 0.17683686176836863,
"grad_norm": 10.664165496826172,
"learning_rate": 4.389788293897883e-06,
"loss": 0.2395,
"step": 142
},
{
"epoch": 0.1780821917808219,
"grad_norm": 25.44748878479004,
"learning_rate": 4.420921544209216e-06,
"loss": 0.9827,
"step": 143
},
{
"epoch": 0.1793275217932752,
"grad_norm": 15.069397926330566,
"learning_rate": 4.452054794520548e-06,
"loss": 0.631,
"step": 144
},
{
"epoch": 0.18057285180572852,
"grad_norm": 18.701967239379883,
"learning_rate": 4.48318804483188e-06,
"loss": 0.8523,
"step": 145
},
{
"epoch": 0.18181818181818182,
"grad_norm": 29.00722885131836,
"learning_rate": 4.514321295143213e-06,
"loss": 1.2954,
"step": 146
},
{
"epoch": 0.18306351183063513,
"grad_norm": 9.37511157989502,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.2489,
"step": 147
},
{
"epoch": 0.1843088418430884,
"grad_norm": 6.786942005157471,
"learning_rate": 4.576587795765878e-06,
"loss": 0.1326,
"step": 148
},
{
"epoch": 0.1855541718555417,
"grad_norm": 28.655126571655273,
"learning_rate": 4.607721046077211e-06,
"loss": 0.9426,
"step": 149
},
{
"epoch": 0.18679950186799502,
"grad_norm": 6.270091533660889,
"learning_rate": 4.638854296388543e-06,
"loss": 0.203,
"step": 150
},
{
"epoch": 0.18804483188044832,
"grad_norm": 24.001052856445312,
"learning_rate": 4.669987546699876e-06,
"loss": 0.6611,
"step": 151
},
{
"epoch": 0.18929016189290163,
"grad_norm": 5.734297275543213,
"learning_rate": 4.7011207970112085e-06,
"loss": 0.1378,
"step": 152
},
{
"epoch": 0.19053549190535493,
"grad_norm": 10.421098709106445,
"learning_rate": 4.732254047322541e-06,
"loss": 0.1292,
"step": 153
},
{
"epoch": 0.1917808219178082,
"grad_norm": 6.499827861785889,
"learning_rate": 4.763387297633874e-06,
"loss": 0.1825,
"step": 154
},
{
"epoch": 0.1930261519302615,
"grad_norm": 7.8410563468933105,
"learning_rate": 4.7945205479452054e-06,
"loss": 0.2148,
"step": 155
},
{
"epoch": 0.19427148194271482,
"grad_norm": 21.975595474243164,
"learning_rate": 4.825653798256538e-06,
"loss": 0.3541,
"step": 156
},
{
"epoch": 0.19551681195516812,
"grad_norm": Infinity,
"learning_rate": 4.825653798256538e-06,
"loss": 0.611,
"step": 157
},
{
"epoch": 0.19676214196762143,
"grad_norm": 41.450469970703125,
"learning_rate": 4.856787048567871e-06,
"loss": 0.7124,
"step": 158
},
{
"epoch": 0.1980074719800747,
"grad_norm": 11.570192337036133,
"learning_rate": 4.887920298879203e-06,
"loss": 0.2204,
"step": 159
},
{
"epoch": 0.199252801992528,
"grad_norm": 9.37869930267334,
"learning_rate": 4.919053549190536e-06,
"loss": 0.2504,
"step": 160
},
{
"epoch": 0.20049813200498132,
"grad_norm": 10.956586837768555,
"learning_rate": 4.950186799501868e-06,
"loss": 0.2246,
"step": 161
},
{
"epoch": 0.20174346201743462,
"grad_norm": 6.231212139129639,
"learning_rate": 4.9813200498132e-06,
"loss": 0.1144,
"step": 162
},
{
"epoch": 0.20298879202988793,
"grad_norm": 7.454379558563232,
"learning_rate": 5.012453300124533e-06,
"loss": 0.1583,
"step": 163
},
{
"epoch": 0.20423412204234123,
"grad_norm": 4.702846050262451,
"learning_rate": 5.043586550435865e-06,
"loss": 0.0929,
"step": 164
},
{
"epoch": 0.2054794520547945,
"grad_norm": 35.559165954589844,
"learning_rate": 5.074719800747199e-06,
"loss": 0.4275,
"step": 165
},
{
"epoch": 0.20672478206724781,
"grad_norm": 2.42557430267334,
"learning_rate": 5.105853051058531e-06,
"loss": 0.0526,
"step": 166
},
{
"epoch": 0.20797011207970112,
"grad_norm": 1.8609647750854492,
"learning_rate": 5.136986301369863e-06,
"loss": 0.0334,
"step": 167
},
{
"epoch": 0.20921544209215442,
"grad_norm": 4.347940921783447,
"learning_rate": 5.168119551681196e-06,
"loss": 0.095,
"step": 168
},
{
"epoch": 0.21046077210460773,
"grad_norm": 7.721733093261719,
"learning_rate": 5.199252801992528e-06,
"loss": 0.1641,
"step": 169
},
{
"epoch": 0.21170610211706103,
"grad_norm": 42.037933349609375,
"learning_rate": 5.230386052303861e-06,
"loss": 0.4911,
"step": 170
},
{
"epoch": 0.2129514321295143,
"grad_norm": 15.133713722229004,
"learning_rate": 5.2615193026151935e-06,
"loss": 0.1069,
"step": 171
},
{
"epoch": 0.21419676214196762,
"grad_norm": 3.205000638961792,
"learning_rate": 5.292652552926526e-06,
"loss": 0.0497,
"step": 172
},
{
"epoch": 0.21544209215442092,
"grad_norm": 1.0115067958831787,
"learning_rate": 5.323785803237858e-06,
"loss": 0.0211,
"step": 173
},
{
"epoch": 0.21668742216687423,
"grad_norm": Infinity,
"learning_rate": 5.323785803237858e-06,
"loss": 0.6843,
"step": 174
},
{
"epoch": 0.21793275217932753,
"grad_norm": 2.7913990020751953,
"learning_rate": 5.3549190535491905e-06,
"loss": 0.03,
"step": 175
},
{
"epoch": 0.2191780821917808,
"grad_norm": 14.680956840515137,
"learning_rate": 5.386052303860523e-06,
"loss": 0.0976,
"step": 176
},
{
"epoch": 0.22042341220423411,
"grad_norm": 5.276736736297607,
"learning_rate": 5.417185554171856e-06,
"loss": 0.0715,
"step": 177
},
{
"epoch": 0.22166874221668742,
"grad_norm": 2.4684441089630127,
"learning_rate": 5.448318804483188e-06,
"loss": 0.0288,
"step": 178
},
{
"epoch": 0.22291407222914073,
"grad_norm": 1.0922425985336304,
"learning_rate": 5.479452054794521e-06,
"loss": 0.0211,
"step": 179
},
{
"epoch": 0.22415940224159403,
"grad_norm": 9.240842819213867,
"learning_rate": 5.5105853051058535e-06,
"loss": 0.0652,
"step": 180
},
{
"epoch": 0.22540473225404734,
"grad_norm": 38.4419059753418,
"learning_rate": 5.541718555417186e-06,
"loss": 0.685,
"step": 181
},
{
"epoch": 0.2266500622665006,
"grad_norm": 15.644163131713867,
"learning_rate": 5.572851805728519e-06,
"loss": 0.4103,
"step": 182
},
{
"epoch": 0.22789539227895392,
"grad_norm": 2.4954333305358887,
"learning_rate": 5.603985056039851e-06,
"loss": 0.0449,
"step": 183
},
{
"epoch": 0.22914072229140722,
"grad_norm": 18.7884521484375,
"learning_rate": 5.635118306351184e-06,
"loss": 0.3378,
"step": 184
},
{
"epoch": 0.23038605230386053,
"grad_norm": Infinity,
"learning_rate": 5.635118306351184e-06,
"loss": 0.8211,
"step": 185
},
{
"epoch": 0.23163138231631383,
"grad_norm": 2.243523359298706,
"learning_rate": 5.666251556662516e-06,
"loss": 0.0479,
"step": 186
},
{
"epoch": 0.2328767123287671,
"grad_norm": 3.3581135272979736,
"learning_rate": 5.697384806973848e-06,
"loss": 0.0505,
"step": 187
},
{
"epoch": 0.23412204234122042,
"grad_norm": 1.6243762969970703,
"learning_rate": 5.728518057285181e-06,
"loss": 0.0287,
"step": 188
},
{
"epoch": 0.23536737235367372,
"grad_norm": 55.31060791015625,
"learning_rate": 5.759651307596513e-06,
"loss": 0.2187,
"step": 189
},
{
"epoch": 0.23661270236612703,
"grad_norm": 0.3759680986404419,
"learning_rate": 5.790784557907846e-06,
"loss": 0.0085,
"step": 190
},
{
"epoch": 0.23785803237858033,
"grad_norm": 10.535552978515625,
"learning_rate": 5.821917808219178e-06,
"loss": 0.1855,
"step": 191
},
{
"epoch": 0.23910336239103364,
"grad_norm": 11.76515007019043,
"learning_rate": 5.85305105853051e-06,
"loss": 0.0808,
"step": 192
},
{
"epoch": 0.2403486924034869,
"grad_norm": 16.85251808166504,
"learning_rate": 5.884184308841843e-06,
"loss": 0.2412,
"step": 193
},
{
"epoch": 0.24159402241594022,
"grad_norm": 0.46440303325653076,
"learning_rate": 5.9153175591531755e-06,
"loss": 0.008,
"step": 194
},
{
"epoch": 0.24283935242839352,
"grad_norm": 0.7289634943008423,
"learning_rate": 5.946450809464509e-06,
"loss": 0.013,
"step": 195
},
{
"epoch": 0.24408468244084683,
"grad_norm": 11.138826370239258,
"learning_rate": 5.9775840597758416e-06,
"loss": 0.1779,
"step": 196
},
{
"epoch": 0.24533001245330013,
"grad_norm": 1.223634123802185,
"learning_rate": 6.008717310087173e-06,
"loss": 0.0177,
"step": 197
},
{
"epoch": 0.2465753424657534,
"grad_norm": 3.939805507659912,
"learning_rate": 6.039850560398506e-06,
"loss": 0.0818,
"step": 198
},
{
"epoch": 0.24782067247820672,
"grad_norm": 137.29930114746094,
"learning_rate": 6.0709838107098385e-06,
"loss": 3.1221,
"step": 199
},
{
"epoch": 0.24906600249066002,
"grad_norm": 3.8515782356262207,
"learning_rate": 6.102117061021171e-06,
"loss": 0.0835,
"step": 200
},
{
"epoch": 0.2503113325031133,
"grad_norm": 1.5677456855773926,
"learning_rate": 6.133250311332504e-06,
"loss": 0.0312,
"step": 201
},
{
"epoch": 0.25155666251556663,
"grad_norm": 1.6086269617080688,
"learning_rate": 6.1643835616438354e-06,
"loss": 0.0299,
"step": 202
},
{
"epoch": 0.25280199252801994,
"grad_norm": 0.9720219969749451,
"learning_rate": 6.195516811955168e-06,
"loss": 0.0152,
"step": 203
},
{
"epoch": 0.25404732254047324,
"grad_norm": 29.63043212890625,
"learning_rate": 6.226650062266501e-06,
"loss": 0.1063,
"step": 204
},
{
"epoch": 0.25529265255292655,
"grad_norm": 0.7106034159660339,
"learning_rate": 6.257783312577833e-06,
"loss": 0.0128,
"step": 205
},
{
"epoch": 0.25653798256537985,
"grad_norm": 0.7417896389961243,
"learning_rate": 6.288916562889166e-06,
"loss": 0.0138,
"step": 206
},
{
"epoch": 0.2577833125778331,
"grad_norm": 2.157313823699951,
"learning_rate": 6.3200498132004984e-06,
"loss": 0.0267,
"step": 207
},
{
"epoch": 0.2590286425902864,
"grad_norm": 0.8388156294822693,
"learning_rate": 6.351183063511831e-06,
"loss": 0.0125,
"step": 208
},
{
"epoch": 0.2602739726027397,
"grad_norm": 0.33427631855010986,
"learning_rate": 6.382316313823164e-06,
"loss": 0.0067,
"step": 209
},
{
"epoch": 0.261519302615193,
"grad_norm": 0.7715888023376465,
"learning_rate": 6.413449564134496e-06,
"loss": 0.0112,
"step": 210
},
{
"epoch": 0.2627646326276463,
"grad_norm": 0.23136259615421295,
"learning_rate": 6.444582814445828e-06,
"loss": 0.0052,
"step": 211
},
{
"epoch": 0.2640099626400996,
"grad_norm": 149.45394897460938,
"learning_rate": 6.4757160647571606e-06,
"loss": 0.3285,
"step": 212
},
{
"epoch": 0.26525529265255293,
"grad_norm": 2.4453482627868652,
"learning_rate": 6.506849315068493e-06,
"loss": 0.0472,
"step": 213
},
{
"epoch": 0.26650062266500624,
"grad_norm": 2.4057695865631104,
"learning_rate": 6.537982565379826e-06,
"loss": 0.033,
"step": 214
},
{
"epoch": 0.26774595267745954,
"grad_norm": 0.2910887598991394,
"learning_rate": 6.569115815691158e-06,
"loss": 0.0054,
"step": 215
},
{
"epoch": 0.26899128268991285,
"grad_norm": 0.9707146286964417,
"learning_rate": 6.600249066002491e-06,
"loss": 0.0173,
"step": 216
},
{
"epoch": 0.27023661270236615,
"grad_norm": 0.2008867859840393,
"learning_rate": 6.6313823163138235e-06,
"loss": 0.0038,
"step": 217
},
{
"epoch": 0.2714819427148194,
"grad_norm": 1.5367100238800049,
"learning_rate": 6.662515566625156e-06,
"loss": 0.0185,
"step": 218
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.5055931806564331,
"learning_rate": 6.693648816936489e-06,
"loss": 0.0066,
"step": 219
},
{
"epoch": 0.273972602739726,
"grad_norm": 0.4430530071258545,
"learning_rate": 6.724782067247821e-06,
"loss": 0.0062,
"step": 220
},
{
"epoch": 0.2752179327521793,
"grad_norm": 2.2975895404815674,
"learning_rate": 6.755915317559154e-06,
"loss": 0.01,
"step": 221
},
{
"epoch": 0.2764632627646326,
"grad_norm": 0.8265185952186584,
"learning_rate": 6.787048567870486e-06,
"loss": 0.0136,
"step": 222
},
{
"epoch": 0.2777085927770859,
"grad_norm": 168.16004943847656,
"learning_rate": 6.818181818181818e-06,
"loss": 2.9077,
"step": 223
},
{
"epoch": 0.27895392278953923,
"grad_norm": 0.7623637318611145,
"learning_rate": 6.849315068493151e-06,
"loss": 0.0124,
"step": 224
},
{
"epoch": 0.28019925280199254,
"grad_norm": 0.5590365529060364,
"learning_rate": 6.8804483188044835e-06,
"loss": 0.0115,
"step": 225
},
{
"epoch": 0.28144458281444584,
"grad_norm": 0.36643216013908386,
"learning_rate": 6.911581569115816e-06,
"loss": 0.005,
"step": 226
},
{
"epoch": 0.28268991282689915,
"grad_norm": 0.33054330945014954,
"learning_rate": 6.942714819427149e-06,
"loss": 0.0077,
"step": 227
},
{
"epoch": 0.28393524283935245,
"grad_norm": 0.34179171919822693,
"learning_rate": 6.973848069738481e-06,
"loss": 0.0077,
"step": 228
},
{
"epoch": 0.2851805728518057,
"grad_norm": 7.439018726348877,
"learning_rate": 7.004981320049814e-06,
"loss": 0.0183,
"step": 229
},
{
"epoch": 0.286425902864259,
"grad_norm": 0.4672091603279114,
"learning_rate": 7.0361145703611465e-06,
"loss": 0.0088,
"step": 230
},
{
"epoch": 0.2876712328767123,
"grad_norm": 43.73134994506836,
"learning_rate": 7.067247820672479e-06,
"loss": 0.0645,
"step": 231
},
{
"epoch": 0.2889165628891656,
"grad_norm": 0.5883788466453552,
"learning_rate": 7.098381070983812e-06,
"loss": 0.0077,
"step": 232
},
{
"epoch": 0.2901618929016189,
"grad_norm": 0.11801683157682419,
"learning_rate": 7.1295143212951425e-06,
"loss": 0.0025,
"step": 233
},
{
"epoch": 0.29140722291407223,
"grad_norm": 0.4613223671913147,
"learning_rate": 7.160647571606475e-06,
"loss": 0.0061,
"step": 234
},
{
"epoch": 0.29265255292652553,
"grad_norm": 0.46132174134254456,
"learning_rate": 7.191780821917809e-06,
"loss": 0.0054,
"step": 235
},
{
"epoch": 0.29389788293897884,
"grad_norm": Infinity,
"learning_rate": 7.191780821917809e-06,
"loss": 0.4395,
"step": 236
},
{
"epoch": 0.29514321295143214,
"grad_norm": 0.17022739350795746,
"learning_rate": 7.222914072229141e-06,
"loss": 0.0041,
"step": 237
},
{
"epoch": 0.29638854296388545,
"grad_norm": 0.10204841196537018,
"learning_rate": 7.254047322540474e-06,
"loss": 0.0025,
"step": 238
},
{
"epoch": 0.29763387297633875,
"grad_norm": 0.21153950691223145,
"learning_rate": 7.285180572851806e-06,
"loss": 0.0037,
"step": 239
},
{
"epoch": 0.298879202988792,
"grad_norm": 0.15493176877498627,
"learning_rate": 7.316313823163139e-06,
"loss": 0.003,
"step": 240
},
{
"epoch": 0.3001245330012453,
"grad_norm": 0.24285216629505157,
"learning_rate": 7.3474470734744716e-06,
"loss": 0.0049,
"step": 241
},
{
"epoch": 0.3013698630136986,
"grad_norm": 0.19606204330921173,
"learning_rate": 7.378580323785804e-06,
"loss": 0.0031,
"step": 242
},
{
"epoch": 0.3026151930261519,
"grad_norm": 2.727463483810425,
"learning_rate": 7.409713574097137e-06,
"loss": 0.0078,
"step": 243
},
{
"epoch": 0.3038605230386052,
"grad_norm": 0.1808951050043106,
"learning_rate": 7.440846824408469e-06,
"loss": 0.0039,
"step": 244
},
{
"epoch": 0.30510585305105853,
"grad_norm": 0.24642078578472137,
"learning_rate": 7.4719800747198e-06,
"loss": 0.0047,
"step": 245
},
{
"epoch": 0.30635118306351183,
"grad_norm": 0.10990118980407715,
"learning_rate": 7.503113325031133e-06,
"loss": 0.0021,
"step": 246
},
{
"epoch": 0.30759651307596514,
"grad_norm": 0.08530181646347046,
"learning_rate": 7.5342465753424655e-06,
"loss": 0.0022,
"step": 247
},
{
"epoch": 0.30884184308841844,
"grad_norm": 1.499770998954773,
"learning_rate": 7.565379825653798e-06,
"loss": 0.0047,
"step": 248
},
{
"epoch": 0.31008717310087175,
"grad_norm": 0.08772747963666916,
"learning_rate": 7.596513075965131e-06,
"loss": 0.0026,
"step": 249
},
{
"epoch": 0.31133250311332505,
"grad_norm": 0.38723257184028625,
"learning_rate": 7.627646326276463e-06,
"loss": 0.0045,
"step": 250
},
{
"epoch": 0.3125778331257783,
"grad_norm": 0.09018506854772568,
"learning_rate": 7.658779576587797e-06,
"loss": 0.002,
"step": 251
},
{
"epoch": 0.3138231631382316,
"grad_norm": 3.251638650894165,
"learning_rate": 7.689912826899128e-06,
"loss": 0.0073,
"step": 252
},
{
"epoch": 0.3150684931506849,
"grad_norm": 0.17742273211479187,
"learning_rate": 7.721046077210462e-06,
"loss": 0.0034,
"step": 253
},
{
"epoch": 0.3163138231631382,
"grad_norm": 4.7799201011657715,
"learning_rate": 7.752179327521794e-06,
"loss": 0.0149,
"step": 254
},
{
"epoch": 0.3175591531755915,
"grad_norm": 0.7822676301002502,
"learning_rate": 7.783312577833127e-06,
"loss": 0.0043,
"step": 255
},
{
"epoch": 0.31880448318804483,
"grad_norm": 0.07635273039340973,
"learning_rate": 7.814445828144457e-06,
"loss": 0.0019,
"step": 256
},
{
"epoch": 0.32004981320049813,
"grad_norm": 0.128676638007164,
"learning_rate": 7.84557907845579e-06,
"loss": 0.0031,
"step": 257
},
{
"epoch": 0.32129514321295144,
"grad_norm": 0.35170984268188477,
"learning_rate": 7.876712328767124e-06,
"loss": 0.0034,
"step": 258
},
{
"epoch": 0.32254047322540474,
"grad_norm": 0.17562495172023773,
"learning_rate": 7.907845579078456e-06,
"loss": 0.0036,
"step": 259
},
{
"epoch": 0.32378580323785805,
"grad_norm": 0.4719379246234894,
"learning_rate": 7.93897882938979e-06,
"loss": 0.0052,
"step": 260
},
{
"epoch": 0.32503113325031135,
"grad_norm": 1.012569546699524,
"learning_rate": 7.970112079701121e-06,
"loss": 0.0034,
"step": 261
},
{
"epoch": 0.32627646326276466,
"grad_norm": 0.6060551404953003,
"learning_rate": 8.001245330012454e-06,
"loss": 0.0033,
"step": 262
},
{
"epoch": 0.3275217932752179,
"grad_norm": 0.04582296311855316,
"learning_rate": 8.032378580323786e-06,
"loss": 0.0012,
"step": 263
},
{
"epoch": 0.3287671232876712,
"grad_norm": 0.109385184943676,
"learning_rate": 8.06351183063512e-06,
"loss": 0.0023,
"step": 264
},
{
"epoch": 0.3300124533001245,
"grad_norm": 0.056446850299835205,
"learning_rate": 8.094645080946451e-06,
"loss": 0.0013,
"step": 265
},
{
"epoch": 0.3312577833125778,
"grad_norm": 0.10354617983102798,
"learning_rate": 8.125778331257785e-06,
"loss": 0.002,
"step": 266
},
{
"epoch": 0.33250311332503113,
"grad_norm": 0.14216098189353943,
"learning_rate": 8.156911581569117e-06,
"loss": 0.0029,
"step": 267
},
{
"epoch": 0.33374844333748444,
"grad_norm": 0.07656246423721313,
"learning_rate": 8.188044831880448e-06,
"loss": 0.0018,
"step": 268
},
{
"epoch": 0.33499377334993774,
"grad_norm": 0.2349928468465805,
"learning_rate": 8.21917808219178e-06,
"loss": 0.0034,
"step": 269
},
{
"epoch": 0.33623910336239105,
"grad_norm": 0.1743057817220688,
"learning_rate": 8.250311332503113e-06,
"loss": 0.0041,
"step": 270
},
{
"epoch": 0.33748443337484435,
"grad_norm": 0.05078033730387688,
"learning_rate": 8.281444582814445e-06,
"loss": 0.0015,
"step": 271
},
{
"epoch": 0.33872976338729766,
"grad_norm": 0.12597429752349854,
"learning_rate": 8.312577833125779e-06,
"loss": 0.0032,
"step": 272
},
{
"epoch": 0.33997509339975096,
"grad_norm": 0.09458588808774948,
"learning_rate": 8.343711083437112e-06,
"loss": 0.002,
"step": 273
},
{
"epoch": 0.3412204234122042,
"grad_norm": 0.20183101296424866,
"learning_rate": 8.374844333748444e-06,
"loss": 0.0043,
"step": 274
},
{
"epoch": 0.3424657534246575,
"grad_norm": 0.16585314273834229,
"learning_rate": 8.405977584059777e-06,
"loss": 0.0026,
"step": 275
},
{
"epoch": 0.3437110834371108,
"grad_norm": 0.05950070172548294,
"learning_rate": 8.437110834371109e-06,
"loss": 0.0018,
"step": 276
},
{
"epoch": 0.3449564134495641,
"grad_norm": 0.062412526458501816,
"learning_rate": 8.468244084682442e-06,
"loss": 0.0017,
"step": 277
},
{
"epoch": 0.34620174346201743,
"grad_norm": 297.8834533691406,
"learning_rate": 8.499377334993774e-06,
"loss": 2.7641,
"step": 278
},
{
"epoch": 0.34744707347447074,
"grad_norm": 0.18788257241249084,
"learning_rate": 8.530510585305106e-06,
"loss": 0.0031,
"step": 279
},
{
"epoch": 0.34869240348692404,
"grad_norm": 0.05538473278284073,
"learning_rate": 8.561643835616438e-06,
"loss": 0.0014,
"step": 280
},
{
"epoch": 0.34993773349937735,
"grad_norm": 0.05929434299468994,
"learning_rate": 8.592777085927771e-06,
"loss": 0.0015,
"step": 281
},
{
"epoch": 0.35118306351183065,
"grad_norm": 0.15558889508247375,
"learning_rate": 8.623910336239103e-06,
"loss": 0.0032,
"step": 282
},
{
"epoch": 0.35242839352428396,
"grad_norm": 0.0714510902762413,
"learning_rate": 8.655043586550436e-06,
"loss": 0.002,
"step": 283
},
{
"epoch": 0.35367372353673726,
"grad_norm": 2.3466129302978516,
"learning_rate": 8.686176836861768e-06,
"loss": 0.0066,
"step": 284
},
{
"epoch": 0.3549190535491905,
"grad_norm": 17.250829696655273,
"learning_rate": 8.717310087173102e-06,
"loss": 0.0224,
"step": 285
},
{
"epoch": 0.3561643835616438,
"grad_norm": 0.03599457070231438,
"learning_rate": 8.748443337484433e-06,
"loss": 0.0011,
"step": 286
},
{
"epoch": 0.3574097135740971,
"grad_norm": 0.05941268801689148,
"learning_rate": 8.779576587795767e-06,
"loss": 0.0019,
"step": 287
},
{
"epoch": 0.3586550435865504,
"grad_norm": 1.2639917135238647,
"learning_rate": 8.810709838107099e-06,
"loss": 0.0044,
"step": 288
},
{
"epoch": 0.35990037359900373,
"grad_norm": 0.04103681072592735,
"learning_rate": 8.841843088418432e-06,
"loss": 0.001,
"step": 289
},
{
"epoch": 0.36114570361145704,
"grad_norm": 0.03893645480275154,
"learning_rate": 8.872976338729764e-06,
"loss": 0.001,
"step": 290
},
{
"epoch": 0.36239103362391034,
"grad_norm": 0.038509551435709,
"learning_rate": 8.904109589041095e-06,
"loss": 0.0009,
"step": 291
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.03188912197947502,
"learning_rate": 8.935242839352429e-06,
"loss": 0.001,
"step": 292
},
{
"epoch": 0.36488169364881695,
"grad_norm": 0.048545584082603455,
"learning_rate": 8.96637608966376e-06,
"loss": 0.0011,
"step": 293
},
{
"epoch": 0.36612702366127026,
"grad_norm": 0.0602889247238636,
"learning_rate": 8.997509339975094e-06,
"loss": 0.0015,
"step": 294
},
{
"epoch": 0.36737235367372356,
"grad_norm": 0.05375710129737854,
"learning_rate": 9.028642590286426e-06,
"loss": 0.0016,
"step": 295
},
{
"epoch": 0.3686176836861768,
"grad_norm": 0.043809376657009125,
"learning_rate": 9.05977584059776e-06,
"loss": 0.0012,
"step": 296
},
{
"epoch": 0.3698630136986301,
"grad_norm": 0.0780409425497055,
"learning_rate": 9.090909090909091e-06,
"loss": 0.0022,
"step": 297
},
{
"epoch": 0.3711083437110834,
"grad_norm": 0.06276142597198486,
"learning_rate": 9.122042341220424e-06,
"loss": 0.0017,
"step": 298
},
{
"epoch": 0.3723536737235367,
"grad_norm": 0.060071829706430435,
"learning_rate": 9.153175591531756e-06,
"loss": 0.0014,
"step": 299
},
{
"epoch": 0.37359900373599003,
"grad_norm": 0.032719388604164124,
"learning_rate": 9.18430884184309e-06,
"loss": 0.0007,
"step": 300
},
{
"epoch": 0.37484433374844334,
"grad_norm": 0.034909844398498535,
"learning_rate": 9.215442092154421e-06,
"loss": 0.001,
"step": 301
},
{
"epoch": 0.37608966376089664,
"grad_norm": 0.034523140639066696,
"learning_rate": 9.246575342465753e-06,
"loss": 0.0011,
"step": 302
},
{
"epoch": 0.37733499377334995,
"grad_norm": 0.05015862360596657,
"learning_rate": 9.277708592777087e-06,
"loss": 0.0013,
"step": 303
},
{
"epoch": 0.37858032378580325,
"grad_norm": 0.05602340027689934,
"learning_rate": 9.308841843088418e-06,
"loss": 0.0016,
"step": 304
},
{
"epoch": 0.37982565379825656,
"grad_norm": 0.04742440581321716,
"learning_rate": 9.339975093399752e-06,
"loss": 0.0014,
"step": 305
},
{
"epoch": 0.38107098381070986,
"grad_norm": 0.03035055100917816,
"learning_rate": 9.371108343711084e-06,
"loss": 0.0009,
"step": 306
},
{
"epoch": 0.3823163138231631,
"grad_norm": 241.25111389160156,
"learning_rate": 9.402241594022417e-06,
"loss": 0.1876,
"step": 307
},
{
"epoch": 0.3835616438356164,
"grad_norm": 0.03797473758459091,
"learning_rate": 9.433374844333749e-06,
"loss": 0.001,
"step": 308
},
{
"epoch": 0.3848069738480697,
"grad_norm": 0.03934524580836296,
"learning_rate": 9.464508094645082e-06,
"loss": 0.001,
"step": 309
},
{
"epoch": 0.386052303860523,
"grad_norm": 0.04892684891819954,
"learning_rate": 9.495641344956414e-06,
"loss": 0.0013,
"step": 310
},
{
"epoch": 0.38729763387297633,
"grad_norm": 0.06903809309005737,
"learning_rate": 9.526774595267747e-06,
"loss": 0.0018,
"step": 311
},
{
"epoch": 0.38854296388542964,
"grad_norm": 0.17654924094676971,
"learning_rate": 9.557907845579077e-06,
"loss": 0.0018,
"step": 312
},
{
"epoch": 0.38978829389788294,
"grad_norm": 0.047983210533857346,
"learning_rate": 9.589041095890411e-06,
"loss": 0.001,
"step": 313
},
{
"epoch": 0.39103362391033625,
"grad_norm": 0.0729343593120575,
"learning_rate": 9.620174346201744e-06,
"loss": 0.0018,
"step": 314
},
{
"epoch": 0.39227895392278955,
"grad_norm": 0.025607705116271973,
"learning_rate": 9.651307596513076e-06,
"loss": 0.0007,
"step": 315
},
{
"epoch": 0.39352428393524286,
"grad_norm": 0.0369686633348465,
"learning_rate": 9.68244084682441e-06,
"loss": 0.001,
"step": 316
},
{
"epoch": 0.39476961394769616,
"grad_norm": 0.03150925785303116,
"learning_rate": 9.713574097135741e-06,
"loss": 0.001,
"step": 317
},
{
"epoch": 0.3960149439601494,
"grad_norm": 537.4097900390625,
"learning_rate": 9.744707347447075e-06,
"loss": 0.9077,
"step": 318
},
{
"epoch": 0.3972602739726027,
"grad_norm": 0.036139559000730515,
"learning_rate": 9.775840597758406e-06,
"loss": 0.0011,
"step": 319
},
{
"epoch": 0.398505603985056,
"grad_norm": 0.10030055046081543,
"learning_rate": 9.80697384806974e-06,
"loss": 0.0019,
"step": 320
},
{
"epoch": 0.39975093399750933,
"grad_norm": 0.20713728666305542,
"learning_rate": 9.838107098381072e-06,
"loss": 0.0013,
"step": 321
},
{
"epoch": 0.40099626400996263,
"grad_norm": 0.21006031334400177,
"learning_rate": 9.869240348692405e-06,
"loss": 0.0021,
"step": 322
},
{
"epoch": 0.40224159402241594,
"grad_norm": 409.08544921875,
"learning_rate": 9.900373599003735e-06,
"loss": 1.8641,
"step": 323
},
{
"epoch": 0.40348692403486924,
"grad_norm": 0.04977629333734512,
"learning_rate": 9.931506849315069e-06,
"loss": 0.0012,
"step": 324
},
{
"epoch": 0.40473225404732255,
"grad_norm": 0.06899397075176239,
"learning_rate": 9.9626400996264e-06,
"loss": 0.0011,
"step": 325
},
{
"epoch": 0.40597758405977585,
"grad_norm": 0.3704112470149994,
"learning_rate": 9.993773349937734e-06,
"loss": 0.0014,
"step": 326
},
{
"epoch": 0.40722291407222916,
"grad_norm": 0.03436332195997238,
"learning_rate": 1.0024906600249066e-05,
"loss": 0.0011,
"step": 327
},
{
"epoch": 0.40846824408468246,
"grad_norm": 0.03816661238670349,
"learning_rate": 1.0056039850560399e-05,
"loss": 0.0009,
"step": 328
},
{
"epoch": 0.40971357409713577,
"grad_norm": 0.053675808012485504,
"learning_rate": 1.008717310087173e-05,
"loss": 0.0014,
"step": 329
},
{
"epoch": 0.410958904109589,
"grad_norm": 0.024651149287819862,
"learning_rate": 1.0118306351183064e-05,
"loss": 0.0007,
"step": 330
},
{
"epoch": 0.4122042341220423,
"grad_norm": 0.03284426033496857,
"learning_rate": 1.0149439601494398e-05,
"loss": 0.001,
"step": 331
},
{
"epoch": 0.41344956413449563,
"grad_norm": 0.03643254190683365,
"learning_rate": 1.018057285180573e-05,
"loss": 0.0011,
"step": 332
},
{
"epoch": 0.41469489414694893,
"grad_norm": 0.02989336848258972,
"learning_rate": 1.0211706102117063e-05,
"loss": 0.0008,
"step": 333
},
{
"epoch": 0.41594022415940224,
"grad_norm": 0.020424343645572662,
"learning_rate": 1.0242839352428395e-05,
"loss": 0.0007,
"step": 334
},
{
"epoch": 0.41718555417185554,
"grad_norm": 0.03185396268963814,
"learning_rate": 1.0273972602739726e-05,
"loss": 0.0009,
"step": 335
},
{
"epoch": 0.41843088418430885,
"grad_norm": 0.022784588858485222,
"learning_rate": 1.0305105853051058e-05,
"loss": 0.0006,
"step": 336
},
{
"epoch": 0.41967621419676215,
"grad_norm": 0.1662231832742691,
"learning_rate": 1.0336239103362392e-05,
"loss": 0.0018,
"step": 337
},
{
"epoch": 0.42092154420921546,
"grad_norm": 0.05111798271536827,
"learning_rate": 1.0367372353673723e-05,
"loss": 0.0014,
"step": 338
},
{
"epoch": 0.42216687422166876,
"grad_norm": 0.024023687466979027,
"learning_rate": 1.0398505603985057e-05,
"loss": 0.0007,
"step": 339
},
{
"epoch": 0.42341220423412207,
"grad_norm": 0.07146386057138443,
"learning_rate": 1.0429638854296388e-05,
"loss": 0.0019,
"step": 340
},
{
"epoch": 0.4246575342465753,
"grad_norm": 0.01847468502819538,
"learning_rate": 1.0460772104607722e-05,
"loss": 0.0006,
"step": 341
},
{
"epoch": 0.4259028642590286,
"grad_norm": 0.11909367889165878,
"learning_rate": 1.0491905354919054e-05,
"loss": 0.0009,
"step": 342
},
{
"epoch": 0.42714819427148193,
"grad_norm": 0.07260438799858093,
"learning_rate": 1.0523038605230387e-05,
"loss": 0.002,
"step": 343
},
{
"epoch": 0.42839352428393523,
"grad_norm": 113.6898193359375,
"learning_rate": 1.0554171855541719e-05,
"loss": 0.0637,
"step": 344
},
{
"epoch": 0.42963885429638854,
"grad_norm": 0.018576975911855698,
"learning_rate": 1.0585305105853052e-05,
"loss": 0.0006,
"step": 345
},
{
"epoch": 0.43088418430884184,
"grad_norm": 0.03654215857386589,
"learning_rate": 1.0616438356164384e-05,
"loss": 0.0007,
"step": 346
},
{
"epoch": 0.43212951432129515,
"grad_norm": 0.025475049391388893,
"learning_rate": 1.0647571606475716e-05,
"loss": 0.0007,
"step": 347
},
{
"epoch": 0.43337484433374845,
"grad_norm": 0.02617563307285309,
"learning_rate": 1.067870485678705e-05,
"loss": 0.0008,
"step": 348
},
{
"epoch": 0.43462017434620176,
"grad_norm": 0.07997260987758636,
"learning_rate": 1.0709838107098381e-05,
"loss": 0.0016,
"step": 349
},
{
"epoch": 0.43586550435865506,
"grad_norm": 0.020727328956127167,
"learning_rate": 1.0740971357409714e-05,
"loss": 0.0007,
"step": 350
},
{
"epoch": 0.43711083437110837,
"grad_norm": 0.02753385342657566,
"learning_rate": 1.0772104607721046e-05,
"loss": 0.0007,
"step": 351
},
{
"epoch": 0.4383561643835616,
"grad_norm": 0.04742880165576935,
"learning_rate": 1.080323785803238e-05,
"loss": 0.0009,
"step": 352
},
{
"epoch": 0.4396014943960149,
"grad_norm": 0.03920525684952736,
"learning_rate": 1.0834371108343711e-05,
"loss": 0.0011,
"step": 353
},
{
"epoch": 0.44084682440846823,
"grad_norm": 0.04735913872718811,
"learning_rate": 1.0865504358655045e-05,
"loss": 0.0012,
"step": 354
},
{
"epoch": 0.44209215442092153,
"grad_norm": 0.028404802083969116,
"learning_rate": 1.0896637608966377e-05,
"loss": 0.0009,
"step": 355
},
{
"epoch": 0.44333748443337484,
"grad_norm": 0.02533857710659504,
"learning_rate": 1.092777085927771e-05,
"loss": 0.0006,
"step": 356
},
{
"epoch": 0.44458281444582815,
"grad_norm": 0.04108303785324097,
"learning_rate": 1.0958904109589042e-05,
"loss": 0.0013,
"step": 357
},
{
"epoch": 0.44582814445828145,
"grad_norm": 0.03464365378022194,
"learning_rate": 1.0990037359900373e-05,
"loss": 0.0009,
"step": 358
},
{
"epoch": 0.44707347447073476,
"grad_norm": 0.030825745314359665,
"learning_rate": 1.1021170610211707e-05,
"loss": 0.0008,
"step": 359
},
{
"epoch": 0.44831880448318806,
"grad_norm": 0.04480734467506409,
"learning_rate": 1.1052303860523039e-05,
"loss": 0.0012,
"step": 360
},
{
"epoch": 0.44956413449564137,
"grad_norm": 0.02541348710656166,
"learning_rate": 1.1083437110834372e-05,
"loss": 0.0008,
"step": 361
},
{
"epoch": 0.45080946450809467,
"grad_norm": 0.02149001508951187,
"learning_rate": 1.1114570361145704e-05,
"loss": 0.0006,
"step": 362
},
{
"epoch": 0.4520547945205479,
"grad_norm": 0.05121343955397606,
"learning_rate": 1.1145703611457037e-05,
"loss": 0.0015,
"step": 363
},
{
"epoch": 0.4533001245330012,
"grad_norm": 0.022881271317601204,
"learning_rate": 1.1176836861768369e-05,
"loss": 0.0007,
"step": 364
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.029813582077622414,
"learning_rate": 1.1207970112079703e-05,
"loss": 0.0007,
"step": 365
},
{
"epoch": 0.45579078455790784,
"grad_norm": 0.0214352048933506,
"learning_rate": 1.1239103362391034e-05,
"loss": 0.0007,
"step": 366
},
{
"epoch": 0.45703611457036114,
"grad_norm": 0.04457417130470276,
"learning_rate": 1.1270236612702368e-05,
"loss": 0.0008,
"step": 367
},
{
"epoch": 0.45828144458281445,
"grad_norm": 0.019106173887848854,
"learning_rate": 1.1301369863013698e-05,
"loss": 0.0006,
"step": 368
},
{
"epoch": 0.45952677459526775,
"grad_norm": 0.022846408188343048,
"learning_rate": 1.1332503113325031e-05,
"loss": 0.0006,
"step": 369
},
{
"epoch": 0.46077210460772106,
"grad_norm": 0.018946994096040726,
"learning_rate": 1.1363636363636365e-05,
"loss": 0.0006,
"step": 370
},
{
"epoch": 0.46201743462017436,
"grad_norm": 0.021404925733804703,
"learning_rate": 1.1394769613947696e-05,
"loss": 0.0006,
"step": 371
},
{
"epoch": 0.46326276463262767,
"grad_norm": 0.01195521280169487,
"learning_rate": 1.142590286425903e-05,
"loss": 0.0004,
"step": 372
},
{
"epoch": 0.46450809464508097,
"grad_norm": 0.03864084184169769,
"learning_rate": 1.1457036114570362e-05,
"loss": 0.001,
"step": 373
},
{
"epoch": 0.4657534246575342,
"grad_norm": 0.058303095400333405,
"learning_rate": 1.1488169364881695e-05,
"loss": 0.0012,
"step": 374
},
{
"epoch": 0.4669987546699875,
"grad_norm": 0.013412773609161377,
"learning_rate": 1.1519302615193027e-05,
"loss": 0.0004,
"step": 375
},
{
"epoch": 0.46824408468244083,
"grad_norm": 0.02416684851050377,
"learning_rate": 1.155043586550436e-05,
"loss": 0.0007,
"step": 376
},
{
"epoch": 0.46948941469489414,
"grad_norm": 0.016587672755122185,
"learning_rate": 1.1581569115815692e-05,
"loss": 0.0005,
"step": 377
},
{
"epoch": 0.47073474470734744,
"grad_norm": 0.020129237323999405,
"learning_rate": 1.1612702366127025e-05,
"loss": 0.0006,
"step": 378
},
{
"epoch": 0.47198007471980075,
"grad_norm": 0.2290887087583542,
"learning_rate": 1.1643835616438355e-05,
"loss": 0.0013,
"step": 379
},
{
"epoch": 0.47322540473225405,
"grad_norm": 0.0186260174959898,
"learning_rate": 1.1674968866749689e-05,
"loss": 0.0006,
"step": 380
},
{
"epoch": 0.47447073474470736,
"grad_norm": 0.03915928676724434,
"learning_rate": 1.170610211706102e-05,
"loss": 0.0009,
"step": 381
},
{
"epoch": 0.47571606475716066,
"grad_norm": 0.024174867197871208,
"learning_rate": 1.1737235367372354e-05,
"loss": 0.0006,
"step": 382
},
{
"epoch": 0.47696139476961397,
"grad_norm": 0.06258780509233475,
"learning_rate": 1.1768368617683686e-05,
"loss": 0.0012,
"step": 383
},
{
"epoch": 0.47820672478206727,
"grad_norm": 0.0187270175665617,
"learning_rate": 1.179950186799502e-05,
"loss": 0.0006,
"step": 384
},
{
"epoch": 0.4794520547945205,
"grad_norm": 0.036254920065402985,
"learning_rate": 1.1830635118306351e-05,
"loss": 0.0011,
"step": 385
},
{
"epoch": 0.4806973848069738,
"grad_norm": 0.04100683704018593,
"learning_rate": 1.1861768368617684e-05,
"loss": 0.0008,
"step": 386
},
{
"epoch": 0.48194271481942713,
"grad_norm": 0.023180831223726273,
"learning_rate": 1.1892901618929018e-05,
"loss": 0.0007,
"step": 387
},
{
"epoch": 0.48318804483188044,
"grad_norm": 36.136348724365234,
"learning_rate": 1.192403486924035e-05,
"loss": 4.5358,
"step": 388
},
{
"epoch": 0.48443337484433374,
"grad_norm": 0.06236216425895691,
"learning_rate": 1.1955168119551683e-05,
"loss": 0.0013,
"step": 389
},
{
"epoch": 0.48567870485678705,
"grad_norm": 0.11113505810499191,
"learning_rate": 1.1986301369863013e-05,
"loss": 0.0014,
"step": 390
},
{
"epoch": 0.48692403486924035,
"grad_norm": 0.028809353709220886,
"learning_rate": 1.2017434620174347e-05,
"loss": 0.0006,
"step": 391
},
{
"epoch": 0.48816936488169366,
"grad_norm": 0.04308629035949707,
"learning_rate": 1.2048567870485678e-05,
"loss": 0.001,
"step": 392
},
{
"epoch": 0.48941469489414696,
"grad_norm": 0.03488301858305931,
"learning_rate": 1.2079701120797012e-05,
"loss": 0.001,
"step": 393
},
{
"epoch": 0.49066002490660027,
"grad_norm": 0.03795866668224335,
"learning_rate": 1.2110834371108344e-05,
"loss": 0.0009,
"step": 394
},
{
"epoch": 0.4919053549190536,
"grad_norm": 179.07867431640625,
"learning_rate": 1.2141967621419677e-05,
"loss": 0.306,
"step": 395
},
{
"epoch": 0.4931506849315068,
"grad_norm": 0.07366206496953964,
"learning_rate": 1.2173100871731009e-05,
"loss": 0.0016,
"step": 396
},
{
"epoch": 0.4943960149439601,
"grad_norm": 0.1270761936903,
"learning_rate": 1.2204234122042342e-05,
"loss": 0.0023,
"step": 397
},
{
"epoch": 0.49564134495641343,
"grad_norm": 0.1619614213705063,
"learning_rate": 1.2235367372353674e-05,
"loss": 0.0025,
"step": 398
},
{
"epoch": 0.49688667496886674,
"grad_norm": 0.027039946988224983,
"learning_rate": 1.2266500622665007e-05,
"loss": 0.0005,
"step": 399
},
{
"epoch": 0.49813200498132004,
"grad_norm": 0.012688295915722847,
"learning_rate": 1.2297633872976339e-05,
"loss": 0.0003,
"step": 400
},
{
"epoch": 0.49937733499377335,
"grad_norm": 0.04193650931119919,
"learning_rate": 1.2328767123287671e-05,
"loss": 0.001,
"step": 401
},
{
"epoch": 0.5006226650062267,
"grad_norm": 0.2457994669675827,
"learning_rate": 1.2359900373599004e-05,
"loss": 0.0033,
"step": 402
},
{
"epoch": 0.50186799501868,
"grad_norm": 0.07151038944721222,
"learning_rate": 1.2391033623910336e-05,
"loss": 0.0012,
"step": 403
},
{
"epoch": 0.5031133250311333,
"grad_norm": 0.03706686571240425,
"learning_rate": 1.242216687422167e-05,
"loss": 0.001,
"step": 404
},
{
"epoch": 0.5043586550435866,
"grad_norm": 0.03082493133842945,
"learning_rate": 1.2453300124533001e-05,
"loss": 0.0008,
"step": 405
},
{
"epoch": 0.5056039850560399,
"grad_norm": 0.02312391996383667,
"learning_rate": 1.2484433374844335e-05,
"loss": 0.0007,
"step": 406
},
{
"epoch": 0.5068493150684932,
"grad_norm": 43.44374084472656,
"learning_rate": 1.2515566625155666e-05,
"loss": 4.0239,
"step": 407
},
{
"epoch": 0.5080946450809465,
"grad_norm": 0.04549500346183777,
"learning_rate": 1.2546699875467e-05,
"loss": 0.0011,
"step": 408
},
{
"epoch": 0.5093399750933998,
"grad_norm": 0.44390103220939636,
"learning_rate": 1.2577833125778332e-05,
"loss": 0.0017,
"step": 409
},
{
"epoch": 0.5105853051058531,
"grad_norm": 0.017668342217803,
"learning_rate": 1.2608966376089665e-05,
"loss": 0.0004,
"step": 410
},
{
"epoch": 0.5118306351183064,
"grad_norm": 0.02797042578458786,
"learning_rate": 1.2640099626400997e-05,
"loss": 0.0005,
"step": 411
},
{
"epoch": 0.5130759651307597,
"grad_norm": 0.05557764694094658,
"learning_rate": 1.267123287671233e-05,
"loss": 0.0011,
"step": 412
},
{
"epoch": 0.5143212951432129,
"grad_norm": 0.028871331363916397,
"learning_rate": 1.2702366127023662e-05,
"loss": 0.0007,
"step": 413
},
{
"epoch": 0.5155666251556662,
"grad_norm": 0.04884202778339386,
"learning_rate": 1.2733499377334995e-05,
"loss": 0.001,
"step": 414
},
{
"epoch": 0.5168119551681195,
"grad_norm": 0.014481289312243462,
"learning_rate": 1.2764632627646327e-05,
"loss": 0.0004,
"step": 415
},
{
"epoch": 0.5180572851805728,
"grad_norm": 0.08000053465366364,
"learning_rate": 1.279576587795766e-05,
"loss": 0.0015,
"step": 416
},
{
"epoch": 0.5193026151930261,
"grad_norm": 0.036073487251996994,
"learning_rate": 1.2826899128268992e-05,
"loss": 0.0007,
"step": 417
},
{
"epoch": 0.5205479452054794,
"grad_norm": 0.08941499143838882,
"learning_rate": 1.2858032378580322e-05,
"loss": 0.0015,
"step": 418
},
{
"epoch": 0.5217932752179327,
"grad_norm": 0.06853260844945908,
"learning_rate": 1.2889165628891656e-05,
"loss": 0.0013,
"step": 419
},
{
"epoch": 0.523038605230386,
"grad_norm": 0.026791630312800407,
"learning_rate": 1.2920298879202988e-05,
"loss": 0.0007,
"step": 420
},
{
"epoch": 0.5242839352428393,
"grad_norm": 0.3121366500854492,
"learning_rate": 1.2951432129514321e-05,
"loss": 0.0039,
"step": 421
},
{
"epoch": 0.5255292652552926,
"grad_norm": 0.02174542099237442,
"learning_rate": 1.2982565379825653e-05,
"loss": 0.0006,
"step": 422
},
{
"epoch": 0.526774595267746,
"grad_norm": 0.053185317665338516,
"learning_rate": 1.3013698630136986e-05,
"loss": 0.0011,
"step": 423
},
{
"epoch": 0.5280199252801993,
"grad_norm": 0.033572856336832047,
"learning_rate": 1.3044831880448318e-05,
"loss": 0.0009,
"step": 424
},
{
"epoch": 0.5292652552926526,
"grad_norm": 0.0287881251424551,
"learning_rate": 1.3075965130759652e-05,
"loss": 0.0008,
"step": 425
},
{
"epoch": 0.5305105853051059,
"grad_norm": 0.029981469735503197,
"learning_rate": 1.3107098381070983e-05,
"loss": 0.0006,
"step": 426
},
{
"epoch": 0.5317559153175592,
"grad_norm": 0.028788315132260323,
"learning_rate": 1.3138231631382317e-05,
"loss": 0.0005,
"step": 427
},
{
"epoch": 0.5330012453300125,
"grad_norm": 0.021008843556046486,
"learning_rate": 1.316936488169365e-05,
"loss": 0.0005,
"step": 428
},
{
"epoch": 0.5342465753424658,
"grad_norm": 0.04118547961115837,
"learning_rate": 1.3200498132004982e-05,
"loss": 0.001,
"step": 429
},
{
"epoch": 0.5354919053549191,
"grad_norm": 0.012453455477952957,
"learning_rate": 1.3231631382316315e-05,
"loss": 0.0003,
"step": 430
},
{
"epoch": 0.5367372353673724,
"grad_norm": 0.06938812136650085,
"learning_rate": 1.3262764632627647e-05,
"loss": 0.0011,
"step": 431
},
{
"epoch": 0.5379825653798257,
"grad_norm": 0.017569739371538162,
"learning_rate": 1.329389788293898e-05,
"loss": 0.0005,
"step": 432
},
{
"epoch": 0.539227895392279,
"grad_norm": 0.026109851896762848,
"learning_rate": 1.3325031133250312e-05,
"loss": 0.0006,
"step": 433
},
{
"epoch": 0.5404732254047323,
"grad_norm": 0.015702908858656883,
"learning_rate": 1.3356164383561646e-05,
"loss": 0.0004,
"step": 434
},
{
"epoch": 0.5417185554171855,
"grad_norm": 0.025982121005654335,
"learning_rate": 1.3387297633872977e-05,
"loss": 0.0007,
"step": 435
},
{
"epoch": 0.5429638854296388,
"grad_norm": 0.06682372093200684,
"learning_rate": 1.3418430884184311e-05,
"loss": 0.0013,
"step": 436
},
{
"epoch": 0.5442092154420921,
"grad_norm": 0.016124481335282326,
"learning_rate": 1.3449564134495643e-05,
"loss": 0.0005,
"step": 437
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.018914785236120224,
"learning_rate": 1.3480697384806976e-05,
"loss": 0.0005,
"step": 438
},
{
"epoch": 0.5466998754669987,
"grad_norm": 0.01492242980748415,
"learning_rate": 1.3511830635118308e-05,
"loss": 0.0004,
"step": 439
},
{
"epoch": 0.547945205479452,
"grad_norm": 0.06164323166012764,
"learning_rate": 1.3542963885429638e-05,
"loss": 0.0011,
"step": 440
},
{
"epoch": 0.5491905354919053,
"grad_norm": 0.07254376262426376,
"learning_rate": 1.3574097135740971e-05,
"loss": 0.0015,
"step": 441
},
{
"epoch": 0.5504358655043586,
"grad_norm": 0.09924010187387466,
"learning_rate": 1.3605230386052303e-05,
"loss": 0.0019,
"step": 442
},
{
"epoch": 0.5516811955168119,
"grad_norm": 0.01098677609115839,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.0003,
"step": 443
},
{
"epoch": 0.5529265255292652,
"grad_norm": 0.030665650963783264,
"learning_rate": 1.3667496886674968e-05,
"loss": 0.001,
"step": 444
},
{
"epoch": 0.5541718555417185,
"grad_norm": 0.04467572271823883,
"learning_rate": 1.3698630136986302e-05,
"loss": 0.001,
"step": 445
},
{
"epoch": 0.5554171855541719,
"grad_norm": 0.01499516423791647,
"learning_rate": 1.3729763387297633e-05,
"loss": 0.0004,
"step": 446
},
{
"epoch": 0.5566625155666252,
"grad_norm": 0.01595112681388855,
"learning_rate": 1.3760896637608967e-05,
"loss": 0.0005,
"step": 447
},
{
"epoch": 0.5579078455790785,
"grad_norm": 0.02192739024758339,
"learning_rate": 1.3792029887920299e-05,
"loss": 0.0006,
"step": 448
},
{
"epoch": 0.5591531755915318,
"grad_norm": 0.0317448228597641,
"learning_rate": 1.3823163138231632e-05,
"loss": 0.0006,
"step": 449
},
{
"epoch": 0.5603985056039851,
"grad_norm": 0.01051297876983881,
"learning_rate": 1.3854296388542964e-05,
"loss": 0.0003,
"step": 450
},
{
"epoch": 0.5616438356164384,
"grad_norm": 0.014249038882553577,
"learning_rate": 1.3885429638854297e-05,
"loss": 0.0004,
"step": 451
},
{
"epoch": 0.5628891656288917,
"grad_norm": 0.026663757860660553,
"learning_rate": 1.3916562889165629e-05,
"loss": 0.0007,
"step": 452
},
{
"epoch": 0.564134495641345,
"grad_norm": 0.018503081053495407,
"learning_rate": 1.3947696139476963e-05,
"loss": 0.0005,
"step": 453
},
{
"epoch": 0.5653798256537983,
"grad_norm": 0.013995744287967682,
"learning_rate": 1.3978829389788294e-05,
"loss": 0.0004,
"step": 454
},
{
"epoch": 0.5666251556662516,
"grad_norm": 0.06841859221458435,
"learning_rate": 1.4009962640099628e-05,
"loss": 0.0012,
"step": 455
},
{
"epoch": 0.5678704856787049,
"grad_norm": 0.052551478147506714,
"learning_rate": 1.404109589041096e-05,
"loss": 0.0009,
"step": 456
},
{
"epoch": 0.5691158156911582,
"grad_norm": 0.01047549955546856,
"learning_rate": 1.4072229140722293e-05,
"loss": 0.0004,
"step": 457
},
{
"epoch": 0.5703611457036114,
"grad_norm": 0.01352018117904663,
"learning_rate": 1.4103362391033625e-05,
"loss": 0.0004,
"step": 458
},
{
"epoch": 0.5716064757160647,
"grad_norm": 0.023181084543466568,
"learning_rate": 1.4134495641344958e-05,
"loss": 0.0006,
"step": 459
},
{
"epoch": 0.572851805728518,
"grad_norm": 0.01287688035517931,
"learning_rate": 1.4165628891656292e-05,
"loss": 0.0004,
"step": 460
},
{
"epoch": 0.5740971357409713,
"grad_norm": 0.013366766273975372,
"learning_rate": 1.4196762141967623e-05,
"loss": 0.0004,
"step": 461
},
{
"epoch": 0.5753424657534246,
"grad_norm": 0.01742659881711006,
"learning_rate": 1.4227895392278957e-05,
"loss": 0.0005,
"step": 462
},
{
"epoch": 0.5765877957658779,
"grad_norm": 0.018992751836776733,
"learning_rate": 1.4259028642590285e-05,
"loss": 0.0004,
"step": 463
},
{
"epoch": 0.5778331257783312,
"grad_norm": 0.013830466195940971,
"learning_rate": 1.4290161892901619e-05,
"loss": 0.0005,
"step": 464
},
{
"epoch": 0.5790784557907845,
"grad_norm": 0.2647791802883148,
"learning_rate": 1.432129514321295e-05,
"loss": 0.0015,
"step": 465
},
{
"epoch": 0.5803237858032378,
"grad_norm": 0.05277368426322937,
"learning_rate": 1.4352428393524284e-05,
"loss": 0.0014,
"step": 466
},
{
"epoch": 0.5815691158156912,
"grad_norm": 0.04205463454127312,
"learning_rate": 1.4383561643835617e-05,
"loss": 0.0011,
"step": 467
},
{
"epoch": 0.5828144458281445,
"grad_norm": 0.01518219243735075,
"learning_rate": 1.4414694894146949e-05,
"loss": 0.0004,
"step": 468
},
{
"epoch": 0.5840597758405978,
"grad_norm": 0.011395282112061977,
"learning_rate": 1.4445828144458282e-05,
"loss": 0.0004,
"step": 469
},
{
"epoch": 0.5853051058530511,
"grad_norm": 0.014821592718362808,
"learning_rate": 1.4476961394769614e-05,
"loss": 0.0005,
"step": 470
},
{
"epoch": 0.5865504358655044,
"grad_norm": 0.01130912359803915,
"learning_rate": 1.4508094645080948e-05,
"loss": 0.0004,
"step": 471
},
{
"epoch": 0.5877957658779577,
"grad_norm": 0.02256758324801922,
"learning_rate": 1.453922789539228e-05,
"loss": 0.0006,
"step": 472
},
{
"epoch": 0.589041095890411,
"grad_norm": 0.1458512842655182,
"learning_rate": 1.4570361145703613e-05,
"loss": 0.0014,
"step": 473
},
{
"epoch": 0.5902864259028643,
"grad_norm": 0.07600380480289459,
"learning_rate": 1.4601494396014945e-05,
"loss": 0.0016,
"step": 474
},
{
"epoch": 0.5915317559153176,
"grad_norm": 0.007826216518878937,
"learning_rate": 1.4632627646326278e-05,
"loss": 0.0002,
"step": 475
},
{
"epoch": 0.5927770859277709,
"grad_norm": 0.013695678673684597,
"learning_rate": 1.466376089663761e-05,
"loss": 0.0004,
"step": 476
},
{
"epoch": 0.5940224159402242,
"grad_norm": 0.034744229167699814,
"learning_rate": 1.4694894146948943e-05,
"loss": 0.0009,
"step": 477
},
{
"epoch": 0.5952677459526775,
"grad_norm": 0.015751633793115616,
"learning_rate": 1.4726027397260275e-05,
"loss": 0.0005,
"step": 478
},
{
"epoch": 0.5965130759651308,
"grad_norm": 0.01636291854083538,
"learning_rate": 1.4757160647571608e-05,
"loss": 0.0004,
"step": 479
},
{
"epoch": 0.597758405977584,
"grad_norm": 0.019713019952178,
"learning_rate": 1.478829389788294e-05,
"loss": 0.0006,
"step": 480
},
{
"epoch": 0.5990037359900373,
"grad_norm": 0.020456036552786827,
"learning_rate": 1.4819427148194274e-05,
"loss": 0.0005,
"step": 481
},
{
"epoch": 0.6002490660024906,
"grad_norm": 0.027187447994947433,
"learning_rate": 1.4850560398505605e-05,
"loss": 0.0006,
"step": 482
},
{
"epoch": 0.6014943960149439,
"grad_norm": 0.024321310222148895,
"learning_rate": 1.4881693648816939e-05,
"loss": 0.0007,
"step": 483
},
{
"epoch": 0.6027397260273972,
"grad_norm": 0.01486989390105009,
"learning_rate": 1.491282689912827e-05,
"loss": 0.0004,
"step": 484
},
{
"epoch": 0.6039850560398505,
"grad_norm": 0.022661667317152023,
"learning_rate": 1.49439601494396e-05,
"loss": 0.0007,
"step": 485
},
{
"epoch": 0.6052303860523038,
"grad_norm": 0.01003281120210886,
"learning_rate": 1.4975093399750934e-05,
"loss": 0.0003,
"step": 486
},
{
"epoch": 0.6064757160647571,
"grad_norm": 0.01938827708363533,
"learning_rate": 1.5006226650062266e-05,
"loss": 0.0005,
"step": 487
},
{
"epoch": 0.6077210460772104,
"grad_norm": 0.058401111513376236,
"learning_rate": 1.50373599003736e-05,
"loss": 0.0006,
"step": 488
},
{
"epoch": 0.6089663760896638,
"grad_norm": 0.008321065455675125,
"learning_rate": 1.5068493150684931e-05,
"loss": 0.0003,
"step": 489
},
{
"epoch": 0.6102117061021171,
"grad_norm": 0.01695171184837818,
"learning_rate": 1.5099626400996264e-05,
"loss": 0.0005,
"step": 490
},
{
"epoch": 0.6114570361145704,
"grad_norm": 0.008688063360750675,
"learning_rate": 1.5130759651307596e-05,
"loss": 0.0003,
"step": 491
},
{
"epoch": 0.6127023661270237,
"grad_norm": 0.009470910765230656,
"learning_rate": 1.516189290161893e-05,
"loss": 0.0003,
"step": 492
},
{
"epoch": 0.613947696139477,
"grad_norm": 0.010343602858483791,
"learning_rate": 1.5193026151930261e-05,
"loss": 0.0003,
"step": 493
},
{
"epoch": 0.6151930261519303,
"grad_norm": 0.031660452485084534,
"learning_rate": 1.5224159402241595e-05,
"loss": 0.0006,
"step": 494
},
{
"epoch": 0.6164383561643836,
"grad_norm": 0.02456934005022049,
"learning_rate": 1.5255292652552926e-05,
"loss": 0.0005,
"step": 495
},
{
"epoch": 0.6176836861768369,
"grad_norm": 0.022074950858950615,
"learning_rate": 1.5286425902864258e-05,
"loss": 0.0006,
"step": 496
},
{
"epoch": 0.6189290161892902,
"grad_norm": 0.013984983786940575,
"learning_rate": 1.5317559153175593e-05,
"loss": 0.0004,
"step": 497
},
{
"epoch": 0.6201743462017435,
"grad_norm": 0.02767989970743656,
"learning_rate": 1.5348692403486925e-05,
"loss": 0.0004,
"step": 498
},
{
"epoch": 0.6214196762141968,
"grad_norm": 0.011965448036789894,
"learning_rate": 1.5379825653798257e-05,
"loss": 0.0003,
"step": 499
},
{
"epoch": 0.6226650062266501,
"grad_norm": 0.018284225836396217,
"learning_rate": 1.541095890410959e-05,
"loss": 0.0005,
"step": 500
},
{
"epoch": 0.6239103362391034,
"grad_norm": 0.010995174758136272,
"learning_rate": 1.5442092154420924e-05,
"loss": 0.0002,
"step": 501
},
{
"epoch": 0.6251556662515566,
"grad_norm": 0.008704639971256256,
"learning_rate": 1.5473225404732256e-05,
"loss": 0.0002,
"step": 502
},
{
"epoch": 0.6264009962640099,
"grad_norm": 0.030416160821914673,
"learning_rate": 1.5504358655043587e-05,
"loss": 0.0007,
"step": 503
},
{
"epoch": 0.6276463262764632,
"grad_norm": 0.02834182232618332,
"learning_rate": 1.5535491905354922e-05,
"loss": 0.0007,
"step": 504
},
{
"epoch": 0.6288916562889165,
"grad_norm": 0.008636824786663055,
"learning_rate": 1.5566625155666254e-05,
"loss": 0.0003,
"step": 505
},
{
"epoch": 0.6301369863013698,
"grad_norm": 0.037112049758434296,
"learning_rate": 1.5597758405977586e-05,
"loss": 0.0009,
"step": 506
},
{
"epoch": 0.6313823163138231,
"grad_norm": 0.012123404070734978,
"learning_rate": 1.5628891656288914e-05,
"loss": 0.0003,
"step": 507
},
{
"epoch": 0.6326276463262764,
"grad_norm": 36.184539794921875,
"learning_rate": 1.566002490660025e-05,
"loss": 0.0304,
"step": 508
},
{
"epoch": 0.6338729763387297,
"grad_norm": 0.03620361536741257,
"learning_rate": 1.569115815691158e-05,
"loss": 0.0009,
"step": 509
},
{
"epoch": 0.635118306351183,
"grad_norm": 0.01849571242928505,
"learning_rate": 1.5722291407222913e-05,
"loss": 0.0005,
"step": 510
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.010837621986865997,
"learning_rate": 1.5753424657534248e-05,
"loss": 0.0003,
"step": 511
},
{
"epoch": 0.6376089663760897,
"grad_norm": 0.017697712406516075,
"learning_rate": 1.578455790784558e-05,
"loss": 0.0004,
"step": 512
},
{
"epoch": 0.638854296388543,
"grad_norm": 0.00896854791790247,
"learning_rate": 1.581569115815691e-05,
"loss": 0.0003,
"step": 513
},
{
"epoch": 0.6400996264009963,
"grad_norm": 0.009376812726259232,
"learning_rate": 1.5846824408468243e-05,
"loss": 0.0003,
"step": 514
},
{
"epoch": 0.6413449564134496,
"grad_norm": 0.03261823207139969,
"learning_rate": 1.587795765877958e-05,
"loss": 0.0006,
"step": 515
},
{
"epoch": 0.6425902864259029,
"grad_norm": 71.34445190429688,
"learning_rate": 1.590909090909091e-05,
"loss": 4.0159,
"step": 516
},
{
"epoch": 0.6438356164383562,
"grad_norm": 0.02780863456428051,
"learning_rate": 1.5940224159402242e-05,
"loss": 0.0006,
"step": 517
},
{
"epoch": 0.6450809464508095,
"grad_norm": 0.008818407543003559,
"learning_rate": 1.5971357409713574e-05,
"loss": 0.0003,
"step": 518
},
{
"epoch": 0.6463262764632628,
"grad_norm": 0.030920347198843956,
"learning_rate": 1.600249066002491e-05,
"loss": 0.0007,
"step": 519
},
{
"epoch": 0.6475716064757161,
"grad_norm": 0.018262671306729317,
"learning_rate": 1.603362391033624e-05,
"loss": 0.0005,
"step": 520
},
{
"epoch": 0.6488169364881694,
"grad_norm": 0.011576538905501366,
"learning_rate": 1.6064757160647572e-05,
"loss": 0.0004,
"step": 521
},
{
"epoch": 0.6500622665006227,
"grad_norm": 0.010801947675645351,
"learning_rate": 1.6095890410958904e-05,
"loss": 0.0003,
"step": 522
},
{
"epoch": 0.651307596513076,
"grad_norm": 0.013210455887019634,
"learning_rate": 1.612702366127024e-05,
"loss": 0.0005,
"step": 523
},
{
"epoch": 0.6525529265255293,
"grad_norm": 0.014238444156944752,
"learning_rate": 1.615815691158157e-05,
"loss": 0.0004,
"step": 524
},
{
"epoch": 0.6537982565379825,
"grad_norm": 0.007543179206550121,
"learning_rate": 1.6189290161892903e-05,
"loss": 0.0002,
"step": 525
},
{
"epoch": 0.6550435865504358,
"grad_norm": 0.007191088050603867,
"learning_rate": 1.6220423412204234e-05,
"loss": 0.0002,
"step": 526
},
{
"epoch": 0.6562889165628891,
"grad_norm": 0.011641144752502441,
"learning_rate": 1.625155666251557e-05,
"loss": 0.0003,
"step": 527
},
{
"epoch": 0.6575342465753424,
"grad_norm": 0.018345683813095093,
"learning_rate": 1.62826899128269e-05,
"loss": 0.0005,
"step": 528
},
{
"epoch": 0.6587795765877957,
"grad_norm": 0.3033308684825897,
"learning_rate": 1.6313823163138233e-05,
"loss": 0.0012,
"step": 529
},
{
"epoch": 0.660024906600249,
"grad_norm": 0.03083566203713417,
"learning_rate": 1.6344956413449565e-05,
"loss": 0.0007,
"step": 530
},
{
"epoch": 0.6612702366127023,
"grad_norm": 0.011249137111008167,
"learning_rate": 1.6376089663760897e-05,
"loss": 0.0003,
"step": 531
},
{
"epoch": 0.6625155666251556,
"grad_norm": 0.009096617810428143,
"learning_rate": 1.640722291407223e-05,
"loss": 0.0003,
"step": 532
},
{
"epoch": 0.663760896637609,
"grad_norm": 0.007661182899028063,
"learning_rate": 1.643835616438356e-05,
"loss": 0.0002,
"step": 533
},
{
"epoch": 0.6650062266500623,
"grad_norm": 0.03464965149760246,
"learning_rate": 1.6469489414694895e-05,
"loss": 0.0006,
"step": 534
},
{
"epoch": 0.6662515566625156,
"grad_norm": 0.017583874985575676,
"learning_rate": 1.6500622665006227e-05,
"loss": 0.0005,
"step": 535
},
{
"epoch": 0.6674968866749689,
"grad_norm": 0.012846691533923149,
"learning_rate": 1.653175591531756e-05,
"loss": 0.0003,
"step": 536
},
{
"epoch": 0.6687422166874222,
"grad_norm": 0.008167251013219357,
"learning_rate": 1.656288916562889e-05,
"loss": 0.0002,
"step": 537
},
{
"epoch": 0.6699875466998755,
"grad_norm": 0.09242931753396988,
"learning_rate": 1.6594022415940226e-05,
"loss": 0.0006,
"step": 538
},
{
"epoch": 0.6712328767123288,
"grad_norm": 0.007621095050126314,
"learning_rate": 1.6625155666251557e-05,
"loss": 0.0003,
"step": 539
},
{
"epoch": 0.6724782067247821,
"grad_norm": 364.0179138183594,
"learning_rate": 1.665628891656289e-05,
"loss": 2.4925,
"step": 540
},
{
"epoch": 0.6737235367372354,
"grad_norm": 0.029700903221964836,
"learning_rate": 1.6687422166874224e-05,
"loss": 0.0005,
"step": 541
},
{
"epoch": 0.6749688667496887,
"grad_norm": 0.009756062179803848,
"learning_rate": 1.6718555417185556e-05,
"loss": 0.0003,
"step": 542
},
{
"epoch": 0.676214196762142,
"grad_norm": 0.02434486895799637,
"learning_rate": 1.6749688667496888e-05,
"loss": 0.0007,
"step": 543
},
{
"epoch": 0.6774595267745953,
"grad_norm": 0.0061378516256809235,
"learning_rate": 1.678082191780822e-05,
"loss": 0.0002,
"step": 544
},
{
"epoch": 0.6787048567870486,
"grad_norm": 0.007974776439368725,
"learning_rate": 1.6811955168119555e-05,
"loss": 0.0002,
"step": 545
},
{
"epoch": 0.6799501867995019,
"grad_norm": 0.023721277713775635,
"learning_rate": 1.6843088418430886e-05,
"loss": 0.0007,
"step": 546
},
{
"epoch": 0.6811955168119551,
"grad_norm": 0.06722849607467651,
"learning_rate": 1.6874221668742218e-05,
"loss": 0.0014,
"step": 547
},
{
"epoch": 0.6824408468244084,
"grad_norm": 0.021218659356236458,
"learning_rate": 1.690535491905355e-05,
"loss": 0.0005,
"step": 548
},
{
"epoch": 0.6836861768368617,
"grad_norm": 0.007651370484381914,
"learning_rate": 1.6936488169364885e-05,
"loss": 0.0003,
"step": 549
},
{
"epoch": 0.684931506849315,
"grad_norm": 0.023434964939951897,
"learning_rate": 1.6967621419676217e-05,
"loss": 0.0004,
"step": 550
},
{
"epoch": 0.6861768368617683,
"grad_norm": 0.010944285430014133,
"learning_rate": 1.699875466998755e-05,
"loss": 0.0002,
"step": 551
},
{
"epoch": 0.6874221668742216,
"grad_norm": 0.007479478605091572,
"learning_rate": 1.702988792029888e-05,
"loss": 0.0002,
"step": 552
},
{
"epoch": 0.688667496886675,
"grad_norm": 0.016678282991051674,
"learning_rate": 1.7061021170610212e-05,
"loss": 0.0004,
"step": 553
},
{
"epoch": 0.6899128268991283,
"grad_norm": 0.008227194659411907,
"learning_rate": 1.7092154420921544e-05,
"loss": 0.0002,
"step": 554
},
{
"epoch": 0.6911581569115816,
"grad_norm": 0.016022512689232826,
"learning_rate": 1.7123287671232875e-05,
"loss": 0.0004,
"step": 555
},
{
"epoch": 0.6924034869240349,
"grad_norm": 0.01723802089691162,
"learning_rate": 1.715442092154421e-05,
"loss": 0.0004,
"step": 556
},
{
"epoch": 0.6936488169364882,
"grad_norm": 0.007776948623359203,
"learning_rate": 1.7185554171855542e-05,
"loss": 0.0002,
"step": 557
},
{
"epoch": 0.6948941469489415,
"grad_norm": 0.061478786170482635,
"learning_rate": 1.7216687422166874e-05,
"loss": 0.0004,
"step": 558
},
{
"epoch": 0.6961394769613948,
"grad_norm": 0.030175473541021347,
"learning_rate": 1.7247820672478206e-05,
"loss": 0.0005,
"step": 559
},
{
"epoch": 0.6973848069738481,
"grad_norm": 0.03586643561720848,
"learning_rate": 1.727895392278954e-05,
"loss": 0.0009,
"step": 560
},
{
"epoch": 0.6986301369863014,
"grad_norm": 0.01669226959347725,
"learning_rate": 1.7310087173100873e-05,
"loss": 0.0004,
"step": 561
},
{
"epoch": 0.6998754669987547,
"grad_norm": 0.013228816911578178,
"learning_rate": 1.7341220423412205e-05,
"loss": 0.0003,
"step": 562
},
{
"epoch": 0.701120797011208,
"grad_norm": 0.16547606885433197,
"learning_rate": 1.7372353673723536e-05,
"loss": 0.0014,
"step": 563
},
{
"epoch": 0.7023661270236613,
"grad_norm": 0.20769615471363068,
"learning_rate": 1.740348692403487e-05,
"loss": 0.0007,
"step": 564
},
{
"epoch": 0.7036114570361146,
"grad_norm": Infinity,
"learning_rate": 1.740348692403487e-05,
"loss": 3.7559,
"step": 565
},
{
"epoch": 0.7048567870485679,
"grad_norm": 0.010459132492542267,
"learning_rate": 1.7434620174346203e-05,
"loss": 0.0003,
"step": 566
},
{
"epoch": 0.7061021170610212,
"grad_norm": 7.497586727142334,
"learning_rate": 1.7465753424657535e-05,
"loss": 0.008,
"step": 567
},
{
"epoch": 0.7073474470734745,
"grad_norm": 0.011709270067512989,
"learning_rate": 1.7496886674968867e-05,
"loss": 0.0003,
"step": 568
},
{
"epoch": 0.7085927770859277,
"grad_norm": 0.024786679074168205,
"learning_rate": 1.7528019925280202e-05,
"loss": 0.0005,
"step": 569
},
{
"epoch": 0.709838107098381,
"grad_norm": 0.007164615672081709,
"learning_rate": 1.7559153175591534e-05,
"loss": 0.0003,
"step": 570
},
{
"epoch": 0.7110834371108343,
"grad_norm": 0.006929496768862009,
"learning_rate": 1.7590286425902865e-05,
"loss": 0.0002,
"step": 571
},
{
"epoch": 0.7123287671232876,
"grad_norm": 0.01036135945469141,
"learning_rate": 1.7621419676214197e-05,
"loss": 0.0003,
"step": 572
},
{
"epoch": 0.7135740971357409,
"grad_norm": 0.01619466207921505,
"learning_rate": 1.7652552926525532e-05,
"loss": 0.0004,
"step": 573
},
{
"epoch": 0.7148194271481942,
"grad_norm": 0.007037854287773371,
"learning_rate": 1.7683686176836864e-05,
"loss": 0.0002,
"step": 574
},
{
"epoch": 0.7160647571606475,
"grad_norm": 0.015169711783528328,
"learning_rate": 1.7714819427148192e-05,
"loss": 0.0004,
"step": 575
},
{
"epoch": 0.7173100871731009,
"grad_norm": 0.014573472552001476,
"learning_rate": 1.7745952677459527e-05,
"loss": 0.0003,
"step": 576
},
{
"epoch": 0.7185554171855542,
"grad_norm": 0.012262790463864803,
"learning_rate": 1.777708592777086e-05,
"loss": 0.0003,
"step": 577
},
{
"epoch": 0.7198007471980075,
"grad_norm": 0.011037294752895832,
"learning_rate": 1.780821917808219e-05,
"loss": 0.0003,
"step": 578
},
{
"epoch": 0.7210460772104608,
"grad_norm": 0.012611133977770805,
"learning_rate": 1.7839352428393523e-05,
"loss": 0.0003,
"step": 579
},
{
"epoch": 0.7222914072229141,
"grad_norm": 0.13023485243320465,
"learning_rate": 1.7870485678704858e-05,
"loss": 0.0009,
"step": 580
},
{
"epoch": 0.7235367372353674,
"grad_norm": 0.006935072597116232,
"learning_rate": 1.790161892901619e-05,
"loss": 0.0002,
"step": 581
},
{
"epoch": 0.7247820672478207,
"grad_norm": 0.026650428771972656,
"learning_rate": 1.793275217932752e-05,
"loss": 0.0006,
"step": 582
},
{
"epoch": 0.726027397260274,
"grad_norm": 0.015044482424855232,
"learning_rate": 1.7963885429638856e-05,
"loss": 0.0004,
"step": 583
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.019932331517338753,
"learning_rate": 1.7995018679950188e-05,
"loss": 0.0005,
"step": 584
},
{
"epoch": 0.7285180572851806,
"grad_norm": 0.01698875240981579,
"learning_rate": 1.802615193026152e-05,
"loss": 0.0004,
"step": 585
},
{
"epoch": 0.7297633872976339,
"grad_norm": 0.4486841857433319,
"learning_rate": 1.805728518057285e-05,
"loss": 0.0005,
"step": 586
},
{
"epoch": 0.7310087173100872,
"grad_norm": 0.01894947700202465,
"learning_rate": 1.8088418430884187e-05,
"loss": 0.0006,
"step": 587
},
{
"epoch": 0.7322540473225405,
"grad_norm": 0.006948466412723064,
"learning_rate": 1.811955168119552e-05,
"loss": 0.0002,
"step": 588
},
{
"epoch": 0.7334993773349938,
"grad_norm": 15.503718376159668,
"learning_rate": 1.815068493150685e-05,
"loss": 0.0137,
"step": 589
},
{
"epoch": 0.7347447073474471,
"grad_norm": 0.021334033459424973,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.0006,
"step": 590
},
{
"epoch": 0.7359900373599004,
"grad_norm": 0.02985548786818981,
"learning_rate": 1.8212951432129517e-05,
"loss": 0.0005,
"step": 591
},
{
"epoch": 0.7372353673723536,
"grad_norm": 0.007480076979845762,
"learning_rate": 1.824408468244085e-05,
"loss": 0.0002,
"step": 592
},
{
"epoch": 0.7384806973848069,
"grad_norm": 0.006202853284776211,
"learning_rate": 1.827521793275218e-05,
"loss": 0.0002,
"step": 593
},
{
"epoch": 0.7397260273972602,
"grad_norm": 0.020105713978409767,
"learning_rate": 1.8306351183063512e-05,
"loss": 0.0005,
"step": 594
},
{
"epoch": 0.7409713574097135,
"grad_norm": 0.01176950428634882,
"learning_rate": 1.8337484433374848e-05,
"loss": 0.0003,
"step": 595
},
{
"epoch": 0.7422166874221668,
"grad_norm": 0.02436145208775997,
"learning_rate": 1.836861768368618e-05,
"loss": 0.0005,
"step": 596
},
{
"epoch": 0.7434620174346201,
"grad_norm": 0.015877658501267433,
"learning_rate": 1.839975093399751e-05,
"loss": 0.0004,
"step": 597
},
{
"epoch": 0.7447073474470735,
"grad_norm": 0.0258621908724308,
"learning_rate": 1.8430884184308843e-05,
"loss": 0.0006,
"step": 598
},
{
"epoch": 0.7459526774595268,
"grad_norm": 0.0054780724458396435,
"learning_rate": 1.8462017434620175e-05,
"loss": 0.0002,
"step": 599
},
{
"epoch": 0.7471980074719801,
"grad_norm": 0.01809469237923622,
"learning_rate": 1.8493150684931506e-05,
"loss": 0.0004,
"step": 600
},
{
"epoch": 0.7484433374844334,
"grad_norm": 0.012986347079277039,
"learning_rate": 1.8524283935242838e-05,
"loss": 0.0003,
"step": 601
},
{
"epoch": 0.7496886674968867,
"grad_norm": 0.004867818206548691,
"learning_rate": 1.8555417185554173e-05,
"loss": 0.0001,
"step": 602
},
{
"epoch": 0.75093399750934,
"grad_norm": 0.005523454863578081,
"learning_rate": 1.8586550435865505e-05,
"loss": 0.0002,
"step": 603
},
{
"epoch": 0.7521793275217933,
"grad_norm": 0.009668633341789246,
"learning_rate": 1.8617683686176837e-05,
"loss": 0.0003,
"step": 604
},
{
"epoch": 0.7534246575342466,
"grad_norm": 0.0070527163334190845,
"learning_rate": 1.864881693648817e-05,
"loss": 0.0002,
"step": 605
},
{
"epoch": 0.7546699875466999,
"grad_norm": 0.006774348672479391,
"learning_rate": 1.8679950186799504e-05,
"loss": 0.0002,
"step": 606
},
{
"epoch": 0.7559153175591532,
"grad_norm": 0.007995886728167534,
"learning_rate": 1.8711083437110835e-05,
"loss": 0.0002,
"step": 607
},
{
"epoch": 0.7571606475716065,
"grad_norm": 30.348756790161133,
"learning_rate": 1.8742216687422167e-05,
"loss": 4.172,
"step": 608
},
{
"epoch": 0.7584059775840598,
"grad_norm": 0.01787879690527916,
"learning_rate": 1.87733499377335e-05,
"loss": 0.0004,
"step": 609
},
{
"epoch": 0.7596513075965131,
"grad_norm": 0.06024169921875,
"learning_rate": 1.8804483188044834e-05,
"loss": 0.0011,
"step": 610
},
{
"epoch": 0.7608966376089664,
"grad_norm": 0.06412393599748611,
"learning_rate": 1.8835616438356166e-05,
"loss": 0.0014,
"step": 611
},
{
"epoch": 0.7621419676214197,
"grad_norm": 0.01381937600672245,
"learning_rate": 1.8866749688667497e-05,
"loss": 0.0005,
"step": 612
},
{
"epoch": 0.763387297633873,
"grad_norm": 0.01991051435470581,
"learning_rate": 1.889788293897883e-05,
"loss": 0.0003,
"step": 613
},
{
"epoch": 0.7646326276463262,
"grad_norm": 0.14104107022285461,
"learning_rate": 1.8929016189290164e-05,
"loss": 0.0026,
"step": 614
},
{
"epoch": 0.7658779576587795,
"grad_norm": 0.0066263917833566666,
"learning_rate": 1.8960149439601496e-05,
"loss": 0.0002,
"step": 615
},
{
"epoch": 0.7671232876712328,
"grad_norm": 0.006442869547754526,
"learning_rate": 1.8991282689912828e-05,
"loss": 0.0002,
"step": 616
},
{
"epoch": 0.7683686176836861,
"grad_norm": 0.20366807281970978,
"learning_rate": 1.9022415940224163e-05,
"loss": 0.0028,
"step": 617
},
{
"epoch": 0.7696139476961394,
"grad_norm": 0.16002459824085236,
"learning_rate": 1.9053549190535495e-05,
"loss": 0.0023,
"step": 618
},
{
"epoch": 0.7708592777085927,
"grad_norm": 0.007126240525394678,
"learning_rate": 1.9084682440846827e-05,
"loss": 0.0002,
"step": 619
},
{
"epoch": 0.772104607721046,
"grad_norm": 0.22348296642303467,
"learning_rate": 1.9115815691158155e-05,
"loss": 0.0034,
"step": 620
},
{
"epoch": 0.7733499377334994,
"grad_norm": 0.01117734331637621,
"learning_rate": 1.914694894146949e-05,
"loss": 0.0003,
"step": 621
},
{
"epoch": 0.7745952677459527,
"grad_norm": 0.017832182347774506,
"learning_rate": 1.9178082191780822e-05,
"loss": 0.0004,
"step": 622
},
{
"epoch": 0.775840597758406,
"grad_norm": 0.10084803402423859,
"learning_rate": 1.9209215442092154e-05,
"loss": 0.002,
"step": 623
},
{
"epoch": 0.7770859277708593,
"grad_norm": 0.0404939204454422,
"learning_rate": 1.924034869240349e-05,
"loss": 0.0009,
"step": 624
},
{
"epoch": 0.7783312577833126,
"grad_norm": 0.006709231995046139,
"learning_rate": 1.927148194271482e-05,
"loss": 0.0002,
"step": 625
},
{
"epoch": 0.7795765877957659,
"grad_norm": 0.006246612407267094,
"learning_rate": 1.9302615193026152e-05,
"loss": 0.0002,
"step": 626
},
{
"epoch": 0.7808219178082192,
"grad_norm": 0.007551430258899927,
"learning_rate": 1.9333748443337484e-05,
"loss": 0.0002,
"step": 627
},
{
"epoch": 0.7820672478206725,
"grad_norm": 0.010194691829383373,
"learning_rate": 1.936488169364882e-05,
"loss": 0.0002,
"step": 628
},
{
"epoch": 0.7833125778331258,
"grad_norm": 0.007259845733642578,
"learning_rate": 1.939601494396015e-05,
"loss": 0.0002,
"step": 629
},
{
"epoch": 0.7845579078455791,
"grad_norm": 0.6343588829040527,
"learning_rate": 1.9427148194271483e-05,
"loss": 0.0014,
"step": 630
},
{
"epoch": 0.7858032378580324,
"grad_norm": 0.004895548801869154,
"learning_rate": 1.9458281444582814e-05,
"loss": 0.0001,
"step": 631
},
{
"epoch": 0.7870485678704857,
"grad_norm": 0.023873023688793182,
"learning_rate": 1.948941469489415e-05,
"loss": 0.0006,
"step": 632
},
{
"epoch": 0.788293897882939,
"grad_norm": 0.06282692402601242,
"learning_rate": 1.952054794520548e-05,
"loss": 0.0014,
"step": 633
},
{
"epoch": 0.7895392278953923,
"grad_norm": 0.01570272073149681,
"learning_rate": 1.9551681195516813e-05,
"loss": 0.0005,
"step": 634
},
{
"epoch": 0.7907845579078456,
"grad_norm": 0.004377361387014389,
"learning_rate": 1.9582814445828145e-05,
"loss": 0.0001,
"step": 635
},
{
"epoch": 0.7920298879202988,
"grad_norm": 0.005370027385652065,
"learning_rate": 1.961394769613948e-05,
"loss": 0.0001,
"step": 636
},
{
"epoch": 0.7932752179327521,
"grad_norm": 0.016998134553432465,
"learning_rate": 1.964508094645081e-05,
"loss": 0.0003,
"step": 637
},
{
"epoch": 0.7945205479452054,
"grad_norm": 0.02801138535141945,
"learning_rate": 1.9676214196762143e-05,
"loss": 0.0007,
"step": 638
},
{
"epoch": 0.7957658779576587,
"grad_norm": 0.007101301569491625,
"learning_rate": 1.9707347447073475e-05,
"loss": 0.0002,
"step": 639
},
{
"epoch": 0.797011207970112,
"grad_norm": 0.007805291563272476,
"learning_rate": 1.973848069738481e-05,
"loss": 0.0002,
"step": 640
},
{
"epoch": 0.7982565379825654,
"grad_norm": 0.01866893284022808,
"learning_rate": 1.9769613947696142e-05,
"loss": 0.0004,
"step": 641
},
{
"epoch": 0.7995018679950187,
"grad_norm": 0.008472064509987831,
"learning_rate": 1.980074719800747e-05,
"loss": 0.0002,
"step": 642
},
{
"epoch": 0.800747198007472,
"grad_norm": 0.011058184318244457,
"learning_rate": 1.9831880448318805e-05,
"loss": 0.0004,
"step": 643
},
{
"epoch": 0.8019925280199253,
"grad_norm": 0.01657005585730076,
"learning_rate": 1.9863013698630137e-05,
"loss": 0.0004,
"step": 644
},
{
"epoch": 0.8032378580323786,
"grad_norm": 0.007903863675892353,
"learning_rate": 1.989414694894147e-05,
"loss": 0.0002,
"step": 645
},
{
"epoch": 0.8044831880448319,
"grad_norm": 0.008648911491036415,
"learning_rate": 1.99252801992528e-05,
"loss": 0.0003,
"step": 646
},
{
"epoch": 0.8057285180572852,
"grad_norm": 0.005954551976174116,
"learning_rate": 1.9956413449564136e-05,
"loss": 0.0001,
"step": 647
},
{
"epoch": 0.8069738480697385,
"grad_norm": 0.012240339070558548,
"learning_rate": 1.9987546699875468e-05,
"loss": 0.0004,
"step": 648
},
{
"epoch": 0.8082191780821918,
"grad_norm": 0.012209310196340084,
"learning_rate": 2.00186799501868e-05,
"loss": 0.0004,
"step": 649
},
{
"epoch": 0.8094645080946451,
"grad_norm": 0.013876602053642273,
"learning_rate": 2.004981320049813e-05,
"loss": 0.0004,
"step": 650
},
{
"epoch": 0.8107098381070984,
"grad_norm": 0.006682861130684614,
"learning_rate": 2.0080946450809466e-05,
"loss": 0.0002,
"step": 651
},
{
"epoch": 0.8119551681195517,
"grad_norm": 0.01869480311870575,
"learning_rate": 2.0112079701120798e-05,
"loss": 0.0004,
"step": 652
},
{
"epoch": 0.813200498132005,
"grad_norm": 0.006386366207152605,
"learning_rate": 2.014321295143213e-05,
"loss": 0.0002,
"step": 653
},
{
"epoch": 0.8144458281444583,
"grad_norm": 0.031244048848748207,
"learning_rate": 2.017434620174346e-05,
"loss": 0.0007,
"step": 654
},
{
"epoch": 0.8156911581569116,
"grad_norm": 0.005839107092469931,
"learning_rate": 2.0205479452054797e-05,
"loss": 0.0002,
"step": 655
},
{
"epoch": 0.8169364881693649,
"grad_norm": 0.012466920539736748,
"learning_rate": 2.023661270236613e-05,
"loss": 0.0003,
"step": 656
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.011677310802042484,
"learning_rate": 2.026774595267746e-05,
"loss": 0.0003,
"step": 657
},
{
"epoch": 0.8194271481942715,
"grad_norm": 325.08514404296875,
"learning_rate": 2.0298879202988795e-05,
"loss": 0.185,
"step": 658
},
{
"epoch": 0.8206724782067247,
"grad_norm": 0.00978070218116045,
"learning_rate": 2.0330012453300127e-05,
"loss": 0.0003,
"step": 659
},
{
"epoch": 0.821917808219178,
"grad_norm": 0.009361130185425282,
"learning_rate": 2.036114570361146e-05,
"loss": 0.0003,
"step": 660
},
{
"epoch": 0.8231631382316313,
"grad_norm": 0.007570465561002493,
"learning_rate": 2.039227895392279e-05,
"loss": 0.0003,
"step": 661
},
{
"epoch": 0.8244084682440846,
"grad_norm": 0.00575603824108839,
"learning_rate": 2.0423412204234126e-05,
"loss": 0.0002,
"step": 662
},
{
"epoch": 0.825653798256538,
"grad_norm": 0.014008327387273312,
"learning_rate": 2.0454545454545457e-05,
"loss": 0.0004,
"step": 663
},
{
"epoch": 0.8268991282689913,
"grad_norm": 0.00547524681314826,
"learning_rate": 2.048567870485679e-05,
"loss": 0.0001,
"step": 664
},
{
"epoch": 0.8281444582814446,
"grad_norm": 0.026367267593741417,
"learning_rate": 2.051681195516812e-05,
"loss": 0.0005,
"step": 665
},
{
"epoch": 0.8293897882938979,
"grad_norm": 0.0041604661382734776,
"learning_rate": 2.0547945205479453e-05,
"loss": 0.0001,
"step": 666
},
{
"epoch": 0.8306351183063512,
"grad_norm": 0.01260537002235651,
"learning_rate": 2.0579078455790784e-05,
"loss": 0.0004,
"step": 667
},
{
"epoch": 0.8318804483188045,
"grad_norm": 0.005095213185995817,
"learning_rate": 2.0610211706102116e-05,
"loss": 0.0002,
"step": 668
},
{
"epoch": 0.8331257783312578,
"grad_norm": 0.004534134641289711,
"learning_rate": 2.064134495641345e-05,
"loss": 0.0001,
"step": 669
},
{
"epoch": 0.8343711083437111,
"grad_norm": 0.015001599676907063,
"learning_rate": 2.0672478206724783e-05,
"loss": 0.0004,
"step": 670
},
{
"epoch": 0.8356164383561644,
"grad_norm": 0.005808024201542139,
"learning_rate": 2.0703611457036115e-05,
"loss": 0.0002,
"step": 671
},
{
"epoch": 0.8368617683686177,
"grad_norm": 0.008496883325278759,
"learning_rate": 2.0734744707347447e-05,
"loss": 0.0003,
"step": 672
},
{
"epoch": 0.838107098381071,
"grad_norm": 410.8919677734375,
"learning_rate": 2.076587795765878e-05,
"loss": 1.7746,
"step": 673
},
{
"epoch": 0.8393524283935243,
"grad_norm": 0.15478110313415527,
"learning_rate": 2.0797011207970113e-05,
"loss": 0.0008,
"step": 674
},
{
"epoch": 0.8405977584059776,
"grad_norm": 0.017121130600571632,
"learning_rate": 2.0828144458281445e-05,
"loss": 0.0004,
"step": 675
},
{
"epoch": 0.8418430884184309,
"grad_norm": 0.01048367191106081,
"learning_rate": 2.0859277708592777e-05,
"loss": 0.0003,
"step": 676
},
{
"epoch": 0.8430884184308842,
"grad_norm": 0.013435076922178268,
"learning_rate": 2.0890410958904112e-05,
"loss": 0.0004,
"step": 677
},
{
"epoch": 0.8443337484433375,
"grad_norm": 0.0057032410986721516,
"learning_rate": 2.0921544209215444e-05,
"loss": 0.0002,
"step": 678
},
{
"epoch": 0.8455790784557908,
"grad_norm": 0.05629182606935501,
"learning_rate": 2.0952677459526776e-05,
"loss": 0.0005,
"step": 679
},
{
"epoch": 0.8468244084682441,
"grad_norm": 0.8133471608161926,
"learning_rate": 2.0983810709838107e-05,
"loss": 0.0012,
"step": 680
},
{
"epoch": 0.8480697384806973,
"grad_norm": 0.011576468124985695,
"learning_rate": 2.1014943960149442e-05,
"loss": 0.0003,
"step": 681
},
{
"epoch": 0.8493150684931506,
"grad_norm": 0.079744853079319,
"learning_rate": 2.1046077210460774e-05,
"loss": 0.0006,
"step": 682
},
{
"epoch": 0.8505603985056039,
"grad_norm": 0.019048074260354042,
"learning_rate": 2.1077210460772106e-05,
"loss": 0.0004,
"step": 683
},
{
"epoch": 0.8518057285180572,
"grad_norm": 0.004764070268720388,
"learning_rate": 2.1108343711083438e-05,
"loss": 0.0001,
"step": 684
},
{
"epoch": 0.8530510585305106,
"grad_norm": 0.022517533972859383,
"learning_rate": 2.1139476961394773e-05,
"loss": 0.0003,
"step": 685
},
{
"epoch": 0.8542963885429639,
"grad_norm": 0.17990639805793762,
"learning_rate": 2.1170610211706105e-05,
"loss": 0.0007,
"step": 686
},
{
"epoch": 0.8555417185554172,
"grad_norm": 0.0133855314925313,
"learning_rate": 2.1201743462017433e-05,
"loss": 0.0004,
"step": 687
},
{
"epoch": 0.8567870485678705,
"grad_norm": 0.01034181471914053,
"learning_rate": 2.1232876712328768e-05,
"loss": 0.0003,
"step": 688
},
{
"epoch": 0.8580323785803238,
"grad_norm": 0.09839920699596405,
"learning_rate": 2.12640099626401e-05,
"loss": 0.0007,
"step": 689
},
{
"epoch": 0.8592777085927771,
"grad_norm": 0.28286799788475037,
"learning_rate": 2.129514321295143e-05,
"loss": 0.0009,
"step": 690
},
{
"epoch": 0.8605230386052304,
"grad_norm": 0.004863832611590624,
"learning_rate": 2.1326276463262763e-05,
"loss": 0.0001,
"step": 691
},
{
"epoch": 0.8617683686176837,
"grad_norm": 0.007945407181978226,
"learning_rate": 2.13574097135741e-05,
"loss": 0.0002,
"step": 692
},
{
"epoch": 0.863013698630137,
"grad_norm": 0.17650844156742096,
"learning_rate": 2.138854296388543e-05,
"loss": 0.0006,
"step": 693
},
{
"epoch": 0.8642590286425903,
"grad_norm": 36.761592864990234,
"learning_rate": 2.1419676214196762e-05,
"loss": 4.8048,
"step": 694
},
{
"epoch": 0.8655043586550436,
"grad_norm": 43.7182731628418,
"learning_rate": 2.1450809464508094e-05,
"loss": 4.1331,
"step": 695
},
{
"epoch": 0.8667496886674969,
"grad_norm": 0.031437598168849945,
"learning_rate": 2.148194271481943e-05,
"loss": 0.0005,
"step": 696
},
{
"epoch": 0.8679950186799502,
"grad_norm": 0.17908449470996857,
"learning_rate": 2.151307596513076e-05,
"loss": 0.0018,
"step": 697
},
{
"epoch": 0.8692403486924035,
"grad_norm": 43.03351974487305,
"learning_rate": 2.1544209215442092e-05,
"loss": 0.9142,
"step": 698
},
{
"epoch": 0.8704856787048568,
"grad_norm": 0.07657460123300552,
"learning_rate": 2.1575342465753427e-05,
"loss": 0.0007,
"step": 699
},
{
"epoch": 0.8717310087173101,
"grad_norm": 43.546669006347656,
"learning_rate": 2.160647571606476e-05,
"loss": 1.2326,
"step": 700
},
{
"epoch": 0.8729763387297634,
"grad_norm": 0.15518978238105774,
"learning_rate": 2.163760896637609e-05,
"loss": 0.0013,
"step": 701
},
{
"epoch": 0.8742216687422167,
"grad_norm": 20.484352111816406,
"learning_rate": 2.1668742216687423e-05,
"loss": 0.4034,
"step": 702
},
{
"epoch": 0.8754669987546699,
"grad_norm": 8.134427070617676,
"learning_rate": 2.1699875466998758e-05,
"loss": 0.1308,
"step": 703
},
{
"epoch": 0.8767123287671232,
"grad_norm": 31.111207962036133,
"learning_rate": 2.173100871731009e-05,
"loss": 1.3048,
"step": 704
},
{
"epoch": 0.8779576587795765,
"grad_norm": 1.6822067499160767,
"learning_rate": 2.176214196762142e-05,
"loss": 0.0337,
"step": 705
},
{
"epoch": 0.8792029887920298,
"grad_norm": 0.016219645738601685,
"learning_rate": 2.1793275217932753e-05,
"loss": 0.0002,
"step": 706
},
{
"epoch": 0.8804483188044832,
"grad_norm": 0.9385362267494202,
"learning_rate": 2.1824408468244088e-05,
"loss": 0.0118,
"step": 707
},
{
"epoch": 0.8816936488169365,
"grad_norm": 59.062347412109375,
"learning_rate": 2.185554171855542e-05,
"loss": 1.5594,
"step": 708
},
{
"epoch": 0.8829389788293898,
"grad_norm": 0.8278292417526245,
"learning_rate": 2.188667496886675e-05,
"loss": 0.0164,
"step": 709
},
{
"epoch": 0.8841843088418431,
"grad_norm": 0.1193016767501831,
"learning_rate": 2.1917808219178083e-05,
"loss": 0.0026,
"step": 710
},
{
"epoch": 0.8854296388542964,
"grad_norm": 0.06685473769903183,
"learning_rate": 2.1948941469489415e-05,
"loss": 0.0007,
"step": 711
},
{
"epoch": 0.8866749688667497,
"grad_norm": 0.2482631653547287,
"learning_rate": 2.1980074719800747e-05,
"loss": 0.0044,
"step": 712
},
{
"epoch": 0.887920298879203,
"grad_norm": 0.09288740158081055,
"learning_rate": 2.201120797011208e-05,
"loss": 0.001,
"step": 713
},
{
"epoch": 0.8891656288916563,
"grad_norm": 0.07905003428459167,
"learning_rate": 2.2042341220423414e-05,
"loss": 0.001,
"step": 714
},
{
"epoch": 0.8904109589041096,
"grad_norm": 0.03586210682988167,
"learning_rate": 2.2073474470734746e-05,
"loss": 0.0007,
"step": 715
},
{
"epoch": 0.8916562889165629,
"grad_norm": 0.029501890763640404,
"learning_rate": 2.2104607721046077e-05,
"loss": 0.0005,
"step": 716
},
{
"epoch": 0.8929016189290162,
"grad_norm": 1.9498989582061768,
"learning_rate": 2.213574097135741e-05,
"loss": 0.0056,
"step": 717
},
{
"epoch": 0.8941469489414695,
"grad_norm": 0.011584372259676456,
"learning_rate": 2.2166874221668744e-05,
"loss": 0.0002,
"step": 718
},
{
"epoch": 0.8953922789539228,
"grad_norm": 0.052831344306468964,
"learning_rate": 2.2198007471980076e-05,
"loss": 0.0007,
"step": 719
},
{
"epoch": 0.8966376089663761,
"grad_norm": 152.57171630859375,
"learning_rate": 2.2229140722291408e-05,
"loss": 0.5103,
"step": 720
},
{
"epoch": 0.8978829389788294,
"grad_norm": 0.03796133026480675,
"learning_rate": 2.226027397260274e-05,
"loss": 0.0008,
"step": 721
},
{
"epoch": 0.8991282689912827,
"grad_norm": 9.698473930358887,
"learning_rate": 2.2291407222914075e-05,
"loss": 0.0168,
"step": 722
},
{
"epoch": 0.900373599003736,
"grad_norm": 0.014799389988183975,
"learning_rate": 2.2322540473225406e-05,
"loss": 0.0003,
"step": 723
},
{
"epoch": 0.9016189290161893,
"grad_norm": 0.015290978364646435,
"learning_rate": 2.2353673723536738e-05,
"loss": 0.0004,
"step": 724
},
{
"epoch": 0.9028642590286425,
"grad_norm": 0.0121547756716609,
"learning_rate": 2.238480697384807e-05,
"loss": 0.0004,
"step": 725
},
{
"epoch": 0.9041095890410958,
"grad_norm": 0.043171875178813934,
"learning_rate": 2.2415940224159405e-05,
"loss": 0.001,
"step": 726
},
{
"epoch": 0.9053549190535491,
"grad_norm": 0.02570340782403946,
"learning_rate": 2.2447073474470737e-05,
"loss": 0.0004,
"step": 727
},
{
"epoch": 0.9066002490660025,
"grad_norm": 0.4008868634700775,
"learning_rate": 2.247820672478207e-05,
"loss": 0.0015,
"step": 728
},
{
"epoch": 0.9078455790784558,
"grad_norm": 0.012521167285740376,
"learning_rate": 2.2509339975093404e-05,
"loss": 0.0003,
"step": 729
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.039595190435647964,
"learning_rate": 2.2540473225404735e-05,
"loss": 0.0008,
"step": 730
},
{
"epoch": 0.9103362391033624,
"grad_norm": 0.0371573381125927,
"learning_rate": 2.2571606475716064e-05,
"loss": 0.0007,
"step": 731
},
{
"epoch": 0.9115815691158157,
"grad_norm": 0.0111406734213233,
"learning_rate": 2.2602739726027396e-05,
"loss": 0.0003,
"step": 732
},
{
"epoch": 0.912826899128269,
"grad_norm": 34.578346252441406,
"learning_rate": 2.263387297633873e-05,
"loss": 4.4143,
"step": 733
},
{
"epoch": 0.9140722291407223,
"grad_norm": 0.006715845782309771,
"learning_rate": 2.2665006226650062e-05,
"loss": 0.0002,
"step": 734
},
{
"epoch": 0.9153175591531756,
"grad_norm": 0.014482389204204082,
"learning_rate": 2.2696139476961394e-05,
"loss": 0.0004,
"step": 735
},
{
"epoch": 0.9165628891656289,
"grad_norm": 0.0057504503056406975,
"learning_rate": 2.272727272727273e-05,
"loss": 0.0001,
"step": 736
},
{
"epoch": 0.9178082191780822,
"grad_norm": 0.04472869634628296,
"learning_rate": 2.275840597758406e-05,
"loss": 0.001,
"step": 737
},
{
"epoch": 0.9190535491905355,
"grad_norm": 0.05841754376888275,
"learning_rate": 2.2789539227895393e-05,
"loss": 0.001,
"step": 738
},
{
"epoch": 0.9202988792029888,
"grad_norm": 0.009739454835653305,
"learning_rate": 2.2820672478206725e-05,
"loss": 0.0002,
"step": 739
},
{
"epoch": 0.9215442092154421,
"grad_norm": 0.011922300793230534,
"learning_rate": 2.285180572851806e-05,
"loss": 0.0004,
"step": 740
},
{
"epoch": 0.9227895392278954,
"grad_norm": 0.05216851085424423,
"learning_rate": 2.288293897882939e-05,
"loss": 0.001,
"step": 741
},
{
"epoch": 0.9240348692403487,
"grad_norm": 0.007307402323931456,
"learning_rate": 2.2914072229140723e-05,
"loss": 0.0002,
"step": 742
},
{
"epoch": 0.925280199252802,
"grad_norm": 0.04301249235868454,
"learning_rate": 2.2945205479452055e-05,
"loss": 0.0005,
"step": 743
},
{
"epoch": 0.9265255292652553,
"grad_norm": 0.013793856836855412,
"learning_rate": 2.297633872976339e-05,
"loss": 0.0003,
"step": 744
},
{
"epoch": 0.9277708592777086,
"grad_norm": 0.1124817505478859,
"learning_rate": 2.3007471980074722e-05,
"loss": 0.0022,
"step": 745
},
{
"epoch": 0.9290161892901619,
"grad_norm": 0.005083655938506126,
"learning_rate": 2.3038605230386054e-05,
"loss": 0.0001,
"step": 746
},
{
"epoch": 0.9302615193026152,
"grad_norm": 0.005723627284169197,
"learning_rate": 2.3069738480697385e-05,
"loss": 0.0001,
"step": 747
},
{
"epoch": 0.9315068493150684,
"grad_norm": 0.08036380261182785,
"learning_rate": 2.310087173100872e-05,
"loss": 0.0014,
"step": 748
},
{
"epoch": 0.9327521793275217,
"grad_norm": 0.007362319156527519,
"learning_rate": 2.3132004981320052e-05,
"loss": 0.0002,
"step": 749
},
{
"epoch": 0.933997509339975,
"grad_norm": 1.5796531438827515,
"learning_rate": 2.3163138231631384e-05,
"loss": 0.0147,
"step": 750
},
{
"epoch": 0.9352428393524284,
"grad_norm": 0.038087982684373856,
"learning_rate": 2.3194271481942716e-05,
"loss": 0.0008,
"step": 751
},
{
"epoch": 0.9364881693648817,
"grad_norm": 0.005102880764752626,
"learning_rate": 2.322540473225405e-05,
"loss": 0.0001,
"step": 752
},
{
"epoch": 0.937733499377335,
"grad_norm": 306.6837158203125,
"learning_rate": 2.3256537982565383e-05,
"loss": 3.1504,
"step": 753
},
{
"epoch": 0.9389788293897883,
"grad_norm": 0.006043303292244673,
"learning_rate": 2.328767123287671e-05,
"loss": 0.0001,
"step": 754
},
{
"epoch": 0.9402241594022416,
"grad_norm": 0.027712326496839523,
"learning_rate": 2.3318804483188046e-05,
"loss": 0.0008,
"step": 755
},
{
"epoch": 0.9414694894146949,
"grad_norm": 0.015633290633559227,
"learning_rate": 2.3349937733499378e-05,
"loss": 0.0004,
"step": 756
},
{
"epoch": 0.9427148194271482,
"grad_norm": 0.007909745909273624,
"learning_rate": 2.338107098381071e-05,
"loss": 0.0002,
"step": 757
},
{
"epoch": 0.9439601494396015,
"grad_norm": 0.018452487885951996,
"learning_rate": 2.341220423412204e-05,
"loss": 0.0004,
"step": 758
},
{
"epoch": 0.9452054794520548,
"grad_norm": 0.010309605859220028,
"learning_rate": 2.3443337484433376e-05,
"loss": 0.0002,
"step": 759
},
{
"epoch": 0.9464508094645081,
"grad_norm": 0.005897897761315107,
"learning_rate": 2.3474470734744708e-05,
"loss": 0.0001,
"step": 760
},
{
"epoch": 0.9476961394769614,
"grad_norm": 0.024718550965189934,
"learning_rate": 2.350560398505604e-05,
"loss": 0.0007,
"step": 761
},
{
"epoch": 0.9489414694894147,
"grad_norm": 0.014151460491120815,
"learning_rate": 2.3536737235367372e-05,
"loss": 0.0004,
"step": 762
},
{
"epoch": 0.950186799501868,
"grad_norm": 0.05046864598989487,
"learning_rate": 2.3567870485678707e-05,
"loss": 0.0005,
"step": 763
},
{
"epoch": 0.9514321295143213,
"grad_norm": 0.05455144867300987,
"learning_rate": 2.359900373599004e-05,
"loss": 0.0006,
"step": 764
},
{
"epoch": 0.9526774595267746,
"grad_norm": 0.02435392327606678,
"learning_rate": 2.363013698630137e-05,
"loss": 0.0003,
"step": 765
},
{
"epoch": 0.9539227895392279,
"grad_norm": 0.025639377534389496,
"learning_rate": 2.3661270236612702e-05,
"loss": 0.0005,
"step": 766
},
{
"epoch": 0.9551681195516812,
"grad_norm": 0.015089256688952446,
"learning_rate": 2.3692403486924037e-05,
"loss": 0.0004,
"step": 767
},
{
"epoch": 0.9564134495641345,
"grad_norm": 0.032805927097797394,
"learning_rate": 2.372353673723537e-05,
"loss": 0.0006,
"step": 768
},
{
"epoch": 0.9576587795765878,
"grad_norm": 0.015525261871516705,
"learning_rate": 2.37546699875467e-05,
"loss": 0.0004,
"step": 769
},
{
"epoch": 0.958904109589041,
"grad_norm": 0.008337048813700676,
"learning_rate": 2.3785803237858036e-05,
"loss": 0.0002,
"step": 770
},
{
"epoch": 0.9601494396014943,
"grad_norm": 0.037120576947927475,
"learning_rate": 2.3816936488169368e-05,
"loss": 0.0004,
"step": 771
},
{
"epoch": 0.9613947696139477,
"grad_norm": 0.01175164058804512,
"learning_rate": 2.38480697384807e-05,
"loss": 0.0003,
"step": 772
},
{
"epoch": 0.962640099626401,
"grad_norm": 0.010447794571518898,
"learning_rate": 2.387920298879203e-05,
"loss": 0.0003,
"step": 773
},
{
"epoch": 0.9638854296388543,
"grad_norm": 0.010614910162985325,
"learning_rate": 2.3910336239103366e-05,
"loss": 0.0001,
"step": 774
},
{
"epoch": 0.9651307596513076,
"grad_norm": 0.07238447666168213,
"learning_rate": 2.3941469489414698e-05,
"loss": 0.0007,
"step": 775
},
{
"epoch": 0.9663760896637609,
"grad_norm": 0.03060179576277733,
"learning_rate": 2.3972602739726026e-05,
"loss": 0.0007,
"step": 776
},
{
"epoch": 0.9676214196762142,
"grad_norm": 0.08607795089483261,
"learning_rate": 2.400373599003736e-05,
"loss": 0.0004,
"step": 777
},
{
"epoch": 0.9688667496886675,
"grad_norm": 0.030211659148335457,
"learning_rate": 2.4034869240348693e-05,
"loss": 0.0003,
"step": 778
},
{
"epoch": 0.9701120797011208,
"grad_norm": 0.006784611847251654,
"learning_rate": 2.4066002490660025e-05,
"loss": 0.0002,
"step": 779
},
{
"epoch": 0.9713574097135741,
"grad_norm": 0.011817213147878647,
"learning_rate": 2.4097135740971357e-05,
"loss": 0.0003,
"step": 780
},
{
"epoch": 0.9726027397260274,
"grad_norm": 0.029583904892206192,
"learning_rate": 2.4128268991282692e-05,
"loss": 0.0004,
"step": 781
},
{
"epoch": 0.9738480697384807,
"grad_norm": 0.007558898068964481,
"learning_rate": 2.4159402241594024e-05,
"loss": 0.0003,
"step": 782
},
{
"epoch": 0.975093399750934,
"grad_norm": 481.3611755371094,
"learning_rate": 2.4190535491905355e-05,
"loss": 2.5255,
"step": 783
},
{
"epoch": 0.9763387297633873,
"grad_norm": 127.75431060791016,
"learning_rate": 2.4221668742216687e-05,
"loss": 0.0841,
"step": 784
},
{
"epoch": 0.9775840597758406,
"grad_norm": 0.01205628365278244,
"learning_rate": 2.4252801992528022e-05,
"loss": 0.0004,
"step": 785
},
{
"epoch": 0.9788293897882939,
"grad_norm": 411.4049377441406,
"learning_rate": 2.4283935242839354e-05,
"loss": 1.7384,
"step": 786
},
{
"epoch": 0.9800747198007472,
"grad_norm": 1.6122777462005615,
"learning_rate": 2.4315068493150686e-05,
"loss": 0.0018,
"step": 787
},
{
"epoch": 0.9813200498132005,
"grad_norm": 0.013621006160974503,
"learning_rate": 2.4346201743462018e-05,
"loss": 0.0004,
"step": 788
},
{
"epoch": 0.9825653798256538,
"grad_norm": 0.0152182187885046,
"learning_rate": 2.4377334993773353e-05,
"loss": 0.0003,
"step": 789
},
{
"epoch": 0.9838107098381071,
"grad_norm": 241.25070190429688,
"learning_rate": 2.4408468244084684e-05,
"loss": 0.1739,
"step": 790
},
{
"epoch": 0.9850560398505604,
"grad_norm": 0.009512806311249733,
"learning_rate": 2.4439601494396016e-05,
"loss": 0.0003,
"step": 791
},
{
"epoch": 0.9863013698630136,
"grad_norm": 12.394267082214355,
"learning_rate": 2.4470734744707348e-05,
"loss": 0.0218,
"step": 792
},
{
"epoch": 0.987546699875467,
"grad_norm": 0.008201587945222855,
"learning_rate": 2.4501867995018683e-05,
"loss": 0.0002,
"step": 793
},
{
"epoch": 0.9887920298879203,
"grad_norm": 0.049125440418720245,
"learning_rate": 2.4533001245330015e-05,
"loss": 0.0006,
"step": 794
},
{
"epoch": 0.9900373599003736,
"grad_norm": 0.0920347198843956,
"learning_rate": 2.4564134495641347e-05,
"loss": 0.001,
"step": 795
},
{
"epoch": 0.9912826899128269,
"grad_norm": 35.2567253112793,
"learning_rate": 2.4595267745952678e-05,
"loss": 0.0267,
"step": 796
},
{
"epoch": 0.9925280199252802,
"grad_norm": 0.01363935973495245,
"learning_rate": 2.4626400996264013e-05,
"loss": 0.0003,
"step": 797
},
{
"epoch": 0.9937733499377335,
"grad_norm": 0.009647058323025703,
"learning_rate": 2.4657534246575342e-05,
"loss": 0.0003,
"step": 798
},
{
"epoch": 0.9950186799501868,
"grad_norm": 0.005581174045801163,
"learning_rate": 2.4688667496886674e-05,
"loss": 0.0002,
"step": 799
},
{
"epoch": 0.9962640099626401,
"grad_norm": 0.006403461564332247,
"learning_rate": 2.471980074719801e-05,
"loss": 0.0002,
"step": 800
},
{
"epoch": 0.9975093399750934,
"grad_norm": 0.018721066415309906,
"learning_rate": 2.475093399750934e-05,
"loss": 0.0005,
"step": 801
},
{
"epoch": 0.9987546699875467,
"grad_norm": 0.0068865250796079636,
"learning_rate": 2.4782067247820672e-05,
"loss": 0.0002,
"step": 802
},
{
"epoch": 1.0,
"grad_norm": 148.17623901367188,
"learning_rate": 2.4813200498132004e-05,
"loss": 0.2457,
"step": 803
},
{
"epoch": 1.0,
"eval_accuracy": 0.9760765550239234,
"eval_f1_macro": 0.9768339768339769,
"eval_f1_micro": 0.9760765550239234,
"eval_f1_weighted": 0.9760457655194498,
"eval_loss": 0.244761124253273,
"eval_precision_macro": 0.978448275862069,
"eval_precision_micro": 0.9760765550239234,
"eval_precision_weighted": 0.978138920970137,
"eval_recall_macro": 0.9772727272727273,
"eval_recall_micro": 0.9760765550239234,
"eval_recall_weighted": 0.9760765550239234,
"eval_runtime": 29.9929,
"eval_samples_per_second": 6.968,
"eval_steps_per_second": 0.467,
"step": 803
}
],
"logging_steps": 1,
"max_steps": 16060,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.01
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.394707013520589e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}