ziyang / trainer_state.json
Mickey25's picture
Upload folder using huggingface_hub
e2ad9bf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.989937106918239,
"eval_steps": 50000,
"global_step": 594,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010062893081761006,
"grad_norm": 1.6687748432159424,
"learning_rate": 6.666666666666667e-08,
"loss": 0.369,
"step": 2
},
{
"epoch": 0.02012578616352201,
"grad_norm": 1.5292283296585083,
"learning_rate": 1.3333333333333334e-07,
"loss": 0.3732,
"step": 4
},
{
"epoch": 0.03018867924528302,
"grad_norm": 1.5824713706970215,
"learning_rate": 2e-07,
"loss": -0.1619,
"step": 6
},
{
"epoch": 0.04025157232704402,
"grad_norm": 4.105996608734131,
"learning_rate": 2.6666666666666667e-07,
"loss": 0.2398,
"step": 8
},
{
"epoch": 0.050314465408805034,
"grad_norm": 1.4400302171707153,
"learning_rate": 3.333333333333333e-07,
"loss": -0.5621,
"step": 10
},
{
"epoch": 0.06037735849056604,
"grad_norm": 2.3048486709594727,
"learning_rate": 4e-07,
"loss": -0.6602,
"step": 12
},
{
"epoch": 0.07044025157232704,
"grad_norm": 2.4866607189178467,
"learning_rate": 4.6666666666666666e-07,
"loss": -1.24,
"step": 14
},
{
"epoch": 0.08050314465408805,
"grad_norm": 3.4124677181243896,
"learning_rate": 5.333333333333333e-07,
"loss": 0.462,
"step": 16
},
{
"epoch": 0.09056603773584905,
"grad_norm": 1.5936415195465088,
"learning_rate": 6e-07,
"loss": -0.0692,
"step": 18
},
{
"epoch": 0.10062893081761007,
"grad_norm": 1.9987062215805054,
"learning_rate": 6.666666666666666e-07,
"loss": 0.5051,
"step": 20
},
{
"epoch": 0.11069182389937107,
"grad_norm": 2.565603017807007,
"learning_rate": 7.333333333333332e-07,
"loss": -0.0248,
"step": 22
},
{
"epoch": 0.12075471698113208,
"grad_norm": 3.2282676696777344,
"learning_rate": 8e-07,
"loss": -0.6335,
"step": 24
},
{
"epoch": 0.13081761006289308,
"grad_norm": 1.868457555770874,
"learning_rate": 8.666666666666667e-07,
"loss": -0.8462,
"step": 26
},
{
"epoch": 0.14088050314465408,
"grad_norm": 2.7205371856689453,
"learning_rate": 9.333333333333333e-07,
"loss": 2.6132,
"step": 28
},
{
"epoch": 0.1509433962264151,
"grad_norm": 3.2904088497161865,
"learning_rate": 1e-06,
"loss": 0.4139,
"step": 30
},
{
"epoch": 0.1610062893081761,
"grad_norm": 1.7929654121398926,
"learning_rate": 1.0666666666666667e-06,
"loss": 1.9297,
"step": 32
},
{
"epoch": 0.1710691823899371,
"grad_norm": 2.788813591003418,
"learning_rate": 1.1333333333333332e-06,
"loss": -1.4279,
"step": 34
},
{
"epoch": 0.1811320754716981,
"grad_norm": 1.792971134185791,
"learning_rate": 1.2e-06,
"loss": 0.1433,
"step": 36
},
{
"epoch": 0.19119496855345913,
"grad_norm": 2.238489866256714,
"learning_rate": 1.2666666666666665e-06,
"loss": 0.3927,
"step": 38
},
{
"epoch": 0.20125786163522014,
"grad_norm": 2.905518054962158,
"learning_rate": 1.3333333333333332e-06,
"loss": 1.079,
"step": 40
},
{
"epoch": 0.21132075471698114,
"grad_norm": 1.6354607343673706,
"learning_rate": 1.4e-06,
"loss": 0.1258,
"step": 42
},
{
"epoch": 0.22138364779874214,
"grad_norm": 2.0974748134613037,
"learning_rate": 1.4666666666666665e-06,
"loss": 0.0546,
"step": 44
},
{
"epoch": 0.23144654088050315,
"grad_norm": 1.619780421257019,
"learning_rate": 1.5333333333333334e-06,
"loss": -1.0396,
"step": 46
},
{
"epoch": 0.24150943396226415,
"grad_norm": 1.9667820930480957,
"learning_rate": 1.6e-06,
"loss": -0.4011,
"step": 48
},
{
"epoch": 0.25157232704402516,
"grad_norm": 1.9112639427185059,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.8607,
"step": 50
},
{
"epoch": 0.26163522012578616,
"grad_norm": 2.6148829460144043,
"learning_rate": 1.7333333333333334e-06,
"loss": 0.6988,
"step": 52
},
{
"epoch": 0.27169811320754716,
"grad_norm": 2.6693756580352783,
"learning_rate": 1.8e-06,
"loss": -1.0175,
"step": 54
},
{
"epoch": 0.28176100628930817,
"grad_norm": 2.0184097290039062,
"learning_rate": 1.8666666666666667e-06,
"loss": -0.1263,
"step": 56
},
{
"epoch": 0.2918238993710692,
"grad_norm": 1.4805622100830078,
"learning_rate": 1.933333333333333e-06,
"loss": -0.4554,
"step": 58
},
{
"epoch": 0.3018867924528302,
"grad_norm": 1.6097267866134644,
"learning_rate": 2e-06,
"loss": 0.5408,
"step": 60
},
{
"epoch": 0.3119496855345912,
"grad_norm": 1.720683217048645,
"learning_rate": 1.9999307783070657e-06,
"loss": 1.3892,
"step": 62
},
{
"epoch": 0.3220125786163522,
"grad_norm": 2.825670003890991,
"learning_rate": 1.999723122811548e-06,
"loss": 0.9162,
"step": 64
},
{
"epoch": 0.3320754716981132,
"grad_norm": 2.550844430923462,
"learning_rate": 1.9993770622619783e-06,
"loss": -0.1783,
"step": 66
},
{
"epoch": 0.3421383647798742,
"grad_norm": 2.4842543601989746,
"learning_rate": 1.998892644568149e-06,
"loss": -1.0679,
"step": 68
},
{
"epoch": 0.3522012578616352,
"grad_norm": 1.9450500011444092,
"learning_rate": 1.9982699367944866e-06,
"loss": 1.4075,
"step": 70
},
{
"epoch": 0.3622641509433962,
"grad_norm": 2.419877052307129,
"learning_rate": 1.9975090251507638e-06,
"loss": -0.5993,
"step": 72
},
{
"epoch": 0.3723270440251572,
"grad_norm": 1.7247552871704102,
"learning_rate": 1.9966100149801647e-06,
"loss": 1.2249,
"step": 74
},
{
"epoch": 0.38238993710691827,
"grad_norm": 2.8694651126861572,
"learning_rate": 1.995573030744701e-06,
"loss": 0.279,
"step": 76
},
{
"epoch": 0.39245283018867927,
"grad_norm": 3.444533586502075,
"learning_rate": 1.994398216007982e-06,
"loss": 2.4944,
"step": 78
},
{
"epoch": 0.4025157232704403,
"grad_norm": 1.145507574081421,
"learning_rate": 1.993085733415337e-06,
"loss": -0.0775,
"step": 80
},
{
"epoch": 0.4125786163522013,
"grad_norm": 2.018376111984253,
"learning_rate": 1.9916357646713006e-06,
"loss": -0.1244,
"step": 82
},
{
"epoch": 0.4226415094339623,
"grad_norm": 3.317014694213867,
"learning_rate": 1.9900485105144544e-06,
"loss": -0.5761,
"step": 84
},
{
"epoch": 0.4327044025157233,
"grad_norm": 1.426088809967041,
"learning_rate": 1.9883241906896385e-06,
"loss": 1.364,
"step": 86
},
{
"epoch": 0.4427672955974843,
"grad_norm": 2.031130790710449,
"learning_rate": 1.986463043917528e-06,
"loss": 0.9214,
"step": 88
},
{
"epoch": 0.4528301886792453,
"grad_norm": 2.133758068084717,
"learning_rate": 1.984465327861583e-06,
"loss": -1.4531,
"step": 90
},
{
"epoch": 0.4628930817610063,
"grad_norm": 2.5162205696105957,
"learning_rate": 1.9823313190923794e-06,
"loss": -0.7078,
"step": 92
},
{
"epoch": 0.4729559748427673,
"grad_norm": 1.5902796983718872,
"learning_rate": 1.980061313049315e-06,
"loss": -1.3553,
"step": 94
},
{
"epoch": 0.4830188679245283,
"grad_norm": 2.366024971008301,
"learning_rate": 1.9776556239997142e-06,
"loss": 0.4744,
"step": 96
},
{
"epoch": 0.4930817610062893,
"grad_norm": 2.211918354034424,
"learning_rate": 1.975114584995313e-06,
"loss": 0.532,
"step": 98
},
{
"epoch": 0.5031446540880503,
"grad_norm": 1.664931058883667,
"learning_rate": 1.972438547826156e-06,
"loss": -0.5974,
"step": 100
},
{
"epoch": 0.5132075471698113,
"grad_norm": 2.5771172046661377,
"learning_rate": 1.969627882971888e-06,
"loss": -0.4213,
"step": 102
},
{
"epoch": 0.5232704402515723,
"grad_norm": 3.083601236343384,
"learning_rate": 1.9666829795504693e-06,
"loss": -1.491,
"step": 104
},
{
"epoch": 0.5333333333333333,
"grad_norm": 3.069186210632324,
"learning_rate": 1.9636042452643e-06,
"loss": -0.6719,
"step": 106
},
{
"epoch": 0.5433962264150943,
"grad_norm": 1.642295479774475,
"learning_rate": 1.960392106343779e-06,
"loss": -0.8876,
"step": 108
},
{
"epoch": 0.5534591194968553,
"grad_norm": 2.7487986087799072,
"learning_rate": 1.9570470074882946e-06,
"loss": -0.8838,
"step": 110
},
{
"epoch": 0.5635220125786163,
"grad_norm": 4.342981338500977,
"learning_rate": 1.9535694118046583e-06,
"loss": 0.6486,
"step": 112
},
{
"epoch": 0.5735849056603773,
"grad_norm": 2.6165924072265625,
"learning_rate": 1.949959800742991e-06,
"loss": 0.901,
"step": 114
},
{
"epoch": 0.5836477987421383,
"grad_norm": 3.7529544830322266,
"learning_rate": 1.9462186740300695e-06,
"loss": -1.5828,
"step": 116
},
{
"epoch": 0.5937106918238994,
"grad_norm": 0.95662921667099,
"learning_rate": 1.942346549600144e-06,
"loss": -1.2115,
"step": 118
},
{
"epoch": 0.6037735849056604,
"grad_norm": 3.2608375549316406,
"learning_rate": 1.9383439635232293e-06,
"loss": 1.1846,
"step": 120
},
{
"epoch": 0.6138364779874214,
"grad_norm": 2.937685966491699,
"learning_rate": 1.9342114699308956e-06,
"loss": 0.5849,
"step": 122
},
{
"epoch": 0.6238993710691824,
"grad_norm": 3.030308485031128,
"learning_rate": 1.929949640939548e-06,
"loss": 1.0768,
"step": 124
},
{
"epoch": 0.6339622641509434,
"grad_norm": 1.6450515985488892,
"learning_rate": 1.925559066571221e-06,
"loss": -0.815,
"step": 126
},
{
"epoch": 0.6440251572327044,
"grad_norm": 4.359044075012207,
"learning_rate": 1.9210403546718966e-06,
"loss": 1.4768,
"step": 128
},
{
"epoch": 0.6540880503144654,
"grad_norm": 2.591158628463745,
"learning_rate": 1.91639413082735e-06,
"loss": 0.4688,
"step": 130
},
{
"epoch": 0.6641509433962264,
"grad_norm": 3.594324827194214,
"learning_rate": 1.9116210382765418e-06,
"loss": -0.4207,
"step": 132
},
{
"epoch": 0.6742138364779874,
"grad_norm": 4.136204242706299,
"learning_rate": 1.9067217378225652e-06,
"loss": -1.2546,
"step": 134
},
{
"epoch": 0.6842767295597484,
"grad_norm": 3.1914331912994385,
"learning_rate": 1.9016969077411645e-06,
"loss": -1.6023,
"step": 136
},
{
"epoch": 0.6943396226415094,
"grad_norm": 2.6611359119415283,
"learning_rate": 1.8965472436868284e-06,
"loss": 0.0919,
"step": 138
},
{
"epoch": 0.7044025157232704,
"grad_norm": 3.068580150604248,
"learning_rate": 1.8912734585964855e-06,
"loss": 0.3057,
"step": 140
},
{
"epoch": 0.7144654088050314,
"grad_norm": 7.307640552520752,
"learning_rate": 1.8858762825907997e-06,
"loss": 1.6571,
"step": 142
},
{
"epoch": 0.7245283018867924,
"grad_norm": 2.129241943359375,
"learning_rate": 1.8803564628730913e-06,
"loss": 0.5422,
"step": 144
},
{
"epoch": 0.7345911949685534,
"grad_norm": 3.773325204849243,
"learning_rate": 1.8747147636258916e-06,
"loss": 0.7144,
"step": 146
},
{
"epoch": 0.7446540880503144,
"grad_norm": 1.3420393466949463,
"learning_rate": 1.8689519659051466e-06,
"loss": -1.1075,
"step": 148
},
{
"epoch": 0.7547169811320755,
"grad_norm": 6.70538854598999,
"learning_rate": 1.8630688675320841e-06,
"loss": -1.9595,
"step": 150
},
{
"epoch": 0.7647798742138365,
"grad_norm": 4.187305927276611,
"learning_rate": 1.857066282982763e-06,
"loss": -0.5234,
"step": 152
},
{
"epoch": 0.7748427672955975,
"grad_norm": 2.975940465927124,
"learning_rate": 1.850945043275312e-06,
"loss": -0.3984,
"step": 154
},
{
"epoch": 0.7849056603773585,
"grad_norm": 2.44286847114563,
"learning_rate": 1.844705995854882e-06,
"loss": 1.109,
"step": 156
},
{
"epoch": 0.7949685534591195,
"grad_norm": 12.523564338684082,
"learning_rate": 1.8383500044763226e-06,
"loss": -2.0379,
"step": 158
},
{
"epoch": 0.8050314465408805,
"grad_norm": 4.5152716636657715,
"learning_rate": 1.8318779490846e-06,
"loss": -0.6498,
"step": 160
},
{
"epoch": 0.8150943396226416,
"grad_norm": 2.565892457962036,
"learning_rate": 1.8252907256929774e-06,
"loss": 0.039,
"step": 162
},
{
"epoch": 0.8251572327044026,
"grad_norm": 3.789813756942749,
"learning_rate": 1.8185892462589636e-06,
"loss": -0.0521,
"step": 164
},
{
"epoch": 0.8352201257861636,
"grad_norm": 4.709334373474121,
"learning_rate": 1.8117744385580623e-06,
"loss": -0.7899,
"step": 166
},
{
"epoch": 0.8452830188679246,
"grad_norm": 2.444716453552246,
"learning_rate": 1.8048472460553256e-06,
"loss": 0.2275,
"step": 168
},
{
"epoch": 0.8553459119496856,
"grad_norm": 2.314274549484253,
"learning_rate": 1.7978086277747379e-06,
"loss": -0.9168,
"step": 170
},
{
"epoch": 0.8654088050314466,
"grad_norm": 3.4260716438293457,
"learning_rate": 1.7906595581664461e-06,
"loss": -0.6274,
"step": 172
},
{
"epoch": 0.8754716981132076,
"grad_norm": 2.7144453525543213,
"learning_rate": 1.7834010269718524e-06,
"loss": -0.9649,
"step": 174
},
{
"epoch": 0.8855345911949686,
"grad_norm": 3.8050897121429443,
"learning_rate": 1.7760340390865917e-06,
"loss": -0.262,
"step": 176
},
{
"epoch": 0.8955974842767296,
"grad_norm": 2.8164639472961426,
"learning_rate": 1.7685596144214107e-06,
"loss": -1.1909,
"step": 178
},
{
"epoch": 0.9056603773584906,
"grad_norm": 4.633458614349365,
"learning_rate": 1.7609787877609676e-06,
"loss": 0.4428,
"step": 180
},
{
"epoch": 0.9157232704402516,
"grad_norm": 2.8389792442321777,
"learning_rate": 1.7532926086205726e-06,
"loss": -0.5821,
"step": 182
},
{
"epoch": 0.9257861635220126,
"grad_norm": 2.226238965988159,
"learning_rate": 1.7455021411008906e-06,
"loss": 0.3515,
"step": 184
},
{
"epoch": 0.9358490566037736,
"grad_norm": 2.591329336166382,
"learning_rate": 1.737608463740622e-06,
"loss": -0.306,
"step": 186
},
{
"epoch": 0.9459119496855346,
"grad_norm": 3.7576334476470947,
"learning_rate": 1.7296126693671882e-06,
"loss": 0.1704,
"step": 188
},
{
"epoch": 0.9559748427672956,
"grad_norm": 2.887920618057251,
"learning_rate": 1.7215158649454346e-06,
"loss": -0.2494,
"step": 190
},
{
"epoch": 0.9660377358490566,
"grad_norm": 4.349538326263428,
"learning_rate": 1.7133191714243802e-06,
"loss": 2.3405,
"step": 192
},
{
"epoch": 0.9761006289308176,
"grad_norm": 4.317368984222412,
"learning_rate": 1.7050237235820287e-06,
"loss": 0.4566,
"step": 194
},
{
"epoch": 0.9861635220125786,
"grad_norm": 5.087897300720215,
"learning_rate": 1.696630669868267e-06,
"loss": 0.1502,
"step": 196
},
{
"epoch": 0.9962264150943396,
"grad_norm": 4.70991325378418,
"learning_rate": 1.6881411722458687e-06,
"loss": -0.3574,
"step": 198
},
{
"epoch": 1.0069182389937108,
"grad_norm": 2.966017007827759,
"learning_rate": 1.6795564060296292e-06,
"loss": 0.9311,
"step": 200
},
{
"epoch": 1.0169811320754718,
"grad_norm": 2.300924777984619,
"learning_rate": 1.6708775597236505e-06,
"loss": 0.2717,
"step": 202
},
{
"epoch": 1.0270440251572328,
"grad_norm": 6.384905815124512,
"learning_rate": 1.6621058348568004e-06,
"loss": -0.0504,
"step": 204
},
{
"epoch": 1.0371069182389938,
"grad_norm": 4.002950668334961,
"learning_rate": 1.6532424458163691e-06,
"loss": -0.2334,
"step": 206
},
{
"epoch": 1.0471698113207548,
"grad_norm": 12.800736427307129,
"learning_rate": 1.6442886196799464e-06,
"loss": -1.2455,
"step": 208
},
{
"epoch": 1.0572327044025158,
"grad_norm": 5.464755535125732,
"learning_rate": 1.6352455960455384e-06,
"loss": 1.8264,
"step": 210
},
{
"epoch": 1.0672955974842768,
"grad_norm": 5.672085762023926,
"learning_rate": 1.6261146268599562e-06,
"loss": -1.0013,
"step": 212
},
{
"epoch": 1.0773584905660378,
"grad_norm": 4.908372402191162,
"learning_rate": 1.6168969762454894e-06,
"loss": -1.0382,
"step": 214
},
{
"epoch": 1.0874213836477988,
"grad_norm": 7.087652683258057,
"learning_rate": 1.607593920324899e-06,
"loss": -0.4295,
"step": 216
},
{
"epoch": 1.0974842767295598,
"grad_norm": 3.5187363624572754,
"learning_rate": 1.5982067470447458e-06,
"loss": -0.0398,
"step": 218
},
{
"epoch": 1.1075471698113208,
"grad_norm": 2.593596935272217,
"learning_rate": 1.5887367559970822e-06,
"loss": 0.7915,
"step": 220
},
{
"epoch": 1.1176100628930818,
"grad_norm": 6.099729061126709,
"learning_rate": 1.5791852582395332e-06,
"loss": -1.0834,
"step": 222
},
{
"epoch": 1.1276729559748428,
"grad_norm": 6.590648174285889,
"learning_rate": 1.5695535761137888e-06,
"loss": 0.9158,
"step": 224
},
{
"epoch": 1.1377358490566039,
"grad_norm": 5.639819145202637,
"learning_rate": 1.5598430430625333e-06,
"loss": -1.5288,
"step": 226
},
{
"epoch": 1.1477987421383649,
"grad_norm": 3.02219820022583,
"learning_rate": 1.550055003444841e-06,
"loss": -0.0297,
"step": 228
},
{
"epoch": 1.1578616352201259,
"grad_norm": 6.338824272155762,
"learning_rate": 1.5401908123500586e-06,
"loss": -0.7611,
"step": 230
},
{
"epoch": 1.1679245283018869,
"grad_norm": 3.917799949645996,
"learning_rate": 1.530251835410199e-06,
"loss": 0.4777,
"step": 232
},
{
"epoch": 1.1779874213836479,
"grad_norm": 6.309770584106445,
"learning_rate": 1.520239448610882e-06,
"loss": 1.729,
"step": 234
},
{
"epoch": 1.1880503144654089,
"grad_norm": 1.9973816871643066,
"learning_rate": 1.5101550381008375e-06,
"loss": -1.5997,
"step": 236
},
{
"epoch": 1.1981132075471699,
"grad_norm": 6.434890270233154,
"learning_rate": 1.5e-06,
"loss": -1.5788,
"step": 238
},
{
"epoch": 1.2081761006289309,
"grad_norm": 2.8913328647613525,
"learning_rate": 1.4897757402062284e-06,
"loss": 0.2666,
"step": 240
},
{
"epoch": 1.2182389937106919,
"grad_norm": 5.833925724029541,
"learning_rate": 1.4794836742006664e-06,
"loss": 0.969,
"step": 242
},
{
"epoch": 1.228301886792453,
"grad_norm": 3.047639846801758,
"learning_rate": 1.4691252268517794e-06,
"loss": -0.7864,
"step": 244
},
{
"epoch": 1.238364779874214,
"grad_norm": 11.185049057006836,
"learning_rate": 1.4587018322180904e-06,
"loss": -1.8447,
"step": 246
},
{
"epoch": 1.248427672955975,
"grad_norm": 3.9488909244537354,
"learning_rate": 1.4482149333496455e-06,
"loss": 1.3762,
"step": 248
},
{
"epoch": 1.258490566037736,
"grad_norm": 8.695211410522461,
"learning_rate": 1.4376659820882306e-06,
"loss": 2.1336,
"step": 250
},
{
"epoch": 1.268553459119497,
"grad_norm": 6.01567268371582,
"learning_rate": 1.427056438866376e-06,
"loss": -0.8317,
"step": 252
},
{
"epoch": 1.278616352201258,
"grad_norm": 4.584295272827148,
"learning_rate": 1.4163877725051677e-06,
"loss": 0.409,
"step": 254
},
{
"epoch": 1.288679245283019,
"grad_norm": 5.3349480628967285,
"learning_rate": 1.4056614600108995e-06,
"loss": 0.106,
"step": 256
},
{
"epoch": 1.29874213836478,
"grad_norm": 2.8550000190734863,
"learning_rate": 1.3948789863705913e-06,
"loss": 0.6895,
"step": 258
},
{
"epoch": 1.308805031446541,
"grad_norm": 6.208876132965088,
"learning_rate": 1.3840418443464013e-06,
"loss": -0.5366,
"step": 260
},
{
"epoch": 1.318867924528302,
"grad_norm": 4.392048358917236,
"learning_rate": 1.3731515342689651e-06,
"loss": 0.9175,
"step": 262
},
{
"epoch": 1.328930817610063,
"grad_norm": 5.677616596221924,
"learning_rate": 1.3622095638296825e-06,
"loss": -0.8256,
"step": 264
},
{
"epoch": 1.338993710691824,
"grad_norm": 3.6334376335144043,
"learning_rate": 1.3512174478719892e-06,
"loss": -1.949,
"step": 266
},
{
"epoch": 1.349056603773585,
"grad_norm": 4.466569423675537,
"learning_rate": 1.3401767081816368e-06,
"loss": 1.0635,
"step": 268
},
{
"epoch": 1.359119496855346,
"grad_norm": 6.331056594848633,
"learning_rate": 1.32908887327601e-06,
"loss": -0.801,
"step": 270
},
{
"epoch": 1.369182389937107,
"grad_norm": 5.03653621673584,
"learning_rate": 1.317955478192515e-06,
"loss": -0.2086,
"step": 272
},
{
"epoch": 1.379245283018868,
"grad_norm": 2.39367413520813,
"learning_rate": 1.3067780642760637e-06,
"loss": -1.0548,
"step": 274
},
{
"epoch": 1.389308176100629,
"grad_norm": 6.588123321533203,
"learning_rate": 1.295558178965684e-06,
"loss": 1.0341,
"step": 276
},
{
"epoch": 1.39937106918239,
"grad_norm": 3.3789021968841553,
"learning_rate": 1.284297375580287e-06,
"loss": -0.1079,
"step": 278
},
{
"epoch": 1.409433962264151,
"grad_norm": 4.275945663452148,
"learning_rate": 1.272997213103621e-06,
"loss": 1.3644,
"step": 280
},
{
"epoch": 1.419496855345912,
"grad_norm": 5.876030921936035,
"learning_rate": 1.2616592559684408e-06,
"loss": -1.5156,
"step": 282
},
{
"epoch": 1.429559748427673,
"grad_norm": 3.4462649822235107,
"learning_rate": 1.2502850738399199e-06,
"loss": 0.2908,
"step": 284
},
{
"epoch": 1.439622641509434,
"grad_norm": 3.7064943313598633,
"learning_rate": 1.2388762413983444e-06,
"loss": -1.058,
"step": 286
},
{
"epoch": 1.449685534591195,
"grad_norm": 4.951382637023926,
"learning_rate": 1.2274343381211066e-06,
"loss": 0.4712,
"step": 288
},
{
"epoch": 1.459748427672956,
"grad_norm": 4.248599052429199,
"learning_rate": 1.215960948064036e-06,
"loss": 0.1037,
"step": 290
},
{
"epoch": 1.469811320754717,
"grad_norm": 4.509840488433838,
"learning_rate": 1.2044576596421002e-06,
"loss": 0.6964,
"step": 292
},
{
"epoch": 1.479874213836478,
"grad_norm": 1.8829210996627808,
"learning_rate": 1.1929260654094969e-06,
"loss": -0.0571,
"step": 294
},
{
"epoch": 1.489937106918239,
"grad_norm": 6.426050662994385,
"learning_rate": 1.1813677618391757e-06,
"loss": 0.5038,
"step": 296
},
{
"epoch": 1.5,
"grad_norm": 3.1166653633117676,
"learning_rate": 1.1697843491018187e-06,
"loss": -1.3007,
"step": 298
},
{
"epoch": 1.510062893081761,
"grad_norm": 2.824904680252075,
"learning_rate": 1.1581774308443039e-06,
"loss": 0.6687,
"step": 300
},
{
"epoch": 1.520125786163522,
"grad_norm": 1.3138232231140137,
"learning_rate": 1.1465486139676953e-06,
"loss": 0.8043,
"step": 302
},
{
"epoch": 1.530188679245283,
"grad_norm": 3.3225157260894775,
"learning_rate": 1.1348995084047749e-06,
"loss": 0.5529,
"step": 304
},
{
"epoch": 1.540251572327044,
"grad_norm": 5.321311950683594,
"learning_rate": 1.1232317268971584e-06,
"loss": 0.1101,
"step": 306
},
{
"epoch": 1.550314465408805,
"grad_norm": 10.030771255493164,
"learning_rate": 1.1115468847720245e-06,
"loss": -0.9142,
"step": 308
},
{
"epoch": 1.560377358490566,
"grad_norm": 2.3845436573028564,
"learning_rate": 1.0998465997184796e-06,
"loss": 0.6053,
"step": 310
},
{
"epoch": 1.570440251572327,
"grad_norm": 3.853327512741089,
"learning_rate": 1.0881324915636018e-06,
"loss": 0.1398,
"step": 312
},
{
"epoch": 1.580503144654088,
"grad_norm": 2.7320926189422607,
"learning_rate": 1.076406182048187e-06,
"loss": -1.7586,
"step": 314
},
{
"epoch": 1.590566037735849,
"grad_norm": 2.23327374458313,
"learning_rate": 1.0646692946022285e-06,
"loss": -0.8936,
"step": 316
},
{
"epoch": 1.60062893081761,
"grad_norm": 6.662895679473877,
"learning_rate": 1.0529234541201631e-06,
"loss": 1.1678,
"step": 318
},
{
"epoch": 1.610691823899371,
"grad_norm": 2.96289324760437,
"learning_rate": 1.0411702867359178e-06,
"loss": -0.3086,
"step": 320
},
{
"epoch": 1.620754716981132,
"grad_norm": 2.9261276721954346,
"learning_rate": 1.0294114195977794e-06,
"loss": 0.7558,
"step": 322
},
{
"epoch": 1.630817610062893,
"grad_norm": 3.917189598083496,
"learning_rate": 1.0176484806431287e-06,
"loss": 0.1406,
"step": 324
},
{
"epoch": 1.640880503144654,
"grad_norm": 8.924764633178711,
"learning_rate": 1.0058830983730622e-06,
"loss": -3.2015,
"step": 326
},
{
"epoch": 1.650943396226415,
"grad_norm": 3.501892328262329,
"learning_rate": 9.94116901626938e-07,
"loss": -1.6323,
"step": 328
},
{
"epoch": 1.661006289308176,
"grad_norm": 2.972134828567505,
"learning_rate": 9.823515193568714e-07,
"loss": -1.4688,
"step": 330
},
{
"epoch": 1.671069182389937,
"grad_norm": 6.309866428375244,
"learning_rate": 9.705885804022205e-07,
"loss": 0.4812,
"step": 332
},
{
"epoch": 1.681132075471698,
"grad_norm": 4.435581207275391,
"learning_rate": 9.588297132640824e-07,
"loss": 0.0122,
"step": 334
},
{
"epoch": 1.691194968553459,
"grad_norm": 4.168426513671875,
"learning_rate": 9.470765458798368e-07,
"loss": -0.787,
"step": 336
},
{
"epoch": 1.70125786163522,
"grad_norm": 3.8862287998199463,
"learning_rate": 9.353307053977715e-07,
"loss": -0.3479,
"step": 338
},
{
"epoch": 1.711320754716981,
"grad_norm": 4.058013439178467,
"learning_rate": 9.23593817951813e-07,
"loss": 0.7891,
"step": 340
},
{
"epoch": 1.721383647798742,
"grad_norm": 9.581009864807129,
"learning_rate": 9.118675084363985e-07,
"loss": -0.5769,
"step": 342
},
{
"epoch": 1.731446540880503,
"grad_norm": 4.200214862823486,
"learning_rate": 9.001534002815207e-07,
"loss": -1.3016,
"step": 344
},
{
"epoch": 1.741509433962264,
"grad_norm": 2.9621429443359375,
"learning_rate": 8.884531152279755e-07,
"loss": -1.772,
"step": 346
},
{
"epoch": 1.751572327044025,
"grad_norm": 3.36149001121521,
"learning_rate": 8.767682731028414e-07,
"loss": -0.7338,
"step": 348
},
{
"epoch": 1.761635220125786,
"grad_norm": 3.888066053390503,
"learning_rate": 8.651004915952252e-07,
"loss": -0.5376,
"step": 350
},
{
"epoch": 1.771698113207547,
"grad_norm": 2.9135375022888184,
"learning_rate": 8.534513860323045e-07,
"loss": -0.2755,
"step": 352
},
{
"epoch": 1.7817610062893081,
"grad_norm": 2.2403316497802734,
"learning_rate": 8.41822569155696e-07,
"loss": -0.5882,
"step": 354
},
{
"epoch": 1.7918238993710691,
"grad_norm": 6.112231731414795,
"learning_rate": 8.302156508981815e-07,
"loss": 0.1197,
"step": 356
},
{
"epoch": 1.8018867924528301,
"grad_norm": 6.92394495010376,
"learning_rate": 8.18632238160824e-07,
"loss": 0.122,
"step": 358
},
{
"epoch": 1.8119496855345911,
"grad_norm": 8.573149681091309,
"learning_rate": 8.070739345905031e-07,
"loss": -1.2034,
"step": 360
},
{
"epoch": 1.8220125786163521,
"grad_norm": 3.436896562576294,
"learning_rate": 7.955423403578997e-07,
"loss": -0.336,
"step": 362
},
{
"epoch": 1.8320754716981131,
"grad_norm": 3.0969924926757812,
"learning_rate": 7.840390519359643e-07,
"loss": -0.6976,
"step": 364
},
{
"epoch": 1.8421383647798741,
"grad_norm": 3.821650266647339,
"learning_rate": 7.725656618788937e-07,
"loss": -1.231,
"step": 366
},
{
"epoch": 1.8522012578616351,
"grad_norm": 3.3464226722717285,
"learning_rate": 7.611237586016557e-07,
"loss": 0.8503,
"step": 368
},
{
"epoch": 1.8622641509433961,
"grad_norm": 3.881531238555908,
"learning_rate": 7.497149261600802e-07,
"loss": 0.3178,
"step": 370
},
{
"epoch": 1.8723270440251572,
"grad_norm": 1.9269695281982422,
"learning_rate": 7.383407440315595e-07,
"loss": -0.2027,
"step": 372
},
{
"epoch": 1.8823899371069182,
"grad_norm": 11.40230941772461,
"learning_rate": 7.27002786896379e-07,
"loss": -0.1666,
"step": 374
},
{
"epoch": 1.8924528301886792,
"grad_norm": 2.309051752090454,
"learning_rate": 7.157026244197131e-07,
"loss": -0.0113,
"step": 376
},
{
"epoch": 1.9025157232704402,
"grad_norm": 13.750130653381348,
"learning_rate": 7.044418210343159e-07,
"loss": -0.5592,
"step": 378
},
{
"epoch": 1.9125786163522012,
"grad_norm": 2.372840166091919,
"learning_rate": 6.932219357239361e-07,
"loss": -0.173,
"step": 380
},
{
"epoch": 1.9226415094339622,
"grad_norm": 11.330310821533203,
"learning_rate": 6.820445218074848e-07,
"loss": -1.36,
"step": 382
},
{
"epoch": 1.9327044025157232,
"grad_norm": 7.450850009918213,
"learning_rate": 6.7091112672399e-07,
"loss": -1.447,
"step": 384
},
{
"epoch": 1.9427672955974842,
"grad_norm": 12.863826751708984,
"learning_rate": 6.598232918183631e-07,
"loss": 1.0882,
"step": 386
},
{
"epoch": 1.9528301886792452,
"grad_norm": 5.197085380554199,
"learning_rate": 6.487825521280108e-07,
"loss": -0.2821,
"step": 388
},
{
"epoch": 1.9628930817610062,
"grad_norm": 2.8584909439086914,
"learning_rate": 6.377904361703177e-07,
"loss": 0.6447,
"step": 390
},
{
"epoch": 1.9729559748427672,
"grad_norm": 9.712791442871094,
"learning_rate": 6.26848465731035e-07,
"loss": 1.5534,
"step": 392
},
{
"epoch": 1.9830188679245282,
"grad_norm": 8.965962409973145,
"learning_rate": 6.159581556535987e-07,
"loss": 1.1777,
"step": 394
},
{
"epoch": 1.9930817610062892,
"grad_norm": 2.6333396434783936,
"learning_rate": 6.051210136294088e-07,
"loss": 0.6377,
"step": 396
},
{
"epoch": 2.0037735849056606,
"grad_norm": 4.632491588592529,
"learning_rate": 5.943385399891003e-07,
"loss": 0.7307,
"step": 398
},
{
"epoch": 2.0138364779874216,
"grad_norm": 4.375370979309082,
"learning_rate": 5.836122274948324e-07,
"loss": 1.2132,
"step": 400
},
{
"epoch": 2.0238993710691826,
"grad_norm": 3.335942268371582,
"learning_rate": 5.729435611336239e-07,
"loss": -0.5918,
"step": 402
},
{
"epoch": 2.0339622641509436,
"grad_norm": 6.7062506675720215,
"learning_rate": 5.623340179117694e-07,
"loss": -0.9562,
"step": 404
},
{
"epoch": 2.0440251572327046,
"grad_norm": 3.223489761352539,
"learning_rate": 5.517850666503546e-07,
"loss": 0.6964,
"step": 406
},
{
"epoch": 2.0540880503144656,
"grad_norm": 7.602553367614746,
"learning_rate": 5.412981677819093e-07,
"loss": -2.6532,
"step": 408
},
{
"epoch": 2.0641509433962266,
"grad_norm": 2.123918056488037,
"learning_rate": 5.308747731482206e-07,
"loss": -1.1065,
"step": 410
},
{
"epoch": 2.0742138364779876,
"grad_norm": 5.430229187011719,
"learning_rate": 5.20516325799334e-07,
"loss": -0.7525,
"step": 412
},
{
"epoch": 2.0842767295597486,
"grad_norm": 5.109172344207764,
"learning_rate": 5.102242597937717e-07,
"loss": -1.5795,
"step": 414
},
{
"epoch": 2.0943396226415096,
"grad_norm": 3.5902011394500732,
"learning_rate": 5.000000000000002e-07,
"loss": -0.4448,
"step": 416
},
{
"epoch": 2.1044025157232706,
"grad_norm": 3.8342630863189697,
"learning_rate": 4.89844961899163e-07,
"loss": -1.3424,
"step": 418
},
{
"epoch": 2.1144654088050316,
"grad_norm": 5.093093395233154,
"learning_rate": 4.797605513891178e-07,
"loss": 0.6365,
"step": 420
},
{
"epoch": 2.1245283018867926,
"grad_norm": 6.690524578094482,
"learning_rate": 4.6974816458980116e-07,
"loss": 0.0718,
"step": 422
},
{
"epoch": 2.1345911949685537,
"grad_norm": 3.328261375427246,
"learning_rate": 4.598091876499417e-07,
"loss": -1.2867,
"step": 424
},
{
"epoch": 2.1446540880503147,
"grad_norm": 2.5299105644226074,
"learning_rate": 4.499449965551586e-07,
"loss": -0.0399,
"step": 426
},
{
"epoch": 2.1547169811320757,
"grad_norm": 7.731986045837402,
"learning_rate": 4.401569569374668e-07,
"loss": 0.4734,
"step": 428
},
{
"epoch": 2.1647798742138367,
"grad_norm": 6.546573162078857,
"learning_rate": 4.3044642388621144e-07,
"loss": -0.9198,
"step": 430
},
{
"epoch": 2.1748427672955977,
"grad_norm": 5.20041561126709,
"learning_rate": 4.208147417604664e-07,
"loss": 0.1999,
"step": 432
},
{
"epoch": 2.1849056603773587,
"grad_norm": 7.04267692565918,
"learning_rate": 4.1126324400291756e-07,
"loss": -0.0014,
"step": 434
},
{
"epoch": 2.1949685534591197,
"grad_norm": 1.8967030048370361,
"learning_rate": 4.0179325295525426e-07,
"loss": -0.4547,
"step": 436
},
{
"epoch": 2.2050314465408807,
"grad_norm": 7.423833847045898,
"learning_rate": 3.924060796751012e-07,
"loss": 1.2133,
"step": 438
},
{
"epoch": 2.2150943396226417,
"grad_norm": 5.08156156539917,
"learning_rate": 3.83103023754511e-07,
"loss": -0.5562,
"step": 440
},
{
"epoch": 2.2251572327044027,
"grad_norm": 2.8167994022369385,
"learning_rate": 3.738853731400439e-07,
"loss": 0.1852,
"step": 442
},
{
"epoch": 2.2352201257861637,
"grad_norm": 3.1104578971862793,
"learning_rate": 3.6475440395446147e-07,
"loss": -0.9611,
"step": 444
},
{
"epoch": 2.2452830188679247,
"grad_norm": 2.3350167274475098,
"learning_rate": 3.5571138032005365e-07,
"loss": 0.3598,
"step": 446
},
{
"epoch": 2.2553459119496857,
"grad_norm": 3.4781851768493652,
"learning_rate": 3.4675755418363053e-07,
"loss": 0.1132,
"step": 448
},
{
"epoch": 2.2654088050314467,
"grad_norm": 5.0868706703186035,
"learning_rate": 3.378941651431996e-07,
"loss": 0.7901,
"step": 450
},
{
"epoch": 2.2754716981132077,
"grad_norm": 4.737022876739502,
"learning_rate": 3.291224402763495e-07,
"loss": -0.5819,
"step": 452
},
{
"epoch": 2.2855345911949687,
"grad_norm": 3.6209828853607178,
"learning_rate": 3.2044359397037046e-07,
"loss": -0.2148,
"step": 454
},
{
"epoch": 2.2955974842767297,
"grad_norm": 6.26187801361084,
"learning_rate": 3.118588277541312e-07,
"loss": -0.7123,
"step": 456
},
{
"epoch": 2.3056603773584907,
"grad_norm": 3.300475597381592,
"learning_rate": 3.0336933013173305e-07,
"loss": 0.3813,
"step": 458
},
{
"epoch": 2.3157232704402517,
"grad_norm": 4.379162311553955,
"learning_rate": 2.9497627641797106e-07,
"loss": -0.9063,
"step": 460
},
{
"epoch": 2.3257861635220127,
"grad_norm": 4.494270324707031,
"learning_rate": 2.8668082857562004e-07,
"loss": 0.7504,
"step": 462
},
{
"epoch": 2.3358490566037737,
"grad_norm": 4.654480457305908,
"learning_rate": 2.784841350545656e-07,
"loss": -0.4204,
"step": 464
},
{
"epoch": 2.3459119496855347,
"grad_norm": 3.090691089630127,
"learning_rate": 2.7038733063281173e-07,
"loss": 0.6562,
"step": 466
},
{
"epoch": 2.3559748427672957,
"grad_norm": 3.110882520675659,
"learning_rate": 2.623915362593778e-07,
"loss": -0.6948,
"step": 468
},
{
"epoch": 2.3660377358490567,
"grad_norm": 8.367574691772461,
"learning_rate": 2.5449785889910956e-07,
"loss": -1.445,
"step": 470
},
{
"epoch": 2.3761006289308177,
"grad_norm": 1.8932026624679565,
"learning_rate": 2.467073913794272e-07,
"loss": 0.3359,
"step": 472
},
{
"epoch": 2.3861635220125788,
"grad_norm": 4.765536785125732,
"learning_rate": 2.3902121223903226e-07,
"loss": -0.9514,
"step": 474
},
{
"epoch": 2.3962264150943398,
"grad_norm": 4.574184894561768,
"learning_rate": 2.3144038557858913e-07,
"loss": 0.6839,
"step": 476
},
{
"epoch": 2.4062893081761008,
"grad_norm": 6.006104469299316,
"learning_rate": 2.2396596091340803e-07,
"loss": 0.0796,
"step": 478
},
{
"epoch": 2.4163522012578618,
"grad_norm": 4.098776340484619,
"learning_rate": 2.1659897302814744e-07,
"loss": -0.9333,
"step": 480
},
{
"epoch": 2.4264150943396228,
"grad_norm": 4.418032646179199,
"learning_rate": 2.0934044183355383e-07,
"loss": -1.8508,
"step": 482
},
{
"epoch": 2.4364779874213838,
"grad_norm": 11.399324417114258,
"learning_rate": 2.0219137222526183e-07,
"loss": 1.1837,
"step": 484
},
{
"epoch": 2.4465408805031448,
"grad_norm": 5.924710273742676,
"learning_rate": 1.9515275394467446e-07,
"loss": -0.0577,
"step": 486
},
{
"epoch": 2.456603773584906,
"grad_norm": 7.316831111907959,
"learning_rate": 1.8822556144193756e-07,
"loss": 0.1237,
"step": 488
},
{
"epoch": 2.466666666666667,
"grad_norm": 6.5416765213012695,
"learning_rate": 1.8141075374103632e-07,
"loss": -1.9742,
"step": 490
},
{
"epoch": 2.476729559748428,
"grad_norm": 5.302765369415283,
"learning_rate": 1.7470927430702276e-07,
"loss": 1.6366,
"step": 492
},
{
"epoch": 2.486792452830189,
"grad_norm": 6.104937553405762,
"learning_rate": 1.6812205091539978e-07,
"loss": -0.9508,
"step": 494
},
{
"epoch": 2.49685534591195,
"grad_norm": 3.6209168434143066,
"learning_rate": 1.6164999552367765e-07,
"loss": -0.6157,
"step": 496
},
{
"epoch": 2.506918238993711,
"grad_norm": 11.832756996154785,
"learning_rate": 1.5529400414511805e-07,
"loss": -1.168,
"step": 498
},
{
"epoch": 2.516981132075472,
"grad_norm": 9.809549331665039,
"learning_rate": 1.4905495672468783e-07,
"loss": 0.3619,
"step": 500
},
{
"epoch": 2.527044025157233,
"grad_norm": 5.026820182800293,
"learning_rate": 1.42933717017237e-07,
"loss": -0.3516,
"step": 502
},
{
"epoch": 2.537106918238994,
"grad_norm": 4.968526363372803,
"learning_rate": 1.3693113246791588e-07,
"loss": -0.383,
"step": 504
},
{
"epoch": 2.547169811320755,
"grad_norm": 5.452160835266113,
"learning_rate": 1.3104803409485354e-07,
"loss": -0.3609,
"step": 506
},
{
"epoch": 2.557232704402516,
"grad_norm": 6.929769992828369,
"learning_rate": 1.2528523637410836e-07,
"loss": -0.109,
"step": 508
},
{
"epoch": 2.567295597484277,
"grad_norm": 5.186896800994873,
"learning_rate": 1.1964353712690888e-07,
"loss": 0.3748,
"step": 510
},
{
"epoch": 2.577358490566038,
"grad_norm": 2.7618138790130615,
"learning_rate": 1.1412371740920035e-07,
"loss": 0.6345,
"step": 512
},
{
"epoch": 2.587421383647799,
"grad_norm": 9.840655326843262,
"learning_rate": 1.0872654140351457e-07,
"loss": -0.4424,
"step": 514
},
{
"epoch": 2.59748427672956,
"grad_norm": 5.229491233825684,
"learning_rate": 1.0345275631317163e-07,
"loss": 0.1269,
"step": 516
},
{
"epoch": 2.607547169811321,
"grad_norm": 3.292207956314087,
"learning_rate": 9.830309225883559e-08,
"loss": -0.8045,
"step": 518
},
{
"epoch": 2.617610062893082,
"grad_norm": 2.8611297607421875,
"learning_rate": 9.327826217743451e-08,
"loss": 0.6012,
"step": 520
},
{
"epoch": 2.627672955974843,
"grad_norm": 6.323940277099609,
"learning_rate": 8.837896172345827e-08,
"loss": -0.5895,
"step": 522
},
{
"epoch": 2.637735849056604,
"grad_norm": 7.645895957946777,
"learning_rate": 8.360586917264977e-08,
"loss": 0.5182,
"step": 524
},
{
"epoch": 2.647798742138365,
"grad_norm": 6.323966979980469,
"learning_rate": 7.895964532810317e-08,
"loss": -0.3837,
"step": 526
},
{
"epoch": 2.657861635220126,
"grad_norm": 7.799415588378906,
"learning_rate": 7.444093342877899e-08,
"loss": -0.7239,
"step": 528
},
{
"epoch": 2.667924528301887,
"grad_norm": 6.719019889831543,
"learning_rate": 7.005035906045197e-08,
"loss": 0.2248,
"step": 530
},
{
"epoch": 2.677987421383648,
"grad_norm": 5.086057186126709,
"learning_rate": 6.578853006910402e-08,
"loss": 0.5775,
"step": 532
},
{
"epoch": 2.688050314465409,
"grad_norm": 3.6781728267669678,
"learning_rate": 6.165603647677054e-08,
"loss": 0.0562,
"step": 534
},
{
"epoch": 2.69811320754717,
"grad_norm": 9.493392944335938,
"learning_rate": 5.765345039985647e-08,
"loss": 0.205,
"step": 536
},
{
"epoch": 2.708176100628931,
"grad_norm": 4.998286247253418,
"learning_rate": 5.378132596993046e-08,
"loss": 0.9461,
"step": 538
},
{
"epoch": 2.718238993710692,
"grad_norm": 4.373546600341797,
"learning_rate": 5.0040199257009196e-08,
"loss": -0.7566,
"step": 540
},
{
"epoch": 2.728301886792453,
"grad_norm": 8.538968086242676,
"learning_rate": 4.6430588195341847e-08,
"loss": 0.9457,
"step": 542
},
{
"epoch": 2.738364779874214,
"grad_norm": 8.773660659790039,
"learning_rate": 4.295299251170537e-08,
"loss": -0.2537,
"step": 544
},
{
"epoch": 2.748427672955975,
"grad_norm": 5.2722978591918945,
"learning_rate": 3.9607893656220745e-08,
"loss": 0.8571,
"step": 546
},
{
"epoch": 2.758490566037736,
"grad_norm": 7.540788650512695,
"learning_rate": 3.639575473569989e-08,
"loss": -2.1415,
"step": 548
},
{
"epoch": 2.768553459119497,
"grad_norm": 3.7448925971984863,
"learning_rate": 3.331702044953066e-08,
"loss": -1.1784,
"step": 550
},
{
"epoch": 2.778616352201258,
"grad_norm": 3.103691577911377,
"learning_rate": 3.037211702811182e-08,
"loss": -0.3766,
"step": 552
},
{
"epoch": 2.788679245283019,
"grad_norm": 4.002925872802734,
"learning_rate": 2.75614521738442e-08,
"loss": -1.5215,
"step": 554
},
{
"epoch": 2.79874213836478,
"grad_norm": 6.615825176239014,
"learning_rate": 2.488541500468666e-08,
"loss": 0.4594,
"step": 556
},
{
"epoch": 2.808805031446541,
"grad_norm": 4.420342922210693,
"learning_rate": 2.2344376000285604e-08,
"loss": 0.0622,
"step": 558
},
{
"epoch": 2.818867924528302,
"grad_norm": 5.796300888061523,
"learning_rate": 1.9938686950684567e-08,
"loss": -0.9306,
"step": 560
},
{
"epoch": 2.828930817610063,
"grad_norm": 4.024370193481445,
"learning_rate": 1.766868090762075e-08,
"loss": -0.4119,
"step": 562
},
{
"epoch": 2.838993710691824,
"grad_norm": 9.87598705291748,
"learning_rate": 1.553467213841664e-08,
"loss": -0.1066,
"step": 564
},
{
"epoch": 2.849056603773585,
"grad_norm": 6.048956871032715,
"learning_rate": 1.3536956082472073e-08,
"loss": -0.7316,
"step": 566
},
{
"epoch": 2.859119496855346,
"grad_norm": 5.084702968597412,
"learning_rate": 1.1675809310361495e-08,
"loss": -1.3274,
"step": 568
},
{
"epoch": 2.869182389937107,
"grad_norm": 4.490642070770264,
"learning_rate": 9.951489485545694e-09,
"loss": 0.1211,
"step": 570
},
{
"epoch": 2.879245283018868,
"grad_norm": 9.895052909851074,
"learning_rate": 8.364235328699564e-09,
"loss": 1.1259,
"step": 572
},
{
"epoch": 2.889308176100629,
"grad_norm": 4.905172348022461,
"learning_rate": 6.914266584662987e-09,
"loss": -0.1241,
"step": 574
},
{
"epoch": 2.89937106918239,
"grad_norm": 3.0340776443481445,
"learning_rate": 5.60178399201805e-09,
"loss": -0.6671,
"step": 576
},
{
"epoch": 2.909433962264151,
"grad_norm": 3.124040126800537,
"learning_rate": 4.42696925529884e-09,
"loss": -1.8007,
"step": 578
},
{
"epoch": 2.919496855345912,
"grad_norm": 5.664621353149414,
"learning_rate": 3.3899850198353397e-09,
"loss": 0.1159,
"step": 580
},
{
"epoch": 2.929559748427673,
"grad_norm": 4.976583957672119,
"learning_rate": 2.4909748492362158e-09,
"loss": -1.2106,
"step": 582
},
{
"epoch": 2.939622641509434,
"grad_norm": 5.037308216094971,
"learning_rate": 1.730063205513277e-09,
"loss": 0.8336,
"step": 584
},
{
"epoch": 2.949685534591195,
"grad_norm": 4.580456733703613,
"learning_rate": 1.1073554318509203e-09,
"loss": 0.378,
"step": 586
},
{
"epoch": 2.959748427672956,
"grad_norm": 4.7945990562438965,
"learning_rate": 6.229377380218003e-10,
"loss": -0.0708,
"step": 588
},
{
"epoch": 2.969811320754717,
"grad_norm": 14.472588539123535,
"learning_rate": 2.7687718845148535e-10,
"loss": -0.0673,
"step": 590
},
{
"epoch": 2.979874213836478,
"grad_norm": 9.063091278076172,
"learning_rate": 6.92216929342182e-11,
"loss": -0.5115,
"step": 592
},
{
"epoch": 2.989937106918239,
"grad_norm": 5.872649192810059,
"learning_rate": 0.0,
"loss": 0.3585,
"step": 594
},
{
"epoch": 2.989937106918239,
"step": 594,
"total_flos": 5.151263974762742e+17,
"train_loss": -0.14183720302852718,
"train_runtime": 1424.6739,
"train_samples_per_second": 13.386,
"train_steps_per_second": 0.417
}
],
"logging_steps": 2,
"max_steps": 594,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.151263974762742e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}