9b-114 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
0aa55f9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2457,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002442002442002442,
"grad_norm": 2.3893706798553467,
"learning_rate": 8.130081300813009e-08,
"loss": 1.9268020391464233,
"step": 2
},
{
"epoch": 0.004884004884004884,
"grad_norm": 2.7525486946105957,
"learning_rate": 2.439024390243903e-07,
"loss": 2.098186492919922,
"step": 4
},
{
"epoch": 0.007326007326007326,
"grad_norm": 11.947481155395508,
"learning_rate": 4.0650406504065046e-07,
"loss": 2.4138333797454834,
"step": 6
},
{
"epoch": 0.009768009768009768,
"grad_norm": 2.4891531467437744,
"learning_rate": 5.691056910569106e-07,
"loss": 1.953867793083191,
"step": 8
},
{
"epoch": 0.01221001221001221,
"grad_norm": 3.49385666847229,
"learning_rate": 7.317073170731707e-07,
"loss": 2.1249871253967285,
"step": 10
},
{
"epoch": 0.014652014652014652,
"grad_norm": 9.621060371398926,
"learning_rate": 8.94308943089431e-07,
"loss": 1.870603322982788,
"step": 12
},
{
"epoch": 0.017094017094017096,
"grad_norm": 1.2622815370559692,
"learning_rate": 1.0569105691056912e-06,
"loss": 1.646697998046875,
"step": 14
},
{
"epoch": 0.019536019536019536,
"grad_norm": 16.232858657836914,
"learning_rate": 1.2195121951219514e-06,
"loss": 1.6898235082626343,
"step": 16
},
{
"epoch": 0.02197802197802198,
"grad_norm": 10.059346199035645,
"learning_rate": 1.3821138211382116e-06,
"loss": 1.8439620733261108,
"step": 18
},
{
"epoch": 0.02442002442002442,
"grad_norm": 3.1513400077819824,
"learning_rate": 1.5447154471544717e-06,
"loss": 1.6882305145263672,
"step": 20
},
{
"epoch": 0.026862026862026864,
"grad_norm": 5.707210540771484,
"learning_rate": 1.707317073170732e-06,
"loss": 1.4086613655090332,
"step": 22
},
{
"epoch": 0.029304029304029304,
"grad_norm": 1.5729589462280273,
"learning_rate": 1.8699186991869919e-06,
"loss": 1.22359037399292,
"step": 24
},
{
"epoch": 0.031746031746031744,
"grad_norm": 1.6034835577011108,
"learning_rate": 2.0325203252032523e-06,
"loss": 1.6794222593307495,
"step": 26
},
{
"epoch": 0.03418803418803419,
"grad_norm": 4.907107353210449,
"learning_rate": 2.1951219512195125e-06,
"loss": 1.7425767183303833,
"step": 28
},
{
"epoch": 0.03663003663003663,
"grad_norm": 3.0787065029144287,
"learning_rate": 2.3577235772357727e-06,
"loss": 1.1433881521224976,
"step": 30
},
{
"epoch": 0.03907203907203907,
"grad_norm": 2.307734966278076,
"learning_rate": 2.5203252032520324e-06,
"loss": 1.0014692544937134,
"step": 32
},
{
"epoch": 0.04151404151404151,
"grad_norm": 2.102328062057495,
"learning_rate": 2.682926829268293e-06,
"loss": 1.558118224143982,
"step": 34
},
{
"epoch": 0.04395604395604396,
"grad_norm": 2.508723020553589,
"learning_rate": 2.845528455284553e-06,
"loss": 1.2752659320831299,
"step": 36
},
{
"epoch": 0.0463980463980464,
"grad_norm": 1.2697498798370361,
"learning_rate": 3.0081300813008134e-06,
"loss": 1.5238615274429321,
"step": 38
},
{
"epoch": 0.04884004884004884,
"grad_norm": 9.991412162780762,
"learning_rate": 3.1707317073170736e-06,
"loss": 1.3837251663208008,
"step": 40
},
{
"epoch": 0.05128205128205128,
"grad_norm": 1.6207857131958008,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.4677042961120605,
"step": 42
},
{
"epoch": 0.05372405372405373,
"grad_norm": 1.4186246395111084,
"learning_rate": 3.495934959349594e-06,
"loss": 1.4204754829406738,
"step": 44
},
{
"epoch": 0.05616605616605617,
"grad_norm": 3.8226752281188965,
"learning_rate": 3.6585365853658537e-06,
"loss": 1.3837416172027588,
"step": 46
},
{
"epoch": 0.05860805860805861,
"grad_norm": 2.2121241092681885,
"learning_rate": 3.821138211382115e-06,
"loss": 1.0739210844039917,
"step": 48
},
{
"epoch": 0.06105006105006105,
"grad_norm": 2.6986029148101807,
"learning_rate": 3.983739837398374e-06,
"loss": 0.9827917814254761,
"step": 50
},
{
"epoch": 0.06349206349206349,
"grad_norm": 1.9821456670761108,
"learning_rate": 4.146341463414634e-06,
"loss": 1.3439877033233643,
"step": 52
},
{
"epoch": 0.06593406593406594,
"grad_norm": 20.799144744873047,
"learning_rate": 4.308943089430894e-06,
"loss": 1.0445363521575928,
"step": 54
},
{
"epoch": 0.06837606837606838,
"grad_norm": 2.018078088760376,
"learning_rate": 4.471544715447155e-06,
"loss": 1.3443750143051147,
"step": 56
},
{
"epoch": 0.07081807081807082,
"grad_norm": 1.7203161716461182,
"learning_rate": 4.634146341463416e-06,
"loss": 1.3475321531295776,
"step": 58
},
{
"epoch": 0.07326007326007326,
"grad_norm": 1.489026665687561,
"learning_rate": 4.796747967479675e-06,
"loss": 1.4037724733352661,
"step": 60
},
{
"epoch": 0.0757020757020757,
"grad_norm": 5.072752475738525,
"learning_rate": 4.959349593495935e-06,
"loss": 1.0292410850524902,
"step": 62
},
{
"epoch": 0.07814407814407814,
"grad_norm": 1.9865821599960327,
"learning_rate": 5.121951219512195e-06,
"loss": 1.6453887224197388,
"step": 64
},
{
"epoch": 0.08058608058608059,
"grad_norm": 8.150779724121094,
"learning_rate": 5.2845528455284555e-06,
"loss": 1.1829452514648438,
"step": 66
},
{
"epoch": 0.08302808302808302,
"grad_norm": 1.6878677606582642,
"learning_rate": 5.447154471544716e-06,
"loss": 1.1011936664581299,
"step": 68
},
{
"epoch": 0.08547008547008547,
"grad_norm": 0.6670809388160706,
"learning_rate": 5.609756097560977e-06,
"loss": 1.1616472005844116,
"step": 70
},
{
"epoch": 0.08791208791208792,
"grad_norm": 1.4617451429367065,
"learning_rate": 5.772357723577237e-06,
"loss": 1.3839240074157715,
"step": 72
},
{
"epoch": 0.09035409035409035,
"grad_norm": 1.9361579418182373,
"learning_rate": 5.934959349593496e-06,
"loss": 1.4121818542480469,
"step": 74
},
{
"epoch": 0.0927960927960928,
"grad_norm": 3.0737693309783936,
"learning_rate": 6.0975609756097564e-06,
"loss": 1.416529893875122,
"step": 76
},
{
"epoch": 0.09523809523809523,
"grad_norm": 1.4835634231567383,
"learning_rate": 6.260162601626017e-06,
"loss": 1.222318410873413,
"step": 78
},
{
"epoch": 0.09768009768009768,
"grad_norm": 3.82383394241333,
"learning_rate": 6.422764227642278e-06,
"loss": 1.1652414798736572,
"step": 80
},
{
"epoch": 0.10012210012210013,
"grad_norm": 13.640969276428223,
"learning_rate": 6.585365853658538e-06,
"loss": 1.0345308780670166,
"step": 82
},
{
"epoch": 0.10256410256410256,
"grad_norm": 9.584553718566895,
"learning_rate": 6.747967479674797e-06,
"loss": 1.0803658962249756,
"step": 84
},
{
"epoch": 0.10500610500610501,
"grad_norm": 1.267685055732727,
"learning_rate": 6.910569105691057e-06,
"loss": 1.249168038368225,
"step": 86
},
{
"epoch": 0.10744810744810745,
"grad_norm": 1.5764743089675903,
"learning_rate": 7.0731707317073175e-06,
"loss": 1.4062129259109497,
"step": 88
},
{
"epoch": 0.10989010989010989,
"grad_norm": 0.6901059746742249,
"learning_rate": 7.2357723577235786e-06,
"loss": 1.1516574621200562,
"step": 90
},
{
"epoch": 0.11233211233211234,
"grad_norm": 2.498300552368164,
"learning_rate": 7.398373983739838e-06,
"loss": 1.4968205690383911,
"step": 92
},
{
"epoch": 0.11477411477411477,
"grad_norm": 8.051702499389648,
"learning_rate": 7.560975609756098e-06,
"loss": 1.1234135627746582,
"step": 94
},
{
"epoch": 0.11721611721611722,
"grad_norm": 2.431464433670044,
"learning_rate": 7.723577235772358e-06,
"loss": 1.4027729034423828,
"step": 96
},
{
"epoch": 0.11965811965811966,
"grad_norm": 2.105727434158325,
"learning_rate": 7.886178861788618e-06,
"loss": 1.3487744331359863,
"step": 98
},
{
"epoch": 0.1221001221001221,
"grad_norm": 2.983010768890381,
"learning_rate": 8.048780487804879e-06,
"loss": 1.074942708015442,
"step": 100
},
{
"epoch": 0.12454212454212454,
"grad_norm": 3.559720039367676,
"learning_rate": 8.21138211382114e-06,
"loss": 1.0268707275390625,
"step": 102
},
{
"epoch": 0.12698412698412698,
"grad_norm": 1.4598705768585205,
"learning_rate": 8.373983739837399e-06,
"loss": 0.9993240833282471,
"step": 104
},
{
"epoch": 0.12942612942612944,
"grad_norm": 1.1988660097122192,
"learning_rate": 8.536585365853658e-06,
"loss": 0.9525761604309082,
"step": 106
},
{
"epoch": 0.13186813186813187,
"grad_norm": 27.485122680664062,
"learning_rate": 8.69918699186992e-06,
"loss": 1.3531205654144287,
"step": 108
},
{
"epoch": 0.1343101343101343,
"grad_norm": 2.1461856365203857,
"learning_rate": 8.86178861788618e-06,
"loss": 1.6010525226593018,
"step": 110
},
{
"epoch": 0.13675213675213677,
"grad_norm": 2.505549430847168,
"learning_rate": 9.02439024390244e-06,
"loss": 1.41323721408844,
"step": 112
},
{
"epoch": 0.1391941391941392,
"grad_norm": 1.478813648223877,
"learning_rate": 9.1869918699187e-06,
"loss": 1.1879425048828125,
"step": 114
},
{
"epoch": 0.14163614163614163,
"grad_norm": 1.3980270624160767,
"learning_rate": 9.34959349593496e-06,
"loss": 1.3165570497512817,
"step": 116
},
{
"epoch": 0.14407814407814407,
"grad_norm": 3.0620999336242676,
"learning_rate": 9.51219512195122e-06,
"loss": 1.2242571115493774,
"step": 118
},
{
"epoch": 0.14652014652014653,
"grad_norm": 1.2746002674102783,
"learning_rate": 9.67479674796748e-06,
"loss": 1.3259317874908447,
"step": 120
},
{
"epoch": 0.14896214896214896,
"grad_norm": 1.4318238496780396,
"learning_rate": 9.837398373983741e-06,
"loss": 1.3270224332809448,
"step": 122
},
{
"epoch": 0.1514041514041514,
"grad_norm": 2.3391611576080322,
"learning_rate": 1e-05,
"loss": 1.3085572719573975,
"step": 124
},
{
"epoch": 0.15384615384615385,
"grad_norm": 1.5901521444320679,
"learning_rate": 1.0162601626016262e-05,
"loss": 1.1765004396438599,
"step": 126
},
{
"epoch": 0.1562881562881563,
"grad_norm": 1.361893892288208,
"learning_rate": 1.0325203252032521e-05,
"loss": 1.4069057703018188,
"step": 128
},
{
"epoch": 0.15873015873015872,
"grad_norm": 2.779815673828125,
"learning_rate": 1.0487804878048782e-05,
"loss": 1.2615665197372437,
"step": 130
},
{
"epoch": 0.16117216117216118,
"grad_norm": 1.3855739831924438,
"learning_rate": 1.065040650406504e-05,
"loss": 1.0511776208877563,
"step": 132
},
{
"epoch": 0.16361416361416362,
"grad_norm": 1.6709128618240356,
"learning_rate": 1.0813008130081301e-05,
"loss": 1.1562919616699219,
"step": 134
},
{
"epoch": 0.16605616605616605,
"grad_norm": 2.7451095581054688,
"learning_rate": 1.0975609756097562e-05,
"loss": 1.1669853925704956,
"step": 136
},
{
"epoch": 0.1684981684981685,
"grad_norm": 1.3229765892028809,
"learning_rate": 1.1138211382113821e-05,
"loss": 1.132803201675415,
"step": 138
},
{
"epoch": 0.17094017094017094,
"grad_norm": 1.1329905986785889,
"learning_rate": 1.1300813008130082e-05,
"loss": 1.078572392463684,
"step": 140
},
{
"epoch": 0.17338217338217338,
"grad_norm": 1.5809731483459473,
"learning_rate": 1.1463414634146342e-05,
"loss": 1.3616305589675903,
"step": 142
},
{
"epoch": 0.17582417582417584,
"grad_norm": 1.0317999124526978,
"learning_rate": 1.1626016260162603e-05,
"loss": 1.1173185110092163,
"step": 144
},
{
"epoch": 0.17826617826617827,
"grad_norm": 2.4350783824920654,
"learning_rate": 1.1788617886178864e-05,
"loss": 0.9900561571121216,
"step": 146
},
{
"epoch": 0.1807081807081807,
"grad_norm": 1.8645095825195312,
"learning_rate": 1.1951219512195123e-05,
"loss": 0.9898566007614136,
"step": 148
},
{
"epoch": 0.18315018315018314,
"grad_norm": 2.060671091079712,
"learning_rate": 1.2113821138211384e-05,
"loss": 1.3455116748809814,
"step": 150
},
{
"epoch": 0.1855921855921856,
"grad_norm": 4.001134395599365,
"learning_rate": 1.2276422764227642e-05,
"loss": 1.4696322679519653,
"step": 152
},
{
"epoch": 0.18803418803418803,
"grad_norm": 1.4498295783996582,
"learning_rate": 1.2439024390243903e-05,
"loss": 1.343530297279358,
"step": 154
},
{
"epoch": 0.19047619047619047,
"grad_norm": 6.848788261413574,
"learning_rate": 1.2601626016260164e-05,
"loss": 1.6466281414031982,
"step": 156
},
{
"epoch": 0.19291819291819293,
"grad_norm": 3.0440895557403564,
"learning_rate": 1.2764227642276423e-05,
"loss": 1.1969726085662842,
"step": 158
},
{
"epoch": 0.19536019536019536,
"grad_norm": 3.5766472816467285,
"learning_rate": 1.2926829268292684e-05,
"loss": 0.961052656173706,
"step": 160
},
{
"epoch": 0.1978021978021978,
"grad_norm": 4.323490619659424,
"learning_rate": 1.3089430894308943e-05,
"loss": 1.4117612838745117,
"step": 162
},
{
"epoch": 0.20024420024420025,
"grad_norm": 1.282266616821289,
"learning_rate": 1.3252032520325204e-05,
"loss": 1.319150447845459,
"step": 164
},
{
"epoch": 0.2026862026862027,
"grad_norm": 1.7024965286254883,
"learning_rate": 1.3414634146341466e-05,
"loss": 1.3318078517913818,
"step": 166
},
{
"epoch": 0.20512820512820512,
"grad_norm": 4.461455821990967,
"learning_rate": 1.3577235772357725e-05,
"loss": 1.1935322284698486,
"step": 168
},
{
"epoch": 0.20757020757020758,
"grad_norm": 4.874426364898682,
"learning_rate": 1.3739837398373986e-05,
"loss": 0.9753493666648865,
"step": 170
},
{
"epoch": 0.21001221001221002,
"grad_norm": 1.221576452255249,
"learning_rate": 1.3902439024390244e-05,
"loss": 1.0886809825897217,
"step": 172
},
{
"epoch": 0.21245421245421245,
"grad_norm": 1.046645998954773,
"learning_rate": 1.4065040650406505e-05,
"loss": 1.3587074279785156,
"step": 174
},
{
"epoch": 0.2148962148962149,
"grad_norm": 1.0372843742370605,
"learning_rate": 1.4227642276422766e-05,
"loss": 1.2677640914916992,
"step": 176
},
{
"epoch": 0.21733821733821734,
"grad_norm": 3.766371250152588,
"learning_rate": 1.4390243902439025e-05,
"loss": 1.3696472644805908,
"step": 178
},
{
"epoch": 0.21978021978021978,
"grad_norm": 0.8646840453147888,
"learning_rate": 1.4552845528455286e-05,
"loss": 1.0324885845184326,
"step": 180
},
{
"epoch": 0.2222222222222222,
"grad_norm": 2.4293770790100098,
"learning_rate": 1.4715447154471545e-05,
"loss": 0.9206986427307129,
"step": 182
},
{
"epoch": 0.22466422466422467,
"grad_norm": 1.333274006843567,
"learning_rate": 1.4878048780487806e-05,
"loss": 1.4569449424743652,
"step": 184
},
{
"epoch": 0.2271062271062271,
"grad_norm": 10.24266529083252,
"learning_rate": 1.5040650406504067e-05,
"loss": 1.3317943811416626,
"step": 186
},
{
"epoch": 0.22954822954822954,
"grad_norm": 1.4686931371688843,
"learning_rate": 1.5203252032520327e-05,
"loss": 1.4440950155258179,
"step": 188
},
{
"epoch": 0.231990231990232,
"grad_norm": 3.4735898971557617,
"learning_rate": 1.5365853658536586e-05,
"loss": 1.4389160871505737,
"step": 190
},
{
"epoch": 0.23443223443223443,
"grad_norm": 1.2915067672729492,
"learning_rate": 1.5528455284552847e-05,
"loss": 1.383222222328186,
"step": 192
},
{
"epoch": 0.23687423687423687,
"grad_norm": 1.2586745023727417,
"learning_rate": 1.5691056910569108e-05,
"loss": 1.3772218227386475,
"step": 194
},
{
"epoch": 0.23931623931623933,
"grad_norm": 5.940347194671631,
"learning_rate": 1.585365853658537e-05,
"loss": 1.152698040008545,
"step": 196
},
{
"epoch": 0.24175824175824176,
"grad_norm": 1.017399787902832,
"learning_rate": 1.6016260162601627e-05,
"loss": 1.3445426225662231,
"step": 198
},
{
"epoch": 0.2442002442002442,
"grad_norm": 2.1003332138061523,
"learning_rate": 1.6178861788617888e-05,
"loss": 1.4353071451187134,
"step": 200
},
{
"epoch": 0.24664224664224665,
"grad_norm": 1.2850189208984375,
"learning_rate": 1.6341463414634145e-05,
"loss": 1.3451241254806519,
"step": 202
},
{
"epoch": 0.2490842490842491,
"grad_norm": 1.576464295387268,
"learning_rate": 1.6504065040650406e-05,
"loss": 1.0413107872009277,
"step": 204
},
{
"epoch": 0.2515262515262515,
"grad_norm": 2.5853071212768555,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.319067120552063,
"step": 206
},
{
"epoch": 0.25396825396825395,
"grad_norm": 1.365488052368164,
"learning_rate": 1.682926829268293e-05,
"loss": 1.1813554763793945,
"step": 208
},
{
"epoch": 0.2564102564102564,
"grad_norm": 2.6422295570373535,
"learning_rate": 1.699186991869919e-05,
"loss": 0.9516135454177856,
"step": 210
},
{
"epoch": 0.2588522588522589,
"grad_norm": 3.197498321533203,
"learning_rate": 1.7154471544715447e-05,
"loss": 0.7080205678939819,
"step": 212
},
{
"epoch": 0.2612942612942613,
"grad_norm": 0.6656016111373901,
"learning_rate": 1.7317073170731708e-05,
"loss": 1.2077349424362183,
"step": 214
},
{
"epoch": 0.26373626373626374,
"grad_norm": 1.1304625272750854,
"learning_rate": 1.747967479674797e-05,
"loss": 1.340700626373291,
"step": 216
},
{
"epoch": 0.2661782661782662,
"grad_norm": 1.5265967845916748,
"learning_rate": 1.7642276422764227e-05,
"loss": 1.2717061042785645,
"step": 218
},
{
"epoch": 0.2686202686202686,
"grad_norm": 1.4525116682052612,
"learning_rate": 1.7804878048780488e-05,
"loss": 1.3473409414291382,
"step": 220
},
{
"epoch": 0.27106227106227104,
"grad_norm": 1.32387113571167,
"learning_rate": 1.796747967479675e-05,
"loss": 1.0149593353271484,
"step": 222
},
{
"epoch": 0.27350427350427353,
"grad_norm": 3.1132187843322754,
"learning_rate": 1.813008130081301e-05,
"loss": 1.0153287649154663,
"step": 224
},
{
"epoch": 0.27594627594627597,
"grad_norm": 1.0930202007293701,
"learning_rate": 1.829268292682927e-05,
"loss": 1.3861629962921143,
"step": 226
},
{
"epoch": 0.2783882783882784,
"grad_norm": 1.287597417831421,
"learning_rate": 1.845528455284553e-05,
"loss": 1.4393969774246216,
"step": 228
},
{
"epoch": 0.28083028083028083,
"grad_norm": 2.620121717453003,
"learning_rate": 1.861788617886179e-05,
"loss": 1.3899247646331787,
"step": 230
},
{
"epoch": 0.28327228327228327,
"grad_norm": 8.519104957580566,
"learning_rate": 1.878048780487805e-05,
"loss": 1.5776143074035645,
"step": 232
},
{
"epoch": 0.2857142857142857,
"grad_norm": 1.6813069581985474,
"learning_rate": 1.8943089430894312e-05,
"loss": 1.1832704544067383,
"step": 234
},
{
"epoch": 0.28815628815628813,
"grad_norm": 0.8178291320800781,
"learning_rate": 1.9105691056910573e-05,
"loss": 1.4064499139785767,
"step": 236
},
{
"epoch": 0.2905982905982906,
"grad_norm": 1.324826717376709,
"learning_rate": 1.926829268292683e-05,
"loss": 1.3489106893539429,
"step": 238
},
{
"epoch": 0.29304029304029305,
"grad_norm": 1.8617048263549805,
"learning_rate": 1.943089430894309e-05,
"loss": 1.2338833808898926,
"step": 240
},
{
"epoch": 0.2954822954822955,
"grad_norm": 1.1062006950378418,
"learning_rate": 1.959349593495935e-05,
"loss": 1.1017166376113892,
"step": 242
},
{
"epoch": 0.2979242979242979,
"grad_norm": 1.151795744895935,
"learning_rate": 1.975609756097561e-05,
"loss": 1.4224822521209717,
"step": 244
},
{
"epoch": 0.30036630036630035,
"grad_norm": 1.1693967580795288,
"learning_rate": 1.991869918699187e-05,
"loss": 0.9105122089385986,
"step": 246
},
{
"epoch": 0.3028083028083028,
"grad_norm": 2.165432929992676,
"learning_rate": 1.9999990914795638e-05,
"loss": 1.3537715673446655,
"step": 248
},
{
"epoch": 0.3052503052503053,
"grad_norm": 2.523041009902954,
"learning_rate": 1.9999918233270764e-05,
"loss": 1.1235604286193848,
"step": 250
},
{
"epoch": 0.3076923076923077,
"grad_norm": 4.936850547790527,
"learning_rate": 1.999977287080797e-05,
"loss": 1.2547414302825928,
"step": 252
},
{
"epoch": 0.31013431013431014,
"grad_norm": 1.219511866569519,
"learning_rate": 1.9999554828581173e-05,
"loss": 1.4373202323913574,
"step": 254
},
{
"epoch": 0.3125763125763126,
"grad_norm": 1.137669324874878,
"learning_rate": 1.9999264108351216e-05,
"loss": 1.3956284523010254,
"step": 256
},
{
"epoch": 0.315018315018315,
"grad_norm": 1.6814566850662231,
"learning_rate": 1.999890071246588e-05,
"loss": 1.4139020442962646,
"step": 258
},
{
"epoch": 0.31746031746031744,
"grad_norm": 1.1596673727035522,
"learning_rate": 1.9998464643859853e-05,
"loss": 1.3567984104156494,
"step": 260
},
{
"epoch": 0.3199023199023199,
"grad_norm": 2.8471524715423584,
"learning_rate": 1.999795590605471e-05,
"loss": 1.6041795015335083,
"step": 262
},
{
"epoch": 0.32234432234432236,
"grad_norm": 3.1703484058380127,
"learning_rate": 1.9997374503158877e-05,
"loss": 0.9505234956741333,
"step": 264
},
{
"epoch": 0.3247863247863248,
"grad_norm": 1.568231463432312,
"learning_rate": 1.9996720439867617e-05,
"loss": 1.1375908851623535,
"step": 266
},
{
"epoch": 0.32722832722832723,
"grad_norm": 1.1084926128387451,
"learning_rate": 1.9995993721462966e-05,
"loss": 1.5744917392730713,
"step": 268
},
{
"epoch": 0.32967032967032966,
"grad_norm": 2.8259096145629883,
"learning_rate": 1.9995194353813707e-05,
"loss": 1.1887890100479126,
"step": 270
},
{
"epoch": 0.3321123321123321,
"grad_norm": 1.1199963092803955,
"learning_rate": 1.999432234337532e-05,
"loss": 1.4438523054122925,
"step": 272
},
{
"epoch": 0.33455433455433453,
"grad_norm": 3.761988878250122,
"learning_rate": 1.999337769718993e-05,
"loss": 1.2220399379730225,
"step": 274
},
{
"epoch": 0.336996336996337,
"grad_norm": 1.841293454170227,
"learning_rate": 1.9992360422886246e-05,
"loss": 1.1481637954711914,
"step": 276
},
{
"epoch": 0.33943833943833945,
"grad_norm": 1.215539813041687,
"learning_rate": 1.9991270528679508e-05,
"loss": 1.5834959745407104,
"step": 278
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.9015586972236633,
"learning_rate": 1.9990108023371403e-05,
"loss": 1.4441936016082764,
"step": 280
},
{
"epoch": 0.3443223443223443,
"grad_norm": 1.5563743114471436,
"learning_rate": 1.9988872916350022e-05,
"loss": 1.376705288887024,
"step": 282
},
{
"epoch": 0.34676434676434675,
"grad_norm": 7.034574031829834,
"learning_rate": 1.9987565217589756e-05,
"loss": 1.4534231424331665,
"step": 284
},
{
"epoch": 0.3492063492063492,
"grad_norm": 0.9118156433105469,
"learning_rate": 1.9986184937651227e-05,
"loss": 1.2641198635101318,
"step": 286
},
{
"epoch": 0.3516483516483517,
"grad_norm": 3.323513984680176,
"learning_rate": 1.9984732087681215e-05,
"loss": 1.834381341934204,
"step": 288
},
{
"epoch": 0.3540903540903541,
"grad_norm": 3.959578037261963,
"learning_rate": 1.9983206679412542e-05,
"loss": 1.1039708852767944,
"step": 290
},
{
"epoch": 0.35653235653235654,
"grad_norm": 0.8664885759353638,
"learning_rate": 1.9981608725164002e-05,
"loss": 1.4267356395721436,
"step": 292
},
{
"epoch": 0.358974358974359,
"grad_norm": 1.372922658920288,
"learning_rate": 1.9979938237840247e-05,
"loss": 1.198704481124878,
"step": 294
},
{
"epoch": 0.3614163614163614,
"grad_norm": 2.058027982711792,
"learning_rate": 1.9978195230931686e-05,
"loss": 1.1538225412368774,
"step": 296
},
{
"epoch": 0.36385836385836384,
"grad_norm": 4.946676254272461,
"learning_rate": 1.997637971851438e-05,
"loss": 1.5473830699920654,
"step": 298
},
{
"epoch": 0.3663003663003663,
"grad_norm": 2.0882294178009033,
"learning_rate": 1.9974491715249917e-05,
"loss": 1.357876181602478,
"step": 300
},
{
"epoch": 0.36874236874236876,
"grad_norm": 3.573915958404541,
"learning_rate": 1.9972531236385314e-05,
"loss": 1.0178381204605103,
"step": 302
},
{
"epoch": 0.3711843711843712,
"grad_norm": 1.7873722314834595,
"learning_rate": 1.997049829775287e-05,
"loss": 1.327938199043274,
"step": 304
},
{
"epoch": 0.37362637362637363,
"grad_norm": 1.3761481046676636,
"learning_rate": 1.996839291577006e-05,
"loss": 1.4819612503051758,
"step": 306
},
{
"epoch": 0.37606837606837606,
"grad_norm": 1.9104338884353638,
"learning_rate": 1.996621510743938e-05,
"loss": 1.418102741241455,
"step": 308
},
{
"epoch": 0.3785103785103785,
"grad_norm": 1.1609731912612915,
"learning_rate": 1.9963964890348236e-05,
"loss": 1.4227708578109741,
"step": 310
},
{
"epoch": 0.38095238095238093,
"grad_norm": 3.0023646354675293,
"learning_rate": 1.9961642282668776e-05,
"loss": 1.1034045219421387,
"step": 312
},
{
"epoch": 0.3833943833943834,
"grad_norm": 4.011119842529297,
"learning_rate": 1.9959247303157763e-05,
"loss": 1.4926037788391113,
"step": 314
},
{
"epoch": 0.38583638583638585,
"grad_norm": 4.095101356506348,
"learning_rate": 1.995677997115641e-05,
"loss": 0.8862283229827881,
"step": 316
},
{
"epoch": 0.3882783882783883,
"grad_norm": 1.9095430374145508,
"learning_rate": 1.9954240306590235e-05,
"loss": 1.15045166015625,
"step": 318
},
{
"epoch": 0.3907203907203907,
"grad_norm": 1.4787174463272095,
"learning_rate": 1.9951628329968885e-05,
"loss": 1.4402953386306763,
"step": 320
},
{
"epoch": 0.39316239316239315,
"grad_norm": 2.3421995639801025,
"learning_rate": 1.9948944062385994e-05,
"loss": 1.456636667251587,
"step": 322
},
{
"epoch": 0.3956043956043956,
"grad_norm": 1.1577990055084229,
"learning_rate": 1.9946187525518986e-05,
"loss": 1.4146589040756226,
"step": 324
},
{
"epoch": 0.398046398046398,
"grad_norm": 1.954940676689148,
"learning_rate": 1.994335874162892e-05,
"loss": 1.3673632144927979,
"step": 326
},
{
"epoch": 0.4004884004884005,
"grad_norm": 1.90268874168396,
"learning_rate": 1.9940457733560293e-05,
"loss": 1.3601889610290527,
"step": 328
},
{
"epoch": 0.40293040293040294,
"grad_norm": 4.163765907287598,
"learning_rate": 1.993748452474088e-05,
"loss": 0.9897390007972717,
"step": 330
},
{
"epoch": 0.4053724053724054,
"grad_norm": 3.35142183303833,
"learning_rate": 1.9934439139181516e-05,
"loss": 0.6906993389129639,
"step": 332
},
{
"epoch": 0.4078144078144078,
"grad_norm": 1.405617356300354,
"learning_rate": 1.993132160147593e-05,
"loss": 1.1328214406967163,
"step": 334
},
{
"epoch": 0.41025641025641024,
"grad_norm": 1.5693705081939697,
"learning_rate": 1.9928131936800514e-05,
"loss": 1.4789706468582153,
"step": 336
},
{
"epoch": 0.4126984126984127,
"grad_norm": 1.2349439859390259,
"learning_rate": 1.9924870170914157e-05,
"loss": 1.0828137397766113,
"step": 338
},
{
"epoch": 0.41514041514041516,
"grad_norm": 5.380833148956299,
"learning_rate": 1.9921536330158007e-05,
"loss": 1.1599012613296509,
"step": 340
},
{
"epoch": 0.4175824175824176,
"grad_norm": 2.7839314937591553,
"learning_rate": 1.9918130441455273e-05,
"loss": 1.6682945489883423,
"step": 342
},
{
"epoch": 0.42002442002442003,
"grad_norm": 1.93392813205719,
"learning_rate": 1.9914652532311005e-05,
"loss": 0.9947870969772339,
"step": 344
},
{
"epoch": 0.42246642246642246,
"grad_norm": 1.5755698680877686,
"learning_rate": 1.991110263081186e-05,
"loss": 1.315640926361084,
"step": 346
},
{
"epoch": 0.4249084249084249,
"grad_norm": 2.16658878326416,
"learning_rate": 1.9907480765625906e-05,
"loss": 1.39967679977417,
"step": 348
},
{
"epoch": 0.42735042735042733,
"grad_norm": 2.711895704269409,
"learning_rate": 1.9903786966002352e-05,
"loss": 0.9204920530319214,
"step": 350
},
{
"epoch": 0.4297924297924298,
"grad_norm": 2.3947465419769287,
"learning_rate": 1.9900021261771348e-05,
"loss": 1.1823644638061523,
"step": 352
},
{
"epoch": 0.43223443223443225,
"grad_norm": 1.8362082242965698,
"learning_rate": 1.9896183683343706e-05,
"loss": 1.3596951961517334,
"step": 354
},
{
"epoch": 0.4346764346764347,
"grad_norm": 2.1142735481262207,
"learning_rate": 1.989227426171069e-05,
"loss": 1.03623628616333,
"step": 356
},
{
"epoch": 0.4371184371184371,
"grad_norm": 4.102887153625488,
"learning_rate": 1.9888293028443747e-05,
"loss": 1.240249514579773,
"step": 358
},
{
"epoch": 0.43956043956043955,
"grad_norm": 1.5868562459945679,
"learning_rate": 1.9884240015694248e-05,
"loss": 1.281577467918396,
"step": 360
},
{
"epoch": 0.442002442002442,
"grad_norm": 1.967654824256897,
"learning_rate": 1.988011525619325e-05,
"loss": 1.1424391269683838,
"step": 362
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.7741585969924927,
"learning_rate": 1.9875918783251207e-05,
"loss": 1.2371528148651123,
"step": 364
},
{
"epoch": 0.4468864468864469,
"grad_norm": 2.0856261253356934,
"learning_rate": 1.9871650630757716e-05,
"loss": 1.4550820589065552,
"step": 366
},
{
"epoch": 0.44932844932844934,
"grad_norm": 1.4822794198989868,
"learning_rate": 1.9867310833181234e-05,
"loss": 1.1890130043029785,
"step": 368
},
{
"epoch": 0.4517704517704518,
"grad_norm": 0.6963343620300293,
"learning_rate": 1.986289942556881e-05,
"loss": 1.2029908895492554,
"step": 370
},
{
"epoch": 0.4542124542124542,
"grad_norm": 1.5182689428329468,
"learning_rate": 1.9858416443545794e-05,
"loss": 1.3851736783981323,
"step": 372
},
{
"epoch": 0.45665445665445664,
"grad_norm": 1.8810380697250366,
"learning_rate": 1.9853861923315555e-05,
"loss": 1.0434424877166748,
"step": 374
},
{
"epoch": 0.4590964590964591,
"grad_norm": 1.6905688047409058,
"learning_rate": 1.984923590165918e-05,
"loss": 1.301484227180481,
"step": 376
},
{
"epoch": 0.46153846153846156,
"grad_norm": 2.4930872917175293,
"learning_rate": 1.9844538415935187e-05,
"loss": 1.0400949716567993,
"step": 378
},
{
"epoch": 0.463980463980464,
"grad_norm": 4.205483913421631,
"learning_rate": 1.983976950407922e-05,
"loss": 0.9666699767112732,
"step": 380
},
{
"epoch": 0.46642246642246643,
"grad_norm": 1.9438555240631104,
"learning_rate": 1.983492920460373e-05,
"loss": 1.3446414470672607,
"step": 382
},
{
"epoch": 0.46886446886446886,
"grad_norm": 1.6146860122680664,
"learning_rate": 1.983001755659769e-05,
"loss": 1.2357232570648193,
"step": 384
},
{
"epoch": 0.4713064713064713,
"grad_norm": 2.4254696369171143,
"learning_rate": 1.9825034599726263e-05,
"loss": 1.2619645595550537,
"step": 386
},
{
"epoch": 0.47374847374847373,
"grad_norm": 2.857746124267578,
"learning_rate": 1.9819980374230468e-05,
"loss": 1.6904096603393555,
"step": 388
},
{
"epoch": 0.47619047619047616,
"grad_norm": 1.4608720541000366,
"learning_rate": 1.981485492092689e-05,
"loss": 0.9965710639953613,
"step": 390
},
{
"epoch": 0.47863247863247865,
"grad_norm": 61.0188102722168,
"learning_rate": 1.9809658281207318e-05,
"loss": 0.9120445251464844,
"step": 392
},
{
"epoch": 0.4810744810744811,
"grad_norm": 2.121208429336548,
"learning_rate": 1.980439049703843e-05,
"loss": 1.0203512907028198,
"step": 394
},
{
"epoch": 0.4835164835164835,
"grad_norm": 1.042589783668518,
"learning_rate": 1.979905161096144e-05,
"loss": 1.3058192729949951,
"step": 396
},
{
"epoch": 0.48595848595848595,
"grad_norm": 1.507728934288025,
"learning_rate": 1.9793641666091773e-05,
"loss": 1.3444452285766602,
"step": 398
},
{
"epoch": 0.4884004884004884,
"grad_norm": 4.658176422119141,
"learning_rate": 1.9788160706118698e-05,
"loss": 0.6673938035964966,
"step": 400
},
{
"epoch": 0.4908424908424908,
"grad_norm": 1.1496187448501587,
"learning_rate": 1.978260877530499e-05,
"loss": 1.3050227165222168,
"step": 402
},
{
"epoch": 0.4932844932844933,
"grad_norm": 0.8402596712112427,
"learning_rate": 1.9776985918486552e-05,
"loss": 1.4215201139450073,
"step": 404
},
{
"epoch": 0.49572649572649574,
"grad_norm": 8.28558349609375,
"learning_rate": 1.9771292181072076e-05,
"loss": 0.8944355845451355,
"step": 406
},
{
"epoch": 0.4981684981684982,
"grad_norm": 2.735724449157715,
"learning_rate": 1.9765527609042676e-05,
"loss": 1.0254771709442139,
"step": 408
},
{
"epoch": 0.5006105006105006,
"grad_norm": 1.9720531702041626,
"learning_rate": 1.9759692248951482e-05,
"loss": 1.3571816682815552,
"step": 410
},
{
"epoch": 0.503052503052503,
"grad_norm": 14.514373779296875,
"learning_rate": 1.975378614792332e-05,
"loss": 0.6523332595825195,
"step": 412
},
{
"epoch": 0.5054945054945055,
"grad_norm": 1.5351808071136475,
"learning_rate": 1.9747809353654276e-05,
"loss": 1.3964738845825195,
"step": 414
},
{
"epoch": 0.5079365079365079,
"grad_norm": 1.1067290306091309,
"learning_rate": 1.974176191441135e-05,
"loss": 1.3599458932876587,
"step": 416
},
{
"epoch": 0.5103785103785103,
"grad_norm": 0.968450665473938,
"learning_rate": 1.973564387903204e-05,
"loss": 1.1259132623672485,
"step": 418
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.7555665373802185,
"learning_rate": 1.972945529692398e-05,
"loss": 1.3250101804733276,
"step": 420
},
{
"epoch": 0.5152625152625152,
"grad_norm": 1.294765830039978,
"learning_rate": 1.97231962180645e-05,
"loss": 1.3246148824691772,
"step": 422
},
{
"epoch": 0.5177045177045178,
"grad_norm": 3.749925374984741,
"learning_rate": 1.9716866693000248e-05,
"loss": 1.3295143842697144,
"step": 424
},
{
"epoch": 0.5201465201465202,
"grad_norm": 1.7079066038131714,
"learning_rate": 1.9710466772846784e-05,
"loss": 1.1310526132583618,
"step": 426
},
{
"epoch": 0.5225885225885226,
"grad_norm": 1.0455013513565063,
"learning_rate": 1.9703996509288153e-05,
"loss": 1.341339111328125,
"step": 428
},
{
"epoch": 0.525030525030525,
"grad_norm": 2.6277689933776855,
"learning_rate": 1.9697455954576478e-05,
"loss": 0.984380841255188,
"step": 430
},
{
"epoch": 0.5274725274725275,
"grad_norm": 3.6414973735809326,
"learning_rate": 1.9690845161531532e-05,
"loss": 0.6374328136444092,
"step": 432
},
{
"epoch": 0.5299145299145299,
"grad_norm": 0.9854040741920471,
"learning_rate": 1.968416418354032e-05,
"loss": 1.363136887550354,
"step": 434
},
{
"epoch": 0.5323565323565324,
"grad_norm": 1.02694833278656,
"learning_rate": 1.967741307455663e-05,
"loss": 1.3728197813034058,
"step": 436
},
{
"epoch": 0.5347985347985348,
"grad_norm": 1.2664965391159058,
"learning_rate": 1.967059188910062e-05,
"loss": 1.3319021463394165,
"step": 438
},
{
"epoch": 0.5372405372405372,
"grad_norm": 0.8867588043212891,
"learning_rate": 1.9663700682258367e-05,
"loss": 1.299553394317627,
"step": 440
},
{
"epoch": 0.5396825396825397,
"grad_norm": 13.338286399841309,
"learning_rate": 1.9656739509681413e-05,
"loss": 1.1493945121765137,
"step": 442
},
{
"epoch": 0.5421245421245421,
"grad_norm": 2.412151575088501,
"learning_rate": 1.9649708427586333e-05,
"loss": 1.0136598348617554,
"step": 444
},
{
"epoch": 0.5445665445665445,
"grad_norm": 2.4818806648254395,
"learning_rate": 1.964260749275427e-05,
"loss": 1.1629705429077148,
"step": 446
},
{
"epoch": 0.5470085470085471,
"grad_norm": 1.1341965198516846,
"learning_rate": 1.963543676253048e-05,
"loss": 1.1858645677566528,
"step": 448
},
{
"epoch": 0.5494505494505495,
"grad_norm": 1.6893372535705566,
"learning_rate": 1.962819629482386e-05,
"loss": 1.1235462427139282,
"step": 450
},
{
"epoch": 0.5518925518925519,
"grad_norm": 1.6189004182815552,
"learning_rate": 1.9620886148106498e-05,
"loss": 0.9178623557090759,
"step": 452
},
{
"epoch": 0.5543345543345544,
"grad_norm": 1.3195807933807373,
"learning_rate": 1.9613506381413194e-05,
"loss": 1.377665400505066,
"step": 454
},
{
"epoch": 0.5567765567765568,
"grad_norm": 1.4087958335876465,
"learning_rate": 1.960605705434097e-05,
"loss": 1.3081351518630981,
"step": 456
},
{
"epoch": 0.5592185592185592,
"grad_norm": 0.652862012386322,
"learning_rate": 1.95985382270486e-05,
"loss": 0.8939856290817261,
"step": 458
},
{
"epoch": 0.5616605616605617,
"grad_norm": 3.38787579536438,
"learning_rate": 1.9590949960256132e-05,
"loss": 1.266584873199463,
"step": 460
},
{
"epoch": 0.5641025641025641,
"grad_norm": 1.0980466604232788,
"learning_rate": 1.9583292315244383e-05,
"loss": 1.2569012641906738,
"step": 462
},
{
"epoch": 0.5665445665445665,
"grad_norm": 4.170780181884766,
"learning_rate": 1.9575565353854448e-05,
"loss": 0.641703724861145,
"step": 464
},
{
"epoch": 0.568986568986569,
"grad_norm": 1.1431292295455933,
"learning_rate": 1.9567769138487208e-05,
"loss": 1.567794680595398,
"step": 466
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.7932583093643188,
"learning_rate": 1.955990373210281e-05,
"loss": 1.3980201482772827,
"step": 468
},
{
"epoch": 0.5738705738705738,
"grad_norm": 2.625420570373535,
"learning_rate": 1.9551969198220188e-05,
"loss": 1.1457037925720215,
"step": 470
},
{
"epoch": 0.5763125763125763,
"grad_norm": 4.886669635772705,
"learning_rate": 1.954396560091652e-05,
"loss": 1.344892144203186,
"step": 472
},
{
"epoch": 0.5787545787545788,
"grad_norm": 2.250831127166748,
"learning_rate": 1.953589300482671e-05,
"loss": 0.9534360766410828,
"step": 474
},
{
"epoch": 0.5811965811965812,
"grad_norm": 2.8664050102233887,
"learning_rate": 1.9527751475142904e-05,
"loss": 1.0838558673858643,
"step": 476
},
{
"epoch": 0.5836385836385837,
"grad_norm": 0.9391406774520874,
"learning_rate": 1.951954107761391e-05,
"loss": 1.2320207357406616,
"step": 478
},
{
"epoch": 0.5860805860805861,
"grad_norm": 1.4157171249389648,
"learning_rate": 1.9511261878544715e-05,
"loss": 1.3821120262145996,
"step": 480
},
{
"epoch": 0.5885225885225885,
"grad_norm": 4.214658737182617,
"learning_rate": 1.950291394479592e-05,
"loss": 0.5741876363754272,
"step": 482
},
{
"epoch": 0.590964590964591,
"grad_norm": 2.0429494380950928,
"learning_rate": 1.9494497343783212e-05,
"loss": 1.1259833574295044,
"step": 484
},
{
"epoch": 0.5934065934065934,
"grad_norm": 0.9556084275245667,
"learning_rate": 1.9486012143476813e-05,
"loss": 1.1523076295852661,
"step": 486
},
{
"epoch": 0.5958485958485958,
"grad_norm": 5.83870792388916,
"learning_rate": 1.9477458412400934e-05,
"loss": 1.0496693849563599,
"step": 488
},
{
"epoch": 0.5982905982905983,
"grad_norm": 1.3661986589431763,
"learning_rate": 1.946883621963323e-05,
"loss": 1.1105148792266846,
"step": 490
},
{
"epoch": 0.6007326007326007,
"grad_norm": 1.4116313457489014,
"learning_rate": 1.946014563480422e-05,
"loss": 0.9300603866577148,
"step": 492
},
{
"epoch": 0.6031746031746031,
"grad_norm": 1.300858974456787,
"learning_rate": 1.9451386728096758e-05,
"loss": 1.0661330223083496,
"step": 494
},
{
"epoch": 0.6056166056166056,
"grad_norm": 1.8178846836090088,
"learning_rate": 1.9442559570245433e-05,
"loss": 1.304194450378418,
"step": 496
},
{
"epoch": 0.608058608058608,
"grad_norm": 1.6697763204574585,
"learning_rate": 1.9433664232536014e-05,
"loss": 0.6469916105270386,
"step": 498
},
{
"epoch": 0.6105006105006106,
"grad_norm": 1.206526279449463,
"learning_rate": 1.9424700786804877e-05,
"loss": 0.9863432049751282,
"step": 500
},
{
"epoch": 0.612942612942613,
"grad_norm": 1.7002737522125244,
"learning_rate": 1.9415669305438413e-05,
"loss": 1.2856956720352173,
"step": 502
},
{
"epoch": 0.6153846153846154,
"grad_norm": 1.4255826473236084,
"learning_rate": 1.9406569861372466e-05,
"loss": 1.3286441564559937,
"step": 504
},
{
"epoch": 0.6178266178266179,
"grad_norm": 1.0831611156463623,
"learning_rate": 1.9397402528091707e-05,
"loss": 1.3130193948745728,
"step": 506
},
{
"epoch": 0.6202686202686203,
"grad_norm": 1.9533292055130005,
"learning_rate": 1.9388167379629076e-05,
"loss": 1.380988597869873,
"step": 508
},
{
"epoch": 0.6227106227106227,
"grad_norm": 3.5476789474487305,
"learning_rate": 1.9378864490565172e-05,
"loss": 1.3338630199432373,
"step": 510
},
{
"epoch": 0.6251526251526252,
"grad_norm": 3.0227179527282715,
"learning_rate": 1.9369493936027642e-05,
"loss": 1.2690256834030151,
"step": 512
},
{
"epoch": 0.6275946275946276,
"grad_norm": 0.7818955779075623,
"learning_rate": 1.9360055791690584e-05,
"loss": 1.1770192384719849,
"step": 514
},
{
"epoch": 0.63003663003663,
"grad_norm": 0.7348341941833496,
"learning_rate": 1.935055013377393e-05,
"loss": 1.119304895401001,
"step": 516
},
{
"epoch": 0.6324786324786325,
"grad_norm": 1.465811848640442,
"learning_rate": 1.934097703904284e-05,
"loss": 1.34721040725708,
"step": 518
},
{
"epoch": 0.6349206349206349,
"grad_norm": 1.2145129442214966,
"learning_rate": 1.933133658480707e-05,
"loss": 0.9806722402572632,
"step": 520
},
{
"epoch": 0.6373626373626373,
"grad_norm": 2.869335174560547,
"learning_rate": 1.9321628848920358e-05,
"loss": 1.0333569049835205,
"step": 522
},
{
"epoch": 0.6398046398046398,
"grad_norm": 2.509185552597046,
"learning_rate": 1.9311853909779785e-05,
"loss": 1.087817907333374,
"step": 524
},
{
"epoch": 0.6422466422466423,
"grad_norm": 1.7746318578720093,
"learning_rate": 1.9302011846325156e-05,
"loss": 1.3438972234725952,
"step": 526
},
{
"epoch": 0.6446886446886447,
"grad_norm": 0.9185584783554077,
"learning_rate": 1.9292102738038347e-05,
"loss": 1.38664972782135,
"step": 528
},
{
"epoch": 0.6471306471306472,
"grad_norm": 1.1560609340667725,
"learning_rate": 1.9282126664942667e-05,
"loss": 1.1136956214904785,
"step": 530
},
{
"epoch": 0.6495726495726496,
"grad_norm": 1.5920125246047974,
"learning_rate": 1.927208370760223e-05,
"loss": 1.0266146659851074,
"step": 532
},
{
"epoch": 0.652014652014652,
"grad_norm": 2.174090623855591,
"learning_rate": 1.9261973947121273e-05,
"loss": 1.6666396856307983,
"step": 534
},
{
"epoch": 0.6544566544566545,
"grad_norm": 1.7790899276733398,
"learning_rate": 1.925179746514352e-05,
"loss": 0.9882057309150696,
"step": 536
},
{
"epoch": 0.6568986568986569,
"grad_norm": 1.3070317506790161,
"learning_rate": 1.9241554343851537e-05,
"loss": 1.368809461593628,
"step": 538
},
{
"epoch": 0.6593406593406593,
"grad_norm": 1.5976839065551758,
"learning_rate": 1.923124466596602e-05,
"loss": 1.3585935831069946,
"step": 540
},
{
"epoch": 0.6617826617826618,
"grad_norm": 1.836732268333435,
"learning_rate": 1.922086851474519e-05,
"loss": 1.0160579681396484,
"step": 542
},
{
"epoch": 0.6642246642246642,
"grad_norm": 4.108547687530518,
"learning_rate": 1.9210425973984074e-05,
"loss": 1.3244247436523438,
"step": 544
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.7101798057556152,
"learning_rate": 1.9199917128013836e-05,
"loss": 1.2471184730529785,
"step": 546
},
{
"epoch": 0.6691086691086691,
"grad_norm": 1.3308701515197754,
"learning_rate": 1.918934206170112e-05,
"loss": 1.3621915578842163,
"step": 548
},
{
"epoch": 0.6715506715506715,
"grad_norm": 1.1020407676696777,
"learning_rate": 1.917870086044734e-05,
"loss": 1.230018973350525,
"step": 550
},
{
"epoch": 0.673992673992674,
"grad_norm": 19.01947021484375,
"learning_rate": 1.9167993610187988e-05,
"loss": 1.0613629817962646,
"step": 552
},
{
"epoch": 0.6764346764346765,
"grad_norm": 1.0684137344360352,
"learning_rate": 1.915722039739197e-05,
"loss": 1.1644939184188843,
"step": 554
},
{
"epoch": 0.6788766788766789,
"grad_norm": 1.4123005867004395,
"learning_rate": 1.9146381309060874e-05,
"loss": 0.9099707007408142,
"step": 556
},
{
"epoch": 0.6813186813186813,
"grad_norm": 3.2105636596679688,
"learning_rate": 1.913547643272828e-05,
"loss": 1.228736400604248,
"step": 558
},
{
"epoch": 0.6837606837606838,
"grad_norm": 0.4815189242362976,
"learning_rate": 1.912450585645907e-05,
"loss": 1.3034601211547852,
"step": 560
},
{
"epoch": 0.6862026862026862,
"grad_norm": 2.001192569732666,
"learning_rate": 1.9113469668848675e-05,
"loss": 1.072668433189392,
"step": 562
},
{
"epoch": 0.6886446886446886,
"grad_norm": 1.3243483304977417,
"learning_rate": 1.9102367959022417e-05,
"loss": 1.3628251552581787,
"step": 564
},
{
"epoch": 0.6910866910866911,
"grad_norm": 1.6034096479415894,
"learning_rate": 1.909120081663473e-05,
"loss": 1.1910985708236694,
"step": 566
},
{
"epoch": 0.6935286935286935,
"grad_norm": 1.6782633066177368,
"learning_rate": 1.9079968331868487e-05,
"loss": 1.4165751934051514,
"step": 568
},
{
"epoch": 0.6959706959706959,
"grad_norm": 0.8705784678459167,
"learning_rate": 1.9068670595434228e-05,
"loss": 1.1330338716506958,
"step": 570
},
{
"epoch": 0.6984126984126984,
"grad_norm": 3.466735601425171,
"learning_rate": 1.9057307698569458e-05,
"loss": 1.0612688064575195,
"step": 572
},
{
"epoch": 0.7008547008547008,
"grad_norm": 2.736870765686035,
"learning_rate": 1.9045879733037907e-05,
"loss": 1.4824306964874268,
"step": 574
},
{
"epoch": 0.7032967032967034,
"grad_norm": 1.9692933559417725,
"learning_rate": 1.9034386791128766e-05,
"loss": 1.28273606300354,
"step": 576
},
{
"epoch": 0.7057387057387058,
"grad_norm": 0.8525418043136597,
"learning_rate": 1.9022828965655975e-05,
"loss": 1.2495508193969727,
"step": 578
},
{
"epoch": 0.7081807081807082,
"grad_norm": 0.8721325993537903,
"learning_rate": 1.9011206349957444e-05,
"loss": 1.2048630714416504,
"step": 580
},
{
"epoch": 0.7106227106227107,
"grad_norm": 1.3199268579483032,
"learning_rate": 1.899951903789431e-05,
"loss": 1.2845754623413086,
"step": 582
},
{
"epoch": 0.7130647130647131,
"grad_norm": 1.1963062286376953,
"learning_rate": 1.8987767123850197e-05,
"loss": 1.2032135725021362,
"step": 584
},
{
"epoch": 0.7155067155067155,
"grad_norm": 1.1792757511138916,
"learning_rate": 1.8975950702730425e-05,
"loss": 1.375983715057373,
"step": 586
},
{
"epoch": 0.717948717948718,
"grad_norm": 1.8274788856506348,
"learning_rate": 1.8964069869961254e-05,
"loss": 1.1112651824951172,
"step": 588
},
{
"epoch": 0.7203907203907204,
"grad_norm": 1.0463271141052246,
"learning_rate": 1.8952124721489115e-05,
"loss": 1.0283359289169312,
"step": 590
},
{
"epoch": 0.7228327228327228,
"grad_norm": 1.1223207712173462,
"learning_rate": 1.8940115353779847e-05,
"loss": 0.9025493860244751,
"step": 592
},
{
"epoch": 0.7252747252747253,
"grad_norm": 1.450899600982666,
"learning_rate": 1.8928041863817896e-05,
"loss": 1.2699706554412842,
"step": 594
},
{
"epoch": 0.7277167277167277,
"grad_norm": 2.5641753673553467,
"learning_rate": 1.891590434910554e-05,
"loss": 1.0194693803787231,
"step": 596
},
{
"epoch": 0.7301587301587301,
"grad_norm": 0.7553045153617859,
"learning_rate": 1.890370290766212e-05,
"loss": 1.160589337348938,
"step": 598
},
{
"epoch": 0.7326007326007326,
"grad_norm": 1.1860005855560303,
"learning_rate": 1.8891437638023212e-05,
"loss": 1.2648638486862183,
"step": 600
},
{
"epoch": 0.7350427350427351,
"grad_norm": 1.1435580253601074,
"learning_rate": 1.8879108639239864e-05,
"loss": 1.3810834884643555,
"step": 602
},
{
"epoch": 0.7374847374847375,
"grad_norm": 0.9142278432846069,
"learning_rate": 1.8866716010877774e-05,
"loss": 1.2209972143173218,
"step": 604
},
{
"epoch": 0.73992673992674,
"grad_norm": 3.2111129760742188,
"learning_rate": 1.885425985301651e-05,
"loss": 1.510741949081421,
"step": 606
},
{
"epoch": 0.7423687423687424,
"grad_norm": 1.5610990524291992,
"learning_rate": 1.884174026624868e-05,
"loss": 1.3180582523345947,
"step": 608
},
{
"epoch": 0.7448107448107448,
"grad_norm": 0.7449647188186646,
"learning_rate": 1.8829157351679116e-05,
"loss": 0.9663639664649963,
"step": 610
},
{
"epoch": 0.7472527472527473,
"grad_norm": 1.3256258964538574,
"learning_rate": 1.881651121092408e-05,
"loss": 1.2966718673706055,
"step": 612
},
{
"epoch": 0.7496947496947497,
"grad_norm": 1.0234135389328003,
"learning_rate": 1.880380194611044e-05,
"loss": 1.2717726230621338,
"step": 614
},
{
"epoch": 0.7521367521367521,
"grad_norm": 2.811690092086792,
"learning_rate": 1.8791029659874817e-05,
"loss": 1.0650262832641602,
"step": 616
},
{
"epoch": 0.7545787545787546,
"grad_norm": 1.469228744506836,
"learning_rate": 1.877819445536279e-05,
"loss": 1.6179522275924683,
"step": 618
},
{
"epoch": 0.757020757020757,
"grad_norm": 2.5131025314331055,
"learning_rate": 1.8765296436228043e-05,
"loss": 1.1963871717453003,
"step": 620
},
{
"epoch": 0.7594627594627594,
"grad_norm": 1.2842845916748047,
"learning_rate": 1.875233570663154e-05,
"loss": 0.9286983013153076,
"step": 622
},
{
"epoch": 0.7619047619047619,
"grad_norm": 1.0976072549819946,
"learning_rate": 1.8739312371240678e-05,
"loss": 1.2990517616271973,
"step": 624
},
{
"epoch": 0.7643467643467643,
"grad_norm": 1.3670490980148315,
"learning_rate": 1.8726226535228425e-05,
"loss": 1.352059006690979,
"step": 626
},
{
"epoch": 0.7667887667887668,
"grad_norm": 2.016474485397339,
"learning_rate": 1.871307830427251e-05,
"loss": 1.1491894721984863,
"step": 628
},
{
"epoch": 0.7692307692307693,
"grad_norm": 1.5183488130569458,
"learning_rate": 1.8699867784554537e-05,
"loss": 1.3350757360458374,
"step": 630
},
{
"epoch": 0.7716727716727717,
"grad_norm": 0.8359405398368835,
"learning_rate": 1.868659508275914e-05,
"loss": 1.0210474729537964,
"step": 632
},
{
"epoch": 0.7741147741147741,
"grad_norm": 1.0358965396881104,
"learning_rate": 1.867326030607311e-05,
"loss": 1.0034987926483154,
"step": 634
},
{
"epoch": 0.7765567765567766,
"grad_norm": 2.3178768157958984,
"learning_rate": 1.8659863562184552e-05,
"loss": 1.3230623006820679,
"step": 636
},
{
"epoch": 0.778998778998779,
"grad_norm": 1.5217390060424805,
"learning_rate": 1.8646404959281986e-05,
"loss": 1.3143547773361206,
"step": 638
},
{
"epoch": 0.7814407814407814,
"grad_norm": 1.7523036003112793,
"learning_rate": 1.8632884606053506e-05,
"loss": 0.9751634001731873,
"step": 640
},
{
"epoch": 0.7838827838827839,
"grad_norm": 2.0202057361602783,
"learning_rate": 1.861930261168587e-05,
"loss": 1.1349761486053467,
"step": 642
},
{
"epoch": 0.7863247863247863,
"grad_norm": 0.9345976710319519,
"learning_rate": 1.860565908586365e-05,
"loss": 1.2226810455322266,
"step": 644
},
{
"epoch": 0.7887667887667887,
"grad_norm": 1.210115909576416,
"learning_rate": 1.859195413876831e-05,
"loss": 1.0119144916534424,
"step": 646
},
{
"epoch": 0.7912087912087912,
"grad_norm": 1.0988825559616089,
"learning_rate": 1.857818788107734e-05,
"loss": 1.26012122631073,
"step": 648
},
{
"epoch": 0.7936507936507936,
"grad_norm": 4.83104944229126,
"learning_rate": 1.856436042396338e-05,
"loss": 0.5898873209953308,
"step": 650
},
{
"epoch": 0.796092796092796,
"grad_norm": 1.161339282989502,
"learning_rate": 1.8550471879093275e-05,
"loss": 0.8887655138969421,
"step": 652
},
{
"epoch": 0.7985347985347986,
"grad_norm": 1.4048727750778198,
"learning_rate": 1.8536522358627205e-05,
"loss": 1.2602205276489258,
"step": 654
},
{
"epoch": 0.800976800976801,
"grad_norm": 2.1626598834991455,
"learning_rate": 1.852251197521778e-05,
"loss": 1.2750191688537598,
"step": 656
},
{
"epoch": 0.8034188034188035,
"grad_norm": 2.365673065185547,
"learning_rate": 1.8508440842009113e-05,
"loss": 0.5839018225669861,
"step": 658
},
{
"epoch": 0.8058608058608059,
"grad_norm": 1.4860225915908813,
"learning_rate": 1.849430907263592e-05,
"loss": 1.297167181968689,
"step": 660
},
{
"epoch": 0.8083028083028083,
"grad_norm": 1.04447603225708,
"learning_rate": 1.8480116781222604e-05,
"loss": 1.2555423974990845,
"step": 662
},
{
"epoch": 0.8107448107448108,
"grad_norm": 0.8101674318313599,
"learning_rate": 1.846586408238232e-05,
"loss": 1.3545968532562256,
"step": 664
},
{
"epoch": 0.8131868131868132,
"grad_norm": 1.1193162202835083,
"learning_rate": 1.8451551091216064e-05,
"loss": 0.9384480118751526,
"step": 666
},
{
"epoch": 0.8156288156288156,
"grad_norm": 1.269223928451538,
"learning_rate": 1.8437177923311728e-05,
"loss": 1.0872721672058105,
"step": 668
},
{
"epoch": 0.818070818070818,
"grad_norm": 1.7073310613632202,
"learning_rate": 1.842274469474318e-05,
"loss": 1.4501525163650513,
"step": 670
},
{
"epoch": 0.8205128205128205,
"grad_norm": 1.2747077941894531,
"learning_rate": 1.8408251522069323e-05,
"loss": 1.296190857887268,
"step": 672
},
{
"epoch": 0.8229548229548229,
"grad_norm": 1.145330786705017,
"learning_rate": 1.8393698522333158e-05,
"loss": 1.076781153678894,
"step": 674
},
{
"epoch": 0.8253968253968254,
"grad_norm": 1.0505316257476807,
"learning_rate": 1.837908581306082e-05,
"loss": 0.963850200176239,
"step": 676
},
{
"epoch": 0.8278388278388278,
"grad_norm": 4.262927055358887,
"learning_rate": 1.8364413512260656e-05,
"loss": 1.2688353061676025,
"step": 678
},
{
"epoch": 0.8302808302808303,
"grad_norm": 2.2526209354400635,
"learning_rate": 1.8349681738422245e-05,
"loss": 1.3245513439178467,
"step": 680
},
{
"epoch": 0.8327228327228328,
"grad_norm": 1.7615208625793457,
"learning_rate": 1.8334890610515465e-05,
"loss": 1.2618424892425537,
"step": 682
},
{
"epoch": 0.8351648351648352,
"grad_norm": 2.4765729904174805,
"learning_rate": 1.8320040247989516e-05,
"loss": 0.9116923213005066,
"step": 684
},
{
"epoch": 0.8376068376068376,
"grad_norm": 2.0831899642944336,
"learning_rate": 1.8305130770771966e-05,
"loss": 1.4006067514419556,
"step": 686
},
{
"epoch": 0.8400488400488401,
"grad_norm": 3.837216854095459,
"learning_rate": 1.829016229926777e-05,
"loss": 1.3707760572433472,
"step": 688
},
{
"epoch": 0.8424908424908425,
"grad_norm": 12.806596755981445,
"learning_rate": 1.827513495435831e-05,
"loss": 1.0350643396377563,
"step": 690
},
{
"epoch": 0.8449328449328449,
"grad_norm": 1.426324486732483,
"learning_rate": 1.826004885740042e-05,
"loss": 1.3101565837860107,
"step": 692
},
{
"epoch": 0.8473748473748474,
"grad_norm": 0.7182126045227051,
"learning_rate": 1.8244904130225383e-05,
"loss": 1.1183477640151978,
"step": 694
},
{
"epoch": 0.8498168498168498,
"grad_norm": 1.0692784786224365,
"learning_rate": 1.8229700895137977e-05,
"loss": 1.2185040712356567,
"step": 696
},
{
"epoch": 0.8522588522588522,
"grad_norm": 1.405985951423645,
"learning_rate": 1.821443927491548e-05,
"loss": 1.0439921617507935,
"step": 698
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.9861589074134827,
"learning_rate": 1.819911939280665e-05,
"loss": 1.179707646369934,
"step": 700
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.3593485355377197,
"learning_rate": 1.8183741372530778e-05,
"loss": 1.1061705350875854,
"step": 702
},
{
"epoch": 0.8595848595848596,
"grad_norm": 2.342923402786255,
"learning_rate": 1.816830533827665e-05,
"loss": 1.0052831172943115,
"step": 704
},
{
"epoch": 0.8620268620268621,
"grad_norm": 1.4813743829727173,
"learning_rate": 1.815281141470155e-05,
"loss": 0.5395532250404358,
"step": 706
},
{
"epoch": 0.8644688644688645,
"grad_norm": 1.3919825553894043,
"learning_rate": 1.8137259726930283e-05,
"loss": 1.2419100999832153,
"step": 708
},
{
"epoch": 0.8669108669108669,
"grad_norm": 3.034050464630127,
"learning_rate": 1.8121650400554125e-05,
"loss": 0.9318399429321289,
"step": 710
},
{
"epoch": 0.8693528693528694,
"grad_norm": 4.048087120056152,
"learning_rate": 1.8105983561629827e-05,
"loss": 1.4534571170806885,
"step": 712
},
{
"epoch": 0.8717948717948718,
"grad_norm": 13.133171081542969,
"learning_rate": 1.8090259336678598e-05,
"loss": 1.6200733184814453,
"step": 714
},
{
"epoch": 0.8742368742368742,
"grad_norm": 1.3102926015853882,
"learning_rate": 1.8074477852685088e-05,
"loss": 1.4871742725372314,
"step": 716
},
{
"epoch": 0.8766788766788767,
"grad_norm": 0.9029149413108826,
"learning_rate": 1.805863923709635e-05,
"loss": 1.0001909732818604,
"step": 718
},
{
"epoch": 0.8791208791208791,
"grad_norm": 0.828899621963501,
"learning_rate": 1.8042743617820814e-05,
"loss": 1.2416490316390991,
"step": 720
},
{
"epoch": 0.8815628815628815,
"grad_norm": 2.1641383171081543,
"learning_rate": 1.8026791123227255e-05,
"loss": 0.8903718590736389,
"step": 722
},
{
"epoch": 0.884004884004884,
"grad_norm": 1.445026159286499,
"learning_rate": 1.8010781882143773e-05,
"loss": 1.285760521888733,
"step": 724
},
{
"epoch": 0.8864468864468864,
"grad_norm": 0.9921174645423889,
"learning_rate": 1.799471602385672e-05,
"loss": 1.2185858488082886,
"step": 726
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.5229535102844238,
"learning_rate": 1.797859367810968e-05,
"loss": 1.2078474760055542,
"step": 728
},
{
"epoch": 0.8913308913308914,
"grad_norm": 1.6648898124694824,
"learning_rate": 1.7962414975102416e-05,
"loss": 1.4831866025924683,
"step": 730
},
{
"epoch": 0.8937728937728938,
"grad_norm": 3.4526195526123047,
"learning_rate": 1.794618004548982e-05,
"loss": 1.2522797584533691,
"step": 732
},
{
"epoch": 0.8962148962148963,
"grad_norm": 1.0352317094802856,
"learning_rate": 1.7929889020380842e-05,
"loss": 1.0359210968017578,
"step": 734
},
{
"epoch": 0.8986568986568987,
"grad_norm": 0.8629250526428223,
"learning_rate": 1.791354203133746e-05,
"loss": 0.8198949098587036,
"step": 736
},
{
"epoch": 0.9010989010989011,
"grad_norm": 4.816531658172607,
"learning_rate": 1.7897139210373594e-05,
"loss": 0.9690486788749695,
"step": 738
},
{
"epoch": 0.9035409035409036,
"grad_norm": 2.7800450325012207,
"learning_rate": 1.7880680689954047e-05,
"loss": 1.0706011056900024,
"step": 740
},
{
"epoch": 0.905982905982906,
"grad_norm": 1.3503133058547974,
"learning_rate": 1.786416660299344e-05,
"loss": 0.9173503518104553,
"step": 742
},
{
"epoch": 0.9084249084249084,
"grad_norm": 0.9783918261528015,
"learning_rate": 1.7847597082855133e-05,
"loss": 0.9544399976730347,
"step": 744
},
{
"epoch": 0.9108669108669109,
"grad_norm": 1.6359418630599976,
"learning_rate": 1.7830972263350142e-05,
"loss": 1.2056411504745483,
"step": 746
},
{
"epoch": 0.9133089133089133,
"grad_norm": 1.5760291814804077,
"learning_rate": 1.7814292278736084e-05,
"loss": 0.9109166264533997,
"step": 748
},
{
"epoch": 0.9157509157509157,
"grad_norm": 1.4765530824661255,
"learning_rate": 1.7797557263716054e-05,
"loss": 1.401995301246643,
"step": 750
},
{
"epoch": 0.9181929181929182,
"grad_norm": 0.7756912708282471,
"learning_rate": 1.7780767353437573e-05,
"loss": 1.2727299928665161,
"step": 752
},
{
"epoch": 0.9206349206349206,
"grad_norm": 0.8636785745620728,
"learning_rate": 1.7763922683491476e-05,
"loss": 1.2869514226913452,
"step": 754
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.8454907536506653,
"learning_rate": 1.7747023389910815e-05,
"loss": 1.2656826972961426,
"step": 756
},
{
"epoch": 0.9255189255189256,
"grad_norm": 1.3287631273269653,
"learning_rate": 1.773006960916978e-05,
"loss": 1.3375307321548462,
"step": 758
},
{
"epoch": 0.927960927960928,
"grad_norm": 1.5437045097351074,
"learning_rate": 1.7713061478182582e-05,
"loss": 0.8308702111244202,
"step": 760
},
{
"epoch": 0.9304029304029304,
"grad_norm": 3.6134531497955322,
"learning_rate": 1.7695999134302348e-05,
"loss": 1.2227895259857178,
"step": 762
},
{
"epoch": 0.9328449328449329,
"grad_norm": 1.38361394405365,
"learning_rate": 1.767888271532001e-05,
"loss": 0.9452077150344849,
"step": 764
},
{
"epoch": 0.9352869352869353,
"grad_norm": 1.8651083707809448,
"learning_rate": 1.7661712359463202e-05,
"loss": 0.6139346957206726,
"step": 766
},
{
"epoch": 0.9377289377289377,
"grad_norm": 1.1716833114624023,
"learning_rate": 1.7644488205395136e-05,
"loss": 0.9175626039505005,
"step": 768
},
{
"epoch": 0.9401709401709402,
"grad_norm": 4.235447406768799,
"learning_rate": 1.7627210392213484e-05,
"loss": 0.7235321402549744,
"step": 770
},
{
"epoch": 0.9426129426129426,
"grad_norm": 0.8599190711975098,
"learning_rate": 1.7609879059449256e-05,
"loss": 1.1240880489349365,
"step": 772
},
{
"epoch": 0.945054945054945,
"grad_norm": 3.387906789779663,
"learning_rate": 1.7592494347065667e-05,
"loss": 1.3139581680297852,
"step": 774
},
{
"epoch": 0.9474969474969475,
"grad_norm": 1.6255816221237183,
"learning_rate": 1.7575056395457017e-05,
"loss": 1.2285006046295166,
"step": 776
},
{
"epoch": 0.9499389499389499,
"grad_norm": 2.0512325763702393,
"learning_rate": 1.7557565345447548e-05,
"loss": 0.9121115207672119,
"step": 778
},
{
"epoch": 0.9523809523809523,
"grad_norm": 1.003928542137146,
"learning_rate": 1.754002133829031e-05,
"loss": 1.1289280652999878,
"step": 780
},
{
"epoch": 0.9548229548229549,
"grad_norm": 6.144791126251221,
"learning_rate": 1.752242451566603e-05,
"loss": 1.1398252248764038,
"step": 782
},
{
"epoch": 0.9572649572649573,
"grad_norm": 0.8303928375244141,
"learning_rate": 1.7504775019681946e-05,
"loss": 1.263461709022522,
"step": 784
},
{
"epoch": 0.9597069597069597,
"grad_norm": 1.0771842002868652,
"learning_rate": 1.7487072992870683e-05,
"loss": 1.2938859462738037,
"step": 786
},
{
"epoch": 0.9621489621489622,
"grad_norm": 1.3151885271072388,
"learning_rate": 1.746931857818908e-05,
"loss": 1.3971589803695679,
"step": 788
},
{
"epoch": 0.9645909645909646,
"grad_norm": 2.2546122074127197,
"learning_rate": 1.7451511919017054e-05,
"loss": 1.341101884841919,
"step": 790
},
{
"epoch": 0.967032967032967,
"grad_norm": 0.76347416639328,
"learning_rate": 1.743365315915643e-05,
"loss": 1.0966370105743408,
"step": 792
},
{
"epoch": 0.9694749694749695,
"grad_norm": 1.2820730209350586,
"learning_rate": 1.7415742442829792e-05,
"loss": 1.3368990421295166,
"step": 794
},
{
"epoch": 0.9719169719169719,
"grad_norm": 0.7520409226417542,
"learning_rate": 1.7397779914679303e-05,
"loss": 1.2155550718307495,
"step": 796
},
{
"epoch": 0.9743589743589743,
"grad_norm": 0.652754545211792,
"learning_rate": 1.7379765719765542e-05,
"loss": 1.2150750160217285,
"step": 798
},
{
"epoch": 0.9768009768009768,
"grad_norm": 0.8119310736656189,
"learning_rate": 1.7361700003566348e-05,
"loss": 1.2871735095977783,
"step": 800
},
{
"epoch": 0.9792429792429792,
"grad_norm": 2.2065281867980957,
"learning_rate": 1.734358291197562e-05,
"loss": 0.9395040273666382,
"step": 802
},
{
"epoch": 0.9816849816849816,
"grad_norm": 0.936976432800293,
"learning_rate": 1.732541459130215e-05,
"loss": 1.1477895975112915,
"step": 804
},
{
"epoch": 0.9841269841269841,
"grad_norm": 1.3355202674865723,
"learning_rate": 1.730719518826846e-05,
"loss": 1.573718547821045,
"step": 806
},
{
"epoch": 0.9865689865689866,
"grad_norm": 4.425434112548828,
"learning_rate": 1.7288924850009576e-05,
"loss": 0.9391233325004578,
"step": 808
},
{
"epoch": 0.989010989010989,
"grad_norm": 0.7173460721969604,
"learning_rate": 1.7270603724071876e-05,
"loss": 1.364790916442871,
"step": 810
},
{
"epoch": 0.9914529914529915,
"grad_norm": 0.7534496784210205,
"learning_rate": 1.725223195841189e-05,
"loss": 1.2704541683197021,
"step": 812
},
{
"epoch": 0.9938949938949939,
"grad_norm": 1.4058549404144287,
"learning_rate": 1.7233809701395087e-05,
"loss": 1.35564386844635,
"step": 814
},
{
"epoch": 0.9963369963369964,
"grad_norm": 0.8958796858787537,
"learning_rate": 1.72153371017947e-05,
"loss": 1.233031153678894,
"step": 816
},
{
"epoch": 0.9987789987789988,
"grad_norm": 0.7508826851844788,
"learning_rate": 1.7196814308790516e-05,
"loss": 1.1463748216629028,
"step": 818
},
{
"epoch": 1.0012210012210012,
"grad_norm": 0.6122261881828308,
"learning_rate": 1.717824147196767e-05,
"loss": 1.007127285003662,
"step": 820
},
{
"epoch": 1.0036630036630036,
"grad_norm": 1.004014015197754,
"learning_rate": 1.7159618741315433e-05,
"loss": 1.0883307456970215,
"step": 822
},
{
"epoch": 1.006105006105006,
"grad_norm": 1.8373212814331055,
"learning_rate": 1.7140946267226006e-05,
"loss": 0.4619407653808594,
"step": 824
},
{
"epoch": 1.0085470085470085,
"grad_norm": 7.073435306549072,
"learning_rate": 1.712222420049331e-05,
"loss": 0.8937675356864929,
"step": 826
},
{
"epoch": 1.010989010989011,
"grad_norm": 3.16390061378479,
"learning_rate": 1.7103452692311756e-05,
"loss": 0.7834187150001526,
"step": 828
},
{
"epoch": 1.0134310134310134,
"grad_norm": 1.286433458328247,
"learning_rate": 1.708463189427504e-05,
"loss": 0.7017002105712891,
"step": 830
},
{
"epoch": 1.0158730158730158,
"grad_norm": 2.5467231273651123,
"learning_rate": 1.7065761958374905e-05,
"loss": 0.9201502203941345,
"step": 832
},
{
"epoch": 1.0183150183150182,
"grad_norm": 1.361122965812683,
"learning_rate": 1.7046843036999912e-05,
"loss": 0.9217178821563721,
"step": 834
},
{
"epoch": 1.0207570207570207,
"grad_norm": 2.6307156085968018,
"learning_rate": 1.7027875282934224e-05,
"loss": 1.00894033908844,
"step": 836
},
{
"epoch": 1.0231990231990231,
"grad_norm": 0.9444079995155334,
"learning_rate": 1.7008858849356363e-05,
"loss": 1.0666855573654175,
"step": 838
},
{
"epoch": 1.0256410256410255,
"grad_norm": 1.807748556137085,
"learning_rate": 1.6989793889837966e-05,
"loss": 0.7795441746711731,
"step": 840
},
{
"epoch": 1.028083028083028,
"grad_norm": 4.041755199432373,
"learning_rate": 1.6970680558342566e-05,
"loss": 0.7524101734161377,
"step": 842
},
{
"epoch": 1.0305250305250304,
"grad_norm": 0.885811448097229,
"learning_rate": 1.695151900922432e-05,
"loss": 0.9602640271186829,
"step": 844
},
{
"epoch": 1.032967032967033,
"grad_norm": 0.9917791485786438,
"learning_rate": 1.6932309397226792e-05,
"loss": 0.8459327816963196,
"step": 846
},
{
"epoch": 1.0354090354090355,
"grad_norm": 0.9382413029670715,
"learning_rate": 1.6913051877481676e-05,
"loss": 1.1561813354492188,
"step": 848
},
{
"epoch": 1.037851037851038,
"grad_norm": 1.5294519662857056,
"learning_rate": 1.6893746605507567e-05,
"loss": 0.7689896821975708,
"step": 850
},
{
"epoch": 1.0402930402930404,
"grad_norm": 1.7145957946777344,
"learning_rate": 1.6874393737208688e-05,
"loss": 0.5241991281509399,
"step": 852
},
{
"epoch": 1.0427350427350428,
"grad_norm": 0.781104326248169,
"learning_rate": 1.685499342887364e-05,
"loss": 1.0428876876831055,
"step": 854
},
{
"epoch": 1.0451770451770452,
"grad_norm": 1.5123246908187866,
"learning_rate": 1.6835545837174132e-05,
"loss": 0.668832004070282,
"step": 856
},
{
"epoch": 1.0476190476190477,
"grad_norm": 1.0035831928253174,
"learning_rate": 1.681605111916373e-05,
"loss": 1.2478870153427124,
"step": 858
},
{
"epoch": 1.05006105006105,
"grad_norm": 0.9146220684051514,
"learning_rate": 1.679650943227657e-05,
"loss": 0.8985828161239624,
"step": 860
},
{
"epoch": 1.0525030525030525,
"grad_norm": 1.358199119567871,
"learning_rate": 1.6776920934326103e-05,
"loss": 1.0257023572921753,
"step": 862
},
{
"epoch": 1.054945054945055,
"grad_norm": 1.0113524198532104,
"learning_rate": 1.675728578350381e-05,
"loss": 1.0212005376815796,
"step": 864
},
{
"epoch": 1.0573870573870574,
"grad_norm": 2.236260175704956,
"learning_rate": 1.673760413837793e-05,
"loss": 1.4508510828018188,
"step": 866
},
{
"epoch": 1.0598290598290598,
"grad_norm": 2.680145740509033,
"learning_rate": 1.6717876157892175e-05,
"loss": 0.5031489729881287,
"step": 868
},
{
"epoch": 1.0622710622710623,
"grad_norm": 1.7734426259994507,
"learning_rate": 1.6698102001364456e-05,
"loss": 0.9893677234649658,
"step": 870
},
{
"epoch": 1.0647130647130647,
"grad_norm": 1.0509651899337769,
"learning_rate": 1.6678281828485576e-05,
"loss": 0.897520124912262,
"step": 872
},
{
"epoch": 1.0671550671550671,
"grad_norm": 1.6916723251342773,
"learning_rate": 1.6658415799317966e-05,
"loss": 0.7381224036216736,
"step": 874
},
{
"epoch": 1.0695970695970696,
"grad_norm": 1.0783177614212036,
"learning_rate": 1.6638504074294375e-05,
"loss": 0.9826089143753052,
"step": 876
},
{
"epoch": 1.072039072039072,
"grad_norm": 0.9295514225959778,
"learning_rate": 1.6618546814216586e-05,
"loss": 1.0204219818115234,
"step": 878
},
{
"epoch": 1.0744810744810744,
"grad_norm": 2.3482747077941895,
"learning_rate": 1.65985441802541e-05,
"loss": 0.6614128947257996,
"step": 880
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.9849045276641846,
"learning_rate": 1.6578496333942848e-05,
"loss": 0.9977365732192993,
"step": 882
},
{
"epoch": 1.0793650793650793,
"grad_norm": 4.873172283172607,
"learning_rate": 1.655840343718389e-05,
"loss": 0.6593250036239624,
"step": 884
},
{
"epoch": 1.0818070818070817,
"grad_norm": 60.08795928955078,
"learning_rate": 1.6538265652242103e-05,
"loss": 0.7343877553939819,
"step": 886
},
{
"epoch": 1.0842490842490842,
"grad_norm": 1.1528880596160889,
"learning_rate": 1.6518083141744862e-05,
"loss": 1.0775821208953857,
"step": 888
},
{
"epoch": 1.0866910866910866,
"grad_norm": 0.5812370181083679,
"learning_rate": 1.649785606868073e-05,
"loss": 0.7265040874481201,
"step": 890
},
{
"epoch": 1.089133089133089,
"grad_norm": 0.9050455093383789,
"learning_rate": 1.647758459639816e-05,
"loss": 0.94173663854599,
"step": 892
},
{
"epoch": 1.0915750915750915,
"grad_norm": 1.2509444952011108,
"learning_rate": 1.6457268888604143e-05,
"loss": 1.1309514045715332,
"step": 894
},
{
"epoch": 1.0940170940170941,
"grad_norm": 1.1489883661270142,
"learning_rate": 1.643690910936292e-05,
"loss": 1.1048157215118408,
"step": 896
},
{
"epoch": 1.0964590964590966,
"grad_norm": 3.072650909423828,
"learning_rate": 1.6416505423094636e-05,
"loss": 0.8980664014816284,
"step": 898
},
{
"epoch": 1.098901098901099,
"grad_norm": 1.031434416770935,
"learning_rate": 1.639605799457401e-05,
"loss": 0.6644148826599121,
"step": 900
},
{
"epoch": 1.1013431013431014,
"grad_norm": 3.342662811279297,
"learning_rate": 1.6375566988929025e-05,
"loss": 0.6176282167434692,
"step": 902
},
{
"epoch": 1.1037851037851039,
"grad_norm": 0.8162381649017334,
"learning_rate": 1.6355032571639574e-05,
"loss": 0.5790269374847412,
"step": 904
},
{
"epoch": 1.1062271062271063,
"grad_norm": 1.7559690475463867,
"learning_rate": 1.6334454908536123e-05,
"loss": 0.8540843725204468,
"step": 906
},
{
"epoch": 1.1086691086691087,
"grad_norm": 2.1038284301757812,
"learning_rate": 1.631383416579839e-05,
"loss": 1.0307986736297607,
"step": 908
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.8097777366638184,
"learning_rate": 1.6293170509954e-05,
"loss": 0.7846847176551819,
"step": 910
},
{
"epoch": 1.1135531135531136,
"grad_norm": 0.7294727563858032,
"learning_rate": 1.6272464107877112e-05,
"loss": 1.0868881940841675,
"step": 912
},
{
"epoch": 1.115995115995116,
"grad_norm": 1.1073777675628662,
"learning_rate": 1.6251715126787114e-05,
"loss": 0.6077226400375366,
"step": 914
},
{
"epoch": 1.1184371184371185,
"grad_norm": 1.3670622110366821,
"learning_rate": 1.623092373424723e-05,
"loss": 0.7134993076324463,
"step": 916
},
{
"epoch": 1.120879120879121,
"grad_norm": 1.8728268146514893,
"learning_rate": 1.6210090098163206e-05,
"loss": 1.1230908632278442,
"step": 918
},
{
"epoch": 1.1233211233211233,
"grad_norm": 0.782214343547821,
"learning_rate": 1.618921438678192e-05,
"loss": 0.9432562589645386,
"step": 920
},
{
"epoch": 1.1257631257631258,
"grad_norm": 1.2588818073272705,
"learning_rate": 1.616829676869005e-05,
"loss": 0.8601541519165039,
"step": 922
},
{
"epoch": 1.1282051282051282,
"grad_norm": 1.1834020614624023,
"learning_rate": 1.61473374128127e-05,
"loss": 0.7565584778785706,
"step": 924
},
{
"epoch": 1.1306471306471306,
"grad_norm": 1.476582646369934,
"learning_rate": 1.612633648841203e-05,
"loss": 0.6475503444671631,
"step": 926
},
{
"epoch": 1.133089133089133,
"grad_norm": 1.7382149696350098,
"learning_rate": 1.61052941650859e-05,
"loss": 0.5194863677024841,
"step": 928
},
{
"epoch": 1.1355311355311355,
"grad_norm": 1.6398006677627563,
"learning_rate": 1.608421061276651e-05,
"loss": 0.8809158205986023,
"step": 930
},
{
"epoch": 1.137973137973138,
"grad_norm": 1.1977638006210327,
"learning_rate": 1.6063086001718986e-05,
"loss": 1.0729451179504395,
"step": 932
},
{
"epoch": 1.1404151404151404,
"grad_norm": 1.9817147254943848,
"learning_rate": 1.6041920502540058e-05,
"loss": 1.008049726486206,
"step": 934
},
{
"epoch": 1.1428571428571428,
"grad_norm": 1.1614291667938232,
"learning_rate": 1.6020714286156646e-05,
"loss": 0.8578592538833618,
"step": 936
},
{
"epoch": 1.1452991452991452,
"grad_norm": 0.9589775800704956,
"learning_rate": 1.59994675238245e-05,
"loss": 0.9546090960502625,
"step": 938
},
{
"epoch": 1.1477411477411477,
"grad_norm": 0.889543354511261,
"learning_rate": 1.5978180387126797e-05,
"loss": 1.0442495346069336,
"step": 940
},
{
"epoch": 1.15018315018315,
"grad_norm": 1.1197261810302734,
"learning_rate": 1.5956853047972776e-05,
"loss": 0.8928858637809753,
"step": 942
},
{
"epoch": 1.1526251526251525,
"grad_norm": 0.7546731233596802,
"learning_rate": 1.5935485678596328e-05,
"loss": 0.8579668998718262,
"step": 944
},
{
"epoch": 1.155067155067155,
"grad_norm": 1.2320284843444824,
"learning_rate": 1.5914078451554637e-05,
"loss": 0.683056652545929,
"step": 946
},
{
"epoch": 1.1575091575091574,
"grad_norm": 1.5659480094909668,
"learning_rate": 1.5892631539726754e-05,
"loss": 0.6238126754760742,
"step": 948
},
{
"epoch": 1.1599511599511598,
"grad_norm": 1.120065450668335,
"learning_rate": 1.5871145116312207e-05,
"loss": 0.9421287178993225,
"step": 950
},
{
"epoch": 1.1623931623931625,
"grad_norm": 0.9711224436759949,
"learning_rate": 1.5849619354829627e-05,
"loss": 0.9722180366516113,
"step": 952
},
{
"epoch": 1.164835164835165,
"grad_norm": 1.5788224935531616,
"learning_rate": 1.5828054429115317e-05,
"loss": 0.9436995983123779,
"step": 954
},
{
"epoch": 1.1672771672771673,
"grad_norm": 0.5967025756835938,
"learning_rate": 1.580645051332186e-05,
"loss": 0.8100671768188477,
"step": 956
},
{
"epoch": 1.1697191697191698,
"grad_norm": 0.7621123790740967,
"learning_rate": 1.5784807781916714e-05,
"loss": 0.7545087337493896,
"step": 958
},
{
"epoch": 1.1721611721611722,
"grad_norm": 1.0470103025436401,
"learning_rate": 1.5763126409680803e-05,
"loss": 1.0842094421386719,
"step": 960
},
{
"epoch": 1.1746031746031746,
"grad_norm": 1.1259769201278687,
"learning_rate": 1.5741406571707108e-05,
"loss": 0.7638933062553406,
"step": 962
},
{
"epoch": 1.177045177045177,
"grad_norm": 0.513518750667572,
"learning_rate": 1.571964844339924e-05,
"loss": 0.6498727798461914,
"step": 964
},
{
"epoch": 1.1794871794871795,
"grad_norm": 0.5694072246551514,
"learning_rate": 1.569785220047003e-05,
"loss": 0.983795702457428,
"step": 966
},
{
"epoch": 1.181929181929182,
"grad_norm": 0.9271643161773682,
"learning_rate": 1.5676018018940134e-05,
"loss": 1.1204752922058105,
"step": 968
},
{
"epoch": 1.1843711843711844,
"grad_norm": 1.4760109186172485,
"learning_rate": 1.5654146075136565e-05,
"loss": 0.7088498473167419,
"step": 970
},
{
"epoch": 1.1868131868131868,
"grad_norm": 1.260972023010254,
"learning_rate": 1.5632236545691308e-05,
"loss": 0.9644913077354431,
"step": 972
},
{
"epoch": 1.1892551892551892,
"grad_norm": 0.883178174495697,
"learning_rate": 1.561028960753988e-05,
"loss": 0.7552489638328552,
"step": 974
},
{
"epoch": 1.1916971916971917,
"grad_norm": 4.277756214141846,
"learning_rate": 1.5588305437919884e-05,
"loss": 0.6645691990852356,
"step": 976
},
{
"epoch": 1.1941391941391941,
"grad_norm": 1.147638201713562,
"learning_rate": 1.556628421436962e-05,
"loss": 0.8974350094795227,
"step": 978
},
{
"epoch": 1.1965811965811965,
"grad_norm": 2.6772568225860596,
"learning_rate": 1.554422611472661e-05,
"loss": 1.0676953792572021,
"step": 980
},
{
"epoch": 1.199023199023199,
"grad_norm": 1.5516761541366577,
"learning_rate": 1.552213131712617e-05,
"loss": 1.0465797185897827,
"step": 982
},
{
"epoch": 1.2014652014652014,
"grad_norm": 0.5330000519752502,
"learning_rate": 1.55e-05,
"loss": 1.1170203685760498,
"step": 984
},
{
"epoch": 1.2039072039072038,
"grad_norm": 1.7233712673187256,
"learning_rate": 1.5477832342074713e-05,
"loss": 0.7278258800506592,
"step": 986
},
{
"epoch": 1.2063492063492063,
"grad_norm": 4.363593101501465,
"learning_rate": 1.545562852237039e-05,
"loss": 0.7073162794113159,
"step": 988
},
{
"epoch": 1.2087912087912087,
"grad_norm": 1.1713706254959106,
"learning_rate": 1.5433388720199156e-05,
"loss": 0.891094982624054,
"step": 990
},
{
"epoch": 1.2112332112332111,
"grad_norm": 0.9442173838615417,
"learning_rate": 1.5411113115163722e-05,
"loss": 0.9304923415184021,
"step": 992
},
{
"epoch": 1.2136752136752136,
"grad_norm": 2.135201930999756,
"learning_rate": 1.538880188715593e-05,
"loss": 0.9996479749679565,
"step": 994
},
{
"epoch": 1.2161172161172162,
"grad_norm": 1.68083918094635,
"learning_rate": 1.5366455216355298e-05,
"loss": 0.8368605971336365,
"step": 996
},
{
"epoch": 1.2185592185592187,
"grad_norm": 0.7228335738182068,
"learning_rate": 1.534407328322758e-05,
"loss": 0.9793355464935303,
"step": 998
},
{
"epoch": 1.221001221001221,
"grad_norm": 3.5241169929504395,
"learning_rate": 1.5321656268523294e-05,
"loss": 0.6125832796096802,
"step": 1000
},
{
"epoch": 1.2234432234432235,
"grad_norm": 0.628485381603241,
"learning_rate": 1.5299204353276268e-05,
"loss": 0.7384300827980042,
"step": 1002
},
{
"epoch": 1.225885225885226,
"grad_norm": 0.8416216373443604,
"learning_rate": 1.5276717718802183e-05,
"loss": 0.9433239698410034,
"step": 1004
},
{
"epoch": 1.2283272283272284,
"grad_norm": 1.3178609609603882,
"learning_rate": 1.5254196546697088e-05,
"loss": 0.9707098603248596,
"step": 1006
},
{
"epoch": 1.2307692307692308,
"grad_norm": 1.0210210084915161,
"learning_rate": 1.523164101883597e-05,
"loss": 0.5824246406555176,
"step": 1008
},
{
"epoch": 1.2332112332112333,
"grad_norm": 0.7243679165840149,
"learning_rate": 1.5209051317371242e-05,
"loss": 1.0274351835250854,
"step": 1010
},
{
"epoch": 1.2356532356532357,
"grad_norm": 0.7745081782341003,
"learning_rate": 1.5186427624731313e-05,
"loss": 0.6757472157478333,
"step": 1012
},
{
"epoch": 1.2380952380952381,
"grad_norm": 0.5712753534317017,
"learning_rate": 1.5163770123619083e-05,
"loss": 1.041149616241455,
"step": 1014
},
{
"epoch": 1.2405372405372406,
"grad_norm": 1.5870078802108765,
"learning_rate": 1.5141078997010486e-05,
"loss": 0.886056125164032,
"step": 1016
},
{
"epoch": 1.242979242979243,
"grad_norm": 0.9383798837661743,
"learning_rate": 1.5118354428153008e-05,
"loss": 0.9722467660903931,
"step": 1018
},
{
"epoch": 1.2454212454212454,
"grad_norm": 3.283290147781372,
"learning_rate": 1.5095596600564197e-05,
"loss": 0.6366119980812073,
"step": 1020
},
{
"epoch": 1.2478632478632479,
"grad_norm": 3.411051034927368,
"learning_rate": 1.5072805698030197e-05,
"loss": 0.7901923656463623,
"step": 1022
},
{
"epoch": 1.2503052503052503,
"grad_norm": 1.0399166345596313,
"learning_rate": 1.504998190460426e-05,
"loss": 0.9346777200698853,
"step": 1024
},
{
"epoch": 1.2527472527472527,
"grad_norm": 0.6323780417442322,
"learning_rate": 1.5027125404605246e-05,
"loss": 0.8927645087242126,
"step": 1026
},
{
"epoch": 1.2551892551892552,
"grad_norm": 0.7854591608047485,
"learning_rate": 1.500423638261615e-05,
"loss": 0.8685034513473511,
"step": 1028
},
{
"epoch": 1.2576312576312576,
"grad_norm": 0.7747111320495605,
"learning_rate": 1.4981315023482605e-05,
"loss": 0.8063104152679443,
"step": 1030
},
{
"epoch": 1.26007326007326,
"grad_norm": 0.7940489649772644,
"learning_rate": 1.4958361512311394e-05,
"loss": 1.0881439447402954,
"step": 1032
},
{
"epoch": 1.2625152625152625,
"grad_norm": 3.6989586353302,
"learning_rate": 1.4935376034468944e-05,
"loss": 1.1380131244659424,
"step": 1034
},
{
"epoch": 1.264957264957265,
"grad_norm": 0.5151039361953735,
"learning_rate": 1.4912358775579841e-05,
"loss": 0.6871868968009949,
"step": 1036
},
{
"epoch": 1.2673992673992673,
"grad_norm": 1.3680596351623535,
"learning_rate": 1.4889309921525325e-05,
"loss": 0.6862649321556091,
"step": 1038
},
{
"epoch": 1.2698412698412698,
"grad_norm": 0.6552305221557617,
"learning_rate": 1.4866229658441793e-05,
"loss": 0.7429234385490417,
"step": 1040
},
{
"epoch": 1.2722832722832722,
"grad_norm": 0.5459038019180298,
"learning_rate": 1.4843118172719289e-05,
"loss": 0.9307520389556885,
"step": 1042
},
{
"epoch": 1.2747252747252746,
"grad_norm": 0.5527384281158447,
"learning_rate": 1.4819975650999998e-05,
"loss": 0.7104328274726868,
"step": 1044
},
{
"epoch": 1.277167277167277,
"grad_norm": 1.2261544466018677,
"learning_rate": 1.4796802280176762e-05,
"loss": 1.0070260763168335,
"step": 1046
},
{
"epoch": 1.2796092796092795,
"grad_norm": 1.9242292642593384,
"learning_rate": 1.4773598247391527e-05,
"loss": 0.690989077091217,
"step": 1048
},
{
"epoch": 1.282051282051282,
"grad_norm": 1.825949788093567,
"learning_rate": 1.4750363740033881e-05,
"loss": 0.42399048805236816,
"step": 1050
},
{
"epoch": 1.2844932844932844,
"grad_norm": 1.841841459274292,
"learning_rate": 1.4727098945739497e-05,
"loss": 1.0426183938980103,
"step": 1052
},
{
"epoch": 1.2869352869352868,
"grad_norm": 0.51153963804245,
"learning_rate": 1.470380405238865e-05,
"loss": 0.8385255336761475,
"step": 1054
},
{
"epoch": 1.2893772893772895,
"grad_norm": 2.656769275665283,
"learning_rate": 1.4680479248104678e-05,
"loss": 0.6596496105194092,
"step": 1056
},
{
"epoch": 1.291819291819292,
"grad_norm": 1.2762665748596191,
"learning_rate": 1.4657124721252476e-05,
"loss": 1.232382893562317,
"step": 1058
},
{
"epoch": 1.2942612942612943,
"grad_norm": 1.1065174341201782,
"learning_rate": 1.4633740660436974e-05,
"loss": 1.0262730121612549,
"step": 1060
},
{
"epoch": 1.2967032967032968,
"grad_norm": 3.235954999923706,
"learning_rate": 1.4610327254501607e-05,
"loss": 0.6136125326156616,
"step": 1062
},
{
"epoch": 1.2991452991452992,
"grad_norm": 0.5966620445251465,
"learning_rate": 1.4586884692526791e-05,
"loss": 0.8876266479492188,
"step": 1064
},
{
"epoch": 1.3015873015873016,
"grad_norm": 2.7788665294647217,
"learning_rate": 1.4563413163828397e-05,
"loss": 0.7026379108428955,
"step": 1066
},
{
"epoch": 1.304029304029304,
"grad_norm": 0.7998191714286804,
"learning_rate": 1.4539912857956234e-05,
"loss": 0.9727767705917358,
"step": 1068
},
{
"epoch": 1.3064713064713065,
"grad_norm": 1.385021686553955,
"learning_rate": 1.4516383964692495e-05,
"loss": 0.7625731825828552,
"step": 1070
},
{
"epoch": 1.308913308913309,
"grad_norm": 1.5408962965011597,
"learning_rate": 1.4492826674050248e-05,
"loss": 0.9061781167984009,
"step": 1072
},
{
"epoch": 1.3113553113553114,
"grad_norm": 6.768632888793945,
"learning_rate": 1.4469241176271884e-05,
"loss": 0.7514428496360779,
"step": 1074
},
{
"epoch": 1.3137973137973138,
"grad_norm": 0.7883042097091675,
"learning_rate": 1.4445627661827589e-05,
"loss": 0.6796785593032837,
"step": 1076
},
{
"epoch": 1.3162393162393162,
"grad_norm": 1.3659143447875977,
"learning_rate": 1.4421986321413801e-05,
"loss": 0.9605479836463928,
"step": 1078
},
{
"epoch": 1.3186813186813187,
"grad_norm": 1.356332778930664,
"learning_rate": 1.439831734595168e-05,
"loss": 0.8200567364692688,
"step": 1080
},
{
"epoch": 1.321123321123321,
"grad_norm": 1.2193089723587036,
"learning_rate": 1.4374620926585556e-05,
"loss": 0.881037175655365,
"step": 1082
},
{
"epoch": 1.3235653235653235,
"grad_norm": 0.5569941401481628,
"learning_rate": 1.4350897254681386e-05,
"loss": 0.8864683508872986,
"step": 1084
},
{
"epoch": 1.326007326007326,
"grad_norm": 1.2279424667358398,
"learning_rate": 1.4327146521825213e-05,
"loss": 1.0031923055648804,
"step": 1086
},
{
"epoch": 1.3284493284493284,
"grad_norm": 7.039901256561279,
"learning_rate": 1.4303368919821619e-05,
"loss": 1.0991631746292114,
"step": 1088
},
{
"epoch": 1.3308913308913308,
"grad_norm": 0.7994674444198608,
"learning_rate": 1.4279564640692172e-05,
"loss": 0.6553327441215515,
"step": 1090
},
{
"epoch": 1.3333333333333333,
"grad_norm": 5.8774871826171875,
"learning_rate": 1.4255733876673874e-05,
"loss": 0.7461038827896118,
"step": 1092
},
{
"epoch": 1.3357753357753357,
"grad_norm": 0.7029107213020325,
"learning_rate": 1.4231876820217623e-05,
"loss": 0.9785415530204773,
"step": 1094
},
{
"epoch": 1.3382173382173383,
"grad_norm": 3.4110426902770996,
"learning_rate": 1.4207993663986636e-05,
"loss": 0.47891128063201904,
"step": 1096
},
{
"epoch": 1.3406593406593408,
"grad_norm": 1.4747514724731445,
"learning_rate": 1.4184084600854906e-05,
"loss": 1.1681262254714966,
"step": 1098
},
{
"epoch": 1.3431013431013432,
"grad_norm": 1.336816668510437,
"learning_rate": 1.4160149823905654e-05,
"loss": 1.0751440525054932,
"step": 1100
},
{
"epoch": 1.3455433455433456,
"grad_norm": 0.80948805809021,
"learning_rate": 1.4136189526429749e-05,
"loss": 1.000352144241333,
"step": 1102
},
{
"epoch": 1.347985347985348,
"grad_norm": 2.687490701675415,
"learning_rate": 1.4112203901924153e-05,
"loss": 0.8417548537254333,
"step": 1104
},
{
"epoch": 1.3504273504273505,
"grad_norm": 0.8591554760932922,
"learning_rate": 1.4088193144090376e-05,
"loss": 0.9740299582481384,
"step": 1106
},
{
"epoch": 1.352869352869353,
"grad_norm": 3.9168152809143066,
"learning_rate": 1.406415744683289e-05,
"loss": 0.7925201058387756,
"step": 1108
},
{
"epoch": 1.3553113553113554,
"grad_norm": 0.8020510673522949,
"learning_rate": 1.4040097004257567e-05,
"loss": 1.042458415031433,
"step": 1110
},
{
"epoch": 1.3577533577533578,
"grad_norm": 1.342916488647461,
"learning_rate": 1.4016012010670125e-05,
"loss": 0.9074981808662415,
"step": 1112
},
{
"epoch": 1.3601953601953602,
"grad_norm": 1.7544145584106445,
"learning_rate": 1.3991902660574544e-05,
"loss": 0.8596875667572021,
"step": 1114
},
{
"epoch": 1.3626373626373627,
"grad_norm": 2.7417960166931152,
"learning_rate": 1.39677691486715e-05,
"loss": 0.5096735954284668,
"step": 1116
},
{
"epoch": 1.3650793650793651,
"grad_norm": 6.50905704498291,
"learning_rate": 1.3943611669856797e-05,
"loss": 0.8825461268424988,
"step": 1118
},
{
"epoch": 1.3675213675213675,
"grad_norm": 1.5938875675201416,
"learning_rate": 1.3919430419219787e-05,
"loss": 0.9512450695037842,
"step": 1120
},
{
"epoch": 1.36996336996337,
"grad_norm": 2.952125072479248,
"learning_rate": 1.389522559204179e-05,
"loss": 0.9308354258537292,
"step": 1122
},
{
"epoch": 1.3724053724053724,
"grad_norm": 0.7429002523422241,
"learning_rate": 1.387099738379454e-05,
"loss": 0.8262976408004761,
"step": 1124
},
{
"epoch": 1.3748473748473748,
"grad_norm": 2.061551809310913,
"learning_rate": 1.3846745990138581e-05,
"loss": 1.28501558303833,
"step": 1126
},
{
"epoch": 1.3772893772893773,
"grad_norm": 0.9269969463348389,
"learning_rate": 1.382247160692169e-05,
"loss": 0.9468799829483032,
"step": 1128
},
{
"epoch": 1.3797313797313797,
"grad_norm": 0.8824846744537354,
"learning_rate": 1.3798174430177314e-05,
"loss": 0.6640329360961914,
"step": 1130
},
{
"epoch": 1.3821733821733821,
"grad_norm": 0.633753776550293,
"learning_rate": 1.3773854656122962e-05,
"loss": 0.7266710996627808,
"step": 1132
},
{
"epoch": 1.3846153846153846,
"grad_norm": 5.053553581237793,
"learning_rate": 1.3749512481158649e-05,
"loss": 0.5124362707138062,
"step": 1134
},
{
"epoch": 1.387057387057387,
"grad_norm": 1.3869932889938354,
"learning_rate": 1.3725148101865275e-05,
"loss": 0.6932591199874878,
"step": 1136
},
{
"epoch": 1.3894993894993894,
"grad_norm": 0.8337790369987488,
"learning_rate": 1.3700761715003068e-05,
"loss": 1.0207314491271973,
"step": 1138
},
{
"epoch": 1.3919413919413919,
"grad_norm": 2.2834839820861816,
"learning_rate": 1.3676353517509981e-05,
"loss": 0.8703376650810242,
"step": 1140
},
{
"epoch": 1.3943833943833943,
"grad_norm": 1.934580683708191,
"learning_rate": 1.3651923706500105e-05,
"loss": 0.9365097284317017,
"step": 1142
},
{
"epoch": 1.3968253968253967,
"grad_norm": 2.526843786239624,
"learning_rate": 1.362747247926207e-05,
"loss": 0.7051898837089539,
"step": 1144
},
{
"epoch": 1.3992673992673992,
"grad_norm": 0.8698064684867859,
"learning_rate": 1.3603000033257465e-05,
"loss": 1.0435025691986084,
"step": 1146
},
{
"epoch": 1.4017094017094016,
"grad_norm": 2.076078176498413,
"learning_rate": 1.3578506566119236e-05,
"loss": 0.8728469610214233,
"step": 1148
},
{
"epoch": 1.404151404151404,
"grad_norm": 0.8785778880119324,
"learning_rate": 1.355399227565008e-05,
"loss": 0.7566535472869873,
"step": 1150
},
{
"epoch": 1.4065934065934065,
"grad_norm": 1.0821596384048462,
"learning_rate": 1.352945735982087e-05,
"loss": 0.7982299327850342,
"step": 1152
},
{
"epoch": 1.409035409035409,
"grad_norm": 1.226269006729126,
"learning_rate": 1.3504902016769039e-05,
"loss": 0.7825957536697388,
"step": 1154
},
{
"epoch": 1.4114774114774113,
"grad_norm": 1.9049503803253174,
"learning_rate": 1.348032644479698e-05,
"loss": 0.6891085505485535,
"step": 1156
},
{
"epoch": 1.4139194139194138,
"grad_norm": 1.1582715511322021,
"learning_rate": 1.3455730842370462e-05,
"loss": 0.8980281352996826,
"step": 1158
},
{
"epoch": 1.4163614163614164,
"grad_norm": 0.8849154114723206,
"learning_rate": 1.3431115408117002e-05,
"loss": 0.8913061618804932,
"step": 1160
},
{
"epoch": 1.4188034188034189,
"grad_norm": 1.0964971780776978,
"learning_rate": 1.3406480340824272e-05,
"loss": 0.7366968393325806,
"step": 1162
},
{
"epoch": 1.4212454212454213,
"grad_norm": 13.473047256469727,
"learning_rate": 1.3381825839438514e-05,
"loss": 0.6932869553565979,
"step": 1164
},
{
"epoch": 1.4236874236874237,
"grad_norm": 1.122653603553772,
"learning_rate": 1.3357152103062892e-05,
"loss": 1.1828283071517944,
"step": 1166
},
{
"epoch": 1.4261294261294262,
"grad_norm": 0.561507523059845,
"learning_rate": 1.3332459330955921e-05,
"loss": 0.966327428817749,
"step": 1168
},
{
"epoch": 1.4285714285714286,
"grad_norm": 2.9495770931243896,
"learning_rate": 1.3307747722529838e-05,
"loss": 0.8709004521369934,
"step": 1170
},
{
"epoch": 1.431013431013431,
"grad_norm": 0.6762902140617371,
"learning_rate": 1.3283017477348993e-05,
"loss": 0.9068043231964111,
"step": 1172
},
{
"epoch": 1.4334554334554335,
"grad_norm": 0.7292370796203613,
"learning_rate": 1.3258268795128258e-05,
"loss": 0.9378133416175842,
"step": 1174
},
{
"epoch": 1.435897435897436,
"grad_norm": 0.974267303943634,
"learning_rate": 1.3233501875731376e-05,
"loss": 1.0176819562911987,
"step": 1176
},
{
"epoch": 1.4383394383394383,
"grad_norm": 5.0265116691589355,
"learning_rate": 1.320871691916938e-05,
"loss": 0.7393254041671753,
"step": 1178
},
{
"epoch": 1.4407814407814408,
"grad_norm": 3.240424394607544,
"learning_rate": 1.3183914125598966e-05,
"loss": 0.8406731486320496,
"step": 1180
},
{
"epoch": 1.4432234432234432,
"grad_norm": 0.9493277668952942,
"learning_rate": 1.3159093695320881e-05,
"loss": 0.756401002407074,
"step": 1182
},
{
"epoch": 1.4456654456654456,
"grad_norm": 0.9762367010116577,
"learning_rate": 1.313425582877829e-05,
"loss": 1.055999755859375,
"step": 1184
},
{
"epoch": 1.448107448107448,
"grad_norm": 0.6565649509429932,
"learning_rate": 1.3109400726555179e-05,
"loss": 0.8509088754653931,
"step": 1186
},
{
"epoch": 1.4505494505494505,
"grad_norm": 2.6168346405029297,
"learning_rate": 1.3084528589374718e-05,
"loss": 0.7348777651786804,
"step": 1188
},
{
"epoch": 1.452991452991453,
"grad_norm": 1.5224627256393433,
"learning_rate": 1.305963961809765e-05,
"loss": 0.9267134666442871,
"step": 1190
},
{
"epoch": 1.4554334554334554,
"grad_norm": 0.7623134255409241,
"learning_rate": 1.3034734013720669e-05,
"loss": 0.8056920170783997,
"step": 1192
},
{
"epoch": 1.4578754578754578,
"grad_norm": 1.4244619607925415,
"learning_rate": 1.3009811977374784e-05,
"loss": 0.6724956631660461,
"step": 1194
},
{
"epoch": 1.4603174603174602,
"grad_norm": 0.7519621253013611,
"learning_rate": 1.2984873710323711e-05,
"loss": 0.6628673076629639,
"step": 1196
},
{
"epoch": 1.462759462759463,
"grad_norm": 0.7634888887405396,
"learning_rate": 1.2959919413962242e-05,
"loss": 0.8408687710762024,
"step": 1198
},
{
"epoch": 1.4652014652014653,
"grad_norm": 1.9624353647232056,
"learning_rate": 1.2934949289814611e-05,
"loss": 1.1985151767730713,
"step": 1200
},
{
"epoch": 1.4676434676434678,
"grad_norm": 1.5909016132354736,
"learning_rate": 1.290996353953288e-05,
"loss": 0.9667496681213379,
"step": 1202
},
{
"epoch": 1.4700854700854702,
"grad_norm": 0.8254397511482239,
"learning_rate": 1.2884962364895304e-05,
"loss": 0.9893684983253479,
"step": 1204
},
{
"epoch": 1.4725274725274726,
"grad_norm": 0.9778246879577637,
"learning_rate": 1.2859945967804687e-05,
"loss": 0.8230042457580566,
"step": 1206
},
{
"epoch": 1.474969474969475,
"grad_norm": 2.8977315425872803,
"learning_rate": 1.2834914550286789e-05,
"loss": 0.7464233040809631,
"step": 1208
},
{
"epoch": 1.4774114774114775,
"grad_norm": 16.703990936279297,
"learning_rate": 1.2809868314488647e-05,
"loss": 0.8318718671798706,
"step": 1210
},
{
"epoch": 1.47985347985348,
"grad_norm": 1.9694427251815796,
"learning_rate": 1.2784807462676983e-05,
"loss": 0.8906052708625793,
"step": 1212
},
{
"epoch": 1.4822954822954824,
"grad_norm": 0.8902061581611633,
"learning_rate": 1.2759732197236548e-05,
"loss": 0.9788769483566284,
"step": 1214
},
{
"epoch": 1.4847374847374848,
"grad_norm": 0.8015345335006714,
"learning_rate": 1.2734642720668494e-05,
"loss": 0.9402112364768982,
"step": 1216
},
{
"epoch": 1.4871794871794872,
"grad_norm": 2.7102816104888916,
"learning_rate": 1.2709539235588739e-05,
"loss": 0.27936387062072754,
"step": 1218
},
{
"epoch": 1.4896214896214897,
"grad_norm": 0.5606179237365723,
"learning_rate": 1.2684421944726323e-05,
"loss": 0.7066472768783569,
"step": 1220
},
{
"epoch": 1.492063492063492,
"grad_norm": 1.7472079992294312,
"learning_rate": 1.2659291050921798e-05,
"loss": 0.8000496029853821,
"step": 1222
},
{
"epoch": 1.4945054945054945,
"grad_norm": 3.1667306423187256,
"learning_rate": 1.263414675712554e-05,
"loss": 0.733214259147644,
"step": 1224
},
{
"epoch": 1.496947496947497,
"grad_norm": 1.6288788318634033,
"learning_rate": 1.2608989266396165e-05,
"loss": 0.8229939341545105,
"step": 1226
},
{
"epoch": 1.4993894993894994,
"grad_norm": 3.6219799518585205,
"learning_rate": 1.2583818781898855e-05,
"loss": 0.4456430971622467,
"step": 1228
},
{
"epoch": 1.5018315018315018,
"grad_norm": 1.921484351158142,
"learning_rate": 1.2558635506903717e-05,
"loss": 0.6831130981445312,
"step": 1230
},
{
"epoch": 1.5042735042735043,
"grad_norm": 0.4906938970088959,
"learning_rate": 1.253343964478417e-05,
"loss": 0.6764166951179504,
"step": 1232
},
{
"epoch": 1.5067155067155067,
"grad_norm": 1.23770272731781,
"learning_rate": 1.250823139901527e-05,
"loss": 0.9079239368438721,
"step": 1234
},
{
"epoch": 1.5091575091575091,
"grad_norm": 0.9974614977836609,
"learning_rate": 1.2483010973172077e-05,
"loss": 0.9452921748161316,
"step": 1236
},
{
"epoch": 1.5115995115995116,
"grad_norm": 0.9079129099845886,
"learning_rate": 1.2457778570928026e-05,
"loss": 0.8234338760375977,
"step": 1238
},
{
"epoch": 1.514041514041514,
"grad_norm": 0.9488117098808289,
"learning_rate": 1.2432534396053261e-05,
"loss": 0.8415461778640747,
"step": 1240
},
{
"epoch": 1.5164835164835164,
"grad_norm": 0.7722516059875488,
"learning_rate": 1.2407278652413001e-05,
"loss": 1.0288302898406982,
"step": 1242
},
{
"epoch": 1.5189255189255189,
"grad_norm": 3.5721123218536377,
"learning_rate": 1.2382011543965896e-05,
"loss": 0.7554802298545837,
"step": 1244
},
{
"epoch": 1.5213675213675213,
"grad_norm": 0.6691564917564392,
"learning_rate": 1.2356733274762367e-05,
"loss": 0.7608579397201538,
"step": 1246
},
{
"epoch": 1.5238095238095237,
"grad_norm": 0.9692053198814392,
"learning_rate": 1.2331444048942969e-05,
"loss": 0.8119852542877197,
"step": 1248
},
{
"epoch": 1.5262515262515262,
"grad_norm": 1.7576018571853638,
"learning_rate": 1.2306144070736747e-05,
"loss": 1.1432095766067505,
"step": 1250
},
{
"epoch": 1.5286935286935286,
"grad_norm": 2.8032066822052,
"learning_rate": 1.228083354445957e-05,
"loss": 0.7118352055549622,
"step": 1252
},
{
"epoch": 1.531135531135531,
"grad_norm": 1.4581559896469116,
"learning_rate": 1.2255512674512491e-05,
"loss": 0.9391320943832397,
"step": 1254
},
{
"epoch": 1.5335775335775335,
"grad_norm": 0.9859986305236816,
"learning_rate": 1.2230181665380101e-05,
"loss": 1.0426268577575684,
"step": 1256
},
{
"epoch": 1.536019536019536,
"grad_norm": 0.6827996373176575,
"learning_rate": 1.220484072162887e-05,
"loss": 0.35382741689682007,
"step": 1258
},
{
"epoch": 1.5384615384615383,
"grad_norm": 4.84762716293335,
"learning_rate": 1.2179490047905495e-05,
"loss": 0.6097034215927124,
"step": 1260
},
{
"epoch": 1.5409035409035408,
"grad_norm": 1.7744395732879639,
"learning_rate": 1.2154129848935258e-05,
"loss": 0.6083784103393555,
"step": 1262
},
{
"epoch": 1.5433455433455432,
"grad_norm": 2.7440474033355713,
"learning_rate": 1.2128760329520355e-05,
"loss": 0.7916078567504883,
"step": 1264
},
{
"epoch": 1.5457875457875456,
"grad_norm": 1.4891324043273926,
"learning_rate": 1.210338169453825e-05,
"loss": 0.8106079697608948,
"step": 1266
},
{
"epoch": 1.5482295482295483,
"grad_norm": 0.9212846159934998,
"learning_rate": 1.2077994148940033e-05,
"loss": 0.8362663984298706,
"step": 1268
},
{
"epoch": 1.5506715506715507,
"grad_norm": 1.9237959384918213,
"learning_rate": 1.2052597897748746e-05,
"loss": 0.4818616807460785,
"step": 1270
},
{
"epoch": 1.5531135531135531,
"grad_norm": 3.0629465579986572,
"learning_rate": 1.202719314605773e-05,
"loss": 1.0731854438781738,
"step": 1272
},
{
"epoch": 1.5555555555555556,
"grad_norm": 1.05351984500885,
"learning_rate": 1.2001780099028988e-05,
"loss": 0.943490207195282,
"step": 1274
},
{
"epoch": 1.557997557997558,
"grad_norm": 4.432197570800781,
"learning_rate": 1.1976358961891504e-05,
"loss": 1.3021904230117798,
"step": 1276
},
{
"epoch": 1.5604395604395604,
"grad_norm": 1.0480554103851318,
"learning_rate": 1.1950929939939596e-05,
"loss": 0.7510530948638916,
"step": 1278
},
{
"epoch": 1.5628815628815629,
"grad_norm": 1.6610548496246338,
"learning_rate": 1.192549323853126e-05,
"loss": 0.9113296270370483,
"step": 1280
},
{
"epoch": 1.5653235653235653,
"grad_norm": 4.827365875244141,
"learning_rate": 1.1900049063086508e-05,
"loss": 0.6182503700256348,
"step": 1282
},
{
"epoch": 1.5677655677655677,
"grad_norm": 0.8534301519393921,
"learning_rate": 1.1874597619085712e-05,
"loss": 0.9308310151100159,
"step": 1284
},
{
"epoch": 1.5702075702075702,
"grad_norm": 0.9158720970153809,
"learning_rate": 1.1849139112067937e-05,
"loss": 0.9331011772155762,
"step": 1286
},
{
"epoch": 1.5726495726495726,
"grad_norm": 12.01048755645752,
"learning_rate": 1.18236737476293e-05,
"loss": 0.490848183631897,
"step": 1288
},
{
"epoch": 1.575091575091575,
"grad_norm": 0.9270315170288086,
"learning_rate": 1.1798201731421286e-05,
"loss": 0.7262513637542725,
"step": 1290
},
{
"epoch": 1.5775335775335775,
"grad_norm": 1.8197249174118042,
"learning_rate": 1.1772723269149096e-05,
"loss": 0.43270692229270935,
"step": 1292
},
{
"epoch": 1.5799755799755801,
"grad_norm": 1.0564115047454834,
"learning_rate": 1.1747238566569993e-05,
"loss": 0.6380181908607483,
"step": 1294
},
{
"epoch": 1.5824175824175826,
"grad_norm": 0.937374472618103,
"learning_rate": 1.1721747829491639e-05,
"loss": 0.9579664468765259,
"step": 1296
},
{
"epoch": 1.584859584859585,
"grad_norm": 0.9189720749855042,
"learning_rate": 1.169625126377042e-05,
"loss": 1.1132162809371948,
"step": 1298
},
{
"epoch": 1.5873015873015874,
"grad_norm": 1.5094869136810303,
"learning_rate": 1.1670749075309798e-05,
"loss": 0.9595221877098083,
"step": 1300
},
{
"epoch": 1.5897435897435899,
"grad_norm": 3.5550084114074707,
"learning_rate": 1.164524147005864e-05,
"loss": 1.0293970108032227,
"step": 1302
},
{
"epoch": 1.5921855921855923,
"grad_norm": 8.070341110229492,
"learning_rate": 1.1619728654009561e-05,
"loss": 0.9469819664955139,
"step": 1304
},
{
"epoch": 1.5946275946275947,
"grad_norm": 2.206435203552246,
"learning_rate": 1.1594210833197252e-05,
"loss": 0.6112901568412781,
"step": 1306
},
{
"epoch": 1.5970695970695972,
"grad_norm": 0.7995406985282898,
"learning_rate": 1.156868821369683e-05,
"loss": 0.9325740337371826,
"step": 1308
},
{
"epoch": 1.5995115995115996,
"grad_norm": 1.177374243736267,
"learning_rate": 1.1543161001622154e-05,
"loss": 0.821311891078949,
"step": 1310
},
{
"epoch": 1.601953601953602,
"grad_norm": 1.0490672588348389,
"learning_rate": 1.1517629403124175e-05,
"loss": 0.8008186221122742,
"step": 1312
},
{
"epoch": 1.6043956043956045,
"grad_norm": 1.078908085823059,
"learning_rate": 1.1492093624389274e-05,
"loss": 0.9607588648796082,
"step": 1314
},
{
"epoch": 1.606837606837607,
"grad_norm": 0.9914792776107788,
"learning_rate": 1.1466553871637585e-05,
"loss": 1.0678871870040894,
"step": 1316
},
{
"epoch": 1.6092796092796093,
"grad_norm": 0.9516023993492126,
"learning_rate": 1.1441010351121332e-05,
"loss": 0.927726686000824,
"step": 1318
},
{
"epoch": 1.6117216117216118,
"grad_norm": 1.6526710987091064,
"learning_rate": 1.1415463269123172e-05,
"loss": 1.1496163606643677,
"step": 1320
},
{
"epoch": 1.6141636141636142,
"grad_norm": 0.8162203431129456,
"learning_rate": 1.1389912831954524e-05,
"loss": 0.849646270275116,
"step": 1322
},
{
"epoch": 1.6166056166056166,
"grad_norm": 0.7434989809989929,
"learning_rate": 1.1364359245953897e-05,
"loss": 1.0158569812774658,
"step": 1324
},
{
"epoch": 1.619047619047619,
"grad_norm": 2.0639302730560303,
"learning_rate": 1.1338802717485234e-05,
"loss": 0.6589023470878601,
"step": 1326
},
{
"epoch": 1.6214896214896215,
"grad_norm": 1.0379024744033813,
"learning_rate": 1.1313243452936235e-05,
"loss": 0.9295322895050049,
"step": 1328
},
{
"epoch": 1.623931623931624,
"grad_norm": 1.181497573852539,
"learning_rate": 1.1287681658716706e-05,
"loss": 1.0116742849349976,
"step": 1330
},
{
"epoch": 1.6263736263736264,
"grad_norm": 4.863892078399658,
"learning_rate": 1.1262117541256872e-05,
"loss": 0.8862733244895935,
"step": 1332
},
{
"epoch": 1.6288156288156288,
"grad_norm": 0.7002055644989014,
"learning_rate": 1.1236551307005722e-05,
"loss": 0.9096848368644714,
"step": 1334
},
{
"epoch": 1.6312576312576312,
"grad_norm": 0.6345333456993103,
"learning_rate": 1.1210983162429347e-05,
"loss": 0.5657076835632324,
"step": 1336
},
{
"epoch": 1.6336996336996337,
"grad_norm": 2.7891440391540527,
"learning_rate": 1.1185413314009254e-05,
"loss": 0.9815369248390198,
"step": 1338
},
{
"epoch": 1.636141636141636,
"grad_norm": 6.338902473449707,
"learning_rate": 1.1159841968240714e-05,
"loss": 0.5724242925643921,
"step": 1340
},
{
"epoch": 1.6385836385836385,
"grad_norm": 2.7349283695220947,
"learning_rate": 1.1134269331631096e-05,
"loss": 0.4281773269176483,
"step": 1342
},
{
"epoch": 1.641025641025641,
"grad_norm": 1.9846585988998413,
"learning_rate": 1.1108695610698187e-05,
"loss": 1.0027917623519897,
"step": 1344
},
{
"epoch": 1.6434676434676434,
"grad_norm": 0.6990553140640259,
"learning_rate": 1.1083121011968531e-05,
"loss": 0.9550279378890991,
"step": 1346
},
{
"epoch": 1.6459096459096458,
"grad_norm": 2.958153486251831,
"learning_rate": 1.1057545741975768e-05,
"loss": 0.6426241993904114,
"step": 1348
},
{
"epoch": 1.6483516483516483,
"grad_norm": 0.8284672498703003,
"learning_rate": 1.1031970007258947e-05,
"loss": 0.8278497457504272,
"step": 1350
},
{
"epoch": 1.6507936507936507,
"grad_norm": 0.7631545066833496,
"learning_rate": 1.1006394014360882e-05,
"loss": 0.9407053589820862,
"step": 1352
},
{
"epoch": 1.6532356532356531,
"grad_norm": 4.05110502243042,
"learning_rate": 1.0980817969826458e-05,
"loss": 0.9099552035331726,
"step": 1354
},
{
"epoch": 1.6556776556776556,
"grad_norm": 1.0000635385513306,
"learning_rate": 1.0955242080200994e-05,
"loss": 0.9383828639984131,
"step": 1356
},
{
"epoch": 1.658119658119658,
"grad_norm": 1.1321988105773926,
"learning_rate": 1.0929666552028545e-05,
"loss": 0.52699214220047,
"step": 1358
},
{
"epoch": 1.6605616605616604,
"grad_norm": 1.246857762336731,
"learning_rate": 1.0904091591850255e-05,
"loss": 0.6198506355285645,
"step": 1360
},
{
"epoch": 1.6630036630036629,
"grad_norm": 1.0080903768539429,
"learning_rate": 1.0878517406202674e-05,
"loss": 0.9911934733390808,
"step": 1362
},
{
"epoch": 1.6654456654456653,
"grad_norm": 0.8918383121490479,
"learning_rate": 1.0852944201616097e-05,
"loss": 1.0504215955734253,
"step": 1364
},
{
"epoch": 1.6678876678876677,
"grad_norm": 1.0392669439315796,
"learning_rate": 1.082737218461291e-05,
"loss": 1.0229471921920776,
"step": 1366
},
{
"epoch": 1.6703296703296702,
"grad_norm": 0.8570772409439087,
"learning_rate": 1.080180156170589e-05,
"loss": 1.049717903137207,
"step": 1368
},
{
"epoch": 1.6727716727716728,
"grad_norm": 0.9958022236824036,
"learning_rate": 1.0776232539396567e-05,
"loss": 1.006693720817566,
"step": 1370
},
{
"epoch": 1.6752136752136753,
"grad_norm": 0.882525622844696,
"learning_rate": 1.0750665324173542e-05,
"loss": 0.615381121635437,
"step": 1372
},
{
"epoch": 1.6776556776556777,
"grad_norm": 0.9473522305488586,
"learning_rate": 1.0725100122510819e-05,
"loss": 0.36105355620384216,
"step": 1374
},
{
"epoch": 1.6800976800976801,
"grad_norm": 3.743011236190796,
"learning_rate": 1.0699537140866146e-05,
"loss": 1.1695616245269775,
"step": 1376
},
{
"epoch": 1.6825396825396826,
"grad_norm": 0.823453962802887,
"learning_rate": 1.0673976585679341e-05,
"loss": 0.9196591377258301,
"step": 1378
},
{
"epoch": 1.684981684981685,
"grad_norm": 0.5954387187957764,
"learning_rate": 1.0648418663370628e-05,
"loss": 0.7695765495300293,
"step": 1380
},
{
"epoch": 1.6874236874236874,
"grad_norm": 2.546109437942505,
"learning_rate": 1.0622863580338967e-05,
"loss": 1.0195831060409546,
"step": 1382
},
{
"epoch": 1.6898656898656899,
"grad_norm": 0.7414639592170715,
"learning_rate": 1.0597311542960385e-05,
"loss": 0.8976457715034485,
"step": 1384
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.6246572732925415,
"learning_rate": 1.0571762757586321e-05,
"loss": 0.9752371907234192,
"step": 1386
},
{
"epoch": 1.6947496947496947,
"grad_norm": 0.8245002627372742,
"learning_rate": 1.0546217430541947e-05,
"loss": 0.9225857257843018,
"step": 1388
},
{
"epoch": 1.6971916971916972,
"grad_norm": 0.7589647769927979,
"learning_rate": 1.0520675768124507e-05,
"loss": 0.47266364097595215,
"step": 1390
},
{
"epoch": 1.6996336996336996,
"grad_norm": 0.8037369847297668,
"learning_rate": 1.0495137976601648e-05,
"loss": 0.8273367881774902,
"step": 1392
},
{
"epoch": 1.702075702075702,
"grad_norm": 0.9903712868690491,
"learning_rate": 1.0469604262209765e-05,
"loss": 0.7290286421775818,
"step": 1394
},
{
"epoch": 1.7045177045177047,
"grad_norm": 2.0067808628082275,
"learning_rate": 1.0444074831152317e-05,
"loss": 0.9373266100883484,
"step": 1396
},
{
"epoch": 1.7069597069597071,
"grad_norm": 20.187288284301758,
"learning_rate": 1.0418549889598175e-05,
"loss": 0.8240612149238586,
"step": 1398
},
{
"epoch": 1.7094017094017095,
"grad_norm": 4.022505283355713,
"learning_rate": 1.0393029643679962e-05,
"loss": 0.44202497601509094,
"step": 1400
},
{
"epoch": 1.711843711843712,
"grad_norm": 5.573869705200195,
"learning_rate": 1.0367514299492366e-05,
"loss": 0.9583691954612732,
"step": 1402
},
{
"epoch": 1.7142857142857144,
"grad_norm": 1.5996133089065552,
"learning_rate": 1.0342004063090503e-05,
"loss": 1.0398838520050049,
"step": 1404
},
{
"epoch": 1.7167277167277168,
"grad_norm": 2.385746717453003,
"learning_rate": 1.0316499140488232e-05,
"loss": 0.4760570824146271,
"step": 1406
},
{
"epoch": 1.7191697191697193,
"grad_norm": 0.8254954218864441,
"learning_rate": 1.0290999737656497e-05,
"loss": 0.907942533493042,
"step": 1408
},
{
"epoch": 1.7216117216117217,
"grad_norm": 8.329554557800293,
"learning_rate": 1.026550606052168e-05,
"loss": 0.6862547397613525,
"step": 1410
},
{
"epoch": 1.7240537240537241,
"grad_norm": 2.332361936569214,
"learning_rate": 1.0240018314963909e-05,
"loss": 0.8768781423568726,
"step": 1412
},
{
"epoch": 1.7264957264957266,
"grad_norm": 2.285680055618286,
"learning_rate": 1.0214536706815418e-05,
"loss": 0.986327588558197,
"step": 1414
},
{
"epoch": 1.728937728937729,
"grad_norm": 3.5364201068878174,
"learning_rate": 1.0189061441858873e-05,
"loss": 0.8355549573898315,
"step": 1416
},
{
"epoch": 1.7313797313797314,
"grad_norm": 0.8595628142356873,
"learning_rate": 1.0163592725825712e-05,
"loss": 0.8929445743560791,
"step": 1418
},
{
"epoch": 1.7338217338217339,
"grad_norm": 15.206433296203613,
"learning_rate": 1.0138130764394496e-05,
"loss": 0.7870601415634155,
"step": 1420
},
{
"epoch": 1.7362637362637363,
"grad_norm": 2.8101370334625244,
"learning_rate": 1.0112675763189224e-05,
"loss": 0.7534129023551941,
"step": 1422
},
{
"epoch": 1.7387057387057387,
"grad_norm": 1.858702540397644,
"learning_rate": 1.0087227927777696e-05,
"loss": 0.8370426893234253,
"step": 1424
},
{
"epoch": 1.7411477411477412,
"grad_norm": 2.0665295124053955,
"learning_rate": 1.006178746366984e-05,
"loss": 0.6909109354019165,
"step": 1426
},
{
"epoch": 1.7435897435897436,
"grad_norm": 0.9323246479034424,
"learning_rate": 1.0036354576316052e-05,
"loss": 1.014011263847351,
"step": 1428
},
{
"epoch": 1.746031746031746,
"grad_norm": 1.75360107421875,
"learning_rate": 1.0010929471105548e-05,
"loss": 1.2392351627349854,
"step": 1430
},
{
"epoch": 1.7484737484737485,
"grad_norm": 1.979491949081421,
"learning_rate": 9.98551235336469e-06,
"loss": 0.6340602040290833,
"step": 1432
},
{
"epoch": 1.750915750915751,
"grad_norm": 2.876166343688965,
"learning_rate": 9.960103428355337e-06,
"loss": 0.7525686621665955,
"step": 1434
},
{
"epoch": 1.7533577533577533,
"grad_norm": 1.366552710533142,
"learning_rate": 9.934702901273187e-06,
"loss": 0.6044411063194275,
"step": 1436
},
{
"epoch": 1.7557997557997558,
"grad_norm": 0.689400315284729,
"learning_rate": 9.90931097724612e-06,
"loss": 0.4377739727497101,
"step": 1438
},
{
"epoch": 1.7582417582417582,
"grad_norm": 0.8386373519897461,
"learning_rate": 9.883927861332538e-06,
"loss": 0.909875214099884,
"step": 1440
},
{
"epoch": 1.7606837606837606,
"grad_norm": 7.745026111602783,
"learning_rate": 9.85855375851971e-06,
"loss": 0.7949923872947693,
"step": 1442
},
{
"epoch": 1.763125763125763,
"grad_norm": 2.948460340499878,
"learning_rate": 9.833188873722122e-06,
"loss": 0.6595785021781921,
"step": 1444
},
{
"epoch": 1.7655677655677655,
"grad_norm": 0.7448163032531738,
"learning_rate": 9.80783341177981e-06,
"loss": 1.0280483961105347,
"step": 1446
},
{
"epoch": 1.768009768009768,
"grad_norm": 0.7969598770141602,
"learning_rate": 9.782487577456724e-06,
"loss": 1.0123943090438843,
"step": 1448
},
{
"epoch": 1.7704517704517704,
"grad_norm": 0.9583572149276733,
"learning_rate": 9.75715157543905e-06,
"loss": 0.8486643433570862,
"step": 1450
},
{
"epoch": 1.7728937728937728,
"grad_norm": 2.09142804145813,
"learning_rate": 9.731825610333587e-06,
"loss": 0.3455406129360199,
"step": 1452
},
{
"epoch": 1.7753357753357752,
"grad_norm": 0.9442964196205139,
"learning_rate": 9.706509886666067e-06,
"loss": 0.8303570747375488,
"step": 1454
},
{
"epoch": 1.7777777777777777,
"grad_norm": 1.240134358406067,
"learning_rate": 9.681204608879518e-06,
"loss": 0.5113586187362671,
"step": 1456
},
{
"epoch": 1.7802197802197801,
"grad_norm": 1.1532829999923706,
"learning_rate": 9.655909981332614e-06,
"loss": 0.8892757892608643,
"step": 1458
},
{
"epoch": 1.7826617826617825,
"grad_norm": 1.5256012678146362,
"learning_rate": 9.63062620829801e-06,
"loss": 0.8083629608154297,
"step": 1460
},
{
"epoch": 1.785103785103785,
"grad_norm": 1.8043534755706787,
"learning_rate": 9.605353493960717e-06,
"loss": 0.9189132452011108,
"step": 1462
},
{
"epoch": 1.7875457875457874,
"grad_norm": 0.841884434223175,
"learning_rate": 9.580092042416427e-06,
"loss": 0.6249831318855286,
"step": 1464
},
{
"epoch": 1.7899877899877898,
"grad_norm": 2.1716599464416504,
"learning_rate": 9.554842057669886e-06,
"loss": 0.6827890872955322,
"step": 1466
},
{
"epoch": 1.7924297924297923,
"grad_norm": 3.5236616134643555,
"learning_rate": 9.529603743633229e-06,
"loss": 0.7608170509338379,
"step": 1468
},
{
"epoch": 1.7948717948717947,
"grad_norm": 1.99154531955719,
"learning_rate": 9.504377304124346e-06,
"loss": 0.9152241945266724,
"step": 1470
},
{
"epoch": 1.7973137973137974,
"grad_norm": 0.8060831427574158,
"learning_rate": 9.47916294286523e-06,
"loss": 0.8515353202819824,
"step": 1472
},
{
"epoch": 1.7997557997557998,
"grad_norm": 5.8603363037109375,
"learning_rate": 9.453960863480333e-06,
"loss": 0.5703706741333008,
"step": 1474
},
{
"epoch": 1.8021978021978022,
"grad_norm": 7.417604446411133,
"learning_rate": 9.428771269494926e-06,
"loss": 0.7551999092102051,
"step": 1476
},
{
"epoch": 1.8046398046398047,
"grad_norm": 1.034999966621399,
"learning_rate": 9.403594364333444e-06,
"loss": 0.6955189108848572,
"step": 1478
},
{
"epoch": 1.807081807081807,
"grad_norm": 0.9549148678779602,
"learning_rate": 9.378430351317854e-06,
"loss": 0.42793938517570496,
"step": 1480
},
{
"epoch": 1.8095238095238095,
"grad_norm": 1.3916822671890259,
"learning_rate": 9.353279433666014e-06,
"loss": 0.6840672492980957,
"step": 1482
},
{
"epoch": 1.811965811965812,
"grad_norm": 0.854276716709137,
"learning_rate": 9.328141814490021e-06,
"loss": 0.893316924571991,
"step": 1484
},
{
"epoch": 1.8144078144078144,
"grad_norm": 1.491588830947876,
"learning_rate": 9.303017696794578e-06,
"loss": 0.872158944606781,
"step": 1486
},
{
"epoch": 1.8168498168498168,
"grad_norm": 1.8033097982406616,
"learning_rate": 9.277907283475358e-06,
"loss": 0.6238676905632019,
"step": 1488
},
{
"epoch": 1.8192918192918193,
"grad_norm": 0.8885567784309387,
"learning_rate": 9.252810777317351e-06,
"loss": 0.6716984510421753,
"step": 1490
},
{
"epoch": 1.8217338217338217,
"grad_norm": 1.0771310329437256,
"learning_rate": 9.227728380993253e-06,
"loss": 0.8512567281723022,
"step": 1492
},
{
"epoch": 1.8241758241758241,
"grad_norm": 1.4891635179519653,
"learning_rate": 9.202660297061798e-06,
"loss": 0.5891348123550415,
"step": 1494
},
{
"epoch": 1.8266178266178266,
"grad_norm": 1.5767910480499268,
"learning_rate": 9.177606727966142e-06,
"loss": 0.8717406392097473,
"step": 1496
},
{
"epoch": 1.8290598290598292,
"grad_norm": 0.8637403845787048,
"learning_rate": 9.15256787603222e-06,
"loss": 1.3341138362884521,
"step": 1498
},
{
"epoch": 1.8315018315018317,
"grad_norm": 1.3066986799240112,
"learning_rate": 9.127543943467128e-06,
"loss": 1.2278974056243896,
"step": 1500
},
{
"epoch": 1.833943833943834,
"grad_norm": 1.3648895025253296,
"learning_rate": 9.102535132357457e-06,
"loss": 0.6873140335083008,
"step": 1502
},
{
"epoch": 1.8363858363858365,
"grad_norm": 0.45770537853240967,
"learning_rate": 9.077541644667697e-06,
"loss": 0.7067763209342957,
"step": 1504
},
{
"epoch": 1.838827838827839,
"grad_norm": 2.4009127616882324,
"learning_rate": 9.052563682238587e-06,
"loss": 0.6803405284881592,
"step": 1506
},
{
"epoch": 1.8412698412698414,
"grad_norm": 1.205779790878296,
"learning_rate": 9.02760144678548e-06,
"loss": 0.6593731641769409,
"step": 1508
},
{
"epoch": 1.8437118437118438,
"grad_norm": 0.640776515007019,
"learning_rate": 9.00265513989673e-06,
"loss": 0.8603323101997375,
"step": 1510
},
{
"epoch": 1.8461538461538463,
"grad_norm": 1.0433986186981201,
"learning_rate": 8.977724963032056e-06,
"loss": 0.8412877917289734,
"step": 1512
},
{
"epoch": 1.8485958485958487,
"grad_norm": 1.245303750038147,
"learning_rate": 8.952811117520914e-06,
"loss": 1.0396430492401123,
"step": 1514
},
{
"epoch": 1.8510378510378511,
"grad_norm": 1.5737297534942627,
"learning_rate": 8.927913804560864e-06,
"loss": 0.6088389754295349,
"step": 1516
},
{
"epoch": 1.8534798534798536,
"grad_norm": 0.9162042140960693,
"learning_rate": 8.903033225215975e-06,
"loss": 1.1635559797286987,
"step": 1518
},
{
"epoch": 1.855921855921856,
"grad_norm": 1.7877050638198853,
"learning_rate": 8.878169580415154e-06,
"loss": 0.631327748298645,
"step": 1520
},
{
"epoch": 1.8583638583638584,
"grad_norm": 3.03653883934021,
"learning_rate": 8.85332307095057e-06,
"loss": 0.902554452419281,
"step": 1522
},
{
"epoch": 1.8608058608058609,
"grad_norm": 1.9247746467590332,
"learning_rate": 8.828493897475998e-06,
"loss": 0.8101663589477539,
"step": 1524
},
{
"epoch": 1.8632478632478633,
"grad_norm": 1.386506199836731,
"learning_rate": 8.803682260505216e-06,
"loss": 0.7383776903152466,
"step": 1526
},
{
"epoch": 1.8656898656898657,
"grad_norm": 1.1092829704284668,
"learning_rate": 8.778888360410385e-06,
"loss": 0.7297862768173218,
"step": 1528
},
{
"epoch": 1.8681318681318682,
"grad_norm": 0.7110038995742798,
"learning_rate": 8.754112397420426e-06,
"loss": 0.8971010446548462,
"step": 1530
},
{
"epoch": 1.8705738705738706,
"grad_norm": 1.9106638431549072,
"learning_rate": 8.729354571619404e-06,
"loss": 0.7592481374740601,
"step": 1532
},
{
"epoch": 1.873015873015873,
"grad_norm": 0.805887758731842,
"learning_rate": 8.704615082944914e-06,
"loss": 0.8079948425292969,
"step": 1534
},
{
"epoch": 1.8754578754578755,
"grad_norm": 0.6133478283882141,
"learning_rate": 8.679894131186462e-06,
"loss": 1.000016450881958,
"step": 1536
},
{
"epoch": 1.877899877899878,
"grad_norm": 0.6692440509796143,
"learning_rate": 8.655191915983859e-06,
"loss": 0.8313310742378235,
"step": 1538
},
{
"epoch": 1.8803418803418803,
"grad_norm": 0.9560274481773376,
"learning_rate": 8.630508636825602e-06,
"loss": 0.9431169033050537,
"step": 1540
},
{
"epoch": 1.8827838827838828,
"grad_norm": 1.700568675994873,
"learning_rate": 8.605844493047269e-06,
"loss": 0.9815627336502075,
"step": 1542
},
{
"epoch": 1.8852258852258852,
"grad_norm": 1.308621883392334,
"learning_rate": 8.581199683829899e-06,
"loss": 0.7461444735527039,
"step": 1544
},
{
"epoch": 1.8876678876678876,
"grad_norm": 1.2452470064163208,
"learning_rate": 8.556574408198399e-06,
"loss": 0.9441168904304504,
"step": 1546
},
{
"epoch": 1.89010989010989,
"grad_norm": 3.298710823059082,
"learning_rate": 8.531968865019919e-06,
"loss": 0.8527262210845947,
"step": 1548
},
{
"epoch": 1.8925518925518925,
"grad_norm": 0.8520393967628479,
"learning_rate": 8.507383253002264e-06,
"loss": 0.47991418838500977,
"step": 1550
},
{
"epoch": 1.894993894993895,
"grad_norm": 1.5283163785934448,
"learning_rate": 8.482817770692276e-06,
"loss": 0.8953297138214111,
"step": 1552
},
{
"epoch": 1.8974358974358974,
"grad_norm": 2.6013505458831787,
"learning_rate": 8.458272616474226e-06,
"loss": 0.598823070526123,
"step": 1554
},
{
"epoch": 1.8998778998778998,
"grad_norm": 6.25869083404541,
"learning_rate": 8.43374798856824e-06,
"loss": 1.0903539657592773,
"step": 1556
},
{
"epoch": 1.9023199023199022,
"grad_norm": 0.7708169221878052,
"learning_rate": 8.40924408502866e-06,
"loss": 0.6560428738594055,
"step": 1558
},
{
"epoch": 1.9047619047619047,
"grad_norm": 1.3442054986953735,
"learning_rate": 8.384761103742476e-06,
"loss": 0.553628146648407,
"step": 1560
},
{
"epoch": 1.907203907203907,
"grad_norm": 0.8295760750770569,
"learning_rate": 8.360299242427713e-06,
"loss": 0.8809893727302551,
"step": 1562
},
{
"epoch": 1.9096459096459095,
"grad_norm": 1.2123860120773315,
"learning_rate": 8.335858698631829e-06,
"loss": 0.7752953171730042,
"step": 1564
},
{
"epoch": 1.912087912087912,
"grad_norm": 1.137731909751892,
"learning_rate": 8.311439669730139e-06,
"loss": 0.937446653842926,
"step": 1566
},
{
"epoch": 1.9145299145299144,
"grad_norm": 1.4613070487976074,
"learning_rate": 8.287042352924206e-06,
"loss": 0.9597198963165283,
"step": 1568
},
{
"epoch": 1.9169719169719168,
"grad_norm": 7.560548305511475,
"learning_rate": 8.26266694524024e-06,
"loss": 0.6756553053855896,
"step": 1570
},
{
"epoch": 1.9194139194139193,
"grad_norm": 0.7736316919326782,
"learning_rate": 8.238313643527533e-06,
"loss": 0.8379277586936951,
"step": 1572
},
{
"epoch": 1.9218559218559217,
"grad_norm": 2.3948774337768555,
"learning_rate": 8.213982644456856e-06,
"loss": 0.7130874991416931,
"step": 1574
},
{
"epoch": 1.9242979242979243,
"grad_norm": 2.804558753967285,
"learning_rate": 8.189674144518864e-06,
"loss": 0.7871428728103638,
"step": 1576
},
{
"epoch": 1.9267399267399268,
"grad_norm": 3.343308925628662,
"learning_rate": 8.165388340022507e-06,
"loss": 0.7644234895706177,
"step": 1578
},
{
"epoch": 1.9291819291819292,
"grad_norm": 0.9689104557037354,
"learning_rate": 8.14112542709347e-06,
"loss": 0.9481227397918701,
"step": 1580
},
{
"epoch": 1.9316239316239316,
"grad_norm": 0.9340876936912537,
"learning_rate": 8.116885601672557e-06,
"loss": 0.2258923351764679,
"step": 1582
},
{
"epoch": 1.934065934065934,
"grad_norm": 1.9040846824645996,
"learning_rate": 8.09266905951413e-06,
"loss": 0.5065496563911438,
"step": 1584
},
{
"epoch": 1.9365079365079365,
"grad_norm": 2.174138069152832,
"learning_rate": 8.068475996184527e-06,
"loss": 0.5920478701591492,
"step": 1586
},
{
"epoch": 1.938949938949939,
"grad_norm": 0.8130704760551453,
"learning_rate": 8.044306607060466e-06,
"loss": 0.9720399379730225,
"step": 1588
},
{
"epoch": 1.9413919413919414,
"grad_norm": 0.833109974861145,
"learning_rate": 8.02016108732748e-06,
"loss": 1.0517313480377197,
"step": 1590
},
{
"epoch": 1.9438339438339438,
"grad_norm": 2.0496108531951904,
"learning_rate": 7.996039631978352e-06,
"loss": 1.0347234010696411,
"step": 1592
},
{
"epoch": 1.9462759462759462,
"grad_norm": 1.0047261714935303,
"learning_rate": 7.97194243581151e-06,
"loss": 0.6489905118942261,
"step": 1594
},
{
"epoch": 1.9487179487179487,
"grad_norm": 1.0025273561477661,
"learning_rate": 7.947869693429486e-06,
"loss": 0.568684458732605,
"step": 1596
},
{
"epoch": 1.9511599511599511,
"grad_norm": 1.1909536123275757,
"learning_rate": 7.923821599237322e-06,
"loss": 0.6664155125617981,
"step": 1598
},
{
"epoch": 1.9536019536019538,
"grad_norm": 1.6859694719314575,
"learning_rate": 7.899798347441005e-06,
"loss": 0.7015742063522339,
"step": 1600
},
{
"epoch": 1.9560439560439562,
"grad_norm": 0.6844836473464966,
"learning_rate": 7.87580013204591e-06,
"loss": 0.9169449210166931,
"step": 1602
},
{
"epoch": 1.9584859584859586,
"grad_norm": 2.2930445671081543,
"learning_rate": 7.85182714685522e-06,
"loss": 0.8345751762390137,
"step": 1604
},
{
"epoch": 1.960927960927961,
"grad_norm": 2.5689308643341064,
"learning_rate": 7.827879585468363e-06,
"loss": 1.1974244117736816,
"step": 1606
},
{
"epoch": 1.9633699633699635,
"grad_norm": 1.2992660999298096,
"learning_rate": 7.803957641279457e-06,
"loss": 1.1730899810791016,
"step": 1608
},
{
"epoch": 1.965811965811966,
"grad_norm": 1.0391148328781128,
"learning_rate": 7.780061507475738e-06,
"loss": 0.9335651397705078,
"step": 1610
},
{
"epoch": 1.9682539682539684,
"grad_norm": 3.6143672466278076,
"learning_rate": 7.756191377036004e-06,
"loss": 0.8546837568283081,
"step": 1612
},
{
"epoch": 1.9706959706959708,
"grad_norm": 0.9346309304237366,
"learning_rate": 7.732347442729062e-06,
"loss": 1.0305918455123901,
"step": 1614
},
{
"epoch": 1.9731379731379732,
"grad_norm": 0.9905077815055847,
"learning_rate": 7.708529897112158e-06,
"loss": 0.8775286674499512,
"step": 1616
},
{
"epoch": 1.9755799755799757,
"grad_norm": 0.6666707396507263,
"learning_rate": 7.684738932529441e-06,
"loss": 0.8464508056640625,
"step": 1618
},
{
"epoch": 1.978021978021978,
"grad_norm": 1.0916727781295776,
"learning_rate": 7.660974741110387e-06,
"loss": 1.035678505897522,
"step": 1620
},
{
"epoch": 1.9804639804639805,
"grad_norm": 0.7847446203231812,
"learning_rate": 7.637237514768265e-06,
"loss": 0.6054593324661255,
"step": 1622
},
{
"epoch": 1.982905982905983,
"grad_norm": 2.2946202754974365,
"learning_rate": 7.613527445198576e-06,
"loss": 0.45836907625198364,
"step": 1624
},
{
"epoch": 1.9853479853479854,
"grad_norm": 9.175978660583496,
"learning_rate": 7.5898447238775264e-06,
"loss": 0.7117047905921936,
"step": 1626
},
{
"epoch": 1.9877899877899878,
"grad_norm": 3.764439105987549,
"learning_rate": 7.566189542060445e-06,
"loss": 1.0821315050125122,
"step": 1628
},
{
"epoch": 1.9902319902319903,
"grad_norm": 0.9272487163543701,
"learning_rate": 7.5425620907802655e-06,
"loss": 1.1502904891967773,
"step": 1630
},
{
"epoch": 1.9926739926739927,
"grad_norm": 1.1519207954406738,
"learning_rate": 7.518962560845986e-06,
"loss": 0.8673257231712341,
"step": 1632
},
{
"epoch": 1.9951159951159951,
"grad_norm": 0.6419383883476257,
"learning_rate": 7.4953911428411085e-06,
"loss": 0.75059574842453,
"step": 1634
},
{
"epoch": 1.9975579975579976,
"grad_norm": 1.7326091527938843,
"learning_rate": 7.4718480271221125e-06,
"loss": 1.0258231163024902,
"step": 1636
},
{
"epoch": 2.0,
"grad_norm": 0.8297693133354187,
"learning_rate": 7.448333403816926e-06,
"loss": 0.9197133779525757,
"step": 1638
},
{
"epoch": 2.0024420024420024,
"grad_norm": 0.842572808265686,
"learning_rate": 7.424847462823361e-06,
"loss": 0.6060487627983093,
"step": 1640
},
{
"epoch": 2.004884004884005,
"grad_norm": 1.4340323209762573,
"learning_rate": 7.401390393807615e-06,
"loss": 0.47724178433418274,
"step": 1642
},
{
"epoch": 2.0073260073260073,
"grad_norm": 0.6351611018180847,
"learning_rate": 7.37796238620272e-06,
"loss": 0.5051848292350769,
"step": 1644
},
{
"epoch": 2.0097680097680097,
"grad_norm": 3.20005202293396,
"learning_rate": 7.3545636292070055e-06,
"loss": 0.438951700925827,
"step": 1646
},
{
"epoch": 2.012210012210012,
"grad_norm": 1.5867102146148682,
"learning_rate": 7.331194311782597e-06,
"loss": 0.528706431388855,
"step": 1648
},
{
"epoch": 2.0146520146520146,
"grad_norm": 2.449397325515747,
"learning_rate": 7.307854622653863e-06,
"loss": 0.3387841284275055,
"step": 1650
},
{
"epoch": 2.017094017094017,
"grad_norm": 5.5735626220703125,
"learning_rate": 7.284544750305902e-06,
"loss": 0.6135000586509705,
"step": 1652
},
{
"epoch": 2.0195360195360195,
"grad_norm": 2.001272439956665,
"learning_rate": 7.261264882983024e-06,
"loss": 0.4525635838508606,
"step": 1654
},
{
"epoch": 2.021978021978022,
"grad_norm": 1.0277931690216064,
"learning_rate": 7.238015208687226e-06,
"loss": 0.4565449655056,
"step": 1656
},
{
"epoch": 2.0244200244200243,
"grad_norm": 1.670928716659546,
"learning_rate": 7.214795915176671e-06,
"loss": 0.4369199872016907,
"step": 1658
},
{
"epoch": 2.0268620268620268,
"grad_norm": 1.4175351858139038,
"learning_rate": 7.191607189964181e-06,
"loss": 0.6220426559448242,
"step": 1660
},
{
"epoch": 2.029304029304029,
"grad_norm": 1.3668700456619263,
"learning_rate": 7.16844922031571e-06,
"loss": 0.557952880859375,
"step": 1662
},
{
"epoch": 2.0317460317460316,
"grad_norm": 0.9909934401512146,
"learning_rate": 7.145322193248838e-06,
"loss": 0.2245861142873764,
"step": 1664
},
{
"epoch": 2.034188034188034,
"grad_norm": 6.492028713226318,
"learning_rate": 7.122226295531267e-06,
"loss": 0.40176424384117126,
"step": 1666
},
{
"epoch": 2.0366300366300365,
"grad_norm": 0.9408150911331177,
"learning_rate": 7.099161713679308e-06,
"loss": 0.4665899872779846,
"step": 1668
},
{
"epoch": 2.039072039072039,
"grad_norm": 1.566773533821106,
"learning_rate": 7.07612863395636e-06,
"loss": 0.6036043763160706,
"step": 1670
},
{
"epoch": 2.0415140415140414,
"grad_norm": 1.2262314558029175,
"learning_rate": 7.053127242371434e-06,
"loss": 0.5682324171066284,
"step": 1672
},
{
"epoch": 2.043956043956044,
"grad_norm": 0.9549220204353333,
"learning_rate": 7.030157724677631e-06,
"loss": 0.5213257074356079,
"step": 1674
},
{
"epoch": 2.0463980463980462,
"grad_norm": 1.66300368309021,
"learning_rate": 7.0072202663706405e-06,
"loss": 0.3227638006210327,
"step": 1676
},
{
"epoch": 2.0488400488400487,
"grad_norm": 1.2017823457717896,
"learning_rate": 6.984315052687258e-06,
"loss": 0.5378082990646362,
"step": 1678
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.8874703645706177,
"learning_rate": 6.96144226860388e-06,
"loss": 0.49545711278915405,
"step": 1680
},
{
"epoch": 2.0537240537240535,
"grad_norm": 1.3648614883422852,
"learning_rate": 6.938602098835e-06,
"loss": 0.3199822008609772,
"step": 1682
},
{
"epoch": 2.056166056166056,
"grad_norm": 2.5054514408111572,
"learning_rate": 6.915794727831743e-06,
"loss": 0.3839988112449646,
"step": 1684
},
{
"epoch": 2.0586080586080584,
"grad_norm": 2.381861925125122,
"learning_rate": 6.893020339780341e-06,
"loss": 0.3781861662864685,
"step": 1686
},
{
"epoch": 2.061050061050061,
"grad_norm": 2.2430403232574463,
"learning_rate": 6.870279118600679e-06,
"loss": 0.6202837824821472,
"step": 1688
},
{
"epoch": 2.0634920634920633,
"grad_norm": 2.3006107807159424,
"learning_rate": 6.847571247944791e-06,
"loss": 0.46027785539627075,
"step": 1690
},
{
"epoch": 2.065934065934066,
"grad_norm": 1.330511450767517,
"learning_rate": 6.8248969111953825e-06,
"loss": 0.31774628162384033,
"step": 1692
},
{
"epoch": 2.0683760683760686,
"grad_norm": 1.060591459274292,
"learning_rate": 6.80225629146434e-06,
"loss": 0.47486642003059387,
"step": 1694
},
{
"epoch": 2.070818070818071,
"grad_norm": 1.2816616296768188,
"learning_rate": 6.7796495715912694e-06,
"loss": 0.4364372789859772,
"step": 1696
},
{
"epoch": 2.0732600732600734,
"grad_norm": 1.004572868347168,
"learning_rate": 6.757076934142013e-06,
"loss": 0.4288478493690491,
"step": 1698
},
{
"epoch": 2.075702075702076,
"grad_norm": 1.2579833269119263,
"learning_rate": 6.734538561407158e-06,
"loss": 0.4020456075668335,
"step": 1700
},
{
"epoch": 2.0781440781440783,
"grad_norm": 1.9755547046661377,
"learning_rate": 6.712034635400593e-06,
"loss": 0.26895561814308167,
"step": 1702
},
{
"epoch": 2.0805860805860807,
"grad_norm": 2.1291699409484863,
"learning_rate": 6.689565337858019e-06,
"loss": 0.2938929796218872,
"step": 1704
},
{
"epoch": 2.083028083028083,
"grad_norm": 1.6085429191589355,
"learning_rate": 6.6671308502354844e-06,
"loss": 0.19200079143047333,
"step": 1706
},
{
"epoch": 2.0854700854700856,
"grad_norm": 3.190870761871338,
"learning_rate": 6.644731353707927e-06,
"loss": 0.5591083765029907,
"step": 1708
},
{
"epoch": 2.087912087912088,
"grad_norm": 1.8141244649887085,
"learning_rate": 6.622367029167702e-06,
"loss": 0.2770901918411255,
"step": 1710
},
{
"epoch": 2.0903540903540905,
"grad_norm": 4.159117221832275,
"learning_rate": 6.600038057223126e-06,
"loss": 0.394546240568161,
"step": 1712
},
{
"epoch": 2.092796092796093,
"grad_norm": 1.3365147113800049,
"learning_rate": 6.577744618197017e-06,
"loss": 0.4641517996788025,
"step": 1714
},
{
"epoch": 2.0952380952380953,
"grad_norm": 0.9762091636657715,
"learning_rate": 6.555486892125243e-06,
"loss": 0.32657861709594727,
"step": 1716
},
{
"epoch": 2.0976800976800978,
"grad_norm": 1.1228184700012207,
"learning_rate": 6.533265058755256e-06,
"loss": 0.6660332083702087,
"step": 1718
},
{
"epoch": 2.1001221001221,
"grad_norm": 0.9115656614303589,
"learning_rate": 6.5110792975446515e-06,
"loss": 0.48777180910110474,
"step": 1720
},
{
"epoch": 2.1025641025641026,
"grad_norm": 1.8341835737228394,
"learning_rate": 6.488929787659721e-06,
"loss": 0.6992468237876892,
"step": 1722
},
{
"epoch": 2.105006105006105,
"grad_norm": 1.1542752981185913,
"learning_rate": 6.466816707973991e-06,
"loss": 0.3529256284236908,
"step": 1724
},
{
"epoch": 2.1074481074481075,
"grad_norm": 19.553573608398438,
"learning_rate": 6.444740237066791e-06,
"loss": 0.45478177070617676,
"step": 1726
},
{
"epoch": 2.10989010989011,
"grad_norm": 0.6075100898742676,
"learning_rate": 6.422700553221817e-06,
"loss": 0.3780288100242615,
"step": 1728
},
{
"epoch": 2.1123321123321124,
"grad_norm": 0.8796222805976868,
"learning_rate": 6.400697834425662e-06,
"loss": 0.42669016122817993,
"step": 1730
},
{
"epoch": 2.114774114774115,
"grad_norm": 0.9508007764816284,
"learning_rate": 6.378732258366421e-06,
"loss": 0.34392303228378296,
"step": 1732
},
{
"epoch": 2.1172161172161172,
"grad_norm": 0.28383857011795044,
"learning_rate": 6.356804002432225e-06,
"loss": 0.1719311773777008,
"step": 1734
},
{
"epoch": 2.1196581196581197,
"grad_norm": 1.0620123147964478,
"learning_rate": 6.334913243709809e-06,
"loss": 0.5892414450645447,
"step": 1736
},
{
"epoch": 2.122100122100122,
"grad_norm": 1.1223015785217285,
"learning_rate": 6.313060158983104e-06,
"loss": 0.3725854456424713,
"step": 1738
},
{
"epoch": 2.1245421245421245,
"grad_norm": 0.83611661195755,
"learning_rate": 6.291244924731794e-06,
"loss": 0.4878256618976593,
"step": 1740
},
{
"epoch": 2.126984126984127,
"grad_norm": 1.6328321695327759,
"learning_rate": 6.26946771712988e-06,
"loss": 0.43116888403892517,
"step": 1742
},
{
"epoch": 2.1294261294261294,
"grad_norm": 1.3364393711090088,
"learning_rate": 6.247728712044283e-06,
"loss": 0.37520939111709595,
"step": 1744
},
{
"epoch": 2.131868131868132,
"grad_norm": 1.3389878273010254,
"learning_rate": 6.226028085033413e-06,
"loss": 0.5751076936721802,
"step": 1746
},
{
"epoch": 2.1343101343101343,
"grad_norm": 1.8287776708602905,
"learning_rate": 6.2043660113457325e-06,
"loss": 0.20154741406440735,
"step": 1748
},
{
"epoch": 2.1367521367521367,
"grad_norm": 1.4840490818023682,
"learning_rate": 6.182742665918373e-06,
"loss": 0.6898431777954102,
"step": 1750
},
{
"epoch": 2.139194139194139,
"grad_norm": 0.9770026803016663,
"learning_rate": 6.161158223375705e-06,
"loss": 0.3924607038497925,
"step": 1752
},
{
"epoch": 2.1416361416361416,
"grad_norm": 0.7722997069358826,
"learning_rate": 6.13961285802792e-06,
"loss": 0.43264567852020264,
"step": 1754
},
{
"epoch": 2.144078144078144,
"grad_norm": 0.9995938539505005,
"learning_rate": 6.118106743869641e-06,
"loss": 0.5022901296615601,
"step": 1756
},
{
"epoch": 2.1465201465201464,
"grad_norm": 0.5033841133117676,
"learning_rate": 6.096640054578511e-06,
"loss": 0.21431341767311096,
"step": 1758
},
{
"epoch": 2.148962148962149,
"grad_norm": 1.137976050376892,
"learning_rate": 6.075212963513776e-06,
"loss": 0.4715498685836792,
"step": 1760
},
{
"epoch": 2.1514041514041513,
"grad_norm": 0.9455146193504333,
"learning_rate": 6.053825643714912e-06,
"loss": 0.4320064187049866,
"step": 1762
},
{
"epoch": 2.1538461538461537,
"grad_norm": 2.8845789432525635,
"learning_rate": 6.032478267900206e-06,
"loss": 0.3226162791252136,
"step": 1764
},
{
"epoch": 2.156288156288156,
"grad_norm": 0.9458103179931641,
"learning_rate": 6.011171008465363e-06,
"loss": 0.2729605436325073,
"step": 1766
},
{
"epoch": 2.1587301587301586,
"grad_norm": 1.9725005626678467,
"learning_rate": 5.989904037482128e-06,
"loss": 0.3462582230567932,
"step": 1768
},
{
"epoch": 2.161172161172161,
"grad_norm": 2.0717337131500244,
"learning_rate": 5.968677526696882e-06,
"loss": 0.38312727212905884,
"step": 1770
},
{
"epoch": 2.1636141636141635,
"grad_norm": 0.8864312767982483,
"learning_rate": 5.947491647529267e-06,
"loss": 0.353424072265625,
"step": 1772
},
{
"epoch": 2.166056166056166,
"grad_norm": 1.0762509107589722,
"learning_rate": 5.9263465710707814e-06,
"loss": 0.5065031051635742,
"step": 1774
},
{
"epoch": 2.1684981684981683,
"grad_norm": 0.7869840264320374,
"learning_rate": 5.905242468083423e-06,
"loss": 0.5348921418190002,
"step": 1776
},
{
"epoch": 2.1709401709401708,
"grad_norm": 2.1878821849823,
"learning_rate": 5.884179508998299e-06,
"loss": 0.27236610651016235,
"step": 1778
},
{
"epoch": 2.173382173382173,
"grad_norm": 0.9579680562019348,
"learning_rate": 5.863157863914239e-06,
"loss": 0.43548962473869324,
"step": 1780
},
{
"epoch": 2.1758241758241756,
"grad_norm": 1.8547625541687012,
"learning_rate": 5.8421777025964446e-06,
"loss": 0.5892971754074097,
"step": 1782
},
{
"epoch": 2.178266178266178,
"grad_norm": 0.9620394706726074,
"learning_rate": 5.8212391944750965e-06,
"loss": 0.4943884313106537,
"step": 1784
},
{
"epoch": 2.1807081807081805,
"grad_norm": 2.7082159519195557,
"learning_rate": 5.8003425086440015e-06,
"loss": 0.5425156354904175,
"step": 1786
},
{
"epoch": 2.183150183150183,
"grad_norm": 4.512080669403076,
"learning_rate": 5.779487813859218e-06,
"loss": 0.3213900625705719,
"step": 1788
},
{
"epoch": 2.185592185592186,
"grad_norm": 0.9232001900672913,
"learning_rate": 5.758675278537692e-06,
"loss": 0.46233004331588745,
"step": 1790
},
{
"epoch": 2.1880341880341883,
"grad_norm": 3.6497743129730225,
"learning_rate": 5.737905070755907e-06,
"loss": 0.480983167886734,
"step": 1792
},
{
"epoch": 2.1904761904761907,
"grad_norm": 1.0851823091506958,
"learning_rate": 5.717177358248522e-06,
"loss": 0.2742152810096741,
"step": 1794
},
{
"epoch": 2.192918192918193,
"grad_norm": 2.418455123901367,
"learning_rate": 5.696492308407002e-06,
"loss": 0.3769078254699707,
"step": 1796
},
{
"epoch": 2.1953601953601956,
"grad_norm": 0.7429922223091125,
"learning_rate": 5.675850088278298e-06,
"loss": 0.40196555852890015,
"step": 1798
},
{
"epoch": 2.197802197802198,
"grad_norm": 1.3570210933685303,
"learning_rate": 5.655250864563469e-06,
"loss": 0.3571450412273407,
"step": 1800
},
{
"epoch": 2.2002442002442004,
"grad_norm": 1.8261560201644897,
"learning_rate": 5.63469480361635e-06,
"loss": 0.4585352838039398,
"step": 1802
},
{
"epoch": 2.202686202686203,
"grad_norm": 2.33353328704834,
"learning_rate": 5.614182071442201e-06,
"loss": 0.4414786100387573,
"step": 1804
},
{
"epoch": 2.2051282051282053,
"grad_norm": 1.7394614219665527,
"learning_rate": 5.59371283369637e-06,
"loss": 0.5657206177711487,
"step": 1806
},
{
"epoch": 2.2075702075702077,
"grad_norm": 1.2605091333389282,
"learning_rate": 5.573287255682967e-06,
"loss": 0.5330032706260681,
"step": 1808
},
{
"epoch": 2.21001221001221,
"grad_norm": 0.2691946029663086,
"learning_rate": 5.552905502353502e-06,
"loss": 0.2634370028972626,
"step": 1810
},
{
"epoch": 2.2124542124542126,
"grad_norm": 0.983033299446106,
"learning_rate": 5.532567738305576e-06,
"loss": 0.4326469302177429,
"step": 1812
},
{
"epoch": 2.214896214896215,
"grad_norm": 0.23342449963092804,
"learning_rate": 5.512274127781552e-06,
"loss": 0.1571735441684723,
"step": 1814
},
{
"epoch": 2.2173382173382175,
"grad_norm": 1.2843339443206787,
"learning_rate": 5.492024834667205e-06,
"loss": 0.5355442762374878,
"step": 1816
},
{
"epoch": 2.21978021978022,
"grad_norm": 0.949738621711731,
"learning_rate": 5.471820022490422e-06,
"loss": 0.38218754529953003,
"step": 1818
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.8940930962562561,
"learning_rate": 5.451659854419882e-06,
"loss": 0.49747079610824585,
"step": 1820
},
{
"epoch": 2.2246642246642248,
"grad_norm": 0.6108909249305725,
"learning_rate": 5.431544493263714e-06,
"loss": 0.2641042172908783,
"step": 1822
},
{
"epoch": 2.227106227106227,
"grad_norm": 0.776020884513855,
"learning_rate": 5.411474101468208e-06,
"loss": 0.39929312467575073,
"step": 1824
},
{
"epoch": 2.2295482295482296,
"grad_norm": 1.3689377307891846,
"learning_rate": 5.3914488411165e-06,
"loss": 0.2978437840938568,
"step": 1826
},
{
"epoch": 2.231990231990232,
"grad_norm": 2.88201904296875,
"learning_rate": 5.3714688739272396e-06,
"loss": 0.3673563599586487,
"step": 1828
},
{
"epoch": 2.2344322344322345,
"grad_norm": 2.748995065689087,
"learning_rate": 5.351534361253312e-06,
"loss": 0.29434409737586975,
"step": 1830
},
{
"epoch": 2.236874236874237,
"grad_norm": 1.0924896001815796,
"learning_rate": 5.331645464080526e-06,
"loss": 0.46827900409698486,
"step": 1832
},
{
"epoch": 2.2393162393162394,
"grad_norm": 0.8829333186149597,
"learning_rate": 5.311802343026302e-06,
"loss": 0.5047073364257812,
"step": 1834
},
{
"epoch": 2.241758241758242,
"grad_norm": 1.315529227256775,
"learning_rate": 5.292005158338394e-06,
"loss": 0.40334218740463257,
"step": 1836
},
{
"epoch": 2.244200244200244,
"grad_norm": 2.0851378440856934,
"learning_rate": 5.272254069893579e-06,
"loss": 0.5924956798553467,
"step": 1838
},
{
"epoch": 2.2466422466422467,
"grad_norm": 3.3136000633239746,
"learning_rate": 5.2525492371963785e-06,
"loss": 0.31219542026519775,
"step": 1840
},
{
"epoch": 2.249084249084249,
"grad_norm": 0.728590190410614,
"learning_rate": 5.232890819377765e-06,
"loss": 0.46928393840789795,
"step": 1842
},
{
"epoch": 2.2515262515262515,
"grad_norm": 0.7545236945152283,
"learning_rate": 5.213278975193874e-06,
"loss": 0.4485982060432434,
"step": 1844
},
{
"epoch": 2.253968253968254,
"grad_norm": 2.7309439182281494,
"learning_rate": 5.193713863024722e-06,
"loss": 0.3948480784893036,
"step": 1846
},
{
"epoch": 2.2564102564102564,
"grad_norm": 0.3682626187801361,
"learning_rate": 5.174195640872937e-06,
"loss": 0.3254821300506592,
"step": 1848
},
{
"epoch": 2.258852258852259,
"grad_norm": 1.932949423789978,
"learning_rate": 5.154724466362473e-06,
"loss": 0.43265148997306824,
"step": 1850
},
{
"epoch": 2.2612942612942613,
"grad_norm": 1.3246240615844727,
"learning_rate": 5.135300496737335e-06,
"loss": 0.5352158546447754,
"step": 1852
},
{
"epoch": 2.2637362637362637,
"grad_norm": 0.8921855688095093,
"learning_rate": 5.115923888860321e-06,
"loss": 0.6833795309066772,
"step": 1854
},
{
"epoch": 2.266178266178266,
"grad_norm": 1.2048108577728271,
"learning_rate": 5.096594799211748e-06,
"loss": 0.6043341755867004,
"step": 1856
},
{
"epoch": 2.2686202686202686,
"grad_norm": 1.5590717792510986,
"learning_rate": 5.0773133838881806e-06,
"loss": 0.6158211827278137,
"step": 1858
},
{
"epoch": 2.271062271062271,
"grad_norm": 0.9362733364105225,
"learning_rate": 5.058079798601184e-06,
"loss": 0.7204128503799438,
"step": 1860
},
{
"epoch": 2.2735042735042734,
"grad_norm": 1.0600636005401611,
"learning_rate": 5.0388941986760675e-06,
"loss": 0.32139068841934204,
"step": 1862
},
{
"epoch": 2.275946275946276,
"grad_norm": 0.8406434059143066,
"learning_rate": 5.019756739050606e-06,
"loss": 0.29253455996513367,
"step": 1864
},
{
"epoch": 2.2783882783882783,
"grad_norm": 2.3749077320098877,
"learning_rate": 5.000667574273821e-06,
"loss": 0.39995700120925903,
"step": 1866
},
{
"epoch": 2.2808302808302807,
"grad_norm": 0.8223360180854797,
"learning_rate": 4.981626858504718e-06,
"loss": 0.45448631048202515,
"step": 1868
},
{
"epoch": 2.283272283272283,
"grad_norm": 0.7664647698402405,
"learning_rate": 4.962634745511027e-06,
"loss": 0.42726626992225647,
"step": 1870
},
{
"epoch": 2.2857142857142856,
"grad_norm": 1.1275815963745117,
"learning_rate": 4.943691388667989e-06,
"loss": 0.4752141237258911,
"step": 1872
},
{
"epoch": 2.288156288156288,
"grad_norm": 2.4123940467834473,
"learning_rate": 4.924796940957099e-06,
"loss": 0.13898348808288574,
"step": 1874
},
{
"epoch": 2.2905982905982905,
"grad_norm": 1.461748480796814,
"learning_rate": 4.905951554964876e-06,
"loss": 0.6339101791381836,
"step": 1876
},
{
"epoch": 2.293040293040293,
"grad_norm": 2.0306098461151123,
"learning_rate": 4.887155382881625e-06,
"loss": 0.347889244556427,
"step": 1878
},
{
"epoch": 2.2954822954822953,
"grad_norm": 1.3482933044433594,
"learning_rate": 4.868408576500216e-06,
"loss": 0.340035080909729,
"step": 1880
},
{
"epoch": 2.2979242979242978,
"grad_norm": 4.910120010375977,
"learning_rate": 4.849711287214856e-06,
"loss": 0.5293861031532288,
"step": 1882
},
{
"epoch": 2.3003663003663,
"grad_norm": 1.0976754426956177,
"learning_rate": 4.8310636660198616e-06,
"loss": 0.31249868869781494,
"step": 1884
},
{
"epoch": 2.3028083028083026,
"grad_norm": 1.3118927478790283,
"learning_rate": 4.812465863508448e-06,
"loss": 0.5040943026542664,
"step": 1886
},
{
"epoch": 2.305250305250305,
"grad_norm": 0.9740425944328308,
"learning_rate": 4.7939180298715055e-06,
"loss": 0.42627787590026855,
"step": 1888
},
{
"epoch": 2.3076923076923075,
"grad_norm": 1.1387205123901367,
"learning_rate": 4.775420314896384e-06,
"loss": 0.44656771421432495,
"step": 1890
},
{
"epoch": 2.31013431013431,
"grad_norm": 2.269031047821045,
"learning_rate": 4.756972867965698e-06,
"loss": 0.5736830830574036,
"step": 1892
},
{
"epoch": 2.3125763125763124,
"grad_norm": 0.9688907265663147,
"learning_rate": 4.738575838056104e-06,
"loss": 0.4964962601661682,
"step": 1894
},
{
"epoch": 2.315018315018315,
"grad_norm": 1.7838249206542969,
"learning_rate": 4.7202293737371066e-06,
"loss": 0.4222361445426941,
"step": 1896
},
{
"epoch": 2.317460317460317,
"grad_norm": 1.0578351020812988,
"learning_rate": 4.7019336231698576e-06,
"loss": 0.5211227536201477,
"step": 1898
},
{
"epoch": 2.3199023199023197,
"grad_norm": 1.8706358671188354,
"learning_rate": 4.6836887341059525e-06,
"loss": 0.8980540633201599,
"step": 1900
},
{
"epoch": 2.3223443223443225,
"grad_norm": 1.151202917098999,
"learning_rate": 4.6654948538862475e-06,
"loss": 0.4475945234298706,
"step": 1902
},
{
"epoch": 2.324786324786325,
"grad_norm": 4.294190406799316,
"learning_rate": 4.647352129439665e-06,
"loss": 0.251365065574646,
"step": 1904
},
{
"epoch": 2.3272283272283274,
"grad_norm": 1.604580044746399,
"learning_rate": 4.629260707282009e-06,
"loss": 0.190834641456604,
"step": 1906
},
{
"epoch": 2.32967032967033,
"grad_norm": 1.1880110502243042,
"learning_rate": 4.6112207335147704e-06,
"loss": 0.2842097878456116,
"step": 1908
},
{
"epoch": 2.3321123321123323,
"grad_norm": 2.0477302074432373,
"learning_rate": 4.593232353823968e-06,
"loss": 0.23184801638126373,
"step": 1910
},
{
"epoch": 2.3345543345543347,
"grad_norm": 1.7173128128051758,
"learning_rate": 4.575295713478956e-06,
"loss": 0.40144017338752747,
"step": 1912
},
{
"epoch": 2.336996336996337,
"grad_norm": 0.9430311322212219,
"learning_rate": 4.557410957331249e-06,
"loss": 0.5639522075653076,
"step": 1914
},
{
"epoch": 2.3394383394383396,
"grad_norm": 3.2917191982269287,
"learning_rate": 4.539578229813372e-06,
"loss": 0.636457622051239,
"step": 1916
},
{
"epoch": 2.341880341880342,
"grad_norm": 1.405510663986206,
"learning_rate": 4.521797674937672e-06,
"loss": 0.26978304982185364,
"step": 1918
},
{
"epoch": 2.3443223443223444,
"grad_norm": 2.574928045272827,
"learning_rate": 4.5040694362951625e-06,
"loss": 0.3309711515903473,
"step": 1920
},
{
"epoch": 2.346764346764347,
"grad_norm": 1.7721152305603027,
"learning_rate": 4.486393657054369e-06,
"loss": 0.3379634618759155,
"step": 1922
},
{
"epoch": 2.3492063492063493,
"grad_norm": 0.34488657116889954,
"learning_rate": 4.468770479960171e-06,
"loss": 0.2894682288169861,
"step": 1924
},
{
"epoch": 2.3516483516483517,
"grad_norm": 1.060381531715393,
"learning_rate": 4.451200047332638e-06,
"loss": 0.44025763869285583,
"step": 1926
},
{
"epoch": 2.354090354090354,
"grad_norm": 1.5222772359848022,
"learning_rate": 4.433682501065897e-06,
"loss": 0.3474840223789215,
"step": 1928
},
{
"epoch": 2.3565323565323566,
"grad_norm": 2.951404094696045,
"learning_rate": 4.416217982626981e-06,
"loss": 0.3358984589576721,
"step": 1930
},
{
"epoch": 2.358974358974359,
"grad_norm": 1.0801118612289429,
"learning_rate": 4.398806633054675e-06,
"loss": 0.3395053446292877,
"step": 1932
},
{
"epoch": 2.3614163614163615,
"grad_norm": 2.127126693725586,
"learning_rate": 4.381448592958394e-06,
"loss": 0.5439938902854919,
"step": 1934
},
{
"epoch": 2.363858363858364,
"grad_norm": 1.0053937435150146,
"learning_rate": 4.36414400251704e-06,
"loss": 0.2674437463283539,
"step": 1936
},
{
"epoch": 2.3663003663003663,
"grad_norm": 0.9853598475456238,
"learning_rate": 4.346893001477861e-06,
"loss": 0.4141199290752411,
"step": 1938
},
{
"epoch": 2.3687423687423688,
"grad_norm": 8.180671691894531,
"learning_rate": 4.329695729155342e-06,
"loss": 0.5360310673713684,
"step": 1940
},
{
"epoch": 2.371184371184371,
"grad_norm": 0.22848689556121826,
"learning_rate": 4.3125523244300686e-06,
"loss": 0.25111788511276245,
"step": 1942
},
{
"epoch": 2.3736263736263736,
"grad_norm": 1.5355631113052368,
"learning_rate": 4.295462925747594e-06,
"loss": 0.3430798351764679,
"step": 1944
},
{
"epoch": 2.376068376068376,
"grad_norm": 1.6975699663162231,
"learning_rate": 4.278427671117344e-06,
"loss": 0.08609216660261154,
"step": 1946
},
{
"epoch": 2.3785103785103785,
"grad_norm": 1.575578212738037,
"learning_rate": 4.261446698111496e-06,
"loss": 0.194163978099823,
"step": 1948
},
{
"epoch": 2.380952380952381,
"grad_norm": 4.127973556518555,
"learning_rate": 4.24452014386385e-06,
"loss": 0.20009776949882507,
"step": 1950
},
{
"epoch": 2.3833943833943834,
"grad_norm": 0.7139300107955933,
"learning_rate": 4.22764814506874e-06,
"loss": 0.12069036066532135,
"step": 1952
},
{
"epoch": 2.385836385836386,
"grad_norm": 3.075773000717163,
"learning_rate": 4.210830837979932e-06,
"loss": 0.35760805010795593,
"step": 1954
},
{
"epoch": 2.3882783882783882,
"grad_norm": 1.492324948310852,
"learning_rate": 4.194068358409503e-06,
"loss": 0.48620444536209106,
"step": 1956
},
{
"epoch": 2.3907203907203907,
"grad_norm": 1.7053909301757812,
"learning_rate": 4.17736084172677e-06,
"loss": 0.20889446139335632,
"step": 1958
},
{
"epoch": 2.393162393162393,
"grad_norm": 1.3225889205932617,
"learning_rate": 4.160708422857178e-06,
"loss": 0.5993058085441589,
"step": 1960
},
{
"epoch": 2.3956043956043955,
"grad_norm": 1.3367353677749634,
"learning_rate": 4.144111236281214e-06,
"loss": 0.1960648149251938,
"step": 1962
},
{
"epoch": 2.398046398046398,
"grad_norm": 2.359844446182251,
"learning_rate": 4.127569416033332e-06,
"loss": 0.5698574185371399,
"step": 1964
},
{
"epoch": 2.4004884004884004,
"grad_norm": 1.1340882778167725,
"learning_rate": 4.111083095700858e-06,
"loss": 0.18890273571014404,
"step": 1966
},
{
"epoch": 2.402930402930403,
"grad_norm": 2.4454874992370605,
"learning_rate": 4.094652408422913e-06,
"loss": 0.3097396492958069,
"step": 1968
},
{
"epoch": 2.4053724053724053,
"grad_norm": 4.218069553375244,
"learning_rate": 4.078277486889341e-06,
"loss": 0.23327361047267914,
"step": 1970
},
{
"epoch": 2.4078144078144077,
"grad_norm": 3.866490364074707,
"learning_rate": 4.061958463339646e-06,
"loss": 0.06529633700847626,
"step": 1972
},
{
"epoch": 2.41025641025641,
"grad_norm": 0.4942020773887634,
"learning_rate": 4.045695469561899e-06,
"loss": 0.08752602338790894,
"step": 1974
},
{
"epoch": 2.4126984126984126,
"grad_norm": 3.321356773376465,
"learning_rate": 4.029488636891702e-06,
"loss": 0.3558381199836731,
"step": 1976
},
{
"epoch": 2.415140415140415,
"grad_norm": 3.152714729309082,
"learning_rate": 4.013338096211109e-06,
"loss": 0.3303931653499603,
"step": 1978
},
{
"epoch": 2.4175824175824174,
"grad_norm": 0.6018658876419067,
"learning_rate": 3.99724397794758e-06,
"loss": 0.22131627798080444,
"step": 1980
},
{
"epoch": 2.42002442002442,
"grad_norm": 1.3327726125717163,
"learning_rate": 3.981206412072914e-06,
"loss": 0.39478451013565063,
"step": 1982
},
{
"epoch": 2.4224664224664223,
"grad_norm": 1.705815076828003,
"learning_rate": 3.965225528102217e-06,
"loss": 0.3109724521636963,
"step": 1984
},
{
"epoch": 2.4249084249084247,
"grad_norm": 0.7618647217750549,
"learning_rate": 3.949301455092845e-06,
"loss": 0.5224888920783997,
"step": 1986
},
{
"epoch": 2.427350427350427,
"grad_norm": 1.2163892984390259,
"learning_rate": 3.933434321643356e-06,
"loss": 0.4845066964626312,
"step": 1988
},
{
"epoch": 2.42979242979243,
"grad_norm": 0.8843790292739868,
"learning_rate": 3.917624255892489e-06,
"loss": 0.5302805304527283,
"step": 1990
},
{
"epoch": 2.4322344322344325,
"grad_norm": 1.2315729856491089,
"learning_rate": 3.901871385518117e-06,
"loss": 0.42821258306503296,
"step": 1992
},
{
"epoch": 2.434676434676435,
"grad_norm": 0.9088804125785828,
"learning_rate": 3.886175837736214e-06,
"loss": 0.4940814673900604,
"step": 1994
},
{
"epoch": 2.4371184371184373,
"grad_norm": 1.1520100831985474,
"learning_rate": 3.870537739299836e-06,
"loss": 0.3047824501991272,
"step": 1996
},
{
"epoch": 2.4395604395604398,
"grad_norm": 0.7935906648635864,
"learning_rate": 3.854957216498099e-06,
"loss": 0.5371643900871277,
"step": 1998
},
{
"epoch": 2.442002442002442,
"grad_norm": 1.0501606464385986,
"learning_rate": 3.839434395155135e-06,
"loss": 0.24889859557151794,
"step": 2000
},
{
"epoch": 2.4444444444444446,
"grad_norm": 1.7994686365127563,
"learning_rate": 3.8239694006291194e-06,
"loss": 0.45958831906318665,
"step": 2002
},
{
"epoch": 2.446886446886447,
"grad_norm": 0.9377945065498352,
"learning_rate": 3.8085623578112136e-06,
"loss": 0.22220918536186218,
"step": 2004
},
{
"epoch": 2.4493284493284495,
"grad_norm": 1.056534767150879,
"learning_rate": 3.793213391124586e-06,
"loss": 0.29667913913726807,
"step": 2006
},
{
"epoch": 2.451770451770452,
"grad_norm": 1.055069088935852,
"learning_rate": 3.7779226245233937e-06,
"loss": 0.7430405616760254,
"step": 2008
},
{
"epoch": 2.4542124542124544,
"grad_norm": 1.062638282775879,
"learning_rate": 3.7626901814917927e-06,
"loss": 0.3536508083343506,
"step": 2010
},
{
"epoch": 2.456654456654457,
"grad_norm": 2.2568395137786865,
"learning_rate": 3.747516185042922e-06,
"loss": 0.2591190040111542,
"step": 2012
},
{
"epoch": 2.4590964590964592,
"grad_norm": 1.5303833484649658,
"learning_rate": 3.7324007577179283e-06,
"loss": 0.5008297562599182,
"step": 2014
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.9226781725883484,
"learning_rate": 3.7173440215849744e-06,
"loss": 0.4963090121746063,
"step": 2016
},
{
"epoch": 2.463980463980464,
"grad_norm": 0.9127579927444458,
"learning_rate": 3.7023460982382355e-06,
"loss": 0.5157759189605713,
"step": 2018
},
{
"epoch": 2.4664224664224665,
"grad_norm": 7.223013401031494,
"learning_rate": 3.687407108796942e-06,
"loss": 0.4686001241207123,
"step": 2020
},
{
"epoch": 2.468864468864469,
"grad_norm": 1.2899993658065796,
"learning_rate": 3.672527173904388e-06,
"loss": 0.25978168845176697,
"step": 2022
},
{
"epoch": 2.4713064713064714,
"grad_norm": 5.451155662536621,
"learning_rate": 3.6577064137269525e-06,
"loss": 0.3640308380126953,
"step": 2024
},
{
"epoch": 2.473748473748474,
"grad_norm": 10.173837661743164,
"learning_rate": 3.6429449479531416e-06,
"loss": 0.3720964193344116,
"step": 2026
},
{
"epoch": 2.4761904761904763,
"grad_norm": 0.20691752433776855,
"learning_rate": 3.6282428957926154e-06,
"loss": 0.2083432972431183,
"step": 2028
},
{
"epoch": 2.4786324786324787,
"grad_norm": 2.024094581604004,
"learning_rate": 3.613600375975221e-06,
"loss": 0.5114956498146057,
"step": 2030
},
{
"epoch": 2.481074481074481,
"grad_norm": 1.2281562089920044,
"learning_rate": 3.599017506750042e-06,
"loss": 0.47537893056869507,
"step": 2032
},
{
"epoch": 2.4835164835164836,
"grad_norm": 2.2216989994049072,
"learning_rate": 3.5844944058844393e-06,
"loss": 0.25453007221221924,
"step": 2034
},
{
"epoch": 2.485958485958486,
"grad_norm": 2.591078281402588,
"learning_rate": 3.570031190663098e-06,
"loss": 0.5005137920379639,
"step": 2036
},
{
"epoch": 2.4884004884004884,
"grad_norm": 0.48911339044570923,
"learning_rate": 3.5556279778870862e-06,
"loss": 0.5193389058113098,
"step": 2038
},
{
"epoch": 2.490842490842491,
"grad_norm": 27.082082748413086,
"learning_rate": 3.5412848838729075e-06,
"loss": 0.5654491782188416,
"step": 2040
},
{
"epoch": 2.4932844932844933,
"grad_norm": 1.6297937631607056,
"learning_rate": 3.5270020244515583e-06,
"loss": 0.5325220227241516,
"step": 2042
},
{
"epoch": 2.4957264957264957,
"grad_norm": 0.9335009455680847,
"learning_rate": 3.5127795149676014e-06,
"loss": 0.38437139987945557,
"step": 2044
},
{
"epoch": 2.498168498168498,
"grad_norm": 141.24978637695312,
"learning_rate": 3.49861747027823e-06,
"loss": 0.2638123035430908,
"step": 2046
},
{
"epoch": 2.5006105006105006,
"grad_norm": 1.3640321493148804,
"learning_rate": 3.484516004752334e-06,
"loss": 0.4149170219898224,
"step": 2048
},
{
"epoch": 2.503052503052503,
"grad_norm": 1.0066052675247192,
"learning_rate": 3.4704752322695877e-06,
"loss": 0.4781511425971985,
"step": 2050
},
{
"epoch": 2.5054945054945055,
"grad_norm": 1.2308069467544556,
"learning_rate": 3.456495266219525e-06,
"loss": 0.7653157711029053,
"step": 2052
},
{
"epoch": 2.507936507936508,
"grad_norm": 1.3373329639434814,
"learning_rate": 3.442576219500614e-06,
"loss": 0.36611488461494446,
"step": 2054
},
{
"epoch": 2.5103785103785103,
"grad_norm": 1.555979609489441,
"learning_rate": 3.428718204519369e-06,
"loss": 0.531693696975708,
"step": 2056
},
{
"epoch": 2.5128205128205128,
"grad_norm": 8.703025817871094,
"learning_rate": 3.4149213331894193e-06,
"loss": 0.18801343441009521,
"step": 2058
},
{
"epoch": 2.515262515262515,
"grad_norm": 1.2803109884262085,
"learning_rate": 3.4011857169306127e-06,
"loss": 0.16657070815563202,
"step": 2060
},
{
"epoch": 2.5177045177045176,
"grad_norm": 0.712373673915863,
"learning_rate": 3.3875114666681235e-06,
"loss": 0.2420540601015091,
"step": 2062
},
{
"epoch": 2.52014652014652,
"grad_norm": 1.780391812324524,
"learning_rate": 3.3738986928315474e-06,
"loss": 0.4269709587097168,
"step": 2064
},
{
"epoch": 2.5225885225885225,
"grad_norm": 1.2723828554153442,
"learning_rate": 3.360347505354011e-06,
"loss": 0.3732086420059204,
"step": 2066
},
{
"epoch": 2.525030525030525,
"grad_norm": 2.761953353881836,
"learning_rate": 3.3468580136712903e-06,
"loss": 0.5551900863647461,
"step": 2068
},
{
"epoch": 2.5274725274725274,
"grad_norm": 0.8927345275878906,
"learning_rate": 3.333430326720921e-06,
"loss": 0.5004504919052124,
"step": 2070
},
{
"epoch": 2.52991452991453,
"grad_norm": 0.67017662525177,
"learning_rate": 3.3200645529413165e-06,
"loss": 0.31844204664230347,
"step": 2072
},
{
"epoch": 2.5323565323565322,
"grad_norm": 1.6567728519439697,
"learning_rate": 3.3067608002709006e-06,
"loss": 0.592690646648407,
"step": 2074
},
{
"epoch": 2.5347985347985347,
"grad_norm": 1.0990091562271118,
"learning_rate": 3.2935191761472313e-06,
"loss": 0.509267270565033,
"step": 2076
},
{
"epoch": 2.537240537240537,
"grad_norm": 2.832087516784668,
"learning_rate": 3.280339787506127e-06,
"loss": 0.4890163540840149,
"step": 2078
},
{
"epoch": 2.5396825396825395,
"grad_norm": 3.6818792819976807,
"learning_rate": 3.2672227407808184e-06,
"loss": 0.35127052664756775,
"step": 2080
},
{
"epoch": 2.542124542124542,
"grad_norm": 0.9744904041290283,
"learning_rate": 3.2541681419010716e-06,
"loss": 0.4693216383457184,
"step": 2082
},
{
"epoch": 2.5445665445665444,
"grad_norm": 0.9872434735298157,
"learning_rate": 3.2411760962923434e-06,
"loss": 0.47572940587997437,
"step": 2084
},
{
"epoch": 2.547008547008547,
"grad_norm": 1.288815975189209,
"learning_rate": 3.228246708874926e-06,
"loss": 0.45491641759872437,
"step": 2086
},
{
"epoch": 2.5494505494505493,
"grad_norm": 1.0426764488220215,
"learning_rate": 3.2153800840631043e-06,
"loss": 0.6177046298980713,
"step": 2088
},
{
"epoch": 2.5518925518925517,
"grad_norm": 1.2259653806686401,
"learning_rate": 3.202576325764307e-06,
"loss": 0.45679447054862976,
"step": 2090
},
{
"epoch": 2.554334554334554,
"grad_norm": 2.0075936317443848,
"learning_rate": 3.1898355373782663e-06,
"loss": 0.3028113842010498,
"step": 2092
},
{
"epoch": 2.5567765567765566,
"grad_norm": 0.8422965407371521,
"learning_rate": 3.177157821796191e-06,
"loss": 0.2570323646068573,
"step": 2094
},
{
"epoch": 2.559218559218559,
"grad_norm": 0.8695139288902283,
"learning_rate": 3.1645432813999306e-06,
"loss": 0.3652976155281067,
"step": 2096
},
{
"epoch": 2.5616605616605614,
"grad_norm": 2.6163241863250732,
"learning_rate": 3.1519920180611436e-06,
"loss": 0.08200995624065399,
"step": 2098
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.7538577914237976,
"learning_rate": 3.139504133140484e-06,
"loss": 0.26613810658454895,
"step": 2100
},
{
"epoch": 2.5665445665445663,
"grad_norm": 0.9928892254829407,
"learning_rate": 3.127079727486781e-06,
"loss": 0.39854198694229126,
"step": 2102
},
{
"epoch": 2.5689865689865687,
"grad_norm": 2.9046833515167236,
"learning_rate": 3.114718901436215e-06,
"loss": 0.35459813475608826,
"step": 2104
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.8664820194244385,
"learning_rate": 3.1024217548115195e-06,
"loss": 0.3210771977901459,
"step": 2106
},
{
"epoch": 2.5738705738705736,
"grad_norm": 1.5520901679992676,
"learning_rate": 3.090188386921171e-06,
"loss": 0.24245740473270416,
"step": 2108
},
{
"epoch": 2.576312576312576,
"grad_norm": 1.7673155069351196,
"learning_rate": 3.078018896558582e-06,
"loss": 0.21324002742767334,
"step": 2110
},
{
"epoch": 2.578754578754579,
"grad_norm": 0.730332612991333,
"learning_rate": 3.0659133820013123e-06,
"loss": 0.469443142414093,
"step": 2112
},
{
"epoch": 2.5811965811965814,
"grad_norm": 1.5071324110031128,
"learning_rate": 3.0538719410102612e-06,
"loss": 0.16458410024642944,
"step": 2114
},
{
"epoch": 2.583638583638584,
"grad_norm": 1.1855233907699585,
"learning_rate": 3.0418946708288984e-06,
"loss": 0.3730916976928711,
"step": 2116
},
{
"epoch": 2.586080586080586,
"grad_norm": 1.2179559469223022,
"learning_rate": 3.029981668182458e-06,
"loss": 0.5398478507995605,
"step": 2118
},
{
"epoch": 2.5885225885225887,
"grad_norm": 1.1000230312347412,
"learning_rate": 3.0181330292771727e-06,
"loss": 0.25115227699279785,
"step": 2120
},
{
"epoch": 2.590964590964591,
"grad_norm": 1.437605857849121,
"learning_rate": 3.0063488497994864e-06,
"loss": 0.6454752087593079,
"step": 2122
},
{
"epoch": 2.5934065934065935,
"grad_norm": 0.7121138572692871,
"learning_rate": 2.994629224915288e-06,
"loss": 0.30809617042541504,
"step": 2124
},
{
"epoch": 2.595848595848596,
"grad_norm": 1.196258783340454,
"learning_rate": 2.9829742492691436e-06,
"loss": 0.1984136551618576,
"step": 2126
},
{
"epoch": 2.5982905982905984,
"grad_norm": 3.140024423599243,
"learning_rate": 2.971384016983522e-06,
"loss": 0.4299178123474121,
"step": 2128
},
{
"epoch": 2.600732600732601,
"grad_norm": 2.820770502090454,
"learning_rate": 2.959858621658047e-06,
"loss": 0.2969256043434143,
"step": 2130
},
{
"epoch": 2.6031746031746033,
"grad_norm": 3.3160879611968994,
"learning_rate": 2.94839815636874e-06,
"loss": 0.2652299702167511,
"step": 2132
},
{
"epoch": 2.6056166056166057,
"grad_norm": 0.7100194096565247,
"learning_rate": 2.9370027136672536e-06,
"loss": 0.34369128942489624,
"step": 2134
},
{
"epoch": 2.608058608058608,
"grad_norm": 3.5660557746887207,
"learning_rate": 2.925672385580145e-06,
"loss": 0.30307111144065857,
"step": 2136
},
{
"epoch": 2.6105006105006106,
"grad_norm": 0.9895382523536682,
"learning_rate": 2.9144072636081233e-06,
"loss": 0.2503519058227539,
"step": 2138
},
{
"epoch": 2.612942612942613,
"grad_norm": 0.7191367745399475,
"learning_rate": 2.9032074387253017e-06,
"loss": 0.25583434104919434,
"step": 2140
},
{
"epoch": 2.6153846153846154,
"grad_norm": 1.4276951551437378,
"learning_rate": 2.892073001378481e-06,
"loss": 0.3618330955505371,
"step": 2142
},
{
"epoch": 2.617826617826618,
"grad_norm": 2.0080482959747314,
"learning_rate": 2.881004041486406e-06,
"loss": 0.4887958765029907,
"step": 2144
},
{
"epoch": 2.6202686202686203,
"grad_norm": 0.8838030099868774,
"learning_rate": 2.8700006484390395e-06,
"loss": 0.46932682394981384,
"step": 2146
},
{
"epoch": 2.6227106227106227,
"grad_norm": 1.1301337480545044,
"learning_rate": 2.8590629110968503e-06,
"loss": 0.3209373652935028,
"step": 2148
},
{
"epoch": 2.625152625152625,
"grad_norm": 1.117184042930603,
"learning_rate": 2.8481909177900874e-06,
"loss": 0.468944787979126,
"step": 2150
},
{
"epoch": 2.6275946275946276,
"grad_norm": 1.6847853660583496,
"learning_rate": 2.837384756318063e-06,
"loss": 0.439802885055542,
"step": 2152
},
{
"epoch": 2.63003663003663,
"grad_norm": 1.6028481721878052,
"learning_rate": 2.826644513948456e-06,
"loss": 0.48533153533935547,
"step": 2154
},
{
"epoch": 2.6324786324786325,
"grad_norm": 2.249617576599121,
"learning_rate": 2.8159702774166e-06,
"loss": 0.5256586670875549,
"step": 2156
},
{
"epoch": 2.634920634920635,
"grad_norm": 1.6403663158416748,
"learning_rate": 2.8053621329247767e-06,
"loss": 0.5299547910690308,
"step": 2158
},
{
"epoch": 2.6373626373626373,
"grad_norm": 1.569277048110962,
"learning_rate": 2.7948201661415307e-06,
"loss": 0.2885707914829254,
"step": 2160
},
{
"epoch": 2.6398046398046398,
"grad_norm": 1.2910041809082031,
"learning_rate": 2.7843444622009746e-06,
"loss": 0.34332627058029175,
"step": 2162
},
{
"epoch": 2.642246642246642,
"grad_norm": 1.1258636713027954,
"learning_rate": 2.773935105702096e-06,
"loss": 0.3300524652004242,
"step": 2164
},
{
"epoch": 2.6446886446886446,
"grad_norm": 1.1584712266921997,
"learning_rate": 2.763592180708081e-06,
"loss": 0.4990626871585846,
"step": 2166
},
{
"epoch": 2.647130647130647,
"grad_norm": 0.8549714684486389,
"learning_rate": 2.7533157707456336e-06,
"loss": 0.42835402488708496,
"step": 2168
},
{
"epoch": 2.6495726495726495,
"grad_norm": 0.7408347129821777,
"learning_rate": 2.7431059588042945e-06,
"loss": 0.504192590713501,
"step": 2170
},
{
"epoch": 2.652014652014652,
"grad_norm": 1.2692267894744873,
"learning_rate": 2.7329628273357815e-06,
"loss": 0.5846405029296875,
"step": 2172
},
{
"epoch": 2.6544566544566544,
"grad_norm": 1.1758378744125366,
"learning_rate": 2.72288645825332e-06,
"loss": 0.4775027632713318,
"step": 2174
},
{
"epoch": 2.656898656898657,
"grad_norm": 1.020842432975769,
"learning_rate": 2.7128769329309744e-06,
"loss": 0.2678804397583008,
"step": 2176
},
{
"epoch": 2.659340659340659,
"grad_norm": 0.7583962082862854,
"learning_rate": 2.702934332203002e-06,
"loss": 0.4422096908092499,
"step": 2178
},
{
"epoch": 2.6617826617826617,
"grad_norm": 2.3237428665161133,
"learning_rate": 2.6930587363631932e-06,
"loss": 0.4233754575252533,
"step": 2180
},
{
"epoch": 2.664224664224664,
"grad_norm": 0.6809400916099548,
"learning_rate": 2.6832502251642223e-06,
"loss": 0.40418240427970886,
"step": 2182
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.7269507050514221,
"learning_rate": 2.6735088778170105e-06,
"loss": 0.2588379979133606,
"step": 2184
},
{
"epoch": 2.669108669108669,
"grad_norm": 1.2103101015090942,
"learning_rate": 2.66383477299008e-06,
"loss": 0.39823517203330994,
"step": 2186
},
{
"epoch": 2.6715506715506714,
"grad_norm": 0.9755131006240845,
"learning_rate": 2.6542279888089163e-06,
"loss": 0.3795110881328583,
"step": 2188
},
{
"epoch": 2.6739926739926743,
"grad_norm": 0.9968971610069275,
"learning_rate": 2.6446886028553476e-06,
"loss": 0.5400364995002747,
"step": 2190
},
{
"epoch": 2.6764346764346767,
"grad_norm": 2.260093927383423,
"learning_rate": 2.6352166921669076e-06,
"loss": 0.5039065480232239,
"step": 2192
},
{
"epoch": 2.678876678876679,
"grad_norm": 2.027021646499634,
"learning_rate": 2.625812333236222e-06,
"loss": 0.13939893245697021,
"step": 2194
},
{
"epoch": 2.6813186813186816,
"grad_norm": 0.9278559684753418,
"learning_rate": 2.61647560201038e-06,
"loss": 0.33114153146743774,
"step": 2196
},
{
"epoch": 2.683760683760684,
"grad_norm": 0.7097818851470947,
"learning_rate": 2.6072065738903335e-06,
"loss": 0.521342396736145,
"step": 2198
},
{
"epoch": 2.6862026862026864,
"grad_norm": 1.291375756263733,
"learning_rate": 2.5980053237302816e-06,
"loss": 0.4681139588356018,
"step": 2200
},
{
"epoch": 2.688644688644689,
"grad_norm": 4.184018611907959,
"learning_rate": 2.588871925837062e-06,
"loss": 0.28020548820495605,
"step": 2202
},
{
"epoch": 2.6910866910866913,
"grad_norm": 2.3299784660339355,
"learning_rate": 2.5798064539695604e-06,
"loss": 0.5311964750289917,
"step": 2204
},
{
"epoch": 2.6935286935286937,
"grad_norm": 2.1903867721557617,
"learning_rate": 2.5708089813381088e-06,
"loss": 0.12289441376924515,
"step": 2206
},
{
"epoch": 2.695970695970696,
"grad_norm": 2.261828899383545,
"learning_rate": 2.561879580603893e-06,
"loss": 0.47109082341194153,
"step": 2208
},
{
"epoch": 2.6984126984126986,
"grad_norm": 1.110669493675232,
"learning_rate": 2.5530183238783728e-06,
"loss": 0.3485221564769745,
"step": 2210
},
{
"epoch": 2.700854700854701,
"grad_norm": 1.4882543087005615,
"learning_rate": 2.5442252827226925e-06,
"loss": 0.5045080184936523,
"step": 2212
},
{
"epoch": 2.7032967032967035,
"grad_norm": 1.0571633577346802,
"learning_rate": 2.5355005281471046e-06,
"loss": 0.2372823804616928,
"step": 2214
},
{
"epoch": 2.705738705738706,
"grad_norm": 0.6153679490089417,
"learning_rate": 2.526844130610399e-06,
"loss": 0.2721218168735504,
"step": 2216
},
{
"epoch": 2.7081807081807083,
"grad_norm": 1.4203280210494995,
"learning_rate": 2.5182561600193317e-06,
"loss": 0.311516672372818,
"step": 2218
},
{
"epoch": 2.7106227106227108,
"grad_norm": 2.5656776428222656,
"learning_rate": 2.5097366857280636e-06,
"loss": 0.1073763519525528,
"step": 2220
},
{
"epoch": 2.713064713064713,
"grad_norm": 1.4745090007781982,
"learning_rate": 2.501285776537593e-06,
"loss": 0.358319491147995,
"step": 2222
},
{
"epoch": 2.7155067155067156,
"grad_norm": 1.3464287519454956,
"learning_rate": 2.4929035006952106e-06,
"loss": 0.21015426516532898,
"step": 2224
},
{
"epoch": 2.717948717948718,
"grad_norm": 1.07408607006073,
"learning_rate": 2.4845899258939362e-06,
"loss": 0.25736236572265625,
"step": 2226
},
{
"epoch": 2.7203907203907205,
"grad_norm": 2.0369420051574707,
"learning_rate": 2.4763451192719816e-06,
"loss": 0.2484760284423828,
"step": 2228
},
{
"epoch": 2.722832722832723,
"grad_norm": 1.062886357307434,
"learning_rate": 2.4681691474122064e-06,
"loss": 0.4695739150047302,
"step": 2230
},
{
"epoch": 2.7252747252747254,
"grad_norm": 3.0891904830932617,
"learning_rate": 2.4600620763415754e-06,
"loss": 0.2893969714641571,
"step": 2232
},
{
"epoch": 2.727716727716728,
"grad_norm": 0.8144769072532654,
"learning_rate": 2.4520239715306325e-06,
"loss": 0.5152880549430847,
"step": 2234
},
{
"epoch": 2.7301587301587302,
"grad_norm": 1.6376501321792603,
"learning_rate": 2.4440548978929678e-06,
"loss": 0.7832448482513428,
"step": 2236
},
{
"epoch": 2.7326007326007327,
"grad_norm": 0.9825433492660522,
"learning_rate": 2.4361549197846914e-06,
"loss": 0.376642107963562,
"step": 2238
},
{
"epoch": 2.735042735042735,
"grad_norm": 1.621090054512024,
"learning_rate": 2.42832410100392e-06,
"loss": 0.26889967918395996,
"step": 2240
},
{
"epoch": 2.7374847374847375,
"grad_norm": 0.8367129564285278,
"learning_rate": 2.420562504790256e-06,
"loss": 0.5269310474395752,
"step": 2242
},
{
"epoch": 2.73992673992674,
"grad_norm": 2.0027148723602295,
"learning_rate": 2.412870193824278e-06,
"loss": 0.2715807557106018,
"step": 2244
},
{
"epoch": 2.7423687423687424,
"grad_norm": 1.490946650505066,
"learning_rate": 2.4052472302270365e-06,
"loss": 0.2188037633895874,
"step": 2246
},
{
"epoch": 2.744810744810745,
"grad_norm": 1.6017478704452515,
"learning_rate": 2.3976936755595533e-06,
"loss": 0.4869040846824646,
"step": 2248
},
{
"epoch": 2.7472527472527473,
"grad_norm": 1.3607432842254639,
"learning_rate": 2.390209590822319e-06,
"loss": 0.40255841612815857,
"step": 2250
},
{
"epoch": 2.7496947496947497,
"grad_norm": 1.5456528663635254,
"learning_rate": 2.3827950364548034e-06,
"loss": 0.6289904117584229,
"step": 2252
},
{
"epoch": 2.752136752136752,
"grad_norm": 1.5753426551818848,
"learning_rate": 2.375450072334972e-06,
"loss": 0.5615298748016357,
"step": 2254
},
{
"epoch": 2.7545787545787546,
"grad_norm": 1.4261977672576904,
"learning_rate": 2.3681747577787924e-06,
"loss": 0.2363334745168686,
"step": 2256
},
{
"epoch": 2.757020757020757,
"grad_norm": 1.1819992065429688,
"learning_rate": 2.3609691515397628e-06,
"loss": 0.4858379364013672,
"step": 2258
},
{
"epoch": 2.7594627594627594,
"grad_norm": 1.9267686605453491,
"learning_rate": 2.3538333118084396e-06,
"loss": 0.5177884697914124,
"step": 2260
},
{
"epoch": 2.761904761904762,
"grad_norm": 1.1344858407974243,
"learning_rate": 2.3467672962119565e-06,
"loss": 0.5373342037200928,
"step": 2262
},
{
"epoch": 2.7643467643467643,
"grad_norm": 0.8637273907661438,
"learning_rate": 2.3397711618135725e-06,
"loss": 0.43640759587287903,
"step": 2264
},
{
"epoch": 2.7667887667887667,
"grad_norm": 1.145462155342102,
"learning_rate": 2.332844965112201e-06,
"loss": 0.3964022099971771,
"step": 2266
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.5111590623855591,
"learning_rate": 2.3259887620419573e-06,
"loss": 0.3127731680870056,
"step": 2268
},
{
"epoch": 2.7716727716727716,
"grad_norm": 0.791425347328186,
"learning_rate": 2.3192026079717086e-06,
"loss": 0.2613333463668823,
"step": 2270
},
{
"epoch": 2.774114774114774,
"grad_norm": 0.23441043496131897,
"learning_rate": 2.3124865577046252e-06,
"loss": 0.07839272171258926,
"step": 2272
},
{
"epoch": 2.7765567765567765,
"grad_norm": 1.0026205778121948,
"learning_rate": 2.3058406654777355e-06,
"loss": 0.502284824848175,
"step": 2274
},
{
"epoch": 2.778998778998779,
"grad_norm": 0.9165741801261902,
"learning_rate": 2.299264984961492e-06,
"loss": 0.6292468905448914,
"step": 2276
},
{
"epoch": 2.7814407814407813,
"grad_norm": 1.3016325235366821,
"learning_rate": 2.2927595692593366e-06,
"loss": 0.3484017252922058,
"step": 2278
},
{
"epoch": 2.7838827838827838,
"grad_norm": 1.573944091796875,
"learning_rate": 2.286324470907269e-06,
"loss": 0.18759427964687347,
"step": 2280
},
{
"epoch": 2.786324786324786,
"grad_norm": 2.0719950199127197,
"learning_rate": 2.279959741873426e-06,
"loss": 0.419060617685318,
"step": 2282
},
{
"epoch": 2.7887667887667886,
"grad_norm": 1.6407965421676636,
"learning_rate": 2.2736654335576634e-06,
"loss": 0.4783077836036682,
"step": 2284
},
{
"epoch": 2.791208791208791,
"grad_norm": 1.0861320495605469,
"learning_rate": 2.267441596791132e-06,
"loss": 0.4703105390071869,
"step": 2286
},
{
"epoch": 2.7936507936507935,
"grad_norm": 0.9553175568580627,
"learning_rate": 2.2612882818358784e-06,
"loss": 0.41585975885391235,
"step": 2288
},
{
"epoch": 2.796092796092796,
"grad_norm": 9.468893051147461,
"learning_rate": 2.2552055383844327e-06,
"loss": 0.08420296758413315,
"step": 2290
},
{
"epoch": 2.7985347985347984,
"grad_norm": 2.4556336402893066,
"learning_rate": 2.2491934155594063e-06,
"loss": 0.35032370686531067,
"step": 2292
},
{
"epoch": 2.800976800976801,
"grad_norm": 1.1944650411605835,
"learning_rate": 2.243251961913099e-06,
"loss": 0.36088746786117554,
"step": 2294
},
{
"epoch": 2.8034188034188032,
"grad_norm": 0.9773551821708679,
"learning_rate": 2.2373812254271074e-06,
"loss": 0.42339953780174255,
"step": 2296
},
{
"epoch": 2.8058608058608057,
"grad_norm": 1.2944077253341675,
"learning_rate": 2.231581253511929e-06,
"loss": 0.1882065087556839,
"step": 2298
},
{
"epoch": 2.808302808302808,
"grad_norm": 1.328771948814392,
"learning_rate": 2.2258520930065902e-06,
"loss": 0.33834829926490784,
"step": 2300
},
{
"epoch": 2.8107448107448105,
"grad_norm": 1.5796797275543213,
"learning_rate": 2.2201937901782632e-06,
"loss": 0.5746235847473145,
"step": 2302
},
{
"epoch": 2.813186813186813,
"grad_norm": 0.1405964195728302,
"learning_rate": 2.2146063907218928e-06,
"loss": 0.2884528338909149,
"step": 2304
},
{
"epoch": 2.8156288156288154,
"grad_norm": 0.7891967296600342,
"learning_rate": 2.2090899397598235e-06,
"loss": 0.34547799825668335,
"step": 2306
},
{
"epoch": 2.818070818070818,
"grad_norm": 1.0902297496795654,
"learning_rate": 2.2036444818414424e-06,
"loss": 0.4068155288696289,
"step": 2308
},
{
"epoch": 2.8205128205128203,
"grad_norm": 1.061621904373169,
"learning_rate": 2.198270060942815e-06,
"loss": 0.4539620876312256,
"step": 2310
},
{
"epoch": 2.8229548229548227,
"grad_norm": 0.9649152755737305,
"learning_rate": 2.192966720466328e-06,
"loss": 0.22723491489887238,
"step": 2312
},
{
"epoch": 2.825396825396825,
"grad_norm": 10.881244659423828,
"learning_rate": 2.1877345032403458e-06,
"loss": 0.287578284740448,
"step": 2314
},
{
"epoch": 2.8278388278388276,
"grad_norm": 2.314340829849243,
"learning_rate": 2.182573451518859e-06,
"loss": 0.4537888169288635,
"step": 2316
},
{
"epoch": 2.8302808302808304,
"grad_norm": 1.7877088785171509,
"learning_rate": 2.1774836069811415e-06,
"loss": 0.3850943446159363,
"step": 2318
},
{
"epoch": 2.832722832722833,
"grad_norm": 0.8207268714904785,
"learning_rate": 2.1724650107314217e-06,
"loss": 0.22680553793907166,
"step": 2320
},
{
"epoch": 2.8351648351648353,
"grad_norm": 1.7450029850006104,
"learning_rate": 2.1675177032985435e-06,
"loss": 0.34959569573402405,
"step": 2322
},
{
"epoch": 2.8376068376068377,
"grad_norm": 0.28571420907974243,
"learning_rate": 2.1626417246356398e-06,
"loss": 0.08046525716781616,
"step": 2324
},
{
"epoch": 2.84004884004884,
"grad_norm": 1.3986101150512695,
"learning_rate": 2.1578371141198154e-06,
"loss": 0.3989933431148529,
"step": 2326
},
{
"epoch": 2.8424908424908426,
"grad_norm": 1.5185210704803467,
"learning_rate": 2.15310391055182e-06,
"loss": 0.27708202600479126,
"step": 2328
},
{
"epoch": 2.844932844932845,
"grad_norm": 2.8958606719970703,
"learning_rate": 2.1484421521557453e-06,
"loss": 0.24901802837848663,
"step": 2330
},
{
"epoch": 2.8473748473748475,
"grad_norm": 1.108059048652649,
"learning_rate": 2.143851876578706e-06,
"loss": 0.45619091391563416,
"step": 2332
},
{
"epoch": 2.84981684981685,
"grad_norm": 1.0437735319137573,
"learning_rate": 2.1393331208905436e-06,
"loss": 0.07932747900485992,
"step": 2334
},
{
"epoch": 2.8522588522588523,
"grad_norm": 1.237439513206482,
"learning_rate": 2.134885921583522e-06,
"loss": 0.5910269021987915,
"step": 2336
},
{
"epoch": 2.8547008547008548,
"grad_norm": 1.1078741550445557,
"learning_rate": 2.1305103145720383e-06,
"loss": 0.3153696656227112,
"step": 2338
},
{
"epoch": 2.857142857142857,
"grad_norm": 1.034421682357788,
"learning_rate": 2.1262063351923255e-06,
"loss": 0.47363409399986267,
"step": 2340
},
{
"epoch": 2.8595848595848596,
"grad_norm": 1.1710708141326904,
"learning_rate": 2.121974018202172e-06,
"loss": 0.48734188079833984,
"step": 2342
},
{
"epoch": 2.862026862026862,
"grad_norm": 2.568005084991455,
"learning_rate": 2.1178133977806413e-06,
"loss": 0.19048890471458435,
"step": 2344
},
{
"epoch": 2.8644688644688645,
"grad_norm": 1.4728940725326538,
"learning_rate": 2.113724507527794e-06,
"loss": 0.6129634976387024,
"step": 2346
},
{
"epoch": 2.866910866910867,
"grad_norm": 0.22239279747009277,
"learning_rate": 2.1097073804644163e-06,
"loss": 0.2763885259628296,
"step": 2348
},
{
"epoch": 2.8693528693528694,
"grad_norm": 0.6631549000740051,
"learning_rate": 2.105762049031753e-06,
"loss": 0.2500677704811096,
"step": 2350
},
{
"epoch": 2.871794871794872,
"grad_norm": 1.0234497785568237,
"learning_rate": 2.1018885450912487e-06,
"loss": 0.45614075660705566,
"step": 2352
},
{
"epoch": 2.8742368742368742,
"grad_norm": 1.8352830410003662,
"learning_rate": 2.098086899924288e-06,
"loss": 0.3945198953151703,
"step": 2354
},
{
"epoch": 2.8766788766788767,
"grad_norm": 0.8980585932731628,
"learning_rate": 2.0943571442319437e-06,
"loss": 0.49924108386039734,
"step": 2356
},
{
"epoch": 2.879120879120879,
"grad_norm": 25.131999969482422,
"learning_rate": 2.090699308134726e-06,
"loss": 0.4753328263759613,
"step": 2358
},
{
"epoch": 2.8815628815628815,
"grad_norm": 1.648654818534851,
"learning_rate": 2.0871134211723417e-06,
"loss": 0.23788021504878998,
"step": 2360
},
{
"epoch": 2.884004884004884,
"grad_norm": 1.9093987941741943,
"learning_rate": 2.0835995123034603e-06,
"loss": 0.32568857073783875,
"step": 2362
},
{
"epoch": 2.8864468864468864,
"grad_norm": 1.0956945419311523,
"learning_rate": 2.0801576099054696e-06,
"loss": 0.6228987574577332,
"step": 2364
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.992882490158081,
"learning_rate": 2.0767877417742564e-06,
"loss": 0.39544668793678284,
"step": 2366
},
{
"epoch": 2.8913308913308913,
"grad_norm": 4.350165367126465,
"learning_rate": 2.0734899351239744e-06,
"loss": 0.3747745156288147,
"step": 2368
},
{
"epoch": 2.8937728937728937,
"grad_norm": 1.0189871788024902,
"learning_rate": 2.0702642165868326e-06,
"loss": 0.3083977997303009,
"step": 2370
},
{
"epoch": 2.896214896214896,
"grad_norm": 1.012895107269287,
"learning_rate": 2.0671106122128717e-06,
"loss": 0.388817697763443,
"step": 2372
},
{
"epoch": 2.8986568986568986,
"grad_norm": 0.2986360788345337,
"learning_rate": 2.064029147469759e-06,
"loss": 0.3050660490989685,
"step": 2374
},
{
"epoch": 2.901098901098901,
"grad_norm": 3.99959397315979,
"learning_rate": 2.0610198472425817e-06,
"loss": 0.42830216884613037,
"step": 2376
},
{
"epoch": 2.9035409035409034,
"grad_norm": 0.9284391403198242,
"learning_rate": 2.0580827358336447e-06,
"loss": 0.4124550223350525,
"step": 2378
},
{
"epoch": 2.905982905982906,
"grad_norm": 1.0101871490478516,
"learning_rate": 2.055217836962276e-06,
"loss": 0.34032320976257324,
"step": 2380
},
{
"epoch": 2.9084249084249083,
"grad_norm": 2.9604995250701904,
"learning_rate": 2.0524251737646367e-06,
"loss": 0.5842119455337524,
"step": 2382
},
{
"epoch": 2.9108669108669107,
"grad_norm": 1.806335687637329,
"learning_rate": 2.049704768793527e-06,
"loss": 0.308889776468277,
"step": 2384
},
{
"epoch": 2.913308913308913,
"grad_norm": 1.2805176973342896,
"learning_rate": 2.0470566440182126e-06,
"loss": 0.736882746219635,
"step": 2386
},
{
"epoch": 2.9157509157509156,
"grad_norm": 1.484055995941162,
"learning_rate": 2.0444808208242414e-06,
"loss": 0.3669341504573822,
"step": 2388
},
{
"epoch": 2.918192918192918,
"grad_norm": 2.3404009342193604,
"learning_rate": 2.041977320013275e-06,
"loss": 0.303989052772522,
"step": 2390
},
{
"epoch": 2.9206349206349205,
"grad_norm": 4.0918097496032715,
"learning_rate": 2.0395461618029175e-06,
"loss": 0.4449572265148163,
"step": 2392
},
{
"epoch": 2.9230769230769234,
"grad_norm": 1.0222722291946411,
"learning_rate": 2.0371873658265546e-06,
"loss": 0.31565719842910767,
"step": 2394
},
{
"epoch": 2.925518925518926,
"grad_norm": 1.7059550285339355,
"learning_rate": 2.0349009511331912e-06,
"loss": 0.24595557153224945,
"step": 2396
},
{
"epoch": 2.927960927960928,
"grad_norm": 3.1395483016967773,
"learning_rate": 2.032686936187305e-06,
"loss": 0.30839934945106506,
"step": 2398
},
{
"epoch": 2.9304029304029307,
"grad_norm": 2.6876676082611084,
"learning_rate": 2.0305453388686876e-06,
"loss": 0.32078707218170166,
"step": 2400
},
{
"epoch": 2.932844932844933,
"grad_norm": 1.2002466917037964,
"learning_rate": 2.0284761764723087e-06,
"loss": 0.27718839049339294,
"step": 2402
},
{
"epoch": 2.9352869352869355,
"grad_norm": 1.524316668510437,
"learning_rate": 2.026479465708171e-06,
"loss": 0.18042829632759094,
"step": 2404
},
{
"epoch": 2.937728937728938,
"grad_norm": 2.9133212566375732,
"learning_rate": 2.0245552227011777e-06,
"loss": 0.5652621984481812,
"step": 2406
},
{
"epoch": 2.9401709401709404,
"grad_norm": 1.6217875480651855,
"learning_rate": 2.022703462991003e-06,
"loss": 0.28077784180641174,
"step": 2408
},
{
"epoch": 2.942612942612943,
"grad_norm": 0.957901656627655,
"learning_rate": 2.0209242015319625e-06,
"loss": 0.312043696641922,
"step": 2410
},
{
"epoch": 2.9450549450549453,
"grad_norm": 0.7723997235298157,
"learning_rate": 2.0192174526928982e-06,
"loss": 0.42037639021873474,
"step": 2412
},
{
"epoch": 2.9474969474969477,
"grad_norm": 1.0776695013046265,
"learning_rate": 2.0175832302570575e-06,
"loss": 0.5173778533935547,
"step": 2414
},
{
"epoch": 2.94993894993895,
"grad_norm": 0.926655650138855,
"learning_rate": 2.016021547421984e-06,
"loss": 0.46436506509780884,
"step": 2416
},
{
"epoch": 2.9523809523809526,
"grad_norm": 1.5396034717559814,
"learning_rate": 2.0145324167994134e-06,
"loss": 0.24875374138355255,
"step": 2418
},
{
"epoch": 2.954822954822955,
"grad_norm": 1.1180499792099,
"learning_rate": 2.0131158504151655e-06,
"loss": 0.35978463292121887,
"step": 2420
},
{
"epoch": 2.9572649572649574,
"grad_norm": 0.9617190957069397,
"learning_rate": 2.0117718597090543e-06,
"loss": 0.3947286605834961,
"step": 2422
},
{
"epoch": 2.95970695970696,
"grad_norm": 1.0433496236801147,
"learning_rate": 2.010500455534788e-06,
"loss": 0.28263401985168457,
"step": 2424
},
{
"epoch": 2.9621489621489623,
"grad_norm": 1.070198893547058,
"learning_rate": 2.0093016481598885e-06,
"loss": 0.5800071954727173,
"step": 2426
},
{
"epoch": 2.9645909645909647,
"grad_norm": 3.0985279083251953,
"learning_rate": 2.0081754472656034e-06,
"loss": 0.1977805346250534,
"step": 2428
},
{
"epoch": 2.967032967032967,
"grad_norm": 1.0906306505203247,
"learning_rate": 2.0071218619468327e-06,
"loss": 0.3762721121311188,
"step": 2430
},
{
"epoch": 2.9694749694749696,
"grad_norm": 0.7913962602615356,
"learning_rate": 2.0061409007120475e-06,
"loss": 0.3768196403980255,
"step": 2432
},
{
"epoch": 2.971916971916972,
"grad_norm": 1.3056226968765259,
"learning_rate": 2.005232571483231e-06,
"loss": 0.46781641244888306,
"step": 2434
},
{
"epoch": 2.9743589743589745,
"grad_norm": 1.005242109298706,
"learning_rate": 2.0043968815958075e-06,
"loss": 0.25440388917922974,
"step": 2436
},
{
"epoch": 2.976800976800977,
"grad_norm": 3.3108999729156494,
"learning_rate": 2.003633837798584e-06,
"loss": 0.12983591854572296,
"step": 2438
},
{
"epoch": 2.9792429792429793,
"grad_norm": 1.743328332901001,
"learning_rate": 2.0029434462537e-06,
"loss": 0.43715769052505493,
"step": 2440
},
{
"epoch": 2.9816849816849818,
"grad_norm": 1.05440092086792,
"learning_rate": 2.002325712536572e-06,
"loss": 0.4317605495452881,
"step": 2442
},
{
"epoch": 2.984126984126984,
"grad_norm": 2.774752616882324,
"learning_rate": 2.001780641635854e-06,
"loss": 0.39571458101272583,
"step": 2444
},
{
"epoch": 2.9865689865689866,
"grad_norm": 1.0296354293823242,
"learning_rate": 2.001308237953393e-06,
"loss": 0.4417667090892792,
"step": 2446
},
{
"epoch": 2.989010989010989,
"grad_norm": 1.3123754262924194,
"learning_rate": 2.000908505304195e-06,
"loss": 0.5195387601852417,
"step": 2448
},
{
"epoch": 2.9914529914529915,
"grad_norm": 2.177339553833008,
"learning_rate": 2.0005814469163937e-06,
"loss": 0.19710102677345276,
"step": 2450
},
{
"epoch": 2.993894993894994,
"grad_norm": 1.543820858001709,
"learning_rate": 2.0003270654312266e-06,
"loss": 0.4630212187767029,
"step": 2452
},
{
"epoch": 2.9963369963369964,
"grad_norm": 1.5547709465026855,
"learning_rate": 2.000145362903009e-06,
"loss": 0.6292054057121277,
"step": 2454
},
{
"epoch": 2.998778998778999,
"grad_norm": 0.33734217286109924,
"learning_rate": 2.0000363407991222e-06,
"loss": 0.16045792400836945,
"step": 2456
},
{
"epoch": 3.0,
"step": 2457,
"total_flos": 2.578606960937009e+18,
"train_loss": 0.8228938954362648,
"train_runtime": 8271.5959,
"train_samples_per_second": 4.753,
"train_steps_per_second": 0.297
}
],
"logging_steps": 2,
"max_steps": 2457,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.578606960937009e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}