lfm-sft / trainer_state.json
Ba2han's picture
Final continued pretraining checkpoint
417dfbc verified
Raw
History Blame Contribute Delete
348 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.1001598295151838,
"eval_steps": 500,
"global_step": 4130,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005327650506126798,
"grad_norm": 16.25,
"learning_rate": 3.8647342995169085e-07,
"loss": 2.6138787269592285,
"step": 2
},
{
"epoch": 0.0010655301012253596,
"grad_norm": 16.875,
"learning_rate": 1.1594202898550726e-06,
"loss": 2.5900964736938477,
"step": 4
},
{
"epoch": 0.0015982951518380393,
"grad_norm": 15.375,
"learning_rate": 1.932367149758454e-06,
"loss": 2.5705907344818115,
"step": 6
},
{
"epoch": 0.002131060202450719,
"grad_norm": 13.4375,
"learning_rate": 2.7053140096618356e-06,
"loss": 2.5161614418029785,
"step": 8
},
{
"epoch": 0.002663825253063399,
"grad_norm": 11.9375,
"learning_rate": 3.4782608695652175e-06,
"loss": 2.573242425918579,
"step": 10
},
{
"epoch": 0.0031965903036760787,
"grad_norm": 10.6875,
"learning_rate": 4.251207729468599e-06,
"loss": 2.527428388595581,
"step": 12
},
{
"epoch": 0.0037293553542887587,
"grad_norm": 6.1875,
"learning_rate": 5.024154589371981e-06,
"loss": 2.3534696102142334,
"step": 14
},
{
"epoch": 0.004262120404901438,
"grad_norm": 5.4375,
"learning_rate": 5.797101449275363e-06,
"loss": 2.3852028846740723,
"step": 16
},
{
"epoch": 0.004794885455514118,
"grad_norm": 4.28125,
"learning_rate": 6.570048309178745e-06,
"loss": 2.350311040878296,
"step": 18
},
{
"epoch": 0.005327650506126798,
"grad_norm": 2.9375,
"learning_rate": 7.342995169082127e-06,
"loss": 2.2285757064819336,
"step": 20
},
{
"epoch": 0.005860415556739478,
"grad_norm": 2.546875,
"learning_rate": 8.115942028985508e-06,
"loss": 2.2054595947265625,
"step": 22
},
{
"epoch": 0.006393180607352157,
"grad_norm": 2.875,
"learning_rate": 8.888888888888888e-06,
"loss": 2.237429618835449,
"step": 24
},
{
"epoch": 0.006925945657964837,
"grad_norm": 2.015625,
"learning_rate": 9.66183574879227e-06,
"loss": 2.1398394107818604,
"step": 26
},
{
"epoch": 0.007458710708577517,
"grad_norm": 1.78125,
"learning_rate": 1.0434782608695653e-05,
"loss": 2.1224007606506348,
"step": 28
},
{
"epoch": 0.007991475759190196,
"grad_norm": 1.65625,
"learning_rate": 1.1207729468599035e-05,
"loss": 2.0460424423217773,
"step": 30
},
{
"epoch": 0.008524240809802876,
"grad_norm": 1.6015625,
"learning_rate": 1.1980676328502416e-05,
"loss": 2.047302484512329,
"step": 32
},
{
"epoch": 0.009057005860415556,
"grad_norm": 1.5078125,
"learning_rate": 1.2753623188405797e-05,
"loss": 2.0680582523345947,
"step": 34
},
{
"epoch": 0.009589770911028236,
"grad_norm": 1.4921875,
"learning_rate": 1.352657004830918e-05,
"loss": 1.9368038177490234,
"step": 36
},
{
"epoch": 0.010122535961640916,
"grad_norm": 1.4609375,
"learning_rate": 1.4299516908212561e-05,
"loss": 1.9544492959976196,
"step": 38
},
{
"epoch": 0.010655301012253596,
"grad_norm": 1.484375,
"learning_rate": 1.5072463768115944e-05,
"loss": 1.9077532291412354,
"step": 40
},
{
"epoch": 0.011188066062866276,
"grad_norm": 1.4765625,
"learning_rate": 1.5845410628019324e-05,
"loss": 1.8669679164886475,
"step": 42
},
{
"epoch": 0.011720831113478956,
"grad_norm": 1.40625,
"learning_rate": 1.6618357487922706e-05,
"loss": 1.860713243484497,
"step": 44
},
{
"epoch": 0.012253596164091636,
"grad_norm": 1.390625,
"learning_rate": 1.739130434782609e-05,
"loss": 1.8645515441894531,
"step": 46
},
{
"epoch": 0.012786361214704315,
"grad_norm": 1.4140625,
"learning_rate": 1.816425120772947e-05,
"loss": 1.8843761682510376,
"step": 48
},
{
"epoch": 0.013319126265316995,
"grad_norm": 1.34375,
"learning_rate": 1.893719806763285e-05,
"loss": 1.81703519821167,
"step": 50
},
{
"epoch": 0.013851891315929675,
"grad_norm": 1.390625,
"learning_rate": 1.9710144927536236e-05,
"loss": 1.791835069656372,
"step": 52
},
{
"epoch": 0.014384656366542355,
"grad_norm": 1.484375,
"learning_rate": 2.0483091787439618e-05,
"loss": 1.8730442523956299,
"step": 54
},
{
"epoch": 0.014917421417155035,
"grad_norm": 1.5,
"learning_rate": 2.1256038647342997e-05,
"loss": 1.7048213481903076,
"step": 56
},
{
"epoch": 0.015450186467767715,
"grad_norm": 1.375,
"learning_rate": 2.202898550724638e-05,
"loss": 1.7030128240585327,
"step": 58
},
{
"epoch": 0.015982951518380393,
"grad_norm": 1.453125,
"learning_rate": 2.280193236714976e-05,
"loss": 1.730646014213562,
"step": 60
},
{
"epoch": 0.016515716568993075,
"grad_norm": 1.4453125,
"learning_rate": 2.357487922705314e-05,
"loss": 1.6557202339172363,
"step": 62
},
{
"epoch": 0.017048481619605753,
"grad_norm": 1.4140625,
"learning_rate": 2.4347826086956526e-05,
"loss": 1.7334563732147217,
"step": 64
},
{
"epoch": 0.017581246670218435,
"grad_norm": 1.40625,
"learning_rate": 2.5120772946859905e-05,
"loss": 1.6493436098098755,
"step": 66
},
{
"epoch": 0.018114011720831113,
"grad_norm": 1.4296875,
"learning_rate": 2.5893719806763288e-05,
"loss": 1.6171550750732422,
"step": 68
},
{
"epoch": 0.018646776771443795,
"grad_norm": 1.40625,
"learning_rate": 2.6666666666666667e-05,
"loss": 1.6566861867904663,
"step": 70
},
{
"epoch": 0.019179541822056473,
"grad_norm": 1.4140625,
"learning_rate": 2.7439613526570052e-05,
"loss": 1.5906771421432495,
"step": 72
},
{
"epoch": 0.019712306872669155,
"grad_norm": 1.4296875,
"learning_rate": 2.8212560386473435e-05,
"loss": 1.568983554840088,
"step": 74
},
{
"epoch": 0.020245071923281833,
"grad_norm": 1.3125,
"learning_rate": 2.8985507246376814e-05,
"loss": 1.5367965698242188,
"step": 76
},
{
"epoch": 0.02077783697389451,
"grad_norm": 1.296875,
"learning_rate": 2.9758454106280196e-05,
"loss": 1.5819628238677979,
"step": 78
},
{
"epoch": 0.021310602024507193,
"grad_norm": 1.3515625,
"learning_rate": 3.053140096618358e-05,
"loss": 1.5393096208572388,
"step": 80
},
{
"epoch": 0.02184336707511987,
"grad_norm": 1.3046875,
"learning_rate": 3.130434782608696e-05,
"loss": 1.5357110500335693,
"step": 82
},
{
"epoch": 0.022376132125732553,
"grad_norm": 1.3125,
"learning_rate": 3.207729468599034e-05,
"loss": 1.5443792343139648,
"step": 84
},
{
"epoch": 0.02290889717634523,
"grad_norm": 1.34375,
"learning_rate": 3.2850241545893725e-05,
"loss": 1.5320842266082764,
"step": 86
},
{
"epoch": 0.023441662226957913,
"grad_norm": 1.265625,
"learning_rate": 3.36231884057971e-05,
"loss": 1.4929611682891846,
"step": 88
},
{
"epoch": 0.02397442727757059,
"grad_norm": 1.40625,
"learning_rate": 3.439613526570049e-05,
"loss": 1.4932339191436768,
"step": 90
},
{
"epoch": 0.024507192328183273,
"grad_norm": 1.3671875,
"learning_rate": 3.5169082125603865e-05,
"loss": 1.4885770082473755,
"step": 92
},
{
"epoch": 0.02503995737879595,
"grad_norm": 1.375,
"learning_rate": 3.594202898550725e-05,
"loss": 1.462911605834961,
"step": 94
},
{
"epoch": 0.02557272242940863,
"grad_norm": 1.3125,
"learning_rate": 3.671497584541063e-05,
"loss": 1.4575042724609375,
"step": 96
},
{
"epoch": 0.02610548748002131,
"grad_norm": 1.3125,
"learning_rate": 3.748792270531401e-05,
"loss": 1.4930751323699951,
"step": 98
},
{
"epoch": 0.02663825253063399,
"grad_norm": 1.3203125,
"learning_rate": 3.8260869565217395e-05,
"loss": 1.47833251953125,
"step": 100
},
{
"epoch": 0.02717101758124667,
"grad_norm": 1.3125,
"learning_rate": 3.903381642512078e-05,
"loss": 1.4232707023620605,
"step": 102
},
{
"epoch": 0.02770378263185935,
"grad_norm": 1.3359375,
"learning_rate": 3.980676328502416e-05,
"loss": 1.4310252666473389,
"step": 104
},
{
"epoch": 0.02823654768247203,
"grad_norm": 1.3359375,
"learning_rate": 4.057971014492754e-05,
"loss": 1.4417388439178467,
"step": 106
},
{
"epoch": 0.02876931273308471,
"grad_norm": 1.3515625,
"learning_rate": 4.135265700483092e-05,
"loss": 1.3514190912246704,
"step": 108
},
{
"epoch": 0.02930207778369739,
"grad_norm": 1.3359375,
"learning_rate": 4.2125603864734306e-05,
"loss": 1.4256916046142578,
"step": 110
},
{
"epoch": 0.02983484283431007,
"grad_norm": 1.3203125,
"learning_rate": 4.289855072463769e-05,
"loss": 1.3847278356552124,
"step": 112
},
{
"epoch": 0.030367607884922748,
"grad_norm": 1.375,
"learning_rate": 4.3671497584541064e-05,
"loss": 1.3873201608657837,
"step": 114
},
{
"epoch": 0.03090037293553543,
"grad_norm": 1.3046875,
"learning_rate": 4.444444444444445e-05,
"loss": 1.3949869871139526,
"step": 116
},
{
"epoch": 0.03143313798614811,
"grad_norm": 1.28125,
"learning_rate": 4.521739130434783e-05,
"loss": 1.3873813152313232,
"step": 118
},
{
"epoch": 0.031965903036760786,
"grad_norm": 1.296875,
"learning_rate": 4.599033816425121e-05,
"loss": 1.3549132347106934,
"step": 120
},
{
"epoch": 0.03249866808737347,
"grad_norm": 1.296875,
"learning_rate": 4.6763285024154594e-05,
"loss": 1.402418851852417,
"step": 122
},
{
"epoch": 0.03303143313798615,
"grad_norm": 1.3203125,
"learning_rate": 4.7536231884057976e-05,
"loss": 1.361701250076294,
"step": 124
},
{
"epoch": 0.03356419818859883,
"grad_norm": 1.3359375,
"learning_rate": 4.830917874396136e-05,
"loss": 1.3121955394744873,
"step": 126
},
{
"epoch": 0.034096963239211506,
"grad_norm": 1.453125,
"learning_rate": 4.9082125603864734e-05,
"loss": 1.342367172241211,
"step": 128
},
{
"epoch": 0.03462972828982419,
"grad_norm": 1.3671875,
"learning_rate": 4.9855072463768116e-05,
"loss": 1.3617416620254517,
"step": 130
},
{
"epoch": 0.03516249334043687,
"grad_norm": 1.390625,
"learning_rate": 5.0628019323671505e-05,
"loss": 1.3062248229980469,
"step": 132
},
{
"epoch": 0.03569525839104955,
"grad_norm": 1.265625,
"learning_rate": 5.140096618357488e-05,
"loss": 1.287408471107483,
"step": 134
},
{
"epoch": 0.036228023441662226,
"grad_norm": 1.328125,
"learning_rate": 5.217391304347826e-05,
"loss": 1.323954463005066,
"step": 136
},
{
"epoch": 0.036760788492274904,
"grad_norm": 1.359375,
"learning_rate": 5.294685990338165e-05,
"loss": 1.3211263418197632,
"step": 138
},
{
"epoch": 0.03729355354288759,
"grad_norm": 1.34375,
"learning_rate": 5.371980676328503e-05,
"loss": 1.276440143585205,
"step": 140
},
{
"epoch": 0.03782631859350027,
"grad_norm": 1.3359375,
"learning_rate": 5.449275362318841e-05,
"loss": 1.2463949918746948,
"step": 142
},
{
"epoch": 0.038359083644112946,
"grad_norm": 1.2734375,
"learning_rate": 5.5265700483091786e-05,
"loss": 1.2704516649246216,
"step": 144
},
{
"epoch": 0.038891848694725624,
"grad_norm": 1.34375,
"learning_rate": 5.6038647342995175e-05,
"loss": 1.266597867012024,
"step": 146
},
{
"epoch": 0.03942461374533831,
"grad_norm": 1.328125,
"learning_rate": 5.681159420289856e-05,
"loss": 1.3489875793457031,
"step": 148
},
{
"epoch": 0.03995737879595099,
"grad_norm": 1.296875,
"learning_rate": 5.758454106280193e-05,
"loss": 1.2545640468597412,
"step": 150
},
{
"epoch": 0.040490143846563666,
"grad_norm": 1.2890625,
"learning_rate": 5.835748792270532e-05,
"loss": 1.28780198097229,
"step": 152
},
{
"epoch": 0.041022908897176344,
"grad_norm": 1.421875,
"learning_rate": 5.91304347826087e-05,
"loss": 1.2295873165130615,
"step": 154
},
{
"epoch": 0.04155567394778902,
"grad_norm": 1.2421875,
"learning_rate": 5.990338164251208e-05,
"loss": 1.2586952447891235,
"step": 156
},
{
"epoch": 0.04208843899840171,
"grad_norm": 1.2265625,
"learning_rate": 6.067632850241547e-05,
"loss": 1.2376638650894165,
"step": 158
},
{
"epoch": 0.042621204049014386,
"grad_norm": 1.2265625,
"learning_rate": 6.144927536231884e-05,
"loss": 1.191737413406372,
"step": 160
},
{
"epoch": 0.043153969099627064,
"grad_norm": 1.2421875,
"learning_rate": 6.222222222222223e-05,
"loss": 1.207621455192566,
"step": 162
},
{
"epoch": 0.04368673415023974,
"grad_norm": 1.203125,
"learning_rate": 6.299516908212561e-05,
"loss": 1.2221109867095947,
"step": 164
},
{
"epoch": 0.04421949920085243,
"grad_norm": 1.34375,
"learning_rate": 6.376811594202898e-05,
"loss": 1.2498793601989746,
"step": 166
},
{
"epoch": 0.044752264251465106,
"grad_norm": 1.296875,
"learning_rate": 6.454106280193237e-05,
"loss": 1.2533750534057617,
"step": 168
},
{
"epoch": 0.045285029302077784,
"grad_norm": 1.203125,
"learning_rate": 6.531400966183575e-05,
"loss": 1.2369980812072754,
"step": 170
},
{
"epoch": 0.04581779435269046,
"grad_norm": 1.296875,
"learning_rate": 6.608695652173914e-05,
"loss": 1.199107050895691,
"step": 172
},
{
"epoch": 0.04635055940330314,
"grad_norm": 1.2578125,
"learning_rate": 6.685990338164253e-05,
"loss": 1.2124717235565186,
"step": 174
},
{
"epoch": 0.046883324453915826,
"grad_norm": 1.234375,
"learning_rate": 6.76328502415459e-05,
"loss": 1.2636237144470215,
"step": 176
},
{
"epoch": 0.047416089504528504,
"grad_norm": 1.2578125,
"learning_rate": 6.840579710144928e-05,
"loss": 1.2178090810775757,
"step": 178
},
{
"epoch": 0.04794885455514118,
"grad_norm": 1.1875,
"learning_rate": 6.917874396135265e-05,
"loss": 1.2136110067367554,
"step": 180
},
{
"epoch": 0.04848161960575386,
"grad_norm": 1.2109375,
"learning_rate": 6.995169082125604e-05,
"loss": 1.1956346035003662,
"step": 182
},
{
"epoch": 0.049014384656366546,
"grad_norm": 1.15625,
"learning_rate": 7.072463768115943e-05,
"loss": 1.1326745748519897,
"step": 184
},
{
"epoch": 0.049547149706979224,
"grad_norm": 1.15625,
"learning_rate": 7.149758454106281e-05,
"loss": 1.1829109191894531,
"step": 186
},
{
"epoch": 0.0500799147575919,
"grad_norm": 1.2109375,
"learning_rate": 7.22705314009662e-05,
"loss": 1.1837726831436157,
"step": 188
},
{
"epoch": 0.05061267980820458,
"grad_norm": 1.2421875,
"learning_rate": 7.304347826086957e-05,
"loss": 1.1446757316589355,
"step": 190
},
{
"epoch": 0.05114544485881726,
"grad_norm": 1.203125,
"learning_rate": 7.381642512077295e-05,
"loss": 1.1455689668655396,
"step": 192
},
{
"epoch": 0.051678209909429944,
"grad_norm": 1.1875,
"learning_rate": 7.458937198067634e-05,
"loss": 1.163525104522705,
"step": 194
},
{
"epoch": 0.05221097496004262,
"grad_norm": 1.234375,
"learning_rate": 7.536231884057971e-05,
"loss": 1.2155964374542236,
"step": 196
},
{
"epoch": 0.0527437400106553,
"grad_norm": 1.1953125,
"learning_rate": 7.61352657004831e-05,
"loss": 1.1462944746017456,
"step": 198
},
{
"epoch": 0.05327650506126798,
"grad_norm": 1.234375,
"learning_rate": 7.690821256038648e-05,
"loss": 1.1662917137145996,
"step": 200
},
{
"epoch": 0.053809270111880664,
"grad_norm": 1.2578125,
"learning_rate": 7.768115942028987e-05,
"loss": 1.2034168243408203,
"step": 202
},
{
"epoch": 0.05434203516249334,
"grad_norm": 1.1640625,
"learning_rate": 7.845410628019324e-05,
"loss": 1.0869637727737427,
"step": 204
},
{
"epoch": 0.05487480021310602,
"grad_norm": 1.3046875,
"learning_rate": 7.922705314009662e-05,
"loss": 1.16934072971344,
"step": 206
},
{
"epoch": 0.0554075652637187,
"grad_norm": 1.140625,
"learning_rate": 8e-05,
"loss": 1.2003157138824463,
"step": 208
},
{
"epoch": 0.05594033031433138,
"grad_norm": 1.171875,
"learning_rate": 8e-05,
"loss": 1.1935644149780273,
"step": 210
},
{
"epoch": 0.05647309536494406,
"grad_norm": 1.1953125,
"learning_rate": 8e-05,
"loss": 1.1314443349838257,
"step": 212
},
{
"epoch": 0.05700586041555674,
"grad_norm": 1.09375,
"learning_rate": 8e-05,
"loss": 1.179926872253418,
"step": 214
},
{
"epoch": 0.05753862546616942,
"grad_norm": 1.0859375,
"learning_rate": 8e-05,
"loss": 1.1462368965148926,
"step": 216
},
{
"epoch": 0.0580713905167821,
"grad_norm": 1.109375,
"learning_rate": 8e-05,
"loss": 1.1517932415008545,
"step": 218
},
{
"epoch": 0.05860415556739478,
"grad_norm": 1.171875,
"learning_rate": 8e-05,
"loss": 1.1624958515167236,
"step": 220
},
{
"epoch": 0.05913692061800746,
"grad_norm": 1.171875,
"learning_rate": 8e-05,
"loss": 1.118838906288147,
"step": 222
},
{
"epoch": 0.05966968566862014,
"grad_norm": 1.2109375,
"learning_rate": 8e-05,
"loss": 1.163698434829712,
"step": 224
},
{
"epoch": 0.06020245071923282,
"grad_norm": 1.234375,
"learning_rate": 8e-05,
"loss": 1.1283934116363525,
"step": 226
},
{
"epoch": 0.060735215769845495,
"grad_norm": 1.2265625,
"learning_rate": 8e-05,
"loss": 1.1495060920715332,
"step": 228
},
{
"epoch": 0.06126798082045818,
"grad_norm": 1.1015625,
"learning_rate": 8e-05,
"loss": 1.105478048324585,
"step": 230
},
{
"epoch": 0.06180074587107086,
"grad_norm": 1.109375,
"learning_rate": 8e-05,
"loss": 1.0947949886322021,
"step": 232
},
{
"epoch": 0.06233351092168354,
"grad_norm": 1.0546875,
"learning_rate": 8e-05,
"loss": 1.0812091827392578,
"step": 234
},
{
"epoch": 0.06286627597229622,
"grad_norm": 1.078125,
"learning_rate": 8e-05,
"loss": 1.1314194202423096,
"step": 236
},
{
"epoch": 0.0633990410229089,
"grad_norm": 1.109375,
"learning_rate": 8e-05,
"loss": 1.1186072826385498,
"step": 238
},
{
"epoch": 0.06393180607352157,
"grad_norm": 1.1484375,
"learning_rate": 8e-05,
"loss": 1.1118886470794678,
"step": 240
},
{
"epoch": 0.06446457112413426,
"grad_norm": 1.0546875,
"learning_rate": 8e-05,
"loss": 1.0832873582839966,
"step": 242
},
{
"epoch": 0.06499733617474694,
"grad_norm": 1.0625,
"learning_rate": 8e-05,
"loss": 1.1219228506088257,
"step": 244
},
{
"epoch": 0.06553010122535961,
"grad_norm": 1.1796875,
"learning_rate": 8e-05,
"loss": 1.1250672340393066,
"step": 246
},
{
"epoch": 0.0660628662759723,
"grad_norm": 1.171875,
"learning_rate": 8e-05,
"loss": 1.120064377784729,
"step": 248
},
{
"epoch": 0.06659563132658497,
"grad_norm": 1.125,
"learning_rate": 8e-05,
"loss": 1.0575106143951416,
"step": 250
},
{
"epoch": 0.06712839637719765,
"grad_norm": 1.0546875,
"learning_rate": 8e-05,
"loss": 1.0453040599822998,
"step": 252
},
{
"epoch": 0.06766116142781034,
"grad_norm": 1.0,
"learning_rate": 8e-05,
"loss": 1.0442687273025513,
"step": 254
},
{
"epoch": 0.06819392647842301,
"grad_norm": 1.0390625,
"learning_rate": 8e-05,
"loss": 1.158420205116272,
"step": 256
},
{
"epoch": 0.0687266915290357,
"grad_norm": 1.046875,
"learning_rate": 8e-05,
"loss": 1.140805959701538,
"step": 258
},
{
"epoch": 0.06925945657964838,
"grad_norm": 1.1328125,
"learning_rate": 8e-05,
"loss": 1.1012375354766846,
"step": 260
},
{
"epoch": 0.06979222163026105,
"grad_norm": 1.1015625,
"learning_rate": 8e-05,
"loss": 1.1367709636688232,
"step": 262
},
{
"epoch": 0.07032498668087374,
"grad_norm": 1.03125,
"learning_rate": 8e-05,
"loss": 1.0242253541946411,
"step": 264
},
{
"epoch": 0.07085775173148641,
"grad_norm": 1.1015625,
"learning_rate": 8e-05,
"loss": 1.0810563564300537,
"step": 266
},
{
"epoch": 0.0713905167820991,
"grad_norm": 1.0390625,
"learning_rate": 8e-05,
"loss": 1.0501123666763306,
"step": 268
},
{
"epoch": 0.07192328183271178,
"grad_norm": 1.09375,
"learning_rate": 8e-05,
"loss": 1.0658546686172485,
"step": 270
},
{
"epoch": 0.07245604688332445,
"grad_norm": 1.1015625,
"learning_rate": 8e-05,
"loss": 1.0879520177841187,
"step": 272
},
{
"epoch": 0.07298881193393714,
"grad_norm": 1.15625,
"learning_rate": 8e-05,
"loss": 1.1073130369186401,
"step": 274
},
{
"epoch": 0.07352157698454981,
"grad_norm": 1.015625,
"learning_rate": 8e-05,
"loss": 1.041626214981079,
"step": 276
},
{
"epoch": 0.0740543420351625,
"grad_norm": 0.98828125,
"learning_rate": 8e-05,
"loss": 1.061267375946045,
"step": 278
},
{
"epoch": 0.07458710708577518,
"grad_norm": 0.953125,
"learning_rate": 8e-05,
"loss": 1.0703907012939453,
"step": 280
},
{
"epoch": 0.07511987213638785,
"grad_norm": 1.0234375,
"learning_rate": 8e-05,
"loss": 1.088350772857666,
"step": 282
},
{
"epoch": 0.07565263718700053,
"grad_norm": 0.95703125,
"learning_rate": 8e-05,
"loss": 1.0447893142700195,
"step": 284
},
{
"epoch": 0.0761854022376132,
"grad_norm": 0.95703125,
"learning_rate": 8e-05,
"loss": 1.0158675909042358,
"step": 286
},
{
"epoch": 0.07671816728822589,
"grad_norm": 1.015625,
"learning_rate": 8e-05,
"loss": 1.0666112899780273,
"step": 288
},
{
"epoch": 0.07725093233883858,
"grad_norm": 1.03125,
"learning_rate": 8e-05,
"loss": 1.0512079000473022,
"step": 290
},
{
"epoch": 0.07778369738945125,
"grad_norm": 1.0078125,
"learning_rate": 8e-05,
"loss": 1.0278513431549072,
"step": 292
},
{
"epoch": 0.07831646244006393,
"grad_norm": 0.97265625,
"learning_rate": 8e-05,
"loss": 1.0547468662261963,
"step": 294
},
{
"epoch": 0.07884922749067662,
"grad_norm": 0.94921875,
"learning_rate": 8e-05,
"loss": 0.9834574460983276,
"step": 296
},
{
"epoch": 0.07938199254128929,
"grad_norm": 1.0234375,
"learning_rate": 8e-05,
"loss": 1.0703551769256592,
"step": 298
},
{
"epoch": 0.07991475759190197,
"grad_norm": 0.97265625,
"learning_rate": 8e-05,
"loss": 1.0130079984664917,
"step": 300
},
{
"epoch": 0.08044752264251465,
"grad_norm": 0.9921875,
"learning_rate": 8e-05,
"loss": 1.0876761674880981,
"step": 302
},
{
"epoch": 0.08098028769312733,
"grad_norm": 0.98828125,
"learning_rate": 8e-05,
"loss": 1.0426894426345825,
"step": 304
},
{
"epoch": 0.08151305274374002,
"grad_norm": 0.97265625,
"learning_rate": 8e-05,
"loss": 1.0565141439437866,
"step": 306
},
{
"epoch": 0.08204581779435269,
"grad_norm": 0.94921875,
"learning_rate": 8e-05,
"loss": 1.0379530191421509,
"step": 308
},
{
"epoch": 0.08257858284496537,
"grad_norm": 0.99609375,
"learning_rate": 8e-05,
"loss": 1.0548206567764282,
"step": 310
},
{
"epoch": 0.08311134789557804,
"grad_norm": 1.0234375,
"learning_rate": 8e-05,
"loss": 1.153540015220642,
"step": 312
},
{
"epoch": 0.08364411294619073,
"grad_norm": 1.0625,
"learning_rate": 8e-05,
"loss": 1.063852310180664,
"step": 314
},
{
"epoch": 0.08417687799680341,
"grad_norm": 0.92578125,
"learning_rate": 8e-05,
"loss": 1.0537505149841309,
"step": 316
},
{
"epoch": 0.08470964304741609,
"grad_norm": 0.9375,
"learning_rate": 8e-05,
"loss": 1.0552769899368286,
"step": 318
},
{
"epoch": 0.08524240809802877,
"grad_norm": 0.9375,
"learning_rate": 8e-05,
"loss": 1.0188905000686646,
"step": 320
},
{
"epoch": 0.08577517314864144,
"grad_norm": 0.91015625,
"learning_rate": 8e-05,
"loss": 1.0366284847259521,
"step": 322
},
{
"epoch": 0.08630793819925413,
"grad_norm": 0.9375,
"learning_rate": 8e-05,
"loss": 0.995755136013031,
"step": 324
},
{
"epoch": 0.08684070324986681,
"grad_norm": 0.94140625,
"learning_rate": 8e-05,
"loss": 1.0196154117584229,
"step": 326
},
{
"epoch": 0.08737346830047948,
"grad_norm": 0.9296875,
"learning_rate": 8e-05,
"loss": 0.9898626208305359,
"step": 328
},
{
"epoch": 0.08790623335109217,
"grad_norm": 0.859375,
"learning_rate": 8e-05,
"loss": 0.9742222428321838,
"step": 330
},
{
"epoch": 0.08843899840170485,
"grad_norm": 0.91015625,
"learning_rate": 8e-05,
"loss": 1.0032895803451538,
"step": 332
},
{
"epoch": 0.08897176345231753,
"grad_norm": 0.9140625,
"learning_rate": 8e-05,
"loss": 1.019012212753296,
"step": 334
},
{
"epoch": 0.08950452850293021,
"grad_norm": 0.9453125,
"learning_rate": 8e-05,
"loss": 1.0903642177581787,
"step": 336
},
{
"epoch": 0.09003729355354288,
"grad_norm": 0.91796875,
"learning_rate": 8e-05,
"loss": 1.02820885181427,
"step": 338
},
{
"epoch": 0.09057005860415557,
"grad_norm": 0.93359375,
"learning_rate": 8e-05,
"loss": 1.0357959270477295,
"step": 340
},
{
"epoch": 0.09110282365476825,
"grad_norm": 0.94140625,
"learning_rate": 8e-05,
"loss": 1.0563883781433105,
"step": 342
},
{
"epoch": 0.09163558870538092,
"grad_norm": 0.90625,
"learning_rate": 8e-05,
"loss": 0.9762277603149414,
"step": 344
},
{
"epoch": 0.09216835375599361,
"grad_norm": 0.9140625,
"learning_rate": 8e-05,
"loss": 0.9926705360412598,
"step": 346
},
{
"epoch": 0.09270111880660628,
"grad_norm": 0.92578125,
"learning_rate": 8e-05,
"loss": 1.028612494468689,
"step": 348
},
{
"epoch": 0.09323388385721897,
"grad_norm": 0.90234375,
"learning_rate": 8e-05,
"loss": 1.0039349794387817,
"step": 350
},
{
"epoch": 0.09376664890783165,
"grad_norm": 0.8515625,
"learning_rate": 8e-05,
"loss": 0.9718811511993408,
"step": 352
},
{
"epoch": 0.09429941395844432,
"grad_norm": 0.91015625,
"learning_rate": 8e-05,
"loss": 1.025659203529358,
"step": 354
},
{
"epoch": 0.09483217900905701,
"grad_norm": 0.890625,
"learning_rate": 8e-05,
"loss": 1.0149027109146118,
"step": 356
},
{
"epoch": 0.09536494405966968,
"grad_norm": 0.92578125,
"learning_rate": 8e-05,
"loss": 1.007585048675537,
"step": 358
},
{
"epoch": 0.09589770911028236,
"grad_norm": 0.8828125,
"learning_rate": 8e-05,
"loss": 0.9930945634841919,
"step": 360
},
{
"epoch": 0.09643047416089505,
"grad_norm": 0.8359375,
"learning_rate": 8e-05,
"loss": 0.9892737865447998,
"step": 362
},
{
"epoch": 0.09696323921150772,
"grad_norm": 0.87890625,
"learning_rate": 8e-05,
"loss": 1.0519317388534546,
"step": 364
},
{
"epoch": 0.0974960042621204,
"grad_norm": 0.9140625,
"learning_rate": 8e-05,
"loss": 1.0029200315475464,
"step": 366
},
{
"epoch": 0.09802876931273309,
"grad_norm": 0.8828125,
"learning_rate": 8e-05,
"loss": 0.9893202781677246,
"step": 368
},
{
"epoch": 0.09856153436334576,
"grad_norm": 0.84765625,
"learning_rate": 8e-05,
"loss": 0.9483737945556641,
"step": 370
},
{
"epoch": 0.09909429941395845,
"grad_norm": 1.125,
"learning_rate": 8e-05,
"loss": 1.0160411596298218,
"step": 372
},
{
"epoch": 0.09962706446457112,
"grad_norm": 0.88671875,
"learning_rate": 8e-05,
"loss": 0.9536669254302979,
"step": 374
},
{
"epoch": 0.1001598295151838,
"grad_norm": 0.8984375,
"learning_rate": 8e-05,
"loss": 1.008516788482666,
"step": 376
},
{
"epoch": 0.10069259456579649,
"grad_norm": 0.890625,
"learning_rate": 8e-05,
"loss": 0.9893296957015991,
"step": 378
},
{
"epoch": 0.10122535961640916,
"grad_norm": 0.875,
"learning_rate": 8e-05,
"loss": 1.0012911558151245,
"step": 380
},
{
"epoch": 0.10175812466702185,
"grad_norm": 0.84765625,
"learning_rate": 8e-05,
"loss": 0.9556794166564941,
"step": 382
},
{
"epoch": 0.10229088971763452,
"grad_norm": 0.875,
"learning_rate": 8e-05,
"loss": 0.9937628507614136,
"step": 384
},
{
"epoch": 0.1028236547682472,
"grad_norm": 0.86328125,
"learning_rate": 8e-05,
"loss": 1.0127745866775513,
"step": 386
},
{
"epoch": 0.10335641981885989,
"grad_norm": 0.84375,
"learning_rate": 8e-05,
"loss": 0.983036458492279,
"step": 388
},
{
"epoch": 0.10388918486947256,
"grad_norm": 0.921875,
"learning_rate": 8e-05,
"loss": 0.9836626648902893,
"step": 390
},
{
"epoch": 0.10442194992008524,
"grad_norm": 0.85546875,
"learning_rate": 8e-05,
"loss": 0.9373552203178406,
"step": 392
},
{
"epoch": 0.10495471497069792,
"grad_norm": 0.8515625,
"learning_rate": 8e-05,
"loss": 0.9097200036048889,
"step": 394
},
{
"epoch": 0.1054874800213106,
"grad_norm": 0.828125,
"learning_rate": 8e-05,
"loss": 0.94362872838974,
"step": 396
},
{
"epoch": 0.10602024507192329,
"grad_norm": 0.83203125,
"learning_rate": 8e-05,
"loss": 0.9865043759346008,
"step": 398
},
{
"epoch": 0.10655301012253596,
"grad_norm": 0.90625,
"learning_rate": 8e-05,
"loss": 0.9668699502944946,
"step": 400
},
{
"epoch": 0.10708577517314864,
"grad_norm": 0.8828125,
"learning_rate": 8e-05,
"loss": 0.9932379722595215,
"step": 402
},
{
"epoch": 0.10761854022376133,
"grad_norm": 0.86328125,
"learning_rate": 8e-05,
"loss": 0.9790946245193481,
"step": 404
},
{
"epoch": 0.108151305274374,
"grad_norm": 0.8515625,
"learning_rate": 8e-05,
"loss": 0.9935732483863831,
"step": 406
},
{
"epoch": 0.10868407032498668,
"grad_norm": 0.8515625,
"learning_rate": 8e-05,
"loss": 0.9672642946243286,
"step": 408
},
{
"epoch": 0.10921683537559936,
"grad_norm": 0.828125,
"learning_rate": 8e-05,
"loss": 0.9297552108764648,
"step": 410
},
{
"epoch": 0.10974960042621204,
"grad_norm": 0.8359375,
"learning_rate": 8e-05,
"loss": 0.9727266430854797,
"step": 412
},
{
"epoch": 0.11028236547682473,
"grad_norm": 0.8359375,
"learning_rate": 8e-05,
"loss": 0.9725493788719177,
"step": 414
},
{
"epoch": 0.1108151305274374,
"grad_norm": 0.86328125,
"learning_rate": 8e-05,
"loss": 0.969160258769989,
"step": 416
},
{
"epoch": 0.11134789557805008,
"grad_norm": 0.8359375,
"learning_rate": 8e-05,
"loss": 0.987875759601593,
"step": 418
},
{
"epoch": 0.11188066062866275,
"grad_norm": 0.83203125,
"learning_rate": 8e-05,
"loss": 0.939948320388794,
"step": 420
},
{
"epoch": 0.11241342567927544,
"grad_norm": 0.82421875,
"learning_rate": 8e-05,
"loss": 0.980888307094574,
"step": 422
},
{
"epoch": 0.11294619072988812,
"grad_norm": 0.82421875,
"learning_rate": 8e-05,
"loss": 0.9552009701728821,
"step": 424
},
{
"epoch": 0.1134789557805008,
"grad_norm": 0.84375,
"learning_rate": 8e-05,
"loss": 0.965975821018219,
"step": 426
},
{
"epoch": 0.11401172083111348,
"grad_norm": 0.83203125,
"learning_rate": 8e-05,
"loss": 0.9458581209182739,
"step": 428
},
{
"epoch": 0.11454448588172615,
"grad_norm": 0.81640625,
"learning_rate": 8e-05,
"loss": 0.9712637066841125,
"step": 430
},
{
"epoch": 0.11507725093233884,
"grad_norm": 0.79296875,
"learning_rate": 8e-05,
"loss": 0.957909345626831,
"step": 432
},
{
"epoch": 0.11561001598295152,
"grad_norm": 0.8359375,
"learning_rate": 8e-05,
"loss": 0.9364346861839294,
"step": 434
},
{
"epoch": 0.1161427810335642,
"grad_norm": 0.8046875,
"learning_rate": 8e-05,
"loss": 0.9434120655059814,
"step": 436
},
{
"epoch": 0.11667554608417688,
"grad_norm": 0.83203125,
"learning_rate": 8e-05,
"loss": 0.9406663775444031,
"step": 438
},
{
"epoch": 0.11720831113478956,
"grad_norm": 0.8125,
"learning_rate": 8e-05,
"loss": 0.9373142719268799,
"step": 440
},
{
"epoch": 0.11774107618540224,
"grad_norm": 0.8203125,
"learning_rate": 8e-05,
"loss": 0.935702919960022,
"step": 442
},
{
"epoch": 0.11827384123601492,
"grad_norm": 0.83203125,
"learning_rate": 8e-05,
"loss": 0.9485442638397217,
"step": 444
},
{
"epoch": 0.11880660628662759,
"grad_norm": 0.80859375,
"learning_rate": 8e-05,
"loss": 0.9245492219924927,
"step": 446
},
{
"epoch": 0.11933937133724028,
"grad_norm": 0.8203125,
"learning_rate": 8e-05,
"loss": 0.9705182909965515,
"step": 448
},
{
"epoch": 0.11987213638785296,
"grad_norm": 0.84765625,
"learning_rate": 8e-05,
"loss": 0.9660132527351379,
"step": 450
},
{
"epoch": 0.12040490143846563,
"grad_norm": 0.83203125,
"learning_rate": 8e-05,
"loss": 0.9636243581771851,
"step": 452
},
{
"epoch": 0.12093766648907832,
"grad_norm": 0.79296875,
"learning_rate": 8e-05,
"loss": 0.912445604801178,
"step": 454
},
{
"epoch": 0.12147043153969099,
"grad_norm": 0.8359375,
"learning_rate": 8e-05,
"loss": 0.9414998292922974,
"step": 456
},
{
"epoch": 0.12200319659030368,
"grad_norm": 0.828125,
"learning_rate": 8e-05,
"loss": 0.9331598281860352,
"step": 458
},
{
"epoch": 0.12253596164091636,
"grad_norm": 0.8125,
"learning_rate": 8e-05,
"loss": 0.9329249858856201,
"step": 460
},
{
"epoch": 0.12306872669152903,
"grad_norm": 0.82421875,
"learning_rate": 8e-05,
"loss": 0.9538522958755493,
"step": 462
},
{
"epoch": 0.12360149174214172,
"grad_norm": 0.8046875,
"learning_rate": 8e-05,
"loss": 0.9617863893508911,
"step": 464
},
{
"epoch": 0.1241342567927544,
"grad_norm": 0.7890625,
"learning_rate": 8e-05,
"loss": 0.9089372754096985,
"step": 466
},
{
"epoch": 0.12466702184336707,
"grad_norm": 0.8046875,
"learning_rate": 8e-05,
"loss": 0.9629489779472351,
"step": 468
},
{
"epoch": 0.12519978689397976,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.9058244824409485,
"step": 470
},
{
"epoch": 0.12573255194459243,
"grad_norm": 0.76171875,
"learning_rate": 8e-05,
"loss": 0.9134584069252014,
"step": 472
},
{
"epoch": 0.1262653169952051,
"grad_norm": 0.80078125,
"learning_rate": 8e-05,
"loss": 0.9098286628723145,
"step": 474
},
{
"epoch": 0.1267980820458178,
"grad_norm": 0.79296875,
"learning_rate": 8e-05,
"loss": 0.9566776752471924,
"step": 476
},
{
"epoch": 0.12733084709643047,
"grad_norm": 0.8046875,
"learning_rate": 8e-05,
"loss": 0.9700292944908142,
"step": 478
},
{
"epoch": 0.12786361214704314,
"grad_norm": 0.80078125,
"learning_rate": 8e-05,
"loss": 0.9420150518417358,
"step": 480
},
{
"epoch": 0.12839637719765584,
"grad_norm": 0.78125,
"learning_rate": 8e-05,
"loss": 0.9000596404075623,
"step": 482
},
{
"epoch": 0.1289291422482685,
"grad_norm": 0.81640625,
"learning_rate": 8e-05,
"loss": 0.9281507730484009,
"step": 484
},
{
"epoch": 0.12946190729888118,
"grad_norm": 0.84375,
"learning_rate": 8e-05,
"loss": 0.9546276330947876,
"step": 486
},
{
"epoch": 0.12999467234949388,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.9615715742111206,
"step": 488
},
{
"epoch": 0.13052743740010656,
"grad_norm": 0.8125,
"learning_rate": 8e-05,
"loss": 0.8883857131004333,
"step": 490
},
{
"epoch": 0.13106020245071923,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.9202597737312317,
"step": 492
},
{
"epoch": 0.13159296750133193,
"grad_norm": 0.83984375,
"learning_rate": 8e-05,
"loss": 0.9927231669425964,
"step": 494
},
{
"epoch": 0.1321257325519446,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.9516614675521851,
"step": 496
},
{
"epoch": 0.13265849760255727,
"grad_norm": 0.80859375,
"learning_rate": 8e-05,
"loss": 0.973112940788269,
"step": 498
},
{
"epoch": 0.13319126265316994,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.9146295189857483,
"step": 500
},
{
"epoch": 0.13372402770378264,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.946043074131012,
"step": 502
},
{
"epoch": 0.1342567927543953,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.9582048654556274,
"step": 504
},
{
"epoch": 0.13478955780500798,
"grad_norm": 0.74609375,
"learning_rate": 8e-05,
"loss": 0.8958991169929504,
"step": 506
},
{
"epoch": 0.13532232285562068,
"grad_norm": 0.8046875,
"learning_rate": 8e-05,
"loss": 0.9187520742416382,
"step": 508
},
{
"epoch": 0.13585508790623335,
"grad_norm": 0.78125,
"learning_rate": 8e-05,
"loss": 0.9344346523284912,
"step": 510
},
{
"epoch": 0.13638785295684602,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.8789697289466858,
"step": 512
},
{
"epoch": 0.13692061800745872,
"grad_norm": 0.79296875,
"learning_rate": 8e-05,
"loss": 0.9604220390319824,
"step": 514
},
{
"epoch": 0.1374533830580714,
"grad_norm": 0.8203125,
"learning_rate": 8e-05,
"loss": 0.9376251101493835,
"step": 516
},
{
"epoch": 0.13798614810868406,
"grad_norm": 0.8125,
"learning_rate": 8e-05,
"loss": 0.8872209787368774,
"step": 518
},
{
"epoch": 0.13851891315929676,
"grad_norm": 0.76953125,
"learning_rate": 8e-05,
"loss": 0.935639500617981,
"step": 520
},
{
"epoch": 0.13905167820990944,
"grad_norm": 0.8046875,
"learning_rate": 8e-05,
"loss": 0.9652010202407837,
"step": 522
},
{
"epoch": 0.1395844432605221,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.9051821231842041,
"step": 524
},
{
"epoch": 0.14011720831113478,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.9186902046203613,
"step": 526
},
{
"epoch": 0.14064997336174748,
"grad_norm": 0.81640625,
"learning_rate": 8e-05,
"loss": 0.8455410599708557,
"step": 528
},
{
"epoch": 0.14118273841236015,
"grad_norm": 0.8203125,
"learning_rate": 8e-05,
"loss": 0.9377204775810242,
"step": 530
},
{
"epoch": 0.14171550346297282,
"grad_norm": 0.84375,
"learning_rate": 8e-05,
"loss": 0.9178054332733154,
"step": 532
},
{
"epoch": 0.14224826851358552,
"grad_norm": 0.8359375,
"learning_rate": 8e-05,
"loss": 0.8930121660232544,
"step": 534
},
{
"epoch": 0.1427810335641982,
"grad_norm": 0.79296875,
"learning_rate": 8e-05,
"loss": 0.9158589839935303,
"step": 536
},
{
"epoch": 0.14331379861481086,
"grad_norm": 0.77734375,
"learning_rate": 8e-05,
"loss": 0.925919234752655,
"step": 538
},
{
"epoch": 0.14384656366542356,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.9652289152145386,
"step": 540
},
{
"epoch": 0.14437932871603623,
"grad_norm": 0.85546875,
"learning_rate": 8e-05,
"loss": 0.9391449093818665,
"step": 542
},
{
"epoch": 0.1449120937666489,
"grad_norm": 0.84375,
"learning_rate": 8e-05,
"loss": 0.9376974701881409,
"step": 544
},
{
"epoch": 0.14544485881726157,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.9146329164505005,
"step": 546
},
{
"epoch": 0.14597762386787427,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.935990571975708,
"step": 548
},
{
"epoch": 0.14651038891848694,
"grad_norm": 0.83203125,
"learning_rate": 8e-05,
"loss": 0.9762868285179138,
"step": 550
},
{
"epoch": 0.14704315396909962,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.9521989822387695,
"step": 552
},
{
"epoch": 0.14757591901971231,
"grad_norm": 0.76171875,
"learning_rate": 8e-05,
"loss": 0.9237980246543884,
"step": 554
},
{
"epoch": 0.148108684070325,
"grad_norm": 0.76953125,
"learning_rate": 8e-05,
"loss": 0.8922250270843506,
"step": 556
},
{
"epoch": 0.14864144912093766,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.9352413415908813,
"step": 558
},
{
"epoch": 0.14917421417155036,
"grad_norm": 0.83203125,
"learning_rate": 8e-05,
"loss": 0.9463628530502319,
"step": 560
},
{
"epoch": 0.14970697922216303,
"grad_norm": 0.81640625,
"learning_rate": 8e-05,
"loss": 0.8881433606147766,
"step": 562
},
{
"epoch": 0.1502397442727757,
"grad_norm": 0.81640625,
"learning_rate": 8e-05,
"loss": 0.9072080254554749,
"step": 564
},
{
"epoch": 0.1507725093233884,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.900499701499939,
"step": 566
},
{
"epoch": 0.15130527437400107,
"grad_norm": 0.75,
"learning_rate": 8e-05,
"loss": 0.9123456478118896,
"step": 568
},
{
"epoch": 0.15183803942461374,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.8801102638244629,
"step": 570
},
{
"epoch": 0.1523708044752264,
"grad_norm": 0.78125,
"learning_rate": 8e-05,
"loss": 0.8868352174758911,
"step": 572
},
{
"epoch": 0.1529035695258391,
"grad_norm": 0.75,
"learning_rate": 8e-05,
"loss": 0.8904232978820801,
"step": 574
},
{
"epoch": 0.15343633457645178,
"grad_norm": 0.7890625,
"learning_rate": 8e-05,
"loss": 0.9351356625556946,
"step": 576
},
{
"epoch": 0.15396909962706445,
"grad_norm": 0.8125,
"learning_rate": 8e-05,
"loss": 0.949609100818634,
"step": 578
},
{
"epoch": 0.15450186467767715,
"grad_norm": 0.8046875,
"learning_rate": 8e-05,
"loss": 0.9633817076683044,
"step": 580
},
{
"epoch": 0.15503462972828982,
"grad_norm": 0.765625,
"learning_rate": 8e-05,
"loss": 0.8968989253044128,
"step": 582
},
{
"epoch": 0.1555673947789025,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.9304310083389282,
"step": 584
},
{
"epoch": 0.1561001598295152,
"grad_norm": 0.74609375,
"learning_rate": 8e-05,
"loss": 0.9069873690605164,
"step": 586
},
{
"epoch": 0.15663292488012787,
"grad_norm": 0.8125,
"learning_rate": 8e-05,
"loss": 0.9126654863357544,
"step": 588
},
{
"epoch": 0.15716568993074054,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.8959242105484009,
"step": 590
},
{
"epoch": 0.15769845498135324,
"grad_norm": 0.78125,
"learning_rate": 8e-05,
"loss": 0.8930934071540833,
"step": 592
},
{
"epoch": 0.1582312200319659,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.9553401470184326,
"step": 594
},
{
"epoch": 0.15876398508257858,
"grad_norm": 0.77734375,
"learning_rate": 8e-05,
"loss": 0.9127396941184998,
"step": 596
},
{
"epoch": 0.15929675013319125,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.8723542094230652,
"step": 598
},
{
"epoch": 0.15982951518380395,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.9096969366073608,
"step": 600
},
{
"epoch": 0.16036228023441662,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.8800921440124512,
"step": 602
},
{
"epoch": 0.1608950452850293,
"grad_norm": 0.80078125,
"learning_rate": 8e-05,
"loss": 0.9251409769058228,
"step": 604
},
{
"epoch": 0.161427810335642,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.8803302049636841,
"step": 606
},
{
"epoch": 0.16196057538625466,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.8997635841369629,
"step": 608
},
{
"epoch": 0.16249334043686733,
"grad_norm": 0.78125,
"learning_rate": 8e-05,
"loss": 0.9180471301078796,
"step": 610
},
{
"epoch": 0.16302610548748003,
"grad_norm": 0.84375,
"learning_rate": 8e-05,
"loss": 0.9325738549232483,
"step": 612
},
{
"epoch": 0.1635588705380927,
"grad_norm": 0.7890625,
"learning_rate": 8e-05,
"loss": 0.8572644591331482,
"step": 614
},
{
"epoch": 0.16409163558870538,
"grad_norm": 0.7890625,
"learning_rate": 8e-05,
"loss": 0.9184677004814148,
"step": 616
},
{
"epoch": 0.16462440063931805,
"grad_norm": 0.80078125,
"learning_rate": 8e-05,
"loss": 0.951015830039978,
"step": 618
},
{
"epoch": 0.16515716568993075,
"grad_norm": 0.8046875,
"learning_rate": 8e-05,
"loss": 0.9111671447753906,
"step": 620
},
{
"epoch": 0.16568993074054342,
"grad_norm": 0.80078125,
"learning_rate": 8e-05,
"loss": 0.9108637571334839,
"step": 622
},
{
"epoch": 0.1662226957911561,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.886138379573822,
"step": 624
},
{
"epoch": 0.1667554608417688,
"grad_norm": 0.765625,
"learning_rate": 8e-05,
"loss": 0.9077087640762329,
"step": 626
},
{
"epoch": 0.16728822589238146,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.8817453384399414,
"step": 628
},
{
"epoch": 0.16782099094299413,
"grad_norm": 0.83984375,
"learning_rate": 8e-05,
"loss": 0.9480854272842407,
"step": 630
},
{
"epoch": 0.16835375599360683,
"grad_norm": 0.80078125,
"learning_rate": 8e-05,
"loss": 0.8651038408279419,
"step": 632
},
{
"epoch": 0.1688865210442195,
"grad_norm": 0.80078125,
"learning_rate": 8e-05,
"loss": 0.8813058733940125,
"step": 634
},
{
"epoch": 0.16941928609483217,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.9414123296737671,
"step": 636
},
{
"epoch": 0.16995205114544487,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.8974052667617798,
"step": 638
},
{
"epoch": 0.17048481619605754,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.9421342015266418,
"step": 640
},
{
"epoch": 0.17101758124667021,
"grad_norm": 0.75,
"learning_rate": 8e-05,
"loss": 0.8592175841331482,
"step": 642
},
{
"epoch": 0.17155034629728289,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.8520596027374268,
"step": 644
},
{
"epoch": 0.17208311134789558,
"grad_norm": 0.80859375,
"learning_rate": 8e-05,
"loss": 0.8983196020126343,
"step": 646
},
{
"epoch": 0.17261587639850826,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.873570442199707,
"step": 648
},
{
"epoch": 0.17314864144912093,
"grad_norm": 0.75390625,
"learning_rate": 8e-05,
"loss": 0.9159509539604187,
"step": 650
},
{
"epoch": 0.17368140649973363,
"grad_norm": 0.78125,
"learning_rate": 8e-05,
"loss": 0.8728219270706177,
"step": 652
},
{
"epoch": 0.1742141715503463,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.9193546175956726,
"step": 654
},
{
"epoch": 0.17474693660095897,
"grad_norm": 0.76953125,
"learning_rate": 8e-05,
"loss": 0.8523741364479065,
"step": 656
},
{
"epoch": 0.17527970165157167,
"grad_norm": 0.76953125,
"learning_rate": 8e-05,
"loss": 0.9125362038612366,
"step": 658
},
{
"epoch": 0.17581246670218434,
"grad_norm": 0.78125,
"learning_rate": 8e-05,
"loss": 0.9080352783203125,
"step": 660
},
{
"epoch": 0.176345231752797,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8742219805717468,
"step": 662
},
{
"epoch": 0.1768779968034097,
"grad_norm": 0.79296875,
"learning_rate": 8e-05,
"loss": 0.9006013870239258,
"step": 664
},
{
"epoch": 0.17741076185402238,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.8835248947143555,
"step": 666
},
{
"epoch": 0.17794352690463505,
"grad_norm": 0.78125,
"learning_rate": 8e-05,
"loss": 0.8771501183509827,
"step": 668
},
{
"epoch": 0.17847629195524772,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.9349658489227295,
"step": 670
},
{
"epoch": 0.17900905700586042,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.903891921043396,
"step": 672
},
{
"epoch": 0.1795418220564731,
"grad_norm": 0.76953125,
"learning_rate": 8e-05,
"loss": 0.905728816986084,
"step": 674
},
{
"epoch": 0.18007458710708577,
"grad_norm": 0.75390625,
"learning_rate": 8e-05,
"loss": 0.8719464540481567,
"step": 676
},
{
"epoch": 0.18060735215769846,
"grad_norm": 0.7421875,
"learning_rate": 8e-05,
"loss": 0.9010811448097229,
"step": 678
},
{
"epoch": 0.18114011720831114,
"grad_norm": 0.75390625,
"learning_rate": 8e-05,
"loss": 0.8699101805686951,
"step": 680
},
{
"epoch": 0.1816728822589238,
"grad_norm": 0.7421875,
"learning_rate": 8e-05,
"loss": 0.8923678994178772,
"step": 682
},
{
"epoch": 0.1822056473095365,
"grad_norm": 0.76953125,
"learning_rate": 8e-05,
"loss": 0.8914515972137451,
"step": 684
},
{
"epoch": 0.18273841236014918,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.8682945966720581,
"step": 686
},
{
"epoch": 0.18327117741076185,
"grad_norm": 0.8203125,
"learning_rate": 8e-05,
"loss": 0.9265104532241821,
"step": 688
},
{
"epoch": 0.18380394246137455,
"grad_norm": 0.7421875,
"learning_rate": 8e-05,
"loss": 0.9007678031921387,
"step": 690
},
{
"epoch": 0.18433670751198722,
"grad_norm": 0.75,
"learning_rate": 8e-05,
"loss": 0.8766607642173767,
"step": 692
},
{
"epoch": 0.1848694725625999,
"grad_norm": 0.78125,
"learning_rate": 8e-05,
"loss": 0.8730136752128601,
"step": 694
},
{
"epoch": 0.18540223761321256,
"grad_norm": 0.78515625,
"learning_rate": 8e-05,
"loss": 0.9283718466758728,
"step": 696
},
{
"epoch": 0.18593500266382526,
"grad_norm": 0.76953125,
"learning_rate": 8e-05,
"loss": 0.8665472865104675,
"step": 698
},
{
"epoch": 0.18646776771443793,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8720895648002625,
"step": 700
},
{
"epoch": 0.1870005327650506,
"grad_norm": 0.7421875,
"learning_rate": 8e-05,
"loss": 0.8716840744018555,
"step": 702
},
{
"epoch": 0.1875332978156633,
"grad_norm": 0.80078125,
"learning_rate": 8e-05,
"loss": 0.893147349357605,
"step": 704
},
{
"epoch": 0.18806606286627597,
"grad_norm": 0.796875,
"learning_rate": 8e-05,
"loss": 0.8979114890098572,
"step": 706
},
{
"epoch": 0.18859882791688865,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8585438132286072,
"step": 708
},
{
"epoch": 0.18913159296750134,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.9044113159179688,
"step": 710
},
{
"epoch": 0.18966435801811402,
"grad_norm": 0.84765625,
"learning_rate": 8e-05,
"loss": 0.920305073261261,
"step": 712
},
{
"epoch": 0.1901971230687267,
"grad_norm": 0.76171875,
"learning_rate": 8e-05,
"loss": 0.8698307275772095,
"step": 714
},
{
"epoch": 0.19072988811933936,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.8913177847862244,
"step": 716
},
{
"epoch": 0.19126265316995206,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8886463642120361,
"step": 718
},
{
"epoch": 0.19179541822056473,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.8651049733161926,
"step": 720
},
{
"epoch": 0.1923281832711774,
"grad_norm": 0.7421875,
"learning_rate": 8e-05,
"loss": 0.9027085900306702,
"step": 722
},
{
"epoch": 0.1928609483217901,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8272101283073425,
"step": 724
},
{
"epoch": 0.19339371337240277,
"grad_norm": 0.7421875,
"learning_rate": 8e-05,
"loss": 0.9042779207229614,
"step": 726
},
{
"epoch": 0.19392647842301544,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8599862456321716,
"step": 728
},
{
"epoch": 0.19445924347362814,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8881500959396362,
"step": 730
},
{
"epoch": 0.1949920085242408,
"grad_norm": 0.7421875,
"learning_rate": 8e-05,
"loss": 0.8275444507598877,
"step": 732
},
{
"epoch": 0.19552477357485348,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8692565560340881,
"step": 734
},
{
"epoch": 0.19605753862546618,
"grad_norm": 0.74609375,
"learning_rate": 8e-05,
"loss": 0.9346656799316406,
"step": 736
},
{
"epoch": 0.19659030367607885,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8541794419288635,
"step": 738
},
{
"epoch": 0.19712306872669152,
"grad_norm": 0.76171875,
"learning_rate": 8e-05,
"loss": 0.8708451986312866,
"step": 740
},
{
"epoch": 0.1976558337773042,
"grad_norm": 0.75,
"learning_rate": 8e-05,
"loss": 0.8439049124717712,
"step": 742
},
{
"epoch": 0.1981885988279169,
"grad_norm": 0.7890625,
"learning_rate": 8e-05,
"loss": 0.8625826835632324,
"step": 744
},
{
"epoch": 0.19872136387852957,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.9517384171485901,
"step": 746
},
{
"epoch": 0.19925412892914224,
"grad_norm": 0.73828125,
"learning_rate": 8e-05,
"loss": 0.9006468057632446,
"step": 748
},
{
"epoch": 0.19978689397975494,
"grad_norm": 0.75390625,
"learning_rate": 8e-05,
"loss": 0.9177417159080505,
"step": 750
},
{
"epoch": 0.2003196590303676,
"grad_norm": 0.7421875,
"learning_rate": 8e-05,
"loss": 0.9008771181106567,
"step": 752
},
{
"epoch": 0.20085242408098028,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.8759271502494812,
"step": 754
},
{
"epoch": 0.20138518913159298,
"grad_norm": 0.75390625,
"learning_rate": 8e-05,
"loss": 0.8917984962463379,
"step": 756
},
{
"epoch": 0.20191795418220565,
"grad_norm": 0.7421875,
"learning_rate": 8e-05,
"loss": 0.8270426988601685,
"step": 758
},
{
"epoch": 0.20245071923281832,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8944893479347229,
"step": 760
},
{
"epoch": 0.20298348428343102,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8563660383224487,
"step": 762
},
{
"epoch": 0.2035162493340437,
"grad_norm": 0.765625,
"learning_rate": 8e-05,
"loss": 0.9090912938117981,
"step": 764
},
{
"epoch": 0.20404901438465636,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8511034250259399,
"step": 766
},
{
"epoch": 0.20458177943526903,
"grad_norm": 0.76953125,
"learning_rate": 8e-05,
"loss": 0.8830511569976807,
"step": 768
},
{
"epoch": 0.20511454448588173,
"grad_norm": 0.76953125,
"learning_rate": 8e-05,
"loss": 0.8611262440681458,
"step": 770
},
{
"epoch": 0.2056473095364944,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.8874700665473938,
"step": 772
},
{
"epoch": 0.20618007458710708,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8730615377426147,
"step": 774
},
{
"epoch": 0.20671283963771978,
"grad_norm": 0.74609375,
"learning_rate": 8e-05,
"loss": 0.8800356388092041,
"step": 776
},
{
"epoch": 0.20724560468833245,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.8427203893661499,
"step": 778
},
{
"epoch": 0.20777836973894512,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8761508464813232,
"step": 780
},
{
"epoch": 0.20831113478955782,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8280084729194641,
"step": 782
},
{
"epoch": 0.2088438998401705,
"grad_norm": 0.74609375,
"learning_rate": 8e-05,
"loss": 0.8750775456428528,
"step": 784
},
{
"epoch": 0.20937666489078316,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8673681020736694,
"step": 786
},
{
"epoch": 0.20990942994139583,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.8897008299827576,
"step": 788
},
{
"epoch": 0.21044219499200853,
"grad_norm": 0.74609375,
"learning_rate": 8e-05,
"loss": 0.8720916509628296,
"step": 790
},
{
"epoch": 0.2109749600426212,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8478310108184814,
"step": 792
},
{
"epoch": 0.21150772509323387,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8465595841407776,
"step": 794
},
{
"epoch": 0.21204049014384657,
"grad_norm": 0.765625,
"learning_rate": 8e-05,
"loss": 0.8860556483268738,
"step": 796
},
{
"epoch": 0.21257325519445924,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8419367074966431,
"step": 798
},
{
"epoch": 0.21310602024507191,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8694333434104919,
"step": 800
},
{
"epoch": 0.2136387852956846,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8959736227989197,
"step": 802
},
{
"epoch": 0.21417155034629728,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8992412090301514,
"step": 804
},
{
"epoch": 0.21470431539690996,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.8671194314956665,
"step": 806
},
{
"epoch": 0.21523708044752266,
"grad_norm": 0.7109375,
"learning_rate": 8e-05,
"loss": 0.8547959327697754,
"step": 808
},
{
"epoch": 0.21576984549813533,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8758715987205505,
"step": 810
},
{
"epoch": 0.216302610548748,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8950058221817017,
"step": 812
},
{
"epoch": 0.21683537559936067,
"grad_norm": 0.74609375,
"learning_rate": 8e-05,
"loss": 0.8568795323371887,
"step": 814
},
{
"epoch": 0.21736814064997337,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.903777003288269,
"step": 816
},
{
"epoch": 0.21790090570058604,
"grad_norm": 0.73828125,
"learning_rate": 8e-05,
"loss": 0.8380757570266724,
"step": 818
},
{
"epoch": 0.2184336707511987,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.851688802242279,
"step": 820
},
{
"epoch": 0.2189664358018114,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8432849645614624,
"step": 822
},
{
"epoch": 0.21949920085242408,
"grad_norm": 0.765625,
"learning_rate": 8e-05,
"loss": 0.9238657355308533,
"step": 824
},
{
"epoch": 0.22003196590303675,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8367424607276917,
"step": 826
},
{
"epoch": 0.22056473095364945,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8334730863571167,
"step": 828
},
{
"epoch": 0.22109749600426212,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8405196666717529,
"step": 830
},
{
"epoch": 0.2216302610548748,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8685415983200073,
"step": 832
},
{
"epoch": 0.2221630261054875,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8226058483123779,
"step": 834
},
{
"epoch": 0.22269579115610016,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.8686912059783936,
"step": 836
},
{
"epoch": 0.22322855620671284,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8767325282096863,
"step": 838
},
{
"epoch": 0.2237613212573255,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8393811583518982,
"step": 840
},
{
"epoch": 0.2242940863079382,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8124199509620667,
"step": 842
},
{
"epoch": 0.22482685135855088,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8643161654472351,
"step": 844
},
{
"epoch": 0.22535961640916355,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8218422532081604,
"step": 846
},
{
"epoch": 0.22589238145977625,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.8444434404373169,
"step": 848
},
{
"epoch": 0.22642514651038892,
"grad_norm": 0.75,
"learning_rate": 8e-05,
"loss": 0.9049234390258789,
"step": 850
},
{
"epoch": 0.2269579115610016,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8443453311920166,
"step": 852
},
{
"epoch": 0.2274906766116143,
"grad_norm": 0.7578125,
"learning_rate": 8e-05,
"loss": 0.8720162510871887,
"step": 854
},
{
"epoch": 0.22802344166222696,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8342218399047852,
"step": 856
},
{
"epoch": 0.22855620671283963,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.816758394241333,
"step": 858
},
{
"epoch": 0.2290889717634523,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.834097683429718,
"step": 860
},
{
"epoch": 0.229621736814065,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8916500806808472,
"step": 862
},
{
"epoch": 0.23015450186467767,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8376821875572205,
"step": 864
},
{
"epoch": 0.23068726691529035,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.8754009008407593,
"step": 866
},
{
"epoch": 0.23122003196590304,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8596115708351135,
"step": 868
},
{
"epoch": 0.23175279701651572,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.8355474472045898,
"step": 870
},
{
"epoch": 0.2322855620671284,
"grad_norm": 0.75390625,
"learning_rate": 8e-05,
"loss": 0.876761257648468,
"step": 872
},
{
"epoch": 0.2328183271177411,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8948037624359131,
"step": 874
},
{
"epoch": 0.23335109216835376,
"grad_norm": 0.7734375,
"learning_rate": 8e-05,
"loss": 0.8517124056816101,
"step": 876
},
{
"epoch": 0.23388385721896643,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8235978484153748,
"step": 878
},
{
"epoch": 0.23441662226957913,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8589498400688171,
"step": 880
},
{
"epoch": 0.2349493873201918,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8652699589729309,
"step": 882
},
{
"epoch": 0.23548215237080447,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8380342125892639,
"step": 884
},
{
"epoch": 0.23601491742141714,
"grad_norm": 0.79296875,
"learning_rate": 8e-05,
"loss": 0.8523828387260437,
"step": 886
},
{
"epoch": 0.23654768247202984,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.9019739627838135,
"step": 888
},
{
"epoch": 0.2370804475226425,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8879658579826355,
"step": 890
},
{
"epoch": 0.23761321257325518,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8675874471664429,
"step": 892
},
{
"epoch": 0.23814597762386788,
"grad_norm": 0.73828125,
"learning_rate": 8e-05,
"loss": 0.8754541873931885,
"step": 894
},
{
"epoch": 0.23867874267448055,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8880608081817627,
"step": 896
},
{
"epoch": 0.23921150772509323,
"grad_norm": 0.75390625,
"learning_rate": 8e-05,
"loss": 0.8680922985076904,
"step": 898
},
{
"epoch": 0.23974427277570592,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8829432129859924,
"step": 900
},
{
"epoch": 0.2402770378263186,
"grad_norm": 0.7109375,
"learning_rate": 8e-05,
"loss": 0.8691644072532654,
"step": 902
},
{
"epoch": 0.24080980287693127,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8773077130317688,
"step": 904
},
{
"epoch": 0.24134256792754397,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8335128426551819,
"step": 906
},
{
"epoch": 0.24187533297815664,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8632769584655762,
"step": 908
},
{
"epoch": 0.2424080980287693,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8212178945541382,
"step": 910
},
{
"epoch": 0.24294086307938198,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8350695967674255,
"step": 912
},
{
"epoch": 0.24347362812999468,
"grad_norm": 0.7109375,
"learning_rate": 8e-05,
"loss": 0.8406141996383667,
"step": 914
},
{
"epoch": 0.24400639318060735,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8700868487358093,
"step": 916
},
{
"epoch": 0.24453915823122002,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8683998584747314,
"step": 918
},
{
"epoch": 0.24507192328183272,
"grad_norm": 0.76171875,
"learning_rate": 8e-05,
"loss": 0.8839133977890015,
"step": 920
},
{
"epoch": 0.2456046883324454,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.8943023085594177,
"step": 922
},
{
"epoch": 0.24613745338305806,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8632996082305908,
"step": 924
},
{
"epoch": 0.24667021843367076,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.833712637424469,
"step": 926
},
{
"epoch": 0.24720298348428343,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.864801287651062,
"step": 928
},
{
"epoch": 0.2477357485348961,
"grad_norm": 0.765625,
"learning_rate": 8e-05,
"loss": 0.866331160068512,
"step": 930
},
{
"epoch": 0.2482685135855088,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8194761872291565,
"step": 932
},
{
"epoch": 0.24880127863612148,
"grad_norm": 0.73828125,
"learning_rate": 8e-05,
"loss": 0.8057902455329895,
"step": 934
},
{
"epoch": 0.24933404368673415,
"grad_norm": 0.7109375,
"learning_rate": 8e-05,
"loss": 0.8366430997848511,
"step": 936
},
{
"epoch": 0.24986680873734682,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8721463084220886,
"step": 938
},
{
"epoch": 0.2503995737879595,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.867581844329834,
"step": 940
},
{
"epoch": 0.2509323388385722,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8661463856697083,
"step": 942
},
{
"epoch": 0.25146510388918486,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8397172093391418,
"step": 944
},
{
"epoch": 0.25199786893979753,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.8674642443656921,
"step": 946
},
{
"epoch": 0.2525306339904102,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8149669766426086,
"step": 948
},
{
"epoch": 0.25306339904102293,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8306043148040771,
"step": 950
},
{
"epoch": 0.2535961640916356,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8840003609657288,
"step": 952
},
{
"epoch": 0.25412892914224827,
"grad_norm": 0.75390625,
"learning_rate": 8e-05,
"loss": 0.8428335189819336,
"step": 954
},
{
"epoch": 0.25466169419286094,
"grad_norm": 0.7109375,
"learning_rate": 8e-05,
"loss": 0.842282772064209,
"step": 956
},
{
"epoch": 0.2551944592434736,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8525648713111877,
"step": 958
},
{
"epoch": 0.2557272242940863,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8552727699279785,
"step": 960
},
{
"epoch": 0.256259989344699,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8829293251037598,
"step": 962
},
{
"epoch": 0.2567927543953117,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8788723945617676,
"step": 964
},
{
"epoch": 0.25732551944592436,
"grad_norm": 0.7890625,
"learning_rate": 8e-05,
"loss": 0.8775660395622253,
"step": 966
},
{
"epoch": 0.257858284496537,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8680721521377563,
"step": 968
},
{
"epoch": 0.2583910495471497,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.8522012829780579,
"step": 970
},
{
"epoch": 0.25892381459776237,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8104643821716309,
"step": 972
},
{
"epoch": 0.25945657964837504,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8393515348434448,
"step": 974
},
{
"epoch": 0.25998934469898777,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.829097330570221,
"step": 976
},
{
"epoch": 0.26052210974960044,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8042383193969727,
"step": 978
},
{
"epoch": 0.2610548748002131,
"grad_norm": 0.7109375,
"learning_rate": 8e-05,
"loss": 0.8093820214271545,
"step": 980
},
{
"epoch": 0.2615876398508258,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8751801252365112,
"step": 982
},
{
"epoch": 0.26212040490143845,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.7946058511734009,
"step": 984
},
{
"epoch": 0.2626531699520511,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.817328929901123,
"step": 986
},
{
"epoch": 0.26318593500266385,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8307056427001953,
"step": 988
},
{
"epoch": 0.2637187000532765,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.837894082069397,
"step": 990
},
{
"epoch": 0.2642514651038892,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.8467088341712952,
"step": 992
},
{
"epoch": 0.26478423015450187,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8036914467811584,
"step": 994
},
{
"epoch": 0.26531699520511454,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.819699764251709,
"step": 996
},
{
"epoch": 0.2658497602557272,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8019229769706726,
"step": 998
},
{
"epoch": 0.2663825253063399,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8053907155990601,
"step": 1000
},
{
"epoch": 0.2669152903569526,
"grad_norm": 0.7109375,
"learning_rate": 8e-05,
"loss": 0.8073758482933044,
"step": 1002
},
{
"epoch": 0.2674480554075653,
"grad_norm": 0.73828125,
"learning_rate": 8e-05,
"loss": 0.828862190246582,
"step": 1004
},
{
"epoch": 0.26798082045817795,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8768335580825806,
"step": 1006
},
{
"epoch": 0.2685135855087906,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.851819634437561,
"step": 1008
},
{
"epoch": 0.2690463505594033,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.812772274017334,
"step": 1010
},
{
"epoch": 0.26957911561001596,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8672690987586975,
"step": 1012
},
{
"epoch": 0.2701118806606287,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8474425077438354,
"step": 1014
},
{
"epoch": 0.27064464571124136,
"grad_norm": 0.76171875,
"learning_rate": 8e-05,
"loss": 0.846489429473877,
"step": 1016
},
{
"epoch": 0.27117741076185403,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8402537107467651,
"step": 1018
},
{
"epoch": 0.2717101758124667,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.873051643371582,
"step": 1020
},
{
"epoch": 0.2722429408630794,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8155555129051208,
"step": 1022
},
{
"epoch": 0.27277570591369205,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7977849245071411,
"step": 1024
},
{
"epoch": 0.2733084709643047,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8137617111206055,
"step": 1026
},
{
"epoch": 0.27384123601491744,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.848274827003479,
"step": 1028
},
{
"epoch": 0.2743740010655301,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8711082339286804,
"step": 1030
},
{
"epoch": 0.2749067661161428,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8073972463607788,
"step": 1032
},
{
"epoch": 0.27543953116675546,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8326395750045776,
"step": 1034
},
{
"epoch": 0.27597229621736813,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.855026125907898,
"step": 1036
},
{
"epoch": 0.2765050612679808,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.7754218578338623,
"step": 1038
},
{
"epoch": 0.2770378263185935,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8070095181465149,
"step": 1040
},
{
"epoch": 0.2775705913692062,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8382344245910645,
"step": 1042
},
{
"epoch": 0.27810335641981887,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.8222081661224365,
"step": 1044
},
{
"epoch": 0.27863612147043154,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.8200312256813049,
"step": 1046
},
{
"epoch": 0.2791688865210442,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8260998129844666,
"step": 1048
},
{
"epoch": 0.2797016515716569,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.814910888671875,
"step": 1050
},
{
"epoch": 0.28023441662226956,
"grad_norm": 0.73828125,
"learning_rate": 8e-05,
"loss": 0.8156729936599731,
"step": 1052
},
{
"epoch": 0.2807671816728823,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8278362154960632,
"step": 1054
},
{
"epoch": 0.28129994672349495,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.8697912096977234,
"step": 1056
},
{
"epoch": 0.2818327117741076,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8401728868484497,
"step": 1058
},
{
"epoch": 0.2823654768247203,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8192448019981384,
"step": 1060
},
{
"epoch": 0.28289824187533297,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8254504203796387,
"step": 1062
},
{
"epoch": 0.28343100692594564,
"grad_norm": 0.7109375,
"learning_rate": 8e-05,
"loss": 0.8694897294044495,
"step": 1064
},
{
"epoch": 0.2839637719765583,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8187917470932007,
"step": 1066
},
{
"epoch": 0.28449653702717104,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8559208512306213,
"step": 1068
},
{
"epoch": 0.2850293020777837,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.844135582447052,
"step": 1070
},
{
"epoch": 0.2855620671283964,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8289559483528137,
"step": 1072
},
{
"epoch": 0.28609483217900905,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.845726490020752,
"step": 1074
},
{
"epoch": 0.2866275972296217,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8291332125663757,
"step": 1076
},
{
"epoch": 0.2871603622802344,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.8118506073951721,
"step": 1078
},
{
"epoch": 0.2876931273308471,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8339268565177917,
"step": 1080
},
{
"epoch": 0.2882258923814598,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8273485898971558,
"step": 1082
},
{
"epoch": 0.28875865743207246,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8129878640174866,
"step": 1084
},
{
"epoch": 0.28929142248268513,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8126479983329773,
"step": 1086
},
{
"epoch": 0.2898241875332978,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8130111694335938,
"step": 1088
},
{
"epoch": 0.2903569525839105,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8049681186676025,
"step": 1090
},
{
"epoch": 0.29088971763452315,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.8302510976791382,
"step": 1092
},
{
"epoch": 0.2914224826851359,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.7791358828544617,
"step": 1094
},
{
"epoch": 0.29195524773574855,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8203290700912476,
"step": 1096
},
{
"epoch": 0.2924880127863612,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8215634822845459,
"step": 1098
},
{
"epoch": 0.2930207778369739,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8419864177703857,
"step": 1100
},
{
"epoch": 0.29355354288758656,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8286892175674438,
"step": 1102
},
{
"epoch": 0.29408630793819923,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8043957948684692,
"step": 1104
},
{
"epoch": 0.29461907298881196,
"grad_norm": 0.734375,
"learning_rate": 8e-05,
"loss": 0.8927145004272461,
"step": 1106
},
{
"epoch": 0.29515183803942463,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8291870951652527,
"step": 1108
},
{
"epoch": 0.2956846030900373,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8313156366348267,
"step": 1110
},
{
"epoch": 0.29621736814065,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8501160144805908,
"step": 1112
},
{
"epoch": 0.29675013319126264,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7889755368232727,
"step": 1114
},
{
"epoch": 0.2972828982418753,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8166150450706482,
"step": 1116
},
{
"epoch": 0.297815663292488,
"grad_norm": 0.7109375,
"learning_rate": 8e-05,
"loss": 0.8412237167358398,
"step": 1118
},
{
"epoch": 0.2983484283431007,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8289856910705566,
"step": 1120
},
{
"epoch": 0.2988811933937134,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8027371168136597,
"step": 1122
},
{
"epoch": 0.29941395844432606,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8847401738166809,
"step": 1124
},
{
"epoch": 0.2999467234949387,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8147903680801392,
"step": 1126
},
{
"epoch": 0.3004794885455514,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8418508768081665,
"step": 1128
},
{
"epoch": 0.30101225359616407,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8261881470680237,
"step": 1130
},
{
"epoch": 0.3015450186467768,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8368499875068665,
"step": 1132
},
{
"epoch": 0.30207778369738947,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.7928882837295532,
"step": 1134
},
{
"epoch": 0.30261054874800214,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8569204211235046,
"step": 1136
},
{
"epoch": 0.3031433137986148,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.789159893989563,
"step": 1138
},
{
"epoch": 0.3036760788492275,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8144698143005371,
"step": 1140
},
{
"epoch": 0.30420884389984015,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8477827310562134,
"step": 1142
},
{
"epoch": 0.3047416089504528,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.7713529467582703,
"step": 1144
},
{
"epoch": 0.30527437400106555,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8129821419715881,
"step": 1146
},
{
"epoch": 0.3058071390516782,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8247785568237305,
"step": 1148
},
{
"epoch": 0.3063399041022909,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8340808153152466,
"step": 1150
},
{
"epoch": 0.30687266915290357,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.837786078453064,
"step": 1152
},
{
"epoch": 0.30740543420351624,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8202642202377319,
"step": 1154
},
{
"epoch": 0.3079381992541289,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.7888924479484558,
"step": 1156
},
{
"epoch": 0.30847096430474163,
"grad_norm": 0.7109375,
"learning_rate": 8e-05,
"loss": 0.8010126352310181,
"step": 1158
},
{
"epoch": 0.3090037293553543,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.8378207683563232,
"step": 1160
},
{
"epoch": 0.309536494405967,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.7908037900924683,
"step": 1162
},
{
"epoch": 0.31006925945657965,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.793649435043335,
"step": 1164
},
{
"epoch": 0.3106020245071923,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8675748109817505,
"step": 1166
},
{
"epoch": 0.311134789557805,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7880247831344604,
"step": 1168
},
{
"epoch": 0.31166755460841766,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8333700299263,
"step": 1170
},
{
"epoch": 0.3122003196590304,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8213086724281311,
"step": 1172
},
{
"epoch": 0.31273308470964306,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8234198689460754,
"step": 1174
},
{
"epoch": 0.31326584976025573,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8723286390304565,
"step": 1176
},
{
"epoch": 0.3137986148108684,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.8310482501983643,
"step": 1178
},
{
"epoch": 0.3143313798614811,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8434503674507141,
"step": 1180
},
{
"epoch": 0.31486414491209375,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8098523020744324,
"step": 1182
},
{
"epoch": 0.3153969099627065,
"grad_norm": 0.7265625,
"learning_rate": 8e-05,
"loss": 0.8657939434051514,
"step": 1184
},
{
"epoch": 0.31592967501331914,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8621107339859009,
"step": 1186
},
{
"epoch": 0.3164624400639318,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8273295760154724,
"step": 1188
},
{
"epoch": 0.3169952051145445,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7910110354423523,
"step": 1190
},
{
"epoch": 0.31752797016515716,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.8113824129104614,
"step": 1192
},
{
"epoch": 0.31806073521576983,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8246937394142151,
"step": 1194
},
{
"epoch": 0.3185935002663825,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.835896909236908,
"step": 1196
},
{
"epoch": 0.31912626531699523,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8543767929077148,
"step": 1198
},
{
"epoch": 0.3196590303676079,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.7878560423851013,
"step": 1200
},
{
"epoch": 0.32019179541822057,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8405436277389526,
"step": 1202
},
{
"epoch": 0.32072456046883324,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8454638719558716,
"step": 1204
},
{
"epoch": 0.3212573255194459,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.8819068074226379,
"step": 1206
},
{
"epoch": 0.3217900905700586,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8613421320915222,
"step": 1208
},
{
"epoch": 0.3223228556206713,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.84280925989151,
"step": 1210
},
{
"epoch": 0.322855620671284,
"grad_norm": 0.73046875,
"learning_rate": 8e-05,
"loss": 0.8268940448760986,
"step": 1212
},
{
"epoch": 0.32338838572189665,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8723543882369995,
"step": 1214
},
{
"epoch": 0.3239211507725093,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8331593871116638,
"step": 1216
},
{
"epoch": 0.324453915823122,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8328378200531006,
"step": 1218
},
{
"epoch": 0.32498668087373467,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8236236572265625,
"step": 1220
},
{
"epoch": 0.32551944592434734,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.9025675058364868,
"step": 1222
},
{
"epoch": 0.32605221097496007,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8124939799308777,
"step": 1224
},
{
"epoch": 0.32658497602557274,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8572587370872498,
"step": 1226
},
{
"epoch": 0.3271177410761854,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.7765132188796997,
"step": 1228
},
{
"epoch": 0.3276505061267981,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8381130695343018,
"step": 1230
},
{
"epoch": 0.32818327117741075,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8289612531661987,
"step": 1232
},
{
"epoch": 0.3287160362280234,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8231470584869385,
"step": 1234
},
{
"epoch": 0.3292488012786361,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8185766339302063,
"step": 1236
},
{
"epoch": 0.3297815663292488,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.809109091758728,
"step": 1238
},
{
"epoch": 0.3303143313798615,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8600226044654846,
"step": 1240
},
{
"epoch": 0.33084709643047416,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8347437977790833,
"step": 1242
},
{
"epoch": 0.33137986148108683,
"grad_norm": 0.71484375,
"learning_rate": 8e-05,
"loss": 0.8622264862060547,
"step": 1244
},
{
"epoch": 0.3319126265316995,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8058921098709106,
"step": 1246
},
{
"epoch": 0.3324453915823122,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8255428075790405,
"step": 1248
},
{
"epoch": 0.3329781566329249,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8035192489624023,
"step": 1250
},
{
"epoch": 0.3335109216835376,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8229770660400391,
"step": 1252
},
{
"epoch": 0.33404368673415025,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8099672198295593,
"step": 1254
},
{
"epoch": 0.3345764517847629,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8221847414970398,
"step": 1256
},
{
"epoch": 0.3351092168353756,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8075480461120605,
"step": 1258
},
{
"epoch": 0.33564198188598826,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8237086534500122,
"step": 1260
},
{
"epoch": 0.33617474693660093,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8089755773544312,
"step": 1262
},
{
"epoch": 0.33670751198721366,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8307473063468933,
"step": 1264
},
{
"epoch": 0.33724027703782633,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8326480388641357,
"step": 1266
},
{
"epoch": 0.337773042088439,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8556010723114014,
"step": 1268
},
{
"epoch": 0.3383058071390517,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8126214146614075,
"step": 1270
},
{
"epoch": 0.33883857218966434,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8234031796455383,
"step": 1272
},
{
"epoch": 0.339371337240277,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8191617727279663,
"step": 1274
},
{
"epoch": 0.33990410229088974,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7701093554496765,
"step": 1276
},
{
"epoch": 0.3404368673415024,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.82283616065979,
"step": 1278
},
{
"epoch": 0.3409696323921151,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8266280889511108,
"step": 1280
},
{
"epoch": 0.34150239744272776,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7687116861343384,
"step": 1282
},
{
"epoch": 0.34203516249334043,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.8157622218132019,
"step": 1284
},
{
"epoch": 0.3425679275439531,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8259207606315613,
"step": 1286
},
{
"epoch": 0.34310069259456577,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7940812110900879,
"step": 1288
},
{
"epoch": 0.3436334576451785,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8644148707389832,
"step": 1290
},
{
"epoch": 0.34416622269579117,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8318431973457336,
"step": 1292
},
{
"epoch": 0.34469898774640384,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7810943126678467,
"step": 1294
},
{
"epoch": 0.3452317527970165,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.8003625273704529,
"step": 1296
},
{
"epoch": 0.3457645178476292,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7984561920166016,
"step": 1298
},
{
"epoch": 0.34629728289824185,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.8027920722961426,
"step": 1300
},
{
"epoch": 0.3468300479488546,
"grad_norm": 0.71875,
"learning_rate": 8e-05,
"loss": 0.8382170796394348,
"step": 1302
},
{
"epoch": 0.34736281299946725,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8113511800765991,
"step": 1304
},
{
"epoch": 0.3478955780500799,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7773157358169556,
"step": 1306
},
{
"epoch": 0.3484283431006926,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8348230719566345,
"step": 1308
},
{
"epoch": 0.34896110815130527,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8246813416481018,
"step": 1310
},
{
"epoch": 0.34949387320191794,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8007385730743408,
"step": 1312
},
{
"epoch": 0.3500266382525306,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.782619297504425,
"step": 1314
},
{
"epoch": 0.35055940330314334,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8223733305931091,
"step": 1316
},
{
"epoch": 0.351092168353756,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.7972275018692017,
"step": 1318
},
{
"epoch": 0.3516249334043687,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.806035578250885,
"step": 1320
},
{
"epoch": 0.35215769845498135,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8033435940742493,
"step": 1322
},
{
"epoch": 0.352690463505594,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8439804315567017,
"step": 1324
},
{
"epoch": 0.3532232285562067,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8247978091239929,
"step": 1326
},
{
"epoch": 0.3537559936068194,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7770576477050781,
"step": 1328
},
{
"epoch": 0.3542887586574321,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.8157473206520081,
"step": 1330
},
{
"epoch": 0.35482152370804476,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8069726824760437,
"step": 1332
},
{
"epoch": 0.35535428875865743,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.8322383165359497,
"step": 1334
},
{
"epoch": 0.3558870538092701,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.797613799571991,
"step": 1336
},
{
"epoch": 0.3564198188598828,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8209842443466187,
"step": 1338
},
{
"epoch": 0.35695258391049545,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8391934037208557,
"step": 1340
},
{
"epoch": 0.3574853489611082,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.8320241570472717,
"step": 1342
},
{
"epoch": 0.35801811401172084,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.8497931957244873,
"step": 1344
},
{
"epoch": 0.3585508790623335,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7735468149185181,
"step": 1346
},
{
"epoch": 0.3590836441129462,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.7980416417121887,
"step": 1348
},
{
"epoch": 0.35961640916355886,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7885520458221436,
"step": 1350
},
{
"epoch": 0.36014917421417153,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.786660373210907,
"step": 1352
},
{
"epoch": 0.36068193926478426,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7961907386779785,
"step": 1354
},
{
"epoch": 0.36121470431539693,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.8075615763664246,
"step": 1356
},
{
"epoch": 0.3617474693660096,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7986862659454346,
"step": 1358
},
{
"epoch": 0.36228023441662227,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7794106006622314,
"step": 1360
},
{
"epoch": 0.36281299946723494,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8085299134254456,
"step": 1362
},
{
"epoch": 0.3633457645178476,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.8172681927680969,
"step": 1364
},
{
"epoch": 0.3638785295684603,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8157686591148376,
"step": 1366
},
{
"epoch": 0.364411294619073,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8455065488815308,
"step": 1368
},
{
"epoch": 0.3649440596696857,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8134093284606934,
"step": 1370
},
{
"epoch": 0.36547682472029835,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7692434787750244,
"step": 1372
},
{
"epoch": 0.366009589770911,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7843102812767029,
"step": 1374
},
{
"epoch": 0.3665423548215237,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8318651914596558,
"step": 1376
},
{
"epoch": 0.36707511987213637,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.8075577616691589,
"step": 1378
},
{
"epoch": 0.3676078849227491,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7955432534217834,
"step": 1380
},
{
"epoch": 0.36814064997336177,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7789199352264404,
"step": 1382
},
{
"epoch": 0.36867341502397444,
"grad_norm": 0.69140625,
"learning_rate": 8e-05,
"loss": 0.8257974982261658,
"step": 1384
},
{
"epoch": 0.3692061800745871,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7887584567070007,
"step": 1386
},
{
"epoch": 0.3697389451251998,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.8510478734970093,
"step": 1388
},
{
"epoch": 0.37027171017581245,
"grad_norm": 0.6875,
"learning_rate": 8e-05,
"loss": 0.8060406446456909,
"step": 1390
},
{
"epoch": 0.3708044752264251,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7955806851387024,
"step": 1392
},
{
"epoch": 0.37133724027703785,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7971813082695007,
"step": 1394
},
{
"epoch": 0.3718700053276505,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7997479438781738,
"step": 1396
},
{
"epoch": 0.3724027703782632,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.8180658221244812,
"step": 1398
},
{
"epoch": 0.37293553542887586,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.790256142616272,
"step": 1400
},
{
"epoch": 0.37346830047948854,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7598444223403931,
"step": 1402
},
{
"epoch": 0.3740010655301012,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.801741361618042,
"step": 1404
},
{
"epoch": 0.3745338305807139,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7819033861160278,
"step": 1406
},
{
"epoch": 0.3750665956313266,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8237056732177734,
"step": 1408
},
{
"epoch": 0.3755993606819393,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.8147417306900024,
"step": 1410
},
{
"epoch": 0.37613212573255195,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7891062498092651,
"step": 1412
},
{
"epoch": 0.3766648907831646,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.8018314838409424,
"step": 1414
},
{
"epoch": 0.3771976558337773,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7935019135475159,
"step": 1416
},
{
"epoch": 0.37773042088438996,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8369472622871399,
"step": 1418
},
{
"epoch": 0.3782631859350027,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.7935312986373901,
"step": 1420
},
{
"epoch": 0.37879595098561536,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7909234166145325,
"step": 1422
},
{
"epoch": 0.37932871603622803,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.817375898361206,
"step": 1424
},
{
"epoch": 0.3798614810868407,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.809626042842865,
"step": 1426
},
{
"epoch": 0.3803942461374534,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7614900469779968,
"step": 1428
},
{
"epoch": 0.38092701118806604,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7885522842407227,
"step": 1430
},
{
"epoch": 0.3814597762386787,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7955629825592041,
"step": 1432
},
{
"epoch": 0.38199254128929144,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7892552614212036,
"step": 1434
},
{
"epoch": 0.3825253063399041,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8095505833625793,
"step": 1436
},
{
"epoch": 0.3830580713905168,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7699292302131653,
"step": 1438
},
{
"epoch": 0.38359083644112946,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.8210839629173279,
"step": 1440
},
{
"epoch": 0.38412360149174213,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7994286417961121,
"step": 1442
},
{
"epoch": 0.3846563665423548,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7771030068397522,
"step": 1444
},
{
"epoch": 0.3851891315929675,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7784361839294434,
"step": 1446
},
{
"epoch": 0.3857218966435802,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.8078509569168091,
"step": 1448
},
{
"epoch": 0.38625466169419287,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8055387735366821,
"step": 1450
},
{
"epoch": 0.38678742674480554,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8143312931060791,
"step": 1452
},
{
"epoch": 0.3873201917954182,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8212940096855164,
"step": 1454
},
{
"epoch": 0.3878529568460309,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.8249719738960266,
"step": 1456
},
{
"epoch": 0.38838572189664355,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8218085765838623,
"step": 1458
},
{
"epoch": 0.3889184869472563,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8255725502967834,
"step": 1460
},
{
"epoch": 0.38945125199786895,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7988225817680359,
"step": 1462
},
{
"epoch": 0.3899840170484816,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7831338047981262,
"step": 1464
},
{
"epoch": 0.3905167820990943,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8661478757858276,
"step": 1466
},
{
"epoch": 0.39104954714970697,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.8106863498687744,
"step": 1468
},
{
"epoch": 0.39158231220031964,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7969095706939697,
"step": 1470
},
{
"epoch": 0.39211507725093236,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7830895781517029,
"step": 1472
},
{
"epoch": 0.39264784230154504,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.8268316984176636,
"step": 1474
},
{
"epoch": 0.3931806073521577,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.8054344654083252,
"step": 1476
},
{
"epoch": 0.3937133724027704,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7860501408576965,
"step": 1478
},
{
"epoch": 0.39424613745338305,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8310704231262207,
"step": 1480
},
{
"epoch": 0.3947789025039957,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.8038405179977417,
"step": 1482
},
{
"epoch": 0.3953116675546084,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7913386821746826,
"step": 1484
},
{
"epoch": 0.3958444326052211,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7641775012016296,
"step": 1486
},
{
"epoch": 0.3963771976558338,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8178999423980713,
"step": 1488
},
{
"epoch": 0.39690996270644646,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8257794380187988,
"step": 1490
},
{
"epoch": 0.39744272775705913,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7903448343276978,
"step": 1492
},
{
"epoch": 0.3979754928076718,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.7882867455482483,
"step": 1494
},
{
"epoch": 0.3985082578582845,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.8316589593887329,
"step": 1496
},
{
"epoch": 0.3990410229088972,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.8053879141807556,
"step": 1498
},
{
"epoch": 0.3995737879595099,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8426420092582703,
"step": 1500
},
{
"epoch": 0.40010655301012255,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8209067583084106,
"step": 1502
},
{
"epoch": 0.4006393180607352,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.8079086542129517,
"step": 1504
},
{
"epoch": 0.4011720831113479,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.808128297328949,
"step": 1506
},
{
"epoch": 0.40170484816196056,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8213729858398438,
"step": 1508
},
{
"epoch": 0.40223761321257323,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8084886074066162,
"step": 1510
},
{
"epoch": 0.40277037826318596,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.8165018558502197,
"step": 1512
},
{
"epoch": 0.40330314331379863,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8012227416038513,
"step": 1514
},
{
"epoch": 0.4038359083644113,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8613946437835693,
"step": 1516
},
{
"epoch": 0.40436867341502397,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8433681726455688,
"step": 1518
},
{
"epoch": 0.40490143846563664,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.779987096786499,
"step": 1520
},
{
"epoch": 0.4054342035162493,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8048115968704224,
"step": 1522
},
{
"epoch": 0.40596696856686204,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8666731715202332,
"step": 1524
},
{
"epoch": 0.4064997336174747,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.747114360332489,
"step": 1526
},
{
"epoch": 0.4070324986680874,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7566266059875488,
"step": 1528
},
{
"epoch": 0.40756526371870005,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7844398617744446,
"step": 1530
},
{
"epoch": 0.4080980287693127,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7681721448898315,
"step": 1532
},
{
"epoch": 0.4086307938199254,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7945184111595154,
"step": 1534
},
{
"epoch": 0.40916355887053807,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7696878910064697,
"step": 1536
},
{
"epoch": 0.4096963239211508,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7923563718795776,
"step": 1538
},
{
"epoch": 0.41022908897176347,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8100162744522095,
"step": 1540
},
{
"epoch": 0.41076185402237614,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7867823243141174,
"step": 1542
},
{
"epoch": 0.4112946190729888,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8097234964370728,
"step": 1544
},
{
"epoch": 0.4118273841236015,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7911956310272217,
"step": 1546
},
{
"epoch": 0.41236014917421415,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8348594903945923,
"step": 1548
},
{
"epoch": 0.4128929142248269,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.77206951379776,
"step": 1550
},
{
"epoch": 0.41342567927543955,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8244647979736328,
"step": 1552
},
{
"epoch": 0.4139584443260522,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7584129571914673,
"step": 1554
},
{
"epoch": 0.4144912093766649,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7752898931503296,
"step": 1556
},
{
"epoch": 0.41502397442727756,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7845876812934875,
"step": 1558
},
{
"epoch": 0.41555673947789024,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7884317636489868,
"step": 1560
},
{
"epoch": 0.4160895045285029,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8328264951705933,
"step": 1562
},
{
"epoch": 0.41662226957911563,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7535569071769714,
"step": 1564
},
{
"epoch": 0.4171550346297283,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7909643650054932,
"step": 1566
},
{
"epoch": 0.417687799680341,
"grad_norm": 0.703125,
"learning_rate": 8e-05,
"loss": 0.8720725774765015,
"step": 1568
},
{
"epoch": 0.41822056473095365,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7731503844261169,
"step": 1570
},
{
"epoch": 0.4187533297815663,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7802464365959167,
"step": 1572
},
{
"epoch": 0.419286094832179,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7635596990585327,
"step": 1574
},
{
"epoch": 0.41981885988279166,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7855473160743713,
"step": 1576
},
{
"epoch": 0.4203516249334044,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.7947031855583191,
"step": 1578
},
{
"epoch": 0.42088438998401706,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.8119519352912903,
"step": 1580
},
{
"epoch": 0.42141715503462973,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7724003791809082,
"step": 1582
},
{
"epoch": 0.4219499200852424,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.8322865962982178,
"step": 1584
},
{
"epoch": 0.4224826851358551,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7840743064880371,
"step": 1586
},
{
"epoch": 0.42301545018646775,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7936784625053406,
"step": 1588
},
{
"epoch": 0.42354821523708047,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.809872567653656,
"step": 1590
},
{
"epoch": 0.42408098028769314,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8307843208312988,
"step": 1592
},
{
"epoch": 0.4246137453383058,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.8011277914047241,
"step": 1594
},
{
"epoch": 0.4251465103889185,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8050786852836609,
"step": 1596
},
{
"epoch": 0.42567927543953116,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7952936291694641,
"step": 1598
},
{
"epoch": 0.42621204049014383,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7305450439453125,
"step": 1600
},
{
"epoch": 0.4267448055407565,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7913362383842468,
"step": 1602
},
{
"epoch": 0.4272775705913692,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7669799327850342,
"step": 1604
},
{
"epoch": 0.4278103356419819,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.812600314617157,
"step": 1606
},
{
"epoch": 0.42834310069259457,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8100326657295227,
"step": 1608
},
{
"epoch": 0.42887586574320724,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7722445726394653,
"step": 1610
},
{
"epoch": 0.4294086307938199,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.7866500616073608,
"step": 1612
},
{
"epoch": 0.4299413958444326,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7936035394668579,
"step": 1614
},
{
"epoch": 0.4304741608950453,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.8121829032897949,
"step": 1616
},
{
"epoch": 0.431006925945658,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7777450680732727,
"step": 1618
},
{
"epoch": 0.43153969099627065,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8149593472480774,
"step": 1620
},
{
"epoch": 0.4320724560468833,
"grad_norm": 0.81640625,
"learning_rate": 8e-05,
"loss": 0.8239789605140686,
"step": 1622
},
{
"epoch": 0.432605221097496,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8340217471122742,
"step": 1624
},
{
"epoch": 0.43313798614810867,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8046810030937195,
"step": 1626
},
{
"epoch": 0.43367075119872134,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7964731454849243,
"step": 1628
},
{
"epoch": 0.43420351624933406,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7805262207984924,
"step": 1630
},
{
"epoch": 0.43473628129994674,
"grad_norm": 0.671875,
"learning_rate": 8e-05,
"loss": 0.8220688104629517,
"step": 1632
},
{
"epoch": 0.4352690463505594,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7977862358093262,
"step": 1634
},
{
"epoch": 0.4358018114011721,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7609307765960693,
"step": 1636
},
{
"epoch": 0.43633457645178475,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8068886399269104,
"step": 1638
},
{
"epoch": 0.4368673415023974,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.7697072625160217,
"step": 1640
},
{
"epoch": 0.43740010655301015,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.8072892427444458,
"step": 1642
},
{
"epoch": 0.4379328716036228,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7413228154182434,
"step": 1644
},
{
"epoch": 0.4384656366542355,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7742043733596802,
"step": 1646
},
{
"epoch": 0.43899840170484816,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7952238917350769,
"step": 1648
},
{
"epoch": 0.43953116675546083,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7638510465621948,
"step": 1650
},
{
"epoch": 0.4400639318060735,
"grad_norm": 0.67578125,
"learning_rate": 8e-05,
"loss": 0.7926915884017944,
"step": 1652
},
{
"epoch": 0.4405966968566862,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7640718817710876,
"step": 1654
},
{
"epoch": 0.4411294619072989,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7931405305862427,
"step": 1656
},
{
"epoch": 0.4416622269579116,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.7784867882728577,
"step": 1658
},
{
"epoch": 0.44219499200852425,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.7957846522331238,
"step": 1660
},
{
"epoch": 0.4427277570591369,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7507031559944153,
"step": 1662
},
{
"epoch": 0.4432605221097496,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8128439784049988,
"step": 1664
},
{
"epoch": 0.44379328716036226,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8259686231613159,
"step": 1666
},
{
"epoch": 0.444326052210975,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.8185787796974182,
"step": 1668
},
{
"epoch": 0.44485881726158766,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.8285982608795166,
"step": 1670
},
{
"epoch": 0.44539158231220033,
"grad_norm": 0.70703125,
"learning_rate": 8e-05,
"loss": 0.7934709787368774,
"step": 1672
},
{
"epoch": 0.445924347362813,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.7914223670959473,
"step": 1674
},
{
"epoch": 0.44645711241342567,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7982125878334045,
"step": 1676
},
{
"epoch": 0.44698987746403834,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.798332929611206,
"step": 1678
},
{
"epoch": 0.447522642514651,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.8056020140647888,
"step": 1680
},
{
"epoch": 0.44805540756526374,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7972841262817383,
"step": 1682
},
{
"epoch": 0.4485881726158764,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.841311514377594,
"step": 1684
},
{
"epoch": 0.4491209376664891,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7941474914550781,
"step": 1686
},
{
"epoch": 0.44965370271710176,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7953973412513733,
"step": 1688
},
{
"epoch": 0.4501864677677144,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.8190003037452698,
"step": 1690
},
{
"epoch": 0.4507192328183271,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.8118422627449036,
"step": 1692
},
{
"epoch": 0.4512519978689398,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7708790898323059,
"step": 1694
},
{
"epoch": 0.4517847629195525,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7594013810157776,
"step": 1696
},
{
"epoch": 0.45231752797016517,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.8185669183731079,
"step": 1698
},
{
"epoch": 0.45285029302077784,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.8450738191604614,
"step": 1700
},
{
"epoch": 0.4533830580713905,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8263086080551147,
"step": 1702
},
{
"epoch": 0.4539158231220032,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.793552041053772,
"step": 1704
},
{
"epoch": 0.45444858817261585,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.797087550163269,
"step": 1706
},
{
"epoch": 0.4549813532232286,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7767654061317444,
"step": 1708
},
{
"epoch": 0.45551411827384125,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7956861257553101,
"step": 1710
},
{
"epoch": 0.4560468833244539,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7868641018867493,
"step": 1712
},
{
"epoch": 0.4565796483750666,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.8069348335266113,
"step": 1714
},
{
"epoch": 0.45711241342567926,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.8170084357261658,
"step": 1716
},
{
"epoch": 0.45764517847629194,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7934258580207825,
"step": 1718
},
{
"epoch": 0.4581779435269046,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8041369915008545,
"step": 1720
},
{
"epoch": 0.45871070857751733,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7930079102516174,
"step": 1722
},
{
"epoch": 0.45924347362813,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7743818163871765,
"step": 1724
},
{
"epoch": 0.4597762386787427,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7860970497131348,
"step": 1726
},
{
"epoch": 0.46030900372935535,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7822363972663879,
"step": 1728
},
{
"epoch": 0.460841768779968,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7959834933280945,
"step": 1730
},
{
"epoch": 0.4613745338305807,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.828339159488678,
"step": 1732
},
{
"epoch": 0.4619072988811934,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.799767017364502,
"step": 1734
},
{
"epoch": 0.4624400639318061,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7834099531173706,
"step": 1736
},
{
"epoch": 0.46297282898241876,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7832657694816589,
"step": 1738
},
{
"epoch": 0.46350559403303143,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7735370993614197,
"step": 1740
},
{
"epoch": 0.4640383590836441,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7704412341117859,
"step": 1742
},
{
"epoch": 0.4645711241342568,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7932986617088318,
"step": 1744
},
{
"epoch": 0.46510388918486945,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8292320966720581,
"step": 1746
},
{
"epoch": 0.4656366542354822,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.8028055429458618,
"step": 1748
},
{
"epoch": 0.46616941928609484,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.801913321018219,
"step": 1750
},
{
"epoch": 0.4667021843367075,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7580520510673523,
"step": 1752
},
{
"epoch": 0.4672349493873202,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7673873901367188,
"step": 1754
},
{
"epoch": 0.46776771443793286,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7917901873588562,
"step": 1756
},
{
"epoch": 0.46830047948854553,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8057483434677124,
"step": 1758
},
{
"epoch": 0.46883324453915826,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.824272632598877,
"step": 1760
},
{
"epoch": 0.4693660095897709,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7674715518951416,
"step": 1762
},
{
"epoch": 0.4698987746403836,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7918990254402161,
"step": 1764
},
{
"epoch": 0.47043153969099627,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7527544498443604,
"step": 1766
},
{
"epoch": 0.47096430474160894,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7316806316375732,
"step": 1768
},
{
"epoch": 0.4714970697922216,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.764977753162384,
"step": 1770
},
{
"epoch": 0.4720298348428343,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.754130482673645,
"step": 1772
},
{
"epoch": 0.472562599893447,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.8153510689735413,
"step": 1774
},
{
"epoch": 0.4730953649440597,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7621491551399231,
"step": 1776
},
{
"epoch": 0.47362812999467235,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7979814410209656,
"step": 1778
},
{
"epoch": 0.474160895045285,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7620680928230286,
"step": 1780
},
{
"epoch": 0.4746936600958977,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.8257867097854614,
"step": 1782
},
{
"epoch": 0.47522642514651037,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8385603427886963,
"step": 1784
},
{
"epoch": 0.4757591901971231,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7887386679649353,
"step": 1786
},
{
"epoch": 0.47629195524773577,
"grad_norm": 0.66796875,
"learning_rate": 8e-05,
"loss": 0.815808892250061,
"step": 1788
},
{
"epoch": 0.47682472029834844,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7848386764526367,
"step": 1790
},
{
"epoch": 0.4773574853489611,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.7899839878082275,
"step": 1792
},
{
"epoch": 0.4778902503995738,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8273345828056335,
"step": 1794
},
{
"epoch": 0.47842301545018645,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7726816534996033,
"step": 1796
},
{
"epoch": 0.4789557805007991,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8128560781478882,
"step": 1798
},
{
"epoch": 0.47948854555141185,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7594364881515503,
"step": 1800
},
{
"epoch": 0.4800213106020245,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7679142951965332,
"step": 1802
},
{
"epoch": 0.4805540756526372,
"grad_norm": 0.73828125,
"learning_rate": 8e-05,
"loss": 0.7729013562202454,
"step": 1804
},
{
"epoch": 0.48108684070324986,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7576342821121216,
"step": 1806
},
{
"epoch": 0.48161960575386253,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7645061612129211,
"step": 1808
},
{
"epoch": 0.4821523708044752,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7803142666816711,
"step": 1810
},
{
"epoch": 0.48268513585508793,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7762041687965393,
"step": 1812
},
{
"epoch": 0.4832179009057006,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7839690446853638,
"step": 1814
},
{
"epoch": 0.4837506659563133,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.787205159664154,
"step": 1816
},
{
"epoch": 0.48428343100692595,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.8149699568748474,
"step": 1818
},
{
"epoch": 0.4848161960575386,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7666029334068298,
"step": 1820
},
{
"epoch": 0.4853489611081513,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.8224028944969177,
"step": 1822
},
{
"epoch": 0.48588172615876396,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7877684831619263,
"step": 1824
},
{
"epoch": 0.4864144912093767,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7987911701202393,
"step": 1826
},
{
"epoch": 0.48694725625998936,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.759647011756897,
"step": 1828
},
{
"epoch": 0.48748002131060203,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7700395584106445,
"step": 1830
},
{
"epoch": 0.4880127863612147,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7666257619857788,
"step": 1832
},
{
"epoch": 0.4885455514118274,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7913581132888794,
"step": 1834
},
{
"epoch": 0.48907831646244004,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7783340215682983,
"step": 1836
},
{
"epoch": 0.48961108151305277,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.8159565925598145,
"step": 1838
},
{
"epoch": 0.49014384656366544,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7737563252449036,
"step": 1840
},
{
"epoch": 0.4906766116142781,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.8198726177215576,
"step": 1842
},
{
"epoch": 0.4912093766648908,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7712583541870117,
"step": 1844
},
{
"epoch": 0.49174214171550346,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7808051705360413,
"step": 1846
},
{
"epoch": 0.4922749067661161,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7701867818832397,
"step": 1848
},
{
"epoch": 0.4928076718167288,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7609500288963318,
"step": 1850
},
{
"epoch": 0.4933404368673415,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8407063484191895,
"step": 1852
},
{
"epoch": 0.4938732019179542,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7863637208938599,
"step": 1854
},
{
"epoch": 0.49440596696856687,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.797248363494873,
"step": 1856
},
{
"epoch": 0.49493873201917954,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7627251148223877,
"step": 1858
},
{
"epoch": 0.4954714970697922,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.8115253448486328,
"step": 1860
},
{
"epoch": 0.4960042621204049,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7831135988235474,
"step": 1862
},
{
"epoch": 0.4965370271710176,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.8069642782211304,
"step": 1864
},
{
"epoch": 0.4970697922216303,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8169358372688293,
"step": 1866
},
{
"epoch": 0.49760255727224295,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7873024344444275,
"step": 1868
},
{
"epoch": 0.4981353223228556,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7919709086418152,
"step": 1870
},
{
"epoch": 0.4986680873734683,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7744743824005127,
"step": 1872
},
{
"epoch": 0.49920085242408097,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.8178368806838989,
"step": 1874
},
{
"epoch": 0.49973361747469364,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7937871217727661,
"step": 1876
},
{
"epoch": 0.5002663825253063,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.7904363870620728,
"step": 1878
},
{
"epoch": 0.500799147575919,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7489137053489685,
"step": 1880
},
{
"epoch": 0.5013319126265317,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.8006702065467834,
"step": 1882
},
{
"epoch": 0.5018646776771444,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7943602800369263,
"step": 1884
},
{
"epoch": 0.5023974427277571,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8127607703208923,
"step": 1886
},
{
"epoch": 0.5029302077783697,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7711650729179382,
"step": 1888
},
{
"epoch": 0.5034629728289824,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.8170063495635986,
"step": 1890
},
{
"epoch": 0.5039957378795951,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7900512218475342,
"step": 1892
},
{
"epoch": 0.5045285029302078,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.789554238319397,
"step": 1894
},
{
"epoch": 0.5050612679808204,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7663175463676453,
"step": 1896
},
{
"epoch": 0.5055940330314331,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7394262552261353,
"step": 1898
},
{
"epoch": 0.5061267980820459,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.776930570602417,
"step": 1900
},
{
"epoch": 0.5066595631326585,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7874533534049988,
"step": 1902
},
{
"epoch": 0.5071923281832712,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.780265212059021,
"step": 1904
},
{
"epoch": 0.5077250932338838,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7821959853172302,
"step": 1906
},
{
"epoch": 0.5082578582844965,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7723431587219238,
"step": 1908
},
{
"epoch": 0.5087906233351093,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8031184673309326,
"step": 1910
},
{
"epoch": 0.5093233883857219,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7822156548500061,
"step": 1912
},
{
"epoch": 0.5098561534363346,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7892681956291199,
"step": 1914
},
{
"epoch": 0.5103889184869472,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7619911432266235,
"step": 1916
},
{
"epoch": 0.51092168353756,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7405803203582764,
"step": 1918
},
{
"epoch": 0.5114544485881726,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7802076935768127,
"step": 1920
},
{
"epoch": 0.5119872136387853,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7818913459777832,
"step": 1922
},
{
"epoch": 0.512519978689398,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7780552506446838,
"step": 1924
},
{
"epoch": 0.5130527437400106,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7681518793106079,
"step": 1926
},
{
"epoch": 0.5135855087906234,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.8236287236213684,
"step": 1928
},
{
"epoch": 0.514118273841236,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7650191783905029,
"step": 1930
},
{
"epoch": 0.5146510388918487,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7506141662597656,
"step": 1932
},
{
"epoch": 0.5151838039424613,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.761217474937439,
"step": 1934
},
{
"epoch": 0.515716568993074,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8237653970718384,
"step": 1936
},
{
"epoch": 0.5162493340436868,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7403720021247864,
"step": 1938
},
{
"epoch": 0.5167820990942994,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7571833729743958,
"step": 1940
},
{
"epoch": 0.5173148641449121,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.8090606927871704,
"step": 1942
},
{
"epoch": 0.5178476291955247,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7799073457717896,
"step": 1944
},
{
"epoch": 0.5183803942461375,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8075971007347107,
"step": 1946
},
{
"epoch": 0.5189131592967501,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7910662889480591,
"step": 1948
},
{
"epoch": 0.5194459243473628,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.786179780960083,
"step": 1950
},
{
"epoch": 0.5199786893979755,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7777940034866333,
"step": 1952
},
{
"epoch": 0.5205114544485882,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7531383633613586,
"step": 1954
},
{
"epoch": 0.5210442194992009,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7762864828109741,
"step": 1956
},
{
"epoch": 0.5215769845498135,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.759125828742981,
"step": 1958
},
{
"epoch": 0.5221097496004262,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7885076403617859,
"step": 1960
},
{
"epoch": 0.5226425146510388,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7936964631080627,
"step": 1962
},
{
"epoch": 0.5231752797016516,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7714090943336487,
"step": 1964
},
{
"epoch": 0.5237080447522643,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7484626770019531,
"step": 1966
},
{
"epoch": 0.5242408098028769,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7742045521736145,
"step": 1968
},
{
"epoch": 0.5247735748534896,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7953801155090332,
"step": 1970
},
{
"epoch": 0.5253063399041022,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7853913903236389,
"step": 1972
},
{
"epoch": 0.525839104954715,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7650255560874939,
"step": 1974
},
{
"epoch": 0.5263718700053277,
"grad_norm": 0.65234375,
"learning_rate": 8e-05,
"loss": 0.8097711801528931,
"step": 1976
},
{
"epoch": 0.5269046350559403,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7798804640769958,
"step": 1978
},
{
"epoch": 0.527437400106553,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.77159184217453,
"step": 1980
},
{
"epoch": 0.5279701651571657,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7470179200172424,
"step": 1982
},
{
"epoch": 0.5285029302077784,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7437633275985718,
"step": 1984
},
{
"epoch": 0.529035695258391,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7707769274711609,
"step": 1986
},
{
"epoch": 0.5295684603090037,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7968963384628296,
"step": 1988
},
{
"epoch": 0.5301012253596165,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7605825662612915,
"step": 1990
},
{
"epoch": 0.5306339904102291,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7688091993331909,
"step": 1992
},
{
"epoch": 0.5311667554608418,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.8013522028923035,
"step": 1994
},
{
"epoch": 0.5316995205114544,
"grad_norm": 0.6953125,
"learning_rate": 8e-05,
"loss": 0.8307242393493652,
"step": 1996
},
{
"epoch": 0.5322322855620671,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7533189058303833,
"step": 1998
},
{
"epoch": 0.5327650506126798,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7661764621734619,
"step": 2000
},
{
"epoch": 0.5332978156632925,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.761687159538269,
"step": 2002
},
{
"epoch": 0.5338305807139052,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7734599113464355,
"step": 2004
},
{
"epoch": 0.5343633457645178,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.764024555683136,
"step": 2006
},
{
"epoch": 0.5348961108151306,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7784127593040466,
"step": 2008
},
{
"epoch": 0.5354288758657432,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7688071131706238,
"step": 2010
},
{
"epoch": 0.5359616409163559,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7830870151519775,
"step": 2012
},
{
"epoch": 0.5364944059669685,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7614960074424744,
"step": 2014
},
{
"epoch": 0.5370271710175812,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7830729484558105,
"step": 2016
},
{
"epoch": 0.537559936068194,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7983683943748474,
"step": 2018
},
{
"epoch": 0.5380927011188066,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7553051114082336,
"step": 2020
},
{
"epoch": 0.5386254661694193,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7692767977714539,
"step": 2022
},
{
"epoch": 0.5391582312200319,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7751806974411011,
"step": 2024
},
{
"epoch": 0.5396909962706447,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.7655361294746399,
"step": 2026
},
{
"epoch": 0.5402237613212574,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.8146576285362244,
"step": 2028
},
{
"epoch": 0.54075652637187,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7740441560745239,
"step": 2030
},
{
"epoch": 0.5412892914224827,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7796229124069214,
"step": 2032
},
{
"epoch": 0.5418220564730953,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7617045044898987,
"step": 2034
},
{
"epoch": 0.5423548215237081,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7547261714935303,
"step": 2036
},
{
"epoch": 0.5428875865743207,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7878617644309998,
"step": 2038
},
{
"epoch": 0.5434203516249334,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.7670659422874451,
"step": 2040
},
{
"epoch": 0.5439531166755461,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.8015154600143433,
"step": 2042
},
{
"epoch": 0.5444858817261587,
"grad_norm": 0.68359375,
"learning_rate": 8e-05,
"loss": 0.7678729295730591,
"step": 2044
},
{
"epoch": 0.5450186467767715,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7684184312820435,
"step": 2046
},
{
"epoch": 0.5455514118273841,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7359906435012817,
"step": 2048
},
{
"epoch": 0.5460841768779968,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7884916067123413,
"step": 2050
},
{
"epoch": 0.5466169419286094,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8283618092536926,
"step": 2052
},
{
"epoch": 0.5471497069792222,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7943402528762817,
"step": 2054
},
{
"epoch": 0.5476824720298349,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7284975647926331,
"step": 2056
},
{
"epoch": 0.5482152370804475,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7707847952842712,
"step": 2058
},
{
"epoch": 0.5487480021310602,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8470367193222046,
"step": 2060
},
{
"epoch": 0.5492807671816728,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8090894222259521,
"step": 2062
},
{
"epoch": 0.5498135322322856,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7707048058509827,
"step": 2064
},
{
"epoch": 0.5503462972828982,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7647881507873535,
"step": 2066
},
{
"epoch": 0.5508790623335109,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.738248348236084,
"step": 2068
},
{
"epoch": 0.5514118273841236,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7581315040588379,
"step": 2070
},
{
"epoch": 0.5519445924347363,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.8011127710342407,
"step": 2072
},
{
"epoch": 0.552477357485349,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7452203631401062,
"step": 2074
},
{
"epoch": 0.5530101225359616,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7685807943344116,
"step": 2076
},
{
"epoch": 0.5535428875865743,
"grad_norm": 0.6796875,
"learning_rate": 8e-05,
"loss": 0.7906605005264282,
"step": 2078
},
{
"epoch": 0.554075652637187,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8196179270744324,
"step": 2080
},
{
"epoch": 0.5546084176877997,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.782852292060852,
"step": 2082
},
{
"epoch": 0.5551411827384124,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7460805177688599,
"step": 2084
},
{
"epoch": 0.555673947789025,
"grad_norm": 0.5859375,
"learning_rate": 8e-05,
"loss": 0.7984696626663208,
"step": 2086
},
{
"epoch": 0.5562067128396377,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.777360200881958,
"step": 2088
},
{
"epoch": 0.5567394778902504,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7777242660522461,
"step": 2090
},
{
"epoch": 0.5572722429408631,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7538855671882629,
"step": 2092
},
{
"epoch": 0.5578050079914758,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7709717750549316,
"step": 2094
},
{
"epoch": 0.5583377730420884,
"grad_norm": 0.5859375,
"learning_rate": 8e-05,
"loss": 0.7503103613853455,
"step": 2096
},
{
"epoch": 0.5588705380927012,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.775667130947113,
"step": 2098
},
{
"epoch": 0.5594033031433138,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7580850720405579,
"step": 2100
},
{
"epoch": 0.5599360681939265,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7868179678916931,
"step": 2102
},
{
"epoch": 0.5604688332445391,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7456986308097839,
"step": 2104
},
{
"epoch": 0.5610015982951518,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7889756560325623,
"step": 2106
},
{
"epoch": 0.5615343633457646,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7284671664237976,
"step": 2108
},
{
"epoch": 0.5620671283963772,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7758548259735107,
"step": 2110
},
{
"epoch": 0.5625998934469899,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.7966741919517517,
"step": 2112
},
{
"epoch": 0.5631326584976025,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.8286765217781067,
"step": 2114
},
{
"epoch": 0.5636654235482152,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.8168929815292358,
"step": 2116
},
{
"epoch": 0.5641981885988279,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7689355611801147,
"step": 2118
},
{
"epoch": 0.5647309536494406,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7302696108818054,
"step": 2120
},
{
"epoch": 0.5652637187000533,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.759016752243042,
"step": 2122
},
{
"epoch": 0.5657964837506659,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.8112335801124573,
"step": 2124
},
{
"epoch": 0.5663292488012787,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.7880202531814575,
"step": 2126
},
{
"epoch": 0.5668620138518913,
"grad_norm": 0.578125,
"learning_rate": 8e-05,
"loss": 0.7357752323150635,
"step": 2128
},
{
"epoch": 0.567394778902504,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.8004807233810425,
"step": 2130
},
{
"epoch": 0.5679275439531166,
"grad_norm": 0.58203125,
"learning_rate": 8e-05,
"loss": 0.7320253252983093,
"step": 2132
},
{
"epoch": 0.5684603090037293,
"grad_norm": 0.58203125,
"learning_rate": 8e-05,
"loss": 0.7926676273345947,
"step": 2134
},
{
"epoch": 0.5689930740543421,
"grad_norm": 0.578125,
"learning_rate": 8e-05,
"loss": 0.7315946817398071,
"step": 2136
},
{
"epoch": 0.5695258391049547,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7578614950180054,
"step": 2138
},
{
"epoch": 0.5700586041555674,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.759326696395874,
"step": 2140
},
{
"epoch": 0.57059136920618,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7619845867156982,
"step": 2142
},
{
"epoch": 0.5711241342567928,
"grad_norm": 0.69921875,
"learning_rate": 8e-05,
"loss": 0.7644577622413635,
"step": 2144
},
{
"epoch": 0.5716568993074055,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7847421765327454,
"step": 2146
},
{
"epoch": 0.5721896643580181,
"grad_norm": 0.5859375,
"learning_rate": 8e-05,
"loss": 0.7709816098213196,
"step": 2148
},
{
"epoch": 0.5727224294086308,
"grad_norm": 0.57421875,
"learning_rate": 8e-05,
"loss": 0.7508271932601929,
"step": 2150
},
{
"epoch": 0.5732551944592434,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7665797472000122,
"step": 2152
},
{
"epoch": 0.5737879595098562,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7920853495597839,
"step": 2154
},
{
"epoch": 0.5743207245604688,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7717834711074829,
"step": 2156
},
{
"epoch": 0.5748534896110815,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7623285055160522,
"step": 2158
},
{
"epoch": 0.5753862546616942,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7920824885368347,
"step": 2160
},
{
"epoch": 0.5759190197123069,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7590711116790771,
"step": 2162
},
{
"epoch": 0.5764517847629196,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7452772855758667,
"step": 2164
},
{
"epoch": 0.5769845498135322,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7672356963157654,
"step": 2166
},
{
"epoch": 0.5775173148641449,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.797944188117981,
"step": 2168
},
{
"epoch": 0.5780500799147575,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7617903351783752,
"step": 2170
},
{
"epoch": 0.5785828449653703,
"grad_norm": 0.5859375,
"learning_rate": 8e-05,
"loss": 0.763174295425415,
"step": 2172
},
{
"epoch": 0.579115610015983,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7957082390785217,
"step": 2174
},
{
"epoch": 0.5796483750665956,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7664093375205994,
"step": 2176
},
{
"epoch": 0.5801811401172083,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7532491087913513,
"step": 2178
},
{
"epoch": 0.580713905167821,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7795153856277466,
"step": 2180
},
{
"epoch": 0.5812466702184337,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7416831254959106,
"step": 2182
},
{
"epoch": 0.5817794352690463,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.73810213804245,
"step": 2184
},
{
"epoch": 0.582312200319659,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7496551871299744,
"step": 2186
},
{
"epoch": 0.5828449653702718,
"grad_norm": 0.65625,
"learning_rate": 8e-05,
"loss": 0.8033764958381653,
"step": 2188
},
{
"epoch": 0.5833777304208844,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7733720541000366,
"step": 2190
},
{
"epoch": 0.5839104954714971,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.8195459842681885,
"step": 2192
},
{
"epoch": 0.5844432605221097,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7616481781005859,
"step": 2194
},
{
"epoch": 0.5849760255727224,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7596360445022583,
"step": 2196
},
{
"epoch": 0.5855087906233352,
"grad_norm": 0.5859375,
"learning_rate": 8e-05,
"loss": 0.7688573598861694,
"step": 2198
},
{
"epoch": 0.5860415556739478,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.8028122782707214,
"step": 2200
},
{
"epoch": 0.5865743207245605,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7449119687080383,
"step": 2202
},
{
"epoch": 0.5871070857751731,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7859646677970886,
"step": 2204
},
{
"epoch": 0.5876398508257858,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7646852731704712,
"step": 2206
},
{
"epoch": 0.5881726158763985,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7665087580680847,
"step": 2208
},
{
"epoch": 0.5887053809270112,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.8172298669815063,
"step": 2210
},
{
"epoch": 0.5892381459776239,
"grad_norm": 0.5859375,
"learning_rate": 8e-05,
"loss": 0.7623305916786194,
"step": 2212
},
{
"epoch": 0.5897709110282365,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7520396709442139,
"step": 2214
},
{
"epoch": 0.5903036760788493,
"grad_norm": 0.64453125,
"learning_rate": 8e-05,
"loss": 0.8028483390808105,
"step": 2216
},
{
"epoch": 0.5908364411294619,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7868152260780334,
"step": 2218
},
{
"epoch": 0.5913692061800746,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7606732249259949,
"step": 2220
},
{
"epoch": 0.5919019712306872,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7934197783470154,
"step": 2222
},
{
"epoch": 0.5924347362813,
"grad_norm": 0.6484375,
"learning_rate": 8e-05,
"loss": 0.7841813564300537,
"step": 2224
},
{
"epoch": 0.5929675013319127,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.7516508102416992,
"step": 2226
},
{
"epoch": 0.5935002663825253,
"grad_norm": 0.66015625,
"learning_rate": 8e-05,
"loss": 0.810605525970459,
"step": 2228
},
{
"epoch": 0.594033031433138,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7940796613693237,
"step": 2230
},
{
"epoch": 0.5945657964837506,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7363699674606323,
"step": 2232
},
{
"epoch": 0.5950985615343634,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7718603610992432,
"step": 2234
},
{
"epoch": 0.595631326584976,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7635315656661987,
"step": 2236
},
{
"epoch": 0.5961640916355887,
"grad_norm": 0.6640625,
"learning_rate": 8e-05,
"loss": 0.7487787008285522,
"step": 2238
},
{
"epoch": 0.5966968566862014,
"grad_norm": 0.5859375,
"learning_rate": 8e-05,
"loss": 0.7251549959182739,
"step": 2240
},
{
"epoch": 0.597229621736814,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7601324319839478,
"step": 2242
},
{
"epoch": 0.5977623867874268,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7800053954124451,
"step": 2244
},
{
"epoch": 0.5982951518380394,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.8023176789283752,
"step": 2246
},
{
"epoch": 0.5988279168886521,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7815042734146118,
"step": 2248
},
{
"epoch": 0.5993606819392648,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7613332271575928,
"step": 2250
},
{
"epoch": 0.5998934469898775,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.8300601243972778,
"step": 2252
},
{
"epoch": 0.6004262120404902,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.7365957498550415,
"step": 2254
},
{
"epoch": 0.6009589770911028,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7763136625289917,
"step": 2256
},
{
"epoch": 0.6014917421417155,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7749415636062622,
"step": 2258
},
{
"epoch": 0.6020245071923281,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7965446710586548,
"step": 2260
},
{
"epoch": 0.6025572722429409,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7543548345565796,
"step": 2262
},
{
"epoch": 0.6030900372935536,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7496042251586914,
"step": 2264
},
{
"epoch": 0.6036228023441662,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7888254523277283,
"step": 2266
},
{
"epoch": 0.6041555673947789,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7962220311164856,
"step": 2268
},
{
"epoch": 0.6046883324453916,
"grad_norm": 0.58203125,
"learning_rate": 8e-05,
"loss": 0.7448618412017822,
"step": 2270
},
{
"epoch": 0.6052210974960043,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7599774599075317,
"step": 2272
},
{
"epoch": 0.6057538625466169,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7902323007583618,
"step": 2274
},
{
"epoch": 0.6062866275972296,
"grad_norm": 0.58203125,
"learning_rate": 8e-05,
"loss": 0.7423989176750183,
"step": 2276
},
{
"epoch": 0.6068193926478423,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7656657099723816,
"step": 2278
},
{
"epoch": 0.607352157698455,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7378427982330322,
"step": 2280
},
{
"epoch": 0.6078849227490677,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.7627711296081543,
"step": 2282
},
{
"epoch": 0.6084176877996803,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7654937505722046,
"step": 2284
},
{
"epoch": 0.608950452850293,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.7615671157836914,
"step": 2286
},
{
"epoch": 0.6094832179009056,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7753266096115112,
"step": 2288
},
{
"epoch": 0.6100159829515184,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7688212990760803,
"step": 2290
},
{
"epoch": 0.6105487480021311,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7590000033378601,
"step": 2292
},
{
"epoch": 0.6110815130527437,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7802637815475464,
"step": 2294
},
{
"epoch": 0.6116142781033564,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7646292448043823,
"step": 2296
},
{
"epoch": 0.6121470431539691,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7465856671333313,
"step": 2298
},
{
"epoch": 0.6126798082045818,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.755443811416626,
"step": 2300
},
{
"epoch": 0.6132125732551944,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7676706910133362,
"step": 2302
},
{
"epoch": 0.6137453383058071,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7980360388755798,
"step": 2304
},
{
"epoch": 0.6142781033564199,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7766455411911011,
"step": 2306
},
{
"epoch": 0.6148108684070325,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7732510566711426,
"step": 2308
},
{
"epoch": 0.6153436334576452,
"grad_norm": 0.58203125,
"learning_rate": 8e-05,
"loss": 0.7538511157035828,
"step": 2310
},
{
"epoch": 0.6158763985082578,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.7400153875350952,
"step": 2312
},
{
"epoch": 0.6164091635588705,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.8004501461982727,
"step": 2314
},
{
"epoch": 0.6169419286094833,
"grad_norm": 0.5703125,
"learning_rate": 8e-05,
"loss": 0.7622724771499634,
"step": 2316
},
{
"epoch": 0.6174746936600959,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.759125292301178,
"step": 2318
},
{
"epoch": 0.6180074587107086,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.7583044767379761,
"step": 2320
},
{
"epoch": 0.6185402237613212,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.8017464876174927,
"step": 2322
},
{
"epoch": 0.619072988811934,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.773422122001648,
"step": 2324
},
{
"epoch": 0.6196057538625466,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.782919704914093,
"step": 2326
},
{
"epoch": 0.6201385189131593,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7374987602233887,
"step": 2328
},
{
"epoch": 0.620671283963772,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7503113746643066,
"step": 2330
},
{
"epoch": 0.6212040490143846,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7530639171600342,
"step": 2332
},
{
"epoch": 0.6217368140649974,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.8065166473388672,
"step": 2334
},
{
"epoch": 0.62226957911561,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7694088816642761,
"step": 2336
},
{
"epoch": 0.6228023441662227,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.8152406215667725,
"step": 2338
},
{
"epoch": 0.6233351092168353,
"grad_norm": 0.63671875,
"learning_rate": 8e-05,
"loss": 0.8167840242385864,
"step": 2340
},
{
"epoch": 0.623867874267448,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.748583972454071,
"step": 2342
},
{
"epoch": 0.6244006393180608,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7353124618530273,
"step": 2344
},
{
"epoch": 0.6249334043686734,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7833389043807983,
"step": 2346
},
{
"epoch": 0.6254661694192861,
"grad_norm": 0.5859375,
"learning_rate": 8e-05,
"loss": 0.7380432486534119,
"step": 2348
},
{
"epoch": 0.6259989344698987,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.8244600892066956,
"step": 2350
},
{
"epoch": 0.6265316995205115,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7240447402000427,
"step": 2352
},
{
"epoch": 0.6270644645711241,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7850971221923828,
"step": 2354
},
{
"epoch": 0.6275972296217368,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7789439558982849,
"step": 2356
},
{
"epoch": 0.6281299946723495,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.8035836815834045,
"step": 2358
},
{
"epoch": 0.6286627597229622,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7577446103096008,
"step": 2360
},
{
"epoch": 0.6291955247735749,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.8039902448654175,
"step": 2362
},
{
"epoch": 0.6297282898241875,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7867581844329834,
"step": 2364
},
{
"epoch": 0.6302610548748002,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7361651062965393,
"step": 2366
},
{
"epoch": 0.630793819925413,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7749671339988708,
"step": 2368
},
{
"epoch": 0.6313265849760256,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7684974670410156,
"step": 2370
},
{
"epoch": 0.6318593500266383,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7540404796600342,
"step": 2372
},
{
"epoch": 0.6323921150772509,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7764151096343994,
"step": 2374
},
{
"epoch": 0.6329248801278636,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7495799660682678,
"step": 2376
},
{
"epoch": 0.6334576451784762,
"grad_norm": 0.72265625,
"learning_rate": 8e-05,
"loss": 0.7766825556755066,
"step": 2378
},
{
"epoch": 0.633990410229089,
"grad_norm": 0.640625,
"learning_rate": 8e-05,
"loss": 0.7477214336395264,
"step": 2380
},
{
"epoch": 0.6345231752797017,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7461574077606201,
"step": 2382
},
{
"epoch": 0.6350559403303143,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7609319090843201,
"step": 2384
},
{
"epoch": 0.635588705380927,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7841947674751282,
"step": 2386
},
{
"epoch": 0.6361214704315397,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7920709848403931,
"step": 2388
},
{
"epoch": 0.6366542354821524,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7804465889930725,
"step": 2390
},
{
"epoch": 0.637187000532765,
"grad_norm": 0.57421875,
"learning_rate": 8e-05,
"loss": 0.7685946226119995,
"step": 2392
},
{
"epoch": 0.6377197655833777,
"grad_norm": 0.57421875,
"learning_rate": 8e-05,
"loss": 0.7582971453666687,
"step": 2394
},
{
"epoch": 0.6382525306339905,
"grad_norm": 0.62109375,
"learning_rate": 8e-05,
"loss": 0.7376336455345154,
"step": 2396
},
{
"epoch": 0.6387852956846031,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7812549471855164,
"step": 2398
},
{
"epoch": 0.6393180607352158,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7486171722412109,
"step": 2400
},
{
"epoch": 0.6398508257858284,
"grad_norm": 0.56640625,
"learning_rate": 8e-05,
"loss": 0.7864870429039001,
"step": 2402
},
{
"epoch": 0.6403835908364411,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.7745766043663025,
"step": 2404
},
{
"epoch": 0.6409163558870538,
"grad_norm": 0.56640625,
"learning_rate": 8e-05,
"loss": 0.7579283118247986,
"step": 2406
},
{
"epoch": 0.6414491209376665,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7351487278938293,
"step": 2408
},
{
"epoch": 0.6419818859882792,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.8096469044685364,
"step": 2410
},
{
"epoch": 0.6425146510388918,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.7487743496894836,
"step": 2412
},
{
"epoch": 0.6430474160895046,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7955554127693176,
"step": 2414
},
{
"epoch": 0.6435801811401172,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.7598539590835571,
"step": 2416
},
{
"epoch": 0.6441129461907299,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.794310986995697,
"step": 2418
},
{
"epoch": 0.6446457112413426,
"grad_norm": 0.57421875,
"learning_rate": 8e-05,
"loss": 0.7372432947158813,
"step": 2420
},
{
"epoch": 0.6451784762919552,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7485856413841248,
"step": 2422
},
{
"epoch": 0.645711241342568,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7526128888130188,
"step": 2424
},
{
"epoch": 0.6462440063931806,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7520730495452881,
"step": 2426
},
{
"epoch": 0.6467767714437933,
"grad_norm": 0.58203125,
"learning_rate": 8e-05,
"loss": 0.7413308024406433,
"step": 2428
},
{
"epoch": 0.6473095364944059,
"grad_norm": 0.6171875,
"learning_rate": 8e-05,
"loss": 0.7861500382423401,
"step": 2430
},
{
"epoch": 0.6478423015450187,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.8049200773239136,
"step": 2432
},
{
"epoch": 0.6483750665956314,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7288910150527954,
"step": 2434
},
{
"epoch": 0.648907831646244,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7674762606620789,
"step": 2436
},
{
"epoch": 0.6494405966968567,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.79649418592453,
"step": 2438
},
{
"epoch": 0.6499733617474693,
"grad_norm": 0.5859375,
"learning_rate": 8e-05,
"loss": 0.7906514406204224,
"step": 2440
},
{
"epoch": 0.6505061267980821,
"grad_norm": 0.58203125,
"learning_rate": 8e-05,
"loss": 0.7719740867614746,
"step": 2442
},
{
"epoch": 0.6510388918486947,
"grad_norm": 0.61328125,
"learning_rate": 8e-05,
"loss": 0.8404486179351807,
"step": 2444
},
{
"epoch": 0.6515716568993074,
"grad_norm": 0.6015625,
"learning_rate": 8e-05,
"loss": 0.7465118169784546,
"step": 2446
},
{
"epoch": 0.6521044219499201,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7870571613311768,
"step": 2448
},
{
"epoch": 0.6526371870005327,
"grad_norm": 0.62890625,
"learning_rate": 8e-05,
"loss": 0.7906562089920044,
"step": 2450
},
{
"epoch": 0.6531699520511455,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.7247576713562012,
"step": 2452
},
{
"epoch": 0.6537027171017581,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7732218503952026,
"step": 2454
},
{
"epoch": 0.6542354821523708,
"grad_norm": 0.5859375,
"learning_rate": 8e-05,
"loss": 0.7541797161102295,
"step": 2456
},
{
"epoch": 0.6547682472029834,
"grad_norm": 0.625,
"learning_rate": 8e-05,
"loss": 0.7637116312980652,
"step": 2458
},
{
"epoch": 0.6553010122535962,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7784105539321899,
"step": 2460
},
{
"epoch": 0.6558337773042089,
"grad_norm": 0.6328125,
"learning_rate": 8e-05,
"loss": 0.7532057166099548,
"step": 2462
},
{
"epoch": 0.6563665423548215,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7266465425491333,
"step": 2464
},
{
"epoch": 0.6568993074054342,
"grad_norm": 0.58203125,
"learning_rate": 8e-05,
"loss": 0.7385075688362122,
"step": 2466
},
{
"epoch": 0.6574320724560468,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7806388139724731,
"step": 2468
},
{
"epoch": 0.6579648375066596,
"grad_norm": 0.609375,
"learning_rate": 8e-05,
"loss": 0.7638847827911377,
"step": 2470
},
{
"epoch": 0.6584976025572722,
"grad_norm": 0.59375,
"learning_rate": 8e-05,
"loss": 0.7466439604759216,
"step": 2472
},
{
"epoch": 0.6590303676078849,
"grad_norm": 0.58984375,
"learning_rate": 8e-05,
"loss": 0.7079021334648132,
"step": 2474
},
{
"epoch": 0.6595631326584976,
"grad_norm": 0.59765625,
"learning_rate": 8e-05,
"loss": 0.7956612706184387,
"step": 2476
},
{
"epoch": 0.6600958977091103,
"grad_norm": 0.60546875,
"learning_rate": 8e-05,
"loss": 0.8124134540557861,
"step": 2478
},
{
"epoch": 0.660628662759723,
"grad_norm": 0.5703125,
"learning_rate": 7.999992767148906e-05,
"loss": 0.7136578559875488,
"step": 2480
},
{
"epoch": 0.6611614278103356,
"grad_norm": 0.609375,
"learning_rate": 7.999934904497092e-05,
"loss": 0.7554067373275757,
"step": 2482
},
{
"epoch": 0.6616941928609483,
"grad_norm": 0.58203125,
"learning_rate": 7.999819180030488e-05,
"loss": 0.7867137789726257,
"step": 2484
},
{
"epoch": 0.662226957911561,
"grad_norm": 0.62109375,
"learning_rate": 7.999645595423128e-05,
"loss": 0.8369832634925842,
"step": 2486
},
{
"epoch": 0.6627597229621737,
"grad_norm": 0.58203125,
"learning_rate": 7.999414153186031e-05,
"loss": 0.745606005191803,
"step": 2488
},
{
"epoch": 0.6632924880127864,
"grad_norm": 0.58203125,
"learning_rate": 7.999124856667172e-05,
"loss": 0.7783620357513428,
"step": 2490
},
{
"epoch": 0.663825253063399,
"grad_norm": 0.62109375,
"learning_rate": 7.998777710051422e-05,
"loss": 0.7496659159660339,
"step": 2492
},
{
"epoch": 0.6643580181140117,
"grad_norm": 0.58984375,
"learning_rate": 7.998372718360495e-05,
"loss": 0.735636293888092,
"step": 2494
},
{
"epoch": 0.6648907831646244,
"grad_norm": 0.59765625,
"learning_rate": 7.997909887452878e-05,
"loss": 0.770561158657074,
"step": 2496
},
{
"epoch": 0.6654235482152371,
"grad_norm": 0.61328125,
"learning_rate": 7.997389224023737e-05,
"loss": 0.7702518105506897,
"step": 2498
},
{
"epoch": 0.6659563132658498,
"grad_norm": 0.62890625,
"learning_rate": 7.996810735604828e-05,
"loss": 0.8403434753417969,
"step": 2500
},
{
"epoch": 0.6664890783164624,
"grad_norm": 0.5859375,
"learning_rate": 7.996174430564384e-05,
"loss": 0.775733232498169,
"step": 2502
},
{
"epoch": 0.6670218433670752,
"grad_norm": 0.578125,
"learning_rate": 7.995480318106997e-05,
"loss": 0.7447243928909302,
"step": 2504
},
{
"epoch": 0.6675546084176878,
"grad_norm": 0.60546875,
"learning_rate": 7.99472840827348e-05,
"loss": 0.7677955627441406,
"step": 2506
},
{
"epoch": 0.6680873734683005,
"grad_norm": 0.58984375,
"learning_rate": 7.99391871194073e-05,
"loss": 0.7605108022689819,
"step": 2508
},
{
"epoch": 0.6686201385189131,
"grad_norm": 0.60546875,
"learning_rate": 7.993051240821559e-05,
"loss": 0.7609809041023254,
"step": 2510
},
{
"epoch": 0.6691529035695258,
"grad_norm": 0.58203125,
"learning_rate": 7.992126007464537e-05,
"loss": 0.7586944699287415,
"step": 2512
},
{
"epoch": 0.6696856686201386,
"grad_norm": 0.6015625,
"learning_rate": 7.991143025253801e-05,
"loss": 0.7763959765434265,
"step": 2514
},
{
"epoch": 0.6702184336707512,
"grad_norm": 0.6015625,
"learning_rate": 7.990102308408867e-05,
"loss": 0.7569335103034973,
"step": 2516
},
{
"epoch": 0.6707511987213639,
"grad_norm": 0.59765625,
"learning_rate": 7.98900387198442e-05,
"loss": 0.7567064166069031,
"step": 2518
},
{
"epoch": 0.6712839637719765,
"grad_norm": 0.62109375,
"learning_rate": 7.9878477318701e-05,
"loss": 0.8084724545478821,
"step": 2520
},
{
"epoch": 0.6718167288225892,
"grad_norm": 0.59765625,
"learning_rate": 7.98663390479027e-05,
"loss": 0.748503565788269,
"step": 2522
},
{
"epoch": 0.6723494938732019,
"grad_norm": 0.59765625,
"learning_rate": 7.985362408303778e-05,
"loss": 0.7644037008285522,
"step": 2524
},
{
"epoch": 0.6728822589238146,
"grad_norm": 0.625,
"learning_rate": 7.984033260803695e-05,
"loss": 0.7853304743766785,
"step": 2526
},
{
"epoch": 0.6734150239744273,
"grad_norm": 0.61328125,
"learning_rate": 7.982646481517054e-05,
"loss": 0.7597571015357971,
"step": 2528
},
{
"epoch": 0.6739477890250399,
"grad_norm": 0.578125,
"learning_rate": 7.981202090504576e-05,
"loss": 0.7463767528533936,
"step": 2530
},
{
"epoch": 0.6744805540756527,
"grad_norm": 0.60546875,
"learning_rate": 7.97970010866037e-05,
"loss": 0.7673113346099854,
"step": 2532
},
{
"epoch": 0.6750133191262653,
"grad_norm": 0.5859375,
"learning_rate": 7.97814055771164e-05,
"loss": 0.7825121879577637,
"step": 2534
},
{
"epoch": 0.675546084176878,
"grad_norm": 0.58984375,
"learning_rate": 7.976523460218363e-05,
"loss": 0.7622671723365784,
"step": 2536
},
{
"epoch": 0.6760788492274907,
"grad_norm": 0.57421875,
"learning_rate": 7.974848839572971e-05,
"loss": 0.722598671913147,
"step": 2538
},
{
"epoch": 0.6766116142781033,
"grad_norm": 0.60546875,
"learning_rate": 7.973116720000005e-05,
"loss": 0.7474793791770935,
"step": 2540
},
{
"epoch": 0.6771443793287161,
"grad_norm": 0.59375,
"learning_rate": 7.971327126555767e-05,
"loss": 0.7257604598999023,
"step": 2542
},
{
"epoch": 0.6776771443793287,
"grad_norm": 0.5859375,
"learning_rate": 7.96948008512796e-05,
"loss": 0.7461668252944946,
"step": 2544
},
{
"epoch": 0.6782099094299414,
"grad_norm": 0.58984375,
"learning_rate": 7.967575622435313e-05,
"loss": 0.75556880235672,
"step": 2546
},
{
"epoch": 0.678742674480554,
"grad_norm": 0.6015625,
"learning_rate": 7.965613766027188e-05,
"loss": 0.796931803226471,
"step": 2548
},
{
"epoch": 0.6792754395311668,
"grad_norm": 0.59375,
"learning_rate": 7.963594544283193e-05,
"loss": 0.7720798850059509,
"step": 2550
},
{
"epoch": 0.6798082045817795,
"grad_norm": 0.56640625,
"learning_rate": 7.96151798641276e-05,
"loss": 0.7493809461593628,
"step": 2552
},
{
"epoch": 0.6803409696323921,
"grad_norm": 0.5859375,
"learning_rate": 7.959384122454729e-05,
"loss": 0.7640602588653564,
"step": 2554
},
{
"epoch": 0.6808737346830048,
"grad_norm": 0.55859375,
"learning_rate": 7.957192983276915e-05,
"loss": 0.716975212097168,
"step": 2556
},
{
"epoch": 0.6814064997336174,
"grad_norm": 0.59765625,
"learning_rate": 7.954944600575654e-05,
"loss": 0.7710456848144531,
"step": 2558
},
{
"epoch": 0.6819392647842302,
"grad_norm": 0.62109375,
"learning_rate": 7.952639006875353e-05,
"loss": 0.7932459712028503,
"step": 2560
},
{
"epoch": 0.6824720298348428,
"grad_norm": 0.578125,
"learning_rate": 7.950276235528011e-05,
"loss": 0.7565551400184631,
"step": 2562
},
{
"epoch": 0.6830047948854555,
"grad_norm": 0.60546875,
"learning_rate": 7.947856320712746e-05,
"loss": 0.7231839895248413,
"step": 2564
},
{
"epoch": 0.6835375599360682,
"grad_norm": 0.578125,
"learning_rate": 7.945379297435294e-05,
"loss": 0.7507690787315369,
"step": 2566
},
{
"epoch": 0.6840703249866809,
"grad_norm": 0.60546875,
"learning_rate": 7.942845201527501e-05,
"loss": 0.7406589388847351,
"step": 2568
},
{
"epoch": 0.6846030900372936,
"grad_norm": 0.5703125,
"learning_rate": 7.940254069646813e-05,
"loss": 0.7486617565155029,
"step": 2570
},
{
"epoch": 0.6851358550879062,
"grad_norm": 0.58203125,
"learning_rate": 7.937605939275736e-05,
"loss": 0.7797908186912537,
"step": 2572
},
{
"epoch": 0.6856686201385189,
"grad_norm": 0.62109375,
"learning_rate": 7.934900848721304e-05,
"loss": 0.8399383425712585,
"step": 2574
},
{
"epoch": 0.6862013851891315,
"grad_norm": 0.58984375,
"learning_rate": 7.932138837114512e-05,
"loss": 0.7333605289459229,
"step": 2576
},
{
"epoch": 0.6867341502397443,
"grad_norm": 0.6015625,
"learning_rate": 7.929319944409765e-05,
"loss": 0.776554524898529,
"step": 2578
},
{
"epoch": 0.687266915290357,
"grad_norm": 0.5859375,
"learning_rate": 7.926444211384286e-05,
"loss": 0.7380905151367188,
"step": 2580
},
{
"epoch": 0.6877996803409696,
"grad_norm": 0.56640625,
"learning_rate": 7.923511679637534e-05,
"loss": 0.7293626070022583,
"step": 2582
},
{
"epoch": 0.6883324453915823,
"grad_norm": 0.58203125,
"learning_rate": 7.920522391590604e-05,
"loss": 0.7142640352249146,
"step": 2584
},
{
"epoch": 0.688865210442195,
"grad_norm": 0.61328125,
"learning_rate": 7.917476390485606e-05,
"loss": 0.8078116178512573,
"step": 2586
},
{
"epoch": 0.6893979754928077,
"grad_norm": 0.59765625,
"learning_rate": 7.914373720385048e-05,
"loss": 0.7499976754188538,
"step": 2588
},
{
"epoch": 0.6899307405434204,
"grad_norm": 0.59375,
"learning_rate": 7.911214426171187e-05,
"loss": 0.7751161456108093,
"step": 2590
},
{
"epoch": 0.690463505594033,
"grad_norm": 0.61328125,
"learning_rate": 7.907998553545392e-05,
"loss": 0.7775903344154358,
"step": 2592
},
{
"epoch": 0.6909962706446457,
"grad_norm": 0.5859375,
"learning_rate": 7.904726149027479e-05,
"loss": 0.7565440535545349,
"step": 2594
},
{
"epoch": 0.6915290356952584,
"grad_norm": 0.58984375,
"learning_rate": 7.901397259955032e-05,
"loss": 0.7655493021011353,
"step": 2596
},
{
"epoch": 0.6920618007458711,
"grad_norm": 0.609375,
"learning_rate": 7.898011934482725e-05,
"loss": 0.7513885498046875,
"step": 2598
},
{
"epoch": 0.6925945657964837,
"grad_norm": 0.6015625,
"learning_rate": 7.894570221581627e-05,
"loss": 0.7957122325897217,
"step": 2600
},
{
"epoch": 0.6931273308470964,
"grad_norm": 0.61328125,
"learning_rate": 7.891072171038483e-05,
"loss": 0.752173125743866,
"step": 2602
},
{
"epoch": 0.6936600958977092,
"grad_norm": 0.60546875,
"learning_rate": 7.887517833455007e-05,
"loss": 0.7955760359764099,
"step": 2604
},
{
"epoch": 0.6941928609483218,
"grad_norm": 0.58984375,
"learning_rate": 7.883907260247141e-05,
"loss": 0.7562192678451538,
"step": 2606
},
{
"epoch": 0.6947256259989345,
"grad_norm": 0.62109375,
"learning_rate": 7.880240503644314e-05,
"loss": 0.7903412580490112,
"step": 2608
},
{
"epoch": 0.6952583910495471,
"grad_norm": 0.58984375,
"learning_rate": 7.876517616688689e-05,
"loss": 0.722710371017456,
"step": 2610
},
{
"epoch": 0.6957911561001598,
"grad_norm": 0.59765625,
"learning_rate": 7.872738653234387e-05,
"loss": 0.7610102891921997,
"step": 2612
},
{
"epoch": 0.6963239211507725,
"grad_norm": 0.58984375,
"learning_rate": 7.868903667946723e-05,
"loss": 0.7523242831230164,
"step": 2614
},
{
"epoch": 0.6968566862013852,
"grad_norm": 0.60546875,
"learning_rate": 7.865012716301399e-05,
"loss": 0.7445037364959717,
"step": 2616
},
{
"epoch": 0.6973894512519979,
"grad_norm": 0.578125,
"learning_rate": 7.861065854583715e-05,
"loss": 0.7576636075973511,
"step": 2618
},
{
"epoch": 0.6979222163026105,
"grad_norm": 0.578125,
"learning_rate": 7.857063139887742e-05,
"loss": 0.7409178614616394,
"step": 2620
},
{
"epoch": 0.6984549813532233,
"grad_norm": 0.59375,
"learning_rate": 7.85300463011551e-05,
"loss": 0.7388145923614502,
"step": 2622
},
{
"epoch": 0.6989877464038359,
"grad_norm": 0.60546875,
"learning_rate": 7.848890383976155e-05,
"loss": 0.7784285545349121,
"step": 2624
},
{
"epoch": 0.6995205114544486,
"grad_norm": 0.55859375,
"learning_rate": 7.844720460985086e-05,
"loss": 0.7128530144691467,
"step": 2626
},
{
"epoch": 0.7000532765050612,
"grad_norm": 0.59375,
"learning_rate": 7.840494921463112e-05,
"loss": 0.7656941413879395,
"step": 2628
},
{
"epoch": 0.7005860415556739,
"grad_norm": 0.609375,
"learning_rate": 7.836213826535574e-05,
"loss": 0.8078965544700623,
"step": 2630
},
{
"epoch": 0.7011188066062867,
"grad_norm": 0.5703125,
"learning_rate": 7.831877238131459e-05,
"loss": 0.7390220165252686,
"step": 2632
},
{
"epoch": 0.7016515716568993,
"grad_norm": 0.58203125,
"learning_rate": 7.82748521898251e-05,
"loss": 0.7566484212875366,
"step": 2634
},
{
"epoch": 0.702184336707512,
"grad_norm": 0.58203125,
"learning_rate": 7.823037832622307e-05,
"loss": 0.7349902391433716,
"step": 2636
},
{
"epoch": 0.7027171017581246,
"grad_norm": 0.57421875,
"learning_rate": 7.818535143385359e-05,
"loss": 0.7549850940704346,
"step": 2638
},
{
"epoch": 0.7032498668087374,
"grad_norm": 0.625,
"learning_rate": 7.81397721640617e-05,
"loss": 0.7904667258262634,
"step": 2640
},
{
"epoch": 0.70378263185935,
"grad_norm": 0.60546875,
"learning_rate": 7.809364117618295e-05,
"loss": 0.7925822734832764,
"step": 2642
},
{
"epoch": 0.7043153969099627,
"grad_norm": 0.57421875,
"learning_rate": 7.804695913753385e-05,
"loss": 0.7261655926704407,
"step": 2644
},
{
"epoch": 0.7048481619605754,
"grad_norm": 0.625,
"learning_rate": 7.799972672340226e-05,
"loss": 0.8227925896644592,
"step": 2646
},
{
"epoch": 0.705380927011188,
"grad_norm": 0.56640625,
"learning_rate": 7.795194461703763e-05,
"loss": 0.6952813863754272,
"step": 2648
},
{
"epoch": 0.7059136920618008,
"grad_norm": 0.5859375,
"learning_rate": 7.790361350964101e-05,
"loss": 0.7714648246765137,
"step": 2650
},
{
"epoch": 0.7064464571124134,
"grad_norm": 0.5859375,
"learning_rate": 7.78547341003552e-05,
"loss": 0.756645143032074,
"step": 2652
},
{
"epoch": 0.7069792221630261,
"grad_norm": 0.5859375,
"learning_rate": 7.780530709625455e-05,
"loss": 0.74300217628479,
"step": 2654
},
{
"epoch": 0.7075119872136388,
"grad_norm": 0.58203125,
"learning_rate": 7.775533321233471e-05,
"loss": 0.7490818500518799,
"step": 2656
},
{
"epoch": 0.7080447522642515,
"grad_norm": 0.578125,
"learning_rate": 7.770481317150236e-05,
"loss": 0.7314748167991638,
"step": 2658
},
{
"epoch": 0.7085775173148642,
"grad_norm": 0.578125,
"learning_rate": 7.765374770456471e-05,
"loss": 0.7653273344039917,
"step": 2660
},
{
"epoch": 0.7091102823654768,
"grad_norm": 0.58203125,
"learning_rate": 7.760213755021892e-05,
"loss": 0.784086287021637,
"step": 2662
},
{
"epoch": 0.7096430474160895,
"grad_norm": 0.57421875,
"learning_rate": 7.754998345504141e-05,
"loss": 0.7511420249938965,
"step": 2664
},
{
"epoch": 0.7101758124667021,
"grad_norm": 0.6171875,
"learning_rate": 7.749728617347717e-05,
"loss": 0.77214115858078,
"step": 2666
},
{
"epoch": 0.7107085775173149,
"grad_norm": 0.64453125,
"learning_rate": 7.744404646782866e-05,
"loss": 0.7471661567687988,
"step": 2668
},
{
"epoch": 0.7112413425679276,
"grad_norm": 0.5859375,
"learning_rate": 7.739026510824489e-05,
"loss": 0.8218797445297241,
"step": 2670
},
{
"epoch": 0.7117741076185402,
"grad_norm": 0.58984375,
"learning_rate": 7.73359428727103e-05,
"loss": 0.7144700288772583,
"step": 2672
},
{
"epoch": 0.7123068726691529,
"grad_norm": 0.58203125,
"learning_rate": 7.72810805470335e-05,
"loss": 0.7447347640991211,
"step": 2674
},
{
"epoch": 0.7128396377197656,
"grad_norm": 0.58203125,
"learning_rate": 7.722567892483579e-05,
"loss": 0.7260941863059998,
"step": 2676
},
{
"epoch": 0.7133724027703783,
"grad_norm": 0.59765625,
"learning_rate": 7.716973880753982e-05,
"loss": 0.7791821360588074,
"step": 2678
},
{
"epoch": 0.7139051678209909,
"grad_norm": 0.57421875,
"learning_rate": 7.711326100435796e-05,
"loss": 0.7719851732254028,
"step": 2680
},
{
"epoch": 0.7144379328716036,
"grad_norm": 0.5703125,
"learning_rate": 7.70562463322805e-05,
"loss": 0.7492231130599976,
"step": 2682
},
{
"epoch": 0.7149706979222163,
"grad_norm": 0.5703125,
"learning_rate": 7.699869561606403e-05,
"loss": 0.7836228609085083,
"step": 2684
},
{
"epoch": 0.715503462972829,
"grad_norm": 0.578125,
"learning_rate": 7.694060968821927e-05,
"loss": 0.7258511781692505,
"step": 2686
},
{
"epoch": 0.7160362280234417,
"grad_norm": 0.59375,
"learning_rate": 7.688198938899922e-05,
"loss": 0.7510860562324524,
"step": 2688
},
{
"epoch": 0.7165689930740543,
"grad_norm": 0.59765625,
"learning_rate": 7.682283556638689e-05,
"loss": 0.7514731884002686,
"step": 2690
},
{
"epoch": 0.717101758124667,
"grad_norm": 0.59765625,
"learning_rate": 7.676314907608309e-05,
"loss": 0.7466673254966736,
"step": 2692
},
{
"epoch": 0.7176345231752796,
"grad_norm": 0.58203125,
"learning_rate": 7.670293078149403e-05,
"loss": 0.7264184951782227,
"step": 2694
},
{
"epoch": 0.7181672882258924,
"grad_norm": 0.59765625,
"learning_rate": 7.664218155371884e-05,
"loss": 0.7410547733306885,
"step": 2696
},
{
"epoch": 0.7187000532765051,
"grad_norm": 0.6015625,
"learning_rate": 7.658090227153697e-05,
"loss": 0.7337695360183716,
"step": 2698
},
{
"epoch": 0.7192328183271177,
"grad_norm": 0.55859375,
"learning_rate": 7.651909382139545e-05,
"loss": 0.7203134894371033,
"step": 2700
},
{
"epoch": 0.7197655833777304,
"grad_norm": 0.58984375,
"learning_rate": 7.645675709739614e-05,
"loss": 0.752673327922821,
"step": 2702
},
{
"epoch": 0.7202983484283431,
"grad_norm": 0.578125,
"learning_rate": 7.639389300128266e-05,
"loss": 0.7161597013473511,
"step": 2704
},
{
"epoch": 0.7208311134789558,
"grad_norm": 0.6171875,
"learning_rate": 7.633050244242752e-05,
"loss": 0.7535909414291382,
"step": 2706
},
{
"epoch": 0.7213638785295685,
"grad_norm": 0.59375,
"learning_rate": 7.62665863378188e-05,
"loss": 0.8101969957351685,
"step": 2708
},
{
"epoch": 0.7218966435801811,
"grad_norm": 0.5859375,
"learning_rate": 7.620214561204704e-05,
"loss": 0.7484019994735718,
"step": 2710
},
{
"epoch": 0.7224294086307939,
"grad_norm": 0.57421875,
"learning_rate": 7.613718119729172e-05,
"loss": 0.7134671211242676,
"step": 2712
},
{
"epoch": 0.7229621736814065,
"grad_norm": 0.59375,
"learning_rate": 7.607169403330786e-05,
"loss": 0.7816528677940369,
"step": 2714
},
{
"epoch": 0.7234949387320192,
"grad_norm": 0.61328125,
"learning_rate": 7.600568506741243e-05,
"loss": 0.8324891924858093,
"step": 2716
},
{
"epoch": 0.7240277037826318,
"grad_norm": 0.56640625,
"learning_rate": 7.593915525447062e-05,
"loss": 0.7704975605010986,
"step": 2718
},
{
"epoch": 0.7245604688332445,
"grad_norm": 0.6171875,
"learning_rate": 7.5872105556882e-05,
"loss": 0.8043801784515381,
"step": 2720
},
{
"epoch": 0.7250932338838573,
"grad_norm": 0.58984375,
"learning_rate": 7.580453694456664e-05,
"loss": 0.7552354335784912,
"step": 2722
},
{
"epoch": 0.7256259989344699,
"grad_norm": 0.6015625,
"learning_rate": 7.57364503949511e-05,
"loss": 0.7909277677536011,
"step": 2724
},
{
"epoch": 0.7261587639850826,
"grad_norm": 0.5703125,
"learning_rate": 7.566784689295425e-05,
"loss": 0.7480258345603943,
"step": 2726
},
{
"epoch": 0.7266915290356952,
"grad_norm": 0.58203125,
"learning_rate": 7.5598727430973e-05,
"loss": 0.7182474732398987,
"step": 2728
},
{
"epoch": 0.727224294086308,
"grad_norm": 0.57421875,
"learning_rate": 7.552909300886802e-05,
"loss": 0.7433866858482361,
"step": 2730
},
{
"epoch": 0.7277570591369206,
"grad_norm": 0.609375,
"learning_rate": 7.545894463394918e-05,
"loss": 0.825681209564209,
"step": 2732
},
{
"epoch": 0.7282898241875333,
"grad_norm": 0.6015625,
"learning_rate": 7.538828332096108e-05,
"loss": 0.7488453388214111,
"step": 2734
},
{
"epoch": 0.728822589238146,
"grad_norm": 0.5859375,
"learning_rate": 7.531711009206831e-05,
"loss": 0.7557936310768127,
"step": 2736
},
{
"epoch": 0.7293553542887586,
"grad_norm": 0.5625,
"learning_rate": 7.524542597684066e-05,
"loss": 0.7335447072982788,
"step": 2738
},
{
"epoch": 0.7298881193393714,
"grad_norm": 0.59375,
"learning_rate": 7.517323201223829e-05,
"loss": 0.7855934500694275,
"step": 2740
},
{
"epoch": 0.730420884389984,
"grad_norm": 0.58203125,
"learning_rate": 7.51005292425966e-05,
"loss": 0.7879592180252075,
"step": 2742
},
{
"epoch": 0.7309536494405967,
"grad_norm": 0.5625,
"learning_rate": 7.502731871961126e-05,
"loss": 0.7301309704780579,
"step": 2744
},
{
"epoch": 0.7314864144912093,
"grad_norm": 0.56640625,
"learning_rate": 7.495360150232298e-05,
"loss": 0.7647414803504944,
"step": 2746
},
{
"epoch": 0.732019179541822,
"grad_norm": 0.58203125,
"learning_rate": 7.487937865710206e-05,
"loss": 0.7254839539527893,
"step": 2748
},
{
"epoch": 0.7325519445924348,
"grad_norm": 0.55078125,
"learning_rate": 7.480465125763312e-05,
"loss": 0.7045494318008423,
"step": 2750
},
{
"epoch": 0.7330847096430474,
"grad_norm": 0.58984375,
"learning_rate": 7.47294203848995e-05,
"loss": 0.7956986427307129,
"step": 2752
},
{
"epoch": 0.7336174746936601,
"grad_norm": 0.57421875,
"learning_rate": 7.465368712716759e-05,
"loss": 0.7624717950820923,
"step": 2754
},
{
"epoch": 0.7341502397442727,
"grad_norm": 0.59765625,
"learning_rate": 7.457745257997118e-05,
"loss": 0.7222627401351929,
"step": 2756
},
{
"epoch": 0.7346830047948855,
"grad_norm": 0.5859375,
"learning_rate": 7.450071784609551e-05,
"loss": 0.7368757724761963,
"step": 2758
},
{
"epoch": 0.7352157698454982,
"grad_norm": 0.58203125,
"learning_rate": 7.442348403556139e-05,
"loss": 0.7786681652069092,
"step": 2760
},
{
"epoch": 0.7357485348961108,
"grad_norm": 0.60546875,
"learning_rate": 7.43457522656091e-05,
"loss": 0.7508862018585205,
"step": 2762
},
{
"epoch": 0.7362812999467235,
"grad_norm": 0.578125,
"learning_rate": 7.42675236606823e-05,
"loss": 0.7344875931739807,
"step": 2764
},
{
"epoch": 0.7368140649973361,
"grad_norm": 0.5859375,
"learning_rate": 7.418879935241162e-05,
"loss": 0.7535064816474915,
"step": 2766
},
{
"epoch": 0.7373468300479489,
"grad_norm": 0.59375,
"learning_rate": 7.410958047959845e-05,
"loss": 0.7149871587753296,
"step": 2768
},
{
"epoch": 0.7378795950985615,
"grad_norm": 0.57421875,
"learning_rate": 7.402986818819838e-05,
"loss": 0.7133111953735352,
"step": 2770
},
{
"epoch": 0.7384123601491742,
"grad_norm": 0.59765625,
"learning_rate": 7.394966363130462e-05,
"loss": 0.7914494276046753,
"step": 2772
},
{
"epoch": 0.738945125199787,
"grad_norm": 0.58984375,
"learning_rate": 7.386896796913137e-05,
"loss": 0.7482845783233643,
"step": 2774
},
{
"epoch": 0.7394778902503996,
"grad_norm": 0.60546875,
"learning_rate": 7.3787782368997e-05,
"loss": 0.732533872127533,
"step": 2776
},
{
"epoch": 0.7400106553010123,
"grad_norm": 0.59375,
"learning_rate": 7.370610800530713e-05,
"loss": 0.7276294827461243,
"step": 2778
},
{
"epoch": 0.7405434203516249,
"grad_norm": 0.60546875,
"learning_rate": 7.362394605953773e-05,
"loss": 0.7570236325263977,
"step": 2780
},
{
"epoch": 0.7410761854022376,
"grad_norm": 0.59375,
"learning_rate": 7.354129772021796e-05,
"loss": 0.7760714292526245,
"step": 2782
},
{
"epoch": 0.7416089504528502,
"grad_norm": 0.58203125,
"learning_rate": 7.345816418291303e-05,
"loss": 0.7801597118377686,
"step": 2784
},
{
"epoch": 0.742141715503463,
"grad_norm": 0.5703125,
"learning_rate": 7.337454665020682e-05,
"loss": 0.7562087178230286,
"step": 2786
},
{
"epoch": 0.7426744805540757,
"grad_norm": 0.58203125,
"learning_rate": 7.329044633168455e-05,
"loss": 0.7230586409568787,
"step": 2788
},
{
"epoch": 0.7432072456046883,
"grad_norm": 0.6015625,
"learning_rate": 7.320586444391531e-05,
"loss": 0.7483820915222168,
"step": 2790
},
{
"epoch": 0.743740010655301,
"grad_norm": 0.5703125,
"learning_rate": 7.312080221043438e-05,
"loss": 0.7314382791519165,
"step": 2792
},
{
"epoch": 0.7442727757059137,
"grad_norm": 0.5546875,
"learning_rate": 7.303526086172558e-05,
"loss": 0.7445840239524841,
"step": 2794
},
{
"epoch": 0.7448055407565264,
"grad_norm": 0.58203125,
"learning_rate": 7.294924163520349e-05,
"loss": 0.7696006298065186,
"step": 2796
},
{
"epoch": 0.745338305807139,
"grad_norm": 0.578125,
"learning_rate": 7.286274577519546e-05,
"loss": 0.738504946231842,
"step": 2798
},
{
"epoch": 0.7458710708577517,
"grad_norm": 0.58203125,
"learning_rate": 7.277577453292373e-05,
"loss": 0.7645745277404785,
"step": 2800
},
{
"epoch": 0.7464038359083645,
"grad_norm": 0.59375,
"learning_rate": 7.268832916648726e-05,
"loss": 0.7574669718742371,
"step": 2802
},
{
"epoch": 0.7469366009589771,
"grad_norm": 0.57421875,
"learning_rate": 7.26004109408435e-05,
"loss": 0.7300820350646973,
"step": 2804
},
{
"epoch": 0.7474693660095898,
"grad_norm": 0.6015625,
"learning_rate": 7.251202112779023e-05,
"loss": 0.7510509490966797,
"step": 2806
},
{
"epoch": 0.7480021310602024,
"grad_norm": 0.5859375,
"learning_rate": 7.242316100594696e-05,
"loss": 0.7628376483917236,
"step": 2808
},
{
"epoch": 0.7485348961108151,
"grad_norm": 0.5703125,
"learning_rate": 7.23338318607366e-05,
"loss": 0.7346773147583008,
"step": 2810
},
{
"epoch": 0.7490676611614278,
"grad_norm": 0.5703125,
"learning_rate": 7.22440349843668e-05,
"loss": 0.768202543258667,
"step": 2812
},
{
"epoch": 0.7496004262120405,
"grad_norm": 0.61328125,
"learning_rate": 7.215377167581123e-05,
"loss": 0.7435672879219055,
"step": 2814
},
{
"epoch": 0.7501331912626532,
"grad_norm": 0.5625,
"learning_rate": 7.206304324079089e-05,
"loss": 0.7180995941162109,
"step": 2816
},
{
"epoch": 0.7506659563132658,
"grad_norm": 0.5859375,
"learning_rate": 7.197185099175508e-05,
"loss": 0.7368422746658325,
"step": 2818
},
{
"epoch": 0.7511987213638786,
"grad_norm": 0.5859375,
"learning_rate": 7.188019624786255e-05,
"loss": 0.7638548612594604,
"step": 2820
},
{
"epoch": 0.7517314864144912,
"grad_norm": 0.58984375,
"learning_rate": 7.17880803349623e-05,
"loss": 0.803455114364624,
"step": 2822
},
{
"epoch": 0.7522642514651039,
"grad_norm": 0.5859375,
"learning_rate": 7.169550458557453e-05,
"loss": 0.7775712609291077,
"step": 2824
},
{
"epoch": 0.7527970165157166,
"grad_norm": 0.5703125,
"learning_rate": 7.160247033887121e-05,
"loss": 0.7178272604942322,
"step": 2826
},
{
"epoch": 0.7533297815663292,
"grad_norm": 0.609375,
"learning_rate": 7.150897894065684e-05,
"loss": 0.7626135349273682,
"step": 2828
},
{
"epoch": 0.753862546616942,
"grad_norm": 0.58203125,
"learning_rate": 7.141503174334894e-05,
"loss": 0.7484996318817139,
"step": 2830
},
{
"epoch": 0.7543953116675546,
"grad_norm": 0.58984375,
"learning_rate": 7.132063010595843e-05,
"loss": 0.7348583340644836,
"step": 2832
},
{
"epoch": 0.7549280767181673,
"grad_norm": 0.58984375,
"learning_rate": 7.122577539407009e-05,
"loss": 0.7415651082992554,
"step": 2834
},
{
"epoch": 0.7554608417687799,
"grad_norm": 0.59765625,
"learning_rate": 7.113046897982265e-05,
"loss": 0.7116685509681702,
"step": 2836
},
{
"epoch": 0.7559936068193926,
"grad_norm": 0.58203125,
"learning_rate": 7.103471224188908e-05,
"loss": 0.7689422369003296,
"step": 2838
},
{
"epoch": 0.7565263718700054,
"grad_norm": 0.58984375,
"learning_rate": 7.093850656545659e-05,
"loss": 0.7544586062431335,
"step": 2840
},
{
"epoch": 0.757059136920618,
"grad_norm": 0.5625,
"learning_rate": 7.084185334220658e-05,
"loss": 0.7671671509742737,
"step": 2842
},
{
"epoch": 0.7575919019712307,
"grad_norm": 0.578125,
"learning_rate": 7.074475397029454e-05,
"loss": 0.7252099514007568,
"step": 2844
},
{
"epoch": 0.7581246670218433,
"grad_norm": 0.59765625,
"learning_rate": 7.064720985432979e-05,
"loss": 0.7508083581924438,
"step": 2846
},
{
"epoch": 0.7586574320724561,
"grad_norm": 0.5703125,
"learning_rate": 7.054922240535516e-05,
"loss": 0.7430750727653503,
"step": 2848
},
{
"epoch": 0.7591901971230687,
"grad_norm": 0.57421875,
"learning_rate": 7.045079304082667e-05,
"loss": 0.736114501953125,
"step": 2850
},
{
"epoch": 0.7597229621736814,
"grad_norm": 0.55859375,
"learning_rate": 7.035192318459288e-05,
"loss": 0.7149706482887268,
"step": 2852
},
{
"epoch": 0.7602557272242941,
"grad_norm": 0.5859375,
"learning_rate": 7.02526142668744e-05,
"loss": 0.7909661531448364,
"step": 2854
},
{
"epoch": 0.7607884922749067,
"grad_norm": 0.60546875,
"learning_rate": 7.015286772424316e-05,
"loss": 0.7296648025512695,
"step": 2856
},
{
"epoch": 0.7613212573255195,
"grad_norm": 0.5625,
"learning_rate": 7.005268499960162e-05,
"loss": 0.7293763160705566,
"step": 2858
},
{
"epoch": 0.7618540223761321,
"grad_norm": 0.56640625,
"learning_rate": 6.995206754216194e-05,
"loss": 0.7414215207099915,
"step": 2860
},
{
"epoch": 0.7623867874267448,
"grad_norm": 0.578125,
"learning_rate": 6.985101680742499e-05,
"loss": 0.7198176383972168,
"step": 2862
},
{
"epoch": 0.7629195524773574,
"grad_norm": 0.5859375,
"learning_rate": 6.974953425715926e-05,
"loss": 0.7562193870544434,
"step": 2864
},
{
"epoch": 0.7634523175279702,
"grad_norm": 0.6015625,
"learning_rate": 6.96476213593798e-05,
"loss": 0.7399710416793823,
"step": 2866
},
{
"epoch": 0.7639850825785829,
"grad_norm": 0.55859375,
"learning_rate": 6.95452795883269e-05,
"loss": 0.739233672618866,
"step": 2868
},
{
"epoch": 0.7645178476291955,
"grad_norm": 0.57421875,
"learning_rate": 6.944251042444477e-05,
"loss": 0.7336940765380859,
"step": 2870
},
{
"epoch": 0.7650506126798082,
"grad_norm": 0.55859375,
"learning_rate": 6.933931535436021e-05,
"loss": 0.7608907222747803,
"step": 2872
},
{
"epoch": 0.7655833777304208,
"grad_norm": 0.5625,
"learning_rate": 6.923569587086103e-05,
"loss": 0.747681736946106,
"step": 2874
},
{
"epoch": 0.7661161427810336,
"grad_norm": 0.54296875,
"learning_rate": 6.913165347287444e-05,
"loss": 0.6818616390228271,
"step": 2876
},
{
"epoch": 0.7666489078316463,
"grad_norm": 0.6015625,
"learning_rate": 6.902718966544545e-05,
"loss": 0.7375771403312683,
"step": 2878
},
{
"epoch": 0.7671816728822589,
"grad_norm": 0.83203125,
"learning_rate": 6.8922305959715e-05,
"loss": 0.8159199357032776,
"step": 2880
},
{
"epoch": 0.7677144379328716,
"grad_norm": 0.5703125,
"learning_rate": 6.881700387289819e-05,
"loss": 0.7084010243415833,
"step": 2882
},
{
"epoch": 0.7682472029834843,
"grad_norm": 0.59765625,
"learning_rate": 6.871128492826226e-05,
"loss": 0.7337329387664795,
"step": 2884
},
{
"epoch": 0.768779968034097,
"grad_norm": 0.56640625,
"learning_rate": 6.860515065510459e-05,
"loss": 0.7246623635292053,
"step": 2886
},
{
"epoch": 0.7693127330847096,
"grad_norm": 0.625,
"learning_rate": 6.849860258873059e-05,
"loss": 0.7965453863143921,
"step": 2888
},
{
"epoch": 0.7698454981353223,
"grad_norm": 0.5703125,
"learning_rate": 6.839164227043146e-05,
"loss": 0.7623465657234192,
"step": 2890
},
{
"epoch": 0.770378263185935,
"grad_norm": 0.578125,
"learning_rate": 6.828427124746191e-05,
"loss": 0.7439296245574951,
"step": 2892
},
{
"epoch": 0.7709110282365477,
"grad_norm": 0.546875,
"learning_rate": 6.817649107301777e-05,
"loss": 0.6967207193374634,
"step": 2894
},
{
"epoch": 0.7714437932871604,
"grad_norm": 0.58203125,
"learning_rate": 6.806830330621355e-05,
"loss": 0.7421116828918457,
"step": 2896
},
{
"epoch": 0.771976558337773,
"grad_norm": 0.6015625,
"learning_rate": 6.795970951205984e-05,
"loss": 0.7583564519882202,
"step": 2898
},
{
"epoch": 0.7725093233883857,
"grad_norm": 0.578125,
"learning_rate": 6.785071126144072e-05,
"loss": 0.7536525130271912,
"step": 2900
},
{
"epoch": 0.7730420884389984,
"grad_norm": 0.5859375,
"learning_rate": 6.774131013109097e-05,
"loss": 0.7408878803253174,
"step": 2902
},
{
"epoch": 0.7735748534896111,
"grad_norm": 0.55859375,
"learning_rate": 6.763150770357337e-05,
"loss": 0.7107818126678467,
"step": 2904
},
{
"epoch": 0.7741076185402238,
"grad_norm": 0.578125,
"learning_rate": 6.752130556725567e-05,
"loss": 0.7988401651382446,
"step": 2906
},
{
"epoch": 0.7746403835908364,
"grad_norm": 0.56640625,
"learning_rate": 6.741070531628771e-05,
"loss": 0.7510604858398438,
"step": 2908
},
{
"epoch": 0.7751731486414492,
"grad_norm": 0.58203125,
"learning_rate": 6.729970855057835e-05,
"loss": 0.6959093809127808,
"step": 2910
},
{
"epoch": 0.7757059136920618,
"grad_norm": 0.57421875,
"learning_rate": 6.718831687577228e-05,
"loss": 0.7422518134117126,
"step": 2912
},
{
"epoch": 0.7762386787426745,
"grad_norm": 0.58984375,
"learning_rate": 6.707653190322687e-05,
"loss": 0.7659637928009033,
"step": 2914
},
{
"epoch": 0.7767714437932871,
"grad_norm": 0.57421875,
"learning_rate": 6.696435524998875e-05,
"loss": 0.7153394222259521,
"step": 2916
},
{
"epoch": 0.7773042088438998,
"grad_norm": 0.5703125,
"learning_rate": 6.685178853877052e-05,
"loss": 0.7259315848350525,
"step": 2918
},
{
"epoch": 0.7778369738945126,
"grad_norm": 0.5546875,
"learning_rate": 6.673883339792723e-05,
"loss": 0.7096306085586548,
"step": 2920
},
{
"epoch": 0.7783697389451252,
"grad_norm": 0.57421875,
"learning_rate": 6.662549146143281e-05,
"loss": 0.7444663047790527,
"step": 2922
},
{
"epoch": 0.7789025039957379,
"grad_norm": 0.58203125,
"learning_rate": 6.651176436885651e-05,
"loss": 0.7020083665847778,
"step": 2924
},
{
"epoch": 0.7794352690463505,
"grad_norm": 0.58203125,
"learning_rate": 6.639765376533909e-05,
"loss": 0.7518250346183777,
"step": 2926
},
{
"epoch": 0.7799680340969632,
"grad_norm": 0.6015625,
"learning_rate": 6.628316130156904e-05,
"loss": 0.7833393216133118,
"step": 2928
},
{
"epoch": 0.780500799147576,
"grad_norm": 0.609375,
"learning_rate": 6.616828863375877e-05,
"loss": 0.7342422604560852,
"step": 2930
},
{
"epoch": 0.7810335641981886,
"grad_norm": 0.546875,
"learning_rate": 6.605303742362057e-05,
"loss": 0.7364885807037354,
"step": 2932
},
{
"epoch": 0.7815663292488013,
"grad_norm": 0.59375,
"learning_rate": 6.593740933834262e-05,
"loss": 0.7553122043609619,
"step": 2934
},
{
"epoch": 0.7820990942994139,
"grad_norm": 0.57421875,
"learning_rate": 6.582140605056484e-05,
"loss": 0.7586118578910828,
"step": 2936
},
{
"epoch": 0.7826318593500267,
"grad_norm": 0.5625,
"learning_rate": 6.570502923835475e-05,
"loss": 0.7087134718894958,
"step": 2938
},
{
"epoch": 0.7831646244006393,
"grad_norm": 0.57421875,
"learning_rate": 6.558828058518311e-05,
"loss": 0.7354501485824585,
"step": 2940
},
{
"epoch": 0.783697389451252,
"grad_norm": 0.58203125,
"learning_rate": 6.547116177989967e-05,
"loss": 0.7278033494949341,
"step": 2942
},
{
"epoch": 0.7842301545018647,
"grad_norm": 0.578125,
"learning_rate": 6.535367451670862e-05,
"loss": 0.7207239270210266,
"step": 2944
},
{
"epoch": 0.7847629195524773,
"grad_norm": 0.5859375,
"learning_rate": 6.523582049514422e-05,
"loss": 0.7300347685813904,
"step": 2946
},
{
"epoch": 0.7852956846030901,
"grad_norm": 0.5546875,
"learning_rate": 6.511760142004608e-05,
"loss": 0.701433539390564,
"step": 2948
},
{
"epoch": 0.7858284496537027,
"grad_norm": 0.55859375,
"learning_rate": 6.49990190015346e-05,
"loss": 0.7079622149467468,
"step": 2950
},
{
"epoch": 0.7863612147043154,
"grad_norm": 0.56640625,
"learning_rate": 6.488007495498619e-05,
"loss": 0.7154868841171265,
"step": 2952
},
{
"epoch": 0.786893979754928,
"grad_norm": 0.5546875,
"learning_rate": 6.476077100100841e-05,
"loss": 0.7213861346244812,
"step": 2954
},
{
"epoch": 0.7874267448055408,
"grad_norm": 0.56640625,
"learning_rate": 6.464110886541521e-05,
"loss": 0.7333036661148071,
"step": 2956
},
{
"epoch": 0.7879595098561535,
"grad_norm": 0.59375,
"learning_rate": 6.452109027920183e-05,
"loss": 0.727940559387207,
"step": 2958
},
{
"epoch": 0.7884922749067661,
"grad_norm": 0.5703125,
"learning_rate": 6.44007169785198e-05,
"loss": 0.7133424282073975,
"step": 2960
},
{
"epoch": 0.7890250399573788,
"grad_norm": 0.5625,
"learning_rate": 6.427999070465191e-05,
"loss": 0.7307575345039368,
"step": 2962
},
{
"epoch": 0.7895578050079914,
"grad_norm": 0.57421875,
"learning_rate": 6.415891320398688e-05,
"loss": 0.7410464286804199,
"step": 2964
},
{
"epoch": 0.7900905700586042,
"grad_norm": 0.5859375,
"learning_rate": 6.403748622799418e-05,
"loss": 0.7647122144699097,
"step": 2966
},
{
"epoch": 0.7906233351092168,
"grad_norm": 0.54296875,
"learning_rate": 6.391571153319872e-05,
"loss": 0.6683281064033508,
"step": 2968
},
{
"epoch": 0.7911561001598295,
"grad_norm": 0.56640625,
"learning_rate": 6.379359088115537e-05,
"loss": 0.7463474273681641,
"step": 2970
},
{
"epoch": 0.7916888652104422,
"grad_norm": 0.54296875,
"learning_rate": 6.36711260384235e-05,
"loss": 0.6725707650184631,
"step": 2972
},
{
"epoch": 0.7922216302610549,
"grad_norm": 0.57421875,
"learning_rate": 6.354831877654147e-05,
"loss": 0.7473057508468628,
"step": 2974
},
{
"epoch": 0.7927543953116676,
"grad_norm": 0.58984375,
"learning_rate": 6.342517087200094e-05,
"loss": 0.7107257843017578,
"step": 2976
},
{
"epoch": 0.7932871603622802,
"grad_norm": 0.5546875,
"learning_rate": 6.330168410622123e-05,
"loss": 0.7127653360366821,
"step": 2978
},
{
"epoch": 0.7938199254128929,
"grad_norm": 0.55078125,
"learning_rate": 6.317786026552347e-05,
"loss": 0.6991822123527527,
"step": 2980
},
{
"epoch": 0.7943526904635055,
"grad_norm": 0.55078125,
"learning_rate": 6.305370114110487e-05,
"loss": 0.7100861072540283,
"step": 2982
},
{
"epoch": 0.7948854555141183,
"grad_norm": 0.56640625,
"learning_rate": 6.292920852901272e-05,
"loss": 0.7302361130714417,
"step": 2984
},
{
"epoch": 0.795418220564731,
"grad_norm": 0.55078125,
"learning_rate": 6.280438423011843e-05,
"loss": 0.7081553936004639,
"step": 2986
},
{
"epoch": 0.7959509856153436,
"grad_norm": 0.55859375,
"learning_rate": 6.267923005009153e-05,
"loss": 0.722525954246521,
"step": 2988
},
{
"epoch": 0.7964837506659563,
"grad_norm": 0.578125,
"learning_rate": 6.255374779937344e-05,
"loss": 0.7379462718963623,
"step": 2990
},
{
"epoch": 0.797016515716569,
"grad_norm": 0.546875,
"learning_rate": 6.242793929315143e-05,
"loss": 0.7418494820594788,
"step": 2992
},
{
"epoch": 0.7975492807671817,
"grad_norm": 0.56640625,
"learning_rate": 6.230180635133221e-05,
"loss": 0.7134079933166504,
"step": 2994
},
{
"epoch": 0.7980820458177944,
"grad_norm": 0.5546875,
"learning_rate": 6.217535079851569e-05,
"loss": 0.6956773996353149,
"step": 2996
},
{
"epoch": 0.798614810868407,
"grad_norm": 0.54296875,
"learning_rate": 6.204857446396862e-05,
"loss": 0.6780136227607727,
"step": 2998
},
{
"epoch": 0.7991475759190197,
"grad_norm": 0.54296875,
"learning_rate": 6.192147918159803e-05,
"loss": 0.6957566142082214,
"step": 3000
},
{
"epoch": 0.7996803409696324,
"grad_norm": 0.55078125,
"learning_rate": 6.179406678992476e-05,
"loss": 0.705109179019928,
"step": 3002
},
{
"epoch": 0.8002131060202451,
"grad_norm": 0.58203125,
"learning_rate": 6.166633913205684e-05,
"loss": 0.7465344071388245,
"step": 3004
},
{
"epoch": 0.8007458710708577,
"grad_norm": 0.55859375,
"learning_rate": 6.15382980556629e-05,
"loss": 0.6974951028823853,
"step": 3006
},
{
"epoch": 0.8012786361214704,
"grad_norm": 0.5625,
"learning_rate": 6.140994541294529e-05,
"loss": 0.7093472480773926,
"step": 3008
},
{
"epoch": 0.8018114011720832,
"grad_norm": 0.55859375,
"learning_rate": 6.128128306061347e-05,
"loss": 0.8073316812515259,
"step": 3010
},
{
"epoch": 0.8023441662226958,
"grad_norm": 0.5625,
"learning_rate": 6.115231285985703e-05,
"loss": 0.7252596616744995,
"step": 3012
},
{
"epoch": 0.8028769312733085,
"grad_norm": 0.5625,
"learning_rate": 6.102303667631878e-05,
"loss": 0.7308025360107422,
"step": 3014
},
{
"epoch": 0.8034096963239211,
"grad_norm": 0.578125,
"learning_rate": 6.089345638006782e-05,
"loss": 0.7695532441139221,
"step": 3016
},
{
"epoch": 0.8039424613745338,
"grad_norm": 0.5703125,
"learning_rate": 6.0763573845572434e-05,
"loss": 0.7262691259384155,
"step": 3018
},
{
"epoch": 0.8044752264251465,
"grad_norm": 0.5546875,
"learning_rate": 6.0633390951672965e-05,
"loss": 0.6978904008865356,
"step": 3020
},
{
"epoch": 0.8050079914757592,
"grad_norm": 0.5546875,
"learning_rate": 6.0502909581554706e-05,
"loss": 0.6867971420288086,
"step": 3022
},
{
"epoch": 0.8055407565263719,
"grad_norm": 0.57421875,
"learning_rate": 6.037213162272056e-05,
"loss": 0.7410815358161926,
"step": 3024
},
{
"epoch": 0.8060735215769845,
"grad_norm": 0.58203125,
"learning_rate": 6.0241058966963854e-05,
"loss": 0.7607967257499695,
"step": 3026
},
{
"epoch": 0.8066062866275973,
"grad_norm": 0.5859375,
"learning_rate": 6.0109693510340867e-05,
"loss": 0.7447317242622375,
"step": 3028
},
{
"epoch": 0.8071390516782099,
"grad_norm": 0.55859375,
"learning_rate": 5.997803715314345e-05,
"loss": 0.7340399026870728,
"step": 3030
},
{
"epoch": 0.8076718167288226,
"grad_norm": 0.5390625,
"learning_rate": 5.984609179987155e-05,
"loss": 0.6988064050674438,
"step": 3032
},
{
"epoch": 0.8082045817794352,
"grad_norm": 0.5625,
"learning_rate": 5.971385935920559e-05,
"loss": 0.7076520323753357,
"step": 3034
},
{
"epoch": 0.8087373468300479,
"grad_norm": 0.56640625,
"learning_rate": 5.9581341743978986e-05,
"loss": 0.7423697113990784,
"step": 3036
},
{
"epoch": 0.8092701118806607,
"grad_norm": 0.5703125,
"learning_rate": 5.944854087115035e-05,
"loss": 0.7314884066581726,
"step": 3038
},
{
"epoch": 0.8098028769312733,
"grad_norm": 0.57421875,
"learning_rate": 5.931545866177581e-05,
"loss": 0.7414923906326294,
"step": 3040
},
{
"epoch": 0.810335641981886,
"grad_norm": 0.58203125,
"learning_rate": 5.918209704098126e-05,
"loss": 0.7651832103729248,
"step": 3042
},
{
"epoch": 0.8108684070324986,
"grad_norm": 0.5625,
"learning_rate": 5.904845793793442e-05,
"loss": 0.7165493369102478,
"step": 3044
},
{
"epoch": 0.8114011720831114,
"grad_norm": 0.57421875,
"learning_rate": 5.891454328581702e-05,
"loss": 0.7285482883453369,
"step": 3046
},
{
"epoch": 0.8119339371337241,
"grad_norm": 0.546875,
"learning_rate": 5.8780355021796774e-05,
"loss": 0.739894449710846,
"step": 3048
},
{
"epoch": 0.8124667021843367,
"grad_norm": 0.6015625,
"learning_rate": 5.86458950869994e-05,
"loss": 0.7955057621002197,
"step": 3050
},
{
"epoch": 0.8129994672349494,
"grad_norm": 0.546875,
"learning_rate": 5.8511165426480514e-05,
"loss": 0.6887469291687012,
"step": 3052
},
{
"epoch": 0.813532232285562,
"grad_norm": 0.56640625,
"learning_rate": 5.8376167989197495e-05,
"loss": 0.6830754280090332,
"step": 3054
},
{
"epoch": 0.8140649973361748,
"grad_norm": 0.57421875,
"learning_rate": 5.82409047279813e-05,
"loss": 0.7223320007324219,
"step": 3056
},
{
"epoch": 0.8145977623867874,
"grad_norm": 0.57421875,
"learning_rate": 5.810537759950822e-05,
"loss": 0.6974748969078064,
"step": 3058
},
{
"epoch": 0.8151305274374001,
"grad_norm": 0.5703125,
"learning_rate": 5.796958856427155e-05,
"loss": 0.7329493761062622,
"step": 3060
},
{
"epoch": 0.8156632924880128,
"grad_norm": 0.5390625,
"learning_rate": 5.783353958655328e-05,
"loss": 0.6891670823097229,
"step": 3062
},
{
"epoch": 0.8161960575386255,
"grad_norm": 0.54296875,
"learning_rate": 5.7697232634395614e-05,
"loss": 0.6876566410064697,
"step": 3064
},
{
"epoch": 0.8167288225892382,
"grad_norm": 0.56640625,
"learning_rate": 5.756066967957253e-05,
"loss": 0.7235361933708191,
"step": 3066
},
{
"epoch": 0.8172615876398508,
"grad_norm": 0.54296875,
"learning_rate": 5.74238526975613e-05,
"loss": 0.7141427993774414,
"step": 3068
},
{
"epoch": 0.8177943526904635,
"grad_norm": 0.54296875,
"learning_rate": 5.728678366751382e-05,
"loss": 0.6909295916557312,
"step": 3070
},
{
"epoch": 0.8183271177410761,
"grad_norm": 0.5625,
"learning_rate": 5.7149464572228104e-05,
"loss": 0.7080371379852295,
"step": 3072
},
{
"epoch": 0.8188598827916889,
"grad_norm": 0.54296875,
"learning_rate": 5.7011897398119486e-05,
"loss": 0.6966223120689392,
"step": 3074
},
{
"epoch": 0.8193926478423016,
"grad_norm": 0.56640625,
"learning_rate": 5.687408413519191e-05,
"loss": 0.7194684743881226,
"step": 3076
},
{
"epoch": 0.8199254128929142,
"grad_norm": 0.56640625,
"learning_rate": 5.6736026777009206e-05,
"loss": 0.7557689547538757,
"step": 3078
},
{
"epoch": 0.8204581779435269,
"grad_norm": 0.5859375,
"learning_rate": 5.6597727320666205e-05,
"loss": 0.76923006772995,
"step": 3080
},
{
"epoch": 0.8209909429941395,
"grad_norm": 0.5390625,
"learning_rate": 5.645918776675985e-05,
"loss": 0.7014768719673157,
"step": 3082
},
{
"epoch": 0.8215237080447523,
"grad_norm": 0.55859375,
"learning_rate": 5.632041011936025e-05,
"loss": 0.7342231869697571,
"step": 3084
},
{
"epoch": 0.8220564730953649,
"grad_norm": 0.57421875,
"learning_rate": 5.6181396385981706e-05,
"loss": 0.7379154562950134,
"step": 3086
},
{
"epoch": 0.8225892381459776,
"grad_norm": 0.55859375,
"learning_rate": 5.6042148577553665e-05,
"loss": 0.7159122824668884,
"step": 3088
},
{
"epoch": 0.8231220031965903,
"grad_norm": 0.54296875,
"learning_rate": 5.590266870839165e-05,
"loss": 0.7064898610115051,
"step": 3090
},
{
"epoch": 0.823654768247203,
"grad_norm": 0.5546875,
"learning_rate": 5.576295879616806e-05,
"loss": 0.7205244898796082,
"step": 3092
},
{
"epoch": 0.8241875332978157,
"grad_norm": 0.5546875,
"learning_rate": 5.5623020861883075e-05,
"loss": 0.7115920782089233,
"step": 3094
},
{
"epoch": 0.8247202983484283,
"grad_norm": 0.5546875,
"learning_rate": 5.5482856929835334e-05,
"loss": 0.6948148012161255,
"step": 3096
},
{
"epoch": 0.825253063399041,
"grad_norm": 0.5546875,
"learning_rate": 5.534246902759269e-05,
"loss": 0.7241220474243164,
"step": 3098
},
{
"epoch": 0.8257858284496538,
"grad_norm": 0.5625,
"learning_rate": 5.520185918596292e-05,
"loss": 0.7225724458694458,
"step": 3100
},
{
"epoch": 0.8263185935002664,
"grad_norm": 0.53125,
"learning_rate": 5.506102943896426e-05,
"loss": 0.712283730506897,
"step": 3102
},
{
"epoch": 0.8268513585508791,
"grad_norm": 0.578125,
"learning_rate": 5.4919981823796046e-05,
"loss": 0.7341251373291016,
"step": 3104
},
{
"epoch": 0.8273841236014917,
"grad_norm": 0.58984375,
"learning_rate": 5.477871838080925e-05,
"loss": 0.7807260155677795,
"step": 3106
},
{
"epoch": 0.8279168886521044,
"grad_norm": 0.546875,
"learning_rate": 5.46372411534769e-05,
"loss": 0.7081162929534912,
"step": 3108
},
{
"epoch": 0.8284496537027171,
"grad_norm": 0.55078125,
"learning_rate": 5.449555218836459e-05,
"loss": 0.7185850739479065,
"step": 3110
},
{
"epoch": 0.8289824187533298,
"grad_norm": 0.55078125,
"learning_rate": 5.435365353510083e-05,
"loss": 0.7876030206680298,
"step": 3112
},
{
"epoch": 0.8295151838039425,
"grad_norm": 0.54296875,
"learning_rate": 5.421154724634743e-05,
"loss": 0.673431396484375,
"step": 3114
},
{
"epoch": 0.8300479488545551,
"grad_norm": 0.546875,
"learning_rate": 5.40692353777698e-05,
"loss": 0.6700567007064819,
"step": 3116
},
{
"epoch": 0.8305807139051679,
"grad_norm": 0.55859375,
"learning_rate": 5.3926719988007173e-05,
"loss": 0.6965268850326538,
"step": 3118
},
{
"epoch": 0.8311134789557805,
"grad_norm": 0.5546875,
"learning_rate": 5.3784003138642855e-05,
"loss": 0.7045586109161377,
"step": 3120
},
{
"epoch": 0.8316462440063932,
"grad_norm": 0.57421875,
"learning_rate": 5.364108689417444e-05,
"loss": 0.6918834447860718,
"step": 3122
},
{
"epoch": 0.8321790090570058,
"grad_norm": 0.55859375,
"learning_rate": 5.3497973321983896e-05,
"loss": 0.688016414642334,
"step": 3124
},
{
"epoch": 0.8327117741076185,
"grad_norm": 0.546875,
"learning_rate": 5.335466449230765e-05,
"loss": 0.7234639525413513,
"step": 3126
},
{
"epoch": 0.8332445391582313,
"grad_norm": 0.5859375,
"learning_rate": 5.321116247820669e-05,
"loss": 0.7776771187782288,
"step": 3128
},
{
"epoch": 0.8337773042088439,
"grad_norm": 0.55859375,
"learning_rate": 5.3067469355536525e-05,
"loss": 0.6885461211204529,
"step": 3130
},
{
"epoch": 0.8343100692594566,
"grad_norm": 0.5390625,
"learning_rate": 5.29235872029172e-05,
"loss": 0.6760199666023254,
"step": 3132
},
{
"epoch": 0.8348428343100692,
"grad_norm": 0.55078125,
"learning_rate": 5.277951810170322e-05,
"loss": 0.6977653503417969,
"step": 3134
},
{
"epoch": 0.835375599360682,
"grad_norm": 0.5390625,
"learning_rate": 5.2635264135953385e-05,
"loss": 0.6952658295631409,
"step": 3136
},
{
"epoch": 0.8359083644112946,
"grad_norm": 0.5390625,
"learning_rate": 5.2490827392400735e-05,
"loss": 0.7002542614936829,
"step": 3138
},
{
"epoch": 0.8364411294619073,
"grad_norm": 0.63671875,
"learning_rate": 5.2346209960422295e-05,
"loss": 0.7366476655006409,
"step": 3140
},
{
"epoch": 0.83697389451252,
"grad_norm": 0.5546875,
"learning_rate": 5.2201413932008865e-05,
"loss": 0.7242977619171143,
"step": 3142
},
{
"epoch": 0.8375066595631326,
"grad_norm": 0.55078125,
"learning_rate": 5.20564414017348e-05,
"loss": 0.6978318095207214,
"step": 3144
},
{
"epoch": 0.8380394246137454,
"grad_norm": 0.546875,
"learning_rate": 5.191129446672763e-05,
"loss": 0.7196321487426758,
"step": 3146
},
{
"epoch": 0.838572189664358,
"grad_norm": 0.55078125,
"learning_rate": 5.1765975226637804e-05,
"loss": 0.7309831380844116,
"step": 3148
},
{
"epoch": 0.8391049547149707,
"grad_norm": 0.57421875,
"learning_rate": 5.162048578360827e-05,
"loss": 0.7818800210952759,
"step": 3150
},
{
"epoch": 0.8396377197655833,
"grad_norm": 0.52734375,
"learning_rate": 5.1474828242244085e-05,
"loss": 0.6933904886245728,
"step": 3152
},
{
"epoch": 0.840170484816196,
"grad_norm": 0.5546875,
"learning_rate": 5.132900470958194e-05,
"loss": 0.6861001253128052,
"step": 3154
},
{
"epoch": 0.8407032498668088,
"grad_norm": 0.55859375,
"learning_rate": 5.1183017295059734e-05,
"loss": 0.7419685125350952,
"step": 3156
},
{
"epoch": 0.8412360149174214,
"grad_norm": 0.546875,
"learning_rate": 5.103686811048603e-05,
"loss": 0.697303056716919,
"step": 3158
},
{
"epoch": 0.8417687799680341,
"grad_norm": 0.5625,
"learning_rate": 5.089055927000948e-05,
"loss": 0.7192218899726868,
"step": 3160
},
{
"epoch": 0.8423015450186467,
"grad_norm": 0.5625,
"learning_rate": 5.07440928900883e-05,
"loss": 0.7201210856437683,
"step": 3162
},
{
"epoch": 0.8428343100692595,
"grad_norm": 0.5703125,
"learning_rate": 5.059747108945958e-05,
"loss": 0.7341133952140808,
"step": 3164
},
{
"epoch": 0.8433670751198722,
"grad_norm": 0.55078125,
"learning_rate": 5.045069598910873e-05,
"loss": 0.7105435132980347,
"step": 3166
},
{
"epoch": 0.8438998401704848,
"grad_norm": 0.5703125,
"learning_rate": 5.030376971223872e-05,
"loss": 0.7143466472625732,
"step": 3168
},
{
"epoch": 0.8444326052210975,
"grad_norm": 0.54296875,
"learning_rate": 5.015669438423939e-05,
"loss": 0.6983811259269714,
"step": 3170
},
{
"epoch": 0.8449653702717101,
"grad_norm": 0.5546875,
"learning_rate": 5.00094721326567e-05,
"loss": 0.6914249062538147,
"step": 3172
},
{
"epoch": 0.8454981353223229,
"grad_norm": 0.53515625,
"learning_rate": 4.9862105087161986e-05,
"loss": 0.6947450041770935,
"step": 3174
},
{
"epoch": 0.8460309003729355,
"grad_norm": 0.56640625,
"learning_rate": 4.9714595379521094e-05,
"loss": 0.7301425933837891,
"step": 3176
},
{
"epoch": 0.8465636654235482,
"grad_norm": 0.60546875,
"learning_rate": 4.956694514356363e-05,
"loss": 0.7228060364723206,
"step": 3178
},
{
"epoch": 0.8470964304741609,
"grad_norm": 0.55859375,
"learning_rate": 4.9419156515151956e-05,
"loss": 0.7128725051879883,
"step": 3180
},
{
"epoch": 0.8476291955247736,
"grad_norm": 0.55859375,
"learning_rate": 4.927123163215047e-05,
"loss": 0.7226285934448242,
"step": 3182
},
{
"epoch": 0.8481619605753863,
"grad_norm": 0.5625,
"learning_rate": 4.9123172634394515e-05,
"loss": 0.7291282415390015,
"step": 3184
},
{
"epoch": 0.8486947256259989,
"grad_norm": 0.55078125,
"learning_rate": 4.897498166365953e-05,
"loss": 0.6915116310119629,
"step": 3186
},
{
"epoch": 0.8492274906766116,
"grad_norm": 0.56640625,
"learning_rate": 4.882666086363002e-05,
"loss": 0.7105843424797058,
"step": 3188
},
{
"epoch": 0.8497602557272242,
"grad_norm": 0.5546875,
"learning_rate": 4.8678212379868585e-05,
"loss": 0.7123350501060486,
"step": 3190
},
{
"epoch": 0.850293020777837,
"grad_norm": 0.5703125,
"learning_rate": 4.852963835978482e-05,
"loss": 0.7145159244537354,
"step": 3192
},
{
"epoch": 0.8508257858284497,
"grad_norm": 0.5625,
"learning_rate": 4.838094095260432e-05,
"loss": 0.7674680948257446,
"step": 3194
},
{
"epoch": 0.8513585508790623,
"grad_norm": 0.5625,
"learning_rate": 4.823212230933755e-05,
"loss": 0.6895533204078674,
"step": 3196
},
{
"epoch": 0.851891315929675,
"grad_norm": 0.546875,
"learning_rate": 4.808318458274874e-05,
"loss": 0.7199817299842834,
"step": 3198
},
{
"epoch": 0.8524240809802877,
"grad_norm": 0.55078125,
"learning_rate": 4.7934129927324717e-05,
"loss": 0.6851661205291748,
"step": 3200
},
{
"epoch": 0.8529568460309004,
"grad_norm": 0.53515625,
"learning_rate": 4.778496049924381e-05,
"loss": 0.6990004777908325,
"step": 3202
},
{
"epoch": 0.853489611081513,
"grad_norm": 0.51953125,
"learning_rate": 4.763567845634459e-05,
"loss": 0.6694433689117432,
"step": 3204
},
{
"epoch": 0.8540223761321257,
"grad_norm": 0.546875,
"learning_rate": 4.748628595809466e-05,
"loss": 0.7453095316886902,
"step": 3206
},
{
"epoch": 0.8545551411827385,
"grad_norm": 0.53125,
"learning_rate": 4.733678516555948e-05,
"loss": 0.711460530757904,
"step": 3208
},
{
"epoch": 0.8550879062333511,
"grad_norm": 0.5390625,
"learning_rate": 4.718717824137102e-05,
"loss": 0.6715887784957886,
"step": 3210
},
{
"epoch": 0.8556206712839638,
"grad_norm": 0.55078125,
"learning_rate": 4.703746734969653e-05,
"loss": 0.704089343547821,
"step": 3212
},
{
"epoch": 0.8561534363345764,
"grad_norm": 0.55859375,
"learning_rate": 4.6887654656207255e-05,
"loss": 0.6886122822761536,
"step": 3214
},
{
"epoch": 0.8566862013851891,
"grad_norm": 0.578125,
"learning_rate": 4.673774232804701e-05,
"loss": 0.7097099423408508,
"step": 3216
},
{
"epoch": 0.8572189664358019,
"grad_norm": 0.59765625,
"learning_rate": 4.6587732533800945e-05,
"loss": 0.7401700615882874,
"step": 3218
},
{
"epoch": 0.8577517314864145,
"grad_norm": 0.55078125,
"learning_rate": 4.64376274434641e-05,
"loss": 0.7368103265762329,
"step": 3220
},
{
"epoch": 0.8582844965370272,
"grad_norm": 0.546875,
"learning_rate": 4.628742922841006e-05,
"loss": 0.6967146396636963,
"step": 3222
},
{
"epoch": 0.8588172615876398,
"grad_norm": 0.5546875,
"learning_rate": 4.613714006135948e-05,
"loss": 0.6933982968330383,
"step": 3224
},
{
"epoch": 0.8593500266382526,
"grad_norm": 0.55078125,
"learning_rate": 4.598676211634876e-05,
"loss": 0.7536207437515259,
"step": 3226
},
{
"epoch": 0.8598827916888652,
"grad_norm": 0.55078125,
"learning_rate": 4.5836297568698475e-05,
"loss": 0.7168260216712952,
"step": 3228
},
{
"epoch": 0.8604155567394779,
"grad_norm": 0.5390625,
"learning_rate": 4.568574859498201e-05,
"loss": 0.723820686340332,
"step": 3230
},
{
"epoch": 0.8609483217900906,
"grad_norm": 0.546875,
"learning_rate": 4.553511737299401e-05,
"loss": 0.6810100674629211,
"step": 3232
},
{
"epoch": 0.8614810868407032,
"grad_norm": 0.55078125,
"learning_rate": 4.5384406081718895e-05,
"loss": 0.7171721458435059,
"step": 3234
},
{
"epoch": 0.862013851891316,
"grad_norm": 0.546875,
"learning_rate": 4.5233616901299364e-05,
"loss": 0.7057915925979614,
"step": 3236
},
{
"epoch": 0.8625466169419286,
"grad_norm": 0.55078125,
"learning_rate": 4.508275201300482e-05,
"loss": 0.6889278292655945,
"step": 3238
},
{
"epoch": 0.8630793819925413,
"grad_norm": 0.54296875,
"learning_rate": 4.493181359919983e-05,
"loss": 0.72967928647995,
"step": 3240
},
{
"epoch": 0.8636121470431539,
"grad_norm": 0.54296875,
"learning_rate": 4.478080384331255e-05,
"loss": 0.6688830852508545,
"step": 3242
},
{
"epoch": 0.8641449120937666,
"grad_norm": 0.5546875,
"learning_rate": 4.462972492980319e-05,
"loss": 0.7294082045555115,
"step": 3244
},
{
"epoch": 0.8646776771443794,
"grad_norm": 0.5625,
"learning_rate": 4.4478579044132314e-05,
"loss": 0.7415005564689636,
"step": 3246
},
{
"epoch": 0.865210442194992,
"grad_norm": 0.5546875,
"learning_rate": 4.432736837272935e-05,
"loss": 0.730790376663208,
"step": 3248
},
{
"epoch": 0.8657432072456047,
"grad_norm": 0.5625,
"learning_rate": 4.417609510296082e-05,
"loss": 0.713773787021637,
"step": 3250
},
{
"epoch": 0.8662759722962173,
"grad_norm": 0.51953125,
"learning_rate": 4.4024761423098845e-05,
"loss": 0.6486693620681763,
"step": 3252
},
{
"epoch": 0.8668087373468301,
"grad_norm": 0.53515625,
"learning_rate": 4.387336952228937e-05,
"loss": 0.692020058631897,
"step": 3254
},
{
"epoch": 0.8673415023974427,
"grad_norm": 0.546875,
"learning_rate": 4.372192159052058e-05,
"loss": 0.7071133255958557,
"step": 3256
},
{
"epoch": 0.8678742674480554,
"grad_norm": 0.546875,
"learning_rate": 4.357041981859118e-05,
"loss": 0.7335031032562256,
"step": 3258
},
{
"epoch": 0.8684070324986681,
"grad_norm": 0.5625,
"learning_rate": 4.3418866398078684e-05,
"loss": 0.7153759002685547,
"step": 3260
},
{
"epoch": 0.8689397975492807,
"grad_norm": 0.55859375,
"learning_rate": 4.326726352130775e-05,
"loss": 0.6825717091560364,
"step": 3262
},
{
"epoch": 0.8694725625998935,
"grad_norm": 0.55078125,
"learning_rate": 4.3115613381318485e-05,
"loss": 0.700499415397644,
"step": 3264
},
{
"epoch": 0.8700053276505061,
"grad_norm": 0.54296875,
"learning_rate": 4.296391817183467e-05,
"loss": 0.6975908279418945,
"step": 3266
},
{
"epoch": 0.8705380927011188,
"grad_norm": 0.55859375,
"learning_rate": 4.281218008723201e-05,
"loss": 0.6818310618400574,
"step": 3268
},
{
"epoch": 0.8710708577517314,
"grad_norm": 0.53515625,
"learning_rate": 4.26604013225065e-05,
"loss": 0.6989009976387024,
"step": 3270
},
{
"epoch": 0.8716036228023442,
"grad_norm": 0.53515625,
"learning_rate": 4.250858407324254e-05,
"loss": 0.669248104095459,
"step": 3272
},
{
"epoch": 0.8721363878529569,
"grad_norm": 0.5390625,
"learning_rate": 4.235673053558127e-05,
"loss": 0.7152054905891418,
"step": 3274
},
{
"epoch": 0.8726691529035695,
"grad_norm": 0.52734375,
"learning_rate": 4.220484290618876e-05,
"loss": 0.6730162501335144,
"step": 3276
},
{
"epoch": 0.8732019179541822,
"grad_norm": 0.5390625,
"learning_rate": 4.205292338222423e-05,
"loss": 0.7103643417358398,
"step": 3278
},
{
"epoch": 0.8737346830047948,
"grad_norm": 0.55078125,
"learning_rate": 4.190097416130828e-05,
"loss": 0.7044172883033752,
"step": 3280
},
{
"epoch": 0.8742674480554076,
"grad_norm": 0.546875,
"learning_rate": 4.174899744149112e-05,
"loss": 0.7182801365852356,
"step": 3282
},
{
"epoch": 0.8748002131060203,
"grad_norm": 0.51953125,
"learning_rate": 4.159699542122071e-05,
"loss": 0.6786679029464722,
"step": 3284
},
{
"epoch": 0.8753329781566329,
"grad_norm": 0.53515625,
"learning_rate": 4.1444970299311016e-05,
"loss": 0.6989539861679077,
"step": 3286
},
{
"epoch": 0.8758657432072456,
"grad_norm": 0.55078125,
"learning_rate": 4.129292427491021e-05,
"loss": 0.6917098760604858,
"step": 3288
},
{
"epoch": 0.8763985082578583,
"grad_norm": 0.54296875,
"learning_rate": 4.1140859547468794e-05,
"loss": 0.6877356171607971,
"step": 3290
},
{
"epoch": 0.876931273308471,
"grad_norm": 0.58203125,
"learning_rate": 4.098877831670785e-05,
"loss": 0.749396026134491,
"step": 3292
},
{
"epoch": 0.8774640383590836,
"grad_norm": 0.578125,
"learning_rate": 4.083668278258717e-05,
"loss": 0.7201125621795654,
"step": 3294
},
{
"epoch": 0.8779968034096963,
"grad_norm": 0.5625,
"learning_rate": 4.0684575145273474e-05,
"loss": 0.7145978808403015,
"step": 3296
},
{
"epoch": 0.878529568460309,
"grad_norm": 0.5703125,
"learning_rate": 4.053245760510856e-05,
"loss": 0.7227798104286194,
"step": 3298
},
{
"epoch": 0.8790623335109217,
"grad_norm": 0.54296875,
"learning_rate": 4.038033236257746e-05,
"loss": 0.7105327248573303,
"step": 3300
},
{
"epoch": 0.8795950985615344,
"grad_norm": 0.53125,
"learning_rate": 4.0228201618276655e-05,
"loss": 0.7208723425865173,
"step": 3302
},
{
"epoch": 0.880127863612147,
"grad_norm": 0.5625,
"learning_rate": 4.007606757288217e-05,
"loss": 0.7228609323501587,
"step": 3304
},
{
"epoch": 0.8806606286627597,
"grad_norm": 0.5546875,
"learning_rate": 3.992393242711785e-05,
"loss": 0.6760815382003784,
"step": 3306
},
{
"epoch": 0.8811933937133724,
"grad_norm": 0.5390625,
"learning_rate": 3.9771798381723365e-05,
"loss": 0.6926905512809753,
"step": 3308
},
{
"epoch": 0.8817261587639851,
"grad_norm": 0.5390625,
"learning_rate": 3.9619667637422555e-05,
"loss": 0.7134456038475037,
"step": 3310
},
{
"epoch": 0.8822589238145978,
"grad_norm": 0.5546875,
"learning_rate": 3.946754239489146e-05,
"loss": 0.697494387626648,
"step": 3312
},
{
"epoch": 0.8827916888652104,
"grad_norm": 0.53125,
"learning_rate": 3.931542485472654e-05,
"loss": 0.6928008198738098,
"step": 3314
},
{
"epoch": 0.8833244539158231,
"grad_norm": 0.53515625,
"learning_rate": 3.9163317217412844e-05,
"loss": 0.6829655766487122,
"step": 3316
},
{
"epoch": 0.8838572189664358,
"grad_norm": 0.55078125,
"learning_rate": 3.901122168329217e-05,
"loss": 0.6962534785270691,
"step": 3318
},
{
"epoch": 0.8843899840170485,
"grad_norm": 0.5390625,
"learning_rate": 3.8859140452531206e-05,
"loss": 0.7147413492202759,
"step": 3320
},
{
"epoch": 0.8849227490676611,
"grad_norm": 0.54296875,
"learning_rate": 3.8707075725089794e-05,
"loss": 0.7086056470870972,
"step": 3322
},
{
"epoch": 0.8854555141182738,
"grad_norm": 0.5625,
"learning_rate": 3.8555029700689e-05,
"loss": 0.7095344066619873,
"step": 3324
},
{
"epoch": 0.8859882791688866,
"grad_norm": 0.5234375,
"learning_rate": 3.840300457877931e-05,
"loss": 0.7212046980857849,
"step": 3326
},
{
"epoch": 0.8865210442194992,
"grad_norm": 0.55078125,
"learning_rate": 3.8251002558508896e-05,
"loss": 0.7254408001899719,
"step": 3328
},
{
"epoch": 0.8870538092701119,
"grad_norm": 0.546875,
"learning_rate": 3.809902583869172e-05,
"loss": 0.6854255795478821,
"step": 3330
},
{
"epoch": 0.8875865743207245,
"grad_norm": 0.5703125,
"learning_rate": 3.7947076617775785e-05,
"loss": 0.7515878081321716,
"step": 3332
},
{
"epoch": 0.8881193393713372,
"grad_norm": 0.5546875,
"learning_rate": 3.779515709381125e-05,
"loss": 0.7171257734298706,
"step": 3334
},
{
"epoch": 0.88865210442195,
"grad_norm": 0.53125,
"learning_rate": 3.7643269464418734e-05,
"loss": 0.7486863136291504,
"step": 3336
},
{
"epoch": 0.8891848694725626,
"grad_norm": 0.53515625,
"learning_rate": 3.749141592675747e-05,
"loss": 0.6408795118331909,
"step": 3338
},
{
"epoch": 0.8897176345231753,
"grad_norm": 0.5234375,
"learning_rate": 3.7339598677493515e-05,
"loss": 0.6756150722503662,
"step": 3340
},
{
"epoch": 0.8902503995737879,
"grad_norm": 0.5546875,
"learning_rate": 3.7187819912768005e-05,
"loss": 0.7228208780288696,
"step": 3342
},
{
"epoch": 0.8907831646244007,
"grad_norm": 0.55078125,
"learning_rate": 3.7036081828165353e-05,
"loss": 0.6675580143928528,
"step": 3344
},
{
"epoch": 0.8913159296750133,
"grad_norm": 0.546875,
"learning_rate": 3.688438661868153e-05,
"loss": 0.7148852944374084,
"step": 3346
},
{
"epoch": 0.891848694725626,
"grad_norm": 0.56640625,
"learning_rate": 3.673273647869226e-05,
"loss": 0.7156222462654114,
"step": 3348
},
{
"epoch": 0.8923814597762387,
"grad_norm": 0.58203125,
"learning_rate": 3.6581133601921336e-05,
"loss": 0.7250128388404846,
"step": 3350
},
{
"epoch": 0.8929142248268513,
"grad_norm": 0.5390625,
"learning_rate": 3.6429580181408836e-05,
"loss": 0.6913713812828064,
"step": 3352
},
{
"epoch": 0.8934469898774641,
"grad_norm": 0.546875,
"learning_rate": 3.6278078409479424e-05,
"loss": 0.7076290249824524,
"step": 3354
},
{
"epoch": 0.8939797549280767,
"grad_norm": 0.5625,
"learning_rate": 3.6126630477710634e-05,
"loss": 0.7225173115730286,
"step": 3356
},
{
"epoch": 0.8945125199786894,
"grad_norm": 0.5546875,
"learning_rate": 3.5975238576901175e-05,
"loss": 0.6932091116905212,
"step": 3358
},
{
"epoch": 0.895045285029302,
"grad_norm": 0.53125,
"learning_rate": 3.582390489703919e-05,
"loss": 0.6957063674926758,
"step": 3360
},
{
"epoch": 0.8955780500799148,
"grad_norm": 0.5234375,
"learning_rate": 3.567263162727067e-05,
"loss": 0.6680214405059814,
"step": 3362
},
{
"epoch": 0.8961108151305275,
"grad_norm": 0.54296875,
"learning_rate": 3.552142095586769e-05,
"loss": 0.7048709988594055,
"step": 3364
},
{
"epoch": 0.8966435801811401,
"grad_norm": 0.55078125,
"learning_rate": 3.537027507019682e-05,
"loss": 0.6847620606422424,
"step": 3366
},
{
"epoch": 0.8971763452317528,
"grad_norm": 0.5703125,
"learning_rate": 3.5219196156687454e-05,
"loss": 0.7076305747032166,
"step": 3368
},
{
"epoch": 0.8977091102823654,
"grad_norm": 0.52734375,
"learning_rate": 3.506818640080018e-05,
"loss": 0.7055946588516235,
"step": 3370
},
{
"epoch": 0.8982418753329782,
"grad_norm": 0.546875,
"learning_rate": 3.491724798699519e-05,
"loss": 0.7180204391479492,
"step": 3372
},
{
"epoch": 0.8987746403835908,
"grad_norm": 0.5234375,
"learning_rate": 3.476638309870064e-05,
"loss": 0.6561832427978516,
"step": 3374
},
{
"epoch": 0.8993074054342035,
"grad_norm": 0.55859375,
"learning_rate": 3.4615593918281126e-05,
"loss": 0.702898383140564,
"step": 3376
},
{
"epoch": 0.8998401704848162,
"grad_norm": 0.55859375,
"learning_rate": 3.446488262700601e-05,
"loss": 0.7005707621574402,
"step": 3378
},
{
"epoch": 0.9003729355354289,
"grad_norm": 0.53125,
"learning_rate": 3.431425140501801e-05,
"loss": 0.6888461709022522,
"step": 3380
},
{
"epoch": 0.9009057005860416,
"grad_norm": 0.55859375,
"learning_rate": 3.416370243130154e-05,
"loss": 0.7409310340881348,
"step": 3382
},
{
"epoch": 0.9014384656366542,
"grad_norm": 0.54296875,
"learning_rate": 3.4013237883651255e-05,
"loss": 0.6963149309158325,
"step": 3384
},
{
"epoch": 0.9019712306872669,
"grad_norm": 0.55078125,
"learning_rate": 3.386285993864053e-05,
"loss": 0.6961240172386169,
"step": 3386
},
{
"epoch": 0.9025039957378796,
"grad_norm": 0.53515625,
"learning_rate": 3.3712570771589956e-05,
"loss": 0.6980938911437988,
"step": 3388
},
{
"epoch": 0.9030367607884923,
"grad_norm": 0.55078125,
"learning_rate": 3.35623725565359e-05,
"loss": 0.7153114676475525,
"step": 3390
},
{
"epoch": 0.903569525839105,
"grad_norm": 0.546875,
"learning_rate": 3.341226746619906e-05,
"loss": 0.7112528085708618,
"step": 3392
},
{
"epoch": 0.9041022908897176,
"grad_norm": 0.54296875,
"learning_rate": 3.326225767195301e-05,
"loss": 0.6847183704376221,
"step": 3394
},
{
"epoch": 0.9046350559403303,
"grad_norm": 0.52734375,
"learning_rate": 3.3112345343792765e-05,
"loss": 0.7189138531684875,
"step": 3396
},
{
"epoch": 0.905167820990943,
"grad_norm": 0.546875,
"learning_rate": 3.2962532650303476e-05,
"loss": 0.6928481459617615,
"step": 3398
},
{
"epoch": 0.9057005860415557,
"grad_norm": 0.53125,
"learning_rate": 3.2812821758628995e-05,
"loss": 0.6926461458206177,
"step": 3400
},
{
"epoch": 0.9062333510921684,
"grad_norm": 0.5625,
"learning_rate": 3.2663214834440536e-05,
"loss": 0.7446824312210083,
"step": 3402
},
{
"epoch": 0.906766116142781,
"grad_norm": 0.515625,
"learning_rate": 3.2513714041905354e-05,
"loss": 0.6290156245231628,
"step": 3404
},
{
"epoch": 0.9072988811933937,
"grad_norm": 0.546875,
"learning_rate": 3.2364321543655414e-05,
"loss": 0.7160915732383728,
"step": 3406
},
{
"epoch": 0.9078316462440064,
"grad_norm": 0.5234375,
"learning_rate": 3.221503950075619e-05,
"loss": 0.6737069487571716,
"step": 3408
},
{
"epoch": 0.9083644112946191,
"grad_norm": 0.57421875,
"learning_rate": 3.206587007267528e-05,
"loss": 0.7074106931686401,
"step": 3410
},
{
"epoch": 0.9088971763452317,
"grad_norm": 0.55859375,
"learning_rate": 3.191681541725128e-05,
"loss": 0.7356147766113281,
"step": 3412
},
{
"epoch": 0.9094299413958444,
"grad_norm": 0.546875,
"learning_rate": 3.176787769066247e-05,
"loss": 0.7018274068832397,
"step": 3414
},
{
"epoch": 0.9099627064464572,
"grad_norm": 0.5703125,
"learning_rate": 3.161905904739569e-05,
"loss": 0.7239270806312561,
"step": 3416
},
{
"epoch": 0.9104954714970698,
"grad_norm": 0.53125,
"learning_rate": 3.147036164021519e-05,
"loss": 0.661172091960907,
"step": 3418
},
{
"epoch": 0.9110282365476825,
"grad_norm": 0.52734375,
"learning_rate": 3.1321787620131435e-05,
"loss": 0.667536735534668,
"step": 3420
},
{
"epoch": 0.9115610015982951,
"grad_norm": 0.515625,
"learning_rate": 3.117333913636999e-05,
"loss": 0.6705557703971863,
"step": 3422
},
{
"epoch": 0.9120937666489078,
"grad_norm": 0.5390625,
"learning_rate": 3.1025018336340484e-05,
"loss": 0.6964682936668396,
"step": 3424
},
{
"epoch": 0.9126265316995205,
"grad_norm": 0.5390625,
"learning_rate": 3.08768273656055e-05,
"loss": 0.7074109315872192,
"step": 3426
},
{
"epoch": 0.9131592967501332,
"grad_norm": 0.53125,
"learning_rate": 3.0728768367849545e-05,
"loss": 0.6577396988868713,
"step": 3428
},
{
"epoch": 0.9136920618007459,
"grad_norm": 0.54296875,
"learning_rate": 3.058084348484806e-05,
"loss": 0.6772574782371521,
"step": 3430
},
{
"epoch": 0.9142248268513585,
"grad_norm": 0.52734375,
"learning_rate": 3.0433054856436395e-05,
"loss": 0.670283854007721,
"step": 3432
},
{
"epoch": 0.9147575919019713,
"grad_norm": 0.546875,
"learning_rate": 3.0285404620478912e-05,
"loss": 0.6941147446632385,
"step": 3434
},
{
"epoch": 0.9152903569525839,
"grad_norm": 0.53125,
"learning_rate": 3.0137894912838027e-05,
"loss": 0.652153730392456,
"step": 3436
},
{
"epoch": 0.9158231220031966,
"grad_norm": 0.51953125,
"learning_rate": 2.999052786734331e-05,
"loss": 0.6745891571044922,
"step": 3438
},
{
"epoch": 0.9163558870538092,
"grad_norm": 0.53125,
"learning_rate": 2.9843305615760623e-05,
"loss": 0.7068504095077515,
"step": 3440
},
{
"epoch": 0.9168886521044219,
"grad_norm": 0.5546875,
"learning_rate": 2.9696230287761288e-05,
"loss": 0.699682354927063,
"step": 3442
},
{
"epoch": 0.9174214171550347,
"grad_norm": 0.53515625,
"learning_rate": 2.954930401089127e-05,
"loss": 0.6988564729690552,
"step": 3444
},
{
"epoch": 0.9179541822056473,
"grad_norm": 0.53125,
"learning_rate": 2.9402528910540433e-05,
"loss": 0.6744426488876343,
"step": 3446
},
{
"epoch": 0.91848694725626,
"grad_norm": 0.5546875,
"learning_rate": 2.9255907109911725e-05,
"loss": 0.6920981407165527,
"step": 3448
},
{
"epoch": 0.9190197123068726,
"grad_norm": 0.52734375,
"learning_rate": 2.9109440729990533e-05,
"loss": 0.7215517163276672,
"step": 3450
},
{
"epoch": 0.9195524773574854,
"grad_norm": 0.53515625,
"learning_rate": 2.8963131889513986e-05,
"loss": 0.6962135434150696,
"step": 3452
},
{
"epoch": 0.9200852424080981,
"grad_norm": 0.515625,
"learning_rate": 2.8816982704940276e-05,
"loss": 0.6650117635726929,
"step": 3454
},
{
"epoch": 0.9206180074587107,
"grad_norm": 0.5546875,
"learning_rate": 2.8670995290418077e-05,
"loss": 0.7167315483093262,
"step": 3456
},
{
"epoch": 0.9211507725093234,
"grad_norm": 0.5234375,
"learning_rate": 2.8525171757755932e-05,
"loss": 0.6390708684921265,
"step": 3458
},
{
"epoch": 0.921683537559936,
"grad_norm": 0.5546875,
"learning_rate": 2.837951421639174e-05,
"loss": 0.7065415978431702,
"step": 3460
},
{
"epoch": 0.9222163026105488,
"grad_norm": 0.5234375,
"learning_rate": 2.8234024773362202e-05,
"loss": 0.6877092719078064,
"step": 3462
},
{
"epoch": 0.9227490676611614,
"grad_norm": 0.53125,
"learning_rate": 2.8088705533272382e-05,
"loss": 0.6896106600761414,
"step": 3464
},
{
"epoch": 0.9232818327117741,
"grad_norm": 0.5546875,
"learning_rate": 2.7943558598265218e-05,
"loss": 0.7128964066505432,
"step": 3466
},
{
"epoch": 0.9238145977623868,
"grad_norm": 0.515625,
"learning_rate": 2.7798586067991142e-05,
"loss": 0.683221697807312,
"step": 3468
},
{
"epoch": 0.9243473628129995,
"grad_norm": 0.5390625,
"learning_rate": 2.7653790039577725e-05,
"loss": 0.6766422390937805,
"step": 3470
},
{
"epoch": 0.9248801278636122,
"grad_norm": 0.55078125,
"learning_rate": 2.750917260759928e-05,
"loss": 0.6711747646331787,
"step": 3472
},
{
"epoch": 0.9254128929142248,
"grad_norm": 0.5390625,
"learning_rate": 2.7364735864046625e-05,
"loss": 0.7293622493743896,
"step": 3474
},
{
"epoch": 0.9259456579648375,
"grad_norm": 0.5234375,
"learning_rate": 2.7220481898296793e-05,
"loss": 0.65350741147995,
"step": 3476
},
{
"epoch": 0.9264784230154501,
"grad_norm": 0.56640625,
"learning_rate": 2.70764127970828e-05,
"loss": 0.6858737468719482,
"step": 3478
},
{
"epoch": 0.9270111880660629,
"grad_norm": 0.5234375,
"learning_rate": 2.693253064446348e-05,
"loss": 0.7053213715553284,
"step": 3480
},
{
"epoch": 0.9275439531166756,
"grad_norm": 0.546875,
"learning_rate": 2.678883752179333e-05,
"loss": 0.6678943037986755,
"step": 3482
},
{
"epoch": 0.9280767181672882,
"grad_norm": 0.54296875,
"learning_rate": 2.664533550769236e-05,
"loss": 0.6876240968704224,
"step": 3484
},
{
"epoch": 0.9286094832179009,
"grad_norm": 0.54296875,
"learning_rate": 2.6502026678016117e-05,
"loss": 0.7180918455123901,
"step": 3486
},
{
"epoch": 0.9291422482685135,
"grad_norm": 0.51953125,
"learning_rate": 2.6358913105825564e-05,
"loss": 0.6760578155517578,
"step": 3488
},
{
"epoch": 0.9296750133191263,
"grad_norm": 0.5625,
"learning_rate": 2.6215996861357152e-05,
"loss": 0.7332901954650879,
"step": 3490
},
{
"epoch": 0.9302077783697389,
"grad_norm": 0.53125,
"learning_rate": 2.6073280011992833e-05,
"loss": 0.6666563749313354,
"step": 3492
},
{
"epoch": 0.9307405434203516,
"grad_norm": 0.5234375,
"learning_rate": 2.59307646222302e-05,
"loss": 0.664758026599884,
"step": 3494
},
{
"epoch": 0.9312733084709643,
"grad_norm": 0.515625,
"learning_rate": 2.5788452753652563e-05,
"loss": 0.6465103030204773,
"step": 3496
},
{
"epoch": 0.931806073521577,
"grad_norm": 0.515625,
"learning_rate": 2.564634646489917e-05,
"loss": 0.675029456615448,
"step": 3498
},
{
"epoch": 0.9323388385721897,
"grad_norm": 0.5234375,
"learning_rate": 2.5504447811635435e-05,
"loss": 0.6932930946350098,
"step": 3500
},
{
"epoch": 0.9328716036228023,
"grad_norm": 0.546875,
"learning_rate": 2.536275884652312e-05,
"loss": 0.7141386270523071,
"step": 3502
},
{
"epoch": 0.933404368673415,
"grad_norm": 0.5390625,
"learning_rate": 2.522128161919077e-05,
"loss": 0.7156549692153931,
"step": 3504
},
{
"epoch": 0.9339371337240278,
"grad_norm": 0.5390625,
"learning_rate": 2.508001817620396e-05,
"loss": 0.7093266844749451,
"step": 3506
},
{
"epoch": 0.9344698987746404,
"grad_norm": 0.5390625,
"learning_rate": 2.4938970561035753e-05,
"loss": 0.6900848150253296,
"step": 3508
},
{
"epoch": 0.9350026638252531,
"grad_norm": 0.5546875,
"learning_rate": 2.479814081403709e-05,
"loss": 0.684967041015625,
"step": 3510
},
{
"epoch": 0.9355354288758657,
"grad_norm": 0.5390625,
"learning_rate": 2.4657530972407316e-05,
"loss": 0.6972053647041321,
"step": 3512
},
{
"epoch": 0.9360681939264784,
"grad_norm": 0.55078125,
"learning_rate": 2.4517143070164683e-05,
"loss": 0.7469601631164551,
"step": 3514
},
{
"epoch": 0.9366009589770911,
"grad_norm": 0.55078125,
"learning_rate": 2.437697913811694e-05,
"loss": 0.7062028050422668,
"step": 3516
},
{
"epoch": 0.9371337240277038,
"grad_norm": 0.52734375,
"learning_rate": 2.423704120383195e-05,
"loss": 0.6981968879699707,
"step": 3518
},
{
"epoch": 0.9376664890783165,
"grad_norm": 0.56640625,
"learning_rate": 2.409733129160836e-05,
"loss": 0.7789372205734253,
"step": 3520
},
{
"epoch": 0.9381992541289291,
"grad_norm": 0.54296875,
"learning_rate": 2.395785142244634e-05,
"loss": 0.7303587198257446,
"step": 3522
},
{
"epoch": 0.9387320191795419,
"grad_norm": 0.53515625,
"learning_rate": 2.38186036140183e-05,
"loss": 0.6874336004257202,
"step": 3524
},
{
"epoch": 0.9392647842301545,
"grad_norm": 0.55859375,
"learning_rate": 2.3679589880639758e-05,
"loss": 0.736723005771637,
"step": 3526
},
{
"epoch": 0.9397975492807672,
"grad_norm": 0.53515625,
"learning_rate": 2.3540812233240154e-05,
"loss": 0.6807746291160583,
"step": 3528
},
{
"epoch": 0.9403303143313798,
"grad_norm": 0.5390625,
"learning_rate": 2.3402272679333798e-05,
"loss": 0.6877115964889526,
"step": 3530
},
{
"epoch": 0.9408630793819925,
"grad_norm": 0.54296875,
"learning_rate": 2.326397322299079e-05,
"loss": 0.7108508348464966,
"step": 3532
},
{
"epoch": 0.9413958444326053,
"grad_norm": 0.53515625,
"learning_rate": 2.312591586480811e-05,
"loss": 0.6459025740623474,
"step": 3534
},
{
"epoch": 0.9419286094832179,
"grad_norm": 0.54296875,
"learning_rate": 2.298810260188054e-05,
"loss": 0.6610309481620789,
"step": 3536
},
{
"epoch": 0.9424613745338306,
"grad_norm": 0.52734375,
"learning_rate": 2.285053542777191e-05,
"loss": 0.6491233706474304,
"step": 3538
},
{
"epoch": 0.9429941395844432,
"grad_norm": 0.55078125,
"learning_rate": 2.2713216332486187e-05,
"loss": 0.7502667307853699,
"step": 3540
},
{
"epoch": 0.943526904635056,
"grad_norm": 0.515625,
"learning_rate": 2.257614730243872e-05,
"loss": 0.6700000762939453,
"step": 3542
},
{
"epoch": 0.9440596696856686,
"grad_norm": 0.53515625,
"learning_rate": 2.2439330320427484e-05,
"loss": 0.6700481176376343,
"step": 3544
},
{
"epoch": 0.9445924347362813,
"grad_norm": 0.53515625,
"learning_rate": 2.2302767365604403e-05,
"loss": 0.682191789150238,
"step": 3546
},
{
"epoch": 0.945125199786894,
"grad_norm": 0.5234375,
"learning_rate": 2.2166460413446725e-05,
"loss": 0.687681257724762,
"step": 3548
},
{
"epoch": 0.9456579648375066,
"grad_norm": 0.53125,
"learning_rate": 2.203041143572845e-05,
"loss": 0.6589215993881226,
"step": 3550
},
{
"epoch": 0.9461907298881194,
"grad_norm": 0.52734375,
"learning_rate": 2.18946224004918e-05,
"loss": 0.6683045029640198,
"step": 3552
},
{
"epoch": 0.946723494938732,
"grad_norm": 0.5390625,
"learning_rate": 2.175909527201872e-05,
"loss": 0.695706307888031,
"step": 3554
},
{
"epoch": 0.9472562599893447,
"grad_norm": 0.51953125,
"learning_rate": 2.1623832010802525e-05,
"loss": 0.6750278472900391,
"step": 3556
},
{
"epoch": 0.9477890250399574,
"grad_norm": 0.546875,
"learning_rate": 2.1488834573519506e-05,
"loss": 0.7252264022827148,
"step": 3558
},
{
"epoch": 0.94832179009057,
"grad_norm": 0.546875,
"learning_rate": 2.1354104913000616e-05,
"loss": 0.7190086841583252,
"step": 3560
},
{
"epoch": 0.9488545551411828,
"grad_norm": 0.53515625,
"learning_rate": 2.1219644978203246e-05,
"loss": 0.6598690152168274,
"step": 3562
},
{
"epoch": 0.9493873201917954,
"grad_norm": 0.54296875,
"learning_rate": 2.1085456714183002e-05,
"loss": 0.6947650909423828,
"step": 3564
},
{
"epoch": 0.9499200852424081,
"grad_norm": 0.5234375,
"learning_rate": 2.0951542062065596e-05,
"loss": 0.6635608673095703,
"step": 3566
},
{
"epoch": 0.9504528502930207,
"grad_norm": 0.5234375,
"learning_rate": 2.0817902959018755e-05,
"loss": 0.6340133547782898,
"step": 3568
},
{
"epoch": 0.9509856153436335,
"grad_norm": 0.515625,
"learning_rate": 2.068454133822419e-05,
"loss": 0.696979284286499,
"step": 3570
},
{
"epoch": 0.9515183803942462,
"grad_norm": 0.51171875,
"learning_rate": 2.0551459128849662e-05,
"loss": 0.7071898579597473,
"step": 3572
},
{
"epoch": 0.9520511454448588,
"grad_norm": 0.5390625,
"learning_rate": 2.041865825602102e-05,
"loss": 0.6977174282073975,
"step": 3574
},
{
"epoch": 0.9525839104954715,
"grad_norm": 0.52734375,
"learning_rate": 2.0286140640794416e-05,
"loss": 0.6723811030387878,
"step": 3576
},
{
"epoch": 0.9531166755460841,
"grad_norm": 0.51953125,
"learning_rate": 2.015390820012847e-05,
"loss": 0.6628782153129578,
"step": 3578
},
{
"epoch": 0.9536494405966969,
"grad_norm": 0.53125,
"learning_rate": 2.0021962846856556e-05,
"loss": 0.6885560154914856,
"step": 3580
},
{
"epoch": 0.9541822056473095,
"grad_norm": 0.53125,
"learning_rate": 1.989030648965914e-05,
"loss": 0.6671193242073059,
"step": 3582
},
{
"epoch": 0.9547149706979222,
"grad_norm": 0.52734375,
"learning_rate": 1.975894103303615e-05,
"loss": 0.6817159652709961,
"step": 3584
},
{
"epoch": 0.9552477357485349,
"grad_norm": 0.5234375,
"learning_rate": 1.962786837727944e-05,
"loss": 0.6346890926361084,
"step": 3586
},
{
"epoch": 0.9557805007991476,
"grad_norm": 0.53515625,
"learning_rate": 1.949709041844532e-05,
"loss": 0.6845383644104004,
"step": 3588
},
{
"epoch": 0.9563132658497603,
"grad_norm": 0.5390625,
"learning_rate": 1.936660904832705e-05,
"loss": 0.7126829624176025,
"step": 3590
},
{
"epoch": 0.9568460309003729,
"grad_norm": 0.515625,
"learning_rate": 1.9236426154427583e-05,
"loss": 0.647582471370697,
"step": 3592
},
{
"epoch": 0.9573787959509856,
"grad_norm": 0.5234375,
"learning_rate": 1.9106543619932188e-05,
"loss": 0.6961623430252075,
"step": 3594
},
{
"epoch": 0.9579115610015982,
"grad_norm": 0.5625,
"learning_rate": 1.8976963323681227e-05,
"loss": 0.7102996110916138,
"step": 3596
},
{
"epoch": 0.958444326052211,
"grad_norm": 0.546875,
"learning_rate": 1.8847687140142987e-05,
"loss": 0.7002482414245605,
"step": 3598
},
{
"epoch": 0.9589770911028237,
"grad_norm": 0.54296875,
"learning_rate": 1.8718716939386543e-05,
"loss": 0.6747076511383057,
"step": 3600
},
{
"epoch": 0.9595098561534363,
"grad_norm": 0.5546875,
"learning_rate": 1.8590054587054728e-05,
"loss": 0.6580986380577087,
"step": 3602
},
{
"epoch": 0.960042621204049,
"grad_norm": 0.51953125,
"learning_rate": 1.8461701944337137e-05,
"loss": 0.6556539535522461,
"step": 3604
},
{
"epoch": 0.9605753862546617,
"grad_norm": 0.53125,
"learning_rate": 1.8333660867943163e-05,
"loss": 0.6604914665222168,
"step": 3606
},
{
"epoch": 0.9611081513052744,
"grad_norm": 0.53125,
"learning_rate": 1.820593321007525e-05,
"loss": 0.6752309203147888,
"step": 3608
},
{
"epoch": 0.961640916355887,
"grad_norm": 0.51171875,
"learning_rate": 1.807852081840197e-05,
"loss": 0.6571139097213745,
"step": 3610
},
{
"epoch": 0.9621736814064997,
"grad_norm": 0.53515625,
"learning_rate": 1.7951425536031374e-05,
"loss": 0.6813245415687561,
"step": 3612
},
{
"epoch": 0.9627064464571125,
"grad_norm": 0.51953125,
"learning_rate": 1.7824649201484306e-05,
"loss": 0.6753969788551331,
"step": 3614
},
{
"epoch": 0.9632392115077251,
"grad_norm": 0.54296875,
"learning_rate": 1.76981936486678e-05,
"loss": 0.6771748065948486,
"step": 3616
},
{
"epoch": 0.9637719765583378,
"grad_norm": 0.54296875,
"learning_rate": 1.7572060706848576e-05,
"loss": 0.7080951929092407,
"step": 3618
},
{
"epoch": 0.9643047416089504,
"grad_norm": 0.53125,
"learning_rate": 1.7446252200626555e-05,
"loss": 0.6531881093978882,
"step": 3620
},
{
"epoch": 0.9648375066595631,
"grad_norm": 0.51171875,
"learning_rate": 1.732076994990849e-05,
"loss": 0.6644502878189087,
"step": 3622
},
{
"epoch": 0.9653702717101759,
"grad_norm": 0.51953125,
"learning_rate": 1.719561576988158e-05,
"loss": 0.6807780861854553,
"step": 3624
},
{
"epoch": 0.9659030367607885,
"grad_norm": 0.53515625,
"learning_rate": 1.7070791470987295e-05,
"loss": 0.6746936440467834,
"step": 3626
},
{
"epoch": 0.9664358018114012,
"grad_norm": 0.52734375,
"learning_rate": 1.6946298858895144e-05,
"loss": 0.7012004256248474,
"step": 3628
},
{
"epoch": 0.9669685668620138,
"grad_norm": 0.52734375,
"learning_rate": 1.6822139734476546e-05,
"loss": 0.677057147026062,
"step": 3630
},
{
"epoch": 0.9675013319126265,
"grad_norm": 0.51171875,
"learning_rate": 1.6698315893778788e-05,
"loss": 0.6767706871032715,
"step": 3632
},
{
"epoch": 0.9680340969632392,
"grad_norm": 0.53125,
"learning_rate": 1.6574829127999067e-05,
"loss": 0.6860625743865967,
"step": 3634
},
{
"epoch": 0.9685668620138519,
"grad_norm": 0.546875,
"learning_rate": 1.645168122345854e-05,
"loss": 0.6813229918479919,
"step": 3636
},
{
"epoch": 0.9690996270644646,
"grad_norm": 0.54296875,
"learning_rate": 1.6328873961576506e-05,
"loss": 0.6505600214004517,
"step": 3638
},
{
"epoch": 0.9696323921150772,
"grad_norm": 0.5390625,
"learning_rate": 1.6206409118844654e-05,
"loss": 0.6736657023429871,
"step": 3640
},
{
"epoch": 0.97016515716569,
"grad_norm": 0.5390625,
"learning_rate": 1.6084288466801295e-05,
"loss": 0.7095986008644104,
"step": 3642
},
{
"epoch": 0.9706979222163026,
"grad_norm": 0.51171875,
"learning_rate": 1.5962513772005836e-05,
"loss": 0.6710772514343262,
"step": 3644
},
{
"epoch": 0.9712306872669153,
"grad_norm": 0.51953125,
"learning_rate": 1.5841086796013142e-05,
"loss": 0.6877874135971069,
"step": 3646
},
{
"epoch": 0.9717634523175279,
"grad_norm": 0.55078125,
"learning_rate": 1.5720009295348103e-05,
"loss": 0.6922812461853027,
"step": 3648
},
{
"epoch": 0.9722962173681406,
"grad_norm": 0.52734375,
"learning_rate": 1.55992830214802e-05,
"loss": 0.7035001516342163,
"step": 3650
},
{
"epoch": 0.9728289824187534,
"grad_norm": 0.51953125,
"learning_rate": 1.5478909720798187e-05,
"loss": 0.6592362523078918,
"step": 3652
},
{
"epoch": 0.973361747469366,
"grad_norm": 0.53125,
"learning_rate": 1.5358891134584802e-05,
"loss": 0.7107102274894714,
"step": 3654
},
{
"epoch": 0.9738945125199787,
"grad_norm": 0.51953125,
"learning_rate": 1.52392289989916e-05,
"loss": 0.6652364730834961,
"step": 3656
},
{
"epoch": 0.9744272775705913,
"grad_norm": 0.5390625,
"learning_rate": 1.5119925045013832e-05,
"loss": 0.6816696524620056,
"step": 3658
},
{
"epoch": 0.9749600426212041,
"grad_norm": 0.53125,
"learning_rate": 1.5000980998465409e-05,
"loss": 0.6663632392883301,
"step": 3660
},
{
"epoch": 0.9754928076718167,
"grad_norm": 0.53125,
"learning_rate": 1.4882398579953928e-05,
"loss": 0.6931334733963013,
"step": 3662
},
{
"epoch": 0.9760255727224294,
"grad_norm": 0.53515625,
"learning_rate": 1.4764179504855793e-05,
"loss": 0.7170167565345764,
"step": 3664
},
{
"epoch": 0.9765583377730421,
"grad_norm": 0.5078125,
"learning_rate": 1.4646325483291386e-05,
"loss": 0.6615394949913025,
"step": 3666
},
{
"epoch": 0.9770911028236547,
"grad_norm": 0.5078125,
"learning_rate": 1.4528838220100344e-05,
"loss": 0.6396226286888123,
"step": 3668
},
{
"epoch": 0.9776238678742675,
"grad_norm": 0.546875,
"learning_rate": 1.4411719414816893e-05,
"loss": 0.6949025988578796,
"step": 3670
},
{
"epoch": 0.9781566329248801,
"grad_norm": 0.5234375,
"learning_rate": 1.429497076164526e-05,
"loss": 0.6923924684524536,
"step": 3672
},
{
"epoch": 0.9786893979754928,
"grad_norm": 0.5390625,
"learning_rate": 1.4178593949435162e-05,
"loss": 0.7032251954078674,
"step": 3674
},
{
"epoch": 0.9792221630261055,
"grad_norm": 0.5234375,
"learning_rate": 1.40625906616574e-05,
"loss": 0.6616812944412231,
"step": 3676
},
{
"epoch": 0.9797549280767182,
"grad_norm": 0.52734375,
"learning_rate": 1.3946962576379446e-05,
"loss": 0.707070529460907,
"step": 3678
},
{
"epoch": 0.9802876931273309,
"grad_norm": 0.53125,
"learning_rate": 1.3831711366241244e-05,
"loss": 0.6896175146102905,
"step": 3680
},
{
"epoch": 0.9808204581779435,
"grad_norm": 0.51953125,
"learning_rate": 1.3716838698430972e-05,
"loss": 0.6856859922409058,
"step": 3682
},
{
"epoch": 0.9813532232285562,
"grad_norm": 0.53125,
"learning_rate": 1.3602346234660928e-05,
"loss": 0.7103138566017151,
"step": 3684
},
{
"epoch": 0.9818859882791688,
"grad_norm": 0.52734375,
"learning_rate": 1.3488235631143498e-05,
"loss": 0.6692371964454651,
"step": 3686
},
{
"epoch": 0.9824187533297816,
"grad_norm": 0.53515625,
"learning_rate": 1.3374508538567198e-05,
"loss": 0.7444373965263367,
"step": 3688
},
{
"epoch": 0.9829515183803943,
"grad_norm": 0.53125,
"learning_rate": 1.326116660207279e-05,
"loss": 0.705718994140625,
"step": 3690
},
{
"epoch": 0.9834842834310069,
"grad_norm": 0.5078125,
"learning_rate": 1.3148211461229497e-05,
"loss": 0.6810398101806641,
"step": 3692
},
{
"epoch": 0.9840170484816196,
"grad_norm": 0.54296875,
"learning_rate": 1.3035644750011262e-05,
"loss": 0.7048704624176025,
"step": 3694
},
{
"epoch": 0.9845498135322323,
"grad_norm": 0.55078125,
"learning_rate": 1.292346809677314e-05,
"loss": 0.7215117812156677,
"step": 3696
},
{
"epoch": 0.985082578582845,
"grad_norm": 0.5390625,
"learning_rate": 1.2811683124227719e-05,
"loss": 0.71157306432724,
"step": 3698
},
{
"epoch": 0.9856153436334576,
"grad_norm": 0.54296875,
"learning_rate": 1.270029144942166e-05,
"loss": 0.6895602345466614,
"step": 3700
},
{
"epoch": 0.9861481086840703,
"grad_norm": 0.53125,
"learning_rate": 1.2589294683712302e-05,
"loss": 0.6617645621299744,
"step": 3702
},
{
"epoch": 0.986680873734683,
"grad_norm": 0.5234375,
"learning_rate": 1.2478694432744342e-05,
"loss": 0.6597418785095215,
"step": 3704
},
{
"epoch": 0.9872136387852957,
"grad_norm": 0.5234375,
"learning_rate": 1.2368492296426636e-05,
"loss": 0.6748197078704834,
"step": 3706
},
{
"epoch": 0.9877464038359084,
"grad_norm": 0.5390625,
"learning_rate": 1.2258689868909021e-05,
"loss": 0.6904255151748657,
"step": 3708
},
{
"epoch": 0.988279168886521,
"grad_norm": 0.54296875,
"learning_rate": 1.2149288738559295e-05,
"loss": 0.6973739266395569,
"step": 3710
},
{
"epoch": 0.9888119339371337,
"grad_norm": 0.53515625,
"learning_rate": 1.2040290487940166e-05,
"loss": 0.7195823192596436,
"step": 3712
},
{
"epoch": 0.9893446989877464,
"grad_norm": 0.52734375,
"learning_rate": 1.1931696693786461e-05,
"loss": 0.6562846899032593,
"step": 3714
},
{
"epoch": 0.9898774640383591,
"grad_norm": 0.51171875,
"learning_rate": 1.1823508926982239e-05,
"loss": 0.6495825052261353,
"step": 3716
},
{
"epoch": 0.9904102290889718,
"grad_norm": 0.5234375,
"learning_rate": 1.1715728752538103e-05,
"loss": 0.7018332481384277,
"step": 3718
},
{
"epoch": 0.9909429941395844,
"grad_norm": 0.51953125,
"learning_rate": 1.1608357729568547e-05,
"loss": 0.6511444449424744,
"step": 3720
},
{
"epoch": 0.9914757591901971,
"grad_norm": 0.5234375,
"learning_rate": 1.1501397411269415e-05,
"loss": 0.6892027258872986,
"step": 3722
},
{
"epoch": 0.9920085242408098,
"grad_norm": 0.53515625,
"learning_rate": 1.1394849344895413e-05,
"loss": 0.6872977018356323,
"step": 3724
},
{
"epoch": 0.9925412892914225,
"grad_norm": 0.5546875,
"learning_rate": 1.1288715071737743e-05,
"loss": 0.7205137014389038,
"step": 3726
},
{
"epoch": 0.9930740543420352,
"grad_norm": 0.5234375,
"learning_rate": 1.1182996127101822e-05,
"loss": 0.6928572058677673,
"step": 3728
},
{
"epoch": 0.9936068193926478,
"grad_norm": 0.5078125,
"learning_rate": 1.1077694040285008e-05,
"loss": 0.6700318455696106,
"step": 3730
},
{
"epoch": 0.9941395844432606,
"grad_norm": 0.55078125,
"learning_rate": 1.0972810334554565e-05,
"loss": 0.6770192384719849,
"step": 3732
},
{
"epoch": 0.9946723494938732,
"grad_norm": 0.53515625,
"learning_rate": 1.086834652712557e-05,
"loss": 0.7142175436019897,
"step": 3734
},
{
"epoch": 0.9952051145444859,
"grad_norm": 0.5234375,
"learning_rate": 1.076430412913899e-05,
"loss": 0.6775243282318115,
"step": 3736
},
{
"epoch": 0.9957378795950985,
"grad_norm": 0.5234375,
"learning_rate": 1.0660684645639808e-05,
"loss": 0.6838144063949585,
"step": 3738
},
{
"epoch": 0.9962706446457112,
"grad_norm": 0.51953125,
"learning_rate": 1.055748957555525e-05,
"loss": 0.6580877900123596,
"step": 3740
},
{
"epoch": 0.996803409696324,
"grad_norm": 0.54296875,
"learning_rate": 1.045472041167313e-05,
"loss": 0.6969035267829895,
"step": 3742
},
{
"epoch": 0.9973361747469366,
"grad_norm": 0.52734375,
"learning_rate": 1.0352378640620211e-05,
"loss": 0.683407187461853,
"step": 3744
},
{
"epoch": 0.9978689397975493,
"grad_norm": 0.5078125,
"learning_rate": 1.0250465742840743e-05,
"loss": 0.6365366578102112,
"step": 3746
},
{
"epoch": 0.9984017048481619,
"grad_norm": 0.54296875,
"learning_rate": 1.0148983192575023e-05,
"loss": 0.6633042097091675,
"step": 3748
},
{
"epoch": 0.9989344698987747,
"grad_norm": 0.53515625,
"learning_rate": 1.0047932457838066e-05,
"loss": 0.6902580261230469,
"step": 3750
},
{
"epoch": 0.9994672349493873,
"grad_norm": 0.51953125,
"learning_rate": 9.947315000398392e-06,
"loss": 0.6602581739425659,
"step": 3752
},
{
"epoch": 1.0,
"grad_norm": 0.59375,
"learning_rate": 9.847132275756857e-06,
"loss": 0.7133287191390991,
"step": 3754
},
{
"epoch": 1.0005327650506126,
"grad_norm": 0.6171875,
"learning_rate": 9.74738573312561e-06,
"loss": 0.554438054561615,
"step": 3756
},
{
"epoch": 1.0010655301012255,
"grad_norm": 0.5859375,
"learning_rate": 9.648076815407123e-06,
"loss": 0.5593016147613525,
"step": 3758
},
{
"epoch": 1.001598295151838,
"grad_norm": 0.60546875,
"learning_rate": 9.549206959173331e-06,
"loss": 0.5708969831466675,
"step": 3760
},
{
"epoch": 1.0021310602024507,
"grad_norm": 0.56640625,
"learning_rate": 9.45077759464485e-06,
"loss": 0.5818780660629272,
"step": 3762
},
{
"epoch": 1.0026638252530633,
"grad_norm": 0.58203125,
"learning_rate": 9.352790145670237e-06,
"loss": 0.5812588930130005,
"step": 3764
},
{
"epoch": 1.0031965903036761,
"grad_norm": 0.53515625,
"learning_rate": 9.255246029705476e-06,
"loss": 0.6045463681221008,
"step": 3766
},
{
"epoch": 1.0037293553542888,
"grad_norm": 0.51171875,
"learning_rate": 9.158146657793429e-06,
"loss": 0.5340660810470581,
"step": 3768
},
{
"epoch": 1.0042621204049014,
"grad_norm": 0.51171875,
"learning_rate": 9.061493434543425e-06,
"loss": 0.569497287273407,
"step": 3770
},
{
"epoch": 1.0047948854555142,
"grad_norm": 0.52734375,
"learning_rate": 8.965287758110932e-06,
"loss": 0.5447984337806702,
"step": 3772
},
{
"epoch": 1.0053276505061268,
"grad_norm": 0.51953125,
"learning_rate": 8.869531020177367e-06,
"loss": 0.5706038475036621,
"step": 3774
},
{
"epoch": 1.0058604155567394,
"grad_norm": 0.53515625,
"learning_rate": 8.774224605929924e-06,
"loss": 0.5389580130577087,
"step": 3776
},
{
"epoch": 1.006393180607352,
"grad_norm": 0.515625,
"learning_rate": 8.679369894041567e-06,
"loss": 0.5501297116279602,
"step": 3778
},
{
"epoch": 1.006925945657965,
"grad_norm": 0.51171875,
"learning_rate": 8.584968256651067e-06,
"loss": 0.5443601608276367,
"step": 3780
},
{
"epoch": 1.0074587107085775,
"grad_norm": 0.53125,
"learning_rate": 8.491021059343163e-06,
"loss": 0.5650312304496765,
"step": 3782
},
{
"epoch": 1.0079914757591901,
"grad_norm": 0.515625,
"learning_rate": 8.397529661128799e-06,
"loss": 0.5328672528266907,
"step": 3784
},
{
"epoch": 1.008524240809803,
"grad_norm": 0.515625,
"learning_rate": 8.30449541442548e-06,
"loss": 0.5154542922973633,
"step": 3786
},
{
"epoch": 1.0090570058604156,
"grad_norm": 0.51953125,
"learning_rate": 8.211919665037697e-06,
"loss": 0.5323178768157959,
"step": 3788
},
{
"epoch": 1.0095897709110282,
"grad_norm": 0.578125,
"learning_rate": 8.119803752137455e-06,
"loss": 0.5955473184585571,
"step": 3790
},
{
"epoch": 1.0101225359616408,
"grad_norm": 0.52734375,
"learning_rate": 8.028149008244921e-06,
"loss": 0.5868597030639648,
"step": 3792
},
{
"epoch": 1.0106553010122536,
"grad_norm": 0.51953125,
"learning_rate": 7.936956759209114e-06,
"loss": 0.5995841026306152,
"step": 3794
},
{
"epoch": 1.0111880660628663,
"grad_norm": 0.53125,
"learning_rate": 7.846228324188767e-06,
"loss": 0.5501196384429932,
"step": 3796
},
{
"epoch": 1.0117208311134789,
"grad_norm": 0.53515625,
"learning_rate": 7.755965015633217e-06,
"loss": 0.5720456838607788,
"step": 3798
},
{
"epoch": 1.0122535961640917,
"grad_norm": 0.53515625,
"learning_rate": 7.66616813926341e-06,
"loss": 0.5405896902084351,
"step": 3800
},
{
"epoch": 1.0127863612147043,
"grad_norm": 0.5546875,
"learning_rate": 7.57683899405305e-06,
"loss": 0.6008697748184204,
"step": 3802
},
{
"epoch": 1.013319126265317,
"grad_norm": 0.53515625,
"learning_rate": 7.487978872209783e-06,
"loss": 0.5538079738616943,
"step": 3804
},
{
"epoch": 1.0138518913159296,
"grad_norm": 0.5234375,
"learning_rate": 7.3995890591564975e-06,
"loss": 0.551633894443512,
"step": 3806
},
{
"epoch": 1.0143846563665424,
"grad_norm": 0.53125,
"learning_rate": 7.311670833512763e-06,
"loss": 0.5497746467590332,
"step": 3808
},
{
"epoch": 1.014917421417155,
"grad_norm": 0.53125,
"learning_rate": 7.224225467076284e-06,
"loss": 0.5558388829231262,
"step": 3810
},
{
"epoch": 1.0154501864677676,
"grad_norm": 0.53515625,
"learning_rate": 7.137254224804557e-06,
"loss": 0.5494096875190735,
"step": 3812
},
{
"epoch": 1.0159829515183805,
"grad_norm": 0.52734375,
"learning_rate": 7.050758364796531e-06,
"loss": 0.5741044282913208,
"step": 3814
},
{
"epoch": 1.016515716568993,
"grad_norm": 0.5390625,
"learning_rate": 6.964739138274433e-06,
"loss": 0.6052870154380798,
"step": 3816
},
{
"epoch": 1.0170484816196057,
"grad_norm": 0.53125,
"learning_rate": 6.879197789565632e-06,
"loss": 0.5531472563743591,
"step": 3818
},
{
"epoch": 1.0175812466702185,
"grad_norm": 0.5234375,
"learning_rate": 6.794135556084698e-06,
"loss": 0.5643627047538757,
"step": 3820
},
{
"epoch": 1.0181140117208312,
"grad_norm": 0.53125,
"learning_rate": 6.709553668315454e-06,
"loss": 0.5206726789474487,
"step": 3822
},
{
"epoch": 1.0186467767714438,
"grad_norm": 0.5390625,
"learning_rate": 6.625453349793196e-06,
"loss": 0.5452365279197693,
"step": 3824
},
{
"epoch": 1.0191795418220564,
"grad_norm": 0.53515625,
"learning_rate": 6.541835817086979e-06,
"loss": 0.5786857604980469,
"step": 3826
},
{
"epoch": 1.0197123068726692,
"grad_norm": 0.52734375,
"learning_rate": 6.458702279782038e-06,
"loss": 0.534633457660675,
"step": 3828
},
{
"epoch": 1.0202450719232818,
"grad_norm": 0.55078125,
"learning_rate": 6.376053940462279e-06,
"loss": 0.5802006721496582,
"step": 3830
},
{
"epoch": 1.0207778369738945,
"grad_norm": 0.53515625,
"learning_rate": 6.293891994692876e-06,
"loss": 0.5870469808578491,
"step": 3832
},
{
"epoch": 1.0213106020245073,
"grad_norm": 0.515625,
"learning_rate": 6.212217631003019e-06,
"loss": 0.5592401027679443,
"step": 3834
},
{
"epoch": 1.02184336707512,
"grad_norm": 0.5234375,
"learning_rate": 6.1310320308686354e-06,
"loss": 0.5553821921348572,
"step": 3836
},
{
"epoch": 1.0223761321257325,
"grad_norm": 0.51953125,
"learning_rate": 6.050336368695386e-06,
"loss": 0.583335816860199,
"step": 3838
},
{
"epoch": 1.0229088971763451,
"grad_norm": 0.515625,
"learning_rate": 5.9701318118016296e-06,
"loss": 0.563973069190979,
"step": 3840
},
{
"epoch": 1.023441662226958,
"grad_norm": 0.53515625,
"learning_rate": 5.8904195204015555e-06,
"loss": 0.5525979995727539,
"step": 3842
},
{
"epoch": 1.0239744272775706,
"grad_norm": 0.53515625,
"learning_rate": 5.811200647588386e-06,
"loss": 0.5597478747367859,
"step": 3844
},
{
"epoch": 1.0245071923281832,
"grad_norm": 0.51953125,
"learning_rate": 5.73247633931771e-06,
"loss": 0.5630712509155273,
"step": 3846
},
{
"epoch": 1.025039957378796,
"grad_norm": 0.5390625,
"learning_rate": 5.6542477343908944e-06,
"loss": 0.5952086448669434,
"step": 3848
},
{
"epoch": 1.0255727224294087,
"grad_norm": 0.51171875,
"learning_rate": 5.5765159644386265e-06,
"loss": 0.5853996872901917,
"step": 3850
},
{
"epoch": 1.0261054874800213,
"grad_norm": 0.53515625,
"learning_rate": 5.499282153904508e-06,
"loss": 0.5681400299072266,
"step": 3852
},
{
"epoch": 1.026638252530634,
"grad_norm": 0.54296875,
"learning_rate": 5.422547420028839e-06,
"loss": 0.5616742968559265,
"step": 3854
},
{
"epoch": 1.0271710175812467,
"grad_norm": 0.54296875,
"learning_rate": 5.346312872832422e-06,
"loss": 0.5645125508308411,
"step": 3856
},
{
"epoch": 1.0277037826318594,
"grad_norm": 0.51171875,
"learning_rate": 5.270579615100518e-06,
"loss": 0.5648372769355774,
"step": 3858
},
{
"epoch": 1.028236547682472,
"grad_norm": 0.53125,
"learning_rate": 5.19534874236689e-06,
"loss": 0.5514068603515625,
"step": 3860
},
{
"epoch": 1.0287693127330848,
"grad_norm": 0.53125,
"learning_rate": 5.120621342897951e-06,
"loss": 0.5846351385116577,
"step": 3862
},
{
"epoch": 1.0293020777836974,
"grad_norm": 0.52734375,
"learning_rate": 5.046398497677034e-06,
"loss": 0.5671142339706421,
"step": 3864
},
{
"epoch": 1.02983484283431,
"grad_norm": 0.53515625,
"learning_rate": 4.972681280388738e-06,
"loss": 0.5745285153388977,
"step": 3866
},
{
"epoch": 1.0303676078849227,
"grad_norm": 0.54296875,
"learning_rate": 4.899470757403415e-06,
"loss": 0.5890294313430786,
"step": 3868
},
{
"epoch": 1.0309003729355355,
"grad_norm": 0.53515625,
"learning_rate": 4.826767987761725e-06,
"loss": 0.5851269364356995,
"step": 3870
},
{
"epoch": 1.031433137986148,
"grad_norm": 0.5234375,
"learning_rate": 4.754574023159335e-06,
"loss": 0.5567840933799744,
"step": 3872
},
{
"epoch": 1.0319659030367607,
"grad_norm": 0.5390625,
"learning_rate": 4.682889907931696e-06,
"loss": 0.5986261963844299,
"step": 3874
},
{
"epoch": 1.0324986680873736,
"grad_norm": 0.53125,
"learning_rate": 4.611716679038925e-06,
"loss": 0.5585236549377441,
"step": 3876
},
{
"epoch": 1.0330314331379862,
"grad_norm": 0.53125,
"learning_rate": 4.5410553660508284e-06,
"loss": 0.574200451374054,
"step": 3878
},
{
"epoch": 1.0335641981885988,
"grad_norm": 0.52734375,
"learning_rate": 4.470906991131991e-06,
"loss": 0.5537621974945068,
"step": 3880
},
{
"epoch": 1.0340969632392114,
"grad_norm": 0.54296875,
"learning_rate": 4.401272569026995e-06,
"loss": 0.5498485565185547,
"step": 3882
},
{
"epoch": 1.0346297282898242,
"grad_norm": 0.5234375,
"learning_rate": 4.332153107045747e-06,
"loss": 0.5446688532829285,
"step": 3884
},
{
"epoch": 1.0351624933404369,
"grad_norm": 0.53515625,
"learning_rate": 4.263549605048898e-06,
"loss": 0.5680376291275024,
"step": 3886
},
{
"epoch": 1.0356952583910495,
"grad_norm": 0.51953125,
"learning_rate": 4.195463055433364e-06,
"loss": 0.5543019771575928,
"step": 3888
},
{
"epoch": 1.0362280234416623,
"grad_norm": 0.5390625,
"learning_rate": 4.1278944431180164e-06,
"loss": 0.5992298126220703,
"step": 3890
},
{
"epoch": 1.036760788492275,
"grad_norm": 0.53125,
"learning_rate": 4.060844745529396e-06,
"loss": 0.5606511235237122,
"step": 3892
},
{
"epoch": 1.0372935535428875,
"grad_norm": 0.53125,
"learning_rate": 3.994314932587573e-06,
"loss": 0.5762649178504944,
"step": 3894
},
{
"epoch": 1.0378263185935002,
"grad_norm": 0.5078125,
"learning_rate": 3.928305966692145e-06,
"loss": 0.5266860723495483,
"step": 3896
},
{
"epoch": 1.038359083644113,
"grad_norm": 0.5234375,
"learning_rate": 3.862818802708295e-06,
"loss": 0.5364474058151245,
"step": 3898
},
{
"epoch": 1.0388918486947256,
"grad_norm": 0.51953125,
"learning_rate": 3.7978543879529704e-06,
"loss": 0.5435348153114319,
"step": 3900
},
{
"epoch": 1.0394246137453382,
"grad_norm": 0.53515625,
"learning_rate": 3.7334136621812023e-06,
"loss": 0.608924150466919,
"step": 3902
},
{
"epoch": 1.039957378795951,
"grad_norm": 0.52734375,
"learning_rate": 3.6694975575725012e-06,
"loss": 0.5841650366783142,
"step": 3904
},
{
"epoch": 1.0404901438465637,
"grad_norm": 0.52734375,
"learning_rate": 3.606106998717351e-06,
"loss": 0.5527678728103638,
"step": 3906
},
{
"epoch": 1.0410229088971763,
"grad_norm": 0.546875,
"learning_rate": 3.5432429026038784e-06,
"loss": 0.5616254806518555,
"step": 3908
},
{
"epoch": 1.041555673947789,
"grad_norm": 0.54296875,
"learning_rate": 3.480906178604553e-06,
"loss": 0.584281861782074,
"step": 3910
},
{
"epoch": 1.0420884389984018,
"grad_norm": 0.55078125,
"learning_rate": 3.419097728463041e-06,
"loss": 0.5592966675758362,
"step": 3912
},
{
"epoch": 1.0426212040490144,
"grad_norm": 0.5234375,
"learning_rate": 3.3578184462811714e-06,
"loss": 0.5340662598609924,
"step": 3914
},
{
"epoch": 1.043153969099627,
"grad_norm": 0.53125,
"learning_rate": 3.2970692185059837e-06,
"loss": 0.5586187839508057,
"step": 3916
},
{
"epoch": 1.0436867341502398,
"grad_norm": 0.53515625,
"learning_rate": 3.236850923916919e-06,
"loss": 0.5644587278366089,
"step": 3918
},
{
"epoch": 1.0442194992008524,
"grad_norm": 0.52734375,
"learning_rate": 3.177164433613116e-06,
"loss": 0.5369923114776611,
"step": 3920
},
{
"epoch": 1.044752264251465,
"grad_norm": 0.53125,
"learning_rate": 3.1180106110007925e-06,
"loss": 0.5914398431777954,
"step": 3922
},
{
"epoch": 1.045285029302078,
"grad_norm": 0.5390625,
"learning_rate": 3.0593903117807344e-06,
"loss": 0.5819767117500305,
"step": 3924
},
{
"epoch": 1.0458177943526905,
"grad_norm": 0.53125,
"learning_rate": 3.001304383935981e-06,
"loss": 0.5808528065681458,
"step": 3926
},
{
"epoch": 1.0463505594033031,
"grad_norm": 0.52734375,
"learning_rate": 2.9437536677194976e-06,
"loss": 0.5389923453330994,
"step": 3928
},
{
"epoch": 1.0468833244539157,
"grad_norm": 0.52734375,
"learning_rate": 2.8867389956420645e-06,
"loss": 0.572472333908081,
"step": 3930
},
{
"epoch": 1.0474160895045286,
"grad_norm": 0.53515625,
"learning_rate": 2.8302611924601884e-06,
"loss": 0.5559939742088318,
"step": 3932
},
{
"epoch": 1.0479488545551412,
"grad_norm": 0.5234375,
"learning_rate": 2.7743210751642212e-06,
"loss": 0.5386156439781189,
"step": 3934
},
{
"epoch": 1.0484816196057538,
"grad_norm": 0.52734375,
"learning_rate": 2.718919452966509e-06,
"loss": 0.5663049221038818,
"step": 3936
},
{
"epoch": 1.0490143846563666,
"grad_norm": 0.51953125,
"learning_rate": 2.664057127289699e-06,
"loss": 0.5892527103424072,
"step": 3938
},
{
"epoch": 1.0495471497069793,
"grad_norm": 0.546875,
"learning_rate": 2.6097348917551204e-06,
"loss": 0.5835411548614502,
"step": 3940
},
{
"epoch": 1.0500799147575919,
"grad_norm": 0.515625,
"learning_rate": 2.55595353217136e-06,
"loss": 0.5439558029174805,
"step": 3942
},
{
"epoch": 1.0506126798082045,
"grad_norm": 0.53515625,
"learning_rate": 2.502713826522838e-06,
"loss": 0.5934330224990845,
"step": 3944
},
{
"epoch": 1.0511454448588173,
"grad_norm": 0.55078125,
"learning_rate": 2.450016544958591e-06,
"loss": 0.5806512832641602,
"step": 3946
},
{
"epoch": 1.05167820990943,
"grad_norm": 0.53515625,
"learning_rate": 2.3978624497811033e-06,
"loss": 0.5375804305076599,
"step": 3948
},
{
"epoch": 1.0522109749600426,
"grad_norm": 0.5234375,
"learning_rate": 2.3462522954353073e-06,
"loss": 0.5656116604804993,
"step": 3950
},
{
"epoch": 1.0527437400106554,
"grad_norm": 0.51953125,
"learning_rate": 2.2951868284976485e-06,
"loss": 0.5351519584655762,
"step": 3952
},
{
"epoch": 1.053276505061268,
"grad_norm": 0.51171875,
"learning_rate": 2.244666787665297e-06,
"loss": 0.5698959827423096,
"step": 3954
},
{
"epoch": 1.0538092701118806,
"grad_norm": 0.51171875,
"learning_rate": 2.194692903745459e-06,
"loss": 0.5653844475746155,
"step": 3956
},
{
"epoch": 1.0543420351624933,
"grad_norm": 0.5078125,
"learning_rate": 2.145265899644802e-06,
"loss": 0.5527917742729187,
"step": 3958
},
{
"epoch": 1.054874800213106,
"grad_norm": 0.53125,
"learning_rate": 2.096386490358997e-06,
"loss": 0.5695369243621826,
"step": 3960
},
{
"epoch": 1.0554075652637187,
"grad_norm": 0.51953125,
"learning_rate": 2.048055382962386e-06,
"loss": 0.5575898289680481,
"step": 3962
},
{
"epoch": 1.0559403303143313,
"grad_norm": 0.546875,
"learning_rate": 2.0002732765977395e-06,
"loss": 0.5826997756958008,
"step": 3964
},
{
"epoch": 1.0564730953649442,
"grad_norm": 0.5390625,
"learning_rate": 1.9530408624661624e-06,
"loss": 0.628014087677002,
"step": 3966
},
{
"epoch": 1.0570058604155568,
"grad_norm": 0.5390625,
"learning_rate": 1.9063588238170627e-06,
"loss": 0.5535210371017456,
"step": 3968
},
{
"epoch": 1.0575386254661694,
"grad_norm": 0.546875,
"learning_rate": 1.8602278359383063e-06,
"loss": 0.5898425579071045,
"step": 3970
},
{
"epoch": 1.058071390516782,
"grad_norm": 0.5234375,
"learning_rate": 1.8146485661464153e-06,
"loss": 0.556390643119812,
"step": 3972
},
{
"epoch": 1.0586041555673948,
"grad_norm": 0.546875,
"learning_rate": 1.769621673776949e-06,
"loss": 0.5970685482025146,
"step": 3974
},
{
"epoch": 1.0591369206180075,
"grad_norm": 0.51953125,
"learning_rate": 1.7251478101749163e-06,
"loss": 0.5681304335594177,
"step": 3976
},
{
"epoch": 1.05966968566862,
"grad_norm": 0.52734375,
"learning_rate": 1.6812276186854105e-06,
"loss": 0.5456256866455078,
"step": 3978
},
{
"epoch": 1.060202450719233,
"grad_norm": 0.53515625,
"learning_rate": 1.6378617346442682e-06,
"loss": 0.5776793956756592,
"step": 3980
},
{
"epoch": 1.0607352157698455,
"grad_norm": 0.5234375,
"learning_rate": 1.595050785368888e-06,
"loss": 0.5277162194252014,
"step": 3982
},
{
"epoch": 1.0612679808204581,
"grad_norm": 0.54296875,
"learning_rate": 1.5527953901491466e-06,
"loss": 0.5984737873077393,
"step": 3984
},
{
"epoch": 1.0618007458710708,
"grad_norm": 0.546875,
"learning_rate": 1.511096160238461e-06,
"loss": 0.5542199611663818,
"step": 3986
},
{
"epoch": 1.0623335109216836,
"grad_norm": 0.54296875,
"learning_rate": 1.4699536988449193e-06,
"loss": 0.5390311479568481,
"step": 3988
},
{
"epoch": 1.0628662759722962,
"grad_norm": 0.5078125,
"learning_rate": 1.4293686011225849e-06,
"loss": 0.5391555428504944,
"step": 3990
},
{
"epoch": 1.0633990410229088,
"grad_norm": 0.515625,
"learning_rate": 1.3893414541628647e-06,
"loss": 0.533847987651825,
"step": 3992
},
{
"epoch": 1.0639318060735217,
"grad_norm": 0.5234375,
"learning_rate": 1.3498728369860125e-06,
"loss": 0.5452683568000793,
"step": 3994
},
{
"epoch": 1.0644645711241343,
"grad_norm": 0.546875,
"learning_rate": 1.310963320532781e-06,
"loss": 0.5820298194885254,
"step": 3996
},
{
"epoch": 1.064997336174747,
"grad_norm": 0.51953125,
"learning_rate": 1.2726134676561385e-06,
"loss": 0.5669692754745483,
"step": 3998
},
{
"epoch": 1.0655301012253595,
"grad_norm": 0.546875,
"learning_rate": 1.2348238331131346e-06,
"loss": 0.5747166275978088,
"step": 4000
},
{
"epoch": 1.0660628662759724,
"grad_norm": 0.54296875,
"learning_rate": 1.1975949635568696e-06,
"loss": 0.6048906445503235,
"step": 4002
},
{
"epoch": 1.066595631326585,
"grad_norm": 0.53125,
"learning_rate": 1.1609273975285995e-06,
"loss": 0.5434789657592773,
"step": 4004
},
{
"epoch": 1.0671283963771976,
"grad_norm": 0.5390625,
"learning_rate": 1.1248216654499377e-06,
"loss": 0.5791047215461731,
"step": 4006
},
{
"epoch": 1.0676611614278104,
"grad_norm": 0.53515625,
"learning_rate": 1.089278289615181e-06,
"loss": 0.5469709038734436,
"step": 4008
},
{
"epoch": 1.068193926478423,
"grad_norm": 0.53125,
"learning_rate": 1.0542977841837465e-06,
"loss": 0.547351598739624,
"step": 4010
},
{
"epoch": 1.0687266915290357,
"grad_norm": 0.55859375,
"learning_rate": 1.0198806551727557e-06,
"loss": 0.6135736107826233,
"step": 4012
},
{
"epoch": 1.0692594565796485,
"grad_norm": 0.51953125,
"learning_rate": 9.860274004496939e-07,
"loss": 0.5442401766777039,
"step": 4014
},
{
"epoch": 1.069792221630261,
"grad_norm": 0.52734375,
"learning_rate": 9.527385097252195e-07,
"loss": 0.5563182234764099,
"step": 4016
},
{
"epoch": 1.0703249866808737,
"grad_norm": 0.52734375,
"learning_rate": 9.200144645460818e-07,
"loss": 0.5793603658676147,
"step": 4018
},
{
"epoch": 1.0708577517314863,
"grad_norm": 0.54296875,
"learning_rate": 8.878557382881436e-07,
"loss": 0.5915518999099731,
"step": 4020
},
{
"epoch": 1.0713905167820992,
"grad_norm": 0.515625,
"learning_rate": 8.56262796149534e-07,
"loss": 0.5501149296760559,
"step": 4022
},
{
"epoch": 1.0719232818327118,
"grad_norm": 0.5390625,
"learning_rate": 8.252360951439375e-07,
"loss": 0.5390005111694336,
"step": 4024
},
{
"epoch": 1.0724560468833244,
"grad_norm": 0.5859375,
"learning_rate": 7.947760840939688e-07,
"loss": 0.5717595219612122,
"step": 4026
},
{
"epoch": 1.072988811933937,
"grad_norm": 0.54296875,
"learning_rate": 7.648832036246712e-07,
"loss": 0.5640586614608765,
"step": 4028
},
{
"epoch": 1.0735215769845499,
"grad_norm": 0.5234375,
"learning_rate": 7.35557886157161e-07,
"loss": 0.5609626173973083,
"step": 4030
},
{
"epoch": 1.0740543420351625,
"grad_norm": 0.51953125,
"learning_rate": 7.068005559023672e-07,
"loss": 0.5454556941986084,
"step": 4032
},
{
"epoch": 1.074587107085775,
"grad_norm": 0.546875,
"learning_rate": 6.786116288548839e-07,
"loss": 0.5526958107948303,
"step": 4034
},
{
"epoch": 1.075119872136388,
"grad_norm": 0.5390625,
"learning_rate": 6.509915127869714e-07,
"loss": 0.5440658926963806,
"step": 4036
},
{
"epoch": 1.0756526371870005,
"grad_norm": 0.53515625,
"learning_rate": 6.239406072426413e-07,
"loss": 0.5510826110839844,
"step": 4038
},
{
"epoch": 1.0761854022376132,
"grad_norm": 0.52734375,
"learning_rate": 5.974593035318777e-07,
"loss": 0.5809528827667236,
"step": 4040
},
{
"epoch": 1.076718167288226,
"grad_norm": 0.5390625,
"learning_rate": 5.715479847249939e-07,
"loss": 0.5985695123672485,
"step": 4042
},
{
"epoch": 1.0772509323388386,
"grad_norm": 0.54296875,
"learning_rate": 5.46207025647072e-07,
"loss": 0.5612497329711914,
"step": 4044
},
{
"epoch": 1.0777836973894512,
"grad_norm": 0.52734375,
"learning_rate": 5.214367928725405e-07,
"loss": 0.5707313418388367,
"step": 4046
},
{
"epoch": 1.0783164624400638,
"grad_norm": 0.5234375,
"learning_rate": 4.972376447198945e-07,
"loss": 0.5503619313240051,
"step": 4048
},
{
"epoch": 1.0788492274906767,
"grad_norm": 0.5234375,
"learning_rate": 4.736099312464815e-07,
"loss": 0.5270297527313232,
"step": 4050
},
{
"epoch": 1.0793819925412893,
"grad_norm": 0.55078125,
"learning_rate": 4.505539942434656e-07,
"loss": 0.5827047824859619,
"step": 4052
},
{
"epoch": 1.079914757591902,
"grad_norm": 0.52734375,
"learning_rate": 4.280701672308585e-07,
"loss": 0.5816717147827148,
"step": 4054
},
{
"epoch": 1.0804475226425145,
"grad_norm": 0.5234375,
"learning_rate": 4.061587754527141e-07,
"loss": 0.574262261390686,
"step": 4056
},
{
"epoch": 1.0809802876931274,
"grad_norm": 0.53125,
"learning_rate": 3.84820135872408e-07,
"loss": 0.5379737615585327,
"step": 4058
},
{
"epoch": 1.08151305274374,
"grad_norm": 0.52734375,
"learning_rate": 3.640545571680765e-07,
"loss": 0.5664905905723572,
"step": 4060
},
{
"epoch": 1.0820458177943526,
"grad_norm": 0.53125,
"learning_rate": 3.438623397281227e-07,
"loss": 0.582391619682312,
"step": 4062
},
{
"epoch": 1.0825785828449654,
"grad_norm": 0.54296875,
"learning_rate": 3.2424377564687745e-07,
"loss": 0.6073220372200012,
"step": 4064
},
{
"epoch": 1.083111347895578,
"grad_norm": 0.54296875,
"learning_rate": 3.051991487203987e-07,
"loss": 0.6066765189170837,
"step": 4066
},
{
"epoch": 1.0836441129461907,
"grad_norm": 0.5234375,
"learning_rate": 2.867287344423364e-07,
"loss": 0.5733582973480225,
"step": 4068
},
{
"epoch": 1.0841768779968035,
"grad_norm": 0.5546875,
"learning_rate": 2.6883279999996294e-07,
"loss": 0.5795176029205322,
"step": 4070
},
{
"epoch": 1.0847096430474161,
"grad_norm": 0.5390625,
"learning_rate": 2.5151160427029584e-07,
"loss": 0.5567296743392944,
"step": 4072
},
{
"epoch": 1.0852424080980287,
"grad_norm": 0.53125,
"learning_rate": 2.3476539781637664e-07,
"loss": 0.5603177547454834,
"step": 4074
},
{
"epoch": 1.0857751731486414,
"grad_norm": 0.49609375,
"learning_rate": 2.1859442288361567e-07,
"loss": 0.5176109671592712,
"step": 4076
},
{
"epoch": 1.0863079381992542,
"grad_norm": 0.54296875,
"learning_rate": 2.0299891339630618e-07,
"loss": 0.5881712436676025,
"step": 4078
},
{
"epoch": 1.0868407032498668,
"grad_norm": 0.54296875,
"learning_rate": 1.879790949542537e-07,
"loss": 0.5769769549369812,
"step": 4080
},
{
"epoch": 1.0873734683004794,
"grad_norm": 0.52734375,
"learning_rate": 1.7353518482946308e-07,
"loss": 0.5656998157501221,
"step": 4082
},
{
"epoch": 1.0879062333510923,
"grad_norm": 0.51171875,
"learning_rate": 1.596673919630609e-07,
"loss": 0.5314844846725464,
"step": 4084
},
{
"epoch": 1.0884389984017049,
"grad_norm": 0.546875,
"learning_rate": 1.4637591696222697e-07,
"loss": 0.5581560134887695,
"step": 4086
},
{
"epoch": 1.0889717634523175,
"grad_norm": 0.515625,
"learning_rate": 1.3366095209729868e-07,
"loss": 0.5622378587722778,
"step": 4088
},
{
"epoch": 1.0895045285029301,
"grad_norm": 0.51953125,
"learning_rate": 1.215226812990089e-07,
"loss": 0.5572565793991089,
"step": 4090
},
{
"epoch": 1.090037293553543,
"grad_norm": 0.546875,
"learning_rate": 1.0996128015581253e-07,
"loss": 0.5689173340797424,
"step": 4092
},
{
"epoch": 1.0905700586041556,
"grad_norm": 0.54296875,
"learning_rate": 9.897691591134184e-08,
"loss": 0.542863130569458,
"step": 4094
},
{
"epoch": 1.0911028236547682,
"grad_norm": 0.51953125,
"learning_rate": 8.856974746199954e-08,
"loss": 0.5954520106315613,
"step": 4096
},
{
"epoch": 1.091635588705381,
"grad_norm": 0.5078125,
"learning_rate": 7.873992535463615e-08,
"loss": 0.5493767857551575,
"step": 4098
},
{
"epoch": 1.0921683537559936,
"grad_norm": 0.546875,
"learning_rate": 6.9487591784414e-08,
"loss": 0.550544023513794,
"step": 4100
},
{
"epoch": 1.0927011188066063,
"grad_norm": 0.5078125,
"learning_rate": 6.081288059271107e-08,
"loss": 0.5349717736244202,
"step": 4102
},
{
"epoch": 1.0932338838572189,
"grad_norm": 0.55078125,
"learning_rate": 5.271591726520253e-08,
"loss": 0.57042396068573,
"step": 4104
},
{
"epoch": 1.0937666489078317,
"grad_norm": 0.5390625,
"learning_rate": 4.519681893004002e-08,
"loss": 0.5848683714866638,
"step": 4106
},
{
"epoch": 1.0942994139584443,
"grad_norm": 0.52734375,
"learning_rate": 3.825569435616405e-08,
"loss": 0.6174777150154114,
"step": 4108
},
{
"epoch": 1.094832179009057,
"grad_norm": 0.5390625,
"learning_rate": 3.189264395172753e-08,
"loss": 0.5845167636871338,
"step": 4110
},
{
"epoch": 1.0953649440596698,
"grad_norm": 0.51953125,
"learning_rate": 2.6107759762634687e-08,
"loss": 0.5631780028343201,
"step": 4112
},
{
"epoch": 1.0958977091102824,
"grad_norm": 0.52734375,
"learning_rate": 2.090112547122658e-08,
"loss": 0.551435649394989,
"step": 4114
},
{
"epoch": 1.096430474160895,
"grad_norm": 0.515625,
"learning_rate": 1.6272816395050962e-08,
"loss": 0.5512660145759583,
"step": 4116
},
{
"epoch": 1.0969632392115076,
"grad_norm": 0.5234375,
"learning_rate": 1.2222899485792027e-08,
"loss": 0.5918897986412048,
"step": 4118
},
{
"epoch": 1.0974960042621205,
"grad_norm": 0.515625,
"learning_rate": 8.751433328288982e-09,
"loss": 0.5378780364990234,
"step": 4120
},
{
"epoch": 1.098028769312733,
"grad_norm": 0.55859375,
"learning_rate": 5.858468139687823e-09,
"loss": 0.5922811627388,
"step": 4122
},
{
"epoch": 1.0985615343633457,
"grad_norm": 0.51953125,
"learning_rate": 3.544045768730797e-09,
"loss": 0.5495621562004089,
"step": 4124
},
{
"epoch": 1.0990942994139585,
"grad_norm": 0.5078125,
"learning_rate": 1.8081996951258007e-09,
"loss": 0.559400200843811,
"step": 4126
},
{
"epoch": 1.0996270644645711,
"grad_norm": 0.5390625,
"learning_rate": 6.509550290800803e-10,
"loss": 0.6017919182777405,
"step": 4128
},
{
"epoch": 1.1001598295151838,
"grad_norm": 0.53515625,
"learning_rate": 7.23285109449634e-11,
"loss": 0.6059576869010925,
"step": 4130
},
{
"epoch": 1.1001598295151838,
"step": 4130,
"total_flos": 3.5614889450215834e+18,
"train_loss": 0.8152079602130677,
"train_runtime": 10535.874,
"train_samples_per_second": 9.406,
"train_steps_per_second": 0.392
}
],
"logging_steps": 2,
"max_steps": 4130,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 938,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.5614889450215834e+18,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}