Dual-Explain-2round-Q1_first / trainer_state.json
tkhangg's picture
Upload folder using huggingface_hub
e252c3c verified
{
"best_global_step": null,
"best_metric": 2.056363105773926,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2022,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001483679525222552,
"grad_norm": 8.939897537231445,
"learning_rate": 0.0,
"loss": 0.8632,
"step": 1
},
{
"epoch": 0.004451038575667656,
"grad_norm": 9.648813247680664,
"learning_rate": 2.8314346251117696e-06,
"loss": 0.801,
"step": 3
},
{
"epoch": 0.008902077151335312,
"grad_norm": 6.610266208648682,
"learning_rate": 4.61787097538723e-06,
"loss": 0.8504,
"step": 6
},
{
"epoch": 0.013353115727002967,
"grad_norm": 6.382794380187988,
"learning_rate": 5.662869250223539e-06,
"loss": 0.8868,
"step": 9
},
{
"epoch": 0.017804154302670624,
"grad_norm": 11.73421859741211,
"learning_rate": 6.40430732566269e-06,
"loss": 0.7982,
"step": 12
},
{
"epoch": 0.02225519287833828,
"grad_norm": 9.07333755493164,
"learning_rate": 6.979411376544402e-06,
"loss": 0.7539,
"step": 15
},
{
"epoch": 0.026706231454005934,
"grad_norm": 5.257121562957764,
"learning_rate": 7.449305600498998e-06,
"loss": 0.7841,
"step": 18
},
{
"epoch": 0.03115727002967359,
"grad_norm": 10.231422424316406,
"learning_rate": 7.846595506000205e-06,
"loss": 0.773,
"step": 21
},
{
"epoch": 0.03560830860534125,
"grad_norm": 6.98926305770874,
"learning_rate": 8.190743675938152e-06,
"loss": 0.794,
"step": 24
},
{
"epoch": 0.040059347181008904,
"grad_norm": 11.664831161499023,
"learning_rate": 8.494303875335309e-06,
"loss": 0.7717,
"step": 27
},
{
"epoch": 0.04451038575667656,
"grad_norm": 7.91517448425293,
"learning_rate": 8.765847726819862e-06,
"loss": 0.7704,
"step": 30
},
{
"epoch": 0.04896142433234421,
"grad_norm": 4.517396926879883,
"learning_rate": 9.01148901993771e-06,
"loss": 0.7508,
"step": 33
},
{
"epoch": 0.05341246290801187,
"grad_norm": 6.36966609954834,
"learning_rate": 9.23574195077446e-06,
"loss": 0.8119,
"step": 36
},
{
"epoch": 0.057863501483679525,
"grad_norm": 11.161980628967285,
"learning_rate": 9.442034649602095e-06,
"loss": 0.8007,
"step": 39
},
{
"epoch": 0.06231454005934718,
"grad_norm": 8.65380859375,
"learning_rate": 9.633031856275666e-06,
"loss": 0.7474,
"step": 42
},
{
"epoch": 0.06676557863501484,
"grad_norm": 8.394684791564941,
"learning_rate": 9.81084600165617e-06,
"loss": 0.7685,
"step": 45
},
{
"epoch": 0.0712166172106825,
"grad_norm": 9.330730438232422,
"learning_rate": 9.977180026213612e-06,
"loss": 0.7716,
"step": 48
},
{
"epoch": 0.07566765578635015,
"grad_norm": 6.538848400115967,
"learning_rate": 1.013342682512159e-05,
"loss": 0.8483,
"step": 51
},
{
"epoch": 0.08011869436201781,
"grad_norm": 5.0795793533325195,
"learning_rate": 1.0280740225610769e-05,
"loss": 0.8217,
"step": 54
},
{
"epoch": 0.08456973293768547,
"grad_norm": 4.957094192504883,
"learning_rate": 1.042008674846264e-05,
"loss": 0.8032,
"step": 57
},
{
"epoch": 0.08902077151335312,
"grad_norm": 11.510953903198242,
"learning_rate": 1.0552284077095323e-05,
"loss": 0.8015,
"step": 60
},
{
"epoch": 0.09347181008902077,
"grad_norm": 5.8856096267700195,
"learning_rate": 1.0678030131111975e-05,
"loss": 0.7644,
"step": 63
},
{
"epoch": 0.09792284866468842,
"grad_norm": 6.385069847106934,
"learning_rate": 1.0797925370213172e-05,
"loss": 0.782,
"step": 66
},
{
"epoch": 0.10237388724035608,
"grad_norm": 7.3922295570373535,
"learning_rate": 1.0912490136135183e-05,
"loss": 0.8079,
"step": 69
},
{
"epoch": 0.10682492581602374,
"grad_norm": 6.742562770843506,
"learning_rate": 1.102217830104992e-05,
"loss": 0.7727,
"step": 72
},
{
"epoch": 0.11127596439169139,
"grad_norm": 10.546209335327148,
"learning_rate": 1.1127388127977034e-05,
"loss": 0.8124,
"step": 75
},
{
"epoch": 0.11572700296735905,
"grad_norm": 10.506328582763672,
"learning_rate": 1.1228470999877556e-05,
"loss": 0.7427,
"step": 78
},
{
"epoch": 0.1201780415430267,
"grad_norm": 8.066429138183594,
"learning_rate": 1.1325738500447079e-05,
"loss": 0.7873,
"step": 81
},
{
"epoch": 0.12462908011869436,
"grad_norm": 9.615484237670898,
"learning_rate": 1.1419468206551126e-05,
"loss": 0.8364,
"step": 84
},
{
"epoch": 0.129080118694362,
"grad_norm": 14.365260124206543,
"learning_rate": 1.150990846375502e-05,
"loss": 0.7708,
"step": 87
},
{
"epoch": 0.13353115727002968,
"grad_norm": 8.187057495117188,
"learning_rate": 1.1597282351931633e-05,
"loss": 0.7984,
"step": 90
},
{
"epoch": 0.13798219584569732,
"grad_norm": 14.292377471923828,
"learning_rate": 1.1681791000387452e-05,
"loss": 0.7983,
"step": 93
},
{
"epoch": 0.142433234421365,
"grad_norm": 12.673680305480957,
"learning_rate": 1.1763616376489071e-05,
"loss": 0.8131,
"step": 96
},
{
"epoch": 0.14688427299703263,
"grad_norm": 5.767789363861084,
"learning_rate": 1.1842923645049482e-05,
"loss": 0.8275,
"step": 99
},
{
"epoch": 0.1513353115727003,
"grad_norm": 10.027091026306152,
"learning_rate": 1.1919863175397048e-05,
"loss": 0.7671,
"step": 102
},
{
"epoch": 0.15578635014836795,
"grad_norm": 5.067603588104248,
"learning_rate": 1.1994572257432837e-05,
"loss": 0.7883,
"step": 105
},
{
"epoch": 0.16023738872403562,
"grad_norm": 12.904479026794434,
"learning_rate": 1.206717657588623e-05,
"loss": 0.7876,
"step": 108
},
{
"epoch": 0.16468842729970326,
"grad_norm": 8.401494979858398,
"learning_rate": 1.2137791482536164e-05,
"loss": 0.8008,
"step": 111
},
{
"epoch": 0.16913946587537093,
"grad_norm": 11.146966934204102,
"learning_rate": 1.2206523098738102e-05,
"loss": 0.7833,
"step": 114
},
{
"epoch": 0.17359050445103857,
"grad_norm": 9.299595832824707,
"learning_rate": 1.2273469274713866e-05,
"loss": 0.7752,
"step": 117
},
{
"epoch": 0.17804154302670624,
"grad_norm": 9.26494312286377,
"learning_rate": 1.2338720427370783e-05,
"loss": 0.8126,
"step": 120
},
{
"epoch": 0.1824925816023739,
"grad_norm": 9.886981964111328,
"learning_rate": 1.2402360274652675e-05,
"loss": 0.7478,
"step": 123
},
{
"epoch": 0.18694362017804153,
"grad_norm": 11.463534355163574,
"learning_rate": 1.2464466481387436e-05,
"loss": 0.7334,
"step": 126
},
{
"epoch": 0.1913946587537092,
"grad_norm": 7.69536018371582,
"learning_rate": 1.252511122913015e-05,
"loss": 0.7756,
"step": 129
},
{
"epoch": 0.19584569732937684,
"grad_norm": 9.41036605834961,
"learning_rate": 1.2584361720488632e-05,
"loss": 0.7992,
"step": 132
},
{
"epoch": 0.20029673590504452,
"grad_norm": 6.2021803855896,
"learning_rate": 1.264228062676794e-05,
"loss": 0.7504,
"step": 135
},
{
"epoch": 0.20474777448071216,
"grad_norm": 6.035690784454346,
"learning_rate": 1.2698926486410644e-05,
"loss": 0.8319,
"step": 138
},
{
"epoch": 0.20919881305637983,
"grad_norm": 6.217399597167969,
"learning_rate": 1.275435406058353e-05,
"loss": 0.8151,
"step": 141
},
{
"epoch": 0.21364985163204747,
"grad_norm": 10.224953651428223,
"learning_rate": 1.280861465132538e-05,
"loss": 0.8005,
"step": 144
},
{
"epoch": 0.21810089020771514,
"grad_norm": 5.470182418823242,
"learning_rate": 1.286175638688864e-05,
"loss": 0.8147,
"step": 147
},
{
"epoch": 0.22255192878338279,
"grad_norm": 6.979781627655029,
"learning_rate": 1.2913824478252495e-05,
"loss": 0.7623,
"step": 150
},
{
"epoch": 0.22700296735905046,
"grad_norm": 7.8118414878845215,
"learning_rate": 1.2964861450233358e-05,
"loss": 0.7851,
"step": 153
},
{
"epoch": 0.2314540059347181,
"grad_norm": 8.010553359985352,
"learning_rate": 1.3014907350153016e-05,
"loss": 0.8243,
"step": 156
},
{
"epoch": 0.23590504451038577,
"grad_norm": 6.6702470779418945,
"learning_rate": 1.3063999936629808e-05,
"loss": 0.8348,
"step": 159
},
{
"epoch": 0.2403560830860534,
"grad_norm": 6.347877025604248,
"learning_rate": 1.3112174850722537e-05,
"loss": 0.8093,
"step": 162
},
{
"epoch": 0.24480712166172106,
"grad_norm": 8.773872375488281,
"learning_rate": 1.3159465771370344e-05,
"loss": 0.8424,
"step": 165
},
{
"epoch": 0.24925816023738873,
"grad_norm": 9.952154159545898,
"learning_rate": 1.3205904556826587e-05,
"loss": 0.8068,
"step": 168
},
{
"epoch": 0.25370919881305637,
"grad_norm": 4.658379554748535,
"learning_rate": 1.325152137357441e-05,
"loss": 0.8352,
"step": 171
},
{
"epoch": 0.258160237388724,
"grad_norm": 8.045439720153809,
"learning_rate": 1.329634481403048e-05,
"loss": 0.7586,
"step": 174
},
{
"epoch": 0.2626112759643917,
"grad_norm": 13.365344047546387,
"learning_rate": 1.3340402004187042e-05,
"loss": 0.7838,
"step": 177
},
{
"epoch": 0.26706231454005935,
"grad_norm": 10.028172492980957,
"learning_rate": 1.3383718702207093e-05,
"loss": 0.8142,
"step": 180
},
{
"epoch": 0.271513353115727,
"grad_norm": 7.544151782989502,
"learning_rate": 1.3426319388870015e-05,
"loss": 0.8386,
"step": 183
},
{
"epoch": 0.27596439169139464,
"grad_norm": 11.747024536132812,
"learning_rate": 1.3468227350662914e-05,
"loss": 0.7884,
"step": 186
},
{
"epoch": 0.28041543026706234,
"grad_norm": 13.382097244262695,
"learning_rate": 1.3509464756223744e-05,
"loss": 0.7878,
"step": 189
},
{
"epoch": 0.28486646884273,
"grad_norm": 6.687026023864746,
"learning_rate": 1.3550052726764533e-05,
"loss": 0.7692,
"step": 192
},
{
"epoch": 0.2893175074183976,
"grad_norm": 5.648706912994385,
"learning_rate": 1.3590011401034729e-05,
"loss": 0.8231,
"step": 195
},
{
"epoch": 0.29376854599406527,
"grad_norm": 12.061844825744629,
"learning_rate": 1.3629359995324941e-05,
"loss": 0.7478,
"step": 198
},
{
"epoch": 0.29821958456973297,
"grad_norm": 7.498600006103516,
"learning_rate": 1.3668116858958576e-05,
"loss": 0.7725,
"step": 201
},
{
"epoch": 0.3026706231454006,
"grad_norm": 9.396763801574707,
"learning_rate": 1.370629952567251e-05,
"loss": 0.7833,
"step": 204
},
{
"epoch": 0.30712166172106825,
"grad_norm": 8.891100883483887,
"learning_rate": 1.3743924761246951e-05,
"loss": 0.8031,
"step": 207
},
{
"epoch": 0.3115727002967359,
"grad_norm": 4.5692830085754395,
"learning_rate": 1.3781008607708299e-05,
"loss": 0.7857,
"step": 210
},
{
"epoch": 0.31602373887240354,
"grad_norm": 6.108200550079346,
"learning_rate": 1.381756642439674e-05,
"loss": 0.7977,
"step": 213
},
{
"epoch": 0.32047477744807124,
"grad_norm": 11.626425743103027,
"learning_rate": 1.385361292616169e-05,
"loss": 0.7656,
"step": 216
},
{
"epoch": 0.3249258160237389,
"grad_norm": 7.599254131317139,
"learning_rate": 1.38891622189228e-05,
"loss": 0.8164,
"step": 219
},
{
"epoch": 0.3293768545994065,
"grad_norm": 13.067089080810547,
"learning_rate": 1.3924227832811623e-05,
"loss": 0.8302,
"step": 222
},
{
"epoch": 0.33382789317507416,
"grad_norm": 8.126757621765137,
"learning_rate": 1.3958822753088804e-05,
"loss": 0.772,
"step": 225
},
{
"epoch": 0.33827893175074186,
"grad_norm": 4.585634231567383,
"learning_rate": 1.3992959449013562e-05,
"loss": 0.8119,
"step": 228
},
{
"epoch": 0.3427299703264095,
"grad_norm": 7.228199481964111,
"learning_rate": 1.4026649900826146e-05,
"loss": 0.7462,
"step": 231
},
{
"epoch": 0.34718100890207715,
"grad_norm": 8.683415412902832,
"learning_rate": 1.4059905624989326e-05,
"loss": 0.7746,
"step": 234
},
{
"epoch": 0.3516320474777448,
"grad_norm": 7.1446990966796875,
"learning_rate": 1.4092737697821986e-05,
"loss": 0.9112,
"step": 237
},
{
"epoch": 0.3560830860534125,
"grad_norm": 7.082102298736572,
"learning_rate": 1.4125156777646244e-05,
"loss": 0.7838,
"step": 240
},
{
"epoch": 0.36053412462908013,
"grad_norm": 4.828745365142822,
"learning_rate": 1.4157173125558845e-05,
"loss": 0.806,
"step": 243
},
{
"epoch": 0.3649851632047478,
"grad_norm": 9.047410011291504,
"learning_rate": 1.4188796624928136e-05,
"loss": 0.8594,
"step": 246
},
{
"epoch": 0.3694362017804154,
"grad_norm": 4.606753349304199,
"learning_rate": 1.4220036799709316e-05,
"loss": 0.7615,
"step": 249
},
{
"epoch": 0.37388724035608306,
"grad_norm": 6.191976547241211,
"learning_rate": 1.4250902831662896e-05,
"loss": 0.8045,
"step": 252
},
{
"epoch": 0.37833827893175076,
"grad_norm": 5.661036491394043,
"learning_rate": 1.4281403576554221e-05,
"loss": 0.8259,
"step": 255
},
{
"epoch": 0.3827893175074184,
"grad_norm": 11.667348861694336,
"learning_rate": 1.4311547579405614e-05,
"loss": 0.8555,
"step": 258
},
{
"epoch": 0.38724035608308605,
"grad_norm": 7.464338302612305,
"learning_rate": 1.4341343088866789e-05,
"loss": 0.8525,
"step": 261
},
{
"epoch": 0.3916913946587537,
"grad_norm": 4.3380584716796875,
"learning_rate": 1.4370798070764093e-05,
"loss": 0.7874,
"step": 264
},
{
"epoch": 0.3961424332344214,
"grad_norm": 9.564691543579102,
"learning_rate": 1.4399920220884169e-05,
"loss": 0.8047,
"step": 267
},
{
"epoch": 0.40059347181008903,
"grad_norm": 8.386067390441895,
"learning_rate": 1.4428716977043402e-05,
"loss": 0.8484,
"step": 270
},
{
"epoch": 0.4050445103857567,
"grad_norm": 6.571981430053711,
"learning_rate": 1.4457195530490532e-05,
"loss": 0.8236,
"step": 273
},
{
"epoch": 0.4094955489614243,
"grad_norm": 4.435903549194336,
"learning_rate": 1.4485362836686102e-05,
"loss": 0.8457,
"step": 276
},
{
"epoch": 0.413946587537092,
"grad_norm": 6.691802024841309,
"learning_rate": 1.451322562549922e-05,
"loss": 0.8016,
"step": 279
},
{
"epoch": 0.41839762611275966,
"grad_norm": 15.839409828186035,
"learning_rate": 1.454079041085899e-05,
"loss": 0.845,
"step": 282
},
{
"epoch": 0.4228486646884273,
"grad_norm": 7.598920822143555,
"learning_rate": 1.4568063499895273e-05,
"loss": 0.7626,
"step": 285
},
{
"epoch": 0.42729970326409494,
"grad_norm": 10.305091857910156,
"learning_rate": 1.4595051001600841e-05,
"loss": 0.7926,
"step": 288
},
{
"epoch": 0.4317507418397626,
"grad_norm": 5.350500106811523,
"learning_rate": 1.4621758835044685e-05,
"loss": 0.8092,
"step": 291
},
{
"epoch": 0.4362017804154303,
"grad_norm": 18.546098709106445,
"learning_rate": 1.4648192737164102e-05,
"loss": 0.815,
"step": 294
},
{
"epoch": 0.4406528189910979,
"grad_norm": 8.379362106323242,
"learning_rate": 1.4674358270161251e-05,
"loss": 0.823,
"step": 297
},
{
"epoch": 0.44510385756676557,
"grad_norm": 8.076055526733398,
"learning_rate": 1.4700260828527957e-05,
"loss": 0.8321,
"step": 300
},
{
"epoch": 0.4495548961424332,
"grad_norm": 16.41190528869629,
"learning_rate": 1.4725905645721047e-05,
"loss": 0.8191,
"step": 303
},
{
"epoch": 0.4540059347181009,
"grad_norm": 6.45059871673584,
"learning_rate": 1.475129780050882e-05,
"loss": 0.7658,
"step": 306
},
{
"epoch": 0.45845697329376855,
"grad_norm": 10.349370002746582,
"learning_rate": 1.4776442223007901e-05,
"loss": 0.7575,
"step": 309
},
{
"epoch": 0.4629080118694362,
"grad_norm": 3.693134069442749,
"learning_rate": 1.4801343700428479e-05,
"loss": 0.8456,
"step": 312
},
{
"epoch": 0.46735905044510384,
"grad_norm": 9.323659896850586,
"learning_rate": 1.4826006882544607e-05,
"loss": 0.8387,
"step": 315
},
{
"epoch": 0.47181008902077154,
"grad_norm": 7.65170431137085,
"learning_rate": 1.4850436286905268e-05,
"loss": 0.8939,
"step": 318
},
{
"epoch": 0.4762611275964392,
"grad_norm": 15.972323417663574,
"learning_rate": 1.4874636303800742e-05,
"loss": 0.8639,
"step": 321
},
{
"epoch": 0.4807121661721068,
"grad_norm": 22.23072052001953,
"learning_rate": 1.4898611200997996e-05,
"loss": 0.8339,
"step": 324
},
{
"epoch": 0.48516320474777447,
"grad_norm": 6.0545430183410645,
"learning_rate": 1.4922365128257845e-05,
"loss": 0.8315,
"step": 327
},
{
"epoch": 0.4896142433234421,
"grad_norm": 15.703563690185547,
"learning_rate": 1.4945902121645804e-05,
"loss": 0.8343,
"step": 330
},
{
"epoch": 0.4940652818991098,
"grad_norm": 9.006149291992188,
"learning_rate": 1.4969226107647933e-05,
"loss": 0.9232,
"step": 333
},
{
"epoch": 0.49851632047477745,
"grad_norm": 9.227169036865234,
"learning_rate": 1.4992340907102047e-05,
"loss": 0.8832,
"step": 336
},
{
"epoch": 0.5029673590504451,
"grad_norm": 13.163078308105469,
"learning_rate": 1.5e-05,
"loss": 0.8781,
"step": 339
},
{
"epoch": 0.5074183976261127,
"grad_norm": 7.369640350341797,
"learning_rate": 1.5e-05,
"loss": 0.8778,
"step": 342
},
{
"epoch": 0.5118694362017804,
"grad_norm": 11.006775856018066,
"learning_rate": 1.5e-05,
"loss": 0.8643,
"step": 345
},
{
"epoch": 0.516320474777448,
"grad_norm": 3.54710054397583,
"learning_rate": 1.5e-05,
"loss": 0.843,
"step": 348
},
{
"epoch": 0.5207715133531158,
"grad_norm": 9.564830780029297,
"learning_rate": 1.5e-05,
"loss": 0.8245,
"step": 351
},
{
"epoch": 0.5252225519287834,
"grad_norm": 8.431432723999023,
"learning_rate": 1.5e-05,
"loss": 0.886,
"step": 354
},
{
"epoch": 0.5296735905044511,
"grad_norm": 7.211839199066162,
"learning_rate": 1.5e-05,
"loss": 0.8578,
"step": 357
},
{
"epoch": 0.5341246290801187,
"grad_norm": 7.788987159729004,
"learning_rate": 1.5e-05,
"loss": 0.872,
"step": 360
},
{
"epoch": 0.5385756676557863,
"grad_norm": 4.075163841247559,
"learning_rate": 1.5e-05,
"loss": 0.8802,
"step": 363
},
{
"epoch": 0.543026706231454,
"grad_norm": 13.805707931518555,
"learning_rate": 1.5e-05,
"loss": 0.8325,
"step": 366
},
{
"epoch": 0.5474777448071216,
"grad_norm": 7.167026519775391,
"learning_rate": 1.5e-05,
"loss": 0.9119,
"step": 369
},
{
"epoch": 0.5519287833827893,
"grad_norm": 8.409590721130371,
"learning_rate": 1.5e-05,
"loss": 0.8572,
"step": 372
},
{
"epoch": 0.5563798219584569,
"grad_norm": 11.929038047790527,
"learning_rate": 1.5e-05,
"loss": 0.8327,
"step": 375
},
{
"epoch": 0.5608308605341247,
"grad_norm": 4.433465480804443,
"learning_rate": 1.5e-05,
"loss": 0.7959,
"step": 378
},
{
"epoch": 0.5652818991097923,
"grad_norm": 7.223580360412598,
"learning_rate": 1.5e-05,
"loss": 0.8157,
"step": 381
},
{
"epoch": 0.56973293768546,
"grad_norm": 6.028378009796143,
"learning_rate": 1.5e-05,
"loss": 0.8469,
"step": 384
},
{
"epoch": 0.5741839762611276,
"grad_norm": 10.804591178894043,
"learning_rate": 1.5e-05,
"loss": 0.8891,
"step": 387
},
{
"epoch": 0.5786350148367952,
"grad_norm": 10.0234956741333,
"learning_rate": 1.5e-05,
"loss": 0.906,
"step": 390
},
{
"epoch": 0.5830860534124629,
"grad_norm": 4.883424758911133,
"learning_rate": 1.5e-05,
"loss": 0.8422,
"step": 393
},
{
"epoch": 0.5875370919881305,
"grad_norm": 8.042715072631836,
"learning_rate": 1.5e-05,
"loss": 0.8673,
"step": 396
},
{
"epoch": 0.5919881305637982,
"grad_norm": 6.206501007080078,
"learning_rate": 1.5e-05,
"loss": 0.8115,
"step": 399
},
{
"epoch": 0.5964391691394659,
"grad_norm": 14.539153099060059,
"learning_rate": 1.5e-05,
"loss": 0.8752,
"step": 402
},
{
"epoch": 0.6008902077151336,
"grad_norm": 9.136768341064453,
"learning_rate": 1.5e-05,
"loss": 0.8752,
"step": 405
},
{
"epoch": 0.6053412462908012,
"grad_norm": 4.936409950256348,
"learning_rate": 1.5e-05,
"loss": 0.8807,
"step": 408
},
{
"epoch": 0.6097922848664689,
"grad_norm": 12.717706680297852,
"learning_rate": 1.5e-05,
"loss": 0.8133,
"step": 411
},
{
"epoch": 0.6142433234421365,
"grad_norm": 13.034161567687988,
"learning_rate": 1.5e-05,
"loss": 0.8437,
"step": 414
},
{
"epoch": 0.6186943620178041,
"grad_norm": 4.76663064956665,
"learning_rate": 1.5e-05,
"loss": 0.8141,
"step": 417
},
{
"epoch": 0.6231454005934718,
"grad_norm": 6.521324634552002,
"learning_rate": 1.5e-05,
"loss": 0.8687,
"step": 420
},
{
"epoch": 0.6275964391691394,
"grad_norm": 19.489913940429688,
"learning_rate": 1.5e-05,
"loss": 0.8154,
"step": 423
},
{
"epoch": 0.6320474777448071,
"grad_norm": 13.211241722106934,
"learning_rate": 1.5e-05,
"loss": 0.8417,
"step": 426
},
{
"epoch": 0.6364985163204748,
"grad_norm": 8.362677574157715,
"learning_rate": 1.5e-05,
"loss": 0.8387,
"step": 429
},
{
"epoch": 0.6409495548961425,
"grad_norm": 13.372685432434082,
"learning_rate": 1.5e-05,
"loss": 0.8563,
"step": 432
},
{
"epoch": 0.6454005934718101,
"grad_norm": 4.561835765838623,
"learning_rate": 1.5e-05,
"loss": 0.8571,
"step": 435
},
{
"epoch": 0.6498516320474778,
"grad_norm": 12.67446231842041,
"learning_rate": 1.5e-05,
"loss": 0.8822,
"step": 438
},
{
"epoch": 0.6543026706231454,
"grad_norm": 11.653807640075684,
"learning_rate": 1.5e-05,
"loss": 0.8925,
"step": 441
},
{
"epoch": 0.658753709198813,
"grad_norm": 9.325271606445312,
"learning_rate": 1.5e-05,
"loss": 0.7984,
"step": 444
},
{
"epoch": 0.6632047477744807,
"grad_norm": 5.574127674102783,
"learning_rate": 1.5e-05,
"loss": 0.8253,
"step": 447
},
{
"epoch": 0.6676557863501483,
"grad_norm": 5.725862979888916,
"learning_rate": 1.5e-05,
"loss": 0.9019,
"step": 450
},
{
"epoch": 0.672106824925816,
"grad_norm": 8.04867172241211,
"learning_rate": 1.5e-05,
"loss": 0.8331,
"step": 453
},
{
"epoch": 0.6765578635014837,
"grad_norm": 8.826385498046875,
"learning_rate": 1.5e-05,
"loss": 0.8056,
"step": 456
},
{
"epoch": 0.6810089020771514,
"grad_norm": 7.501665115356445,
"learning_rate": 1.5e-05,
"loss": 0.8137,
"step": 459
},
{
"epoch": 0.685459940652819,
"grad_norm": 3.8520801067352295,
"learning_rate": 1.5e-05,
"loss": 0.8345,
"step": 462
},
{
"epoch": 0.6899109792284867,
"grad_norm": 15.48876953125,
"learning_rate": 1.5e-05,
"loss": 0.859,
"step": 465
},
{
"epoch": 0.6943620178041543,
"grad_norm": 12.54112720489502,
"learning_rate": 1.5e-05,
"loss": 0.8799,
"step": 468
},
{
"epoch": 0.6988130563798219,
"grad_norm": 7.668098449707031,
"learning_rate": 1.5e-05,
"loss": 0.8135,
"step": 471
},
{
"epoch": 0.7032640949554896,
"grad_norm": 13.388195991516113,
"learning_rate": 1.5e-05,
"loss": 0.8104,
"step": 474
},
{
"epoch": 0.7077151335311572,
"grad_norm": 8.277421951293945,
"learning_rate": 1.5e-05,
"loss": 0.792,
"step": 477
},
{
"epoch": 0.712166172106825,
"grad_norm": 10.426804542541504,
"learning_rate": 1.5e-05,
"loss": 0.8533,
"step": 480
},
{
"epoch": 0.7166172106824926,
"grad_norm": 15.068408012390137,
"learning_rate": 1.5e-05,
"loss": 0.8912,
"step": 483
},
{
"epoch": 0.7210682492581603,
"grad_norm": 7.566452980041504,
"learning_rate": 1.5e-05,
"loss": 0.8625,
"step": 486
},
{
"epoch": 0.7255192878338279,
"grad_norm": 14.359679222106934,
"learning_rate": 1.5e-05,
"loss": 0.7973,
"step": 489
},
{
"epoch": 0.7299703264094956,
"grad_norm": 8.746999740600586,
"learning_rate": 1.5e-05,
"loss": 0.9004,
"step": 492
},
{
"epoch": 0.7344213649851632,
"grad_norm": 15.901468276977539,
"learning_rate": 1.5e-05,
"loss": 0.8499,
"step": 495
},
{
"epoch": 0.7388724035608308,
"grad_norm": 5.345223903656006,
"learning_rate": 1.5e-05,
"loss": 0.9112,
"step": 498
},
{
"epoch": 0.7433234421364985,
"grad_norm": 8.00938892364502,
"learning_rate": 1.5e-05,
"loss": 0.8521,
"step": 501
},
{
"epoch": 0.7477744807121661,
"grad_norm": 7.601090431213379,
"learning_rate": 1.5e-05,
"loss": 0.8708,
"step": 504
},
{
"epoch": 0.7522255192878339,
"grad_norm": 14.48643970489502,
"learning_rate": 1.5e-05,
"loss": 0.8476,
"step": 507
},
{
"epoch": 0.7566765578635015,
"grad_norm": 16.052143096923828,
"learning_rate": 1.5e-05,
"loss": 0.8846,
"step": 510
},
{
"epoch": 0.7611275964391692,
"grad_norm": 11.447772979736328,
"learning_rate": 1.5e-05,
"loss": 0.8062,
"step": 513
},
{
"epoch": 0.7655786350148368,
"grad_norm": 7.909060955047607,
"learning_rate": 1.5e-05,
"loss": 0.8715,
"step": 516
},
{
"epoch": 0.7700296735905044,
"grad_norm": 10.86801528930664,
"learning_rate": 1.5e-05,
"loss": 0.8102,
"step": 519
},
{
"epoch": 0.7744807121661721,
"grad_norm": 6.530400276184082,
"learning_rate": 1.5e-05,
"loss": 0.8667,
"step": 522
},
{
"epoch": 0.7789317507418397,
"grad_norm": 13.691858291625977,
"learning_rate": 1.5e-05,
"loss": 0.8159,
"step": 525
},
{
"epoch": 0.7833827893175074,
"grad_norm": 8.416064262390137,
"learning_rate": 1.5e-05,
"loss": 0.8298,
"step": 528
},
{
"epoch": 0.787833827893175,
"grad_norm": 8.614116668701172,
"learning_rate": 1.5e-05,
"loss": 0.8542,
"step": 531
},
{
"epoch": 0.7922848664688428,
"grad_norm": 10.818787574768066,
"learning_rate": 1.5e-05,
"loss": 0.8476,
"step": 534
},
{
"epoch": 0.7967359050445104,
"grad_norm": 12.394455909729004,
"learning_rate": 1.5e-05,
"loss": 0.884,
"step": 537
},
{
"epoch": 0.8011869436201781,
"grad_norm": 7.285090446472168,
"learning_rate": 1.5e-05,
"loss": 0.8706,
"step": 540
},
{
"epoch": 0.8056379821958457,
"grad_norm": 6.0529632568359375,
"learning_rate": 1.5e-05,
"loss": 0.8389,
"step": 543
},
{
"epoch": 0.8100890207715133,
"grad_norm": 5.722935199737549,
"learning_rate": 1.5e-05,
"loss": 0.9546,
"step": 546
},
{
"epoch": 0.814540059347181,
"grad_norm": 8.289714813232422,
"learning_rate": 1.5e-05,
"loss": 0.8797,
"step": 549
},
{
"epoch": 0.8189910979228486,
"grad_norm": 11.81054973602295,
"learning_rate": 1.5e-05,
"loss": 0.9221,
"step": 552
},
{
"epoch": 0.8234421364985163,
"grad_norm": 8.692460060119629,
"learning_rate": 1.5e-05,
"loss": 0.8738,
"step": 555
},
{
"epoch": 0.827893175074184,
"grad_norm": 12.485095024108887,
"learning_rate": 1.5e-05,
"loss": 0.824,
"step": 558
},
{
"epoch": 0.8323442136498517,
"grad_norm": 15.875768661499023,
"learning_rate": 1.5e-05,
"loss": 0.8862,
"step": 561
},
{
"epoch": 0.8367952522255193,
"grad_norm": 11.125205039978027,
"learning_rate": 1.5e-05,
"loss": 0.8969,
"step": 564
},
{
"epoch": 0.841246290801187,
"grad_norm": 9.451800346374512,
"learning_rate": 1.5e-05,
"loss": 0.8265,
"step": 567
},
{
"epoch": 0.8456973293768546,
"grad_norm": 8.435980796813965,
"learning_rate": 1.5e-05,
"loss": 0.9067,
"step": 570
},
{
"epoch": 0.8501483679525222,
"grad_norm": 7.994401931762695,
"learning_rate": 1.5e-05,
"loss": 0.8713,
"step": 573
},
{
"epoch": 0.8545994065281899,
"grad_norm": 8.249290466308594,
"learning_rate": 1.5e-05,
"loss": 0.8681,
"step": 576
},
{
"epoch": 0.8590504451038575,
"grad_norm": 10.910624504089355,
"learning_rate": 1.5e-05,
"loss": 0.8686,
"step": 579
},
{
"epoch": 0.8635014836795252,
"grad_norm": 9.459715843200684,
"learning_rate": 1.5e-05,
"loss": 0.8453,
"step": 582
},
{
"epoch": 0.8679525222551929,
"grad_norm": 11.252153396606445,
"learning_rate": 1.5e-05,
"loss": 0.8494,
"step": 585
},
{
"epoch": 0.8724035608308606,
"grad_norm": 13.70508098602295,
"learning_rate": 1.5e-05,
"loss": 0.8768,
"step": 588
},
{
"epoch": 0.8768545994065282,
"grad_norm": 5.890571117401123,
"learning_rate": 1.5e-05,
"loss": 0.855,
"step": 591
},
{
"epoch": 0.8813056379821959,
"grad_norm": 9.50145149230957,
"learning_rate": 1.5e-05,
"loss": 0.8869,
"step": 594
},
{
"epoch": 0.8857566765578635,
"grad_norm": 14.335087776184082,
"learning_rate": 1.5e-05,
"loss": 0.8234,
"step": 597
},
{
"epoch": 0.8902077151335311,
"grad_norm": 7.306372165679932,
"learning_rate": 1.5e-05,
"loss": 0.8548,
"step": 600
},
{
"epoch": 0.8946587537091988,
"grad_norm": 8.26121711730957,
"learning_rate": 1.5e-05,
"loss": 0.8765,
"step": 603
},
{
"epoch": 0.8991097922848664,
"grad_norm": 9.257493019104004,
"learning_rate": 1.5e-05,
"loss": 0.8978,
"step": 606
},
{
"epoch": 0.9035608308605341,
"grad_norm": 5.7045817375183105,
"learning_rate": 1.5e-05,
"loss": 0.8315,
"step": 609
},
{
"epoch": 0.9080118694362018,
"grad_norm": 13.060375213623047,
"learning_rate": 1.5e-05,
"loss": 0.8272,
"step": 612
},
{
"epoch": 0.9124629080118695,
"grad_norm": 7.294022560119629,
"learning_rate": 1.5e-05,
"loss": 0.8223,
"step": 615
},
{
"epoch": 0.9169139465875371,
"grad_norm": 5.998393535614014,
"learning_rate": 1.5e-05,
"loss": 0.8029,
"step": 618
},
{
"epoch": 0.9213649851632048,
"grad_norm": 4.604220390319824,
"learning_rate": 1.5e-05,
"loss": 0.8613,
"step": 621
},
{
"epoch": 0.9258160237388724,
"grad_norm": 10.009174346923828,
"learning_rate": 1.5e-05,
"loss": 0.856,
"step": 624
},
{
"epoch": 0.93026706231454,
"grad_norm": 6.437814235687256,
"learning_rate": 1.5e-05,
"loss": 0.9074,
"step": 627
},
{
"epoch": 0.9347181008902077,
"grad_norm": 16.38670539855957,
"learning_rate": 1.5e-05,
"loss": 0.8116,
"step": 630
},
{
"epoch": 0.9391691394658753,
"grad_norm": 7.195847511291504,
"learning_rate": 1.5e-05,
"loss": 0.8411,
"step": 633
},
{
"epoch": 0.9436201780415431,
"grad_norm": 4.928689479827881,
"learning_rate": 1.5e-05,
"loss": 0.8057,
"step": 636
},
{
"epoch": 0.9480712166172107,
"grad_norm": 4.637528896331787,
"learning_rate": 1.5e-05,
"loss": 0.8651,
"step": 639
},
{
"epoch": 0.9525222551928784,
"grad_norm": 10.825565338134766,
"learning_rate": 1.5e-05,
"loss": 0.8757,
"step": 642
},
{
"epoch": 0.956973293768546,
"grad_norm": 9.891039848327637,
"learning_rate": 1.5e-05,
"loss": 0.7774,
"step": 645
},
{
"epoch": 0.9614243323442137,
"grad_norm": 6.30767297744751,
"learning_rate": 1.5e-05,
"loss": 0.8346,
"step": 648
},
{
"epoch": 0.9658753709198813,
"grad_norm": 9.901351928710938,
"learning_rate": 1.5e-05,
"loss": 0.8439,
"step": 651
},
{
"epoch": 0.9703264094955489,
"grad_norm": 8.528812408447266,
"learning_rate": 1.5e-05,
"loss": 0.922,
"step": 654
},
{
"epoch": 0.9747774480712166,
"grad_norm": 8.85007381439209,
"learning_rate": 1.5e-05,
"loss": 0.8565,
"step": 657
},
{
"epoch": 0.9792284866468842,
"grad_norm": 7.137876510620117,
"learning_rate": 1.5e-05,
"loss": 0.886,
"step": 660
},
{
"epoch": 0.983679525222552,
"grad_norm": 12.115394592285156,
"learning_rate": 1.5e-05,
"loss": 0.9001,
"step": 663
},
{
"epoch": 0.9881305637982196,
"grad_norm": 3.7681024074554443,
"learning_rate": 1.5e-05,
"loss": 0.8983,
"step": 666
},
{
"epoch": 0.9925816023738873,
"grad_norm": 7.687930583953857,
"learning_rate": 1.5e-05,
"loss": 0.8629,
"step": 669
},
{
"epoch": 0.9970326409495549,
"grad_norm": 5.996459007263184,
"learning_rate": 1.5e-05,
"loss": 0.8284,
"step": 672
},
{
"epoch": 1.0,
"eval_loss": 2.075103521347046,
"eval_runtime": 554.3952,
"eval_samples_per_second": 2.781,
"eval_steps_per_second": 0.348,
"step": 674
},
{
"epoch": 1.0014836795252227,
"grad_norm": 10.066774368286133,
"learning_rate": 1.5e-05,
"loss": 0.8555,
"step": 675
},
{
"epoch": 1.0059347181008902,
"grad_norm": 6.481128692626953,
"learning_rate": 1.5e-05,
"loss": 0.838,
"step": 678
},
{
"epoch": 1.010385756676558,
"grad_norm": 12.874567985534668,
"learning_rate": 1.5e-05,
"loss": 0.8869,
"step": 681
},
{
"epoch": 1.0148367952522255,
"grad_norm": 5.834901332855225,
"learning_rate": 1.5e-05,
"loss": 0.8517,
"step": 684
},
{
"epoch": 1.0192878338278932,
"grad_norm": 15.476794242858887,
"learning_rate": 1.5e-05,
"loss": 0.8777,
"step": 687
},
{
"epoch": 1.0237388724035608,
"grad_norm": 6.580478191375732,
"learning_rate": 1.5e-05,
"loss": 0.827,
"step": 690
},
{
"epoch": 1.0281899109792285,
"grad_norm": 9.56643009185791,
"learning_rate": 1.5e-05,
"loss": 0.8886,
"step": 693
},
{
"epoch": 1.032640949554896,
"grad_norm": 4.428914546966553,
"learning_rate": 1.5e-05,
"loss": 0.9552,
"step": 696
},
{
"epoch": 1.0370919881305638,
"grad_norm": 4.398239612579346,
"learning_rate": 1.5e-05,
"loss": 0.845,
"step": 699
},
{
"epoch": 1.0415430267062316,
"grad_norm": 5.540760040283203,
"learning_rate": 1.5e-05,
"loss": 0.8776,
"step": 702
},
{
"epoch": 1.045994065281899,
"grad_norm": 15.209844589233398,
"learning_rate": 1.5e-05,
"loss": 0.9569,
"step": 705
},
{
"epoch": 1.0504451038575668,
"grad_norm": 11.813831329345703,
"learning_rate": 1.5e-05,
"loss": 0.8452,
"step": 708
},
{
"epoch": 1.0548961424332344,
"grad_norm": 8.536015510559082,
"learning_rate": 1.5e-05,
"loss": 0.8486,
"step": 711
},
{
"epoch": 1.0593471810089021,
"grad_norm": 6.545129299163818,
"learning_rate": 1.5e-05,
"loss": 0.8856,
"step": 714
},
{
"epoch": 1.0637982195845697,
"grad_norm": 8.14754581451416,
"learning_rate": 1.5e-05,
"loss": 0.8801,
"step": 717
},
{
"epoch": 1.0682492581602374,
"grad_norm": 7.521109580993652,
"learning_rate": 1.5e-05,
"loss": 0.8708,
"step": 720
},
{
"epoch": 1.072700296735905,
"grad_norm": 9.358808517456055,
"learning_rate": 1.5e-05,
"loss": 0.8476,
"step": 723
},
{
"epoch": 1.0771513353115727,
"grad_norm": 6.190918922424316,
"learning_rate": 1.5e-05,
"loss": 0.8605,
"step": 726
},
{
"epoch": 1.0816023738872405,
"grad_norm": 10.316658020019531,
"learning_rate": 1.5e-05,
"loss": 0.8518,
"step": 729
},
{
"epoch": 1.086053412462908,
"grad_norm": 5.746811389923096,
"learning_rate": 1.5e-05,
"loss": 0.8707,
"step": 732
},
{
"epoch": 1.0905044510385757,
"grad_norm": 7.9586663246154785,
"learning_rate": 1.5e-05,
"loss": 0.8193,
"step": 735
},
{
"epoch": 1.0949554896142433,
"grad_norm": 6.76649808883667,
"learning_rate": 1.5e-05,
"loss": 0.8491,
"step": 738
},
{
"epoch": 1.099406528189911,
"grad_norm": 7.164156436920166,
"learning_rate": 1.5e-05,
"loss": 0.8137,
"step": 741
},
{
"epoch": 1.1038575667655786,
"grad_norm": 5.188474178314209,
"learning_rate": 1.5e-05,
"loss": 0.8105,
"step": 744
},
{
"epoch": 1.1083086053412463,
"grad_norm": 11.81541633605957,
"learning_rate": 1.5e-05,
"loss": 0.8628,
"step": 747
},
{
"epoch": 1.1127596439169138,
"grad_norm": 6.901886940002441,
"learning_rate": 1.5e-05,
"loss": 0.8279,
"step": 750
},
{
"epoch": 1.1172106824925816,
"grad_norm": 5.522708892822266,
"learning_rate": 1.5e-05,
"loss": 0.834,
"step": 753
},
{
"epoch": 1.1216617210682494,
"grad_norm": 10.341312408447266,
"learning_rate": 1.5e-05,
"loss": 0.7452,
"step": 756
},
{
"epoch": 1.1261127596439169,
"grad_norm": 11.882563591003418,
"learning_rate": 1.5e-05,
"loss": 0.8481,
"step": 759
},
{
"epoch": 1.1305637982195846,
"grad_norm": 4.872053146362305,
"learning_rate": 1.5e-05,
"loss": 0.8284,
"step": 762
},
{
"epoch": 1.1350148367952522,
"grad_norm": 10.799345016479492,
"learning_rate": 1.5e-05,
"loss": 0.8375,
"step": 765
},
{
"epoch": 1.13946587537092,
"grad_norm": 5.207538604736328,
"learning_rate": 1.5e-05,
"loss": 0.8427,
"step": 768
},
{
"epoch": 1.1439169139465875,
"grad_norm": 12.862470626831055,
"learning_rate": 1.5e-05,
"loss": 0.8248,
"step": 771
},
{
"epoch": 1.1483679525222552,
"grad_norm": 6.997878074645996,
"learning_rate": 1.5e-05,
"loss": 0.8827,
"step": 774
},
{
"epoch": 1.1528189910979227,
"grad_norm": 5.541961669921875,
"learning_rate": 1.5e-05,
"loss": 0.8391,
"step": 777
},
{
"epoch": 1.1572700296735905,
"grad_norm": 8.02748966217041,
"learning_rate": 1.5e-05,
"loss": 0.8518,
"step": 780
},
{
"epoch": 1.1617210682492582,
"grad_norm": 10.839200973510742,
"learning_rate": 1.5e-05,
"loss": 0.8715,
"step": 783
},
{
"epoch": 1.1661721068249258,
"grad_norm": 6.69924259185791,
"learning_rate": 1.5e-05,
"loss": 0.8505,
"step": 786
},
{
"epoch": 1.1706231454005935,
"grad_norm": 15.232388496398926,
"learning_rate": 1.5e-05,
"loss": 0.8483,
"step": 789
},
{
"epoch": 1.175074183976261,
"grad_norm": 7.842281341552734,
"learning_rate": 1.5e-05,
"loss": 0.8434,
"step": 792
},
{
"epoch": 1.1795252225519288,
"grad_norm": 9.89548110961914,
"learning_rate": 1.5e-05,
"loss": 0.8434,
"step": 795
},
{
"epoch": 1.1839762611275964,
"grad_norm": 12.349285125732422,
"learning_rate": 1.5e-05,
"loss": 0.8522,
"step": 798
},
{
"epoch": 1.188427299703264,
"grad_norm": 6.074175834655762,
"learning_rate": 1.5e-05,
"loss": 0.8639,
"step": 801
},
{
"epoch": 1.1928783382789319,
"grad_norm": 12.941549301147461,
"learning_rate": 1.5e-05,
"loss": 0.8215,
"step": 804
},
{
"epoch": 1.1973293768545994,
"grad_norm": 6.716182708740234,
"learning_rate": 1.5e-05,
"loss": 0.8417,
"step": 807
},
{
"epoch": 1.2017804154302671,
"grad_norm": 7.472183704376221,
"learning_rate": 1.5e-05,
"loss": 0.8061,
"step": 810
},
{
"epoch": 1.2062314540059347,
"grad_norm": 9.26564884185791,
"learning_rate": 1.5e-05,
"loss": 0.8834,
"step": 813
},
{
"epoch": 1.2106824925816024,
"grad_norm": 12.621788024902344,
"learning_rate": 1.5e-05,
"loss": 0.8299,
"step": 816
},
{
"epoch": 1.21513353115727,
"grad_norm": 8.867362976074219,
"learning_rate": 1.5e-05,
"loss": 0.8374,
"step": 819
},
{
"epoch": 1.2195845697329377,
"grad_norm": 12.072689056396484,
"learning_rate": 1.5e-05,
"loss": 0.8223,
"step": 822
},
{
"epoch": 1.2240356083086052,
"grad_norm": 10.037847518920898,
"learning_rate": 1.5e-05,
"loss": 0.7896,
"step": 825
},
{
"epoch": 1.228486646884273,
"grad_norm": 7.16823148727417,
"learning_rate": 1.5e-05,
"loss": 0.7912,
"step": 828
},
{
"epoch": 1.2329376854599405,
"grad_norm": 13.862353324890137,
"learning_rate": 1.5e-05,
"loss": 0.8545,
"step": 831
},
{
"epoch": 1.2373887240356083,
"grad_norm": 6.668301582336426,
"learning_rate": 1.5e-05,
"loss": 0.7566,
"step": 834
},
{
"epoch": 1.241839762611276,
"grad_norm": 10.224084854125977,
"learning_rate": 1.5e-05,
"loss": 0.8056,
"step": 837
},
{
"epoch": 1.2462908011869436,
"grad_norm": 6.452188014984131,
"learning_rate": 1.5e-05,
"loss": 0.82,
"step": 840
},
{
"epoch": 1.2507418397626113,
"grad_norm": 7.246963024139404,
"learning_rate": 1.5e-05,
"loss": 0.8351,
"step": 843
},
{
"epoch": 1.2551928783382789,
"grad_norm": 6.421880722045898,
"learning_rate": 1.5e-05,
"loss": 0.8563,
"step": 846
},
{
"epoch": 1.2596439169139466,
"grad_norm": 6.952515602111816,
"learning_rate": 1.5e-05,
"loss": 0.9159,
"step": 849
},
{
"epoch": 1.2640949554896141,
"grad_norm": 4.98225212097168,
"learning_rate": 1.5e-05,
"loss": 0.8609,
"step": 852
},
{
"epoch": 1.268545994065282,
"grad_norm": 7.60207462310791,
"learning_rate": 1.5e-05,
"loss": 0.8765,
"step": 855
},
{
"epoch": 1.2729970326409497,
"grad_norm": 7.847710609436035,
"learning_rate": 1.5e-05,
"loss": 0.8208,
"step": 858
},
{
"epoch": 1.2774480712166172,
"grad_norm": 8.309576988220215,
"learning_rate": 1.5e-05,
"loss": 0.8579,
"step": 861
},
{
"epoch": 1.281899109792285,
"grad_norm": 8.306654930114746,
"learning_rate": 1.5e-05,
"loss": 0.8137,
"step": 864
},
{
"epoch": 1.2863501483679525,
"grad_norm": 7.458807945251465,
"learning_rate": 1.5e-05,
"loss": 0.8503,
"step": 867
},
{
"epoch": 1.2908011869436202,
"grad_norm": 5.15773344039917,
"learning_rate": 1.5e-05,
"loss": 0.8074,
"step": 870
},
{
"epoch": 1.2952522255192878,
"grad_norm": 5.930022716522217,
"learning_rate": 1.5e-05,
"loss": 0.7877,
"step": 873
},
{
"epoch": 1.2997032640949555,
"grad_norm": 10.39821720123291,
"learning_rate": 1.5e-05,
"loss": 0.7825,
"step": 876
},
{
"epoch": 1.3041543026706233,
"grad_norm": 10.691167831420898,
"learning_rate": 1.5e-05,
"loss": 0.84,
"step": 879
},
{
"epoch": 1.3086053412462908,
"grad_norm": 11.49881362915039,
"learning_rate": 1.5e-05,
"loss": 0.8858,
"step": 882
},
{
"epoch": 1.3130563798219583,
"grad_norm": 8.16782283782959,
"learning_rate": 1.5e-05,
"loss": 0.8237,
"step": 885
},
{
"epoch": 1.317507418397626,
"grad_norm": 5.213159561157227,
"learning_rate": 1.5e-05,
"loss": 0.776,
"step": 888
},
{
"epoch": 1.3219584569732938,
"grad_norm": 7.029541969299316,
"learning_rate": 1.5e-05,
"loss": 0.8448,
"step": 891
},
{
"epoch": 1.3264094955489614,
"grad_norm": 5.071165084838867,
"learning_rate": 1.5e-05,
"loss": 0.915,
"step": 894
},
{
"epoch": 1.3308605341246291,
"grad_norm": 8.019569396972656,
"learning_rate": 1.5e-05,
"loss": 0.873,
"step": 897
},
{
"epoch": 1.3353115727002967,
"grad_norm": 8.721610069274902,
"learning_rate": 1.5e-05,
"loss": 0.8049,
"step": 900
},
{
"epoch": 1.3397626112759644,
"grad_norm": 15.425809860229492,
"learning_rate": 1.5e-05,
"loss": 0.8703,
"step": 903
},
{
"epoch": 1.344213649851632,
"grad_norm": 10.104765892028809,
"learning_rate": 1.5e-05,
"loss": 0.8404,
"step": 906
},
{
"epoch": 1.3486646884272997,
"grad_norm": 6.787662506103516,
"learning_rate": 1.5e-05,
"loss": 0.8585,
"step": 909
},
{
"epoch": 1.3531157270029674,
"grad_norm": 10.807848930358887,
"learning_rate": 1.5e-05,
"loss": 0.8074,
"step": 912
},
{
"epoch": 1.357566765578635,
"grad_norm": 4.6103129386901855,
"learning_rate": 1.5e-05,
"loss": 0.8287,
"step": 915
},
{
"epoch": 1.3620178041543027,
"grad_norm": 7.826140880584717,
"learning_rate": 1.5e-05,
"loss": 0.8319,
"step": 918
},
{
"epoch": 1.3664688427299703,
"grad_norm": 4.535531044006348,
"learning_rate": 1.5e-05,
"loss": 0.8546,
"step": 921
},
{
"epoch": 1.370919881305638,
"grad_norm": 6.764124870300293,
"learning_rate": 1.5e-05,
"loss": 0.8228,
"step": 924
},
{
"epoch": 1.3753709198813056,
"grad_norm": 9.835914611816406,
"learning_rate": 1.5e-05,
"loss": 0.8356,
"step": 927
},
{
"epoch": 1.3798219584569733,
"grad_norm": 10.747434616088867,
"learning_rate": 1.5e-05,
"loss": 0.8648,
"step": 930
},
{
"epoch": 1.384272997032641,
"grad_norm": 7.541149139404297,
"learning_rate": 1.5e-05,
"loss": 0.8486,
"step": 933
},
{
"epoch": 1.3887240356083086,
"grad_norm": 6.978203296661377,
"learning_rate": 1.5e-05,
"loss": 0.8054,
"step": 936
},
{
"epoch": 1.3931750741839761,
"grad_norm": 15.082099914550781,
"learning_rate": 1.5e-05,
"loss": 0.8219,
"step": 939
},
{
"epoch": 1.3976261127596439,
"grad_norm": 8.855502128601074,
"learning_rate": 1.5e-05,
"loss": 0.8023,
"step": 942
},
{
"epoch": 1.4020771513353116,
"grad_norm": 4.794929027557373,
"learning_rate": 1.5e-05,
"loss": 0.8232,
"step": 945
},
{
"epoch": 1.4065281899109792,
"grad_norm": 5.567296028137207,
"learning_rate": 1.5e-05,
"loss": 0.7989,
"step": 948
},
{
"epoch": 1.410979228486647,
"grad_norm": 9.492593765258789,
"learning_rate": 1.5e-05,
"loss": 0.8452,
"step": 951
},
{
"epoch": 1.4154302670623147,
"grad_norm": 7.953827857971191,
"learning_rate": 1.5e-05,
"loss": 0.8377,
"step": 954
},
{
"epoch": 1.4198813056379822,
"grad_norm": 9.085283279418945,
"learning_rate": 1.5e-05,
"loss": 0.8983,
"step": 957
},
{
"epoch": 1.4243323442136497,
"grad_norm": 8.406304359436035,
"learning_rate": 1.5e-05,
"loss": 0.8422,
"step": 960
},
{
"epoch": 1.4287833827893175,
"grad_norm": 5.686973571777344,
"learning_rate": 1.5e-05,
"loss": 0.8542,
"step": 963
},
{
"epoch": 1.4332344213649852,
"grad_norm": 9.381924629211426,
"learning_rate": 1.5e-05,
"loss": 0.8904,
"step": 966
},
{
"epoch": 1.4376854599406528,
"grad_norm": 4.451043128967285,
"learning_rate": 1.5e-05,
"loss": 0.8172,
"step": 969
},
{
"epoch": 1.4421364985163205,
"grad_norm": 7.336870193481445,
"learning_rate": 1.5e-05,
"loss": 0.8578,
"step": 972
},
{
"epoch": 1.446587537091988,
"grad_norm": 8.10446548461914,
"learning_rate": 1.5e-05,
"loss": 0.8826,
"step": 975
},
{
"epoch": 1.4510385756676558,
"grad_norm": 8.376605033874512,
"learning_rate": 1.5e-05,
"loss": 0.8302,
"step": 978
},
{
"epoch": 1.4554896142433233,
"grad_norm": 11.178180694580078,
"learning_rate": 1.5e-05,
"loss": 0.8797,
"step": 981
},
{
"epoch": 1.459940652818991,
"grad_norm": 10.056670188903809,
"learning_rate": 1.5e-05,
"loss": 0.8314,
"step": 984
},
{
"epoch": 1.4643916913946589,
"grad_norm": 5.353207588195801,
"learning_rate": 1.5e-05,
"loss": 0.823,
"step": 987
},
{
"epoch": 1.4688427299703264,
"grad_norm": 10.648890495300293,
"learning_rate": 1.5e-05,
"loss": 0.8789,
"step": 990
},
{
"epoch": 1.4732937685459941,
"grad_norm": 5.265453338623047,
"learning_rate": 1.5e-05,
"loss": 0.8392,
"step": 993
},
{
"epoch": 1.4777448071216617,
"grad_norm": 4.404312610626221,
"learning_rate": 1.5e-05,
"loss": 0.8659,
"step": 996
},
{
"epoch": 1.4821958456973294,
"grad_norm": 7.063133716583252,
"learning_rate": 1.5e-05,
"loss": 0.8903,
"step": 999
},
{
"epoch": 1.486646884272997,
"grad_norm": 12.400032043457031,
"learning_rate": 1.5e-05,
"loss": 0.8696,
"step": 1002
},
{
"epoch": 1.4910979228486647,
"grad_norm": 8.297316551208496,
"learning_rate": 1.5e-05,
"loss": 0.9001,
"step": 1005
},
{
"epoch": 1.4955489614243325,
"grad_norm": 11.91292667388916,
"learning_rate": 1.5e-05,
"loss": 0.8941,
"step": 1008
},
{
"epoch": 1.5,
"grad_norm": 6.494741916656494,
"learning_rate": 1.5e-05,
"loss": 0.8534,
"step": 1011
},
{
"epoch": 1.5044510385756675,
"grad_norm": 11.065376281738281,
"learning_rate": 1.5e-05,
"loss": 0.8624,
"step": 1014
},
{
"epoch": 1.5089020771513353,
"grad_norm": 7.178919315338135,
"learning_rate": 1.5e-05,
"loss": 0.8742,
"step": 1017
},
{
"epoch": 1.513353115727003,
"grad_norm": 5.641129493713379,
"learning_rate": 1.5e-05,
"loss": 0.8493,
"step": 1020
},
{
"epoch": 1.5178041543026706,
"grad_norm": 14.277860641479492,
"learning_rate": 1.5e-05,
"loss": 0.8686,
"step": 1023
},
{
"epoch": 1.5222551928783383,
"grad_norm": 9.708137512207031,
"learning_rate": 1.5e-05,
"loss": 0.83,
"step": 1026
},
{
"epoch": 1.526706231454006,
"grad_norm": 7.91434383392334,
"learning_rate": 1.5e-05,
"loss": 0.841,
"step": 1029
},
{
"epoch": 1.5311572700296736,
"grad_norm": 14.816337585449219,
"learning_rate": 1.5e-05,
"loss": 0.9307,
"step": 1032
},
{
"epoch": 1.5356083086053411,
"grad_norm": 12.463879585266113,
"learning_rate": 1.5e-05,
"loss": 0.8844,
"step": 1035
},
{
"epoch": 1.540059347181009,
"grad_norm": 7.6568217277526855,
"learning_rate": 1.5e-05,
"loss": 0.9198,
"step": 1038
},
{
"epoch": 1.5445103857566767,
"grad_norm": 11.649917602539062,
"learning_rate": 1.5e-05,
"loss": 0.9174,
"step": 1041
},
{
"epoch": 1.5489614243323442,
"grad_norm": 9.973616600036621,
"learning_rate": 1.5e-05,
"loss": 0.9474,
"step": 1044
},
{
"epoch": 1.5534124629080117,
"grad_norm": 11.569575309753418,
"learning_rate": 1.5e-05,
"loss": 0.9166,
"step": 1047
},
{
"epoch": 1.5578635014836797,
"grad_norm": 11.283512115478516,
"learning_rate": 1.5e-05,
"loss": 0.8188,
"step": 1050
},
{
"epoch": 1.5623145400593472,
"grad_norm": 6.829236030578613,
"learning_rate": 1.5e-05,
"loss": 0.8318,
"step": 1053
},
{
"epoch": 1.5667655786350148,
"grad_norm": 10.211257934570312,
"learning_rate": 1.5e-05,
"loss": 0.8139,
"step": 1056
},
{
"epoch": 1.5712166172106825,
"grad_norm": 6.259841442108154,
"learning_rate": 1.5e-05,
"loss": 0.8444,
"step": 1059
},
{
"epoch": 1.5756676557863503,
"grad_norm": 14.19024658203125,
"learning_rate": 1.5e-05,
"loss": 0.8641,
"step": 1062
},
{
"epoch": 1.5801186943620178,
"grad_norm": 6.3594231605529785,
"learning_rate": 1.5e-05,
"loss": 0.9041,
"step": 1065
},
{
"epoch": 1.5845697329376853,
"grad_norm": 9.81156063079834,
"learning_rate": 1.5e-05,
"loss": 0.8703,
"step": 1068
},
{
"epoch": 1.589020771513353,
"grad_norm": 6.122777938842773,
"learning_rate": 1.5e-05,
"loss": 0.8309,
"step": 1071
},
{
"epoch": 1.5934718100890208,
"grad_norm": 11.714445114135742,
"learning_rate": 1.5e-05,
"loss": 0.8151,
"step": 1074
},
{
"epoch": 1.5979228486646884,
"grad_norm": 12.073863983154297,
"learning_rate": 1.5e-05,
"loss": 0.9148,
"step": 1077
},
{
"epoch": 1.6023738872403561,
"grad_norm": 8.177748680114746,
"learning_rate": 1.5e-05,
"loss": 0.865,
"step": 1080
},
{
"epoch": 1.6068249258160239,
"grad_norm": 13.913122177124023,
"learning_rate": 1.5e-05,
"loss": 0.8409,
"step": 1083
},
{
"epoch": 1.6112759643916914,
"grad_norm": 8.375801086425781,
"learning_rate": 1.5e-05,
"loss": 0.7768,
"step": 1086
},
{
"epoch": 1.615727002967359,
"grad_norm": 6.173603057861328,
"learning_rate": 1.5e-05,
"loss": 0.8343,
"step": 1089
},
{
"epoch": 1.6201780415430267,
"grad_norm": 10.390620231628418,
"learning_rate": 1.5e-05,
"loss": 0.8338,
"step": 1092
},
{
"epoch": 1.6246290801186944,
"grad_norm": 8.413612365722656,
"learning_rate": 1.5e-05,
"loss": 0.8517,
"step": 1095
},
{
"epoch": 1.629080118694362,
"grad_norm": 9.790428161621094,
"learning_rate": 1.5e-05,
"loss": 0.8443,
"step": 1098
},
{
"epoch": 1.6335311572700295,
"grad_norm": 13.228864669799805,
"learning_rate": 1.5e-05,
"loss": 0.8481,
"step": 1101
},
{
"epoch": 1.6379821958456975,
"grad_norm": 11.918046951293945,
"learning_rate": 1.5e-05,
"loss": 0.8901,
"step": 1104
},
{
"epoch": 1.642433234421365,
"grad_norm": 6.354975700378418,
"learning_rate": 1.5e-05,
"loss": 0.8817,
"step": 1107
},
{
"epoch": 1.6468842729970326,
"grad_norm": 10.373885154724121,
"learning_rate": 1.5e-05,
"loss": 0.8443,
"step": 1110
},
{
"epoch": 1.6513353115727003,
"grad_norm": 7.181490421295166,
"learning_rate": 1.5e-05,
"loss": 0.857,
"step": 1113
},
{
"epoch": 1.655786350148368,
"grad_norm": 8.490324020385742,
"learning_rate": 1.5e-05,
"loss": 0.8714,
"step": 1116
},
{
"epoch": 1.6602373887240356,
"grad_norm": 5.962569236755371,
"learning_rate": 1.5e-05,
"loss": 0.7782,
"step": 1119
},
{
"epoch": 1.6646884272997031,
"grad_norm": 7.268184185028076,
"learning_rate": 1.5e-05,
"loss": 0.8829,
"step": 1122
},
{
"epoch": 1.6691394658753709,
"grad_norm": 9.73929500579834,
"learning_rate": 1.5e-05,
"loss": 0.8069,
"step": 1125
},
{
"epoch": 1.6735905044510386,
"grad_norm": 8.92696762084961,
"learning_rate": 1.5e-05,
"loss": 0.8727,
"step": 1128
},
{
"epoch": 1.6780415430267062,
"grad_norm": 7.317033767700195,
"learning_rate": 1.5e-05,
"loss": 0.8104,
"step": 1131
},
{
"epoch": 1.682492581602374,
"grad_norm": 6.796001434326172,
"learning_rate": 1.5e-05,
"loss": 0.9314,
"step": 1134
},
{
"epoch": 1.6869436201780417,
"grad_norm": 8.300507545471191,
"learning_rate": 1.5e-05,
"loss": 0.8108,
"step": 1137
},
{
"epoch": 1.6913946587537092,
"grad_norm": 14.353339195251465,
"learning_rate": 1.5e-05,
"loss": 0.8735,
"step": 1140
},
{
"epoch": 1.6958456973293767,
"grad_norm": 8.713440895080566,
"learning_rate": 1.5e-05,
"loss": 0.8634,
"step": 1143
},
{
"epoch": 1.7002967359050445,
"grad_norm": 12.015419960021973,
"learning_rate": 1.5e-05,
"loss": 0.8491,
"step": 1146
},
{
"epoch": 1.7047477744807122,
"grad_norm": 5.322451114654541,
"learning_rate": 1.5e-05,
"loss": 0.8491,
"step": 1149
},
{
"epoch": 1.7091988130563798,
"grad_norm": 7.663971900939941,
"learning_rate": 1.5e-05,
"loss": 0.841,
"step": 1152
},
{
"epoch": 1.7136498516320475,
"grad_norm": 9.272565841674805,
"learning_rate": 1.5e-05,
"loss": 0.9399,
"step": 1155
},
{
"epoch": 1.7181008902077153,
"grad_norm": 6.013884544372559,
"learning_rate": 1.5e-05,
"loss": 0.8239,
"step": 1158
},
{
"epoch": 1.7225519287833828,
"grad_norm": 9.190864562988281,
"learning_rate": 1.5e-05,
"loss": 0.8211,
"step": 1161
},
{
"epoch": 1.7270029673590503,
"grad_norm": 9.801536560058594,
"learning_rate": 1.5e-05,
"loss": 0.8571,
"step": 1164
},
{
"epoch": 1.731454005934718,
"grad_norm": 13.254154205322266,
"learning_rate": 1.5e-05,
"loss": 0.8762,
"step": 1167
},
{
"epoch": 1.7359050445103859,
"grad_norm": 10.48544979095459,
"learning_rate": 1.5e-05,
"loss": 0.7882,
"step": 1170
},
{
"epoch": 1.7403560830860534,
"grad_norm": 9.48491382598877,
"learning_rate": 1.5e-05,
"loss": 0.8939,
"step": 1173
},
{
"epoch": 1.744807121661721,
"grad_norm": 8.662673950195312,
"learning_rate": 1.5e-05,
"loss": 0.8562,
"step": 1176
},
{
"epoch": 1.7492581602373887,
"grad_norm": 11.683974266052246,
"learning_rate": 1.5e-05,
"loss": 0.8178,
"step": 1179
},
{
"epoch": 1.7537091988130564,
"grad_norm": 17.12523078918457,
"learning_rate": 1.5e-05,
"loss": 0.8192,
"step": 1182
},
{
"epoch": 1.758160237388724,
"grad_norm": 4.6900835037231445,
"learning_rate": 1.5e-05,
"loss": 0.8683,
"step": 1185
},
{
"epoch": 1.7626112759643917,
"grad_norm": 7.892794132232666,
"learning_rate": 1.5e-05,
"loss": 0.8594,
"step": 1188
},
{
"epoch": 1.7670623145400595,
"grad_norm": 9.247455596923828,
"learning_rate": 1.5e-05,
"loss": 0.858,
"step": 1191
},
{
"epoch": 1.771513353115727,
"grad_norm": 7.50583028793335,
"learning_rate": 1.5e-05,
"loss": 0.7865,
"step": 1194
},
{
"epoch": 1.7759643916913945,
"grad_norm": 4.668313503265381,
"learning_rate": 1.5e-05,
"loss": 0.8215,
"step": 1197
},
{
"epoch": 1.7804154302670623,
"grad_norm": 10.414295196533203,
"learning_rate": 1.5e-05,
"loss": 0.8199,
"step": 1200
},
{
"epoch": 1.78486646884273,
"grad_norm": 4.297197341918945,
"learning_rate": 1.5e-05,
"loss": 0.8129,
"step": 1203
},
{
"epoch": 1.7893175074183976,
"grad_norm": 9.394143104553223,
"learning_rate": 1.5e-05,
"loss": 0.8624,
"step": 1206
},
{
"epoch": 1.7937685459940653,
"grad_norm": 8.61468505859375,
"learning_rate": 1.5e-05,
"loss": 0.797,
"step": 1209
},
{
"epoch": 1.798219584569733,
"grad_norm": 8.216081619262695,
"learning_rate": 1.5e-05,
"loss": 0.8217,
"step": 1212
},
{
"epoch": 1.8026706231454006,
"grad_norm": 7.414550304412842,
"learning_rate": 1.5e-05,
"loss": 0.8259,
"step": 1215
},
{
"epoch": 1.8071216617210681,
"grad_norm": 7.1664042472839355,
"learning_rate": 1.5e-05,
"loss": 0.8568,
"step": 1218
},
{
"epoch": 1.811572700296736,
"grad_norm": 6.590891361236572,
"learning_rate": 1.5e-05,
"loss": 0.8269,
"step": 1221
},
{
"epoch": 1.8160237388724036,
"grad_norm": 8.408268928527832,
"learning_rate": 1.5e-05,
"loss": 0.9052,
"step": 1224
},
{
"epoch": 1.8204747774480712,
"grad_norm": 19.62491226196289,
"learning_rate": 1.5e-05,
"loss": 0.8959,
"step": 1227
},
{
"epoch": 1.8249258160237387,
"grad_norm": 11.636604309082031,
"learning_rate": 1.5e-05,
"loss": 0.8714,
"step": 1230
},
{
"epoch": 1.8293768545994067,
"grad_norm": 8.018316268920898,
"learning_rate": 1.5e-05,
"loss": 0.8673,
"step": 1233
},
{
"epoch": 1.8338278931750742,
"grad_norm": 4.958278179168701,
"learning_rate": 1.5e-05,
"loss": 0.8551,
"step": 1236
},
{
"epoch": 1.8382789317507418,
"grad_norm": 13.244430541992188,
"learning_rate": 1.5e-05,
"loss": 0.8011,
"step": 1239
},
{
"epoch": 1.8427299703264095,
"grad_norm": 7.185425281524658,
"learning_rate": 1.5e-05,
"loss": 0.7873,
"step": 1242
},
{
"epoch": 1.8471810089020773,
"grad_norm": 5.537222862243652,
"learning_rate": 1.5e-05,
"loss": 0.8237,
"step": 1245
},
{
"epoch": 1.8516320474777448,
"grad_norm": 5.888150215148926,
"learning_rate": 1.5e-05,
"loss": 0.8278,
"step": 1248
},
{
"epoch": 1.8560830860534123,
"grad_norm": 7.887198448181152,
"learning_rate": 1.5e-05,
"loss": 0.8399,
"step": 1251
},
{
"epoch": 1.86053412462908,
"grad_norm": 8.108527183532715,
"learning_rate": 1.5e-05,
"loss": 0.8711,
"step": 1254
},
{
"epoch": 1.8649851632047478,
"grad_norm": 4.459034442901611,
"learning_rate": 1.5e-05,
"loss": 0.9171,
"step": 1257
},
{
"epoch": 1.8694362017804154,
"grad_norm": 4.293658256530762,
"learning_rate": 1.5e-05,
"loss": 0.882,
"step": 1260
},
{
"epoch": 1.8738872403560831,
"grad_norm": 6.042054176330566,
"learning_rate": 1.5e-05,
"loss": 0.8623,
"step": 1263
},
{
"epoch": 1.8783382789317509,
"grad_norm": 11.530425071716309,
"learning_rate": 1.5e-05,
"loss": 0.9091,
"step": 1266
},
{
"epoch": 1.8827893175074184,
"grad_norm": 7.389677047729492,
"learning_rate": 1.5e-05,
"loss": 0.8579,
"step": 1269
},
{
"epoch": 1.887240356083086,
"grad_norm": 10.24569034576416,
"learning_rate": 1.5e-05,
"loss": 0.8281,
"step": 1272
},
{
"epoch": 1.8916913946587537,
"grad_norm": 9.817954063415527,
"learning_rate": 1.5e-05,
"loss": 0.8575,
"step": 1275
},
{
"epoch": 1.8961424332344214,
"grad_norm": 11.875582695007324,
"learning_rate": 1.5e-05,
"loss": 0.8375,
"step": 1278
},
{
"epoch": 1.900593471810089,
"grad_norm": 7.8601837158203125,
"learning_rate": 1.5e-05,
"loss": 0.875,
"step": 1281
},
{
"epoch": 1.9050445103857567,
"grad_norm": 6.628482341766357,
"learning_rate": 1.5e-05,
"loss": 0.8568,
"step": 1284
},
{
"epoch": 1.9094955489614245,
"grad_norm": 14.135390281677246,
"learning_rate": 1.5e-05,
"loss": 0.7945,
"step": 1287
},
{
"epoch": 1.913946587537092,
"grad_norm": 7.571518421173096,
"learning_rate": 1.5e-05,
"loss": 0.8387,
"step": 1290
},
{
"epoch": 1.9183976261127595,
"grad_norm": 4.84207010269165,
"learning_rate": 1.5e-05,
"loss": 0.8573,
"step": 1293
},
{
"epoch": 1.9228486646884273,
"grad_norm": 7.50210428237915,
"learning_rate": 1.5e-05,
"loss": 0.8396,
"step": 1296
},
{
"epoch": 1.927299703264095,
"grad_norm": 10.158517837524414,
"learning_rate": 1.5e-05,
"loss": 0.8407,
"step": 1299
},
{
"epoch": 1.9317507418397626,
"grad_norm": 4.945800304412842,
"learning_rate": 1.5e-05,
"loss": 0.849,
"step": 1302
},
{
"epoch": 1.9362017804154301,
"grad_norm": 5.40016508102417,
"learning_rate": 1.5e-05,
"loss": 0.7703,
"step": 1305
},
{
"epoch": 1.9406528189910979,
"grad_norm": 7.8204665184021,
"learning_rate": 1.5e-05,
"loss": 0.8625,
"step": 1308
},
{
"epoch": 1.9451038575667656,
"grad_norm": 6.786766052246094,
"learning_rate": 1.5e-05,
"loss": 0.8816,
"step": 1311
},
{
"epoch": 1.9495548961424332,
"grad_norm": 6.751473903656006,
"learning_rate": 1.5e-05,
"loss": 0.8586,
"step": 1314
},
{
"epoch": 1.954005934718101,
"grad_norm": 9.781673431396484,
"learning_rate": 1.5e-05,
"loss": 0.8348,
"step": 1317
},
{
"epoch": 1.9584569732937687,
"grad_norm": 14.07801628112793,
"learning_rate": 1.5e-05,
"loss": 0.9011,
"step": 1320
},
{
"epoch": 1.9629080118694362,
"grad_norm": 10.769022941589355,
"learning_rate": 1.5e-05,
"loss": 0.8547,
"step": 1323
},
{
"epoch": 1.9673590504451037,
"grad_norm": 5.165210723876953,
"learning_rate": 1.5e-05,
"loss": 0.8819,
"step": 1326
},
{
"epoch": 1.9718100890207715,
"grad_norm": 6.151379108428955,
"learning_rate": 1.5e-05,
"loss": 0.8767,
"step": 1329
},
{
"epoch": 1.9762611275964392,
"grad_norm": 8.154912948608398,
"learning_rate": 1.5e-05,
"loss": 0.8856,
"step": 1332
},
{
"epoch": 1.9807121661721068,
"grad_norm": 7.511419773101807,
"learning_rate": 1.5e-05,
"loss": 0.835,
"step": 1335
},
{
"epoch": 1.9851632047477745,
"grad_norm": 8.648750305175781,
"learning_rate": 1.5e-05,
"loss": 0.8391,
"step": 1338
},
{
"epoch": 1.9896142433234423,
"grad_norm": 5.8288984298706055,
"learning_rate": 1.5e-05,
"loss": 0.8746,
"step": 1341
},
{
"epoch": 1.9940652818991098,
"grad_norm": 7.342560768127441,
"learning_rate": 1.5e-05,
"loss": 0.8591,
"step": 1344
},
{
"epoch": 1.9985163204747773,
"grad_norm": 9.906895637512207,
"learning_rate": 1.5e-05,
"loss": 0.8925,
"step": 1347
},
{
"epoch": 2.0,
"eval_loss": 2.071904420852661,
"eval_runtime": 553.6606,
"eval_samples_per_second": 2.785,
"eval_steps_per_second": 0.349,
"step": 1348
},
{
"epoch": 2.0029673590504453,
"grad_norm": 11.174464225769043,
"learning_rate": 1.5e-05,
"loss": 0.8723,
"step": 1350
},
{
"epoch": 2.007418397626113,
"grad_norm": 6.163029193878174,
"learning_rate": 1.5e-05,
"loss": 0.8487,
"step": 1353
},
{
"epoch": 2.0118694362017804,
"grad_norm": 11.33940601348877,
"learning_rate": 1.5e-05,
"loss": 0.8795,
"step": 1356
},
{
"epoch": 2.016320474777448,
"grad_norm": 15.676403999328613,
"learning_rate": 1.5e-05,
"loss": 0.91,
"step": 1359
},
{
"epoch": 2.020771513353116,
"grad_norm": 13.067048072814941,
"learning_rate": 1.5e-05,
"loss": 0.8249,
"step": 1362
},
{
"epoch": 2.0252225519287834,
"grad_norm": 9.354158401489258,
"learning_rate": 1.5e-05,
"loss": 0.8865,
"step": 1365
},
{
"epoch": 2.029673590504451,
"grad_norm": 5.574648380279541,
"learning_rate": 1.5e-05,
"loss": 0.8637,
"step": 1368
},
{
"epoch": 2.0341246290801185,
"grad_norm": 15.917570114135742,
"learning_rate": 1.5e-05,
"loss": 0.8854,
"step": 1371
},
{
"epoch": 2.0385756676557865,
"grad_norm": 5.499011516571045,
"learning_rate": 1.5e-05,
"loss": 0.8468,
"step": 1374
},
{
"epoch": 2.043026706231454,
"grad_norm": 15.698616027832031,
"learning_rate": 1.5e-05,
"loss": 0.821,
"step": 1377
},
{
"epoch": 2.0474777448071215,
"grad_norm": 6.169116497039795,
"learning_rate": 1.5e-05,
"loss": 0.8785,
"step": 1380
},
{
"epoch": 2.0519287833827895,
"grad_norm": 8.438339233398438,
"learning_rate": 1.5e-05,
"loss": 0.8298,
"step": 1383
},
{
"epoch": 2.056379821958457,
"grad_norm": 13.13904857635498,
"learning_rate": 1.5e-05,
"loss": 0.8698,
"step": 1386
},
{
"epoch": 2.0608308605341246,
"grad_norm": 5.194973468780518,
"learning_rate": 1.5e-05,
"loss": 0.8581,
"step": 1389
},
{
"epoch": 2.065281899109792,
"grad_norm": 6.09019660949707,
"learning_rate": 1.5e-05,
"loss": 0.8442,
"step": 1392
},
{
"epoch": 2.06973293768546,
"grad_norm": 12.113000869750977,
"learning_rate": 1.5e-05,
"loss": 0.8349,
"step": 1395
},
{
"epoch": 2.0741839762611276,
"grad_norm": 8.027348518371582,
"learning_rate": 1.5e-05,
"loss": 0.8507,
"step": 1398
},
{
"epoch": 2.078635014836795,
"grad_norm": 11.222186088562012,
"learning_rate": 1.5e-05,
"loss": 0.8345,
"step": 1401
},
{
"epoch": 2.083086053412463,
"grad_norm": 7.976278781890869,
"learning_rate": 1.5e-05,
"loss": 0.8036,
"step": 1404
},
{
"epoch": 2.0875370919881306,
"grad_norm": 9.854942321777344,
"learning_rate": 1.5e-05,
"loss": 0.8363,
"step": 1407
},
{
"epoch": 2.091988130563798,
"grad_norm": 11.801050186157227,
"learning_rate": 1.5e-05,
"loss": 0.7906,
"step": 1410
},
{
"epoch": 2.0964391691394657,
"grad_norm": 9.733396530151367,
"learning_rate": 1.5e-05,
"loss": 0.8425,
"step": 1413
},
{
"epoch": 2.1008902077151337,
"grad_norm": 11.16501235961914,
"learning_rate": 1.5e-05,
"loss": 0.7729,
"step": 1416
},
{
"epoch": 2.105341246290801,
"grad_norm": 10.145631790161133,
"learning_rate": 1.5e-05,
"loss": 0.8017,
"step": 1419
},
{
"epoch": 2.1097922848664687,
"grad_norm": 4.5289764404296875,
"learning_rate": 1.5e-05,
"loss": 0.9057,
"step": 1422
},
{
"epoch": 2.1142433234421363,
"grad_norm": 6.727800369262695,
"learning_rate": 1.5e-05,
"loss": 0.8651,
"step": 1425
},
{
"epoch": 2.1186943620178043,
"grad_norm": 11.357308387756348,
"learning_rate": 1.5e-05,
"loss": 0.8358,
"step": 1428
},
{
"epoch": 2.123145400593472,
"grad_norm": 6.047675609588623,
"learning_rate": 1.5e-05,
"loss": 0.8265,
"step": 1431
},
{
"epoch": 2.1275964391691393,
"grad_norm": 8.08861255645752,
"learning_rate": 1.5e-05,
"loss": 0.8374,
"step": 1434
},
{
"epoch": 2.1320474777448073,
"grad_norm": 7.7563958168029785,
"learning_rate": 1.5e-05,
"loss": 0.8818,
"step": 1437
},
{
"epoch": 2.136498516320475,
"grad_norm": 7.988875865936279,
"learning_rate": 1.5e-05,
"loss": 0.8241,
"step": 1440
},
{
"epoch": 2.1409495548961424,
"grad_norm": 12.524341583251953,
"learning_rate": 1.5e-05,
"loss": 0.7918,
"step": 1443
},
{
"epoch": 2.14540059347181,
"grad_norm": 6.229768753051758,
"learning_rate": 1.5e-05,
"loss": 0.862,
"step": 1446
},
{
"epoch": 2.149851632047478,
"grad_norm": 8.271695137023926,
"learning_rate": 1.5e-05,
"loss": 0.8522,
"step": 1449
},
{
"epoch": 2.1543026706231454,
"grad_norm": 5.045875072479248,
"learning_rate": 1.5e-05,
"loss": 0.8574,
"step": 1452
},
{
"epoch": 2.158753709198813,
"grad_norm": 11.379587173461914,
"learning_rate": 1.5e-05,
"loss": 0.8524,
"step": 1455
},
{
"epoch": 2.163204747774481,
"grad_norm": 8.184687614440918,
"learning_rate": 1.5e-05,
"loss": 0.84,
"step": 1458
},
{
"epoch": 2.1676557863501484,
"grad_norm": 9.615589141845703,
"learning_rate": 1.5e-05,
"loss": 0.8334,
"step": 1461
},
{
"epoch": 2.172106824925816,
"grad_norm": 18.80459213256836,
"learning_rate": 1.5e-05,
"loss": 0.805,
"step": 1464
},
{
"epoch": 2.1765578635014835,
"grad_norm": 14.540130615234375,
"learning_rate": 1.5e-05,
"loss": 0.8749,
"step": 1467
},
{
"epoch": 2.1810089020771515,
"grad_norm": 6.465779781341553,
"learning_rate": 1.5e-05,
"loss": 0.8481,
"step": 1470
},
{
"epoch": 2.185459940652819,
"grad_norm": 9.467011451721191,
"learning_rate": 1.5e-05,
"loss": 0.8818,
"step": 1473
},
{
"epoch": 2.1899109792284865,
"grad_norm": 11.624500274658203,
"learning_rate": 1.5e-05,
"loss": 0.8488,
"step": 1476
},
{
"epoch": 2.1943620178041545,
"grad_norm": 4.053292751312256,
"learning_rate": 1.5e-05,
"loss": 0.8816,
"step": 1479
},
{
"epoch": 2.198813056379822,
"grad_norm": 11.990628242492676,
"learning_rate": 1.5e-05,
"loss": 0.8632,
"step": 1482
},
{
"epoch": 2.2032640949554896,
"grad_norm": 5.125602722167969,
"learning_rate": 1.5e-05,
"loss": 0.8546,
"step": 1485
},
{
"epoch": 2.207715133531157,
"grad_norm": 12.101594924926758,
"learning_rate": 1.5e-05,
"loss": 0.7745,
"step": 1488
},
{
"epoch": 2.212166172106825,
"grad_norm": 7.778988838195801,
"learning_rate": 1.5e-05,
"loss": 0.8345,
"step": 1491
},
{
"epoch": 2.2166172106824926,
"grad_norm": 9.549551010131836,
"learning_rate": 1.5e-05,
"loss": 0.788,
"step": 1494
},
{
"epoch": 2.22106824925816,
"grad_norm": 9.322439193725586,
"learning_rate": 1.5e-05,
"loss": 0.8578,
"step": 1497
},
{
"epoch": 2.2255192878338277,
"grad_norm": 4.3148298263549805,
"learning_rate": 1.5e-05,
"loss": 0.8553,
"step": 1500
},
{
"epoch": 2.2299703264094957,
"grad_norm": 8.451520919799805,
"learning_rate": 1.5e-05,
"loss": 0.8682,
"step": 1503
},
{
"epoch": 2.234421364985163,
"grad_norm": 6.928389072418213,
"learning_rate": 1.5e-05,
"loss": 0.8307,
"step": 1506
},
{
"epoch": 2.2388724035608307,
"grad_norm": 6.243911266326904,
"learning_rate": 1.5e-05,
"loss": 0.8033,
"step": 1509
},
{
"epoch": 2.2433234421364987,
"grad_norm": 5.559226036071777,
"learning_rate": 1.5e-05,
"loss": 0.8383,
"step": 1512
},
{
"epoch": 2.2477744807121662,
"grad_norm": 4.369063854217529,
"learning_rate": 1.5e-05,
"loss": 0.8009,
"step": 1515
},
{
"epoch": 2.2522255192878338,
"grad_norm": 7.634733200073242,
"learning_rate": 1.5e-05,
"loss": 0.7984,
"step": 1518
},
{
"epoch": 2.2566765578635013,
"grad_norm": 6.254056453704834,
"learning_rate": 1.5e-05,
"loss": 0.8802,
"step": 1521
},
{
"epoch": 2.2611275964391693,
"grad_norm": 5.46887731552124,
"learning_rate": 1.5e-05,
"loss": 0.8178,
"step": 1524
},
{
"epoch": 2.265578635014837,
"grad_norm": 16.369176864624023,
"learning_rate": 1.5e-05,
"loss": 0.8448,
"step": 1527
},
{
"epoch": 2.2700296735905043,
"grad_norm": 7.460346221923828,
"learning_rate": 1.5e-05,
"loss": 0.8313,
"step": 1530
},
{
"epoch": 2.274480712166172,
"grad_norm": 11.850396156311035,
"learning_rate": 1.5e-05,
"loss": 0.8652,
"step": 1533
},
{
"epoch": 2.27893175074184,
"grad_norm": 7.525960445404053,
"learning_rate": 1.5e-05,
"loss": 0.9073,
"step": 1536
},
{
"epoch": 2.2833827893175074,
"grad_norm": 6.6893391609191895,
"learning_rate": 1.5e-05,
"loss": 0.8186,
"step": 1539
},
{
"epoch": 2.287833827893175,
"grad_norm": 8.127947807312012,
"learning_rate": 1.5e-05,
"loss": 0.8062,
"step": 1542
},
{
"epoch": 2.292284866468843,
"grad_norm": 4.763282299041748,
"learning_rate": 1.5e-05,
"loss": 0.7815,
"step": 1545
},
{
"epoch": 2.2967359050445104,
"grad_norm": 8.980463981628418,
"learning_rate": 1.5e-05,
"loss": 0.8257,
"step": 1548
},
{
"epoch": 2.301186943620178,
"grad_norm": 5.902709484100342,
"learning_rate": 1.5e-05,
"loss": 0.8655,
"step": 1551
},
{
"epoch": 2.3056379821958455,
"grad_norm": 9.1312255859375,
"learning_rate": 1.5e-05,
"loss": 0.8125,
"step": 1554
},
{
"epoch": 2.3100890207715135,
"grad_norm": 11.039669036865234,
"learning_rate": 1.5e-05,
"loss": 0.8216,
"step": 1557
},
{
"epoch": 2.314540059347181,
"grad_norm": 8.05490779876709,
"learning_rate": 1.5e-05,
"loss": 0.8396,
"step": 1560
},
{
"epoch": 2.3189910979228485,
"grad_norm": 5.826514720916748,
"learning_rate": 1.5e-05,
"loss": 0.8757,
"step": 1563
},
{
"epoch": 2.3234421364985165,
"grad_norm": 7.574896812438965,
"learning_rate": 1.5e-05,
"loss": 0.7736,
"step": 1566
},
{
"epoch": 2.327893175074184,
"grad_norm": 6.01354455947876,
"learning_rate": 1.5e-05,
"loss": 0.8401,
"step": 1569
},
{
"epoch": 2.3323442136498516,
"grad_norm": 6.542453289031982,
"learning_rate": 1.5e-05,
"loss": 0.8068,
"step": 1572
},
{
"epoch": 2.336795252225519,
"grad_norm": 9.089799880981445,
"learning_rate": 1.5e-05,
"loss": 0.8628,
"step": 1575
},
{
"epoch": 2.341246290801187,
"grad_norm": 6.65020227432251,
"learning_rate": 1.5e-05,
"loss": 0.8298,
"step": 1578
},
{
"epoch": 2.3456973293768546,
"grad_norm": 6.966747760772705,
"learning_rate": 1.5e-05,
"loss": 0.8445,
"step": 1581
},
{
"epoch": 2.350148367952522,
"grad_norm": 8.938283920288086,
"learning_rate": 1.5e-05,
"loss": 0.8404,
"step": 1584
},
{
"epoch": 2.35459940652819,
"grad_norm": 7.403584957122803,
"learning_rate": 1.5e-05,
"loss": 0.8531,
"step": 1587
},
{
"epoch": 2.3590504451038576,
"grad_norm": 5.015456199645996,
"learning_rate": 1.5e-05,
"loss": 0.8488,
"step": 1590
},
{
"epoch": 2.363501483679525,
"grad_norm": 10.829426765441895,
"learning_rate": 1.5e-05,
"loss": 0.8292,
"step": 1593
},
{
"epoch": 2.3679525222551927,
"grad_norm": 10.542449951171875,
"learning_rate": 1.5e-05,
"loss": 0.9097,
"step": 1596
},
{
"epoch": 2.3724035608308607,
"grad_norm": 5.919280529022217,
"learning_rate": 1.5e-05,
"loss": 0.857,
"step": 1599
},
{
"epoch": 2.376854599406528,
"grad_norm": 12.15097713470459,
"learning_rate": 1.5e-05,
"loss": 0.8439,
"step": 1602
},
{
"epoch": 2.3813056379821957,
"grad_norm": 12.634583473205566,
"learning_rate": 1.5e-05,
"loss": 0.8838,
"step": 1605
},
{
"epoch": 2.3857566765578637,
"grad_norm": 9.54806900024414,
"learning_rate": 1.5e-05,
"loss": 0.9001,
"step": 1608
},
{
"epoch": 2.3902077151335313,
"grad_norm": 5.300346851348877,
"learning_rate": 1.5e-05,
"loss": 0.84,
"step": 1611
},
{
"epoch": 2.394658753709199,
"grad_norm": 6.94837760925293,
"learning_rate": 1.5e-05,
"loss": 0.7643,
"step": 1614
},
{
"epoch": 2.3991097922848663,
"grad_norm": 12.666196823120117,
"learning_rate": 1.5e-05,
"loss": 0.8797,
"step": 1617
},
{
"epoch": 2.4035608308605343,
"grad_norm": 11.400768280029297,
"learning_rate": 1.5e-05,
"loss": 0.8755,
"step": 1620
},
{
"epoch": 2.408011869436202,
"grad_norm": 9.61484146118164,
"learning_rate": 1.5e-05,
"loss": 0.8662,
"step": 1623
},
{
"epoch": 2.4124629080118694,
"grad_norm": 7.2894110679626465,
"learning_rate": 1.5e-05,
"loss": 0.7903,
"step": 1626
},
{
"epoch": 2.4169139465875373,
"grad_norm": 4.545930862426758,
"learning_rate": 1.5e-05,
"loss": 0.862,
"step": 1629
},
{
"epoch": 2.421364985163205,
"grad_norm": 16.610261917114258,
"learning_rate": 1.5e-05,
"loss": 0.8769,
"step": 1632
},
{
"epoch": 2.4258160237388724,
"grad_norm": 14.895539283752441,
"learning_rate": 1.5e-05,
"loss": 0.8884,
"step": 1635
},
{
"epoch": 2.43026706231454,
"grad_norm": 6.956692218780518,
"learning_rate": 1.5e-05,
"loss": 0.8112,
"step": 1638
},
{
"epoch": 2.434718100890208,
"grad_norm": 8.233116149902344,
"learning_rate": 1.5e-05,
"loss": 0.8461,
"step": 1641
},
{
"epoch": 2.4391691394658754,
"grad_norm": 9.529879570007324,
"learning_rate": 1.5e-05,
"loss": 0.8649,
"step": 1644
},
{
"epoch": 2.443620178041543,
"grad_norm": 7.341912269592285,
"learning_rate": 1.5e-05,
"loss": 0.8393,
"step": 1647
},
{
"epoch": 2.4480712166172105,
"grad_norm": 7.184902667999268,
"learning_rate": 1.5e-05,
"loss": 0.8309,
"step": 1650
},
{
"epoch": 2.4525222551928785,
"grad_norm": 14.62401008605957,
"learning_rate": 1.5e-05,
"loss": 0.8549,
"step": 1653
},
{
"epoch": 2.456973293768546,
"grad_norm": 5.0358662605285645,
"learning_rate": 1.5e-05,
"loss": 0.8726,
"step": 1656
},
{
"epoch": 2.4614243323442135,
"grad_norm": 3.3150253295898438,
"learning_rate": 1.5e-05,
"loss": 0.8276,
"step": 1659
},
{
"epoch": 2.465875370919881,
"grad_norm": 8.129389762878418,
"learning_rate": 1.5e-05,
"loss": 0.8692,
"step": 1662
},
{
"epoch": 2.470326409495549,
"grad_norm": 8.148307800292969,
"learning_rate": 1.5e-05,
"loss": 0.8713,
"step": 1665
},
{
"epoch": 2.4747774480712166,
"grad_norm": 5.398252487182617,
"learning_rate": 1.5e-05,
"loss": 0.8615,
"step": 1668
},
{
"epoch": 2.479228486646884,
"grad_norm": 4.667980194091797,
"learning_rate": 1.5e-05,
"loss": 0.8324,
"step": 1671
},
{
"epoch": 2.483679525222552,
"grad_norm": 6.927284240722656,
"learning_rate": 1.5e-05,
"loss": 0.8589,
"step": 1674
},
{
"epoch": 2.4881305637982196,
"grad_norm": 11.005992889404297,
"learning_rate": 1.5e-05,
"loss": 0.8564,
"step": 1677
},
{
"epoch": 2.492581602373887,
"grad_norm": 16.280454635620117,
"learning_rate": 1.5e-05,
"loss": 0.8761,
"step": 1680
},
{
"epoch": 2.4970326409495547,
"grad_norm": 8.563511848449707,
"learning_rate": 1.5e-05,
"loss": 0.8643,
"step": 1683
},
{
"epoch": 2.5014836795252227,
"grad_norm": 17.003629684448242,
"learning_rate": 1.5e-05,
"loss": 0.8249,
"step": 1686
},
{
"epoch": 2.50593471810089,
"grad_norm": 6.441048622131348,
"learning_rate": 1.5e-05,
"loss": 0.9072,
"step": 1689
},
{
"epoch": 2.5103857566765577,
"grad_norm": 6.359565734863281,
"learning_rate": 1.5e-05,
"loss": 0.8495,
"step": 1692
},
{
"epoch": 2.5148367952522257,
"grad_norm": 9.161234855651855,
"learning_rate": 1.5e-05,
"loss": 0.8319,
"step": 1695
},
{
"epoch": 2.5192878338278932,
"grad_norm": 10.241405487060547,
"learning_rate": 1.5e-05,
"loss": 0.7881,
"step": 1698
},
{
"epoch": 2.5237388724035608,
"grad_norm": 9.603667259216309,
"learning_rate": 1.5e-05,
"loss": 0.8681,
"step": 1701
},
{
"epoch": 2.5281899109792283,
"grad_norm": 8.364523887634277,
"learning_rate": 1.5e-05,
"loss": 0.7961,
"step": 1704
},
{
"epoch": 2.5326409495548963,
"grad_norm": 8.140654563903809,
"learning_rate": 1.5e-05,
"loss": 0.8807,
"step": 1707
},
{
"epoch": 2.537091988130564,
"grad_norm": 12.45283031463623,
"learning_rate": 1.5e-05,
"loss": 0.8206,
"step": 1710
},
{
"epoch": 2.5415430267062313,
"grad_norm": 7.65419864654541,
"learning_rate": 1.5e-05,
"loss": 0.8989,
"step": 1713
},
{
"epoch": 2.5459940652818993,
"grad_norm": 4.040281295776367,
"learning_rate": 1.5e-05,
"loss": 0.8814,
"step": 1716
},
{
"epoch": 2.550445103857567,
"grad_norm": 11.04344654083252,
"learning_rate": 1.5e-05,
"loss": 0.8469,
"step": 1719
},
{
"epoch": 2.5548961424332344,
"grad_norm": 12.735292434692383,
"learning_rate": 1.5e-05,
"loss": 0.8847,
"step": 1722
},
{
"epoch": 2.559347181008902,
"grad_norm": 7.6085686683654785,
"learning_rate": 1.5e-05,
"loss": 0.851,
"step": 1725
},
{
"epoch": 2.56379821958457,
"grad_norm": 10.644798278808594,
"learning_rate": 1.5e-05,
"loss": 0.861,
"step": 1728
},
{
"epoch": 2.5682492581602374,
"grad_norm": 7.817785263061523,
"learning_rate": 1.5e-05,
"loss": 0.8574,
"step": 1731
},
{
"epoch": 2.572700296735905,
"grad_norm": 14.533990859985352,
"learning_rate": 1.5e-05,
"loss": 0.8883,
"step": 1734
},
{
"epoch": 2.577151335311573,
"grad_norm": 9.98595905303955,
"learning_rate": 1.5e-05,
"loss": 0.7992,
"step": 1737
},
{
"epoch": 2.5816023738872405,
"grad_norm": 13.704192161560059,
"learning_rate": 1.5e-05,
"loss": 0.8781,
"step": 1740
},
{
"epoch": 2.586053412462908,
"grad_norm": 6.400760650634766,
"learning_rate": 1.5e-05,
"loss": 0.8248,
"step": 1743
},
{
"epoch": 2.5905044510385755,
"grad_norm": 3.9698002338409424,
"learning_rate": 1.5e-05,
"loss": 0.7804,
"step": 1746
},
{
"epoch": 2.594955489614243,
"grad_norm": 5.405271053314209,
"learning_rate": 1.5e-05,
"loss": 0.8691,
"step": 1749
},
{
"epoch": 2.599406528189911,
"grad_norm": 7.326030731201172,
"learning_rate": 1.5e-05,
"loss": 0.7876,
"step": 1752
},
{
"epoch": 2.6038575667655786,
"grad_norm": 12.94884967803955,
"learning_rate": 1.5e-05,
"loss": 0.7877,
"step": 1755
},
{
"epoch": 2.6083086053412465,
"grad_norm": 12.542633056640625,
"learning_rate": 1.5e-05,
"loss": 0.8311,
"step": 1758
},
{
"epoch": 2.612759643916914,
"grad_norm": 12.357892036437988,
"learning_rate": 1.5e-05,
"loss": 0.8398,
"step": 1761
},
{
"epoch": 2.6172106824925816,
"grad_norm": 10.735803604125977,
"learning_rate": 1.5e-05,
"loss": 0.8557,
"step": 1764
},
{
"epoch": 2.621661721068249,
"grad_norm": 7.849278450012207,
"learning_rate": 1.5e-05,
"loss": 0.8559,
"step": 1767
},
{
"epoch": 2.6261127596439167,
"grad_norm": 7.459741592407227,
"learning_rate": 1.5e-05,
"loss": 0.8382,
"step": 1770
},
{
"epoch": 2.6305637982195846,
"grad_norm": 9.422908782958984,
"learning_rate": 1.5e-05,
"loss": 0.8586,
"step": 1773
},
{
"epoch": 2.635014836795252,
"grad_norm": 6.327311038970947,
"learning_rate": 1.5e-05,
"loss": 0.8232,
"step": 1776
},
{
"epoch": 2.63946587537092,
"grad_norm": 10.571976661682129,
"learning_rate": 1.5e-05,
"loss": 0.8694,
"step": 1779
},
{
"epoch": 2.6439169139465877,
"grad_norm": 17.467416763305664,
"learning_rate": 1.5e-05,
"loss": 0.8693,
"step": 1782
},
{
"epoch": 2.648367952522255,
"grad_norm": 6.911043167114258,
"learning_rate": 1.5e-05,
"loss": 0.8643,
"step": 1785
},
{
"epoch": 2.6528189910979227,
"grad_norm": 10.180506706237793,
"learning_rate": 1.5e-05,
"loss": 0.832,
"step": 1788
},
{
"epoch": 2.6572700296735903,
"grad_norm": 5.487372398376465,
"learning_rate": 1.5e-05,
"loss": 0.8211,
"step": 1791
},
{
"epoch": 2.6617210682492582,
"grad_norm": 8.488285064697266,
"learning_rate": 1.5e-05,
"loss": 0.8174,
"step": 1794
},
{
"epoch": 2.666172106824926,
"grad_norm": 14.654566764831543,
"learning_rate": 1.5e-05,
"loss": 0.8443,
"step": 1797
},
{
"epoch": 2.6706231454005933,
"grad_norm": 8.551965713500977,
"learning_rate": 1.5e-05,
"loss": 0.8778,
"step": 1800
},
{
"epoch": 2.6750741839762613,
"grad_norm": 6.797290802001953,
"learning_rate": 1.5e-05,
"loss": 0.859,
"step": 1803
},
{
"epoch": 2.679525222551929,
"grad_norm": 4.413401126861572,
"learning_rate": 1.5e-05,
"loss": 0.9004,
"step": 1806
},
{
"epoch": 2.6839762611275964,
"grad_norm": 8.826961517333984,
"learning_rate": 1.5e-05,
"loss": 0.8618,
"step": 1809
},
{
"epoch": 2.688427299703264,
"grad_norm": 6.915543556213379,
"learning_rate": 1.5e-05,
"loss": 0.8242,
"step": 1812
},
{
"epoch": 2.692878338278932,
"grad_norm": 7.802698612213135,
"learning_rate": 1.5e-05,
"loss": 0.8373,
"step": 1815
},
{
"epoch": 2.6973293768545994,
"grad_norm": 4.345271587371826,
"learning_rate": 1.5e-05,
"loss": 0.8212,
"step": 1818
},
{
"epoch": 2.701780415430267,
"grad_norm": 8.312252044677734,
"learning_rate": 1.5e-05,
"loss": 0.807,
"step": 1821
},
{
"epoch": 2.706231454005935,
"grad_norm": 5.19842004776001,
"learning_rate": 1.5e-05,
"loss": 0.8513,
"step": 1824
},
{
"epoch": 2.7106824925816024,
"grad_norm": 14.573792457580566,
"learning_rate": 1.5e-05,
"loss": 0.8215,
"step": 1827
},
{
"epoch": 2.71513353115727,
"grad_norm": 6.5800323486328125,
"learning_rate": 1.5e-05,
"loss": 0.8826,
"step": 1830
},
{
"epoch": 2.7195845697329375,
"grad_norm": 14.643542289733887,
"learning_rate": 1.5e-05,
"loss": 0.7716,
"step": 1833
},
{
"epoch": 2.7240356083086055,
"grad_norm": 12.744583129882812,
"learning_rate": 1.5e-05,
"loss": 0.8621,
"step": 1836
},
{
"epoch": 2.728486646884273,
"grad_norm": 12.435503005981445,
"learning_rate": 1.5e-05,
"loss": 0.8908,
"step": 1839
},
{
"epoch": 2.7329376854599405,
"grad_norm": 6.115302562713623,
"learning_rate": 1.5e-05,
"loss": 0.8921,
"step": 1842
},
{
"epoch": 2.7373887240356085,
"grad_norm": 14.632364273071289,
"learning_rate": 1.5e-05,
"loss": 0.8535,
"step": 1845
},
{
"epoch": 2.741839762611276,
"grad_norm": 5.676476001739502,
"learning_rate": 1.5e-05,
"loss": 0.8872,
"step": 1848
},
{
"epoch": 2.7462908011869436,
"grad_norm": 12.727757453918457,
"learning_rate": 1.5e-05,
"loss": 0.8836,
"step": 1851
},
{
"epoch": 2.750741839762611,
"grad_norm": 5.729983329772949,
"learning_rate": 1.5e-05,
"loss": 0.8453,
"step": 1854
},
{
"epoch": 2.755192878338279,
"grad_norm": 8.607340812683105,
"learning_rate": 1.5e-05,
"loss": 0.8795,
"step": 1857
},
{
"epoch": 2.7596439169139466,
"grad_norm": 7.55084228515625,
"learning_rate": 1.5e-05,
"loss": 0.8788,
"step": 1860
},
{
"epoch": 2.764094955489614,
"grad_norm": 10.093510627746582,
"learning_rate": 1.5e-05,
"loss": 0.8816,
"step": 1863
},
{
"epoch": 2.768545994065282,
"grad_norm": 8.678201675415039,
"learning_rate": 1.5e-05,
"loss": 0.867,
"step": 1866
},
{
"epoch": 2.7729970326409497,
"grad_norm": 6.614081859588623,
"learning_rate": 1.5e-05,
"loss": 0.887,
"step": 1869
},
{
"epoch": 2.777448071216617,
"grad_norm": 6.593700408935547,
"learning_rate": 1.5e-05,
"loss": 0.8929,
"step": 1872
},
{
"epoch": 2.7818991097922847,
"grad_norm": 5.097481727600098,
"learning_rate": 1.5e-05,
"loss": 0.8629,
"step": 1875
},
{
"epoch": 2.7863501483679523,
"grad_norm": 5.016817569732666,
"learning_rate": 1.5e-05,
"loss": 0.8413,
"step": 1878
},
{
"epoch": 2.7908011869436202,
"grad_norm": 7.502362251281738,
"learning_rate": 1.5e-05,
"loss": 0.8945,
"step": 1881
},
{
"epoch": 2.7952522255192878,
"grad_norm": 4.612887859344482,
"learning_rate": 1.5e-05,
"loss": 0.8811,
"step": 1884
},
{
"epoch": 2.7997032640949557,
"grad_norm": 5.493846893310547,
"learning_rate": 1.5e-05,
"loss": 0.8757,
"step": 1887
},
{
"epoch": 2.8041543026706233,
"grad_norm": 8.605670928955078,
"learning_rate": 1.5e-05,
"loss": 0.8047,
"step": 1890
},
{
"epoch": 2.808605341246291,
"grad_norm": 12.178396224975586,
"learning_rate": 1.5e-05,
"loss": 0.8905,
"step": 1893
},
{
"epoch": 2.8130563798219583,
"grad_norm": 8.929186820983887,
"learning_rate": 1.5e-05,
"loss": 0.8869,
"step": 1896
},
{
"epoch": 2.817507418397626,
"grad_norm": 6.589859962463379,
"learning_rate": 1.5e-05,
"loss": 0.8473,
"step": 1899
},
{
"epoch": 2.821958456973294,
"grad_norm": 10.543880462646484,
"learning_rate": 1.5e-05,
"loss": 0.8613,
"step": 1902
},
{
"epoch": 2.8264094955489614,
"grad_norm": 8.176854133605957,
"learning_rate": 1.5e-05,
"loss": 0.8429,
"step": 1905
},
{
"epoch": 2.8308605341246293,
"grad_norm": 5.652864456176758,
"learning_rate": 1.5e-05,
"loss": 0.8754,
"step": 1908
},
{
"epoch": 2.835311572700297,
"grad_norm": 8.587650299072266,
"learning_rate": 1.5e-05,
"loss": 0.9134,
"step": 1911
},
{
"epoch": 2.8397626112759644,
"grad_norm": 5.4106974601745605,
"learning_rate": 1.5e-05,
"loss": 0.8334,
"step": 1914
},
{
"epoch": 2.844213649851632,
"grad_norm": 6.253225803375244,
"learning_rate": 1.5e-05,
"loss": 0.901,
"step": 1917
},
{
"epoch": 2.8486646884272995,
"grad_norm": 8.90531063079834,
"learning_rate": 1.5e-05,
"loss": 0.8316,
"step": 1920
},
{
"epoch": 2.8531157270029674,
"grad_norm": 4.412182807922363,
"learning_rate": 1.5e-05,
"loss": 0.8384,
"step": 1923
},
{
"epoch": 2.857566765578635,
"grad_norm": 6.357685565948486,
"learning_rate": 1.5e-05,
"loss": 0.8903,
"step": 1926
},
{
"epoch": 2.8620178041543025,
"grad_norm": 7.667703628540039,
"learning_rate": 1.5e-05,
"loss": 0.8455,
"step": 1929
},
{
"epoch": 2.8664688427299705,
"grad_norm": 10.909478187561035,
"learning_rate": 1.5e-05,
"loss": 0.8399,
"step": 1932
},
{
"epoch": 2.870919881305638,
"grad_norm": 7.347332954406738,
"learning_rate": 1.5e-05,
"loss": 0.8198,
"step": 1935
},
{
"epoch": 2.8753709198813056,
"grad_norm": 7.22322416305542,
"learning_rate": 1.5e-05,
"loss": 0.9074,
"step": 1938
},
{
"epoch": 2.879821958456973,
"grad_norm": 5.389438152313232,
"learning_rate": 1.5e-05,
"loss": 0.846,
"step": 1941
},
{
"epoch": 2.884272997032641,
"grad_norm": 8.13633918762207,
"learning_rate": 1.5e-05,
"loss": 0.8615,
"step": 1944
},
{
"epoch": 2.8887240356083086,
"grad_norm": 7.694199085235596,
"learning_rate": 1.5e-05,
"loss": 0.7791,
"step": 1947
},
{
"epoch": 2.893175074183976,
"grad_norm": 10.673176765441895,
"learning_rate": 1.5e-05,
"loss": 0.8234,
"step": 1950
},
{
"epoch": 2.897626112759644,
"grad_norm": 7.695837020874023,
"learning_rate": 1.5e-05,
"loss": 0.8471,
"step": 1953
},
{
"epoch": 2.9020771513353116,
"grad_norm": 11.210200309753418,
"learning_rate": 1.5e-05,
"loss": 0.8561,
"step": 1956
},
{
"epoch": 2.906528189910979,
"grad_norm": 13.856889724731445,
"learning_rate": 1.5e-05,
"loss": 0.8177,
"step": 1959
},
{
"epoch": 2.9109792284866467,
"grad_norm": 6.0733489990234375,
"learning_rate": 1.5e-05,
"loss": 0.8885,
"step": 1962
},
{
"epoch": 2.9154302670623147,
"grad_norm": 7.38955545425415,
"learning_rate": 1.5e-05,
"loss": 0.8502,
"step": 1965
},
{
"epoch": 2.919881305637982,
"grad_norm": 13.866927146911621,
"learning_rate": 1.5e-05,
"loss": 0.8622,
"step": 1968
},
{
"epoch": 2.9243323442136497,
"grad_norm": 6.984748363494873,
"learning_rate": 1.5e-05,
"loss": 0.9041,
"step": 1971
},
{
"epoch": 2.9287833827893177,
"grad_norm": 13.53242301940918,
"learning_rate": 1.5e-05,
"loss": 0.8367,
"step": 1974
},
{
"epoch": 2.9332344213649852,
"grad_norm": 6.702526092529297,
"learning_rate": 1.5e-05,
"loss": 0.8343,
"step": 1977
},
{
"epoch": 2.9376854599406528,
"grad_norm": 5.4899678230285645,
"learning_rate": 1.5e-05,
"loss": 0.8873,
"step": 1980
},
{
"epoch": 2.9421364985163203,
"grad_norm": 8.483062744140625,
"learning_rate": 1.5e-05,
"loss": 0.8078,
"step": 1983
},
{
"epoch": 2.9465875370919883,
"grad_norm": 7.6923065185546875,
"learning_rate": 1.5e-05,
"loss": 0.8316,
"step": 1986
},
{
"epoch": 2.951038575667656,
"grad_norm": 4.571675777435303,
"learning_rate": 1.5e-05,
"loss": 0.882,
"step": 1989
},
{
"epoch": 2.9554896142433233,
"grad_norm": 8.073565483093262,
"learning_rate": 1.5e-05,
"loss": 0.8233,
"step": 1992
},
{
"epoch": 2.9599406528189913,
"grad_norm": 13.49317455291748,
"learning_rate": 1.5e-05,
"loss": 0.9236,
"step": 1995
},
{
"epoch": 2.964391691394659,
"grad_norm": 6.603755474090576,
"learning_rate": 1.5e-05,
"loss": 0.8692,
"step": 1998
},
{
"epoch": 2.9688427299703264,
"grad_norm": 8.523149490356445,
"learning_rate": 1.5e-05,
"loss": 0.8333,
"step": 2001
},
{
"epoch": 2.973293768545994,
"grad_norm": 9.513497352600098,
"learning_rate": 1.5e-05,
"loss": 0.8879,
"step": 2004
},
{
"epoch": 2.9777448071216615,
"grad_norm": 8.058304786682129,
"learning_rate": 1.5e-05,
"loss": 0.8539,
"step": 2007
},
{
"epoch": 2.9821958456973294,
"grad_norm": 6.942746162414551,
"learning_rate": 1.5e-05,
"loss": 0.8532,
"step": 2010
},
{
"epoch": 2.986646884272997,
"grad_norm": 7.1598639488220215,
"learning_rate": 1.5e-05,
"loss": 0.8094,
"step": 2013
},
{
"epoch": 2.991097922848665,
"grad_norm": 7.722570419311523,
"learning_rate": 1.5e-05,
"loss": 0.85,
"step": 2016
},
{
"epoch": 2.9955489614243325,
"grad_norm": 7.609329700469971,
"learning_rate": 1.5e-05,
"loss": 0.9311,
"step": 2019
},
{
"epoch": 3.0,
"grad_norm": 6.5114426612854,
"learning_rate": 1.5e-05,
"loss": 0.8043,
"step": 2022
},
{
"epoch": 3.0,
"eval_loss": 2.056363105773926,
"eval_runtime": 552.8503,
"eval_samples_per_second": 2.789,
"eval_steps_per_second": 0.349,
"step": 2022
}
],
"logging_steps": 3,
"max_steps": 6740,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}