Dual-Explain-2_round / trainer_state.json
tkhangg0910's picture
Upload folder using huggingface_hub
8560113 verified
{
"best_global_step": null,
"best_metric": 0.6839648485183716,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1011,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002967359050445104,
"grad_norm": 15.052371978759766,
"learning_rate": 0.0,
"loss": 0.3511,
"step": 1
},
{
"epoch": 0.008902077151335312,
"grad_norm": 11.145326614379883,
"learning_rate": 3.2123800577354604e-06,
"loss": 0.3486,
"step": 3
},
{
"epoch": 0.017804154302670624,
"grad_norm": 7.514125823974609,
"learning_rate": 5.239166215940359e-06,
"loss": 0.3444,
"step": 6
},
{
"epoch": 0.026706231454005934,
"grad_norm": 8.797248840332031,
"learning_rate": 6.424760115470921e-06,
"loss": 0.3284,
"step": 9
},
{
"epoch": 0.03560830860534125,
"grad_norm": 3.7588727474212646,
"learning_rate": 7.265952374145257e-06,
"loss": 0.3375,
"step": 12
},
{
"epoch": 0.04451038575667656,
"grad_norm": 7.172510623931885,
"learning_rate": 7.918431780800236e-06,
"loss": 0.3271,
"step": 15
},
{
"epoch": 0.05341246290801187,
"grad_norm": 3.615478992462158,
"learning_rate": 8.451546273675818e-06,
"loss": 0.3277,
"step": 18
},
{
"epoch": 0.06231454005934718,
"grad_norm": 6.7531232833862305,
"learning_rate": 8.902288154930203e-06,
"loss": 0.3343,
"step": 21
},
{
"epoch": 0.0712166172106825,
"grad_norm": 6.248157501220703,
"learning_rate": 9.292738532350157e-06,
"loss": 0.3294,
"step": 24
},
{
"epoch": 0.08011869436201781,
"grad_norm": 6.436707019805908,
"learning_rate": 9.637140173206382e-06,
"loss": 0.3358,
"step": 27
},
{
"epoch": 0.08902077151335312,
"grad_norm": 9.06855583190918,
"learning_rate": 9.945217939005136e-06,
"loss": 0.3293,
"step": 30
},
{
"epoch": 0.09792284866468842,
"grad_norm": 7.107985973358154,
"learning_rate": 1.0223908177645902e-05,
"loss": 0.3111,
"step": 33
},
{
"epoch": 0.10682492581602374,
"grad_norm": 6.3856520652771,
"learning_rate": 1.0478332431880717e-05,
"loss": 0.3251,
"step": 36
},
{
"epoch": 0.11572700296735905,
"grad_norm": 4.95862340927124,
"learning_rate": 1.0712380057735461e-05,
"loss": 0.3211,
"step": 39
},
{
"epoch": 0.12462908011869436,
"grad_norm": 4.809103012084961,
"learning_rate": 1.0929074313135101e-05,
"loss": 0.3244,
"step": 42
},
{
"epoch": 0.13353115727002968,
"grad_norm": 6.697277069091797,
"learning_rate": 1.1130811838535696e-05,
"loss": 0.3179,
"step": 45
},
{
"epoch": 0.142433234421365,
"grad_norm": 12.093936920166016,
"learning_rate": 1.1319524690555053e-05,
"loss": 0.3339,
"step": 48
},
{
"epoch": 0.1513353115727003,
"grad_norm": 8.427734375,
"learning_rate": 1.1496793166558515e-05,
"loss": 0.329,
"step": 51
},
{
"epoch": 0.16023738872403562,
"grad_norm": 11.345624923706055,
"learning_rate": 1.1663926331411281e-05,
"loss": 0.3212,
"step": 54
},
{
"epoch": 0.16913946587537093,
"grad_norm": 7.70440149307251,
"learning_rate": 1.1822020743040672e-05,
"loss": 0.3119,
"step": 57
},
{
"epoch": 0.17804154302670624,
"grad_norm": 4.138779163360596,
"learning_rate": 1.1972004097210032e-05,
"loss": 0.3186,
"step": 60
},
{
"epoch": 0.18694362017804153,
"grad_norm": 9.505159378051758,
"learning_rate": 1.2114668212665663e-05,
"loss": 0.3125,
"step": 63
},
{
"epoch": 0.19584569732937684,
"grad_norm": 5.866057395935059,
"learning_rate": 1.2250694335850798e-05,
"loss": 0.3217,
"step": 66
},
{
"epoch": 0.20474777448071216,
"grad_norm": 9.940937995910645,
"learning_rate": 1.238067281605409e-05,
"loss": 0.3295,
"step": 69
},
{
"epoch": 0.21364985163204747,
"grad_norm": 8.819473266601562,
"learning_rate": 1.2505118590085615e-05,
"loss": 0.3351,
"step": 72
},
{
"epoch": 0.22255192878338279,
"grad_norm": 9.78069019317627,
"learning_rate": 1.262448350386501e-05,
"loss": 0.3383,
"step": 75
},
{
"epoch": 0.2314540059347181,
"grad_norm": 5.660183429718018,
"learning_rate": 1.2739166215940359e-05,
"loss": 0.3252,
"step": 78
},
{
"epoch": 0.2403560830860534,
"grad_norm": 7.043887615203857,
"learning_rate": 1.2849520230941842e-05,
"loss": 0.3249,
"step": 81
},
{
"epoch": 0.24925816023738873,
"grad_norm": 7.998411178588867,
"learning_rate": 1.295586047134e-05,
"loss": 0.3359,
"step": 84
},
{
"epoch": 0.258160237388724,
"grad_norm": 11.106488227844238,
"learning_rate": 1.3058468695482481e-05,
"loss": 0.3211,
"step": 87
},
{
"epoch": 0.26706231454005935,
"grad_norm": 5.680809497833252,
"learning_rate": 1.3157597996740594e-05,
"loss": 0.3152,
"step": 90
},
{
"epoch": 0.27596439169139464,
"grad_norm": 6.006475925445557,
"learning_rate": 1.3253476564657357e-05,
"loss": 0.3171,
"step": 93
},
{
"epoch": 0.28486646884273,
"grad_norm": 4.647056579589844,
"learning_rate": 1.3346310848759951e-05,
"loss": 0.31,
"step": 96
},
{
"epoch": 0.29376854599406527,
"grad_norm": 4.605801582336426,
"learning_rate": 1.343628823538136e-05,
"loss": 0.3185,
"step": 99
},
{
"epoch": 0.3026706231454006,
"grad_norm": 17.812042236328125,
"learning_rate": 1.3523579324763411e-05,
"loss": 0.3161,
"step": 102
},
{
"epoch": 0.3115727002967359,
"grad_norm": 12.627123832702637,
"learning_rate": 1.3608339877994978e-05,
"loss": 0.329,
"step": 105
},
{
"epoch": 0.32047477744807124,
"grad_norm": 7.308374404907227,
"learning_rate": 1.3690712489616179e-05,
"loss": 0.3179,
"step": 108
},
{
"epoch": 0.3293768545994065,
"grad_norm": 5.522846698760986,
"learning_rate": 1.3770828031006136e-05,
"loss": 0.3245,
"step": 111
},
{
"epoch": 0.33827893175074186,
"grad_norm": 8.475018501281738,
"learning_rate": 1.384880690124557e-05,
"loss": 0.3105,
"step": 114
},
{
"epoch": 0.34718100890207715,
"grad_norm": 8.174044609069824,
"learning_rate": 1.3924760115470921e-05,
"loss": 0.3057,
"step": 117
},
{
"epoch": 0.3560830860534125,
"grad_norm": 6.196917533874512,
"learning_rate": 1.399879025541493e-05,
"loss": 0.336,
"step": 120
},
{
"epoch": 0.3649851632047478,
"grad_norm": 9.92663860321045,
"learning_rate": 1.4070992302558296e-05,
"loss": 0.3253,
"step": 123
},
{
"epoch": 0.37388724035608306,
"grad_norm": 7.014742374420166,
"learning_rate": 1.4141454370870561e-05,
"loss": 0.3129,
"step": 126
},
{
"epoch": 0.3827893175074184,
"grad_norm": 8.182991027832031,
"learning_rate": 1.421025835332077e-05,
"loss": 0.3329,
"step": 129
},
{
"epoch": 0.3916913946587537,
"grad_norm": 7.044063091278076,
"learning_rate": 1.4277480494055697e-05,
"loss": 0.3265,
"step": 132
},
{
"epoch": 0.40059347181008903,
"grad_norm": 7.129275321960449,
"learning_rate": 1.4343191896271158e-05,
"loss": 0.3259,
"step": 135
},
{
"epoch": 0.4094955489614243,
"grad_norm": 6.028862476348877,
"learning_rate": 1.4407458974258987e-05,
"loss": 0.326,
"step": 138
},
{
"epoch": 0.41839762611275966,
"grad_norm": 9.75017261505127,
"learning_rate": 1.4470343856834936e-05,
"loss": 0.3242,
"step": 141
},
{
"epoch": 0.42729970326409494,
"grad_norm": 8.225783348083496,
"learning_rate": 1.4531904748290513e-05,
"loss": 0.3061,
"step": 144
},
{
"epoch": 0.4362017804154303,
"grad_norm": 7.090129852294922,
"learning_rate": 1.4592196252124945e-05,
"loss": 0.3152,
"step": 147
},
{
"epoch": 0.44510385756676557,
"grad_norm": 11.792762756347656,
"learning_rate": 1.465126966206991e-05,
"loss": 0.3258,
"step": 150
},
{
"epoch": 0.4540059347181009,
"grad_norm": 8.907958984375,
"learning_rate": 1.4709173224293973e-05,
"loss": 0.3203,
"step": 153
},
{
"epoch": 0.4629080118694362,
"grad_norm": 4.160986423492432,
"learning_rate": 1.4765952374145259e-05,
"loss": 0.3156,
"step": 156
},
{
"epoch": 0.47181008902077154,
"grad_norm": 4.767416954040527,
"learning_rate": 1.482164995034286e-05,
"loss": 0.3238,
"step": 159
},
{
"epoch": 0.4807121661721068,
"grad_norm": 6.091423511505127,
"learning_rate": 1.4876306389146738e-05,
"loss": 0.3186,
"step": 162
},
{
"epoch": 0.4896142433234421,
"grad_norm": 9.760332107543945,
"learning_rate": 1.4929959900710676e-05,
"loss": 0.3111,
"step": 165
},
{
"epoch": 0.49851632047477745,
"grad_norm": 7.750044822692871,
"learning_rate": 1.4982646629544899e-05,
"loss": 0.3209,
"step": 168
},
{
"epoch": 0.5074183976261127,
"grad_norm": 10.224579811096191,
"learning_rate": 1.5e-05,
"loss": 0.3262,
"step": 171
},
{
"epoch": 0.516320474777448,
"grad_norm": 8.861902236938477,
"learning_rate": 1.5e-05,
"loss": 0.3121,
"step": 174
},
{
"epoch": 0.5252225519287834,
"grad_norm": 4.321209907531738,
"learning_rate": 1.5e-05,
"loss": 0.3133,
"step": 177
},
{
"epoch": 0.5341246290801187,
"grad_norm": 4.978445529937744,
"learning_rate": 1.5e-05,
"loss": 0.3071,
"step": 180
},
{
"epoch": 0.543026706231454,
"grad_norm": 10.364422798156738,
"learning_rate": 1.5e-05,
"loss": 0.3178,
"step": 183
},
{
"epoch": 0.5519287833827893,
"grad_norm": 8.970362663269043,
"learning_rate": 1.5e-05,
"loss": 0.3212,
"step": 186
},
{
"epoch": 0.5608308605341247,
"grad_norm": 6.0015645027160645,
"learning_rate": 1.5e-05,
"loss": 0.3184,
"step": 189
},
{
"epoch": 0.56973293768546,
"grad_norm": 5.832813739776611,
"learning_rate": 1.5e-05,
"loss": 0.3211,
"step": 192
},
{
"epoch": 0.5786350148367952,
"grad_norm": 7.478390693664551,
"learning_rate": 1.5e-05,
"loss": 0.3432,
"step": 195
},
{
"epoch": 0.5875370919881305,
"grad_norm": 5.908259868621826,
"learning_rate": 1.5e-05,
"loss": 0.3287,
"step": 198
},
{
"epoch": 0.5964391691394659,
"grad_norm": 8.518238067626953,
"learning_rate": 1.5e-05,
"loss": 0.313,
"step": 201
},
{
"epoch": 0.6053412462908012,
"grad_norm": 6.046856880187988,
"learning_rate": 1.5e-05,
"loss": 0.3197,
"step": 204
},
{
"epoch": 0.6142433234421365,
"grad_norm": 9.07325553894043,
"learning_rate": 1.5e-05,
"loss": 0.3105,
"step": 207
},
{
"epoch": 0.6231454005934718,
"grad_norm": 7.418500900268555,
"learning_rate": 1.5e-05,
"loss": 0.3178,
"step": 210
},
{
"epoch": 0.6320474777448071,
"grad_norm": 10.935755729675293,
"learning_rate": 1.5e-05,
"loss": 0.3192,
"step": 213
},
{
"epoch": 0.6409495548961425,
"grad_norm": 8.953109741210938,
"learning_rate": 1.5e-05,
"loss": 0.336,
"step": 216
},
{
"epoch": 0.6498516320474778,
"grad_norm": 10.9089937210083,
"learning_rate": 1.5e-05,
"loss": 0.3362,
"step": 219
},
{
"epoch": 0.658753709198813,
"grad_norm": 7.62611198425293,
"learning_rate": 1.5e-05,
"loss": 0.3219,
"step": 222
},
{
"epoch": 0.6676557863501483,
"grad_norm": 8.877312660217285,
"learning_rate": 1.5e-05,
"loss": 0.3242,
"step": 225
},
{
"epoch": 0.6765578635014837,
"grad_norm": 11.891584396362305,
"learning_rate": 1.5e-05,
"loss": 0.3193,
"step": 228
},
{
"epoch": 0.685459940652819,
"grad_norm": 6.047501564025879,
"learning_rate": 1.5e-05,
"loss": 0.3145,
"step": 231
},
{
"epoch": 0.6943620178041543,
"grad_norm": 11.06523609161377,
"learning_rate": 1.5e-05,
"loss": 0.3205,
"step": 234
},
{
"epoch": 0.7032640949554896,
"grad_norm": 14.651629447937012,
"learning_rate": 1.5e-05,
"loss": 0.3181,
"step": 237
},
{
"epoch": 0.712166172106825,
"grad_norm": 4.986928462982178,
"learning_rate": 1.5e-05,
"loss": 0.3188,
"step": 240
},
{
"epoch": 0.7210682492581603,
"grad_norm": 5.383213520050049,
"learning_rate": 1.5e-05,
"loss": 0.3322,
"step": 243
},
{
"epoch": 0.7299703264094956,
"grad_norm": 7.467197418212891,
"learning_rate": 1.5e-05,
"loss": 0.3231,
"step": 246
},
{
"epoch": 0.7388724035608308,
"grad_norm": 8.040964126586914,
"learning_rate": 1.5e-05,
"loss": 0.3194,
"step": 249
},
{
"epoch": 0.7477744807121661,
"grad_norm": 9.442214012145996,
"learning_rate": 1.5e-05,
"loss": 0.3045,
"step": 252
},
{
"epoch": 0.7566765578635015,
"grad_norm": 5.0572919845581055,
"learning_rate": 1.5e-05,
"loss": 0.3198,
"step": 255
},
{
"epoch": 0.7655786350148368,
"grad_norm": 9.763797760009766,
"learning_rate": 1.5e-05,
"loss": 0.3038,
"step": 258
},
{
"epoch": 0.7744807121661721,
"grad_norm": 4.699306488037109,
"learning_rate": 1.5e-05,
"loss": 0.3071,
"step": 261
},
{
"epoch": 0.7833827893175074,
"grad_norm": 6.758116245269775,
"learning_rate": 1.5e-05,
"loss": 0.3079,
"step": 264
},
{
"epoch": 0.7922848664688428,
"grad_norm": 9.004244804382324,
"learning_rate": 1.5e-05,
"loss": 0.3114,
"step": 267
},
{
"epoch": 0.8011869436201781,
"grad_norm": 10.923787117004395,
"learning_rate": 1.5e-05,
"loss": 0.3214,
"step": 270
},
{
"epoch": 0.8100890207715133,
"grad_norm": 4.750248432159424,
"learning_rate": 1.5e-05,
"loss": 0.3213,
"step": 273
},
{
"epoch": 0.8189910979228486,
"grad_norm": 6.5013346672058105,
"learning_rate": 1.5e-05,
"loss": 0.3213,
"step": 276
},
{
"epoch": 0.827893175074184,
"grad_norm": 14.487788200378418,
"learning_rate": 1.5e-05,
"loss": 0.3117,
"step": 279
},
{
"epoch": 0.8367952522255193,
"grad_norm": 4.58863639831543,
"learning_rate": 1.5e-05,
"loss": 0.325,
"step": 282
},
{
"epoch": 0.8456973293768546,
"grad_norm": 5.803460597991943,
"learning_rate": 1.5e-05,
"loss": 0.3095,
"step": 285
},
{
"epoch": 0.8545994065281899,
"grad_norm": 6.8022871017456055,
"learning_rate": 1.5e-05,
"loss": 0.3279,
"step": 288
},
{
"epoch": 0.8635014836795252,
"grad_norm": 11.592184066772461,
"learning_rate": 1.5e-05,
"loss": 0.3211,
"step": 291
},
{
"epoch": 0.8724035608308606,
"grad_norm": 4.380642890930176,
"learning_rate": 1.5e-05,
"loss": 0.3121,
"step": 294
},
{
"epoch": 0.8813056379821959,
"grad_norm": 10.14802360534668,
"learning_rate": 1.5e-05,
"loss": 0.3147,
"step": 297
},
{
"epoch": 0.8902077151335311,
"grad_norm": 6.102616786956787,
"learning_rate": 1.5e-05,
"loss": 0.327,
"step": 300
},
{
"epoch": 0.8991097922848664,
"grad_norm": 6.485515117645264,
"learning_rate": 1.5e-05,
"loss": 0.3165,
"step": 303
},
{
"epoch": 0.9080118694362018,
"grad_norm": 3.8234357833862305,
"learning_rate": 1.5e-05,
"loss": 0.316,
"step": 306
},
{
"epoch": 0.9169139465875371,
"grad_norm": 9.781375885009766,
"learning_rate": 1.5e-05,
"loss": 0.3147,
"step": 309
},
{
"epoch": 0.9258160237388724,
"grad_norm": 8.865571975708008,
"learning_rate": 1.5e-05,
"loss": 0.3178,
"step": 312
},
{
"epoch": 0.9347181008902077,
"grad_norm": 6.27769660949707,
"learning_rate": 1.5e-05,
"loss": 0.3152,
"step": 315
},
{
"epoch": 0.9436201780415431,
"grad_norm": 5.6717143058776855,
"learning_rate": 1.5e-05,
"loss": 0.3062,
"step": 318
},
{
"epoch": 0.9525222551928784,
"grad_norm": 14.846003532409668,
"learning_rate": 1.5e-05,
"loss": 0.3204,
"step": 321
},
{
"epoch": 0.9614243323442137,
"grad_norm": 7.501077651977539,
"learning_rate": 1.5e-05,
"loss": 0.307,
"step": 324
},
{
"epoch": 0.9703264094955489,
"grad_norm": 7.450833320617676,
"learning_rate": 1.5e-05,
"loss": 0.313,
"step": 327
},
{
"epoch": 0.9792284866468842,
"grad_norm": 4.422346591949463,
"learning_rate": 1.5e-05,
"loss": 0.3067,
"step": 330
},
{
"epoch": 0.9881305637982196,
"grad_norm": 5.954162120819092,
"learning_rate": 1.5e-05,
"loss": 0.3215,
"step": 333
},
{
"epoch": 0.9970326409495549,
"grad_norm": 4.766922950744629,
"learning_rate": 1.5e-05,
"loss": 0.3139,
"step": 336
},
{
"epoch": 1.0,
"eval_loss": 0.6956011056900024,
"eval_runtime": 297.7711,
"eval_samples_per_second": 5.178,
"eval_steps_per_second": 0.648,
"step": 337
},
{
"epoch": 1.0059347181008902,
"grad_norm": 7.0965895652771,
"learning_rate": 1.5e-05,
"loss": 0.3104,
"step": 339
},
{
"epoch": 1.0148367952522255,
"grad_norm": 7.9634599685668945,
"learning_rate": 1.5e-05,
"loss": 0.3082,
"step": 342
},
{
"epoch": 1.0237388724035608,
"grad_norm": 8.5128755569458,
"learning_rate": 1.5e-05,
"loss": 0.3211,
"step": 345
},
{
"epoch": 1.032640949554896,
"grad_norm": 5.733129501342773,
"learning_rate": 1.5e-05,
"loss": 0.3236,
"step": 348
},
{
"epoch": 1.0415430267062316,
"grad_norm": 8.197546005249023,
"learning_rate": 1.5e-05,
"loss": 0.3136,
"step": 351
},
{
"epoch": 1.0504451038575668,
"grad_norm": 11.312963485717773,
"learning_rate": 1.5e-05,
"loss": 0.3274,
"step": 354
},
{
"epoch": 1.0593471810089021,
"grad_norm": 4.885214328765869,
"learning_rate": 1.5e-05,
"loss": 0.3268,
"step": 357
},
{
"epoch": 1.0682492581602374,
"grad_norm": 7.366455078125,
"learning_rate": 1.5e-05,
"loss": 0.3214,
"step": 360
},
{
"epoch": 1.0771513353115727,
"grad_norm": 7.0693559646606445,
"learning_rate": 1.5e-05,
"loss": 0.3262,
"step": 363
},
{
"epoch": 1.086053412462908,
"grad_norm": 12.16964054107666,
"learning_rate": 1.5e-05,
"loss": 0.3232,
"step": 366
},
{
"epoch": 1.0949554896142433,
"grad_norm": 6.702571868896484,
"learning_rate": 1.5e-05,
"loss": 0.3049,
"step": 369
},
{
"epoch": 1.1038575667655786,
"grad_norm": 8.25865650177002,
"learning_rate": 1.5e-05,
"loss": 0.3086,
"step": 372
},
{
"epoch": 1.1127596439169138,
"grad_norm": 10.963550567626953,
"learning_rate": 1.5e-05,
"loss": 0.3187,
"step": 375
},
{
"epoch": 1.1216617210682494,
"grad_norm": 10.957636833190918,
"learning_rate": 1.5e-05,
"loss": 0.3119,
"step": 378
},
{
"epoch": 1.1305637982195846,
"grad_norm": 4.481369495391846,
"learning_rate": 1.5e-05,
"loss": 0.3153,
"step": 381
},
{
"epoch": 1.13946587537092,
"grad_norm": 7.9678120613098145,
"learning_rate": 1.5e-05,
"loss": 0.3143,
"step": 384
},
{
"epoch": 1.1483679525222552,
"grad_norm": 10.013398170471191,
"learning_rate": 1.5e-05,
"loss": 0.3146,
"step": 387
},
{
"epoch": 1.1572700296735905,
"grad_norm": 9.361319541931152,
"learning_rate": 1.5e-05,
"loss": 0.3069,
"step": 390
},
{
"epoch": 1.1661721068249258,
"grad_norm": 7.185680866241455,
"learning_rate": 1.5e-05,
"loss": 0.3198,
"step": 393
},
{
"epoch": 1.175074183976261,
"grad_norm": 9.780238151550293,
"learning_rate": 1.5e-05,
"loss": 0.3173,
"step": 396
},
{
"epoch": 1.1839762611275964,
"grad_norm": 6.236032009124756,
"learning_rate": 1.5e-05,
"loss": 0.3096,
"step": 399
},
{
"epoch": 1.1928783382789319,
"grad_norm": 6.732054710388184,
"learning_rate": 1.5e-05,
"loss": 0.3086,
"step": 402
},
{
"epoch": 1.2017804154302671,
"grad_norm": 8.902305603027344,
"learning_rate": 1.5e-05,
"loss": 0.3074,
"step": 405
},
{
"epoch": 1.2106824925816024,
"grad_norm": 8.529496192932129,
"learning_rate": 1.5e-05,
"loss": 0.3208,
"step": 408
},
{
"epoch": 1.2195845697329377,
"grad_norm": 10.779397964477539,
"learning_rate": 1.5e-05,
"loss": 0.2997,
"step": 411
},
{
"epoch": 1.228486646884273,
"grad_norm": 5.797762393951416,
"learning_rate": 1.5e-05,
"loss": 0.2977,
"step": 414
},
{
"epoch": 1.2373887240356083,
"grad_norm": 10.339754104614258,
"learning_rate": 1.5e-05,
"loss": 0.3099,
"step": 417
},
{
"epoch": 1.2462908011869436,
"grad_norm": 6.894352436065674,
"learning_rate": 1.5e-05,
"loss": 0.3027,
"step": 420
},
{
"epoch": 1.2551928783382789,
"grad_norm": 10.406209945678711,
"learning_rate": 1.5e-05,
"loss": 0.318,
"step": 423
},
{
"epoch": 1.2640949554896141,
"grad_norm": 4.105279922485352,
"learning_rate": 1.5e-05,
"loss": 0.3271,
"step": 426
},
{
"epoch": 1.2729970326409497,
"grad_norm": 9.26810073852539,
"learning_rate": 1.5e-05,
"loss": 0.3052,
"step": 429
},
{
"epoch": 1.281899109792285,
"grad_norm": 11.131587028503418,
"learning_rate": 1.5e-05,
"loss": 0.315,
"step": 432
},
{
"epoch": 1.2908011869436202,
"grad_norm": 7.997912883758545,
"learning_rate": 1.5e-05,
"loss": 0.3037,
"step": 435
},
{
"epoch": 1.2997032640949555,
"grad_norm": 4.264193058013916,
"learning_rate": 1.5e-05,
"loss": 0.2967,
"step": 438
},
{
"epoch": 1.3086053412462908,
"grad_norm": 6.291212558746338,
"learning_rate": 1.5e-05,
"loss": 0.3078,
"step": 441
},
{
"epoch": 1.317507418397626,
"grad_norm": 10.159900665283203,
"learning_rate": 1.5e-05,
"loss": 0.2994,
"step": 444
},
{
"epoch": 1.3264094955489614,
"grad_norm": 10.216263771057129,
"learning_rate": 1.5e-05,
"loss": 0.3244,
"step": 447
},
{
"epoch": 1.3353115727002967,
"grad_norm": 7.566501617431641,
"learning_rate": 1.5e-05,
"loss": 0.31,
"step": 450
},
{
"epoch": 1.344213649851632,
"grad_norm": 5.979765892028809,
"learning_rate": 1.5e-05,
"loss": 0.3069,
"step": 453
},
{
"epoch": 1.3531157270029674,
"grad_norm": 6.646083354949951,
"learning_rate": 1.5e-05,
"loss": 0.3026,
"step": 456
},
{
"epoch": 1.3620178041543027,
"grad_norm": 6.50187349319458,
"learning_rate": 1.5e-05,
"loss": 0.3065,
"step": 459
},
{
"epoch": 1.370919881305638,
"grad_norm": 4.8104705810546875,
"learning_rate": 1.5e-05,
"loss": 0.307,
"step": 462
},
{
"epoch": 1.3798219584569733,
"grad_norm": 4.24050235748291,
"learning_rate": 1.5e-05,
"loss": 0.3011,
"step": 465
},
{
"epoch": 1.3887240356083086,
"grad_norm": 7.853260040283203,
"learning_rate": 1.5e-05,
"loss": 0.3051,
"step": 468
},
{
"epoch": 1.3976261127596439,
"grad_norm": 6.0949602127075195,
"learning_rate": 1.5e-05,
"loss": 0.2931,
"step": 471
},
{
"epoch": 1.4065281899109792,
"grad_norm": 11.793612480163574,
"learning_rate": 1.5e-05,
"loss": 0.2982,
"step": 474
},
{
"epoch": 1.4154302670623147,
"grad_norm": 8.261275291442871,
"learning_rate": 1.5e-05,
"loss": 0.2966,
"step": 477
},
{
"epoch": 1.4243323442136497,
"grad_norm": 6.895599365234375,
"learning_rate": 1.5e-05,
"loss": 0.3195,
"step": 480
},
{
"epoch": 1.4332344213649852,
"grad_norm": 5.414015293121338,
"learning_rate": 1.5e-05,
"loss": 0.3063,
"step": 483
},
{
"epoch": 1.4421364985163205,
"grad_norm": 4.563000202178955,
"learning_rate": 1.5e-05,
"loss": 0.3079,
"step": 486
},
{
"epoch": 1.4510385756676558,
"grad_norm": 7.205160617828369,
"learning_rate": 1.5e-05,
"loss": 0.3204,
"step": 489
},
{
"epoch": 1.459940652818991,
"grad_norm": 7.146437168121338,
"learning_rate": 1.5e-05,
"loss": 0.3049,
"step": 492
},
{
"epoch": 1.4688427299703264,
"grad_norm": 8.912725448608398,
"learning_rate": 1.5e-05,
"loss": 0.3004,
"step": 495
},
{
"epoch": 1.4777448071216617,
"grad_norm": 5.934146881103516,
"learning_rate": 1.5e-05,
"loss": 0.3053,
"step": 498
},
{
"epoch": 1.486646884272997,
"grad_norm": 7.54482889175415,
"learning_rate": 1.5e-05,
"loss": 0.2962,
"step": 501
},
{
"epoch": 1.4955489614243325,
"grad_norm": 11.391508102416992,
"learning_rate": 1.5e-05,
"loss": 0.3291,
"step": 504
},
{
"epoch": 1.5044510385756675,
"grad_norm": 9.863611221313477,
"learning_rate": 1.5e-05,
"loss": 0.3068,
"step": 507
},
{
"epoch": 1.513353115727003,
"grad_norm": 7.5741376876831055,
"learning_rate": 1.5e-05,
"loss": 0.3036,
"step": 510
},
{
"epoch": 1.5222551928783383,
"grad_norm": 11.626495361328125,
"learning_rate": 1.5e-05,
"loss": 0.3131,
"step": 513
},
{
"epoch": 1.5311572700296736,
"grad_norm": 4.790311813354492,
"learning_rate": 1.5e-05,
"loss": 0.3126,
"step": 516
},
{
"epoch": 1.540059347181009,
"grad_norm": 5.693728446960449,
"learning_rate": 1.5e-05,
"loss": 0.3221,
"step": 519
},
{
"epoch": 1.5489614243323442,
"grad_norm": 9.541658401489258,
"learning_rate": 1.5e-05,
"loss": 0.3186,
"step": 522
},
{
"epoch": 1.5578635014836797,
"grad_norm": 10.08277416229248,
"learning_rate": 1.5e-05,
"loss": 0.3094,
"step": 525
},
{
"epoch": 1.5667655786350148,
"grad_norm": 10.004911422729492,
"learning_rate": 1.5e-05,
"loss": 0.3081,
"step": 528
},
{
"epoch": 1.5756676557863503,
"grad_norm": 4.247671127319336,
"learning_rate": 1.5e-05,
"loss": 0.3173,
"step": 531
},
{
"epoch": 1.5845697329376853,
"grad_norm": 6.010837078094482,
"learning_rate": 1.5e-05,
"loss": 0.314,
"step": 534
},
{
"epoch": 1.5934718100890208,
"grad_norm": 10.42171859741211,
"learning_rate": 1.5e-05,
"loss": 0.3111,
"step": 537
},
{
"epoch": 1.6023738872403561,
"grad_norm": 11.672240257263184,
"learning_rate": 1.5e-05,
"loss": 0.3263,
"step": 540
},
{
"epoch": 1.6112759643916914,
"grad_norm": 9.143010139465332,
"learning_rate": 1.5e-05,
"loss": 0.2983,
"step": 543
},
{
"epoch": 1.6201780415430267,
"grad_norm": 7.786658763885498,
"learning_rate": 1.5e-05,
"loss": 0.3089,
"step": 546
},
{
"epoch": 1.629080118694362,
"grad_norm": 5.973340034484863,
"learning_rate": 1.5e-05,
"loss": 0.303,
"step": 549
},
{
"epoch": 1.6379821958456975,
"grad_norm": 4.11182975769043,
"learning_rate": 1.5e-05,
"loss": 0.3059,
"step": 552
},
{
"epoch": 1.6468842729970326,
"grad_norm": 6.210434913635254,
"learning_rate": 1.5e-05,
"loss": 0.3133,
"step": 555
},
{
"epoch": 1.655786350148368,
"grad_norm": 11.501874923706055,
"learning_rate": 1.5e-05,
"loss": 0.3078,
"step": 558
},
{
"epoch": 1.6646884272997031,
"grad_norm": 8.35253620147705,
"learning_rate": 1.5e-05,
"loss": 0.3147,
"step": 561
},
{
"epoch": 1.6735905044510386,
"grad_norm": 6.669034957885742,
"learning_rate": 1.5e-05,
"loss": 0.3104,
"step": 564
},
{
"epoch": 1.682492581602374,
"grad_norm": 13.310565948486328,
"learning_rate": 1.5e-05,
"loss": 0.3172,
"step": 567
},
{
"epoch": 1.6913946587537092,
"grad_norm": 6.960197448730469,
"learning_rate": 1.5e-05,
"loss": 0.3192,
"step": 570
},
{
"epoch": 1.7002967359050445,
"grad_norm": 10.452018737792969,
"learning_rate": 1.5e-05,
"loss": 0.3072,
"step": 573
},
{
"epoch": 1.7091988130563798,
"grad_norm": 6.1864190101623535,
"learning_rate": 1.5e-05,
"loss": 0.3178,
"step": 576
},
{
"epoch": 1.7181008902077153,
"grad_norm": 6.356491565704346,
"learning_rate": 1.5e-05,
"loss": 0.3176,
"step": 579
},
{
"epoch": 1.7270029673590503,
"grad_norm": 5.232566833496094,
"learning_rate": 1.5e-05,
"loss": 0.2963,
"step": 582
},
{
"epoch": 1.7359050445103859,
"grad_norm": 3.332583427429199,
"learning_rate": 1.5e-05,
"loss": 0.2999,
"step": 585
},
{
"epoch": 1.744807121661721,
"grad_norm": 5.193176746368408,
"learning_rate": 1.5e-05,
"loss": 0.3192,
"step": 588
},
{
"epoch": 1.7537091988130564,
"grad_norm": 6.814889907836914,
"learning_rate": 1.5e-05,
"loss": 0.2958,
"step": 591
},
{
"epoch": 1.7626112759643917,
"grad_norm": 9.611870765686035,
"learning_rate": 1.5e-05,
"loss": 0.3091,
"step": 594
},
{
"epoch": 1.771513353115727,
"grad_norm": 7.733308792114258,
"learning_rate": 1.5e-05,
"loss": 0.3026,
"step": 597
},
{
"epoch": 1.7804154302670623,
"grad_norm": 5.742140769958496,
"learning_rate": 1.5e-05,
"loss": 0.2962,
"step": 600
},
{
"epoch": 1.7893175074183976,
"grad_norm": 11.053295135498047,
"learning_rate": 1.5e-05,
"loss": 0.2967,
"step": 603
},
{
"epoch": 1.798219584569733,
"grad_norm": 7.031610012054443,
"learning_rate": 1.5e-05,
"loss": 0.2927,
"step": 606
},
{
"epoch": 1.8071216617210681,
"grad_norm": 6.521071910858154,
"learning_rate": 1.5e-05,
"loss": 0.3139,
"step": 609
},
{
"epoch": 1.8160237388724036,
"grad_norm": 6.417489528656006,
"learning_rate": 1.5e-05,
"loss": 0.3081,
"step": 612
},
{
"epoch": 1.8249258160237387,
"grad_norm": 9.378142356872559,
"learning_rate": 1.5e-05,
"loss": 0.298,
"step": 615
},
{
"epoch": 1.8338278931750742,
"grad_norm": 8.447271347045898,
"learning_rate": 1.5e-05,
"loss": 0.3141,
"step": 618
},
{
"epoch": 1.8427299703264095,
"grad_norm": 10.930451393127441,
"learning_rate": 1.5e-05,
"loss": 0.3021,
"step": 621
},
{
"epoch": 1.8516320474777448,
"grad_norm": 8.880478858947754,
"learning_rate": 1.5e-05,
"loss": 0.3136,
"step": 624
},
{
"epoch": 1.86053412462908,
"grad_norm": 5.905041217803955,
"learning_rate": 1.5e-05,
"loss": 0.3191,
"step": 627
},
{
"epoch": 1.8694362017804154,
"grad_norm": 6.188875675201416,
"learning_rate": 1.5e-05,
"loss": 0.3283,
"step": 630
},
{
"epoch": 1.8783382789317509,
"grad_norm": 11.83849811553955,
"learning_rate": 1.5e-05,
"loss": 0.3235,
"step": 633
},
{
"epoch": 1.887240356083086,
"grad_norm": 7.689598560333252,
"learning_rate": 1.5e-05,
"loss": 0.3162,
"step": 636
},
{
"epoch": 1.8961424332344214,
"grad_norm": 3.9637110233306885,
"learning_rate": 1.5e-05,
"loss": 0.3127,
"step": 639
},
{
"epoch": 1.9050445103857567,
"grad_norm": 13.587063789367676,
"learning_rate": 1.5e-05,
"loss": 0.3268,
"step": 642
},
{
"epoch": 1.913946587537092,
"grad_norm": 7.881510257720947,
"learning_rate": 1.5e-05,
"loss": 0.3038,
"step": 645
},
{
"epoch": 1.9228486646884273,
"grad_norm": 6.357386112213135,
"learning_rate": 1.5e-05,
"loss": 0.3097,
"step": 648
},
{
"epoch": 1.9317507418397626,
"grad_norm": 6.852357387542725,
"learning_rate": 1.5e-05,
"loss": 0.3056,
"step": 651
},
{
"epoch": 1.9406528189910979,
"grad_norm": 6.557038307189941,
"learning_rate": 1.5e-05,
"loss": 0.3209,
"step": 654
},
{
"epoch": 1.9495548961424332,
"grad_norm": 7.013545036315918,
"learning_rate": 1.5e-05,
"loss": 0.3237,
"step": 657
},
{
"epoch": 1.9584569732937687,
"grad_norm": 9.902325630187988,
"learning_rate": 1.5e-05,
"loss": 0.3166,
"step": 660
},
{
"epoch": 1.9673590504451037,
"grad_norm": 6.723764896392822,
"learning_rate": 1.5e-05,
"loss": 0.3198,
"step": 663
},
{
"epoch": 1.9762611275964392,
"grad_norm": 9.627095222473145,
"learning_rate": 1.5e-05,
"loss": 0.321,
"step": 666
},
{
"epoch": 1.9851632047477745,
"grad_norm": 8.035420417785645,
"learning_rate": 1.5e-05,
"loss": 0.31,
"step": 669
},
{
"epoch": 1.9940652818991098,
"grad_norm": 10.477612495422363,
"learning_rate": 1.5e-05,
"loss": 0.2995,
"step": 672
},
{
"epoch": 2.0,
"eval_loss": 0.6920689940452576,
"eval_runtime": 296.7344,
"eval_samples_per_second": 5.197,
"eval_steps_per_second": 0.65,
"step": 674
},
{
"epoch": 2.0029673590504453,
"grad_norm": 4.917605400085449,
"learning_rate": 1.5e-05,
"loss": 0.3129,
"step": 675
},
{
"epoch": 2.0118694362017804,
"grad_norm": 14.471161842346191,
"learning_rate": 1.5e-05,
"loss": 0.3121,
"step": 678
},
{
"epoch": 2.020771513353116,
"grad_norm": 10.123734474182129,
"learning_rate": 1.5e-05,
"loss": 0.3136,
"step": 681
},
{
"epoch": 2.029673590504451,
"grad_norm": 7.0058794021606445,
"learning_rate": 1.5e-05,
"loss": 0.314,
"step": 684
},
{
"epoch": 2.0385756676557865,
"grad_norm": 5.461868762969971,
"learning_rate": 1.5e-05,
"loss": 0.307,
"step": 687
},
{
"epoch": 2.0474777448071215,
"grad_norm": 5.689599514007568,
"learning_rate": 1.5e-05,
"loss": 0.3053,
"step": 690
},
{
"epoch": 2.056379821958457,
"grad_norm": 8.585354804992676,
"learning_rate": 1.5e-05,
"loss": 0.3041,
"step": 693
},
{
"epoch": 2.065281899109792,
"grad_norm": 4.620091915130615,
"learning_rate": 1.5e-05,
"loss": 0.2921,
"step": 696
},
{
"epoch": 2.0741839762611276,
"grad_norm": 6.909940719604492,
"learning_rate": 1.5e-05,
"loss": 0.3087,
"step": 699
},
{
"epoch": 2.083086053412463,
"grad_norm": 5.3829426765441895,
"learning_rate": 1.5e-05,
"loss": 0.294,
"step": 702
},
{
"epoch": 2.091988130563798,
"grad_norm": 10.095771789550781,
"learning_rate": 1.5e-05,
"loss": 0.3027,
"step": 705
},
{
"epoch": 2.1008902077151337,
"grad_norm": 7.622206687927246,
"learning_rate": 1.5e-05,
"loss": 0.2936,
"step": 708
},
{
"epoch": 2.1097922848664687,
"grad_norm": 9.839076042175293,
"learning_rate": 1.5e-05,
"loss": 0.3093,
"step": 711
},
{
"epoch": 2.1186943620178043,
"grad_norm": 13.05020809173584,
"learning_rate": 1.5e-05,
"loss": 0.3076,
"step": 714
},
{
"epoch": 2.1275964391691393,
"grad_norm": 4.418980598449707,
"learning_rate": 1.5e-05,
"loss": 0.3043,
"step": 717
},
{
"epoch": 2.136498516320475,
"grad_norm": 3.569221019744873,
"learning_rate": 1.5e-05,
"loss": 0.3083,
"step": 720
},
{
"epoch": 2.14540059347181,
"grad_norm": 6.468089580535889,
"learning_rate": 1.5e-05,
"loss": 0.3016,
"step": 723
},
{
"epoch": 2.1543026706231454,
"grad_norm": 8.789352416992188,
"learning_rate": 1.5e-05,
"loss": 0.3022,
"step": 726
},
{
"epoch": 2.163204747774481,
"grad_norm": 8.202059745788574,
"learning_rate": 1.5e-05,
"loss": 0.311,
"step": 729
},
{
"epoch": 2.172106824925816,
"grad_norm": 6.959595203399658,
"learning_rate": 1.5e-05,
"loss": 0.2947,
"step": 732
},
{
"epoch": 2.1810089020771515,
"grad_norm": 11.653180122375488,
"learning_rate": 1.5e-05,
"loss": 0.3094,
"step": 735
},
{
"epoch": 2.1899109792284865,
"grad_norm": 8.507452964782715,
"learning_rate": 1.5e-05,
"loss": 0.3094,
"step": 738
},
{
"epoch": 2.198813056379822,
"grad_norm": 3.680802583694458,
"learning_rate": 1.5e-05,
"loss": 0.3054,
"step": 741
},
{
"epoch": 2.207715133531157,
"grad_norm": 9.95173454284668,
"learning_rate": 1.5e-05,
"loss": 0.2928,
"step": 744
},
{
"epoch": 2.2166172106824926,
"grad_norm": 10.835822105407715,
"learning_rate": 1.5e-05,
"loss": 0.2882,
"step": 747
},
{
"epoch": 2.2255192878338277,
"grad_norm": 12.096845626831055,
"learning_rate": 1.5e-05,
"loss": 0.308,
"step": 750
},
{
"epoch": 2.234421364985163,
"grad_norm": 4.49980354309082,
"learning_rate": 1.5e-05,
"loss": 0.3173,
"step": 753
},
{
"epoch": 2.2433234421364987,
"grad_norm": 9.042285919189453,
"learning_rate": 1.5e-05,
"loss": 0.3073,
"step": 756
},
{
"epoch": 2.2522255192878338,
"grad_norm": 5.250131130218506,
"learning_rate": 1.5e-05,
"loss": 0.2966,
"step": 759
},
{
"epoch": 2.2611275964391693,
"grad_norm": 9.235132217407227,
"learning_rate": 1.5e-05,
"loss": 0.3168,
"step": 762
},
{
"epoch": 2.2700296735905043,
"grad_norm": 7.330996513366699,
"learning_rate": 1.5e-05,
"loss": 0.3027,
"step": 765
},
{
"epoch": 2.27893175074184,
"grad_norm": 5.805144309997559,
"learning_rate": 1.5e-05,
"loss": 0.3232,
"step": 768
},
{
"epoch": 2.287833827893175,
"grad_norm": 10.95457649230957,
"learning_rate": 1.5e-05,
"loss": 0.307,
"step": 771
},
{
"epoch": 2.2967359050445104,
"grad_norm": 5.920906066894531,
"learning_rate": 1.5e-05,
"loss": 0.3052,
"step": 774
},
{
"epoch": 2.3056379821958455,
"grad_norm": 7.4418511390686035,
"learning_rate": 1.5e-05,
"loss": 0.3195,
"step": 777
},
{
"epoch": 2.314540059347181,
"grad_norm": 9.739228248596191,
"learning_rate": 1.5e-05,
"loss": 0.3146,
"step": 780
},
{
"epoch": 2.3234421364985165,
"grad_norm": 11.025596618652344,
"learning_rate": 1.5e-05,
"loss": 0.3061,
"step": 783
},
{
"epoch": 2.3323442136498516,
"grad_norm": 5.031250953674316,
"learning_rate": 1.5e-05,
"loss": 0.3128,
"step": 786
},
{
"epoch": 2.341246290801187,
"grad_norm": 9.482969284057617,
"learning_rate": 1.5e-05,
"loss": 0.3067,
"step": 789
},
{
"epoch": 2.350148367952522,
"grad_norm": 4.4395270347595215,
"learning_rate": 1.5e-05,
"loss": 0.2972,
"step": 792
},
{
"epoch": 2.3590504451038576,
"grad_norm": 4.755709171295166,
"learning_rate": 1.5e-05,
"loss": 0.3078,
"step": 795
},
{
"epoch": 2.3679525222551927,
"grad_norm": 6.278073310852051,
"learning_rate": 1.5e-05,
"loss": 0.3107,
"step": 798
},
{
"epoch": 2.376854599406528,
"grad_norm": 7.922651767730713,
"learning_rate": 1.5e-05,
"loss": 0.3043,
"step": 801
},
{
"epoch": 2.3857566765578637,
"grad_norm": 9.521344184875488,
"learning_rate": 1.5e-05,
"loss": 0.3158,
"step": 804
},
{
"epoch": 2.394658753709199,
"grad_norm": 12.499236106872559,
"learning_rate": 1.5e-05,
"loss": 0.3086,
"step": 807
},
{
"epoch": 2.4035608308605343,
"grad_norm": 6.426900863647461,
"learning_rate": 1.5e-05,
"loss": 0.3126,
"step": 810
},
{
"epoch": 2.4124629080118694,
"grad_norm": 8.431981086730957,
"learning_rate": 1.5e-05,
"loss": 0.3,
"step": 813
},
{
"epoch": 2.421364985163205,
"grad_norm": 12.86776351928711,
"learning_rate": 1.5e-05,
"loss": 0.2995,
"step": 816
},
{
"epoch": 2.43026706231454,
"grad_norm": 6.822738170623779,
"learning_rate": 1.5e-05,
"loss": 0.3115,
"step": 819
},
{
"epoch": 2.4391691394658754,
"grad_norm": 6.153812408447266,
"learning_rate": 1.5e-05,
"loss": 0.297,
"step": 822
},
{
"epoch": 2.4480712166172105,
"grad_norm": 11.699315071105957,
"learning_rate": 1.5e-05,
"loss": 0.2951,
"step": 825
},
{
"epoch": 2.456973293768546,
"grad_norm": 5.795748710632324,
"learning_rate": 1.5e-05,
"loss": 0.3062,
"step": 828
},
{
"epoch": 2.465875370919881,
"grad_norm": 6.4195756912231445,
"learning_rate": 1.5e-05,
"loss": 0.2938,
"step": 831
},
{
"epoch": 2.4747774480712166,
"grad_norm": 6.024349212646484,
"learning_rate": 1.5e-05,
"loss": 0.2887,
"step": 834
},
{
"epoch": 2.483679525222552,
"grad_norm": 5.880214691162109,
"learning_rate": 1.5e-05,
"loss": 0.2943,
"step": 837
},
{
"epoch": 2.492581602373887,
"grad_norm": 18.98047637939453,
"learning_rate": 1.5e-05,
"loss": 0.2903,
"step": 840
},
{
"epoch": 2.5014836795252227,
"grad_norm": 14.550153732299805,
"learning_rate": 1.5e-05,
"loss": 0.2999,
"step": 843
},
{
"epoch": 2.5103857566765577,
"grad_norm": 11.062093734741211,
"learning_rate": 1.5e-05,
"loss": 0.3281,
"step": 846
},
{
"epoch": 2.5192878338278932,
"grad_norm": 6.1865644454956055,
"learning_rate": 1.5e-05,
"loss": 0.3073,
"step": 849
},
{
"epoch": 2.5281899109792283,
"grad_norm": 10.409070014953613,
"learning_rate": 1.5e-05,
"loss": 0.3155,
"step": 852
},
{
"epoch": 2.537091988130564,
"grad_norm": 12.40860366821289,
"learning_rate": 1.5e-05,
"loss": 0.3013,
"step": 855
},
{
"epoch": 2.5459940652818993,
"grad_norm": 6.20428466796875,
"learning_rate": 1.5e-05,
"loss": 0.3141,
"step": 858
},
{
"epoch": 2.5548961424332344,
"grad_norm": 4.158163547515869,
"learning_rate": 1.5e-05,
"loss": 0.307,
"step": 861
},
{
"epoch": 2.56379821958457,
"grad_norm": 7.828709602355957,
"learning_rate": 1.5e-05,
"loss": 0.3191,
"step": 864
},
{
"epoch": 2.572700296735905,
"grad_norm": 8.588981628417969,
"learning_rate": 1.5e-05,
"loss": 0.3237,
"step": 867
},
{
"epoch": 2.5816023738872405,
"grad_norm": 6.725210189819336,
"learning_rate": 1.5e-05,
"loss": 0.293,
"step": 870
},
{
"epoch": 2.5905044510385755,
"grad_norm": 8.876666069030762,
"learning_rate": 1.5e-05,
"loss": 0.3042,
"step": 873
},
{
"epoch": 2.599406528189911,
"grad_norm": 8.503588676452637,
"learning_rate": 1.5e-05,
"loss": 0.3058,
"step": 876
},
{
"epoch": 2.6083086053412465,
"grad_norm": 7.051385402679443,
"learning_rate": 1.5e-05,
"loss": 0.2856,
"step": 879
},
{
"epoch": 2.6172106824925816,
"grad_norm": 11.214133262634277,
"learning_rate": 1.5e-05,
"loss": 0.2899,
"step": 882
},
{
"epoch": 2.6261127596439167,
"grad_norm": 5.270874977111816,
"learning_rate": 1.5e-05,
"loss": 0.2947,
"step": 885
},
{
"epoch": 2.635014836795252,
"grad_norm": 13.623291015625,
"learning_rate": 1.5e-05,
"loss": 0.3001,
"step": 888
},
{
"epoch": 2.6439169139465877,
"grad_norm": 3.9485678672790527,
"learning_rate": 1.5e-05,
"loss": 0.3027,
"step": 891
},
{
"epoch": 2.6528189910979227,
"grad_norm": 7.7399725914001465,
"learning_rate": 1.5e-05,
"loss": 0.2988,
"step": 894
},
{
"epoch": 2.6617210682492582,
"grad_norm": 7.428469181060791,
"learning_rate": 1.5e-05,
"loss": 0.2896,
"step": 897
},
{
"epoch": 2.6706231454005933,
"grad_norm": 4.9085001945495605,
"learning_rate": 1.5e-05,
"loss": 0.2955,
"step": 900
},
{
"epoch": 2.679525222551929,
"grad_norm": 7.616215705871582,
"learning_rate": 1.5e-05,
"loss": 0.3143,
"step": 903
},
{
"epoch": 2.688427299703264,
"grad_norm": 6.225953102111816,
"learning_rate": 1.5e-05,
"loss": 0.3004,
"step": 906
},
{
"epoch": 2.6973293768545994,
"grad_norm": 5.675787448883057,
"learning_rate": 1.5e-05,
"loss": 0.2946,
"step": 909
},
{
"epoch": 2.706231454005935,
"grad_norm": 7.747137069702148,
"learning_rate": 1.5e-05,
"loss": 0.2966,
"step": 912
},
{
"epoch": 2.71513353115727,
"grad_norm": 12.72786808013916,
"learning_rate": 1.5e-05,
"loss": 0.3106,
"step": 915
},
{
"epoch": 2.7240356083086055,
"grad_norm": 7.423135280609131,
"learning_rate": 1.5e-05,
"loss": 0.2838,
"step": 918
},
{
"epoch": 2.7329376854599405,
"grad_norm": 6.8378520011901855,
"learning_rate": 1.5e-05,
"loss": 0.3165,
"step": 921
},
{
"epoch": 2.741839762611276,
"grad_norm": 5.68455696105957,
"learning_rate": 1.5e-05,
"loss": 0.3078,
"step": 924
},
{
"epoch": 2.750741839762611,
"grad_norm": 13.37850570678711,
"learning_rate": 1.5e-05,
"loss": 0.3005,
"step": 927
},
{
"epoch": 2.7596439169139466,
"grad_norm": 5.610422611236572,
"learning_rate": 1.5e-05,
"loss": 0.2948,
"step": 930
},
{
"epoch": 2.768545994065282,
"grad_norm": 9.621097564697266,
"learning_rate": 1.5e-05,
"loss": 0.307,
"step": 933
},
{
"epoch": 2.777448071216617,
"grad_norm": 4.709936141967773,
"learning_rate": 1.5e-05,
"loss": 0.3011,
"step": 936
},
{
"epoch": 2.7863501483679523,
"grad_norm": 7.198949813842773,
"learning_rate": 1.5e-05,
"loss": 0.2938,
"step": 939
},
{
"epoch": 2.7952522255192878,
"grad_norm": 6.532808303833008,
"learning_rate": 1.5e-05,
"loss": 0.3158,
"step": 942
},
{
"epoch": 2.8041543026706233,
"grad_norm": 10.170119285583496,
"learning_rate": 1.5e-05,
"loss": 0.2862,
"step": 945
},
{
"epoch": 2.8130563798219583,
"grad_norm": 7.333060264587402,
"learning_rate": 1.5e-05,
"loss": 0.2989,
"step": 948
},
{
"epoch": 2.821958456973294,
"grad_norm": 3.9618520736694336,
"learning_rate": 1.5e-05,
"loss": 0.2759,
"step": 951
},
{
"epoch": 2.8308605341246293,
"grad_norm": 5.956901550292969,
"learning_rate": 1.5e-05,
"loss": 0.294,
"step": 954
},
{
"epoch": 2.8397626112759644,
"grad_norm": 5.030998706817627,
"learning_rate": 1.5e-05,
"loss": 0.3016,
"step": 957
},
{
"epoch": 2.8486646884272995,
"grad_norm": 8.330857276916504,
"learning_rate": 1.5e-05,
"loss": 0.3029,
"step": 960
},
{
"epoch": 2.857566765578635,
"grad_norm": 10.079005241394043,
"learning_rate": 1.5e-05,
"loss": 0.2955,
"step": 963
},
{
"epoch": 2.8664688427299705,
"grad_norm": 9.091019630432129,
"learning_rate": 1.5e-05,
"loss": 0.2999,
"step": 966
},
{
"epoch": 2.8753709198813056,
"grad_norm": 7.372535705566406,
"learning_rate": 1.5e-05,
"loss": 0.2949,
"step": 969
},
{
"epoch": 2.884272997032641,
"grad_norm": 8.11223030090332,
"learning_rate": 1.5e-05,
"loss": 0.2852,
"step": 972
},
{
"epoch": 2.893175074183976,
"grad_norm": 3.835611343383789,
"learning_rate": 1.5e-05,
"loss": 0.2745,
"step": 975
},
{
"epoch": 2.9020771513353116,
"grad_norm": 11.748644828796387,
"learning_rate": 1.5e-05,
"loss": 0.2875,
"step": 978
},
{
"epoch": 2.9109792284866467,
"grad_norm": 14.599609375,
"learning_rate": 1.5e-05,
"loss": 0.2854,
"step": 981
},
{
"epoch": 2.919881305637982,
"grad_norm": 8.011322021484375,
"learning_rate": 1.5e-05,
"loss": 0.2924,
"step": 984
},
{
"epoch": 2.9287833827893177,
"grad_norm": 5.392467498779297,
"learning_rate": 1.5e-05,
"loss": 0.293,
"step": 987
},
{
"epoch": 2.9376854599406528,
"grad_norm": 10.867618560791016,
"learning_rate": 1.5e-05,
"loss": 0.3049,
"step": 990
},
{
"epoch": 2.9465875370919883,
"grad_norm": 11.08749771118164,
"learning_rate": 1.5e-05,
"loss": 0.2943,
"step": 993
},
{
"epoch": 2.9554896142433233,
"grad_norm": 7.80095100402832,
"learning_rate": 1.5e-05,
"loss": 0.2984,
"step": 996
},
{
"epoch": 2.964391691394659,
"grad_norm": 6.650088310241699,
"learning_rate": 1.5e-05,
"loss": 0.3048,
"step": 999
},
{
"epoch": 2.973293768545994,
"grad_norm": 9.152456283569336,
"learning_rate": 1.5e-05,
"loss": 0.2985,
"step": 1002
},
{
"epoch": 2.9821958456973294,
"grad_norm": 10.47088623046875,
"learning_rate": 1.5e-05,
"loss": 0.2934,
"step": 1005
},
{
"epoch": 2.991097922848665,
"grad_norm": 3.175657272338867,
"learning_rate": 1.5e-05,
"loss": 0.2741,
"step": 1008
},
{
"epoch": 3.0,
"grad_norm": 10.17156982421875,
"learning_rate": 1.5e-05,
"loss": 0.2926,
"step": 1011
},
{
"epoch": 3.0,
"eval_loss": 0.6839648485183716,
"eval_runtime": 298.4807,
"eval_samples_per_second": 5.166,
"eval_steps_per_second": 0.647,
"step": 1011
}
],
"logging_steps": 3,
"max_steps": 3370,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}