{ "best_global_step": null, "best_metric": 0.6839648485183716, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1011, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002967359050445104, "grad_norm": 15.052371978759766, "learning_rate": 0.0, "loss": 0.3511, "step": 1 }, { "epoch": 0.008902077151335312, "grad_norm": 11.145326614379883, "learning_rate": 3.2123800577354604e-06, "loss": 0.3486, "step": 3 }, { "epoch": 0.017804154302670624, "grad_norm": 7.514125823974609, "learning_rate": 5.239166215940359e-06, "loss": 0.3444, "step": 6 }, { "epoch": 0.026706231454005934, "grad_norm": 8.797248840332031, "learning_rate": 6.424760115470921e-06, "loss": 0.3284, "step": 9 }, { "epoch": 0.03560830860534125, "grad_norm": 3.7588727474212646, "learning_rate": 7.265952374145257e-06, "loss": 0.3375, "step": 12 }, { "epoch": 0.04451038575667656, "grad_norm": 7.172510623931885, "learning_rate": 7.918431780800236e-06, "loss": 0.3271, "step": 15 }, { "epoch": 0.05341246290801187, "grad_norm": 3.615478992462158, "learning_rate": 8.451546273675818e-06, "loss": 0.3277, "step": 18 }, { "epoch": 0.06231454005934718, "grad_norm": 6.7531232833862305, "learning_rate": 8.902288154930203e-06, "loss": 0.3343, "step": 21 }, { "epoch": 0.0712166172106825, "grad_norm": 6.248157501220703, "learning_rate": 9.292738532350157e-06, "loss": 0.3294, "step": 24 }, { "epoch": 0.08011869436201781, "grad_norm": 6.436707019805908, "learning_rate": 9.637140173206382e-06, "loss": 0.3358, "step": 27 }, { "epoch": 0.08902077151335312, "grad_norm": 9.06855583190918, "learning_rate": 9.945217939005136e-06, "loss": 0.3293, "step": 30 }, { "epoch": 0.09792284866468842, "grad_norm": 7.107985973358154, "learning_rate": 1.0223908177645902e-05, "loss": 0.3111, "step": 33 }, { "epoch": 0.10682492581602374, "grad_norm": 6.3856520652771, "learning_rate": 1.0478332431880717e-05, "loss": 0.3251, "step": 36 }, { "epoch": 0.11572700296735905, "grad_norm": 4.95862340927124, "learning_rate": 1.0712380057735461e-05, "loss": 0.3211, "step": 39 }, { "epoch": 0.12462908011869436, "grad_norm": 4.809103012084961, "learning_rate": 1.0929074313135101e-05, "loss": 0.3244, "step": 42 }, { "epoch": 0.13353115727002968, "grad_norm": 6.697277069091797, "learning_rate": 1.1130811838535696e-05, "loss": 0.3179, "step": 45 }, { "epoch": 0.142433234421365, "grad_norm": 12.093936920166016, "learning_rate": 1.1319524690555053e-05, "loss": 0.3339, "step": 48 }, { "epoch": 0.1513353115727003, "grad_norm": 8.427734375, "learning_rate": 1.1496793166558515e-05, "loss": 0.329, "step": 51 }, { "epoch": 0.16023738872403562, "grad_norm": 11.345624923706055, "learning_rate": 1.1663926331411281e-05, "loss": 0.3212, "step": 54 }, { "epoch": 0.16913946587537093, "grad_norm": 7.70440149307251, "learning_rate": 1.1822020743040672e-05, "loss": 0.3119, "step": 57 }, { "epoch": 0.17804154302670624, "grad_norm": 4.138779163360596, "learning_rate": 1.1972004097210032e-05, "loss": 0.3186, "step": 60 }, { "epoch": 0.18694362017804153, "grad_norm": 9.505159378051758, "learning_rate": 1.2114668212665663e-05, "loss": 0.3125, "step": 63 }, { "epoch": 0.19584569732937684, "grad_norm": 5.866057395935059, "learning_rate": 1.2250694335850798e-05, "loss": 0.3217, "step": 66 }, { "epoch": 0.20474777448071216, "grad_norm": 9.940937995910645, "learning_rate": 1.238067281605409e-05, "loss": 0.3295, "step": 69 }, { "epoch": 0.21364985163204747, "grad_norm": 8.819473266601562, "learning_rate": 1.2505118590085615e-05, "loss": 0.3351, "step": 72 }, { "epoch": 0.22255192878338279, "grad_norm": 9.78069019317627, "learning_rate": 1.262448350386501e-05, "loss": 0.3383, "step": 75 }, { "epoch": 0.2314540059347181, "grad_norm": 5.660183429718018, "learning_rate": 1.2739166215940359e-05, "loss": 0.3252, "step": 78 }, { "epoch": 0.2403560830860534, "grad_norm": 7.043887615203857, "learning_rate": 1.2849520230941842e-05, "loss": 0.3249, "step": 81 }, { "epoch": 0.24925816023738873, "grad_norm": 7.998411178588867, "learning_rate": 1.295586047134e-05, "loss": 0.3359, "step": 84 }, { "epoch": 0.258160237388724, "grad_norm": 11.106488227844238, "learning_rate": 1.3058468695482481e-05, "loss": 0.3211, "step": 87 }, { "epoch": 0.26706231454005935, "grad_norm": 5.680809497833252, "learning_rate": 1.3157597996740594e-05, "loss": 0.3152, "step": 90 }, { "epoch": 0.27596439169139464, "grad_norm": 6.006475925445557, "learning_rate": 1.3253476564657357e-05, "loss": 0.3171, "step": 93 }, { "epoch": 0.28486646884273, "grad_norm": 4.647056579589844, "learning_rate": 1.3346310848759951e-05, "loss": 0.31, "step": 96 }, { "epoch": 0.29376854599406527, "grad_norm": 4.605801582336426, "learning_rate": 1.343628823538136e-05, "loss": 0.3185, "step": 99 }, { "epoch": 0.3026706231454006, "grad_norm": 17.812042236328125, "learning_rate": 1.3523579324763411e-05, "loss": 0.3161, "step": 102 }, { "epoch": 0.3115727002967359, "grad_norm": 12.627123832702637, "learning_rate": 1.3608339877994978e-05, "loss": 0.329, "step": 105 }, { "epoch": 0.32047477744807124, "grad_norm": 7.308374404907227, "learning_rate": 1.3690712489616179e-05, "loss": 0.3179, "step": 108 }, { "epoch": 0.3293768545994065, "grad_norm": 5.522846698760986, "learning_rate": 1.3770828031006136e-05, "loss": 0.3245, "step": 111 }, { "epoch": 0.33827893175074186, "grad_norm": 8.475018501281738, "learning_rate": 1.384880690124557e-05, "loss": 0.3105, "step": 114 }, { "epoch": 0.34718100890207715, "grad_norm": 8.174044609069824, "learning_rate": 1.3924760115470921e-05, "loss": 0.3057, "step": 117 }, { "epoch": 0.3560830860534125, "grad_norm": 6.196917533874512, "learning_rate": 1.399879025541493e-05, "loss": 0.336, "step": 120 }, { "epoch": 0.3649851632047478, "grad_norm": 9.92663860321045, "learning_rate": 1.4070992302558296e-05, "loss": 0.3253, "step": 123 }, { "epoch": 0.37388724035608306, "grad_norm": 7.014742374420166, "learning_rate": 1.4141454370870561e-05, "loss": 0.3129, "step": 126 }, { "epoch": 0.3827893175074184, "grad_norm": 8.182991027832031, "learning_rate": 1.421025835332077e-05, "loss": 0.3329, "step": 129 }, { "epoch": 0.3916913946587537, "grad_norm": 7.044063091278076, "learning_rate": 1.4277480494055697e-05, "loss": 0.3265, "step": 132 }, { "epoch": 0.40059347181008903, "grad_norm": 7.129275321960449, "learning_rate": 1.4343191896271158e-05, "loss": 0.3259, "step": 135 }, { "epoch": 0.4094955489614243, "grad_norm": 6.028862476348877, "learning_rate": 1.4407458974258987e-05, "loss": 0.326, "step": 138 }, { "epoch": 0.41839762611275966, "grad_norm": 9.75017261505127, "learning_rate": 1.4470343856834936e-05, "loss": 0.3242, "step": 141 }, { "epoch": 0.42729970326409494, "grad_norm": 8.225783348083496, "learning_rate": 1.4531904748290513e-05, "loss": 0.3061, "step": 144 }, { "epoch": 0.4362017804154303, "grad_norm": 7.090129852294922, "learning_rate": 1.4592196252124945e-05, "loss": 0.3152, "step": 147 }, { "epoch": 0.44510385756676557, "grad_norm": 11.792762756347656, "learning_rate": 1.465126966206991e-05, "loss": 0.3258, "step": 150 }, { "epoch": 0.4540059347181009, "grad_norm": 8.907958984375, "learning_rate": 1.4709173224293973e-05, "loss": 0.3203, "step": 153 }, { "epoch": 0.4629080118694362, "grad_norm": 4.160986423492432, "learning_rate": 1.4765952374145259e-05, "loss": 0.3156, "step": 156 }, { "epoch": 0.47181008902077154, "grad_norm": 4.767416954040527, "learning_rate": 1.482164995034286e-05, "loss": 0.3238, "step": 159 }, { "epoch": 0.4807121661721068, "grad_norm": 6.091423511505127, "learning_rate": 1.4876306389146738e-05, "loss": 0.3186, "step": 162 }, { "epoch": 0.4896142433234421, "grad_norm": 9.760332107543945, "learning_rate": 1.4929959900710676e-05, "loss": 0.3111, "step": 165 }, { "epoch": 0.49851632047477745, "grad_norm": 7.750044822692871, "learning_rate": 1.4982646629544899e-05, "loss": 0.3209, "step": 168 }, { "epoch": 0.5074183976261127, "grad_norm": 10.224579811096191, "learning_rate": 1.5e-05, "loss": 0.3262, "step": 171 }, { "epoch": 0.516320474777448, "grad_norm": 8.861902236938477, "learning_rate": 1.5e-05, "loss": 0.3121, "step": 174 }, { "epoch": 0.5252225519287834, "grad_norm": 4.321209907531738, "learning_rate": 1.5e-05, "loss": 0.3133, "step": 177 }, { "epoch": 0.5341246290801187, "grad_norm": 4.978445529937744, "learning_rate": 1.5e-05, "loss": 0.3071, "step": 180 }, { "epoch": 0.543026706231454, "grad_norm": 10.364422798156738, "learning_rate": 1.5e-05, "loss": 0.3178, "step": 183 }, { "epoch": 0.5519287833827893, "grad_norm": 8.970362663269043, "learning_rate": 1.5e-05, "loss": 0.3212, "step": 186 }, { "epoch": 0.5608308605341247, "grad_norm": 6.0015645027160645, "learning_rate": 1.5e-05, "loss": 0.3184, "step": 189 }, { "epoch": 0.56973293768546, "grad_norm": 5.832813739776611, "learning_rate": 1.5e-05, "loss": 0.3211, "step": 192 }, { "epoch": 0.5786350148367952, "grad_norm": 7.478390693664551, "learning_rate": 1.5e-05, "loss": 0.3432, "step": 195 }, { "epoch": 0.5875370919881305, "grad_norm": 5.908259868621826, "learning_rate": 1.5e-05, "loss": 0.3287, "step": 198 }, { "epoch": 0.5964391691394659, "grad_norm": 8.518238067626953, "learning_rate": 1.5e-05, "loss": 0.313, "step": 201 }, { "epoch": 0.6053412462908012, "grad_norm": 6.046856880187988, "learning_rate": 1.5e-05, "loss": 0.3197, "step": 204 }, { "epoch": 0.6142433234421365, "grad_norm": 9.07325553894043, "learning_rate": 1.5e-05, "loss": 0.3105, "step": 207 }, { "epoch": 0.6231454005934718, "grad_norm": 7.418500900268555, "learning_rate": 1.5e-05, "loss": 0.3178, "step": 210 }, { "epoch": 0.6320474777448071, "grad_norm": 10.935755729675293, "learning_rate": 1.5e-05, "loss": 0.3192, "step": 213 }, { "epoch": 0.6409495548961425, "grad_norm": 8.953109741210938, "learning_rate": 1.5e-05, "loss": 0.336, "step": 216 }, { "epoch": 0.6498516320474778, "grad_norm": 10.9089937210083, "learning_rate": 1.5e-05, "loss": 0.3362, "step": 219 }, { "epoch": 0.658753709198813, "grad_norm": 7.62611198425293, "learning_rate": 1.5e-05, "loss": 0.3219, "step": 222 }, { "epoch": 0.6676557863501483, "grad_norm": 8.877312660217285, "learning_rate": 1.5e-05, "loss": 0.3242, "step": 225 }, { "epoch": 0.6765578635014837, "grad_norm": 11.891584396362305, "learning_rate": 1.5e-05, "loss": 0.3193, "step": 228 }, { "epoch": 0.685459940652819, "grad_norm": 6.047501564025879, "learning_rate": 1.5e-05, "loss": 0.3145, "step": 231 }, { "epoch": 0.6943620178041543, "grad_norm": 11.06523609161377, "learning_rate": 1.5e-05, "loss": 0.3205, "step": 234 }, { "epoch": 0.7032640949554896, "grad_norm": 14.651629447937012, "learning_rate": 1.5e-05, "loss": 0.3181, "step": 237 }, { "epoch": 0.712166172106825, "grad_norm": 4.986928462982178, "learning_rate": 1.5e-05, "loss": 0.3188, "step": 240 }, { "epoch": 0.7210682492581603, "grad_norm": 5.383213520050049, "learning_rate": 1.5e-05, "loss": 0.3322, "step": 243 }, { "epoch": 0.7299703264094956, "grad_norm": 7.467197418212891, "learning_rate": 1.5e-05, "loss": 0.3231, "step": 246 }, { "epoch": 0.7388724035608308, "grad_norm": 8.040964126586914, "learning_rate": 1.5e-05, "loss": 0.3194, "step": 249 }, { "epoch": 0.7477744807121661, "grad_norm": 9.442214012145996, "learning_rate": 1.5e-05, "loss": 0.3045, "step": 252 }, { "epoch": 0.7566765578635015, "grad_norm": 5.0572919845581055, "learning_rate": 1.5e-05, "loss": 0.3198, "step": 255 }, { "epoch": 0.7655786350148368, "grad_norm": 9.763797760009766, "learning_rate": 1.5e-05, "loss": 0.3038, "step": 258 }, { "epoch": 0.7744807121661721, "grad_norm": 4.699306488037109, "learning_rate": 1.5e-05, "loss": 0.3071, "step": 261 }, { "epoch": 0.7833827893175074, "grad_norm": 6.758116245269775, "learning_rate": 1.5e-05, "loss": 0.3079, "step": 264 }, { "epoch": 0.7922848664688428, "grad_norm": 9.004244804382324, "learning_rate": 1.5e-05, "loss": 0.3114, "step": 267 }, { "epoch": 0.8011869436201781, "grad_norm": 10.923787117004395, "learning_rate": 1.5e-05, "loss": 0.3214, "step": 270 }, { "epoch": 0.8100890207715133, "grad_norm": 4.750248432159424, "learning_rate": 1.5e-05, "loss": 0.3213, "step": 273 }, { "epoch": 0.8189910979228486, "grad_norm": 6.5013346672058105, "learning_rate": 1.5e-05, "loss": 0.3213, "step": 276 }, { "epoch": 0.827893175074184, "grad_norm": 14.487788200378418, "learning_rate": 1.5e-05, "loss": 0.3117, "step": 279 }, { "epoch": 0.8367952522255193, "grad_norm": 4.58863639831543, "learning_rate": 1.5e-05, "loss": 0.325, "step": 282 }, { "epoch": 0.8456973293768546, "grad_norm": 5.803460597991943, "learning_rate": 1.5e-05, "loss": 0.3095, "step": 285 }, { "epoch": 0.8545994065281899, "grad_norm": 6.8022871017456055, "learning_rate": 1.5e-05, "loss": 0.3279, "step": 288 }, { "epoch": 0.8635014836795252, "grad_norm": 11.592184066772461, "learning_rate": 1.5e-05, "loss": 0.3211, "step": 291 }, { "epoch": 0.8724035608308606, "grad_norm": 4.380642890930176, "learning_rate": 1.5e-05, "loss": 0.3121, "step": 294 }, { "epoch": 0.8813056379821959, "grad_norm": 10.14802360534668, "learning_rate": 1.5e-05, "loss": 0.3147, "step": 297 }, { "epoch": 0.8902077151335311, "grad_norm": 6.102616786956787, "learning_rate": 1.5e-05, "loss": 0.327, "step": 300 }, { "epoch": 0.8991097922848664, "grad_norm": 6.485515117645264, "learning_rate": 1.5e-05, "loss": 0.3165, "step": 303 }, { "epoch": 0.9080118694362018, "grad_norm": 3.8234357833862305, "learning_rate": 1.5e-05, "loss": 0.316, "step": 306 }, { "epoch": 0.9169139465875371, "grad_norm": 9.781375885009766, "learning_rate": 1.5e-05, "loss": 0.3147, "step": 309 }, { "epoch": 0.9258160237388724, "grad_norm": 8.865571975708008, "learning_rate": 1.5e-05, "loss": 0.3178, "step": 312 }, { "epoch": 0.9347181008902077, "grad_norm": 6.27769660949707, "learning_rate": 1.5e-05, "loss": 0.3152, "step": 315 }, { "epoch": 0.9436201780415431, "grad_norm": 5.6717143058776855, "learning_rate": 1.5e-05, "loss": 0.3062, "step": 318 }, { "epoch": 0.9525222551928784, "grad_norm": 14.846003532409668, "learning_rate": 1.5e-05, "loss": 0.3204, "step": 321 }, { "epoch": 0.9614243323442137, "grad_norm": 7.501077651977539, "learning_rate": 1.5e-05, "loss": 0.307, "step": 324 }, { "epoch": 0.9703264094955489, "grad_norm": 7.450833320617676, "learning_rate": 1.5e-05, "loss": 0.313, "step": 327 }, { "epoch": 0.9792284866468842, "grad_norm": 4.422346591949463, "learning_rate": 1.5e-05, "loss": 0.3067, "step": 330 }, { "epoch": 0.9881305637982196, "grad_norm": 5.954162120819092, "learning_rate": 1.5e-05, "loss": 0.3215, "step": 333 }, { "epoch": 0.9970326409495549, "grad_norm": 4.766922950744629, "learning_rate": 1.5e-05, "loss": 0.3139, "step": 336 }, { "epoch": 1.0, "eval_loss": 0.6956011056900024, "eval_runtime": 297.7711, "eval_samples_per_second": 5.178, "eval_steps_per_second": 0.648, "step": 337 }, { "epoch": 1.0059347181008902, "grad_norm": 7.0965895652771, "learning_rate": 1.5e-05, "loss": 0.3104, "step": 339 }, { "epoch": 1.0148367952522255, "grad_norm": 7.9634599685668945, "learning_rate": 1.5e-05, "loss": 0.3082, "step": 342 }, { "epoch": 1.0237388724035608, "grad_norm": 8.5128755569458, "learning_rate": 1.5e-05, "loss": 0.3211, "step": 345 }, { "epoch": 1.032640949554896, "grad_norm": 5.733129501342773, "learning_rate": 1.5e-05, "loss": 0.3236, "step": 348 }, { "epoch": 1.0415430267062316, "grad_norm": 8.197546005249023, "learning_rate": 1.5e-05, "loss": 0.3136, "step": 351 }, { "epoch": 1.0504451038575668, "grad_norm": 11.312963485717773, "learning_rate": 1.5e-05, "loss": 0.3274, "step": 354 }, { "epoch": 1.0593471810089021, "grad_norm": 4.885214328765869, "learning_rate": 1.5e-05, "loss": 0.3268, "step": 357 }, { "epoch": 1.0682492581602374, "grad_norm": 7.366455078125, "learning_rate": 1.5e-05, "loss": 0.3214, "step": 360 }, { "epoch": 1.0771513353115727, "grad_norm": 7.0693559646606445, "learning_rate": 1.5e-05, "loss": 0.3262, "step": 363 }, { "epoch": 1.086053412462908, "grad_norm": 12.16964054107666, "learning_rate": 1.5e-05, "loss": 0.3232, "step": 366 }, { "epoch": 1.0949554896142433, "grad_norm": 6.702571868896484, "learning_rate": 1.5e-05, "loss": 0.3049, "step": 369 }, { "epoch": 1.1038575667655786, "grad_norm": 8.25865650177002, "learning_rate": 1.5e-05, "loss": 0.3086, "step": 372 }, { "epoch": 1.1127596439169138, "grad_norm": 10.963550567626953, "learning_rate": 1.5e-05, "loss": 0.3187, "step": 375 }, { "epoch": 1.1216617210682494, "grad_norm": 10.957636833190918, "learning_rate": 1.5e-05, "loss": 0.3119, "step": 378 }, { "epoch": 1.1305637982195846, "grad_norm": 4.481369495391846, "learning_rate": 1.5e-05, "loss": 0.3153, "step": 381 }, { "epoch": 1.13946587537092, "grad_norm": 7.9678120613098145, "learning_rate": 1.5e-05, "loss": 0.3143, "step": 384 }, { "epoch": 1.1483679525222552, "grad_norm": 10.013398170471191, "learning_rate": 1.5e-05, "loss": 0.3146, "step": 387 }, { "epoch": 1.1572700296735905, "grad_norm": 9.361319541931152, "learning_rate": 1.5e-05, "loss": 0.3069, "step": 390 }, { "epoch": 1.1661721068249258, "grad_norm": 7.185680866241455, "learning_rate": 1.5e-05, "loss": 0.3198, "step": 393 }, { "epoch": 1.175074183976261, "grad_norm": 9.780238151550293, "learning_rate": 1.5e-05, "loss": 0.3173, "step": 396 }, { "epoch": 1.1839762611275964, "grad_norm": 6.236032009124756, "learning_rate": 1.5e-05, "loss": 0.3096, "step": 399 }, { "epoch": 1.1928783382789319, "grad_norm": 6.732054710388184, "learning_rate": 1.5e-05, "loss": 0.3086, "step": 402 }, { "epoch": 1.2017804154302671, "grad_norm": 8.902305603027344, "learning_rate": 1.5e-05, "loss": 0.3074, "step": 405 }, { "epoch": 1.2106824925816024, "grad_norm": 8.529496192932129, "learning_rate": 1.5e-05, "loss": 0.3208, "step": 408 }, { "epoch": 1.2195845697329377, "grad_norm": 10.779397964477539, "learning_rate": 1.5e-05, "loss": 0.2997, "step": 411 }, { "epoch": 1.228486646884273, "grad_norm": 5.797762393951416, "learning_rate": 1.5e-05, "loss": 0.2977, "step": 414 }, { "epoch": 1.2373887240356083, "grad_norm": 10.339754104614258, "learning_rate": 1.5e-05, "loss": 0.3099, "step": 417 }, { "epoch": 1.2462908011869436, "grad_norm": 6.894352436065674, "learning_rate": 1.5e-05, "loss": 0.3027, "step": 420 }, { "epoch": 1.2551928783382789, "grad_norm": 10.406209945678711, "learning_rate": 1.5e-05, "loss": 0.318, "step": 423 }, { "epoch": 1.2640949554896141, "grad_norm": 4.105279922485352, "learning_rate": 1.5e-05, "loss": 0.3271, "step": 426 }, { "epoch": 1.2729970326409497, "grad_norm": 9.26810073852539, "learning_rate": 1.5e-05, "loss": 0.3052, "step": 429 }, { "epoch": 1.281899109792285, "grad_norm": 11.131587028503418, "learning_rate": 1.5e-05, "loss": 0.315, "step": 432 }, { "epoch": 1.2908011869436202, "grad_norm": 7.997912883758545, "learning_rate": 1.5e-05, "loss": 0.3037, "step": 435 }, { "epoch": 1.2997032640949555, "grad_norm": 4.264193058013916, "learning_rate": 1.5e-05, "loss": 0.2967, "step": 438 }, { "epoch": 1.3086053412462908, "grad_norm": 6.291212558746338, "learning_rate": 1.5e-05, "loss": 0.3078, "step": 441 }, { "epoch": 1.317507418397626, "grad_norm": 10.159900665283203, "learning_rate": 1.5e-05, "loss": 0.2994, "step": 444 }, { "epoch": 1.3264094955489614, "grad_norm": 10.216263771057129, "learning_rate": 1.5e-05, "loss": 0.3244, "step": 447 }, { "epoch": 1.3353115727002967, "grad_norm": 7.566501617431641, "learning_rate": 1.5e-05, "loss": 0.31, "step": 450 }, { "epoch": 1.344213649851632, "grad_norm": 5.979765892028809, "learning_rate": 1.5e-05, "loss": 0.3069, "step": 453 }, { "epoch": 1.3531157270029674, "grad_norm": 6.646083354949951, "learning_rate": 1.5e-05, "loss": 0.3026, "step": 456 }, { "epoch": 1.3620178041543027, "grad_norm": 6.50187349319458, "learning_rate": 1.5e-05, "loss": 0.3065, "step": 459 }, { "epoch": 1.370919881305638, "grad_norm": 4.8104705810546875, "learning_rate": 1.5e-05, "loss": 0.307, "step": 462 }, { "epoch": 1.3798219584569733, "grad_norm": 4.24050235748291, "learning_rate": 1.5e-05, "loss": 0.3011, "step": 465 }, { "epoch": 1.3887240356083086, "grad_norm": 7.853260040283203, "learning_rate": 1.5e-05, "loss": 0.3051, "step": 468 }, { "epoch": 1.3976261127596439, "grad_norm": 6.0949602127075195, "learning_rate": 1.5e-05, "loss": 0.2931, "step": 471 }, { "epoch": 1.4065281899109792, "grad_norm": 11.793612480163574, "learning_rate": 1.5e-05, "loss": 0.2982, "step": 474 }, { "epoch": 1.4154302670623147, "grad_norm": 8.261275291442871, "learning_rate": 1.5e-05, "loss": 0.2966, "step": 477 }, { "epoch": 1.4243323442136497, "grad_norm": 6.895599365234375, "learning_rate": 1.5e-05, "loss": 0.3195, "step": 480 }, { "epoch": 1.4332344213649852, "grad_norm": 5.414015293121338, "learning_rate": 1.5e-05, "loss": 0.3063, "step": 483 }, { "epoch": 1.4421364985163205, "grad_norm": 4.563000202178955, "learning_rate": 1.5e-05, "loss": 0.3079, "step": 486 }, { "epoch": 1.4510385756676558, "grad_norm": 7.205160617828369, "learning_rate": 1.5e-05, "loss": 0.3204, "step": 489 }, { "epoch": 1.459940652818991, "grad_norm": 7.146437168121338, "learning_rate": 1.5e-05, "loss": 0.3049, "step": 492 }, { "epoch": 1.4688427299703264, "grad_norm": 8.912725448608398, "learning_rate": 1.5e-05, "loss": 0.3004, "step": 495 }, { "epoch": 1.4777448071216617, "grad_norm": 5.934146881103516, "learning_rate": 1.5e-05, "loss": 0.3053, "step": 498 }, { "epoch": 1.486646884272997, "grad_norm": 7.54482889175415, "learning_rate": 1.5e-05, "loss": 0.2962, "step": 501 }, { "epoch": 1.4955489614243325, "grad_norm": 11.391508102416992, "learning_rate": 1.5e-05, "loss": 0.3291, "step": 504 }, { "epoch": 1.5044510385756675, "grad_norm": 9.863611221313477, "learning_rate": 1.5e-05, "loss": 0.3068, "step": 507 }, { "epoch": 1.513353115727003, "grad_norm": 7.5741376876831055, "learning_rate": 1.5e-05, "loss": 0.3036, "step": 510 }, { "epoch": 1.5222551928783383, "grad_norm": 11.626495361328125, "learning_rate": 1.5e-05, "loss": 0.3131, "step": 513 }, { "epoch": 1.5311572700296736, "grad_norm": 4.790311813354492, "learning_rate": 1.5e-05, "loss": 0.3126, "step": 516 }, { "epoch": 1.540059347181009, "grad_norm": 5.693728446960449, "learning_rate": 1.5e-05, "loss": 0.3221, "step": 519 }, { "epoch": 1.5489614243323442, "grad_norm": 9.541658401489258, "learning_rate": 1.5e-05, "loss": 0.3186, "step": 522 }, { "epoch": 1.5578635014836797, "grad_norm": 10.08277416229248, "learning_rate": 1.5e-05, "loss": 0.3094, "step": 525 }, { "epoch": 1.5667655786350148, "grad_norm": 10.004911422729492, "learning_rate": 1.5e-05, "loss": 0.3081, "step": 528 }, { "epoch": 1.5756676557863503, "grad_norm": 4.247671127319336, "learning_rate": 1.5e-05, "loss": 0.3173, "step": 531 }, { "epoch": 1.5845697329376853, "grad_norm": 6.010837078094482, "learning_rate": 1.5e-05, "loss": 0.314, "step": 534 }, { "epoch": 1.5934718100890208, "grad_norm": 10.42171859741211, "learning_rate": 1.5e-05, "loss": 0.3111, "step": 537 }, { "epoch": 1.6023738872403561, "grad_norm": 11.672240257263184, "learning_rate": 1.5e-05, "loss": 0.3263, "step": 540 }, { "epoch": 1.6112759643916914, "grad_norm": 9.143010139465332, "learning_rate": 1.5e-05, "loss": 0.2983, "step": 543 }, { "epoch": 1.6201780415430267, "grad_norm": 7.786658763885498, "learning_rate": 1.5e-05, "loss": 0.3089, "step": 546 }, { "epoch": 1.629080118694362, "grad_norm": 5.973340034484863, "learning_rate": 1.5e-05, "loss": 0.303, "step": 549 }, { "epoch": 1.6379821958456975, "grad_norm": 4.11182975769043, "learning_rate": 1.5e-05, "loss": 0.3059, "step": 552 }, { "epoch": 1.6468842729970326, "grad_norm": 6.210434913635254, "learning_rate": 1.5e-05, "loss": 0.3133, "step": 555 }, { "epoch": 1.655786350148368, "grad_norm": 11.501874923706055, "learning_rate": 1.5e-05, "loss": 0.3078, "step": 558 }, { "epoch": 1.6646884272997031, "grad_norm": 8.35253620147705, "learning_rate": 1.5e-05, "loss": 0.3147, "step": 561 }, { "epoch": 1.6735905044510386, "grad_norm": 6.669034957885742, "learning_rate": 1.5e-05, "loss": 0.3104, "step": 564 }, { "epoch": 1.682492581602374, "grad_norm": 13.310565948486328, "learning_rate": 1.5e-05, "loss": 0.3172, "step": 567 }, { "epoch": 1.6913946587537092, "grad_norm": 6.960197448730469, "learning_rate": 1.5e-05, "loss": 0.3192, "step": 570 }, { "epoch": 1.7002967359050445, "grad_norm": 10.452018737792969, "learning_rate": 1.5e-05, "loss": 0.3072, "step": 573 }, { "epoch": 1.7091988130563798, "grad_norm": 6.1864190101623535, "learning_rate": 1.5e-05, "loss": 0.3178, "step": 576 }, { "epoch": 1.7181008902077153, "grad_norm": 6.356491565704346, "learning_rate": 1.5e-05, "loss": 0.3176, "step": 579 }, { "epoch": 1.7270029673590503, "grad_norm": 5.232566833496094, "learning_rate": 1.5e-05, "loss": 0.2963, "step": 582 }, { "epoch": 1.7359050445103859, "grad_norm": 3.332583427429199, "learning_rate": 1.5e-05, "loss": 0.2999, "step": 585 }, { "epoch": 1.744807121661721, "grad_norm": 5.193176746368408, "learning_rate": 1.5e-05, "loss": 0.3192, "step": 588 }, { "epoch": 1.7537091988130564, "grad_norm": 6.814889907836914, "learning_rate": 1.5e-05, "loss": 0.2958, "step": 591 }, { "epoch": 1.7626112759643917, "grad_norm": 9.611870765686035, "learning_rate": 1.5e-05, "loss": 0.3091, "step": 594 }, { "epoch": 1.771513353115727, "grad_norm": 7.733308792114258, "learning_rate": 1.5e-05, "loss": 0.3026, "step": 597 }, { "epoch": 1.7804154302670623, "grad_norm": 5.742140769958496, "learning_rate": 1.5e-05, "loss": 0.2962, "step": 600 }, { "epoch": 1.7893175074183976, "grad_norm": 11.053295135498047, "learning_rate": 1.5e-05, "loss": 0.2967, "step": 603 }, { "epoch": 1.798219584569733, "grad_norm": 7.031610012054443, "learning_rate": 1.5e-05, "loss": 0.2927, "step": 606 }, { "epoch": 1.8071216617210681, "grad_norm": 6.521071910858154, "learning_rate": 1.5e-05, "loss": 0.3139, "step": 609 }, { "epoch": 1.8160237388724036, "grad_norm": 6.417489528656006, "learning_rate": 1.5e-05, "loss": 0.3081, "step": 612 }, { "epoch": 1.8249258160237387, "grad_norm": 9.378142356872559, "learning_rate": 1.5e-05, "loss": 0.298, "step": 615 }, { "epoch": 1.8338278931750742, "grad_norm": 8.447271347045898, "learning_rate": 1.5e-05, "loss": 0.3141, "step": 618 }, { "epoch": 1.8427299703264095, "grad_norm": 10.930451393127441, "learning_rate": 1.5e-05, "loss": 0.3021, "step": 621 }, { "epoch": 1.8516320474777448, "grad_norm": 8.880478858947754, "learning_rate": 1.5e-05, "loss": 0.3136, "step": 624 }, { "epoch": 1.86053412462908, "grad_norm": 5.905041217803955, "learning_rate": 1.5e-05, "loss": 0.3191, "step": 627 }, { "epoch": 1.8694362017804154, "grad_norm": 6.188875675201416, "learning_rate": 1.5e-05, "loss": 0.3283, "step": 630 }, { "epoch": 1.8783382789317509, "grad_norm": 11.83849811553955, "learning_rate": 1.5e-05, "loss": 0.3235, "step": 633 }, { "epoch": 1.887240356083086, "grad_norm": 7.689598560333252, "learning_rate": 1.5e-05, "loss": 0.3162, "step": 636 }, { "epoch": 1.8961424332344214, "grad_norm": 3.9637110233306885, "learning_rate": 1.5e-05, "loss": 0.3127, "step": 639 }, { "epoch": 1.9050445103857567, "grad_norm": 13.587063789367676, "learning_rate": 1.5e-05, "loss": 0.3268, "step": 642 }, { "epoch": 1.913946587537092, "grad_norm": 7.881510257720947, "learning_rate": 1.5e-05, "loss": 0.3038, "step": 645 }, { "epoch": 1.9228486646884273, "grad_norm": 6.357386112213135, "learning_rate": 1.5e-05, "loss": 0.3097, "step": 648 }, { "epoch": 1.9317507418397626, "grad_norm": 6.852357387542725, "learning_rate": 1.5e-05, "loss": 0.3056, "step": 651 }, { "epoch": 1.9406528189910979, "grad_norm": 6.557038307189941, "learning_rate": 1.5e-05, "loss": 0.3209, "step": 654 }, { "epoch": 1.9495548961424332, "grad_norm": 7.013545036315918, "learning_rate": 1.5e-05, "loss": 0.3237, "step": 657 }, { "epoch": 1.9584569732937687, "grad_norm": 9.902325630187988, "learning_rate": 1.5e-05, "loss": 0.3166, "step": 660 }, { "epoch": 1.9673590504451037, "grad_norm": 6.723764896392822, "learning_rate": 1.5e-05, "loss": 0.3198, "step": 663 }, { "epoch": 1.9762611275964392, "grad_norm": 9.627095222473145, "learning_rate": 1.5e-05, "loss": 0.321, "step": 666 }, { "epoch": 1.9851632047477745, "grad_norm": 8.035420417785645, "learning_rate": 1.5e-05, "loss": 0.31, "step": 669 }, { "epoch": 1.9940652818991098, "grad_norm": 10.477612495422363, "learning_rate": 1.5e-05, "loss": 0.2995, "step": 672 }, { "epoch": 2.0, "eval_loss": 0.6920689940452576, "eval_runtime": 296.7344, "eval_samples_per_second": 5.197, "eval_steps_per_second": 0.65, "step": 674 }, { "epoch": 2.0029673590504453, "grad_norm": 4.917605400085449, "learning_rate": 1.5e-05, "loss": 0.3129, "step": 675 }, { "epoch": 2.0118694362017804, "grad_norm": 14.471161842346191, "learning_rate": 1.5e-05, "loss": 0.3121, "step": 678 }, { "epoch": 2.020771513353116, "grad_norm": 10.123734474182129, "learning_rate": 1.5e-05, "loss": 0.3136, "step": 681 }, { "epoch": 2.029673590504451, "grad_norm": 7.0058794021606445, "learning_rate": 1.5e-05, "loss": 0.314, "step": 684 }, { "epoch": 2.0385756676557865, "grad_norm": 5.461868762969971, "learning_rate": 1.5e-05, "loss": 0.307, "step": 687 }, { "epoch": 2.0474777448071215, "grad_norm": 5.689599514007568, "learning_rate": 1.5e-05, "loss": 0.3053, "step": 690 }, { "epoch": 2.056379821958457, "grad_norm": 8.585354804992676, "learning_rate": 1.5e-05, "loss": 0.3041, "step": 693 }, { "epoch": 2.065281899109792, "grad_norm": 4.620091915130615, "learning_rate": 1.5e-05, "loss": 0.2921, "step": 696 }, { "epoch": 2.0741839762611276, "grad_norm": 6.909940719604492, "learning_rate": 1.5e-05, "loss": 0.3087, "step": 699 }, { "epoch": 2.083086053412463, "grad_norm": 5.3829426765441895, "learning_rate": 1.5e-05, "loss": 0.294, "step": 702 }, { "epoch": 2.091988130563798, "grad_norm": 10.095771789550781, "learning_rate": 1.5e-05, "loss": 0.3027, "step": 705 }, { "epoch": 2.1008902077151337, "grad_norm": 7.622206687927246, "learning_rate": 1.5e-05, "loss": 0.2936, "step": 708 }, { "epoch": 2.1097922848664687, "grad_norm": 9.839076042175293, "learning_rate": 1.5e-05, "loss": 0.3093, "step": 711 }, { "epoch": 2.1186943620178043, "grad_norm": 13.05020809173584, "learning_rate": 1.5e-05, "loss": 0.3076, "step": 714 }, { "epoch": 2.1275964391691393, "grad_norm": 4.418980598449707, "learning_rate": 1.5e-05, "loss": 0.3043, "step": 717 }, { "epoch": 2.136498516320475, "grad_norm": 3.569221019744873, "learning_rate": 1.5e-05, "loss": 0.3083, "step": 720 }, { "epoch": 2.14540059347181, "grad_norm": 6.468089580535889, "learning_rate": 1.5e-05, "loss": 0.3016, "step": 723 }, { "epoch": 2.1543026706231454, "grad_norm": 8.789352416992188, "learning_rate": 1.5e-05, "loss": 0.3022, "step": 726 }, { "epoch": 2.163204747774481, "grad_norm": 8.202059745788574, "learning_rate": 1.5e-05, "loss": 0.311, "step": 729 }, { "epoch": 2.172106824925816, "grad_norm": 6.959595203399658, "learning_rate": 1.5e-05, "loss": 0.2947, "step": 732 }, { "epoch": 2.1810089020771515, "grad_norm": 11.653180122375488, "learning_rate": 1.5e-05, "loss": 0.3094, "step": 735 }, { "epoch": 2.1899109792284865, "grad_norm": 8.507452964782715, "learning_rate": 1.5e-05, "loss": 0.3094, "step": 738 }, { "epoch": 2.198813056379822, "grad_norm": 3.680802583694458, "learning_rate": 1.5e-05, "loss": 0.3054, "step": 741 }, { "epoch": 2.207715133531157, "grad_norm": 9.95173454284668, "learning_rate": 1.5e-05, "loss": 0.2928, "step": 744 }, { "epoch": 2.2166172106824926, "grad_norm": 10.835822105407715, "learning_rate": 1.5e-05, "loss": 0.2882, "step": 747 }, { "epoch": 2.2255192878338277, "grad_norm": 12.096845626831055, "learning_rate": 1.5e-05, "loss": 0.308, "step": 750 }, { "epoch": 2.234421364985163, "grad_norm": 4.49980354309082, "learning_rate": 1.5e-05, "loss": 0.3173, "step": 753 }, { "epoch": 2.2433234421364987, "grad_norm": 9.042285919189453, "learning_rate": 1.5e-05, "loss": 0.3073, "step": 756 }, { "epoch": 2.2522255192878338, "grad_norm": 5.250131130218506, "learning_rate": 1.5e-05, "loss": 0.2966, "step": 759 }, { "epoch": 2.2611275964391693, "grad_norm": 9.235132217407227, "learning_rate": 1.5e-05, "loss": 0.3168, "step": 762 }, { "epoch": 2.2700296735905043, "grad_norm": 7.330996513366699, "learning_rate": 1.5e-05, "loss": 0.3027, "step": 765 }, { "epoch": 2.27893175074184, "grad_norm": 5.805144309997559, "learning_rate": 1.5e-05, "loss": 0.3232, "step": 768 }, { "epoch": 2.287833827893175, "grad_norm": 10.95457649230957, "learning_rate": 1.5e-05, "loss": 0.307, "step": 771 }, { "epoch": 2.2967359050445104, "grad_norm": 5.920906066894531, "learning_rate": 1.5e-05, "loss": 0.3052, "step": 774 }, { "epoch": 2.3056379821958455, "grad_norm": 7.4418511390686035, "learning_rate": 1.5e-05, "loss": 0.3195, "step": 777 }, { "epoch": 2.314540059347181, "grad_norm": 9.739228248596191, "learning_rate": 1.5e-05, "loss": 0.3146, "step": 780 }, { "epoch": 2.3234421364985165, "grad_norm": 11.025596618652344, "learning_rate": 1.5e-05, "loss": 0.3061, "step": 783 }, { "epoch": 2.3323442136498516, "grad_norm": 5.031250953674316, "learning_rate": 1.5e-05, "loss": 0.3128, "step": 786 }, { "epoch": 2.341246290801187, "grad_norm": 9.482969284057617, "learning_rate": 1.5e-05, "loss": 0.3067, "step": 789 }, { "epoch": 2.350148367952522, "grad_norm": 4.4395270347595215, "learning_rate": 1.5e-05, "loss": 0.2972, "step": 792 }, { "epoch": 2.3590504451038576, "grad_norm": 4.755709171295166, "learning_rate": 1.5e-05, "loss": 0.3078, "step": 795 }, { "epoch": 2.3679525222551927, "grad_norm": 6.278073310852051, "learning_rate": 1.5e-05, "loss": 0.3107, "step": 798 }, { "epoch": 2.376854599406528, "grad_norm": 7.922651767730713, "learning_rate": 1.5e-05, "loss": 0.3043, "step": 801 }, { "epoch": 2.3857566765578637, "grad_norm": 9.521344184875488, "learning_rate": 1.5e-05, "loss": 0.3158, "step": 804 }, { "epoch": 2.394658753709199, "grad_norm": 12.499236106872559, "learning_rate": 1.5e-05, "loss": 0.3086, "step": 807 }, { "epoch": 2.4035608308605343, "grad_norm": 6.426900863647461, "learning_rate": 1.5e-05, "loss": 0.3126, "step": 810 }, { "epoch": 2.4124629080118694, "grad_norm": 8.431981086730957, "learning_rate": 1.5e-05, "loss": 0.3, "step": 813 }, { "epoch": 2.421364985163205, "grad_norm": 12.86776351928711, "learning_rate": 1.5e-05, "loss": 0.2995, "step": 816 }, { "epoch": 2.43026706231454, "grad_norm": 6.822738170623779, "learning_rate": 1.5e-05, "loss": 0.3115, "step": 819 }, { "epoch": 2.4391691394658754, "grad_norm": 6.153812408447266, "learning_rate": 1.5e-05, "loss": 0.297, "step": 822 }, { "epoch": 2.4480712166172105, "grad_norm": 11.699315071105957, "learning_rate": 1.5e-05, "loss": 0.2951, "step": 825 }, { "epoch": 2.456973293768546, "grad_norm": 5.795748710632324, "learning_rate": 1.5e-05, "loss": 0.3062, "step": 828 }, { "epoch": 2.465875370919881, "grad_norm": 6.4195756912231445, "learning_rate": 1.5e-05, "loss": 0.2938, "step": 831 }, { "epoch": 2.4747774480712166, "grad_norm": 6.024349212646484, "learning_rate": 1.5e-05, "loss": 0.2887, "step": 834 }, { "epoch": 2.483679525222552, "grad_norm": 5.880214691162109, "learning_rate": 1.5e-05, "loss": 0.2943, "step": 837 }, { "epoch": 2.492581602373887, "grad_norm": 18.98047637939453, "learning_rate": 1.5e-05, "loss": 0.2903, "step": 840 }, { "epoch": 2.5014836795252227, "grad_norm": 14.550153732299805, "learning_rate": 1.5e-05, "loss": 0.2999, "step": 843 }, { "epoch": 2.5103857566765577, "grad_norm": 11.062093734741211, "learning_rate": 1.5e-05, "loss": 0.3281, "step": 846 }, { "epoch": 2.5192878338278932, "grad_norm": 6.1865644454956055, "learning_rate": 1.5e-05, "loss": 0.3073, "step": 849 }, { "epoch": 2.5281899109792283, "grad_norm": 10.409070014953613, "learning_rate": 1.5e-05, "loss": 0.3155, "step": 852 }, { "epoch": 2.537091988130564, "grad_norm": 12.40860366821289, "learning_rate": 1.5e-05, "loss": 0.3013, "step": 855 }, { "epoch": 2.5459940652818993, "grad_norm": 6.20428466796875, "learning_rate": 1.5e-05, "loss": 0.3141, "step": 858 }, { "epoch": 2.5548961424332344, "grad_norm": 4.158163547515869, "learning_rate": 1.5e-05, "loss": 0.307, "step": 861 }, { "epoch": 2.56379821958457, "grad_norm": 7.828709602355957, "learning_rate": 1.5e-05, "loss": 0.3191, "step": 864 }, { "epoch": 2.572700296735905, "grad_norm": 8.588981628417969, "learning_rate": 1.5e-05, "loss": 0.3237, "step": 867 }, { "epoch": 2.5816023738872405, "grad_norm": 6.725210189819336, "learning_rate": 1.5e-05, "loss": 0.293, "step": 870 }, { "epoch": 2.5905044510385755, "grad_norm": 8.876666069030762, "learning_rate": 1.5e-05, "loss": 0.3042, "step": 873 }, { "epoch": 2.599406528189911, "grad_norm": 8.503588676452637, "learning_rate": 1.5e-05, "loss": 0.3058, "step": 876 }, { "epoch": 2.6083086053412465, "grad_norm": 7.051385402679443, "learning_rate": 1.5e-05, "loss": 0.2856, "step": 879 }, { "epoch": 2.6172106824925816, "grad_norm": 11.214133262634277, "learning_rate": 1.5e-05, "loss": 0.2899, "step": 882 }, { "epoch": 2.6261127596439167, "grad_norm": 5.270874977111816, "learning_rate": 1.5e-05, "loss": 0.2947, "step": 885 }, { "epoch": 2.635014836795252, "grad_norm": 13.623291015625, "learning_rate": 1.5e-05, "loss": 0.3001, "step": 888 }, { "epoch": 2.6439169139465877, "grad_norm": 3.9485678672790527, "learning_rate": 1.5e-05, "loss": 0.3027, "step": 891 }, { "epoch": 2.6528189910979227, "grad_norm": 7.7399725914001465, "learning_rate": 1.5e-05, "loss": 0.2988, "step": 894 }, { "epoch": 2.6617210682492582, "grad_norm": 7.428469181060791, "learning_rate": 1.5e-05, "loss": 0.2896, "step": 897 }, { "epoch": 2.6706231454005933, "grad_norm": 4.9085001945495605, "learning_rate": 1.5e-05, "loss": 0.2955, "step": 900 }, { "epoch": 2.679525222551929, "grad_norm": 7.616215705871582, "learning_rate": 1.5e-05, "loss": 0.3143, "step": 903 }, { "epoch": 2.688427299703264, "grad_norm": 6.225953102111816, "learning_rate": 1.5e-05, "loss": 0.3004, "step": 906 }, { "epoch": 2.6973293768545994, "grad_norm": 5.675787448883057, "learning_rate": 1.5e-05, "loss": 0.2946, "step": 909 }, { "epoch": 2.706231454005935, "grad_norm": 7.747137069702148, "learning_rate": 1.5e-05, "loss": 0.2966, "step": 912 }, { "epoch": 2.71513353115727, "grad_norm": 12.72786808013916, "learning_rate": 1.5e-05, "loss": 0.3106, "step": 915 }, { "epoch": 2.7240356083086055, "grad_norm": 7.423135280609131, "learning_rate": 1.5e-05, "loss": 0.2838, "step": 918 }, { "epoch": 2.7329376854599405, "grad_norm": 6.8378520011901855, "learning_rate": 1.5e-05, "loss": 0.3165, "step": 921 }, { "epoch": 2.741839762611276, "grad_norm": 5.68455696105957, "learning_rate": 1.5e-05, "loss": 0.3078, "step": 924 }, { "epoch": 2.750741839762611, "grad_norm": 13.37850570678711, "learning_rate": 1.5e-05, "loss": 0.3005, "step": 927 }, { "epoch": 2.7596439169139466, "grad_norm": 5.610422611236572, "learning_rate": 1.5e-05, "loss": 0.2948, "step": 930 }, { "epoch": 2.768545994065282, "grad_norm": 9.621097564697266, "learning_rate": 1.5e-05, "loss": 0.307, "step": 933 }, { "epoch": 2.777448071216617, "grad_norm": 4.709936141967773, "learning_rate": 1.5e-05, "loss": 0.3011, "step": 936 }, { "epoch": 2.7863501483679523, "grad_norm": 7.198949813842773, "learning_rate": 1.5e-05, "loss": 0.2938, "step": 939 }, { "epoch": 2.7952522255192878, "grad_norm": 6.532808303833008, "learning_rate": 1.5e-05, "loss": 0.3158, "step": 942 }, { "epoch": 2.8041543026706233, "grad_norm": 10.170119285583496, "learning_rate": 1.5e-05, "loss": 0.2862, "step": 945 }, { "epoch": 2.8130563798219583, "grad_norm": 7.333060264587402, "learning_rate": 1.5e-05, "loss": 0.2989, "step": 948 }, { "epoch": 2.821958456973294, "grad_norm": 3.9618520736694336, "learning_rate": 1.5e-05, "loss": 0.2759, "step": 951 }, { "epoch": 2.8308605341246293, "grad_norm": 5.956901550292969, "learning_rate": 1.5e-05, "loss": 0.294, "step": 954 }, { "epoch": 2.8397626112759644, "grad_norm": 5.030998706817627, "learning_rate": 1.5e-05, "loss": 0.3016, "step": 957 }, { "epoch": 2.8486646884272995, "grad_norm": 8.330857276916504, "learning_rate": 1.5e-05, "loss": 0.3029, "step": 960 }, { "epoch": 2.857566765578635, "grad_norm": 10.079005241394043, "learning_rate": 1.5e-05, "loss": 0.2955, "step": 963 }, { "epoch": 2.8664688427299705, "grad_norm": 9.091019630432129, "learning_rate": 1.5e-05, "loss": 0.2999, "step": 966 }, { "epoch": 2.8753709198813056, "grad_norm": 7.372535705566406, "learning_rate": 1.5e-05, "loss": 0.2949, "step": 969 }, { "epoch": 2.884272997032641, "grad_norm": 8.11223030090332, "learning_rate": 1.5e-05, "loss": 0.2852, "step": 972 }, { "epoch": 2.893175074183976, "grad_norm": 3.835611343383789, "learning_rate": 1.5e-05, "loss": 0.2745, "step": 975 }, { "epoch": 2.9020771513353116, "grad_norm": 11.748644828796387, "learning_rate": 1.5e-05, "loss": 0.2875, "step": 978 }, { "epoch": 2.9109792284866467, "grad_norm": 14.599609375, "learning_rate": 1.5e-05, "loss": 0.2854, "step": 981 }, { "epoch": 2.919881305637982, "grad_norm": 8.011322021484375, "learning_rate": 1.5e-05, "loss": 0.2924, "step": 984 }, { "epoch": 2.9287833827893177, "grad_norm": 5.392467498779297, "learning_rate": 1.5e-05, "loss": 0.293, "step": 987 }, { "epoch": 2.9376854599406528, "grad_norm": 10.867618560791016, "learning_rate": 1.5e-05, "loss": 0.3049, "step": 990 }, { "epoch": 2.9465875370919883, "grad_norm": 11.08749771118164, "learning_rate": 1.5e-05, "loss": 0.2943, "step": 993 }, { "epoch": 2.9554896142433233, "grad_norm": 7.80095100402832, "learning_rate": 1.5e-05, "loss": 0.2984, "step": 996 }, { "epoch": 2.964391691394659, "grad_norm": 6.650088310241699, "learning_rate": 1.5e-05, "loss": 0.3048, "step": 999 }, { "epoch": 2.973293768545994, "grad_norm": 9.152456283569336, "learning_rate": 1.5e-05, "loss": 0.2985, "step": 1002 }, { "epoch": 2.9821958456973294, "grad_norm": 10.47088623046875, "learning_rate": 1.5e-05, "loss": 0.2934, "step": 1005 }, { "epoch": 2.991097922848665, "grad_norm": 3.175657272338867, "learning_rate": 1.5e-05, "loss": 0.2741, "step": 1008 }, { "epoch": 3.0, "grad_norm": 10.17156982421875, "learning_rate": 1.5e-05, "loss": 0.2926, "step": 1011 }, { "epoch": 3.0, "eval_loss": 0.6839648485183716, "eval_runtime": 298.4807, "eval_samples_per_second": 5.166, "eval_steps_per_second": 0.647, "step": 1011 } ], "logging_steps": 3, "max_steps": 3370, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }