{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9996224990562474, "eval_steps": 5000, "global_step": 6951, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009060022650056626, "grad_norm": 2.6664810047066947, "learning_rate": 9.999774793715127e-05, "loss": 1.431, "num_input_tokens_seen": 5118512, "step": 21 }, { "epoch": 0.01812004530011325, "grad_norm": 0.8152880359374859, "learning_rate": 9.99909919514765e-05, "loss": 0.5348, "num_input_tokens_seen": 10150592, "step": 42 }, { "epoch": 0.027180067950169876, "grad_norm": 0.6315786850545617, "learning_rate": 9.997973265157192e-05, "loss": 0.4599, "num_input_tokens_seen": 15170896, "step": 63 }, { "epoch": 0.0362400906002265, "grad_norm": 0.4538892840989214, "learning_rate": 9.996397105170353e-05, "loss": 0.4108, "num_input_tokens_seen": 20155632, "step": 84 }, { "epoch": 0.045300113250283124, "grad_norm": 0.4720911162561318, "learning_rate": 9.994370857171588e-05, "loss": 0.3947, "num_input_tokens_seen": 25135440, "step": 105 }, { "epoch": 0.05436013590033975, "grad_norm": 0.4622888476742547, "learning_rate": 9.991894703690414e-05, "loss": 0.3764, "num_input_tokens_seen": 30346784, "step": 126 }, { "epoch": 0.06342015855039637, "grad_norm": 0.4173513485950873, "learning_rate": 9.988968867784958e-05, "loss": 0.3751, "num_input_tokens_seen": 35269664, "step": 147 }, { "epoch": 0.072480181200453, "grad_norm": 0.39249341716348773, "learning_rate": 9.985593613021872e-05, "loss": 0.3704, "num_input_tokens_seen": 40151792, "step": 168 }, { "epoch": 0.08154020385050963, "grad_norm": 0.3080363284417135, "learning_rate": 9.981769243452595e-05, "loss": 0.3552, "num_input_tokens_seen": 45283312, "step": 189 }, { "epoch": 0.09060022650056625, "grad_norm": 0.3747998358734097, "learning_rate": 9.977496103585949e-05, "loss": 0.3576, "num_input_tokens_seen": 50298912, "step": 210 }, { "epoch": 0.09966024915062288, "grad_norm": 0.2975791185912304, "learning_rate": 9.972774578357117e-05, "loss": 0.3451, "num_input_tokens_seen": 55445792, "step": 231 }, { "epoch": 0.1087202718006795, "grad_norm": 0.3172928529259604, "learning_rate": 9.96760509309296e-05, "loss": 0.3506, "num_input_tokens_seen": 60506496, "step": 252 }, { "epoch": 0.11778029445073612, "grad_norm": 0.3304680645103982, "learning_rate": 9.961988113473708e-05, "loss": 0.3443, "num_input_tokens_seen": 65678096, "step": 273 }, { "epoch": 0.12684031710079274, "grad_norm": 0.2929759270178528, "learning_rate": 9.955924145491005e-05, "loss": 0.3446, "num_input_tokens_seen": 70478688, "step": 294 }, { "epoch": 0.13590033975084936, "grad_norm": 0.2809492487037724, "learning_rate": 9.94941373540233e-05, "loss": 0.3362, "num_input_tokens_seen": 75373536, "step": 315 }, { "epoch": 0.144960362400906, "grad_norm": 0.38609616986912937, "learning_rate": 9.942457469681794e-05, "loss": 0.3384, "num_input_tokens_seen": 80476704, "step": 336 }, { "epoch": 0.15402038505096263, "grad_norm": 0.24129947470960447, "learning_rate": 9.935055974967299e-05, "loss": 0.3315, "num_input_tokens_seen": 85670800, "step": 357 }, { "epoch": 0.16308040770101925, "grad_norm": 0.22875446286948012, "learning_rate": 9.927209918004095e-05, "loss": 0.33, "num_input_tokens_seen": 90707040, "step": 378 }, { "epoch": 0.17214043035107587, "grad_norm": 0.26219955903132913, "learning_rate": 9.918920005584719e-05, "loss": 0.3296, "num_input_tokens_seen": 95824496, "step": 399 }, { "epoch": 0.1812004530011325, "grad_norm": 0.6217611528853424, "learning_rate": 9.910186984485321e-05, "loss": 0.3315, "num_input_tokens_seen": 100862224, "step": 420 }, { "epoch": 0.19026047565118911, "grad_norm": 0.5343386676193482, "learning_rate": 9.901011641398398e-05, "loss": 0.353, "num_input_tokens_seen": 105876656, "step": 441 }, { "epoch": 0.19932049830124576, "grad_norm": 0.29271392356860787, "learning_rate": 9.89139480286192e-05, "loss": 0.3414, "num_input_tokens_seen": 110980864, "step": 462 }, { "epoch": 0.20838052095130238, "grad_norm": 0.2662936598921738, "learning_rate": 9.881337335184878e-05, "loss": 0.3224, "num_input_tokens_seen": 116114800, "step": 483 }, { "epoch": 0.217440543601359, "grad_norm": 0.3006863273182064, "learning_rate": 9.870840144369246e-05, "loss": 0.3212, "num_input_tokens_seen": 121255744, "step": 504 }, { "epoch": 0.22650056625141562, "grad_norm": 0.28557101475631624, "learning_rate": 9.859904176028362e-05, "loss": 0.3213, "num_input_tokens_seen": 126288608, "step": 525 }, { "epoch": 0.23556058890147225, "grad_norm": 0.30204169829965893, "learning_rate": 9.848530415301747e-05, "loss": 0.3198, "num_input_tokens_seen": 131233488, "step": 546 }, { "epoch": 0.24462061155152887, "grad_norm": 0.22788149212961117, "learning_rate": 9.836719886766356e-05, "loss": 0.3149, "num_input_tokens_seen": 136257888, "step": 567 }, { "epoch": 0.2536806342015855, "grad_norm": 0.25388452332747136, "learning_rate": 9.824473654344297e-05, "loss": 0.3169, "num_input_tokens_seen": 141405120, "step": 588 }, { "epoch": 0.2627406568516421, "grad_norm": 0.22932388019266456, "learning_rate": 9.811792821206969e-05, "loss": 0.3142, "num_input_tokens_seen": 146380496, "step": 609 }, { "epoch": 0.2718006795016987, "grad_norm": 0.21316227582275912, "learning_rate": 9.7986785296757e-05, "loss": 0.3097, "num_input_tokens_seen": 151344048, "step": 630 }, { "epoch": 0.2808607021517554, "grad_norm": 0.2055839569031328, "learning_rate": 9.785131961118844e-05, "loss": 0.3116, "num_input_tokens_seen": 156435136, "step": 651 }, { "epoch": 0.289920724801812, "grad_norm": 0.2441799648084251, "learning_rate": 9.771154335845345e-05, "loss": 0.3086, "num_input_tokens_seen": 161536224, "step": 672 }, { "epoch": 0.29898074745186864, "grad_norm": 0.20481810635702893, "learning_rate": 9.756746912994832e-05, "loss": 0.3057, "num_input_tokens_seen": 166573984, "step": 693 }, { "epoch": 0.30804077010192527, "grad_norm": 0.2592126104806373, "learning_rate": 9.741910990424174e-05, "loss": 0.3017, "num_input_tokens_seen": 171638000, "step": 714 }, { "epoch": 0.3171007927519819, "grad_norm": 0.25174137308280303, "learning_rate": 9.726647904590571e-05, "loss": 0.3066, "num_input_tokens_seen": 176765648, "step": 735 }, { "epoch": 0.3261608154020385, "grad_norm": 0.20855236580529835, "learning_rate": 9.710959030431167e-05, "loss": 0.2996, "num_input_tokens_seen": 181949360, "step": 756 }, { "epoch": 0.3352208380520951, "grad_norm": 0.23697815659002952, "learning_rate": 9.694845781239187e-05, "loss": 0.2972, "num_input_tokens_seen": 186990096, "step": 777 }, { "epoch": 0.34428086070215175, "grad_norm": 0.2492979898134609, "learning_rate": 9.678309608536626e-05, "loss": 0.2984, "num_input_tokens_seen": 192083856, "step": 798 }, { "epoch": 0.35334088335220837, "grad_norm": 0.23816573617268064, "learning_rate": 9.661352001943493e-05, "loss": 0.2957, "num_input_tokens_seen": 197134448, "step": 819 }, { "epoch": 0.362400906002265, "grad_norm": 0.20467034048790322, "learning_rate": 9.64397448904362e-05, "loss": 0.2926, "num_input_tokens_seen": 202310368, "step": 840 }, { "epoch": 0.3714609286523216, "grad_norm": 0.17904927666050605, "learning_rate": 9.626178635247054e-05, "loss": 0.2909, "num_input_tokens_seen": 207359840, "step": 861 }, { "epoch": 0.38052095130237823, "grad_norm": 0.248985929697072, "learning_rate": 9.607966043649046e-05, "loss": 0.2954, "num_input_tokens_seen": 212395664, "step": 882 }, { "epoch": 0.3895809739524349, "grad_norm": 0.17735947921966527, "learning_rate": 9.589338354885629e-05, "loss": 0.2912, "num_input_tokens_seen": 217570640, "step": 903 }, { "epoch": 0.3986409966024915, "grad_norm": 0.2653024856558906, "learning_rate": 9.570297246985837e-05, "loss": 0.2928, "num_input_tokens_seen": 222629712, "step": 924 }, { "epoch": 0.40770101925254815, "grad_norm": 0.21684716629964057, "learning_rate": 9.550844435220539e-05, "loss": 0.292, "num_input_tokens_seen": 227565744, "step": 945 }, { "epoch": 0.41676104190260477, "grad_norm": 0.19831973070913392, "learning_rate": 9.530981671947923e-05, "loss": 0.292, "num_input_tokens_seen": 232655712, "step": 966 }, { "epoch": 0.4258210645526614, "grad_norm": 0.2039591338730108, "learning_rate": 9.510710746455636e-05, "loss": 0.2959, "num_input_tokens_seen": 237611056, "step": 987 }, { "epoch": 0.434881087202718, "grad_norm": 0.20635917697106, "learning_rate": 9.490033484799608e-05, "loss": 0.2884, "num_input_tokens_seen": 242693136, "step": 1008 }, { "epoch": 0.44394110985277463, "grad_norm": 0.21178513822087988, "learning_rate": 9.468951749639551e-05, "loss": 0.2878, "num_input_tokens_seen": 247677488, "step": 1029 }, { "epoch": 0.45300113250283125, "grad_norm": 0.24393748334345636, "learning_rate": 9.447467440071164e-05, "loss": 0.2908, "num_input_tokens_seen": 252770384, "step": 1050 }, { "epoch": 0.46206115515288787, "grad_norm": 0.19345581725168481, "learning_rate": 9.425582491455067e-05, "loss": 0.2796, "num_input_tokens_seen": 258009696, "step": 1071 }, { "epoch": 0.4711211778029445, "grad_norm": 0.17550447436524724, "learning_rate": 9.403298875242448e-05, "loss": 0.2858, "num_input_tokens_seen": 263147728, "step": 1092 }, { "epoch": 0.4801812004530011, "grad_norm": 0.1873327110291133, "learning_rate": 9.380618598797473e-05, "loss": 0.2876, "num_input_tokens_seen": 268184080, "step": 1113 }, { "epoch": 0.48924122310305773, "grad_norm": 0.2326284547666313, "learning_rate": 9.357543705216465e-05, "loss": 0.2814, "num_input_tokens_seen": 273442768, "step": 1134 }, { "epoch": 0.4983012457531144, "grad_norm": 0.1606625392823979, "learning_rate": 9.334076273143843e-05, "loss": 0.2804, "num_input_tokens_seen": 278640624, "step": 1155 }, { "epoch": 0.507361268403171, "grad_norm": 0.174300515580659, "learning_rate": 9.310218416584886e-05, "loss": 0.2863, "num_input_tokens_seen": 283769424, "step": 1176 }, { "epoch": 0.5164212910532276, "grad_norm": 0.18523825539113345, "learning_rate": 9.28597228471529e-05, "loss": 0.2851, "num_input_tokens_seen": 288866448, "step": 1197 }, { "epoch": 0.5254813137032842, "grad_norm": 0.29288899462077944, "learning_rate": 9.26134006168757e-05, "loss": 0.2798, "num_input_tokens_seen": 293944576, "step": 1218 }, { "epoch": 0.5345413363533409, "grad_norm": 0.21677248351508627, "learning_rate": 9.236323966434295e-05, "loss": 0.2728, "num_input_tokens_seen": 299090032, "step": 1239 }, { "epoch": 0.5436013590033975, "grad_norm": 0.17425284816339717, "learning_rate": 9.210926252468219e-05, "loss": 0.2756, "num_input_tokens_seen": 304016304, "step": 1260 }, { "epoch": 0.5526613816534541, "grad_norm": 0.19722321188451059, "learning_rate": 9.185149207679263e-05, "loss": 0.2747, "num_input_tokens_seen": 309084016, "step": 1281 }, { "epoch": 0.5617214043035108, "grad_norm": 0.15927450699962342, "learning_rate": 9.158995154128425e-05, "loss": 0.2772, "num_input_tokens_seen": 314201696, "step": 1302 }, { "epoch": 0.5707814269535674, "grad_norm": 0.144121234319095, "learning_rate": 9.132466447838597e-05, "loss": 0.2785, "num_input_tokens_seen": 319266256, "step": 1323 }, { "epoch": 0.579841449603624, "grad_norm": 0.20014275331573014, "learning_rate": 9.105565478582334e-05, "loss": 0.2755, "num_input_tokens_seen": 324468352, "step": 1344 }, { "epoch": 0.5889014722536806, "grad_norm": 0.1763537829210326, "learning_rate": 9.078294669666576e-05, "loss": 0.2708, "num_input_tokens_seen": 329566736, "step": 1365 }, { "epoch": 0.5979614949037373, "grad_norm": 0.20344878470372835, "learning_rate": 9.050656477714346e-05, "loss": 0.2729, "num_input_tokens_seen": 334661888, "step": 1386 }, { "epoch": 0.6070215175537939, "grad_norm": 0.16201525012659232, "learning_rate": 9.022653392443454e-05, "loss": 0.2754, "num_input_tokens_seen": 339784976, "step": 1407 }, { "epoch": 0.6160815402038505, "grad_norm": 0.17086654725622255, "learning_rate": 8.994287936442225e-05, "loss": 0.2742, "num_input_tokens_seen": 344776544, "step": 1428 }, { "epoch": 0.6251415628539071, "grad_norm": 0.17370971449213884, "learning_rate": 8.96556266494224e-05, "loss": 0.2703, "num_input_tokens_seen": 349731168, "step": 1449 }, { "epoch": 0.6342015855039638, "grad_norm": 0.1759254965690454, "learning_rate": 8.936480165588173e-05, "loss": 0.2756, "num_input_tokens_seen": 354694544, "step": 1470 }, { "epoch": 0.6432616081540203, "grad_norm": 0.1884165927777844, "learning_rate": 8.907043058204674e-05, "loss": 0.2698, "num_input_tokens_seen": 359871984, "step": 1491 }, { "epoch": 0.652321630804077, "grad_norm": 0.18427306909901323, "learning_rate": 8.877253994560382e-05, "loss": 0.2716, "num_input_tokens_seen": 364937440, "step": 1512 }, { "epoch": 0.6613816534541337, "grad_norm": 0.20911376204072243, "learning_rate": 8.847115658129039e-05, "loss": 0.2682, "num_input_tokens_seen": 369994848, "step": 1533 }, { "epoch": 0.6704416761041903, "grad_norm": 0.20741042143522315, "learning_rate": 8.816630763847755e-05, "loss": 0.2695, "num_input_tokens_seen": 374992544, "step": 1554 }, { "epoch": 0.6795016987542469, "grad_norm": 0.1643283959081191, "learning_rate": 8.785802057872446e-05, "loss": 0.2706, "num_input_tokens_seen": 380038624, "step": 1575 }, { "epoch": 0.6885617214043035, "grad_norm": 0.20102992236744546, "learning_rate": 8.754632317330447e-05, "loss": 0.2704, "num_input_tokens_seen": 385195792, "step": 1596 }, { "epoch": 0.6976217440543602, "grad_norm": 0.18834807757879243, "learning_rate": 8.723124350070347e-05, "loss": 0.2707, "num_input_tokens_seen": 390195296, "step": 1617 }, { "epoch": 0.7066817667044167, "grad_norm": 0.15261075530832655, "learning_rate": 8.691280994409043e-05, "loss": 0.2653, "num_input_tokens_seen": 395353440, "step": 1638 }, { "epoch": 0.7157417893544734, "grad_norm": 0.20682506960801342, "learning_rate": 8.659105118876068e-05, "loss": 0.2649, "num_input_tokens_seen": 400444080, "step": 1659 }, { "epoch": 0.72480181200453, "grad_norm": 0.21733357068498219, "learning_rate": 8.626599621955179e-05, "loss": 0.2652, "num_input_tokens_seen": 405492112, "step": 1680 }, { "epoch": 0.7338618346545867, "grad_norm": 0.18286757761891814, "learning_rate": 8.593767431823255e-05, "loss": 0.2638, "num_input_tokens_seen": 410467584, "step": 1701 }, { "epoch": 0.7429218573046432, "grad_norm": 0.16764027437130122, "learning_rate": 8.56061150608652e-05, "loss": 0.2685, "num_input_tokens_seen": 415550320, "step": 1722 }, { "epoch": 0.7519818799546999, "grad_norm": 0.17921646972994934, "learning_rate": 8.527134831514117e-05, "loss": 0.2584, "num_input_tokens_seen": 420503712, "step": 1743 }, { "epoch": 0.7610419026047565, "grad_norm": 0.19062465197166928, "learning_rate": 8.493340423769053e-05, "loss": 0.2607, "num_input_tokens_seen": 425602800, "step": 1764 }, { "epoch": 0.7701019252548131, "grad_norm": 0.19476076523413036, "learning_rate": 8.459231327136532e-05, "loss": 0.2652, "num_input_tokens_seen": 430546320, "step": 1785 }, { "epoch": 0.7791619479048698, "grad_norm": 0.18756178702354967, "learning_rate": 8.42481061424973e-05, "loss": 0.2604, "num_input_tokens_seen": 435625600, "step": 1806 }, { "epoch": 0.7882219705549264, "grad_norm": 0.16871375853816825, "learning_rate": 8.390081385812993e-05, "loss": 0.2603, "num_input_tokens_seen": 440695024, "step": 1827 }, { "epoch": 0.797281993204983, "grad_norm": 0.1669594920851862, "learning_rate": 8.355046770322528e-05, "loss": 0.2576, "num_input_tokens_seen": 445877360, "step": 1848 }, { "epoch": 0.8063420158550396, "grad_norm": 0.20147791313721689, "learning_rate": 8.319709923784573e-05, "loss": 0.2622, "num_input_tokens_seen": 451021040, "step": 1869 }, { "epoch": 0.8154020385050963, "grad_norm": 0.17108850294819108, "learning_rate": 8.284074029431099e-05, "loss": 0.2587, "num_input_tokens_seen": 456101872, "step": 1890 }, { "epoch": 0.8244620611551529, "grad_norm": 0.18516681989871883, "learning_rate": 8.248142297433057e-05, "loss": 0.2575, "num_input_tokens_seen": 461365920, "step": 1911 }, { "epoch": 0.8335220838052095, "grad_norm": 0.20285356102658042, "learning_rate": 8.211917964611196e-05, "loss": 0.2573, "num_input_tokens_seen": 466466096, "step": 1932 }, { "epoch": 0.8425821064552661, "grad_norm": 0.207923488217522, "learning_rate": 8.175404294144482e-05, "loss": 0.26, "num_input_tokens_seen": 471541104, "step": 1953 }, { "epoch": 0.8516421291053228, "grad_norm": 0.19850908270608497, "learning_rate": 8.138604575276143e-05, "loss": 0.2571, "num_input_tokens_seen": 476646096, "step": 1974 }, { "epoch": 0.8607021517553793, "grad_norm": 0.1821198820367163, "learning_rate": 8.10152212301737e-05, "loss": 0.251, "num_input_tokens_seen": 481695200, "step": 1995 }, { "epoch": 0.869762174405436, "grad_norm": 0.1623421619904062, "learning_rate": 8.064160277848682e-05, "loss": 0.2614, "num_input_tokens_seen": 486706656, "step": 2016 }, { "epoch": 0.8788221970554927, "grad_norm": 0.1774308248272562, "learning_rate": 8.026522405419023e-05, "loss": 0.2528, "num_input_tokens_seen": 491943424, "step": 2037 }, { "epoch": 0.8878822197055493, "grad_norm": 0.21003241654174584, "learning_rate": 7.988611896242559e-05, "loss": 0.2571, "num_input_tokens_seen": 496925888, "step": 2058 }, { "epoch": 0.8969422423556059, "grad_norm": 0.20014809740395048, "learning_rate": 7.950432165393259e-05, "loss": 0.2547, "num_input_tokens_seen": 502065216, "step": 2079 }, { "epoch": 0.9060022650056625, "grad_norm": 0.17154274701388803, "learning_rate": 7.911986652197262e-05, "loss": 0.2538, "num_input_tokens_seen": 507089616, "step": 2100 }, { "epoch": 0.9150622876557192, "grad_norm": 0.17929198920009218, "learning_rate": 7.873278819923048e-05, "loss": 0.2551, "num_input_tokens_seen": 512060336, "step": 2121 }, { "epoch": 0.9241223103057757, "grad_norm": 0.16398954046091754, "learning_rate": 7.834312155469456e-05, "loss": 0.2515, "num_input_tokens_seen": 517133680, "step": 2142 }, { "epoch": 0.9331823329558324, "grad_norm": 0.17469600332499013, "learning_rate": 7.79509016905158e-05, "loss": 0.2526, "num_input_tokens_seen": 522229616, "step": 2163 }, { "epoch": 0.942242355605889, "grad_norm": 0.15900929966723312, "learning_rate": 7.755616393884561e-05, "loss": 0.2482, "num_input_tokens_seen": 527368864, "step": 2184 }, { "epoch": 0.9513023782559457, "grad_norm": 0.18254122861853214, "learning_rate": 7.715894385865299e-05, "loss": 0.2516, "num_input_tokens_seen": 532499712, "step": 2205 }, { "epoch": 0.9603624009060022, "grad_norm": 0.19001641667974054, "learning_rate": 7.675927723252134e-05, "loss": 0.2493, "num_input_tokens_seen": 537438224, "step": 2226 }, { "epoch": 0.9694224235560589, "grad_norm": 0.16454856189702688, "learning_rate": 7.635720006342512e-05, "loss": 0.2465, "num_input_tokens_seen": 542603472, "step": 2247 }, { "epoch": 0.9784824462061155, "grad_norm": 0.18435585737458549, "learning_rate": 7.595274857148652e-05, "loss": 0.2486, "num_input_tokens_seen": 547622688, "step": 2268 }, { "epoch": 0.9875424688561721, "grad_norm": 0.16595693704140477, "learning_rate": 7.554595919071268e-05, "loss": 0.2472, "num_input_tokens_seen": 552751232, "step": 2289 }, { "epoch": 0.9966024915062288, "grad_norm": 0.16765543588823353, "learning_rate": 7.513686856571368e-05, "loss": 0.2471, "num_input_tokens_seen": 557786736, "step": 2310 }, { "epoch": 1.0060400151000377, "grad_norm": 0.17133389174596797, "learning_rate": 7.472551354840145e-05, "loss": 0.2361, "num_input_tokens_seen": 562993712, "step": 2331 }, { "epoch": 1.0151000377500943, "grad_norm": 0.18279159983012744, "learning_rate": 7.431193119467008e-05, "loss": 0.217, "num_input_tokens_seen": 568021744, "step": 2352 }, { "epoch": 1.024160060400151, "grad_norm": 0.266530218109695, "learning_rate": 7.389615876105774e-05, "loss": 0.2145, "num_input_tokens_seen": 572956608, "step": 2373 }, { "epoch": 1.0332200830502076, "grad_norm": 0.178808088687151, "learning_rate": 7.347823370139042e-05, "loss": 0.2179, "num_input_tokens_seen": 577973792, "step": 2394 }, { "epoch": 1.0422801057002642, "grad_norm": 0.15535901011827938, "learning_rate": 7.30581936634082e-05, "loss": 0.2098, "num_input_tokens_seen": 582948368, "step": 2415 }, { "epoch": 1.051340128350321, "grad_norm": 0.1879255010231289, "learning_rate": 7.263607648537364e-05, "loss": 0.2174, "num_input_tokens_seen": 587973936, "step": 2436 }, { "epoch": 1.0604001510003775, "grad_norm": 0.18116579180636055, "learning_rate": 7.221192019266332e-05, "loss": 0.2187, "num_input_tokens_seen": 593048624, "step": 2457 }, { "epoch": 1.069460173650434, "grad_norm": 0.16067683672474237, "learning_rate": 7.178576299434238e-05, "loss": 0.2162, "num_input_tokens_seen": 598171840, "step": 2478 }, { "epoch": 1.0785201963004907, "grad_norm": 0.16890347919356866, "learning_rate": 7.135764327972261e-05, "loss": 0.2202, "num_input_tokens_seen": 603168000, "step": 2499 }, { "epoch": 1.0875802189505475, "grad_norm": 0.171528446871021, "learning_rate": 7.092759961490415e-05, "loss": 0.2237, "num_input_tokens_seen": 608280544, "step": 2520 }, { "epoch": 1.096640241600604, "grad_norm": 0.16674379731500746, "learning_rate": 7.049567073930143e-05, "loss": 0.2199, "num_input_tokens_seen": 613215280, "step": 2541 }, { "epoch": 1.1057002642506606, "grad_norm": 0.18816045716369076, "learning_rate": 7.006189556215345e-05, "loss": 0.2189, "num_input_tokens_seen": 618261984, "step": 2562 }, { "epoch": 1.1147602869007172, "grad_norm": 0.16658411013718444, "learning_rate": 6.962631315901861e-05, "loss": 0.2163, "num_input_tokens_seen": 623492320, "step": 2583 }, { "epoch": 1.123820309550774, "grad_norm": 0.19772642288828662, "learning_rate": 6.918896276825485e-05, "loss": 0.2157, "num_input_tokens_seen": 628563152, "step": 2604 }, { "epoch": 1.1328803322008305, "grad_norm": 0.17075347503361357, "learning_rate": 6.874988378748483e-05, "loss": 0.2141, "num_input_tokens_seen": 633639472, "step": 2625 }, { "epoch": 1.141940354850887, "grad_norm": 0.14444472527009938, "learning_rate": 6.830911577004698e-05, "loss": 0.2185, "num_input_tokens_seen": 638639648, "step": 2646 }, { "epoch": 1.1510003775009436, "grad_norm": 0.17948926082904845, "learning_rate": 6.786669842143236e-05, "loss": 0.2125, "num_input_tokens_seen": 643743632, "step": 2667 }, { "epoch": 1.1600604001510004, "grad_norm": 0.1845862453656852, "learning_rate": 6.742267159570795e-05, "loss": 0.2138, "num_input_tokens_seen": 648823584, "step": 2688 }, { "epoch": 1.169120422801057, "grad_norm": 0.1611490189496428, "learning_rate": 6.697707529192648e-05, "loss": 0.2152, "num_input_tokens_seen": 653949232, "step": 2709 }, { "epoch": 1.1781804454511136, "grad_norm": 0.19293414419678218, "learning_rate": 6.652994965052319e-05, "loss": 0.2125, "num_input_tokens_seen": 658996016, "step": 2730 }, { "epoch": 1.1872404681011703, "grad_norm": 0.1887515649690068, "learning_rate": 6.608133494969994e-05, "loss": 0.2123, "num_input_tokens_seen": 664102304, "step": 2751 }, { "epoch": 1.196300490751227, "grad_norm": 0.15668371616625942, "learning_rate": 6.563127160179671e-05, "loss": 0.2101, "num_input_tokens_seen": 669123584, "step": 2772 }, { "epoch": 1.2053605134012835, "grad_norm": 0.17871386948979864, "learning_rate": 6.517980014965139e-05, "loss": 0.209, "num_input_tokens_seen": 674256592, "step": 2793 }, { "epoch": 1.21442053605134, "grad_norm": 0.16852785549912133, "learning_rate": 6.472696126294732e-05, "loss": 0.2122, "num_input_tokens_seen": 679248208, "step": 2814 }, { "epoch": 1.2234805587013968, "grad_norm": 0.1758847676430736, "learning_rate": 6.427279573454985e-05, "loss": 0.2093, "num_input_tokens_seen": 684325632, "step": 2835 }, { "epoch": 1.2325405813514534, "grad_norm": 0.19298009720432585, "learning_rate": 6.381734447683152e-05, "loss": 0.2114, "num_input_tokens_seen": 689336736, "step": 2856 }, { "epoch": 1.24160060400151, "grad_norm": 0.16439303725722001, "learning_rate": 6.33606485179866e-05, "loss": 0.2111, "num_input_tokens_seen": 694382688, "step": 2877 }, { "epoch": 1.2506606266515665, "grad_norm": 0.18140167193790194, "learning_rate": 6.290274899833517e-05, "loss": 0.2086, "num_input_tokens_seen": 699371792, "step": 2898 }, { "epoch": 1.2597206493016233, "grad_norm": 0.17151657072780352, "learning_rate": 6.244368716661713e-05, "loss": 0.2095, "num_input_tokens_seen": 704404624, "step": 2919 }, { "epoch": 1.2687806719516799, "grad_norm": 0.2052334824788225, "learning_rate": 6.198350437627632e-05, "loss": 0.2083, "num_input_tokens_seen": 709451392, "step": 2940 }, { "epoch": 1.2778406946017364, "grad_norm": 0.18426322385396474, "learning_rate": 6.152224208173533e-05, "loss": 0.2088, "num_input_tokens_seen": 714486848, "step": 2961 }, { "epoch": 1.2869007172517932, "grad_norm": 0.1949416856665576, "learning_rate": 6.10599418346613e-05, "loss": 0.2118, "num_input_tokens_seen": 719556448, "step": 2982 }, { "epoch": 1.2959607399018498, "grad_norm": 0.16224663869829734, "learning_rate": 6.059664528022266e-05, "loss": 0.2058, "num_input_tokens_seen": 724625472, "step": 3003 }, { "epoch": 1.3050207625519064, "grad_norm": 0.1742996675080445, "learning_rate": 6.0132394153337755e-05, "loss": 0.2065, "num_input_tokens_seen": 729794320, "step": 3024 }, { "epoch": 1.3140807852019631, "grad_norm": 0.17806291944392397, "learning_rate": 5.9667230274915174e-05, "loss": 0.207, "num_input_tokens_seen": 734753392, "step": 3045 }, { "epoch": 1.3231408078520197, "grad_norm": 0.18436964692073765, "learning_rate": 5.920119554808651e-05, "loss": 0.2049, "num_input_tokens_seen": 739827088, "step": 3066 }, { "epoch": 1.3322008305020763, "grad_norm": 0.16939954583438047, "learning_rate": 5.873433195443152e-05, "loss": 0.208, "num_input_tokens_seen": 744847184, "step": 3087 }, { "epoch": 1.3412608531521328, "grad_norm": 0.1635889048824763, "learning_rate": 5.82666815501964e-05, "loss": 0.2047, "num_input_tokens_seen": 749874880, "step": 3108 }, { "epoch": 1.3503208758021894, "grad_norm": 0.16527453811089068, "learning_rate": 5.779828646250521e-05, "loss": 0.2022, "num_input_tokens_seen": 754848400, "step": 3129 }, { "epoch": 1.3593808984522462, "grad_norm": 0.18777829379599956, "learning_rate": 5.7329188885565e-05, "loss": 0.2073, "num_input_tokens_seen": 759913728, "step": 3150 }, { "epoch": 1.3684409211023028, "grad_norm": 0.15155859962967053, "learning_rate": 5.6859431076864755e-05, "loss": 0.2056, "num_input_tokens_seen": 765009632, "step": 3171 }, { "epoch": 1.3775009437523593, "grad_norm": 0.154719416013354, "learning_rate": 5.6389055353368826e-05, "loss": 0.2056, "num_input_tokens_seen": 770016704, "step": 3192 }, { "epoch": 1.386560966402416, "grad_norm": 0.16028731910302912, "learning_rate": 5.591810408770493e-05, "loss": 0.2037, "num_input_tokens_seen": 775197264, "step": 3213 }, { "epoch": 1.3956209890524727, "grad_norm": 0.15645057759509218, "learning_rate": 5.544661970434696e-05, "loss": 0.2042, "num_input_tokens_seen": 780209328, "step": 3234 }, { "epoch": 1.4046810117025292, "grad_norm": 0.17948486028711083, "learning_rate": 5.497464467579351e-05, "loss": 0.2011, "num_input_tokens_seen": 785402112, "step": 3255 }, { "epoch": 1.4137410343525858, "grad_norm": 0.16643670433003308, "learning_rate": 5.450222151874166e-05, "loss": 0.2015, "num_input_tokens_seen": 790429216, "step": 3276 }, { "epoch": 1.4228010570026426, "grad_norm": 0.17345996800344896, "learning_rate": 5.402939279025705e-05, "loss": 0.2005, "num_input_tokens_seen": 795543264, "step": 3297 }, { "epoch": 1.4318610796526992, "grad_norm": 0.1663960297870033, "learning_rate": 5.355620108394018e-05, "loss": 0.2052, "num_input_tokens_seen": 800533200, "step": 3318 }, { "epoch": 1.4409211023027557, "grad_norm": 0.15958882963062815, "learning_rate": 5.308268902608958e-05, "loss": 0.2042, "num_input_tokens_seen": 805542720, "step": 3339 }, { "epoch": 1.4499811249528123, "grad_norm": 0.17053093118312482, "learning_rate": 5.2608899271861765e-05, "loss": 0.1984, "num_input_tokens_seen": 810549376, "step": 3360 }, { "epoch": 1.459041147602869, "grad_norm": 0.1731330043830458, "learning_rate": 5.213487450142892e-05, "loss": 0.2038, "num_input_tokens_seen": 815599232, "step": 3381 }, { "epoch": 1.4681011702529256, "grad_norm": 0.17941197802062514, "learning_rate": 5.166065741613402e-05, "loss": 0.2012, "num_input_tokens_seen": 820700608, "step": 3402 }, { "epoch": 1.4771611929029822, "grad_norm": 0.1844938407002505, "learning_rate": 5.118629073464424e-05, "loss": 0.1987, "num_input_tokens_seen": 825686176, "step": 3423 }, { "epoch": 1.486221215553039, "grad_norm": 0.1748567417166297, "learning_rate": 5.071181718910283e-05, "loss": 0.1986, "num_input_tokens_seen": 830730000, "step": 3444 }, { "epoch": 1.4952812382030956, "grad_norm": 0.15694569688029672, "learning_rate": 5.023727952127954e-05, "loss": 0.1987, "num_input_tokens_seen": 835738032, "step": 3465 }, { "epoch": 1.5043412608531521, "grad_norm": 0.18575993893540607, "learning_rate": 4.976272047872046e-05, "loss": 0.1952, "num_input_tokens_seen": 840806528, "step": 3486 }, { "epoch": 1.513401283503209, "grad_norm": 0.16316391964141339, "learning_rate": 4.9288182810897184e-05, "loss": 0.1957, "num_input_tokens_seen": 845877808, "step": 3507 }, { "epoch": 1.5224613061532652, "grad_norm": 0.1809977532876625, "learning_rate": 4.8813709265355766e-05, "loss": 0.1957, "num_input_tokens_seen": 851002432, "step": 3528 }, { "epoch": 1.531521328803322, "grad_norm": 0.15896204329046001, "learning_rate": 4.8339342583866005e-05, "loss": 0.197, "num_input_tokens_seen": 856037440, "step": 3549 }, { "epoch": 1.5405813514533786, "grad_norm": 0.1848696286617871, "learning_rate": 4.7865125498571086e-05, "loss": 0.1957, "num_input_tokens_seen": 860972624, "step": 3570 }, { "epoch": 1.5496413741034352, "grad_norm": 0.16411859849940666, "learning_rate": 4.739110072813823e-05, "loss": 0.1926, "num_input_tokens_seen": 866078128, "step": 3591 }, { "epoch": 1.558701396753492, "grad_norm": 0.15293153546751434, "learning_rate": 4.6917310973910425e-05, "loss": 0.1934, "num_input_tokens_seen": 871290720, "step": 3612 }, { "epoch": 1.5677614194035485, "grad_norm": 0.18580264173261662, "learning_rate": 4.6443798916059836e-05, "loss": 0.1961, "num_input_tokens_seen": 876353920, "step": 3633 }, { "epoch": 1.576821442053605, "grad_norm": 0.16117670144515006, "learning_rate": 4.597060720974298e-05, "loss": 0.1902, "num_input_tokens_seen": 881469536, "step": 3654 }, { "epoch": 1.5858814647036619, "grad_norm": 0.1821844142116438, "learning_rate": 4.549777848125833e-05, "loss": 0.1971, "num_input_tokens_seen": 886532048, "step": 3675 }, { "epoch": 1.5949414873537184, "grad_norm": 0.188981157327872, "learning_rate": 4.50253553242065e-05, "loss": 0.1952, "num_input_tokens_seen": 891565152, "step": 3696 }, { "epoch": 1.604001510003775, "grad_norm": 0.1663775536476532, "learning_rate": 4.4553380295653053e-05, "loss": 0.1908, "num_input_tokens_seen": 896603568, "step": 3717 }, { "epoch": 1.6130615326538318, "grad_norm": 0.16695660636413406, "learning_rate": 4.40818959122951e-05, "loss": 0.1945, "num_input_tokens_seen": 901703264, "step": 3738 }, { "epoch": 1.6221215553038881, "grad_norm": 0.18003132042487852, "learning_rate": 4.361094464663118e-05, "loss": 0.1911, "num_input_tokens_seen": 906846256, "step": 3759 }, { "epoch": 1.631181577953945, "grad_norm": 0.16377146934729214, "learning_rate": 4.3140568923135264e-05, "loss": 0.193, "num_input_tokens_seen": 911964272, "step": 3780 }, { "epoch": 1.6402416006040015, "grad_norm": 0.1711801561805431, "learning_rate": 4.267081111443501e-05, "loss": 0.1898, "num_input_tokens_seen": 917101840, "step": 3801 }, { "epoch": 1.649301623254058, "grad_norm": 0.1743609898038798, "learning_rate": 4.22017135374948e-05, "loss": 0.1852, "num_input_tokens_seen": 922205664, "step": 3822 }, { "epoch": 1.6583616459041148, "grad_norm": 0.17938627926996303, "learning_rate": 4.1733318449803624e-05, "loss": 0.1863, "num_input_tokens_seen": 927302560, "step": 3843 }, { "epoch": 1.6674216685541714, "grad_norm": 0.16947333759434738, "learning_rate": 4.1265668045568495e-05, "loss": 0.1882, "num_input_tokens_seen": 932325424, "step": 3864 }, { "epoch": 1.676481691204228, "grad_norm": 0.16639553173104588, "learning_rate": 4.079880445191351e-05, "loss": 0.1893, "num_input_tokens_seen": 937438464, "step": 3885 }, { "epoch": 1.6855417138542848, "grad_norm": 0.14651023615133163, "learning_rate": 4.033276972508484e-05, "loss": 0.1885, "num_input_tokens_seen": 942617840, "step": 3906 }, { "epoch": 1.6946017365043413, "grad_norm": 0.17812367097504705, "learning_rate": 3.9867605846662256e-05, "loss": 0.1883, "num_input_tokens_seen": 947823200, "step": 3927 }, { "epoch": 1.7036617591543979, "grad_norm": 0.1872194452488721, "learning_rate": 3.940335471977734e-05, "loss": 0.1871, "num_input_tokens_seen": 952872784, "step": 3948 }, { "epoch": 1.7127217818044547, "grad_norm": 0.1643081487678093, "learning_rate": 3.89400581653387e-05, "loss": 0.1853, "num_input_tokens_seen": 957908608, "step": 3969 }, { "epoch": 1.721781804454511, "grad_norm": 0.1753318656372752, "learning_rate": 3.847775791826468e-05, "loss": 0.1862, "num_input_tokens_seen": 962972208, "step": 3990 }, { "epoch": 1.7308418271045678, "grad_norm": 0.15851578473823177, "learning_rate": 3.801649562372371e-05, "loss": 0.1913, "num_input_tokens_seen": 968020256, "step": 4011 }, { "epoch": 1.7399018497546244, "grad_norm": 0.1812244935434252, "learning_rate": 3.755631283338287e-05, "loss": 0.1908, "num_input_tokens_seen": 973116912, "step": 4032 }, { "epoch": 1.748961872404681, "grad_norm": 0.16265272189557067, "learning_rate": 3.709725100166482e-05, "loss": 0.1839, "num_input_tokens_seen": 978276224, "step": 4053 }, { "epoch": 1.7580218950547377, "grad_norm": 0.16695166226650535, "learning_rate": 3.663935148201341e-05, "loss": 0.1869, "num_input_tokens_seen": 983499184, "step": 4074 }, { "epoch": 1.7670819177047943, "grad_norm": 0.17046617487207735, "learning_rate": 3.618265552316849e-05, "loss": 0.1884, "num_input_tokens_seen": 988511216, "step": 4095 }, { "epoch": 1.7761419403548508, "grad_norm": 0.16853521697476523, "learning_rate": 3.572720426545017e-05, "loss": 0.1863, "num_input_tokens_seen": 993542272, "step": 4116 }, { "epoch": 1.7852019630049076, "grad_norm": 0.16196055724715774, "learning_rate": 3.5273038737052675e-05, "loss": 0.1884, "num_input_tokens_seen": 998561584, "step": 4137 }, { "epoch": 1.794261985654964, "grad_norm": 0.17704958458091835, "learning_rate": 3.482019985034861e-05, "loss": 0.1815, "num_input_tokens_seen": 1003535696, "step": 4158 }, { "epoch": 1.8033220083050208, "grad_norm": 0.17212954264417213, "learning_rate": 3.43687283982033e-05, "loss": 0.1798, "num_input_tokens_seen": 1008610432, "step": 4179 }, { "epoch": 1.8123820309550775, "grad_norm": 0.1642508897074481, "learning_rate": 3.391866505030009e-05, "loss": 0.1797, "num_input_tokens_seen": 1013577840, "step": 4200 }, { "epoch": 1.821442053605134, "grad_norm": 0.1895193964135349, "learning_rate": 3.347005034947681e-05, "loss": 0.1773, "num_input_tokens_seen": 1018549888, "step": 4221 }, { "epoch": 1.8305020762551907, "grad_norm": 0.18935672208270557, "learning_rate": 3.3022924708073524e-05, "loss": 0.1828, "num_input_tokens_seen": 1023498368, "step": 4242 }, { "epoch": 1.8395620989052472, "grad_norm": 0.15473627402095172, "learning_rate": 3.257732840429206e-05, "loss": 0.18, "num_input_tokens_seen": 1028542992, "step": 4263 }, { "epoch": 1.8486221215553038, "grad_norm": 0.17782850850732204, "learning_rate": 3.2133301578567646e-05, "loss": 0.1825, "num_input_tokens_seen": 1033574288, "step": 4284 }, { "epoch": 1.8576821442053606, "grad_norm": 0.17879475744218412, "learning_rate": 3.169088422995304e-05, "loss": 0.1776, "num_input_tokens_seen": 1038606208, "step": 4305 }, { "epoch": 1.8667421668554172, "grad_norm": 0.16166293718253705, "learning_rate": 3.125011621251516e-05, "loss": 0.1768, "num_input_tokens_seen": 1043770704, "step": 4326 }, { "epoch": 1.8758021895054737, "grad_norm": 0.1607230134601091, "learning_rate": 3.081103723174515e-05, "loss": 0.1778, "num_input_tokens_seen": 1048829664, "step": 4347 }, { "epoch": 1.8848622121555305, "grad_norm": 0.159447656379203, "learning_rate": 3.0373686840981397e-05, "loss": 0.1788, "num_input_tokens_seen": 1053950224, "step": 4368 }, { "epoch": 1.8939222348055869, "grad_norm": 0.1674766446494019, "learning_rate": 2.9938104437846572e-05, "loss": 0.176, "num_input_tokens_seen": 1059119888, "step": 4389 }, { "epoch": 1.9029822574556436, "grad_norm": 0.17753675611302996, "learning_rate": 2.950432926069857e-05, "loss": 0.1783, "num_input_tokens_seen": 1064177088, "step": 4410 }, { "epoch": 1.9120422801057002, "grad_norm": 0.17087252328331373, "learning_rate": 2.9072400385095865e-05, "loss": 0.178, "num_input_tokens_seen": 1069200928, "step": 4431 }, { "epoch": 1.9211023027557568, "grad_norm": 0.16133227423173738, "learning_rate": 2.864235672027741e-05, "loss": 0.1759, "num_input_tokens_seen": 1074313840, "step": 4452 }, { "epoch": 1.9301623254058136, "grad_norm": 0.1865580464555286, "learning_rate": 2.8214237005657627e-05, "loss": 0.1769, "num_input_tokens_seen": 1079348080, "step": 4473 }, { "epoch": 1.9392223480558701, "grad_norm": 0.17483638643553473, "learning_rate": 2.7788079807336692e-05, "loss": 0.1761, "num_input_tokens_seen": 1084415072, "step": 4494 }, { "epoch": 1.9482823707059267, "grad_norm": 0.16127203478332483, "learning_rate": 2.7363923514626367e-05, "loss": 0.1762, "num_input_tokens_seen": 1089576528, "step": 4515 }, { "epoch": 1.9573423933559835, "grad_norm": 0.1818665955450248, "learning_rate": 2.6941806336591808e-05, "loss": 0.1715, "num_input_tokens_seen": 1094741664, "step": 4536 }, { "epoch": 1.96640241600604, "grad_norm": 0.16510174569454042, "learning_rate": 2.6521766298609584e-05, "loss": 0.1728, "num_input_tokens_seen": 1099708896, "step": 4557 }, { "epoch": 1.9754624386560966, "grad_norm": 0.17393602608748607, "learning_rate": 2.610384123894229e-05, "loss": 0.175, "num_input_tokens_seen": 1104824512, "step": 4578 }, { "epoch": 1.9845224613061534, "grad_norm": 0.18901915034549496, "learning_rate": 2.568806880532991e-05, "loss": 0.1736, "num_input_tokens_seen": 1109954160, "step": 4599 }, { "epoch": 1.9935824839562097, "grad_norm": 0.19336693087348367, "learning_rate": 2.5274486451598565e-05, "loss": 0.1704, "num_input_tokens_seen": 1115130992, "step": 4620 }, { "epoch": 2.003020007550019, "grad_norm": 0.192558427240515, "learning_rate": 2.4863131434286342e-05, "loss": 0.1548, "num_input_tokens_seen": 1120294784, "step": 4641 }, { "epoch": 2.0120800302000754, "grad_norm": 0.19360993518356076, "learning_rate": 2.4454040809287342e-05, "loss": 0.1188, "num_input_tokens_seen": 1125375728, "step": 4662 }, { "epoch": 2.021140052850132, "grad_norm": 0.19346692314512148, "learning_rate": 2.4047251428513485e-05, "loss": 0.1176, "num_input_tokens_seen": 1130663488, "step": 4683 }, { "epoch": 2.0302000755001886, "grad_norm": 0.1915500646603155, "learning_rate": 2.364279993657487e-05, "loss": 0.1166, "num_input_tokens_seen": 1135729856, "step": 4704 }, { "epoch": 2.0392600981502453, "grad_norm": 0.21320689744431512, "learning_rate": 2.3240722767478657e-05, "loss": 0.1129, "num_input_tokens_seen": 1140728768, "step": 4725 }, { "epoch": 2.048320120800302, "grad_norm": 0.20002232427856995, "learning_rate": 2.2841056141347038e-05, "loss": 0.1122, "num_input_tokens_seen": 1145810672, "step": 4746 }, { "epoch": 2.0573801434503585, "grad_norm": 0.21228559927967805, "learning_rate": 2.2443836061154415e-05, "loss": 0.1145, "num_input_tokens_seen": 1150862064, "step": 4767 }, { "epoch": 2.0664401661004153, "grad_norm": 0.19792768108065947, "learning_rate": 2.2049098309484195e-05, "loss": 0.1153, "num_input_tokens_seen": 1155954544, "step": 4788 }, { "epoch": 2.075500188750472, "grad_norm": 0.21247296887779493, "learning_rate": 2.1656878445305447e-05, "loss": 0.1152, "num_input_tokens_seen": 1161054256, "step": 4809 }, { "epoch": 2.0845602114005284, "grad_norm": 0.19109010163603735, "learning_rate": 2.1267211800769528e-05, "loss": 0.1148, "num_input_tokens_seen": 1166056688, "step": 4830 }, { "epoch": 2.093620234050585, "grad_norm": 0.19679782828606215, "learning_rate": 2.088013347802738e-05, "loss": 0.1119, "num_input_tokens_seen": 1171231104, "step": 4851 }, { "epoch": 2.102680256700642, "grad_norm": 0.2128224999872872, "learning_rate": 2.0495678346067414e-05, "loss": 0.1101, "num_input_tokens_seen": 1176284976, "step": 4872 }, { "epoch": 2.1117402793506983, "grad_norm": 0.2123206811047115, "learning_rate": 2.011388103757442e-05, "loss": 0.1139, "num_input_tokens_seen": 1181400944, "step": 4893 }, { "epoch": 2.120800302000755, "grad_norm": 0.2071017368245751, "learning_rate": 1.973477594580977e-05, "loss": 0.1116, "num_input_tokens_seen": 1186527776, "step": 4914 }, { "epoch": 2.1298603246508114, "grad_norm": 0.17323287993849096, "learning_rate": 1.9358397221513176e-05, "loss": 0.112, "num_input_tokens_seen": 1191661680, "step": 4935 }, { "epoch": 2.138920347300868, "grad_norm": 0.20213151950682676, "learning_rate": 1.8984778769826316e-05, "loss": 0.1106, "num_input_tokens_seen": 1196759648, "step": 4956 }, { "epoch": 2.147980369950925, "grad_norm": 0.19700292148625387, "learning_rate": 1.8613954247238586e-05, "loss": 0.1124, "num_input_tokens_seen": 1201857104, "step": 4977 }, { "epoch": 2.1570403926009813, "grad_norm": 0.21527000496492768, "learning_rate": 1.82459570585552e-05, "loss": 0.1136, "num_input_tokens_seen": 1206927520, "step": 4998 }, { "epoch": 2.157903251900987, "eval_loss": 0.19485081732273102, "eval_runtime": 529.4687, "eval_samples_per_second": 17.331, "eval_steps_per_second": 1.084, "num_input_tokens_seen": 1207385424, "step": 5000 }, { "epoch": 2.166100415251038, "grad_norm": 0.2275158963594303, "learning_rate": 1.7880820353888056e-05, "loss": 0.1102, "num_input_tokens_seen": 1211875824, "step": 5019 }, { "epoch": 2.175160437901095, "grad_norm": 0.20450931488489404, "learning_rate": 1.751857702566944e-05, "loss": 0.113, "num_input_tokens_seen": 1216954688, "step": 5040 }, { "epoch": 2.1842204605511513, "grad_norm": 0.21173943893990088, "learning_rate": 1.7159259705689e-05, "loss": 0.1104, "num_input_tokens_seen": 1221976560, "step": 5061 }, { "epoch": 2.193280483201208, "grad_norm": 0.20717537063008165, "learning_rate": 1.6802900762154267e-05, "loss": 0.1152, "num_input_tokens_seen": 1226975776, "step": 5082 }, { "epoch": 2.2023405058512644, "grad_norm": 0.19831781893791461, "learning_rate": 1.644953229677474e-05, "loss": 0.1097, "num_input_tokens_seen": 1231998784, "step": 5103 }, { "epoch": 2.211400528501321, "grad_norm": 0.18492554370023317, "learning_rate": 1.609918614187009e-05, "loss": 0.1111, "num_input_tokens_seen": 1236990864, "step": 5124 }, { "epoch": 2.220460551151378, "grad_norm": 0.20016085842409992, "learning_rate": 1.575189385750271e-05, "loss": 0.1104, "num_input_tokens_seen": 1242051280, "step": 5145 }, { "epoch": 2.2295205738014343, "grad_norm": 0.2070887723839001, "learning_rate": 1.540768672863468e-05, "loss": 0.1075, "num_input_tokens_seen": 1247127040, "step": 5166 }, { "epoch": 2.238580596451491, "grad_norm": 0.1925213910719394, "learning_rate": 1.5066595762309477e-05, "loss": 0.1093, "num_input_tokens_seen": 1252158672, "step": 5187 }, { "epoch": 2.247640619101548, "grad_norm": 0.205831347337121, "learning_rate": 1.4728651684858834e-05, "loss": 0.1126, "num_input_tokens_seen": 1257321184, "step": 5208 }, { "epoch": 2.2567006417516042, "grad_norm": 0.19926488557298117, "learning_rate": 1.4393884939134833e-05, "loss": 0.1064, "num_input_tokens_seen": 1262315984, "step": 5229 }, { "epoch": 2.265760664401661, "grad_norm": 0.18546254868875062, "learning_rate": 1.4062325681767469e-05, "loss": 0.1096, "num_input_tokens_seen": 1267351616, "step": 5250 }, { "epoch": 2.274820687051718, "grad_norm": 0.21880629906349583, "learning_rate": 1.3734003780448218e-05, "loss": 0.1089, "num_input_tokens_seen": 1272350592, "step": 5271 }, { "epoch": 2.283880709701774, "grad_norm": 0.1996371660776893, "learning_rate": 1.340894881123932e-05, "loss": 0.1093, "num_input_tokens_seen": 1277314160, "step": 5292 }, { "epoch": 2.292940732351831, "grad_norm": 0.18322023913039737, "learning_rate": 1.308719005590957e-05, "loss": 0.1064, "num_input_tokens_seen": 1282348896, "step": 5313 }, { "epoch": 2.3020007550018873, "grad_norm": 0.19825429674508396, "learning_rate": 1.276875649929654e-05, "loss": 0.1103, "num_input_tokens_seen": 1287503120, "step": 5334 }, { "epoch": 2.311060777651944, "grad_norm": 0.20100225641083314, "learning_rate": 1.2453676826695532e-05, "loss": 0.1077, "num_input_tokens_seen": 1292488224, "step": 5355 }, { "epoch": 2.320120800302001, "grad_norm": 0.19869949736224346, "learning_rate": 1.2141979421275545e-05, "loss": 0.1051, "num_input_tokens_seen": 1297613792, "step": 5376 }, { "epoch": 2.329180822952057, "grad_norm": 0.20145867765354752, "learning_rate": 1.1833692361522459e-05, "loss": 0.1063, "num_input_tokens_seen": 1302765200, "step": 5397 }, { "epoch": 2.338240845602114, "grad_norm": 0.20680505787617295, "learning_rate": 1.1528843418709622e-05, "loss": 0.1073, "num_input_tokens_seen": 1307780896, "step": 5418 }, { "epoch": 2.3473008682521708, "grad_norm": 0.23847500451035963, "learning_rate": 1.1227460054396177e-05, "loss": 0.1076, "num_input_tokens_seen": 1312916864, "step": 5439 }, { "epoch": 2.356360890902227, "grad_norm": 0.21518454470567003, "learning_rate": 1.0929569417953278e-05, "loss": 0.1049, "num_input_tokens_seen": 1317924528, "step": 5460 }, { "epoch": 2.365420913552284, "grad_norm": 0.19953783012904103, "learning_rate": 1.0635198344118296e-05, "loss": 0.1038, "num_input_tokens_seen": 1322892896, "step": 5481 }, { "epoch": 2.3744809362023407, "grad_norm": 0.20097656219123833, "learning_rate": 1.034437335057762e-05, "loss": 0.1049, "num_input_tokens_seen": 1328000960, "step": 5502 }, { "epoch": 2.383540958852397, "grad_norm": 0.20223248741837738, "learning_rate": 1.005712063557776e-05, "loss": 0.1026, "num_input_tokens_seen": 1333104928, "step": 5523 }, { "epoch": 2.392600981502454, "grad_norm": 0.184389360298103, "learning_rate": 9.773466075565457e-06, "loss": 0.1061, "num_input_tokens_seen": 1338094928, "step": 5544 }, { "epoch": 2.40166100415251, "grad_norm": 0.18202586925329933, "learning_rate": 9.493435222856556e-06, "loss": 0.1078, "num_input_tokens_seen": 1343094352, "step": 5565 }, { "epoch": 2.410721026802567, "grad_norm": 0.21238526697964133, "learning_rate": 9.21705330333426e-06, "loss": 0.1021, "num_input_tokens_seen": 1348209008, "step": 5586 }, { "epoch": 2.4197810494526237, "grad_norm": 0.2033611614783377, "learning_rate": 8.944345214176675e-06, "loss": 0.105, "num_input_tokens_seen": 1353281712, "step": 5607 }, { "epoch": 2.42884107210268, "grad_norm": 0.19144661395169293, "learning_rate": 8.675335521614036e-06, "loss": 0.1039, "num_input_tokens_seen": 1358325728, "step": 5628 }, { "epoch": 2.437901094752737, "grad_norm": 0.20545555012965147, "learning_rate": 8.410048458715763e-06, "loss": 0.1026, "num_input_tokens_seen": 1363274864, "step": 5649 }, { "epoch": 2.4469611174027937, "grad_norm": 0.20596285141748574, "learning_rate": 8.148507923207377e-06, "loss": 0.1046, "num_input_tokens_seen": 1368398176, "step": 5670 }, { "epoch": 2.45602114005285, "grad_norm": 0.21097019629979452, "learning_rate": 7.890737475317817e-06, "loss": 0.1062, "num_input_tokens_seen": 1373421664, "step": 5691 }, { "epoch": 2.465081162702907, "grad_norm": 0.1903944548607354, "learning_rate": 7.636760335657056e-06, "loss": 0.1005, "num_input_tokens_seen": 1378386688, "step": 5712 }, { "epoch": 2.4741411853529636, "grad_norm": 0.19609864215469505, "learning_rate": 7.38659938312432e-06, "loss": 0.1008, "num_input_tokens_seen": 1383515360, "step": 5733 }, { "epoch": 2.48320120800302, "grad_norm": 0.18901755025774616, "learning_rate": 7.140277152847103e-06, "loss": 0.1012, "num_input_tokens_seen": 1388651712, "step": 5754 }, { "epoch": 2.4922612306530767, "grad_norm": 0.2089521843263624, "learning_rate": 6.89781583415115e-06, "loss": 0.1004, "num_input_tokens_seen": 1393819168, "step": 5775 }, { "epoch": 2.501321253303133, "grad_norm": 0.20297486453222555, "learning_rate": 6.659237268561569e-06, "loss": 0.1058, "num_input_tokens_seen": 1399005008, "step": 5796 }, { "epoch": 2.51038127595319, "grad_norm": 0.1950872269091398, "learning_rate": 6.424562947835367e-06, "loss": 0.0996, "num_input_tokens_seen": 1404075040, "step": 5817 }, { "epoch": 2.5194412986032466, "grad_norm": 0.19137900590478205, "learning_rate": 6.193814012025278e-06, "loss": 0.098, "num_input_tokens_seen": 1409145760, "step": 5838 }, { "epoch": 2.5285013212533034, "grad_norm": 0.21343987395986905, "learning_rate": 5.967011247575532e-06, "loss": 0.1053, "num_input_tokens_seen": 1414225568, "step": 5859 }, { "epoch": 2.5375613439033597, "grad_norm": 0.21335949851815006, "learning_rate": 5.744175085449338e-06, "loss": 0.1021, "num_input_tokens_seen": 1419339216, "step": 5880 }, { "epoch": 2.5466213665534165, "grad_norm": 0.19658196939034006, "learning_rate": 5.525325599288356e-06, "loss": 0.1003, "num_input_tokens_seen": 1424423024, "step": 5901 }, { "epoch": 2.555681389203473, "grad_norm": 0.1831370056225536, "learning_rate": 5.310482503604497e-06, "loss": 0.1039, "num_input_tokens_seen": 1429360512, "step": 5922 }, { "epoch": 2.5647414118535297, "grad_norm": 0.21013156618721565, "learning_rate": 5.09966515200393e-06, "loss": 0.1034, "num_input_tokens_seen": 1434443216, "step": 5943 }, { "epoch": 2.5738014345035864, "grad_norm": 0.2204689190211589, "learning_rate": 4.892892535443655e-06, "loss": 0.1025, "num_input_tokens_seen": 1439693152, "step": 5964 }, { "epoch": 2.582861457153643, "grad_norm": 0.20888767875138448, "learning_rate": 4.690183280520777e-06, "loss": 0.1041, "num_input_tokens_seen": 1444742640, "step": 5985 }, { "epoch": 2.5919214798036996, "grad_norm": 0.20314033156230726, "learning_rate": 4.491555647794609e-06, "loss": 0.1035, "num_input_tokens_seen": 1449817024, "step": 6006 }, { "epoch": 2.600981502453756, "grad_norm": 0.1878500846044568, "learning_rate": 4.297027530141634e-06, "loss": 0.102, "num_input_tokens_seen": 1454966656, "step": 6027 }, { "epoch": 2.6100415251038127, "grad_norm": 0.1689243463349296, "learning_rate": 4.106616451143719e-06, "loss": 0.0968, "num_input_tokens_seen": 1460107904, "step": 6048 }, { "epoch": 2.6191015477538695, "grad_norm": 0.20642026958771845, "learning_rate": 3.9203395635095615e-06, "loss": 0.1025, "num_input_tokens_seen": 1465329712, "step": 6069 }, { "epoch": 2.6281615704039263, "grad_norm": 0.18586990386683522, "learning_rate": 3.7382136475294592e-06, "loss": 0.0992, "num_input_tokens_seen": 1470486400, "step": 6090 }, { "epoch": 2.6372215930539826, "grad_norm": 0.2145218277764739, "learning_rate": 3.5602551095638094e-06, "loss": 0.1014, "num_input_tokens_seen": 1475481216, "step": 6111 }, { "epoch": 2.6462816157040394, "grad_norm": 0.18413690443200365, "learning_rate": 3.386479980565077e-06, "loss": 0.097, "num_input_tokens_seen": 1480509520, "step": 6132 }, { "epoch": 2.6553416383540958, "grad_norm": 0.20568925987079073, "learning_rate": 3.2169039146337455e-06, "loss": 0.1011, "num_input_tokens_seen": 1485415168, "step": 6153 }, { "epoch": 2.6644016610041525, "grad_norm": 0.19689596480908558, "learning_rate": 3.0515421876081364e-06, "loss": 0.1003, "num_input_tokens_seen": 1490580288, "step": 6174 }, { "epoch": 2.6734616836542093, "grad_norm": 0.19998505506705416, "learning_rate": 2.8904096956883396e-06, "loss": 0.1011, "num_input_tokens_seen": 1495724928, "step": 6195 }, { "epoch": 2.6825217063042657, "grad_norm": 0.19403273255448156, "learning_rate": 2.733520954094304e-06, "loss": 0.0992, "num_input_tokens_seen": 1500671568, "step": 6216 }, { "epoch": 2.6915817289543225, "grad_norm": 0.2062968511222456, "learning_rate": 2.580890095758276e-06, "loss": 0.0985, "num_input_tokens_seen": 1505736848, "step": 6237 }, { "epoch": 2.700641751604379, "grad_norm": 0.18160540614114035, "learning_rate": 2.4325308700516804e-06, "loss": 0.0999, "num_input_tokens_seen": 1510772384, "step": 6258 }, { "epoch": 2.7097017742544356, "grad_norm": 0.1923201194653099, "learning_rate": 2.288456641546549e-06, "loss": 0.1015, "num_input_tokens_seen": 1515840336, "step": 6279 }, { "epoch": 2.7187617969044924, "grad_norm": 0.19463587674679012, "learning_rate": 2.1486803888115802e-06, "loss": 0.0952, "num_input_tokens_seen": 1520795728, "step": 6300 }, { "epoch": 2.7278218195545487, "grad_norm": 0.20675363777805839, "learning_rate": 2.013214703242994e-06, "loss": 0.1014, "num_input_tokens_seen": 1525885232, "step": 6321 }, { "epoch": 2.7368818422046055, "grad_norm": 0.20373174987364437, "learning_rate": 1.8820717879303175e-06, "loss": 0.0962, "num_input_tokens_seen": 1531020736, "step": 6342 }, { "epoch": 2.7459418648546623, "grad_norm": 0.1956195345947546, "learning_rate": 1.7552634565570325e-06, "loss": 0.0984, "num_input_tokens_seen": 1536139280, "step": 6363 }, { "epoch": 2.7550018875047186, "grad_norm": 0.1948987852214273, "learning_rate": 1.6328011323364313e-06, "loss": 0.0996, "num_input_tokens_seen": 1541119392, "step": 6384 }, { "epoch": 2.7640619101547754, "grad_norm": 0.20220064200986687, "learning_rate": 1.5146958469825445e-06, "loss": 0.098, "num_input_tokens_seen": 1546172016, "step": 6405 }, { "epoch": 2.773121932804832, "grad_norm": 0.18253057622469138, "learning_rate": 1.4009582397163879e-06, "loss": 0.0979, "num_input_tokens_seen": 1551424800, "step": 6426 }, { "epoch": 2.7821819554548886, "grad_norm": 0.19954203330015002, "learning_rate": 1.2915985563075383e-06, "loss": 0.096, "num_input_tokens_seen": 1556510032, "step": 6447 }, { "epoch": 2.7912419781049453, "grad_norm": 0.18349734934457243, "learning_rate": 1.1866266481512234e-06, "loss": 0.0995, "num_input_tokens_seen": 1561425840, "step": 6468 }, { "epoch": 2.8003020007550017, "grad_norm": 0.19583118554915796, "learning_rate": 1.0860519713808082e-06, "loss": 0.0979, "num_input_tokens_seen": 1566437584, "step": 6489 }, { "epoch": 2.8093620234050585, "grad_norm": 0.21286644366673166, "learning_rate": 9.898835860160271e-07, "loss": 0.0944, "num_input_tokens_seen": 1571433728, "step": 6510 }, { "epoch": 2.8184220460551153, "grad_norm": 0.18549288153723123, "learning_rate": 8.981301551467924e-07, "loss": 0.0949, "num_input_tokens_seen": 1576510304, "step": 6531 }, { "epoch": 2.8274820687051716, "grad_norm": 0.1969304866148859, "learning_rate": 8.10799944152818e-07, "loss": 0.0959, "num_input_tokens_seen": 1581652480, "step": 6552 }, { "epoch": 2.8365420913552284, "grad_norm": 0.19621694587220775, "learning_rate": 7.279008199590543e-07, "loss": 0.0995, "num_input_tokens_seen": 1586710928, "step": 6573 }, { "epoch": 2.845602114005285, "grad_norm": 0.19626099410771614, "learning_rate": 6.494402503270158e-07, "loss": 0.0973, "num_input_tokens_seen": 1591751872, "step": 6594 }, { "epoch": 2.8546621366553415, "grad_norm": 0.17530668226038035, "learning_rate": 5.754253031820588e-07, "loss": 0.0992, "num_input_tokens_seen": 1596817344, "step": 6615 }, { "epoch": 2.8637221593053983, "grad_norm": 0.1962276487617284, "learning_rate": 5.058626459766902e-07, "loss": 0.0978, "num_input_tokens_seen": 1601955280, "step": 6636 }, { "epoch": 2.872782181955455, "grad_norm": 0.19661166968992774, "learning_rate": 4.407585450899587e-07, "loss": 0.0963, "num_input_tokens_seen": 1606977888, "step": 6657 }, { "epoch": 2.8818422046055114, "grad_norm": 0.1900385103617743, "learning_rate": 3.8011886526292395e-07, "loss": 0.0932, "num_input_tokens_seen": 1612137088, "step": 6678 }, { "epoch": 2.8909022272555682, "grad_norm": 0.19054833990071282, "learning_rate": 3.2394906907040056e-07, "loss": 0.098, "num_input_tokens_seen": 1617105760, "step": 6699 }, { "epoch": 2.8999622499056246, "grad_norm": 0.22982156321658473, "learning_rate": 2.7225421642883554e-07, "loss": 0.099, "num_input_tokens_seen": 1622079712, "step": 6720 }, { "epoch": 2.9090222725556814, "grad_norm": 0.20733117617880123, "learning_rate": 2.250389641405115e-07, "loss": 0.0974, "num_input_tokens_seen": 1627262208, "step": 6741 }, { "epoch": 2.918082295205738, "grad_norm": 0.2091535861816229, "learning_rate": 1.823075654740547e-07, "loss": 0.0995, "num_input_tokens_seen": 1632293744, "step": 6762 }, { "epoch": 2.9271423178557945, "grad_norm": 0.2010699924002249, "learning_rate": 1.4406386978128018e-07, "loss": 0.0955, "num_input_tokens_seen": 1637413856, "step": 6783 }, { "epoch": 2.9362023405058513, "grad_norm": 0.2039974530223662, "learning_rate": 1.1031132215043594e-07, "loss": 0.095, "num_input_tokens_seen": 1642376144, "step": 6804 }, { "epoch": 2.9452623631559076, "grad_norm": 0.1867785407127438, "learning_rate": 8.105296309586785e-08, "loss": 0.0995, "num_input_tokens_seen": 1647471008, "step": 6825 }, { "epoch": 2.9543223858059644, "grad_norm": 0.1873978173047584, "learning_rate": 5.629142828411094e-08, "loss": 0.0976, "num_input_tokens_seen": 1652489696, "step": 6846 }, { "epoch": 2.963382408456021, "grad_norm": 0.1977061377471765, "learning_rate": 3.602894829647374e-08, "loss": 0.0955, "num_input_tokens_seen": 1657488848, "step": 6867 }, { "epoch": 2.972442431106078, "grad_norm": 0.19200607060766933, "learning_rate": 2.0267348428087974e-08, "loss": 0.0979, "num_input_tokens_seen": 1662468816, "step": 6888 }, { "epoch": 2.9815024537561343, "grad_norm": 0.21184767679059754, "learning_rate": 9.008048523501122e-09, "loss": 0.0999, "num_input_tokens_seen": 1667627776, "step": 6909 }, { "epoch": 2.990562476406191, "grad_norm": 0.2003643238535032, "learning_rate": 2.252062848745462e-09, "loss": 0.099, "num_input_tokens_seen": 1672724048, "step": 6930 }, { "epoch": 2.9996224990562474, "grad_norm": 0.19768122036104033, "learning_rate": 0.0, "loss": 0.0971, "num_input_tokens_seen": 1677860944, "step": 6951 }, { "epoch": 2.9996224990562474, "num_input_tokens_seen": 1677860944, "step": 6951, "total_flos": 8545808136798208.0, "train_loss": 0.2019795169591595, "train_runtime": 178782.2648, "train_samples_per_second": 4.978, "train_steps_per_second": 0.039 } ], "logging_steps": 21, "max_steps": 6951, "num_input_tokens_seen": 1677860944, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8545808136798208.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }