| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9996224990562474, |
| "eval_steps": 5000, |
| "global_step": 6951, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.009060022650056626, |
| "grad_norm": 2.6664810047066947, |
| "learning_rate": 9.999774793715127e-05, |
| "loss": 1.431, |
| "num_input_tokens_seen": 5118512, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.01812004530011325, |
| "grad_norm": 0.8152880359374859, |
| "learning_rate": 9.99909919514765e-05, |
| "loss": 0.5348, |
| "num_input_tokens_seen": 10150592, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.027180067950169876, |
| "grad_norm": 0.6315786850545617, |
| "learning_rate": 9.997973265157192e-05, |
| "loss": 0.4599, |
| "num_input_tokens_seen": 15170896, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.0362400906002265, |
| "grad_norm": 0.4538892840989214, |
| "learning_rate": 9.996397105170353e-05, |
| "loss": 0.4108, |
| "num_input_tokens_seen": 20155632, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.045300113250283124, |
| "grad_norm": 0.4720911162561318, |
| "learning_rate": 9.994370857171588e-05, |
| "loss": 0.3947, |
| "num_input_tokens_seen": 25135440, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.05436013590033975, |
| "grad_norm": 0.4622888476742547, |
| "learning_rate": 9.991894703690414e-05, |
| "loss": 0.3764, |
| "num_input_tokens_seen": 30346784, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.06342015855039637, |
| "grad_norm": 0.4173513485950873, |
| "learning_rate": 9.988968867784958e-05, |
| "loss": 0.3751, |
| "num_input_tokens_seen": 35269664, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.072480181200453, |
| "grad_norm": 0.39249341716348773, |
| "learning_rate": 9.985593613021872e-05, |
| "loss": 0.3704, |
| "num_input_tokens_seen": 40151792, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.08154020385050963, |
| "grad_norm": 0.3080363284417135, |
| "learning_rate": 9.981769243452595e-05, |
| "loss": 0.3552, |
| "num_input_tokens_seen": 45283312, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.09060022650056625, |
| "grad_norm": 0.3747998358734097, |
| "learning_rate": 9.977496103585949e-05, |
| "loss": 0.3576, |
| "num_input_tokens_seen": 50298912, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.09966024915062288, |
| "grad_norm": 0.2975791185912304, |
| "learning_rate": 9.972774578357117e-05, |
| "loss": 0.3451, |
| "num_input_tokens_seen": 55445792, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.1087202718006795, |
| "grad_norm": 0.3172928529259604, |
| "learning_rate": 9.96760509309296e-05, |
| "loss": 0.3506, |
| "num_input_tokens_seen": 60506496, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.11778029445073612, |
| "grad_norm": 0.3304680645103982, |
| "learning_rate": 9.961988113473708e-05, |
| "loss": 0.3443, |
| "num_input_tokens_seen": 65678096, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.12684031710079274, |
| "grad_norm": 0.2929759270178528, |
| "learning_rate": 9.955924145491005e-05, |
| "loss": 0.3446, |
| "num_input_tokens_seen": 70478688, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.13590033975084936, |
| "grad_norm": 0.2809492487037724, |
| "learning_rate": 9.94941373540233e-05, |
| "loss": 0.3362, |
| "num_input_tokens_seen": 75373536, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.144960362400906, |
| "grad_norm": 0.38609616986912937, |
| "learning_rate": 9.942457469681794e-05, |
| "loss": 0.3384, |
| "num_input_tokens_seen": 80476704, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.15402038505096263, |
| "grad_norm": 0.24129947470960447, |
| "learning_rate": 9.935055974967299e-05, |
| "loss": 0.3315, |
| "num_input_tokens_seen": 85670800, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.16308040770101925, |
| "grad_norm": 0.22875446286948012, |
| "learning_rate": 9.927209918004095e-05, |
| "loss": 0.33, |
| "num_input_tokens_seen": 90707040, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.17214043035107587, |
| "grad_norm": 0.26219955903132913, |
| "learning_rate": 9.918920005584719e-05, |
| "loss": 0.3296, |
| "num_input_tokens_seen": 95824496, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.1812004530011325, |
| "grad_norm": 0.6217611528853424, |
| "learning_rate": 9.910186984485321e-05, |
| "loss": 0.3315, |
| "num_input_tokens_seen": 100862224, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.19026047565118911, |
| "grad_norm": 0.5343386676193482, |
| "learning_rate": 9.901011641398398e-05, |
| "loss": 0.353, |
| "num_input_tokens_seen": 105876656, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.19932049830124576, |
| "grad_norm": 0.29271392356860787, |
| "learning_rate": 9.89139480286192e-05, |
| "loss": 0.3414, |
| "num_input_tokens_seen": 110980864, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.20838052095130238, |
| "grad_norm": 0.2662936598921738, |
| "learning_rate": 9.881337335184878e-05, |
| "loss": 0.3224, |
| "num_input_tokens_seen": 116114800, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.217440543601359, |
| "grad_norm": 0.3006863273182064, |
| "learning_rate": 9.870840144369246e-05, |
| "loss": 0.3212, |
| "num_input_tokens_seen": 121255744, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.22650056625141562, |
| "grad_norm": 0.28557101475631624, |
| "learning_rate": 9.859904176028362e-05, |
| "loss": 0.3213, |
| "num_input_tokens_seen": 126288608, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.23556058890147225, |
| "grad_norm": 0.30204169829965893, |
| "learning_rate": 9.848530415301747e-05, |
| "loss": 0.3198, |
| "num_input_tokens_seen": 131233488, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.24462061155152887, |
| "grad_norm": 0.22788149212961117, |
| "learning_rate": 9.836719886766356e-05, |
| "loss": 0.3149, |
| "num_input_tokens_seen": 136257888, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.2536806342015855, |
| "grad_norm": 0.25388452332747136, |
| "learning_rate": 9.824473654344297e-05, |
| "loss": 0.3169, |
| "num_input_tokens_seen": 141405120, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.2627406568516421, |
| "grad_norm": 0.22932388019266456, |
| "learning_rate": 9.811792821206969e-05, |
| "loss": 0.3142, |
| "num_input_tokens_seen": 146380496, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.2718006795016987, |
| "grad_norm": 0.21316227582275912, |
| "learning_rate": 9.7986785296757e-05, |
| "loss": 0.3097, |
| "num_input_tokens_seen": 151344048, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.2808607021517554, |
| "grad_norm": 0.2055839569031328, |
| "learning_rate": 9.785131961118844e-05, |
| "loss": 0.3116, |
| "num_input_tokens_seen": 156435136, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.289920724801812, |
| "grad_norm": 0.2441799648084251, |
| "learning_rate": 9.771154335845345e-05, |
| "loss": 0.3086, |
| "num_input_tokens_seen": 161536224, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.29898074745186864, |
| "grad_norm": 0.20481810635702893, |
| "learning_rate": 9.756746912994832e-05, |
| "loss": 0.3057, |
| "num_input_tokens_seen": 166573984, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.30804077010192527, |
| "grad_norm": 0.2592126104806373, |
| "learning_rate": 9.741910990424174e-05, |
| "loss": 0.3017, |
| "num_input_tokens_seen": 171638000, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.3171007927519819, |
| "grad_norm": 0.25174137308280303, |
| "learning_rate": 9.726647904590571e-05, |
| "loss": 0.3066, |
| "num_input_tokens_seen": 176765648, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.3261608154020385, |
| "grad_norm": 0.20855236580529835, |
| "learning_rate": 9.710959030431167e-05, |
| "loss": 0.2996, |
| "num_input_tokens_seen": 181949360, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.3352208380520951, |
| "grad_norm": 0.23697815659002952, |
| "learning_rate": 9.694845781239187e-05, |
| "loss": 0.2972, |
| "num_input_tokens_seen": 186990096, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.34428086070215175, |
| "grad_norm": 0.2492979898134609, |
| "learning_rate": 9.678309608536626e-05, |
| "loss": 0.2984, |
| "num_input_tokens_seen": 192083856, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.35334088335220837, |
| "grad_norm": 0.23816573617268064, |
| "learning_rate": 9.661352001943493e-05, |
| "loss": 0.2957, |
| "num_input_tokens_seen": 197134448, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.362400906002265, |
| "grad_norm": 0.20467034048790322, |
| "learning_rate": 9.64397448904362e-05, |
| "loss": 0.2926, |
| "num_input_tokens_seen": 202310368, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.3714609286523216, |
| "grad_norm": 0.17904927666050605, |
| "learning_rate": 9.626178635247054e-05, |
| "loss": 0.2909, |
| "num_input_tokens_seen": 207359840, |
| "step": 861 |
| }, |
| { |
| "epoch": 0.38052095130237823, |
| "grad_norm": 0.248985929697072, |
| "learning_rate": 9.607966043649046e-05, |
| "loss": 0.2954, |
| "num_input_tokens_seen": 212395664, |
| "step": 882 |
| }, |
| { |
| "epoch": 0.3895809739524349, |
| "grad_norm": 0.17735947921966527, |
| "learning_rate": 9.589338354885629e-05, |
| "loss": 0.2912, |
| "num_input_tokens_seen": 217570640, |
| "step": 903 |
| }, |
| { |
| "epoch": 0.3986409966024915, |
| "grad_norm": 0.2653024856558906, |
| "learning_rate": 9.570297246985837e-05, |
| "loss": 0.2928, |
| "num_input_tokens_seen": 222629712, |
| "step": 924 |
| }, |
| { |
| "epoch": 0.40770101925254815, |
| "grad_norm": 0.21684716629964057, |
| "learning_rate": 9.550844435220539e-05, |
| "loss": 0.292, |
| "num_input_tokens_seen": 227565744, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.41676104190260477, |
| "grad_norm": 0.19831973070913392, |
| "learning_rate": 9.530981671947923e-05, |
| "loss": 0.292, |
| "num_input_tokens_seen": 232655712, |
| "step": 966 |
| }, |
| { |
| "epoch": 0.4258210645526614, |
| "grad_norm": 0.2039591338730108, |
| "learning_rate": 9.510710746455636e-05, |
| "loss": 0.2959, |
| "num_input_tokens_seen": 237611056, |
| "step": 987 |
| }, |
| { |
| "epoch": 0.434881087202718, |
| "grad_norm": 0.20635917697106, |
| "learning_rate": 9.490033484799608e-05, |
| "loss": 0.2884, |
| "num_input_tokens_seen": 242693136, |
| "step": 1008 |
| }, |
| { |
| "epoch": 0.44394110985277463, |
| "grad_norm": 0.21178513822087988, |
| "learning_rate": 9.468951749639551e-05, |
| "loss": 0.2878, |
| "num_input_tokens_seen": 247677488, |
| "step": 1029 |
| }, |
| { |
| "epoch": 0.45300113250283125, |
| "grad_norm": 0.24393748334345636, |
| "learning_rate": 9.447467440071164e-05, |
| "loss": 0.2908, |
| "num_input_tokens_seen": 252770384, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.46206115515288787, |
| "grad_norm": 0.19345581725168481, |
| "learning_rate": 9.425582491455067e-05, |
| "loss": 0.2796, |
| "num_input_tokens_seen": 258009696, |
| "step": 1071 |
| }, |
| { |
| "epoch": 0.4711211778029445, |
| "grad_norm": 0.17550447436524724, |
| "learning_rate": 9.403298875242448e-05, |
| "loss": 0.2858, |
| "num_input_tokens_seen": 263147728, |
| "step": 1092 |
| }, |
| { |
| "epoch": 0.4801812004530011, |
| "grad_norm": 0.1873327110291133, |
| "learning_rate": 9.380618598797473e-05, |
| "loss": 0.2876, |
| "num_input_tokens_seen": 268184080, |
| "step": 1113 |
| }, |
| { |
| "epoch": 0.48924122310305773, |
| "grad_norm": 0.2326284547666313, |
| "learning_rate": 9.357543705216465e-05, |
| "loss": 0.2814, |
| "num_input_tokens_seen": 273442768, |
| "step": 1134 |
| }, |
| { |
| "epoch": 0.4983012457531144, |
| "grad_norm": 0.1606625392823979, |
| "learning_rate": 9.334076273143843e-05, |
| "loss": 0.2804, |
| "num_input_tokens_seen": 278640624, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.507361268403171, |
| "grad_norm": 0.174300515580659, |
| "learning_rate": 9.310218416584886e-05, |
| "loss": 0.2863, |
| "num_input_tokens_seen": 283769424, |
| "step": 1176 |
| }, |
| { |
| "epoch": 0.5164212910532276, |
| "grad_norm": 0.18523825539113345, |
| "learning_rate": 9.28597228471529e-05, |
| "loss": 0.2851, |
| "num_input_tokens_seen": 288866448, |
| "step": 1197 |
| }, |
| { |
| "epoch": 0.5254813137032842, |
| "grad_norm": 0.29288899462077944, |
| "learning_rate": 9.26134006168757e-05, |
| "loss": 0.2798, |
| "num_input_tokens_seen": 293944576, |
| "step": 1218 |
| }, |
| { |
| "epoch": 0.5345413363533409, |
| "grad_norm": 0.21677248351508627, |
| "learning_rate": 9.236323966434295e-05, |
| "loss": 0.2728, |
| "num_input_tokens_seen": 299090032, |
| "step": 1239 |
| }, |
| { |
| "epoch": 0.5436013590033975, |
| "grad_norm": 0.17425284816339717, |
| "learning_rate": 9.210926252468219e-05, |
| "loss": 0.2756, |
| "num_input_tokens_seen": 304016304, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.5526613816534541, |
| "grad_norm": 0.19722321188451059, |
| "learning_rate": 9.185149207679263e-05, |
| "loss": 0.2747, |
| "num_input_tokens_seen": 309084016, |
| "step": 1281 |
| }, |
| { |
| "epoch": 0.5617214043035108, |
| "grad_norm": 0.15927450699962342, |
| "learning_rate": 9.158995154128425e-05, |
| "loss": 0.2772, |
| "num_input_tokens_seen": 314201696, |
| "step": 1302 |
| }, |
| { |
| "epoch": 0.5707814269535674, |
| "grad_norm": 0.144121234319095, |
| "learning_rate": 9.132466447838597e-05, |
| "loss": 0.2785, |
| "num_input_tokens_seen": 319266256, |
| "step": 1323 |
| }, |
| { |
| "epoch": 0.579841449603624, |
| "grad_norm": 0.20014275331573014, |
| "learning_rate": 9.105565478582334e-05, |
| "loss": 0.2755, |
| "num_input_tokens_seen": 324468352, |
| "step": 1344 |
| }, |
| { |
| "epoch": 0.5889014722536806, |
| "grad_norm": 0.1763537829210326, |
| "learning_rate": 9.078294669666576e-05, |
| "loss": 0.2708, |
| "num_input_tokens_seen": 329566736, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.5979614949037373, |
| "grad_norm": 0.20344878470372835, |
| "learning_rate": 9.050656477714346e-05, |
| "loss": 0.2729, |
| "num_input_tokens_seen": 334661888, |
| "step": 1386 |
| }, |
| { |
| "epoch": 0.6070215175537939, |
| "grad_norm": 0.16201525012659232, |
| "learning_rate": 9.022653392443454e-05, |
| "loss": 0.2754, |
| "num_input_tokens_seen": 339784976, |
| "step": 1407 |
| }, |
| { |
| "epoch": 0.6160815402038505, |
| "grad_norm": 0.17086654725622255, |
| "learning_rate": 8.994287936442225e-05, |
| "loss": 0.2742, |
| "num_input_tokens_seen": 344776544, |
| "step": 1428 |
| }, |
| { |
| "epoch": 0.6251415628539071, |
| "grad_norm": 0.17370971449213884, |
| "learning_rate": 8.96556266494224e-05, |
| "loss": 0.2703, |
| "num_input_tokens_seen": 349731168, |
| "step": 1449 |
| }, |
| { |
| "epoch": 0.6342015855039638, |
| "grad_norm": 0.1759254965690454, |
| "learning_rate": 8.936480165588173e-05, |
| "loss": 0.2756, |
| "num_input_tokens_seen": 354694544, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.6432616081540203, |
| "grad_norm": 0.1884165927777844, |
| "learning_rate": 8.907043058204674e-05, |
| "loss": 0.2698, |
| "num_input_tokens_seen": 359871984, |
| "step": 1491 |
| }, |
| { |
| "epoch": 0.652321630804077, |
| "grad_norm": 0.18427306909901323, |
| "learning_rate": 8.877253994560382e-05, |
| "loss": 0.2716, |
| "num_input_tokens_seen": 364937440, |
| "step": 1512 |
| }, |
| { |
| "epoch": 0.6613816534541337, |
| "grad_norm": 0.20911376204072243, |
| "learning_rate": 8.847115658129039e-05, |
| "loss": 0.2682, |
| "num_input_tokens_seen": 369994848, |
| "step": 1533 |
| }, |
| { |
| "epoch": 0.6704416761041903, |
| "grad_norm": 0.20741042143522315, |
| "learning_rate": 8.816630763847755e-05, |
| "loss": 0.2695, |
| "num_input_tokens_seen": 374992544, |
| "step": 1554 |
| }, |
| { |
| "epoch": 0.6795016987542469, |
| "grad_norm": 0.1643283959081191, |
| "learning_rate": 8.785802057872446e-05, |
| "loss": 0.2706, |
| "num_input_tokens_seen": 380038624, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.6885617214043035, |
| "grad_norm": 0.20102992236744546, |
| "learning_rate": 8.754632317330447e-05, |
| "loss": 0.2704, |
| "num_input_tokens_seen": 385195792, |
| "step": 1596 |
| }, |
| { |
| "epoch": 0.6976217440543602, |
| "grad_norm": 0.18834807757879243, |
| "learning_rate": 8.723124350070347e-05, |
| "loss": 0.2707, |
| "num_input_tokens_seen": 390195296, |
| "step": 1617 |
| }, |
| { |
| "epoch": 0.7066817667044167, |
| "grad_norm": 0.15261075530832655, |
| "learning_rate": 8.691280994409043e-05, |
| "loss": 0.2653, |
| "num_input_tokens_seen": 395353440, |
| "step": 1638 |
| }, |
| { |
| "epoch": 0.7157417893544734, |
| "grad_norm": 0.20682506960801342, |
| "learning_rate": 8.659105118876068e-05, |
| "loss": 0.2649, |
| "num_input_tokens_seen": 400444080, |
| "step": 1659 |
| }, |
| { |
| "epoch": 0.72480181200453, |
| "grad_norm": 0.21733357068498219, |
| "learning_rate": 8.626599621955179e-05, |
| "loss": 0.2652, |
| "num_input_tokens_seen": 405492112, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.7338618346545867, |
| "grad_norm": 0.18286757761891814, |
| "learning_rate": 8.593767431823255e-05, |
| "loss": 0.2638, |
| "num_input_tokens_seen": 410467584, |
| "step": 1701 |
| }, |
| { |
| "epoch": 0.7429218573046432, |
| "grad_norm": 0.16764027437130122, |
| "learning_rate": 8.56061150608652e-05, |
| "loss": 0.2685, |
| "num_input_tokens_seen": 415550320, |
| "step": 1722 |
| }, |
| { |
| "epoch": 0.7519818799546999, |
| "grad_norm": 0.17921646972994934, |
| "learning_rate": 8.527134831514117e-05, |
| "loss": 0.2584, |
| "num_input_tokens_seen": 420503712, |
| "step": 1743 |
| }, |
| { |
| "epoch": 0.7610419026047565, |
| "grad_norm": 0.19062465197166928, |
| "learning_rate": 8.493340423769053e-05, |
| "loss": 0.2607, |
| "num_input_tokens_seen": 425602800, |
| "step": 1764 |
| }, |
| { |
| "epoch": 0.7701019252548131, |
| "grad_norm": 0.19476076523413036, |
| "learning_rate": 8.459231327136532e-05, |
| "loss": 0.2652, |
| "num_input_tokens_seen": 430546320, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.7791619479048698, |
| "grad_norm": 0.18756178702354967, |
| "learning_rate": 8.42481061424973e-05, |
| "loss": 0.2604, |
| "num_input_tokens_seen": 435625600, |
| "step": 1806 |
| }, |
| { |
| "epoch": 0.7882219705549264, |
| "grad_norm": 0.16871375853816825, |
| "learning_rate": 8.390081385812993e-05, |
| "loss": 0.2603, |
| "num_input_tokens_seen": 440695024, |
| "step": 1827 |
| }, |
| { |
| "epoch": 0.797281993204983, |
| "grad_norm": 0.1669594920851862, |
| "learning_rate": 8.355046770322528e-05, |
| "loss": 0.2576, |
| "num_input_tokens_seen": 445877360, |
| "step": 1848 |
| }, |
| { |
| "epoch": 0.8063420158550396, |
| "grad_norm": 0.20147791313721689, |
| "learning_rate": 8.319709923784573e-05, |
| "loss": 0.2622, |
| "num_input_tokens_seen": 451021040, |
| "step": 1869 |
| }, |
| { |
| "epoch": 0.8154020385050963, |
| "grad_norm": 0.17108850294819108, |
| "learning_rate": 8.284074029431099e-05, |
| "loss": 0.2587, |
| "num_input_tokens_seen": 456101872, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.8244620611551529, |
| "grad_norm": 0.18516681989871883, |
| "learning_rate": 8.248142297433057e-05, |
| "loss": 0.2575, |
| "num_input_tokens_seen": 461365920, |
| "step": 1911 |
| }, |
| { |
| "epoch": 0.8335220838052095, |
| "grad_norm": 0.20285356102658042, |
| "learning_rate": 8.211917964611196e-05, |
| "loss": 0.2573, |
| "num_input_tokens_seen": 466466096, |
| "step": 1932 |
| }, |
| { |
| "epoch": 0.8425821064552661, |
| "grad_norm": 0.207923488217522, |
| "learning_rate": 8.175404294144482e-05, |
| "loss": 0.26, |
| "num_input_tokens_seen": 471541104, |
| "step": 1953 |
| }, |
| { |
| "epoch": 0.8516421291053228, |
| "grad_norm": 0.19850908270608497, |
| "learning_rate": 8.138604575276143e-05, |
| "loss": 0.2571, |
| "num_input_tokens_seen": 476646096, |
| "step": 1974 |
| }, |
| { |
| "epoch": 0.8607021517553793, |
| "grad_norm": 0.1821198820367163, |
| "learning_rate": 8.10152212301737e-05, |
| "loss": 0.251, |
| "num_input_tokens_seen": 481695200, |
| "step": 1995 |
| }, |
| { |
| "epoch": 0.869762174405436, |
| "grad_norm": 0.1623421619904062, |
| "learning_rate": 8.064160277848682e-05, |
| "loss": 0.2614, |
| "num_input_tokens_seen": 486706656, |
| "step": 2016 |
| }, |
| { |
| "epoch": 0.8788221970554927, |
| "grad_norm": 0.1774308248272562, |
| "learning_rate": 8.026522405419023e-05, |
| "loss": 0.2528, |
| "num_input_tokens_seen": 491943424, |
| "step": 2037 |
| }, |
| { |
| "epoch": 0.8878822197055493, |
| "grad_norm": 0.21003241654174584, |
| "learning_rate": 7.988611896242559e-05, |
| "loss": 0.2571, |
| "num_input_tokens_seen": 496925888, |
| "step": 2058 |
| }, |
| { |
| "epoch": 0.8969422423556059, |
| "grad_norm": 0.20014809740395048, |
| "learning_rate": 7.950432165393259e-05, |
| "loss": 0.2547, |
| "num_input_tokens_seen": 502065216, |
| "step": 2079 |
| }, |
| { |
| "epoch": 0.9060022650056625, |
| "grad_norm": 0.17154274701388803, |
| "learning_rate": 7.911986652197262e-05, |
| "loss": 0.2538, |
| "num_input_tokens_seen": 507089616, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.9150622876557192, |
| "grad_norm": 0.17929198920009218, |
| "learning_rate": 7.873278819923048e-05, |
| "loss": 0.2551, |
| "num_input_tokens_seen": 512060336, |
| "step": 2121 |
| }, |
| { |
| "epoch": 0.9241223103057757, |
| "grad_norm": 0.16398954046091754, |
| "learning_rate": 7.834312155469456e-05, |
| "loss": 0.2515, |
| "num_input_tokens_seen": 517133680, |
| "step": 2142 |
| }, |
| { |
| "epoch": 0.9331823329558324, |
| "grad_norm": 0.17469600332499013, |
| "learning_rate": 7.79509016905158e-05, |
| "loss": 0.2526, |
| "num_input_tokens_seen": 522229616, |
| "step": 2163 |
| }, |
| { |
| "epoch": 0.942242355605889, |
| "grad_norm": 0.15900929966723312, |
| "learning_rate": 7.755616393884561e-05, |
| "loss": 0.2482, |
| "num_input_tokens_seen": 527368864, |
| "step": 2184 |
| }, |
| { |
| "epoch": 0.9513023782559457, |
| "grad_norm": 0.18254122861853214, |
| "learning_rate": 7.715894385865299e-05, |
| "loss": 0.2516, |
| "num_input_tokens_seen": 532499712, |
| "step": 2205 |
| }, |
| { |
| "epoch": 0.9603624009060022, |
| "grad_norm": 0.19001641667974054, |
| "learning_rate": 7.675927723252134e-05, |
| "loss": 0.2493, |
| "num_input_tokens_seen": 537438224, |
| "step": 2226 |
| }, |
| { |
| "epoch": 0.9694224235560589, |
| "grad_norm": 0.16454856189702688, |
| "learning_rate": 7.635720006342512e-05, |
| "loss": 0.2465, |
| "num_input_tokens_seen": 542603472, |
| "step": 2247 |
| }, |
| { |
| "epoch": 0.9784824462061155, |
| "grad_norm": 0.18435585737458549, |
| "learning_rate": 7.595274857148652e-05, |
| "loss": 0.2486, |
| "num_input_tokens_seen": 547622688, |
| "step": 2268 |
| }, |
| { |
| "epoch": 0.9875424688561721, |
| "grad_norm": 0.16595693704140477, |
| "learning_rate": 7.554595919071268e-05, |
| "loss": 0.2472, |
| "num_input_tokens_seen": 552751232, |
| "step": 2289 |
| }, |
| { |
| "epoch": 0.9966024915062288, |
| "grad_norm": 0.16765543588823353, |
| "learning_rate": 7.513686856571368e-05, |
| "loss": 0.2471, |
| "num_input_tokens_seen": 557786736, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.0060400151000377, |
| "grad_norm": 0.17133389174596797, |
| "learning_rate": 7.472551354840145e-05, |
| "loss": 0.2361, |
| "num_input_tokens_seen": 562993712, |
| "step": 2331 |
| }, |
| { |
| "epoch": 1.0151000377500943, |
| "grad_norm": 0.18279159983012744, |
| "learning_rate": 7.431193119467008e-05, |
| "loss": 0.217, |
| "num_input_tokens_seen": 568021744, |
| "step": 2352 |
| }, |
| { |
| "epoch": 1.024160060400151, |
| "grad_norm": 0.266530218109695, |
| "learning_rate": 7.389615876105774e-05, |
| "loss": 0.2145, |
| "num_input_tokens_seen": 572956608, |
| "step": 2373 |
| }, |
| { |
| "epoch": 1.0332200830502076, |
| "grad_norm": 0.178808088687151, |
| "learning_rate": 7.347823370139042e-05, |
| "loss": 0.2179, |
| "num_input_tokens_seen": 577973792, |
| "step": 2394 |
| }, |
| { |
| "epoch": 1.0422801057002642, |
| "grad_norm": 0.15535901011827938, |
| "learning_rate": 7.30581936634082e-05, |
| "loss": 0.2098, |
| "num_input_tokens_seen": 582948368, |
| "step": 2415 |
| }, |
| { |
| "epoch": 1.051340128350321, |
| "grad_norm": 0.1879255010231289, |
| "learning_rate": 7.263607648537364e-05, |
| "loss": 0.2174, |
| "num_input_tokens_seen": 587973936, |
| "step": 2436 |
| }, |
| { |
| "epoch": 1.0604001510003775, |
| "grad_norm": 0.18116579180636055, |
| "learning_rate": 7.221192019266332e-05, |
| "loss": 0.2187, |
| "num_input_tokens_seen": 593048624, |
| "step": 2457 |
| }, |
| { |
| "epoch": 1.069460173650434, |
| "grad_norm": 0.16067683672474237, |
| "learning_rate": 7.178576299434238e-05, |
| "loss": 0.2162, |
| "num_input_tokens_seen": 598171840, |
| "step": 2478 |
| }, |
| { |
| "epoch": 1.0785201963004907, |
| "grad_norm": 0.16890347919356866, |
| "learning_rate": 7.135764327972261e-05, |
| "loss": 0.2202, |
| "num_input_tokens_seen": 603168000, |
| "step": 2499 |
| }, |
| { |
| "epoch": 1.0875802189505475, |
| "grad_norm": 0.171528446871021, |
| "learning_rate": 7.092759961490415e-05, |
| "loss": 0.2237, |
| "num_input_tokens_seen": 608280544, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.096640241600604, |
| "grad_norm": 0.16674379731500746, |
| "learning_rate": 7.049567073930143e-05, |
| "loss": 0.2199, |
| "num_input_tokens_seen": 613215280, |
| "step": 2541 |
| }, |
| { |
| "epoch": 1.1057002642506606, |
| "grad_norm": 0.18816045716369076, |
| "learning_rate": 7.006189556215345e-05, |
| "loss": 0.2189, |
| "num_input_tokens_seen": 618261984, |
| "step": 2562 |
| }, |
| { |
| "epoch": 1.1147602869007172, |
| "grad_norm": 0.16658411013718444, |
| "learning_rate": 6.962631315901861e-05, |
| "loss": 0.2163, |
| "num_input_tokens_seen": 623492320, |
| "step": 2583 |
| }, |
| { |
| "epoch": 1.123820309550774, |
| "grad_norm": 0.19772642288828662, |
| "learning_rate": 6.918896276825485e-05, |
| "loss": 0.2157, |
| "num_input_tokens_seen": 628563152, |
| "step": 2604 |
| }, |
| { |
| "epoch": 1.1328803322008305, |
| "grad_norm": 0.17075347503361357, |
| "learning_rate": 6.874988378748483e-05, |
| "loss": 0.2141, |
| "num_input_tokens_seen": 633639472, |
| "step": 2625 |
| }, |
| { |
| "epoch": 1.141940354850887, |
| "grad_norm": 0.14444472527009938, |
| "learning_rate": 6.830911577004698e-05, |
| "loss": 0.2185, |
| "num_input_tokens_seen": 638639648, |
| "step": 2646 |
| }, |
| { |
| "epoch": 1.1510003775009436, |
| "grad_norm": 0.17948926082904845, |
| "learning_rate": 6.786669842143236e-05, |
| "loss": 0.2125, |
| "num_input_tokens_seen": 643743632, |
| "step": 2667 |
| }, |
| { |
| "epoch": 1.1600604001510004, |
| "grad_norm": 0.1845862453656852, |
| "learning_rate": 6.742267159570795e-05, |
| "loss": 0.2138, |
| "num_input_tokens_seen": 648823584, |
| "step": 2688 |
| }, |
| { |
| "epoch": 1.169120422801057, |
| "grad_norm": 0.1611490189496428, |
| "learning_rate": 6.697707529192648e-05, |
| "loss": 0.2152, |
| "num_input_tokens_seen": 653949232, |
| "step": 2709 |
| }, |
| { |
| "epoch": 1.1781804454511136, |
| "grad_norm": 0.19293414419678218, |
| "learning_rate": 6.652994965052319e-05, |
| "loss": 0.2125, |
| "num_input_tokens_seen": 658996016, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.1872404681011703, |
| "grad_norm": 0.1887515649690068, |
| "learning_rate": 6.608133494969994e-05, |
| "loss": 0.2123, |
| "num_input_tokens_seen": 664102304, |
| "step": 2751 |
| }, |
| { |
| "epoch": 1.196300490751227, |
| "grad_norm": 0.15668371616625942, |
| "learning_rate": 6.563127160179671e-05, |
| "loss": 0.2101, |
| "num_input_tokens_seen": 669123584, |
| "step": 2772 |
| }, |
| { |
| "epoch": 1.2053605134012835, |
| "grad_norm": 0.17871386948979864, |
| "learning_rate": 6.517980014965139e-05, |
| "loss": 0.209, |
| "num_input_tokens_seen": 674256592, |
| "step": 2793 |
| }, |
| { |
| "epoch": 1.21442053605134, |
| "grad_norm": 0.16852785549912133, |
| "learning_rate": 6.472696126294732e-05, |
| "loss": 0.2122, |
| "num_input_tokens_seen": 679248208, |
| "step": 2814 |
| }, |
| { |
| "epoch": 1.2234805587013968, |
| "grad_norm": 0.1758847676430736, |
| "learning_rate": 6.427279573454985e-05, |
| "loss": 0.2093, |
| "num_input_tokens_seen": 684325632, |
| "step": 2835 |
| }, |
| { |
| "epoch": 1.2325405813514534, |
| "grad_norm": 0.19298009720432585, |
| "learning_rate": 6.381734447683152e-05, |
| "loss": 0.2114, |
| "num_input_tokens_seen": 689336736, |
| "step": 2856 |
| }, |
| { |
| "epoch": 1.24160060400151, |
| "grad_norm": 0.16439303725722001, |
| "learning_rate": 6.33606485179866e-05, |
| "loss": 0.2111, |
| "num_input_tokens_seen": 694382688, |
| "step": 2877 |
| }, |
| { |
| "epoch": 1.2506606266515665, |
| "grad_norm": 0.18140167193790194, |
| "learning_rate": 6.290274899833517e-05, |
| "loss": 0.2086, |
| "num_input_tokens_seen": 699371792, |
| "step": 2898 |
| }, |
| { |
| "epoch": 1.2597206493016233, |
| "grad_norm": 0.17151657072780352, |
| "learning_rate": 6.244368716661713e-05, |
| "loss": 0.2095, |
| "num_input_tokens_seen": 704404624, |
| "step": 2919 |
| }, |
| { |
| "epoch": 1.2687806719516799, |
| "grad_norm": 0.2052334824788225, |
| "learning_rate": 6.198350437627632e-05, |
| "loss": 0.2083, |
| "num_input_tokens_seen": 709451392, |
| "step": 2940 |
| }, |
| { |
| "epoch": 1.2778406946017364, |
| "grad_norm": 0.18426322385396474, |
| "learning_rate": 6.152224208173533e-05, |
| "loss": 0.2088, |
| "num_input_tokens_seen": 714486848, |
| "step": 2961 |
| }, |
| { |
| "epoch": 1.2869007172517932, |
| "grad_norm": 0.1949416856665576, |
| "learning_rate": 6.10599418346613e-05, |
| "loss": 0.2118, |
| "num_input_tokens_seen": 719556448, |
| "step": 2982 |
| }, |
| { |
| "epoch": 1.2959607399018498, |
| "grad_norm": 0.16224663869829734, |
| "learning_rate": 6.059664528022266e-05, |
| "loss": 0.2058, |
| "num_input_tokens_seen": 724625472, |
| "step": 3003 |
| }, |
| { |
| "epoch": 1.3050207625519064, |
| "grad_norm": 0.1742996675080445, |
| "learning_rate": 6.0132394153337755e-05, |
| "loss": 0.2065, |
| "num_input_tokens_seen": 729794320, |
| "step": 3024 |
| }, |
| { |
| "epoch": 1.3140807852019631, |
| "grad_norm": 0.17806291944392397, |
| "learning_rate": 5.9667230274915174e-05, |
| "loss": 0.207, |
| "num_input_tokens_seen": 734753392, |
| "step": 3045 |
| }, |
| { |
| "epoch": 1.3231408078520197, |
| "grad_norm": 0.18436964692073765, |
| "learning_rate": 5.920119554808651e-05, |
| "loss": 0.2049, |
| "num_input_tokens_seen": 739827088, |
| "step": 3066 |
| }, |
| { |
| "epoch": 1.3322008305020763, |
| "grad_norm": 0.16939954583438047, |
| "learning_rate": 5.873433195443152e-05, |
| "loss": 0.208, |
| "num_input_tokens_seen": 744847184, |
| "step": 3087 |
| }, |
| { |
| "epoch": 1.3412608531521328, |
| "grad_norm": 0.1635889048824763, |
| "learning_rate": 5.82666815501964e-05, |
| "loss": 0.2047, |
| "num_input_tokens_seen": 749874880, |
| "step": 3108 |
| }, |
| { |
| "epoch": 1.3503208758021894, |
| "grad_norm": 0.16527453811089068, |
| "learning_rate": 5.779828646250521e-05, |
| "loss": 0.2022, |
| "num_input_tokens_seen": 754848400, |
| "step": 3129 |
| }, |
| { |
| "epoch": 1.3593808984522462, |
| "grad_norm": 0.18777829379599956, |
| "learning_rate": 5.7329188885565e-05, |
| "loss": 0.2073, |
| "num_input_tokens_seen": 759913728, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.3684409211023028, |
| "grad_norm": 0.15155859962967053, |
| "learning_rate": 5.6859431076864755e-05, |
| "loss": 0.2056, |
| "num_input_tokens_seen": 765009632, |
| "step": 3171 |
| }, |
| { |
| "epoch": 1.3775009437523593, |
| "grad_norm": 0.154719416013354, |
| "learning_rate": 5.6389055353368826e-05, |
| "loss": 0.2056, |
| "num_input_tokens_seen": 770016704, |
| "step": 3192 |
| }, |
| { |
| "epoch": 1.386560966402416, |
| "grad_norm": 0.16028731910302912, |
| "learning_rate": 5.591810408770493e-05, |
| "loss": 0.2037, |
| "num_input_tokens_seen": 775197264, |
| "step": 3213 |
| }, |
| { |
| "epoch": 1.3956209890524727, |
| "grad_norm": 0.15645057759509218, |
| "learning_rate": 5.544661970434696e-05, |
| "loss": 0.2042, |
| "num_input_tokens_seen": 780209328, |
| "step": 3234 |
| }, |
| { |
| "epoch": 1.4046810117025292, |
| "grad_norm": 0.17948486028711083, |
| "learning_rate": 5.497464467579351e-05, |
| "loss": 0.2011, |
| "num_input_tokens_seen": 785402112, |
| "step": 3255 |
| }, |
| { |
| "epoch": 1.4137410343525858, |
| "grad_norm": 0.16643670433003308, |
| "learning_rate": 5.450222151874166e-05, |
| "loss": 0.2015, |
| "num_input_tokens_seen": 790429216, |
| "step": 3276 |
| }, |
| { |
| "epoch": 1.4228010570026426, |
| "grad_norm": 0.17345996800344896, |
| "learning_rate": 5.402939279025705e-05, |
| "loss": 0.2005, |
| "num_input_tokens_seen": 795543264, |
| "step": 3297 |
| }, |
| { |
| "epoch": 1.4318610796526992, |
| "grad_norm": 0.1663960297870033, |
| "learning_rate": 5.355620108394018e-05, |
| "loss": 0.2052, |
| "num_input_tokens_seen": 800533200, |
| "step": 3318 |
| }, |
| { |
| "epoch": 1.4409211023027557, |
| "grad_norm": 0.15958882963062815, |
| "learning_rate": 5.308268902608958e-05, |
| "loss": 0.2042, |
| "num_input_tokens_seen": 805542720, |
| "step": 3339 |
| }, |
| { |
| "epoch": 1.4499811249528123, |
| "grad_norm": 0.17053093118312482, |
| "learning_rate": 5.2608899271861765e-05, |
| "loss": 0.1984, |
| "num_input_tokens_seen": 810549376, |
| "step": 3360 |
| }, |
| { |
| "epoch": 1.459041147602869, |
| "grad_norm": 0.1731330043830458, |
| "learning_rate": 5.213487450142892e-05, |
| "loss": 0.2038, |
| "num_input_tokens_seen": 815599232, |
| "step": 3381 |
| }, |
| { |
| "epoch": 1.4681011702529256, |
| "grad_norm": 0.17941197802062514, |
| "learning_rate": 5.166065741613402e-05, |
| "loss": 0.2012, |
| "num_input_tokens_seen": 820700608, |
| "step": 3402 |
| }, |
| { |
| "epoch": 1.4771611929029822, |
| "grad_norm": 0.1844938407002505, |
| "learning_rate": 5.118629073464424e-05, |
| "loss": 0.1987, |
| "num_input_tokens_seen": 825686176, |
| "step": 3423 |
| }, |
| { |
| "epoch": 1.486221215553039, |
| "grad_norm": 0.1748567417166297, |
| "learning_rate": 5.071181718910283e-05, |
| "loss": 0.1986, |
| "num_input_tokens_seen": 830730000, |
| "step": 3444 |
| }, |
| { |
| "epoch": 1.4952812382030956, |
| "grad_norm": 0.15694569688029672, |
| "learning_rate": 5.023727952127954e-05, |
| "loss": 0.1987, |
| "num_input_tokens_seen": 835738032, |
| "step": 3465 |
| }, |
| { |
| "epoch": 1.5043412608531521, |
| "grad_norm": 0.18575993893540607, |
| "learning_rate": 4.976272047872046e-05, |
| "loss": 0.1952, |
| "num_input_tokens_seen": 840806528, |
| "step": 3486 |
| }, |
| { |
| "epoch": 1.513401283503209, |
| "grad_norm": 0.16316391964141339, |
| "learning_rate": 4.9288182810897184e-05, |
| "loss": 0.1957, |
| "num_input_tokens_seen": 845877808, |
| "step": 3507 |
| }, |
| { |
| "epoch": 1.5224613061532652, |
| "grad_norm": 0.1809977532876625, |
| "learning_rate": 4.8813709265355766e-05, |
| "loss": 0.1957, |
| "num_input_tokens_seen": 851002432, |
| "step": 3528 |
| }, |
| { |
| "epoch": 1.531521328803322, |
| "grad_norm": 0.15896204329046001, |
| "learning_rate": 4.8339342583866005e-05, |
| "loss": 0.197, |
| "num_input_tokens_seen": 856037440, |
| "step": 3549 |
| }, |
| { |
| "epoch": 1.5405813514533786, |
| "grad_norm": 0.1848696286617871, |
| "learning_rate": 4.7865125498571086e-05, |
| "loss": 0.1957, |
| "num_input_tokens_seen": 860972624, |
| "step": 3570 |
| }, |
| { |
| "epoch": 1.5496413741034352, |
| "grad_norm": 0.16411859849940666, |
| "learning_rate": 4.739110072813823e-05, |
| "loss": 0.1926, |
| "num_input_tokens_seen": 866078128, |
| "step": 3591 |
| }, |
| { |
| "epoch": 1.558701396753492, |
| "grad_norm": 0.15293153546751434, |
| "learning_rate": 4.6917310973910425e-05, |
| "loss": 0.1934, |
| "num_input_tokens_seen": 871290720, |
| "step": 3612 |
| }, |
| { |
| "epoch": 1.5677614194035485, |
| "grad_norm": 0.18580264173261662, |
| "learning_rate": 4.6443798916059836e-05, |
| "loss": 0.1961, |
| "num_input_tokens_seen": 876353920, |
| "step": 3633 |
| }, |
| { |
| "epoch": 1.576821442053605, |
| "grad_norm": 0.16117670144515006, |
| "learning_rate": 4.597060720974298e-05, |
| "loss": 0.1902, |
| "num_input_tokens_seen": 881469536, |
| "step": 3654 |
| }, |
| { |
| "epoch": 1.5858814647036619, |
| "grad_norm": 0.1821844142116438, |
| "learning_rate": 4.549777848125833e-05, |
| "loss": 0.1971, |
| "num_input_tokens_seen": 886532048, |
| "step": 3675 |
| }, |
| { |
| "epoch": 1.5949414873537184, |
| "grad_norm": 0.188981157327872, |
| "learning_rate": 4.50253553242065e-05, |
| "loss": 0.1952, |
| "num_input_tokens_seen": 891565152, |
| "step": 3696 |
| }, |
| { |
| "epoch": 1.604001510003775, |
| "grad_norm": 0.1663775536476532, |
| "learning_rate": 4.4553380295653053e-05, |
| "loss": 0.1908, |
| "num_input_tokens_seen": 896603568, |
| "step": 3717 |
| }, |
| { |
| "epoch": 1.6130615326538318, |
| "grad_norm": 0.16695660636413406, |
| "learning_rate": 4.40818959122951e-05, |
| "loss": 0.1945, |
| "num_input_tokens_seen": 901703264, |
| "step": 3738 |
| }, |
| { |
| "epoch": 1.6221215553038881, |
| "grad_norm": 0.18003132042487852, |
| "learning_rate": 4.361094464663118e-05, |
| "loss": 0.1911, |
| "num_input_tokens_seen": 906846256, |
| "step": 3759 |
| }, |
| { |
| "epoch": 1.631181577953945, |
| "grad_norm": 0.16377146934729214, |
| "learning_rate": 4.3140568923135264e-05, |
| "loss": 0.193, |
| "num_input_tokens_seen": 911964272, |
| "step": 3780 |
| }, |
| { |
| "epoch": 1.6402416006040015, |
| "grad_norm": 0.1711801561805431, |
| "learning_rate": 4.267081111443501e-05, |
| "loss": 0.1898, |
| "num_input_tokens_seen": 917101840, |
| "step": 3801 |
| }, |
| { |
| "epoch": 1.649301623254058, |
| "grad_norm": 0.1743609898038798, |
| "learning_rate": 4.22017135374948e-05, |
| "loss": 0.1852, |
| "num_input_tokens_seen": 922205664, |
| "step": 3822 |
| }, |
| { |
| "epoch": 1.6583616459041148, |
| "grad_norm": 0.17938627926996303, |
| "learning_rate": 4.1733318449803624e-05, |
| "loss": 0.1863, |
| "num_input_tokens_seen": 927302560, |
| "step": 3843 |
| }, |
| { |
| "epoch": 1.6674216685541714, |
| "grad_norm": 0.16947333759434738, |
| "learning_rate": 4.1265668045568495e-05, |
| "loss": 0.1882, |
| "num_input_tokens_seen": 932325424, |
| "step": 3864 |
| }, |
| { |
| "epoch": 1.676481691204228, |
| "grad_norm": 0.16639553173104588, |
| "learning_rate": 4.079880445191351e-05, |
| "loss": 0.1893, |
| "num_input_tokens_seen": 937438464, |
| "step": 3885 |
| }, |
| { |
| "epoch": 1.6855417138542848, |
| "grad_norm": 0.14651023615133163, |
| "learning_rate": 4.033276972508484e-05, |
| "loss": 0.1885, |
| "num_input_tokens_seen": 942617840, |
| "step": 3906 |
| }, |
| { |
| "epoch": 1.6946017365043413, |
| "grad_norm": 0.17812367097504705, |
| "learning_rate": 3.9867605846662256e-05, |
| "loss": 0.1883, |
| "num_input_tokens_seen": 947823200, |
| "step": 3927 |
| }, |
| { |
| "epoch": 1.7036617591543979, |
| "grad_norm": 0.1872194452488721, |
| "learning_rate": 3.940335471977734e-05, |
| "loss": 0.1871, |
| "num_input_tokens_seen": 952872784, |
| "step": 3948 |
| }, |
| { |
| "epoch": 1.7127217818044547, |
| "grad_norm": 0.1643081487678093, |
| "learning_rate": 3.89400581653387e-05, |
| "loss": 0.1853, |
| "num_input_tokens_seen": 957908608, |
| "step": 3969 |
| }, |
| { |
| "epoch": 1.721781804454511, |
| "grad_norm": 0.1753318656372752, |
| "learning_rate": 3.847775791826468e-05, |
| "loss": 0.1862, |
| "num_input_tokens_seen": 962972208, |
| "step": 3990 |
| }, |
| { |
| "epoch": 1.7308418271045678, |
| "grad_norm": 0.15851578473823177, |
| "learning_rate": 3.801649562372371e-05, |
| "loss": 0.1913, |
| "num_input_tokens_seen": 968020256, |
| "step": 4011 |
| }, |
| { |
| "epoch": 1.7399018497546244, |
| "grad_norm": 0.1812244935434252, |
| "learning_rate": 3.755631283338287e-05, |
| "loss": 0.1908, |
| "num_input_tokens_seen": 973116912, |
| "step": 4032 |
| }, |
| { |
| "epoch": 1.748961872404681, |
| "grad_norm": 0.16265272189557067, |
| "learning_rate": 3.709725100166482e-05, |
| "loss": 0.1839, |
| "num_input_tokens_seen": 978276224, |
| "step": 4053 |
| }, |
| { |
| "epoch": 1.7580218950547377, |
| "grad_norm": 0.16695166226650535, |
| "learning_rate": 3.663935148201341e-05, |
| "loss": 0.1869, |
| "num_input_tokens_seen": 983499184, |
| "step": 4074 |
| }, |
| { |
| "epoch": 1.7670819177047943, |
| "grad_norm": 0.17046617487207735, |
| "learning_rate": 3.618265552316849e-05, |
| "loss": 0.1884, |
| "num_input_tokens_seen": 988511216, |
| "step": 4095 |
| }, |
| { |
| "epoch": 1.7761419403548508, |
| "grad_norm": 0.16853521697476523, |
| "learning_rate": 3.572720426545017e-05, |
| "loss": 0.1863, |
| "num_input_tokens_seen": 993542272, |
| "step": 4116 |
| }, |
| { |
| "epoch": 1.7852019630049076, |
| "grad_norm": 0.16196055724715774, |
| "learning_rate": 3.5273038737052675e-05, |
| "loss": 0.1884, |
| "num_input_tokens_seen": 998561584, |
| "step": 4137 |
| }, |
| { |
| "epoch": 1.794261985654964, |
| "grad_norm": 0.17704958458091835, |
| "learning_rate": 3.482019985034861e-05, |
| "loss": 0.1815, |
| "num_input_tokens_seen": 1003535696, |
| "step": 4158 |
| }, |
| { |
| "epoch": 1.8033220083050208, |
| "grad_norm": 0.17212954264417213, |
| "learning_rate": 3.43687283982033e-05, |
| "loss": 0.1798, |
| "num_input_tokens_seen": 1008610432, |
| "step": 4179 |
| }, |
| { |
| "epoch": 1.8123820309550775, |
| "grad_norm": 0.1642508897074481, |
| "learning_rate": 3.391866505030009e-05, |
| "loss": 0.1797, |
| "num_input_tokens_seen": 1013577840, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.821442053605134, |
| "grad_norm": 0.1895193964135349, |
| "learning_rate": 3.347005034947681e-05, |
| "loss": 0.1773, |
| "num_input_tokens_seen": 1018549888, |
| "step": 4221 |
| }, |
| { |
| "epoch": 1.8305020762551907, |
| "grad_norm": 0.18935672208270557, |
| "learning_rate": 3.3022924708073524e-05, |
| "loss": 0.1828, |
| "num_input_tokens_seen": 1023498368, |
| "step": 4242 |
| }, |
| { |
| "epoch": 1.8395620989052472, |
| "grad_norm": 0.15473627402095172, |
| "learning_rate": 3.257732840429206e-05, |
| "loss": 0.18, |
| "num_input_tokens_seen": 1028542992, |
| "step": 4263 |
| }, |
| { |
| "epoch": 1.8486221215553038, |
| "grad_norm": 0.17782850850732204, |
| "learning_rate": 3.2133301578567646e-05, |
| "loss": 0.1825, |
| "num_input_tokens_seen": 1033574288, |
| "step": 4284 |
| }, |
| { |
| "epoch": 1.8576821442053606, |
| "grad_norm": 0.17879475744218412, |
| "learning_rate": 3.169088422995304e-05, |
| "loss": 0.1776, |
| "num_input_tokens_seen": 1038606208, |
| "step": 4305 |
| }, |
| { |
| "epoch": 1.8667421668554172, |
| "grad_norm": 0.16166293718253705, |
| "learning_rate": 3.125011621251516e-05, |
| "loss": 0.1768, |
| "num_input_tokens_seen": 1043770704, |
| "step": 4326 |
| }, |
| { |
| "epoch": 1.8758021895054737, |
| "grad_norm": 0.1607230134601091, |
| "learning_rate": 3.081103723174515e-05, |
| "loss": 0.1778, |
| "num_input_tokens_seen": 1048829664, |
| "step": 4347 |
| }, |
| { |
| "epoch": 1.8848622121555305, |
| "grad_norm": 0.159447656379203, |
| "learning_rate": 3.0373686840981397e-05, |
| "loss": 0.1788, |
| "num_input_tokens_seen": 1053950224, |
| "step": 4368 |
| }, |
| { |
| "epoch": 1.8939222348055869, |
| "grad_norm": 0.1674766446494019, |
| "learning_rate": 2.9938104437846572e-05, |
| "loss": 0.176, |
| "num_input_tokens_seen": 1059119888, |
| "step": 4389 |
| }, |
| { |
| "epoch": 1.9029822574556436, |
| "grad_norm": 0.17753675611302996, |
| "learning_rate": 2.950432926069857e-05, |
| "loss": 0.1783, |
| "num_input_tokens_seen": 1064177088, |
| "step": 4410 |
| }, |
| { |
| "epoch": 1.9120422801057002, |
| "grad_norm": 0.17087252328331373, |
| "learning_rate": 2.9072400385095865e-05, |
| "loss": 0.178, |
| "num_input_tokens_seen": 1069200928, |
| "step": 4431 |
| }, |
| { |
| "epoch": 1.9211023027557568, |
| "grad_norm": 0.16133227423173738, |
| "learning_rate": 2.864235672027741e-05, |
| "loss": 0.1759, |
| "num_input_tokens_seen": 1074313840, |
| "step": 4452 |
| }, |
| { |
| "epoch": 1.9301623254058136, |
| "grad_norm": 0.1865580464555286, |
| "learning_rate": 2.8214237005657627e-05, |
| "loss": 0.1769, |
| "num_input_tokens_seen": 1079348080, |
| "step": 4473 |
| }, |
| { |
| "epoch": 1.9392223480558701, |
| "grad_norm": 0.17483638643553473, |
| "learning_rate": 2.7788079807336692e-05, |
| "loss": 0.1761, |
| "num_input_tokens_seen": 1084415072, |
| "step": 4494 |
| }, |
| { |
| "epoch": 1.9482823707059267, |
| "grad_norm": 0.16127203478332483, |
| "learning_rate": 2.7363923514626367e-05, |
| "loss": 0.1762, |
| "num_input_tokens_seen": 1089576528, |
| "step": 4515 |
| }, |
| { |
| "epoch": 1.9573423933559835, |
| "grad_norm": 0.1818665955450248, |
| "learning_rate": 2.6941806336591808e-05, |
| "loss": 0.1715, |
| "num_input_tokens_seen": 1094741664, |
| "step": 4536 |
| }, |
| { |
| "epoch": 1.96640241600604, |
| "grad_norm": 0.16510174569454042, |
| "learning_rate": 2.6521766298609584e-05, |
| "loss": 0.1728, |
| "num_input_tokens_seen": 1099708896, |
| "step": 4557 |
| }, |
| { |
| "epoch": 1.9754624386560966, |
| "grad_norm": 0.17393602608748607, |
| "learning_rate": 2.610384123894229e-05, |
| "loss": 0.175, |
| "num_input_tokens_seen": 1104824512, |
| "step": 4578 |
| }, |
| { |
| "epoch": 1.9845224613061534, |
| "grad_norm": 0.18901915034549496, |
| "learning_rate": 2.568806880532991e-05, |
| "loss": 0.1736, |
| "num_input_tokens_seen": 1109954160, |
| "step": 4599 |
| }, |
| { |
| "epoch": 1.9935824839562097, |
| "grad_norm": 0.19336693087348367, |
| "learning_rate": 2.5274486451598565e-05, |
| "loss": 0.1704, |
| "num_input_tokens_seen": 1115130992, |
| "step": 4620 |
| }, |
| { |
| "epoch": 2.003020007550019, |
| "grad_norm": 0.192558427240515, |
| "learning_rate": 2.4863131434286342e-05, |
| "loss": 0.1548, |
| "num_input_tokens_seen": 1120294784, |
| "step": 4641 |
| }, |
| { |
| "epoch": 2.0120800302000754, |
| "grad_norm": 0.19360993518356076, |
| "learning_rate": 2.4454040809287342e-05, |
| "loss": 0.1188, |
| "num_input_tokens_seen": 1125375728, |
| "step": 4662 |
| }, |
| { |
| "epoch": 2.021140052850132, |
| "grad_norm": 0.19346692314512148, |
| "learning_rate": 2.4047251428513485e-05, |
| "loss": 0.1176, |
| "num_input_tokens_seen": 1130663488, |
| "step": 4683 |
| }, |
| { |
| "epoch": 2.0302000755001886, |
| "grad_norm": 0.1915500646603155, |
| "learning_rate": 2.364279993657487e-05, |
| "loss": 0.1166, |
| "num_input_tokens_seen": 1135729856, |
| "step": 4704 |
| }, |
| { |
| "epoch": 2.0392600981502453, |
| "grad_norm": 0.21320689744431512, |
| "learning_rate": 2.3240722767478657e-05, |
| "loss": 0.1129, |
| "num_input_tokens_seen": 1140728768, |
| "step": 4725 |
| }, |
| { |
| "epoch": 2.048320120800302, |
| "grad_norm": 0.20002232427856995, |
| "learning_rate": 2.2841056141347038e-05, |
| "loss": 0.1122, |
| "num_input_tokens_seen": 1145810672, |
| "step": 4746 |
| }, |
| { |
| "epoch": 2.0573801434503585, |
| "grad_norm": 0.21228559927967805, |
| "learning_rate": 2.2443836061154415e-05, |
| "loss": 0.1145, |
| "num_input_tokens_seen": 1150862064, |
| "step": 4767 |
| }, |
| { |
| "epoch": 2.0664401661004153, |
| "grad_norm": 0.19792768108065947, |
| "learning_rate": 2.2049098309484195e-05, |
| "loss": 0.1153, |
| "num_input_tokens_seen": 1155954544, |
| "step": 4788 |
| }, |
| { |
| "epoch": 2.075500188750472, |
| "grad_norm": 0.21247296887779493, |
| "learning_rate": 2.1656878445305447e-05, |
| "loss": 0.1152, |
| "num_input_tokens_seen": 1161054256, |
| "step": 4809 |
| }, |
| { |
| "epoch": 2.0845602114005284, |
| "grad_norm": 0.19109010163603735, |
| "learning_rate": 2.1267211800769528e-05, |
| "loss": 0.1148, |
| "num_input_tokens_seen": 1166056688, |
| "step": 4830 |
| }, |
| { |
| "epoch": 2.093620234050585, |
| "grad_norm": 0.19679782828606215, |
| "learning_rate": 2.088013347802738e-05, |
| "loss": 0.1119, |
| "num_input_tokens_seen": 1171231104, |
| "step": 4851 |
| }, |
| { |
| "epoch": 2.102680256700642, |
| "grad_norm": 0.2128224999872872, |
| "learning_rate": 2.0495678346067414e-05, |
| "loss": 0.1101, |
| "num_input_tokens_seen": 1176284976, |
| "step": 4872 |
| }, |
| { |
| "epoch": 2.1117402793506983, |
| "grad_norm": 0.2123206811047115, |
| "learning_rate": 2.011388103757442e-05, |
| "loss": 0.1139, |
| "num_input_tokens_seen": 1181400944, |
| "step": 4893 |
| }, |
| { |
| "epoch": 2.120800302000755, |
| "grad_norm": 0.2071017368245751, |
| "learning_rate": 1.973477594580977e-05, |
| "loss": 0.1116, |
| "num_input_tokens_seen": 1186527776, |
| "step": 4914 |
| }, |
| { |
| "epoch": 2.1298603246508114, |
| "grad_norm": 0.17323287993849096, |
| "learning_rate": 1.9358397221513176e-05, |
| "loss": 0.112, |
| "num_input_tokens_seen": 1191661680, |
| "step": 4935 |
| }, |
| { |
| "epoch": 2.138920347300868, |
| "grad_norm": 0.20213151950682676, |
| "learning_rate": 1.8984778769826316e-05, |
| "loss": 0.1106, |
| "num_input_tokens_seen": 1196759648, |
| "step": 4956 |
| }, |
| { |
| "epoch": 2.147980369950925, |
| "grad_norm": 0.19700292148625387, |
| "learning_rate": 1.8613954247238586e-05, |
| "loss": 0.1124, |
| "num_input_tokens_seen": 1201857104, |
| "step": 4977 |
| }, |
| { |
| "epoch": 2.1570403926009813, |
| "grad_norm": 0.21527000496492768, |
| "learning_rate": 1.82459570585552e-05, |
| "loss": 0.1136, |
| "num_input_tokens_seen": 1206927520, |
| "step": 4998 |
| }, |
| { |
| "epoch": 2.157903251900987, |
| "eval_loss": 0.19485081732273102, |
| "eval_runtime": 529.4687, |
| "eval_samples_per_second": 17.331, |
| "eval_steps_per_second": 1.084, |
| "num_input_tokens_seen": 1207385424, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.166100415251038, |
| "grad_norm": 0.2275158963594303, |
| "learning_rate": 1.7880820353888056e-05, |
| "loss": 0.1102, |
| "num_input_tokens_seen": 1211875824, |
| "step": 5019 |
| }, |
| { |
| "epoch": 2.175160437901095, |
| "grad_norm": 0.20450931488489404, |
| "learning_rate": 1.751857702566944e-05, |
| "loss": 0.113, |
| "num_input_tokens_seen": 1216954688, |
| "step": 5040 |
| }, |
| { |
| "epoch": 2.1842204605511513, |
| "grad_norm": 0.21173943893990088, |
| "learning_rate": 1.7159259705689e-05, |
| "loss": 0.1104, |
| "num_input_tokens_seen": 1221976560, |
| "step": 5061 |
| }, |
| { |
| "epoch": 2.193280483201208, |
| "grad_norm": 0.20717537063008165, |
| "learning_rate": 1.6802900762154267e-05, |
| "loss": 0.1152, |
| "num_input_tokens_seen": 1226975776, |
| "step": 5082 |
| }, |
| { |
| "epoch": 2.2023405058512644, |
| "grad_norm": 0.19831781893791461, |
| "learning_rate": 1.644953229677474e-05, |
| "loss": 0.1097, |
| "num_input_tokens_seen": 1231998784, |
| "step": 5103 |
| }, |
| { |
| "epoch": 2.211400528501321, |
| "grad_norm": 0.18492554370023317, |
| "learning_rate": 1.609918614187009e-05, |
| "loss": 0.1111, |
| "num_input_tokens_seen": 1236990864, |
| "step": 5124 |
| }, |
| { |
| "epoch": 2.220460551151378, |
| "grad_norm": 0.20016085842409992, |
| "learning_rate": 1.575189385750271e-05, |
| "loss": 0.1104, |
| "num_input_tokens_seen": 1242051280, |
| "step": 5145 |
| }, |
| { |
| "epoch": 2.2295205738014343, |
| "grad_norm": 0.2070887723839001, |
| "learning_rate": 1.540768672863468e-05, |
| "loss": 0.1075, |
| "num_input_tokens_seen": 1247127040, |
| "step": 5166 |
| }, |
| { |
| "epoch": 2.238580596451491, |
| "grad_norm": 0.1925213910719394, |
| "learning_rate": 1.5066595762309477e-05, |
| "loss": 0.1093, |
| "num_input_tokens_seen": 1252158672, |
| "step": 5187 |
| }, |
| { |
| "epoch": 2.247640619101548, |
| "grad_norm": 0.205831347337121, |
| "learning_rate": 1.4728651684858834e-05, |
| "loss": 0.1126, |
| "num_input_tokens_seen": 1257321184, |
| "step": 5208 |
| }, |
| { |
| "epoch": 2.2567006417516042, |
| "grad_norm": 0.19926488557298117, |
| "learning_rate": 1.4393884939134833e-05, |
| "loss": 0.1064, |
| "num_input_tokens_seen": 1262315984, |
| "step": 5229 |
| }, |
| { |
| "epoch": 2.265760664401661, |
| "grad_norm": 0.18546254868875062, |
| "learning_rate": 1.4062325681767469e-05, |
| "loss": 0.1096, |
| "num_input_tokens_seen": 1267351616, |
| "step": 5250 |
| }, |
| { |
| "epoch": 2.274820687051718, |
| "grad_norm": 0.21880629906349583, |
| "learning_rate": 1.3734003780448218e-05, |
| "loss": 0.1089, |
| "num_input_tokens_seen": 1272350592, |
| "step": 5271 |
| }, |
| { |
| "epoch": 2.283880709701774, |
| "grad_norm": 0.1996371660776893, |
| "learning_rate": 1.340894881123932e-05, |
| "loss": 0.1093, |
| "num_input_tokens_seen": 1277314160, |
| "step": 5292 |
| }, |
| { |
| "epoch": 2.292940732351831, |
| "grad_norm": 0.18322023913039737, |
| "learning_rate": 1.308719005590957e-05, |
| "loss": 0.1064, |
| "num_input_tokens_seen": 1282348896, |
| "step": 5313 |
| }, |
| { |
| "epoch": 2.3020007550018873, |
| "grad_norm": 0.19825429674508396, |
| "learning_rate": 1.276875649929654e-05, |
| "loss": 0.1103, |
| "num_input_tokens_seen": 1287503120, |
| "step": 5334 |
| }, |
| { |
| "epoch": 2.311060777651944, |
| "grad_norm": 0.20100225641083314, |
| "learning_rate": 1.2453676826695532e-05, |
| "loss": 0.1077, |
| "num_input_tokens_seen": 1292488224, |
| "step": 5355 |
| }, |
| { |
| "epoch": 2.320120800302001, |
| "grad_norm": 0.19869949736224346, |
| "learning_rate": 1.2141979421275545e-05, |
| "loss": 0.1051, |
| "num_input_tokens_seen": 1297613792, |
| "step": 5376 |
| }, |
| { |
| "epoch": 2.329180822952057, |
| "grad_norm": 0.20145867765354752, |
| "learning_rate": 1.1833692361522459e-05, |
| "loss": 0.1063, |
| "num_input_tokens_seen": 1302765200, |
| "step": 5397 |
| }, |
| { |
| "epoch": 2.338240845602114, |
| "grad_norm": 0.20680505787617295, |
| "learning_rate": 1.1528843418709622e-05, |
| "loss": 0.1073, |
| "num_input_tokens_seen": 1307780896, |
| "step": 5418 |
| }, |
| { |
| "epoch": 2.3473008682521708, |
| "grad_norm": 0.23847500451035963, |
| "learning_rate": 1.1227460054396177e-05, |
| "loss": 0.1076, |
| "num_input_tokens_seen": 1312916864, |
| "step": 5439 |
| }, |
| { |
| "epoch": 2.356360890902227, |
| "grad_norm": 0.21518454470567003, |
| "learning_rate": 1.0929569417953278e-05, |
| "loss": 0.1049, |
| "num_input_tokens_seen": 1317924528, |
| "step": 5460 |
| }, |
| { |
| "epoch": 2.365420913552284, |
| "grad_norm": 0.19953783012904103, |
| "learning_rate": 1.0635198344118296e-05, |
| "loss": 0.1038, |
| "num_input_tokens_seen": 1322892896, |
| "step": 5481 |
| }, |
| { |
| "epoch": 2.3744809362023407, |
| "grad_norm": 0.20097656219123833, |
| "learning_rate": 1.034437335057762e-05, |
| "loss": 0.1049, |
| "num_input_tokens_seen": 1328000960, |
| "step": 5502 |
| }, |
| { |
| "epoch": 2.383540958852397, |
| "grad_norm": 0.20223248741837738, |
| "learning_rate": 1.005712063557776e-05, |
| "loss": 0.1026, |
| "num_input_tokens_seen": 1333104928, |
| "step": 5523 |
| }, |
| { |
| "epoch": 2.392600981502454, |
| "grad_norm": 0.184389360298103, |
| "learning_rate": 9.773466075565457e-06, |
| "loss": 0.1061, |
| "num_input_tokens_seen": 1338094928, |
| "step": 5544 |
| }, |
| { |
| "epoch": 2.40166100415251, |
| "grad_norm": 0.18202586925329933, |
| "learning_rate": 9.493435222856556e-06, |
| "loss": 0.1078, |
| "num_input_tokens_seen": 1343094352, |
| "step": 5565 |
| }, |
| { |
| "epoch": 2.410721026802567, |
| "grad_norm": 0.21238526697964133, |
| "learning_rate": 9.21705330333426e-06, |
| "loss": 0.1021, |
| "num_input_tokens_seen": 1348209008, |
| "step": 5586 |
| }, |
| { |
| "epoch": 2.4197810494526237, |
| "grad_norm": 0.2033611614783377, |
| "learning_rate": 8.944345214176675e-06, |
| "loss": 0.105, |
| "num_input_tokens_seen": 1353281712, |
| "step": 5607 |
| }, |
| { |
| "epoch": 2.42884107210268, |
| "grad_norm": 0.19144661395169293, |
| "learning_rate": 8.675335521614036e-06, |
| "loss": 0.1039, |
| "num_input_tokens_seen": 1358325728, |
| "step": 5628 |
| }, |
| { |
| "epoch": 2.437901094752737, |
| "grad_norm": 0.20545555012965147, |
| "learning_rate": 8.410048458715763e-06, |
| "loss": 0.1026, |
| "num_input_tokens_seen": 1363274864, |
| "step": 5649 |
| }, |
| { |
| "epoch": 2.4469611174027937, |
| "grad_norm": 0.20596285141748574, |
| "learning_rate": 8.148507923207377e-06, |
| "loss": 0.1046, |
| "num_input_tokens_seen": 1368398176, |
| "step": 5670 |
| }, |
| { |
| "epoch": 2.45602114005285, |
| "grad_norm": 0.21097019629979452, |
| "learning_rate": 7.890737475317817e-06, |
| "loss": 0.1062, |
| "num_input_tokens_seen": 1373421664, |
| "step": 5691 |
| }, |
| { |
| "epoch": 2.465081162702907, |
| "grad_norm": 0.1903944548607354, |
| "learning_rate": 7.636760335657056e-06, |
| "loss": 0.1005, |
| "num_input_tokens_seen": 1378386688, |
| "step": 5712 |
| }, |
| { |
| "epoch": 2.4741411853529636, |
| "grad_norm": 0.19609864215469505, |
| "learning_rate": 7.38659938312432e-06, |
| "loss": 0.1008, |
| "num_input_tokens_seen": 1383515360, |
| "step": 5733 |
| }, |
| { |
| "epoch": 2.48320120800302, |
| "grad_norm": 0.18901755025774616, |
| "learning_rate": 7.140277152847103e-06, |
| "loss": 0.1012, |
| "num_input_tokens_seen": 1388651712, |
| "step": 5754 |
| }, |
| { |
| "epoch": 2.4922612306530767, |
| "grad_norm": 0.2089521843263624, |
| "learning_rate": 6.89781583415115e-06, |
| "loss": 0.1004, |
| "num_input_tokens_seen": 1393819168, |
| "step": 5775 |
| }, |
| { |
| "epoch": 2.501321253303133, |
| "grad_norm": 0.20297486453222555, |
| "learning_rate": 6.659237268561569e-06, |
| "loss": 0.1058, |
| "num_input_tokens_seen": 1399005008, |
| "step": 5796 |
| }, |
| { |
| "epoch": 2.51038127595319, |
| "grad_norm": 0.1950872269091398, |
| "learning_rate": 6.424562947835367e-06, |
| "loss": 0.0996, |
| "num_input_tokens_seen": 1404075040, |
| "step": 5817 |
| }, |
| { |
| "epoch": 2.5194412986032466, |
| "grad_norm": 0.19137900590478205, |
| "learning_rate": 6.193814012025278e-06, |
| "loss": 0.098, |
| "num_input_tokens_seen": 1409145760, |
| "step": 5838 |
| }, |
| { |
| "epoch": 2.5285013212533034, |
| "grad_norm": 0.21343987395986905, |
| "learning_rate": 5.967011247575532e-06, |
| "loss": 0.1053, |
| "num_input_tokens_seen": 1414225568, |
| "step": 5859 |
| }, |
| { |
| "epoch": 2.5375613439033597, |
| "grad_norm": 0.21335949851815006, |
| "learning_rate": 5.744175085449338e-06, |
| "loss": 0.1021, |
| "num_input_tokens_seen": 1419339216, |
| "step": 5880 |
| }, |
| { |
| "epoch": 2.5466213665534165, |
| "grad_norm": 0.19658196939034006, |
| "learning_rate": 5.525325599288356e-06, |
| "loss": 0.1003, |
| "num_input_tokens_seen": 1424423024, |
| "step": 5901 |
| }, |
| { |
| "epoch": 2.555681389203473, |
| "grad_norm": 0.1831370056225536, |
| "learning_rate": 5.310482503604497e-06, |
| "loss": 0.1039, |
| "num_input_tokens_seen": 1429360512, |
| "step": 5922 |
| }, |
| { |
| "epoch": 2.5647414118535297, |
| "grad_norm": 0.21013156618721565, |
| "learning_rate": 5.09966515200393e-06, |
| "loss": 0.1034, |
| "num_input_tokens_seen": 1434443216, |
| "step": 5943 |
| }, |
| { |
| "epoch": 2.5738014345035864, |
| "grad_norm": 0.2204689190211589, |
| "learning_rate": 4.892892535443655e-06, |
| "loss": 0.1025, |
| "num_input_tokens_seen": 1439693152, |
| "step": 5964 |
| }, |
| { |
| "epoch": 2.582861457153643, |
| "grad_norm": 0.20888767875138448, |
| "learning_rate": 4.690183280520777e-06, |
| "loss": 0.1041, |
| "num_input_tokens_seen": 1444742640, |
| "step": 5985 |
| }, |
| { |
| "epoch": 2.5919214798036996, |
| "grad_norm": 0.20314033156230726, |
| "learning_rate": 4.491555647794609e-06, |
| "loss": 0.1035, |
| "num_input_tokens_seen": 1449817024, |
| "step": 6006 |
| }, |
| { |
| "epoch": 2.600981502453756, |
| "grad_norm": 0.1878500846044568, |
| "learning_rate": 4.297027530141634e-06, |
| "loss": 0.102, |
| "num_input_tokens_seen": 1454966656, |
| "step": 6027 |
| }, |
| { |
| "epoch": 2.6100415251038127, |
| "grad_norm": 0.1689243463349296, |
| "learning_rate": 4.106616451143719e-06, |
| "loss": 0.0968, |
| "num_input_tokens_seen": 1460107904, |
| "step": 6048 |
| }, |
| { |
| "epoch": 2.6191015477538695, |
| "grad_norm": 0.20642026958771845, |
| "learning_rate": 3.9203395635095615e-06, |
| "loss": 0.1025, |
| "num_input_tokens_seen": 1465329712, |
| "step": 6069 |
| }, |
| { |
| "epoch": 2.6281615704039263, |
| "grad_norm": 0.18586990386683522, |
| "learning_rate": 3.7382136475294592e-06, |
| "loss": 0.0992, |
| "num_input_tokens_seen": 1470486400, |
| "step": 6090 |
| }, |
| { |
| "epoch": 2.6372215930539826, |
| "grad_norm": 0.2145218277764739, |
| "learning_rate": 3.5602551095638094e-06, |
| "loss": 0.1014, |
| "num_input_tokens_seen": 1475481216, |
| "step": 6111 |
| }, |
| { |
| "epoch": 2.6462816157040394, |
| "grad_norm": 0.18413690443200365, |
| "learning_rate": 3.386479980565077e-06, |
| "loss": 0.097, |
| "num_input_tokens_seen": 1480509520, |
| "step": 6132 |
| }, |
| { |
| "epoch": 2.6553416383540958, |
| "grad_norm": 0.20568925987079073, |
| "learning_rate": 3.2169039146337455e-06, |
| "loss": 0.1011, |
| "num_input_tokens_seen": 1485415168, |
| "step": 6153 |
| }, |
| { |
| "epoch": 2.6644016610041525, |
| "grad_norm": 0.19689596480908558, |
| "learning_rate": 3.0515421876081364e-06, |
| "loss": 0.1003, |
| "num_input_tokens_seen": 1490580288, |
| "step": 6174 |
| }, |
| { |
| "epoch": 2.6734616836542093, |
| "grad_norm": 0.19998505506705416, |
| "learning_rate": 2.8904096956883396e-06, |
| "loss": 0.1011, |
| "num_input_tokens_seen": 1495724928, |
| "step": 6195 |
| }, |
| { |
| "epoch": 2.6825217063042657, |
| "grad_norm": 0.19403273255448156, |
| "learning_rate": 2.733520954094304e-06, |
| "loss": 0.0992, |
| "num_input_tokens_seen": 1500671568, |
| "step": 6216 |
| }, |
| { |
| "epoch": 2.6915817289543225, |
| "grad_norm": 0.2062968511222456, |
| "learning_rate": 2.580890095758276e-06, |
| "loss": 0.0985, |
| "num_input_tokens_seen": 1505736848, |
| "step": 6237 |
| }, |
| { |
| "epoch": 2.700641751604379, |
| "grad_norm": 0.18160540614114035, |
| "learning_rate": 2.4325308700516804e-06, |
| "loss": 0.0999, |
| "num_input_tokens_seen": 1510772384, |
| "step": 6258 |
| }, |
| { |
| "epoch": 2.7097017742544356, |
| "grad_norm": 0.1923201194653099, |
| "learning_rate": 2.288456641546549e-06, |
| "loss": 0.1015, |
| "num_input_tokens_seen": 1515840336, |
| "step": 6279 |
| }, |
| { |
| "epoch": 2.7187617969044924, |
| "grad_norm": 0.19463587674679012, |
| "learning_rate": 2.1486803888115802e-06, |
| "loss": 0.0952, |
| "num_input_tokens_seen": 1520795728, |
| "step": 6300 |
| }, |
| { |
| "epoch": 2.7278218195545487, |
| "grad_norm": 0.20675363777805839, |
| "learning_rate": 2.013214703242994e-06, |
| "loss": 0.1014, |
| "num_input_tokens_seen": 1525885232, |
| "step": 6321 |
| }, |
| { |
| "epoch": 2.7368818422046055, |
| "grad_norm": 0.20373174987364437, |
| "learning_rate": 1.8820717879303175e-06, |
| "loss": 0.0962, |
| "num_input_tokens_seen": 1531020736, |
| "step": 6342 |
| }, |
| { |
| "epoch": 2.7459418648546623, |
| "grad_norm": 0.1956195345947546, |
| "learning_rate": 1.7552634565570325e-06, |
| "loss": 0.0984, |
| "num_input_tokens_seen": 1536139280, |
| "step": 6363 |
| }, |
| { |
| "epoch": 2.7550018875047186, |
| "grad_norm": 0.1948987852214273, |
| "learning_rate": 1.6328011323364313e-06, |
| "loss": 0.0996, |
| "num_input_tokens_seen": 1541119392, |
| "step": 6384 |
| }, |
| { |
| "epoch": 2.7640619101547754, |
| "grad_norm": 0.20220064200986687, |
| "learning_rate": 1.5146958469825445e-06, |
| "loss": 0.098, |
| "num_input_tokens_seen": 1546172016, |
| "step": 6405 |
| }, |
| { |
| "epoch": 2.773121932804832, |
| "grad_norm": 0.18253057622469138, |
| "learning_rate": 1.4009582397163879e-06, |
| "loss": 0.0979, |
| "num_input_tokens_seen": 1551424800, |
| "step": 6426 |
| }, |
| { |
| "epoch": 2.7821819554548886, |
| "grad_norm": 0.19954203330015002, |
| "learning_rate": 1.2915985563075383e-06, |
| "loss": 0.096, |
| "num_input_tokens_seen": 1556510032, |
| "step": 6447 |
| }, |
| { |
| "epoch": 2.7912419781049453, |
| "grad_norm": 0.18349734934457243, |
| "learning_rate": 1.1866266481512234e-06, |
| "loss": 0.0995, |
| "num_input_tokens_seen": 1561425840, |
| "step": 6468 |
| }, |
| { |
| "epoch": 2.8003020007550017, |
| "grad_norm": 0.19583118554915796, |
| "learning_rate": 1.0860519713808082e-06, |
| "loss": 0.0979, |
| "num_input_tokens_seen": 1566437584, |
| "step": 6489 |
| }, |
| { |
| "epoch": 2.8093620234050585, |
| "grad_norm": 0.21286644366673166, |
| "learning_rate": 9.898835860160271e-07, |
| "loss": 0.0944, |
| "num_input_tokens_seen": 1571433728, |
| "step": 6510 |
| }, |
| { |
| "epoch": 2.8184220460551153, |
| "grad_norm": 0.18549288153723123, |
| "learning_rate": 8.981301551467924e-07, |
| "loss": 0.0949, |
| "num_input_tokens_seen": 1576510304, |
| "step": 6531 |
| }, |
| { |
| "epoch": 2.8274820687051716, |
| "grad_norm": 0.1969304866148859, |
| "learning_rate": 8.10799944152818e-07, |
| "loss": 0.0959, |
| "num_input_tokens_seen": 1581652480, |
| "step": 6552 |
| }, |
| { |
| "epoch": 2.8365420913552284, |
| "grad_norm": 0.19621694587220775, |
| "learning_rate": 7.279008199590543e-07, |
| "loss": 0.0995, |
| "num_input_tokens_seen": 1586710928, |
| "step": 6573 |
| }, |
| { |
| "epoch": 2.845602114005285, |
| "grad_norm": 0.19626099410771614, |
| "learning_rate": 6.494402503270158e-07, |
| "loss": 0.0973, |
| "num_input_tokens_seen": 1591751872, |
| "step": 6594 |
| }, |
| { |
| "epoch": 2.8546621366553415, |
| "grad_norm": 0.17530668226038035, |
| "learning_rate": 5.754253031820588e-07, |
| "loss": 0.0992, |
| "num_input_tokens_seen": 1596817344, |
| "step": 6615 |
| }, |
| { |
| "epoch": 2.8637221593053983, |
| "grad_norm": 0.1962276487617284, |
| "learning_rate": 5.058626459766902e-07, |
| "loss": 0.0978, |
| "num_input_tokens_seen": 1601955280, |
| "step": 6636 |
| }, |
| { |
| "epoch": 2.872782181955455, |
| "grad_norm": 0.19661166968992774, |
| "learning_rate": 4.407585450899587e-07, |
| "loss": 0.0963, |
| "num_input_tokens_seen": 1606977888, |
| "step": 6657 |
| }, |
| { |
| "epoch": 2.8818422046055114, |
| "grad_norm": 0.1900385103617743, |
| "learning_rate": 3.8011886526292395e-07, |
| "loss": 0.0932, |
| "num_input_tokens_seen": 1612137088, |
| "step": 6678 |
| }, |
| { |
| "epoch": 2.8909022272555682, |
| "grad_norm": 0.19054833990071282, |
| "learning_rate": 3.2394906907040056e-07, |
| "loss": 0.098, |
| "num_input_tokens_seen": 1617105760, |
| "step": 6699 |
| }, |
| { |
| "epoch": 2.8999622499056246, |
| "grad_norm": 0.22982156321658473, |
| "learning_rate": 2.7225421642883554e-07, |
| "loss": 0.099, |
| "num_input_tokens_seen": 1622079712, |
| "step": 6720 |
| }, |
| { |
| "epoch": 2.9090222725556814, |
| "grad_norm": 0.20733117617880123, |
| "learning_rate": 2.250389641405115e-07, |
| "loss": 0.0974, |
| "num_input_tokens_seen": 1627262208, |
| "step": 6741 |
| }, |
| { |
| "epoch": 2.918082295205738, |
| "grad_norm": 0.2091535861816229, |
| "learning_rate": 1.823075654740547e-07, |
| "loss": 0.0995, |
| "num_input_tokens_seen": 1632293744, |
| "step": 6762 |
| }, |
| { |
| "epoch": 2.9271423178557945, |
| "grad_norm": 0.2010699924002249, |
| "learning_rate": 1.4406386978128018e-07, |
| "loss": 0.0955, |
| "num_input_tokens_seen": 1637413856, |
| "step": 6783 |
| }, |
| { |
| "epoch": 2.9362023405058513, |
| "grad_norm": 0.2039974530223662, |
| "learning_rate": 1.1031132215043594e-07, |
| "loss": 0.095, |
| "num_input_tokens_seen": 1642376144, |
| "step": 6804 |
| }, |
| { |
| "epoch": 2.9452623631559076, |
| "grad_norm": 0.1867785407127438, |
| "learning_rate": 8.105296309586785e-08, |
| "loss": 0.0995, |
| "num_input_tokens_seen": 1647471008, |
| "step": 6825 |
| }, |
| { |
| "epoch": 2.9543223858059644, |
| "grad_norm": 0.1873978173047584, |
| "learning_rate": 5.629142828411094e-08, |
| "loss": 0.0976, |
| "num_input_tokens_seen": 1652489696, |
| "step": 6846 |
| }, |
| { |
| "epoch": 2.963382408456021, |
| "grad_norm": 0.1977061377471765, |
| "learning_rate": 3.602894829647374e-08, |
| "loss": 0.0955, |
| "num_input_tokens_seen": 1657488848, |
| "step": 6867 |
| }, |
| { |
| "epoch": 2.972442431106078, |
| "grad_norm": 0.19200607060766933, |
| "learning_rate": 2.0267348428087974e-08, |
| "loss": 0.0979, |
| "num_input_tokens_seen": 1662468816, |
| "step": 6888 |
| }, |
| { |
| "epoch": 2.9815024537561343, |
| "grad_norm": 0.21184767679059754, |
| "learning_rate": 9.008048523501122e-09, |
| "loss": 0.0999, |
| "num_input_tokens_seen": 1667627776, |
| "step": 6909 |
| }, |
| { |
| "epoch": 2.990562476406191, |
| "grad_norm": 0.2003643238535032, |
| "learning_rate": 2.252062848745462e-09, |
| "loss": 0.099, |
| "num_input_tokens_seen": 1672724048, |
| "step": 6930 |
| }, |
| { |
| "epoch": 2.9996224990562474, |
| "grad_norm": 0.19768122036104033, |
| "learning_rate": 0.0, |
| "loss": 0.0971, |
| "num_input_tokens_seen": 1677860944, |
| "step": 6951 |
| }, |
| { |
| "epoch": 2.9996224990562474, |
| "num_input_tokens_seen": 1677860944, |
| "step": 6951, |
| "total_flos": 8545808136798208.0, |
| "train_loss": 0.2019795169591595, |
| "train_runtime": 178782.2648, |
| "train_samples_per_second": 4.978, |
| "train_steps_per_second": 0.039 |
| } |
| ], |
| "logging_steps": 21, |
| "max_steps": 6951, |
| "num_input_tokens_seen": 1677860944, |
| "num_train_epochs": 3, |
| "save_steps": 5000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8545808136798208.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|