IntroSVG-Qwen2.5-VL-7B / trainer_state.json
gitcat404's picture
Upload 22 files
c2c1a57 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9996224990562474,
"eval_steps": 5000,
"global_step": 6951,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009060022650056626,
"grad_norm": 2.6664810047066947,
"learning_rate": 9.999774793715127e-05,
"loss": 1.431,
"num_input_tokens_seen": 5118512,
"step": 21
},
{
"epoch": 0.01812004530011325,
"grad_norm": 0.8152880359374859,
"learning_rate": 9.99909919514765e-05,
"loss": 0.5348,
"num_input_tokens_seen": 10150592,
"step": 42
},
{
"epoch": 0.027180067950169876,
"grad_norm": 0.6315786850545617,
"learning_rate": 9.997973265157192e-05,
"loss": 0.4599,
"num_input_tokens_seen": 15170896,
"step": 63
},
{
"epoch": 0.0362400906002265,
"grad_norm": 0.4538892840989214,
"learning_rate": 9.996397105170353e-05,
"loss": 0.4108,
"num_input_tokens_seen": 20155632,
"step": 84
},
{
"epoch": 0.045300113250283124,
"grad_norm": 0.4720911162561318,
"learning_rate": 9.994370857171588e-05,
"loss": 0.3947,
"num_input_tokens_seen": 25135440,
"step": 105
},
{
"epoch": 0.05436013590033975,
"grad_norm": 0.4622888476742547,
"learning_rate": 9.991894703690414e-05,
"loss": 0.3764,
"num_input_tokens_seen": 30346784,
"step": 126
},
{
"epoch": 0.06342015855039637,
"grad_norm": 0.4173513485950873,
"learning_rate": 9.988968867784958e-05,
"loss": 0.3751,
"num_input_tokens_seen": 35269664,
"step": 147
},
{
"epoch": 0.072480181200453,
"grad_norm": 0.39249341716348773,
"learning_rate": 9.985593613021872e-05,
"loss": 0.3704,
"num_input_tokens_seen": 40151792,
"step": 168
},
{
"epoch": 0.08154020385050963,
"grad_norm": 0.3080363284417135,
"learning_rate": 9.981769243452595e-05,
"loss": 0.3552,
"num_input_tokens_seen": 45283312,
"step": 189
},
{
"epoch": 0.09060022650056625,
"grad_norm": 0.3747998358734097,
"learning_rate": 9.977496103585949e-05,
"loss": 0.3576,
"num_input_tokens_seen": 50298912,
"step": 210
},
{
"epoch": 0.09966024915062288,
"grad_norm": 0.2975791185912304,
"learning_rate": 9.972774578357117e-05,
"loss": 0.3451,
"num_input_tokens_seen": 55445792,
"step": 231
},
{
"epoch": 0.1087202718006795,
"grad_norm": 0.3172928529259604,
"learning_rate": 9.96760509309296e-05,
"loss": 0.3506,
"num_input_tokens_seen": 60506496,
"step": 252
},
{
"epoch": 0.11778029445073612,
"grad_norm": 0.3304680645103982,
"learning_rate": 9.961988113473708e-05,
"loss": 0.3443,
"num_input_tokens_seen": 65678096,
"step": 273
},
{
"epoch": 0.12684031710079274,
"grad_norm": 0.2929759270178528,
"learning_rate": 9.955924145491005e-05,
"loss": 0.3446,
"num_input_tokens_seen": 70478688,
"step": 294
},
{
"epoch": 0.13590033975084936,
"grad_norm": 0.2809492487037724,
"learning_rate": 9.94941373540233e-05,
"loss": 0.3362,
"num_input_tokens_seen": 75373536,
"step": 315
},
{
"epoch": 0.144960362400906,
"grad_norm": 0.38609616986912937,
"learning_rate": 9.942457469681794e-05,
"loss": 0.3384,
"num_input_tokens_seen": 80476704,
"step": 336
},
{
"epoch": 0.15402038505096263,
"grad_norm": 0.24129947470960447,
"learning_rate": 9.935055974967299e-05,
"loss": 0.3315,
"num_input_tokens_seen": 85670800,
"step": 357
},
{
"epoch": 0.16308040770101925,
"grad_norm": 0.22875446286948012,
"learning_rate": 9.927209918004095e-05,
"loss": 0.33,
"num_input_tokens_seen": 90707040,
"step": 378
},
{
"epoch": 0.17214043035107587,
"grad_norm": 0.26219955903132913,
"learning_rate": 9.918920005584719e-05,
"loss": 0.3296,
"num_input_tokens_seen": 95824496,
"step": 399
},
{
"epoch": 0.1812004530011325,
"grad_norm": 0.6217611528853424,
"learning_rate": 9.910186984485321e-05,
"loss": 0.3315,
"num_input_tokens_seen": 100862224,
"step": 420
},
{
"epoch": 0.19026047565118911,
"grad_norm": 0.5343386676193482,
"learning_rate": 9.901011641398398e-05,
"loss": 0.353,
"num_input_tokens_seen": 105876656,
"step": 441
},
{
"epoch": 0.19932049830124576,
"grad_norm": 0.29271392356860787,
"learning_rate": 9.89139480286192e-05,
"loss": 0.3414,
"num_input_tokens_seen": 110980864,
"step": 462
},
{
"epoch": 0.20838052095130238,
"grad_norm": 0.2662936598921738,
"learning_rate": 9.881337335184878e-05,
"loss": 0.3224,
"num_input_tokens_seen": 116114800,
"step": 483
},
{
"epoch": 0.217440543601359,
"grad_norm": 0.3006863273182064,
"learning_rate": 9.870840144369246e-05,
"loss": 0.3212,
"num_input_tokens_seen": 121255744,
"step": 504
},
{
"epoch": 0.22650056625141562,
"grad_norm": 0.28557101475631624,
"learning_rate": 9.859904176028362e-05,
"loss": 0.3213,
"num_input_tokens_seen": 126288608,
"step": 525
},
{
"epoch": 0.23556058890147225,
"grad_norm": 0.30204169829965893,
"learning_rate": 9.848530415301747e-05,
"loss": 0.3198,
"num_input_tokens_seen": 131233488,
"step": 546
},
{
"epoch": 0.24462061155152887,
"grad_norm": 0.22788149212961117,
"learning_rate": 9.836719886766356e-05,
"loss": 0.3149,
"num_input_tokens_seen": 136257888,
"step": 567
},
{
"epoch": 0.2536806342015855,
"grad_norm": 0.25388452332747136,
"learning_rate": 9.824473654344297e-05,
"loss": 0.3169,
"num_input_tokens_seen": 141405120,
"step": 588
},
{
"epoch": 0.2627406568516421,
"grad_norm": 0.22932388019266456,
"learning_rate": 9.811792821206969e-05,
"loss": 0.3142,
"num_input_tokens_seen": 146380496,
"step": 609
},
{
"epoch": 0.2718006795016987,
"grad_norm": 0.21316227582275912,
"learning_rate": 9.7986785296757e-05,
"loss": 0.3097,
"num_input_tokens_seen": 151344048,
"step": 630
},
{
"epoch": 0.2808607021517554,
"grad_norm": 0.2055839569031328,
"learning_rate": 9.785131961118844e-05,
"loss": 0.3116,
"num_input_tokens_seen": 156435136,
"step": 651
},
{
"epoch": 0.289920724801812,
"grad_norm": 0.2441799648084251,
"learning_rate": 9.771154335845345e-05,
"loss": 0.3086,
"num_input_tokens_seen": 161536224,
"step": 672
},
{
"epoch": 0.29898074745186864,
"grad_norm": 0.20481810635702893,
"learning_rate": 9.756746912994832e-05,
"loss": 0.3057,
"num_input_tokens_seen": 166573984,
"step": 693
},
{
"epoch": 0.30804077010192527,
"grad_norm": 0.2592126104806373,
"learning_rate": 9.741910990424174e-05,
"loss": 0.3017,
"num_input_tokens_seen": 171638000,
"step": 714
},
{
"epoch": 0.3171007927519819,
"grad_norm": 0.25174137308280303,
"learning_rate": 9.726647904590571e-05,
"loss": 0.3066,
"num_input_tokens_seen": 176765648,
"step": 735
},
{
"epoch": 0.3261608154020385,
"grad_norm": 0.20855236580529835,
"learning_rate": 9.710959030431167e-05,
"loss": 0.2996,
"num_input_tokens_seen": 181949360,
"step": 756
},
{
"epoch": 0.3352208380520951,
"grad_norm": 0.23697815659002952,
"learning_rate": 9.694845781239187e-05,
"loss": 0.2972,
"num_input_tokens_seen": 186990096,
"step": 777
},
{
"epoch": 0.34428086070215175,
"grad_norm": 0.2492979898134609,
"learning_rate": 9.678309608536626e-05,
"loss": 0.2984,
"num_input_tokens_seen": 192083856,
"step": 798
},
{
"epoch": 0.35334088335220837,
"grad_norm": 0.23816573617268064,
"learning_rate": 9.661352001943493e-05,
"loss": 0.2957,
"num_input_tokens_seen": 197134448,
"step": 819
},
{
"epoch": 0.362400906002265,
"grad_norm": 0.20467034048790322,
"learning_rate": 9.64397448904362e-05,
"loss": 0.2926,
"num_input_tokens_seen": 202310368,
"step": 840
},
{
"epoch": 0.3714609286523216,
"grad_norm": 0.17904927666050605,
"learning_rate": 9.626178635247054e-05,
"loss": 0.2909,
"num_input_tokens_seen": 207359840,
"step": 861
},
{
"epoch": 0.38052095130237823,
"grad_norm": 0.248985929697072,
"learning_rate": 9.607966043649046e-05,
"loss": 0.2954,
"num_input_tokens_seen": 212395664,
"step": 882
},
{
"epoch": 0.3895809739524349,
"grad_norm": 0.17735947921966527,
"learning_rate": 9.589338354885629e-05,
"loss": 0.2912,
"num_input_tokens_seen": 217570640,
"step": 903
},
{
"epoch": 0.3986409966024915,
"grad_norm": 0.2653024856558906,
"learning_rate": 9.570297246985837e-05,
"loss": 0.2928,
"num_input_tokens_seen": 222629712,
"step": 924
},
{
"epoch": 0.40770101925254815,
"grad_norm": 0.21684716629964057,
"learning_rate": 9.550844435220539e-05,
"loss": 0.292,
"num_input_tokens_seen": 227565744,
"step": 945
},
{
"epoch": 0.41676104190260477,
"grad_norm": 0.19831973070913392,
"learning_rate": 9.530981671947923e-05,
"loss": 0.292,
"num_input_tokens_seen": 232655712,
"step": 966
},
{
"epoch": 0.4258210645526614,
"grad_norm": 0.2039591338730108,
"learning_rate": 9.510710746455636e-05,
"loss": 0.2959,
"num_input_tokens_seen": 237611056,
"step": 987
},
{
"epoch": 0.434881087202718,
"grad_norm": 0.20635917697106,
"learning_rate": 9.490033484799608e-05,
"loss": 0.2884,
"num_input_tokens_seen": 242693136,
"step": 1008
},
{
"epoch": 0.44394110985277463,
"grad_norm": 0.21178513822087988,
"learning_rate": 9.468951749639551e-05,
"loss": 0.2878,
"num_input_tokens_seen": 247677488,
"step": 1029
},
{
"epoch": 0.45300113250283125,
"grad_norm": 0.24393748334345636,
"learning_rate": 9.447467440071164e-05,
"loss": 0.2908,
"num_input_tokens_seen": 252770384,
"step": 1050
},
{
"epoch": 0.46206115515288787,
"grad_norm": 0.19345581725168481,
"learning_rate": 9.425582491455067e-05,
"loss": 0.2796,
"num_input_tokens_seen": 258009696,
"step": 1071
},
{
"epoch": 0.4711211778029445,
"grad_norm": 0.17550447436524724,
"learning_rate": 9.403298875242448e-05,
"loss": 0.2858,
"num_input_tokens_seen": 263147728,
"step": 1092
},
{
"epoch": 0.4801812004530011,
"grad_norm": 0.1873327110291133,
"learning_rate": 9.380618598797473e-05,
"loss": 0.2876,
"num_input_tokens_seen": 268184080,
"step": 1113
},
{
"epoch": 0.48924122310305773,
"grad_norm": 0.2326284547666313,
"learning_rate": 9.357543705216465e-05,
"loss": 0.2814,
"num_input_tokens_seen": 273442768,
"step": 1134
},
{
"epoch": 0.4983012457531144,
"grad_norm": 0.1606625392823979,
"learning_rate": 9.334076273143843e-05,
"loss": 0.2804,
"num_input_tokens_seen": 278640624,
"step": 1155
},
{
"epoch": 0.507361268403171,
"grad_norm": 0.174300515580659,
"learning_rate": 9.310218416584886e-05,
"loss": 0.2863,
"num_input_tokens_seen": 283769424,
"step": 1176
},
{
"epoch": 0.5164212910532276,
"grad_norm": 0.18523825539113345,
"learning_rate": 9.28597228471529e-05,
"loss": 0.2851,
"num_input_tokens_seen": 288866448,
"step": 1197
},
{
"epoch": 0.5254813137032842,
"grad_norm": 0.29288899462077944,
"learning_rate": 9.26134006168757e-05,
"loss": 0.2798,
"num_input_tokens_seen": 293944576,
"step": 1218
},
{
"epoch": 0.5345413363533409,
"grad_norm": 0.21677248351508627,
"learning_rate": 9.236323966434295e-05,
"loss": 0.2728,
"num_input_tokens_seen": 299090032,
"step": 1239
},
{
"epoch": 0.5436013590033975,
"grad_norm": 0.17425284816339717,
"learning_rate": 9.210926252468219e-05,
"loss": 0.2756,
"num_input_tokens_seen": 304016304,
"step": 1260
},
{
"epoch": 0.5526613816534541,
"grad_norm": 0.19722321188451059,
"learning_rate": 9.185149207679263e-05,
"loss": 0.2747,
"num_input_tokens_seen": 309084016,
"step": 1281
},
{
"epoch": 0.5617214043035108,
"grad_norm": 0.15927450699962342,
"learning_rate": 9.158995154128425e-05,
"loss": 0.2772,
"num_input_tokens_seen": 314201696,
"step": 1302
},
{
"epoch": 0.5707814269535674,
"grad_norm": 0.144121234319095,
"learning_rate": 9.132466447838597e-05,
"loss": 0.2785,
"num_input_tokens_seen": 319266256,
"step": 1323
},
{
"epoch": 0.579841449603624,
"grad_norm": 0.20014275331573014,
"learning_rate": 9.105565478582334e-05,
"loss": 0.2755,
"num_input_tokens_seen": 324468352,
"step": 1344
},
{
"epoch": 0.5889014722536806,
"grad_norm": 0.1763537829210326,
"learning_rate": 9.078294669666576e-05,
"loss": 0.2708,
"num_input_tokens_seen": 329566736,
"step": 1365
},
{
"epoch": 0.5979614949037373,
"grad_norm": 0.20344878470372835,
"learning_rate": 9.050656477714346e-05,
"loss": 0.2729,
"num_input_tokens_seen": 334661888,
"step": 1386
},
{
"epoch": 0.6070215175537939,
"grad_norm": 0.16201525012659232,
"learning_rate": 9.022653392443454e-05,
"loss": 0.2754,
"num_input_tokens_seen": 339784976,
"step": 1407
},
{
"epoch": 0.6160815402038505,
"grad_norm": 0.17086654725622255,
"learning_rate": 8.994287936442225e-05,
"loss": 0.2742,
"num_input_tokens_seen": 344776544,
"step": 1428
},
{
"epoch": 0.6251415628539071,
"grad_norm": 0.17370971449213884,
"learning_rate": 8.96556266494224e-05,
"loss": 0.2703,
"num_input_tokens_seen": 349731168,
"step": 1449
},
{
"epoch": 0.6342015855039638,
"grad_norm": 0.1759254965690454,
"learning_rate": 8.936480165588173e-05,
"loss": 0.2756,
"num_input_tokens_seen": 354694544,
"step": 1470
},
{
"epoch": 0.6432616081540203,
"grad_norm": 0.1884165927777844,
"learning_rate": 8.907043058204674e-05,
"loss": 0.2698,
"num_input_tokens_seen": 359871984,
"step": 1491
},
{
"epoch": 0.652321630804077,
"grad_norm": 0.18427306909901323,
"learning_rate": 8.877253994560382e-05,
"loss": 0.2716,
"num_input_tokens_seen": 364937440,
"step": 1512
},
{
"epoch": 0.6613816534541337,
"grad_norm": 0.20911376204072243,
"learning_rate": 8.847115658129039e-05,
"loss": 0.2682,
"num_input_tokens_seen": 369994848,
"step": 1533
},
{
"epoch": 0.6704416761041903,
"grad_norm": 0.20741042143522315,
"learning_rate": 8.816630763847755e-05,
"loss": 0.2695,
"num_input_tokens_seen": 374992544,
"step": 1554
},
{
"epoch": 0.6795016987542469,
"grad_norm": 0.1643283959081191,
"learning_rate": 8.785802057872446e-05,
"loss": 0.2706,
"num_input_tokens_seen": 380038624,
"step": 1575
},
{
"epoch": 0.6885617214043035,
"grad_norm": 0.20102992236744546,
"learning_rate": 8.754632317330447e-05,
"loss": 0.2704,
"num_input_tokens_seen": 385195792,
"step": 1596
},
{
"epoch": 0.6976217440543602,
"grad_norm": 0.18834807757879243,
"learning_rate": 8.723124350070347e-05,
"loss": 0.2707,
"num_input_tokens_seen": 390195296,
"step": 1617
},
{
"epoch": 0.7066817667044167,
"grad_norm": 0.15261075530832655,
"learning_rate": 8.691280994409043e-05,
"loss": 0.2653,
"num_input_tokens_seen": 395353440,
"step": 1638
},
{
"epoch": 0.7157417893544734,
"grad_norm": 0.20682506960801342,
"learning_rate": 8.659105118876068e-05,
"loss": 0.2649,
"num_input_tokens_seen": 400444080,
"step": 1659
},
{
"epoch": 0.72480181200453,
"grad_norm": 0.21733357068498219,
"learning_rate": 8.626599621955179e-05,
"loss": 0.2652,
"num_input_tokens_seen": 405492112,
"step": 1680
},
{
"epoch": 0.7338618346545867,
"grad_norm": 0.18286757761891814,
"learning_rate": 8.593767431823255e-05,
"loss": 0.2638,
"num_input_tokens_seen": 410467584,
"step": 1701
},
{
"epoch": 0.7429218573046432,
"grad_norm": 0.16764027437130122,
"learning_rate": 8.56061150608652e-05,
"loss": 0.2685,
"num_input_tokens_seen": 415550320,
"step": 1722
},
{
"epoch": 0.7519818799546999,
"grad_norm": 0.17921646972994934,
"learning_rate": 8.527134831514117e-05,
"loss": 0.2584,
"num_input_tokens_seen": 420503712,
"step": 1743
},
{
"epoch": 0.7610419026047565,
"grad_norm": 0.19062465197166928,
"learning_rate": 8.493340423769053e-05,
"loss": 0.2607,
"num_input_tokens_seen": 425602800,
"step": 1764
},
{
"epoch": 0.7701019252548131,
"grad_norm": 0.19476076523413036,
"learning_rate": 8.459231327136532e-05,
"loss": 0.2652,
"num_input_tokens_seen": 430546320,
"step": 1785
},
{
"epoch": 0.7791619479048698,
"grad_norm": 0.18756178702354967,
"learning_rate": 8.42481061424973e-05,
"loss": 0.2604,
"num_input_tokens_seen": 435625600,
"step": 1806
},
{
"epoch": 0.7882219705549264,
"grad_norm": 0.16871375853816825,
"learning_rate": 8.390081385812993e-05,
"loss": 0.2603,
"num_input_tokens_seen": 440695024,
"step": 1827
},
{
"epoch": 0.797281993204983,
"grad_norm": 0.1669594920851862,
"learning_rate": 8.355046770322528e-05,
"loss": 0.2576,
"num_input_tokens_seen": 445877360,
"step": 1848
},
{
"epoch": 0.8063420158550396,
"grad_norm": 0.20147791313721689,
"learning_rate": 8.319709923784573e-05,
"loss": 0.2622,
"num_input_tokens_seen": 451021040,
"step": 1869
},
{
"epoch": 0.8154020385050963,
"grad_norm": 0.17108850294819108,
"learning_rate": 8.284074029431099e-05,
"loss": 0.2587,
"num_input_tokens_seen": 456101872,
"step": 1890
},
{
"epoch": 0.8244620611551529,
"grad_norm": 0.18516681989871883,
"learning_rate": 8.248142297433057e-05,
"loss": 0.2575,
"num_input_tokens_seen": 461365920,
"step": 1911
},
{
"epoch": 0.8335220838052095,
"grad_norm": 0.20285356102658042,
"learning_rate": 8.211917964611196e-05,
"loss": 0.2573,
"num_input_tokens_seen": 466466096,
"step": 1932
},
{
"epoch": 0.8425821064552661,
"grad_norm": 0.207923488217522,
"learning_rate": 8.175404294144482e-05,
"loss": 0.26,
"num_input_tokens_seen": 471541104,
"step": 1953
},
{
"epoch": 0.8516421291053228,
"grad_norm": 0.19850908270608497,
"learning_rate": 8.138604575276143e-05,
"loss": 0.2571,
"num_input_tokens_seen": 476646096,
"step": 1974
},
{
"epoch": 0.8607021517553793,
"grad_norm": 0.1821198820367163,
"learning_rate": 8.10152212301737e-05,
"loss": 0.251,
"num_input_tokens_seen": 481695200,
"step": 1995
},
{
"epoch": 0.869762174405436,
"grad_norm": 0.1623421619904062,
"learning_rate": 8.064160277848682e-05,
"loss": 0.2614,
"num_input_tokens_seen": 486706656,
"step": 2016
},
{
"epoch": 0.8788221970554927,
"grad_norm": 0.1774308248272562,
"learning_rate": 8.026522405419023e-05,
"loss": 0.2528,
"num_input_tokens_seen": 491943424,
"step": 2037
},
{
"epoch": 0.8878822197055493,
"grad_norm": 0.21003241654174584,
"learning_rate": 7.988611896242559e-05,
"loss": 0.2571,
"num_input_tokens_seen": 496925888,
"step": 2058
},
{
"epoch": 0.8969422423556059,
"grad_norm": 0.20014809740395048,
"learning_rate": 7.950432165393259e-05,
"loss": 0.2547,
"num_input_tokens_seen": 502065216,
"step": 2079
},
{
"epoch": 0.9060022650056625,
"grad_norm": 0.17154274701388803,
"learning_rate": 7.911986652197262e-05,
"loss": 0.2538,
"num_input_tokens_seen": 507089616,
"step": 2100
},
{
"epoch": 0.9150622876557192,
"grad_norm": 0.17929198920009218,
"learning_rate": 7.873278819923048e-05,
"loss": 0.2551,
"num_input_tokens_seen": 512060336,
"step": 2121
},
{
"epoch": 0.9241223103057757,
"grad_norm": 0.16398954046091754,
"learning_rate": 7.834312155469456e-05,
"loss": 0.2515,
"num_input_tokens_seen": 517133680,
"step": 2142
},
{
"epoch": 0.9331823329558324,
"grad_norm": 0.17469600332499013,
"learning_rate": 7.79509016905158e-05,
"loss": 0.2526,
"num_input_tokens_seen": 522229616,
"step": 2163
},
{
"epoch": 0.942242355605889,
"grad_norm": 0.15900929966723312,
"learning_rate": 7.755616393884561e-05,
"loss": 0.2482,
"num_input_tokens_seen": 527368864,
"step": 2184
},
{
"epoch": 0.9513023782559457,
"grad_norm": 0.18254122861853214,
"learning_rate": 7.715894385865299e-05,
"loss": 0.2516,
"num_input_tokens_seen": 532499712,
"step": 2205
},
{
"epoch": 0.9603624009060022,
"grad_norm": 0.19001641667974054,
"learning_rate": 7.675927723252134e-05,
"loss": 0.2493,
"num_input_tokens_seen": 537438224,
"step": 2226
},
{
"epoch": 0.9694224235560589,
"grad_norm": 0.16454856189702688,
"learning_rate": 7.635720006342512e-05,
"loss": 0.2465,
"num_input_tokens_seen": 542603472,
"step": 2247
},
{
"epoch": 0.9784824462061155,
"grad_norm": 0.18435585737458549,
"learning_rate": 7.595274857148652e-05,
"loss": 0.2486,
"num_input_tokens_seen": 547622688,
"step": 2268
},
{
"epoch": 0.9875424688561721,
"grad_norm": 0.16595693704140477,
"learning_rate": 7.554595919071268e-05,
"loss": 0.2472,
"num_input_tokens_seen": 552751232,
"step": 2289
},
{
"epoch": 0.9966024915062288,
"grad_norm": 0.16765543588823353,
"learning_rate": 7.513686856571368e-05,
"loss": 0.2471,
"num_input_tokens_seen": 557786736,
"step": 2310
},
{
"epoch": 1.0060400151000377,
"grad_norm": 0.17133389174596797,
"learning_rate": 7.472551354840145e-05,
"loss": 0.2361,
"num_input_tokens_seen": 562993712,
"step": 2331
},
{
"epoch": 1.0151000377500943,
"grad_norm": 0.18279159983012744,
"learning_rate": 7.431193119467008e-05,
"loss": 0.217,
"num_input_tokens_seen": 568021744,
"step": 2352
},
{
"epoch": 1.024160060400151,
"grad_norm": 0.266530218109695,
"learning_rate": 7.389615876105774e-05,
"loss": 0.2145,
"num_input_tokens_seen": 572956608,
"step": 2373
},
{
"epoch": 1.0332200830502076,
"grad_norm": 0.178808088687151,
"learning_rate": 7.347823370139042e-05,
"loss": 0.2179,
"num_input_tokens_seen": 577973792,
"step": 2394
},
{
"epoch": 1.0422801057002642,
"grad_norm": 0.15535901011827938,
"learning_rate": 7.30581936634082e-05,
"loss": 0.2098,
"num_input_tokens_seen": 582948368,
"step": 2415
},
{
"epoch": 1.051340128350321,
"grad_norm": 0.1879255010231289,
"learning_rate": 7.263607648537364e-05,
"loss": 0.2174,
"num_input_tokens_seen": 587973936,
"step": 2436
},
{
"epoch": 1.0604001510003775,
"grad_norm": 0.18116579180636055,
"learning_rate": 7.221192019266332e-05,
"loss": 0.2187,
"num_input_tokens_seen": 593048624,
"step": 2457
},
{
"epoch": 1.069460173650434,
"grad_norm": 0.16067683672474237,
"learning_rate": 7.178576299434238e-05,
"loss": 0.2162,
"num_input_tokens_seen": 598171840,
"step": 2478
},
{
"epoch": 1.0785201963004907,
"grad_norm": 0.16890347919356866,
"learning_rate": 7.135764327972261e-05,
"loss": 0.2202,
"num_input_tokens_seen": 603168000,
"step": 2499
},
{
"epoch": 1.0875802189505475,
"grad_norm": 0.171528446871021,
"learning_rate": 7.092759961490415e-05,
"loss": 0.2237,
"num_input_tokens_seen": 608280544,
"step": 2520
},
{
"epoch": 1.096640241600604,
"grad_norm": 0.16674379731500746,
"learning_rate": 7.049567073930143e-05,
"loss": 0.2199,
"num_input_tokens_seen": 613215280,
"step": 2541
},
{
"epoch": 1.1057002642506606,
"grad_norm": 0.18816045716369076,
"learning_rate": 7.006189556215345e-05,
"loss": 0.2189,
"num_input_tokens_seen": 618261984,
"step": 2562
},
{
"epoch": 1.1147602869007172,
"grad_norm": 0.16658411013718444,
"learning_rate": 6.962631315901861e-05,
"loss": 0.2163,
"num_input_tokens_seen": 623492320,
"step": 2583
},
{
"epoch": 1.123820309550774,
"grad_norm": 0.19772642288828662,
"learning_rate": 6.918896276825485e-05,
"loss": 0.2157,
"num_input_tokens_seen": 628563152,
"step": 2604
},
{
"epoch": 1.1328803322008305,
"grad_norm": 0.17075347503361357,
"learning_rate": 6.874988378748483e-05,
"loss": 0.2141,
"num_input_tokens_seen": 633639472,
"step": 2625
},
{
"epoch": 1.141940354850887,
"grad_norm": 0.14444472527009938,
"learning_rate": 6.830911577004698e-05,
"loss": 0.2185,
"num_input_tokens_seen": 638639648,
"step": 2646
},
{
"epoch": 1.1510003775009436,
"grad_norm": 0.17948926082904845,
"learning_rate": 6.786669842143236e-05,
"loss": 0.2125,
"num_input_tokens_seen": 643743632,
"step": 2667
},
{
"epoch": 1.1600604001510004,
"grad_norm": 0.1845862453656852,
"learning_rate": 6.742267159570795e-05,
"loss": 0.2138,
"num_input_tokens_seen": 648823584,
"step": 2688
},
{
"epoch": 1.169120422801057,
"grad_norm": 0.1611490189496428,
"learning_rate": 6.697707529192648e-05,
"loss": 0.2152,
"num_input_tokens_seen": 653949232,
"step": 2709
},
{
"epoch": 1.1781804454511136,
"grad_norm": 0.19293414419678218,
"learning_rate": 6.652994965052319e-05,
"loss": 0.2125,
"num_input_tokens_seen": 658996016,
"step": 2730
},
{
"epoch": 1.1872404681011703,
"grad_norm": 0.1887515649690068,
"learning_rate": 6.608133494969994e-05,
"loss": 0.2123,
"num_input_tokens_seen": 664102304,
"step": 2751
},
{
"epoch": 1.196300490751227,
"grad_norm": 0.15668371616625942,
"learning_rate": 6.563127160179671e-05,
"loss": 0.2101,
"num_input_tokens_seen": 669123584,
"step": 2772
},
{
"epoch": 1.2053605134012835,
"grad_norm": 0.17871386948979864,
"learning_rate": 6.517980014965139e-05,
"loss": 0.209,
"num_input_tokens_seen": 674256592,
"step": 2793
},
{
"epoch": 1.21442053605134,
"grad_norm": 0.16852785549912133,
"learning_rate": 6.472696126294732e-05,
"loss": 0.2122,
"num_input_tokens_seen": 679248208,
"step": 2814
},
{
"epoch": 1.2234805587013968,
"grad_norm": 0.1758847676430736,
"learning_rate": 6.427279573454985e-05,
"loss": 0.2093,
"num_input_tokens_seen": 684325632,
"step": 2835
},
{
"epoch": 1.2325405813514534,
"grad_norm": 0.19298009720432585,
"learning_rate": 6.381734447683152e-05,
"loss": 0.2114,
"num_input_tokens_seen": 689336736,
"step": 2856
},
{
"epoch": 1.24160060400151,
"grad_norm": 0.16439303725722001,
"learning_rate": 6.33606485179866e-05,
"loss": 0.2111,
"num_input_tokens_seen": 694382688,
"step": 2877
},
{
"epoch": 1.2506606266515665,
"grad_norm": 0.18140167193790194,
"learning_rate": 6.290274899833517e-05,
"loss": 0.2086,
"num_input_tokens_seen": 699371792,
"step": 2898
},
{
"epoch": 1.2597206493016233,
"grad_norm": 0.17151657072780352,
"learning_rate": 6.244368716661713e-05,
"loss": 0.2095,
"num_input_tokens_seen": 704404624,
"step": 2919
},
{
"epoch": 1.2687806719516799,
"grad_norm": 0.2052334824788225,
"learning_rate": 6.198350437627632e-05,
"loss": 0.2083,
"num_input_tokens_seen": 709451392,
"step": 2940
},
{
"epoch": 1.2778406946017364,
"grad_norm": 0.18426322385396474,
"learning_rate": 6.152224208173533e-05,
"loss": 0.2088,
"num_input_tokens_seen": 714486848,
"step": 2961
},
{
"epoch": 1.2869007172517932,
"grad_norm": 0.1949416856665576,
"learning_rate": 6.10599418346613e-05,
"loss": 0.2118,
"num_input_tokens_seen": 719556448,
"step": 2982
},
{
"epoch": 1.2959607399018498,
"grad_norm": 0.16224663869829734,
"learning_rate": 6.059664528022266e-05,
"loss": 0.2058,
"num_input_tokens_seen": 724625472,
"step": 3003
},
{
"epoch": 1.3050207625519064,
"grad_norm": 0.1742996675080445,
"learning_rate": 6.0132394153337755e-05,
"loss": 0.2065,
"num_input_tokens_seen": 729794320,
"step": 3024
},
{
"epoch": 1.3140807852019631,
"grad_norm": 0.17806291944392397,
"learning_rate": 5.9667230274915174e-05,
"loss": 0.207,
"num_input_tokens_seen": 734753392,
"step": 3045
},
{
"epoch": 1.3231408078520197,
"grad_norm": 0.18436964692073765,
"learning_rate": 5.920119554808651e-05,
"loss": 0.2049,
"num_input_tokens_seen": 739827088,
"step": 3066
},
{
"epoch": 1.3322008305020763,
"grad_norm": 0.16939954583438047,
"learning_rate": 5.873433195443152e-05,
"loss": 0.208,
"num_input_tokens_seen": 744847184,
"step": 3087
},
{
"epoch": 1.3412608531521328,
"grad_norm": 0.1635889048824763,
"learning_rate": 5.82666815501964e-05,
"loss": 0.2047,
"num_input_tokens_seen": 749874880,
"step": 3108
},
{
"epoch": 1.3503208758021894,
"grad_norm": 0.16527453811089068,
"learning_rate": 5.779828646250521e-05,
"loss": 0.2022,
"num_input_tokens_seen": 754848400,
"step": 3129
},
{
"epoch": 1.3593808984522462,
"grad_norm": 0.18777829379599956,
"learning_rate": 5.7329188885565e-05,
"loss": 0.2073,
"num_input_tokens_seen": 759913728,
"step": 3150
},
{
"epoch": 1.3684409211023028,
"grad_norm": 0.15155859962967053,
"learning_rate": 5.6859431076864755e-05,
"loss": 0.2056,
"num_input_tokens_seen": 765009632,
"step": 3171
},
{
"epoch": 1.3775009437523593,
"grad_norm": 0.154719416013354,
"learning_rate": 5.6389055353368826e-05,
"loss": 0.2056,
"num_input_tokens_seen": 770016704,
"step": 3192
},
{
"epoch": 1.386560966402416,
"grad_norm": 0.16028731910302912,
"learning_rate": 5.591810408770493e-05,
"loss": 0.2037,
"num_input_tokens_seen": 775197264,
"step": 3213
},
{
"epoch": 1.3956209890524727,
"grad_norm": 0.15645057759509218,
"learning_rate": 5.544661970434696e-05,
"loss": 0.2042,
"num_input_tokens_seen": 780209328,
"step": 3234
},
{
"epoch": 1.4046810117025292,
"grad_norm": 0.17948486028711083,
"learning_rate": 5.497464467579351e-05,
"loss": 0.2011,
"num_input_tokens_seen": 785402112,
"step": 3255
},
{
"epoch": 1.4137410343525858,
"grad_norm": 0.16643670433003308,
"learning_rate": 5.450222151874166e-05,
"loss": 0.2015,
"num_input_tokens_seen": 790429216,
"step": 3276
},
{
"epoch": 1.4228010570026426,
"grad_norm": 0.17345996800344896,
"learning_rate": 5.402939279025705e-05,
"loss": 0.2005,
"num_input_tokens_seen": 795543264,
"step": 3297
},
{
"epoch": 1.4318610796526992,
"grad_norm": 0.1663960297870033,
"learning_rate": 5.355620108394018e-05,
"loss": 0.2052,
"num_input_tokens_seen": 800533200,
"step": 3318
},
{
"epoch": 1.4409211023027557,
"grad_norm": 0.15958882963062815,
"learning_rate": 5.308268902608958e-05,
"loss": 0.2042,
"num_input_tokens_seen": 805542720,
"step": 3339
},
{
"epoch": 1.4499811249528123,
"grad_norm": 0.17053093118312482,
"learning_rate": 5.2608899271861765e-05,
"loss": 0.1984,
"num_input_tokens_seen": 810549376,
"step": 3360
},
{
"epoch": 1.459041147602869,
"grad_norm": 0.1731330043830458,
"learning_rate": 5.213487450142892e-05,
"loss": 0.2038,
"num_input_tokens_seen": 815599232,
"step": 3381
},
{
"epoch": 1.4681011702529256,
"grad_norm": 0.17941197802062514,
"learning_rate": 5.166065741613402e-05,
"loss": 0.2012,
"num_input_tokens_seen": 820700608,
"step": 3402
},
{
"epoch": 1.4771611929029822,
"grad_norm": 0.1844938407002505,
"learning_rate": 5.118629073464424e-05,
"loss": 0.1987,
"num_input_tokens_seen": 825686176,
"step": 3423
},
{
"epoch": 1.486221215553039,
"grad_norm": 0.1748567417166297,
"learning_rate": 5.071181718910283e-05,
"loss": 0.1986,
"num_input_tokens_seen": 830730000,
"step": 3444
},
{
"epoch": 1.4952812382030956,
"grad_norm": 0.15694569688029672,
"learning_rate": 5.023727952127954e-05,
"loss": 0.1987,
"num_input_tokens_seen": 835738032,
"step": 3465
},
{
"epoch": 1.5043412608531521,
"grad_norm": 0.18575993893540607,
"learning_rate": 4.976272047872046e-05,
"loss": 0.1952,
"num_input_tokens_seen": 840806528,
"step": 3486
},
{
"epoch": 1.513401283503209,
"grad_norm": 0.16316391964141339,
"learning_rate": 4.9288182810897184e-05,
"loss": 0.1957,
"num_input_tokens_seen": 845877808,
"step": 3507
},
{
"epoch": 1.5224613061532652,
"grad_norm": 0.1809977532876625,
"learning_rate": 4.8813709265355766e-05,
"loss": 0.1957,
"num_input_tokens_seen": 851002432,
"step": 3528
},
{
"epoch": 1.531521328803322,
"grad_norm": 0.15896204329046001,
"learning_rate": 4.8339342583866005e-05,
"loss": 0.197,
"num_input_tokens_seen": 856037440,
"step": 3549
},
{
"epoch": 1.5405813514533786,
"grad_norm": 0.1848696286617871,
"learning_rate": 4.7865125498571086e-05,
"loss": 0.1957,
"num_input_tokens_seen": 860972624,
"step": 3570
},
{
"epoch": 1.5496413741034352,
"grad_norm": 0.16411859849940666,
"learning_rate": 4.739110072813823e-05,
"loss": 0.1926,
"num_input_tokens_seen": 866078128,
"step": 3591
},
{
"epoch": 1.558701396753492,
"grad_norm": 0.15293153546751434,
"learning_rate": 4.6917310973910425e-05,
"loss": 0.1934,
"num_input_tokens_seen": 871290720,
"step": 3612
},
{
"epoch": 1.5677614194035485,
"grad_norm": 0.18580264173261662,
"learning_rate": 4.6443798916059836e-05,
"loss": 0.1961,
"num_input_tokens_seen": 876353920,
"step": 3633
},
{
"epoch": 1.576821442053605,
"grad_norm": 0.16117670144515006,
"learning_rate": 4.597060720974298e-05,
"loss": 0.1902,
"num_input_tokens_seen": 881469536,
"step": 3654
},
{
"epoch": 1.5858814647036619,
"grad_norm": 0.1821844142116438,
"learning_rate": 4.549777848125833e-05,
"loss": 0.1971,
"num_input_tokens_seen": 886532048,
"step": 3675
},
{
"epoch": 1.5949414873537184,
"grad_norm": 0.188981157327872,
"learning_rate": 4.50253553242065e-05,
"loss": 0.1952,
"num_input_tokens_seen": 891565152,
"step": 3696
},
{
"epoch": 1.604001510003775,
"grad_norm": 0.1663775536476532,
"learning_rate": 4.4553380295653053e-05,
"loss": 0.1908,
"num_input_tokens_seen": 896603568,
"step": 3717
},
{
"epoch": 1.6130615326538318,
"grad_norm": 0.16695660636413406,
"learning_rate": 4.40818959122951e-05,
"loss": 0.1945,
"num_input_tokens_seen": 901703264,
"step": 3738
},
{
"epoch": 1.6221215553038881,
"grad_norm": 0.18003132042487852,
"learning_rate": 4.361094464663118e-05,
"loss": 0.1911,
"num_input_tokens_seen": 906846256,
"step": 3759
},
{
"epoch": 1.631181577953945,
"grad_norm": 0.16377146934729214,
"learning_rate": 4.3140568923135264e-05,
"loss": 0.193,
"num_input_tokens_seen": 911964272,
"step": 3780
},
{
"epoch": 1.6402416006040015,
"grad_norm": 0.1711801561805431,
"learning_rate": 4.267081111443501e-05,
"loss": 0.1898,
"num_input_tokens_seen": 917101840,
"step": 3801
},
{
"epoch": 1.649301623254058,
"grad_norm": 0.1743609898038798,
"learning_rate": 4.22017135374948e-05,
"loss": 0.1852,
"num_input_tokens_seen": 922205664,
"step": 3822
},
{
"epoch": 1.6583616459041148,
"grad_norm": 0.17938627926996303,
"learning_rate": 4.1733318449803624e-05,
"loss": 0.1863,
"num_input_tokens_seen": 927302560,
"step": 3843
},
{
"epoch": 1.6674216685541714,
"grad_norm": 0.16947333759434738,
"learning_rate": 4.1265668045568495e-05,
"loss": 0.1882,
"num_input_tokens_seen": 932325424,
"step": 3864
},
{
"epoch": 1.676481691204228,
"grad_norm": 0.16639553173104588,
"learning_rate": 4.079880445191351e-05,
"loss": 0.1893,
"num_input_tokens_seen": 937438464,
"step": 3885
},
{
"epoch": 1.6855417138542848,
"grad_norm": 0.14651023615133163,
"learning_rate": 4.033276972508484e-05,
"loss": 0.1885,
"num_input_tokens_seen": 942617840,
"step": 3906
},
{
"epoch": 1.6946017365043413,
"grad_norm": 0.17812367097504705,
"learning_rate": 3.9867605846662256e-05,
"loss": 0.1883,
"num_input_tokens_seen": 947823200,
"step": 3927
},
{
"epoch": 1.7036617591543979,
"grad_norm": 0.1872194452488721,
"learning_rate": 3.940335471977734e-05,
"loss": 0.1871,
"num_input_tokens_seen": 952872784,
"step": 3948
},
{
"epoch": 1.7127217818044547,
"grad_norm": 0.1643081487678093,
"learning_rate": 3.89400581653387e-05,
"loss": 0.1853,
"num_input_tokens_seen": 957908608,
"step": 3969
},
{
"epoch": 1.721781804454511,
"grad_norm": 0.1753318656372752,
"learning_rate": 3.847775791826468e-05,
"loss": 0.1862,
"num_input_tokens_seen": 962972208,
"step": 3990
},
{
"epoch": 1.7308418271045678,
"grad_norm": 0.15851578473823177,
"learning_rate": 3.801649562372371e-05,
"loss": 0.1913,
"num_input_tokens_seen": 968020256,
"step": 4011
},
{
"epoch": 1.7399018497546244,
"grad_norm": 0.1812244935434252,
"learning_rate": 3.755631283338287e-05,
"loss": 0.1908,
"num_input_tokens_seen": 973116912,
"step": 4032
},
{
"epoch": 1.748961872404681,
"grad_norm": 0.16265272189557067,
"learning_rate": 3.709725100166482e-05,
"loss": 0.1839,
"num_input_tokens_seen": 978276224,
"step": 4053
},
{
"epoch": 1.7580218950547377,
"grad_norm": 0.16695166226650535,
"learning_rate": 3.663935148201341e-05,
"loss": 0.1869,
"num_input_tokens_seen": 983499184,
"step": 4074
},
{
"epoch": 1.7670819177047943,
"grad_norm": 0.17046617487207735,
"learning_rate": 3.618265552316849e-05,
"loss": 0.1884,
"num_input_tokens_seen": 988511216,
"step": 4095
},
{
"epoch": 1.7761419403548508,
"grad_norm": 0.16853521697476523,
"learning_rate": 3.572720426545017e-05,
"loss": 0.1863,
"num_input_tokens_seen": 993542272,
"step": 4116
},
{
"epoch": 1.7852019630049076,
"grad_norm": 0.16196055724715774,
"learning_rate": 3.5273038737052675e-05,
"loss": 0.1884,
"num_input_tokens_seen": 998561584,
"step": 4137
},
{
"epoch": 1.794261985654964,
"grad_norm": 0.17704958458091835,
"learning_rate": 3.482019985034861e-05,
"loss": 0.1815,
"num_input_tokens_seen": 1003535696,
"step": 4158
},
{
"epoch": 1.8033220083050208,
"grad_norm": 0.17212954264417213,
"learning_rate": 3.43687283982033e-05,
"loss": 0.1798,
"num_input_tokens_seen": 1008610432,
"step": 4179
},
{
"epoch": 1.8123820309550775,
"grad_norm": 0.1642508897074481,
"learning_rate": 3.391866505030009e-05,
"loss": 0.1797,
"num_input_tokens_seen": 1013577840,
"step": 4200
},
{
"epoch": 1.821442053605134,
"grad_norm": 0.1895193964135349,
"learning_rate": 3.347005034947681e-05,
"loss": 0.1773,
"num_input_tokens_seen": 1018549888,
"step": 4221
},
{
"epoch": 1.8305020762551907,
"grad_norm": 0.18935672208270557,
"learning_rate": 3.3022924708073524e-05,
"loss": 0.1828,
"num_input_tokens_seen": 1023498368,
"step": 4242
},
{
"epoch": 1.8395620989052472,
"grad_norm": 0.15473627402095172,
"learning_rate": 3.257732840429206e-05,
"loss": 0.18,
"num_input_tokens_seen": 1028542992,
"step": 4263
},
{
"epoch": 1.8486221215553038,
"grad_norm": 0.17782850850732204,
"learning_rate": 3.2133301578567646e-05,
"loss": 0.1825,
"num_input_tokens_seen": 1033574288,
"step": 4284
},
{
"epoch": 1.8576821442053606,
"grad_norm": 0.17879475744218412,
"learning_rate": 3.169088422995304e-05,
"loss": 0.1776,
"num_input_tokens_seen": 1038606208,
"step": 4305
},
{
"epoch": 1.8667421668554172,
"grad_norm": 0.16166293718253705,
"learning_rate": 3.125011621251516e-05,
"loss": 0.1768,
"num_input_tokens_seen": 1043770704,
"step": 4326
},
{
"epoch": 1.8758021895054737,
"grad_norm": 0.1607230134601091,
"learning_rate": 3.081103723174515e-05,
"loss": 0.1778,
"num_input_tokens_seen": 1048829664,
"step": 4347
},
{
"epoch": 1.8848622121555305,
"grad_norm": 0.159447656379203,
"learning_rate": 3.0373686840981397e-05,
"loss": 0.1788,
"num_input_tokens_seen": 1053950224,
"step": 4368
},
{
"epoch": 1.8939222348055869,
"grad_norm": 0.1674766446494019,
"learning_rate": 2.9938104437846572e-05,
"loss": 0.176,
"num_input_tokens_seen": 1059119888,
"step": 4389
},
{
"epoch": 1.9029822574556436,
"grad_norm": 0.17753675611302996,
"learning_rate": 2.950432926069857e-05,
"loss": 0.1783,
"num_input_tokens_seen": 1064177088,
"step": 4410
},
{
"epoch": 1.9120422801057002,
"grad_norm": 0.17087252328331373,
"learning_rate": 2.9072400385095865e-05,
"loss": 0.178,
"num_input_tokens_seen": 1069200928,
"step": 4431
},
{
"epoch": 1.9211023027557568,
"grad_norm": 0.16133227423173738,
"learning_rate": 2.864235672027741e-05,
"loss": 0.1759,
"num_input_tokens_seen": 1074313840,
"step": 4452
},
{
"epoch": 1.9301623254058136,
"grad_norm": 0.1865580464555286,
"learning_rate": 2.8214237005657627e-05,
"loss": 0.1769,
"num_input_tokens_seen": 1079348080,
"step": 4473
},
{
"epoch": 1.9392223480558701,
"grad_norm": 0.17483638643553473,
"learning_rate": 2.7788079807336692e-05,
"loss": 0.1761,
"num_input_tokens_seen": 1084415072,
"step": 4494
},
{
"epoch": 1.9482823707059267,
"grad_norm": 0.16127203478332483,
"learning_rate": 2.7363923514626367e-05,
"loss": 0.1762,
"num_input_tokens_seen": 1089576528,
"step": 4515
},
{
"epoch": 1.9573423933559835,
"grad_norm": 0.1818665955450248,
"learning_rate": 2.6941806336591808e-05,
"loss": 0.1715,
"num_input_tokens_seen": 1094741664,
"step": 4536
},
{
"epoch": 1.96640241600604,
"grad_norm": 0.16510174569454042,
"learning_rate": 2.6521766298609584e-05,
"loss": 0.1728,
"num_input_tokens_seen": 1099708896,
"step": 4557
},
{
"epoch": 1.9754624386560966,
"grad_norm": 0.17393602608748607,
"learning_rate": 2.610384123894229e-05,
"loss": 0.175,
"num_input_tokens_seen": 1104824512,
"step": 4578
},
{
"epoch": 1.9845224613061534,
"grad_norm": 0.18901915034549496,
"learning_rate": 2.568806880532991e-05,
"loss": 0.1736,
"num_input_tokens_seen": 1109954160,
"step": 4599
},
{
"epoch": 1.9935824839562097,
"grad_norm": 0.19336693087348367,
"learning_rate": 2.5274486451598565e-05,
"loss": 0.1704,
"num_input_tokens_seen": 1115130992,
"step": 4620
},
{
"epoch": 2.003020007550019,
"grad_norm": 0.192558427240515,
"learning_rate": 2.4863131434286342e-05,
"loss": 0.1548,
"num_input_tokens_seen": 1120294784,
"step": 4641
},
{
"epoch": 2.0120800302000754,
"grad_norm": 0.19360993518356076,
"learning_rate": 2.4454040809287342e-05,
"loss": 0.1188,
"num_input_tokens_seen": 1125375728,
"step": 4662
},
{
"epoch": 2.021140052850132,
"grad_norm": 0.19346692314512148,
"learning_rate": 2.4047251428513485e-05,
"loss": 0.1176,
"num_input_tokens_seen": 1130663488,
"step": 4683
},
{
"epoch": 2.0302000755001886,
"grad_norm": 0.1915500646603155,
"learning_rate": 2.364279993657487e-05,
"loss": 0.1166,
"num_input_tokens_seen": 1135729856,
"step": 4704
},
{
"epoch": 2.0392600981502453,
"grad_norm": 0.21320689744431512,
"learning_rate": 2.3240722767478657e-05,
"loss": 0.1129,
"num_input_tokens_seen": 1140728768,
"step": 4725
},
{
"epoch": 2.048320120800302,
"grad_norm": 0.20002232427856995,
"learning_rate": 2.2841056141347038e-05,
"loss": 0.1122,
"num_input_tokens_seen": 1145810672,
"step": 4746
},
{
"epoch": 2.0573801434503585,
"grad_norm": 0.21228559927967805,
"learning_rate": 2.2443836061154415e-05,
"loss": 0.1145,
"num_input_tokens_seen": 1150862064,
"step": 4767
},
{
"epoch": 2.0664401661004153,
"grad_norm": 0.19792768108065947,
"learning_rate": 2.2049098309484195e-05,
"loss": 0.1153,
"num_input_tokens_seen": 1155954544,
"step": 4788
},
{
"epoch": 2.075500188750472,
"grad_norm": 0.21247296887779493,
"learning_rate": 2.1656878445305447e-05,
"loss": 0.1152,
"num_input_tokens_seen": 1161054256,
"step": 4809
},
{
"epoch": 2.0845602114005284,
"grad_norm": 0.19109010163603735,
"learning_rate": 2.1267211800769528e-05,
"loss": 0.1148,
"num_input_tokens_seen": 1166056688,
"step": 4830
},
{
"epoch": 2.093620234050585,
"grad_norm": 0.19679782828606215,
"learning_rate": 2.088013347802738e-05,
"loss": 0.1119,
"num_input_tokens_seen": 1171231104,
"step": 4851
},
{
"epoch": 2.102680256700642,
"grad_norm": 0.2128224999872872,
"learning_rate": 2.0495678346067414e-05,
"loss": 0.1101,
"num_input_tokens_seen": 1176284976,
"step": 4872
},
{
"epoch": 2.1117402793506983,
"grad_norm": 0.2123206811047115,
"learning_rate": 2.011388103757442e-05,
"loss": 0.1139,
"num_input_tokens_seen": 1181400944,
"step": 4893
},
{
"epoch": 2.120800302000755,
"grad_norm": 0.2071017368245751,
"learning_rate": 1.973477594580977e-05,
"loss": 0.1116,
"num_input_tokens_seen": 1186527776,
"step": 4914
},
{
"epoch": 2.1298603246508114,
"grad_norm": 0.17323287993849096,
"learning_rate": 1.9358397221513176e-05,
"loss": 0.112,
"num_input_tokens_seen": 1191661680,
"step": 4935
},
{
"epoch": 2.138920347300868,
"grad_norm": 0.20213151950682676,
"learning_rate": 1.8984778769826316e-05,
"loss": 0.1106,
"num_input_tokens_seen": 1196759648,
"step": 4956
},
{
"epoch": 2.147980369950925,
"grad_norm": 0.19700292148625387,
"learning_rate": 1.8613954247238586e-05,
"loss": 0.1124,
"num_input_tokens_seen": 1201857104,
"step": 4977
},
{
"epoch": 2.1570403926009813,
"grad_norm": 0.21527000496492768,
"learning_rate": 1.82459570585552e-05,
"loss": 0.1136,
"num_input_tokens_seen": 1206927520,
"step": 4998
},
{
"epoch": 2.157903251900987,
"eval_loss": 0.19485081732273102,
"eval_runtime": 529.4687,
"eval_samples_per_second": 17.331,
"eval_steps_per_second": 1.084,
"num_input_tokens_seen": 1207385424,
"step": 5000
},
{
"epoch": 2.166100415251038,
"grad_norm": 0.2275158963594303,
"learning_rate": 1.7880820353888056e-05,
"loss": 0.1102,
"num_input_tokens_seen": 1211875824,
"step": 5019
},
{
"epoch": 2.175160437901095,
"grad_norm": 0.20450931488489404,
"learning_rate": 1.751857702566944e-05,
"loss": 0.113,
"num_input_tokens_seen": 1216954688,
"step": 5040
},
{
"epoch": 2.1842204605511513,
"grad_norm": 0.21173943893990088,
"learning_rate": 1.7159259705689e-05,
"loss": 0.1104,
"num_input_tokens_seen": 1221976560,
"step": 5061
},
{
"epoch": 2.193280483201208,
"grad_norm": 0.20717537063008165,
"learning_rate": 1.6802900762154267e-05,
"loss": 0.1152,
"num_input_tokens_seen": 1226975776,
"step": 5082
},
{
"epoch": 2.2023405058512644,
"grad_norm": 0.19831781893791461,
"learning_rate": 1.644953229677474e-05,
"loss": 0.1097,
"num_input_tokens_seen": 1231998784,
"step": 5103
},
{
"epoch": 2.211400528501321,
"grad_norm": 0.18492554370023317,
"learning_rate": 1.609918614187009e-05,
"loss": 0.1111,
"num_input_tokens_seen": 1236990864,
"step": 5124
},
{
"epoch": 2.220460551151378,
"grad_norm": 0.20016085842409992,
"learning_rate": 1.575189385750271e-05,
"loss": 0.1104,
"num_input_tokens_seen": 1242051280,
"step": 5145
},
{
"epoch": 2.2295205738014343,
"grad_norm": 0.2070887723839001,
"learning_rate": 1.540768672863468e-05,
"loss": 0.1075,
"num_input_tokens_seen": 1247127040,
"step": 5166
},
{
"epoch": 2.238580596451491,
"grad_norm": 0.1925213910719394,
"learning_rate": 1.5066595762309477e-05,
"loss": 0.1093,
"num_input_tokens_seen": 1252158672,
"step": 5187
},
{
"epoch": 2.247640619101548,
"grad_norm": 0.205831347337121,
"learning_rate": 1.4728651684858834e-05,
"loss": 0.1126,
"num_input_tokens_seen": 1257321184,
"step": 5208
},
{
"epoch": 2.2567006417516042,
"grad_norm": 0.19926488557298117,
"learning_rate": 1.4393884939134833e-05,
"loss": 0.1064,
"num_input_tokens_seen": 1262315984,
"step": 5229
},
{
"epoch": 2.265760664401661,
"grad_norm": 0.18546254868875062,
"learning_rate": 1.4062325681767469e-05,
"loss": 0.1096,
"num_input_tokens_seen": 1267351616,
"step": 5250
},
{
"epoch": 2.274820687051718,
"grad_norm": 0.21880629906349583,
"learning_rate": 1.3734003780448218e-05,
"loss": 0.1089,
"num_input_tokens_seen": 1272350592,
"step": 5271
},
{
"epoch": 2.283880709701774,
"grad_norm": 0.1996371660776893,
"learning_rate": 1.340894881123932e-05,
"loss": 0.1093,
"num_input_tokens_seen": 1277314160,
"step": 5292
},
{
"epoch": 2.292940732351831,
"grad_norm": 0.18322023913039737,
"learning_rate": 1.308719005590957e-05,
"loss": 0.1064,
"num_input_tokens_seen": 1282348896,
"step": 5313
},
{
"epoch": 2.3020007550018873,
"grad_norm": 0.19825429674508396,
"learning_rate": 1.276875649929654e-05,
"loss": 0.1103,
"num_input_tokens_seen": 1287503120,
"step": 5334
},
{
"epoch": 2.311060777651944,
"grad_norm": 0.20100225641083314,
"learning_rate": 1.2453676826695532e-05,
"loss": 0.1077,
"num_input_tokens_seen": 1292488224,
"step": 5355
},
{
"epoch": 2.320120800302001,
"grad_norm": 0.19869949736224346,
"learning_rate": 1.2141979421275545e-05,
"loss": 0.1051,
"num_input_tokens_seen": 1297613792,
"step": 5376
},
{
"epoch": 2.329180822952057,
"grad_norm": 0.20145867765354752,
"learning_rate": 1.1833692361522459e-05,
"loss": 0.1063,
"num_input_tokens_seen": 1302765200,
"step": 5397
},
{
"epoch": 2.338240845602114,
"grad_norm": 0.20680505787617295,
"learning_rate": 1.1528843418709622e-05,
"loss": 0.1073,
"num_input_tokens_seen": 1307780896,
"step": 5418
},
{
"epoch": 2.3473008682521708,
"grad_norm": 0.23847500451035963,
"learning_rate": 1.1227460054396177e-05,
"loss": 0.1076,
"num_input_tokens_seen": 1312916864,
"step": 5439
},
{
"epoch": 2.356360890902227,
"grad_norm": 0.21518454470567003,
"learning_rate": 1.0929569417953278e-05,
"loss": 0.1049,
"num_input_tokens_seen": 1317924528,
"step": 5460
},
{
"epoch": 2.365420913552284,
"grad_norm": 0.19953783012904103,
"learning_rate": 1.0635198344118296e-05,
"loss": 0.1038,
"num_input_tokens_seen": 1322892896,
"step": 5481
},
{
"epoch": 2.3744809362023407,
"grad_norm": 0.20097656219123833,
"learning_rate": 1.034437335057762e-05,
"loss": 0.1049,
"num_input_tokens_seen": 1328000960,
"step": 5502
},
{
"epoch": 2.383540958852397,
"grad_norm": 0.20223248741837738,
"learning_rate": 1.005712063557776e-05,
"loss": 0.1026,
"num_input_tokens_seen": 1333104928,
"step": 5523
},
{
"epoch": 2.392600981502454,
"grad_norm": 0.184389360298103,
"learning_rate": 9.773466075565457e-06,
"loss": 0.1061,
"num_input_tokens_seen": 1338094928,
"step": 5544
},
{
"epoch": 2.40166100415251,
"grad_norm": 0.18202586925329933,
"learning_rate": 9.493435222856556e-06,
"loss": 0.1078,
"num_input_tokens_seen": 1343094352,
"step": 5565
},
{
"epoch": 2.410721026802567,
"grad_norm": 0.21238526697964133,
"learning_rate": 9.21705330333426e-06,
"loss": 0.1021,
"num_input_tokens_seen": 1348209008,
"step": 5586
},
{
"epoch": 2.4197810494526237,
"grad_norm": 0.2033611614783377,
"learning_rate": 8.944345214176675e-06,
"loss": 0.105,
"num_input_tokens_seen": 1353281712,
"step": 5607
},
{
"epoch": 2.42884107210268,
"grad_norm": 0.19144661395169293,
"learning_rate": 8.675335521614036e-06,
"loss": 0.1039,
"num_input_tokens_seen": 1358325728,
"step": 5628
},
{
"epoch": 2.437901094752737,
"grad_norm": 0.20545555012965147,
"learning_rate": 8.410048458715763e-06,
"loss": 0.1026,
"num_input_tokens_seen": 1363274864,
"step": 5649
},
{
"epoch": 2.4469611174027937,
"grad_norm": 0.20596285141748574,
"learning_rate": 8.148507923207377e-06,
"loss": 0.1046,
"num_input_tokens_seen": 1368398176,
"step": 5670
},
{
"epoch": 2.45602114005285,
"grad_norm": 0.21097019629979452,
"learning_rate": 7.890737475317817e-06,
"loss": 0.1062,
"num_input_tokens_seen": 1373421664,
"step": 5691
},
{
"epoch": 2.465081162702907,
"grad_norm": 0.1903944548607354,
"learning_rate": 7.636760335657056e-06,
"loss": 0.1005,
"num_input_tokens_seen": 1378386688,
"step": 5712
},
{
"epoch": 2.4741411853529636,
"grad_norm": 0.19609864215469505,
"learning_rate": 7.38659938312432e-06,
"loss": 0.1008,
"num_input_tokens_seen": 1383515360,
"step": 5733
},
{
"epoch": 2.48320120800302,
"grad_norm": 0.18901755025774616,
"learning_rate": 7.140277152847103e-06,
"loss": 0.1012,
"num_input_tokens_seen": 1388651712,
"step": 5754
},
{
"epoch": 2.4922612306530767,
"grad_norm": 0.2089521843263624,
"learning_rate": 6.89781583415115e-06,
"loss": 0.1004,
"num_input_tokens_seen": 1393819168,
"step": 5775
},
{
"epoch": 2.501321253303133,
"grad_norm": 0.20297486453222555,
"learning_rate": 6.659237268561569e-06,
"loss": 0.1058,
"num_input_tokens_seen": 1399005008,
"step": 5796
},
{
"epoch": 2.51038127595319,
"grad_norm": 0.1950872269091398,
"learning_rate": 6.424562947835367e-06,
"loss": 0.0996,
"num_input_tokens_seen": 1404075040,
"step": 5817
},
{
"epoch": 2.5194412986032466,
"grad_norm": 0.19137900590478205,
"learning_rate": 6.193814012025278e-06,
"loss": 0.098,
"num_input_tokens_seen": 1409145760,
"step": 5838
},
{
"epoch": 2.5285013212533034,
"grad_norm": 0.21343987395986905,
"learning_rate": 5.967011247575532e-06,
"loss": 0.1053,
"num_input_tokens_seen": 1414225568,
"step": 5859
},
{
"epoch": 2.5375613439033597,
"grad_norm": 0.21335949851815006,
"learning_rate": 5.744175085449338e-06,
"loss": 0.1021,
"num_input_tokens_seen": 1419339216,
"step": 5880
},
{
"epoch": 2.5466213665534165,
"grad_norm": 0.19658196939034006,
"learning_rate": 5.525325599288356e-06,
"loss": 0.1003,
"num_input_tokens_seen": 1424423024,
"step": 5901
},
{
"epoch": 2.555681389203473,
"grad_norm": 0.1831370056225536,
"learning_rate": 5.310482503604497e-06,
"loss": 0.1039,
"num_input_tokens_seen": 1429360512,
"step": 5922
},
{
"epoch": 2.5647414118535297,
"grad_norm": 0.21013156618721565,
"learning_rate": 5.09966515200393e-06,
"loss": 0.1034,
"num_input_tokens_seen": 1434443216,
"step": 5943
},
{
"epoch": 2.5738014345035864,
"grad_norm": 0.2204689190211589,
"learning_rate": 4.892892535443655e-06,
"loss": 0.1025,
"num_input_tokens_seen": 1439693152,
"step": 5964
},
{
"epoch": 2.582861457153643,
"grad_norm": 0.20888767875138448,
"learning_rate": 4.690183280520777e-06,
"loss": 0.1041,
"num_input_tokens_seen": 1444742640,
"step": 5985
},
{
"epoch": 2.5919214798036996,
"grad_norm": 0.20314033156230726,
"learning_rate": 4.491555647794609e-06,
"loss": 0.1035,
"num_input_tokens_seen": 1449817024,
"step": 6006
},
{
"epoch": 2.600981502453756,
"grad_norm": 0.1878500846044568,
"learning_rate": 4.297027530141634e-06,
"loss": 0.102,
"num_input_tokens_seen": 1454966656,
"step": 6027
},
{
"epoch": 2.6100415251038127,
"grad_norm": 0.1689243463349296,
"learning_rate": 4.106616451143719e-06,
"loss": 0.0968,
"num_input_tokens_seen": 1460107904,
"step": 6048
},
{
"epoch": 2.6191015477538695,
"grad_norm": 0.20642026958771845,
"learning_rate": 3.9203395635095615e-06,
"loss": 0.1025,
"num_input_tokens_seen": 1465329712,
"step": 6069
},
{
"epoch": 2.6281615704039263,
"grad_norm": 0.18586990386683522,
"learning_rate": 3.7382136475294592e-06,
"loss": 0.0992,
"num_input_tokens_seen": 1470486400,
"step": 6090
},
{
"epoch": 2.6372215930539826,
"grad_norm": 0.2145218277764739,
"learning_rate": 3.5602551095638094e-06,
"loss": 0.1014,
"num_input_tokens_seen": 1475481216,
"step": 6111
},
{
"epoch": 2.6462816157040394,
"grad_norm": 0.18413690443200365,
"learning_rate": 3.386479980565077e-06,
"loss": 0.097,
"num_input_tokens_seen": 1480509520,
"step": 6132
},
{
"epoch": 2.6553416383540958,
"grad_norm": 0.20568925987079073,
"learning_rate": 3.2169039146337455e-06,
"loss": 0.1011,
"num_input_tokens_seen": 1485415168,
"step": 6153
},
{
"epoch": 2.6644016610041525,
"grad_norm": 0.19689596480908558,
"learning_rate": 3.0515421876081364e-06,
"loss": 0.1003,
"num_input_tokens_seen": 1490580288,
"step": 6174
},
{
"epoch": 2.6734616836542093,
"grad_norm": 0.19998505506705416,
"learning_rate": 2.8904096956883396e-06,
"loss": 0.1011,
"num_input_tokens_seen": 1495724928,
"step": 6195
},
{
"epoch": 2.6825217063042657,
"grad_norm": 0.19403273255448156,
"learning_rate": 2.733520954094304e-06,
"loss": 0.0992,
"num_input_tokens_seen": 1500671568,
"step": 6216
},
{
"epoch": 2.6915817289543225,
"grad_norm": 0.2062968511222456,
"learning_rate": 2.580890095758276e-06,
"loss": 0.0985,
"num_input_tokens_seen": 1505736848,
"step": 6237
},
{
"epoch": 2.700641751604379,
"grad_norm": 0.18160540614114035,
"learning_rate": 2.4325308700516804e-06,
"loss": 0.0999,
"num_input_tokens_seen": 1510772384,
"step": 6258
},
{
"epoch": 2.7097017742544356,
"grad_norm": 0.1923201194653099,
"learning_rate": 2.288456641546549e-06,
"loss": 0.1015,
"num_input_tokens_seen": 1515840336,
"step": 6279
},
{
"epoch": 2.7187617969044924,
"grad_norm": 0.19463587674679012,
"learning_rate": 2.1486803888115802e-06,
"loss": 0.0952,
"num_input_tokens_seen": 1520795728,
"step": 6300
},
{
"epoch": 2.7278218195545487,
"grad_norm": 0.20675363777805839,
"learning_rate": 2.013214703242994e-06,
"loss": 0.1014,
"num_input_tokens_seen": 1525885232,
"step": 6321
},
{
"epoch": 2.7368818422046055,
"grad_norm": 0.20373174987364437,
"learning_rate": 1.8820717879303175e-06,
"loss": 0.0962,
"num_input_tokens_seen": 1531020736,
"step": 6342
},
{
"epoch": 2.7459418648546623,
"grad_norm": 0.1956195345947546,
"learning_rate": 1.7552634565570325e-06,
"loss": 0.0984,
"num_input_tokens_seen": 1536139280,
"step": 6363
},
{
"epoch": 2.7550018875047186,
"grad_norm": 0.1948987852214273,
"learning_rate": 1.6328011323364313e-06,
"loss": 0.0996,
"num_input_tokens_seen": 1541119392,
"step": 6384
},
{
"epoch": 2.7640619101547754,
"grad_norm": 0.20220064200986687,
"learning_rate": 1.5146958469825445e-06,
"loss": 0.098,
"num_input_tokens_seen": 1546172016,
"step": 6405
},
{
"epoch": 2.773121932804832,
"grad_norm": 0.18253057622469138,
"learning_rate": 1.4009582397163879e-06,
"loss": 0.0979,
"num_input_tokens_seen": 1551424800,
"step": 6426
},
{
"epoch": 2.7821819554548886,
"grad_norm": 0.19954203330015002,
"learning_rate": 1.2915985563075383e-06,
"loss": 0.096,
"num_input_tokens_seen": 1556510032,
"step": 6447
},
{
"epoch": 2.7912419781049453,
"grad_norm": 0.18349734934457243,
"learning_rate": 1.1866266481512234e-06,
"loss": 0.0995,
"num_input_tokens_seen": 1561425840,
"step": 6468
},
{
"epoch": 2.8003020007550017,
"grad_norm": 0.19583118554915796,
"learning_rate": 1.0860519713808082e-06,
"loss": 0.0979,
"num_input_tokens_seen": 1566437584,
"step": 6489
},
{
"epoch": 2.8093620234050585,
"grad_norm": 0.21286644366673166,
"learning_rate": 9.898835860160271e-07,
"loss": 0.0944,
"num_input_tokens_seen": 1571433728,
"step": 6510
},
{
"epoch": 2.8184220460551153,
"grad_norm": 0.18549288153723123,
"learning_rate": 8.981301551467924e-07,
"loss": 0.0949,
"num_input_tokens_seen": 1576510304,
"step": 6531
},
{
"epoch": 2.8274820687051716,
"grad_norm": 0.1969304866148859,
"learning_rate": 8.10799944152818e-07,
"loss": 0.0959,
"num_input_tokens_seen": 1581652480,
"step": 6552
},
{
"epoch": 2.8365420913552284,
"grad_norm": 0.19621694587220775,
"learning_rate": 7.279008199590543e-07,
"loss": 0.0995,
"num_input_tokens_seen": 1586710928,
"step": 6573
},
{
"epoch": 2.845602114005285,
"grad_norm": 0.19626099410771614,
"learning_rate": 6.494402503270158e-07,
"loss": 0.0973,
"num_input_tokens_seen": 1591751872,
"step": 6594
},
{
"epoch": 2.8546621366553415,
"grad_norm": 0.17530668226038035,
"learning_rate": 5.754253031820588e-07,
"loss": 0.0992,
"num_input_tokens_seen": 1596817344,
"step": 6615
},
{
"epoch": 2.8637221593053983,
"grad_norm": 0.1962276487617284,
"learning_rate": 5.058626459766902e-07,
"loss": 0.0978,
"num_input_tokens_seen": 1601955280,
"step": 6636
},
{
"epoch": 2.872782181955455,
"grad_norm": 0.19661166968992774,
"learning_rate": 4.407585450899587e-07,
"loss": 0.0963,
"num_input_tokens_seen": 1606977888,
"step": 6657
},
{
"epoch": 2.8818422046055114,
"grad_norm": 0.1900385103617743,
"learning_rate": 3.8011886526292395e-07,
"loss": 0.0932,
"num_input_tokens_seen": 1612137088,
"step": 6678
},
{
"epoch": 2.8909022272555682,
"grad_norm": 0.19054833990071282,
"learning_rate": 3.2394906907040056e-07,
"loss": 0.098,
"num_input_tokens_seen": 1617105760,
"step": 6699
},
{
"epoch": 2.8999622499056246,
"grad_norm": 0.22982156321658473,
"learning_rate": 2.7225421642883554e-07,
"loss": 0.099,
"num_input_tokens_seen": 1622079712,
"step": 6720
},
{
"epoch": 2.9090222725556814,
"grad_norm": 0.20733117617880123,
"learning_rate": 2.250389641405115e-07,
"loss": 0.0974,
"num_input_tokens_seen": 1627262208,
"step": 6741
},
{
"epoch": 2.918082295205738,
"grad_norm": 0.2091535861816229,
"learning_rate": 1.823075654740547e-07,
"loss": 0.0995,
"num_input_tokens_seen": 1632293744,
"step": 6762
},
{
"epoch": 2.9271423178557945,
"grad_norm": 0.2010699924002249,
"learning_rate": 1.4406386978128018e-07,
"loss": 0.0955,
"num_input_tokens_seen": 1637413856,
"step": 6783
},
{
"epoch": 2.9362023405058513,
"grad_norm": 0.2039974530223662,
"learning_rate": 1.1031132215043594e-07,
"loss": 0.095,
"num_input_tokens_seen": 1642376144,
"step": 6804
},
{
"epoch": 2.9452623631559076,
"grad_norm": 0.1867785407127438,
"learning_rate": 8.105296309586785e-08,
"loss": 0.0995,
"num_input_tokens_seen": 1647471008,
"step": 6825
},
{
"epoch": 2.9543223858059644,
"grad_norm": 0.1873978173047584,
"learning_rate": 5.629142828411094e-08,
"loss": 0.0976,
"num_input_tokens_seen": 1652489696,
"step": 6846
},
{
"epoch": 2.963382408456021,
"grad_norm": 0.1977061377471765,
"learning_rate": 3.602894829647374e-08,
"loss": 0.0955,
"num_input_tokens_seen": 1657488848,
"step": 6867
},
{
"epoch": 2.972442431106078,
"grad_norm": 0.19200607060766933,
"learning_rate": 2.0267348428087974e-08,
"loss": 0.0979,
"num_input_tokens_seen": 1662468816,
"step": 6888
},
{
"epoch": 2.9815024537561343,
"grad_norm": 0.21184767679059754,
"learning_rate": 9.008048523501122e-09,
"loss": 0.0999,
"num_input_tokens_seen": 1667627776,
"step": 6909
},
{
"epoch": 2.990562476406191,
"grad_norm": 0.2003643238535032,
"learning_rate": 2.252062848745462e-09,
"loss": 0.099,
"num_input_tokens_seen": 1672724048,
"step": 6930
},
{
"epoch": 2.9996224990562474,
"grad_norm": 0.19768122036104033,
"learning_rate": 0.0,
"loss": 0.0971,
"num_input_tokens_seen": 1677860944,
"step": 6951
},
{
"epoch": 2.9996224990562474,
"num_input_tokens_seen": 1677860944,
"step": 6951,
"total_flos": 8545808136798208.0,
"train_loss": 0.2019795169591595,
"train_runtime": 178782.2648,
"train_samples_per_second": 4.978,
"train_steps_per_second": 0.039
}
],
"logging_steps": 21,
"max_steps": 6951,
"num_input_tokens_seen": 1677860944,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8545808136798208.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}