ner-on-types / trainer_state.json
arynkiewicz's picture
Model save
54198be verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4689,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0064,
"grad_norm": 2.760316424583359,
"learning_rate": 3.1914893617021275e-07,
"loss": 1.6571,
"num_tokens": 1208276.0,
"step": 10
},
{
"epoch": 0.0128,
"grad_norm": 1.9639395470560352,
"learning_rate": 6.73758865248227e-07,
"loss": 1.6372,
"num_tokens": 2410446.0,
"step": 20
},
{
"epoch": 0.0192,
"grad_norm": 1.6857950160814903,
"learning_rate": 1.0283687943262412e-06,
"loss": 1.6138,
"num_tokens": 3622536.0,
"step": 30
},
{
"epoch": 0.0256,
"grad_norm": 1.9355649405079267,
"learning_rate": 1.3829787234042555e-06,
"loss": 1.554,
"num_tokens": 4837847.0,
"step": 40
},
{
"epoch": 0.032,
"grad_norm": 1.0134479979543427,
"learning_rate": 1.7375886524822697e-06,
"loss": 1.5138,
"num_tokens": 6044886.0,
"step": 50
},
{
"epoch": 0.0384,
"grad_norm": 0.7097712225560386,
"learning_rate": 2.092198581560284e-06,
"loss": 1.4577,
"num_tokens": 7255346.0,
"step": 60
},
{
"epoch": 0.0448,
"grad_norm": 0.7563602572113316,
"learning_rate": 2.446808510638298e-06,
"loss": 1.4239,
"num_tokens": 8465627.0,
"step": 70
},
{
"epoch": 0.0512,
"grad_norm": 0.6411265148411116,
"learning_rate": 2.8014184397163125e-06,
"loss": 1.3857,
"num_tokens": 9667266.0,
"step": 80
},
{
"epoch": 0.0576,
"grad_norm": 0.7071256376230877,
"learning_rate": 3.1560283687943267e-06,
"loss": 1.3736,
"num_tokens": 10869831.0,
"step": 90
},
{
"epoch": 0.064,
"grad_norm": 0.7623180191305359,
"learning_rate": 3.510638297872341e-06,
"loss": 1.3722,
"num_tokens": 12083093.0,
"step": 100
},
{
"epoch": 0.0704,
"grad_norm": 0.651385333897087,
"learning_rate": 3.865248226950355e-06,
"loss": 1.3468,
"num_tokens": 13290331.0,
"step": 110
},
{
"epoch": 0.0768,
"grad_norm": 0.8706225351642094,
"learning_rate": 4.219858156028369e-06,
"loss": 1.3387,
"num_tokens": 14488386.0,
"step": 120
},
{
"epoch": 0.0832,
"grad_norm": 0.84726755662717,
"learning_rate": 4.574468085106383e-06,
"loss": 1.3364,
"num_tokens": 15690608.0,
"step": 130
},
{
"epoch": 0.0896,
"grad_norm": 0.8553144960607314,
"learning_rate": 4.929078014184397e-06,
"loss": 1.3207,
"num_tokens": 16894120.0,
"step": 140
},
{
"epoch": 0.096,
"grad_norm": 0.6845288880044453,
"learning_rate": 4.999961827753897e-06,
"loss": 1.3072,
"num_tokens": 18098866.0,
"step": 150
},
{
"epoch": 0.1024,
"grad_norm": 0.7060413425833653,
"learning_rate": 4.999806755001946e-06,
"loss": 1.293,
"num_tokens": 19317515.0,
"step": 160
},
{
"epoch": 0.1088,
"grad_norm": 1.112301905134234,
"learning_rate": 4.999532403372408e-06,
"loss": 1.2933,
"num_tokens": 20523986.0,
"step": 170
},
{
"epoch": 0.1152,
"grad_norm": 0.7057273926728088,
"learning_rate": 4.9991387859560365e-06,
"loss": 1.3105,
"num_tokens": 21730204.0,
"step": 180
},
{
"epoch": 0.1216,
"grad_norm": 0.7046621457199816,
"learning_rate": 4.9986259215343814e-06,
"loss": 1.3036,
"num_tokens": 22941629.0,
"step": 190
},
{
"epoch": 0.128,
"grad_norm": 0.6753839003505228,
"learning_rate": 4.997993834578891e-06,
"loss": 1.2837,
"num_tokens": 24149743.0,
"step": 200
},
{
"epoch": 0.1344,
"grad_norm": 0.6833117540920727,
"learning_rate": 4.997242555249746e-06,
"loss": 1.2798,
"num_tokens": 25350421.0,
"step": 210
},
{
"epoch": 0.1408,
"grad_norm": 0.7496565711502305,
"learning_rate": 4.996372119394418e-06,
"loss": 1.2872,
"num_tokens": 26553851.0,
"step": 220
},
{
"epoch": 0.1472,
"grad_norm": 0.8257784450438341,
"learning_rate": 4.9953825685459635e-06,
"loss": 1.2715,
"num_tokens": 27756494.0,
"step": 230
},
{
"epoch": 0.1536,
"grad_norm": 0.8586750458312551,
"learning_rate": 4.994273949921038e-06,
"loss": 1.273,
"num_tokens": 28966311.0,
"step": 240
},
{
"epoch": 0.16,
"grad_norm": 0.8942167127143708,
"learning_rate": 4.993046316417643e-06,
"loss": 1.2615,
"num_tokens": 30165165.0,
"step": 250
},
{
"epoch": 0.1664,
"grad_norm": 0.7320667303892974,
"learning_rate": 4.991699726612607e-06,
"loss": 1.2598,
"num_tokens": 31372687.0,
"step": 260
},
{
"epoch": 0.1728,
"grad_norm": 0.7759159652826615,
"learning_rate": 4.990234244758785e-06,
"loss": 1.2378,
"num_tokens": 32578240.0,
"step": 270
},
{
"epoch": 0.1792,
"grad_norm": 0.7081937298786585,
"learning_rate": 4.988649940781992e-06,
"loss": 1.2496,
"num_tokens": 33788704.0,
"step": 280
},
{
"epoch": 0.1856,
"grad_norm": 0.8354872354621143,
"learning_rate": 4.986946890277673e-06,
"loss": 1.239,
"num_tokens": 34992041.0,
"step": 290
},
{
"epoch": 0.192,
"grad_norm": 0.7419306542972816,
"learning_rate": 4.9851251745072905e-06,
"loss": 1.2334,
"num_tokens": 36202424.0,
"step": 300
},
{
"epoch": 0.1984,
"grad_norm": 0.8124424043952861,
"learning_rate": 4.983184880394447e-06,
"loss": 1.2423,
"num_tokens": 37406998.0,
"step": 310
},
{
"epoch": 0.2048,
"grad_norm": 0.9137121442594122,
"learning_rate": 4.981126100520743e-06,
"loss": 1.2398,
"num_tokens": 38614024.0,
"step": 320
},
{
"epoch": 0.2112,
"grad_norm": 0.8692171799253517,
"learning_rate": 4.978948933121351e-06,
"loss": 1.2274,
"num_tokens": 39818938.0,
"step": 330
},
{
"epoch": 0.2176,
"grad_norm": 0.7959433307352174,
"learning_rate": 4.976653482080335e-06,
"loss": 1.2432,
"num_tokens": 41029985.0,
"step": 340
},
{
"epoch": 0.224,
"grad_norm": 0.9183385731990914,
"learning_rate": 4.97423985692569e-06,
"loss": 1.2183,
"num_tokens": 42241595.0,
"step": 350
},
{
"epoch": 0.2304,
"grad_norm": 0.8800279308744207,
"learning_rate": 4.97170817282412e-06,
"loss": 1.2174,
"num_tokens": 43436994.0,
"step": 360
},
{
"epoch": 0.2368,
"grad_norm": 0.8482042891364965,
"learning_rate": 4.969058550575535e-06,
"loss": 1.214,
"num_tokens": 44649051.0,
"step": 370
},
{
"epoch": 0.2432,
"grad_norm": 0.8597854654288322,
"learning_rate": 4.966291116607297e-06,
"loss": 1.2105,
"num_tokens": 45857075.0,
"step": 380
},
{
"epoch": 0.2496,
"grad_norm": 0.8904371734549302,
"learning_rate": 4.96340600296818e-06,
"loss": 1.1976,
"num_tokens": 47059498.0,
"step": 390
},
{
"epoch": 0.256,
"grad_norm": 0.864096324906862,
"learning_rate": 4.960403347322069e-06,
"loss": 1.2067,
"num_tokens": 48273286.0,
"step": 400
},
{
"epoch": 0.2624,
"grad_norm": 0.8417001685001565,
"learning_rate": 4.957283292941401e-06,
"loss": 1.2012,
"num_tokens": 49479835.0,
"step": 410
},
{
"epoch": 0.2688,
"grad_norm": 0.8738206939182319,
"learning_rate": 4.954045988700315e-06,
"loss": 1.2081,
"num_tokens": 50692484.0,
"step": 420
},
{
"epoch": 0.2752,
"grad_norm": 0.9214341760640065,
"learning_rate": 4.9506915890675566e-06,
"loss": 1.1982,
"num_tokens": 51904151.0,
"step": 430
},
{
"epoch": 0.2816,
"grad_norm": 0.8270044046785595,
"learning_rate": 4.94722025409911e-06,
"loss": 1.2003,
"num_tokens": 53107439.0,
"step": 440
},
{
"epoch": 0.288,
"grad_norm": 0.9325298797380837,
"learning_rate": 4.943632149430552e-06,
"loss": 1.1934,
"num_tokens": 54311802.0,
"step": 450
},
{
"epoch": 0.2944,
"grad_norm": 0.8173318542721012,
"learning_rate": 4.9399274462691555e-06,
"loss": 1.183,
"num_tokens": 55516169.0,
"step": 460
},
{
"epoch": 0.3008,
"grad_norm": 0.8403372189641363,
"learning_rate": 4.93610632138572e-06,
"loss": 1.2011,
"num_tokens": 56720582.0,
"step": 470
},
{
"epoch": 0.3072,
"grad_norm": 0.9133683374494203,
"learning_rate": 4.9321689571061314e-06,
"loss": 1.1863,
"num_tokens": 57923305.0,
"step": 480
},
{
"epoch": 0.3136,
"grad_norm": 0.8342006897685076,
"learning_rate": 4.928115541302672e-06,
"loss": 1.1789,
"num_tokens": 59119131.0,
"step": 490
},
{
"epoch": 0.32,
"grad_norm": 0.9237208555707096,
"learning_rate": 4.923946267385043e-06,
"loss": 1.1823,
"num_tokens": 60323216.0,
"step": 500
},
{
"epoch": 0.3264,
"grad_norm": 1.138961215949811,
"learning_rate": 4.91966133429115e-06,
"loss": 1.1849,
"num_tokens": 61536243.0,
"step": 510
},
{
"epoch": 0.3328,
"grad_norm": 0.8179215725319021,
"learning_rate": 4.915260946477601e-06,
"loss": 1.1689,
"num_tokens": 62725558.0,
"step": 520
},
{
"epoch": 0.3392,
"grad_norm": 0.8196458509991646,
"learning_rate": 4.910745313909953e-06,
"loss": 1.1754,
"num_tokens": 63929035.0,
"step": 530
},
{
"epoch": 0.3456,
"grad_norm": 0.8606903543941481,
"learning_rate": 4.906114652052694e-06,
"loss": 1.1608,
"num_tokens": 65137799.0,
"step": 540
},
{
"epoch": 0.352,
"grad_norm": 0.842427893289404,
"learning_rate": 4.9013691818589635e-06,
"loss": 1.176,
"num_tokens": 66343119.0,
"step": 550
},
{
"epoch": 0.3584,
"grad_norm": 0.9536458222010928,
"learning_rate": 4.896509129760008e-06,
"loss": 1.1766,
"num_tokens": 67554625.0,
"step": 560
},
{
"epoch": 0.3648,
"grad_norm": 0.8456584910416223,
"learning_rate": 4.891534727654374e-06,
"loss": 1.1704,
"num_tokens": 68767553.0,
"step": 570
},
{
"epoch": 0.3712,
"grad_norm": 0.825023352714185,
"learning_rate": 4.886446212896853e-06,
"loss": 1.1662,
"num_tokens": 69977707.0,
"step": 580
},
{
"epoch": 0.3776,
"grad_norm": 0.8327520829988985,
"learning_rate": 4.881243828287141e-06,
"loss": 1.1715,
"num_tokens": 71189476.0,
"step": 590
},
{
"epoch": 0.384,
"grad_norm": 0.840077866672345,
"learning_rate": 4.875927822058265e-06,
"loss": 1.1711,
"num_tokens": 72395847.0,
"step": 600
},
{
"epoch": 0.3904,
"grad_norm": 0.8253947193633453,
"learning_rate": 4.870498447864735e-06,
"loss": 1.1439,
"num_tokens": 73594932.0,
"step": 610
},
{
"epoch": 0.3968,
"grad_norm": 0.9212419524845424,
"learning_rate": 4.864955964770442e-06,
"loss": 1.1643,
"num_tokens": 74802657.0,
"step": 620
},
{
"epoch": 0.4032,
"grad_norm": 0.9296250658068028,
"learning_rate": 4.859300637236289e-06,
"loss": 1.1534,
"num_tokens": 76011529.0,
"step": 630
},
{
"epoch": 0.4096,
"grad_norm": 1.057634627530951,
"learning_rate": 4.853532735107587e-06,
"loss": 1.1507,
"num_tokens": 77210334.0,
"step": 640
},
{
"epoch": 0.416,
"grad_norm": 0.8097939416205123,
"learning_rate": 4.847652533601164e-06,
"loss": 1.1395,
"num_tokens": 78425328.0,
"step": 650
},
{
"epoch": 0.4224,
"grad_norm": 0.8447649876579609,
"learning_rate": 4.8416603132922425e-06,
"loss": 1.1378,
"num_tokens": 79638521.0,
"step": 660
},
{
"epoch": 0.4288,
"grad_norm": 0.9421170322416722,
"learning_rate": 4.83555636010105e-06,
"loss": 1.1349,
"num_tokens": 80836868.0,
"step": 670
},
{
"epoch": 0.4352,
"grad_norm": 0.9009555407016511,
"learning_rate": 4.829340965279173e-06,
"loss": 1.1482,
"num_tokens": 82050746.0,
"step": 680
},
{
"epoch": 0.4416,
"grad_norm": 0.9304718962620818,
"learning_rate": 4.823014425395662e-06,
"loss": 1.1535,
"num_tokens": 83256247.0,
"step": 690
},
{
"epoch": 0.448,
"grad_norm": 0.8268029795401431,
"learning_rate": 4.816577042322883e-06,
"loss": 1.1625,
"num_tokens": 84466963.0,
"step": 700
},
{
"epoch": 0.4544,
"grad_norm": 0.8118838757785675,
"learning_rate": 4.810029123222109e-06,
"loss": 1.1582,
"num_tokens": 85668747.0,
"step": 710
},
{
"epoch": 0.4608,
"grad_norm": 0.8191391458452703,
"learning_rate": 4.803370980528868e-06,
"loss": 1.1508,
"num_tokens": 86869314.0,
"step": 720
},
{
"epoch": 0.4672,
"grad_norm": 0.8573356891805307,
"learning_rate": 4.796602931938031e-06,
"loss": 1.1367,
"num_tokens": 88072166.0,
"step": 730
},
{
"epoch": 0.4736,
"grad_norm": 0.9130087766709583,
"learning_rate": 4.789725300388658e-06,
"loss": 1.1496,
"num_tokens": 89276560.0,
"step": 740
},
{
"epoch": 0.48,
"grad_norm": 0.8756224792489176,
"learning_rate": 4.782738414048581e-06,
"loss": 1.1387,
"num_tokens": 90489167.0,
"step": 750
},
{
"epoch": 0.4864,
"grad_norm": 0.8660533049576743,
"learning_rate": 4.775642606298758e-06,
"loss": 1.1293,
"num_tokens": 91699027.0,
"step": 760
},
{
"epoch": 0.4928,
"grad_norm": 0.9344747635312723,
"learning_rate": 4.7684382157173515e-06,
"loss": 1.1544,
"num_tokens": 92907904.0,
"step": 770
},
{
"epoch": 0.4992,
"grad_norm": 0.8232769483557345,
"learning_rate": 4.761125586063583e-06,
"loss": 1.1509,
"num_tokens": 94108258.0,
"step": 780
},
{
"epoch": 0.5056,
"grad_norm": 0.8019044034927749,
"learning_rate": 4.753705066261326e-06,
"loss": 1.142,
"num_tokens": 95319591.0,
"step": 790
},
{
"epoch": 0.512,
"grad_norm": 0.8744491818182848,
"learning_rate": 4.74617701038246e-06,
"loss": 1.1407,
"num_tokens": 96527466.0,
"step": 800
},
{
"epoch": 0.5184,
"grad_norm": 0.8457377069978257,
"learning_rate": 4.738541777629971e-06,
"loss": 1.1454,
"num_tokens": 97741955.0,
"step": 810
},
{
"epoch": 0.5248,
"grad_norm": 0.8367461594303044,
"learning_rate": 4.730799732320819e-06,
"loss": 1.1499,
"num_tokens": 98947846.0,
"step": 820
},
{
"epoch": 0.5312,
"grad_norm": 0.8153933334854007,
"learning_rate": 4.722951243868547e-06,
"loss": 1.1338,
"num_tokens": 100149443.0,
"step": 830
},
{
"epoch": 0.5376,
"grad_norm": 0.9553883385280855,
"learning_rate": 4.7149966867656625e-06,
"loss": 1.1239,
"num_tokens": 101354489.0,
"step": 840
},
{
"epoch": 0.544,
"grad_norm": 0.8020256868069202,
"learning_rate": 4.706936440565759e-06,
"loss": 1.1233,
"num_tokens": 102561908.0,
"step": 850
},
{
"epoch": 0.5504,
"grad_norm": 0.8506848444686664,
"learning_rate": 4.698770889865414e-06,
"loss": 1.1314,
"num_tokens": 103765389.0,
"step": 860
},
{
"epoch": 0.5568,
"grad_norm": 0.8931807739845334,
"learning_rate": 4.690500424285833e-06,
"loss": 1.1367,
"num_tokens": 104973326.0,
"step": 870
},
{
"epoch": 0.5632,
"grad_norm": 0.8498884776316712,
"learning_rate": 4.682125438454261e-06,
"loss": 1.1329,
"num_tokens": 106184942.0,
"step": 880
},
{
"epoch": 0.5696,
"grad_norm": 0.8866656591752357,
"learning_rate": 4.673646331985151e-06,
"loss": 1.1469,
"num_tokens": 107391403.0,
"step": 890
},
{
"epoch": 0.576,
"grad_norm": 0.8247486140289442,
"learning_rate": 4.665063509461098e-06,
"loss": 1.1304,
"num_tokens": 108599244.0,
"step": 900
},
{
"epoch": 0.5824,
"grad_norm": 0.8509584195104843,
"learning_rate": 4.6563773804135305e-06,
"loss": 1.1205,
"num_tokens": 109802767.0,
"step": 910
},
{
"epoch": 0.5888,
"grad_norm": 0.9532478448654986,
"learning_rate": 4.647588359303178e-06,
"loss": 1.135,
"num_tokens": 111002144.0,
"step": 920
},
{
"epoch": 0.5952,
"grad_norm": 0.795143766492276,
"learning_rate": 4.638696865500284e-06,
"loss": 1.133,
"num_tokens": 112202360.0,
"step": 930
},
{
"epoch": 0.6016,
"grad_norm": 0.8884950967785606,
"learning_rate": 4.629703323264605e-06,
"loss": 1.1174,
"num_tokens": 113410661.0,
"step": 940
},
{
"epoch": 0.608,
"grad_norm": 0.8094095645216874,
"learning_rate": 4.62060816172516e-06,
"loss": 1.1359,
"num_tokens": 114615154.0,
"step": 950
},
{
"epoch": 0.6144,
"grad_norm": 0.8517004319099382,
"learning_rate": 4.611411814859758e-06,
"loss": 1.1141,
"num_tokens": 115826696.0,
"step": 960
},
{
"epoch": 0.6208,
"grad_norm": 0.8739388391386897,
"learning_rate": 4.602114721474293e-06,
"loss": 1.1204,
"num_tokens": 117030663.0,
"step": 970
},
{
"epoch": 0.6272,
"grad_norm": 1.0126603878935398,
"learning_rate": 4.592717325181798e-06,
"loss": 1.1259,
"num_tokens": 118243461.0,
"step": 980
},
{
"epoch": 0.6336,
"grad_norm": 0.7961249459761912,
"learning_rate": 4.583220074381288e-06,
"loss": 1.1105,
"num_tokens": 119444400.0,
"step": 990
},
{
"epoch": 0.64,
"grad_norm": 0.8547801323336933,
"learning_rate": 4.573623422236359e-06,
"loss": 1.1247,
"num_tokens": 120646721.0,
"step": 1000
},
{
"epoch": 0.6464,
"grad_norm": 0.8827343366608609,
"learning_rate": 4.563927826653562e-06,
"loss": 1.1381,
"num_tokens": 121856814.0,
"step": 1010
},
{
"epoch": 0.6528,
"grad_norm": 0.8379604515543791,
"learning_rate": 4.554133750260561e-06,
"loss": 1.1038,
"num_tokens": 123063137.0,
"step": 1020
},
{
"epoch": 0.6592,
"grad_norm": 0.9009991930297082,
"learning_rate": 4.544241660384057e-06,
"loss": 1.1351,
"num_tokens": 124281752.0,
"step": 1030
},
{
"epoch": 0.6656,
"grad_norm": 0.9398290903202526,
"learning_rate": 4.534252029027485e-06,
"loss": 1.132,
"num_tokens": 125483927.0,
"step": 1040
},
{
"epoch": 0.672,
"grad_norm": 0.8135458599046622,
"learning_rate": 4.5241653328484965e-06,
"loss": 1.1137,
"num_tokens": 126688041.0,
"step": 1050
},
{
"epoch": 0.6784,
"grad_norm": 0.826631698433715,
"learning_rate": 4.5139820531362125e-06,
"loss": 1.1149,
"num_tokens": 127895497.0,
"step": 1060
},
{
"epoch": 0.6848,
"grad_norm": 0.8326760862617015,
"learning_rate": 4.503702675788263e-06,
"loss": 1.1082,
"num_tokens": 129093768.0,
"step": 1070
},
{
"epoch": 0.6912,
"grad_norm": 0.8187909661973681,
"learning_rate": 4.493327691287596e-06,
"loss": 1.1213,
"num_tokens": 130296941.0,
"step": 1080
},
{
"epoch": 0.6976,
"grad_norm": 0.8758642744013126,
"learning_rate": 4.482857594679082e-06,
"loss": 1.1169,
"num_tokens": 131499785.0,
"step": 1090
},
{
"epoch": 0.704,
"grad_norm": 0.9756017880226009,
"learning_rate": 4.472292885545887e-06,
"loss": 1.1182,
"num_tokens": 132704447.0,
"step": 1100
},
{
"epoch": 0.7104,
"grad_norm": 0.9918470716003941,
"learning_rate": 4.4616340679856344e-06,
"loss": 1.112,
"num_tokens": 133914148.0,
"step": 1110
},
{
"epoch": 0.7168,
"grad_norm": 0.7736509572616426,
"learning_rate": 4.450881650586354e-06,
"loss": 1.0948,
"num_tokens": 135116690.0,
"step": 1120
},
{
"epoch": 0.7232,
"grad_norm": 0.8393996918370894,
"learning_rate": 4.440036146402218e-06,
"loss": 1.1196,
"num_tokens": 136325534.0,
"step": 1130
},
{
"epoch": 0.7296,
"grad_norm": 0.8283036410858456,
"learning_rate": 4.429098072929052e-06,
"loss": 1.1249,
"num_tokens": 137532058.0,
"step": 1140
},
{
"epoch": 0.736,
"grad_norm": 1.0272561438627168,
"learning_rate": 4.418067952079651e-06,
"loss": 1.0894,
"num_tokens": 138742925.0,
"step": 1150
},
{
"epoch": 0.7424,
"grad_norm": 0.9457224166686296,
"learning_rate": 4.40694631015887e-06,
"loss": 1.1072,
"num_tokens": 139944361.0,
"step": 1160
},
{
"epoch": 0.7488,
"grad_norm": 0.8472242869303449,
"learning_rate": 4.395733677838515e-06,
"loss": 1.104,
"num_tokens": 141145139.0,
"step": 1170
},
{
"epoch": 0.7552,
"grad_norm": 0.8369893067934512,
"learning_rate": 4.384430590132023e-06,
"loss": 1.1167,
"num_tokens": 142348857.0,
"step": 1180
},
{
"epoch": 0.7616,
"grad_norm": 0.9417838753194914,
"learning_rate": 4.373037586368925e-06,
"loss": 1.0952,
"num_tokens": 143560823.0,
"step": 1190
},
{
"epoch": 0.768,
"grad_norm": 0.83199280244184,
"learning_rate": 4.361555210169126e-06,
"loss": 1.0969,
"num_tokens": 144770576.0,
"step": 1200
},
{
"epoch": 0.7744,
"grad_norm": 0.8757783495810086,
"learning_rate": 4.349984009416952e-06,
"loss": 1.0948,
"num_tokens": 145978862.0,
"step": 1210
},
{
"epoch": 0.7808,
"grad_norm": 0.8374080168936522,
"learning_rate": 4.3383245362350174e-06,
"loss": 1.1087,
"num_tokens": 147191743.0,
"step": 1220
},
{
"epoch": 0.7872,
"grad_norm": 0.8702169752217432,
"learning_rate": 4.326577346957876e-06,
"loss": 1.1099,
"num_tokens": 148399289.0,
"step": 1230
},
{
"epoch": 0.7936,
"grad_norm": 0.8016984816166285,
"learning_rate": 4.314743002105473e-06,
"loss": 1.1052,
"num_tokens": 149602404.0,
"step": 1240
},
{
"epoch": 0.8,
"grad_norm": 1.0811796381892176,
"learning_rate": 4.302822066356408e-06,
"loss": 1.0996,
"num_tokens": 150811734.0,
"step": 1250
},
{
"epoch": 0.8064,
"grad_norm": 0.8374755480022819,
"learning_rate": 4.290815108520982e-06,
"loss": 1.1185,
"num_tokens": 152011294.0,
"step": 1260
},
{
"epoch": 0.8128,
"grad_norm": 0.7904368039438139,
"learning_rate": 4.278722701514061e-06,
"loss": 1.0992,
"num_tokens": 153217258.0,
"step": 1270
},
{
"epoch": 0.8192,
"grad_norm": 0.785661611999425,
"learning_rate": 4.266545422327741e-06,
"loss": 1.1208,
"num_tokens": 154419838.0,
"step": 1280
},
{
"epoch": 0.8256,
"grad_norm": 0.8439322755320521,
"learning_rate": 4.254283852003813e-06,
"loss": 1.1091,
"num_tokens": 155626578.0,
"step": 1290
},
{
"epoch": 0.832,
"grad_norm": 0.8732275622995317,
"learning_rate": 4.241938575606038e-06,
"loss": 1.0826,
"num_tokens": 156825805.0,
"step": 1300
},
{
"epoch": 0.8384,
"grad_norm": 0.8014980196902037,
"learning_rate": 4.229510182192235e-06,
"loss": 1.1093,
"num_tokens": 158037877.0,
"step": 1310
},
{
"epoch": 0.8448,
"grad_norm": 0.8106302375207448,
"learning_rate": 4.216999264786169e-06,
"loss": 1.1073,
"num_tokens": 159245106.0,
"step": 1320
},
{
"epoch": 0.8512,
"grad_norm": 0.9385310776537238,
"learning_rate": 4.204406420349259e-06,
"loss": 1.1056,
"num_tokens": 160456114.0,
"step": 1330
},
{
"epoch": 0.8576,
"grad_norm": 0.9579249297784465,
"learning_rate": 4.191732249752092e-06,
"loss": 1.1021,
"num_tokens": 161659510.0,
"step": 1340
},
{
"epoch": 0.864,
"grad_norm": 0.8134490186326385,
"learning_rate": 4.178977357745749e-06,
"loss": 1.0821,
"num_tokens": 162865495.0,
"step": 1350
},
{
"epoch": 0.8704,
"grad_norm": 0.7943299269230713,
"learning_rate": 4.166142352932957e-06,
"loss": 1.1065,
"num_tokens": 164069925.0,
"step": 1360
},
{
"epoch": 0.8768,
"grad_norm": 0.8171116530483417,
"learning_rate": 4.153227847739041e-06,
"loss": 1.0873,
"num_tokens": 165272777.0,
"step": 1370
},
{
"epoch": 0.8832,
"grad_norm": 0.8472827858602203,
"learning_rate": 4.140234458382708e-06,
"loss": 1.1207,
"num_tokens": 166473564.0,
"step": 1380
},
{
"epoch": 0.8896,
"grad_norm": 0.8254355045966608,
"learning_rate": 4.12716280484664e-06,
"loss": 1.093,
"num_tokens": 167678209.0,
"step": 1390
},
{
"epoch": 0.896,
"grad_norm": 0.8238773032302608,
"learning_rate": 4.114013510847914e-06,
"loss": 1.1004,
"num_tokens": 168879199.0,
"step": 1400
},
{
"epoch": 0.9024,
"grad_norm": 0.8035266067408213,
"learning_rate": 4.100787203808241e-06,
"loss": 1.09,
"num_tokens": 170089062.0,
"step": 1410
},
{
"epoch": 0.9088,
"grad_norm": 0.796684651593008,
"learning_rate": 4.0874845148240265e-06,
"loss": 1.0923,
"num_tokens": 171298354.0,
"step": 1420
},
{
"epoch": 0.9152,
"grad_norm": 0.7944378162845194,
"learning_rate": 4.074106078636259e-06,
"loss": 1.0877,
"num_tokens": 172502932.0,
"step": 1430
},
{
"epoch": 0.9216,
"grad_norm": 0.8222630499336689,
"learning_rate": 4.0606525336002215e-06,
"loss": 1.1069,
"num_tokens": 173714359.0,
"step": 1440
},
{
"epoch": 0.928,
"grad_norm": 0.8284462145945989,
"learning_rate": 4.047124521655037e-06,
"loss": 1.1063,
"num_tokens": 174915024.0,
"step": 1450
},
{
"epoch": 0.9344,
"grad_norm": 1.1184143246349953,
"learning_rate": 4.033522688293033e-06,
"loss": 1.0958,
"num_tokens": 176121314.0,
"step": 1460
},
{
"epoch": 0.9408,
"grad_norm": 0.9302956644371011,
"learning_rate": 4.019847682528943e-06,
"loss": 1.1057,
"num_tokens": 177329003.0,
"step": 1470
},
{
"epoch": 0.9472,
"grad_norm": 0.8315189293207337,
"learning_rate": 4.00610015686894e-06,
"loss": 1.1021,
"num_tokens": 178533383.0,
"step": 1480
},
{
"epoch": 0.9536,
"grad_norm": 0.780029339050911,
"learning_rate": 3.9922807672795015e-06,
"loss": 1.1022,
"num_tokens": 179737544.0,
"step": 1490
},
{
"epoch": 0.96,
"grad_norm": 0.8861787669753409,
"learning_rate": 3.97839017315611e-06,
"loss": 1.1033,
"num_tokens": 180941884.0,
"step": 1500
},
{
"epoch": 0.9664,
"grad_norm": 0.8613329501244571,
"learning_rate": 3.964429037291785e-06,
"loss": 1.0932,
"num_tokens": 182147995.0,
"step": 1510
},
{
"epoch": 0.9728,
"grad_norm": 0.7767446273299125,
"learning_rate": 3.950398025845469e-06,
"loss": 1.0764,
"num_tokens": 183351238.0,
"step": 1520
},
{
"epoch": 0.9792,
"grad_norm": 0.7800388177467502,
"learning_rate": 3.936297808310229e-06,
"loss": 1.0955,
"num_tokens": 184559744.0,
"step": 1530
},
{
"epoch": 0.9856,
"grad_norm": 0.822587499260109,
"learning_rate": 3.9221290574813205e-06,
"loss": 1.101,
"num_tokens": 185771261.0,
"step": 1540
},
{
"epoch": 0.992,
"grad_norm": 0.7842833667912362,
"learning_rate": 3.907892449424081e-06,
"loss": 1.0858,
"num_tokens": 186988878.0,
"step": 1550
},
{
"epoch": 0.9984,
"grad_norm": 0.875565650877801,
"learning_rate": 3.893588663441669e-06,
"loss": 1.1096,
"num_tokens": 188198614.0,
"step": 1560
},
{
"epoch": 1.00448,
"grad_norm": 0.9833099796256903,
"learning_rate": 3.8792183820426575e-06,
"loss": 1.0518,
"num_tokens": 189338860.0,
"step": 1570
},
{
"epoch": 1.01088,
"grad_norm": 0.9539211061323496,
"learning_rate": 3.864782290908462e-06,
"loss": 1.0558,
"num_tokens": 190541615.0,
"step": 1580
},
{
"epoch": 1.01728,
"grad_norm": 0.8277557093113368,
"learning_rate": 3.850281078860627e-06,
"loss": 1.0672,
"num_tokens": 191744590.0,
"step": 1590
},
{
"epoch": 1.02368,
"grad_norm": 0.8095245034674352,
"learning_rate": 3.835715437827954e-06,
"loss": 1.0555,
"num_tokens": 192946831.0,
"step": 1600
},
{
"epoch": 1.03008,
"grad_norm": 0.8670205092911757,
"learning_rate": 3.821086062813492e-06,
"loss": 1.0558,
"num_tokens": 194153241.0,
"step": 1610
},
{
"epoch": 1.03648,
"grad_norm": 0.8041612181651476,
"learning_rate": 3.806393651861372e-06,
"loss": 1.0713,
"num_tokens": 195361386.0,
"step": 1620
},
{
"epoch": 1.04288,
"grad_norm": 0.8201672913405339,
"learning_rate": 3.7916389060234964e-06,
"loss": 1.0612,
"num_tokens": 196570539.0,
"step": 1630
},
{
"epoch": 1.04928,
"grad_norm": 0.822814114472732,
"learning_rate": 3.776822529326097e-06,
"loss": 1.0643,
"num_tokens": 197758018.0,
"step": 1640
},
{
"epoch": 1.05568,
"grad_norm": 0.8405563342503541,
"learning_rate": 3.7619452287361306e-06,
"loss": 1.0576,
"num_tokens": 198962473.0,
"step": 1650
},
{
"epoch": 1.06208,
"grad_norm": 0.8733811946067399,
"learning_rate": 3.7470077141275578e-06,
"loss": 1.0602,
"num_tokens": 200168404.0,
"step": 1660
},
{
"epoch": 1.06848,
"grad_norm": 0.7810891863766373,
"learning_rate": 3.732010698247463e-06,
"loss": 1.0429,
"num_tokens": 201383921.0,
"step": 1670
},
{
"epoch": 1.07488,
"grad_norm": 0.8253121322208729,
"learning_rate": 3.7169548966820466e-06,
"loss": 1.069,
"num_tokens": 202590191.0,
"step": 1680
},
{
"epoch": 1.08128,
"grad_norm": 0.7968885719952052,
"learning_rate": 3.7018410278224852e-06,
"loss": 1.0661,
"num_tokens": 203790064.0,
"step": 1690
},
{
"epoch": 1.08768,
"grad_norm": 0.7513522866065546,
"learning_rate": 3.686669812830648e-06,
"loss": 1.0648,
"num_tokens": 205004834.0,
"step": 1700
},
{
"epoch": 1.09408,
"grad_norm": 0.8133897709614188,
"learning_rate": 3.671441975604689e-06,
"loss": 1.0574,
"num_tokens": 206218130.0,
"step": 1710
},
{
"epoch": 1.10048,
"grad_norm": 0.855169356505383,
"learning_rate": 3.6561582427445053e-06,
"loss": 1.0652,
"num_tokens": 207421774.0,
"step": 1720
},
{
"epoch": 1.10688,
"grad_norm": 0.7861479775879827,
"learning_rate": 3.6408193435170695e-06,
"loss": 1.0601,
"num_tokens": 208639076.0,
"step": 1730
},
{
"epoch": 1.11328,
"grad_norm": 0.7759167355223116,
"learning_rate": 3.625426009821628e-06,
"loss": 1.0515,
"num_tokens": 209843506.0,
"step": 1740
},
{
"epoch": 1.11968,
"grad_norm": 0.7737945956455258,
"learning_rate": 3.609978976154784e-06,
"loss": 1.0449,
"num_tokens": 211053262.0,
"step": 1750
},
{
"epoch": 1.12608,
"grad_norm": 0.8033895393207562,
"learning_rate": 3.594478979575443e-06,
"loss": 1.0653,
"num_tokens": 212256390.0,
"step": 1760
},
{
"epoch": 1.13248,
"grad_norm": 0.8687778972426285,
"learning_rate": 3.578926759669653e-06,
"loss": 1.046,
"num_tokens": 213458553.0,
"step": 1770
},
{
"epoch": 1.13888,
"grad_norm": 0.8146069292073773,
"learning_rate": 3.5633230585153093e-06,
"loss": 1.0587,
"num_tokens": 214667929.0,
"step": 1780
},
{
"epoch": 1.14528,
"grad_norm": 0.8442869654702855,
"learning_rate": 3.5476686206467465e-06,
"loss": 1.0476,
"num_tokens": 215872854.0,
"step": 1790
},
{
"epoch": 1.15168,
"grad_norm": 0.8166732673631207,
"learning_rate": 3.531964193019214e-06,
"loss": 1.0486,
"num_tokens": 217084577.0,
"step": 1800
},
{
"epoch": 1.15808,
"grad_norm": 0.8407184177973456,
"learning_rate": 3.5162105249732336e-06,
"loss": 1.0446,
"num_tokens": 218284006.0,
"step": 1810
},
{
"epoch": 1.16448,
"grad_norm": 0.7814422822824459,
"learning_rate": 3.5004083681988476e-06,
"loss": 1.0466,
"num_tokens": 219487469.0,
"step": 1820
},
{
"epoch": 1.17088,
"grad_norm": 0.7953904441180448,
"learning_rate": 3.484558476699748e-06,
"loss": 1.0539,
"num_tokens": 220690881.0,
"step": 1830
},
{
"epoch": 1.17728,
"grad_norm": 0.8120616693504964,
"learning_rate": 3.468661606757301e-06,
"loss": 1.0564,
"num_tokens": 221898060.0,
"step": 1840
},
{
"epoch": 1.18368,
"grad_norm": 0.7894301070451438,
"learning_rate": 3.45271851689446e-06,
"loss": 1.0576,
"num_tokens": 223099219.0,
"step": 1850
},
{
"epoch": 1.19008,
"grad_norm": 0.8628648936847306,
"learning_rate": 3.436729967839575e-06,
"loss": 1.0697,
"num_tokens": 224314472.0,
"step": 1860
},
{
"epoch": 1.19648,
"grad_norm": 0.8485241964897267,
"learning_rate": 3.4206967224900885e-06,
"loss": 1.0583,
"num_tokens": 225513940.0,
"step": 1870
},
{
"epoch": 1.20288,
"grad_norm": 0.8019635872502272,
"learning_rate": 3.40461954587614e-06,
"loss": 1.0484,
"num_tokens": 226733560.0,
"step": 1880
},
{
"epoch": 1.20928,
"grad_norm": 0.8148504625626072,
"learning_rate": 3.3884992051240613e-06,
"loss": 1.049,
"num_tokens": 227946861.0,
"step": 1890
},
{
"epoch": 1.21568,
"grad_norm": 0.799348761407277,
"learning_rate": 3.372336469419767e-06,
"loss": 1.0636,
"num_tokens": 229149854.0,
"step": 1900
},
{
"epoch": 1.22208,
"grad_norm": 0.8121058069211242,
"learning_rate": 3.35613210997206e-06,
"loss": 1.0679,
"num_tokens": 230358777.0,
"step": 1910
},
{
"epoch": 1.22848,
"grad_norm": 0.8225529513521229,
"learning_rate": 3.339886899975831e-06,
"loss": 1.0455,
"num_tokens": 231573319.0,
"step": 1920
},
{
"epoch": 1.23488,
"grad_norm": 0.7930056234558618,
"learning_rate": 3.3236016145751616e-06,
"loss": 1.0453,
"num_tokens": 232778798.0,
"step": 1930
},
{
"epoch": 1.24128,
"grad_norm": 0.7824523425714454,
"learning_rate": 3.307277030826342e-06,
"loss": 1.046,
"num_tokens": 233985281.0,
"step": 1940
},
{
"epoch": 1.24768,
"grad_norm": 1.126385656615945,
"learning_rate": 3.290913927660793e-06,
"loss": 1.0418,
"num_tokens": 235194572.0,
"step": 1950
},
{
"epoch": 1.25408,
"grad_norm": 0.8230976427574604,
"learning_rate": 3.274513085847899e-06,
"loss": 1.0596,
"num_tokens": 236400915.0,
"step": 1960
},
{
"epoch": 1.26048,
"grad_norm": 0.7715465448814725,
"learning_rate": 3.2580752879577508e-06,
"loss": 1.0421,
"num_tokens": 237602768.0,
"step": 1970
},
{
"epoch": 1.26688,
"grad_norm": 0.7604905419126253,
"learning_rate": 3.2416013183238105e-06,
"loss": 1.0596,
"num_tokens": 238810127.0,
"step": 1980
},
{
"epoch": 1.27328,
"grad_norm": 0.8091857959210363,
"learning_rate": 3.22509196300548e-06,
"loss": 1.0544,
"num_tokens": 240016518.0,
"step": 1990
},
{
"epoch": 1.27968,
"grad_norm": 0.8428609624878182,
"learning_rate": 3.2085480097506015e-06,
"loss": 1.0517,
"num_tokens": 241224903.0,
"step": 2000
},
{
"epoch": 1.2860800000000001,
"grad_norm": 0.8167440202916451,
"learning_rate": 3.191970247957862e-06,
"loss": 1.0607,
"num_tokens": 242432829.0,
"step": 2010
},
{
"epoch": 1.29248,
"grad_norm": 0.843189559655867,
"learning_rate": 3.1753594686391343e-06,
"loss": 1.0519,
"num_tokens": 243643680.0,
"step": 2020
},
{
"epoch": 1.29888,
"grad_norm": 0.8113193681644453,
"learning_rate": 3.158716464381728e-06,
"loss": 1.0534,
"num_tokens": 244850967.0,
"step": 2030
},
{
"epoch": 1.30528,
"grad_norm": 0.8238038397216464,
"learning_rate": 3.1420420293105753e-06,
"loss": 1.0537,
"num_tokens": 246055107.0,
"step": 2040
},
{
"epoch": 1.31168,
"grad_norm": 0.7585161106894139,
"learning_rate": 3.1253369590503357e-06,
"loss": 1.053,
"num_tokens": 247255291.0,
"step": 2050
},
{
"epoch": 1.31808,
"grad_norm": 0.8358837254742888,
"learning_rate": 3.1086020506874352e-06,
"loss": 1.0552,
"num_tokens": 248472347.0,
"step": 2060
},
{
"epoch": 1.3244799999999999,
"grad_norm": 0.8248705338889306,
"learning_rate": 3.091838102732031e-06,
"loss": 1.0547,
"num_tokens": 249675791.0,
"step": 2070
},
{
"epoch": 1.33088,
"grad_norm": 0.8413169777388428,
"learning_rate": 3.0750459150799116e-06,
"loss": 1.0512,
"num_tokens": 250883742.0,
"step": 2080
},
{
"epoch": 1.33728,
"grad_norm": 0.7773274742980588,
"learning_rate": 3.0582262889743304e-06,
"loss": 1.0435,
"num_tokens": 252092991.0,
"step": 2090
},
{
"epoch": 1.34368,
"grad_norm": 0.8160134758509259,
"learning_rate": 3.0413800269677707e-06,
"loss": 1.0617,
"num_tokens": 253296187.0,
"step": 2100
},
{
"epoch": 1.35008,
"grad_norm": 0.8253629381678,
"learning_rate": 3.024507932883659e-06,
"loss": 1.0467,
"num_tokens": 254497531.0,
"step": 2110
},
{
"epoch": 1.35648,
"grad_norm": 0.8449321081656331,
"learning_rate": 3.0076108117779995e-06,
"loss": 1.0501,
"num_tokens": 255698828.0,
"step": 2120
},
{
"epoch": 1.36288,
"grad_norm": 0.864074317535777,
"learning_rate": 2.9906894699009714e-06,
"loss": 1.051,
"num_tokens": 256901786.0,
"step": 2130
},
{
"epoch": 1.36928,
"grad_norm": 0.8545075997582061,
"learning_rate": 2.973744714658452e-06,
"loss": 1.045,
"num_tokens": 258102803.0,
"step": 2140
},
{
"epoch": 1.37568,
"grad_norm": 0.7950948333995521,
"learning_rate": 2.9567773545734917e-06,
"loss": 1.0609,
"num_tokens": 259309237.0,
"step": 2150
},
{
"epoch": 1.38208,
"grad_norm": 0.7772992222068908,
"learning_rate": 2.9397881992477388e-06,
"loss": 1.0529,
"num_tokens": 260512534.0,
"step": 2160
},
{
"epoch": 1.38848,
"grad_norm": 0.8230701809627932,
"learning_rate": 2.9227780593228063e-06,
"loss": 1.0492,
"num_tokens": 261721309.0,
"step": 2170
},
{
"epoch": 1.3948800000000001,
"grad_norm": 0.803410117521878,
"learning_rate": 2.90574774644159e-06,
"loss": 1.0341,
"num_tokens": 262926754.0,
"step": 2180
},
{
"epoch": 1.40128,
"grad_norm": 0.9047895349858696,
"learning_rate": 2.8886980732095467e-06,
"loss": 1.0304,
"num_tokens": 264129158.0,
"step": 2190
},
{
"epoch": 1.40768,
"grad_norm": 0.8048555076981502,
"learning_rate": 2.8716298531559133e-06,
"loss": 1.0494,
"num_tokens": 265332827.0,
"step": 2200
},
{
"epoch": 1.41408,
"grad_norm": 0.8364957546359483,
"learning_rate": 2.8545439006948948e-06,
"loss": 1.0423,
"num_tokens": 266542306.0,
"step": 2210
},
{
"epoch": 1.42048,
"grad_norm": 0.7904212151138658,
"learning_rate": 2.8374410310868044e-06,
"loss": 1.0423,
"num_tokens": 267751752.0,
"step": 2220
},
{
"epoch": 1.42688,
"grad_norm": 0.8434192039931359,
"learning_rate": 2.820322060399156e-06,
"loss": 1.0471,
"num_tokens": 268955655.0,
"step": 2230
},
{
"epoch": 1.4332799999999999,
"grad_norm": 0.7746642379992007,
"learning_rate": 2.803187805467733e-06,
"loss": 1.0574,
"num_tokens": 270165303.0,
"step": 2240
},
{
"epoch": 1.43968,
"grad_norm": 0.8462146853078769,
"learning_rate": 2.7860390838576125e-06,
"loss": 1.0579,
"num_tokens": 271371057.0,
"step": 2250
},
{
"epoch": 1.44608,
"grad_norm": 0.7814911330812998,
"learning_rate": 2.7688767138241474e-06,
"loss": 1.0374,
"num_tokens": 272570562.0,
"step": 2260
},
{
"epoch": 1.45248,
"grad_norm": 0.7648342437809393,
"learning_rate": 2.7517015142739335e-06,
"loss": 1.0551,
"num_tokens": 273773102.0,
"step": 2270
},
{
"epoch": 1.45888,
"grad_norm": 0.8135139786141086,
"learning_rate": 2.734514304725727e-06,
"loss": 1.0431,
"num_tokens": 274979458.0,
"step": 2280
},
{
"epoch": 1.46528,
"grad_norm": 0.8275244446318913,
"learning_rate": 2.717315905271344e-06,
"loss": 1.0436,
"num_tokens": 276180959.0,
"step": 2290
},
{
"epoch": 1.47168,
"grad_norm": 0.8456585906125247,
"learning_rate": 2.700107136536533e-06,
"loss": 1.0571,
"num_tokens": 277381104.0,
"step": 2300
},
{
"epoch": 1.47808,
"grad_norm": 0.7676272425904394,
"learning_rate": 2.682888819641809e-06,
"loss": 1.0454,
"num_tokens": 278589355.0,
"step": 2310
},
{
"epoch": 1.48448,
"grad_norm": 0.7530507207913718,
"learning_rate": 2.6656617761632863e-06,
"loss": 1.0452,
"num_tokens": 279802576.0,
"step": 2320
},
{
"epoch": 1.49088,
"grad_norm": 0.8099596670334043,
"learning_rate": 2.6484268280934674e-06,
"loss": 1.0441,
"num_tokens": 281010541.0,
"step": 2330
},
{
"epoch": 1.49728,
"grad_norm": 0.8098629796138991,
"learning_rate": 2.631184797802022e-06,
"loss": 1.0379,
"num_tokens": 282219974.0,
"step": 2340
},
{
"epoch": 1.5036800000000001,
"grad_norm": 0.8633758780871927,
"learning_rate": 2.613936507996554e-06,
"loss": 1.0553,
"num_tokens": 283423505.0,
"step": 2350
},
{
"epoch": 1.5100799999999999,
"grad_norm": 0.8494557884878244,
"learning_rate": 2.5966827816833393e-06,
"loss": 1.034,
"num_tokens": 284628594.0,
"step": 2360
},
{
"epoch": 1.51648,
"grad_norm": 0.8961874351947472,
"learning_rate": 2.579424442128057e-06,
"loss": 1.0403,
"num_tokens": 285839496.0,
"step": 2370
},
{
"epoch": 1.52288,
"grad_norm": 0.8982519210357097,
"learning_rate": 2.562162312816511e-06,
"loss": 1.0516,
"num_tokens": 287048432.0,
"step": 2380
},
{
"epoch": 1.52928,
"grad_norm": 0.834174589328149,
"learning_rate": 2.544897217415332e-06,
"loss": 1.0371,
"num_tokens": 288256611.0,
"step": 2390
},
{
"epoch": 1.5356800000000002,
"grad_norm": 0.7790317392375281,
"learning_rate": 2.5276299797326777e-06,
"loss": 1.0347,
"num_tokens": 289465699.0,
"step": 2400
},
{
"epoch": 1.54208,
"grad_norm": 0.8113176021935586,
"learning_rate": 2.510361423678929e-06,
"loss": 1.035,
"num_tokens": 290666618.0,
"step": 2410
},
{
"epoch": 1.54848,
"grad_norm": 0.8175298566784388,
"learning_rate": 2.4930923732273683e-06,
"loss": 1.0364,
"num_tokens": 291864705.0,
"step": 2420
},
{
"epoch": 1.55488,
"grad_norm": 0.8601137215701125,
"learning_rate": 2.4758236523748734e-06,
"loss": 1.041,
"num_tokens": 293077992.0,
"step": 2430
},
{
"epoch": 1.56128,
"grad_norm": 0.766342647676912,
"learning_rate": 2.4585560851025917e-06,
"loss": 1.0448,
"num_tokens": 294292270.0,
"step": 2440
},
{
"epoch": 1.56768,
"grad_norm": 0.8144040865702195,
"learning_rate": 2.4412904953366263e-06,
"loss": 1.0626,
"num_tokens": 295501196.0,
"step": 2450
},
{
"epoch": 1.57408,
"grad_norm": 0.8426321262317878,
"learning_rate": 2.424027706908728e-06,
"loss": 1.0361,
"num_tokens": 296713375.0,
"step": 2460
},
{
"epoch": 1.58048,
"grad_norm": 0.870533748148585,
"learning_rate": 2.406768543516977e-06,
"loss": 1.041,
"num_tokens": 297925333.0,
"step": 2470
},
{
"epoch": 1.5868799999999998,
"grad_norm": 0.813316442312155,
"learning_rate": 2.389513828686485e-06,
"loss": 1.0337,
"num_tokens": 299126955.0,
"step": 2480
},
{
"epoch": 1.59328,
"grad_norm": 0.8050560504469045,
"learning_rate": 2.372264385730099e-06,
"loss": 1.0432,
"num_tokens": 300336458.0,
"step": 2490
},
{
"epoch": 1.59968,
"grad_norm": 0.8007073397832749,
"learning_rate": 2.355021037709118e-06,
"loss": 1.0571,
"num_tokens": 301539282.0,
"step": 2500
},
{
"epoch": 1.60608,
"grad_norm": 0.8259619776886131,
"learning_rate": 2.3377846073940207e-06,
"loss": 1.0478,
"num_tokens": 302743922.0,
"step": 2510
},
{
"epoch": 1.6124800000000001,
"grad_norm": 0.7857263898091816,
"learning_rate": 2.3205559172252052e-06,
"loss": 1.0265,
"num_tokens": 303945412.0,
"step": 2520
},
{
"epoch": 1.6188799999999999,
"grad_norm": 0.7830231024473471,
"learning_rate": 2.303335789273744e-06,
"loss": 1.0424,
"num_tokens": 305146555.0,
"step": 2530
},
{
"epoch": 1.62528,
"grad_norm": 0.773313259484951,
"learning_rate": 2.286125045202164e-06,
"loss": 1.0435,
"num_tokens": 306362219.0,
"step": 2540
},
{
"epoch": 1.63168,
"grad_norm": 0.8201327055565161,
"learning_rate": 2.2689245062252398e-06,
"loss": 1.0509,
"num_tokens": 307565244.0,
"step": 2550
},
{
"epoch": 1.63808,
"grad_norm": 0.827602816998628,
"learning_rate": 2.2517349930708032e-06,
"loss": 1.049,
"num_tokens": 308770918.0,
"step": 2560
},
{
"epoch": 1.6444800000000002,
"grad_norm": 0.7919141547822656,
"learning_rate": 2.234557325940589e-06,
"loss": 1.0431,
"num_tokens": 309984868.0,
"step": 2570
},
{
"epoch": 1.65088,
"grad_norm": 0.7394357208064606,
"learning_rate": 2.2173923244710954e-06,
"loss": 1.0312,
"num_tokens": 311187334.0,
"step": 2580
},
{
"epoch": 1.65728,
"grad_norm": 0.785327584034165,
"learning_rate": 2.200240807694474e-06,
"loss": 1.0353,
"num_tokens": 312396234.0,
"step": 2590
},
{
"epoch": 1.66368,
"grad_norm": 0.8232141872243898,
"learning_rate": 2.1831035939994554e-06,
"loss": 1.0562,
"num_tokens": 313601855.0,
"step": 2600
},
{
"epoch": 1.67008,
"grad_norm": 0.7833896049344754,
"learning_rate": 2.165981501092291e-06,
"loss": 1.0407,
"num_tokens": 314804262.0,
"step": 2610
},
{
"epoch": 1.67648,
"grad_norm": 0.7885429615611813,
"learning_rate": 2.148875345957741e-06,
"loss": 1.0295,
"num_tokens": 316005948.0,
"step": 2620
},
{
"epoch": 1.68288,
"grad_norm": 0.7829739281596803,
"learning_rate": 2.131785944820092e-06,
"loss": 1.0252,
"num_tokens": 317208803.0,
"step": 2630
},
{
"epoch": 1.6892800000000001,
"grad_norm": 0.7928770034373539,
"learning_rate": 2.114714113104211e-06,
"loss": 1.0498,
"num_tokens": 318416652.0,
"step": 2640
},
{
"epoch": 1.6956799999999999,
"grad_norm": 0.790850427449215,
"learning_rate": 2.097660665396632e-06,
"loss": 1.0421,
"num_tokens": 319628095.0,
"step": 2650
},
{
"epoch": 1.70208,
"grad_norm": 0.8023551277637352,
"learning_rate": 2.0806264154066946e-06,
"loss": 1.0393,
"num_tokens": 320828695.0,
"step": 2660
},
{
"epoch": 1.70848,
"grad_norm": 0.7922577515769408,
"learning_rate": 2.0636121759277135e-06,
"loss": 1.0485,
"num_tokens": 322041475.0,
"step": 2670
},
{
"epoch": 1.71488,
"grad_norm": 0.7971244397123712,
"learning_rate": 2.046618758798197e-06,
"loss": 1.0275,
"num_tokens": 323243099.0,
"step": 2680
},
{
"epoch": 1.7212800000000001,
"grad_norm": 0.8040701855401029,
"learning_rate": 2.0296469748631113e-06,
"loss": 1.0238,
"num_tokens": 324448570.0,
"step": 2690
},
{
"epoch": 1.7276799999999999,
"grad_norm": 0.7586132016898348,
"learning_rate": 2.0126976339351883e-06,
"loss": 1.0345,
"num_tokens": 325656124.0,
"step": 2700
},
{
"epoch": 1.73408,
"grad_norm": 0.7741130670086324,
"learning_rate": 1.995771544756287e-06,
"loss": 1.0304,
"num_tokens": 326867457.0,
"step": 2710
},
{
"epoch": 1.74048,
"grad_norm": 0.7603630468965715,
"learning_rate": 1.9788695149588027e-06,
"loss": 1.0348,
"num_tokens": 328069419.0,
"step": 2720
},
{
"epoch": 1.74688,
"grad_norm": 0.7656701861871694,
"learning_rate": 1.9619923510271333e-06,
"loss": 1.0337,
"num_tokens": 329274913.0,
"step": 2730
},
{
"epoch": 1.75328,
"grad_norm": 0.7795354061202655,
"learning_rate": 1.945140858259195e-06,
"loss": 1.0467,
"num_tokens": 330497463.0,
"step": 2740
},
{
"epoch": 1.75968,
"grad_norm": 0.8511581572833524,
"learning_rate": 1.928315840727998e-06,
"loss": 1.0292,
"num_tokens": 331705026.0,
"step": 2750
},
{
"epoch": 1.76608,
"grad_norm": 0.8185264208105538,
"learning_rate": 1.9115181012432795e-06,
"loss": 1.0462,
"num_tokens": 332910224.0,
"step": 2760
},
{
"epoch": 1.77248,
"grad_norm": 0.8581339452377109,
"learning_rate": 1.8947484413131996e-06,
"loss": 1.0344,
"num_tokens": 334124736.0,
"step": 2770
},
{
"epoch": 1.77888,
"grad_norm": 0.8469198844835426,
"learning_rate": 1.8780076611060962e-06,
"loss": 1.031,
"num_tokens": 335328630.0,
"step": 2780
},
{
"epoch": 1.78528,
"grad_norm": 0.8097233001009885,
"learning_rate": 1.861296559412303e-06,
"loss": 1.0268,
"num_tokens": 336532418.0,
"step": 2790
},
{
"epoch": 1.79168,
"grad_norm": 0.8477425454150115,
"learning_rate": 1.844615933606037e-06,
"loss": 1.0311,
"num_tokens": 337730246.0,
"step": 2800
},
{
"epoch": 1.7980800000000001,
"grad_norm": 0.7749925952377877,
"learning_rate": 1.8279665796073498e-06,
"loss": 1.0415,
"num_tokens": 338937460.0,
"step": 2810
},
{
"epoch": 1.8044799999999999,
"grad_norm": 0.7976261215266267,
"learning_rate": 1.8113492918441523e-06,
"loss": 1.047,
"num_tokens": 340147641.0,
"step": 2820
},
{
"epoch": 1.81088,
"grad_norm": 0.7733887224457893,
"learning_rate": 1.7947648632143075e-06,
"loss": 1.0309,
"num_tokens": 341352040.0,
"step": 2830
},
{
"epoch": 1.81728,
"grad_norm": 0.7739175808490624,
"learning_rate": 1.7782140850477967e-06,
"loss": 1.0518,
"num_tokens": 342559891.0,
"step": 2840
},
{
"epoch": 1.82368,
"grad_norm": 0.797265127895327,
"learning_rate": 1.7616977470689605e-06,
"loss": 1.0325,
"num_tokens": 343774370.0,
"step": 2850
},
{
"epoch": 1.8300800000000002,
"grad_norm": 0.8443750617770532,
"learning_rate": 1.7452166373588185e-06,
"loss": 1.021,
"num_tokens": 344970302.0,
"step": 2860
},
{
"epoch": 1.83648,
"grad_norm": 0.8003604596330827,
"learning_rate": 1.7287715423174662e-06,
"loss": 1.0304,
"num_tokens": 346180457.0,
"step": 2870
},
{
"epoch": 1.84288,
"grad_norm": 0.8376385879621375,
"learning_rate": 1.7123632466265483e-06,
"loss": 1.0395,
"num_tokens": 347385193.0,
"step": 2880
},
{
"epoch": 1.84928,
"grad_norm": 0.7906644473344662,
"learning_rate": 1.69599253321182e-06,
"loss": 1.0413,
"num_tokens": 348601710.0,
"step": 2890
},
{
"epoch": 1.85568,
"grad_norm": 0.7924809016265382,
"learning_rate": 1.6796601832057905e-06,
"loss": 1.0378,
"num_tokens": 349806167.0,
"step": 2900
},
{
"epoch": 1.86208,
"grad_norm": 0.7766495775123572,
"learning_rate": 1.6633669759104488e-06,
"loss": 1.0264,
"num_tokens": 351012043.0,
"step": 2910
},
{
"epoch": 1.86848,
"grad_norm": 1.3435506252779292,
"learning_rate": 1.6471136887600805e-06,
"loss": 1.0237,
"num_tokens": 352217587.0,
"step": 2920
},
{
"epoch": 1.87488,
"grad_norm": 0.765607343549468,
"learning_rate": 1.6309010972841728e-06,
"loss": 1.0382,
"num_tokens": 353418821.0,
"step": 2930
},
{
"epoch": 1.8812799999999998,
"grad_norm": 0.8171820174646456,
"learning_rate": 1.614729975070407e-06,
"loss": 1.0366,
"num_tokens": 354624890.0,
"step": 2940
},
{
"epoch": 1.88768,
"grad_norm": 0.8064241532835642,
"learning_rate": 1.598601093727749e-06,
"loss": 1.0361,
"num_tokens": 355824991.0,
"step": 2950
},
{
"epoch": 1.89408,
"grad_norm": 0.7884619306846271,
"learning_rate": 1.5825152228496342e-06,
"loss": 1.0425,
"num_tokens": 357030616.0,
"step": 2960
},
{
"epoch": 1.90048,
"grad_norm": 0.8265648248850005,
"learning_rate": 1.5664731299772401e-06,
"loss": 1.0332,
"num_tokens": 358234522.0,
"step": 2970
},
{
"epoch": 1.9068800000000001,
"grad_norm": 0.8092024559268799,
"learning_rate": 1.5504755805628677e-06,
"loss": 1.0399,
"num_tokens": 359443389.0,
"step": 2980
},
{
"epoch": 1.9132799999999999,
"grad_norm": 0.791864238644019,
"learning_rate": 1.5345233379334156e-06,
"loss": 1.0289,
"num_tokens": 360644258.0,
"step": 2990
},
{
"epoch": 1.91968,
"grad_norm": 0.8006538523086424,
"learning_rate": 1.5186171632539587e-06,
"loss": 1.0392,
"num_tokens": 361848281.0,
"step": 3000
},
{
"epoch": 1.92608,
"grad_norm": 0.7852026214667117,
"learning_rate": 1.502757815491429e-06,
"loss": 1.0301,
"num_tokens": 363051672.0,
"step": 3010
},
{
"epoch": 1.93248,
"grad_norm": 0.7473075275246417,
"learning_rate": 1.4869460513784011e-06,
"loss": 1.0349,
"num_tokens": 364249917.0,
"step": 3020
},
{
"epoch": 1.9388800000000002,
"grad_norm": 0.7822299185363633,
"learning_rate": 1.4711826253769828e-06,
"loss": 1.04,
"num_tokens": 365456248.0,
"step": 3030
},
{
"epoch": 1.94528,
"grad_norm": 0.8034434681463449,
"learning_rate": 1.4554682896428179e-06,
"loss": 1.0379,
"num_tokens": 366654881.0,
"step": 3040
},
{
"epoch": 1.95168,
"grad_norm": 0.7768199970864885,
"learning_rate": 1.439803793989198e-06,
"loss": 1.0241,
"num_tokens": 367861348.0,
"step": 3050
},
{
"epoch": 1.95808,
"grad_norm": 0.8118112910224361,
"learning_rate": 1.4241898858512824e-06,
"loss": 1.0426,
"num_tokens": 369064003.0,
"step": 3060
},
{
"epoch": 1.96448,
"grad_norm": 0.7744113528953481,
"learning_rate": 1.408627310250434e-06,
"loss": 1.0414,
"num_tokens": 370279324.0,
"step": 3070
},
{
"epoch": 1.97088,
"grad_norm": 0.7887556630257991,
"learning_rate": 1.3931168097586717e-06,
"loss": 1.0336,
"num_tokens": 371480368.0,
"step": 3080
},
{
"epoch": 1.97728,
"grad_norm": 0.7640435636356337,
"learning_rate": 1.377659124463239e-06,
"loss": 1.042,
"num_tokens": 372690129.0,
"step": 3090
},
{
"epoch": 1.98368,
"grad_norm": 0.7603826553278634,
"learning_rate": 1.3622549919312902e-06,
"loss": 1.0361,
"num_tokens": 373902924.0,
"step": 3100
},
{
"epoch": 1.9900799999999998,
"grad_norm": 0.7599088525071184,
"learning_rate": 1.346905147174694e-06,
"loss": 1.0193,
"num_tokens": 375112585.0,
"step": 3110
},
{
"epoch": 1.99648,
"grad_norm": 0.7816099568186937,
"learning_rate": 1.3316103226149682e-06,
"loss": 1.0349,
"num_tokens": 376325844.0,
"step": 3120
},
{
"epoch": 2.00256,
"grad_norm": 0.7532423548597259,
"learning_rate": 1.3163712480483255e-06,
"loss": 1.0248,
"num_tokens": 377473897.0,
"step": 3130
},
{
"epoch": 2.00896,
"grad_norm": 0.7586660186977321,
"learning_rate": 1.3011886506108578e-06,
"loss": 1.0107,
"num_tokens": 378675832.0,
"step": 3140
},
{
"epoch": 2.01536,
"grad_norm": 0.7958518507428463,
"learning_rate": 1.2860632547438334e-06,
"loss": 1.0029,
"num_tokens": 379872472.0,
"step": 3150
},
{
"epoch": 2.02176,
"grad_norm": 0.8017956552207596,
"learning_rate": 1.2709957821591384e-06,
"loss": 1.0188,
"num_tokens": 381071848.0,
"step": 3160
},
{
"epoch": 2.02816,
"grad_norm": 0.8260326835110341,
"learning_rate": 1.2559869518048307e-06,
"loss": 1.0134,
"num_tokens": 382272368.0,
"step": 3170
},
{
"epoch": 2.03456,
"grad_norm": 0.845928507883109,
"learning_rate": 1.2410374798308442e-06,
"loss": 1.0107,
"num_tokens": 383480338.0,
"step": 3180
},
{
"epoch": 2.04096,
"grad_norm": 0.8513825857009242,
"learning_rate": 1.2261480795548123e-06,
"loss": 1.0099,
"num_tokens": 384683907.0,
"step": 3190
},
{
"epoch": 2.04736,
"grad_norm": 0.7711891823020852,
"learning_rate": 1.211319461428032e-06,
"loss": 1.0139,
"num_tokens": 385889491.0,
"step": 3200
},
{
"epoch": 2.05376,
"grad_norm": 0.7769167344105451,
"learning_rate": 1.1965523330015652e-06,
"loss": 1.0092,
"num_tokens": 387095853.0,
"step": 3210
},
{
"epoch": 2.06016,
"grad_norm": 0.7922783527359497,
"learning_rate": 1.1818473988924797e-06,
"loss": 1.0199,
"num_tokens": 388306034.0,
"step": 3220
},
{
"epoch": 2.06656,
"grad_norm": 0.8009332691587518,
"learning_rate": 1.167205360750227e-06,
"loss": 1.0185,
"num_tokens": 389516647.0,
"step": 3230
},
{
"epoch": 2.07296,
"grad_norm": 0.7591186989087252,
"learning_rate": 1.1526269172231594e-06,
"loss": 0.995,
"num_tokens": 390724121.0,
"step": 3240
},
{
"epoch": 2.07936,
"grad_norm": 0.8055729406106343,
"learning_rate": 1.1381127639252005e-06,
"loss": 1.0109,
"num_tokens": 391924857.0,
"step": 3250
},
{
"epoch": 2.08576,
"grad_norm": 0.7920326568899239,
"learning_rate": 1.1236635934026474e-06,
"loss": 0.9928,
"num_tokens": 393133226.0,
"step": 3260
},
{
"epoch": 2.09216,
"grad_norm": 0.8095321364071963,
"learning_rate": 1.1092800951011283e-06,
"loss": 1.0066,
"num_tokens": 394338791.0,
"step": 3270
},
{
"epoch": 2.09856,
"grad_norm": 0.7790939177959936,
"learning_rate": 1.0949629553327106e-06,
"loss": 1.0144,
"num_tokens": 395544646.0,
"step": 3280
},
{
"epoch": 2.10496,
"grad_norm": 0.9934552993460479,
"learning_rate": 1.080712857243143e-06,
"loss": 1.0004,
"num_tokens": 396744920.0,
"step": 3290
},
{
"epoch": 2.11136,
"grad_norm": 0.7949729572040324,
"learning_rate": 1.0665304807792653e-06,
"loss": 1.009,
"num_tokens": 397964288.0,
"step": 3300
},
{
"epoch": 2.11776,
"grad_norm": 0.7652291996158113,
"learning_rate": 1.0524165026565655e-06,
"loss": 1.007,
"num_tokens": 399168969.0,
"step": 3310
},
{
"epoch": 2.12416,
"grad_norm": 0.7761816653258836,
"learning_rate": 1.0383715963268884e-06,
"loss": 0.994,
"num_tokens": 400373422.0,
"step": 3320
},
{
"epoch": 2.13056,
"grad_norm": 0.8018173213180155,
"learning_rate": 1.0243964319462997e-06,
"loss": 1.0134,
"num_tokens": 401577043.0,
"step": 3330
},
{
"epoch": 2.13696,
"grad_norm": 0.7607821844421783,
"learning_rate": 1.0104916763431133e-06,
"loss": 1.0187,
"num_tokens": 402777527.0,
"step": 3340
},
{
"epoch": 2.14336,
"grad_norm": 0.8327193810047873,
"learning_rate": 9.966579929860704e-07,
"loss": 1.0249,
"num_tokens": 403989663.0,
"step": 3350
},
{
"epoch": 2.14976,
"grad_norm": 0.7706122440471653,
"learning_rate": 9.828960419526818e-07,
"loss": 1.0085,
"num_tokens": 405198202.0,
"step": 3360
},
{
"epoch": 2.15616,
"grad_norm": 0.7867703490032154,
"learning_rate": 9.69206479897736e-07,
"loss": 1.0197,
"num_tokens": 406403598.0,
"step": 3370
},
{
"epoch": 2.16256,
"grad_norm": 0.8249023880860281,
"learning_rate": 9.555899600219634e-07,
"loss": 1.0274,
"num_tokens": 407600213.0,
"step": 3380
},
{
"epoch": 2.16896,
"grad_norm": 0.7855210183667297,
"learning_rate": 9.420471320408669e-07,
"loss": 1.0127,
"num_tokens": 408811259.0,
"step": 3390
},
{
"epoch": 2.17536,
"grad_norm": 0.8119007655119803,
"learning_rate": 9.28578642153726e-07,
"loss": 1.0021,
"num_tokens": 410014132.0,
"step": 3400
},
{
"epoch": 2.18176,
"grad_norm": 0.9205406972397864,
"learning_rate": 9.151851330127593e-07,
"loss": 1.0126,
"num_tokens": 411220727.0,
"step": 3410
},
{
"epoch": 2.18816,
"grad_norm": 0.8056010049273263,
"learning_rate": 9.018672436924605e-07,
"loss": 0.9892,
"num_tokens": 412425755.0,
"step": 3420
},
{
"epoch": 2.19456,
"grad_norm": 0.7632161903493846,
"learning_rate": 8.886256096591048e-07,
"loss": 1.019,
"num_tokens": 413631347.0,
"step": 3430
},
{
"epoch": 2.20096,
"grad_norm": 0.7669268826101938,
"learning_rate": 8.754608627404307e-07,
"loss": 1.0048,
"num_tokens": 414833259.0,
"step": 3440
},
{
"epoch": 2.20736,
"grad_norm": 0.8361832533605145,
"learning_rate": 8.623736310954869e-07,
"loss": 1.0221,
"num_tokens": 416040472.0,
"step": 3450
},
{
"epoch": 2.21376,
"grad_norm": 0.7814096537064951,
"learning_rate": 8.493645391846642e-07,
"loss": 1.0037,
"num_tokens": 417245756.0,
"step": 3460
},
{
"epoch": 2.22016,
"grad_norm": 0.7933425120808404,
"learning_rate": 8.364342077398971e-07,
"loss": 0.9987,
"num_tokens": 418455436.0,
"step": 3470
},
{
"epoch": 2.22656,
"grad_norm": 0.7659925077465827,
"learning_rate": 8.235832537350441e-07,
"loss": 0.993,
"num_tokens": 419667134.0,
"step": 3480
},
{
"epoch": 2.23296,
"grad_norm": 0.8187051274632632,
"learning_rate": 8.108122903564502e-07,
"loss": 1.0028,
"num_tokens": 420870725.0,
"step": 3490
},
{
"epoch": 2.23936,
"grad_norm": 0.7595169446678035,
"learning_rate": 7.98121926973692e-07,
"loss": 1.0124,
"num_tokens": 422076634.0,
"step": 3500
},
{
"epoch": 2.24576,
"grad_norm": 0.8064753048978947,
"learning_rate": 7.855127691104944e-07,
"loss": 1.024,
"num_tokens": 423284867.0,
"step": 3510
},
{
"epoch": 2.25216,
"grad_norm": 0.809858814713402,
"learning_rate": 7.729854184158411e-07,
"loss": 1.0174,
"num_tokens": 424493379.0,
"step": 3520
},
{
"epoch": 2.25856,
"grad_norm": 0.7957945935555317,
"learning_rate": 7.605404726352708e-07,
"loss": 1.0149,
"num_tokens": 425697729.0,
"step": 3530
},
{
"epoch": 2.26496,
"grad_norm": 0.8194656210162423,
"learning_rate": 7.481785255823482e-07,
"loss": 0.9972,
"num_tokens": 426893908.0,
"step": 3540
},
{
"epoch": 2.27136,
"grad_norm": 0.7967423955163617,
"learning_rate": 7.359001671103361e-07,
"loss": 1.0106,
"num_tokens": 428092842.0,
"step": 3550
},
{
"epoch": 2.27776,
"grad_norm": 0.7881164663338793,
"learning_rate": 7.237059830840482e-07,
"loss": 1.0066,
"num_tokens": 429286773.0,
"step": 3560
},
{
"epoch": 2.28416,
"grad_norm": 0.7903923247778172,
"learning_rate": 7.11596555351893e-07,
"loss": 1.0111,
"num_tokens": 430493341.0,
"step": 3570
},
{
"epoch": 2.29056,
"grad_norm": 0.770776011448775,
"learning_rate": 6.995724617181124e-07,
"loss": 0.9923,
"num_tokens": 431693370.0,
"step": 3580
},
{
"epoch": 2.29696,
"grad_norm": 0.7817336774071154,
"learning_rate": 6.876342759152121e-07,
"loss": 1.0162,
"num_tokens": 432901215.0,
"step": 3590
},
{
"epoch": 2.30336,
"grad_norm": 0.7799297164560258,
"learning_rate": 6.757825675765862e-07,
"loss": 1.0089,
"num_tokens": 434107776.0,
"step": 3600
},
{
"epoch": 2.30976,
"grad_norm": 0.8399066019292479,
"learning_rate": 6.640179022093324e-07,
"loss": 1.0104,
"num_tokens": 435311152.0,
"step": 3610
},
{
"epoch": 2.31616,
"grad_norm": 0.8297592147597433,
"learning_rate": 6.52340841167276e-07,
"loss": 1.0114,
"num_tokens": 436513739.0,
"step": 3620
},
{
"epoch": 2.32256,
"grad_norm": 0.7719279126860086,
"learning_rate": 6.407519416241779e-07,
"loss": 1.0065,
"num_tokens": 437726898.0,
"step": 3630
},
{
"epoch": 2.32896,
"grad_norm": 0.8045844362641281,
"learning_rate": 6.292517565471548e-07,
"loss": 1.0097,
"num_tokens": 438931660.0,
"step": 3640
},
{
"epoch": 2.33536,
"grad_norm": 0.7982553698914577,
"learning_rate": 6.178408346702882e-07,
"loss": 1.0082,
"num_tokens": 440137185.0,
"step": 3650
},
{
"epoch": 2.34176,
"grad_norm": 0.7908405728187465,
"learning_rate": 6.065197204684484e-07,
"loss": 1.0148,
"num_tokens": 441339870.0,
"step": 3660
},
{
"epoch": 2.34816,
"grad_norm": 0.7738211794516375,
"learning_rate": 5.95288954131307e-07,
"loss": 1.015,
"num_tokens": 442548750.0,
"step": 3670
},
{
"epoch": 2.35456,
"grad_norm": 0.7925014240523639,
"learning_rate": 5.841490715375689e-07,
"loss": 1.0146,
"num_tokens": 443760356.0,
"step": 3680
},
{
"epoch": 2.36096,
"grad_norm": 0.7744344940621614,
"learning_rate": 5.731006042293983e-07,
"loss": 1.0195,
"num_tokens": 444963192.0,
"step": 3690
},
{
"epoch": 2.36736,
"grad_norm": 0.809967543772837,
"learning_rate": 5.621440793870564e-07,
"loss": 1.0138,
"num_tokens": 446161734.0,
"step": 3700
},
{
"epoch": 2.37376,
"grad_norm": 0.7634003235889771,
"learning_rate": 5.512800198037477e-07,
"loss": 1.0092,
"num_tokens": 447367385.0,
"step": 3710
},
{
"epoch": 2.38016,
"grad_norm": 0.7694302990943018,
"learning_rate": 5.405089438606759e-07,
"loss": 1.0183,
"num_tokens": 448574222.0,
"step": 3720
},
{
"epoch": 2.3865600000000002,
"grad_norm": 0.7964969360810369,
"learning_rate": 5.298313655023083e-07,
"loss": 1.0146,
"num_tokens": 449787465.0,
"step": 3730
},
{
"epoch": 2.39296,
"grad_norm": 0.7826022145337301,
"learning_rate": 5.192477942118501e-07,
"loss": 1.0059,
"num_tokens": 450993609.0,
"step": 3740
},
{
"epoch": 2.39936,
"grad_norm": 0.7939322826576104,
"learning_rate": 5.087587349869396e-07,
"loss": 1.016,
"num_tokens": 452203974.0,
"step": 3750
},
{
"epoch": 2.40576,
"grad_norm": 0.7880956603422961,
"learning_rate": 4.983646883155479e-07,
"loss": 0.9871,
"num_tokens": 453406872.0,
"step": 3760
},
{
"epoch": 2.41216,
"grad_norm": 0.7870741062813569,
"learning_rate": 4.880661501520977e-07,
"loss": 1.0146,
"num_tokens": 454612112.0,
"step": 3770
},
{
"epoch": 2.41856,
"grad_norm": 0.7757670556350029,
"learning_rate": 4.778636118938052e-07,
"loss": 1.0043,
"num_tokens": 455821550.0,
"step": 3780
},
{
"epoch": 2.42496,
"grad_norm": 0.764980277323769,
"learning_rate": 4.677575603572235e-07,
"loss": 1.0037,
"num_tokens": 457034119.0,
"step": 3790
},
{
"epoch": 2.43136,
"grad_norm": 0.7689487131773513,
"learning_rate": 4.5774847775501977e-07,
"loss": 1.0215,
"num_tokens": 458243443.0,
"step": 3800
},
{
"epoch": 2.43776,
"grad_norm": 0.7835819207262276,
"learning_rate": 4.4783684167296645e-07,
"loss": 1.0107,
"num_tokens": 459449656.0,
"step": 3810
},
{
"epoch": 2.44416,
"grad_norm": 0.7439227301838608,
"learning_rate": 4.38023125047152e-07,
"loss": 1.0163,
"num_tokens": 460660657.0,
"step": 3820
},
{
"epoch": 2.45056,
"grad_norm": 0.8141456247124772,
"learning_rate": 4.283077961414125e-07,
"loss": 1.0073,
"num_tokens": 461868305.0,
"step": 3830
},
{
"epoch": 2.45696,
"grad_norm": 0.7873824030524625,
"learning_rate": 4.186913185249936e-07,
"loss": 1.0161,
"num_tokens": 463067022.0,
"step": 3840
},
{
"epoch": 2.4633599999999998,
"grad_norm": 0.7651257037667265,
"learning_rate": 4.091741510504249e-07,
"loss": 1.0054,
"num_tokens": 464277276.0,
"step": 3850
},
{
"epoch": 2.46976,
"grad_norm": 0.7817592356120844,
"learning_rate": 3.9975674783163e-07,
"loss": 1.0131,
"num_tokens": 465486770.0,
"step": 3860
},
{
"epoch": 2.47616,
"grad_norm": 0.7600628098450863,
"learning_rate": 3.904395582222578e-07,
"loss": 1.0,
"num_tokens": 466688564.0,
"step": 3870
},
{
"epoch": 2.48256,
"grad_norm": 0.7452814104047683,
"learning_rate": 3.81223026794241e-07,
"loss": 0.9948,
"num_tokens": 467893407.0,
"step": 3880
},
{
"epoch": 2.48896,
"grad_norm": 0.7886078128816824,
"learning_rate": 3.721075933165816e-07,
"loss": 1.0255,
"num_tokens": 469103315.0,
"step": 3890
},
{
"epoch": 2.49536,
"grad_norm": 0.7883279810476201,
"learning_rate": 3.630936927343695e-07,
"loss": 0.9955,
"num_tokens": 470304536.0,
"step": 3900
},
{
"epoch": 2.50176,
"grad_norm": 0.7870530493997763,
"learning_rate": 3.541817551480292e-07,
"loss": 1.0106,
"num_tokens": 471516225.0,
"step": 3910
},
{
"epoch": 2.50816,
"grad_norm": 0.7913988775198784,
"learning_rate": 3.4537220579279497e-07,
"loss": 1.0123,
"num_tokens": 472723848.0,
"step": 3920
},
{
"epoch": 2.51456,
"grad_norm": 0.788228042670068,
"learning_rate": 3.366654650184217e-07,
"loss": 1.0076,
"num_tokens": 473927605.0,
"step": 3930
},
{
"epoch": 2.52096,
"grad_norm": 0.7671091431259203,
"learning_rate": 3.2806194826913107e-07,
"loss": 1.0054,
"num_tokens": 475130341.0,
"step": 3940
},
{
"epoch": 2.52736,
"grad_norm": 0.7769242999032523,
"learning_rate": 3.1956206606378186e-07,
"loss": 1.0137,
"num_tokens": 476337471.0,
"step": 3950
},
{
"epoch": 2.53376,
"grad_norm": 0.7761725619806417,
"learning_rate": 3.1116622397628886e-07,
"loss": 1.0139,
"num_tokens": 477546278.0,
"step": 3960
},
{
"epoch": 2.54016,
"grad_norm": 0.8119517968358277,
"learning_rate": 3.0287482261626727e-07,
"loss": 1.0112,
"num_tokens": 478748834.0,
"step": 3970
},
{
"epoch": 2.54656,
"grad_norm": 0.7768387486408453,
"learning_rate": 2.946882576099164e-07,
"loss": 1.0176,
"num_tokens": 479951666.0,
"step": 3980
},
{
"epoch": 2.55296,
"grad_norm": 0.8059661577502851,
"learning_rate": 2.8660691958114384e-07,
"loss": 1.0192,
"num_tokens": 481155740.0,
"step": 3990
},
{
"epoch": 2.55936,
"grad_norm": 0.7923218074076707,
"learning_rate": 2.786311941329298e-07,
"loss": 1.0228,
"num_tokens": 482362569.0,
"step": 4000
},
{
"epoch": 2.56576,
"grad_norm": 0.7737100130087119,
"learning_rate": 2.70761461828922e-07,
"loss": 1.0117,
"num_tokens": 483577083.0,
"step": 4010
},
{
"epoch": 2.5721600000000002,
"grad_norm": 0.8198263737858525,
"learning_rate": 2.629980981752803e-07,
"loss": 1.0027,
"num_tokens": 484785169.0,
"step": 4020
},
{
"epoch": 2.57856,
"grad_norm": 0.7800117950292567,
"learning_rate": 2.5534147360276014e-07,
"loss": 1.0061,
"num_tokens": 485992637.0,
"step": 4030
},
{
"epoch": 2.58496,
"grad_norm": 0.7806994703813391,
"learning_rate": 2.4779195344903447e-07,
"loss": 1.0067,
"num_tokens": 487200371.0,
"step": 4040
},
{
"epoch": 2.59136,
"grad_norm": 0.7967832568550222,
"learning_rate": 2.4034989794126494e-07,
"loss": 1.005,
"num_tokens": 488411438.0,
"step": 4050
},
{
"epoch": 2.59776,
"grad_norm": 0.7613054637393943,
"learning_rate": 2.3301566217891148e-07,
"loss": 1.0057,
"num_tokens": 489619089.0,
"step": 4060
},
{
"epoch": 2.6041600000000003,
"grad_norm": 0.8097260832659626,
"learning_rate": 2.257895961167886e-07,
"loss": 1.0115,
"num_tokens": 490822004.0,
"step": 4070
},
{
"epoch": 2.61056,
"grad_norm": 0.7724807002861569,
"learning_rate": 2.18672044548367e-07,
"loss": 1.013,
"num_tokens": 492031022.0,
"step": 4080
},
{
"epoch": 2.6169599999999997,
"grad_norm": 0.769701738678788,
"learning_rate": 2.1166334708932367e-07,
"loss": 1.0097,
"num_tokens": 493240890.0,
"step": 4090
},
{
"epoch": 2.62336,
"grad_norm": 0.76114972582814,
"learning_rate": 2.0476383816133594e-07,
"loss": 1.0042,
"num_tokens": 494453799.0,
"step": 4100
},
{
"epoch": 2.62976,
"grad_norm": 0.8083057947332605,
"learning_rate": 1.9797384697612277e-07,
"loss": 1.0044,
"num_tokens": 495667359.0,
"step": 4110
},
{
"epoch": 2.63616,
"grad_norm": 0.7707158865091736,
"learning_rate": 1.912936975197388e-07,
"loss": 1.0073,
"num_tokens": 496881814.0,
"step": 4120
},
{
"epoch": 2.64256,
"grad_norm": 0.7676478517895791,
"learning_rate": 1.8472370853711397e-07,
"loss": 1.0187,
"num_tokens": 498083665.0,
"step": 4130
},
{
"epoch": 2.6489599999999998,
"grad_norm": 0.7728314364028435,
"learning_rate": 1.7826419351684553e-07,
"loss": 0.996,
"num_tokens": 499285193.0,
"step": 4140
},
{
"epoch": 2.65536,
"grad_norm": 0.7787493559807903,
"learning_rate": 1.7191546067623772e-07,
"loss": 0.9928,
"num_tokens": 500495522.0,
"step": 4150
},
{
"epoch": 2.66176,
"grad_norm": 0.7740957124528121,
"learning_rate": 1.656778129465983e-07,
"loss": 0.9942,
"num_tokens": 501704772.0,
"step": 4160
},
{
"epoch": 2.66816,
"grad_norm": 0.7834164164129861,
"learning_rate": 1.5955154795878086e-07,
"loss": 1.0018,
"num_tokens": 502908159.0,
"step": 4170
},
{
"epoch": 2.67456,
"grad_norm": 0.7690261436250733,
"learning_rate": 1.5353695802898556e-07,
"loss": 0.9966,
"num_tokens": 504119578.0,
"step": 4180
},
{
"epoch": 2.68096,
"grad_norm": 0.7500003508328252,
"learning_rate": 1.4763433014481105e-07,
"loss": 1.0175,
"num_tokens": 505329761.0,
"step": 4190
},
{
"epoch": 2.68736,
"grad_norm": 0.7619674427912766,
"learning_rate": 1.4184394595155887e-07,
"loss": 1.0084,
"num_tokens": 506541089.0,
"step": 4200
},
{
"epoch": 2.69376,
"grad_norm": 0.7905928509034632,
"learning_rate": 1.3616608173879636e-07,
"loss": 1.0077,
"num_tokens": 507747398.0,
"step": 4210
},
{
"epoch": 2.70016,
"grad_norm": 0.7768455409603942,
"learning_rate": 1.3060100842717388e-07,
"loss": 1.0211,
"num_tokens": 508948926.0,
"step": 4220
},
{
"epoch": 2.70656,
"grad_norm": 0.7650832573151034,
"learning_rate": 1.2514899155549625e-07,
"loss": 1.0033,
"num_tokens": 510157051.0,
"step": 4230
},
{
"epoch": 2.71296,
"grad_norm": 0.7847880941915708,
"learning_rate": 1.1981029126805293e-07,
"loss": 1.0025,
"num_tokens": 511359623.0,
"step": 4240
},
{
"epoch": 2.71936,
"grad_norm": 0.8047407028430222,
"learning_rate": 1.1458516230220651e-07,
"loss": 1.0056,
"num_tokens": 512562364.0,
"step": 4250
},
{
"epoch": 2.72576,
"grad_norm": 0.7894872635799464,
"learning_rate": 1.0947385397623522e-07,
"loss": 1.0062,
"num_tokens": 513767195.0,
"step": 4260
},
{
"epoch": 2.73216,
"grad_norm": 0.7754271372790722,
"learning_rate": 1.0447661017743971e-07,
"loss": 0.997,
"num_tokens": 514974517.0,
"step": 4270
},
{
"epoch": 2.73856,
"grad_norm": 0.7746425365371328,
"learning_rate": 9.959366935050397e-08,
"loss": 0.9987,
"num_tokens": 516179935.0,
"step": 4280
},
{
"epoch": 2.74496,
"grad_norm": 0.7523512554064233,
"learning_rate": 9.482526448611807e-08,
"loss": 1.0042,
"num_tokens": 517387907.0,
"step": 4290
},
{
"epoch": 2.75136,
"grad_norm": 0.7805940920378595,
"learning_rate": 9.017162310986067e-08,
"loss": 1.002,
"num_tokens": 518595813.0,
"step": 4300
},
{
"epoch": 2.75776,
"grad_norm": 0.8110259911998368,
"learning_rate": 8.563296727134435e-08,
"loss": 1.0066,
"num_tokens": 519800375.0,
"step": 4310
},
{
"epoch": 2.76416,
"grad_norm": 0.772256949618178,
"learning_rate": 8.120951353361884e-08,
"loss": 1.0045,
"num_tokens": 521008297.0,
"step": 4320
},
{
"epoch": 2.77056,
"grad_norm": 0.7629770251408482,
"learning_rate": 7.690147296283757e-08,
"loss": 1.0007,
"num_tokens": 522217337.0,
"step": 4330
},
{
"epoch": 2.77696,
"grad_norm": 0.750000751925906,
"learning_rate": 7.270905111818744e-08,
"loss": 1.0044,
"num_tokens": 523427534.0,
"step": 4340
},
{
"epoch": 2.78336,
"grad_norm": 0.7695523347419888,
"learning_rate": 6.863244804208053e-08,
"loss": 1.0185,
"num_tokens": 524629610.0,
"step": 4350
},
{
"epoch": 2.7897600000000002,
"grad_norm": 0.7594496702512009,
"learning_rate": 6.467185825060728e-08,
"loss": 1.0132,
"num_tokens": 525838628.0,
"step": 4360
},
{
"epoch": 2.79616,
"grad_norm": 0.774231464389687,
"learning_rate": 6.082747072425844e-08,
"loss": 0.9923,
"num_tokens": 527047256.0,
"step": 4370
},
{
"epoch": 2.80256,
"grad_norm": 0.7878028776389799,
"learning_rate": 5.709946889890461e-08,
"loss": 0.9989,
"num_tokens": 528251412.0,
"step": 4380
},
{
"epoch": 2.80896,
"grad_norm": 0.7680845271371904,
"learning_rate": 5.348803065704483e-08,
"loss": 0.9971,
"num_tokens": 529460583.0,
"step": 4390
},
{
"epoch": 2.81536,
"grad_norm": 0.7710477876974481,
"learning_rate": 4.999332831931936e-08,
"loss": 1.0097,
"num_tokens": 530666949.0,
"step": 4400
},
{
"epoch": 2.8217600000000003,
"grad_norm": 0.7641864260094089,
"learning_rate": 4.6615528636286545e-08,
"loss": 1.0083,
"num_tokens": 531877350.0,
"step": 4410
},
{
"epoch": 2.82816,
"grad_norm": 0.7798848074760067,
"learning_rate": 4.3354792780467004e-08,
"loss": 1.0145,
"num_tokens": 533089968.0,
"step": 4420
},
{
"epoch": 2.8345599999999997,
"grad_norm": 0.7987639919755114,
"learning_rate": 4.021127633865196e-08,
"loss": 1.0061,
"num_tokens": 534295222.0,
"step": 4430
},
{
"epoch": 2.84096,
"grad_norm": 0.7471470388574258,
"learning_rate": 3.718512930448115e-08,
"loss": 0.9897,
"num_tokens": 535501172.0,
"step": 4440
},
{
"epoch": 2.84736,
"grad_norm": 0.7784643844597081,
"learning_rate": 3.4276496071284084e-08,
"loss": 1.0126,
"num_tokens": 536697925.0,
"step": 4450
},
{
"epoch": 2.85376,
"grad_norm": 0.7972370799678196,
"learning_rate": 3.148551542519196e-08,
"loss": 1.0051,
"num_tokens": 537893496.0,
"step": 4460
},
{
"epoch": 2.86016,
"grad_norm": 0.7691284457736113,
"learning_rate": 2.8812320538514348e-08,
"loss": 1.0098,
"num_tokens": 539102796.0,
"step": 4470
},
{
"epoch": 2.8665599999999998,
"grad_norm": 0.7982125519739797,
"learning_rate": 2.6257038963385106e-08,
"loss": 1.0136,
"num_tokens": 540316296.0,
"step": 4480
},
{
"epoch": 2.87296,
"grad_norm": 0.7728520058232545,
"learning_rate": 2.3819792625675297e-08,
"loss": 1.0149,
"num_tokens": 541533670.0,
"step": 4490
},
{
"epoch": 2.87936,
"grad_norm": 0.7681197599600511,
"learning_rate": 2.1500697819178406e-08,
"loss": 1.0027,
"num_tokens": 542738043.0,
"step": 4500
},
{
"epoch": 2.88576,
"grad_norm": 0.7510549175746628,
"learning_rate": 1.9299865200057556e-08,
"loss": 1.0059,
"num_tokens": 543947829.0,
"step": 4510
},
{
"epoch": 2.89216,
"grad_norm": 0.766881311747473,
"learning_rate": 1.721739978156778e-08,
"loss": 1.0051,
"num_tokens": 545163765.0,
"step": 4520
},
{
"epoch": 2.89856,
"grad_norm": 0.8067610998392601,
"learning_rate": 1.5253400929045036e-08,
"loss": 0.9998,
"num_tokens": 546371420.0,
"step": 4530
},
{
"epoch": 2.90496,
"grad_norm": 0.7674069412891232,
"learning_rate": 1.3407962355164728e-08,
"loss": 1.0164,
"num_tokens": 547577921.0,
"step": 4540
},
{
"epoch": 2.91136,
"grad_norm": 0.775004069541473,
"learning_rate": 1.1681172115469986e-08,
"loss": 1.0034,
"num_tokens": 548783680.0,
"step": 4550
},
{
"epoch": 2.91776,
"grad_norm": 0.7833232261400477,
"learning_rate": 1.007311260417032e-08,
"loss": 0.9956,
"num_tokens": 549988634.0,
"step": 4560
},
{
"epoch": 2.92416,
"grad_norm": 0.7826676344415344,
"learning_rate": 8.583860550210043e-09,
"loss": 1.0098,
"num_tokens": 551189799.0,
"step": 4570
},
{
"epoch": 2.93056,
"grad_norm": 0.744986247926951,
"learning_rate": 7.213487013607856e-09,
"loss": 1.0035,
"num_tokens": 552397598.0,
"step": 4580
},
{
"epoch": 2.93696,
"grad_norm": 0.7521106648563647,
"learning_rate": 5.96205738206429e-09,
"loss": 1.0043,
"num_tokens": 553610771.0,
"step": 4590
},
{
"epoch": 2.94336,
"grad_norm": 0.8150061917429959,
"learning_rate": 4.829631367844201e-09,
"loss": 1.0,
"num_tokens": 554824637.0,
"step": 4600
},
{
"epoch": 2.94976,
"grad_norm": 0.772723595238506,
"learning_rate": 3.816263004925991e-09,
"loss": 1.0082,
"num_tokens": 556030923.0,
"step": 4610
},
{
"epoch": 2.95616,
"grad_norm": 0.768451723737756,
"learning_rate": 2.922000646423118e-09,
"loss": 0.9922,
"num_tokens": 557231653.0,
"step": 4620
},
{
"epoch": 2.96256,
"grad_norm": 0.7993486350591127,
"learning_rate": 2.1468869622781608e-09,
"loss": 1.0019,
"num_tokens": 558442813.0,
"step": 4630
},
{
"epoch": 2.96896,
"grad_norm": 1.1239957345324176,
"learning_rate": 1.4909589372266719e-09,
"loss": 1.001,
"num_tokens": 559650373.0,
"step": 4640
},
{
"epoch": 2.9753600000000002,
"grad_norm": 0.7957668006721109,
"learning_rate": 9.542478690305335e-10,
"loss": 1.0067,
"num_tokens": 560855666.0,
"step": 4650
},
{
"epoch": 2.98176,
"grad_norm": 0.8079239433679425,
"learning_rate": 5.367793669874832e-10,
"loss": 0.9969,
"num_tokens": 562060878.0,
"step": 4660
},
{
"epoch": 2.98816,
"grad_norm": 0.7720639449232606,
"learning_rate": 2.385733507062615e-10,
"loss": 1.0052,
"num_tokens": 563260411.0,
"step": 4670
},
{
"epoch": 2.99456,
"grad_norm": 0.7561377897632978,
"learning_rate": 5.964404915903555e-11,
"loss": 0.9991,
"num_tokens": 564468049.0,
"step": 4680
},
{
"epoch": 3.0,
"num_tokens": 565489014.0,
"step": 4689,
"total_flos": 722584728633344.0,
"train_loss": 1.0774097926684294,
"train_runtime": 15585.9875,
"train_samples_per_second": 19.248,
"train_steps_per_second": 0.301
}
],
"logging_steps": 10,
"max_steps": 4689,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 722584728633344.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}