{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4689, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0064, "grad_norm": 2.760316424583359, "learning_rate": 3.1914893617021275e-07, "loss": 1.6571, "num_tokens": 1208276.0, "step": 10 }, { "epoch": 0.0128, "grad_norm": 1.9639395470560352, "learning_rate": 6.73758865248227e-07, "loss": 1.6372, "num_tokens": 2410446.0, "step": 20 }, { "epoch": 0.0192, "grad_norm": 1.6857950160814903, "learning_rate": 1.0283687943262412e-06, "loss": 1.6138, "num_tokens": 3622536.0, "step": 30 }, { "epoch": 0.0256, "grad_norm": 1.9355649405079267, "learning_rate": 1.3829787234042555e-06, "loss": 1.554, "num_tokens": 4837847.0, "step": 40 }, { "epoch": 0.032, "grad_norm": 1.0134479979543427, "learning_rate": 1.7375886524822697e-06, "loss": 1.5138, "num_tokens": 6044886.0, "step": 50 }, { "epoch": 0.0384, "grad_norm": 0.7097712225560386, "learning_rate": 2.092198581560284e-06, "loss": 1.4577, "num_tokens": 7255346.0, "step": 60 }, { "epoch": 0.0448, "grad_norm": 0.7563602572113316, "learning_rate": 2.446808510638298e-06, "loss": 1.4239, "num_tokens": 8465627.0, "step": 70 }, { "epoch": 0.0512, "grad_norm": 0.6411265148411116, "learning_rate": 2.8014184397163125e-06, "loss": 1.3857, "num_tokens": 9667266.0, "step": 80 }, { "epoch": 0.0576, "grad_norm": 0.7071256376230877, "learning_rate": 3.1560283687943267e-06, "loss": 1.3736, "num_tokens": 10869831.0, "step": 90 }, { "epoch": 0.064, "grad_norm": 0.7623180191305359, "learning_rate": 3.510638297872341e-06, "loss": 1.3722, "num_tokens": 12083093.0, "step": 100 }, { "epoch": 0.0704, "grad_norm": 0.651385333897087, "learning_rate": 3.865248226950355e-06, "loss": 1.3468, "num_tokens": 13290331.0, "step": 110 }, { "epoch": 0.0768, "grad_norm": 0.8706225351642094, "learning_rate": 4.219858156028369e-06, "loss": 1.3387, "num_tokens": 14488386.0, "step": 120 }, { "epoch": 0.0832, "grad_norm": 0.84726755662717, "learning_rate": 4.574468085106383e-06, "loss": 1.3364, "num_tokens": 15690608.0, "step": 130 }, { "epoch": 0.0896, "grad_norm": 0.8553144960607314, "learning_rate": 4.929078014184397e-06, "loss": 1.3207, "num_tokens": 16894120.0, "step": 140 }, { "epoch": 0.096, "grad_norm": 0.6845288880044453, "learning_rate": 4.999961827753897e-06, "loss": 1.3072, "num_tokens": 18098866.0, "step": 150 }, { "epoch": 0.1024, "grad_norm": 0.7060413425833653, "learning_rate": 4.999806755001946e-06, "loss": 1.293, "num_tokens": 19317515.0, "step": 160 }, { "epoch": 0.1088, "grad_norm": 1.112301905134234, "learning_rate": 4.999532403372408e-06, "loss": 1.2933, "num_tokens": 20523986.0, "step": 170 }, { "epoch": 0.1152, "grad_norm": 0.7057273926728088, "learning_rate": 4.9991387859560365e-06, "loss": 1.3105, "num_tokens": 21730204.0, "step": 180 }, { "epoch": 0.1216, "grad_norm": 0.7046621457199816, "learning_rate": 4.9986259215343814e-06, "loss": 1.3036, "num_tokens": 22941629.0, "step": 190 }, { "epoch": 0.128, "grad_norm": 0.6753839003505228, "learning_rate": 4.997993834578891e-06, "loss": 1.2837, "num_tokens": 24149743.0, "step": 200 }, { "epoch": 0.1344, "grad_norm": 0.6833117540920727, "learning_rate": 4.997242555249746e-06, "loss": 1.2798, "num_tokens": 25350421.0, "step": 210 }, { "epoch": 0.1408, "grad_norm": 0.7496565711502305, "learning_rate": 4.996372119394418e-06, "loss": 1.2872, "num_tokens": 26553851.0, "step": 220 }, { "epoch": 0.1472, "grad_norm": 0.8257784450438341, "learning_rate": 4.9953825685459635e-06, "loss": 1.2715, "num_tokens": 27756494.0, "step": 230 }, { "epoch": 0.1536, "grad_norm": 0.8586750458312551, "learning_rate": 4.994273949921038e-06, "loss": 1.273, "num_tokens": 28966311.0, "step": 240 }, { "epoch": 0.16, "grad_norm": 0.8942167127143708, "learning_rate": 4.993046316417643e-06, "loss": 1.2615, "num_tokens": 30165165.0, "step": 250 }, { "epoch": 0.1664, "grad_norm": 0.7320667303892974, "learning_rate": 4.991699726612607e-06, "loss": 1.2598, "num_tokens": 31372687.0, "step": 260 }, { "epoch": 0.1728, "grad_norm": 0.7759159652826615, "learning_rate": 4.990234244758785e-06, "loss": 1.2378, "num_tokens": 32578240.0, "step": 270 }, { "epoch": 0.1792, "grad_norm": 0.7081937298786585, "learning_rate": 4.988649940781992e-06, "loss": 1.2496, "num_tokens": 33788704.0, "step": 280 }, { "epoch": 0.1856, "grad_norm": 0.8354872354621143, "learning_rate": 4.986946890277673e-06, "loss": 1.239, "num_tokens": 34992041.0, "step": 290 }, { "epoch": 0.192, "grad_norm": 0.7419306542972816, "learning_rate": 4.9851251745072905e-06, "loss": 1.2334, "num_tokens": 36202424.0, "step": 300 }, { "epoch": 0.1984, "grad_norm": 0.8124424043952861, "learning_rate": 4.983184880394447e-06, "loss": 1.2423, "num_tokens": 37406998.0, "step": 310 }, { "epoch": 0.2048, "grad_norm": 0.9137121442594122, "learning_rate": 4.981126100520743e-06, "loss": 1.2398, "num_tokens": 38614024.0, "step": 320 }, { "epoch": 0.2112, "grad_norm": 0.8692171799253517, "learning_rate": 4.978948933121351e-06, "loss": 1.2274, "num_tokens": 39818938.0, "step": 330 }, { "epoch": 0.2176, "grad_norm": 0.7959433307352174, "learning_rate": 4.976653482080335e-06, "loss": 1.2432, "num_tokens": 41029985.0, "step": 340 }, { "epoch": 0.224, "grad_norm": 0.9183385731990914, "learning_rate": 4.97423985692569e-06, "loss": 1.2183, "num_tokens": 42241595.0, "step": 350 }, { "epoch": 0.2304, "grad_norm": 0.8800279308744207, "learning_rate": 4.97170817282412e-06, "loss": 1.2174, "num_tokens": 43436994.0, "step": 360 }, { "epoch": 0.2368, "grad_norm": 0.8482042891364965, "learning_rate": 4.969058550575535e-06, "loss": 1.214, "num_tokens": 44649051.0, "step": 370 }, { "epoch": 0.2432, "grad_norm": 0.8597854654288322, "learning_rate": 4.966291116607297e-06, "loss": 1.2105, "num_tokens": 45857075.0, "step": 380 }, { "epoch": 0.2496, "grad_norm": 0.8904371734549302, "learning_rate": 4.96340600296818e-06, "loss": 1.1976, "num_tokens": 47059498.0, "step": 390 }, { "epoch": 0.256, "grad_norm": 0.864096324906862, "learning_rate": 4.960403347322069e-06, "loss": 1.2067, "num_tokens": 48273286.0, "step": 400 }, { "epoch": 0.2624, "grad_norm": 0.8417001685001565, "learning_rate": 4.957283292941401e-06, "loss": 1.2012, "num_tokens": 49479835.0, "step": 410 }, { "epoch": 0.2688, "grad_norm": 0.8738206939182319, "learning_rate": 4.954045988700315e-06, "loss": 1.2081, "num_tokens": 50692484.0, "step": 420 }, { "epoch": 0.2752, "grad_norm": 0.9214341760640065, "learning_rate": 4.9506915890675566e-06, "loss": 1.1982, "num_tokens": 51904151.0, "step": 430 }, { "epoch": 0.2816, "grad_norm": 0.8270044046785595, "learning_rate": 4.94722025409911e-06, "loss": 1.2003, "num_tokens": 53107439.0, "step": 440 }, { "epoch": 0.288, "grad_norm": 0.9325298797380837, "learning_rate": 4.943632149430552e-06, "loss": 1.1934, "num_tokens": 54311802.0, "step": 450 }, { "epoch": 0.2944, "grad_norm": 0.8173318542721012, "learning_rate": 4.9399274462691555e-06, "loss": 1.183, "num_tokens": 55516169.0, "step": 460 }, { "epoch": 0.3008, "grad_norm": 0.8403372189641363, "learning_rate": 4.93610632138572e-06, "loss": 1.2011, "num_tokens": 56720582.0, "step": 470 }, { "epoch": 0.3072, "grad_norm": 0.9133683374494203, "learning_rate": 4.9321689571061314e-06, "loss": 1.1863, "num_tokens": 57923305.0, "step": 480 }, { "epoch": 0.3136, "grad_norm": 0.8342006897685076, "learning_rate": 4.928115541302672e-06, "loss": 1.1789, "num_tokens": 59119131.0, "step": 490 }, { "epoch": 0.32, "grad_norm": 0.9237208555707096, "learning_rate": 4.923946267385043e-06, "loss": 1.1823, "num_tokens": 60323216.0, "step": 500 }, { "epoch": 0.3264, "grad_norm": 1.138961215949811, "learning_rate": 4.91966133429115e-06, "loss": 1.1849, "num_tokens": 61536243.0, "step": 510 }, { "epoch": 0.3328, "grad_norm": 0.8179215725319021, "learning_rate": 4.915260946477601e-06, "loss": 1.1689, "num_tokens": 62725558.0, "step": 520 }, { "epoch": 0.3392, "grad_norm": 0.8196458509991646, "learning_rate": 4.910745313909953e-06, "loss": 1.1754, "num_tokens": 63929035.0, "step": 530 }, { "epoch": 0.3456, "grad_norm": 0.8606903543941481, "learning_rate": 4.906114652052694e-06, "loss": 1.1608, "num_tokens": 65137799.0, "step": 540 }, { "epoch": 0.352, "grad_norm": 0.842427893289404, "learning_rate": 4.9013691818589635e-06, "loss": 1.176, "num_tokens": 66343119.0, "step": 550 }, { "epoch": 0.3584, "grad_norm": 0.9536458222010928, "learning_rate": 4.896509129760008e-06, "loss": 1.1766, "num_tokens": 67554625.0, "step": 560 }, { "epoch": 0.3648, "grad_norm": 0.8456584910416223, "learning_rate": 4.891534727654374e-06, "loss": 1.1704, "num_tokens": 68767553.0, "step": 570 }, { "epoch": 0.3712, "grad_norm": 0.825023352714185, "learning_rate": 4.886446212896853e-06, "loss": 1.1662, "num_tokens": 69977707.0, "step": 580 }, { "epoch": 0.3776, "grad_norm": 0.8327520829988985, "learning_rate": 4.881243828287141e-06, "loss": 1.1715, "num_tokens": 71189476.0, "step": 590 }, { "epoch": 0.384, "grad_norm": 0.840077866672345, "learning_rate": 4.875927822058265e-06, "loss": 1.1711, "num_tokens": 72395847.0, "step": 600 }, { "epoch": 0.3904, "grad_norm": 0.8253947193633453, "learning_rate": 4.870498447864735e-06, "loss": 1.1439, "num_tokens": 73594932.0, "step": 610 }, { "epoch": 0.3968, "grad_norm": 0.9212419524845424, "learning_rate": 4.864955964770442e-06, "loss": 1.1643, "num_tokens": 74802657.0, "step": 620 }, { "epoch": 0.4032, "grad_norm": 0.9296250658068028, "learning_rate": 4.859300637236289e-06, "loss": 1.1534, "num_tokens": 76011529.0, "step": 630 }, { "epoch": 0.4096, "grad_norm": 1.057634627530951, "learning_rate": 4.853532735107587e-06, "loss": 1.1507, "num_tokens": 77210334.0, "step": 640 }, { "epoch": 0.416, "grad_norm": 0.8097939416205123, "learning_rate": 4.847652533601164e-06, "loss": 1.1395, "num_tokens": 78425328.0, "step": 650 }, { "epoch": 0.4224, "grad_norm": 0.8447649876579609, "learning_rate": 4.8416603132922425e-06, "loss": 1.1378, "num_tokens": 79638521.0, "step": 660 }, { "epoch": 0.4288, "grad_norm": 0.9421170322416722, "learning_rate": 4.83555636010105e-06, "loss": 1.1349, "num_tokens": 80836868.0, "step": 670 }, { "epoch": 0.4352, "grad_norm": 0.9009555407016511, "learning_rate": 4.829340965279173e-06, "loss": 1.1482, "num_tokens": 82050746.0, "step": 680 }, { "epoch": 0.4416, "grad_norm": 0.9304718962620818, "learning_rate": 4.823014425395662e-06, "loss": 1.1535, "num_tokens": 83256247.0, "step": 690 }, { "epoch": 0.448, "grad_norm": 0.8268029795401431, "learning_rate": 4.816577042322883e-06, "loss": 1.1625, "num_tokens": 84466963.0, "step": 700 }, { "epoch": 0.4544, "grad_norm": 0.8118838757785675, "learning_rate": 4.810029123222109e-06, "loss": 1.1582, "num_tokens": 85668747.0, "step": 710 }, { "epoch": 0.4608, "grad_norm": 0.8191391458452703, "learning_rate": 4.803370980528868e-06, "loss": 1.1508, "num_tokens": 86869314.0, "step": 720 }, { "epoch": 0.4672, "grad_norm": 0.8573356891805307, "learning_rate": 4.796602931938031e-06, "loss": 1.1367, "num_tokens": 88072166.0, "step": 730 }, { "epoch": 0.4736, "grad_norm": 0.9130087766709583, "learning_rate": 4.789725300388658e-06, "loss": 1.1496, "num_tokens": 89276560.0, "step": 740 }, { "epoch": 0.48, "grad_norm": 0.8756224792489176, "learning_rate": 4.782738414048581e-06, "loss": 1.1387, "num_tokens": 90489167.0, "step": 750 }, { "epoch": 0.4864, "grad_norm": 0.8660533049576743, "learning_rate": 4.775642606298758e-06, "loss": 1.1293, "num_tokens": 91699027.0, "step": 760 }, { "epoch": 0.4928, "grad_norm": 0.9344747635312723, "learning_rate": 4.7684382157173515e-06, "loss": 1.1544, "num_tokens": 92907904.0, "step": 770 }, { "epoch": 0.4992, "grad_norm": 0.8232769483557345, "learning_rate": 4.761125586063583e-06, "loss": 1.1509, "num_tokens": 94108258.0, "step": 780 }, { "epoch": 0.5056, "grad_norm": 0.8019044034927749, "learning_rate": 4.753705066261326e-06, "loss": 1.142, "num_tokens": 95319591.0, "step": 790 }, { "epoch": 0.512, "grad_norm": 0.8744491818182848, "learning_rate": 4.74617701038246e-06, "loss": 1.1407, "num_tokens": 96527466.0, "step": 800 }, { "epoch": 0.5184, "grad_norm": 0.8457377069978257, "learning_rate": 4.738541777629971e-06, "loss": 1.1454, "num_tokens": 97741955.0, "step": 810 }, { "epoch": 0.5248, "grad_norm": 0.8367461594303044, "learning_rate": 4.730799732320819e-06, "loss": 1.1499, "num_tokens": 98947846.0, "step": 820 }, { "epoch": 0.5312, "grad_norm": 0.8153933334854007, "learning_rate": 4.722951243868547e-06, "loss": 1.1338, "num_tokens": 100149443.0, "step": 830 }, { "epoch": 0.5376, "grad_norm": 0.9553883385280855, "learning_rate": 4.7149966867656625e-06, "loss": 1.1239, "num_tokens": 101354489.0, "step": 840 }, { "epoch": 0.544, "grad_norm": 0.8020256868069202, "learning_rate": 4.706936440565759e-06, "loss": 1.1233, "num_tokens": 102561908.0, "step": 850 }, { "epoch": 0.5504, "grad_norm": 0.8506848444686664, "learning_rate": 4.698770889865414e-06, "loss": 1.1314, "num_tokens": 103765389.0, "step": 860 }, { "epoch": 0.5568, "grad_norm": 0.8931807739845334, "learning_rate": 4.690500424285833e-06, "loss": 1.1367, "num_tokens": 104973326.0, "step": 870 }, { "epoch": 0.5632, "grad_norm": 0.8498884776316712, "learning_rate": 4.682125438454261e-06, "loss": 1.1329, "num_tokens": 106184942.0, "step": 880 }, { "epoch": 0.5696, "grad_norm": 0.8866656591752357, "learning_rate": 4.673646331985151e-06, "loss": 1.1469, "num_tokens": 107391403.0, "step": 890 }, { "epoch": 0.576, "grad_norm": 0.8247486140289442, "learning_rate": 4.665063509461098e-06, "loss": 1.1304, "num_tokens": 108599244.0, "step": 900 }, { "epoch": 0.5824, "grad_norm": 0.8509584195104843, "learning_rate": 4.6563773804135305e-06, "loss": 1.1205, "num_tokens": 109802767.0, "step": 910 }, { "epoch": 0.5888, "grad_norm": 0.9532478448654986, "learning_rate": 4.647588359303178e-06, "loss": 1.135, "num_tokens": 111002144.0, "step": 920 }, { "epoch": 0.5952, "grad_norm": 0.795143766492276, "learning_rate": 4.638696865500284e-06, "loss": 1.133, "num_tokens": 112202360.0, "step": 930 }, { "epoch": 0.6016, "grad_norm": 0.8884950967785606, "learning_rate": 4.629703323264605e-06, "loss": 1.1174, "num_tokens": 113410661.0, "step": 940 }, { "epoch": 0.608, "grad_norm": 0.8094095645216874, "learning_rate": 4.62060816172516e-06, "loss": 1.1359, "num_tokens": 114615154.0, "step": 950 }, { "epoch": 0.6144, "grad_norm": 0.8517004319099382, "learning_rate": 4.611411814859758e-06, "loss": 1.1141, "num_tokens": 115826696.0, "step": 960 }, { "epoch": 0.6208, "grad_norm": 0.8739388391386897, "learning_rate": 4.602114721474293e-06, "loss": 1.1204, "num_tokens": 117030663.0, "step": 970 }, { "epoch": 0.6272, "grad_norm": 1.0126603878935398, "learning_rate": 4.592717325181798e-06, "loss": 1.1259, "num_tokens": 118243461.0, "step": 980 }, { "epoch": 0.6336, "grad_norm": 0.7961249459761912, "learning_rate": 4.583220074381288e-06, "loss": 1.1105, "num_tokens": 119444400.0, "step": 990 }, { "epoch": 0.64, "grad_norm": 0.8547801323336933, "learning_rate": 4.573623422236359e-06, "loss": 1.1247, "num_tokens": 120646721.0, "step": 1000 }, { "epoch": 0.6464, "grad_norm": 0.8827343366608609, "learning_rate": 4.563927826653562e-06, "loss": 1.1381, "num_tokens": 121856814.0, "step": 1010 }, { "epoch": 0.6528, "grad_norm": 0.8379604515543791, "learning_rate": 4.554133750260561e-06, "loss": 1.1038, "num_tokens": 123063137.0, "step": 1020 }, { "epoch": 0.6592, "grad_norm": 0.9009991930297082, "learning_rate": 4.544241660384057e-06, "loss": 1.1351, "num_tokens": 124281752.0, "step": 1030 }, { "epoch": 0.6656, "grad_norm": 0.9398290903202526, "learning_rate": 4.534252029027485e-06, "loss": 1.132, "num_tokens": 125483927.0, "step": 1040 }, { "epoch": 0.672, "grad_norm": 0.8135458599046622, "learning_rate": 4.5241653328484965e-06, "loss": 1.1137, "num_tokens": 126688041.0, "step": 1050 }, { "epoch": 0.6784, "grad_norm": 0.826631698433715, "learning_rate": 4.5139820531362125e-06, "loss": 1.1149, "num_tokens": 127895497.0, "step": 1060 }, { "epoch": 0.6848, "grad_norm": 0.8326760862617015, "learning_rate": 4.503702675788263e-06, "loss": 1.1082, "num_tokens": 129093768.0, "step": 1070 }, { "epoch": 0.6912, "grad_norm": 0.8187909661973681, "learning_rate": 4.493327691287596e-06, "loss": 1.1213, "num_tokens": 130296941.0, "step": 1080 }, { "epoch": 0.6976, "grad_norm": 0.8758642744013126, "learning_rate": 4.482857594679082e-06, "loss": 1.1169, "num_tokens": 131499785.0, "step": 1090 }, { "epoch": 0.704, "grad_norm": 0.9756017880226009, "learning_rate": 4.472292885545887e-06, "loss": 1.1182, "num_tokens": 132704447.0, "step": 1100 }, { "epoch": 0.7104, "grad_norm": 0.9918470716003941, "learning_rate": 4.4616340679856344e-06, "loss": 1.112, "num_tokens": 133914148.0, "step": 1110 }, { "epoch": 0.7168, "grad_norm": 0.7736509572616426, "learning_rate": 4.450881650586354e-06, "loss": 1.0948, "num_tokens": 135116690.0, "step": 1120 }, { "epoch": 0.7232, "grad_norm": 0.8393996918370894, "learning_rate": 4.440036146402218e-06, "loss": 1.1196, "num_tokens": 136325534.0, "step": 1130 }, { "epoch": 0.7296, "grad_norm": 0.8283036410858456, "learning_rate": 4.429098072929052e-06, "loss": 1.1249, "num_tokens": 137532058.0, "step": 1140 }, { "epoch": 0.736, "grad_norm": 1.0272561438627168, "learning_rate": 4.418067952079651e-06, "loss": 1.0894, "num_tokens": 138742925.0, "step": 1150 }, { "epoch": 0.7424, "grad_norm": 0.9457224166686296, "learning_rate": 4.40694631015887e-06, "loss": 1.1072, "num_tokens": 139944361.0, "step": 1160 }, { "epoch": 0.7488, "grad_norm": 0.8472242869303449, "learning_rate": 4.395733677838515e-06, "loss": 1.104, "num_tokens": 141145139.0, "step": 1170 }, { "epoch": 0.7552, "grad_norm": 0.8369893067934512, "learning_rate": 4.384430590132023e-06, "loss": 1.1167, "num_tokens": 142348857.0, "step": 1180 }, { "epoch": 0.7616, "grad_norm": 0.9417838753194914, "learning_rate": 4.373037586368925e-06, "loss": 1.0952, "num_tokens": 143560823.0, "step": 1190 }, { "epoch": 0.768, "grad_norm": 0.83199280244184, "learning_rate": 4.361555210169126e-06, "loss": 1.0969, "num_tokens": 144770576.0, "step": 1200 }, { "epoch": 0.7744, "grad_norm": 0.8757783495810086, "learning_rate": 4.349984009416952e-06, "loss": 1.0948, "num_tokens": 145978862.0, "step": 1210 }, { "epoch": 0.7808, "grad_norm": 0.8374080168936522, "learning_rate": 4.3383245362350174e-06, "loss": 1.1087, "num_tokens": 147191743.0, "step": 1220 }, { "epoch": 0.7872, "grad_norm": 0.8702169752217432, "learning_rate": 4.326577346957876e-06, "loss": 1.1099, "num_tokens": 148399289.0, "step": 1230 }, { "epoch": 0.7936, "grad_norm": 0.8016984816166285, "learning_rate": 4.314743002105473e-06, "loss": 1.1052, "num_tokens": 149602404.0, "step": 1240 }, { "epoch": 0.8, "grad_norm": 1.0811796381892176, "learning_rate": 4.302822066356408e-06, "loss": 1.0996, "num_tokens": 150811734.0, "step": 1250 }, { "epoch": 0.8064, "grad_norm": 0.8374755480022819, "learning_rate": 4.290815108520982e-06, "loss": 1.1185, "num_tokens": 152011294.0, "step": 1260 }, { "epoch": 0.8128, "grad_norm": 0.7904368039438139, "learning_rate": 4.278722701514061e-06, "loss": 1.0992, "num_tokens": 153217258.0, "step": 1270 }, { "epoch": 0.8192, "grad_norm": 0.785661611999425, "learning_rate": 4.266545422327741e-06, "loss": 1.1208, "num_tokens": 154419838.0, "step": 1280 }, { "epoch": 0.8256, "grad_norm": 0.8439322755320521, "learning_rate": 4.254283852003813e-06, "loss": 1.1091, "num_tokens": 155626578.0, "step": 1290 }, { "epoch": 0.832, "grad_norm": 0.8732275622995317, "learning_rate": 4.241938575606038e-06, "loss": 1.0826, "num_tokens": 156825805.0, "step": 1300 }, { "epoch": 0.8384, "grad_norm": 0.8014980196902037, "learning_rate": 4.229510182192235e-06, "loss": 1.1093, "num_tokens": 158037877.0, "step": 1310 }, { "epoch": 0.8448, "grad_norm": 0.8106302375207448, "learning_rate": 4.216999264786169e-06, "loss": 1.1073, "num_tokens": 159245106.0, "step": 1320 }, { "epoch": 0.8512, "grad_norm": 0.9385310776537238, "learning_rate": 4.204406420349259e-06, "loss": 1.1056, "num_tokens": 160456114.0, "step": 1330 }, { "epoch": 0.8576, "grad_norm": 0.9579249297784465, "learning_rate": 4.191732249752092e-06, "loss": 1.1021, "num_tokens": 161659510.0, "step": 1340 }, { "epoch": 0.864, "grad_norm": 0.8134490186326385, "learning_rate": 4.178977357745749e-06, "loss": 1.0821, "num_tokens": 162865495.0, "step": 1350 }, { "epoch": 0.8704, "grad_norm": 0.7943299269230713, "learning_rate": 4.166142352932957e-06, "loss": 1.1065, "num_tokens": 164069925.0, "step": 1360 }, { "epoch": 0.8768, "grad_norm": 0.8171116530483417, "learning_rate": 4.153227847739041e-06, "loss": 1.0873, "num_tokens": 165272777.0, "step": 1370 }, { "epoch": 0.8832, "grad_norm": 0.8472827858602203, "learning_rate": 4.140234458382708e-06, "loss": 1.1207, "num_tokens": 166473564.0, "step": 1380 }, { "epoch": 0.8896, "grad_norm": 0.8254355045966608, "learning_rate": 4.12716280484664e-06, "loss": 1.093, "num_tokens": 167678209.0, "step": 1390 }, { "epoch": 0.896, "grad_norm": 0.8238773032302608, "learning_rate": 4.114013510847914e-06, "loss": 1.1004, "num_tokens": 168879199.0, "step": 1400 }, { "epoch": 0.9024, "grad_norm": 0.8035266067408213, "learning_rate": 4.100787203808241e-06, "loss": 1.09, "num_tokens": 170089062.0, "step": 1410 }, { "epoch": 0.9088, "grad_norm": 0.796684651593008, "learning_rate": 4.0874845148240265e-06, "loss": 1.0923, "num_tokens": 171298354.0, "step": 1420 }, { "epoch": 0.9152, "grad_norm": 0.7944378162845194, "learning_rate": 4.074106078636259e-06, "loss": 1.0877, "num_tokens": 172502932.0, "step": 1430 }, { "epoch": 0.9216, "grad_norm": 0.8222630499336689, "learning_rate": 4.0606525336002215e-06, "loss": 1.1069, "num_tokens": 173714359.0, "step": 1440 }, { "epoch": 0.928, "grad_norm": 0.8284462145945989, "learning_rate": 4.047124521655037e-06, "loss": 1.1063, "num_tokens": 174915024.0, "step": 1450 }, { "epoch": 0.9344, "grad_norm": 1.1184143246349953, "learning_rate": 4.033522688293033e-06, "loss": 1.0958, "num_tokens": 176121314.0, "step": 1460 }, { "epoch": 0.9408, "grad_norm": 0.9302956644371011, "learning_rate": 4.019847682528943e-06, "loss": 1.1057, "num_tokens": 177329003.0, "step": 1470 }, { "epoch": 0.9472, "grad_norm": 0.8315189293207337, "learning_rate": 4.00610015686894e-06, "loss": 1.1021, "num_tokens": 178533383.0, "step": 1480 }, { "epoch": 0.9536, "grad_norm": 0.780029339050911, "learning_rate": 3.9922807672795015e-06, "loss": 1.1022, "num_tokens": 179737544.0, "step": 1490 }, { "epoch": 0.96, "grad_norm": 0.8861787669753409, "learning_rate": 3.97839017315611e-06, "loss": 1.1033, "num_tokens": 180941884.0, "step": 1500 }, { "epoch": 0.9664, "grad_norm": 0.8613329501244571, "learning_rate": 3.964429037291785e-06, "loss": 1.0932, "num_tokens": 182147995.0, "step": 1510 }, { "epoch": 0.9728, "grad_norm": 0.7767446273299125, "learning_rate": 3.950398025845469e-06, "loss": 1.0764, "num_tokens": 183351238.0, "step": 1520 }, { "epoch": 0.9792, "grad_norm": 0.7800388177467502, "learning_rate": 3.936297808310229e-06, "loss": 1.0955, "num_tokens": 184559744.0, "step": 1530 }, { "epoch": 0.9856, "grad_norm": 0.822587499260109, "learning_rate": 3.9221290574813205e-06, "loss": 1.101, "num_tokens": 185771261.0, "step": 1540 }, { "epoch": 0.992, "grad_norm": 0.7842833667912362, "learning_rate": 3.907892449424081e-06, "loss": 1.0858, "num_tokens": 186988878.0, "step": 1550 }, { "epoch": 0.9984, "grad_norm": 0.875565650877801, "learning_rate": 3.893588663441669e-06, "loss": 1.1096, "num_tokens": 188198614.0, "step": 1560 }, { "epoch": 1.00448, "grad_norm": 0.9833099796256903, "learning_rate": 3.8792183820426575e-06, "loss": 1.0518, "num_tokens": 189338860.0, "step": 1570 }, { "epoch": 1.01088, "grad_norm": 0.9539211061323496, "learning_rate": 3.864782290908462e-06, "loss": 1.0558, "num_tokens": 190541615.0, "step": 1580 }, { "epoch": 1.01728, "grad_norm": 0.8277557093113368, "learning_rate": 3.850281078860627e-06, "loss": 1.0672, "num_tokens": 191744590.0, "step": 1590 }, { "epoch": 1.02368, "grad_norm": 0.8095245034674352, "learning_rate": 3.835715437827954e-06, "loss": 1.0555, "num_tokens": 192946831.0, "step": 1600 }, { "epoch": 1.03008, "grad_norm": 0.8670205092911757, "learning_rate": 3.821086062813492e-06, "loss": 1.0558, "num_tokens": 194153241.0, "step": 1610 }, { "epoch": 1.03648, "grad_norm": 0.8041612181651476, "learning_rate": 3.806393651861372e-06, "loss": 1.0713, "num_tokens": 195361386.0, "step": 1620 }, { "epoch": 1.04288, "grad_norm": 0.8201672913405339, "learning_rate": 3.7916389060234964e-06, "loss": 1.0612, "num_tokens": 196570539.0, "step": 1630 }, { "epoch": 1.04928, "grad_norm": 0.822814114472732, "learning_rate": 3.776822529326097e-06, "loss": 1.0643, "num_tokens": 197758018.0, "step": 1640 }, { "epoch": 1.05568, "grad_norm": 0.8405563342503541, "learning_rate": 3.7619452287361306e-06, "loss": 1.0576, "num_tokens": 198962473.0, "step": 1650 }, { "epoch": 1.06208, "grad_norm": 0.8733811946067399, "learning_rate": 3.7470077141275578e-06, "loss": 1.0602, "num_tokens": 200168404.0, "step": 1660 }, { "epoch": 1.06848, "grad_norm": 0.7810891863766373, "learning_rate": 3.732010698247463e-06, "loss": 1.0429, "num_tokens": 201383921.0, "step": 1670 }, { "epoch": 1.07488, "grad_norm": 0.8253121322208729, "learning_rate": 3.7169548966820466e-06, "loss": 1.069, "num_tokens": 202590191.0, "step": 1680 }, { "epoch": 1.08128, "grad_norm": 0.7968885719952052, "learning_rate": 3.7018410278224852e-06, "loss": 1.0661, "num_tokens": 203790064.0, "step": 1690 }, { "epoch": 1.08768, "grad_norm": 0.7513522866065546, "learning_rate": 3.686669812830648e-06, "loss": 1.0648, "num_tokens": 205004834.0, "step": 1700 }, { "epoch": 1.09408, "grad_norm": 0.8133897709614188, "learning_rate": 3.671441975604689e-06, "loss": 1.0574, "num_tokens": 206218130.0, "step": 1710 }, { "epoch": 1.10048, "grad_norm": 0.855169356505383, "learning_rate": 3.6561582427445053e-06, "loss": 1.0652, "num_tokens": 207421774.0, "step": 1720 }, { "epoch": 1.10688, "grad_norm": 0.7861479775879827, "learning_rate": 3.6408193435170695e-06, "loss": 1.0601, "num_tokens": 208639076.0, "step": 1730 }, { "epoch": 1.11328, "grad_norm": 0.7759167355223116, "learning_rate": 3.625426009821628e-06, "loss": 1.0515, "num_tokens": 209843506.0, "step": 1740 }, { "epoch": 1.11968, "grad_norm": 0.7737945956455258, "learning_rate": 3.609978976154784e-06, "loss": 1.0449, "num_tokens": 211053262.0, "step": 1750 }, { "epoch": 1.12608, "grad_norm": 0.8033895393207562, "learning_rate": 3.594478979575443e-06, "loss": 1.0653, "num_tokens": 212256390.0, "step": 1760 }, { "epoch": 1.13248, "grad_norm": 0.8687778972426285, "learning_rate": 3.578926759669653e-06, "loss": 1.046, "num_tokens": 213458553.0, "step": 1770 }, { "epoch": 1.13888, "grad_norm": 0.8146069292073773, "learning_rate": 3.5633230585153093e-06, "loss": 1.0587, "num_tokens": 214667929.0, "step": 1780 }, { "epoch": 1.14528, "grad_norm": 0.8442869654702855, "learning_rate": 3.5476686206467465e-06, "loss": 1.0476, "num_tokens": 215872854.0, "step": 1790 }, { "epoch": 1.15168, "grad_norm": 0.8166732673631207, "learning_rate": 3.531964193019214e-06, "loss": 1.0486, "num_tokens": 217084577.0, "step": 1800 }, { "epoch": 1.15808, "grad_norm": 0.8407184177973456, "learning_rate": 3.5162105249732336e-06, "loss": 1.0446, "num_tokens": 218284006.0, "step": 1810 }, { "epoch": 1.16448, "grad_norm": 0.7814422822824459, "learning_rate": 3.5004083681988476e-06, "loss": 1.0466, "num_tokens": 219487469.0, "step": 1820 }, { "epoch": 1.17088, "grad_norm": 0.7953904441180448, "learning_rate": 3.484558476699748e-06, "loss": 1.0539, "num_tokens": 220690881.0, "step": 1830 }, { "epoch": 1.17728, "grad_norm": 0.8120616693504964, "learning_rate": 3.468661606757301e-06, "loss": 1.0564, "num_tokens": 221898060.0, "step": 1840 }, { "epoch": 1.18368, "grad_norm": 0.7894301070451438, "learning_rate": 3.45271851689446e-06, "loss": 1.0576, "num_tokens": 223099219.0, "step": 1850 }, { "epoch": 1.19008, "grad_norm": 0.8628648936847306, "learning_rate": 3.436729967839575e-06, "loss": 1.0697, "num_tokens": 224314472.0, "step": 1860 }, { "epoch": 1.19648, "grad_norm": 0.8485241964897267, "learning_rate": 3.4206967224900885e-06, "loss": 1.0583, "num_tokens": 225513940.0, "step": 1870 }, { "epoch": 1.20288, "grad_norm": 0.8019635872502272, "learning_rate": 3.40461954587614e-06, "loss": 1.0484, "num_tokens": 226733560.0, "step": 1880 }, { "epoch": 1.20928, "grad_norm": 0.8148504625626072, "learning_rate": 3.3884992051240613e-06, "loss": 1.049, "num_tokens": 227946861.0, "step": 1890 }, { "epoch": 1.21568, "grad_norm": 0.799348761407277, "learning_rate": 3.372336469419767e-06, "loss": 1.0636, "num_tokens": 229149854.0, "step": 1900 }, { "epoch": 1.22208, "grad_norm": 0.8121058069211242, "learning_rate": 3.35613210997206e-06, "loss": 1.0679, "num_tokens": 230358777.0, "step": 1910 }, { "epoch": 1.22848, "grad_norm": 0.8225529513521229, "learning_rate": 3.339886899975831e-06, "loss": 1.0455, "num_tokens": 231573319.0, "step": 1920 }, { "epoch": 1.23488, "grad_norm": 0.7930056234558618, "learning_rate": 3.3236016145751616e-06, "loss": 1.0453, "num_tokens": 232778798.0, "step": 1930 }, { "epoch": 1.24128, "grad_norm": 0.7824523425714454, "learning_rate": 3.307277030826342e-06, "loss": 1.046, "num_tokens": 233985281.0, "step": 1940 }, { "epoch": 1.24768, "grad_norm": 1.126385656615945, "learning_rate": 3.290913927660793e-06, "loss": 1.0418, "num_tokens": 235194572.0, "step": 1950 }, { "epoch": 1.25408, "grad_norm": 0.8230976427574604, "learning_rate": 3.274513085847899e-06, "loss": 1.0596, "num_tokens": 236400915.0, "step": 1960 }, { "epoch": 1.26048, "grad_norm": 0.7715465448814725, "learning_rate": 3.2580752879577508e-06, "loss": 1.0421, "num_tokens": 237602768.0, "step": 1970 }, { "epoch": 1.26688, "grad_norm": 0.7604905419126253, "learning_rate": 3.2416013183238105e-06, "loss": 1.0596, "num_tokens": 238810127.0, "step": 1980 }, { "epoch": 1.27328, "grad_norm": 0.8091857959210363, "learning_rate": 3.22509196300548e-06, "loss": 1.0544, "num_tokens": 240016518.0, "step": 1990 }, { "epoch": 1.27968, "grad_norm": 0.8428609624878182, "learning_rate": 3.2085480097506015e-06, "loss": 1.0517, "num_tokens": 241224903.0, "step": 2000 }, { "epoch": 1.2860800000000001, "grad_norm": 0.8167440202916451, "learning_rate": 3.191970247957862e-06, "loss": 1.0607, "num_tokens": 242432829.0, "step": 2010 }, { "epoch": 1.29248, "grad_norm": 0.843189559655867, "learning_rate": 3.1753594686391343e-06, "loss": 1.0519, "num_tokens": 243643680.0, "step": 2020 }, { "epoch": 1.29888, "grad_norm": 0.8113193681644453, "learning_rate": 3.158716464381728e-06, "loss": 1.0534, "num_tokens": 244850967.0, "step": 2030 }, { "epoch": 1.30528, "grad_norm": 0.8238038397216464, "learning_rate": 3.1420420293105753e-06, "loss": 1.0537, "num_tokens": 246055107.0, "step": 2040 }, { "epoch": 1.31168, "grad_norm": 0.7585161106894139, "learning_rate": 3.1253369590503357e-06, "loss": 1.053, "num_tokens": 247255291.0, "step": 2050 }, { "epoch": 1.31808, "grad_norm": 0.8358837254742888, "learning_rate": 3.1086020506874352e-06, "loss": 1.0552, "num_tokens": 248472347.0, "step": 2060 }, { "epoch": 1.3244799999999999, "grad_norm": 0.8248705338889306, "learning_rate": 3.091838102732031e-06, "loss": 1.0547, "num_tokens": 249675791.0, "step": 2070 }, { "epoch": 1.33088, "grad_norm": 0.8413169777388428, "learning_rate": 3.0750459150799116e-06, "loss": 1.0512, "num_tokens": 250883742.0, "step": 2080 }, { "epoch": 1.33728, "grad_norm": 0.7773274742980588, "learning_rate": 3.0582262889743304e-06, "loss": 1.0435, "num_tokens": 252092991.0, "step": 2090 }, { "epoch": 1.34368, "grad_norm": 0.8160134758509259, "learning_rate": 3.0413800269677707e-06, "loss": 1.0617, "num_tokens": 253296187.0, "step": 2100 }, { "epoch": 1.35008, "grad_norm": 0.8253629381678, "learning_rate": 3.024507932883659e-06, "loss": 1.0467, "num_tokens": 254497531.0, "step": 2110 }, { "epoch": 1.35648, "grad_norm": 0.8449321081656331, "learning_rate": 3.0076108117779995e-06, "loss": 1.0501, "num_tokens": 255698828.0, "step": 2120 }, { "epoch": 1.36288, "grad_norm": 0.864074317535777, "learning_rate": 2.9906894699009714e-06, "loss": 1.051, "num_tokens": 256901786.0, "step": 2130 }, { "epoch": 1.36928, "grad_norm": 0.8545075997582061, "learning_rate": 2.973744714658452e-06, "loss": 1.045, "num_tokens": 258102803.0, "step": 2140 }, { "epoch": 1.37568, "grad_norm": 0.7950948333995521, "learning_rate": 2.9567773545734917e-06, "loss": 1.0609, "num_tokens": 259309237.0, "step": 2150 }, { "epoch": 1.38208, "grad_norm": 0.7772992222068908, "learning_rate": 2.9397881992477388e-06, "loss": 1.0529, "num_tokens": 260512534.0, "step": 2160 }, { "epoch": 1.38848, "grad_norm": 0.8230701809627932, "learning_rate": 2.9227780593228063e-06, "loss": 1.0492, "num_tokens": 261721309.0, "step": 2170 }, { "epoch": 1.3948800000000001, "grad_norm": 0.803410117521878, "learning_rate": 2.90574774644159e-06, "loss": 1.0341, "num_tokens": 262926754.0, "step": 2180 }, { "epoch": 1.40128, "grad_norm": 0.9047895349858696, "learning_rate": 2.8886980732095467e-06, "loss": 1.0304, "num_tokens": 264129158.0, "step": 2190 }, { "epoch": 1.40768, "grad_norm": 0.8048555076981502, "learning_rate": 2.8716298531559133e-06, "loss": 1.0494, "num_tokens": 265332827.0, "step": 2200 }, { "epoch": 1.41408, "grad_norm": 0.8364957546359483, "learning_rate": 2.8545439006948948e-06, "loss": 1.0423, "num_tokens": 266542306.0, "step": 2210 }, { "epoch": 1.42048, "grad_norm": 0.7904212151138658, "learning_rate": 2.8374410310868044e-06, "loss": 1.0423, "num_tokens": 267751752.0, "step": 2220 }, { "epoch": 1.42688, "grad_norm": 0.8434192039931359, "learning_rate": 2.820322060399156e-06, "loss": 1.0471, "num_tokens": 268955655.0, "step": 2230 }, { "epoch": 1.4332799999999999, "grad_norm": 0.7746642379992007, "learning_rate": 2.803187805467733e-06, "loss": 1.0574, "num_tokens": 270165303.0, "step": 2240 }, { "epoch": 1.43968, "grad_norm": 0.8462146853078769, "learning_rate": 2.7860390838576125e-06, "loss": 1.0579, "num_tokens": 271371057.0, "step": 2250 }, { "epoch": 1.44608, "grad_norm": 0.7814911330812998, "learning_rate": 2.7688767138241474e-06, "loss": 1.0374, "num_tokens": 272570562.0, "step": 2260 }, { "epoch": 1.45248, "grad_norm": 0.7648342437809393, "learning_rate": 2.7517015142739335e-06, "loss": 1.0551, "num_tokens": 273773102.0, "step": 2270 }, { "epoch": 1.45888, "grad_norm": 0.8135139786141086, "learning_rate": 2.734514304725727e-06, "loss": 1.0431, "num_tokens": 274979458.0, "step": 2280 }, { "epoch": 1.46528, "grad_norm": 0.8275244446318913, "learning_rate": 2.717315905271344e-06, "loss": 1.0436, "num_tokens": 276180959.0, "step": 2290 }, { "epoch": 1.47168, "grad_norm": 0.8456585906125247, "learning_rate": 2.700107136536533e-06, "loss": 1.0571, "num_tokens": 277381104.0, "step": 2300 }, { "epoch": 1.47808, "grad_norm": 0.7676272425904394, "learning_rate": 2.682888819641809e-06, "loss": 1.0454, "num_tokens": 278589355.0, "step": 2310 }, { "epoch": 1.48448, "grad_norm": 0.7530507207913718, "learning_rate": 2.6656617761632863e-06, "loss": 1.0452, "num_tokens": 279802576.0, "step": 2320 }, { "epoch": 1.49088, "grad_norm": 0.8099596670334043, "learning_rate": 2.6484268280934674e-06, "loss": 1.0441, "num_tokens": 281010541.0, "step": 2330 }, { "epoch": 1.49728, "grad_norm": 0.8098629796138991, "learning_rate": 2.631184797802022e-06, "loss": 1.0379, "num_tokens": 282219974.0, "step": 2340 }, { "epoch": 1.5036800000000001, "grad_norm": 0.8633758780871927, "learning_rate": 2.613936507996554e-06, "loss": 1.0553, "num_tokens": 283423505.0, "step": 2350 }, { "epoch": 1.5100799999999999, "grad_norm": 0.8494557884878244, "learning_rate": 2.5966827816833393e-06, "loss": 1.034, "num_tokens": 284628594.0, "step": 2360 }, { "epoch": 1.51648, "grad_norm": 0.8961874351947472, "learning_rate": 2.579424442128057e-06, "loss": 1.0403, "num_tokens": 285839496.0, "step": 2370 }, { "epoch": 1.52288, "grad_norm": 0.8982519210357097, "learning_rate": 2.562162312816511e-06, "loss": 1.0516, "num_tokens": 287048432.0, "step": 2380 }, { "epoch": 1.52928, "grad_norm": 0.834174589328149, "learning_rate": 2.544897217415332e-06, "loss": 1.0371, "num_tokens": 288256611.0, "step": 2390 }, { "epoch": 1.5356800000000002, "grad_norm": 0.7790317392375281, "learning_rate": 2.5276299797326777e-06, "loss": 1.0347, "num_tokens": 289465699.0, "step": 2400 }, { "epoch": 1.54208, "grad_norm": 0.8113176021935586, "learning_rate": 2.510361423678929e-06, "loss": 1.035, "num_tokens": 290666618.0, "step": 2410 }, { "epoch": 1.54848, "grad_norm": 0.8175298566784388, "learning_rate": 2.4930923732273683e-06, "loss": 1.0364, "num_tokens": 291864705.0, "step": 2420 }, { "epoch": 1.55488, "grad_norm": 0.8601137215701125, "learning_rate": 2.4758236523748734e-06, "loss": 1.041, "num_tokens": 293077992.0, "step": 2430 }, { "epoch": 1.56128, "grad_norm": 0.766342647676912, "learning_rate": 2.4585560851025917e-06, "loss": 1.0448, "num_tokens": 294292270.0, "step": 2440 }, { "epoch": 1.56768, "grad_norm": 0.8144040865702195, "learning_rate": 2.4412904953366263e-06, "loss": 1.0626, "num_tokens": 295501196.0, "step": 2450 }, { "epoch": 1.57408, "grad_norm": 0.8426321262317878, "learning_rate": 2.424027706908728e-06, "loss": 1.0361, "num_tokens": 296713375.0, "step": 2460 }, { "epoch": 1.58048, "grad_norm": 0.870533748148585, "learning_rate": 2.406768543516977e-06, "loss": 1.041, "num_tokens": 297925333.0, "step": 2470 }, { "epoch": 1.5868799999999998, "grad_norm": 0.813316442312155, "learning_rate": 2.389513828686485e-06, "loss": 1.0337, "num_tokens": 299126955.0, "step": 2480 }, { "epoch": 1.59328, "grad_norm": 0.8050560504469045, "learning_rate": 2.372264385730099e-06, "loss": 1.0432, "num_tokens": 300336458.0, "step": 2490 }, { "epoch": 1.59968, "grad_norm": 0.8007073397832749, "learning_rate": 2.355021037709118e-06, "loss": 1.0571, "num_tokens": 301539282.0, "step": 2500 }, { "epoch": 1.60608, "grad_norm": 0.8259619776886131, "learning_rate": 2.3377846073940207e-06, "loss": 1.0478, "num_tokens": 302743922.0, "step": 2510 }, { "epoch": 1.6124800000000001, "grad_norm": 0.7857263898091816, "learning_rate": 2.3205559172252052e-06, "loss": 1.0265, "num_tokens": 303945412.0, "step": 2520 }, { "epoch": 1.6188799999999999, "grad_norm": 0.7830231024473471, "learning_rate": 2.303335789273744e-06, "loss": 1.0424, "num_tokens": 305146555.0, "step": 2530 }, { "epoch": 1.62528, "grad_norm": 0.773313259484951, "learning_rate": 2.286125045202164e-06, "loss": 1.0435, "num_tokens": 306362219.0, "step": 2540 }, { "epoch": 1.63168, "grad_norm": 0.8201327055565161, "learning_rate": 2.2689245062252398e-06, "loss": 1.0509, "num_tokens": 307565244.0, "step": 2550 }, { "epoch": 1.63808, "grad_norm": 0.827602816998628, "learning_rate": 2.2517349930708032e-06, "loss": 1.049, "num_tokens": 308770918.0, "step": 2560 }, { "epoch": 1.6444800000000002, "grad_norm": 0.7919141547822656, "learning_rate": 2.234557325940589e-06, "loss": 1.0431, "num_tokens": 309984868.0, "step": 2570 }, { "epoch": 1.65088, "grad_norm": 0.7394357208064606, "learning_rate": 2.2173923244710954e-06, "loss": 1.0312, "num_tokens": 311187334.0, "step": 2580 }, { "epoch": 1.65728, "grad_norm": 0.785327584034165, "learning_rate": 2.200240807694474e-06, "loss": 1.0353, "num_tokens": 312396234.0, "step": 2590 }, { "epoch": 1.66368, "grad_norm": 0.8232141872243898, "learning_rate": 2.1831035939994554e-06, "loss": 1.0562, "num_tokens": 313601855.0, "step": 2600 }, { "epoch": 1.67008, "grad_norm": 0.7833896049344754, "learning_rate": 2.165981501092291e-06, "loss": 1.0407, "num_tokens": 314804262.0, "step": 2610 }, { "epoch": 1.67648, "grad_norm": 0.7885429615611813, "learning_rate": 2.148875345957741e-06, "loss": 1.0295, "num_tokens": 316005948.0, "step": 2620 }, { "epoch": 1.68288, "grad_norm": 0.7829739281596803, "learning_rate": 2.131785944820092e-06, "loss": 1.0252, "num_tokens": 317208803.0, "step": 2630 }, { "epoch": 1.6892800000000001, "grad_norm": 0.7928770034373539, "learning_rate": 2.114714113104211e-06, "loss": 1.0498, "num_tokens": 318416652.0, "step": 2640 }, { "epoch": 1.6956799999999999, "grad_norm": 0.790850427449215, "learning_rate": 2.097660665396632e-06, "loss": 1.0421, "num_tokens": 319628095.0, "step": 2650 }, { "epoch": 1.70208, "grad_norm": 0.8023551277637352, "learning_rate": 2.0806264154066946e-06, "loss": 1.0393, "num_tokens": 320828695.0, "step": 2660 }, { "epoch": 1.70848, "grad_norm": 0.7922577515769408, "learning_rate": 2.0636121759277135e-06, "loss": 1.0485, "num_tokens": 322041475.0, "step": 2670 }, { "epoch": 1.71488, "grad_norm": 0.7971244397123712, "learning_rate": 2.046618758798197e-06, "loss": 1.0275, "num_tokens": 323243099.0, "step": 2680 }, { "epoch": 1.7212800000000001, "grad_norm": 0.8040701855401029, "learning_rate": 2.0296469748631113e-06, "loss": 1.0238, "num_tokens": 324448570.0, "step": 2690 }, { "epoch": 1.7276799999999999, "grad_norm": 0.7586132016898348, "learning_rate": 2.0126976339351883e-06, "loss": 1.0345, "num_tokens": 325656124.0, "step": 2700 }, { "epoch": 1.73408, "grad_norm": 0.7741130670086324, "learning_rate": 1.995771544756287e-06, "loss": 1.0304, "num_tokens": 326867457.0, "step": 2710 }, { "epoch": 1.74048, "grad_norm": 0.7603630468965715, "learning_rate": 1.9788695149588027e-06, "loss": 1.0348, "num_tokens": 328069419.0, "step": 2720 }, { "epoch": 1.74688, "grad_norm": 0.7656701861871694, "learning_rate": 1.9619923510271333e-06, "loss": 1.0337, "num_tokens": 329274913.0, "step": 2730 }, { "epoch": 1.75328, "grad_norm": 0.7795354061202655, "learning_rate": 1.945140858259195e-06, "loss": 1.0467, "num_tokens": 330497463.0, "step": 2740 }, { "epoch": 1.75968, "grad_norm": 0.8511581572833524, "learning_rate": 1.928315840727998e-06, "loss": 1.0292, "num_tokens": 331705026.0, "step": 2750 }, { "epoch": 1.76608, "grad_norm": 0.8185264208105538, "learning_rate": 1.9115181012432795e-06, "loss": 1.0462, "num_tokens": 332910224.0, "step": 2760 }, { "epoch": 1.77248, "grad_norm": 0.8581339452377109, "learning_rate": 1.8947484413131996e-06, "loss": 1.0344, "num_tokens": 334124736.0, "step": 2770 }, { "epoch": 1.77888, "grad_norm": 0.8469198844835426, "learning_rate": 1.8780076611060962e-06, "loss": 1.031, "num_tokens": 335328630.0, "step": 2780 }, { "epoch": 1.78528, "grad_norm": 0.8097233001009885, "learning_rate": 1.861296559412303e-06, "loss": 1.0268, "num_tokens": 336532418.0, "step": 2790 }, { "epoch": 1.79168, "grad_norm": 0.8477425454150115, "learning_rate": 1.844615933606037e-06, "loss": 1.0311, "num_tokens": 337730246.0, "step": 2800 }, { "epoch": 1.7980800000000001, "grad_norm": 0.7749925952377877, "learning_rate": 1.8279665796073498e-06, "loss": 1.0415, "num_tokens": 338937460.0, "step": 2810 }, { "epoch": 1.8044799999999999, "grad_norm": 0.7976261215266267, "learning_rate": 1.8113492918441523e-06, "loss": 1.047, "num_tokens": 340147641.0, "step": 2820 }, { "epoch": 1.81088, "grad_norm": 0.7733887224457893, "learning_rate": 1.7947648632143075e-06, "loss": 1.0309, "num_tokens": 341352040.0, "step": 2830 }, { "epoch": 1.81728, "grad_norm": 0.7739175808490624, "learning_rate": 1.7782140850477967e-06, "loss": 1.0518, "num_tokens": 342559891.0, "step": 2840 }, { "epoch": 1.82368, "grad_norm": 0.797265127895327, "learning_rate": 1.7616977470689605e-06, "loss": 1.0325, "num_tokens": 343774370.0, "step": 2850 }, { "epoch": 1.8300800000000002, "grad_norm": 0.8443750617770532, "learning_rate": 1.7452166373588185e-06, "loss": 1.021, "num_tokens": 344970302.0, "step": 2860 }, { "epoch": 1.83648, "grad_norm": 0.8003604596330827, "learning_rate": 1.7287715423174662e-06, "loss": 1.0304, "num_tokens": 346180457.0, "step": 2870 }, { "epoch": 1.84288, "grad_norm": 0.8376385879621375, "learning_rate": 1.7123632466265483e-06, "loss": 1.0395, "num_tokens": 347385193.0, "step": 2880 }, { "epoch": 1.84928, "grad_norm": 0.7906644473344662, "learning_rate": 1.69599253321182e-06, "loss": 1.0413, "num_tokens": 348601710.0, "step": 2890 }, { "epoch": 1.85568, "grad_norm": 0.7924809016265382, "learning_rate": 1.6796601832057905e-06, "loss": 1.0378, "num_tokens": 349806167.0, "step": 2900 }, { "epoch": 1.86208, "grad_norm": 0.7766495775123572, "learning_rate": 1.6633669759104488e-06, "loss": 1.0264, "num_tokens": 351012043.0, "step": 2910 }, { "epoch": 1.86848, "grad_norm": 1.3435506252779292, "learning_rate": 1.6471136887600805e-06, "loss": 1.0237, "num_tokens": 352217587.0, "step": 2920 }, { "epoch": 1.87488, "grad_norm": 0.765607343549468, "learning_rate": 1.6309010972841728e-06, "loss": 1.0382, "num_tokens": 353418821.0, "step": 2930 }, { "epoch": 1.8812799999999998, "grad_norm": 0.8171820174646456, "learning_rate": 1.614729975070407e-06, "loss": 1.0366, "num_tokens": 354624890.0, "step": 2940 }, { "epoch": 1.88768, "grad_norm": 0.8064241532835642, "learning_rate": 1.598601093727749e-06, "loss": 1.0361, "num_tokens": 355824991.0, "step": 2950 }, { "epoch": 1.89408, "grad_norm": 0.7884619306846271, "learning_rate": 1.5825152228496342e-06, "loss": 1.0425, "num_tokens": 357030616.0, "step": 2960 }, { "epoch": 1.90048, "grad_norm": 0.8265648248850005, "learning_rate": 1.5664731299772401e-06, "loss": 1.0332, "num_tokens": 358234522.0, "step": 2970 }, { "epoch": 1.9068800000000001, "grad_norm": 0.8092024559268799, "learning_rate": 1.5504755805628677e-06, "loss": 1.0399, "num_tokens": 359443389.0, "step": 2980 }, { "epoch": 1.9132799999999999, "grad_norm": 0.791864238644019, "learning_rate": 1.5345233379334156e-06, "loss": 1.0289, "num_tokens": 360644258.0, "step": 2990 }, { "epoch": 1.91968, "grad_norm": 0.8006538523086424, "learning_rate": 1.5186171632539587e-06, "loss": 1.0392, "num_tokens": 361848281.0, "step": 3000 }, { "epoch": 1.92608, "grad_norm": 0.7852026214667117, "learning_rate": 1.502757815491429e-06, "loss": 1.0301, "num_tokens": 363051672.0, "step": 3010 }, { "epoch": 1.93248, "grad_norm": 0.7473075275246417, "learning_rate": 1.4869460513784011e-06, "loss": 1.0349, "num_tokens": 364249917.0, "step": 3020 }, { "epoch": 1.9388800000000002, "grad_norm": 0.7822299185363633, "learning_rate": 1.4711826253769828e-06, "loss": 1.04, "num_tokens": 365456248.0, "step": 3030 }, { "epoch": 1.94528, "grad_norm": 0.8034434681463449, "learning_rate": 1.4554682896428179e-06, "loss": 1.0379, "num_tokens": 366654881.0, "step": 3040 }, { "epoch": 1.95168, "grad_norm": 0.7768199970864885, "learning_rate": 1.439803793989198e-06, "loss": 1.0241, "num_tokens": 367861348.0, "step": 3050 }, { "epoch": 1.95808, "grad_norm": 0.8118112910224361, "learning_rate": 1.4241898858512824e-06, "loss": 1.0426, "num_tokens": 369064003.0, "step": 3060 }, { "epoch": 1.96448, "grad_norm": 0.7744113528953481, "learning_rate": 1.408627310250434e-06, "loss": 1.0414, "num_tokens": 370279324.0, "step": 3070 }, { "epoch": 1.97088, "grad_norm": 0.7887556630257991, "learning_rate": 1.3931168097586717e-06, "loss": 1.0336, "num_tokens": 371480368.0, "step": 3080 }, { "epoch": 1.97728, "grad_norm": 0.7640435636356337, "learning_rate": 1.377659124463239e-06, "loss": 1.042, "num_tokens": 372690129.0, "step": 3090 }, { "epoch": 1.98368, "grad_norm": 0.7603826553278634, "learning_rate": 1.3622549919312902e-06, "loss": 1.0361, "num_tokens": 373902924.0, "step": 3100 }, { "epoch": 1.9900799999999998, "grad_norm": 0.7599088525071184, "learning_rate": 1.346905147174694e-06, "loss": 1.0193, "num_tokens": 375112585.0, "step": 3110 }, { "epoch": 1.99648, "grad_norm": 0.7816099568186937, "learning_rate": 1.3316103226149682e-06, "loss": 1.0349, "num_tokens": 376325844.0, "step": 3120 }, { "epoch": 2.00256, "grad_norm": 0.7532423548597259, "learning_rate": 1.3163712480483255e-06, "loss": 1.0248, "num_tokens": 377473897.0, "step": 3130 }, { "epoch": 2.00896, "grad_norm": 0.7586660186977321, "learning_rate": 1.3011886506108578e-06, "loss": 1.0107, "num_tokens": 378675832.0, "step": 3140 }, { "epoch": 2.01536, "grad_norm": 0.7958518507428463, "learning_rate": 1.2860632547438334e-06, "loss": 1.0029, "num_tokens": 379872472.0, "step": 3150 }, { "epoch": 2.02176, "grad_norm": 0.8017956552207596, "learning_rate": 1.2709957821591384e-06, "loss": 1.0188, "num_tokens": 381071848.0, "step": 3160 }, { "epoch": 2.02816, "grad_norm": 0.8260326835110341, "learning_rate": 1.2559869518048307e-06, "loss": 1.0134, "num_tokens": 382272368.0, "step": 3170 }, { "epoch": 2.03456, "grad_norm": 0.845928507883109, "learning_rate": 1.2410374798308442e-06, "loss": 1.0107, "num_tokens": 383480338.0, "step": 3180 }, { "epoch": 2.04096, "grad_norm": 0.8513825857009242, "learning_rate": 1.2261480795548123e-06, "loss": 1.0099, "num_tokens": 384683907.0, "step": 3190 }, { "epoch": 2.04736, "grad_norm": 0.7711891823020852, "learning_rate": 1.211319461428032e-06, "loss": 1.0139, "num_tokens": 385889491.0, "step": 3200 }, { "epoch": 2.05376, "grad_norm": 0.7769167344105451, "learning_rate": 1.1965523330015652e-06, "loss": 1.0092, "num_tokens": 387095853.0, "step": 3210 }, { "epoch": 2.06016, "grad_norm": 0.7922783527359497, "learning_rate": 1.1818473988924797e-06, "loss": 1.0199, "num_tokens": 388306034.0, "step": 3220 }, { "epoch": 2.06656, "grad_norm": 0.8009332691587518, "learning_rate": 1.167205360750227e-06, "loss": 1.0185, "num_tokens": 389516647.0, "step": 3230 }, { "epoch": 2.07296, "grad_norm": 0.7591186989087252, "learning_rate": 1.1526269172231594e-06, "loss": 0.995, "num_tokens": 390724121.0, "step": 3240 }, { "epoch": 2.07936, "grad_norm": 0.8055729406106343, "learning_rate": 1.1381127639252005e-06, "loss": 1.0109, "num_tokens": 391924857.0, "step": 3250 }, { "epoch": 2.08576, "grad_norm": 0.7920326568899239, "learning_rate": 1.1236635934026474e-06, "loss": 0.9928, "num_tokens": 393133226.0, "step": 3260 }, { "epoch": 2.09216, "grad_norm": 0.8095321364071963, "learning_rate": 1.1092800951011283e-06, "loss": 1.0066, "num_tokens": 394338791.0, "step": 3270 }, { "epoch": 2.09856, "grad_norm": 0.7790939177959936, "learning_rate": 1.0949629553327106e-06, "loss": 1.0144, "num_tokens": 395544646.0, "step": 3280 }, { "epoch": 2.10496, "grad_norm": 0.9934552993460479, "learning_rate": 1.080712857243143e-06, "loss": 1.0004, "num_tokens": 396744920.0, "step": 3290 }, { "epoch": 2.11136, "grad_norm": 0.7949729572040324, "learning_rate": 1.0665304807792653e-06, "loss": 1.009, "num_tokens": 397964288.0, "step": 3300 }, { "epoch": 2.11776, "grad_norm": 0.7652291996158113, "learning_rate": 1.0524165026565655e-06, "loss": 1.007, "num_tokens": 399168969.0, "step": 3310 }, { "epoch": 2.12416, "grad_norm": 0.7761816653258836, "learning_rate": 1.0383715963268884e-06, "loss": 0.994, "num_tokens": 400373422.0, "step": 3320 }, { "epoch": 2.13056, "grad_norm": 0.8018173213180155, "learning_rate": 1.0243964319462997e-06, "loss": 1.0134, "num_tokens": 401577043.0, "step": 3330 }, { "epoch": 2.13696, "grad_norm": 0.7607821844421783, "learning_rate": 1.0104916763431133e-06, "loss": 1.0187, "num_tokens": 402777527.0, "step": 3340 }, { "epoch": 2.14336, "grad_norm": 0.8327193810047873, "learning_rate": 9.966579929860704e-07, "loss": 1.0249, "num_tokens": 403989663.0, "step": 3350 }, { "epoch": 2.14976, "grad_norm": 0.7706122440471653, "learning_rate": 9.828960419526818e-07, "loss": 1.0085, "num_tokens": 405198202.0, "step": 3360 }, { "epoch": 2.15616, "grad_norm": 0.7867703490032154, "learning_rate": 9.69206479897736e-07, "loss": 1.0197, "num_tokens": 406403598.0, "step": 3370 }, { "epoch": 2.16256, "grad_norm": 0.8249023880860281, "learning_rate": 9.555899600219634e-07, "loss": 1.0274, "num_tokens": 407600213.0, "step": 3380 }, { "epoch": 2.16896, "grad_norm": 0.7855210183667297, "learning_rate": 9.420471320408669e-07, "loss": 1.0127, "num_tokens": 408811259.0, "step": 3390 }, { "epoch": 2.17536, "grad_norm": 0.8119007655119803, "learning_rate": 9.28578642153726e-07, "loss": 1.0021, "num_tokens": 410014132.0, "step": 3400 }, { "epoch": 2.18176, "grad_norm": 0.9205406972397864, "learning_rate": 9.151851330127593e-07, "loss": 1.0126, "num_tokens": 411220727.0, "step": 3410 }, { "epoch": 2.18816, "grad_norm": 0.8056010049273263, "learning_rate": 9.018672436924605e-07, "loss": 0.9892, "num_tokens": 412425755.0, "step": 3420 }, { "epoch": 2.19456, "grad_norm": 0.7632161903493846, "learning_rate": 8.886256096591048e-07, "loss": 1.019, "num_tokens": 413631347.0, "step": 3430 }, { "epoch": 2.20096, "grad_norm": 0.7669268826101938, "learning_rate": 8.754608627404307e-07, "loss": 1.0048, "num_tokens": 414833259.0, "step": 3440 }, { "epoch": 2.20736, "grad_norm": 0.8361832533605145, "learning_rate": 8.623736310954869e-07, "loss": 1.0221, "num_tokens": 416040472.0, "step": 3450 }, { "epoch": 2.21376, "grad_norm": 0.7814096537064951, "learning_rate": 8.493645391846642e-07, "loss": 1.0037, "num_tokens": 417245756.0, "step": 3460 }, { "epoch": 2.22016, "grad_norm": 0.7933425120808404, "learning_rate": 8.364342077398971e-07, "loss": 0.9987, "num_tokens": 418455436.0, "step": 3470 }, { "epoch": 2.22656, "grad_norm": 0.7659925077465827, "learning_rate": 8.235832537350441e-07, "loss": 0.993, "num_tokens": 419667134.0, "step": 3480 }, { "epoch": 2.23296, "grad_norm": 0.8187051274632632, "learning_rate": 8.108122903564502e-07, "loss": 1.0028, "num_tokens": 420870725.0, "step": 3490 }, { "epoch": 2.23936, "grad_norm": 0.7595169446678035, "learning_rate": 7.98121926973692e-07, "loss": 1.0124, "num_tokens": 422076634.0, "step": 3500 }, { "epoch": 2.24576, "grad_norm": 0.8064753048978947, "learning_rate": 7.855127691104944e-07, "loss": 1.024, "num_tokens": 423284867.0, "step": 3510 }, { "epoch": 2.25216, "grad_norm": 0.809858814713402, "learning_rate": 7.729854184158411e-07, "loss": 1.0174, "num_tokens": 424493379.0, "step": 3520 }, { "epoch": 2.25856, "grad_norm": 0.7957945935555317, "learning_rate": 7.605404726352708e-07, "loss": 1.0149, "num_tokens": 425697729.0, "step": 3530 }, { "epoch": 2.26496, "grad_norm": 0.8194656210162423, "learning_rate": 7.481785255823482e-07, "loss": 0.9972, "num_tokens": 426893908.0, "step": 3540 }, { "epoch": 2.27136, "grad_norm": 0.7967423955163617, "learning_rate": 7.359001671103361e-07, "loss": 1.0106, "num_tokens": 428092842.0, "step": 3550 }, { "epoch": 2.27776, "grad_norm": 0.7881164663338793, "learning_rate": 7.237059830840482e-07, "loss": 1.0066, "num_tokens": 429286773.0, "step": 3560 }, { "epoch": 2.28416, "grad_norm": 0.7903923247778172, "learning_rate": 7.11596555351893e-07, "loss": 1.0111, "num_tokens": 430493341.0, "step": 3570 }, { "epoch": 2.29056, "grad_norm": 0.770776011448775, "learning_rate": 6.995724617181124e-07, "loss": 0.9923, "num_tokens": 431693370.0, "step": 3580 }, { "epoch": 2.29696, "grad_norm": 0.7817336774071154, "learning_rate": 6.876342759152121e-07, "loss": 1.0162, "num_tokens": 432901215.0, "step": 3590 }, { "epoch": 2.30336, "grad_norm": 0.7799297164560258, "learning_rate": 6.757825675765862e-07, "loss": 1.0089, "num_tokens": 434107776.0, "step": 3600 }, { "epoch": 2.30976, "grad_norm": 0.8399066019292479, "learning_rate": 6.640179022093324e-07, "loss": 1.0104, "num_tokens": 435311152.0, "step": 3610 }, { "epoch": 2.31616, "grad_norm": 0.8297592147597433, "learning_rate": 6.52340841167276e-07, "loss": 1.0114, "num_tokens": 436513739.0, "step": 3620 }, { "epoch": 2.32256, "grad_norm": 0.7719279126860086, "learning_rate": 6.407519416241779e-07, "loss": 1.0065, "num_tokens": 437726898.0, "step": 3630 }, { "epoch": 2.32896, "grad_norm": 0.8045844362641281, "learning_rate": 6.292517565471548e-07, "loss": 1.0097, "num_tokens": 438931660.0, "step": 3640 }, { "epoch": 2.33536, "grad_norm": 0.7982553698914577, "learning_rate": 6.178408346702882e-07, "loss": 1.0082, "num_tokens": 440137185.0, "step": 3650 }, { "epoch": 2.34176, "grad_norm": 0.7908405728187465, "learning_rate": 6.065197204684484e-07, "loss": 1.0148, "num_tokens": 441339870.0, "step": 3660 }, { "epoch": 2.34816, "grad_norm": 0.7738211794516375, "learning_rate": 5.95288954131307e-07, "loss": 1.015, "num_tokens": 442548750.0, "step": 3670 }, { "epoch": 2.35456, "grad_norm": 0.7925014240523639, "learning_rate": 5.841490715375689e-07, "loss": 1.0146, "num_tokens": 443760356.0, "step": 3680 }, { "epoch": 2.36096, "grad_norm": 0.7744344940621614, "learning_rate": 5.731006042293983e-07, "loss": 1.0195, "num_tokens": 444963192.0, "step": 3690 }, { "epoch": 2.36736, "grad_norm": 0.809967543772837, "learning_rate": 5.621440793870564e-07, "loss": 1.0138, "num_tokens": 446161734.0, "step": 3700 }, { "epoch": 2.37376, "grad_norm": 0.7634003235889771, "learning_rate": 5.512800198037477e-07, "loss": 1.0092, "num_tokens": 447367385.0, "step": 3710 }, { "epoch": 2.38016, "grad_norm": 0.7694302990943018, "learning_rate": 5.405089438606759e-07, "loss": 1.0183, "num_tokens": 448574222.0, "step": 3720 }, { "epoch": 2.3865600000000002, "grad_norm": 0.7964969360810369, "learning_rate": 5.298313655023083e-07, "loss": 1.0146, "num_tokens": 449787465.0, "step": 3730 }, { "epoch": 2.39296, "grad_norm": 0.7826022145337301, "learning_rate": 5.192477942118501e-07, "loss": 1.0059, "num_tokens": 450993609.0, "step": 3740 }, { "epoch": 2.39936, "grad_norm": 0.7939322826576104, "learning_rate": 5.087587349869396e-07, "loss": 1.016, "num_tokens": 452203974.0, "step": 3750 }, { "epoch": 2.40576, "grad_norm": 0.7880956603422961, "learning_rate": 4.983646883155479e-07, "loss": 0.9871, "num_tokens": 453406872.0, "step": 3760 }, { "epoch": 2.41216, "grad_norm": 0.7870741062813569, "learning_rate": 4.880661501520977e-07, "loss": 1.0146, "num_tokens": 454612112.0, "step": 3770 }, { "epoch": 2.41856, "grad_norm": 0.7757670556350029, "learning_rate": 4.778636118938052e-07, "loss": 1.0043, "num_tokens": 455821550.0, "step": 3780 }, { "epoch": 2.42496, "grad_norm": 0.764980277323769, "learning_rate": 4.677575603572235e-07, "loss": 1.0037, "num_tokens": 457034119.0, "step": 3790 }, { "epoch": 2.43136, "grad_norm": 0.7689487131773513, "learning_rate": 4.5774847775501977e-07, "loss": 1.0215, "num_tokens": 458243443.0, "step": 3800 }, { "epoch": 2.43776, "grad_norm": 0.7835819207262276, "learning_rate": 4.4783684167296645e-07, "loss": 1.0107, "num_tokens": 459449656.0, "step": 3810 }, { "epoch": 2.44416, "grad_norm": 0.7439227301838608, "learning_rate": 4.38023125047152e-07, "loss": 1.0163, "num_tokens": 460660657.0, "step": 3820 }, { "epoch": 2.45056, "grad_norm": 0.8141456247124772, "learning_rate": 4.283077961414125e-07, "loss": 1.0073, "num_tokens": 461868305.0, "step": 3830 }, { "epoch": 2.45696, "grad_norm": 0.7873824030524625, "learning_rate": 4.186913185249936e-07, "loss": 1.0161, "num_tokens": 463067022.0, "step": 3840 }, { "epoch": 2.4633599999999998, "grad_norm": 0.7651257037667265, "learning_rate": 4.091741510504249e-07, "loss": 1.0054, "num_tokens": 464277276.0, "step": 3850 }, { "epoch": 2.46976, "grad_norm": 0.7817592356120844, "learning_rate": 3.9975674783163e-07, "loss": 1.0131, "num_tokens": 465486770.0, "step": 3860 }, { "epoch": 2.47616, "grad_norm": 0.7600628098450863, "learning_rate": 3.904395582222578e-07, "loss": 1.0, "num_tokens": 466688564.0, "step": 3870 }, { "epoch": 2.48256, "grad_norm": 0.7452814104047683, "learning_rate": 3.81223026794241e-07, "loss": 0.9948, "num_tokens": 467893407.0, "step": 3880 }, { "epoch": 2.48896, "grad_norm": 0.7886078128816824, "learning_rate": 3.721075933165816e-07, "loss": 1.0255, "num_tokens": 469103315.0, "step": 3890 }, { "epoch": 2.49536, "grad_norm": 0.7883279810476201, "learning_rate": 3.630936927343695e-07, "loss": 0.9955, "num_tokens": 470304536.0, "step": 3900 }, { "epoch": 2.50176, "grad_norm": 0.7870530493997763, "learning_rate": 3.541817551480292e-07, "loss": 1.0106, "num_tokens": 471516225.0, "step": 3910 }, { "epoch": 2.50816, "grad_norm": 0.7913988775198784, "learning_rate": 3.4537220579279497e-07, "loss": 1.0123, "num_tokens": 472723848.0, "step": 3920 }, { "epoch": 2.51456, "grad_norm": 0.788228042670068, "learning_rate": 3.366654650184217e-07, "loss": 1.0076, "num_tokens": 473927605.0, "step": 3930 }, { "epoch": 2.52096, "grad_norm": 0.7671091431259203, "learning_rate": 3.2806194826913107e-07, "loss": 1.0054, "num_tokens": 475130341.0, "step": 3940 }, { "epoch": 2.52736, "grad_norm": 0.7769242999032523, "learning_rate": 3.1956206606378186e-07, "loss": 1.0137, "num_tokens": 476337471.0, "step": 3950 }, { "epoch": 2.53376, "grad_norm": 0.7761725619806417, "learning_rate": 3.1116622397628886e-07, "loss": 1.0139, "num_tokens": 477546278.0, "step": 3960 }, { "epoch": 2.54016, "grad_norm": 0.8119517968358277, "learning_rate": 3.0287482261626727e-07, "loss": 1.0112, "num_tokens": 478748834.0, "step": 3970 }, { "epoch": 2.54656, "grad_norm": 0.7768387486408453, "learning_rate": 2.946882576099164e-07, "loss": 1.0176, "num_tokens": 479951666.0, "step": 3980 }, { "epoch": 2.55296, "grad_norm": 0.8059661577502851, "learning_rate": 2.8660691958114384e-07, "loss": 1.0192, "num_tokens": 481155740.0, "step": 3990 }, { "epoch": 2.55936, "grad_norm": 0.7923218074076707, "learning_rate": 2.786311941329298e-07, "loss": 1.0228, "num_tokens": 482362569.0, "step": 4000 }, { "epoch": 2.56576, "grad_norm": 0.7737100130087119, "learning_rate": 2.70761461828922e-07, "loss": 1.0117, "num_tokens": 483577083.0, "step": 4010 }, { "epoch": 2.5721600000000002, "grad_norm": 0.8198263737858525, "learning_rate": 2.629980981752803e-07, "loss": 1.0027, "num_tokens": 484785169.0, "step": 4020 }, { "epoch": 2.57856, "grad_norm": 0.7800117950292567, "learning_rate": 2.5534147360276014e-07, "loss": 1.0061, "num_tokens": 485992637.0, "step": 4030 }, { "epoch": 2.58496, "grad_norm": 0.7806994703813391, "learning_rate": 2.4779195344903447e-07, "loss": 1.0067, "num_tokens": 487200371.0, "step": 4040 }, { "epoch": 2.59136, "grad_norm": 0.7967832568550222, "learning_rate": 2.4034989794126494e-07, "loss": 1.005, "num_tokens": 488411438.0, "step": 4050 }, { "epoch": 2.59776, "grad_norm": 0.7613054637393943, "learning_rate": 2.3301566217891148e-07, "loss": 1.0057, "num_tokens": 489619089.0, "step": 4060 }, { "epoch": 2.6041600000000003, "grad_norm": 0.8097260832659626, "learning_rate": 2.257895961167886e-07, "loss": 1.0115, "num_tokens": 490822004.0, "step": 4070 }, { "epoch": 2.61056, "grad_norm": 0.7724807002861569, "learning_rate": 2.18672044548367e-07, "loss": 1.013, "num_tokens": 492031022.0, "step": 4080 }, { "epoch": 2.6169599999999997, "grad_norm": 0.769701738678788, "learning_rate": 2.1166334708932367e-07, "loss": 1.0097, "num_tokens": 493240890.0, "step": 4090 }, { "epoch": 2.62336, "grad_norm": 0.76114972582814, "learning_rate": 2.0476383816133594e-07, "loss": 1.0042, "num_tokens": 494453799.0, "step": 4100 }, { "epoch": 2.62976, "grad_norm": 0.8083057947332605, "learning_rate": 1.9797384697612277e-07, "loss": 1.0044, "num_tokens": 495667359.0, "step": 4110 }, { "epoch": 2.63616, "grad_norm": 0.7707158865091736, "learning_rate": 1.912936975197388e-07, "loss": 1.0073, "num_tokens": 496881814.0, "step": 4120 }, { "epoch": 2.64256, "grad_norm": 0.7676478517895791, "learning_rate": 1.8472370853711397e-07, "loss": 1.0187, "num_tokens": 498083665.0, "step": 4130 }, { "epoch": 2.6489599999999998, "grad_norm": 0.7728314364028435, "learning_rate": 1.7826419351684553e-07, "loss": 0.996, "num_tokens": 499285193.0, "step": 4140 }, { "epoch": 2.65536, "grad_norm": 0.7787493559807903, "learning_rate": 1.7191546067623772e-07, "loss": 0.9928, "num_tokens": 500495522.0, "step": 4150 }, { "epoch": 2.66176, "grad_norm": 0.7740957124528121, "learning_rate": 1.656778129465983e-07, "loss": 0.9942, "num_tokens": 501704772.0, "step": 4160 }, { "epoch": 2.66816, "grad_norm": 0.7834164164129861, "learning_rate": 1.5955154795878086e-07, "loss": 1.0018, "num_tokens": 502908159.0, "step": 4170 }, { "epoch": 2.67456, "grad_norm": 0.7690261436250733, "learning_rate": 1.5353695802898556e-07, "loss": 0.9966, "num_tokens": 504119578.0, "step": 4180 }, { "epoch": 2.68096, "grad_norm": 0.7500003508328252, "learning_rate": 1.4763433014481105e-07, "loss": 1.0175, "num_tokens": 505329761.0, "step": 4190 }, { "epoch": 2.68736, "grad_norm": 0.7619674427912766, "learning_rate": 1.4184394595155887e-07, "loss": 1.0084, "num_tokens": 506541089.0, "step": 4200 }, { "epoch": 2.69376, "grad_norm": 0.7905928509034632, "learning_rate": 1.3616608173879636e-07, "loss": 1.0077, "num_tokens": 507747398.0, "step": 4210 }, { "epoch": 2.70016, "grad_norm": 0.7768455409603942, "learning_rate": 1.3060100842717388e-07, "loss": 1.0211, "num_tokens": 508948926.0, "step": 4220 }, { "epoch": 2.70656, "grad_norm": 0.7650832573151034, "learning_rate": 1.2514899155549625e-07, "loss": 1.0033, "num_tokens": 510157051.0, "step": 4230 }, { "epoch": 2.71296, "grad_norm": 0.7847880941915708, "learning_rate": 1.1981029126805293e-07, "loss": 1.0025, "num_tokens": 511359623.0, "step": 4240 }, { "epoch": 2.71936, "grad_norm": 0.8047407028430222, "learning_rate": 1.1458516230220651e-07, "loss": 1.0056, "num_tokens": 512562364.0, "step": 4250 }, { "epoch": 2.72576, "grad_norm": 0.7894872635799464, "learning_rate": 1.0947385397623522e-07, "loss": 1.0062, "num_tokens": 513767195.0, "step": 4260 }, { "epoch": 2.73216, "grad_norm": 0.7754271372790722, "learning_rate": 1.0447661017743971e-07, "loss": 0.997, "num_tokens": 514974517.0, "step": 4270 }, { "epoch": 2.73856, "grad_norm": 0.7746425365371328, "learning_rate": 9.959366935050397e-08, "loss": 0.9987, "num_tokens": 516179935.0, "step": 4280 }, { "epoch": 2.74496, "grad_norm": 0.7523512554064233, "learning_rate": 9.482526448611807e-08, "loss": 1.0042, "num_tokens": 517387907.0, "step": 4290 }, { "epoch": 2.75136, "grad_norm": 0.7805940920378595, "learning_rate": 9.017162310986067e-08, "loss": 1.002, "num_tokens": 518595813.0, "step": 4300 }, { "epoch": 2.75776, "grad_norm": 0.8110259911998368, "learning_rate": 8.563296727134435e-08, "loss": 1.0066, "num_tokens": 519800375.0, "step": 4310 }, { "epoch": 2.76416, "grad_norm": 0.772256949618178, "learning_rate": 8.120951353361884e-08, "loss": 1.0045, "num_tokens": 521008297.0, "step": 4320 }, { "epoch": 2.77056, "grad_norm": 0.7629770251408482, "learning_rate": 7.690147296283757e-08, "loss": 1.0007, "num_tokens": 522217337.0, "step": 4330 }, { "epoch": 2.77696, "grad_norm": 0.750000751925906, "learning_rate": 7.270905111818744e-08, "loss": 1.0044, "num_tokens": 523427534.0, "step": 4340 }, { "epoch": 2.78336, "grad_norm": 0.7695523347419888, "learning_rate": 6.863244804208053e-08, "loss": 1.0185, "num_tokens": 524629610.0, "step": 4350 }, { "epoch": 2.7897600000000002, "grad_norm": 0.7594496702512009, "learning_rate": 6.467185825060728e-08, "loss": 1.0132, "num_tokens": 525838628.0, "step": 4360 }, { "epoch": 2.79616, "grad_norm": 0.774231464389687, "learning_rate": 6.082747072425844e-08, "loss": 0.9923, "num_tokens": 527047256.0, "step": 4370 }, { "epoch": 2.80256, "grad_norm": 0.7878028776389799, "learning_rate": 5.709946889890461e-08, "loss": 0.9989, "num_tokens": 528251412.0, "step": 4380 }, { "epoch": 2.80896, "grad_norm": 0.7680845271371904, "learning_rate": 5.348803065704483e-08, "loss": 0.9971, "num_tokens": 529460583.0, "step": 4390 }, { "epoch": 2.81536, "grad_norm": 0.7710477876974481, "learning_rate": 4.999332831931936e-08, "loss": 1.0097, "num_tokens": 530666949.0, "step": 4400 }, { "epoch": 2.8217600000000003, "grad_norm": 0.7641864260094089, "learning_rate": 4.6615528636286545e-08, "loss": 1.0083, "num_tokens": 531877350.0, "step": 4410 }, { "epoch": 2.82816, "grad_norm": 0.7798848074760067, "learning_rate": 4.3354792780467004e-08, "loss": 1.0145, "num_tokens": 533089968.0, "step": 4420 }, { "epoch": 2.8345599999999997, "grad_norm": 0.7987639919755114, "learning_rate": 4.021127633865196e-08, "loss": 1.0061, "num_tokens": 534295222.0, "step": 4430 }, { "epoch": 2.84096, "grad_norm": 0.7471470388574258, "learning_rate": 3.718512930448115e-08, "loss": 0.9897, "num_tokens": 535501172.0, "step": 4440 }, { "epoch": 2.84736, "grad_norm": 0.7784643844597081, "learning_rate": 3.4276496071284084e-08, "loss": 1.0126, "num_tokens": 536697925.0, "step": 4450 }, { "epoch": 2.85376, "grad_norm": 0.7972370799678196, "learning_rate": 3.148551542519196e-08, "loss": 1.0051, "num_tokens": 537893496.0, "step": 4460 }, { "epoch": 2.86016, "grad_norm": 0.7691284457736113, "learning_rate": 2.8812320538514348e-08, "loss": 1.0098, "num_tokens": 539102796.0, "step": 4470 }, { "epoch": 2.8665599999999998, "grad_norm": 0.7982125519739797, "learning_rate": 2.6257038963385106e-08, "loss": 1.0136, "num_tokens": 540316296.0, "step": 4480 }, { "epoch": 2.87296, "grad_norm": 0.7728520058232545, "learning_rate": 2.3819792625675297e-08, "loss": 1.0149, "num_tokens": 541533670.0, "step": 4490 }, { "epoch": 2.87936, "grad_norm": 0.7681197599600511, "learning_rate": 2.1500697819178406e-08, "loss": 1.0027, "num_tokens": 542738043.0, "step": 4500 }, { "epoch": 2.88576, "grad_norm": 0.7510549175746628, "learning_rate": 1.9299865200057556e-08, "loss": 1.0059, "num_tokens": 543947829.0, "step": 4510 }, { "epoch": 2.89216, "grad_norm": 0.766881311747473, "learning_rate": 1.721739978156778e-08, "loss": 1.0051, "num_tokens": 545163765.0, "step": 4520 }, { "epoch": 2.89856, "grad_norm": 0.8067610998392601, "learning_rate": 1.5253400929045036e-08, "loss": 0.9998, "num_tokens": 546371420.0, "step": 4530 }, { "epoch": 2.90496, "grad_norm": 0.7674069412891232, "learning_rate": 1.3407962355164728e-08, "loss": 1.0164, "num_tokens": 547577921.0, "step": 4540 }, { "epoch": 2.91136, "grad_norm": 0.775004069541473, "learning_rate": 1.1681172115469986e-08, "loss": 1.0034, "num_tokens": 548783680.0, "step": 4550 }, { "epoch": 2.91776, "grad_norm": 0.7833232261400477, "learning_rate": 1.007311260417032e-08, "loss": 0.9956, "num_tokens": 549988634.0, "step": 4560 }, { "epoch": 2.92416, "grad_norm": 0.7826676344415344, "learning_rate": 8.583860550210043e-09, "loss": 1.0098, "num_tokens": 551189799.0, "step": 4570 }, { "epoch": 2.93056, "grad_norm": 0.744986247926951, "learning_rate": 7.213487013607856e-09, "loss": 1.0035, "num_tokens": 552397598.0, "step": 4580 }, { "epoch": 2.93696, "grad_norm": 0.7521106648563647, "learning_rate": 5.96205738206429e-09, "loss": 1.0043, "num_tokens": 553610771.0, "step": 4590 }, { "epoch": 2.94336, "grad_norm": 0.8150061917429959, "learning_rate": 4.829631367844201e-09, "loss": 1.0, "num_tokens": 554824637.0, "step": 4600 }, { "epoch": 2.94976, "grad_norm": 0.772723595238506, "learning_rate": 3.816263004925991e-09, "loss": 1.0082, "num_tokens": 556030923.0, "step": 4610 }, { "epoch": 2.95616, "grad_norm": 0.768451723737756, "learning_rate": 2.922000646423118e-09, "loss": 0.9922, "num_tokens": 557231653.0, "step": 4620 }, { "epoch": 2.96256, "grad_norm": 0.7993486350591127, "learning_rate": 2.1468869622781608e-09, "loss": 1.0019, "num_tokens": 558442813.0, "step": 4630 }, { "epoch": 2.96896, "grad_norm": 1.1239957345324176, "learning_rate": 1.4909589372266719e-09, "loss": 1.001, "num_tokens": 559650373.0, "step": 4640 }, { "epoch": 2.9753600000000002, "grad_norm": 0.7957668006721109, "learning_rate": 9.542478690305335e-10, "loss": 1.0067, "num_tokens": 560855666.0, "step": 4650 }, { "epoch": 2.98176, "grad_norm": 0.8079239433679425, "learning_rate": 5.367793669874832e-10, "loss": 0.9969, "num_tokens": 562060878.0, "step": 4660 }, { "epoch": 2.98816, "grad_norm": 0.7720639449232606, "learning_rate": 2.385733507062615e-10, "loss": 1.0052, "num_tokens": 563260411.0, "step": 4670 }, { "epoch": 2.99456, "grad_norm": 0.7561377897632978, "learning_rate": 5.964404915903555e-11, "loss": 0.9991, "num_tokens": 564468049.0, "step": 4680 }, { "epoch": 3.0, "num_tokens": 565489014.0, "step": 4689, "total_flos": 722584728633344.0, "train_loss": 1.0774097926684294, "train_runtime": 15585.9875, "train_samples_per_second": 19.248, "train_steps_per_second": 0.301 } ], "logging_steps": 10, "max_steps": 4689, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 722584728633344.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }