| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 4689, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 2.760316424583359, | |
| "learning_rate": 3.1914893617021275e-07, | |
| "loss": 1.6571, | |
| "num_tokens": 1208276.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 1.9639395470560352, | |
| "learning_rate": 6.73758865248227e-07, | |
| "loss": 1.6372, | |
| "num_tokens": 2410446.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 1.6857950160814903, | |
| "learning_rate": 1.0283687943262412e-06, | |
| "loss": 1.6138, | |
| "num_tokens": 3622536.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 1.9355649405079267, | |
| "learning_rate": 1.3829787234042555e-06, | |
| "loss": 1.554, | |
| "num_tokens": 4837847.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 1.0134479979543427, | |
| "learning_rate": 1.7375886524822697e-06, | |
| "loss": 1.5138, | |
| "num_tokens": 6044886.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 0.7097712225560386, | |
| "learning_rate": 2.092198581560284e-06, | |
| "loss": 1.4577, | |
| "num_tokens": 7255346.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 0.7563602572113316, | |
| "learning_rate": 2.446808510638298e-06, | |
| "loss": 1.4239, | |
| "num_tokens": 8465627.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 0.6411265148411116, | |
| "learning_rate": 2.8014184397163125e-06, | |
| "loss": 1.3857, | |
| "num_tokens": 9667266.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 0.7071256376230877, | |
| "learning_rate": 3.1560283687943267e-06, | |
| "loss": 1.3736, | |
| "num_tokens": 10869831.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.7623180191305359, | |
| "learning_rate": 3.510638297872341e-06, | |
| "loss": 1.3722, | |
| "num_tokens": 12083093.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 0.651385333897087, | |
| "learning_rate": 3.865248226950355e-06, | |
| "loss": 1.3468, | |
| "num_tokens": 13290331.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 0.8706225351642094, | |
| "learning_rate": 4.219858156028369e-06, | |
| "loss": 1.3387, | |
| "num_tokens": 14488386.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 0.84726755662717, | |
| "learning_rate": 4.574468085106383e-06, | |
| "loss": 1.3364, | |
| "num_tokens": 15690608.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 0.8553144960607314, | |
| "learning_rate": 4.929078014184397e-06, | |
| "loss": 1.3207, | |
| "num_tokens": 16894120.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.6845288880044453, | |
| "learning_rate": 4.999961827753897e-06, | |
| "loss": 1.3072, | |
| "num_tokens": 18098866.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 0.7060413425833653, | |
| "learning_rate": 4.999806755001946e-06, | |
| "loss": 1.293, | |
| "num_tokens": 19317515.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 1.112301905134234, | |
| "learning_rate": 4.999532403372408e-06, | |
| "loss": 1.2933, | |
| "num_tokens": 20523986.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 0.7057273926728088, | |
| "learning_rate": 4.9991387859560365e-06, | |
| "loss": 1.3105, | |
| "num_tokens": 21730204.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 0.7046621457199816, | |
| "learning_rate": 4.9986259215343814e-06, | |
| "loss": 1.3036, | |
| "num_tokens": 22941629.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.6753839003505228, | |
| "learning_rate": 4.997993834578891e-06, | |
| "loss": 1.2837, | |
| "num_tokens": 24149743.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 0.6833117540920727, | |
| "learning_rate": 4.997242555249746e-06, | |
| "loss": 1.2798, | |
| "num_tokens": 25350421.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 0.7496565711502305, | |
| "learning_rate": 4.996372119394418e-06, | |
| "loss": 1.2872, | |
| "num_tokens": 26553851.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 0.8257784450438341, | |
| "learning_rate": 4.9953825685459635e-06, | |
| "loss": 1.2715, | |
| "num_tokens": 27756494.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 0.8586750458312551, | |
| "learning_rate": 4.994273949921038e-06, | |
| "loss": 1.273, | |
| "num_tokens": 28966311.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.8942167127143708, | |
| "learning_rate": 4.993046316417643e-06, | |
| "loss": 1.2615, | |
| "num_tokens": 30165165.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 0.7320667303892974, | |
| "learning_rate": 4.991699726612607e-06, | |
| "loss": 1.2598, | |
| "num_tokens": 31372687.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 0.7759159652826615, | |
| "learning_rate": 4.990234244758785e-06, | |
| "loss": 1.2378, | |
| "num_tokens": 32578240.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 0.7081937298786585, | |
| "learning_rate": 4.988649940781992e-06, | |
| "loss": 1.2496, | |
| "num_tokens": 33788704.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 0.8354872354621143, | |
| "learning_rate": 4.986946890277673e-06, | |
| "loss": 1.239, | |
| "num_tokens": 34992041.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.7419306542972816, | |
| "learning_rate": 4.9851251745072905e-06, | |
| "loss": 1.2334, | |
| "num_tokens": 36202424.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 0.8124424043952861, | |
| "learning_rate": 4.983184880394447e-06, | |
| "loss": 1.2423, | |
| "num_tokens": 37406998.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 0.9137121442594122, | |
| "learning_rate": 4.981126100520743e-06, | |
| "loss": 1.2398, | |
| "num_tokens": 38614024.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 0.8692171799253517, | |
| "learning_rate": 4.978948933121351e-06, | |
| "loss": 1.2274, | |
| "num_tokens": 39818938.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 0.7959433307352174, | |
| "learning_rate": 4.976653482080335e-06, | |
| "loss": 1.2432, | |
| "num_tokens": 41029985.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.9183385731990914, | |
| "learning_rate": 4.97423985692569e-06, | |
| "loss": 1.2183, | |
| "num_tokens": 42241595.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 0.8800279308744207, | |
| "learning_rate": 4.97170817282412e-06, | |
| "loss": 1.2174, | |
| "num_tokens": 43436994.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 0.8482042891364965, | |
| "learning_rate": 4.969058550575535e-06, | |
| "loss": 1.214, | |
| "num_tokens": 44649051.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 0.8597854654288322, | |
| "learning_rate": 4.966291116607297e-06, | |
| "loss": 1.2105, | |
| "num_tokens": 45857075.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 0.8904371734549302, | |
| "learning_rate": 4.96340600296818e-06, | |
| "loss": 1.1976, | |
| "num_tokens": 47059498.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.864096324906862, | |
| "learning_rate": 4.960403347322069e-06, | |
| "loss": 1.2067, | |
| "num_tokens": 48273286.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 0.8417001685001565, | |
| "learning_rate": 4.957283292941401e-06, | |
| "loss": 1.2012, | |
| "num_tokens": 49479835.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 0.8738206939182319, | |
| "learning_rate": 4.954045988700315e-06, | |
| "loss": 1.2081, | |
| "num_tokens": 50692484.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 0.9214341760640065, | |
| "learning_rate": 4.9506915890675566e-06, | |
| "loss": 1.1982, | |
| "num_tokens": 51904151.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 0.8270044046785595, | |
| "learning_rate": 4.94722025409911e-06, | |
| "loss": 1.2003, | |
| "num_tokens": 53107439.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.9325298797380837, | |
| "learning_rate": 4.943632149430552e-06, | |
| "loss": 1.1934, | |
| "num_tokens": 54311802.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 0.8173318542721012, | |
| "learning_rate": 4.9399274462691555e-06, | |
| "loss": 1.183, | |
| "num_tokens": 55516169.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 0.8403372189641363, | |
| "learning_rate": 4.93610632138572e-06, | |
| "loss": 1.2011, | |
| "num_tokens": 56720582.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 0.9133683374494203, | |
| "learning_rate": 4.9321689571061314e-06, | |
| "loss": 1.1863, | |
| "num_tokens": 57923305.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 0.8342006897685076, | |
| "learning_rate": 4.928115541302672e-06, | |
| "loss": 1.1789, | |
| "num_tokens": 59119131.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.9237208555707096, | |
| "learning_rate": 4.923946267385043e-06, | |
| "loss": 1.1823, | |
| "num_tokens": 60323216.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 1.138961215949811, | |
| "learning_rate": 4.91966133429115e-06, | |
| "loss": 1.1849, | |
| "num_tokens": 61536243.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 0.8179215725319021, | |
| "learning_rate": 4.915260946477601e-06, | |
| "loss": 1.1689, | |
| "num_tokens": 62725558.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 0.8196458509991646, | |
| "learning_rate": 4.910745313909953e-06, | |
| "loss": 1.1754, | |
| "num_tokens": 63929035.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 0.8606903543941481, | |
| "learning_rate": 4.906114652052694e-06, | |
| "loss": 1.1608, | |
| "num_tokens": 65137799.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.842427893289404, | |
| "learning_rate": 4.9013691818589635e-06, | |
| "loss": 1.176, | |
| "num_tokens": 66343119.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 0.9536458222010928, | |
| "learning_rate": 4.896509129760008e-06, | |
| "loss": 1.1766, | |
| "num_tokens": 67554625.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 0.8456584910416223, | |
| "learning_rate": 4.891534727654374e-06, | |
| "loss": 1.1704, | |
| "num_tokens": 68767553.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 0.825023352714185, | |
| "learning_rate": 4.886446212896853e-06, | |
| "loss": 1.1662, | |
| "num_tokens": 69977707.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 0.8327520829988985, | |
| "learning_rate": 4.881243828287141e-06, | |
| "loss": 1.1715, | |
| "num_tokens": 71189476.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.840077866672345, | |
| "learning_rate": 4.875927822058265e-06, | |
| "loss": 1.1711, | |
| "num_tokens": 72395847.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 0.8253947193633453, | |
| "learning_rate": 4.870498447864735e-06, | |
| "loss": 1.1439, | |
| "num_tokens": 73594932.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 0.9212419524845424, | |
| "learning_rate": 4.864955964770442e-06, | |
| "loss": 1.1643, | |
| "num_tokens": 74802657.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 0.9296250658068028, | |
| "learning_rate": 4.859300637236289e-06, | |
| "loss": 1.1534, | |
| "num_tokens": 76011529.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 1.057634627530951, | |
| "learning_rate": 4.853532735107587e-06, | |
| "loss": 1.1507, | |
| "num_tokens": 77210334.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.8097939416205123, | |
| "learning_rate": 4.847652533601164e-06, | |
| "loss": 1.1395, | |
| "num_tokens": 78425328.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 0.8447649876579609, | |
| "learning_rate": 4.8416603132922425e-06, | |
| "loss": 1.1378, | |
| "num_tokens": 79638521.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 0.9421170322416722, | |
| "learning_rate": 4.83555636010105e-06, | |
| "loss": 1.1349, | |
| "num_tokens": 80836868.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 0.9009555407016511, | |
| "learning_rate": 4.829340965279173e-06, | |
| "loss": 1.1482, | |
| "num_tokens": 82050746.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 0.9304718962620818, | |
| "learning_rate": 4.823014425395662e-06, | |
| "loss": 1.1535, | |
| "num_tokens": 83256247.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.8268029795401431, | |
| "learning_rate": 4.816577042322883e-06, | |
| "loss": 1.1625, | |
| "num_tokens": 84466963.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 0.8118838757785675, | |
| "learning_rate": 4.810029123222109e-06, | |
| "loss": 1.1582, | |
| "num_tokens": 85668747.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 0.8191391458452703, | |
| "learning_rate": 4.803370980528868e-06, | |
| "loss": 1.1508, | |
| "num_tokens": 86869314.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 0.8573356891805307, | |
| "learning_rate": 4.796602931938031e-06, | |
| "loss": 1.1367, | |
| "num_tokens": 88072166.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 0.9130087766709583, | |
| "learning_rate": 4.789725300388658e-06, | |
| "loss": 1.1496, | |
| "num_tokens": 89276560.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.8756224792489176, | |
| "learning_rate": 4.782738414048581e-06, | |
| "loss": 1.1387, | |
| "num_tokens": 90489167.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 0.8660533049576743, | |
| "learning_rate": 4.775642606298758e-06, | |
| "loss": 1.1293, | |
| "num_tokens": 91699027.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 0.9344747635312723, | |
| "learning_rate": 4.7684382157173515e-06, | |
| "loss": 1.1544, | |
| "num_tokens": 92907904.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 0.8232769483557345, | |
| "learning_rate": 4.761125586063583e-06, | |
| "loss": 1.1509, | |
| "num_tokens": 94108258.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 0.8019044034927749, | |
| "learning_rate": 4.753705066261326e-06, | |
| "loss": 1.142, | |
| "num_tokens": 95319591.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.8744491818182848, | |
| "learning_rate": 4.74617701038246e-06, | |
| "loss": 1.1407, | |
| "num_tokens": 96527466.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 0.8457377069978257, | |
| "learning_rate": 4.738541777629971e-06, | |
| "loss": 1.1454, | |
| "num_tokens": 97741955.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 0.8367461594303044, | |
| "learning_rate": 4.730799732320819e-06, | |
| "loss": 1.1499, | |
| "num_tokens": 98947846.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 0.8153933334854007, | |
| "learning_rate": 4.722951243868547e-06, | |
| "loss": 1.1338, | |
| "num_tokens": 100149443.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 0.9553883385280855, | |
| "learning_rate": 4.7149966867656625e-06, | |
| "loss": 1.1239, | |
| "num_tokens": 101354489.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.8020256868069202, | |
| "learning_rate": 4.706936440565759e-06, | |
| "loss": 1.1233, | |
| "num_tokens": 102561908.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 0.8506848444686664, | |
| "learning_rate": 4.698770889865414e-06, | |
| "loss": 1.1314, | |
| "num_tokens": 103765389.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 0.8931807739845334, | |
| "learning_rate": 4.690500424285833e-06, | |
| "loss": 1.1367, | |
| "num_tokens": 104973326.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 0.8498884776316712, | |
| "learning_rate": 4.682125438454261e-06, | |
| "loss": 1.1329, | |
| "num_tokens": 106184942.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 0.8866656591752357, | |
| "learning_rate": 4.673646331985151e-06, | |
| "loss": 1.1469, | |
| "num_tokens": 107391403.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.8247486140289442, | |
| "learning_rate": 4.665063509461098e-06, | |
| "loss": 1.1304, | |
| "num_tokens": 108599244.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 0.8509584195104843, | |
| "learning_rate": 4.6563773804135305e-06, | |
| "loss": 1.1205, | |
| "num_tokens": 109802767.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 0.9532478448654986, | |
| "learning_rate": 4.647588359303178e-06, | |
| "loss": 1.135, | |
| "num_tokens": 111002144.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 0.795143766492276, | |
| "learning_rate": 4.638696865500284e-06, | |
| "loss": 1.133, | |
| "num_tokens": 112202360.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 0.8884950967785606, | |
| "learning_rate": 4.629703323264605e-06, | |
| "loss": 1.1174, | |
| "num_tokens": 113410661.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.8094095645216874, | |
| "learning_rate": 4.62060816172516e-06, | |
| "loss": 1.1359, | |
| "num_tokens": 114615154.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 0.8517004319099382, | |
| "learning_rate": 4.611411814859758e-06, | |
| "loss": 1.1141, | |
| "num_tokens": 115826696.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 0.8739388391386897, | |
| "learning_rate": 4.602114721474293e-06, | |
| "loss": 1.1204, | |
| "num_tokens": 117030663.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 1.0126603878935398, | |
| "learning_rate": 4.592717325181798e-06, | |
| "loss": 1.1259, | |
| "num_tokens": 118243461.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 0.7961249459761912, | |
| "learning_rate": 4.583220074381288e-06, | |
| "loss": 1.1105, | |
| "num_tokens": 119444400.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.8547801323336933, | |
| "learning_rate": 4.573623422236359e-06, | |
| "loss": 1.1247, | |
| "num_tokens": 120646721.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 0.8827343366608609, | |
| "learning_rate": 4.563927826653562e-06, | |
| "loss": 1.1381, | |
| "num_tokens": 121856814.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 0.8379604515543791, | |
| "learning_rate": 4.554133750260561e-06, | |
| "loss": 1.1038, | |
| "num_tokens": 123063137.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 0.9009991930297082, | |
| "learning_rate": 4.544241660384057e-06, | |
| "loss": 1.1351, | |
| "num_tokens": 124281752.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 0.9398290903202526, | |
| "learning_rate": 4.534252029027485e-06, | |
| "loss": 1.132, | |
| "num_tokens": 125483927.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.8135458599046622, | |
| "learning_rate": 4.5241653328484965e-06, | |
| "loss": 1.1137, | |
| "num_tokens": 126688041.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 0.826631698433715, | |
| "learning_rate": 4.5139820531362125e-06, | |
| "loss": 1.1149, | |
| "num_tokens": 127895497.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 0.8326760862617015, | |
| "learning_rate": 4.503702675788263e-06, | |
| "loss": 1.1082, | |
| "num_tokens": 129093768.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 0.8187909661973681, | |
| "learning_rate": 4.493327691287596e-06, | |
| "loss": 1.1213, | |
| "num_tokens": 130296941.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 0.8758642744013126, | |
| "learning_rate": 4.482857594679082e-06, | |
| "loss": 1.1169, | |
| "num_tokens": 131499785.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.9756017880226009, | |
| "learning_rate": 4.472292885545887e-06, | |
| "loss": 1.1182, | |
| "num_tokens": 132704447.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 0.9918470716003941, | |
| "learning_rate": 4.4616340679856344e-06, | |
| "loss": 1.112, | |
| "num_tokens": 133914148.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 0.7736509572616426, | |
| "learning_rate": 4.450881650586354e-06, | |
| "loss": 1.0948, | |
| "num_tokens": 135116690.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 0.8393996918370894, | |
| "learning_rate": 4.440036146402218e-06, | |
| "loss": 1.1196, | |
| "num_tokens": 136325534.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 0.8283036410858456, | |
| "learning_rate": 4.429098072929052e-06, | |
| "loss": 1.1249, | |
| "num_tokens": 137532058.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 1.0272561438627168, | |
| "learning_rate": 4.418067952079651e-06, | |
| "loss": 1.0894, | |
| "num_tokens": 138742925.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 0.9457224166686296, | |
| "learning_rate": 4.40694631015887e-06, | |
| "loss": 1.1072, | |
| "num_tokens": 139944361.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 0.8472242869303449, | |
| "learning_rate": 4.395733677838515e-06, | |
| "loss": 1.104, | |
| "num_tokens": 141145139.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 0.8369893067934512, | |
| "learning_rate": 4.384430590132023e-06, | |
| "loss": 1.1167, | |
| "num_tokens": 142348857.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 0.9417838753194914, | |
| "learning_rate": 4.373037586368925e-06, | |
| "loss": 1.0952, | |
| "num_tokens": 143560823.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.83199280244184, | |
| "learning_rate": 4.361555210169126e-06, | |
| "loss": 1.0969, | |
| "num_tokens": 144770576.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 0.8757783495810086, | |
| "learning_rate": 4.349984009416952e-06, | |
| "loss": 1.0948, | |
| "num_tokens": 145978862.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 0.8374080168936522, | |
| "learning_rate": 4.3383245362350174e-06, | |
| "loss": 1.1087, | |
| "num_tokens": 147191743.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 0.8702169752217432, | |
| "learning_rate": 4.326577346957876e-06, | |
| "loss": 1.1099, | |
| "num_tokens": 148399289.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 0.8016984816166285, | |
| "learning_rate": 4.314743002105473e-06, | |
| "loss": 1.1052, | |
| "num_tokens": 149602404.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.0811796381892176, | |
| "learning_rate": 4.302822066356408e-06, | |
| "loss": 1.0996, | |
| "num_tokens": 150811734.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 0.8374755480022819, | |
| "learning_rate": 4.290815108520982e-06, | |
| "loss": 1.1185, | |
| "num_tokens": 152011294.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 0.7904368039438139, | |
| "learning_rate": 4.278722701514061e-06, | |
| "loss": 1.0992, | |
| "num_tokens": 153217258.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 0.785661611999425, | |
| "learning_rate": 4.266545422327741e-06, | |
| "loss": 1.1208, | |
| "num_tokens": 154419838.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 0.8439322755320521, | |
| "learning_rate": 4.254283852003813e-06, | |
| "loss": 1.1091, | |
| "num_tokens": 155626578.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.8732275622995317, | |
| "learning_rate": 4.241938575606038e-06, | |
| "loss": 1.0826, | |
| "num_tokens": 156825805.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 0.8014980196902037, | |
| "learning_rate": 4.229510182192235e-06, | |
| "loss": 1.1093, | |
| "num_tokens": 158037877.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 0.8106302375207448, | |
| "learning_rate": 4.216999264786169e-06, | |
| "loss": 1.1073, | |
| "num_tokens": 159245106.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 0.9385310776537238, | |
| "learning_rate": 4.204406420349259e-06, | |
| "loss": 1.1056, | |
| "num_tokens": 160456114.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 0.9579249297784465, | |
| "learning_rate": 4.191732249752092e-06, | |
| "loss": 1.1021, | |
| "num_tokens": 161659510.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.8134490186326385, | |
| "learning_rate": 4.178977357745749e-06, | |
| "loss": 1.0821, | |
| "num_tokens": 162865495.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 0.7943299269230713, | |
| "learning_rate": 4.166142352932957e-06, | |
| "loss": 1.1065, | |
| "num_tokens": 164069925.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 0.8171116530483417, | |
| "learning_rate": 4.153227847739041e-06, | |
| "loss": 1.0873, | |
| "num_tokens": 165272777.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 0.8472827858602203, | |
| "learning_rate": 4.140234458382708e-06, | |
| "loss": 1.1207, | |
| "num_tokens": 166473564.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 0.8254355045966608, | |
| "learning_rate": 4.12716280484664e-06, | |
| "loss": 1.093, | |
| "num_tokens": 167678209.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.8238773032302608, | |
| "learning_rate": 4.114013510847914e-06, | |
| "loss": 1.1004, | |
| "num_tokens": 168879199.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 0.8035266067408213, | |
| "learning_rate": 4.100787203808241e-06, | |
| "loss": 1.09, | |
| "num_tokens": 170089062.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 0.796684651593008, | |
| "learning_rate": 4.0874845148240265e-06, | |
| "loss": 1.0923, | |
| "num_tokens": 171298354.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 0.7944378162845194, | |
| "learning_rate": 4.074106078636259e-06, | |
| "loss": 1.0877, | |
| "num_tokens": 172502932.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 0.8222630499336689, | |
| "learning_rate": 4.0606525336002215e-06, | |
| "loss": 1.1069, | |
| "num_tokens": 173714359.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.8284462145945989, | |
| "learning_rate": 4.047124521655037e-06, | |
| "loss": 1.1063, | |
| "num_tokens": 174915024.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 1.1184143246349953, | |
| "learning_rate": 4.033522688293033e-06, | |
| "loss": 1.0958, | |
| "num_tokens": 176121314.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 0.9302956644371011, | |
| "learning_rate": 4.019847682528943e-06, | |
| "loss": 1.1057, | |
| "num_tokens": 177329003.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 0.8315189293207337, | |
| "learning_rate": 4.00610015686894e-06, | |
| "loss": 1.1021, | |
| "num_tokens": 178533383.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 0.780029339050911, | |
| "learning_rate": 3.9922807672795015e-06, | |
| "loss": 1.1022, | |
| "num_tokens": 179737544.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.8861787669753409, | |
| "learning_rate": 3.97839017315611e-06, | |
| "loss": 1.1033, | |
| "num_tokens": 180941884.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 0.8613329501244571, | |
| "learning_rate": 3.964429037291785e-06, | |
| "loss": 1.0932, | |
| "num_tokens": 182147995.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 0.7767446273299125, | |
| "learning_rate": 3.950398025845469e-06, | |
| "loss": 1.0764, | |
| "num_tokens": 183351238.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 0.7800388177467502, | |
| "learning_rate": 3.936297808310229e-06, | |
| "loss": 1.0955, | |
| "num_tokens": 184559744.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 0.822587499260109, | |
| "learning_rate": 3.9221290574813205e-06, | |
| "loss": 1.101, | |
| "num_tokens": 185771261.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.7842833667912362, | |
| "learning_rate": 3.907892449424081e-06, | |
| "loss": 1.0858, | |
| "num_tokens": 186988878.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 0.875565650877801, | |
| "learning_rate": 3.893588663441669e-06, | |
| "loss": 1.1096, | |
| "num_tokens": 188198614.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.00448, | |
| "grad_norm": 0.9833099796256903, | |
| "learning_rate": 3.8792183820426575e-06, | |
| "loss": 1.0518, | |
| "num_tokens": 189338860.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.01088, | |
| "grad_norm": 0.9539211061323496, | |
| "learning_rate": 3.864782290908462e-06, | |
| "loss": 1.0558, | |
| "num_tokens": 190541615.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.01728, | |
| "grad_norm": 0.8277557093113368, | |
| "learning_rate": 3.850281078860627e-06, | |
| "loss": 1.0672, | |
| "num_tokens": 191744590.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.02368, | |
| "grad_norm": 0.8095245034674352, | |
| "learning_rate": 3.835715437827954e-06, | |
| "loss": 1.0555, | |
| "num_tokens": 192946831.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.03008, | |
| "grad_norm": 0.8670205092911757, | |
| "learning_rate": 3.821086062813492e-06, | |
| "loss": 1.0558, | |
| "num_tokens": 194153241.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.03648, | |
| "grad_norm": 0.8041612181651476, | |
| "learning_rate": 3.806393651861372e-06, | |
| "loss": 1.0713, | |
| "num_tokens": 195361386.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.04288, | |
| "grad_norm": 0.8201672913405339, | |
| "learning_rate": 3.7916389060234964e-06, | |
| "loss": 1.0612, | |
| "num_tokens": 196570539.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.04928, | |
| "grad_norm": 0.822814114472732, | |
| "learning_rate": 3.776822529326097e-06, | |
| "loss": 1.0643, | |
| "num_tokens": 197758018.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.05568, | |
| "grad_norm": 0.8405563342503541, | |
| "learning_rate": 3.7619452287361306e-06, | |
| "loss": 1.0576, | |
| "num_tokens": 198962473.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.06208, | |
| "grad_norm": 0.8733811946067399, | |
| "learning_rate": 3.7470077141275578e-06, | |
| "loss": 1.0602, | |
| "num_tokens": 200168404.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.06848, | |
| "grad_norm": 0.7810891863766373, | |
| "learning_rate": 3.732010698247463e-06, | |
| "loss": 1.0429, | |
| "num_tokens": 201383921.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.07488, | |
| "grad_norm": 0.8253121322208729, | |
| "learning_rate": 3.7169548966820466e-06, | |
| "loss": 1.069, | |
| "num_tokens": 202590191.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.08128, | |
| "grad_norm": 0.7968885719952052, | |
| "learning_rate": 3.7018410278224852e-06, | |
| "loss": 1.0661, | |
| "num_tokens": 203790064.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.08768, | |
| "grad_norm": 0.7513522866065546, | |
| "learning_rate": 3.686669812830648e-06, | |
| "loss": 1.0648, | |
| "num_tokens": 205004834.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.09408, | |
| "grad_norm": 0.8133897709614188, | |
| "learning_rate": 3.671441975604689e-06, | |
| "loss": 1.0574, | |
| "num_tokens": 206218130.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.10048, | |
| "grad_norm": 0.855169356505383, | |
| "learning_rate": 3.6561582427445053e-06, | |
| "loss": 1.0652, | |
| "num_tokens": 207421774.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.10688, | |
| "grad_norm": 0.7861479775879827, | |
| "learning_rate": 3.6408193435170695e-06, | |
| "loss": 1.0601, | |
| "num_tokens": 208639076.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.11328, | |
| "grad_norm": 0.7759167355223116, | |
| "learning_rate": 3.625426009821628e-06, | |
| "loss": 1.0515, | |
| "num_tokens": 209843506.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.11968, | |
| "grad_norm": 0.7737945956455258, | |
| "learning_rate": 3.609978976154784e-06, | |
| "loss": 1.0449, | |
| "num_tokens": 211053262.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.12608, | |
| "grad_norm": 0.8033895393207562, | |
| "learning_rate": 3.594478979575443e-06, | |
| "loss": 1.0653, | |
| "num_tokens": 212256390.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.13248, | |
| "grad_norm": 0.8687778972426285, | |
| "learning_rate": 3.578926759669653e-06, | |
| "loss": 1.046, | |
| "num_tokens": 213458553.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.13888, | |
| "grad_norm": 0.8146069292073773, | |
| "learning_rate": 3.5633230585153093e-06, | |
| "loss": 1.0587, | |
| "num_tokens": 214667929.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.14528, | |
| "grad_norm": 0.8442869654702855, | |
| "learning_rate": 3.5476686206467465e-06, | |
| "loss": 1.0476, | |
| "num_tokens": 215872854.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.15168, | |
| "grad_norm": 0.8166732673631207, | |
| "learning_rate": 3.531964193019214e-06, | |
| "loss": 1.0486, | |
| "num_tokens": 217084577.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.15808, | |
| "grad_norm": 0.8407184177973456, | |
| "learning_rate": 3.5162105249732336e-06, | |
| "loss": 1.0446, | |
| "num_tokens": 218284006.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.16448, | |
| "grad_norm": 0.7814422822824459, | |
| "learning_rate": 3.5004083681988476e-06, | |
| "loss": 1.0466, | |
| "num_tokens": 219487469.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.17088, | |
| "grad_norm": 0.7953904441180448, | |
| "learning_rate": 3.484558476699748e-06, | |
| "loss": 1.0539, | |
| "num_tokens": 220690881.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.17728, | |
| "grad_norm": 0.8120616693504964, | |
| "learning_rate": 3.468661606757301e-06, | |
| "loss": 1.0564, | |
| "num_tokens": 221898060.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.18368, | |
| "grad_norm": 0.7894301070451438, | |
| "learning_rate": 3.45271851689446e-06, | |
| "loss": 1.0576, | |
| "num_tokens": 223099219.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.19008, | |
| "grad_norm": 0.8628648936847306, | |
| "learning_rate": 3.436729967839575e-06, | |
| "loss": 1.0697, | |
| "num_tokens": 224314472.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.19648, | |
| "grad_norm": 0.8485241964897267, | |
| "learning_rate": 3.4206967224900885e-06, | |
| "loss": 1.0583, | |
| "num_tokens": 225513940.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.20288, | |
| "grad_norm": 0.8019635872502272, | |
| "learning_rate": 3.40461954587614e-06, | |
| "loss": 1.0484, | |
| "num_tokens": 226733560.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.20928, | |
| "grad_norm": 0.8148504625626072, | |
| "learning_rate": 3.3884992051240613e-06, | |
| "loss": 1.049, | |
| "num_tokens": 227946861.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.21568, | |
| "grad_norm": 0.799348761407277, | |
| "learning_rate": 3.372336469419767e-06, | |
| "loss": 1.0636, | |
| "num_tokens": 229149854.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.22208, | |
| "grad_norm": 0.8121058069211242, | |
| "learning_rate": 3.35613210997206e-06, | |
| "loss": 1.0679, | |
| "num_tokens": 230358777.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.22848, | |
| "grad_norm": 0.8225529513521229, | |
| "learning_rate": 3.339886899975831e-06, | |
| "loss": 1.0455, | |
| "num_tokens": 231573319.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.23488, | |
| "grad_norm": 0.7930056234558618, | |
| "learning_rate": 3.3236016145751616e-06, | |
| "loss": 1.0453, | |
| "num_tokens": 232778798.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.24128, | |
| "grad_norm": 0.7824523425714454, | |
| "learning_rate": 3.307277030826342e-06, | |
| "loss": 1.046, | |
| "num_tokens": 233985281.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.24768, | |
| "grad_norm": 1.126385656615945, | |
| "learning_rate": 3.290913927660793e-06, | |
| "loss": 1.0418, | |
| "num_tokens": 235194572.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.25408, | |
| "grad_norm": 0.8230976427574604, | |
| "learning_rate": 3.274513085847899e-06, | |
| "loss": 1.0596, | |
| "num_tokens": 236400915.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.26048, | |
| "grad_norm": 0.7715465448814725, | |
| "learning_rate": 3.2580752879577508e-06, | |
| "loss": 1.0421, | |
| "num_tokens": 237602768.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.26688, | |
| "grad_norm": 0.7604905419126253, | |
| "learning_rate": 3.2416013183238105e-06, | |
| "loss": 1.0596, | |
| "num_tokens": 238810127.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.27328, | |
| "grad_norm": 0.8091857959210363, | |
| "learning_rate": 3.22509196300548e-06, | |
| "loss": 1.0544, | |
| "num_tokens": 240016518.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.27968, | |
| "grad_norm": 0.8428609624878182, | |
| "learning_rate": 3.2085480097506015e-06, | |
| "loss": 1.0517, | |
| "num_tokens": 241224903.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.2860800000000001, | |
| "grad_norm": 0.8167440202916451, | |
| "learning_rate": 3.191970247957862e-06, | |
| "loss": 1.0607, | |
| "num_tokens": 242432829.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.29248, | |
| "grad_norm": 0.843189559655867, | |
| "learning_rate": 3.1753594686391343e-06, | |
| "loss": 1.0519, | |
| "num_tokens": 243643680.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.29888, | |
| "grad_norm": 0.8113193681644453, | |
| "learning_rate": 3.158716464381728e-06, | |
| "loss": 1.0534, | |
| "num_tokens": 244850967.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.30528, | |
| "grad_norm": 0.8238038397216464, | |
| "learning_rate": 3.1420420293105753e-06, | |
| "loss": 1.0537, | |
| "num_tokens": 246055107.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.31168, | |
| "grad_norm": 0.7585161106894139, | |
| "learning_rate": 3.1253369590503357e-06, | |
| "loss": 1.053, | |
| "num_tokens": 247255291.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.31808, | |
| "grad_norm": 0.8358837254742888, | |
| "learning_rate": 3.1086020506874352e-06, | |
| "loss": 1.0552, | |
| "num_tokens": 248472347.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.3244799999999999, | |
| "grad_norm": 0.8248705338889306, | |
| "learning_rate": 3.091838102732031e-06, | |
| "loss": 1.0547, | |
| "num_tokens": 249675791.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.33088, | |
| "grad_norm": 0.8413169777388428, | |
| "learning_rate": 3.0750459150799116e-06, | |
| "loss": 1.0512, | |
| "num_tokens": 250883742.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.33728, | |
| "grad_norm": 0.7773274742980588, | |
| "learning_rate": 3.0582262889743304e-06, | |
| "loss": 1.0435, | |
| "num_tokens": 252092991.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.34368, | |
| "grad_norm": 0.8160134758509259, | |
| "learning_rate": 3.0413800269677707e-06, | |
| "loss": 1.0617, | |
| "num_tokens": 253296187.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.35008, | |
| "grad_norm": 0.8253629381678, | |
| "learning_rate": 3.024507932883659e-06, | |
| "loss": 1.0467, | |
| "num_tokens": 254497531.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.35648, | |
| "grad_norm": 0.8449321081656331, | |
| "learning_rate": 3.0076108117779995e-06, | |
| "loss": 1.0501, | |
| "num_tokens": 255698828.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.36288, | |
| "grad_norm": 0.864074317535777, | |
| "learning_rate": 2.9906894699009714e-06, | |
| "loss": 1.051, | |
| "num_tokens": 256901786.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.36928, | |
| "grad_norm": 0.8545075997582061, | |
| "learning_rate": 2.973744714658452e-06, | |
| "loss": 1.045, | |
| "num_tokens": 258102803.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.37568, | |
| "grad_norm": 0.7950948333995521, | |
| "learning_rate": 2.9567773545734917e-06, | |
| "loss": 1.0609, | |
| "num_tokens": 259309237.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.38208, | |
| "grad_norm": 0.7772992222068908, | |
| "learning_rate": 2.9397881992477388e-06, | |
| "loss": 1.0529, | |
| "num_tokens": 260512534.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.38848, | |
| "grad_norm": 0.8230701809627932, | |
| "learning_rate": 2.9227780593228063e-06, | |
| "loss": 1.0492, | |
| "num_tokens": 261721309.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.3948800000000001, | |
| "grad_norm": 0.803410117521878, | |
| "learning_rate": 2.90574774644159e-06, | |
| "loss": 1.0341, | |
| "num_tokens": 262926754.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.40128, | |
| "grad_norm": 0.9047895349858696, | |
| "learning_rate": 2.8886980732095467e-06, | |
| "loss": 1.0304, | |
| "num_tokens": 264129158.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.40768, | |
| "grad_norm": 0.8048555076981502, | |
| "learning_rate": 2.8716298531559133e-06, | |
| "loss": 1.0494, | |
| "num_tokens": 265332827.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.41408, | |
| "grad_norm": 0.8364957546359483, | |
| "learning_rate": 2.8545439006948948e-06, | |
| "loss": 1.0423, | |
| "num_tokens": 266542306.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.42048, | |
| "grad_norm": 0.7904212151138658, | |
| "learning_rate": 2.8374410310868044e-06, | |
| "loss": 1.0423, | |
| "num_tokens": 267751752.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.42688, | |
| "grad_norm": 0.8434192039931359, | |
| "learning_rate": 2.820322060399156e-06, | |
| "loss": 1.0471, | |
| "num_tokens": 268955655.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.4332799999999999, | |
| "grad_norm": 0.7746642379992007, | |
| "learning_rate": 2.803187805467733e-06, | |
| "loss": 1.0574, | |
| "num_tokens": 270165303.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.43968, | |
| "grad_norm": 0.8462146853078769, | |
| "learning_rate": 2.7860390838576125e-06, | |
| "loss": 1.0579, | |
| "num_tokens": 271371057.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.44608, | |
| "grad_norm": 0.7814911330812998, | |
| "learning_rate": 2.7688767138241474e-06, | |
| "loss": 1.0374, | |
| "num_tokens": 272570562.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.45248, | |
| "grad_norm": 0.7648342437809393, | |
| "learning_rate": 2.7517015142739335e-06, | |
| "loss": 1.0551, | |
| "num_tokens": 273773102.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.45888, | |
| "grad_norm": 0.8135139786141086, | |
| "learning_rate": 2.734514304725727e-06, | |
| "loss": 1.0431, | |
| "num_tokens": 274979458.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.46528, | |
| "grad_norm": 0.8275244446318913, | |
| "learning_rate": 2.717315905271344e-06, | |
| "loss": 1.0436, | |
| "num_tokens": 276180959.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.47168, | |
| "grad_norm": 0.8456585906125247, | |
| "learning_rate": 2.700107136536533e-06, | |
| "loss": 1.0571, | |
| "num_tokens": 277381104.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.47808, | |
| "grad_norm": 0.7676272425904394, | |
| "learning_rate": 2.682888819641809e-06, | |
| "loss": 1.0454, | |
| "num_tokens": 278589355.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.48448, | |
| "grad_norm": 0.7530507207913718, | |
| "learning_rate": 2.6656617761632863e-06, | |
| "loss": 1.0452, | |
| "num_tokens": 279802576.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.49088, | |
| "grad_norm": 0.8099596670334043, | |
| "learning_rate": 2.6484268280934674e-06, | |
| "loss": 1.0441, | |
| "num_tokens": 281010541.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.49728, | |
| "grad_norm": 0.8098629796138991, | |
| "learning_rate": 2.631184797802022e-06, | |
| "loss": 1.0379, | |
| "num_tokens": 282219974.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.5036800000000001, | |
| "grad_norm": 0.8633758780871927, | |
| "learning_rate": 2.613936507996554e-06, | |
| "loss": 1.0553, | |
| "num_tokens": 283423505.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.5100799999999999, | |
| "grad_norm": 0.8494557884878244, | |
| "learning_rate": 2.5966827816833393e-06, | |
| "loss": 1.034, | |
| "num_tokens": 284628594.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.51648, | |
| "grad_norm": 0.8961874351947472, | |
| "learning_rate": 2.579424442128057e-06, | |
| "loss": 1.0403, | |
| "num_tokens": 285839496.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.52288, | |
| "grad_norm": 0.8982519210357097, | |
| "learning_rate": 2.562162312816511e-06, | |
| "loss": 1.0516, | |
| "num_tokens": 287048432.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.52928, | |
| "grad_norm": 0.834174589328149, | |
| "learning_rate": 2.544897217415332e-06, | |
| "loss": 1.0371, | |
| "num_tokens": 288256611.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.5356800000000002, | |
| "grad_norm": 0.7790317392375281, | |
| "learning_rate": 2.5276299797326777e-06, | |
| "loss": 1.0347, | |
| "num_tokens": 289465699.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.54208, | |
| "grad_norm": 0.8113176021935586, | |
| "learning_rate": 2.510361423678929e-06, | |
| "loss": 1.035, | |
| "num_tokens": 290666618.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.54848, | |
| "grad_norm": 0.8175298566784388, | |
| "learning_rate": 2.4930923732273683e-06, | |
| "loss": 1.0364, | |
| "num_tokens": 291864705.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.55488, | |
| "grad_norm": 0.8601137215701125, | |
| "learning_rate": 2.4758236523748734e-06, | |
| "loss": 1.041, | |
| "num_tokens": 293077992.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.56128, | |
| "grad_norm": 0.766342647676912, | |
| "learning_rate": 2.4585560851025917e-06, | |
| "loss": 1.0448, | |
| "num_tokens": 294292270.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.56768, | |
| "grad_norm": 0.8144040865702195, | |
| "learning_rate": 2.4412904953366263e-06, | |
| "loss": 1.0626, | |
| "num_tokens": 295501196.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.57408, | |
| "grad_norm": 0.8426321262317878, | |
| "learning_rate": 2.424027706908728e-06, | |
| "loss": 1.0361, | |
| "num_tokens": 296713375.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.58048, | |
| "grad_norm": 0.870533748148585, | |
| "learning_rate": 2.406768543516977e-06, | |
| "loss": 1.041, | |
| "num_tokens": 297925333.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.5868799999999998, | |
| "grad_norm": 0.813316442312155, | |
| "learning_rate": 2.389513828686485e-06, | |
| "loss": 1.0337, | |
| "num_tokens": 299126955.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.59328, | |
| "grad_norm": 0.8050560504469045, | |
| "learning_rate": 2.372264385730099e-06, | |
| "loss": 1.0432, | |
| "num_tokens": 300336458.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.59968, | |
| "grad_norm": 0.8007073397832749, | |
| "learning_rate": 2.355021037709118e-06, | |
| "loss": 1.0571, | |
| "num_tokens": 301539282.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.60608, | |
| "grad_norm": 0.8259619776886131, | |
| "learning_rate": 2.3377846073940207e-06, | |
| "loss": 1.0478, | |
| "num_tokens": 302743922.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.6124800000000001, | |
| "grad_norm": 0.7857263898091816, | |
| "learning_rate": 2.3205559172252052e-06, | |
| "loss": 1.0265, | |
| "num_tokens": 303945412.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.6188799999999999, | |
| "grad_norm": 0.7830231024473471, | |
| "learning_rate": 2.303335789273744e-06, | |
| "loss": 1.0424, | |
| "num_tokens": 305146555.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.62528, | |
| "grad_norm": 0.773313259484951, | |
| "learning_rate": 2.286125045202164e-06, | |
| "loss": 1.0435, | |
| "num_tokens": 306362219.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.63168, | |
| "grad_norm": 0.8201327055565161, | |
| "learning_rate": 2.2689245062252398e-06, | |
| "loss": 1.0509, | |
| "num_tokens": 307565244.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.63808, | |
| "grad_norm": 0.827602816998628, | |
| "learning_rate": 2.2517349930708032e-06, | |
| "loss": 1.049, | |
| "num_tokens": 308770918.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.6444800000000002, | |
| "grad_norm": 0.7919141547822656, | |
| "learning_rate": 2.234557325940589e-06, | |
| "loss": 1.0431, | |
| "num_tokens": 309984868.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.65088, | |
| "grad_norm": 0.7394357208064606, | |
| "learning_rate": 2.2173923244710954e-06, | |
| "loss": 1.0312, | |
| "num_tokens": 311187334.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.65728, | |
| "grad_norm": 0.785327584034165, | |
| "learning_rate": 2.200240807694474e-06, | |
| "loss": 1.0353, | |
| "num_tokens": 312396234.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.66368, | |
| "grad_norm": 0.8232141872243898, | |
| "learning_rate": 2.1831035939994554e-06, | |
| "loss": 1.0562, | |
| "num_tokens": 313601855.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.67008, | |
| "grad_norm": 0.7833896049344754, | |
| "learning_rate": 2.165981501092291e-06, | |
| "loss": 1.0407, | |
| "num_tokens": 314804262.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.67648, | |
| "grad_norm": 0.7885429615611813, | |
| "learning_rate": 2.148875345957741e-06, | |
| "loss": 1.0295, | |
| "num_tokens": 316005948.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.68288, | |
| "grad_norm": 0.7829739281596803, | |
| "learning_rate": 2.131785944820092e-06, | |
| "loss": 1.0252, | |
| "num_tokens": 317208803.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.6892800000000001, | |
| "grad_norm": 0.7928770034373539, | |
| "learning_rate": 2.114714113104211e-06, | |
| "loss": 1.0498, | |
| "num_tokens": 318416652.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.6956799999999999, | |
| "grad_norm": 0.790850427449215, | |
| "learning_rate": 2.097660665396632e-06, | |
| "loss": 1.0421, | |
| "num_tokens": 319628095.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.70208, | |
| "grad_norm": 0.8023551277637352, | |
| "learning_rate": 2.0806264154066946e-06, | |
| "loss": 1.0393, | |
| "num_tokens": 320828695.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.70848, | |
| "grad_norm": 0.7922577515769408, | |
| "learning_rate": 2.0636121759277135e-06, | |
| "loss": 1.0485, | |
| "num_tokens": 322041475.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.71488, | |
| "grad_norm": 0.7971244397123712, | |
| "learning_rate": 2.046618758798197e-06, | |
| "loss": 1.0275, | |
| "num_tokens": 323243099.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.7212800000000001, | |
| "grad_norm": 0.8040701855401029, | |
| "learning_rate": 2.0296469748631113e-06, | |
| "loss": 1.0238, | |
| "num_tokens": 324448570.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.7276799999999999, | |
| "grad_norm": 0.7586132016898348, | |
| "learning_rate": 2.0126976339351883e-06, | |
| "loss": 1.0345, | |
| "num_tokens": 325656124.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.73408, | |
| "grad_norm": 0.7741130670086324, | |
| "learning_rate": 1.995771544756287e-06, | |
| "loss": 1.0304, | |
| "num_tokens": 326867457.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.74048, | |
| "grad_norm": 0.7603630468965715, | |
| "learning_rate": 1.9788695149588027e-06, | |
| "loss": 1.0348, | |
| "num_tokens": 328069419.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.74688, | |
| "grad_norm": 0.7656701861871694, | |
| "learning_rate": 1.9619923510271333e-06, | |
| "loss": 1.0337, | |
| "num_tokens": 329274913.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.75328, | |
| "grad_norm": 0.7795354061202655, | |
| "learning_rate": 1.945140858259195e-06, | |
| "loss": 1.0467, | |
| "num_tokens": 330497463.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.75968, | |
| "grad_norm": 0.8511581572833524, | |
| "learning_rate": 1.928315840727998e-06, | |
| "loss": 1.0292, | |
| "num_tokens": 331705026.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.76608, | |
| "grad_norm": 0.8185264208105538, | |
| "learning_rate": 1.9115181012432795e-06, | |
| "loss": 1.0462, | |
| "num_tokens": 332910224.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.77248, | |
| "grad_norm": 0.8581339452377109, | |
| "learning_rate": 1.8947484413131996e-06, | |
| "loss": 1.0344, | |
| "num_tokens": 334124736.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.77888, | |
| "grad_norm": 0.8469198844835426, | |
| "learning_rate": 1.8780076611060962e-06, | |
| "loss": 1.031, | |
| "num_tokens": 335328630.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.78528, | |
| "grad_norm": 0.8097233001009885, | |
| "learning_rate": 1.861296559412303e-06, | |
| "loss": 1.0268, | |
| "num_tokens": 336532418.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.79168, | |
| "grad_norm": 0.8477425454150115, | |
| "learning_rate": 1.844615933606037e-06, | |
| "loss": 1.0311, | |
| "num_tokens": 337730246.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.7980800000000001, | |
| "grad_norm": 0.7749925952377877, | |
| "learning_rate": 1.8279665796073498e-06, | |
| "loss": 1.0415, | |
| "num_tokens": 338937460.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.8044799999999999, | |
| "grad_norm": 0.7976261215266267, | |
| "learning_rate": 1.8113492918441523e-06, | |
| "loss": 1.047, | |
| "num_tokens": 340147641.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.81088, | |
| "grad_norm": 0.7733887224457893, | |
| "learning_rate": 1.7947648632143075e-06, | |
| "loss": 1.0309, | |
| "num_tokens": 341352040.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.81728, | |
| "grad_norm": 0.7739175808490624, | |
| "learning_rate": 1.7782140850477967e-06, | |
| "loss": 1.0518, | |
| "num_tokens": 342559891.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.82368, | |
| "grad_norm": 0.797265127895327, | |
| "learning_rate": 1.7616977470689605e-06, | |
| "loss": 1.0325, | |
| "num_tokens": 343774370.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.8300800000000002, | |
| "grad_norm": 0.8443750617770532, | |
| "learning_rate": 1.7452166373588185e-06, | |
| "loss": 1.021, | |
| "num_tokens": 344970302.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.83648, | |
| "grad_norm": 0.8003604596330827, | |
| "learning_rate": 1.7287715423174662e-06, | |
| "loss": 1.0304, | |
| "num_tokens": 346180457.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.84288, | |
| "grad_norm": 0.8376385879621375, | |
| "learning_rate": 1.7123632466265483e-06, | |
| "loss": 1.0395, | |
| "num_tokens": 347385193.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.84928, | |
| "grad_norm": 0.7906644473344662, | |
| "learning_rate": 1.69599253321182e-06, | |
| "loss": 1.0413, | |
| "num_tokens": 348601710.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.85568, | |
| "grad_norm": 0.7924809016265382, | |
| "learning_rate": 1.6796601832057905e-06, | |
| "loss": 1.0378, | |
| "num_tokens": 349806167.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.86208, | |
| "grad_norm": 0.7766495775123572, | |
| "learning_rate": 1.6633669759104488e-06, | |
| "loss": 1.0264, | |
| "num_tokens": 351012043.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.86848, | |
| "grad_norm": 1.3435506252779292, | |
| "learning_rate": 1.6471136887600805e-06, | |
| "loss": 1.0237, | |
| "num_tokens": 352217587.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.87488, | |
| "grad_norm": 0.765607343549468, | |
| "learning_rate": 1.6309010972841728e-06, | |
| "loss": 1.0382, | |
| "num_tokens": 353418821.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.8812799999999998, | |
| "grad_norm": 0.8171820174646456, | |
| "learning_rate": 1.614729975070407e-06, | |
| "loss": 1.0366, | |
| "num_tokens": 354624890.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.88768, | |
| "grad_norm": 0.8064241532835642, | |
| "learning_rate": 1.598601093727749e-06, | |
| "loss": 1.0361, | |
| "num_tokens": 355824991.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.89408, | |
| "grad_norm": 0.7884619306846271, | |
| "learning_rate": 1.5825152228496342e-06, | |
| "loss": 1.0425, | |
| "num_tokens": 357030616.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.90048, | |
| "grad_norm": 0.8265648248850005, | |
| "learning_rate": 1.5664731299772401e-06, | |
| "loss": 1.0332, | |
| "num_tokens": 358234522.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.9068800000000001, | |
| "grad_norm": 0.8092024559268799, | |
| "learning_rate": 1.5504755805628677e-06, | |
| "loss": 1.0399, | |
| "num_tokens": 359443389.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.9132799999999999, | |
| "grad_norm": 0.791864238644019, | |
| "learning_rate": 1.5345233379334156e-06, | |
| "loss": 1.0289, | |
| "num_tokens": 360644258.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.91968, | |
| "grad_norm": 0.8006538523086424, | |
| "learning_rate": 1.5186171632539587e-06, | |
| "loss": 1.0392, | |
| "num_tokens": 361848281.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.92608, | |
| "grad_norm": 0.7852026214667117, | |
| "learning_rate": 1.502757815491429e-06, | |
| "loss": 1.0301, | |
| "num_tokens": 363051672.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.93248, | |
| "grad_norm": 0.7473075275246417, | |
| "learning_rate": 1.4869460513784011e-06, | |
| "loss": 1.0349, | |
| "num_tokens": 364249917.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.9388800000000002, | |
| "grad_norm": 0.7822299185363633, | |
| "learning_rate": 1.4711826253769828e-06, | |
| "loss": 1.04, | |
| "num_tokens": 365456248.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.94528, | |
| "grad_norm": 0.8034434681463449, | |
| "learning_rate": 1.4554682896428179e-06, | |
| "loss": 1.0379, | |
| "num_tokens": 366654881.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.95168, | |
| "grad_norm": 0.7768199970864885, | |
| "learning_rate": 1.439803793989198e-06, | |
| "loss": 1.0241, | |
| "num_tokens": 367861348.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.95808, | |
| "grad_norm": 0.8118112910224361, | |
| "learning_rate": 1.4241898858512824e-06, | |
| "loss": 1.0426, | |
| "num_tokens": 369064003.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.96448, | |
| "grad_norm": 0.7744113528953481, | |
| "learning_rate": 1.408627310250434e-06, | |
| "loss": 1.0414, | |
| "num_tokens": 370279324.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.97088, | |
| "grad_norm": 0.7887556630257991, | |
| "learning_rate": 1.3931168097586717e-06, | |
| "loss": 1.0336, | |
| "num_tokens": 371480368.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.97728, | |
| "grad_norm": 0.7640435636356337, | |
| "learning_rate": 1.377659124463239e-06, | |
| "loss": 1.042, | |
| "num_tokens": 372690129.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.98368, | |
| "grad_norm": 0.7603826553278634, | |
| "learning_rate": 1.3622549919312902e-06, | |
| "loss": 1.0361, | |
| "num_tokens": 373902924.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.9900799999999998, | |
| "grad_norm": 0.7599088525071184, | |
| "learning_rate": 1.346905147174694e-06, | |
| "loss": 1.0193, | |
| "num_tokens": 375112585.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.99648, | |
| "grad_norm": 0.7816099568186937, | |
| "learning_rate": 1.3316103226149682e-06, | |
| "loss": 1.0349, | |
| "num_tokens": 376325844.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.00256, | |
| "grad_norm": 0.7532423548597259, | |
| "learning_rate": 1.3163712480483255e-06, | |
| "loss": 1.0248, | |
| "num_tokens": 377473897.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.00896, | |
| "grad_norm": 0.7586660186977321, | |
| "learning_rate": 1.3011886506108578e-06, | |
| "loss": 1.0107, | |
| "num_tokens": 378675832.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.01536, | |
| "grad_norm": 0.7958518507428463, | |
| "learning_rate": 1.2860632547438334e-06, | |
| "loss": 1.0029, | |
| "num_tokens": 379872472.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.02176, | |
| "grad_norm": 0.8017956552207596, | |
| "learning_rate": 1.2709957821591384e-06, | |
| "loss": 1.0188, | |
| "num_tokens": 381071848.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.02816, | |
| "grad_norm": 0.8260326835110341, | |
| "learning_rate": 1.2559869518048307e-06, | |
| "loss": 1.0134, | |
| "num_tokens": 382272368.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.03456, | |
| "grad_norm": 0.845928507883109, | |
| "learning_rate": 1.2410374798308442e-06, | |
| "loss": 1.0107, | |
| "num_tokens": 383480338.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.04096, | |
| "grad_norm": 0.8513825857009242, | |
| "learning_rate": 1.2261480795548123e-06, | |
| "loss": 1.0099, | |
| "num_tokens": 384683907.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.04736, | |
| "grad_norm": 0.7711891823020852, | |
| "learning_rate": 1.211319461428032e-06, | |
| "loss": 1.0139, | |
| "num_tokens": 385889491.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.05376, | |
| "grad_norm": 0.7769167344105451, | |
| "learning_rate": 1.1965523330015652e-06, | |
| "loss": 1.0092, | |
| "num_tokens": 387095853.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.06016, | |
| "grad_norm": 0.7922783527359497, | |
| "learning_rate": 1.1818473988924797e-06, | |
| "loss": 1.0199, | |
| "num_tokens": 388306034.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.06656, | |
| "grad_norm": 0.8009332691587518, | |
| "learning_rate": 1.167205360750227e-06, | |
| "loss": 1.0185, | |
| "num_tokens": 389516647.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.07296, | |
| "grad_norm": 0.7591186989087252, | |
| "learning_rate": 1.1526269172231594e-06, | |
| "loss": 0.995, | |
| "num_tokens": 390724121.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.07936, | |
| "grad_norm": 0.8055729406106343, | |
| "learning_rate": 1.1381127639252005e-06, | |
| "loss": 1.0109, | |
| "num_tokens": 391924857.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.08576, | |
| "grad_norm": 0.7920326568899239, | |
| "learning_rate": 1.1236635934026474e-06, | |
| "loss": 0.9928, | |
| "num_tokens": 393133226.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.09216, | |
| "grad_norm": 0.8095321364071963, | |
| "learning_rate": 1.1092800951011283e-06, | |
| "loss": 1.0066, | |
| "num_tokens": 394338791.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.09856, | |
| "grad_norm": 0.7790939177959936, | |
| "learning_rate": 1.0949629553327106e-06, | |
| "loss": 1.0144, | |
| "num_tokens": 395544646.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.10496, | |
| "grad_norm": 0.9934552993460479, | |
| "learning_rate": 1.080712857243143e-06, | |
| "loss": 1.0004, | |
| "num_tokens": 396744920.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.11136, | |
| "grad_norm": 0.7949729572040324, | |
| "learning_rate": 1.0665304807792653e-06, | |
| "loss": 1.009, | |
| "num_tokens": 397964288.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.11776, | |
| "grad_norm": 0.7652291996158113, | |
| "learning_rate": 1.0524165026565655e-06, | |
| "loss": 1.007, | |
| "num_tokens": 399168969.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.12416, | |
| "grad_norm": 0.7761816653258836, | |
| "learning_rate": 1.0383715963268884e-06, | |
| "loss": 0.994, | |
| "num_tokens": 400373422.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.13056, | |
| "grad_norm": 0.8018173213180155, | |
| "learning_rate": 1.0243964319462997e-06, | |
| "loss": 1.0134, | |
| "num_tokens": 401577043.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.13696, | |
| "grad_norm": 0.7607821844421783, | |
| "learning_rate": 1.0104916763431133e-06, | |
| "loss": 1.0187, | |
| "num_tokens": 402777527.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.14336, | |
| "grad_norm": 0.8327193810047873, | |
| "learning_rate": 9.966579929860704e-07, | |
| "loss": 1.0249, | |
| "num_tokens": 403989663.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.14976, | |
| "grad_norm": 0.7706122440471653, | |
| "learning_rate": 9.828960419526818e-07, | |
| "loss": 1.0085, | |
| "num_tokens": 405198202.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.15616, | |
| "grad_norm": 0.7867703490032154, | |
| "learning_rate": 9.69206479897736e-07, | |
| "loss": 1.0197, | |
| "num_tokens": 406403598.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 2.16256, | |
| "grad_norm": 0.8249023880860281, | |
| "learning_rate": 9.555899600219634e-07, | |
| "loss": 1.0274, | |
| "num_tokens": 407600213.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 2.16896, | |
| "grad_norm": 0.7855210183667297, | |
| "learning_rate": 9.420471320408669e-07, | |
| "loss": 1.0127, | |
| "num_tokens": 408811259.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 2.17536, | |
| "grad_norm": 0.8119007655119803, | |
| "learning_rate": 9.28578642153726e-07, | |
| "loss": 1.0021, | |
| "num_tokens": 410014132.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.18176, | |
| "grad_norm": 0.9205406972397864, | |
| "learning_rate": 9.151851330127593e-07, | |
| "loss": 1.0126, | |
| "num_tokens": 411220727.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 2.18816, | |
| "grad_norm": 0.8056010049273263, | |
| "learning_rate": 9.018672436924605e-07, | |
| "loss": 0.9892, | |
| "num_tokens": 412425755.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.19456, | |
| "grad_norm": 0.7632161903493846, | |
| "learning_rate": 8.886256096591048e-07, | |
| "loss": 1.019, | |
| "num_tokens": 413631347.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 2.20096, | |
| "grad_norm": 0.7669268826101938, | |
| "learning_rate": 8.754608627404307e-07, | |
| "loss": 1.0048, | |
| "num_tokens": 414833259.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 2.20736, | |
| "grad_norm": 0.8361832533605145, | |
| "learning_rate": 8.623736310954869e-07, | |
| "loss": 1.0221, | |
| "num_tokens": 416040472.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.21376, | |
| "grad_norm": 0.7814096537064951, | |
| "learning_rate": 8.493645391846642e-07, | |
| "loss": 1.0037, | |
| "num_tokens": 417245756.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 2.22016, | |
| "grad_norm": 0.7933425120808404, | |
| "learning_rate": 8.364342077398971e-07, | |
| "loss": 0.9987, | |
| "num_tokens": 418455436.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 2.22656, | |
| "grad_norm": 0.7659925077465827, | |
| "learning_rate": 8.235832537350441e-07, | |
| "loss": 0.993, | |
| "num_tokens": 419667134.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 2.23296, | |
| "grad_norm": 0.8187051274632632, | |
| "learning_rate": 8.108122903564502e-07, | |
| "loss": 1.0028, | |
| "num_tokens": 420870725.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 2.23936, | |
| "grad_norm": 0.7595169446678035, | |
| "learning_rate": 7.98121926973692e-07, | |
| "loss": 1.0124, | |
| "num_tokens": 422076634.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.24576, | |
| "grad_norm": 0.8064753048978947, | |
| "learning_rate": 7.855127691104944e-07, | |
| "loss": 1.024, | |
| "num_tokens": 423284867.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.25216, | |
| "grad_norm": 0.809858814713402, | |
| "learning_rate": 7.729854184158411e-07, | |
| "loss": 1.0174, | |
| "num_tokens": 424493379.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 2.25856, | |
| "grad_norm": 0.7957945935555317, | |
| "learning_rate": 7.605404726352708e-07, | |
| "loss": 1.0149, | |
| "num_tokens": 425697729.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 2.26496, | |
| "grad_norm": 0.8194656210162423, | |
| "learning_rate": 7.481785255823482e-07, | |
| "loss": 0.9972, | |
| "num_tokens": 426893908.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 2.27136, | |
| "grad_norm": 0.7967423955163617, | |
| "learning_rate": 7.359001671103361e-07, | |
| "loss": 1.0106, | |
| "num_tokens": 428092842.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.27776, | |
| "grad_norm": 0.7881164663338793, | |
| "learning_rate": 7.237059830840482e-07, | |
| "loss": 1.0066, | |
| "num_tokens": 429286773.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 2.28416, | |
| "grad_norm": 0.7903923247778172, | |
| "learning_rate": 7.11596555351893e-07, | |
| "loss": 1.0111, | |
| "num_tokens": 430493341.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 2.29056, | |
| "grad_norm": 0.770776011448775, | |
| "learning_rate": 6.995724617181124e-07, | |
| "loss": 0.9923, | |
| "num_tokens": 431693370.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 2.29696, | |
| "grad_norm": 0.7817336774071154, | |
| "learning_rate": 6.876342759152121e-07, | |
| "loss": 1.0162, | |
| "num_tokens": 432901215.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 2.30336, | |
| "grad_norm": 0.7799297164560258, | |
| "learning_rate": 6.757825675765862e-07, | |
| "loss": 1.0089, | |
| "num_tokens": 434107776.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.30976, | |
| "grad_norm": 0.8399066019292479, | |
| "learning_rate": 6.640179022093324e-07, | |
| "loss": 1.0104, | |
| "num_tokens": 435311152.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 2.31616, | |
| "grad_norm": 0.8297592147597433, | |
| "learning_rate": 6.52340841167276e-07, | |
| "loss": 1.0114, | |
| "num_tokens": 436513739.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 2.32256, | |
| "grad_norm": 0.7719279126860086, | |
| "learning_rate": 6.407519416241779e-07, | |
| "loss": 1.0065, | |
| "num_tokens": 437726898.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 2.32896, | |
| "grad_norm": 0.8045844362641281, | |
| "learning_rate": 6.292517565471548e-07, | |
| "loss": 1.0097, | |
| "num_tokens": 438931660.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 2.33536, | |
| "grad_norm": 0.7982553698914577, | |
| "learning_rate": 6.178408346702882e-07, | |
| "loss": 1.0082, | |
| "num_tokens": 440137185.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.34176, | |
| "grad_norm": 0.7908405728187465, | |
| "learning_rate": 6.065197204684484e-07, | |
| "loss": 1.0148, | |
| "num_tokens": 441339870.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 2.34816, | |
| "grad_norm": 0.7738211794516375, | |
| "learning_rate": 5.95288954131307e-07, | |
| "loss": 1.015, | |
| "num_tokens": 442548750.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 2.35456, | |
| "grad_norm": 0.7925014240523639, | |
| "learning_rate": 5.841490715375689e-07, | |
| "loss": 1.0146, | |
| "num_tokens": 443760356.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 2.36096, | |
| "grad_norm": 0.7744344940621614, | |
| "learning_rate": 5.731006042293983e-07, | |
| "loss": 1.0195, | |
| "num_tokens": 444963192.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.36736, | |
| "grad_norm": 0.809967543772837, | |
| "learning_rate": 5.621440793870564e-07, | |
| "loss": 1.0138, | |
| "num_tokens": 446161734.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.37376, | |
| "grad_norm": 0.7634003235889771, | |
| "learning_rate": 5.512800198037477e-07, | |
| "loss": 1.0092, | |
| "num_tokens": 447367385.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.38016, | |
| "grad_norm": 0.7694302990943018, | |
| "learning_rate": 5.405089438606759e-07, | |
| "loss": 1.0183, | |
| "num_tokens": 448574222.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 2.3865600000000002, | |
| "grad_norm": 0.7964969360810369, | |
| "learning_rate": 5.298313655023083e-07, | |
| "loss": 1.0146, | |
| "num_tokens": 449787465.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 2.39296, | |
| "grad_norm": 0.7826022145337301, | |
| "learning_rate": 5.192477942118501e-07, | |
| "loss": 1.0059, | |
| "num_tokens": 450993609.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 2.39936, | |
| "grad_norm": 0.7939322826576104, | |
| "learning_rate": 5.087587349869396e-07, | |
| "loss": 1.016, | |
| "num_tokens": 452203974.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.40576, | |
| "grad_norm": 0.7880956603422961, | |
| "learning_rate": 4.983646883155479e-07, | |
| "loss": 0.9871, | |
| "num_tokens": 453406872.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 2.41216, | |
| "grad_norm": 0.7870741062813569, | |
| "learning_rate": 4.880661501520977e-07, | |
| "loss": 1.0146, | |
| "num_tokens": 454612112.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 2.41856, | |
| "grad_norm": 0.7757670556350029, | |
| "learning_rate": 4.778636118938052e-07, | |
| "loss": 1.0043, | |
| "num_tokens": 455821550.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 2.42496, | |
| "grad_norm": 0.764980277323769, | |
| "learning_rate": 4.677575603572235e-07, | |
| "loss": 1.0037, | |
| "num_tokens": 457034119.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 2.43136, | |
| "grad_norm": 0.7689487131773513, | |
| "learning_rate": 4.5774847775501977e-07, | |
| "loss": 1.0215, | |
| "num_tokens": 458243443.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.43776, | |
| "grad_norm": 0.7835819207262276, | |
| "learning_rate": 4.4783684167296645e-07, | |
| "loss": 1.0107, | |
| "num_tokens": 459449656.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 2.44416, | |
| "grad_norm": 0.7439227301838608, | |
| "learning_rate": 4.38023125047152e-07, | |
| "loss": 1.0163, | |
| "num_tokens": 460660657.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 2.45056, | |
| "grad_norm": 0.8141456247124772, | |
| "learning_rate": 4.283077961414125e-07, | |
| "loss": 1.0073, | |
| "num_tokens": 461868305.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 2.45696, | |
| "grad_norm": 0.7873824030524625, | |
| "learning_rate": 4.186913185249936e-07, | |
| "loss": 1.0161, | |
| "num_tokens": 463067022.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 2.4633599999999998, | |
| "grad_norm": 0.7651257037667265, | |
| "learning_rate": 4.091741510504249e-07, | |
| "loss": 1.0054, | |
| "num_tokens": 464277276.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.46976, | |
| "grad_norm": 0.7817592356120844, | |
| "learning_rate": 3.9975674783163e-07, | |
| "loss": 1.0131, | |
| "num_tokens": 465486770.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 2.47616, | |
| "grad_norm": 0.7600628098450863, | |
| "learning_rate": 3.904395582222578e-07, | |
| "loss": 1.0, | |
| "num_tokens": 466688564.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 2.48256, | |
| "grad_norm": 0.7452814104047683, | |
| "learning_rate": 3.81223026794241e-07, | |
| "loss": 0.9948, | |
| "num_tokens": 467893407.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 2.48896, | |
| "grad_norm": 0.7886078128816824, | |
| "learning_rate": 3.721075933165816e-07, | |
| "loss": 1.0255, | |
| "num_tokens": 469103315.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 2.49536, | |
| "grad_norm": 0.7883279810476201, | |
| "learning_rate": 3.630936927343695e-07, | |
| "loss": 0.9955, | |
| "num_tokens": 470304536.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.50176, | |
| "grad_norm": 0.7870530493997763, | |
| "learning_rate": 3.541817551480292e-07, | |
| "loss": 1.0106, | |
| "num_tokens": 471516225.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 2.50816, | |
| "grad_norm": 0.7913988775198784, | |
| "learning_rate": 3.4537220579279497e-07, | |
| "loss": 1.0123, | |
| "num_tokens": 472723848.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 2.51456, | |
| "grad_norm": 0.788228042670068, | |
| "learning_rate": 3.366654650184217e-07, | |
| "loss": 1.0076, | |
| "num_tokens": 473927605.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 2.52096, | |
| "grad_norm": 0.7671091431259203, | |
| "learning_rate": 3.2806194826913107e-07, | |
| "loss": 1.0054, | |
| "num_tokens": 475130341.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 2.52736, | |
| "grad_norm": 0.7769242999032523, | |
| "learning_rate": 3.1956206606378186e-07, | |
| "loss": 1.0137, | |
| "num_tokens": 476337471.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.53376, | |
| "grad_norm": 0.7761725619806417, | |
| "learning_rate": 3.1116622397628886e-07, | |
| "loss": 1.0139, | |
| "num_tokens": 477546278.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 2.54016, | |
| "grad_norm": 0.8119517968358277, | |
| "learning_rate": 3.0287482261626727e-07, | |
| "loss": 1.0112, | |
| "num_tokens": 478748834.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 2.54656, | |
| "grad_norm": 0.7768387486408453, | |
| "learning_rate": 2.946882576099164e-07, | |
| "loss": 1.0176, | |
| "num_tokens": 479951666.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 2.55296, | |
| "grad_norm": 0.8059661577502851, | |
| "learning_rate": 2.8660691958114384e-07, | |
| "loss": 1.0192, | |
| "num_tokens": 481155740.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 2.55936, | |
| "grad_norm": 0.7923218074076707, | |
| "learning_rate": 2.786311941329298e-07, | |
| "loss": 1.0228, | |
| "num_tokens": 482362569.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.56576, | |
| "grad_norm": 0.7737100130087119, | |
| "learning_rate": 2.70761461828922e-07, | |
| "loss": 1.0117, | |
| "num_tokens": 483577083.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 2.5721600000000002, | |
| "grad_norm": 0.8198263737858525, | |
| "learning_rate": 2.629980981752803e-07, | |
| "loss": 1.0027, | |
| "num_tokens": 484785169.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 2.57856, | |
| "grad_norm": 0.7800117950292567, | |
| "learning_rate": 2.5534147360276014e-07, | |
| "loss": 1.0061, | |
| "num_tokens": 485992637.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 2.58496, | |
| "grad_norm": 0.7806994703813391, | |
| "learning_rate": 2.4779195344903447e-07, | |
| "loss": 1.0067, | |
| "num_tokens": 487200371.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 2.59136, | |
| "grad_norm": 0.7967832568550222, | |
| "learning_rate": 2.4034989794126494e-07, | |
| "loss": 1.005, | |
| "num_tokens": 488411438.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.59776, | |
| "grad_norm": 0.7613054637393943, | |
| "learning_rate": 2.3301566217891148e-07, | |
| "loss": 1.0057, | |
| "num_tokens": 489619089.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 2.6041600000000003, | |
| "grad_norm": 0.8097260832659626, | |
| "learning_rate": 2.257895961167886e-07, | |
| "loss": 1.0115, | |
| "num_tokens": 490822004.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 2.61056, | |
| "grad_norm": 0.7724807002861569, | |
| "learning_rate": 2.18672044548367e-07, | |
| "loss": 1.013, | |
| "num_tokens": 492031022.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 2.6169599999999997, | |
| "grad_norm": 0.769701738678788, | |
| "learning_rate": 2.1166334708932367e-07, | |
| "loss": 1.0097, | |
| "num_tokens": 493240890.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 2.62336, | |
| "grad_norm": 0.76114972582814, | |
| "learning_rate": 2.0476383816133594e-07, | |
| "loss": 1.0042, | |
| "num_tokens": 494453799.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.62976, | |
| "grad_norm": 0.8083057947332605, | |
| "learning_rate": 1.9797384697612277e-07, | |
| "loss": 1.0044, | |
| "num_tokens": 495667359.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 2.63616, | |
| "grad_norm": 0.7707158865091736, | |
| "learning_rate": 1.912936975197388e-07, | |
| "loss": 1.0073, | |
| "num_tokens": 496881814.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 2.64256, | |
| "grad_norm": 0.7676478517895791, | |
| "learning_rate": 1.8472370853711397e-07, | |
| "loss": 1.0187, | |
| "num_tokens": 498083665.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 2.6489599999999998, | |
| "grad_norm": 0.7728314364028435, | |
| "learning_rate": 1.7826419351684553e-07, | |
| "loss": 0.996, | |
| "num_tokens": 499285193.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 2.65536, | |
| "grad_norm": 0.7787493559807903, | |
| "learning_rate": 1.7191546067623772e-07, | |
| "loss": 0.9928, | |
| "num_tokens": 500495522.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.66176, | |
| "grad_norm": 0.7740957124528121, | |
| "learning_rate": 1.656778129465983e-07, | |
| "loss": 0.9942, | |
| "num_tokens": 501704772.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 2.66816, | |
| "grad_norm": 0.7834164164129861, | |
| "learning_rate": 1.5955154795878086e-07, | |
| "loss": 1.0018, | |
| "num_tokens": 502908159.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 2.67456, | |
| "grad_norm": 0.7690261436250733, | |
| "learning_rate": 1.5353695802898556e-07, | |
| "loss": 0.9966, | |
| "num_tokens": 504119578.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 2.68096, | |
| "grad_norm": 0.7500003508328252, | |
| "learning_rate": 1.4763433014481105e-07, | |
| "loss": 1.0175, | |
| "num_tokens": 505329761.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 2.68736, | |
| "grad_norm": 0.7619674427912766, | |
| "learning_rate": 1.4184394595155887e-07, | |
| "loss": 1.0084, | |
| "num_tokens": 506541089.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.69376, | |
| "grad_norm": 0.7905928509034632, | |
| "learning_rate": 1.3616608173879636e-07, | |
| "loss": 1.0077, | |
| "num_tokens": 507747398.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 2.70016, | |
| "grad_norm": 0.7768455409603942, | |
| "learning_rate": 1.3060100842717388e-07, | |
| "loss": 1.0211, | |
| "num_tokens": 508948926.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 2.70656, | |
| "grad_norm": 0.7650832573151034, | |
| "learning_rate": 1.2514899155549625e-07, | |
| "loss": 1.0033, | |
| "num_tokens": 510157051.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 2.71296, | |
| "grad_norm": 0.7847880941915708, | |
| "learning_rate": 1.1981029126805293e-07, | |
| "loss": 1.0025, | |
| "num_tokens": 511359623.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 2.71936, | |
| "grad_norm": 0.8047407028430222, | |
| "learning_rate": 1.1458516230220651e-07, | |
| "loss": 1.0056, | |
| "num_tokens": 512562364.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.72576, | |
| "grad_norm": 0.7894872635799464, | |
| "learning_rate": 1.0947385397623522e-07, | |
| "loss": 1.0062, | |
| "num_tokens": 513767195.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 2.73216, | |
| "grad_norm": 0.7754271372790722, | |
| "learning_rate": 1.0447661017743971e-07, | |
| "loss": 0.997, | |
| "num_tokens": 514974517.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 2.73856, | |
| "grad_norm": 0.7746425365371328, | |
| "learning_rate": 9.959366935050397e-08, | |
| "loss": 0.9987, | |
| "num_tokens": 516179935.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 2.74496, | |
| "grad_norm": 0.7523512554064233, | |
| "learning_rate": 9.482526448611807e-08, | |
| "loss": 1.0042, | |
| "num_tokens": 517387907.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 2.75136, | |
| "grad_norm": 0.7805940920378595, | |
| "learning_rate": 9.017162310986067e-08, | |
| "loss": 1.002, | |
| "num_tokens": 518595813.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.75776, | |
| "grad_norm": 0.8110259911998368, | |
| "learning_rate": 8.563296727134435e-08, | |
| "loss": 1.0066, | |
| "num_tokens": 519800375.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 2.76416, | |
| "grad_norm": 0.772256949618178, | |
| "learning_rate": 8.120951353361884e-08, | |
| "loss": 1.0045, | |
| "num_tokens": 521008297.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 2.77056, | |
| "grad_norm": 0.7629770251408482, | |
| "learning_rate": 7.690147296283757e-08, | |
| "loss": 1.0007, | |
| "num_tokens": 522217337.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 2.77696, | |
| "grad_norm": 0.750000751925906, | |
| "learning_rate": 7.270905111818744e-08, | |
| "loss": 1.0044, | |
| "num_tokens": 523427534.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 2.78336, | |
| "grad_norm": 0.7695523347419888, | |
| "learning_rate": 6.863244804208053e-08, | |
| "loss": 1.0185, | |
| "num_tokens": 524629610.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.7897600000000002, | |
| "grad_norm": 0.7594496702512009, | |
| "learning_rate": 6.467185825060728e-08, | |
| "loss": 1.0132, | |
| "num_tokens": 525838628.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 2.79616, | |
| "grad_norm": 0.774231464389687, | |
| "learning_rate": 6.082747072425844e-08, | |
| "loss": 0.9923, | |
| "num_tokens": 527047256.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 2.80256, | |
| "grad_norm": 0.7878028776389799, | |
| "learning_rate": 5.709946889890461e-08, | |
| "loss": 0.9989, | |
| "num_tokens": 528251412.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 2.80896, | |
| "grad_norm": 0.7680845271371904, | |
| "learning_rate": 5.348803065704483e-08, | |
| "loss": 0.9971, | |
| "num_tokens": 529460583.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 2.81536, | |
| "grad_norm": 0.7710477876974481, | |
| "learning_rate": 4.999332831931936e-08, | |
| "loss": 1.0097, | |
| "num_tokens": 530666949.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.8217600000000003, | |
| "grad_norm": 0.7641864260094089, | |
| "learning_rate": 4.6615528636286545e-08, | |
| "loss": 1.0083, | |
| "num_tokens": 531877350.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 2.82816, | |
| "grad_norm": 0.7798848074760067, | |
| "learning_rate": 4.3354792780467004e-08, | |
| "loss": 1.0145, | |
| "num_tokens": 533089968.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 2.8345599999999997, | |
| "grad_norm": 0.7987639919755114, | |
| "learning_rate": 4.021127633865196e-08, | |
| "loss": 1.0061, | |
| "num_tokens": 534295222.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 2.84096, | |
| "grad_norm": 0.7471470388574258, | |
| "learning_rate": 3.718512930448115e-08, | |
| "loss": 0.9897, | |
| "num_tokens": 535501172.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 2.84736, | |
| "grad_norm": 0.7784643844597081, | |
| "learning_rate": 3.4276496071284084e-08, | |
| "loss": 1.0126, | |
| "num_tokens": 536697925.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.85376, | |
| "grad_norm": 0.7972370799678196, | |
| "learning_rate": 3.148551542519196e-08, | |
| "loss": 1.0051, | |
| "num_tokens": 537893496.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 2.86016, | |
| "grad_norm": 0.7691284457736113, | |
| "learning_rate": 2.8812320538514348e-08, | |
| "loss": 1.0098, | |
| "num_tokens": 539102796.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 2.8665599999999998, | |
| "grad_norm": 0.7982125519739797, | |
| "learning_rate": 2.6257038963385106e-08, | |
| "loss": 1.0136, | |
| "num_tokens": 540316296.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 2.87296, | |
| "grad_norm": 0.7728520058232545, | |
| "learning_rate": 2.3819792625675297e-08, | |
| "loss": 1.0149, | |
| "num_tokens": 541533670.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 2.87936, | |
| "grad_norm": 0.7681197599600511, | |
| "learning_rate": 2.1500697819178406e-08, | |
| "loss": 1.0027, | |
| "num_tokens": 542738043.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.88576, | |
| "grad_norm": 0.7510549175746628, | |
| "learning_rate": 1.9299865200057556e-08, | |
| "loss": 1.0059, | |
| "num_tokens": 543947829.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 2.89216, | |
| "grad_norm": 0.766881311747473, | |
| "learning_rate": 1.721739978156778e-08, | |
| "loss": 1.0051, | |
| "num_tokens": 545163765.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 2.89856, | |
| "grad_norm": 0.8067610998392601, | |
| "learning_rate": 1.5253400929045036e-08, | |
| "loss": 0.9998, | |
| "num_tokens": 546371420.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 2.90496, | |
| "grad_norm": 0.7674069412891232, | |
| "learning_rate": 1.3407962355164728e-08, | |
| "loss": 1.0164, | |
| "num_tokens": 547577921.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 2.91136, | |
| "grad_norm": 0.775004069541473, | |
| "learning_rate": 1.1681172115469986e-08, | |
| "loss": 1.0034, | |
| "num_tokens": 548783680.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.91776, | |
| "grad_norm": 0.7833232261400477, | |
| "learning_rate": 1.007311260417032e-08, | |
| "loss": 0.9956, | |
| "num_tokens": 549988634.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 2.92416, | |
| "grad_norm": 0.7826676344415344, | |
| "learning_rate": 8.583860550210043e-09, | |
| "loss": 1.0098, | |
| "num_tokens": 551189799.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 2.93056, | |
| "grad_norm": 0.744986247926951, | |
| "learning_rate": 7.213487013607856e-09, | |
| "loss": 1.0035, | |
| "num_tokens": 552397598.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 2.93696, | |
| "grad_norm": 0.7521106648563647, | |
| "learning_rate": 5.96205738206429e-09, | |
| "loss": 1.0043, | |
| "num_tokens": 553610771.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 2.94336, | |
| "grad_norm": 0.8150061917429959, | |
| "learning_rate": 4.829631367844201e-09, | |
| "loss": 1.0, | |
| "num_tokens": 554824637.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.94976, | |
| "grad_norm": 0.772723595238506, | |
| "learning_rate": 3.816263004925991e-09, | |
| "loss": 1.0082, | |
| "num_tokens": 556030923.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 2.95616, | |
| "grad_norm": 0.768451723737756, | |
| "learning_rate": 2.922000646423118e-09, | |
| "loss": 0.9922, | |
| "num_tokens": 557231653.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 2.96256, | |
| "grad_norm": 0.7993486350591127, | |
| "learning_rate": 2.1468869622781608e-09, | |
| "loss": 1.0019, | |
| "num_tokens": 558442813.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 2.96896, | |
| "grad_norm": 1.1239957345324176, | |
| "learning_rate": 1.4909589372266719e-09, | |
| "loss": 1.001, | |
| "num_tokens": 559650373.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 2.9753600000000002, | |
| "grad_norm": 0.7957668006721109, | |
| "learning_rate": 9.542478690305335e-10, | |
| "loss": 1.0067, | |
| "num_tokens": 560855666.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.98176, | |
| "grad_norm": 0.8079239433679425, | |
| "learning_rate": 5.367793669874832e-10, | |
| "loss": 0.9969, | |
| "num_tokens": 562060878.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 2.98816, | |
| "grad_norm": 0.7720639449232606, | |
| "learning_rate": 2.385733507062615e-10, | |
| "loss": 1.0052, | |
| "num_tokens": 563260411.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 2.99456, | |
| "grad_norm": 0.7561377897632978, | |
| "learning_rate": 5.964404915903555e-11, | |
| "loss": 0.9991, | |
| "num_tokens": 564468049.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "num_tokens": 565489014.0, | |
| "step": 4689, | |
| "total_flos": 722584728633344.0, | |
| "train_loss": 1.0774097926684294, | |
| "train_runtime": 15585.9875, | |
| "train_samples_per_second": 19.248, | |
| "train_steps_per_second": 0.301 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4689, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 722584728633344.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |