diff --git "a/BASE_seed1_10k/trainer_state.json" "b/BASE_seed1_10k/trainer_state.json" new file mode 100644--- /dev/null +++ "b/BASE_seed1_10k/trainer_state.json" @@ -0,0 +1,7034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.802776275464215, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0018027762754642149, + "grad_norm": 5.41559362411499, + "learning_rate": 3.0000000000000004e-07, + "loss": 0.5387, + "step": 10 + }, + { + "epoch": 0.0036055525509284298, + "grad_norm": 7.191960334777832, + "learning_rate": 6.333333333333333e-07, + "loss": 0.5293, + "step": 20 + }, + { + "epoch": 0.005408328826392645, + "grad_norm": 4.474244117736816, + "learning_rate": 9.666666666666668e-07, + "loss": 0.4781, + "step": 30 + }, + { + "epoch": 0.0072111051018568595, + "grad_norm": 3.841322183609009, + "learning_rate": 1.3e-06, + "loss": 0.4167, + "step": 40 + }, + { + "epoch": 0.009013881377321075, + "grad_norm": 1.6013221740722656, + "learning_rate": 1.6333333333333333e-06, + "loss": 0.2892, + "step": 50 + }, + { + "epoch": 0.01081665765278529, + "grad_norm": 0.917481005191803, + "learning_rate": 1.9666666666666668e-06, + "loss": 0.2024, + "step": 60 + }, + { + "epoch": 0.012619433928249504, + "grad_norm": 1.1382007598876953, + "learning_rate": 2.3e-06, + "loss": 0.1743, + "step": 70 + }, + { + "epoch": 0.014422210203713719, + "grad_norm": 0.7447443008422852, + "learning_rate": 2.6333333333333337e-06, + "loss": 0.1568, + "step": 80 + }, + { + "epoch": 0.016224986479177934, + "grad_norm": 0.6980589628219604, + "learning_rate": 2.966666666666667e-06, + "loss": 0.151, + "step": 90 + }, + { + "epoch": 0.01802776275464215, + "grad_norm": 0.39905884861946106, + "learning_rate": 3.3e-06, + "loss": 0.1492, + "step": 100 + }, + { + "epoch": 0.019830539030106363, + "grad_norm": 0.4256199598312378, + "learning_rate": 3.633333333333334e-06, + "loss": 0.1458, + "step": 110 + }, + { + "epoch": 0.02163331530557058, + "grad_norm": 0.4188803732395172, + "learning_rate": 3.966666666666667e-06, + "loss": 0.1271, + "step": 120 + }, + { + "epoch": 0.023436091581034792, + "grad_norm": 0.36602726578712463, + "learning_rate": 4.2999999999999995e-06, + "loss": 0.1223, + "step": 130 + }, + { + "epoch": 0.02523886785649901, + "grad_norm": 0.4918166399002075, + "learning_rate": 4.633333333333334e-06, + "loss": 0.126, + "step": 140 + }, + { + "epoch": 0.02704164413196322, + "grad_norm": 0.6279438138008118, + "learning_rate": 4.966666666666667e-06, + "loss": 0.1206, + "step": 150 + }, + { + "epoch": 0.028844420407427438, + "grad_norm": 0.569150984287262, + "learning_rate": 5.3e-06, + "loss": 0.1107, + "step": 160 + }, + { + "epoch": 0.030647196682891654, + "grad_norm": 0.625974714756012, + "learning_rate": 5.633333333333333e-06, + "loss": 0.1071, + "step": 170 + }, + { + "epoch": 0.03244997295835587, + "grad_norm": 0.6144049167633057, + "learning_rate": 5.9666666666666666e-06, + "loss": 0.1111, + "step": 180 + }, + { + "epoch": 0.03425274923382008, + "grad_norm": 0.6697492599487305, + "learning_rate": 6.300000000000001e-06, + "loss": 0.1167, + "step": 190 + }, + { + "epoch": 0.0360555255092843, + "grad_norm": 0.2732592821121216, + "learning_rate": 6.633333333333333e-06, + "loss": 0.1049, + "step": 200 + }, + { + "epoch": 0.03785830178474851, + "grad_norm": 0.6220976114273071, + "learning_rate": 6.966666666666667e-06, + "loss": 0.1029, + "step": 210 + }, + { + "epoch": 0.039661078060212726, + "grad_norm": 0.3747783303260803, + "learning_rate": 7.2999999999999996e-06, + "loss": 0.0994, + "step": 220 + }, + { + "epoch": 0.041463854335676946, + "grad_norm": 0.7402628064155579, + "learning_rate": 7.633333333333334e-06, + "loss": 0.0955, + "step": 230 + }, + { + "epoch": 0.04326663061114116, + "grad_norm": 0.3625622093677521, + "learning_rate": 7.966666666666666e-06, + "loss": 0.0948, + "step": 240 + }, + { + "epoch": 0.04506940688660537, + "grad_norm": 0.33918800950050354, + "learning_rate": 8.3e-06, + "loss": 0.0887, + "step": 250 + }, + { + "epoch": 0.046872183162069585, + "grad_norm": 0.42553257942199707, + "learning_rate": 8.633333333333334e-06, + "loss": 0.0915, + "step": 260 + }, + { + "epoch": 0.048674959437533805, + "grad_norm": 0.7103670835494995, + "learning_rate": 8.966666666666668e-06, + "loss": 0.0975, + "step": 270 + }, + { + "epoch": 0.05047773571299802, + "grad_norm": 0.47956153750419617, + "learning_rate": 9.3e-06, + "loss": 0.0921, + "step": 280 + }, + { + "epoch": 0.05228051198846223, + "grad_norm": 0.52305668592453, + "learning_rate": 9.633333333333335e-06, + "loss": 0.0956, + "step": 290 + }, + { + "epoch": 0.05408328826392644, + "grad_norm": 0.294786274433136, + "learning_rate": 9.966666666666667e-06, + "loss": 0.0843, + "step": 300 + }, + { + "epoch": 0.05588606453939066, + "grad_norm": 0.7143281102180481, + "learning_rate": 1.03e-05, + "loss": 0.0909, + "step": 310 + }, + { + "epoch": 0.057688840814854876, + "grad_norm": 0.37210068106651306, + "learning_rate": 1.0633333333333334e-05, + "loss": 0.0908, + "step": 320 + }, + { + "epoch": 0.05949161709031909, + "grad_norm": 0.3835732936859131, + "learning_rate": 1.0966666666666666e-05, + "loss": 0.0876, + "step": 330 + }, + { + "epoch": 0.06129439336578331, + "grad_norm": 0.4630444347858429, + "learning_rate": 1.13e-05, + "loss": 0.0941, + "step": 340 + }, + { + "epoch": 0.06309716964124752, + "grad_norm": 0.2766304612159729, + "learning_rate": 1.1633333333333334e-05, + "loss": 0.093, + "step": 350 + }, + { + "epoch": 0.06489994591671173, + "grad_norm": 0.5275501012802124, + "learning_rate": 1.1966666666666668e-05, + "loss": 0.089, + "step": 360 + }, + { + "epoch": 0.06670272219217595, + "grad_norm": 0.37696143984794617, + "learning_rate": 1.23e-05, + "loss": 0.0896, + "step": 370 + }, + { + "epoch": 0.06850549846764016, + "grad_norm": 0.32620155811309814, + "learning_rate": 1.2633333333333333e-05, + "loss": 0.0813, + "step": 380 + }, + { + "epoch": 0.07030827474310439, + "grad_norm": 0.4478512704372406, + "learning_rate": 1.2966666666666669e-05, + "loss": 0.0839, + "step": 390 + }, + { + "epoch": 0.0721110510185686, + "grad_norm": 0.37261882424354553, + "learning_rate": 1.3300000000000001e-05, + "loss": 0.0837, + "step": 400 + }, + { + "epoch": 0.07391382729403281, + "grad_norm": 0.7236140370368958, + "learning_rate": 1.3633333333333334e-05, + "loss": 0.0822, + "step": 410 + }, + { + "epoch": 0.07571660356949703, + "grad_norm": 0.45200467109680176, + "learning_rate": 1.3966666666666666e-05, + "loss": 0.0783, + "step": 420 + }, + { + "epoch": 0.07751937984496124, + "grad_norm": 0.4210900664329529, + "learning_rate": 1.43e-05, + "loss": 0.0777, + "step": 430 + }, + { + "epoch": 0.07932215612042545, + "grad_norm": 0.2809159755706787, + "learning_rate": 1.4633333333333334e-05, + "loss": 0.0773, + "step": 440 + }, + { + "epoch": 0.08112493239588967, + "grad_norm": 0.37136387825012207, + "learning_rate": 1.4966666666666668e-05, + "loss": 0.0851, + "step": 450 + }, + { + "epoch": 0.08292770867135389, + "grad_norm": 0.5091298818588257, + "learning_rate": 1.53e-05, + "loss": 0.0794, + "step": 460 + }, + { + "epoch": 0.0847304849468181, + "grad_norm": 0.3478129506111145, + "learning_rate": 1.563333333333333e-05, + "loss": 0.0835, + "step": 470 + }, + { + "epoch": 0.08653326122228232, + "grad_norm": 0.38393208384513855, + "learning_rate": 1.5966666666666667e-05, + "loss": 0.0792, + "step": 480 + }, + { + "epoch": 0.08833603749774653, + "grad_norm": 0.32405346632003784, + "learning_rate": 1.63e-05, + "loss": 0.0827, + "step": 490 + }, + { + "epoch": 0.09013881377321074, + "grad_norm": 0.5071119666099548, + "learning_rate": 1.6633333333333336e-05, + "loss": 0.077, + "step": 500 + }, + { + "epoch": 0.09194159004867496, + "grad_norm": 0.6446972489356995, + "learning_rate": 1.6966666666666668e-05, + "loss": 0.0808, + "step": 510 + }, + { + "epoch": 0.09374436632413917, + "grad_norm": 0.5842817425727844, + "learning_rate": 1.73e-05, + "loss": 0.0766, + "step": 520 + }, + { + "epoch": 0.09554714259960338, + "grad_norm": 0.4594644606113434, + "learning_rate": 1.7633333333333336e-05, + "loss": 0.0827, + "step": 530 + }, + { + "epoch": 0.09734991887506761, + "grad_norm": 0.3079770803451538, + "learning_rate": 1.796666666666667e-05, + "loss": 0.0786, + "step": 540 + }, + { + "epoch": 0.09915269515053182, + "grad_norm": 0.47132161259651184, + "learning_rate": 1.83e-05, + "loss": 0.08, + "step": 550 + }, + { + "epoch": 0.10095547142599604, + "grad_norm": 0.3440473973751068, + "learning_rate": 1.8633333333333333e-05, + "loss": 0.0805, + "step": 560 + }, + { + "epoch": 0.10275824770146025, + "grad_norm": 0.2881501019001007, + "learning_rate": 1.896666666666667e-05, + "loss": 0.0756, + "step": 570 + }, + { + "epoch": 0.10456102397692446, + "grad_norm": 0.38147321343421936, + "learning_rate": 1.93e-05, + "loss": 0.0753, + "step": 580 + }, + { + "epoch": 0.10636380025238867, + "grad_norm": 0.3498924970626831, + "learning_rate": 1.9633333333333334e-05, + "loss": 0.0779, + "step": 590 + }, + { + "epoch": 0.10816657652785289, + "grad_norm": 0.3204267919063568, + "learning_rate": 1.9966666666666666e-05, + "loss": 0.073, + "step": 600 + }, + { + "epoch": 0.10996935280331711, + "grad_norm": 0.3183678090572357, + "learning_rate": 2.0300000000000002e-05, + "loss": 0.0724, + "step": 610 + }, + { + "epoch": 0.11177212907878133, + "grad_norm": 0.3010425269603729, + "learning_rate": 2.0633333333333335e-05, + "loss": 0.0725, + "step": 620 + }, + { + "epoch": 0.11357490535424554, + "grad_norm": 0.3266223669052124, + "learning_rate": 2.0966666666666667e-05, + "loss": 0.0796, + "step": 630 + }, + { + "epoch": 0.11537768162970975, + "grad_norm": 0.3957923948764801, + "learning_rate": 2.13e-05, + "loss": 0.0764, + "step": 640 + }, + { + "epoch": 0.11718045790517397, + "grad_norm": 0.36882466077804565, + "learning_rate": 2.1633333333333332e-05, + "loss": 0.0743, + "step": 650 + }, + { + "epoch": 0.11898323418063818, + "grad_norm": 0.40890058875083923, + "learning_rate": 2.1966666666666668e-05, + "loss": 0.078, + "step": 660 + }, + { + "epoch": 0.12078601045610239, + "grad_norm": 0.3890920579433441, + "learning_rate": 2.23e-05, + "loss": 0.0681, + "step": 670 + }, + { + "epoch": 0.12258878673156662, + "grad_norm": 0.3976999521255493, + "learning_rate": 2.2633333333333336e-05, + "loss": 0.0747, + "step": 680 + }, + { + "epoch": 0.12439156300703083, + "grad_norm": 0.3453735113143921, + "learning_rate": 2.2966666666666668e-05, + "loss": 0.079, + "step": 690 + }, + { + "epoch": 0.12619433928249504, + "grad_norm": 0.3744434714317322, + "learning_rate": 2.3300000000000004e-05, + "loss": 0.0769, + "step": 700 + }, + { + "epoch": 0.12799711555795926, + "grad_norm": 0.4043666422367096, + "learning_rate": 2.3633333333333336e-05, + "loss": 0.0721, + "step": 710 + }, + { + "epoch": 0.12979989183342347, + "grad_norm": 0.452412873506546, + "learning_rate": 2.396666666666667e-05, + "loss": 0.076, + "step": 720 + }, + { + "epoch": 0.13160266810888768, + "grad_norm": 0.3486067056655884, + "learning_rate": 2.43e-05, + "loss": 0.0742, + "step": 730 + }, + { + "epoch": 0.1334054443843519, + "grad_norm": 0.3778965473175049, + "learning_rate": 2.4633333333333334e-05, + "loss": 0.0727, + "step": 740 + }, + { + "epoch": 0.1352082206598161, + "grad_norm": 0.2709180414676666, + "learning_rate": 2.496666666666667e-05, + "loss": 0.0684, + "step": 750 + }, + { + "epoch": 0.13701099693528032, + "grad_norm": 0.32746025919914246, + "learning_rate": 2.5300000000000002e-05, + "loss": 0.0735, + "step": 760 + }, + { + "epoch": 0.13881377321074453, + "grad_norm": 0.3406435549259186, + "learning_rate": 2.5633333333333338e-05, + "loss": 0.0747, + "step": 770 + }, + { + "epoch": 0.14061654948620878, + "grad_norm": 0.39069485664367676, + "learning_rate": 2.5966666666666667e-05, + "loss": 0.0638, + "step": 780 + }, + { + "epoch": 0.142419325761673, + "grad_norm": 0.3011362552642822, + "learning_rate": 2.6300000000000002e-05, + "loss": 0.0709, + "step": 790 + }, + { + "epoch": 0.1442221020371372, + "grad_norm": 0.3271448612213135, + "learning_rate": 2.663333333333333e-05, + "loss": 0.0684, + "step": 800 + }, + { + "epoch": 0.1460248783126014, + "grad_norm": 0.3503864109516144, + "learning_rate": 2.6966666666666667e-05, + "loss": 0.078, + "step": 810 + }, + { + "epoch": 0.14782765458806563, + "grad_norm": 0.2866891622543335, + "learning_rate": 2.7300000000000003e-05, + "loss": 0.0671, + "step": 820 + }, + { + "epoch": 0.14963043086352984, + "grad_norm": 0.36019957065582275, + "learning_rate": 2.7633333333333332e-05, + "loss": 0.0673, + "step": 830 + }, + { + "epoch": 0.15143320713899405, + "grad_norm": 0.34380584955215454, + "learning_rate": 2.7966666666666668e-05, + "loss": 0.069, + "step": 840 + }, + { + "epoch": 0.15323598341445827, + "grad_norm": 0.2428215742111206, + "learning_rate": 2.83e-05, + "loss": 0.0733, + "step": 850 + }, + { + "epoch": 0.15503875968992248, + "grad_norm": 0.29352885484695435, + "learning_rate": 2.8633333333333336e-05, + "loss": 0.0774, + "step": 860 + }, + { + "epoch": 0.1568415359653867, + "grad_norm": 0.4597095251083374, + "learning_rate": 2.8966666666666668e-05, + "loss": 0.0765, + "step": 870 + }, + { + "epoch": 0.1586443122408509, + "grad_norm": 0.31262949109077454, + "learning_rate": 2.93e-05, + "loss": 0.0664, + "step": 880 + }, + { + "epoch": 0.16044708851631512, + "grad_norm": 0.35274234414100647, + "learning_rate": 2.9633333333333336e-05, + "loss": 0.0662, + "step": 890 + }, + { + "epoch": 0.16224986479177933, + "grad_norm": 0.21876050531864166, + "learning_rate": 2.9966666666666672e-05, + "loss": 0.0641, + "step": 900 + }, + { + "epoch": 0.16405264106724354, + "grad_norm": 0.2677417993545532, + "learning_rate": 3.03e-05, + "loss": 0.0631, + "step": 910 + }, + { + "epoch": 0.16585541734270778, + "grad_norm": 0.25009411573410034, + "learning_rate": 3.063333333333334e-05, + "loss": 0.0728, + "step": 920 + }, + { + "epoch": 0.167658193618172, + "grad_norm": 0.2829286456108093, + "learning_rate": 3.096666666666666e-05, + "loss": 0.0638, + "step": 930 + }, + { + "epoch": 0.1694609698936362, + "grad_norm": 0.24688038229942322, + "learning_rate": 3.13e-05, + "loss": 0.0691, + "step": 940 + }, + { + "epoch": 0.17126374616910042, + "grad_norm": 0.34949207305908203, + "learning_rate": 3.1633333333333334e-05, + "loss": 0.0676, + "step": 950 + }, + { + "epoch": 0.17306652244456464, + "grad_norm": 0.2864921987056732, + "learning_rate": 3.196666666666667e-05, + "loss": 0.0743, + "step": 960 + }, + { + "epoch": 0.17486929872002885, + "grad_norm": 0.2108231633901596, + "learning_rate": 3.2300000000000006e-05, + "loss": 0.0691, + "step": 970 + }, + { + "epoch": 0.17667207499549306, + "grad_norm": 0.30339616537094116, + "learning_rate": 3.263333333333333e-05, + "loss": 0.0731, + "step": 980 + }, + { + "epoch": 0.17847485127095727, + "grad_norm": 0.3775007128715515, + "learning_rate": 3.296666666666667e-05, + "loss": 0.0694, + "step": 990 + }, + { + "epoch": 0.1802776275464215, + "grad_norm": 0.3024364411830902, + "learning_rate": 3.33e-05, + "loss": 0.0665, + "step": 1000 + }, + { + "epoch": 0.1820804038218857, + "grad_norm": 0.2513657212257385, + "learning_rate": 3.3633333333333335e-05, + "loss": 0.0676, + "step": 1010 + }, + { + "epoch": 0.1838831800973499, + "grad_norm": 0.27736595273017883, + "learning_rate": 3.396666666666667e-05, + "loss": 0.0684, + "step": 1020 + }, + { + "epoch": 0.18568595637281413, + "grad_norm": 0.4237976372241974, + "learning_rate": 3.430000000000001e-05, + "loss": 0.064, + "step": 1030 + }, + { + "epoch": 0.18748873264827834, + "grad_norm": 0.38221675157546997, + "learning_rate": 3.463333333333333e-05, + "loss": 0.06, + "step": 1040 + }, + { + "epoch": 0.18929150892374255, + "grad_norm": 0.31958162784576416, + "learning_rate": 3.496666666666667e-05, + "loss": 0.0706, + "step": 1050 + }, + { + "epoch": 0.19109428519920676, + "grad_norm": 0.33839792013168335, + "learning_rate": 3.53e-05, + "loss": 0.0676, + "step": 1060 + }, + { + "epoch": 0.192897061474671, + "grad_norm": 0.40528759360313416, + "learning_rate": 3.563333333333334e-05, + "loss": 0.0694, + "step": 1070 + }, + { + "epoch": 0.19469983775013522, + "grad_norm": 0.19379711151123047, + "learning_rate": 3.596666666666667e-05, + "loss": 0.0613, + "step": 1080 + }, + { + "epoch": 0.19650261402559943, + "grad_norm": 0.2642883062362671, + "learning_rate": 3.63e-05, + "loss": 0.0665, + "step": 1090 + }, + { + "epoch": 0.19830539030106364, + "grad_norm": 0.23124559223651886, + "learning_rate": 3.6633333333333334e-05, + "loss": 0.0645, + "step": 1100 + }, + { + "epoch": 0.20010816657652786, + "grad_norm": 0.3091222643852234, + "learning_rate": 3.6966666666666666e-05, + "loss": 0.0648, + "step": 1110 + }, + { + "epoch": 0.20191094285199207, + "grad_norm": 0.3477613031864166, + "learning_rate": 3.73e-05, + "loss": 0.0628, + "step": 1120 + }, + { + "epoch": 0.20371371912745628, + "grad_norm": 0.22648930549621582, + "learning_rate": 3.763333333333334e-05, + "loss": 0.0612, + "step": 1130 + }, + { + "epoch": 0.2055164954029205, + "grad_norm": 0.21195665001869202, + "learning_rate": 3.796666666666667e-05, + "loss": 0.0668, + "step": 1140 + }, + { + "epoch": 0.2073192716783847, + "grad_norm": 0.2780223786830902, + "learning_rate": 3.83e-05, + "loss": 0.0679, + "step": 1150 + }, + { + "epoch": 0.20912204795384892, + "grad_norm": 0.2918558418750763, + "learning_rate": 3.8633333333333335e-05, + "loss": 0.0673, + "step": 1160 + }, + { + "epoch": 0.21092482422931313, + "grad_norm": 0.2419963777065277, + "learning_rate": 3.896666666666667e-05, + "loss": 0.0708, + "step": 1170 + }, + { + "epoch": 0.21272760050477735, + "grad_norm": 0.26872798800468445, + "learning_rate": 3.9300000000000007e-05, + "loss": 0.065, + "step": 1180 + }, + { + "epoch": 0.21453037678024156, + "grad_norm": 0.25414329767227173, + "learning_rate": 3.963333333333333e-05, + "loss": 0.0622, + "step": 1190 + }, + { + "epoch": 0.21633315305570577, + "grad_norm": 0.25174248218536377, + "learning_rate": 3.996666666666667e-05, + "loss": 0.0695, + "step": 1200 + }, + { + "epoch": 0.21813592933117001, + "grad_norm": 0.22807759046554565, + "learning_rate": 4.0300000000000004e-05, + "loss": 0.0701, + "step": 1210 + }, + { + "epoch": 0.21993870560663423, + "grad_norm": 0.29292958974838257, + "learning_rate": 4.0633333333333336e-05, + "loss": 0.0632, + "step": 1220 + }, + { + "epoch": 0.22174148188209844, + "grad_norm": 0.2155628800392151, + "learning_rate": 4.096666666666667e-05, + "loss": 0.0682, + "step": 1230 + }, + { + "epoch": 0.22354425815756265, + "grad_norm": 0.20743617415428162, + "learning_rate": 4.13e-05, + "loss": 0.0641, + "step": 1240 + }, + { + "epoch": 0.22534703443302687, + "grad_norm": 0.20022955536842346, + "learning_rate": 4.1633333333333333e-05, + "loss": 0.0579, + "step": 1250 + }, + { + "epoch": 0.22714981070849108, + "grad_norm": 0.3666442334651947, + "learning_rate": 4.196666666666667e-05, + "loss": 0.0611, + "step": 1260 + }, + { + "epoch": 0.2289525869839553, + "grad_norm": 0.29185906052589417, + "learning_rate": 4.23e-05, + "loss": 0.0641, + "step": 1270 + }, + { + "epoch": 0.2307553632594195, + "grad_norm": 0.4205268323421478, + "learning_rate": 4.263333333333334e-05, + "loss": 0.0666, + "step": 1280 + }, + { + "epoch": 0.23255813953488372, + "grad_norm": 0.2673855125904083, + "learning_rate": 4.296666666666666e-05, + "loss": 0.064, + "step": 1290 + }, + { + "epoch": 0.23436091581034793, + "grad_norm": 0.31742236018180847, + "learning_rate": 4.33e-05, + "loss": 0.0691, + "step": 1300 + }, + { + "epoch": 0.23616369208581214, + "grad_norm": 0.2533666789531708, + "learning_rate": 4.3633333333333335e-05, + "loss": 0.0641, + "step": 1310 + }, + { + "epoch": 0.23796646836127636, + "grad_norm": 0.2713998258113861, + "learning_rate": 4.396666666666667e-05, + "loss": 0.0633, + "step": 1320 + }, + { + "epoch": 0.23976924463674057, + "grad_norm": 0.3495473861694336, + "learning_rate": 4.43e-05, + "loss": 0.061, + "step": 1330 + }, + { + "epoch": 0.24157202091220478, + "grad_norm": 0.22186997532844543, + "learning_rate": 4.463333333333334e-05, + "loss": 0.063, + "step": 1340 + }, + { + "epoch": 0.24337479718766902, + "grad_norm": 0.2712762951850891, + "learning_rate": 4.496666666666667e-05, + "loss": 0.0605, + "step": 1350 + }, + { + "epoch": 0.24517757346313324, + "grad_norm": 0.2828688323497772, + "learning_rate": 4.53e-05, + "loss": 0.0651, + "step": 1360 + }, + { + "epoch": 0.24698034973859745, + "grad_norm": 0.32463574409484863, + "learning_rate": 4.5633333333333336e-05, + "loss": 0.0637, + "step": 1370 + }, + { + "epoch": 0.24878312601406166, + "grad_norm": 0.27668318152427673, + "learning_rate": 4.596666666666667e-05, + "loss": 0.0568, + "step": 1380 + }, + { + "epoch": 0.2505859022895259, + "grad_norm": 0.30997949838638306, + "learning_rate": 4.630000000000001e-05, + "loss": 0.0686, + "step": 1390 + }, + { + "epoch": 0.2523886785649901, + "grad_norm": 0.2110419124364853, + "learning_rate": 4.663333333333333e-05, + "loss": 0.0575, + "step": 1400 + }, + { + "epoch": 0.2541914548404543, + "grad_norm": 0.2404826134443283, + "learning_rate": 4.696666666666667e-05, + "loss": 0.0655, + "step": 1410 + }, + { + "epoch": 0.2559942311159185, + "grad_norm": 0.23162831366062164, + "learning_rate": 4.73e-05, + "loss": 0.0624, + "step": 1420 + }, + { + "epoch": 0.2577970073913827, + "grad_norm": 0.23692749440670013, + "learning_rate": 4.763333333333334e-05, + "loss": 0.0613, + "step": 1430 + }, + { + "epoch": 0.25959978366684694, + "grad_norm": 0.2759644687175751, + "learning_rate": 4.796666666666667e-05, + "loss": 0.0654, + "step": 1440 + }, + { + "epoch": 0.26140255994231115, + "grad_norm": 0.23253923654556274, + "learning_rate": 4.83e-05, + "loss": 0.068, + "step": 1450 + }, + { + "epoch": 0.26320533621777537, + "grad_norm": 0.26997312903404236, + "learning_rate": 4.8633333333333334e-05, + "loss": 0.0595, + "step": 1460 + }, + { + "epoch": 0.2650081124932396, + "grad_norm": 0.40077781677246094, + "learning_rate": 4.8966666666666667e-05, + "loss": 0.0622, + "step": 1470 + }, + { + "epoch": 0.2668108887687038, + "grad_norm": 0.2376328855752945, + "learning_rate": 4.93e-05, + "loss": 0.0636, + "step": 1480 + }, + { + "epoch": 0.268613665044168, + "grad_norm": 0.2827739715576172, + "learning_rate": 4.963333333333334e-05, + "loss": 0.0604, + "step": 1490 + }, + { + "epoch": 0.2704164413196322, + "grad_norm": 0.4532570540904999, + "learning_rate": 4.996666666666667e-05, + "loss": 0.0587, + "step": 1500 + }, + { + "epoch": 0.27221921759509643, + "grad_norm": 0.43316343426704407, + "learning_rate": 5.03e-05, + "loss": 0.0641, + "step": 1510 + }, + { + "epoch": 0.27402199387056064, + "grad_norm": 0.3506268262863159, + "learning_rate": 5.0633333333333335e-05, + "loss": 0.0662, + "step": 1520 + }, + { + "epoch": 0.27582477014602486, + "grad_norm": 0.24790820479393005, + "learning_rate": 5.0966666666666674e-05, + "loss": 0.0646, + "step": 1530 + }, + { + "epoch": 0.27762754642148907, + "grad_norm": 0.2507699131965637, + "learning_rate": 5.130000000000001e-05, + "loss": 0.0596, + "step": 1540 + }, + { + "epoch": 0.2794303226969533, + "grad_norm": 0.26693329215049744, + "learning_rate": 5.163333333333333e-05, + "loss": 0.0687, + "step": 1550 + }, + { + "epoch": 0.28123309897241755, + "grad_norm": 0.25990283489227295, + "learning_rate": 5.196666666666667e-05, + "loss": 0.0618, + "step": 1560 + }, + { + "epoch": 0.28303587524788176, + "grad_norm": 0.20986329019069672, + "learning_rate": 5.2300000000000004e-05, + "loss": 0.0631, + "step": 1570 + }, + { + "epoch": 0.284838651523346, + "grad_norm": 0.3476528227329254, + "learning_rate": 5.2633333333333336e-05, + "loss": 0.0577, + "step": 1580 + }, + { + "epoch": 0.2866414277988102, + "grad_norm": 0.2593328654766083, + "learning_rate": 5.296666666666666e-05, + "loss": 0.0605, + "step": 1590 + }, + { + "epoch": 0.2884442040742744, + "grad_norm": 0.3481169044971466, + "learning_rate": 5.330000000000001e-05, + "loss": 0.0606, + "step": 1600 + }, + { + "epoch": 0.2902469803497386, + "grad_norm": 0.21679189801216125, + "learning_rate": 5.3633333333333334e-05, + "loss": 0.0671, + "step": 1610 + }, + { + "epoch": 0.2920497566252028, + "grad_norm": 0.2372356802225113, + "learning_rate": 5.3966666666666666e-05, + "loss": 0.0613, + "step": 1620 + }, + { + "epoch": 0.29385253290066704, + "grad_norm": 0.25489747524261475, + "learning_rate": 5.4300000000000005e-05, + "loss": 0.0601, + "step": 1630 + }, + { + "epoch": 0.29565530917613125, + "grad_norm": 0.25877898931503296, + "learning_rate": 5.463333333333334e-05, + "loss": 0.0633, + "step": 1640 + }, + { + "epoch": 0.29745808545159547, + "grad_norm": 0.22300730645656586, + "learning_rate": 5.496666666666666e-05, + "loss": 0.0642, + "step": 1650 + }, + { + "epoch": 0.2992608617270597, + "grad_norm": 0.18823741376399994, + "learning_rate": 5.530000000000001e-05, + "loss": 0.0584, + "step": 1660 + }, + { + "epoch": 0.3010636380025239, + "grad_norm": 0.26364418864250183, + "learning_rate": 5.5633333333333335e-05, + "loss": 0.0586, + "step": 1670 + }, + { + "epoch": 0.3028664142779881, + "grad_norm": 0.2923857271671295, + "learning_rate": 5.596666666666667e-05, + "loss": 0.0647, + "step": 1680 + }, + { + "epoch": 0.3046691905534523, + "grad_norm": 0.2227906584739685, + "learning_rate": 5.63e-05, + "loss": 0.0625, + "step": 1690 + }, + { + "epoch": 0.30647196682891653, + "grad_norm": 0.21054530143737793, + "learning_rate": 5.663333333333334e-05, + "loss": 0.065, + "step": 1700 + }, + { + "epoch": 0.30827474310438074, + "grad_norm": 0.348690390586853, + "learning_rate": 5.696666666666667e-05, + "loss": 0.0632, + "step": 1710 + }, + { + "epoch": 0.31007751937984496, + "grad_norm": 0.20512254536151886, + "learning_rate": 5.73e-05, + "loss": 0.0643, + "step": 1720 + }, + { + "epoch": 0.31188029565530917, + "grad_norm": 0.19749906659126282, + "learning_rate": 5.7633333333333336e-05, + "loss": 0.0582, + "step": 1730 + }, + { + "epoch": 0.3136830719307734, + "grad_norm": 0.2500685155391693, + "learning_rate": 5.796666666666667e-05, + "loss": 0.0572, + "step": 1740 + }, + { + "epoch": 0.3154858482062376, + "grad_norm": 0.20494727790355682, + "learning_rate": 5.83e-05, + "loss": 0.0618, + "step": 1750 + }, + { + "epoch": 0.3172886244817018, + "grad_norm": 0.28706008195877075, + "learning_rate": 5.863333333333334e-05, + "loss": 0.0595, + "step": 1760 + }, + { + "epoch": 0.319091400757166, + "grad_norm": 0.23980054259300232, + "learning_rate": 5.896666666666667e-05, + "loss": 0.0601, + "step": 1770 + }, + { + "epoch": 0.32089417703263023, + "grad_norm": 0.26302099227905273, + "learning_rate": 5.93e-05, + "loss": 0.0614, + "step": 1780 + }, + { + "epoch": 0.32269695330809445, + "grad_norm": 0.34201040863990784, + "learning_rate": 5.9633333333333344e-05, + "loss": 0.0676, + "step": 1790 + }, + { + "epoch": 0.32449972958355866, + "grad_norm": 0.22230128943920135, + "learning_rate": 5.996666666666667e-05, + "loss": 0.0624, + "step": 1800 + }, + { + "epoch": 0.3263025058590229, + "grad_norm": 0.25414663553237915, + "learning_rate": 6.03e-05, + "loss": 0.0573, + "step": 1810 + }, + { + "epoch": 0.3281052821344871, + "grad_norm": 0.20873858034610748, + "learning_rate": 6.063333333333333e-05, + "loss": 0.058, + "step": 1820 + }, + { + "epoch": 0.3299080584099513, + "grad_norm": 0.25061991810798645, + "learning_rate": 6.0966666666666674e-05, + "loss": 0.0612, + "step": 1830 + }, + { + "epoch": 0.33171083468541557, + "grad_norm": 0.24576738476753235, + "learning_rate": 6.13e-05, + "loss": 0.0635, + "step": 1840 + }, + { + "epoch": 0.3335136109608798, + "grad_norm": 0.41412773728370667, + "learning_rate": 6.163333333333333e-05, + "loss": 0.0635, + "step": 1850 + }, + { + "epoch": 0.335316387236344, + "grad_norm": 0.2621055543422699, + "learning_rate": 6.196666666666668e-05, + "loss": 0.0573, + "step": 1860 + }, + { + "epoch": 0.3371191635118082, + "grad_norm": 0.36470827460289, + "learning_rate": 6.23e-05, + "loss": 0.0597, + "step": 1870 + }, + { + "epoch": 0.3389219397872724, + "grad_norm": 0.21115829050540924, + "learning_rate": 6.263333333333333e-05, + "loss": 0.0562, + "step": 1880 + }, + { + "epoch": 0.34072471606273663, + "grad_norm": 0.20064254105091095, + "learning_rate": 6.296666666666667e-05, + "loss": 0.0596, + "step": 1890 + }, + { + "epoch": 0.34252749233820085, + "grad_norm": 0.30089330673217773, + "learning_rate": 6.330000000000001e-05, + "loss": 0.0574, + "step": 1900 + }, + { + "epoch": 0.34433026861366506, + "grad_norm": 0.26324665546417236, + "learning_rate": 6.363333333333334e-05, + "loss": 0.0628, + "step": 1910 + }, + { + "epoch": 0.34613304488912927, + "grad_norm": 0.3654855489730835, + "learning_rate": 6.396666666666667e-05, + "loss": 0.0603, + "step": 1920 + }, + { + "epoch": 0.3479358211645935, + "grad_norm": 0.17230789363384247, + "learning_rate": 6.43e-05, + "loss": 0.0581, + "step": 1930 + }, + { + "epoch": 0.3497385974400577, + "grad_norm": 0.39206016063690186, + "learning_rate": 6.463333333333334e-05, + "loss": 0.0584, + "step": 1940 + }, + { + "epoch": 0.3515413737155219, + "grad_norm": 0.30950891971588135, + "learning_rate": 6.496666666666667e-05, + "loss": 0.0622, + "step": 1950 + }, + { + "epoch": 0.3533441499909861, + "grad_norm": 0.30163729190826416, + "learning_rate": 6.53e-05, + "loss": 0.0628, + "step": 1960 + }, + { + "epoch": 0.35514692626645034, + "grad_norm": 0.3109796345233917, + "learning_rate": 6.563333333333333e-05, + "loss": 0.0612, + "step": 1970 + }, + { + "epoch": 0.35694970254191455, + "grad_norm": 0.3278733491897583, + "learning_rate": 6.596666666666667e-05, + "loss": 0.0563, + "step": 1980 + }, + { + "epoch": 0.35875247881737876, + "grad_norm": 0.33346840739250183, + "learning_rate": 6.630000000000001e-05, + "loss": 0.06, + "step": 1990 + }, + { + "epoch": 0.360555255092843, + "grad_norm": 0.23046278953552246, + "learning_rate": 6.663333333333333e-05, + "loss": 0.0579, + "step": 2000 + }, + { + "epoch": 0.3623580313683072, + "grad_norm": 0.20575708150863647, + "learning_rate": 6.696666666666666e-05, + "loss": 0.06, + "step": 2010 + }, + { + "epoch": 0.3641608076437714, + "grad_norm": 0.2498203068971634, + "learning_rate": 6.730000000000001e-05, + "loss": 0.0576, + "step": 2020 + }, + { + "epoch": 0.3659635839192356, + "grad_norm": 0.3085964322090149, + "learning_rate": 6.763333333333334e-05, + "loss": 0.0608, + "step": 2030 + }, + { + "epoch": 0.3677663601946998, + "grad_norm": 0.30336859822273254, + "learning_rate": 6.796666666666666e-05, + "loss": 0.0641, + "step": 2040 + }, + { + "epoch": 0.36956913647016404, + "grad_norm": 0.21763910353183746, + "learning_rate": 6.83e-05, + "loss": 0.0565, + "step": 2050 + }, + { + "epoch": 0.37137191274562825, + "grad_norm": 0.23763242363929749, + "learning_rate": 6.863333333333334e-05, + "loss": 0.0569, + "step": 2060 + }, + { + "epoch": 0.37317468902109246, + "grad_norm": 0.24654164910316467, + "learning_rate": 6.896666666666667e-05, + "loss": 0.0588, + "step": 2070 + }, + { + "epoch": 0.3749774652965567, + "grad_norm": 0.2766386568546295, + "learning_rate": 6.93e-05, + "loss": 0.0598, + "step": 2080 + }, + { + "epoch": 0.3767802415720209, + "grad_norm": 0.2941744327545166, + "learning_rate": 6.963333333333334e-05, + "loss": 0.0654, + "step": 2090 + }, + { + "epoch": 0.3785830178474851, + "grad_norm": 0.20436793565750122, + "learning_rate": 6.996666666666667e-05, + "loss": 0.0611, + "step": 2100 + }, + { + "epoch": 0.3803857941229493, + "grad_norm": 0.2097810059785843, + "learning_rate": 7.03e-05, + "loss": 0.0582, + "step": 2110 + }, + { + "epoch": 0.38218857039841353, + "grad_norm": 0.19502535462379456, + "learning_rate": 7.063333333333333e-05, + "loss": 0.057, + "step": 2120 + }, + { + "epoch": 0.3839913466738778, + "grad_norm": 0.20433445274829865, + "learning_rate": 7.096666666666667e-05, + "loss": 0.0599, + "step": 2130 + }, + { + "epoch": 0.385794122949342, + "grad_norm": 0.19240501523017883, + "learning_rate": 7.13e-05, + "loss": 0.0595, + "step": 2140 + }, + { + "epoch": 0.3875968992248062, + "grad_norm": 0.25694090127944946, + "learning_rate": 7.163333333333334e-05, + "loss": 0.0604, + "step": 2150 + }, + { + "epoch": 0.38939967550027044, + "grad_norm": 0.29847267270088196, + "learning_rate": 7.196666666666668e-05, + "loss": 0.057, + "step": 2160 + }, + { + "epoch": 0.39120245177573465, + "grad_norm": 0.2709079086780548, + "learning_rate": 7.23e-05, + "loss": 0.0576, + "step": 2170 + }, + { + "epoch": 0.39300522805119886, + "grad_norm": 0.22980639338493347, + "learning_rate": 7.263333333333334e-05, + "loss": 0.0582, + "step": 2180 + }, + { + "epoch": 0.3948080043266631, + "grad_norm": 0.2066880613565445, + "learning_rate": 7.296666666666667e-05, + "loss": 0.0594, + "step": 2190 + }, + { + "epoch": 0.3966107806021273, + "grad_norm": 0.15543051064014435, + "learning_rate": 7.33e-05, + "loss": 0.0621, + "step": 2200 + }, + { + "epoch": 0.3984135568775915, + "grad_norm": 0.30837076902389526, + "learning_rate": 7.363333333333334e-05, + "loss": 0.0587, + "step": 2210 + }, + { + "epoch": 0.4002163331530557, + "grad_norm": 0.2566860616207123, + "learning_rate": 7.396666666666667e-05, + "loss": 0.0551, + "step": 2220 + }, + { + "epoch": 0.4020191094285199, + "grad_norm": 0.17758356034755707, + "learning_rate": 7.43e-05, + "loss": 0.0574, + "step": 2230 + }, + { + "epoch": 0.40382188570398414, + "grad_norm": 0.2511194348335266, + "learning_rate": 7.463333333333334e-05, + "loss": 0.0614, + "step": 2240 + }, + { + "epoch": 0.40562466197944835, + "grad_norm": 0.2289959192276001, + "learning_rate": 7.496666666666667e-05, + "loss": 0.0562, + "step": 2250 + }, + { + "epoch": 0.40742743825491257, + "grad_norm": 0.2782787084579468, + "learning_rate": 7.53e-05, + "loss": 0.0579, + "step": 2260 + }, + { + "epoch": 0.4092302145303768, + "grad_norm": 0.14765875041484833, + "learning_rate": 7.563333333333333e-05, + "loss": 0.0545, + "step": 2270 + }, + { + "epoch": 0.411032990805841, + "grad_norm": 0.29629790782928467, + "learning_rate": 7.596666666666668e-05, + "loss": 0.0592, + "step": 2280 + }, + { + "epoch": 0.4128357670813052, + "grad_norm": 0.16385123133659363, + "learning_rate": 7.630000000000001e-05, + "loss": 0.0591, + "step": 2290 + }, + { + "epoch": 0.4146385433567694, + "grad_norm": 0.2961997985839844, + "learning_rate": 7.663333333333333e-05, + "loss": 0.0567, + "step": 2300 + }, + { + "epoch": 0.41644131963223363, + "grad_norm": 0.1871434897184372, + "learning_rate": 7.696666666666668e-05, + "loss": 0.0569, + "step": 2310 + }, + { + "epoch": 0.41824409590769784, + "grad_norm": 0.2781011462211609, + "learning_rate": 7.730000000000001e-05, + "loss": 0.0547, + "step": 2320 + }, + { + "epoch": 0.42004687218316206, + "grad_norm": 0.25767332315444946, + "learning_rate": 7.763333333333334e-05, + "loss": 0.0637, + "step": 2330 + }, + { + "epoch": 0.42184964845862627, + "grad_norm": 0.26878052949905396, + "learning_rate": 7.796666666666666e-05, + "loss": 0.0558, + "step": 2340 + }, + { + "epoch": 0.4236524247340905, + "grad_norm": 0.18930955231189728, + "learning_rate": 7.83e-05, + "loss": 0.0577, + "step": 2350 + }, + { + "epoch": 0.4254552010095547, + "grad_norm": 0.23502376675605774, + "learning_rate": 7.863333333333334e-05, + "loss": 0.0629, + "step": 2360 + }, + { + "epoch": 0.4272579772850189, + "grad_norm": 0.2777710258960724, + "learning_rate": 7.896666666666667e-05, + "loss": 0.0574, + "step": 2370 + }, + { + "epoch": 0.4290607535604831, + "grad_norm": 0.18867869675159454, + "learning_rate": 7.93e-05, + "loss": 0.0593, + "step": 2380 + }, + { + "epoch": 0.43086352983594733, + "grad_norm": 0.24806471168994904, + "learning_rate": 7.963333333333334e-05, + "loss": 0.0558, + "step": 2390 + }, + { + "epoch": 0.43266630611141155, + "grad_norm": 0.21862687170505524, + "learning_rate": 7.996666666666667e-05, + "loss": 0.0552, + "step": 2400 + }, + { + "epoch": 0.4344690823868758, + "grad_norm": 0.2106906920671463, + "learning_rate": 8.030000000000001e-05, + "loss": 0.0621, + "step": 2410 + }, + { + "epoch": 0.43627185866234003, + "grad_norm": 0.20159752666950226, + "learning_rate": 8.063333333333333e-05, + "loss": 0.0626, + "step": 2420 + }, + { + "epoch": 0.43807463493780424, + "grad_norm": 0.2598176896572113, + "learning_rate": 8.096666666666667e-05, + "loss": 0.0546, + "step": 2430 + }, + { + "epoch": 0.43987741121326845, + "grad_norm": 0.19820767641067505, + "learning_rate": 8.13e-05, + "loss": 0.0575, + "step": 2440 + }, + { + "epoch": 0.44168018748873267, + "grad_norm": 0.18782299757003784, + "learning_rate": 8.163333333333334e-05, + "loss": 0.0551, + "step": 2450 + }, + { + "epoch": 0.4434829637641969, + "grad_norm": 0.25559478998184204, + "learning_rate": 8.196666666666668e-05, + "loss": 0.0573, + "step": 2460 + }, + { + "epoch": 0.4452857400396611, + "grad_norm": 0.2523142099380493, + "learning_rate": 8.23e-05, + "loss": 0.0574, + "step": 2470 + }, + { + "epoch": 0.4470885163151253, + "grad_norm": 0.20205341279506683, + "learning_rate": 8.263333333333334e-05, + "loss": 0.056, + "step": 2480 + }, + { + "epoch": 0.4488912925905895, + "grad_norm": 0.2733706533908844, + "learning_rate": 8.296666666666667e-05, + "loss": 0.0574, + "step": 2490 + }, + { + "epoch": 0.45069406886605373, + "grad_norm": 0.2302892506122589, + "learning_rate": 8.33e-05, + "loss": 0.0593, + "step": 2500 + }, + { + "epoch": 0.45249684514151794, + "grad_norm": 0.2586212754249573, + "learning_rate": 8.363333333333334e-05, + "loss": 0.0551, + "step": 2510 + }, + { + "epoch": 0.45429962141698216, + "grad_norm": 0.4032387137413025, + "learning_rate": 8.396666666666667e-05, + "loss": 0.0593, + "step": 2520 + }, + { + "epoch": 0.45610239769244637, + "grad_norm": 0.18610864877700806, + "learning_rate": 8.43e-05, + "loss": 0.0578, + "step": 2530 + }, + { + "epoch": 0.4579051739679106, + "grad_norm": 0.29500970244407654, + "learning_rate": 8.463333333333335e-05, + "loss": 0.0594, + "step": 2540 + }, + { + "epoch": 0.4597079502433748, + "grad_norm": 0.19957338273525238, + "learning_rate": 8.496666666666667e-05, + "loss": 0.0556, + "step": 2550 + }, + { + "epoch": 0.461510726518839, + "grad_norm": 0.27168312668800354, + "learning_rate": 8.53e-05, + "loss": 0.0612, + "step": 2560 + }, + { + "epoch": 0.4633135027943032, + "grad_norm": 0.18914657831192017, + "learning_rate": 8.563333333333333e-05, + "loss": 0.0564, + "step": 2570 + }, + { + "epoch": 0.46511627906976744, + "grad_norm": 0.16713805496692657, + "learning_rate": 8.596666666666668e-05, + "loss": 0.0574, + "step": 2580 + }, + { + "epoch": 0.46691905534523165, + "grad_norm": 0.1571536511182785, + "learning_rate": 8.63e-05, + "loss": 0.0534, + "step": 2590 + }, + { + "epoch": 0.46872183162069586, + "grad_norm": 0.20700669288635254, + "learning_rate": 8.663333333333333e-05, + "loss": 0.0577, + "step": 2600 + }, + { + "epoch": 0.4705246078961601, + "grad_norm": 0.20322678983211517, + "learning_rate": 8.696666666666668e-05, + "loss": 0.0638, + "step": 2610 + }, + { + "epoch": 0.4723273841716243, + "grad_norm": 0.17560529708862305, + "learning_rate": 8.730000000000001e-05, + "loss": 0.0613, + "step": 2620 + }, + { + "epoch": 0.4741301604470885, + "grad_norm": 0.2778451144695282, + "learning_rate": 8.763333333333334e-05, + "loss": 0.0578, + "step": 2630 + }, + { + "epoch": 0.4759329367225527, + "grad_norm": 0.24755309522151947, + "learning_rate": 8.796666666666667e-05, + "loss": 0.0541, + "step": 2640 + }, + { + "epoch": 0.4777357129980169, + "grad_norm": 0.2499132603406906, + "learning_rate": 8.83e-05, + "loss": 0.0537, + "step": 2650 + }, + { + "epoch": 0.47953848927348114, + "grad_norm": 0.21429553627967834, + "learning_rate": 8.863333333333334e-05, + "loss": 0.0644, + "step": 2660 + }, + { + "epoch": 0.48134126554894535, + "grad_norm": 0.25340747833251953, + "learning_rate": 8.896666666666667e-05, + "loss": 0.0574, + "step": 2670 + }, + { + "epoch": 0.48314404182440956, + "grad_norm": 0.197022944688797, + "learning_rate": 8.93e-05, + "loss": 0.0555, + "step": 2680 + }, + { + "epoch": 0.48494681809987383, + "grad_norm": 0.28157272934913635, + "learning_rate": 8.963333333333333e-05, + "loss": 0.0597, + "step": 2690 + }, + { + "epoch": 0.48674959437533805, + "grad_norm": 0.20312272012233734, + "learning_rate": 8.996666666666667e-05, + "loss": 0.0588, + "step": 2700 + }, + { + "epoch": 0.48855237065080226, + "grad_norm": 0.22160930931568146, + "learning_rate": 9.030000000000001e-05, + "loss": 0.0632, + "step": 2710 + }, + { + "epoch": 0.49035514692626647, + "grad_norm": 0.1846475899219513, + "learning_rate": 9.063333333333333e-05, + "loss": 0.0512, + "step": 2720 + }, + { + "epoch": 0.4921579232017307, + "grad_norm": 0.22525529563426971, + "learning_rate": 9.096666666666666e-05, + "loss": 0.0574, + "step": 2730 + }, + { + "epoch": 0.4939606994771949, + "grad_norm": 0.14015792310237885, + "learning_rate": 9.130000000000001e-05, + "loss": 0.0535, + "step": 2740 + }, + { + "epoch": 0.4957634757526591, + "grad_norm": 0.25817930698394775, + "learning_rate": 9.163333333333334e-05, + "loss": 0.0606, + "step": 2750 + }, + { + "epoch": 0.4975662520281233, + "grad_norm": 0.21513380110263824, + "learning_rate": 9.196666666666666e-05, + "loss": 0.06, + "step": 2760 + }, + { + "epoch": 0.49936902830358754, + "grad_norm": 0.22595332562923431, + "learning_rate": 9.230000000000001e-05, + "loss": 0.0519, + "step": 2770 + }, + { + "epoch": 0.5011718045790517, + "grad_norm": 0.27969732880592346, + "learning_rate": 9.263333333333334e-05, + "loss": 0.0623, + "step": 2780 + }, + { + "epoch": 0.502974580854516, + "grad_norm": 0.2222040742635727, + "learning_rate": 9.296666666666667e-05, + "loss": 0.0565, + "step": 2790 + }, + { + "epoch": 0.5047773571299802, + "grad_norm": 0.28888288140296936, + "learning_rate": 9.33e-05, + "loss": 0.0577, + "step": 2800 + }, + { + "epoch": 0.5065801334054444, + "grad_norm": 0.1962571144104004, + "learning_rate": 9.363333333333334e-05, + "loss": 0.0536, + "step": 2810 + }, + { + "epoch": 0.5083829096809086, + "grad_norm": 0.26705968379974365, + "learning_rate": 9.396666666666667e-05, + "loss": 0.0603, + "step": 2820 + }, + { + "epoch": 0.5101856859563728, + "grad_norm": 0.17329153418540955, + "learning_rate": 9.43e-05, + "loss": 0.0533, + "step": 2830 + }, + { + "epoch": 0.511988462231837, + "grad_norm": 0.2211298942565918, + "learning_rate": 9.463333333333333e-05, + "loss": 0.0608, + "step": 2840 + }, + { + "epoch": 0.5137912385073012, + "grad_norm": 0.2022695690393448, + "learning_rate": 9.496666666666667e-05, + "loss": 0.0579, + "step": 2850 + }, + { + "epoch": 0.5155940147827655, + "grad_norm": 0.16921758651733398, + "learning_rate": 9.53e-05, + "loss": 0.0568, + "step": 2860 + }, + { + "epoch": 0.5173967910582297, + "grad_norm": 0.24564726650714874, + "learning_rate": 9.563333333333334e-05, + "loss": 0.0568, + "step": 2870 + }, + { + "epoch": 0.5191995673336939, + "grad_norm": 0.2871924638748169, + "learning_rate": 9.596666666666668e-05, + "loss": 0.0596, + "step": 2880 + }, + { + "epoch": 0.5210023436091581, + "grad_norm": 0.27038854360580444, + "learning_rate": 9.63e-05, + "loss": 0.0596, + "step": 2890 + }, + { + "epoch": 0.5228051198846223, + "grad_norm": 0.2762112319469452, + "learning_rate": 9.663333333333334e-05, + "loss": 0.0588, + "step": 2900 + }, + { + "epoch": 0.5246078961600865, + "grad_norm": 0.1921473741531372, + "learning_rate": 9.696666666666667e-05, + "loss": 0.0626, + "step": 2910 + }, + { + "epoch": 0.5264106724355507, + "grad_norm": 0.27863645553588867, + "learning_rate": 9.730000000000001e-05, + "loss": 0.0567, + "step": 2920 + }, + { + "epoch": 0.5282134487110149, + "grad_norm": 0.23096972703933716, + "learning_rate": 9.763333333333334e-05, + "loss": 0.0579, + "step": 2930 + }, + { + "epoch": 0.5300162249864792, + "grad_norm": 0.23984530568122864, + "learning_rate": 9.796666666666667e-05, + "loss": 0.0539, + "step": 2940 + }, + { + "epoch": 0.5318190012619434, + "grad_norm": 0.26811182498931885, + "learning_rate": 9.83e-05, + "loss": 0.0532, + "step": 2950 + }, + { + "epoch": 0.5336217775374076, + "grad_norm": 0.30521097779273987, + "learning_rate": 9.863333333333334e-05, + "loss": 0.0604, + "step": 2960 + }, + { + "epoch": 0.5354245538128718, + "grad_norm": 0.1678686887025833, + "learning_rate": 9.896666666666667e-05, + "loss": 0.0566, + "step": 2970 + }, + { + "epoch": 0.537227330088336, + "grad_norm": 0.19016125798225403, + "learning_rate": 9.93e-05, + "loss": 0.0538, + "step": 2980 + }, + { + "epoch": 0.5390301063638002, + "grad_norm": 0.30830803513526917, + "learning_rate": 9.963333333333333e-05, + "loss": 0.0638, + "step": 2990 + }, + { + "epoch": 0.5408328826392644, + "grad_norm": 0.17846520245075226, + "learning_rate": 9.996666666666668e-05, + "loss": 0.0537, + "step": 3000 + }, + { + "epoch": 0.5426356589147286, + "grad_norm": 0.15769369900226593, + "learning_rate": 9.999999384858465e-05, + "loss": 0.0522, + "step": 3010 + }, + { + "epoch": 0.5444384351901929, + "grad_norm": 0.2433222085237503, + "learning_rate": 9.999997258443473e-05, + "loss": 0.0579, + "step": 3020 + }, + { + "epoch": 0.5462412114656571, + "grad_norm": 0.24462740123271942, + "learning_rate": 9.999993613161331e-05, + "loss": 0.0608, + "step": 3030 + }, + { + "epoch": 0.5480439877411213, + "grad_norm": 0.2256120890378952, + "learning_rate": 9.999988449013146e-05, + "loss": 0.0543, + "step": 3040 + }, + { + "epoch": 0.5498467640165855, + "grad_norm": 0.23624876141548157, + "learning_rate": 9.99998176600049e-05, + "loss": 0.055, + "step": 3050 + }, + { + "epoch": 0.5516495402920497, + "grad_norm": 0.1689511388540268, + "learning_rate": 9.999973564125389e-05, + "loss": 0.0563, + "step": 3060 + }, + { + "epoch": 0.5534523165675139, + "grad_norm": 0.31788817048072815, + "learning_rate": 9.999963843390335e-05, + "loss": 0.0554, + "step": 3070 + }, + { + "epoch": 0.5552550928429781, + "grad_norm": 0.2632403075695038, + "learning_rate": 9.999952603798282e-05, + "loss": 0.0517, + "step": 3080 + }, + { + "epoch": 0.5570578691184424, + "grad_norm": 0.2732613980770111, + "learning_rate": 9.999939845352646e-05, + "loss": 0.0599, + "step": 3090 + }, + { + "epoch": 0.5588606453939066, + "grad_norm": 0.17078642547130585, + "learning_rate": 9.999925568057298e-05, + "loss": 0.0597, + "step": 3100 + }, + { + "epoch": 0.5606634216693708, + "grad_norm": 0.23614409565925598, + "learning_rate": 9.999909771916578e-05, + "loss": 0.0559, + "step": 3110 + }, + { + "epoch": 0.5624661979448351, + "grad_norm": 0.21851426362991333, + "learning_rate": 9.999892456935285e-05, + "loss": 0.0564, + "step": 3120 + }, + { + "epoch": 0.5642689742202993, + "grad_norm": 0.19016896188259125, + "learning_rate": 9.999873623118679e-05, + "loss": 0.0577, + "step": 3130 + }, + { + "epoch": 0.5660717504957635, + "grad_norm": 0.1992519646883011, + "learning_rate": 9.999853270472479e-05, + "loss": 0.0599, + "step": 3140 + }, + { + "epoch": 0.5678745267712277, + "grad_norm": 0.37866124510765076, + "learning_rate": 9.999831399002871e-05, + "loss": 0.0569, + "step": 3150 + }, + { + "epoch": 0.569677303046692, + "grad_norm": 0.17698729038238525, + "learning_rate": 9.999808008716494e-05, + "loss": 0.0585, + "step": 3160 + }, + { + "epoch": 0.5714800793221562, + "grad_norm": 0.2222241461277008, + "learning_rate": 9.999783099620459e-05, + "loss": 0.0546, + "step": 3170 + }, + { + "epoch": 0.5732828555976204, + "grad_norm": 0.1738797426223755, + "learning_rate": 9.999756671722328e-05, + "loss": 0.0525, + "step": 3180 + }, + { + "epoch": 0.5750856318730846, + "grad_norm": 0.16521303355693817, + "learning_rate": 9.99972872503013e-05, + "loss": 0.0524, + "step": 3190 + }, + { + "epoch": 0.5768884081485488, + "grad_norm": 0.21063007414340973, + "learning_rate": 9.999699259552359e-05, + "loss": 0.0515, + "step": 3200 + }, + { + "epoch": 0.578691184424013, + "grad_norm": 0.20505587756633759, + "learning_rate": 9.99966827529796e-05, + "loss": 0.0509, + "step": 3210 + }, + { + "epoch": 0.5804939606994772, + "grad_norm": 0.24220533668994904, + "learning_rate": 9.999635772276348e-05, + "loss": 0.0577, + "step": 3220 + }, + { + "epoch": 0.5822967369749414, + "grad_norm": 0.20930424332618713, + "learning_rate": 9.999601750497396e-05, + "loss": 0.0526, + "step": 3230 + }, + { + "epoch": 0.5840995132504057, + "grad_norm": 0.17883487045764923, + "learning_rate": 9.99956620997144e-05, + "loss": 0.0529, + "step": 3240 + }, + { + "epoch": 0.5859022895258699, + "grad_norm": 0.27819836139678955, + "learning_rate": 9.999529150709275e-05, + "loss": 0.0573, + "step": 3250 + }, + { + "epoch": 0.5877050658013341, + "grad_norm": 0.31870806217193604, + "learning_rate": 9.999490572722158e-05, + "loss": 0.0576, + "step": 3260 + }, + { + "epoch": 0.5895078420767983, + "grad_norm": 0.2694283723831177, + "learning_rate": 9.99945047602181e-05, + "loss": 0.0544, + "step": 3270 + }, + { + "epoch": 0.5913106183522625, + "grad_norm": 0.2328466773033142, + "learning_rate": 9.99940886062041e-05, + "loss": 0.0572, + "step": 3280 + }, + { + "epoch": 0.5931133946277267, + "grad_norm": 0.2757991850376129, + "learning_rate": 9.999365726530599e-05, + "loss": 0.0567, + "step": 3290 + }, + { + "epoch": 0.5949161709031909, + "grad_norm": 0.1877506971359253, + "learning_rate": 9.999321073765481e-05, + "loss": 0.0521, + "step": 3300 + }, + { + "epoch": 0.5967189471786551, + "grad_norm": 0.17714393138885498, + "learning_rate": 9.99927490233862e-05, + "loss": 0.052, + "step": 3310 + }, + { + "epoch": 0.5985217234541194, + "grad_norm": 0.2916216254234314, + "learning_rate": 9.999227212264043e-05, + "loss": 0.0577, + "step": 3320 + }, + { + "epoch": 0.6003244997295836, + "grad_norm": 0.16772031784057617, + "learning_rate": 9.999178003556236e-05, + "loss": 0.0554, + "step": 3330 + }, + { + "epoch": 0.6021272760050478, + "grad_norm": 0.2048356533050537, + "learning_rate": 9.999127276230146e-05, + "loss": 0.0521, + "step": 3340 + }, + { + "epoch": 0.603930052280512, + "grad_norm": 0.1914864331483841, + "learning_rate": 9.999075030301184e-05, + "loss": 0.0575, + "step": 3350 + }, + { + "epoch": 0.6057328285559762, + "grad_norm": 0.22202041745185852, + "learning_rate": 9.999021265785221e-05, + "loss": 0.0548, + "step": 3360 + }, + { + "epoch": 0.6075356048314404, + "grad_norm": 0.14372526109218597, + "learning_rate": 9.998965982698589e-05, + "loss": 0.0547, + "step": 3370 + }, + { + "epoch": 0.6093383811069046, + "grad_norm": 0.22338172793388367, + "learning_rate": 9.998909181058082e-05, + "loss": 0.0596, + "step": 3380 + }, + { + "epoch": 0.6111411573823688, + "grad_norm": 0.23771338164806366, + "learning_rate": 9.998850860880953e-05, + "loss": 0.0542, + "step": 3390 + }, + { + "epoch": 0.6129439336578331, + "grad_norm": 0.14361989498138428, + "learning_rate": 9.998791022184922e-05, + "loss": 0.0493, + "step": 3400 + }, + { + "epoch": 0.6147467099332973, + "grad_norm": 0.24462664127349854, + "learning_rate": 9.99872966498816e-05, + "loss": 0.0544, + "step": 3410 + }, + { + "epoch": 0.6165494862087615, + "grad_norm": 0.26684728264808655, + "learning_rate": 9.998666789309313e-05, + "loss": 0.0569, + "step": 3420 + }, + { + "epoch": 0.6183522624842257, + "grad_norm": 0.22234764695167542, + "learning_rate": 9.998602395167475e-05, + "loss": 0.0514, + "step": 3430 + }, + { + "epoch": 0.6201550387596899, + "grad_norm": 0.2795373797416687, + "learning_rate": 9.998536482582213e-05, + "loss": 0.0583, + "step": 3440 + }, + { + "epoch": 0.6219578150351541, + "grad_norm": 0.22455738484859467, + "learning_rate": 9.998469051573544e-05, + "loss": 0.0482, + "step": 3450 + }, + { + "epoch": 0.6237605913106183, + "grad_norm": 0.204681396484375, + "learning_rate": 9.998400102161954e-05, + "loss": 0.0541, + "step": 3460 + }, + { + "epoch": 0.6255633675860826, + "grad_norm": 0.1677926778793335, + "learning_rate": 9.998329634368388e-05, + "loss": 0.0556, + "step": 3470 + }, + { + "epoch": 0.6273661438615468, + "grad_norm": 0.16572725772857666, + "learning_rate": 9.998257648214253e-05, + "loss": 0.0511, + "step": 3480 + }, + { + "epoch": 0.629168920137011, + "grad_norm": 0.18733033537864685, + "learning_rate": 9.998184143721417e-05, + "loss": 0.0543, + "step": 3490 + }, + { + "epoch": 0.6309716964124752, + "grad_norm": 0.23857447504997253, + "learning_rate": 9.998109120912206e-05, + "loss": 0.0558, + "step": 3500 + }, + { + "epoch": 0.6327744726879394, + "grad_norm": 0.20467570424079895, + "learning_rate": 9.998032579809411e-05, + "loss": 0.0525, + "step": 3510 + }, + { + "epoch": 0.6345772489634036, + "grad_norm": 0.1384439617395401, + "learning_rate": 9.997954520436286e-05, + "loss": 0.0541, + "step": 3520 + }, + { + "epoch": 0.6363800252388678, + "grad_norm": 0.19725823402404785, + "learning_rate": 9.997874942816538e-05, + "loss": 0.0569, + "step": 3530 + }, + { + "epoch": 0.638182801514332, + "grad_norm": 0.16356350481510162, + "learning_rate": 9.997793846974345e-05, + "loss": 0.0501, + "step": 3540 + }, + { + "epoch": 0.6399855777897963, + "grad_norm": 0.20610161125659943, + "learning_rate": 9.997711232934341e-05, + "loss": 0.0551, + "step": 3550 + }, + { + "epoch": 0.6417883540652605, + "grad_norm": 0.16926410794258118, + "learning_rate": 9.99762710072162e-05, + "loss": 0.0594, + "step": 3560 + }, + { + "epoch": 0.6435911303407247, + "grad_norm": 0.17915554344654083, + "learning_rate": 9.997541450361743e-05, + "loss": 0.0545, + "step": 3570 + }, + { + "epoch": 0.6453939066161889, + "grad_norm": 0.2203824371099472, + "learning_rate": 9.997454281880723e-05, + "loss": 0.058, + "step": 3580 + }, + { + "epoch": 0.6471966828916531, + "grad_norm": 0.2686256766319275, + "learning_rate": 9.997365595305044e-05, + "loss": 0.06, + "step": 3590 + }, + { + "epoch": 0.6489994591671173, + "grad_norm": 0.2311336100101471, + "learning_rate": 9.997275390661644e-05, + "loss": 0.0536, + "step": 3600 + }, + { + "epoch": 0.6508022354425815, + "grad_norm": 0.2140345573425293, + "learning_rate": 9.997183667977926e-05, + "loss": 0.0581, + "step": 3610 + }, + { + "epoch": 0.6526050117180457, + "grad_norm": 0.2858904302120209, + "learning_rate": 9.997090427281752e-05, + "loss": 0.0492, + "step": 3620 + }, + { + "epoch": 0.65440778799351, + "grad_norm": 0.2953670024871826, + "learning_rate": 9.996995668601448e-05, + "loss": 0.055, + "step": 3630 + }, + { + "epoch": 0.6562105642689742, + "grad_norm": 0.2625373303890228, + "learning_rate": 9.996899391965798e-05, + "loss": 0.0521, + "step": 3640 + }, + { + "epoch": 0.6580133405444384, + "grad_norm": 0.16617342829704285, + "learning_rate": 9.996801597404048e-05, + "loss": 0.0524, + "step": 3650 + }, + { + "epoch": 0.6598161168199026, + "grad_norm": 0.19370779395103455, + "learning_rate": 9.996702284945905e-05, + "loss": 0.0541, + "step": 3660 + }, + { + "epoch": 0.6616188930953668, + "grad_norm": 0.22694896161556244, + "learning_rate": 9.996601454621539e-05, + "loss": 0.0543, + "step": 3670 + }, + { + "epoch": 0.6634216693708311, + "grad_norm": 0.16225895285606384, + "learning_rate": 9.996499106461577e-05, + "loss": 0.0513, + "step": 3680 + }, + { + "epoch": 0.6652244456462953, + "grad_norm": 0.13752566277980804, + "learning_rate": 9.996395240497112e-05, + "loss": 0.0552, + "step": 3690 + }, + { + "epoch": 0.6670272219217596, + "grad_norm": 0.17287544906139374, + "learning_rate": 9.996289856759696e-05, + "loss": 0.055, + "step": 3700 + }, + { + "epoch": 0.6688299981972238, + "grad_norm": 0.1903308480978012, + "learning_rate": 9.996182955281342e-05, + "loss": 0.0541, + "step": 3710 + }, + { + "epoch": 0.670632774472688, + "grad_norm": 0.17485389113426208, + "learning_rate": 9.996074536094519e-05, + "loss": 0.0511, + "step": 3720 + }, + { + "epoch": 0.6724355507481522, + "grad_norm": 0.18762964010238647, + "learning_rate": 9.995964599232168e-05, + "loss": 0.0574, + "step": 3730 + }, + { + "epoch": 0.6742383270236164, + "grad_norm": 0.17530937492847443, + "learning_rate": 9.995853144727683e-05, + "loss": 0.0603, + "step": 3740 + }, + { + "epoch": 0.6760411032990806, + "grad_norm": 0.1973475217819214, + "learning_rate": 9.99574017261492e-05, + "loss": 0.0544, + "step": 3750 + }, + { + "epoch": 0.6778438795745448, + "grad_norm": 0.2093428671360016, + "learning_rate": 9.995625682928198e-05, + "loss": 0.0559, + "step": 3760 + }, + { + "epoch": 0.679646655850009, + "grad_norm": 0.14786028861999512, + "learning_rate": 9.995509675702295e-05, + "loss": 0.0529, + "step": 3770 + }, + { + "epoch": 0.6814494321254733, + "grad_norm": 0.2190335988998413, + "learning_rate": 9.995392150972451e-05, + "loss": 0.0643, + "step": 3780 + }, + { + "epoch": 0.6832522084009375, + "grad_norm": 0.22454246878623962, + "learning_rate": 9.995273108774366e-05, + "loss": 0.0522, + "step": 3790 + }, + { + "epoch": 0.6850549846764017, + "grad_norm": 0.18701009452342987, + "learning_rate": 9.995152549144205e-05, + "loss": 0.0547, + "step": 3800 + }, + { + "epoch": 0.6868577609518659, + "grad_norm": 0.21698643267154694, + "learning_rate": 9.995030472118587e-05, + "loss": 0.0543, + "step": 3810 + }, + { + "epoch": 0.6886605372273301, + "grad_norm": 0.2331894040107727, + "learning_rate": 9.9949068777346e-05, + "loss": 0.0578, + "step": 3820 + }, + { + "epoch": 0.6904633135027943, + "grad_norm": 0.23350775241851807, + "learning_rate": 9.994781766029786e-05, + "loss": 0.0556, + "step": 3830 + }, + { + "epoch": 0.6922660897782585, + "grad_norm": 0.17531098425388336, + "learning_rate": 9.994655137042151e-05, + "loss": 0.0514, + "step": 3840 + }, + { + "epoch": 0.6940688660537228, + "grad_norm": 0.17184023559093475, + "learning_rate": 9.99452699081016e-05, + "loss": 0.0495, + "step": 3850 + }, + { + "epoch": 0.695871642329187, + "grad_norm": 0.16536134481430054, + "learning_rate": 9.994397327372743e-05, + "loss": 0.0539, + "step": 3860 + }, + { + "epoch": 0.6976744186046512, + "grad_norm": 0.1660694181919098, + "learning_rate": 9.994266146769286e-05, + "loss": 0.053, + "step": 3870 + }, + { + "epoch": 0.6994771948801154, + "grad_norm": 0.20537647604942322, + "learning_rate": 9.994133449039642e-05, + "loss": 0.0578, + "step": 3880 + }, + { + "epoch": 0.7012799711555796, + "grad_norm": 0.18250614404678345, + "learning_rate": 9.993999234224118e-05, + "loss": 0.0511, + "step": 3890 + }, + { + "epoch": 0.7030827474310438, + "grad_norm": 0.21514642238616943, + "learning_rate": 9.993863502363485e-05, + "loss": 0.0553, + "step": 3900 + }, + { + "epoch": 0.704885523706508, + "grad_norm": 0.2605758309364319, + "learning_rate": 9.993726253498976e-05, + "loss": 0.0529, + "step": 3910 + }, + { + "epoch": 0.7066882999819722, + "grad_norm": 0.20881788432598114, + "learning_rate": 9.993587487672282e-05, + "loss": 0.0555, + "step": 3920 + }, + { + "epoch": 0.7084910762574365, + "grad_norm": 0.18526031076908112, + "learning_rate": 9.993447204925558e-05, + "loss": 0.0529, + "step": 3930 + }, + { + "epoch": 0.7102938525329007, + "grad_norm": 0.17271843552589417, + "learning_rate": 9.993305405301416e-05, + "loss": 0.0539, + "step": 3940 + }, + { + "epoch": 0.7120966288083649, + "grad_norm": 0.17218437790870667, + "learning_rate": 9.993162088842935e-05, + "loss": 0.053, + "step": 3950 + }, + { + "epoch": 0.7138994050838291, + "grad_norm": 0.20867522060871124, + "learning_rate": 9.993017255593646e-05, + "loss": 0.0544, + "step": 3960 + }, + { + "epoch": 0.7157021813592933, + "grad_norm": 0.18345223367214203, + "learning_rate": 9.992870905597548e-05, + "loss": 0.0522, + "step": 3970 + }, + { + "epoch": 0.7175049576347575, + "grad_norm": 0.14037351310253143, + "learning_rate": 9.9927230388991e-05, + "loss": 0.0507, + "step": 3980 + }, + { + "epoch": 0.7193077339102217, + "grad_norm": 0.1470947265625, + "learning_rate": 9.992573655543215e-05, + "loss": 0.0525, + "step": 3990 + }, + { + "epoch": 0.721110510185686, + "grad_norm": 0.15597905218601227, + "learning_rate": 9.992422755575277e-05, + "loss": 0.0486, + "step": 4000 + }, + { + "epoch": 0.7229132864611502, + "grad_norm": 0.15142855048179626, + "learning_rate": 9.992270339041123e-05, + "loss": 0.0477, + "step": 4010 + }, + { + "epoch": 0.7247160627366144, + "grad_norm": 0.19573090970516205, + "learning_rate": 9.992116405987053e-05, + "loss": 0.0564, + "step": 4020 + }, + { + "epoch": 0.7265188390120786, + "grad_norm": 0.13741803169250488, + "learning_rate": 9.991960956459828e-05, + "loss": 0.0533, + "step": 4030 + }, + { + "epoch": 0.7283216152875428, + "grad_norm": 0.19118353724479675, + "learning_rate": 9.991803990506669e-05, + "loss": 0.0509, + "step": 4040 + }, + { + "epoch": 0.730124391563007, + "grad_norm": 0.18628346920013428, + "learning_rate": 9.991645508175258e-05, + "loss": 0.0568, + "step": 4050 + }, + { + "epoch": 0.7319271678384712, + "grad_norm": 0.2145071029663086, + "learning_rate": 9.99148550951374e-05, + "loss": 0.0523, + "step": 4060 + }, + { + "epoch": 0.7337299441139354, + "grad_norm": 0.17149919271469116, + "learning_rate": 9.991323994570716e-05, + "loss": 0.0483, + "step": 4070 + }, + { + "epoch": 0.7355327203893997, + "grad_norm": 0.22665870189666748, + "learning_rate": 9.99116096339525e-05, + "loss": 0.0567, + "step": 4080 + }, + { + "epoch": 0.7373354966648639, + "grad_norm": 0.17213742434978485, + "learning_rate": 9.990996416036869e-05, + "loss": 0.0519, + "step": 4090 + }, + { + "epoch": 0.7391382729403281, + "grad_norm": 0.21811003983020782, + "learning_rate": 9.990830352545555e-05, + "loss": 0.0507, + "step": 4100 + }, + { + "epoch": 0.7409410492157923, + "grad_norm": 0.19745135307312012, + "learning_rate": 9.990662772971756e-05, + "loss": 0.0522, + "step": 4110 + }, + { + "epoch": 0.7427438254912565, + "grad_norm": 0.15801742672920227, + "learning_rate": 9.990493677366376e-05, + "loss": 0.0531, + "step": 4120 + }, + { + "epoch": 0.7445466017667207, + "grad_norm": 0.19218087196350098, + "learning_rate": 9.990323065780786e-05, + "loss": 0.057, + "step": 4130 + }, + { + "epoch": 0.7463493780421849, + "grad_norm": 0.20360451936721802, + "learning_rate": 9.990150938266808e-05, + "loss": 0.0525, + "step": 4140 + }, + { + "epoch": 0.7481521543176491, + "grad_norm": 0.19944339990615845, + "learning_rate": 9.989977294876733e-05, + "loss": 0.0533, + "step": 4150 + }, + { + "epoch": 0.7499549305931134, + "grad_norm": 0.16528679430484772, + "learning_rate": 9.989802135663308e-05, + "loss": 0.0532, + "step": 4160 + }, + { + "epoch": 0.7517577068685776, + "grad_norm": 0.15513837337493896, + "learning_rate": 9.989625460679743e-05, + "loss": 0.0493, + "step": 4170 + }, + { + "epoch": 0.7535604831440418, + "grad_norm": 0.2322871834039688, + "learning_rate": 9.989447269979706e-05, + "loss": 0.0529, + "step": 4180 + }, + { + "epoch": 0.755363259419506, + "grad_norm": 0.1429019719362259, + "learning_rate": 9.989267563617328e-05, + "loss": 0.0524, + "step": 4190 + }, + { + "epoch": 0.7571660356949702, + "grad_norm": 0.15561752021312714, + "learning_rate": 9.989086341647198e-05, + "loss": 0.048, + "step": 4200 + }, + { + "epoch": 0.7589688119704344, + "grad_norm": 0.15190979838371277, + "learning_rate": 9.988903604124366e-05, + "loss": 0.0502, + "step": 4210 + }, + { + "epoch": 0.7607715882458986, + "grad_norm": 0.19186142086982727, + "learning_rate": 9.988719351104343e-05, + "loss": 0.0484, + "step": 4220 + }, + { + "epoch": 0.7625743645213628, + "grad_norm": 0.2058783620595932, + "learning_rate": 9.9885335826431e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.7643771407968271, + "grad_norm": 0.19337031245231628, + "learning_rate": 9.988346298797071e-05, + "loss": 0.0493, + "step": 4240 + }, + { + "epoch": 0.7661799170722914, + "grad_norm": 0.24919281899929047, + "learning_rate": 9.988157499623146e-05, + "loss": 0.0566, + "step": 4250 + }, + { + "epoch": 0.7679826933477556, + "grad_norm": 0.23650670051574707, + "learning_rate": 9.987967185178677e-05, + "loss": 0.0529, + "step": 4260 + }, + { + "epoch": 0.7697854696232198, + "grad_norm": 0.15392479300498962, + "learning_rate": 9.987775355521476e-05, + "loss": 0.0468, + "step": 4270 + }, + { + "epoch": 0.771588245898684, + "grad_norm": 0.2416786551475525, + "learning_rate": 9.987582010709817e-05, + "loss": 0.0479, + "step": 4280 + }, + { + "epoch": 0.7733910221741482, + "grad_norm": 0.15816278755664825, + "learning_rate": 9.987387150802431e-05, + "loss": 0.0517, + "step": 4290 + }, + { + "epoch": 0.7751937984496124, + "grad_norm": 0.16642552614212036, + "learning_rate": 9.987190775858517e-05, + "loss": 0.0498, + "step": 4300 + }, + { + "epoch": 0.7769965747250767, + "grad_norm": 0.12232939898967743, + "learning_rate": 9.98699288593772e-05, + "loss": 0.0528, + "step": 4310 + }, + { + "epoch": 0.7787993510005409, + "grad_norm": 0.1731266975402832, + "learning_rate": 9.986793481100161e-05, + "loss": 0.048, + "step": 4320 + }, + { + "epoch": 0.7806021272760051, + "grad_norm": 0.21021099388599396, + "learning_rate": 9.986592561406412e-05, + "loss": 0.0483, + "step": 4330 + }, + { + "epoch": 0.7824049035514693, + "grad_norm": 0.1725785732269287, + "learning_rate": 9.986390126917503e-05, + "loss": 0.0534, + "step": 4340 + }, + { + "epoch": 0.7842076798269335, + "grad_norm": 0.2088138461112976, + "learning_rate": 9.986186177694933e-05, + "loss": 0.0547, + "step": 4350 + }, + { + "epoch": 0.7860104561023977, + "grad_norm": 0.21956445276737213, + "learning_rate": 9.985980713800656e-05, + "loss": 0.0478, + "step": 4360 + }, + { + "epoch": 0.7878132323778619, + "grad_norm": 0.16561521589756012, + "learning_rate": 9.985773735297084e-05, + "loss": 0.0521, + "step": 4370 + }, + { + "epoch": 0.7896160086533262, + "grad_norm": 0.16367164254188538, + "learning_rate": 9.985565242247092e-05, + "loss": 0.0538, + "step": 4380 + }, + { + "epoch": 0.7914187849287904, + "grad_norm": 0.19190697371959686, + "learning_rate": 9.985355234714016e-05, + "loss": 0.0496, + "step": 4390 + }, + { + "epoch": 0.7932215612042546, + "grad_norm": 0.14845453202724457, + "learning_rate": 9.985143712761652e-05, + "loss": 0.0483, + "step": 4400 + }, + { + "epoch": 0.7950243374797188, + "grad_norm": 0.13262321054935455, + "learning_rate": 9.984930676454252e-05, + "loss": 0.0553, + "step": 4410 + }, + { + "epoch": 0.796827113755183, + "grad_norm": 0.20816828310489655, + "learning_rate": 9.984716125856532e-05, + "loss": 0.0507, + "step": 4420 + }, + { + "epoch": 0.7986298900306472, + "grad_norm": 0.15905918180942535, + "learning_rate": 9.984500061033667e-05, + "loss": 0.0472, + "step": 4430 + }, + { + "epoch": 0.8004326663061114, + "grad_norm": 0.16600371897220612, + "learning_rate": 9.984282482051293e-05, + "loss": 0.0499, + "step": 4440 + }, + { + "epoch": 0.8022354425815756, + "grad_norm": 0.2226187288761139, + "learning_rate": 9.9840633889755e-05, + "loss": 0.0513, + "step": 4450 + }, + { + "epoch": 0.8040382188570399, + "grad_norm": 0.13732576370239258, + "learning_rate": 9.983842781872848e-05, + "loss": 0.0514, + "step": 4460 + }, + { + "epoch": 0.8058409951325041, + "grad_norm": 0.16772523522377014, + "learning_rate": 9.98362066081035e-05, + "loss": 0.0553, + "step": 4470 + }, + { + "epoch": 0.8076437714079683, + "grad_norm": 0.15782472491264343, + "learning_rate": 9.983397025855479e-05, + "loss": 0.0514, + "step": 4480 + }, + { + "epoch": 0.8094465476834325, + "grad_norm": 0.20454518496990204, + "learning_rate": 9.983171877076171e-05, + "loss": 0.0503, + "step": 4490 + }, + { + "epoch": 0.8112493239588967, + "grad_norm": 0.23142272233963013, + "learning_rate": 9.98294521454082e-05, + "loss": 0.0515, + "step": 4500 + }, + { + "epoch": 0.8130521002343609, + "grad_norm": 0.17832320928573608, + "learning_rate": 9.98271703831828e-05, + "loss": 0.0495, + "step": 4510 + }, + { + "epoch": 0.8148548765098251, + "grad_norm": 0.16449207067489624, + "learning_rate": 9.982487348477865e-05, + "loss": 0.0577, + "step": 4520 + }, + { + "epoch": 0.8166576527852893, + "grad_norm": 0.2397778481245041, + "learning_rate": 9.982256145089347e-05, + "loss": 0.0437, + "step": 4530 + }, + { + "epoch": 0.8184604290607536, + "grad_norm": 0.13208262622356415, + "learning_rate": 9.982023428222962e-05, + "loss": 0.0477, + "step": 4540 + }, + { + "epoch": 0.8202632053362178, + "grad_norm": 0.168897807598114, + "learning_rate": 9.981789197949403e-05, + "loss": 0.0475, + "step": 4550 + }, + { + "epoch": 0.822065981611682, + "grad_norm": 0.15609736740589142, + "learning_rate": 9.98155345433982e-05, + "loss": 0.0481, + "step": 4560 + }, + { + "epoch": 0.8238687578871462, + "grad_norm": 0.11921030282974243, + "learning_rate": 9.981316197465831e-05, + "loss": 0.0582, + "step": 4570 + }, + { + "epoch": 0.8256715341626104, + "grad_norm": 0.23547197878360748, + "learning_rate": 9.981077427399504e-05, + "loss": 0.0474, + "step": 4580 + }, + { + "epoch": 0.8274743104380746, + "grad_norm": 0.15833474695682526, + "learning_rate": 9.980837144213371e-05, + "loss": 0.0477, + "step": 4590 + }, + { + "epoch": 0.8292770867135388, + "grad_norm": 0.11992108076810837, + "learning_rate": 9.980595347980426e-05, + "loss": 0.0494, + "step": 4600 + }, + { + "epoch": 0.831079862989003, + "grad_norm": 0.1470881551504135, + "learning_rate": 9.980352038774119e-05, + "loss": 0.0523, + "step": 4610 + }, + { + "epoch": 0.8328826392644673, + "grad_norm": 0.17348624765872955, + "learning_rate": 9.98010721666836e-05, + "loss": 0.0509, + "step": 4620 + }, + { + "epoch": 0.8346854155399315, + "grad_norm": 0.19449074566364288, + "learning_rate": 9.979860881737523e-05, + "loss": 0.046, + "step": 4630 + }, + { + "epoch": 0.8364881918153957, + "grad_norm": 0.18979020416736603, + "learning_rate": 9.979613034056434e-05, + "loss": 0.0487, + "step": 4640 + }, + { + "epoch": 0.8382909680908599, + "grad_norm": 0.214563250541687, + "learning_rate": 9.979363673700386e-05, + "loss": 0.0484, + "step": 4650 + }, + { + "epoch": 0.8400937443663241, + "grad_norm": 0.14514483511447906, + "learning_rate": 9.979112800745124e-05, + "loss": 0.0505, + "step": 4660 + }, + { + "epoch": 0.8418965206417883, + "grad_norm": 0.1271432340145111, + "learning_rate": 9.978860415266861e-05, + "loss": 0.0514, + "step": 4670 + }, + { + "epoch": 0.8436992969172525, + "grad_norm": 0.25574085116386414, + "learning_rate": 9.978606517342262e-05, + "loss": 0.052, + "step": 4680 + }, + { + "epoch": 0.8455020731927168, + "grad_norm": 0.16129499673843384, + "learning_rate": 9.978351107048456e-05, + "loss": 0.0516, + "step": 4690 + }, + { + "epoch": 0.847304849468181, + "grad_norm": 0.208964541554451, + "learning_rate": 9.978094184463029e-05, + "loss": 0.0485, + "step": 4700 + }, + { + "epoch": 0.8491076257436452, + "grad_norm": 0.18033969402313232, + "learning_rate": 9.977835749664029e-05, + "loss": 0.0486, + "step": 4710 + }, + { + "epoch": 0.8509104020191094, + "grad_norm": 0.17518360912799835, + "learning_rate": 9.97757580272996e-05, + "loss": 0.0498, + "step": 4720 + }, + { + "epoch": 0.8527131782945736, + "grad_norm": 0.1941109299659729, + "learning_rate": 9.977314343739786e-05, + "loss": 0.0508, + "step": 4730 + }, + { + "epoch": 0.8545159545700378, + "grad_norm": 0.1365840882062912, + "learning_rate": 9.977051372772934e-05, + "loss": 0.0469, + "step": 4740 + }, + { + "epoch": 0.856318730845502, + "grad_norm": 0.20867887139320374, + "learning_rate": 9.976786889909286e-05, + "loss": 0.0479, + "step": 4750 + }, + { + "epoch": 0.8581215071209662, + "grad_norm": 0.16011196374893188, + "learning_rate": 9.976520895229185e-05, + "loss": 0.0477, + "step": 4760 + }, + { + "epoch": 0.8599242833964305, + "grad_norm": 0.14465907216072083, + "learning_rate": 9.976253388813433e-05, + "loss": 0.0513, + "step": 4770 + }, + { + "epoch": 0.8617270596718947, + "grad_norm": 0.18770858645439148, + "learning_rate": 9.975984370743293e-05, + "loss": 0.0491, + "step": 4780 + }, + { + "epoch": 0.8635298359473589, + "grad_norm": 0.13379095494747162, + "learning_rate": 9.975713841100485e-05, + "loss": 0.0508, + "step": 4790 + }, + { + "epoch": 0.8653326122228231, + "grad_norm": 0.17976029217243195, + "learning_rate": 9.975441799967187e-05, + "loss": 0.05, + "step": 4800 + }, + { + "epoch": 0.8671353884982873, + "grad_norm": 0.1620328575372696, + "learning_rate": 9.975168247426039e-05, + "loss": 0.0445, + "step": 4810 + }, + { + "epoch": 0.8689381647737516, + "grad_norm": 0.25882643461227417, + "learning_rate": 9.974893183560139e-05, + "loss": 0.0545, + "step": 4820 + }, + { + "epoch": 0.8707409410492158, + "grad_norm": 0.1808311641216278, + "learning_rate": 9.974616608453045e-05, + "loss": 0.0558, + "step": 4830 + }, + { + "epoch": 0.8725437173246801, + "grad_norm": 0.17577724158763885, + "learning_rate": 9.974338522188772e-05, + "loss": 0.0484, + "step": 4840 + }, + { + "epoch": 0.8743464936001443, + "grad_norm": 0.2507362365722656, + "learning_rate": 9.974058924851797e-05, + "loss": 0.0499, + "step": 4850 + }, + { + "epoch": 0.8761492698756085, + "grad_norm": 0.21123144030570984, + "learning_rate": 9.973777816527051e-05, + "loss": 0.0526, + "step": 4860 + }, + { + "epoch": 0.8779520461510727, + "grad_norm": 0.18370167911052704, + "learning_rate": 9.973495197299931e-05, + "loss": 0.0465, + "step": 4870 + }, + { + "epoch": 0.8797548224265369, + "grad_norm": 0.19995036721229553, + "learning_rate": 9.973211067256287e-05, + "loss": 0.0539, + "step": 4880 + }, + { + "epoch": 0.8815575987020011, + "grad_norm": 0.16384492814540863, + "learning_rate": 9.97292542648243e-05, + "loss": 0.0487, + "step": 4890 + }, + { + "epoch": 0.8833603749774653, + "grad_norm": 0.17125661671161652, + "learning_rate": 9.972638275065131e-05, + "loss": 0.0489, + "step": 4900 + }, + { + "epoch": 0.8851631512529295, + "grad_norm": 0.14433567225933075, + "learning_rate": 9.972349613091621e-05, + "loss": 0.0468, + "step": 4910 + }, + { + "epoch": 0.8869659275283938, + "grad_norm": 0.18537750840187073, + "learning_rate": 9.972059440649584e-05, + "loss": 0.0507, + "step": 4920 + }, + { + "epoch": 0.888768703803858, + "grad_norm": 0.1987074315547943, + "learning_rate": 9.971767757827168e-05, + "loss": 0.0474, + "step": 4930 + }, + { + "epoch": 0.8905714800793222, + "grad_norm": 0.15364907681941986, + "learning_rate": 9.971474564712982e-05, + "loss": 0.0454, + "step": 4940 + }, + { + "epoch": 0.8923742563547864, + "grad_norm": 0.13974200189113617, + "learning_rate": 9.971179861396084e-05, + "loss": 0.0472, + "step": 4950 + }, + { + "epoch": 0.8941770326302506, + "grad_norm": 0.1537025272846222, + "learning_rate": 9.970883647966003e-05, + "loss": 0.0495, + "step": 4960 + }, + { + "epoch": 0.8959798089057148, + "grad_norm": 0.12050797045230865, + "learning_rate": 9.970585924512717e-05, + "loss": 0.0474, + "step": 4970 + }, + { + "epoch": 0.897782585181179, + "grad_norm": 0.17876748740673065, + "learning_rate": 9.970286691126669e-05, + "loss": 0.0482, + "step": 4980 + }, + { + "epoch": 0.8995853614566433, + "grad_norm": 0.1352316439151764, + "learning_rate": 9.969985947898756e-05, + "loss": 0.0495, + "step": 4990 + }, + { + "epoch": 0.9013881377321075, + "grad_norm": 0.17774096131324768, + "learning_rate": 9.969683694920337e-05, + "loss": 0.0494, + "step": 5000 + }, + { + "epoch": 0.9031909140075717, + "grad_norm": 0.11138828843832016, + "learning_rate": 9.969379932283228e-05, + "loss": 0.0468, + "step": 5010 + }, + { + "epoch": 0.9049936902830359, + "grad_norm": 0.14681695401668549, + "learning_rate": 9.969074660079704e-05, + "loss": 0.0425, + "step": 5020 + }, + { + "epoch": 0.9067964665585001, + "grad_norm": 0.15625491738319397, + "learning_rate": 9.968767878402501e-05, + "loss": 0.05, + "step": 5030 + }, + { + "epoch": 0.9085992428339643, + "grad_norm": 0.13043609261512756, + "learning_rate": 9.968459587344808e-05, + "loss": 0.0478, + "step": 5040 + }, + { + "epoch": 0.9104020191094285, + "grad_norm": 0.2065897434949875, + "learning_rate": 9.968149787000278e-05, + "loss": 0.0468, + "step": 5050 + }, + { + "epoch": 0.9122047953848927, + "grad_norm": 0.17964307963848114, + "learning_rate": 9.967838477463018e-05, + "loss": 0.05, + "step": 5060 + }, + { + "epoch": 0.914007571660357, + "grad_norm": 0.1859792321920395, + "learning_rate": 9.967525658827597e-05, + "loss": 0.0506, + "step": 5070 + }, + { + "epoch": 0.9158103479358212, + "grad_norm": 0.15018241107463837, + "learning_rate": 9.967211331189042e-05, + "loss": 0.0502, + "step": 5080 + }, + { + "epoch": 0.9176131242112854, + "grad_norm": 0.17236633598804474, + "learning_rate": 9.966895494642834e-05, + "loss": 0.0472, + "step": 5090 + }, + { + "epoch": 0.9194159004867496, + "grad_norm": 0.1674015372991562, + "learning_rate": 9.96657814928492e-05, + "loss": 0.0498, + "step": 5100 + }, + { + "epoch": 0.9212186767622138, + "grad_norm": 0.1864183396100998, + "learning_rate": 9.966259295211697e-05, + "loss": 0.048, + "step": 5110 + }, + { + "epoch": 0.923021453037678, + "grad_norm": 0.18949656188488007, + "learning_rate": 9.965938932520028e-05, + "loss": 0.0522, + "step": 5120 + }, + { + "epoch": 0.9248242293131422, + "grad_norm": 0.1557510942220688, + "learning_rate": 9.965617061307229e-05, + "loss": 0.0503, + "step": 5130 + }, + { + "epoch": 0.9266270055886064, + "grad_norm": 0.19600127637386322, + "learning_rate": 9.965293681671077e-05, + "loss": 0.0507, + "step": 5140 + }, + { + "epoch": 0.9284297818640707, + "grad_norm": 0.22450041770935059, + "learning_rate": 9.964968793709804e-05, + "loss": 0.0451, + "step": 5150 + }, + { + "epoch": 0.9302325581395349, + "grad_norm": 0.18441647291183472, + "learning_rate": 9.964642397522106e-05, + "loss": 0.0552, + "step": 5160 + }, + { + "epoch": 0.9320353344149991, + "grad_norm": 0.18776282668113708, + "learning_rate": 9.96431449320713e-05, + "loss": 0.045, + "step": 5170 + }, + { + "epoch": 0.9338381106904633, + "grad_norm": 0.14953307807445526, + "learning_rate": 9.963985080864486e-05, + "loss": 0.0467, + "step": 5180 + }, + { + "epoch": 0.9356408869659275, + "grad_norm": 0.19671177864074707, + "learning_rate": 9.96365416059424e-05, + "loss": 0.0498, + "step": 5190 + }, + { + "epoch": 0.9374436632413917, + "grad_norm": 0.1434078812599182, + "learning_rate": 9.963321732496919e-05, + "loss": 0.0456, + "step": 5200 + }, + { + "epoch": 0.9392464395168559, + "grad_norm": 0.26493221521377563, + "learning_rate": 9.962987796673506e-05, + "loss": 0.0474, + "step": 5210 + }, + { + "epoch": 0.9410492157923201, + "grad_norm": 0.1555505245923996, + "learning_rate": 9.962652353225438e-05, + "loss": 0.0492, + "step": 5220 + }, + { + "epoch": 0.9428519920677844, + "grad_norm": 0.17275793850421906, + "learning_rate": 9.962315402254619e-05, + "loss": 0.0475, + "step": 5230 + }, + { + "epoch": 0.9446547683432486, + "grad_norm": 0.16553552448749542, + "learning_rate": 9.9619769438634e-05, + "loss": 0.0491, + "step": 5240 + }, + { + "epoch": 0.9464575446187128, + "grad_norm": 0.2248888462781906, + "learning_rate": 9.9616369781546e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.948260320894177, + "grad_norm": 0.18081867694854736, + "learning_rate": 9.961295505231491e-05, + "loss": 0.0497, + "step": 5260 + }, + { + "epoch": 0.9500630971696412, + "grad_norm": 0.24891573190689087, + "learning_rate": 9.960952525197804e-05, + "loss": 0.0513, + "step": 5270 + }, + { + "epoch": 0.9518658734451054, + "grad_norm": 0.19671489298343658, + "learning_rate": 9.960608038157724e-05, + "loss": 0.0477, + "step": 5280 + }, + { + "epoch": 0.9536686497205696, + "grad_norm": 0.23478513956069946, + "learning_rate": 9.960262044215901e-05, + "loss": 0.0521, + "step": 5290 + }, + { + "epoch": 0.9554714259960339, + "grad_norm": 0.2092088758945465, + "learning_rate": 9.959914543477435e-05, + "loss": 0.0446, + "step": 5300 + }, + { + "epoch": 0.9572742022714981, + "grad_norm": 0.14573600888252258, + "learning_rate": 9.959565536047892e-05, + "loss": 0.0448, + "step": 5310 + }, + { + "epoch": 0.9590769785469623, + "grad_norm": 0.17377443611621857, + "learning_rate": 9.959215022033288e-05, + "loss": 0.053, + "step": 5320 + }, + { + "epoch": 0.9608797548224265, + "grad_norm": 0.19106821715831757, + "learning_rate": 9.9588630015401e-05, + "loss": 0.0504, + "step": 5330 + }, + { + "epoch": 0.9626825310978907, + "grad_norm": 0.18199145793914795, + "learning_rate": 9.958509474675264e-05, + "loss": 0.0449, + "step": 5340 + }, + { + "epoch": 0.9644853073733549, + "grad_norm": 0.2071419656276703, + "learning_rate": 9.958154441546171e-05, + "loss": 0.0465, + "step": 5350 + }, + { + "epoch": 0.9662880836488191, + "grad_norm": 0.1954154074192047, + "learning_rate": 9.957797902260673e-05, + "loss": 0.0496, + "step": 5360 + }, + { + "epoch": 0.9680908599242833, + "grad_norm": 0.15930332243442535, + "learning_rate": 9.957439856927073e-05, + "loss": 0.0479, + "step": 5370 + }, + { + "epoch": 0.9698936361997477, + "grad_norm": 0.1356603056192398, + "learning_rate": 9.957080305654139e-05, + "loss": 0.0459, + "step": 5380 + }, + { + "epoch": 0.9716964124752119, + "grad_norm": 0.16535735130310059, + "learning_rate": 9.956719248551092e-05, + "loss": 0.0485, + "step": 5390 + }, + { + "epoch": 0.9734991887506761, + "grad_norm": 0.13410362601280212, + "learning_rate": 9.956356685727612e-05, + "loss": 0.0499, + "step": 5400 + }, + { + "epoch": 0.9753019650261403, + "grad_norm": 0.15987256169319153, + "learning_rate": 9.955992617293836e-05, + "loss": 0.0532, + "step": 5410 + }, + { + "epoch": 0.9771047413016045, + "grad_norm": 0.15461158752441406, + "learning_rate": 9.955627043360358e-05, + "loss": 0.0528, + "step": 5420 + }, + { + "epoch": 0.9789075175770687, + "grad_norm": 0.1482284665107727, + "learning_rate": 9.955259964038231e-05, + "loss": 0.0514, + "step": 5430 + }, + { + "epoch": 0.9807102938525329, + "grad_norm": 0.14993350207805634, + "learning_rate": 9.954891379438962e-05, + "loss": 0.0496, + "step": 5440 + }, + { + "epoch": 0.9825130701279972, + "grad_norm": 0.17867375910282135, + "learning_rate": 9.954521289674519e-05, + "loss": 0.0509, + "step": 5450 + }, + { + "epoch": 0.9843158464034614, + "grad_norm": 0.1541956067085266, + "learning_rate": 9.954149694857325e-05, + "loss": 0.0535, + "step": 5460 + }, + { + "epoch": 0.9861186226789256, + "grad_norm": 0.18034759163856506, + "learning_rate": 9.953776595100258e-05, + "loss": 0.0475, + "step": 5470 + }, + { + "epoch": 0.9879213989543898, + "grad_norm": 0.14206287264823914, + "learning_rate": 9.95340199051666e-05, + "loss": 0.0472, + "step": 5480 + }, + { + "epoch": 0.989724175229854, + "grad_norm": 0.1905004382133484, + "learning_rate": 9.953025881220325e-05, + "loss": 0.057, + "step": 5490 + }, + { + "epoch": 0.9915269515053182, + "grad_norm": 0.18126565217971802, + "learning_rate": 9.952648267325504e-05, + "loss": 0.0466, + "step": 5500 + }, + { + "epoch": 0.9933297277807824, + "grad_norm": 0.17522726953029633, + "learning_rate": 9.952269148946905e-05, + "loss": 0.0523, + "step": 5510 + }, + { + "epoch": 0.9951325040562466, + "grad_norm": 0.16590476036071777, + "learning_rate": 9.951888526199697e-05, + "loss": 0.0515, + "step": 5520 + }, + { + "epoch": 0.9969352803317109, + "grad_norm": 0.15113309025764465, + "learning_rate": 9.951506399199501e-05, + "loss": 0.0519, + "step": 5530 + }, + { + "epoch": 0.9987380566071751, + "grad_norm": 0.14152631163597107, + "learning_rate": 9.951122768062399e-05, + "loss": 0.0413, + "step": 5540 + }, + { + "epoch": 1.0005408328826393, + "grad_norm": 0.149987131357193, + "learning_rate": 9.950737632904927e-05, + "loss": 0.0499, + "step": 5550 + }, + { + "epoch": 1.0023436091581035, + "grad_norm": 0.1638782024383545, + "learning_rate": 9.950350993844077e-05, + "loss": 0.0478, + "step": 5560 + }, + { + "epoch": 1.0041463854335677, + "grad_norm": 0.12248101830482483, + "learning_rate": 9.949962850997303e-05, + "loss": 0.0437, + "step": 5570 + }, + { + "epoch": 1.005949161709032, + "grad_norm": 0.14365188777446747, + "learning_rate": 9.949573204482512e-05, + "loss": 0.0471, + "step": 5580 + }, + { + "epoch": 1.0077519379844961, + "grad_norm": 0.1796455979347229, + "learning_rate": 9.949182054418064e-05, + "loss": 0.0484, + "step": 5590 + }, + { + "epoch": 1.0095547142599604, + "grad_norm": 0.20651739835739136, + "learning_rate": 9.948789400922787e-05, + "loss": 0.0474, + "step": 5600 + }, + { + "epoch": 1.0113574905354246, + "grad_norm": 0.14700564742088318, + "learning_rate": 9.948395244115953e-05, + "loss": 0.0453, + "step": 5610 + }, + { + "epoch": 1.0131602668108888, + "grad_norm": 0.17185929417610168, + "learning_rate": 9.9479995841173e-05, + "loss": 0.0453, + "step": 5620 + }, + { + "epoch": 1.014963043086353, + "grad_norm": 0.1785033792257309, + "learning_rate": 9.947602421047017e-05, + "loss": 0.0453, + "step": 5630 + }, + { + "epoch": 1.0167658193618172, + "grad_norm": 0.13014046847820282, + "learning_rate": 9.947203755025753e-05, + "loss": 0.0495, + "step": 5640 + }, + { + "epoch": 1.0185685956372814, + "grad_norm": 0.16697810590267181, + "learning_rate": 9.946803586174611e-05, + "loss": 0.0463, + "step": 5650 + }, + { + "epoch": 1.0203713719127456, + "grad_norm": 0.16410310566425323, + "learning_rate": 9.946401914615151e-05, + "loss": 0.0475, + "step": 5660 + }, + { + "epoch": 1.0221741481882098, + "grad_norm": 0.2110091745853424, + "learning_rate": 9.945998740469394e-05, + "loss": 0.0502, + "step": 5670 + }, + { + "epoch": 1.023976924463674, + "grad_norm": 0.17573440074920654, + "learning_rate": 9.945594063859809e-05, + "loss": 0.044, + "step": 5680 + }, + { + "epoch": 1.0257797007391383, + "grad_norm": 0.17109978199005127, + "learning_rate": 9.94518788490933e-05, + "loss": 0.0477, + "step": 5690 + }, + { + "epoch": 1.0275824770146025, + "grad_norm": 0.14034892618656158, + "learning_rate": 9.944780203741341e-05, + "loss": 0.0481, + "step": 5700 + }, + { + "epoch": 1.0293852532900667, + "grad_norm": 0.11823529750108719, + "learning_rate": 9.944371020479686e-05, + "loss": 0.0418, + "step": 5710 + }, + { + "epoch": 1.031188029565531, + "grad_norm": 0.23299425840377808, + "learning_rate": 9.943960335248662e-05, + "loss": 0.0496, + "step": 5720 + }, + { + "epoch": 1.0329908058409951, + "grad_norm": 0.13195425271987915, + "learning_rate": 9.943548148173027e-05, + "loss": 0.049, + "step": 5730 + }, + { + "epoch": 1.0347935821164593, + "grad_norm": 0.20911426842212677, + "learning_rate": 9.943134459377992e-05, + "loss": 0.0392, + "step": 5740 + }, + { + "epoch": 1.0365963583919235, + "grad_norm": 0.1547815203666687, + "learning_rate": 9.942719268989222e-05, + "loss": 0.0445, + "step": 5750 + }, + { + "epoch": 1.0383991346673878, + "grad_norm": 0.16118600964546204, + "learning_rate": 9.942302577132844e-05, + "loss": 0.0501, + "step": 5760 + }, + { + "epoch": 1.040201910942852, + "grad_norm": 0.20199312269687653, + "learning_rate": 9.941884383935438e-05, + "loss": 0.0505, + "step": 5770 + }, + { + "epoch": 1.0420046872183162, + "grad_norm": 0.20491407811641693, + "learning_rate": 9.941464689524039e-05, + "loss": 0.053, + "step": 5780 + }, + { + "epoch": 1.0438074634937804, + "grad_norm": 0.12579475343227386, + "learning_rate": 9.941043494026139e-05, + "loss": 0.0468, + "step": 5790 + }, + { + "epoch": 1.0456102397692446, + "grad_norm": 0.1562582403421402, + "learning_rate": 9.940620797569685e-05, + "loss": 0.0456, + "step": 5800 + }, + { + "epoch": 1.0474130160447088, + "grad_norm": 0.16877920925617218, + "learning_rate": 9.940196600283082e-05, + "loss": 0.0446, + "step": 5810 + }, + { + "epoch": 1.049215792320173, + "grad_norm": 0.16113650798797607, + "learning_rate": 9.939770902295192e-05, + "loss": 0.0496, + "step": 5820 + }, + { + "epoch": 1.0510185685956372, + "grad_norm": 0.1584942638874054, + "learning_rate": 9.939343703735329e-05, + "loss": 0.0461, + "step": 5830 + }, + { + "epoch": 1.0528213448711015, + "grad_norm": 0.13135896623134613, + "learning_rate": 9.938915004733264e-05, + "loss": 0.047, + "step": 5840 + }, + { + "epoch": 1.0546241211465657, + "grad_norm": 0.15665724873542786, + "learning_rate": 9.938484805419224e-05, + "loss": 0.0436, + "step": 5850 + }, + { + "epoch": 1.0564268974220299, + "grad_norm": 0.12544065713882446, + "learning_rate": 9.938053105923894e-05, + "loss": 0.0459, + "step": 5860 + }, + { + "epoch": 1.058229673697494, + "grad_norm": 0.18433156609535217, + "learning_rate": 9.937619906378413e-05, + "loss": 0.0469, + "step": 5870 + }, + { + "epoch": 1.0600324499729583, + "grad_norm": 0.1148960292339325, + "learning_rate": 9.937185206914374e-05, + "loss": 0.045, + "step": 5880 + }, + { + "epoch": 1.0618352262484225, + "grad_norm": 0.14018428325653076, + "learning_rate": 9.936749007663829e-05, + "loss": 0.0452, + "step": 5890 + }, + { + "epoch": 1.0636380025238867, + "grad_norm": 0.11849283427000046, + "learning_rate": 9.93631130875928e-05, + "loss": 0.0471, + "step": 5900 + }, + { + "epoch": 1.065440778799351, + "grad_norm": 0.22733385860919952, + "learning_rate": 9.935872110333692e-05, + "loss": 0.0418, + "step": 5910 + }, + { + "epoch": 1.0672435550748152, + "grad_norm": 0.1406257450580597, + "learning_rate": 9.935431412520484e-05, + "loss": 0.0428, + "step": 5920 + }, + { + "epoch": 1.0690463313502794, + "grad_norm": 0.1938510537147522, + "learning_rate": 9.934989215453523e-05, + "loss": 0.0502, + "step": 5930 + }, + { + "epoch": 1.0708491076257436, + "grad_norm": 0.15984466671943665, + "learning_rate": 9.934545519267139e-05, + "loss": 0.0409, + "step": 5940 + }, + { + "epoch": 1.0726518839012078, + "grad_norm": 0.1253848671913147, + "learning_rate": 9.934100324096117e-05, + "loss": 0.0517, + "step": 5950 + }, + { + "epoch": 1.074454660176672, + "grad_norm": 0.24231602251529694, + "learning_rate": 9.933653630075692e-05, + "loss": 0.0446, + "step": 5960 + }, + { + "epoch": 1.0762574364521362, + "grad_norm": 0.14123372733592987, + "learning_rate": 9.93320543734156e-05, + "loss": 0.0454, + "step": 5970 + }, + { + "epoch": 1.0780602127276004, + "grad_norm": 0.18040312826633453, + "learning_rate": 9.932755746029871e-05, + "loss": 0.0506, + "step": 5980 + }, + { + "epoch": 1.0798629890030647, + "grad_norm": 0.16003407537937164, + "learning_rate": 9.932304556277228e-05, + "loss": 0.0462, + "step": 5990 + }, + { + "epoch": 1.0816657652785289, + "grad_norm": 0.14502501487731934, + "learning_rate": 9.93185186822069e-05, + "loss": 0.0534, + "step": 6000 + }, + { + "epoch": 1.083468541553993, + "grad_norm": 0.14464087784290314, + "learning_rate": 9.931397681997773e-05, + "loss": 0.0426, + "step": 6010 + }, + { + "epoch": 1.0852713178294573, + "grad_norm": 0.18443164229393005, + "learning_rate": 9.930941997746446e-05, + "loss": 0.0471, + "step": 6020 + }, + { + "epoch": 1.0870740941049215, + "grad_norm": 0.11963239312171936, + "learning_rate": 9.930484815605134e-05, + "loss": 0.0457, + "step": 6030 + }, + { + "epoch": 1.0888768703803857, + "grad_norm": 0.14174722135066986, + "learning_rate": 9.930026135712717e-05, + "loss": 0.0453, + "step": 6040 + }, + { + "epoch": 1.09067964665585, + "grad_norm": 0.17676784098148346, + "learning_rate": 9.92956595820853e-05, + "loss": 0.0456, + "step": 6050 + }, + { + "epoch": 1.0924824229313141, + "grad_norm": 0.15777680277824402, + "learning_rate": 9.929104283232362e-05, + "loss": 0.0476, + "step": 6060 + }, + { + "epoch": 1.0942851992067784, + "grad_norm": 0.17012213170528412, + "learning_rate": 9.92864111092446e-05, + "loss": 0.0441, + "step": 6070 + }, + { + "epoch": 1.0960879754822426, + "grad_norm": 0.21466368436813354, + "learning_rate": 9.92817644142552e-05, + "loss": 0.0451, + "step": 6080 + }, + { + "epoch": 1.0978907517577068, + "grad_norm": 0.16265292465686798, + "learning_rate": 9.927710274876698e-05, + "loss": 0.0434, + "step": 6090 + }, + { + "epoch": 1.099693528033171, + "grad_norm": 0.2022707313299179, + "learning_rate": 9.927242611419603e-05, + "loss": 0.045, + "step": 6100 + }, + { + "epoch": 1.1014963043086352, + "grad_norm": 0.2022763192653656, + "learning_rate": 9.926773451196301e-05, + "loss": 0.0482, + "step": 6110 + }, + { + "epoch": 1.1032990805840994, + "grad_norm": 0.17981594800949097, + "learning_rate": 9.926302794349306e-05, + "loss": 0.047, + "step": 6120 + }, + { + "epoch": 1.1051018568595636, + "grad_norm": 0.18052178621292114, + "learning_rate": 9.925830641021594e-05, + "loss": 0.0444, + "step": 6130 + }, + { + "epoch": 1.1069046331350278, + "grad_norm": 0.19416724145412445, + "learning_rate": 9.925356991356593e-05, + "loss": 0.0459, + "step": 6140 + }, + { + "epoch": 1.108707409410492, + "grad_norm": 0.19651128351688385, + "learning_rate": 9.924881845498184e-05, + "loss": 0.0415, + "step": 6150 + }, + { + "epoch": 1.1105101856859563, + "grad_norm": 0.17051537334918976, + "learning_rate": 9.924405203590705e-05, + "loss": 0.0445, + "step": 6160 + }, + { + "epoch": 1.1123129619614205, + "grad_norm": 0.15216341614723206, + "learning_rate": 9.923927065778946e-05, + "loss": 0.042, + "step": 6170 + }, + { + "epoch": 1.1141157382368847, + "grad_norm": 0.1388617604970932, + "learning_rate": 9.923447432208154e-05, + "loss": 0.0455, + "step": 6180 + }, + { + "epoch": 1.115918514512349, + "grad_norm": 0.14735934138298035, + "learning_rate": 9.922966303024027e-05, + "loss": 0.0443, + "step": 6190 + }, + { + "epoch": 1.1177212907878133, + "grad_norm": 0.15511681139469147, + "learning_rate": 9.922483678372721e-05, + "loss": 0.0414, + "step": 6200 + }, + { + "epoch": 1.1195240670632773, + "grad_norm": 0.1833156943321228, + "learning_rate": 9.921999558400845e-05, + "loss": 0.0489, + "step": 6210 + }, + { + "epoch": 1.1213268433387418, + "grad_norm": 0.15968109667301178, + "learning_rate": 9.92151394325546e-05, + "loss": 0.047, + "step": 6220 + }, + { + "epoch": 1.1231296196142058, + "grad_norm": 0.12436918169260025, + "learning_rate": 9.921026833084084e-05, + "loss": 0.0478, + "step": 6230 + }, + { + "epoch": 1.1249323958896702, + "grad_norm": 0.1411399096250534, + "learning_rate": 9.920538228034689e-05, + "loss": 0.0454, + "step": 6240 + }, + { + "epoch": 1.1267351721651342, + "grad_norm": 0.1757981926202774, + "learning_rate": 9.920048128255699e-05, + "loss": 0.0449, + "step": 6250 + }, + { + "epoch": 1.1285379484405986, + "grad_norm": 0.14229696989059448, + "learning_rate": 9.919556533895995e-05, + "loss": 0.0433, + "step": 6260 + }, + { + "epoch": 1.1303407247160626, + "grad_norm": 0.199496790766716, + "learning_rate": 9.919063445104907e-05, + "loss": 0.0456, + "step": 6270 + }, + { + "epoch": 1.132143500991527, + "grad_norm": 0.15765699744224548, + "learning_rate": 9.918568862032227e-05, + "loss": 0.0509, + "step": 6280 + }, + { + "epoch": 1.133946277266991, + "grad_norm": 0.20775863528251648, + "learning_rate": 9.918072784828194e-05, + "loss": 0.0419, + "step": 6290 + }, + { + "epoch": 1.1357490535424555, + "grad_norm": 0.13819508254528046, + "learning_rate": 9.917575213643501e-05, + "loss": 0.0465, + "step": 6300 + }, + { + "epoch": 1.1375518298179197, + "grad_norm": 0.17002363502979279, + "learning_rate": 9.917076148629302e-05, + "loss": 0.0449, + "step": 6310 + }, + { + "epoch": 1.139354606093384, + "grad_norm": 0.16458258032798767, + "learning_rate": 9.916575589937196e-05, + "loss": 0.0446, + "step": 6320 + }, + { + "epoch": 1.1411573823688481, + "grad_norm": 0.2087150514125824, + "learning_rate": 9.916073537719239e-05, + "loss": 0.0476, + "step": 6330 + }, + { + "epoch": 1.1429601586443123, + "grad_norm": 0.20809558033943176, + "learning_rate": 9.915569992127944e-05, + "loss": 0.049, + "step": 6340 + }, + { + "epoch": 1.1447629349197765, + "grad_norm": 0.17088796198368073, + "learning_rate": 9.915064953316273e-05, + "loss": 0.0401, + "step": 6350 + }, + { + "epoch": 1.1465657111952408, + "grad_norm": 0.18331198394298553, + "learning_rate": 9.914558421437645e-05, + "loss": 0.0422, + "step": 6360 + }, + { + "epoch": 1.148368487470705, + "grad_norm": 0.1356252282857895, + "learning_rate": 9.914050396645929e-05, + "loss": 0.0431, + "step": 6370 + }, + { + "epoch": 1.1501712637461692, + "grad_norm": 0.15957140922546387, + "learning_rate": 9.913540879095452e-05, + "loss": 0.0454, + "step": 6380 + }, + { + "epoch": 1.1519740400216334, + "grad_norm": 0.16391707956790924, + "learning_rate": 9.913029868940987e-05, + "loss": 0.0462, + "step": 6390 + }, + { + "epoch": 1.1537768162970976, + "grad_norm": 0.1389417201280594, + "learning_rate": 9.912517366337772e-05, + "loss": 0.0442, + "step": 6400 + }, + { + "epoch": 1.1555795925725618, + "grad_norm": 0.15863144397735596, + "learning_rate": 9.912003371441487e-05, + "loss": 0.0477, + "step": 6410 + }, + { + "epoch": 1.157382368848026, + "grad_norm": 0.1406799852848053, + "learning_rate": 9.911487884408271e-05, + "loss": 0.0483, + "step": 6420 + }, + { + "epoch": 1.1591851451234902, + "grad_norm": 0.1771913468837738, + "learning_rate": 9.910970905394719e-05, + "loss": 0.0467, + "step": 6430 + }, + { + "epoch": 1.1609879213989545, + "grad_norm": 0.19251134991645813, + "learning_rate": 9.91045243455787e-05, + "loss": 0.0451, + "step": 6440 + }, + { + "epoch": 1.1627906976744187, + "grad_norm": 0.1681923270225525, + "learning_rate": 9.909932472055225e-05, + "loss": 0.0473, + "step": 6450 + }, + { + "epoch": 1.1645934739498829, + "grad_norm": 0.1934024542570114, + "learning_rate": 9.909411018044734e-05, + "loss": 0.0452, + "step": 6460 + }, + { + "epoch": 1.166396250225347, + "grad_norm": 0.19895634055137634, + "learning_rate": 9.908888072684802e-05, + "loss": 0.0463, + "step": 6470 + }, + { + "epoch": 1.1681990265008113, + "grad_norm": 0.20210351049900055, + "learning_rate": 9.908363636134285e-05, + "loss": 0.0479, + "step": 6480 + }, + { + "epoch": 1.1700018027762755, + "grad_norm": 0.15985043346881866, + "learning_rate": 9.907837708552493e-05, + "loss": 0.0452, + "step": 6490 + }, + { + "epoch": 1.1718045790517397, + "grad_norm": 0.10954516381025314, + "learning_rate": 9.90731029009919e-05, + "loss": 0.0429, + "step": 6500 + }, + { + "epoch": 1.173607355327204, + "grad_norm": 0.2081098109483719, + "learning_rate": 9.906781380934589e-05, + "loss": 0.0466, + "step": 6510 + }, + { + "epoch": 1.1754101316026682, + "grad_norm": 0.16629405319690704, + "learning_rate": 9.906250981219362e-05, + "loss": 0.04, + "step": 6520 + }, + { + "epoch": 1.1772129078781324, + "grad_norm": 0.19642026722431183, + "learning_rate": 9.905719091114628e-05, + "loss": 0.0498, + "step": 6530 + }, + { + "epoch": 1.1790156841535966, + "grad_norm": 0.23067142069339752, + "learning_rate": 9.905185710781964e-05, + "loss": 0.0527, + "step": 6540 + }, + { + "epoch": 1.1808184604290608, + "grad_norm": 0.16386303305625916, + "learning_rate": 9.904650840383392e-05, + "loss": 0.0477, + "step": 6550 + }, + { + "epoch": 1.182621236704525, + "grad_norm": 0.1786251962184906, + "learning_rate": 9.904114480081397e-05, + "loss": 0.0454, + "step": 6560 + }, + { + "epoch": 1.1844240129799892, + "grad_norm": 0.1505107283592224, + "learning_rate": 9.903576630038906e-05, + "loss": 0.0435, + "step": 6570 + }, + { + "epoch": 1.1862267892554534, + "grad_norm": 0.11944922804832458, + "learning_rate": 9.903037290419309e-05, + "loss": 0.0453, + "step": 6580 + }, + { + "epoch": 1.1880295655309177, + "grad_norm": 0.13519299030303955, + "learning_rate": 9.902496461386439e-05, + "loss": 0.0522, + "step": 6590 + }, + { + "epoch": 1.1898323418063819, + "grad_norm": 0.17141962051391602, + "learning_rate": 9.901954143104588e-05, + "loss": 0.0429, + "step": 6600 + }, + { + "epoch": 1.191635118081846, + "grad_norm": 0.15148812532424927, + "learning_rate": 9.901410335738496e-05, + "loss": 0.0475, + "step": 6610 + }, + { + "epoch": 1.1934378943573103, + "grad_norm": 0.1400941014289856, + "learning_rate": 9.900865039453358e-05, + "loss": 0.0487, + "step": 6620 + }, + { + "epoch": 1.1952406706327745, + "grad_norm": 0.22592294216156006, + "learning_rate": 9.900318254414821e-05, + "loss": 0.0445, + "step": 6630 + }, + { + "epoch": 1.1970434469082387, + "grad_norm": 0.1541605442762375, + "learning_rate": 9.899769980788985e-05, + "loss": 0.041, + "step": 6640 + }, + { + "epoch": 1.198846223183703, + "grad_norm": 0.15684810280799866, + "learning_rate": 9.899220218742398e-05, + "loss": 0.0481, + "step": 6650 + }, + { + "epoch": 1.2006489994591671, + "grad_norm": 0.13748125731945038, + "learning_rate": 9.898668968442066e-05, + "loss": 0.0518, + "step": 6660 + }, + { + "epoch": 1.2024517757346314, + "grad_norm": 0.17705634236335754, + "learning_rate": 9.898116230055443e-05, + "loss": 0.0485, + "step": 6670 + }, + { + "epoch": 1.2042545520100956, + "grad_norm": 0.13691218197345734, + "learning_rate": 9.897562003750437e-05, + "loss": 0.0488, + "step": 6680 + }, + { + "epoch": 1.2060573282855598, + "grad_norm": 0.14242900907993317, + "learning_rate": 9.897006289695407e-05, + "loss": 0.0433, + "step": 6690 + }, + { + "epoch": 1.207860104561024, + "grad_norm": 0.23364323377609253, + "learning_rate": 9.896449088059164e-05, + "loss": 0.0468, + "step": 6700 + }, + { + "epoch": 1.2096628808364882, + "grad_norm": 0.15275025367736816, + "learning_rate": 9.89589039901097e-05, + "loss": 0.0476, + "step": 6710 + }, + { + "epoch": 1.2114656571119524, + "grad_norm": 0.14401565492153168, + "learning_rate": 9.895330222720542e-05, + "loss": 0.0446, + "step": 6720 + }, + { + "epoch": 1.2132684333874166, + "grad_norm": 0.13415250182151794, + "learning_rate": 9.894768559358047e-05, + "loss": 0.0482, + "step": 6730 + }, + { + "epoch": 1.2150712096628808, + "grad_norm": 0.12522150576114655, + "learning_rate": 9.894205409094101e-05, + "loss": 0.0435, + "step": 6740 + }, + { + "epoch": 1.216873985938345, + "grad_norm": 0.13278326392173767, + "learning_rate": 9.893640772099777e-05, + "loss": 0.0506, + "step": 6750 + }, + { + "epoch": 1.2186767622138093, + "grad_norm": 0.2089739739894867, + "learning_rate": 9.893074648546595e-05, + "loss": 0.0458, + "step": 6760 + }, + { + "epoch": 1.2204795384892735, + "grad_norm": 0.14211158454418182, + "learning_rate": 9.892507038606528e-05, + "loss": 0.0479, + "step": 6770 + }, + { + "epoch": 1.2222823147647377, + "grad_norm": 0.20254890620708466, + "learning_rate": 9.891937942452003e-05, + "loss": 0.0434, + "step": 6780 + }, + { + "epoch": 1.224085091040202, + "grad_norm": 0.14579035341739655, + "learning_rate": 9.891367360255895e-05, + "loss": 0.0479, + "step": 6790 + }, + { + "epoch": 1.2258878673156661, + "grad_norm": 0.19807976484298706, + "learning_rate": 9.890795292191532e-05, + "loss": 0.0502, + "step": 6800 + }, + { + "epoch": 1.2276906435911303, + "grad_norm": 0.1949124038219452, + "learning_rate": 9.890221738432694e-05, + "loss": 0.0431, + "step": 6810 + }, + { + "epoch": 1.2294934198665946, + "grad_norm": 0.14246895909309387, + "learning_rate": 9.88964669915361e-05, + "loss": 0.0416, + "step": 6820 + }, + { + "epoch": 1.2312961961420588, + "grad_norm": 0.20334585011005402, + "learning_rate": 9.889070174528963e-05, + "loss": 0.0479, + "step": 6830 + }, + { + "epoch": 1.233098972417523, + "grad_norm": 0.20166416466236115, + "learning_rate": 9.888492164733883e-05, + "loss": 0.0452, + "step": 6840 + }, + { + "epoch": 1.2349017486929872, + "grad_norm": 0.17774198949337006, + "learning_rate": 9.88791266994396e-05, + "loss": 0.0432, + "step": 6850 + }, + { + "epoch": 1.2367045249684514, + "grad_norm": 0.15698379278182983, + "learning_rate": 9.887331690335223e-05, + "loss": 0.0472, + "step": 6860 + }, + { + "epoch": 1.2385073012439156, + "grad_norm": 0.15600785613059998, + "learning_rate": 9.886749226084163e-05, + "loss": 0.0352, + "step": 6870 + }, + { + "epoch": 1.2403100775193798, + "grad_norm": 0.18297724425792694, + "learning_rate": 9.886165277367714e-05, + "loss": 0.0469, + "step": 6880 + }, + { + "epoch": 1.242112853794844, + "grad_norm": 0.13390932977199554, + "learning_rate": 9.885579844363265e-05, + "loss": 0.0419, + "step": 6890 + }, + { + "epoch": 1.2439156300703083, + "grad_norm": 0.15496668219566345, + "learning_rate": 9.884992927248656e-05, + "loss": 0.0481, + "step": 6900 + }, + { + "epoch": 1.2457184063457725, + "grad_norm": 0.1544051617383957, + "learning_rate": 9.884404526202178e-05, + "loss": 0.0448, + "step": 6910 + }, + { + "epoch": 1.2475211826212367, + "grad_norm": 0.12542957067489624, + "learning_rate": 9.883814641402568e-05, + "loss": 0.0448, + "step": 6920 + }, + { + "epoch": 1.249323958896701, + "grad_norm": 0.17185518145561218, + "learning_rate": 9.88322327302902e-05, + "loss": 0.0509, + "step": 6930 + }, + { + "epoch": 1.251126735172165, + "grad_norm": 0.13775959610939026, + "learning_rate": 9.882630421261176e-05, + "loss": 0.0433, + "step": 6940 + }, + { + "epoch": 1.2529295114476293, + "grad_norm": 0.10725994408130646, + "learning_rate": 9.88203608627913e-05, + "loss": 0.0421, + "step": 6950 + }, + { + "epoch": 1.2547322877230935, + "grad_norm": 0.13983775675296783, + "learning_rate": 9.881440268263422e-05, + "loss": 0.0444, + "step": 6960 + }, + { + "epoch": 1.2565350639985577, + "grad_norm": 0.15295752882957458, + "learning_rate": 9.880842967395048e-05, + "loss": 0.0423, + "step": 6970 + }, + { + "epoch": 1.258337840274022, + "grad_norm": 0.14794494211673737, + "learning_rate": 9.880244183855452e-05, + "loss": 0.041, + "step": 6980 + }, + { + "epoch": 1.2601406165494862, + "grad_norm": 0.10952970385551453, + "learning_rate": 9.879643917826527e-05, + "loss": 0.0478, + "step": 6990 + }, + { + "epoch": 1.2619433928249504, + "grad_norm": 0.1916574239730835, + "learning_rate": 9.87904216949062e-05, + "loss": 0.0486, + "step": 7000 + }, + { + "epoch": 1.2637461691004146, + "grad_norm": 0.15800578892230988, + "learning_rate": 9.878438939030526e-05, + "loss": 0.0422, + "step": 7010 + }, + { + "epoch": 1.2655489453758788, + "grad_norm": 0.16109836101531982, + "learning_rate": 9.877834226629489e-05, + "loss": 0.045, + "step": 7020 + }, + { + "epoch": 1.267351721651343, + "grad_norm": 0.15866237878799438, + "learning_rate": 9.877228032471206e-05, + "loss": 0.0399, + "step": 7030 + }, + { + "epoch": 1.2691544979268072, + "grad_norm": 0.16993622481822968, + "learning_rate": 9.876620356739823e-05, + "loss": 0.0409, + "step": 7040 + }, + { + "epoch": 1.2709572742022714, + "grad_norm": 0.14860498905181885, + "learning_rate": 9.876011199619935e-05, + "loss": 0.0463, + "step": 7050 + }, + { + "epoch": 1.2727600504777357, + "grad_norm": 0.16061100363731384, + "learning_rate": 9.875400561296589e-05, + "loss": 0.0418, + "step": 7060 + }, + { + "epoch": 1.2745628267531999, + "grad_norm": 0.11456069350242615, + "learning_rate": 9.874788441955278e-05, + "loss": 0.0464, + "step": 7070 + }, + { + "epoch": 1.276365603028664, + "grad_norm": 0.1331043392419815, + "learning_rate": 9.874174841781951e-05, + "loss": 0.0437, + "step": 7080 + }, + { + "epoch": 1.2781683793041283, + "grad_norm": 0.1368376463651657, + "learning_rate": 9.873559760963003e-05, + "loss": 0.0425, + "step": 7090 + }, + { + "epoch": 1.2799711555795925, + "grad_norm": 0.13278715312480927, + "learning_rate": 9.872943199685278e-05, + "loss": 0.043, + "step": 7100 + }, + { + "epoch": 1.2817739318550567, + "grad_norm": 0.13766273856163025, + "learning_rate": 9.872325158136071e-05, + "loss": 0.0436, + "step": 7110 + }, + { + "epoch": 1.283576708130521, + "grad_norm": 0.23797397315502167, + "learning_rate": 9.871705636503128e-05, + "loss": 0.0385, + "step": 7120 + }, + { + "epoch": 1.2853794844059852, + "grad_norm": 0.12689067423343658, + "learning_rate": 9.871084634974641e-05, + "loss": 0.043, + "step": 7130 + }, + { + "epoch": 1.2871822606814494, + "grad_norm": 0.21763867139816284, + "learning_rate": 9.870462153739257e-05, + "loss": 0.0434, + "step": 7140 + }, + { + "epoch": 1.2889850369569136, + "grad_norm": 0.10984236001968384, + "learning_rate": 9.869838192986067e-05, + "loss": 0.0415, + "step": 7150 + }, + { + "epoch": 1.2907878132323778, + "grad_norm": 0.1933307945728302, + "learning_rate": 9.869212752904616e-05, + "loss": 0.0444, + "step": 7160 + }, + { + "epoch": 1.292590589507842, + "grad_norm": 0.16871808469295502, + "learning_rate": 9.868585833684894e-05, + "loss": 0.0408, + "step": 7170 + }, + { + "epoch": 1.2943933657833062, + "grad_norm": 0.11162350326776505, + "learning_rate": 9.867957435517342e-05, + "loss": 0.0425, + "step": 7180 + }, + { + "epoch": 1.2961961420587704, + "grad_norm": 0.17216373980045319, + "learning_rate": 9.867327558592854e-05, + "loss": 0.0426, + "step": 7190 + }, + { + "epoch": 1.2979989183342346, + "grad_norm": 0.22052639722824097, + "learning_rate": 9.866696203102766e-05, + "loss": 0.0401, + "step": 7200 + }, + { + "epoch": 1.2998016946096989, + "grad_norm": 0.16249202191829681, + "learning_rate": 9.86606336923887e-05, + "loss": 0.042, + "step": 7210 + }, + { + "epoch": 1.301604470885163, + "grad_norm": 0.1349180042743683, + "learning_rate": 9.865429057193403e-05, + "loss": 0.0427, + "step": 7220 + }, + { + "epoch": 1.3034072471606275, + "grad_norm": 0.15774761140346527, + "learning_rate": 9.864793267159053e-05, + "loss": 0.0443, + "step": 7230 + }, + { + "epoch": 1.3052100234360915, + "grad_norm": 0.18511465191841125, + "learning_rate": 9.864155999328957e-05, + "loss": 0.0454, + "step": 7240 + }, + { + "epoch": 1.307012799711556, + "grad_norm": 0.13638466596603394, + "learning_rate": 9.8635172538967e-05, + "loss": 0.0471, + "step": 7250 + }, + { + "epoch": 1.30881557598702, + "grad_norm": 0.14766816794872284, + "learning_rate": 9.862877031056312e-05, + "loss": 0.0417, + "step": 7260 + }, + { + "epoch": 1.3106183522624844, + "grad_norm": 0.13629595935344696, + "learning_rate": 9.862235331002279e-05, + "loss": 0.042, + "step": 7270 + }, + { + "epoch": 1.3124211285379483, + "grad_norm": 0.1394883245229721, + "learning_rate": 9.861592153929533e-05, + "loss": 0.044, + "step": 7280 + }, + { + "epoch": 1.3142239048134128, + "grad_norm": 0.16391190886497498, + "learning_rate": 9.860947500033455e-05, + "loss": 0.0448, + "step": 7290 + }, + { + "epoch": 1.3160266810888768, + "grad_norm": 0.11683732271194458, + "learning_rate": 9.86030136950987e-05, + "loss": 0.0455, + "step": 7300 + }, + { + "epoch": 1.3178294573643412, + "grad_norm": 0.17655129730701447, + "learning_rate": 9.85965376255506e-05, + "loss": 0.0451, + "step": 7310 + }, + { + "epoch": 1.3196322336398052, + "grad_norm": 0.16985571384429932, + "learning_rate": 9.859004679365747e-05, + "loss": 0.0415, + "step": 7320 + }, + { + "epoch": 1.3214350099152696, + "grad_norm": 0.20975105464458466, + "learning_rate": 9.858354120139108e-05, + "loss": 0.0448, + "step": 7330 + }, + { + "epoch": 1.3232377861907336, + "grad_norm": 0.13291749358177185, + "learning_rate": 9.857702085072764e-05, + "loss": 0.0452, + "step": 7340 + }, + { + "epoch": 1.325040562466198, + "grad_norm": 0.20697391033172607, + "learning_rate": 9.857048574364787e-05, + "loss": 0.0442, + "step": 7350 + }, + { + "epoch": 1.326843338741662, + "grad_norm": 0.15438371896743774, + "learning_rate": 9.856393588213698e-05, + "loss": 0.0454, + "step": 7360 + }, + { + "epoch": 1.3286461150171265, + "grad_norm": 0.17755204439163208, + "learning_rate": 9.855737126818458e-05, + "loss": 0.0413, + "step": 7370 + }, + { + "epoch": 1.3304488912925905, + "grad_norm": 0.14504413306713104, + "learning_rate": 9.855079190378491e-05, + "loss": 0.0383, + "step": 7380 + }, + { + "epoch": 1.332251667568055, + "grad_norm": 0.13375242054462433, + "learning_rate": 9.854419779093655e-05, + "loss": 0.0437, + "step": 7390 + }, + { + "epoch": 1.334054443843519, + "grad_norm": 0.11517435312271118, + "learning_rate": 9.853758893164264e-05, + "loss": 0.045, + "step": 7400 + }, + { + "epoch": 1.3358572201189833, + "grad_norm": 0.15008369088172913, + "learning_rate": 9.853096532791078e-05, + "loss": 0.0479, + "step": 7410 + }, + { + "epoch": 1.3376599963944473, + "grad_norm": 0.22088518738746643, + "learning_rate": 9.852432698175304e-05, + "loss": 0.0442, + "step": 7420 + }, + { + "epoch": 1.3394627726699118, + "grad_norm": 0.15875470638275146, + "learning_rate": 9.851767389518597e-05, + "loss": 0.047, + "step": 7430 + }, + { + "epoch": 1.3412655489453758, + "grad_norm": 0.16996069252490997, + "learning_rate": 9.85110060702306e-05, + "loss": 0.0431, + "step": 7440 + }, + { + "epoch": 1.3430683252208402, + "grad_norm": 0.17854364216327667, + "learning_rate": 9.850432350891245e-05, + "loss": 0.0453, + "step": 7450 + }, + { + "epoch": 1.3448711014963042, + "grad_norm": 0.17406010627746582, + "learning_rate": 9.84976262132615e-05, + "loss": 0.0448, + "step": 7460 + }, + { + "epoch": 1.3466738777717686, + "grad_norm": 0.16046424210071564, + "learning_rate": 9.849091418531222e-05, + "loss": 0.042, + "step": 7470 + }, + { + "epoch": 1.3484766540472326, + "grad_norm": 0.17490343749523163, + "learning_rate": 9.848418742710353e-05, + "loss": 0.0432, + "step": 7480 + }, + { + "epoch": 1.350279430322697, + "grad_norm": 0.19233691692352295, + "learning_rate": 9.847744594067885e-05, + "loss": 0.0411, + "step": 7490 + }, + { + "epoch": 1.3520822065981613, + "grad_norm": 0.1614624410867691, + "learning_rate": 9.847068972808607e-05, + "loss": 0.0428, + "step": 7500 + }, + { + "epoch": 1.3538849828736255, + "grad_norm": 0.1238013207912445, + "learning_rate": 9.846391879137756e-05, + "loss": 0.042, + "step": 7510 + }, + { + "epoch": 1.3556877591490897, + "grad_norm": 0.15535993874073029, + "learning_rate": 9.845713313261012e-05, + "loss": 0.0459, + "step": 7520 + }, + { + "epoch": 1.357490535424554, + "grad_norm": 0.1513490229845047, + "learning_rate": 9.845033275384505e-05, + "loss": 0.0447, + "step": 7530 + }, + { + "epoch": 1.359293311700018, + "grad_norm": 0.1245562881231308, + "learning_rate": 9.844351765714818e-05, + "loss": 0.0487, + "step": 7540 + }, + { + "epoch": 1.3610960879754823, + "grad_norm": 0.2179003357887268, + "learning_rate": 9.843668784458971e-05, + "loss": 0.0454, + "step": 7550 + }, + { + "epoch": 1.3628988642509465, + "grad_norm": 0.17515775561332703, + "learning_rate": 9.842984331824437e-05, + "loss": 0.0427, + "step": 7560 + }, + { + "epoch": 1.3647016405264107, + "grad_norm": 0.1258850246667862, + "learning_rate": 9.842298408019133e-05, + "loss": 0.0424, + "step": 7570 + }, + { + "epoch": 1.366504416801875, + "grad_norm": 0.155188649892807, + "learning_rate": 9.841611013251429e-05, + "loss": 0.046, + "step": 7580 + }, + { + "epoch": 1.3683071930773392, + "grad_norm": 0.15004678070545197, + "learning_rate": 9.840922147730133e-05, + "loss": 0.043, + "step": 7590 + }, + { + "epoch": 1.3701099693528034, + "grad_norm": 0.13265766203403473, + "learning_rate": 9.840231811664506e-05, + "loss": 0.0504, + "step": 7600 + }, + { + "epoch": 1.3719127456282676, + "grad_norm": 0.15950389206409454, + "learning_rate": 9.839540005264252e-05, + "loss": 0.0455, + "step": 7610 + }, + { + "epoch": 1.3737155219037318, + "grad_norm": 0.12739747762680054, + "learning_rate": 9.838846728739527e-05, + "loss": 0.0406, + "step": 7620 + }, + { + "epoch": 1.375518298179196, + "grad_norm": 0.15198072791099548, + "learning_rate": 9.838151982300927e-05, + "loss": 0.0393, + "step": 7630 + }, + { + "epoch": 1.3773210744546602, + "grad_norm": 0.15584009885787964, + "learning_rate": 9.8374557661595e-05, + "loss": 0.0365, + "step": 7640 + }, + { + "epoch": 1.3791238507301244, + "grad_norm": 0.20010541379451752, + "learning_rate": 9.836758080526735e-05, + "loss": 0.0461, + "step": 7650 + }, + { + "epoch": 1.3809266270055887, + "grad_norm": 0.14808468520641327, + "learning_rate": 9.836058925614575e-05, + "loss": 0.0409, + "step": 7660 + }, + { + "epoch": 1.3827294032810529, + "grad_norm": 0.19547396898269653, + "learning_rate": 9.8353583016354e-05, + "loss": 0.0441, + "step": 7670 + }, + { + "epoch": 1.384532179556517, + "grad_norm": 0.1321982890367508, + "learning_rate": 9.834656208802044e-05, + "loss": 0.0471, + "step": 7680 + }, + { + "epoch": 1.3863349558319813, + "grad_norm": 0.15596811473369598, + "learning_rate": 9.833952647327784e-05, + "loss": 0.049, + "step": 7690 + }, + { + "epoch": 1.3881377321074455, + "grad_norm": 0.1391909122467041, + "learning_rate": 9.833247617426342e-05, + "loss": 0.0451, + "step": 7700 + }, + { + "epoch": 1.3899405083829097, + "grad_norm": 0.1682356894016266, + "learning_rate": 9.832541119311889e-05, + "loss": 0.0447, + "step": 7710 + }, + { + "epoch": 1.391743284658374, + "grad_norm": 0.15864479541778564, + "learning_rate": 9.83183315319904e-05, + "loss": 0.0457, + "step": 7720 + }, + { + "epoch": 1.3935460609338381, + "grad_norm": 0.1691732257604599, + "learning_rate": 9.831123719302855e-05, + "loss": 0.0416, + "step": 7730 + }, + { + "epoch": 1.3953488372093024, + "grad_norm": 0.1667153537273407, + "learning_rate": 9.830412817838842e-05, + "loss": 0.043, + "step": 7740 + }, + { + "epoch": 1.3971516134847666, + "grad_norm": 0.14120270311832428, + "learning_rate": 9.829700449022956e-05, + "loss": 0.0436, + "step": 7750 + }, + { + "epoch": 1.3989543897602308, + "grad_norm": 0.23471084237098694, + "learning_rate": 9.828986613071593e-05, + "loss": 0.0416, + "step": 7760 + }, + { + "epoch": 1.400757166035695, + "grad_norm": 0.187713623046875, + "learning_rate": 9.828271310201601e-05, + "loss": 0.046, + "step": 7770 + }, + { + "epoch": 1.4025599423111592, + "grad_norm": 0.14881864190101624, + "learning_rate": 9.827554540630268e-05, + "loss": 0.0395, + "step": 7780 + }, + { + "epoch": 1.4043627185866234, + "grad_norm": 0.1740046590566635, + "learning_rate": 9.826836304575329e-05, + "loss": 0.0431, + "step": 7790 + }, + { + "epoch": 1.4061654948620876, + "grad_norm": 0.11747528612613678, + "learning_rate": 9.826116602254966e-05, + "loss": 0.0442, + "step": 7800 + }, + { + "epoch": 1.4079682711375519, + "grad_norm": 0.1879766881465912, + "learning_rate": 9.825395433887805e-05, + "loss": 0.0442, + "step": 7810 + }, + { + "epoch": 1.409771047413016, + "grad_norm": 0.13589905202388763, + "learning_rate": 9.824672799692917e-05, + "loss": 0.0441, + "step": 7820 + }, + { + "epoch": 1.4115738236884803, + "grad_norm": 0.12330379337072372, + "learning_rate": 9.823948699889823e-05, + "loss": 0.0449, + "step": 7830 + }, + { + "epoch": 1.4133765999639445, + "grad_norm": 0.16740384697914124, + "learning_rate": 9.823223134698483e-05, + "loss": 0.0416, + "step": 7840 + }, + { + "epoch": 1.4151793762394087, + "grad_norm": 0.1375618875026703, + "learning_rate": 9.822496104339303e-05, + "loss": 0.0428, + "step": 7850 + }, + { + "epoch": 1.416982152514873, + "grad_norm": 0.1519220471382141, + "learning_rate": 9.821767609033138e-05, + "loss": 0.0421, + "step": 7860 + }, + { + "epoch": 1.4187849287903371, + "grad_norm": 0.15725424885749817, + "learning_rate": 9.821037649001284e-05, + "loss": 0.0427, + "step": 7870 + }, + { + "epoch": 1.4205877050658013, + "grad_norm": 0.10759121179580688, + "learning_rate": 9.820306224465486e-05, + "loss": 0.0432, + "step": 7880 + }, + { + "epoch": 1.4223904813412656, + "grad_norm": 0.16982968151569366, + "learning_rate": 9.819573335647928e-05, + "loss": 0.044, + "step": 7890 + }, + { + "epoch": 1.4241932576167298, + "grad_norm": 0.09863346070051193, + "learning_rate": 9.818838982771246e-05, + "loss": 0.0457, + "step": 7900 + }, + { + "epoch": 1.425996033892194, + "grad_norm": 0.10342240333557129, + "learning_rate": 9.818103166058514e-05, + "loss": 0.0499, + "step": 7910 + }, + { + "epoch": 1.4277988101676582, + "grad_norm": 0.10769131779670715, + "learning_rate": 9.817365885733254e-05, + "loss": 0.0411, + "step": 7920 + }, + { + "epoch": 1.4296015864431224, + "grad_norm": 0.192804753780365, + "learning_rate": 9.816627142019434e-05, + "loss": 0.0413, + "step": 7930 + }, + { + "epoch": 1.4314043627185866, + "grad_norm": 0.13700097799301147, + "learning_rate": 9.815886935141463e-05, + "loss": 0.046, + "step": 7940 + }, + { + "epoch": 1.4332071389940508, + "grad_norm": 0.11457335948944092, + "learning_rate": 9.8151452653242e-05, + "loss": 0.0453, + "step": 7950 + }, + { + "epoch": 1.435009915269515, + "grad_norm": 0.13416250050067902, + "learning_rate": 9.814402132792939e-05, + "loss": 0.0408, + "step": 7960 + }, + { + "epoch": 1.4368126915449793, + "grad_norm": 0.20427556335926056, + "learning_rate": 9.813657537773428e-05, + "loss": 0.0444, + "step": 7970 + }, + { + "epoch": 1.4386154678204435, + "grad_norm": 0.21200846135616302, + "learning_rate": 9.812911480491854e-05, + "loss": 0.0454, + "step": 7980 + }, + { + "epoch": 1.4404182440959077, + "grad_norm": 0.13624627888202667, + "learning_rate": 9.81216396117485e-05, + "loss": 0.0393, + "step": 7990 + }, + { + "epoch": 1.442221020371372, + "grad_norm": 0.20008771121501923, + "learning_rate": 9.811414980049491e-05, + "loss": 0.0443, + "step": 8000 + }, + { + "epoch": 1.444023796646836, + "grad_norm": 0.13469800353050232, + "learning_rate": 9.810664537343301e-05, + "loss": 0.0445, + "step": 8010 + }, + { + "epoch": 1.4458265729223003, + "grad_norm": 0.12516558170318604, + "learning_rate": 9.809912633284243e-05, + "loss": 0.0441, + "step": 8020 + }, + { + "epoch": 1.4476293491977645, + "grad_norm": 0.12844984233379364, + "learning_rate": 9.809159268100725e-05, + "loss": 0.046, + "step": 8030 + }, + { + "epoch": 1.4494321254732287, + "grad_norm": 0.12819664180278778, + "learning_rate": 9.808404442021599e-05, + "loss": 0.0418, + "step": 8040 + }, + { + "epoch": 1.451234901748693, + "grad_norm": 0.12778475880622864, + "learning_rate": 9.807648155276163e-05, + "loss": 0.0407, + "step": 8050 + }, + { + "epoch": 1.4530376780241572, + "grad_norm": 0.16056707501411438, + "learning_rate": 9.806890408094156e-05, + "loss": 0.0444, + "step": 8060 + }, + { + "epoch": 1.4548404542996214, + "grad_norm": 0.09849455207586288, + "learning_rate": 9.806131200705761e-05, + "loss": 0.0389, + "step": 8070 + }, + { + "epoch": 1.4566432305750856, + "grad_norm": 0.09808798879384995, + "learning_rate": 9.805370533341605e-05, + "loss": 0.0444, + "step": 8080 + }, + { + "epoch": 1.4584460068505498, + "grad_norm": 0.13618935644626617, + "learning_rate": 9.804608406232762e-05, + "loss": 0.0418, + "step": 8090 + }, + { + "epoch": 1.460248783126014, + "grad_norm": 0.14985764026641846, + "learning_rate": 9.803844819610741e-05, + "loss": 0.0401, + "step": 8100 + }, + { + "epoch": 1.4620515594014782, + "grad_norm": 0.23928481340408325, + "learning_rate": 9.803079773707504e-05, + "loss": 0.0411, + "step": 8110 + }, + { + "epoch": 1.4638543356769425, + "grad_norm": 0.10376619547605515, + "learning_rate": 9.802313268755447e-05, + "loss": 0.0393, + "step": 8120 + }, + { + "epoch": 1.4656571119524067, + "grad_norm": 0.1542768031358719, + "learning_rate": 9.801545304987419e-05, + "loss": 0.0423, + "step": 8130 + }, + { + "epoch": 1.4674598882278709, + "grad_norm": 0.14499475061893463, + "learning_rate": 9.800775882636704e-05, + "loss": 0.0437, + "step": 8140 + }, + { + "epoch": 1.469262664503335, + "grad_norm": 0.2057196944952011, + "learning_rate": 9.800005001937034e-05, + "loss": 0.0423, + "step": 8150 + }, + { + "epoch": 1.4710654407787993, + "grad_norm": 0.14442934095859528, + "learning_rate": 9.79923266312258e-05, + "loss": 0.0425, + "step": 8160 + }, + { + "epoch": 1.4728682170542635, + "grad_norm": 0.1957738995552063, + "learning_rate": 9.79845886642796e-05, + "loss": 0.0365, + "step": 8170 + }, + { + "epoch": 1.4746709933297277, + "grad_norm": 0.1717921495437622, + "learning_rate": 9.797683612088233e-05, + "loss": 0.0427, + "step": 8180 + }, + { + "epoch": 1.476473769605192, + "grad_norm": 0.15808957815170288, + "learning_rate": 9.796906900338898e-05, + "loss": 0.0393, + "step": 8190 + }, + { + "epoch": 1.4782765458806562, + "grad_norm": 0.19438597559928894, + "learning_rate": 9.796128731415903e-05, + "loss": 0.0435, + "step": 8200 + }, + { + "epoch": 1.4800793221561204, + "grad_norm": 0.19207032024860382, + "learning_rate": 9.795349105555634e-05, + "loss": 0.0401, + "step": 8210 + }, + { + "epoch": 1.4818820984315846, + "grad_norm": 0.1565093696117401, + "learning_rate": 9.794568022994922e-05, + "loss": 0.0408, + "step": 8220 + }, + { + "epoch": 1.4836848747070488, + "grad_norm": 0.15701228380203247, + "learning_rate": 9.793785483971034e-05, + "loss": 0.0442, + "step": 8230 + }, + { + "epoch": 1.485487650982513, + "grad_norm": 0.1439363807439804, + "learning_rate": 9.793001488721691e-05, + "loss": 0.0508, + "step": 8240 + }, + { + "epoch": 1.4872904272579772, + "grad_norm": 0.11535283923149109, + "learning_rate": 9.792216037485047e-05, + "loss": 0.0414, + "step": 8250 + }, + { + "epoch": 1.4890932035334414, + "grad_norm": 0.12835390865802765, + "learning_rate": 9.791429130499704e-05, + "loss": 0.047, + "step": 8260 + }, + { + "epoch": 1.4908959798089056, + "grad_norm": 0.1851019561290741, + "learning_rate": 9.790640768004698e-05, + "loss": 0.0443, + "step": 8270 + }, + { + "epoch": 1.4926987560843699, + "grad_norm": 0.14740526676177979, + "learning_rate": 9.789850950239518e-05, + "loss": 0.0412, + "step": 8280 + }, + { + "epoch": 1.494501532359834, + "grad_norm": 0.1528676599264145, + "learning_rate": 9.789059677444089e-05, + "loss": 0.0397, + "step": 8290 + }, + { + "epoch": 1.4963043086352983, + "grad_norm": 0.18221311271190643, + "learning_rate": 9.788266949858776e-05, + "loss": 0.0432, + "step": 8300 + }, + { + "epoch": 1.4981070849107625, + "grad_norm": 0.12342915683984756, + "learning_rate": 9.787472767724392e-05, + "loss": 0.0407, + "step": 8310 + }, + { + "epoch": 1.4999098611862267, + "grad_norm": 0.12390574812889099, + "learning_rate": 9.786677131282185e-05, + "loss": 0.0432, + "step": 8320 + }, + { + "epoch": 1.501712637461691, + "grad_norm": 0.13368622958660126, + "learning_rate": 9.785880040773853e-05, + "loss": 0.0414, + "step": 8330 + }, + { + "epoch": 1.5035154137371554, + "grad_norm": 0.17940254509449005, + "learning_rate": 9.785081496441527e-05, + "loss": 0.0423, + "step": 8340 + }, + { + "epoch": 1.5053181900126194, + "grad_norm": 0.1522243469953537, + "learning_rate": 9.784281498527785e-05, + "loss": 0.044, + "step": 8350 + }, + { + "epoch": 1.5071209662880838, + "grad_norm": 0.17423155903816223, + "learning_rate": 9.783480047275646e-05, + "loss": 0.0486, + "step": 8360 + }, + { + "epoch": 1.5089237425635478, + "grad_norm": 0.20396697521209717, + "learning_rate": 9.78267714292857e-05, + "loss": 0.0418, + "step": 8370 + }, + { + "epoch": 1.5107265188390122, + "grad_norm": 0.1652018427848816, + "learning_rate": 9.781872785730454e-05, + "loss": 0.0402, + "step": 8380 + }, + { + "epoch": 1.5125292951144762, + "grad_norm": 0.13339237868785858, + "learning_rate": 9.781066975925646e-05, + "loss": 0.0439, + "step": 8390 + }, + { + "epoch": 1.5143320713899406, + "grad_norm": 0.14576540887355804, + "learning_rate": 9.780259713758928e-05, + "loss": 0.0398, + "step": 8400 + }, + { + "epoch": 1.5161348476654046, + "grad_norm": 0.11785120517015457, + "learning_rate": 9.779450999475524e-05, + "loss": 0.0409, + "step": 8410 + }, + { + "epoch": 1.517937623940869, + "grad_norm": 0.13513927161693573, + "learning_rate": 9.7786408333211e-05, + "loss": 0.0454, + "step": 8420 + }, + { + "epoch": 1.519740400216333, + "grad_norm": 0.12213265895843506, + "learning_rate": 9.777829215541764e-05, + "loss": 0.0424, + "step": 8430 + }, + { + "epoch": 1.5215431764917975, + "grad_norm": 0.15755802392959595, + "learning_rate": 9.777016146384064e-05, + "loss": 0.0433, + "step": 8440 + }, + { + "epoch": 1.5233459527672615, + "grad_norm": 0.21495431661605835, + "learning_rate": 9.776201626094988e-05, + "loss": 0.0406, + "step": 8450 + }, + { + "epoch": 1.525148729042726, + "grad_norm": 0.19009213149547577, + "learning_rate": 9.775385654921965e-05, + "loss": 0.0404, + "step": 8460 + }, + { + "epoch": 1.52695150531819, + "grad_norm": 0.12882180511951447, + "learning_rate": 9.774568233112868e-05, + "loss": 0.0384, + "step": 8470 + }, + { + "epoch": 1.5287542815936543, + "grad_norm": 0.11674314737319946, + "learning_rate": 9.773749360916007e-05, + "loss": 0.04, + "step": 8480 + }, + { + "epoch": 1.5305570578691183, + "grad_norm": 0.13835161924362183, + "learning_rate": 9.772929038580134e-05, + "loss": 0.0409, + "step": 8490 + }, + { + "epoch": 1.5323598341445828, + "grad_norm": 0.15066763758659363, + "learning_rate": 9.772107266354439e-05, + "loss": 0.0451, + "step": 8500 + }, + { + "epoch": 1.5341626104200468, + "grad_norm": 0.1178978905081749, + "learning_rate": 9.77128404448856e-05, + "loss": 0.0434, + "step": 8510 + }, + { + "epoch": 1.5359653866955112, + "grad_norm": 0.10479331016540527, + "learning_rate": 9.770459373232565e-05, + "loss": 0.0425, + "step": 8520 + }, + { + "epoch": 1.5377681629709752, + "grad_norm": 0.1205153539776802, + "learning_rate": 9.769633252836969e-05, + "loss": 0.0437, + "step": 8530 + }, + { + "epoch": 1.5395709392464396, + "grad_norm": 0.20385809242725372, + "learning_rate": 9.768805683552724e-05, + "loss": 0.0457, + "step": 8540 + }, + { + "epoch": 1.5413737155219036, + "grad_norm": 0.15422265231609344, + "learning_rate": 9.767976665631228e-05, + "loss": 0.0435, + "step": 8550 + }, + { + "epoch": 1.543176491797368, + "grad_norm": 0.1509687304496765, + "learning_rate": 9.767146199324311e-05, + "loss": 0.0442, + "step": 8560 + }, + { + "epoch": 1.544979268072832, + "grad_norm": 0.1412591189146042, + "learning_rate": 9.766314284884249e-05, + "loss": 0.0407, + "step": 8570 + }, + { + "epoch": 1.5467820443482965, + "grad_norm": 0.1641855239868164, + "learning_rate": 9.765480922563752e-05, + "loss": 0.0409, + "step": 8580 + }, + { + "epoch": 1.5485848206237605, + "grad_norm": 0.17688889801502228, + "learning_rate": 9.764646112615978e-05, + "loss": 0.0451, + "step": 8590 + }, + { + "epoch": 1.550387596899225, + "grad_norm": 0.10193276405334473, + "learning_rate": 9.763809855294517e-05, + "loss": 0.0424, + "step": 8600 + }, + { + "epoch": 1.5521903731746889, + "grad_norm": 0.1310090571641922, + "learning_rate": 9.762972150853404e-05, + "loss": 0.0442, + "step": 8610 + }, + { + "epoch": 1.5539931494501533, + "grad_norm": 0.15439288318157196, + "learning_rate": 9.762132999547111e-05, + "loss": 0.046, + "step": 8620 + }, + { + "epoch": 1.5557959257256173, + "grad_norm": 0.14456555247306824, + "learning_rate": 9.761292401630549e-05, + "loss": 0.039, + "step": 8630 + }, + { + "epoch": 1.5575987020010817, + "grad_norm": 0.14294250309467316, + "learning_rate": 9.76045035735907e-05, + "loss": 0.0408, + "step": 8640 + }, + { + "epoch": 1.5594014782765457, + "grad_norm": 0.17914031445980072, + "learning_rate": 9.759606866988464e-05, + "loss": 0.0387, + "step": 8650 + }, + { + "epoch": 1.5612042545520102, + "grad_norm": 0.14985167980194092, + "learning_rate": 9.758761930774963e-05, + "loss": 0.0477, + "step": 8660 + }, + { + "epoch": 1.5630070308274742, + "grad_norm": 0.15756645798683167, + "learning_rate": 9.757915548975235e-05, + "loss": 0.044, + "step": 8670 + }, + { + "epoch": 1.5648098071029386, + "grad_norm": 0.14678172767162323, + "learning_rate": 9.757067721846389e-05, + "loss": 0.0432, + "step": 8680 + }, + { + "epoch": 1.5666125833784026, + "grad_norm": 0.14178459346294403, + "learning_rate": 9.756218449645971e-05, + "loss": 0.0516, + "step": 8690 + }, + { + "epoch": 1.568415359653867, + "grad_norm": 0.1318591982126236, + "learning_rate": 9.75536773263197e-05, + "loss": 0.0439, + "step": 8700 + }, + { + "epoch": 1.570218135929331, + "grad_norm": 0.13863953948020935, + "learning_rate": 9.75451557106281e-05, + "loss": 0.045, + "step": 8710 + }, + { + "epoch": 1.5720209122047955, + "grad_norm": 0.12420926988124847, + "learning_rate": 9.753661965197354e-05, + "loss": 0.0447, + "step": 8720 + }, + { + "epoch": 1.5738236884802594, + "grad_norm": 0.12888164818286896, + "learning_rate": 9.752806915294908e-05, + "loss": 0.0453, + "step": 8730 + }, + { + "epoch": 1.5756264647557239, + "grad_norm": 0.15761391818523407, + "learning_rate": 9.75195042161521e-05, + "loss": 0.0452, + "step": 8740 + }, + { + "epoch": 1.5774292410311879, + "grad_norm": 0.15469135344028473, + "learning_rate": 9.751092484418442e-05, + "loss": 0.0446, + "step": 8750 + }, + { + "epoch": 1.5792320173066523, + "grad_norm": 0.1579580157995224, + "learning_rate": 9.750233103965224e-05, + "loss": 0.0391, + "step": 8760 + }, + { + "epoch": 1.5810347935821163, + "grad_norm": 0.1366501897573471, + "learning_rate": 9.749372280516611e-05, + "loss": 0.0447, + "step": 8770 + }, + { + "epoch": 1.5828375698575807, + "grad_norm": 0.1659681797027588, + "learning_rate": 9.748510014334097e-05, + "loss": 0.0429, + "step": 8780 + }, + { + "epoch": 1.584640346133045, + "grad_norm": 0.1764449030160904, + "learning_rate": 9.747646305679621e-05, + "loss": 0.0459, + "step": 8790 + }, + { + "epoch": 1.5864431224085092, + "grad_norm": 0.11895448714494705, + "learning_rate": 9.74678115481555e-05, + "loss": 0.0413, + "step": 8800 + }, + { + "epoch": 1.5882458986839734, + "grad_norm": 0.12485641241073608, + "learning_rate": 9.745914562004696e-05, + "loss": 0.0378, + "step": 8810 + }, + { + "epoch": 1.5900486749594376, + "grad_norm": 0.12429939210414886, + "learning_rate": 9.745046527510307e-05, + "loss": 0.0409, + "step": 8820 + }, + { + "epoch": 1.5918514512349018, + "grad_norm": 0.09228527545928955, + "learning_rate": 9.744177051596068e-05, + "loss": 0.0352, + "step": 8830 + }, + { + "epoch": 1.593654227510366, + "grad_norm": 0.1651102751493454, + "learning_rate": 9.743306134526105e-05, + "loss": 0.0393, + "step": 8840 + }, + { + "epoch": 1.5954570037858302, + "grad_norm": 0.16719184815883636, + "learning_rate": 9.742433776564977e-05, + "loss": 0.0374, + "step": 8850 + }, + { + "epoch": 1.5972597800612944, + "grad_norm": 0.11218594759702682, + "learning_rate": 9.741559977977683e-05, + "loss": 0.0379, + "step": 8860 + }, + { + "epoch": 1.5990625563367586, + "grad_norm": 0.13786685466766357, + "learning_rate": 9.740684739029661e-05, + "loss": 0.0379, + "step": 8870 + }, + { + "epoch": 1.6008653326122229, + "grad_norm": 0.11107755452394485, + "learning_rate": 9.739808059986789e-05, + "loss": 0.0394, + "step": 8880 + }, + { + "epoch": 1.602668108887687, + "grad_norm": 0.14831335842609406, + "learning_rate": 9.738929941115373e-05, + "loss": 0.0425, + "step": 8890 + }, + { + "epoch": 1.6044708851631513, + "grad_norm": 0.15752890706062317, + "learning_rate": 9.738050382682167e-05, + "loss": 0.0442, + "step": 8900 + }, + { + "epoch": 1.6062736614386155, + "grad_norm": 0.15082195401191711, + "learning_rate": 9.737169384954355e-05, + "loss": 0.0453, + "step": 8910 + }, + { + "epoch": 1.6080764377140797, + "grad_norm": 0.13747678697109222, + "learning_rate": 9.736286948199562e-05, + "loss": 0.0423, + "step": 8920 + }, + { + "epoch": 1.609879213989544, + "grad_norm": 0.13459938764572144, + "learning_rate": 9.735403072685848e-05, + "loss": 0.0417, + "step": 8930 + }, + { + "epoch": 1.6116819902650081, + "grad_norm": 0.1419130563735962, + "learning_rate": 9.734517758681712e-05, + "loss": 0.039, + "step": 8940 + }, + { + "epoch": 1.6134847665404723, + "grad_norm": 0.1557479202747345, + "learning_rate": 9.733631006456088e-05, + "loss": 0.0357, + "step": 8950 + }, + { + "epoch": 1.6152875428159366, + "grad_norm": 0.1271543949842453, + "learning_rate": 9.732742816278348e-05, + "loss": 0.0417, + "step": 8960 + }, + { + "epoch": 1.6170903190914008, + "grad_norm": 0.1513073593378067, + "learning_rate": 9.731853188418302e-05, + "loss": 0.0414, + "step": 8970 + }, + { + "epoch": 1.618893095366865, + "grad_norm": 0.10693933814764023, + "learning_rate": 9.730962123146194e-05, + "loss": 0.0389, + "step": 8980 + }, + { + "epoch": 1.6206958716423292, + "grad_norm": 0.133611798286438, + "learning_rate": 9.730069620732709e-05, + "loss": 0.0411, + "step": 8990 + }, + { + "epoch": 1.6224986479177934, + "grad_norm": 0.15051952004432678, + "learning_rate": 9.72917568144896e-05, + "loss": 0.0431, + "step": 9000 + }, + { + "epoch": 1.6243014241932576, + "grad_norm": 0.21103809773921967, + "learning_rate": 9.728280305566509e-05, + "loss": 0.0435, + "step": 9010 + }, + { + "epoch": 1.6261042004687218, + "grad_norm": 0.08502990752458572, + "learning_rate": 9.727383493357343e-05, + "loss": 0.0394, + "step": 9020 + }, + { + "epoch": 1.627906976744186, + "grad_norm": 0.1304304301738739, + "learning_rate": 9.726485245093891e-05, + "loss": 0.042, + "step": 9030 + }, + { + "epoch": 1.6297097530196503, + "grad_norm": 0.15076182782649994, + "learning_rate": 9.725585561049018e-05, + "loss": 0.0434, + "step": 9040 + }, + { + "epoch": 1.6315125292951145, + "grad_norm": 0.12900467216968536, + "learning_rate": 9.724684441496022e-05, + "loss": 0.0382, + "step": 9050 + }, + { + "epoch": 1.6333153055705787, + "grad_norm": 0.14301662147045135, + "learning_rate": 9.72378188670864e-05, + "loss": 0.0404, + "step": 9060 + }, + { + "epoch": 1.635118081846043, + "grad_norm": 0.12155018746852875, + "learning_rate": 9.722877896961047e-05, + "loss": 0.0418, + "step": 9070 + }, + { + "epoch": 1.6369208581215071, + "grad_norm": 0.12534725666046143, + "learning_rate": 9.721972472527848e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 1.6387236343969713, + "grad_norm": 0.1575479507446289, + "learning_rate": 9.721065613684089e-05, + "loss": 0.0425, + "step": 9090 + }, + { + "epoch": 1.6405264106724355, + "grad_norm": 0.132240891456604, + "learning_rate": 9.72015732070525e-05, + "loss": 0.0414, + "step": 9100 + }, + { + "epoch": 1.6423291869478998, + "grad_norm": 0.16293112933635712, + "learning_rate": 9.719247593867244e-05, + "loss": 0.0438, + "step": 9110 + }, + { + "epoch": 1.644131963223364, + "grad_norm": 0.18946024775505066, + "learning_rate": 9.718336433446423e-05, + "loss": 0.04, + "step": 9120 + }, + { + "epoch": 1.6459347394988282, + "grad_norm": 0.1284693032503128, + "learning_rate": 9.717423839719574e-05, + "loss": 0.043, + "step": 9130 + }, + { + "epoch": 1.6477375157742924, + "grad_norm": 0.20048880577087402, + "learning_rate": 9.71650981296392e-05, + "loss": 0.0397, + "step": 9140 + }, + { + "epoch": 1.6495402920497566, + "grad_norm": 0.1629897505044937, + "learning_rate": 9.715594353457118e-05, + "loss": 0.0431, + "step": 9150 + }, + { + "epoch": 1.6513430683252208, + "grad_norm": 0.17216765880584717, + "learning_rate": 9.714677461477257e-05, + "loss": 0.0455, + "step": 9160 + }, + { + "epoch": 1.653145844600685, + "grad_norm": 0.12067907303571701, + "learning_rate": 9.713759137302869e-05, + "loss": 0.0435, + "step": 9170 + }, + { + "epoch": 1.6549486208761492, + "grad_norm": 0.17365257441997528, + "learning_rate": 9.712839381212914e-05, + "loss": 0.0446, + "step": 9180 + }, + { + "epoch": 1.6567513971516135, + "grad_norm": 0.12032567709684372, + "learning_rate": 9.71191819348679e-05, + "loss": 0.0421, + "step": 9190 + }, + { + "epoch": 1.6585541734270777, + "grad_norm": 0.1644958257675171, + "learning_rate": 9.710995574404331e-05, + "loss": 0.0452, + "step": 9200 + }, + { + "epoch": 1.6603569497025419, + "grad_norm": 0.1338106095790863, + "learning_rate": 9.710071524245802e-05, + "loss": 0.0417, + "step": 9210 + }, + { + "epoch": 1.662159725978006, + "grad_norm": 0.1877439022064209, + "learning_rate": 9.709146043291906e-05, + "loss": 0.0373, + "step": 9220 + }, + { + "epoch": 1.6639625022534703, + "grad_norm": 0.09826911240816116, + "learning_rate": 9.70821913182378e-05, + "loss": 0.0414, + "step": 9230 + }, + { + "epoch": 1.6657652785289345, + "grad_norm": 0.13114754855632782, + "learning_rate": 9.707290790122995e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 1.6675680548043987, + "grad_norm": 0.17269818484783173, + "learning_rate": 9.706361018471557e-05, + "loss": 0.0411, + "step": 9250 + }, + { + "epoch": 1.669370831079863, + "grad_norm": 0.1072874590754509, + "learning_rate": 9.705429817151906e-05, + "loss": 0.0438, + "step": 9260 + }, + { + "epoch": 1.6711736073553272, + "grad_norm": 0.1763143688440323, + "learning_rate": 9.704497186446917e-05, + "loss": 0.0432, + "step": 9270 + }, + { + "epoch": 1.6729763836307914, + "grad_norm": 0.12969639897346497, + "learning_rate": 9.703563126639896e-05, + "loss": 0.0443, + "step": 9280 + }, + { + "epoch": 1.6747791599062556, + "grad_norm": 0.22380030155181885, + "learning_rate": 9.70262763801459e-05, + "loss": 0.0425, + "step": 9290 + }, + { + "epoch": 1.6765819361817198, + "grad_norm": 0.14714008569717407, + "learning_rate": 9.701690720855171e-05, + "loss": 0.0367, + "step": 9300 + }, + { + "epoch": 1.678384712457184, + "grad_norm": 0.16367602348327637, + "learning_rate": 9.700752375446253e-05, + "loss": 0.0466, + "step": 9310 + }, + { + "epoch": 1.6801874887326482, + "grad_norm": 0.11947980523109436, + "learning_rate": 9.69981260207288e-05, + "loss": 0.0406, + "step": 9320 + }, + { + "epoch": 1.6819902650081124, + "grad_norm": 0.171796977519989, + "learning_rate": 9.698871401020529e-05, + "loss": 0.0413, + "step": 9330 + }, + { + "epoch": 1.6837930412835767, + "grad_norm": 0.13770607113838196, + "learning_rate": 9.697928772575112e-05, + "loss": 0.0452, + "step": 9340 + }, + { + "epoch": 1.685595817559041, + "grad_norm": 0.12719982862472534, + "learning_rate": 9.696984717022976e-05, + "loss": 0.0492, + "step": 9350 + }, + { + "epoch": 1.687398593834505, + "grad_norm": 0.1812325119972229, + "learning_rate": 9.6960392346509e-05, + "loss": 0.0407, + "step": 9360 + }, + { + "epoch": 1.6892013701099695, + "grad_norm": 0.15962675213813782, + "learning_rate": 9.695092325746097e-05, + "loss": 0.0425, + "step": 9370 + }, + { + "epoch": 1.6910041463854335, + "grad_norm": 0.16723452508449554, + "learning_rate": 9.694143990596211e-05, + "loss": 0.0371, + "step": 9380 + }, + { + "epoch": 1.692806922660898, + "grad_norm": 0.13127018511295319, + "learning_rate": 9.693194229489325e-05, + "loss": 0.0399, + "step": 9390 + }, + { + "epoch": 1.694609698936362, + "grad_norm": 0.1530247926712036, + "learning_rate": 9.692243042713944e-05, + "loss": 0.0372, + "step": 9400 + }, + { + "epoch": 1.6964124752118264, + "grad_norm": 0.1293390542268753, + "learning_rate": 9.691290430559022e-05, + "loss": 0.0424, + "step": 9410 + }, + { + "epoch": 1.6982152514872904, + "grad_norm": 0.12290593981742859, + "learning_rate": 9.690336393313932e-05, + "loss": 0.0406, + "step": 9420 + }, + { + "epoch": 1.7000180277627548, + "grad_norm": 0.13951396942138672, + "learning_rate": 9.689380931268487e-05, + "loss": 0.041, + "step": 9430 + }, + { + "epoch": 1.7018208040382188, + "grad_norm": 0.14184926450252533, + "learning_rate": 9.688424044712932e-05, + "loss": 0.0432, + "step": 9440 + }, + { + "epoch": 1.7036235803136832, + "grad_norm": 0.12445221841335297, + "learning_rate": 9.687465733937942e-05, + "loss": 0.0419, + "step": 9450 + }, + { + "epoch": 1.7054263565891472, + "grad_norm": 0.12526944279670715, + "learning_rate": 9.686505999234627e-05, + "loss": 0.042, + "step": 9460 + }, + { + "epoch": 1.7072291328646116, + "grad_norm": 0.11837741732597351, + "learning_rate": 9.685544840894529e-05, + "loss": 0.0436, + "step": 9470 + }, + { + "epoch": 1.7090319091400756, + "grad_norm": 0.15270712971687317, + "learning_rate": 9.684582259209624e-05, + "loss": 0.0411, + "step": 9480 + }, + { + "epoch": 1.71083468541554, + "grad_norm": 0.1271277219057083, + "learning_rate": 9.683618254472317e-05, + "loss": 0.0374, + "step": 9490 + }, + { + "epoch": 1.712637461691004, + "grad_norm": 0.10759492963552475, + "learning_rate": 9.682652826975449e-05, + "loss": 0.039, + "step": 9500 + }, + { + "epoch": 1.7144402379664685, + "grad_norm": 0.1235543042421341, + "learning_rate": 9.681685977012291e-05, + "loss": 0.0404, + "step": 9510 + }, + { + "epoch": 1.7162430142419325, + "grad_norm": 0.12931011617183685, + "learning_rate": 9.680717704876546e-05, + "loss": 0.0354, + "step": 9520 + }, + { + "epoch": 1.718045790517397, + "grad_norm": 0.120416559278965, + "learning_rate": 9.679748010862349e-05, + "loss": 0.0385, + "step": 9530 + }, + { + "epoch": 1.719848566792861, + "grad_norm": 0.1548643559217453, + "learning_rate": 9.678776895264267e-05, + "loss": 0.042, + "step": 9540 + }, + { + "epoch": 1.7216513430683253, + "grad_norm": 0.12035730481147766, + "learning_rate": 9.6778043583773e-05, + "loss": 0.0439, + "step": 9550 + }, + { + "epoch": 1.7234541193437893, + "grad_norm": 0.13684290647506714, + "learning_rate": 9.67683040049688e-05, + "loss": 0.0428, + "step": 9560 + }, + { + "epoch": 1.7252568956192538, + "grad_norm": 0.19395649433135986, + "learning_rate": 9.675855021918869e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 1.7270596718947178, + "grad_norm": 0.115456722676754, + "learning_rate": 9.674878222939561e-05, + "loss": 0.0377, + "step": 9580 + }, + { + "epoch": 1.7288624481701822, + "grad_norm": 0.1280662566423416, + "learning_rate": 9.673900003855681e-05, + "loss": 0.0426, + "step": 9590 + }, + { + "epoch": 1.7306652244456462, + "grad_norm": 0.20764289796352386, + "learning_rate": 9.672920364964389e-05, + "loss": 0.0446, + "step": 9600 + }, + { + "epoch": 1.7324680007211106, + "grad_norm": 0.16308268904685974, + "learning_rate": 9.671939306563269e-05, + "loss": 0.0436, + "step": 9610 + }, + { + "epoch": 1.7342707769965746, + "grad_norm": 0.14351972937583923, + "learning_rate": 9.670956828950345e-05, + "loss": 0.0397, + "step": 9620 + }, + { + "epoch": 1.736073553272039, + "grad_norm": 0.1471412479877472, + "learning_rate": 9.669972932424065e-05, + "loss": 0.0388, + "step": 9630 + }, + { + "epoch": 1.737876329547503, + "grad_norm": 0.14039361476898193, + "learning_rate": 9.668987617283312e-05, + "loss": 0.039, + "step": 9640 + }, + { + "epoch": 1.7396791058229675, + "grad_norm": 0.18541479110717773, + "learning_rate": 9.668000883827397e-05, + "loss": 0.035, + "step": 9650 + }, + { + "epoch": 1.7414818820984315, + "grad_norm": 0.11093011498451233, + "learning_rate": 9.667012732356067e-05, + "loss": 0.0421, + "step": 9660 + }, + { + "epoch": 1.743284658373896, + "grad_norm": 0.14861413836479187, + "learning_rate": 9.666023163169493e-05, + "loss": 0.0413, + "step": 9670 + }, + { + "epoch": 1.74508743464936, + "grad_norm": 0.16462992131710052, + "learning_rate": 9.665032176568281e-05, + "loss": 0.0447, + "step": 9680 + }, + { + "epoch": 1.7468902109248243, + "grad_norm": 0.18467123806476593, + "learning_rate": 9.664039772853469e-05, + "loss": 0.0418, + "step": 9690 + }, + { + "epoch": 1.7486929872002883, + "grad_norm": 0.15918749570846558, + "learning_rate": 9.663045952326518e-05, + "loss": 0.0424, + "step": 9700 + }, + { + "epoch": 1.7504957634757528, + "grad_norm": 0.12315177917480469, + "learning_rate": 9.662050715289328e-05, + "loss": 0.0372, + "step": 9710 + }, + { + "epoch": 1.7522985397512167, + "grad_norm": 0.18573623895645142, + "learning_rate": 9.661054062044226e-05, + "loss": 0.0403, + "step": 9720 + }, + { + "epoch": 1.7541013160266812, + "grad_norm": 0.17335885763168335, + "learning_rate": 9.660055992893968e-05, + "loss": 0.0413, + "step": 9730 + }, + { + "epoch": 1.7559040923021452, + "grad_norm": 0.14658159017562866, + "learning_rate": 9.659056508141739e-05, + "loss": 0.0397, + "step": 9740 + }, + { + "epoch": 1.7577068685776096, + "grad_norm": 0.19271817803382874, + "learning_rate": 9.658055608091161e-05, + "loss": 0.036, + "step": 9750 + }, + { + "epoch": 1.7595096448530736, + "grad_norm": 0.1798640936613083, + "learning_rate": 9.657053293046276e-05, + "loss": 0.0409, + "step": 9760 + }, + { + "epoch": 1.761312421128538, + "grad_norm": 0.12259628623723984, + "learning_rate": 9.656049563311564e-05, + "loss": 0.0372, + "step": 9770 + }, + { + "epoch": 1.763115197404002, + "grad_norm": 0.15863682329654694, + "learning_rate": 9.655044419191929e-05, + "loss": 0.0419, + "step": 9780 + }, + { + "epoch": 1.7649179736794665, + "grad_norm": 0.11790483444929123, + "learning_rate": 9.654037860992711e-05, + "loss": 0.0433, + "step": 9790 + }, + { + "epoch": 1.7667207499549304, + "grad_norm": 0.13711854815483093, + "learning_rate": 9.653029889019672e-05, + "loss": 0.0455, + "step": 9800 + }, + { + "epoch": 1.7685235262303949, + "grad_norm": 0.11476242542266846, + "learning_rate": 9.65202050357901e-05, + "loss": 0.0372, + "step": 9810 + }, + { + "epoch": 1.7703263025058589, + "grad_norm": 0.1282246857881546, + "learning_rate": 9.651009704977347e-05, + "loss": 0.0409, + "step": 9820 + }, + { + "epoch": 1.7721290787813233, + "grad_norm": 0.16368697583675385, + "learning_rate": 9.649997493521738e-05, + "loss": 0.04, + "step": 9830 + }, + { + "epoch": 1.7739318550567873, + "grad_norm": 0.1306801289319992, + "learning_rate": 9.64898386951967e-05, + "loss": 0.0408, + "step": 9840 + }, + { + "epoch": 1.7757346313322517, + "grad_norm": 0.14899666607379913, + "learning_rate": 9.647968833279049e-05, + "loss": 0.0455, + "step": 9850 + }, + { + "epoch": 1.7775374076077157, + "grad_norm": 0.13333363831043243, + "learning_rate": 9.646952385108218e-05, + "loss": 0.039, + "step": 9860 + }, + { + "epoch": 1.7793401838831802, + "grad_norm": 0.12403760850429535, + "learning_rate": 9.645934525315951e-05, + "loss": 0.0395, + "step": 9870 + }, + { + "epoch": 1.7811429601586442, + "grad_norm": 0.17959313094615936, + "learning_rate": 9.644915254211442e-05, + "loss": 0.0428, + "step": 9880 + }, + { + "epoch": 1.7829457364341086, + "grad_norm": 0.17752526700496674, + "learning_rate": 9.643894572104321e-05, + "loss": 0.0455, + "step": 9890 + }, + { + "epoch": 1.7847485127095726, + "grad_norm": 0.15523189306259155, + "learning_rate": 9.642872479304644e-05, + "loss": 0.0414, + "step": 9900 + }, + { + "epoch": 1.786551288985037, + "grad_norm": 0.11487815529108047, + "learning_rate": 9.641848976122895e-05, + "loss": 0.0428, + "step": 9910 + }, + { + "epoch": 1.7883540652605012, + "grad_norm": 0.11738132685422897, + "learning_rate": 9.64082406286999e-05, + "loss": 0.0452, + "step": 9920 + }, + { + "epoch": 1.7901568415359654, + "grad_norm": 0.12917788326740265, + "learning_rate": 9.639797739857269e-05, + "loss": 0.0414, + "step": 9930 + }, + { + "epoch": 1.7919596178114297, + "grad_norm": 0.1342395395040512, + "learning_rate": 9.638770007396498e-05, + "loss": 0.0407, + "step": 9940 + }, + { + "epoch": 1.7937623940868939, + "grad_norm": 0.12433820962905884, + "learning_rate": 9.63774086579988e-05, + "loss": 0.0344, + "step": 9950 + }, + { + "epoch": 1.795565170362358, + "grad_norm": 0.16066451370716095, + "learning_rate": 9.63671031538004e-05, + "loss": 0.0441, + "step": 9960 + }, + { + "epoch": 1.7973679466378223, + "grad_norm": 0.14150521159172058, + "learning_rate": 9.635678356450031e-05, + "loss": 0.0391, + "step": 9970 + }, + { + "epoch": 1.7991707229132865, + "grad_norm": 0.1580437272787094, + "learning_rate": 9.634644989323336e-05, + "loss": 0.0461, + "step": 9980 + }, + { + "epoch": 1.8009734991887507, + "grad_norm": 0.11043276637792587, + "learning_rate": 9.633610214313861e-05, + "loss": 0.0402, + "step": 9990 + }, + { + "epoch": 1.802776275464215, + "grad_norm": 0.10205219686031342, + "learning_rate": 9.632574031735951e-05, + "loss": 0.0378, + "step": 10000 + } + ], + "logging_steps": 10, + "max_steps": 60000, + "num_input_tokens_seen": 0, + "num_train_epochs": 11, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}