| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.8648734680884926, | |
| "eval_steps": 500, | |
| "global_step": 9000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0, | |
| "loss": 7.7169, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 10.55540657043457, | |
| "learning_rate": 9.997877083112197e-05, | |
| "loss": 9.0438, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 6.060225009918213, | |
| "learning_rate": 9.987262498673178e-05, | |
| "loss": 3.211, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 2.5255496501922607, | |
| "learning_rate": 9.976647914234159e-05, | |
| "loss": 0.6387, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 2.976543664932251, | |
| "learning_rate": 9.966033329795139e-05, | |
| "loss": 0.5633, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 2.2680673599243164, | |
| "learning_rate": 9.95541874535612e-05, | |
| "loss": 0.474, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 3.136930465698242, | |
| "learning_rate": 9.944804160917101e-05, | |
| "loss": 0.3379, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 4.159604072570801, | |
| "learning_rate": 9.935251034921983e-05, | |
| "loss": 0.4444, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.704042911529541, | |
| "learning_rate": 9.924636450482963e-05, | |
| "loss": 0.4925, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 3.9414522647857666, | |
| "learning_rate": 9.914021866043945e-05, | |
| "loss": 0.4583, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.938662052154541, | |
| "learning_rate": 9.903407281604927e-05, | |
| "loss": 0.3838, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.8753790855407715, | |
| "learning_rate": 9.892792697165907e-05, | |
| "loss": 0.3247, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.75948965549469, | |
| "learning_rate": 9.882178112726887e-05, | |
| "loss": 0.3609, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.9066141843795776, | |
| "learning_rate": 9.871563528287868e-05, | |
| "loss": 0.3453, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.7767695188522339, | |
| "learning_rate": 9.86094894384885e-05, | |
| "loss": 0.5076, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.5219664573669434, | |
| "learning_rate": 9.85033435940983e-05, | |
| "loss": 0.4999, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.0505383014678955, | |
| "learning_rate": 9.83971977497081e-05, | |
| "loss": 0.5429, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 6.132015705108643, | |
| "learning_rate": 9.82910519053179e-05, | |
| "loss": 0.5099, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.057868480682373, | |
| "learning_rate": 9.818490606092772e-05, | |
| "loss": 0.4416, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.6155290603637695, | |
| "learning_rate": 9.807876021653753e-05, | |
| "loss": 0.3986, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.1468820571899414, | |
| "learning_rate": 9.797261437214733e-05, | |
| "loss": 0.3216, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.6600925326347351, | |
| "learning_rate": 9.786646852775713e-05, | |
| "loss": 0.3552, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 5.129382133483887, | |
| "learning_rate": 9.776032268336695e-05, | |
| "loss": 0.3221, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.3891478180885315, | |
| "learning_rate": 9.765417683897677e-05, | |
| "loss": 0.4073, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.254958391189575, | |
| "learning_rate": 9.754803099458657e-05, | |
| "loss": 0.4212, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 3.34332013130188, | |
| "learning_rate": 9.744188515019638e-05, | |
| "loss": 0.2167, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.801086902618408, | |
| "learning_rate": 9.733573930580618e-05, | |
| "loss": 0.5605, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 5.026745796203613, | |
| "learning_rate": 9.7229593461416e-05, | |
| "loss": 0.3527, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 3.8389620780944824, | |
| "learning_rate": 9.71234476170258e-05, | |
| "loss": 0.295, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.0584566593170166, | |
| "learning_rate": 9.70173017726356e-05, | |
| "loss": 0.2759, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 3.132164239883423, | |
| "learning_rate": 9.691115592824541e-05, | |
| "loss": 0.3888, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.5387492179870605, | |
| "learning_rate": 9.680501008385522e-05, | |
| "loss": 0.2285, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.0382373332977295, | |
| "learning_rate": 9.669886423946503e-05, | |
| "loss": 0.2549, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.465576648712158, | |
| "learning_rate": 9.659271839507483e-05, | |
| "loss": 0.7377, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.1156134605407715, | |
| "learning_rate": 9.648657255068465e-05, | |
| "loss": 0.3387, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.147655963897705, | |
| "learning_rate": 9.638042670629445e-05, | |
| "loss": 0.2605, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.4572869539260864, | |
| "learning_rate": 9.627428086190427e-05, | |
| "loss": 0.3024, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.906175971031189, | |
| "learning_rate": 9.616813501751407e-05, | |
| "loss": 0.3728, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.169878363609314, | |
| "learning_rate": 9.606198917312388e-05, | |
| "loss": 0.3961, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.2084730863571167, | |
| "learning_rate": 9.595584332873368e-05, | |
| "loss": 0.4887, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.7927988171577454, | |
| "learning_rate": 9.58496974843435e-05, | |
| "loss": 0.4519, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 6.37067985534668, | |
| "learning_rate": 9.57435516399533e-05, | |
| "loss": 0.237, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.9806203842163086, | |
| "learning_rate": 9.56374057955631e-05, | |
| "loss": 0.2917, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 5.05634880065918, | |
| "learning_rate": 9.553125995117291e-05, | |
| "loss": 0.2794, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.0483241081237793, | |
| "learning_rate": 9.542511410678273e-05, | |
| "loss": 0.3182, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.2123796939849854, | |
| "learning_rate": 9.531896826239253e-05, | |
| "loss": 0.2872, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.532020092010498, | |
| "learning_rate": 9.521282241800233e-05, | |
| "loss": 0.3258, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.1242539882659912, | |
| "learning_rate": 9.510667657361215e-05, | |
| "loss": 0.3356, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.846567153930664, | |
| "learning_rate": 9.500053072922196e-05, | |
| "loss": 0.3551, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 3.233238458633423, | |
| "learning_rate": 9.489438488483177e-05, | |
| "loss": 0.3971, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.7334824800491333, | |
| "learning_rate": 9.478823904044158e-05, | |
| "loss": 0.1896, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 7.36009407043457, | |
| "learning_rate": 9.468209319605138e-05, | |
| "loss": 0.3338, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.7838549613952637, | |
| "learning_rate": 9.457594735166118e-05, | |
| "loss": 0.3331, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.643627405166626, | |
| "learning_rate": 9.4469801507271e-05, | |
| "loss": 0.4575, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 5.420917510986328, | |
| "learning_rate": 9.43636556628808e-05, | |
| "loss": 0.37, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 2.1689090728759766, | |
| "learning_rate": 9.425750981849061e-05, | |
| "loss": 0.3551, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.7210526466369629, | |
| "learning_rate": 9.415136397410041e-05, | |
| "loss": 0.4028, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.3214457929134369, | |
| "learning_rate": 9.404521812971022e-05, | |
| "loss": 0.2391, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 3.8258142471313477, | |
| "learning_rate": 9.393907228532003e-05, | |
| "loss": 0.3399, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.5249234437942505, | |
| "learning_rate": 9.383292644092985e-05, | |
| "loss": 0.4449, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.22292350232601166, | |
| "learning_rate": 9.372678059653965e-05, | |
| "loss": 0.2156, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.3040258884429932, | |
| "learning_rate": 9.362063475214946e-05, | |
| "loss": 0.4175, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.3762481212615967, | |
| "learning_rate": 9.351448890775926e-05, | |
| "loss": 0.3191, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.706467866897583, | |
| "learning_rate": 9.340834306336908e-05, | |
| "loss": 0.5163, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.8577134609222412, | |
| "learning_rate": 9.330219721897888e-05, | |
| "loss": 0.1832, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 5.450695037841797, | |
| "learning_rate": 9.319605137458869e-05, | |
| "loss": 0.269, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 3.1967124938964844, | |
| "learning_rate": 9.308990553019849e-05, | |
| "loss": 0.3387, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.2148098945617676, | |
| "learning_rate": 9.29837596858083e-05, | |
| "loss": 0.3407, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.2693583965301514, | |
| "learning_rate": 9.287761384141811e-05, | |
| "loss": 0.2758, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 4.460744857788086, | |
| "learning_rate": 9.277146799702791e-05, | |
| "loss": 0.2493, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 8.331945419311523, | |
| "learning_rate": 9.266532215263772e-05, | |
| "loss": 0.2264, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 2.7469747066497803, | |
| "learning_rate": 9.255917630824753e-05, | |
| "loss": 0.3038, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 3.013535737991333, | |
| "learning_rate": 9.245303046385735e-05, | |
| "loss": 0.3136, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 3.508979558944702, | |
| "learning_rate": 9.234688461946716e-05, | |
| "loss": 0.3502, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.0464301109313965, | |
| "learning_rate": 9.224073877507696e-05, | |
| "loss": 0.1776, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.6929841041564941, | |
| "learning_rate": 9.213459293068676e-05, | |
| "loss": 0.2984, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.1452223062515259, | |
| "learning_rate": 9.202844708629658e-05, | |
| "loss": 0.2503, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.3975647687911987, | |
| "learning_rate": 9.192230124190638e-05, | |
| "loss": 0.2423, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.8630661964416504, | |
| "learning_rate": 9.181615539751619e-05, | |
| "loss": 0.327, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 5.333163261413574, | |
| "learning_rate": 9.171000955312599e-05, | |
| "loss": 0.4495, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.6478999853134155, | |
| "learning_rate": 9.160386370873581e-05, | |
| "loss": 0.2546, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.2132633924484253, | |
| "learning_rate": 9.149771786434561e-05, | |
| "loss": 0.2439, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.2123448848724365, | |
| "learning_rate": 9.139157201995542e-05, | |
| "loss": 0.3715, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 2.148674726486206, | |
| "learning_rate": 9.128542617556523e-05, | |
| "loss": 0.252, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 3.6980788707733154, | |
| "learning_rate": 9.117928033117504e-05, | |
| "loss": 0.4487, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 6.548594951629639, | |
| "learning_rate": 9.107313448678485e-05, | |
| "loss": 0.2199, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 3.5746383666992188, | |
| "learning_rate": 9.096698864239466e-05, | |
| "loss": 0.2728, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.9120383858680725, | |
| "learning_rate": 9.086084279800446e-05, | |
| "loss": 0.2737, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 4.220329761505127, | |
| "learning_rate": 9.075469695361427e-05, | |
| "loss": 0.4124, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 2.5000956058502197, | |
| "learning_rate": 9.064855110922408e-05, | |
| "loss": 0.302, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.3845906257629395, | |
| "learning_rate": 9.054240526483389e-05, | |
| "loss": 0.4177, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.0533277988433838, | |
| "learning_rate": 9.043625942044369e-05, | |
| "loss": 0.3834, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.482363224029541, | |
| "learning_rate": 9.03301135760535e-05, | |
| "loss": 0.3497, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.785825729370117, | |
| "learning_rate": 9.022396773166331e-05, | |
| "loss": 0.2696, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.9899762868881226, | |
| "learning_rate": 9.011782188727311e-05, | |
| "loss": 0.3139, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 3.0521786212921143, | |
| "learning_rate": 9.001167604288293e-05, | |
| "loss": 0.4116, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.1553211212158203, | |
| "learning_rate": 8.990553019849274e-05, | |
| "loss": 0.3239, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 2.973958730697632, | |
| "learning_rate": 8.979938435410254e-05, | |
| "loss": 0.297, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.3011306524276733, | |
| "learning_rate": 8.969323850971236e-05, | |
| "loss": 0.3136, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.6845755577087402, | |
| "learning_rate": 8.958709266532216e-05, | |
| "loss": 0.3207, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.33025118708610535, | |
| "learning_rate": 8.948094682093196e-05, | |
| "loss": 0.1847, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.9631307125091553, | |
| "learning_rate": 8.937480097654177e-05, | |
| "loss": 0.2798, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.952580451965332, | |
| "learning_rate": 8.926865513215158e-05, | |
| "loss": 0.2184, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 5.541811466217041, | |
| "learning_rate": 8.916250928776139e-05, | |
| "loss": 0.2649, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.0800001621246338, | |
| "learning_rate": 8.905636344337119e-05, | |
| "loss": 0.3064, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 4.908554553985596, | |
| "learning_rate": 8.8950217598981e-05, | |
| "loss": 0.2, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.08677980303764343, | |
| "learning_rate": 8.884407175459081e-05, | |
| "loss": 0.1262, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.9461978673934937, | |
| "learning_rate": 8.873792591020062e-05, | |
| "loss": 0.3098, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.11714805662631989, | |
| "learning_rate": 8.863178006581043e-05, | |
| "loss": 0.3596, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 2.0041699409484863, | |
| "learning_rate": 8.852563422142024e-05, | |
| "loss": 0.2518, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 5.036510467529297, | |
| "learning_rate": 8.841948837703004e-05, | |
| "loss": 0.3654, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.267143726348877, | |
| "learning_rate": 8.831334253263986e-05, | |
| "loss": 0.2812, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.063321113586426, | |
| "learning_rate": 8.820719668824966e-05, | |
| "loss": 0.3135, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 4.012215614318848, | |
| "learning_rate": 8.810105084385947e-05, | |
| "loss": 0.2423, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.7306702136993408, | |
| "learning_rate": 8.799490499946927e-05, | |
| "loss": 0.187, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.7319563627243042, | |
| "learning_rate": 8.788875915507909e-05, | |
| "loss": 0.3792, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 4.382763862609863, | |
| "learning_rate": 8.778261331068889e-05, | |
| "loss": 0.483, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.3643946647644043, | |
| "learning_rate": 8.76764674662987e-05, | |
| "loss": 0.1497, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 5.549211025238037, | |
| "learning_rate": 8.75703216219085e-05, | |
| "loss": 0.2628, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.2046520709991455, | |
| "learning_rate": 8.747479036195734e-05, | |
| "loss": 0.3474, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 3.313180446624756, | |
| "learning_rate": 8.736864451756715e-05, | |
| "loss": 0.3096, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.811859130859375, | |
| "learning_rate": 8.726249867317695e-05, | |
| "loss": 0.1371, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.43377700448036194, | |
| "learning_rate": 8.715635282878675e-05, | |
| "loss": 0.2461, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.7710583209991455, | |
| "learning_rate": 8.705020698439657e-05, | |
| "loss": 0.3332, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.4188406467437744, | |
| "learning_rate": 8.694406114000637e-05, | |
| "loss": 0.3196, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.7705641388893127, | |
| "learning_rate": 8.683791529561618e-05, | |
| "loss": 0.1709, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.6247994899749756, | |
| "learning_rate": 8.673176945122598e-05, | |
| "loss": 0.3033, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.033170461654663, | |
| "learning_rate": 8.66256236068358e-05, | |
| "loss": 0.2506, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 4.289760112762451, | |
| "learning_rate": 8.65194777624456e-05, | |
| "loss": 0.2839, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.3554538488388062, | |
| "learning_rate": 8.64133319180554e-05, | |
| "loss": 0.2703, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.9523005485534668, | |
| "learning_rate": 8.630718607366522e-05, | |
| "loss": 0.1133, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 5.332389831542969, | |
| "learning_rate": 8.620104022927503e-05, | |
| "loss": 0.3579, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 5.874100208282471, | |
| "learning_rate": 8.609489438488484e-05, | |
| "loss": 0.4038, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.4143377542495728, | |
| "learning_rate": 8.598874854049465e-05, | |
| "loss": 0.2451, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.5176362991333008, | |
| "learning_rate": 8.588260269610445e-05, | |
| "loss": 0.2561, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.5968561172485352, | |
| "learning_rate": 8.577645685171426e-05, | |
| "loss": 0.3456, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.039812445640564, | |
| "learning_rate": 8.567031100732407e-05, | |
| "loss": 0.2792, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 5.390068531036377, | |
| "learning_rate": 8.556416516293388e-05, | |
| "loss": 0.398, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.3645654916763306, | |
| "learning_rate": 8.545801931854368e-05, | |
| "loss": 0.4537, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 2.444027900695801, | |
| "learning_rate": 8.535187347415348e-05, | |
| "loss": 0.218, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 4.201082229614258, | |
| "learning_rate": 8.52457276297633e-05, | |
| "loss": 0.3146, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 4.080310344696045, | |
| "learning_rate": 8.51395817853731e-05, | |
| "loss": 0.2769, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 2.712216377258301, | |
| "learning_rate": 8.503343594098292e-05, | |
| "loss": 0.2795, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 3.2429492473602295, | |
| "learning_rate": 8.492729009659273e-05, | |
| "loss": 0.2956, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 6.107478618621826, | |
| "learning_rate": 8.482114425220253e-05, | |
| "loss": 0.3381, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.9037106037139893, | |
| "learning_rate": 8.471499840781235e-05, | |
| "loss": 0.4196, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.2487717866897583, | |
| "learning_rate": 8.460885256342215e-05, | |
| "loss": 0.2471, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 2.8922715187072754, | |
| "learning_rate": 8.450270671903195e-05, | |
| "loss": 0.2664, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.6493813991546631, | |
| "learning_rate": 8.439656087464176e-05, | |
| "loss": 0.206, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.11327870935201645, | |
| "learning_rate": 8.429041503025157e-05, | |
| "loss": 0.2593, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.4462690353393555, | |
| "learning_rate": 8.418426918586138e-05, | |
| "loss": 0.4474, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.0405867099761963, | |
| "learning_rate": 8.407812334147118e-05, | |
| "loss": 0.1657, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.3047516942024231, | |
| "learning_rate": 8.397197749708099e-05, | |
| "loss": 0.1691, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 6.330657958984375, | |
| "learning_rate": 8.386583165269079e-05, | |
| "loss": 0.2041, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 2.403702974319458, | |
| "learning_rate": 8.375968580830062e-05, | |
| "loss": 0.3408, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 3.2958528995513916, | |
| "learning_rate": 8.365353996391042e-05, | |
| "loss": 0.3271, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 3.2511487007141113, | |
| "learning_rate": 8.354739411952023e-05, | |
| "loss": 0.1719, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 2.447939872741699, | |
| "learning_rate": 8.344124827513003e-05, | |
| "loss": 0.2823, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.9992095232009888, | |
| "learning_rate": 8.333510243073985e-05, | |
| "loss": 0.2479, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 3.8574376106262207, | |
| "learning_rate": 8.322895658634965e-05, | |
| "loss": 0.2539, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 3.184896230697632, | |
| "learning_rate": 8.312281074195946e-05, | |
| "loss": 0.2826, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.6027563810348511, | |
| "learning_rate": 8.301666489756926e-05, | |
| "loss": 0.1404, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0776386260986328, | |
| "learning_rate": 8.291051905317906e-05, | |
| "loss": 0.3887, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.386305093765259, | |
| "learning_rate": 8.280437320878888e-05, | |
| "loss": 0.4232, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.299332618713379, | |
| "learning_rate": 8.269822736439868e-05, | |
| "loss": 0.2855, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.3506910800933838, | |
| "learning_rate": 8.259208152000849e-05, | |
| "loss": 0.2412, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 2.2037456035614014, | |
| "learning_rate": 8.24859356756183e-05, | |
| "loss": 0.2399, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 2.2852354049682617, | |
| "learning_rate": 8.237978983122812e-05, | |
| "loss": 0.202, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.2693609297275543, | |
| "learning_rate": 8.227364398683793e-05, | |
| "loss": 0.3235, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 3.526648998260498, | |
| "learning_rate": 8.216749814244773e-05, | |
| "loss": 0.3102, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.9742597341537476, | |
| "learning_rate": 8.206135229805753e-05, | |
| "loss": 0.3293, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 2.933436155319214, | |
| "learning_rate": 8.195520645366734e-05, | |
| "loss": 0.207, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.5870353579521179, | |
| "learning_rate": 8.184906060927715e-05, | |
| "loss": 0.3731, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.7825034856796265, | |
| "learning_rate": 8.174291476488696e-05, | |
| "loss": 0.1747, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 4.706550598144531, | |
| "learning_rate": 8.163676892049676e-05, | |
| "loss": 0.2143, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 3.326359748840332, | |
| "learning_rate": 8.153062307610657e-05, | |
| "loss": 0.363, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.3437646627426147, | |
| "learning_rate": 8.142447723171638e-05, | |
| "loss": 0.2806, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 4.6950249671936035, | |
| "learning_rate": 8.131833138732619e-05, | |
| "loss": 0.2547, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.557305097579956, | |
| "learning_rate": 8.1212185542936e-05, | |
| "loss": 0.277, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.5373164415359497, | |
| "learning_rate": 8.110603969854581e-05, | |
| "loss": 0.2878, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.3761144876480103, | |
| "learning_rate": 8.099989385415561e-05, | |
| "loss": 0.4071, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.7141520977020264, | |
| "learning_rate": 8.089374800976543e-05, | |
| "loss": 0.2002, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.6471810340881348, | |
| "learning_rate": 8.078760216537523e-05, | |
| "loss": 0.1962, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 1.8333234786987305, | |
| "learning_rate": 8.068145632098504e-05, | |
| "loss": 0.23, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.7382714152336121, | |
| "learning_rate": 8.057531047659484e-05, | |
| "loss": 0.1602, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 2.2624874114990234, | |
| "learning_rate": 8.046916463220466e-05, | |
| "loss": 0.3355, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.3432509899139404, | |
| "learning_rate": 8.036301878781446e-05, | |
| "loss": 0.1226, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.3153080940246582, | |
| "learning_rate": 8.025687294342426e-05, | |
| "loss": 0.2797, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.13998636603355408, | |
| "learning_rate": 8.015072709903407e-05, | |
| "loss": 0.3126, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 7.6837382316589355, | |
| "learning_rate": 8.004458125464388e-05, | |
| "loss": 0.348, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 2.536726236343384, | |
| "learning_rate": 7.993843541025369e-05, | |
| "loss": 0.2518, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 2.798586130142212, | |
| "learning_rate": 7.98322895658635e-05, | |
| "loss": 0.187, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 2.047030210494995, | |
| "learning_rate": 7.972614372147331e-05, | |
| "loss": 0.1801, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 2.5127789974212646, | |
| "learning_rate": 7.961999787708311e-05, | |
| "loss": 0.2613, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 5.015801429748535, | |
| "learning_rate": 7.951385203269293e-05, | |
| "loss": 0.4155, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 4.095780849456787, | |
| "learning_rate": 7.940770618830273e-05, | |
| "loss": 0.2413, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.575307309627533, | |
| "learning_rate": 7.930156034391254e-05, | |
| "loss": 0.2799, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.26382434368133545, | |
| "learning_rate": 7.919541449952234e-05, | |
| "loss": 0.1894, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.7955100536346436, | |
| "learning_rate": 7.908926865513216e-05, | |
| "loss": 0.199, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.4029354453086853, | |
| "learning_rate": 7.898312281074196e-05, | |
| "loss": 0.2465, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.4386157989501953, | |
| "learning_rate": 7.887697696635177e-05, | |
| "loss": 0.2603, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 4.048315525054932, | |
| "learning_rate": 7.877083112196157e-05, | |
| "loss": 0.3663, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 4.0357255935668945, | |
| "learning_rate": 7.866468527757139e-05, | |
| "loss": 0.2365, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.6603661775588989, | |
| "learning_rate": 7.85585394331812e-05, | |
| "loss": 0.2848, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 2.005911111831665, | |
| "learning_rate": 7.845239358879101e-05, | |
| "loss": 0.316, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.5447591543197632, | |
| "learning_rate": 7.834624774440081e-05, | |
| "loss": 0.2741, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 3.2413675785064697, | |
| "learning_rate": 7.824010190001062e-05, | |
| "loss": 0.4234, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 2.6230356693267822, | |
| "learning_rate": 7.813395605562043e-05, | |
| "loss": 0.1797, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.5376132726669312, | |
| "learning_rate": 7.802781021123024e-05, | |
| "loss": 0.3815, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.4491734504699707, | |
| "learning_rate": 7.792166436684004e-05, | |
| "loss": 0.3153, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.949112057685852, | |
| "learning_rate": 7.781551852244984e-05, | |
| "loss": 0.2751, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.3488381803035736, | |
| "learning_rate": 7.770937267805966e-05, | |
| "loss": 0.3558, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.4437161684036255, | |
| "learning_rate": 7.760322683366946e-05, | |
| "loss": 0.2827, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.1105573177337646, | |
| "learning_rate": 7.749708098927927e-05, | |
| "loss": 0.1867, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.1235313415527344, | |
| "learning_rate": 7.739093514488907e-05, | |
| "loss": 0.1689, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.60935378074646, | |
| "learning_rate": 7.728478930049889e-05, | |
| "loss": 0.3198, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.3222334384918213, | |
| "learning_rate": 7.71786434561087e-05, | |
| "loss": 0.1978, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.4521784782409668, | |
| "learning_rate": 7.707249761171851e-05, | |
| "loss": 0.3276, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.4480780363082886, | |
| "learning_rate": 7.696635176732831e-05, | |
| "loss": 0.2151, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.5750231742858887, | |
| "learning_rate": 7.686020592293812e-05, | |
| "loss": 0.1659, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.5736334323883057, | |
| "learning_rate": 7.675406007854793e-05, | |
| "loss": 0.3704, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 3.719284772872925, | |
| "learning_rate": 7.664791423415774e-05, | |
| "loss": 0.1645, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 3.429244041442871, | |
| "learning_rate": 7.654176838976754e-05, | |
| "loss": 0.3323, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 2.801398277282715, | |
| "learning_rate": 7.643562254537735e-05, | |
| "loss": 0.2805, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.050607204437256, | |
| "learning_rate": 7.632947670098716e-05, | |
| "loss": 0.2308, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.164123773574829, | |
| "learning_rate": 7.622333085659697e-05, | |
| "loss": 0.2401, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.276832342147827, | |
| "learning_rate": 7.611718501220677e-05, | |
| "loss": 0.2399, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.8366944789886475, | |
| "learning_rate": 7.601103916781659e-05, | |
| "loss": 0.4004, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.4258265495300293, | |
| "learning_rate": 7.590489332342639e-05, | |
| "loss": 0.3202, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.4008164405822754, | |
| "learning_rate": 7.579874747903621e-05, | |
| "loss": 0.1952, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.1098754405975342, | |
| "learning_rate": 7.569260163464601e-05, | |
| "loss": 0.1867, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.15033583343029022, | |
| "learning_rate": 7.558645579025582e-05, | |
| "loss": 0.1995, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.9557719230651855, | |
| "learning_rate": 7.548030994586562e-05, | |
| "loss": 0.2475, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 8.91406536102295, | |
| "learning_rate": 7.537416410147544e-05, | |
| "loss": 0.2756, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.9521056413650513, | |
| "learning_rate": 7.526801825708524e-05, | |
| "loss": 0.2595, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 3.3855483531951904, | |
| "learning_rate": 7.516187241269504e-05, | |
| "loss": 0.2948, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.6990065574645996, | |
| "learning_rate": 7.506634115274387e-05, | |
| "loss": 0.2755, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 2.098942518234253, | |
| "learning_rate": 7.496019530835369e-05, | |
| "loss": 0.175, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.9781967997550964, | |
| "learning_rate": 7.48540494639635e-05, | |
| "loss": 0.4592, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.4728473722934723, | |
| "learning_rate": 7.47479036195733e-05, | |
| "loss": 0.3847, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 3.3047373294830322, | |
| "learning_rate": 7.46417577751831e-05, | |
| "loss": 0.1848, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 2.424025535583496, | |
| "learning_rate": 7.453561193079292e-05, | |
| "loss": 0.2197, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 2.697960376739502, | |
| "learning_rate": 7.442946608640272e-05, | |
| "loss": 0.2314, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.496898353099823, | |
| "learning_rate": 7.432332024201253e-05, | |
| "loss": 0.3299, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.4845099449157715, | |
| "learning_rate": 7.421717439762233e-05, | |
| "loss": 0.2832, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 3.8896942138671875, | |
| "learning_rate": 7.411102855323215e-05, | |
| "loss": 0.2837, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 4.288979530334473, | |
| "learning_rate": 7.400488270884195e-05, | |
| "loss": 0.1653, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 3.0013909339904785, | |
| "learning_rate": 7.389873686445176e-05, | |
| "loss": 0.3207, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.38008421659469604, | |
| "learning_rate": 7.379259102006156e-05, | |
| "loss": 0.2916, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.843106985092163, | |
| "learning_rate": 7.368644517567138e-05, | |
| "loss": 0.4216, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.46844518184661865, | |
| "learning_rate": 7.35802993312812e-05, | |
| "loss": 0.3038, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.5063233375549316, | |
| "learning_rate": 7.3474153486891e-05, | |
| "loss": 0.2392, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 6.260082721710205, | |
| "learning_rate": 7.33680076425008e-05, | |
| "loss": 0.317, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.771292805671692, | |
| "learning_rate": 7.32618617981106e-05, | |
| "loss": 0.2229, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 5.619741439819336, | |
| "learning_rate": 7.315571595372042e-05, | |
| "loss": 0.1364, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 2.196967363357544, | |
| "learning_rate": 7.304957010933023e-05, | |
| "loss": 0.2732, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.6409101486206055, | |
| "learning_rate": 7.294342426494003e-05, | |
| "loss": 0.2754, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.4790414571762085, | |
| "learning_rate": 7.283727842054983e-05, | |
| "loss": 0.2017, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 2.013932943344116, | |
| "learning_rate": 7.273113257615965e-05, | |
| "loss": 0.24, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 3.7832634449005127, | |
| "learning_rate": 7.262498673176945e-05, | |
| "loss": 0.3675, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.3102867007255554, | |
| "learning_rate": 7.251884088737926e-05, | |
| "loss": 0.379, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 2.4098093509674072, | |
| "learning_rate": 7.241269504298906e-05, | |
| "loss": 0.381, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 2.3519186973571777, | |
| "learning_rate": 7.230654919859888e-05, | |
| "loss": 0.2574, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.1589571237564087, | |
| "learning_rate": 7.22004033542087e-05, | |
| "loss": 0.1603, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 3.823918342590332, | |
| "learning_rate": 7.20942575098185e-05, | |
| "loss": 0.2485, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.778441071510315, | |
| "learning_rate": 7.19881116654283e-05, | |
| "loss": 0.234, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 2.2710683345794678, | |
| "learning_rate": 7.188196582103811e-05, | |
| "loss": 0.1746, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 6.078259468078613, | |
| "learning_rate": 7.177581997664792e-05, | |
| "loss": 0.3255, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.585472583770752, | |
| "learning_rate": 7.166967413225773e-05, | |
| "loss": 0.3718, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.9394687414169312, | |
| "learning_rate": 7.156352828786753e-05, | |
| "loss": 0.3181, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.6753870248794556, | |
| "learning_rate": 7.145738244347734e-05, | |
| "loss": 0.2424, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.37682977318763733, | |
| "learning_rate": 7.135123659908714e-05, | |
| "loss": 0.2963, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 3.564805507659912, | |
| "learning_rate": 7.124509075469696e-05, | |
| "loss": 0.2822, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.22953364253044128, | |
| "learning_rate": 7.113894491030676e-05, | |
| "loss": 0.3489, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 4.16074275970459, | |
| "learning_rate": 7.103279906591658e-05, | |
| "loss": 0.405, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.4540446996688843, | |
| "learning_rate": 7.092665322152638e-05, | |
| "loss": 0.2634, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.9992202520370483, | |
| "learning_rate": 7.082050737713618e-05, | |
| "loss": 0.2762, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.3939869403839111, | |
| "learning_rate": 7.0714361532746e-05, | |
| "loss": 0.3462, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.6099751591682434, | |
| "learning_rate": 7.06082156883558e-05, | |
| "loss": 0.367, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 6.303842067718506, | |
| "learning_rate": 7.050206984396561e-05, | |
| "loss": 0.2596, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.5723298788070679, | |
| "learning_rate": 7.039592399957541e-05, | |
| "loss": 0.3136, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.3614245653152466, | |
| "learning_rate": 7.028977815518523e-05, | |
| "loss": 0.2983, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.220656633377075, | |
| "learning_rate": 7.018363231079503e-05, | |
| "loss": 0.3549, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.8158984184265137, | |
| "learning_rate": 7.007748646640484e-05, | |
| "loss": 0.2431, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.46454083919525146, | |
| "learning_rate": 6.997134062201464e-05, | |
| "loss": 0.204, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 2.5426604747772217, | |
| "learning_rate": 6.986519477762446e-05, | |
| "loss": 0.1241, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 2.6442790031433105, | |
| "learning_rate": 6.975904893323428e-05, | |
| "loss": 0.2026, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.07216634601354599, | |
| "learning_rate": 6.965290308884408e-05, | |
| "loss": 0.1619, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.6410995721817017, | |
| "learning_rate": 6.954675724445388e-05, | |
| "loss": 0.309, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.0634126663208008, | |
| "learning_rate": 6.944061140006369e-05, | |
| "loss": 0.2269, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.272518277168274, | |
| "learning_rate": 6.93344655556735e-05, | |
| "loss": 0.2748, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 8.030739784240723, | |
| "learning_rate": 6.922831971128331e-05, | |
| "loss": 0.2386, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.0459538698196411, | |
| "learning_rate": 6.912217386689311e-05, | |
| "loss": 0.2162, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.7766873836517334, | |
| "learning_rate": 6.901602802250292e-05, | |
| "loss": 0.18, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.345751166343689, | |
| "learning_rate": 6.890988217811273e-05, | |
| "loss": 0.1927, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 3.475550889968872, | |
| "learning_rate": 6.880373633372254e-05, | |
| "loss": 0.1593, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 4.3208088874816895, | |
| "learning_rate": 6.869759048933234e-05, | |
| "loss": 0.3782, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.5283639430999756, | |
| "learning_rate": 6.859144464494214e-05, | |
| "loss": 0.2065, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.3912002444267273, | |
| "learning_rate": 6.848529880055196e-05, | |
| "loss": 0.2094, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 5.560369968414307, | |
| "learning_rate": 6.837915295616178e-05, | |
| "loss": 0.2598, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.0859804153442383, | |
| "learning_rate": 6.827300711177158e-05, | |
| "loss": 0.2396, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.9198240041732788, | |
| "learning_rate": 6.816686126738139e-05, | |
| "loss": 0.326, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.559525728225708, | |
| "learning_rate": 6.806071542299119e-05, | |
| "loss": 0.2846, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 8.122730255126953, | |
| "learning_rate": 6.7954569578601e-05, | |
| "loss": 0.3404, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.4377597570419312, | |
| "learning_rate": 6.784842373421081e-05, | |
| "loss": 0.3534, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.3202710151672363, | |
| "learning_rate": 6.774227788982061e-05, | |
| "loss": 0.3151, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.2933627367019653, | |
| "learning_rate": 6.763613204543042e-05, | |
| "loss": 0.1983, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.8253432512283325, | |
| "learning_rate": 6.752998620104023e-05, | |
| "loss": 0.1989, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.008435606956482, | |
| "learning_rate": 6.742384035665004e-05, | |
| "loss": 0.2045, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 4.022599220275879, | |
| "learning_rate": 6.731769451225984e-05, | |
| "loss": 0.2166, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.5018757581710815, | |
| "learning_rate": 6.721154866786966e-05, | |
| "loss": 0.1841, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.1110012531280518, | |
| "learning_rate": 6.710540282347946e-05, | |
| "loss": 0.208, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 4.160871505737305, | |
| "learning_rate": 6.699925697908928e-05, | |
| "loss": 0.2853, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 3.1839327812194824, | |
| "learning_rate": 6.689311113469908e-05, | |
| "loss": 0.239, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.2867355346679688, | |
| "learning_rate": 6.678696529030889e-05, | |
| "loss": 0.1678, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.3853776454925537, | |
| "learning_rate": 6.668081944591869e-05, | |
| "loss": 0.1119, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.9403756856918335, | |
| "learning_rate": 6.657467360152851e-05, | |
| "loss": 0.1772, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 2.8056976795196533, | |
| "learning_rate": 6.646852775713831e-05, | |
| "loss": 0.1438, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 0.9233602285385132, | |
| "learning_rate": 6.636238191274812e-05, | |
| "loss": 0.2491, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 2.179743766784668, | |
| "learning_rate": 6.625623606835792e-05, | |
| "loss": 0.1493, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.8002713918685913, | |
| "learning_rate": 6.615009022396774e-05, | |
| "loss": 0.1557, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.0567578077316284, | |
| "learning_rate": 6.604394437957754e-05, | |
| "loss": 0.1573, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.7498853206634521, | |
| "learning_rate": 6.593779853518734e-05, | |
| "loss": 0.2639, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.14960238337516785, | |
| "learning_rate": 6.583165269079716e-05, | |
| "loss": 0.2314, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.858378529548645, | |
| "learning_rate": 6.572550684640697e-05, | |
| "loss": 0.1898, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 4.104907989501953, | |
| "learning_rate": 6.561936100201678e-05, | |
| "loss": 0.2381, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.1154847964644432, | |
| "learning_rate": 6.551321515762659e-05, | |
| "loss": 0.0987, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.8907705545425415, | |
| "learning_rate": 6.540706931323639e-05, | |
| "loss": 0.125, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.2750372886657715, | |
| "learning_rate": 6.53009234688462e-05, | |
| "loss": 0.234, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.584429144859314, | |
| "learning_rate": 6.519477762445601e-05, | |
| "loss": 0.1328, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 2.3900089263916016, | |
| "learning_rate": 6.508863178006581e-05, | |
| "loss": 0.2681, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.9859068393707275, | |
| "learning_rate": 6.498248593567562e-05, | |
| "loss": 0.4136, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 3.4652695655822754, | |
| "learning_rate": 6.487634009128542e-05, | |
| "loss": 0.2059, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 4.06072473526001, | |
| "learning_rate": 6.477019424689524e-05, | |
| "loss": 0.2378, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.2823538780212402, | |
| "learning_rate": 6.466404840250504e-05, | |
| "loss": 0.1772, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 0.545313835144043, | |
| "learning_rate": 6.455790255811486e-05, | |
| "loss": 0.1587, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 5.666371822357178, | |
| "learning_rate": 6.445175671372466e-05, | |
| "loss": 0.1486, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.3175773620605469, | |
| "learning_rate": 6.434561086933447e-05, | |
| "loss": 0.2295, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 3.88968563079834, | |
| "learning_rate": 6.423946502494428e-05, | |
| "loss": 0.16, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 2.4445409774780273, | |
| "learning_rate": 6.413331918055409e-05, | |
| "loss": 0.1766, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 0.5478050708770752, | |
| "learning_rate": 6.402717333616389e-05, | |
| "loss": 0.1299, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 4.029285907745361, | |
| "learning_rate": 6.393164207621272e-05, | |
| "loss": 0.3463, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 0.3899819552898407, | |
| "learning_rate": 6.382549623182253e-05, | |
| "loss": 0.1214, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 0.7180734276771545, | |
| "learning_rate": 6.371935038743233e-05, | |
| "loss": 0.2756, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 3.6423099040985107, | |
| "learning_rate": 6.361320454304213e-05, | |
| "loss": 0.2059, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 3.006516933441162, | |
| "learning_rate": 6.350705869865195e-05, | |
| "loss": 0.2151, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 2.1426503658294678, | |
| "learning_rate": 6.340091285426177e-05, | |
| "loss": 0.2644, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.4418883323669434, | |
| "learning_rate": 6.329476700987157e-05, | |
| "loss": 0.1675, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.2576738595962524, | |
| "learning_rate": 6.318862116548138e-05, | |
| "loss": 0.1612, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 3.26369309425354, | |
| "learning_rate": 6.308247532109118e-05, | |
| "loss": 0.2346, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 0.9214788675308228, | |
| "learning_rate": 6.2976329476701e-05, | |
| "loss": 0.1714, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.696925163269043, | |
| "learning_rate": 6.28701836323108e-05, | |
| "loss": 0.1306, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.1808693408966064, | |
| "learning_rate": 6.27640377879206e-05, | |
| "loss": 0.1135, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 4.710297107696533, | |
| "learning_rate": 6.265789194353041e-05, | |
| "loss": 0.158, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 0.5521005988121033, | |
| "learning_rate": 6.255174609914022e-05, | |
| "loss": 0.3224, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 2.172825336456299, | |
| "learning_rate": 6.244560025475003e-05, | |
| "loss": 0.0946, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.8690552711486816, | |
| "learning_rate": 6.233945441035983e-05, | |
| "loss": 0.1972, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 0.059970393776893616, | |
| "learning_rate": 6.223330856596965e-05, | |
| "loss": 0.0601, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 0.0773802176117897, | |
| "learning_rate": 6.212716272157945e-05, | |
| "loss": 0.2881, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.320061206817627, | |
| "learning_rate": 6.202101687718927e-05, | |
| "loss": 0.1966, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 2.4339261054992676, | |
| "learning_rate": 6.191487103279907e-05, | |
| "loss": 0.1808, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 5.3104729652404785, | |
| "learning_rate": 6.180872518840888e-05, | |
| "loss": 0.1737, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 3.9139719009399414, | |
| "learning_rate": 6.170257934401868e-05, | |
| "loss": 0.239, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.9480198621749878, | |
| "learning_rate": 6.15964334996285e-05, | |
| "loss": 0.1556, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.807107150554657, | |
| "learning_rate": 6.14902876552383e-05, | |
| "loss": 0.131, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.059983473271131516, | |
| "learning_rate": 6.13841418108481e-05, | |
| "loss": 0.1479, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.7000637650489807, | |
| "learning_rate": 6.127799596645791e-05, | |
| "loss": 0.0861, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.43273600935935974, | |
| "learning_rate": 6.117185012206771e-05, | |
| "loss": 0.1848, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.056298673152923584, | |
| "learning_rate": 6.106570427767753e-05, | |
| "loss": 0.1313, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 0.6714267134666443, | |
| "learning_rate": 6.095955843328735e-05, | |
| "loss": 0.2817, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 2.8052423000335693, | |
| "learning_rate": 6.085341258889715e-05, | |
| "loss": 0.2095, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 3.0490353107452393, | |
| "learning_rate": 6.074726674450696e-05, | |
| "loss": 0.2707, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 2.3823633193969727, | |
| "learning_rate": 6.0641120900116766e-05, | |
| "loss": 0.1918, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 5.9893293380737305, | |
| "learning_rate": 6.0534975055726576e-05, | |
| "loss": 0.1855, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 5.253934383392334, | |
| "learning_rate": 6.042882921133638e-05, | |
| "loss": 0.1286, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 3.3353893756866455, | |
| "learning_rate": 6.0322683366946183e-05, | |
| "loss": 0.1656, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.5391966104507446, | |
| "learning_rate": 6.0216537522555994e-05, | |
| "loss": 0.1783, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 3.3716678619384766, | |
| "learning_rate": 6.01103916781658e-05, | |
| "loss": 0.1025, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 0.8058392405509949, | |
| "learning_rate": 6.000424583377561e-05, | |
| "loss": 0.1224, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.5231162309646606, | |
| "learning_rate": 5.989809998938541e-05, | |
| "loss": 0.0579, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 3.7527573108673096, | |
| "learning_rate": 5.979195414499522e-05, | |
| "loss": 0.3109, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.884722113609314, | |
| "learning_rate": 5.968580830060504e-05, | |
| "loss": 0.2569, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.2949138879776, | |
| "learning_rate": 5.957966245621484e-05, | |
| "loss": 0.2067, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.9406439065933228, | |
| "learning_rate": 5.9473516611824654e-05, | |
| "loss": 0.1397, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 3.048089027404785, | |
| "learning_rate": 5.936737076743446e-05, | |
| "loss": 0.1903, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 2.7827141284942627, | |
| "learning_rate": 5.926122492304427e-05, | |
| "loss": 0.2375, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 0.30664700269699097, | |
| "learning_rate": 5.915507907865407e-05, | |
| "loss": 0.2605, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 5.038077354431152, | |
| "learning_rate": 5.904893323426388e-05, | |
| "loss": 0.2249, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.5563170313835144, | |
| "learning_rate": 5.8942787389873686e-05, | |
| "loss": 0.1407, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 3.5176491737365723, | |
| "learning_rate": 5.8836641545483496e-05, | |
| "loss": 0.1955, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 0.16444259881973267, | |
| "learning_rate": 5.87304957010933e-05, | |
| "loss": 0.2973, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.3163607120513916, | |
| "learning_rate": 5.862434985670311e-05, | |
| "loss": 0.1388, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.4921140670776367, | |
| "learning_rate": 5.8518204012312914e-05, | |
| "loss": 0.2844, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 6.664550304412842, | |
| "learning_rate": 5.841205816792273e-05, | |
| "loss": 0.5434, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.27615758776664734, | |
| "learning_rate": 5.830591232353254e-05, | |
| "loss": 0.2716, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 7.205143451690674, | |
| "learning_rate": 5.8199766479142345e-05, | |
| "loss": 0.1927, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 2.423842191696167, | |
| "learning_rate": 5.8093620634752156e-05, | |
| "loss": 0.2013, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 0.6563037037849426, | |
| "learning_rate": 5.798747479036196e-05, | |
| "loss": 0.2597, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 2.216214418411255, | |
| "learning_rate": 5.788132894597177e-05, | |
| "loss": 0.1484, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 0.21049724519252777, | |
| "learning_rate": 5.7775183101581574e-05, | |
| "loss": 0.1205, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.838711142539978, | |
| "learning_rate": 5.7669037257191384e-05, | |
| "loss": 0.1806, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 4.584275245666504, | |
| "learning_rate": 5.756289141280119e-05, | |
| "loss": 0.1459, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 3.7076704502105713, | |
| "learning_rate": 5.7456745568411e-05, | |
| "loss": 0.2119, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 4.600487232208252, | |
| "learning_rate": 5.73505997240208e-05, | |
| "loss": 0.1846, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 2.9479613304138184, | |
| "learning_rate": 5.724445387963061e-05, | |
| "loss": 0.1373, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 2.7824301719665527, | |
| "learning_rate": 5.7138308035240416e-05, | |
| "loss": 0.1573, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.3697668313980103, | |
| "learning_rate": 5.703216219085023e-05, | |
| "loss": 0.1067, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 4.134962558746338, | |
| "learning_rate": 5.6926016346460044e-05, | |
| "loss": 0.3154, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.986623764038086, | |
| "learning_rate": 5.681987050206985e-05, | |
| "loss": 0.162, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.7553232908248901, | |
| "learning_rate": 5.671372465767966e-05, | |
| "loss": 0.2197, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.666942834854126, | |
| "learning_rate": 5.660757881328946e-05, | |
| "loss": 0.2144, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.3620635271072388, | |
| "learning_rate": 5.650143296889927e-05, | |
| "loss": 0.2823, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 3.4056193828582764, | |
| "learning_rate": 5.6395287124509076e-05, | |
| "loss": 0.3223, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.8397992253303528, | |
| "learning_rate": 5.6289141280118886e-05, | |
| "loss": 0.1297, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.09627294540405273, | |
| "learning_rate": 5.618299543572869e-05, | |
| "loss": 0.1154, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 2.1529462337493896, | |
| "learning_rate": 5.60768495913385e-05, | |
| "loss": 0.1903, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.42282378673553467, | |
| "learning_rate": 5.5970703746948304e-05, | |
| "loss": 0.0992, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.34097906947135925, | |
| "learning_rate": 5.5864557902558115e-05, | |
| "loss": 0.2193, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 0.11647669225931168, | |
| "learning_rate": 5.575841205816793e-05, | |
| "loss": 0.1511, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 7.489476680755615, | |
| "learning_rate": 5.5652266213777736e-05, | |
| "loss": 0.182, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 0.0627538189291954, | |
| "learning_rate": 5.5546120369387546e-05, | |
| "loss": 0.2056, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.6038990020751953, | |
| "learning_rate": 5.543997452499735e-05, | |
| "loss": 0.317, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.0296130180358887, | |
| "learning_rate": 5.533382868060716e-05, | |
| "loss": 0.221, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 3.08427357673645, | |
| "learning_rate": 5.5227682836216964e-05, | |
| "loss": 0.309, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 6.700926303863525, | |
| "learning_rate": 5.5121536991826774e-05, | |
| "loss": 0.3862, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 3.3283987045288086, | |
| "learning_rate": 5.501539114743658e-05, | |
| "loss": 0.1449, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 2.7718186378479004, | |
| "learning_rate": 5.490924530304639e-05, | |
| "loss": 0.1237, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.7264149188995361, | |
| "learning_rate": 5.480309945865619e-05, | |
| "loss": 0.0537, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 2.8292267322540283, | |
| "learning_rate": 5.4696953614266e-05, | |
| "loss": 0.1139, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 2.6377663612365723, | |
| "learning_rate": 5.4590807769875806e-05, | |
| "loss": 0.1632, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 0.1827862560749054, | |
| "learning_rate": 5.4484661925485624e-05, | |
| "loss": 0.1809, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 5.187005996704102, | |
| "learning_rate": 5.4378516081095434e-05, | |
| "loss": 0.1735, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 2.064953327178955, | |
| "learning_rate": 5.427237023670524e-05, | |
| "loss": 0.3226, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 0.03769757226109505, | |
| "learning_rate": 5.416622439231505e-05, | |
| "loss": 0.1563, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 5.220246315002441, | |
| "learning_rate": 5.406007854792485e-05, | |
| "loss": 0.2403, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.1891440451145172, | |
| "learning_rate": 5.395393270353466e-05, | |
| "loss": 0.1741, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 5.661322116851807, | |
| "learning_rate": 5.3847786859144466e-05, | |
| "loss": 0.1514, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 8.325531005859375, | |
| "learning_rate": 5.3741641014754277e-05, | |
| "loss": 0.1954, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 3.1849327087402344, | |
| "learning_rate": 5.363549517036408e-05, | |
| "loss": 0.2667, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 4.426061153411865, | |
| "learning_rate": 5.352934932597389e-05, | |
| "loss": 0.1621, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 0.08511369675397873, | |
| "learning_rate": 5.3423203481583694e-05, | |
| "loss": 0.2384, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 2.6035985946655273, | |
| "learning_rate": 5.3317057637193505e-05, | |
| "loss": 0.2029, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 3.637746810913086, | |
| "learning_rate": 5.321091179280332e-05, | |
| "loss": 0.2054, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 2.6887290477752686, | |
| "learning_rate": 5.3104765948413126e-05, | |
| "loss": 0.194, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.5362237691879272, | |
| "learning_rate": 5.2998620104022936e-05, | |
| "loss": 0.1243, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 6.602662086486816, | |
| "learning_rate": 5.289247425963274e-05, | |
| "loss": 0.1005, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.16585449874401093, | |
| "learning_rate": 5.278632841524255e-05, | |
| "loss": 0.116, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 3.062458038330078, | |
| "learning_rate": 5.2690797155291374e-05, | |
| "loss": 0.2236, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 3.1578338146209717, | |
| "learning_rate": 5.258465131090118e-05, | |
| "loss": 0.1248, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 6.487752914428711, | |
| "learning_rate": 5.247850546651099e-05, | |
| "loss": 0.2268, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 4.561209678649902, | |
| "learning_rate": 5.237235962212079e-05, | |
| "loss": 0.3183, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.6614716053009033, | |
| "learning_rate": 5.22662137777306e-05, | |
| "loss": 0.2555, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.4814791679382324, | |
| "learning_rate": 5.216006793334042e-05, | |
| "loss": 0.1524, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.17691956460475922, | |
| "learning_rate": 5.205392208895022e-05, | |
| "loss": 0.1934, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 5.082562446594238, | |
| "learning_rate": 5.1947776244560033e-05, | |
| "loss": 0.4279, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 3.106387138366699, | |
| "learning_rate": 5.184163040016984e-05, | |
| "loss": 0.1194, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 7.02073335647583, | |
| "learning_rate": 5.173548455577965e-05, | |
| "loss": 0.1109, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.2526942193508148, | |
| "learning_rate": 5.162933871138945e-05, | |
| "loss": 0.1913, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 4.575504302978516, | |
| "learning_rate": 5.152319286699926e-05, | |
| "loss": 0.2151, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 2.3890509605407715, | |
| "learning_rate": 5.1417047022609066e-05, | |
| "loss": 0.2336, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 0.8267619013786316, | |
| "learning_rate": 5.1310901178218876e-05, | |
| "loss": 0.0856, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 4.056538105010986, | |
| "learning_rate": 5.120475533382868e-05, | |
| "loss": 0.1947, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 6.964923858642578, | |
| "learning_rate": 5.109860948943849e-05, | |
| "loss": 0.1195, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.813004970550537, | |
| "learning_rate": 5.100307822948732e-05, | |
| "loss": 0.1225, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.654339075088501, | |
| "learning_rate": 5.089693238509713e-05, | |
| "loss": 0.1006, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 6.5991644859313965, | |
| "learning_rate": 5.0790786540706934e-05, | |
| "loss": 0.2646, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 5.099368572235107, | |
| "learning_rate": 5.0684640696316745e-05, | |
| "loss": 0.2748, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 5.0444655418396, | |
| "learning_rate": 5.057849485192655e-05, | |
| "loss": 0.2295, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 0.07431354373693466, | |
| "learning_rate": 5.047234900753636e-05, | |
| "loss": 0.1348, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.1366661787033081, | |
| "learning_rate": 5.036620316314616e-05, | |
| "loss": 0.1164, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 4.550073146820068, | |
| "learning_rate": 5.026005731875597e-05, | |
| "loss": 0.2377, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.12663549184799194, | |
| "learning_rate": 5.015391147436578e-05, | |
| "loss": 0.0871, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 5.191462993621826, | |
| "learning_rate": 5.004776562997559e-05, | |
| "loss": 0.2778, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 2.7582337856292725, | |
| "learning_rate": 4.99416197855854e-05, | |
| "loss": 0.203, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 7.114481449127197, | |
| "learning_rate": 4.98354739411952e-05, | |
| "loss": 0.1426, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.41717416048049927, | |
| "learning_rate": 4.972932809680501e-05, | |
| "loss": 0.2009, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.8175145387649536, | |
| "learning_rate": 4.9623182252414816e-05, | |
| "loss": 0.1152, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 3.585702419281006, | |
| "learning_rate": 4.951703640802463e-05, | |
| "loss": 0.1615, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.385105699300766, | |
| "learning_rate": 4.9410890563634437e-05, | |
| "loss": 0.1569, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 2.8163392543792725, | |
| "learning_rate": 4.930474471924425e-05, | |
| "loss": 0.0942, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 5.181662082672119, | |
| "learning_rate": 4.919859887485405e-05, | |
| "loss": 0.2076, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 0.15229104459285736, | |
| "learning_rate": 4.909245303046386e-05, | |
| "loss": 0.2249, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 3.2373440265655518, | |
| "learning_rate": 4.8986307186073665e-05, | |
| "loss": 0.5439, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.7857202291488647, | |
| "learning_rate": 4.8880161341683475e-05, | |
| "loss": 0.1806, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.1035951375961304, | |
| "learning_rate": 4.8774015497293286e-05, | |
| "loss": 0.1309, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 7.660123825073242, | |
| "learning_rate": 4.866786965290309e-05, | |
| "loss": 0.1587, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.20227286219596863, | |
| "learning_rate": 4.85617238085129e-05, | |
| "loss": 0.3051, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 6.558931827545166, | |
| "learning_rate": 4.8455577964122704e-05, | |
| "loss": 0.2137, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 2.683018922805786, | |
| "learning_rate": 4.8349432119732514e-05, | |
| "loss": 0.1528, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.2843786478042603, | |
| "learning_rate": 4.8243286275342325e-05, | |
| "loss": 0.1525, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 0.9824750423431396, | |
| "learning_rate": 4.8137140430952135e-05, | |
| "loss": 0.1682, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.0165822505950928, | |
| "learning_rate": 4.803099458656194e-05, | |
| "loss": 0.2397, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 2.0921578407287598, | |
| "learning_rate": 4.792484874217175e-05, | |
| "loss": 0.2342, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 2.5232343673706055, | |
| "learning_rate": 4.781870289778155e-05, | |
| "loss": 0.2216, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 5.7156782150268555, | |
| "learning_rate": 4.7712557053391363e-05, | |
| "loss": 0.2342, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 3.128016233444214, | |
| "learning_rate": 4.760641120900117e-05, | |
| "loss": 0.1759, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 2.2040598392486572, | |
| "learning_rate": 4.750026536461098e-05, | |
| "loss": 0.1414, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 2.1795644760131836, | |
| "learning_rate": 4.739411952022079e-05, | |
| "loss": 0.1648, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 5.399777412414551, | |
| "learning_rate": 4.728797367583059e-05, | |
| "loss": 0.1344, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 0.06098851189017296, | |
| "learning_rate": 4.71818278314404e-05, | |
| "loss": 0.1188, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 3.174159049987793, | |
| "learning_rate": 4.7075681987050206e-05, | |
| "loss": 0.3419, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 4.566168308258057, | |
| "learning_rate": 4.6969536142660016e-05, | |
| "loss": 0.2582, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.5227226614952087, | |
| "learning_rate": 4.686339029826983e-05, | |
| "loss": 0.1691, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 5.8460869789123535, | |
| "learning_rate": 4.675724445387963e-05, | |
| "loss": 0.1399, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 2.2399487495422363, | |
| "learning_rate": 4.665109860948944e-05, | |
| "loss": 0.1549, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 2.9508166313171387, | |
| "learning_rate": 4.6544952765099245e-05, | |
| "loss": 0.1665, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 2.5230746269226074, | |
| "learning_rate": 4.6438806920709055e-05, | |
| "loss": 0.2108, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.5516650080680847, | |
| "learning_rate": 4.633266107631886e-05, | |
| "loss": 0.2275, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 8.398303985595703, | |
| "learning_rate": 4.6226515231928676e-05, | |
| "loss": 0.1801, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 0.2512928247451782, | |
| "learning_rate": 4.612036938753848e-05, | |
| "loss": 0.2654, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 5.312344551086426, | |
| "learning_rate": 4.601422354314829e-05, | |
| "loss": 0.2992, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.728023648262024, | |
| "learning_rate": 4.5908077698758094e-05, | |
| "loss": 0.1638, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.6222649812698364, | |
| "learning_rate": 4.5801931854367904e-05, | |
| "loss": 0.2216, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.5581383109092712, | |
| "learning_rate": 4.569578600997771e-05, | |
| "loss": 0.2467, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 3.051811456680298, | |
| "learning_rate": 4.558964016558752e-05, | |
| "loss": 0.1486, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 0.6013765931129456, | |
| "learning_rate": 4.548349432119733e-05, | |
| "loss": 0.123, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 3.8984789848327637, | |
| "learning_rate": 4.537734847680713e-05, | |
| "loss": 0.3698, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.3346749544143677, | |
| "learning_rate": 4.527120263241694e-05, | |
| "loss": 0.1814, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 11.491423606872559, | |
| "learning_rate": 4.516505678802675e-05, | |
| "loss": 0.1745, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 2.358656883239746, | |
| "learning_rate": 4.505891094363656e-05, | |
| "loss": 0.2734, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 3.3352041244506836, | |
| "learning_rate": 4.495276509924637e-05, | |
| "loss": 0.2054, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 0.052441373467445374, | |
| "learning_rate": 4.484661925485618e-05, | |
| "loss": 0.1389, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 0.20047003030776978, | |
| "learning_rate": 4.474047341046598e-05, | |
| "loss": 0.1197, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.4837030172348022, | |
| "learning_rate": 4.463432756607579e-05, | |
| "loss": 0.2446, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 0.3104861378669739, | |
| "learning_rate": 4.4528181721685596e-05, | |
| "loss": 0.1842, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 7.954286098480225, | |
| "learning_rate": 4.442203587729541e-05, | |
| "loss": 0.1221, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.03400198742747307, | |
| "learning_rate": 4.431589003290522e-05, | |
| "loss": 0.1513, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.08371475338935852, | |
| "learning_rate": 4.420974418851502e-05, | |
| "loss": 0.2098, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.2470760345458984, | |
| "learning_rate": 4.410359834412483e-05, | |
| "loss": 0.117, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.5426656007766724, | |
| "learning_rate": 4.3997452499734635e-05, | |
| "loss": 0.1826, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 3.978109121322632, | |
| "learning_rate": 4.3891306655344445e-05, | |
| "loss": 0.1103, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.6321693658828735, | |
| "learning_rate": 4.378516081095425e-05, | |
| "loss": 0.151, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 2.555723190307617, | |
| "learning_rate": 4.3679014966564066e-05, | |
| "loss": 0.1786, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.2461155354976654, | |
| "learning_rate": 4.357286912217387e-05, | |
| "loss": 0.1914, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 0.41670894622802734, | |
| "learning_rate": 4.346672327778368e-05, | |
| "loss": 0.2582, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 4.785902976989746, | |
| "learning_rate": 4.3360577433393484e-05, | |
| "loss": 0.0911, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 4.179080963134766, | |
| "learning_rate": 4.3254431589003295e-05, | |
| "loss": 0.2264, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 0.9344226717948914, | |
| "learning_rate": 4.31482857446131e-05, | |
| "loss": 0.2003, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.3643859624862671, | |
| "learning_rate": 4.304213990022291e-05, | |
| "loss": 0.1, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 2.3688154220581055, | |
| "learning_rate": 4.293599405583272e-05, | |
| "loss": 0.2461, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 4.223112106323242, | |
| "learning_rate": 4.282984821144252e-05, | |
| "loss": 0.1316, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.52751886844635, | |
| "learning_rate": 4.2723702367052333e-05, | |
| "loss": 0.162, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 0.06534834951162338, | |
| "learning_rate": 4.261755652266214e-05, | |
| "loss": 0.1787, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 0.0435919463634491, | |
| "learning_rate": 4.251141067827195e-05, | |
| "loss": 0.2196, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.0877362489700317, | |
| "learning_rate": 4.240526483388176e-05, | |
| "loss": 0.2829, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.7220368385314941, | |
| "learning_rate": 4.229911898949156e-05, | |
| "loss": 0.211, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.6200969219207764, | |
| "learning_rate": 4.219297314510137e-05, | |
| "loss": 0.2046, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 2.376384735107422, | |
| "learning_rate": 4.2086827300711176e-05, | |
| "loss": 0.2518, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.6646453142166138, | |
| "learning_rate": 4.1980681456320986e-05, | |
| "loss": 0.1542, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.580792248249054, | |
| "learning_rate": 4.187453561193079e-05, | |
| "loss": 0.1503, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 2.325477123260498, | |
| "learning_rate": 4.176838976754061e-05, | |
| "loss": 0.1867, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 3.004499673843384, | |
| "learning_rate": 4.166224392315041e-05, | |
| "loss": 0.1816, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.7592769861221313, | |
| "learning_rate": 4.155609807876022e-05, | |
| "loss": 0.2155, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.4255143105983734, | |
| "learning_rate": 4.1449952234370025e-05, | |
| "loss": 0.2298, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 4.217332363128662, | |
| "learning_rate": 4.1343806389979836e-05, | |
| "loss": 0.1263, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.6670517921447754, | |
| "learning_rate": 4.123766054558964e-05, | |
| "loss": 0.1993, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 0.2432798445224762, | |
| "learning_rate": 4.113151470119945e-05, | |
| "loss": 0.1992, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 5.0905070304870605, | |
| "learning_rate": 4.102536885680926e-05, | |
| "loss": 0.1381, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 12.299093246459961, | |
| "learning_rate": 4.0919223012419064e-05, | |
| "loss": 0.2233, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 0.27092546224594116, | |
| "learning_rate": 4.0813077168028874e-05, | |
| "loss": 0.1675, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 3.4481306076049805, | |
| "learning_rate": 4.070693132363868e-05, | |
| "loss": 0.3113, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 12.642804145812988, | |
| "learning_rate": 4.060078547924849e-05, | |
| "loss": 0.1557, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 4.341307163238525, | |
| "learning_rate": 4.049463963485829e-05, | |
| "loss": 0.0825, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.728386402130127, | |
| "learning_rate": 4.038849379046811e-05, | |
| "loss": 0.1589, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 4.2692084312438965, | |
| "learning_rate": 4.028234794607791e-05, | |
| "loss": 0.0908, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 3.5218265056610107, | |
| "learning_rate": 4.0176202101687724e-05, | |
| "loss": 0.2008, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.6934779286384583, | |
| "learning_rate": 4.007005625729753e-05, | |
| "loss": 0.1652, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 7.079185485839844, | |
| "learning_rate": 3.996391041290734e-05, | |
| "loss": 0.1854, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 2.6828112602233887, | |
| "learning_rate": 3.985776456851714e-05, | |
| "loss": 0.0911, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 5.049779891967773, | |
| "learning_rate": 3.975161872412695e-05, | |
| "loss": 0.1191, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 2.4732673168182373, | |
| "learning_rate": 3.9656087464175775e-05, | |
| "loss": 0.2192, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 0.11808130145072937, | |
| "learning_rate": 3.9549941619785586e-05, | |
| "loss": 0.1782, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 3.8879833221435547, | |
| "learning_rate": 3.944379577539539e-05, | |
| "loss": 0.1692, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 3.667048931121826, | |
| "learning_rate": 3.933764993100521e-05, | |
| "loss": 0.1236, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 4.494665622711182, | |
| "learning_rate": 3.923150408661501e-05, | |
| "loss": 0.2373, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 0.3976966440677643, | |
| "learning_rate": 3.912535824222482e-05, | |
| "loss": 0.2805, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 2.046142578125, | |
| "learning_rate": 3.9019212397834625e-05, | |
| "loss": 0.1198, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 0.27937573194503784, | |
| "learning_rate": 3.8913066553444435e-05, | |
| "loss": 0.1443, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 6.109045028686523, | |
| "learning_rate": 3.880692070905424e-05, | |
| "loss": 0.3341, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.7306396961212158, | |
| "learning_rate": 3.870077486466405e-05, | |
| "loss": 0.1208, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.7087950706481934, | |
| "learning_rate": 3.859462902027386e-05, | |
| "loss": 0.1464, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 0.5200537443161011, | |
| "learning_rate": 3.8488483175883663e-05, | |
| "loss": 0.1639, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 6.455096244812012, | |
| "learning_rate": 3.8382337331493474e-05, | |
| "loss": 0.1885, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 7.437272548675537, | |
| "learning_rate": 3.827619148710328e-05, | |
| "loss": 0.1916, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 6.395534515380859, | |
| "learning_rate": 3.817004564271309e-05, | |
| "loss": 0.2988, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 20.61446762084961, | |
| "learning_rate": 3.80638997983229e-05, | |
| "loss": 0.0853, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.0395785570144653, | |
| "learning_rate": 3.795775395393271e-05, | |
| "loss": 0.2113, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 8.83860969543457, | |
| "learning_rate": 3.785160810954251e-05, | |
| "loss": 0.1904, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 5.42601203918457, | |
| "learning_rate": 3.774546226515232e-05, | |
| "loss": 0.3887, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 3.3505442142486572, | |
| "learning_rate": 3.763931642076213e-05, | |
| "loss": 0.1397, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 4.929141521453857, | |
| "learning_rate": 3.753317057637194e-05, | |
| "loss": 0.2773, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 2.1540703773498535, | |
| "learning_rate": 3.742702473198175e-05, | |
| "loss": 0.1679, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 10.82689094543457, | |
| "learning_rate": 3.732087888759155e-05, | |
| "loss": 0.1776, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 3.0525174140930176, | |
| "learning_rate": 3.721473304320136e-05, | |
| "loss": 0.1619, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 5.296212196350098, | |
| "learning_rate": 3.7108587198811166e-05, | |
| "loss": 0.3294, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 3.4226958751678467, | |
| "learning_rate": 3.7002441354420976e-05, | |
| "loss": 0.3229, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 0.4734908938407898, | |
| "learning_rate": 3.689629551003078e-05, | |
| "loss": 0.1179, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 5.436024188995361, | |
| "learning_rate": 3.67901496656406e-05, | |
| "loss": 0.1892, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 5.233070373535156, | |
| "learning_rate": 3.66840038212504e-05, | |
| "loss": 0.2054, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 0.5661432147026062, | |
| "learning_rate": 3.657785797686021e-05, | |
| "loss": 0.2202, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 0.23524077236652374, | |
| "learning_rate": 3.6471712132470015e-05, | |
| "loss": 0.2318, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.05953243002295494, | |
| "learning_rate": 3.6365566288079825e-05, | |
| "loss": 0.2486, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.3823449611663818, | |
| "learning_rate": 3.625942044368963e-05, | |
| "loss": 0.1171, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 7.733388423919678, | |
| "learning_rate": 3.615327459929944e-05, | |
| "loss": 0.2469, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 1.4917621612548828, | |
| "learning_rate": 3.604712875490925e-05, | |
| "loss": 0.2045, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 7.689728736877441, | |
| "learning_rate": 3.5940982910519054e-05, | |
| "loss": 0.1648, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 2.2216577529907227, | |
| "learning_rate": 3.5834837066128864e-05, | |
| "loss": 0.2779, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.7362425327301025, | |
| "learning_rate": 3.572869122173867e-05, | |
| "loss": 0.1664, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 4.933811187744141, | |
| "learning_rate": 3.562254537734848e-05, | |
| "loss": 0.293, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 4.054910182952881, | |
| "learning_rate": 3.551639953295829e-05, | |
| "loss": 0.1539, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 0.9219651222229004, | |
| "learning_rate": 3.541025368856809e-05, | |
| "loss": 0.1111, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 4.558506488800049, | |
| "learning_rate": 3.53041078441779e-05, | |
| "loss": 0.1783, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 2.6951773166656494, | |
| "learning_rate": 3.5197961999787707e-05, | |
| "loss": 0.2916, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 0.9989050626754761, | |
| "learning_rate": 3.509181615539752e-05, | |
| "loss": 0.2099, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.08494656533002853, | |
| "learning_rate": 3.498567031100732e-05, | |
| "loss": 0.1255, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.20273062586784363, | |
| "learning_rate": 3.487952446661714e-05, | |
| "loss": 0.1523, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 0.2878829538822174, | |
| "learning_rate": 3.477337862222694e-05, | |
| "loss": 0.1732, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 2.026616096496582, | |
| "learning_rate": 3.466723277783675e-05, | |
| "loss": 0.1037, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 0.8350101709365845, | |
| "learning_rate": 3.4561086933446556e-05, | |
| "loss": 0.1169, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 0.6492775082588196, | |
| "learning_rate": 3.4454941089056366e-05, | |
| "loss": 0.1758, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 4.830353736877441, | |
| "learning_rate": 3.434879524466617e-05, | |
| "loss": 0.3367, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 5.267330169677734, | |
| "learning_rate": 3.424264940027598e-05, | |
| "loss": 0.1753, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 0.11368358880281448, | |
| "learning_rate": 3.413650355588579e-05, | |
| "loss": 0.2409, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.10408168286085129, | |
| "learning_rate": 3.4030357711495595e-05, | |
| "loss": 0.1407, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 4.495917320251465, | |
| "learning_rate": 3.3924211867105405e-05, | |
| "loss": 0.1504, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 0.16925585269927979, | |
| "learning_rate": 3.381806602271521e-05, | |
| "loss": 0.1323, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.5475289821624756, | |
| "learning_rate": 3.371192017832502e-05, | |
| "loss": 0.1902, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.21207332611084, | |
| "learning_rate": 3.360577433393483e-05, | |
| "loss": 0.1019, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.7308425903320312, | |
| "learning_rate": 3.349962848954464e-05, | |
| "loss": 0.1368, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 0.8695929646492004, | |
| "learning_rate": 3.3393482645154444e-05, | |
| "loss": 0.1979, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 5.150228977203369, | |
| "learning_rate": 3.3287336800764254e-05, | |
| "loss": 0.1237, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 0.1432078331708908, | |
| "learning_rate": 3.318119095637406e-05, | |
| "loss": 0.1547, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 3.952962875366211, | |
| "learning_rate": 3.307504511198387e-05, | |
| "loss": 0.1682, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.044416822493076324, | |
| "learning_rate": 3.296889926759367e-05, | |
| "loss": 0.0388, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 6.307524681091309, | |
| "learning_rate": 3.286275342320348e-05, | |
| "loss": 0.1418, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 0.1354295015335083, | |
| "learning_rate": 3.275660757881329e-05, | |
| "loss": 0.2588, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 3.275066614151001, | |
| "learning_rate": 3.26504617344231e-05, | |
| "loss": 0.1091, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 0.0923081785440445, | |
| "learning_rate": 3.254431589003291e-05, | |
| "loss": 0.1384, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 3.508528232574463, | |
| "learning_rate": 3.243817004564271e-05, | |
| "loss": 0.217, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 2.36240291595459, | |
| "learning_rate": 3.233202420125252e-05, | |
| "loss": 0.0337, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.20124652981758118, | |
| "learning_rate": 3.222587835686233e-05, | |
| "loss": 0.0982, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 0.8248081803321838, | |
| "learning_rate": 3.211973251247214e-05, | |
| "loss": 0.2217, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.1201878786087036, | |
| "learning_rate": 3.2013586668081946e-05, | |
| "loss": 0.0651, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 1.6418076753616333, | |
| "learning_rate": 3.1907440823691757e-05, | |
| "loss": 0.0738, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 2.1913180351257324, | |
| "learning_rate": 3.180129497930156e-05, | |
| "loss": 0.0863, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 1.3282325267791748, | |
| "learning_rate": 3.1695149134911364e-05, | |
| "loss": 0.0582, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 2.451772451400757, | |
| "learning_rate": 3.158900329052118e-05, | |
| "loss": 0.1187, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 0.1372409611940384, | |
| "learning_rate": 3.1482857446130985e-05, | |
| "loss": 0.0618, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.08469751477241516, | |
| "learning_rate": 3.1376711601740795e-05, | |
| "loss": 0.0316, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.1473696529865265, | |
| "learning_rate": 3.12705657573506e-05, | |
| "loss": 0.0954, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 0.06819278746843338, | |
| "learning_rate": 3.116441991296041e-05, | |
| "loss": 0.1365, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 8.832886695861816, | |
| "learning_rate": 3.105827406857021e-05, | |
| "loss": 0.1828, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.043228354305028915, | |
| "learning_rate": 3.0952128224180024e-05, | |
| "loss": 0.1541, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 0.1457592248916626, | |
| "learning_rate": 3.0845982379789834e-05, | |
| "loss": 0.0291, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.5548399686813354, | |
| "learning_rate": 3.073983653539964e-05, | |
| "loss": 0.123, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 5.61803674697876, | |
| "learning_rate": 3.063369069100945e-05, | |
| "loss": 0.1871, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 0.020372767001390457, | |
| "learning_rate": 3.052754484661925e-05, | |
| "loss": 0.0865, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 5.178860664367676, | |
| "learning_rate": 3.0421399002229062e-05, | |
| "loss": 0.1568, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 4.118620872497559, | |
| "learning_rate": 3.0315253157838873e-05, | |
| "loss": 0.0729, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 3.9899566173553467, | |
| "learning_rate": 3.020910731344868e-05, | |
| "loss": 0.2327, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 1.3902517557144165, | |
| "learning_rate": 3.0102961469058487e-05, | |
| "loss": 0.1305, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 5.5835957527160645, | |
| "learning_rate": 2.9996815624668294e-05, | |
| "loss": 0.1032, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.521474003791809, | |
| "learning_rate": 2.98906697802781e-05, | |
| "loss": 0.1188, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 0.19501766562461853, | |
| "learning_rate": 2.978452393588791e-05, | |
| "loss": 0.0989, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.03989823907613754, | |
| "learning_rate": 2.9678378091497722e-05, | |
| "loss": 0.0736, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 3.9346630573272705, | |
| "learning_rate": 2.957223224710753e-05, | |
| "loss": 0.0347, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 0.05866791680455208, | |
| "learning_rate": 2.9466086402717336e-05, | |
| "loss": 0.1317, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 0.660900890827179, | |
| "learning_rate": 2.9359940558327143e-05, | |
| "loss": 0.1365, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 0.20864763855934143, | |
| "learning_rate": 2.925379471393695e-05, | |
| "loss": 0.2221, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 2.8652963638305664, | |
| "learning_rate": 2.9147648869546758e-05, | |
| "loss": 0.0355, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 3.0343375205993652, | |
| "learning_rate": 2.9041503025156568e-05, | |
| "loss": 0.2081, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 2.393002510070801, | |
| "learning_rate": 2.8935357180766375e-05, | |
| "loss": 0.1076, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 0.08225111663341522, | |
| "learning_rate": 2.8829211336376182e-05, | |
| "loss": 0.1367, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 4.09624719619751, | |
| "learning_rate": 2.872306549198599e-05, | |
| "loss": 0.2712, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 0.667273998260498, | |
| "learning_rate": 2.8616919647595796e-05, | |
| "loss": 0.152, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 1.4781357049942017, | |
| "learning_rate": 2.8510773803205603e-05, | |
| "loss": 0.0949, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 4.563651084899902, | |
| "learning_rate": 2.8404627958815417e-05, | |
| "loss": 0.0873, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 3.7740418910980225, | |
| "learning_rate": 2.830909669886424e-05, | |
| "loss": 0.1207, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 10.370115280151367, | |
| "learning_rate": 2.8202950854474048e-05, | |
| "loss": 0.1369, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 0.13098500669002533, | |
| "learning_rate": 2.8096805010083855e-05, | |
| "loss": 0.1016, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 9.170578956604004, | |
| "learning_rate": 2.7990659165693665e-05, | |
| "loss": 0.0463, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 10.379976272583008, | |
| "learning_rate": 2.7884513321303472e-05, | |
| "loss": 0.0798, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 0.10993140935897827, | |
| "learning_rate": 2.777836747691328e-05, | |
| "loss": 0.084, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 0.4707590937614441, | |
| "learning_rate": 2.7672221632523087e-05, | |
| "loss": 0.1232, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 4.587014198303223, | |
| "learning_rate": 2.7566075788132894e-05, | |
| "loss": 0.1474, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 10.61086654663086, | |
| "learning_rate": 2.74599299437427e-05, | |
| "loss": 0.1282, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 0.2299477756023407, | |
| "learning_rate": 2.7353784099352515e-05, | |
| "loss": 0.0667, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 5.911661624908447, | |
| "learning_rate": 2.724763825496232e-05, | |
| "loss": 0.1222, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 0.1657014936208725, | |
| "learning_rate": 2.714149241057213e-05, | |
| "loss": 0.072, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.04870441555976868, | |
| "learning_rate": 2.7035346566181936e-05, | |
| "loss": 0.1132, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 0.7382871508598328, | |
| "learning_rate": 2.6929200721791743e-05, | |
| "loss": 0.0272, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.1875141859054565, | |
| "learning_rate": 2.682305487740155e-05, | |
| "loss": 0.0833, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 0.070220448076725, | |
| "learning_rate": 2.671690903301136e-05, | |
| "loss": 0.1321, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 3.7514150142669678, | |
| "learning_rate": 2.6610763188621167e-05, | |
| "loss": 0.0971, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 0.04383459314703941, | |
| "learning_rate": 2.6504617344230975e-05, | |
| "loss": 0.0878, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 0.11518880724906921, | |
| "learning_rate": 2.639847149984078e-05, | |
| "loss": 0.0679, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 5.474330902099609, | |
| "learning_rate": 2.629232565545059e-05, | |
| "loss": 0.0497, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 0.03785128891468048, | |
| "learning_rate": 2.6186179811060396e-05, | |
| "loss": 0.1183, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 0.050687942653894424, | |
| "learning_rate": 2.608003396667021e-05, | |
| "loss": 0.1141, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 5.501091003417969, | |
| "learning_rate": 2.5973888122280017e-05, | |
| "loss": 0.1175, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 1.3896145820617676, | |
| "learning_rate": 2.5867742277889824e-05, | |
| "loss": 0.1665, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 5.888062000274658, | |
| "learning_rate": 2.576159643349963e-05, | |
| "loss": 0.1868, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.3350411653518677, | |
| "learning_rate": 2.5655450589109438e-05, | |
| "loss": 0.0262, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 0.12134930491447449, | |
| "learning_rate": 2.5549304744719245e-05, | |
| "loss": 0.1391, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 2.653724193572998, | |
| "learning_rate": 2.5443158900329056e-05, | |
| "loss": 0.044, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 1.480675458908081, | |
| "learning_rate": 2.5337013055938863e-05, | |
| "loss": 0.125, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 2.112579584121704, | |
| "learning_rate": 2.523086721154867e-05, | |
| "loss": 0.0774, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.03731192275881767, | |
| "learning_rate": 2.5124721367158477e-05, | |
| "loss": 0.0703, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 0.06327365338802338, | |
| "learning_rate": 2.5018575522768284e-05, | |
| "loss": 0.1557, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 0.10991324484348297, | |
| "learning_rate": 2.4912429678378094e-05, | |
| "loss": 0.0682, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 0.03156714513897896, | |
| "learning_rate": 2.48062838339879e-05, | |
| "loss": 0.1716, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 9.979147911071777, | |
| "learning_rate": 2.470013798959771e-05, | |
| "loss": 0.1797, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 3.263706684112549, | |
| "learning_rate": 2.459399214520752e-05, | |
| "loss": 0.0659, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 6.261413097381592, | |
| "learning_rate": 2.4487846300817323e-05, | |
| "loss": 0.0968, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.550948143005371, | |
| "learning_rate": 2.438170045642713e-05, | |
| "loss": 0.1336, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.1487703323364258, | |
| "learning_rate": 2.427555461203694e-05, | |
| "loss": 0.0647, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 6.673706531524658, | |
| "learning_rate": 2.4169408767646747e-05, | |
| "loss": 0.1567, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 0.17169363796710968, | |
| "learning_rate": 2.4063262923256554e-05, | |
| "loss": 0.1096, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 8.660694122314453, | |
| "learning_rate": 2.3957117078866365e-05, | |
| "loss": 0.1589, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 1.5906010866165161, | |
| "learning_rate": 2.3850971234476172e-05, | |
| "loss": 0.1224, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.8341835141181946, | |
| "learning_rate": 2.374482539008598e-05, | |
| "loss": 0.02, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 10.785898208618164, | |
| "learning_rate": 2.363867954569579e-05, | |
| "loss": 0.1153, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 5.174521446228027, | |
| "learning_rate": 2.3532533701305597e-05, | |
| "loss": 0.0843, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 0.7447335720062256, | |
| "learning_rate": 2.3426387856915404e-05, | |
| "loss": 0.1465, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 3.2618470191955566, | |
| "learning_rate": 2.332024201252521e-05, | |
| "loss": 0.0874, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 1.483594298362732, | |
| "learning_rate": 2.3214096168135018e-05, | |
| "loss": 0.1061, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 0.303654283285141, | |
| "learning_rate": 2.3107950323744825e-05, | |
| "loss": 0.117, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 10.942138671875, | |
| "learning_rate": 2.3001804479354635e-05, | |
| "loss": 0.2048, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 7.95550012588501, | |
| "learning_rate": 2.2895658634964442e-05, | |
| "loss": 0.1158, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.05263487249612808, | |
| "learning_rate": 2.278951279057425e-05, | |
| "loss": 0.0142, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 0.04684547707438469, | |
| "learning_rate": 2.268336694618406e-05, | |
| "loss": 0.15, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 6.8654890060424805, | |
| "learning_rate": 2.2577221101793867e-05, | |
| "loss": 0.1816, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 11.469459533691406, | |
| "learning_rate": 2.2471075257403674e-05, | |
| "loss": 0.1426, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 5.302177906036377, | |
| "learning_rate": 2.236492941301348e-05, | |
| "loss": 0.2248, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 2.6794090270996094, | |
| "learning_rate": 2.2258783568623288e-05, | |
| "loss": 0.157, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 1.3895156383514404, | |
| "learning_rate": 2.2152637724233095e-05, | |
| "loss": 0.159, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.17077626287937164, | |
| "learning_rate": 2.2046491879842906e-05, | |
| "loss": 0.1298, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.14379891753196716, | |
| "learning_rate": 2.1940346035452713e-05, | |
| "loss": 0.072, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 0.946506142616272, | |
| "learning_rate": 2.183420019106252e-05, | |
| "loss": 0.0414, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.10742925852537155, | |
| "learning_rate": 2.172805434667233e-05, | |
| "loss": 0.1991, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 4.503111362457275, | |
| "learning_rate": 2.1621908502282138e-05, | |
| "loss": 0.1018, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.025181856006383896, | |
| "learning_rate": 2.1515762657891945e-05, | |
| "loss": 0.2192, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.2496863454580307, | |
| "learning_rate": 2.140961681350175e-05, | |
| "loss": 0.1513, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 0.18356376886367798, | |
| "learning_rate": 2.1303470969111562e-05, | |
| "loss": 0.0928, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 4.700144290924072, | |
| "learning_rate": 2.119732512472137e-05, | |
| "loss": 0.1076, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 1.5925829410552979, | |
| "learning_rate": 2.1091179280331176e-05, | |
| "loss": 0.0673, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.5920007228851318, | |
| "learning_rate": 2.0985033435940983e-05, | |
| "loss": 0.1291, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 5.156589508056641, | |
| "learning_rate": 2.087888759155079e-05, | |
| "loss": 0.3071, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 0.03765925392508507, | |
| "learning_rate": 2.0772741747160598e-05, | |
| "loss": 0.1093, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.4249335825443268, | |
| "learning_rate": 2.0666595902770408e-05, | |
| "loss": 0.1279, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.016695374622941017, | |
| "learning_rate": 2.0560450058380215e-05, | |
| "loss": 0.2595, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 0.8157448768615723, | |
| "learning_rate": 2.0454304213990022e-05, | |
| "loss": 0.2879, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.43193209171295166, | |
| "learning_rate": 2.0348158369599833e-05, | |
| "loss": 0.1147, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.1754236221313477, | |
| "learning_rate": 2.024201252520964e-05, | |
| "loss": 0.1228, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.073044553399086, | |
| "learning_rate": 2.0135866680819447e-05, | |
| "loss": 0.0953, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 6.481806755065918, | |
| "learning_rate": 2.0029720836429254e-05, | |
| "loss": 0.0925, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 3.421597719192505, | |
| "learning_rate": 1.992357499203906e-05, | |
| "loss": 0.1844, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 0.15194571018218994, | |
| "learning_rate": 1.9817429147648868e-05, | |
| "loss": 0.3675, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.44171637296676636, | |
| "learning_rate": 1.971128330325868e-05, | |
| "loss": 0.043, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 0.06510256975889206, | |
| "learning_rate": 1.9605137458868486e-05, | |
| "loss": 0.1354, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 2.7437000274658203, | |
| "learning_rate": 1.9498991614478293e-05, | |
| "loss": 0.0389, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 1.2895437479019165, | |
| "learning_rate": 1.9392845770088103e-05, | |
| "loss": 0.1704, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 0.03322044759988785, | |
| "learning_rate": 1.928669992569791e-05, | |
| "loss": 0.1065, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 2.8655242919921875, | |
| "learning_rate": 1.9180554081307717e-05, | |
| "loss": 0.1504, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 0.2032870352268219, | |
| "learning_rate": 1.9074408236917528e-05, | |
| "loss": 0.0229, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 1.7102253437042236, | |
| "learning_rate": 1.8968262392527335e-05, | |
| "loss": 0.1019, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 2.740474224090576, | |
| "learning_rate": 1.8862116548137142e-05, | |
| "loss": 0.1235, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 2.9120683670043945, | |
| "learning_rate": 1.875597070374695e-05, | |
| "loss": 0.0516, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.11502601206302643, | |
| "learning_rate": 1.8649824859356756e-05, | |
| "loss": 0.2791, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 0.7027528882026672, | |
| "learning_rate": 1.8543679014966563e-05, | |
| "loss": 0.0385, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 2.4370245933532715, | |
| "learning_rate": 1.8437533170576374e-05, | |
| "loss": 0.0936, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 6.21151876449585, | |
| "learning_rate": 1.833138732618618e-05, | |
| "loss": 0.0806, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 0.052706655114889145, | |
| "learning_rate": 1.8225241481795988e-05, | |
| "loss": 0.1684, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 0.24665802717208862, | |
| "learning_rate": 1.8119095637405798e-05, | |
| "loss": 0.1383, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 3.6017708778381348, | |
| "learning_rate": 1.8012949793015605e-05, | |
| "loss": 0.1204, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 3.1942765712738037, | |
| "learning_rate": 1.7906803948625412e-05, | |
| "loss": 0.0627, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 3.020968437194824, | |
| "learning_rate": 1.780065810423522e-05, | |
| "loss": 0.1656, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.13594868779182434, | |
| "learning_rate": 1.7694512259845027e-05, | |
| "loss": 0.0529, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.0280010886490345, | |
| "learning_rate": 1.7588366415454834e-05, | |
| "loss": 0.1539, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 8.52804946899414, | |
| "learning_rate": 1.7482220571064644e-05, | |
| "loss": 0.0498, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.20770138502120972, | |
| "learning_rate": 1.737607472667445e-05, | |
| "loss": 0.0903, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.06971104443073273, | |
| "learning_rate": 1.7269928882284258e-05, | |
| "loss": 0.2458, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 0.022506361827254295, | |
| "learning_rate": 1.716378303789407e-05, | |
| "loss": 0.0741, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 4.818386077880859, | |
| "learning_rate": 1.7057637193503876e-05, | |
| "loss": 0.0586, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.05160210281610489, | |
| "learning_rate": 1.6951491349113683e-05, | |
| "loss": 0.0817, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.15953780710697174, | |
| "learning_rate": 1.6845345504723493e-05, | |
| "loss": 0.0905, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.015429453924298286, | |
| "learning_rate": 1.67391996603333e-05, | |
| "loss": 0.0719, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 3.159700632095337, | |
| "learning_rate": 1.6633053815943108e-05, | |
| "loss": 0.048, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 1.702974796295166, | |
| "learning_rate": 1.6526907971552915e-05, | |
| "loss": 0.1025, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 0.7218146324157715, | |
| "learning_rate": 1.6420762127162722e-05, | |
| "loss": 0.0534, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 4.001716136932373, | |
| "learning_rate": 1.632523086721155e-05, | |
| "loss": 0.1611, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 0.6529110074043274, | |
| "learning_rate": 1.6219085022821356e-05, | |
| "loss": 0.1739, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 3.3086657524108887, | |
| "learning_rate": 1.6112939178431166e-05, | |
| "loss": 0.1373, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 2.368133068084717, | |
| "learning_rate": 1.6006793334040973e-05, | |
| "loss": 0.0841, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 5.263741970062256, | |
| "learning_rate": 1.590064748965078e-05, | |
| "loss": 0.1226, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 10.581872940063477, | |
| "learning_rate": 1.579450164526059e-05, | |
| "loss": 0.1063, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.07476484030485153, | |
| "learning_rate": 1.5688355800870398e-05, | |
| "loss": 0.3208, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.976747453212738, | |
| "learning_rate": 1.5582209956480205e-05, | |
| "loss": 0.2336, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 0.2981054186820984, | |
| "learning_rate": 1.5476064112090012e-05, | |
| "loss": 0.0603, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 0.032338302582502365, | |
| "learning_rate": 1.536991826769982e-05, | |
| "loss": 0.0123, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 7.625821590423584, | |
| "learning_rate": 1.5263772423309626e-05, | |
| "loss": 0.1735, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 4.120946407318115, | |
| "learning_rate": 1.5157626578919436e-05, | |
| "loss": 0.1199, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.04417848959565163, | |
| "learning_rate": 1.5051480734529244e-05, | |
| "loss": 0.0629, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 3.9831886291503906, | |
| "learning_rate": 1.494533489013905e-05, | |
| "loss": 0.1507, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.2706742286682129, | |
| "learning_rate": 1.4839189045748861e-05, | |
| "loss": 0.1195, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 0.045659586787223816, | |
| "learning_rate": 1.4733043201358668e-05, | |
| "loss": 0.0875, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 2.9574756622314453, | |
| "learning_rate": 1.4626897356968475e-05, | |
| "loss": 0.0828, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 11.923121452331543, | |
| "learning_rate": 1.4520751512578284e-05, | |
| "loss": 0.1937, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.8571139574050903, | |
| "learning_rate": 1.4414605668188091e-05, | |
| "loss": 0.1385, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 11.532151222229004, | |
| "learning_rate": 1.4308459823797898e-05, | |
| "loss": 0.1644, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.02608746476471424, | |
| "learning_rate": 1.4202313979407709e-05, | |
| "loss": 0.0845, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 0.3875482976436615, | |
| "learning_rate": 1.4096168135017516e-05, | |
| "loss": 0.1526, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.46190938353538513, | |
| "learning_rate": 1.3990022290627323e-05, | |
| "loss": 0.0656, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.06178577244281769, | |
| "learning_rate": 1.3883876446237132e-05, | |
| "loss": 0.0245, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 0.41626548767089844, | |
| "learning_rate": 1.3777730601846939e-05, | |
| "loss": 0.1448, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.8394218683242798, | |
| "learning_rate": 1.3671584757456746e-05, | |
| "loss": 0.1566, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.030064724385738373, | |
| "learning_rate": 1.3565438913066556e-05, | |
| "loss": 0.1614, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.7408326864242554, | |
| "learning_rate": 1.3459293068676362e-05, | |
| "loss": 0.0493, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 6.210927486419678, | |
| "learning_rate": 1.3353147224286169e-05, | |
| "loss": 0.173, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 0.3989274501800537, | |
| "learning_rate": 1.3247001379895976e-05, | |
| "loss": 0.2242, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 0.21221469342708588, | |
| "learning_rate": 1.3140855535505786e-05, | |
| "loss": 0.1021, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.018684396520256996, | |
| "learning_rate": 1.3034709691115593e-05, | |
| "loss": 0.1168, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 4.258501052856445, | |
| "learning_rate": 1.29285638467254e-05, | |
| "loss": 0.101, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 0.18293698132038116, | |
| "learning_rate": 1.282241800233521e-05, | |
| "loss": 0.0888, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 5.2559685707092285, | |
| "learning_rate": 1.2716272157945016e-05, | |
| "loss": 0.1593, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 0.714055597782135, | |
| "learning_rate": 1.2610126313554823e-05, | |
| "loss": 0.0548, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 5.772704124450684, | |
| "learning_rate": 1.2503980469164634e-05, | |
| "loss": 0.1758, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.15256932377815247, | |
| "learning_rate": 1.2397834624774441e-05, | |
| "loss": 0.1546, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.17343765497207642, | |
| "learning_rate": 1.2291688780384248e-05, | |
| "loss": 0.0422, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 5.067286491394043, | |
| "learning_rate": 1.2185542935994057e-05, | |
| "loss": 0.0384, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 2.087721109390259, | |
| "learning_rate": 1.2079397091603864e-05, | |
| "loss": 0.1132, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 6.7488017082214355, | |
| "learning_rate": 1.1973251247213673e-05, | |
| "loss": 0.0729, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 10.669734954833984, | |
| "learning_rate": 1.1867105402823481e-05, | |
| "loss": 0.0677, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 4.734282970428467, | |
| "learning_rate": 1.1760959558433288e-05, | |
| "loss": 0.1741, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 1.1807498931884766, | |
| "learning_rate": 1.1665428298482115e-05, | |
| "loss": 0.1219, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 0.1118198037147522, | |
| "learning_rate": 1.1559282454091922e-05, | |
| "loss": 0.0724, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 8.563444137573242, | |
| "learning_rate": 1.1453136609701731e-05, | |
| "loss": 0.1453, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 1.4987778663635254, | |
| "learning_rate": 1.1346990765311538e-05, | |
| "loss": 0.1087, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 6.070169448852539, | |
| "learning_rate": 1.1240844920921347e-05, | |
| "loss": 0.1372, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 3.5378408432006836, | |
| "learning_rate": 1.1134699076531156e-05, | |
| "loss": 0.1421, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 0.18879607319831848, | |
| "learning_rate": 1.1028553232140961e-05, | |
| "loss": 0.0617, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 3.873791217803955, | |
| "learning_rate": 1.092240738775077e-05, | |
| "loss": 0.1256, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 3.0632710456848145, | |
| "learning_rate": 1.0816261543360579e-05, | |
| "loss": 0.1084, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.044198133051395416, | |
| "learning_rate": 1.0710115698970386e-05, | |
| "loss": 0.0972, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.06533059477806091, | |
| "learning_rate": 1.0603969854580194e-05, | |
| "loss": 0.0659, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 0.024154966697096825, | |
| "learning_rate": 1.0497824010190002e-05, | |
| "loss": 0.2245, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 0.06551453471183777, | |
| "learning_rate": 1.0391678165799809e-05, | |
| "loss": 0.0679, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 2.244358777999878, | |
| "learning_rate": 1.0285532321409617e-05, | |
| "loss": 0.1138, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 1.3429971933364868, | |
| "learning_rate": 1.0179386477019426e-05, | |
| "loss": 0.1286, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 13.364596366882324, | |
| "learning_rate": 1.0073240632629233e-05, | |
| "loss": 0.1304, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 1.5777560472488403, | |
| "learning_rate": 9.96709478823904e-06, | |
| "loss": 0.0324, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 3.5468719005584717, | |
| "learning_rate": 9.860948943848847e-06, | |
| "loss": 0.1142, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 9.198564529418945, | |
| "learning_rate": 9.754803099458656e-06, | |
| "loss": 0.1208, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 0.10464298725128174, | |
| "learning_rate": 9.648657255068465e-06, | |
| "loss": 0.062, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 7.4889702796936035, | |
| "learning_rate": 9.542511410678272e-06, | |
| "loss": 0.1081, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 4.211546897888184, | |
| "learning_rate": 9.43636556628808e-06, | |
| "loss": 0.122, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 5.125463008880615, | |
| "learning_rate": 9.330219721897888e-06, | |
| "loss": 0.2547, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 0.17111606895923615, | |
| "learning_rate": 9.224073877507695e-06, | |
| "loss": 0.0792, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 0.17677658796310425, | |
| "learning_rate": 9.117928033117504e-06, | |
| "loss": 0.2517, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.88303542137146, | |
| "learning_rate": 9.011782188727312e-06, | |
| "loss": 0.1207, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.934140682220459, | |
| "learning_rate": 8.90563634433712e-06, | |
| "loss": 0.0874, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 0.1124495416879654, | |
| "learning_rate": 8.799490499946927e-06, | |
| "loss": 0.2207, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.9301073551177979, | |
| "learning_rate": 8.693344655556735e-06, | |
| "loss": 0.1351, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.42326900362968445, | |
| "learning_rate": 8.587198811166543e-06, | |
| "loss": 0.1563, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.01322962436825037, | |
| "learning_rate": 8.481052966776351e-06, | |
| "loss": 0.0387, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 3.7665517330169678, | |
| "learning_rate": 8.37490712238616e-06, | |
| "loss": 0.2157, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.2205476611852646, | |
| "learning_rate": 8.268761277995967e-06, | |
| "loss": 0.0491, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.10910103470087051, | |
| "learning_rate": 8.162615433605774e-06, | |
| "loss": 0.0924, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.030913598835468292, | |
| "learning_rate": 8.056469589215583e-06, | |
| "loss": 0.0553, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.08986567705869675, | |
| "learning_rate": 7.95032374482539e-06, | |
| "loss": 0.0613, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 0.21952463686466217, | |
| "learning_rate": 7.844177900435199e-06, | |
| "loss": 0.0898, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.6068935990333557, | |
| "learning_rate": 7.738032056045006e-06, | |
| "loss": 0.043, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 0.03201749920845032, | |
| "learning_rate": 7.631886211654813e-06, | |
| "loss": 0.1957, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 3.205738067626953, | |
| "learning_rate": 7.525740367264622e-06, | |
| "loss": 0.1425, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 3.265514612197876, | |
| "learning_rate": 7.4195945228744306e-06, | |
| "loss": 0.1652, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 0.11868763715028763, | |
| "learning_rate": 7.313448678484238e-06, | |
| "loss": 0.0193, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 0.03614291548728943, | |
| "learning_rate": 7.2073028340940456e-06, | |
| "loss": 0.1368, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 2.512045383453369, | |
| "learning_rate": 7.101156989703854e-06, | |
| "loss": 0.0949, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 5.77540922164917, | |
| "learning_rate": 6.995011145313661e-06, | |
| "loss": 0.1639, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 7.473822116851807, | |
| "learning_rate": 6.888865300923469e-06, | |
| "loss": 0.1023, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.0789722427725792, | |
| "learning_rate": 6.782719456533278e-06, | |
| "loss": 0.0627, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 2.9245636463165283, | |
| "learning_rate": 6.676573612143084e-06, | |
| "loss": 0.1771, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 2.1707448959350586, | |
| "learning_rate": 6.570427767752893e-06, | |
| "loss": 0.0423, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 4.990893363952637, | |
| "learning_rate": 6.4642819233627e-06, | |
| "loss": 0.073, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 6.3620452880859375, | |
| "learning_rate": 6.358136078972508e-06, | |
| "loss": 0.0627, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 0.09669307619333267, | |
| "learning_rate": 6.251990234582317e-06, | |
| "loss": 0.0879, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 5.8794779777526855, | |
| "learning_rate": 6.145844390192124e-06, | |
| "loss": 0.1667, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 0.0750487744808197, | |
| "learning_rate": 6.039698545801932e-06, | |
| "loss": 0.1538, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 4.174580097198486, | |
| "learning_rate": 5.933552701411741e-06, | |
| "loss": 0.1782, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 2.7931034564971924, | |
| "learning_rate": 5.827406857021548e-06, | |
| "loss": 0.1047, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.11179756373167038, | |
| "learning_rate": 5.721261012631356e-06, | |
| "loss": 0.0648, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.25602421164512634, | |
| "learning_rate": 5.615115168241164e-06, | |
| "loss": 0.1657, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.030272111296653748, | |
| "learning_rate": 5.5089693238509715e-06, | |
| "loss": 0.1344, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.8802919387817383, | |
| "learning_rate": 5.4028234794607795e-06, | |
| "loss": 0.1284, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.9859854578971863, | |
| "learning_rate": 5.296677635070587e-06, | |
| "loss": 0.0504, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 0.5083135962486267, | |
| "learning_rate": 5.190531790680395e-06, | |
| "loss": 0.0193, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 3.466031789779663, | |
| "learning_rate": 5.084385946290203e-06, | |
| "loss": 0.0459, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 8.049098014831543, | |
| "learning_rate": 4.97824010190001e-06, | |
| "loss": 0.1025, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 5.528136730194092, | |
| "learning_rate": 4.872094257509819e-06, | |
| "loss": 0.109, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 0.02654377557337284, | |
| "learning_rate": 4.765948413119627e-06, | |
| "loss": 0.1655, | |
| "step": 9000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 9423, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "total_flos": 1.0453875280157082e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |