| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.8972542072630647, | |
| "eval_steps": 2000, | |
| "global_step": 22000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00354295837023915, | |
| "grad_norm": 10.283039659237906, | |
| "learning_rate": 3.5423308537017364e-08, | |
| "loss": 1.5344, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0070859167404783, | |
| "grad_norm": 10.29248009558218, | |
| "learning_rate": 7.084661707403473e-08, | |
| "loss": 1.4742, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.010628875110717449, | |
| "grad_norm": 9.936150778118938, | |
| "learning_rate": 1.0626992561105209e-07, | |
| "loss": 1.4303, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0141718334809566, | |
| "grad_norm": 10.764517644587348, | |
| "learning_rate": 1.4169323414806946e-07, | |
| "loss": 1.41, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.01771479185119575, | |
| "grad_norm": 7.66807491548178, | |
| "learning_rate": 1.7711654268508678e-07, | |
| "loss": 1.3917, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.021257750221434897, | |
| "grad_norm": 4.50717632446115, | |
| "learning_rate": 2.1253985122210417e-07, | |
| "loss": 1.4334, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.024800708591674048, | |
| "grad_norm": 5.252802352230719, | |
| "learning_rate": 2.479631597591215e-07, | |
| "loss": 1.3535, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0283436669619132, | |
| "grad_norm": 6.854461866498012, | |
| "learning_rate": 2.833864682961389e-07, | |
| "loss": 1.327, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03188662533215235, | |
| "grad_norm": 5.151839746898103, | |
| "learning_rate": 3.188097768331563e-07, | |
| "loss": 1.3742, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0354295837023915, | |
| "grad_norm": 5.045147162247824, | |
| "learning_rate": 3.5423308537017355e-07, | |
| "loss": 1.3332, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.03897254207263064, | |
| "grad_norm": 4.683603653905373, | |
| "learning_rate": 3.89656393907191e-07, | |
| "loss": 1.2738, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.042515500442869794, | |
| "grad_norm": 3.3123783143845853, | |
| "learning_rate": 4.2507970244420835e-07, | |
| "loss": 1.3093, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.046058458813108945, | |
| "grad_norm": 7.022433229508388, | |
| "learning_rate": 4.605030109812257e-07, | |
| "loss": 1.3152, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.049601417183348095, | |
| "grad_norm": 4.657117511243846, | |
| "learning_rate": 4.95926319518243e-07, | |
| "loss": 1.2072, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.053144375553587246, | |
| "grad_norm": 3.8212763356692094, | |
| "learning_rate": 5.313496280552604e-07, | |
| "loss": 1.2571, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0566873339238264, | |
| "grad_norm": 3.510893973188609, | |
| "learning_rate": 5.667729365922778e-07, | |
| "loss": 1.2607, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.06023029229406555, | |
| "grad_norm": 5.0086319180776195, | |
| "learning_rate": 6.021962451292952e-07, | |
| "loss": 1.197, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.0637732506643047, | |
| "grad_norm": 3.431634184610715, | |
| "learning_rate": 6.376195536663126e-07, | |
| "loss": 1.2377, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.06731620903454384, | |
| "grad_norm": 6.75935217841033, | |
| "learning_rate": 6.730428622033298e-07, | |
| "loss": 1.1773, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.070859167404783, | |
| "grad_norm": 4.209102695925099, | |
| "learning_rate": 7.084661707403471e-07, | |
| "loss": 1.219, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07440212577502214, | |
| "grad_norm": 3.49834326636571, | |
| "learning_rate": 7.438894792773646e-07, | |
| "loss": 1.1681, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.07794508414526129, | |
| "grad_norm": 6.9169037458817515, | |
| "learning_rate": 7.79312787814382e-07, | |
| "loss": 1.159, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.08148804251550044, | |
| "grad_norm": 4.117418143738441, | |
| "learning_rate": 8.147360963513992e-07, | |
| "loss": 1.1361, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.08503100088573959, | |
| "grad_norm": 3.878686980846809, | |
| "learning_rate": 8.501594048884167e-07, | |
| "loss": 1.1408, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.08857395925597875, | |
| "grad_norm": 2.4629212636233513, | |
| "learning_rate": 8.85582713425434e-07, | |
| "loss": 1.1273, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09211691762621789, | |
| "grad_norm": 3.5335784704034263, | |
| "learning_rate": 9.210060219624514e-07, | |
| "loss": 1.1072, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.09565987599645705, | |
| "grad_norm": 2.993397755869922, | |
| "learning_rate": 9.564293304994688e-07, | |
| "loss": 1.1374, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.09920283436669619, | |
| "grad_norm": 3.3645219648004074, | |
| "learning_rate": 9.91852639036486e-07, | |
| "loss": 1.0846, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.10274579273693533, | |
| "grad_norm": 3.5598287380775657, | |
| "learning_rate": 1.0272759475735035e-06, | |
| "loss": 1.1543, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.10628875110717449, | |
| "grad_norm": 5.756341944728728, | |
| "learning_rate": 1.0626992561105207e-06, | |
| "loss": 1.1218, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.10983170947741364, | |
| "grad_norm": 4.622099244697303, | |
| "learning_rate": 1.098122564647538e-06, | |
| "loss": 1.1217, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1133746678476528, | |
| "grad_norm": 4.036105893234525, | |
| "learning_rate": 1.1335458731845557e-06, | |
| "loss": 1.1389, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.11691762621789194, | |
| "grad_norm": 4.152917559749059, | |
| "learning_rate": 1.1689691817215728e-06, | |
| "loss": 1.1605, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.1204605845881311, | |
| "grad_norm": 2.353928725309983, | |
| "learning_rate": 1.2043924902585904e-06, | |
| "loss": 1.1468, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.12400354295837024, | |
| "grad_norm": 3.628879073452012, | |
| "learning_rate": 1.2398157987956076e-06, | |
| "loss": 1.1481, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1275465013286094, | |
| "grad_norm": 4.198170873976199, | |
| "learning_rate": 1.2752391073326251e-06, | |
| "loss": 1.1093, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.13108945969884853, | |
| "grad_norm": 3.269113557624318, | |
| "learning_rate": 1.3106624158696423e-06, | |
| "loss": 1.102, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.13463241806908768, | |
| "grad_norm": 4.528204224506456, | |
| "learning_rate": 1.3460857244066597e-06, | |
| "loss": 1.0894, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.13817537643932684, | |
| "grad_norm": 3.27137530368028, | |
| "learning_rate": 1.381509032943677e-06, | |
| "loss": 1.0747, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.141718334809566, | |
| "grad_norm": 7.233836600777667, | |
| "learning_rate": 1.4169323414806942e-06, | |
| "loss": 1.1337, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.14526129317980513, | |
| "grad_norm": 2.8025481056888815, | |
| "learning_rate": 1.4523556500177118e-06, | |
| "loss": 1.0662, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.1488042515500443, | |
| "grad_norm": 3.388696427420553, | |
| "learning_rate": 1.4877789585547292e-06, | |
| "loss": 1.0438, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.15234720992028344, | |
| "grad_norm": 4.710208067024261, | |
| "learning_rate": 1.5232022670917465e-06, | |
| "loss": 1.1523, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.15589016829052257, | |
| "grad_norm": 3.564554055568693, | |
| "learning_rate": 1.558625575628764e-06, | |
| "loss": 1.1362, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.15943312666076173, | |
| "grad_norm": 4.195782034527705, | |
| "learning_rate": 1.594048884165781e-06, | |
| "loss": 1.1197, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1629760850310009, | |
| "grad_norm": 5.136529290856518, | |
| "learning_rate": 1.6294721927027984e-06, | |
| "loss": 1.0747, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.16651904340124005, | |
| "grad_norm": 3.0425557174200875, | |
| "learning_rate": 1.664895501239816e-06, | |
| "loss": 1.0078, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.17006200177147918, | |
| "grad_norm": 4.70753429709887, | |
| "learning_rate": 1.7003188097768334e-06, | |
| "loss": 1.0692, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.17360496014171833, | |
| "grad_norm": 2.352046892407463, | |
| "learning_rate": 1.7357421183138505e-06, | |
| "loss": 1.1146, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.1771479185119575, | |
| "grad_norm": 4.251351248901323, | |
| "learning_rate": 1.771165426850868e-06, | |
| "loss": 1.1281, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.18069087688219662, | |
| "grad_norm": 3.1244602580086203, | |
| "learning_rate": 1.8065887353878853e-06, | |
| "loss": 1.0573, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.18423383525243578, | |
| "grad_norm": 3.7998767293490014, | |
| "learning_rate": 1.8420120439249029e-06, | |
| "loss": 1.0572, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.18777679362267494, | |
| "grad_norm": 4.226400750966905, | |
| "learning_rate": 1.8774353524619202e-06, | |
| "loss": 1.0907, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.1913197519929141, | |
| "grad_norm": 8.680035093004253, | |
| "learning_rate": 1.9128586609989376e-06, | |
| "loss": 1.1316, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.19486271036315322, | |
| "grad_norm": 5.6932973340030175, | |
| "learning_rate": 1.9482819695359548e-06, | |
| "loss": 1.0741, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.19840566873339238, | |
| "grad_norm": 4.008840499431025, | |
| "learning_rate": 1.983705278072972e-06, | |
| "loss": 1.0599, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.20194862710363154, | |
| "grad_norm": 4.7089915409873555, | |
| "learning_rate": 2.0191285866099895e-06, | |
| "loss": 1.1045, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.20549158547387067, | |
| "grad_norm": 3.8841603187249665, | |
| "learning_rate": 2.054551895147007e-06, | |
| "loss": 1.1013, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.20903454384410983, | |
| "grad_norm": 2.5908607447164256, | |
| "learning_rate": 2.0899752036840243e-06, | |
| "loss": 1.0688, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.21257750221434898, | |
| "grad_norm": 4.285794532561674, | |
| "learning_rate": 2.1253985122210414e-06, | |
| "loss": 1.044, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.21612046058458814, | |
| "grad_norm": 5.061151481127176, | |
| "learning_rate": 2.160821820758059e-06, | |
| "loss": 1.0987, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.21966341895482727, | |
| "grad_norm": 2.4272321583338945, | |
| "learning_rate": 2.196245129295076e-06, | |
| "loss": 1.0697, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.22320637732506643, | |
| "grad_norm": 3.516050346917228, | |
| "learning_rate": 2.2316684378320937e-06, | |
| "loss": 1.0373, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.2267493356953056, | |
| "grad_norm": 5.174609559420662, | |
| "learning_rate": 2.2670917463691113e-06, | |
| "loss": 1.0212, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.23029229406554472, | |
| "grad_norm": 5.096030335997553, | |
| "learning_rate": 2.3025150549061285e-06, | |
| "loss": 1.1042, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.23383525243578387, | |
| "grad_norm": 3.21646646241324, | |
| "learning_rate": 2.3379383634431456e-06, | |
| "loss": 1.074, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.23737821080602303, | |
| "grad_norm": 4.102524624460841, | |
| "learning_rate": 2.3733616719801632e-06, | |
| "loss": 1.1245, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.2409211691762622, | |
| "grad_norm": 3.537479639297508, | |
| "learning_rate": 2.408784980517181e-06, | |
| "loss": 1.1179, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.24446412754650132, | |
| "grad_norm": 3.886486819810854, | |
| "learning_rate": 2.444208289054198e-06, | |
| "loss": 1.0651, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.24800708591674048, | |
| "grad_norm": 4.034282862676682, | |
| "learning_rate": 2.479631597591215e-06, | |
| "loss": 1.02, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.25155004428697963, | |
| "grad_norm": 3.0945919872830663, | |
| "learning_rate": 2.5150549061282327e-06, | |
| "loss": 1.0985, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.2550930026572188, | |
| "grad_norm": 3.634569400423284, | |
| "learning_rate": 2.5504782146652503e-06, | |
| "loss": 1.0805, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.25863596102745795, | |
| "grad_norm": 3.920774204411743, | |
| "learning_rate": 2.585901523202267e-06, | |
| "loss": 1.0352, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.26217891939769705, | |
| "grad_norm": 3.650545041007239, | |
| "learning_rate": 2.6213248317392846e-06, | |
| "loss": 1.0575, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.2657218777679362, | |
| "grad_norm": 3.8026274989044793, | |
| "learning_rate": 2.6567481402763018e-06, | |
| "loss": 1.0806, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.26926483613817537, | |
| "grad_norm": 4.1381541786166895, | |
| "learning_rate": 2.6921714488133194e-06, | |
| "loss": 1.0501, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.2728077945084145, | |
| "grad_norm": 5.519250816332529, | |
| "learning_rate": 2.727594757350337e-06, | |
| "loss": 1.0583, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.2763507528786537, | |
| "grad_norm": 4.595209023098072, | |
| "learning_rate": 2.763018065887354e-06, | |
| "loss": 1.0662, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.27989371124889284, | |
| "grad_norm": 3.540003351520752, | |
| "learning_rate": 2.7984413744243717e-06, | |
| "loss": 1.0286, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.283436669619132, | |
| "grad_norm": 3.4373968392712184, | |
| "learning_rate": 2.8338646829613884e-06, | |
| "loss": 1.0434, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2869796279893711, | |
| "grad_norm": 17.453104932319967, | |
| "learning_rate": 2.869287991498406e-06, | |
| "loss": 1.0223, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.29052258635961026, | |
| "grad_norm": 2.6061492522441863, | |
| "learning_rate": 2.9047113000354236e-06, | |
| "loss": 1.0935, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.2940655447298494, | |
| "grad_norm": 4.139323586910726, | |
| "learning_rate": 2.9401346085724407e-06, | |
| "loss": 1.0754, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.2976085031000886, | |
| "grad_norm": 5.300424892826558, | |
| "learning_rate": 2.9755579171094583e-06, | |
| "loss": 1.0681, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.30115146147032773, | |
| "grad_norm": 3.649885398648624, | |
| "learning_rate": 3.0109812256464755e-06, | |
| "loss": 1.0922, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.3046944198405669, | |
| "grad_norm": 4.140426538668616, | |
| "learning_rate": 3.046404534183493e-06, | |
| "loss": 1.0644, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.30823737821080605, | |
| "grad_norm": 2.175231115055194, | |
| "learning_rate": 3.0818278427205106e-06, | |
| "loss": 1.0494, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.31178033658104515, | |
| "grad_norm": 3.028695259816387, | |
| "learning_rate": 3.117251151257528e-06, | |
| "loss": 1.0932, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.3153232949512843, | |
| "grad_norm": 2.7850683084236394, | |
| "learning_rate": 3.1526744597945454e-06, | |
| "loss": 1.0287, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.31886625332152346, | |
| "grad_norm": 2.933892885639913, | |
| "learning_rate": 3.188097768331562e-06, | |
| "loss": 1.0669, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.3224092116917626, | |
| "grad_norm": 4.948366022661806, | |
| "learning_rate": 3.2235210768685797e-06, | |
| "loss": 1.0984, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.3259521700620018, | |
| "grad_norm": 3.086993856127569, | |
| "learning_rate": 3.258944385405597e-06, | |
| "loss": 1.0421, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.32949512843224094, | |
| "grad_norm": 4.135810740344135, | |
| "learning_rate": 3.2943676939426144e-06, | |
| "loss": 0.9316, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.3330380868024801, | |
| "grad_norm": 2.7787248572400673, | |
| "learning_rate": 3.329791002479632e-06, | |
| "loss": 1.0354, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.3365810451727192, | |
| "grad_norm": 6.6200330325040815, | |
| "learning_rate": 3.365214311016649e-06, | |
| "loss": 1.0825, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.34012400354295835, | |
| "grad_norm": 3.9618002923502607, | |
| "learning_rate": 3.4006376195536668e-06, | |
| "loss": 1.0643, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.3436669619131975, | |
| "grad_norm": 2.9478604282057987, | |
| "learning_rate": 3.4360609280906835e-06, | |
| "loss": 1.0942, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.34720992028343667, | |
| "grad_norm": 3.1696939381732596, | |
| "learning_rate": 3.471484236627701e-06, | |
| "loss": 1.0848, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.3507528786536758, | |
| "grad_norm": 2.610545027614052, | |
| "learning_rate": 3.5069075451647187e-06, | |
| "loss": 1.0424, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.354295837023915, | |
| "grad_norm": 2.8653023342432844, | |
| "learning_rate": 3.542330853701736e-06, | |
| "loss": 1.0376, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.354295837023915, | |
| "eval_loss": 0.9136635661125183, | |
| "eval_runtime": 366.8623, | |
| "eval_samples_per_second": 25.914, | |
| "eval_steps_per_second": 3.241, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.35783879539415414, | |
| "grad_norm": 5.501631748880912, | |
| "learning_rate": 3.5777541622387534e-06, | |
| "loss": 1.0794, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.36138175376439324, | |
| "grad_norm": 3.9781584018724216, | |
| "learning_rate": 3.6131774707757706e-06, | |
| "loss": 1.0918, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.3649247121346324, | |
| "grad_norm": 5.9653615606161035, | |
| "learning_rate": 3.648600779312788e-06, | |
| "loss": 1.0281, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.36846767050487156, | |
| "grad_norm": 4.911079902501515, | |
| "learning_rate": 3.6840240878498057e-06, | |
| "loss": 1.0565, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.3720106288751107, | |
| "grad_norm": 6.677202780526525, | |
| "learning_rate": 3.719447396386823e-06, | |
| "loss": 1.0622, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3755535872453499, | |
| "grad_norm": 2.9957559478511513, | |
| "learning_rate": 3.7548707049238405e-06, | |
| "loss": 1.014, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.37909654561558903, | |
| "grad_norm": 6.136487010459827, | |
| "learning_rate": 3.7902940134608572e-06, | |
| "loss": 1.0463, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.3826395039858282, | |
| "grad_norm": 2.6989289543608987, | |
| "learning_rate": 3.825717321997875e-06, | |
| "loss": 1.0334, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.3861824623560673, | |
| "grad_norm": 2.8559280148544715, | |
| "learning_rate": 3.861140630534892e-06, | |
| "loss": 0.9961, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.38972542072630645, | |
| "grad_norm": 3.9195236355689618, | |
| "learning_rate": 3.8965639390719095e-06, | |
| "loss": 1.0501, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3932683790965456, | |
| "grad_norm": 4.745715075717865, | |
| "learning_rate": 3.9319872476089276e-06, | |
| "loss": 1.0532, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.39681133746678476, | |
| "grad_norm": 3.1678711880303365, | |
| "learning_rate": 3.967410556145944e-06, | |
| "loss": 1.0464, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.4003542958370239, | |
| "grad_norm": 4.318491084289353, | |
| "learning_rate": 4.002833864682962e-06, | |
| "loss": 1.01, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.4038972542072631, | |
| "grad_norm": 3.877214772420464, | |
| "learning_rate": 4.038257173219979e-06, | |
| "loss": 1.04, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.40744021257750224, | |
| "grad_norm": 4.408726611386237, | |
| "learning_rate": 4.073680481756996e-06, | |
| "loss": 0.9864, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.41098317094774134, | |
| "grad_norm": 3.1476639776400264, | |
| "learning_rate": 4.109103790294014e-06, | |
| "loss": 1.0471, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.4145261293179805, | |
| "grad_norm": 4.057188755394368, | |
| "learning_rate": 4.144527098831031e-06, | |
| "loss": 1.0187, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.41806908768821965, | |
| "grad_norm": 3.7443003760493547, | |
| "learning_rate": 4.1799504073680485e-06, | |
| "loss": 1.0207, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.4216120460584588, | |
| "grad_norm": 3.4133153204439375, | |
| "learning_rate": 4.215373715905066e-06, | |
| "loss": 1.0412, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.42515500442869797, | |
| "grad_norm": 5.271529700638458, | |
| "learning_rate": 4.250797024442083e-06, | |
| "loss": 1.0446, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.4286979627989371, | |
| "grad_norm": 3.690737939017104, | |
| "learning_rate": 4.286220332979101e-06, | |
| "loss": 1.0893, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.4322409211691763, | |
| "grad_norm": 4.6971388539053445, | |
| "learning_rate": 4.321643641516118e-06, | |
| "loss": 1.0552, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.4357838795394154, | |
| "grad_norm": 3.833304687965468, | |
| "learning_rate": 4.357066950053135e-06, | |
| "loss": 1.018, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.43932683790965454, | |
| "grad_norm": 3.876707930916304, | |
| "learning_rate": 4.392490258590152e-06, | |
| "loss": 1.0593, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.4428697962798937, | |
| "grad_norm": 4.485093155652708, | |
| "learning_rate": 4.42791356712717e-06, | |
| "loss": 1.0315, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.44641275465013286, | |
| "grad_norm": 3.240309715459973, | |
| "learning_rate": 4.4633368756641875e-06, | |
| "loss": 1.0228, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.449955713020372, | |
| "grad_norm": 2.8577216948048085, | |
| "learning_rate": 4.498760184201205e-06, | |
| "loss": 0.9622, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.4534986713906112, | |
| "grad_norm": 2.3204510234528004, | |
| "learning_rate": 4.534183492738223e-06, | |
| "loss": 1.0417, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.45704162976085033, | |
| "grad_norm": 4.8495156054088655, | |
| "learning_rate": 4.569606801275239e-06, | |
| "loss": 1.0108, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.46058458813108943, | |
| "grad_norm": 5.060714565551563, | |
| "learning_rate": 4.605030109812257e-06, | |
| "loss": 1.0303, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.4641275465013286, | |
| "grad_norm": 6.095607076544207, | |
| "learning_rate": 4.640453418349274e-06, | |
| "loss": 1.0116, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.46767050487156775, | |
| "grad_norm": 2.9232128503389183, | |
| "learning_rate": 4.675876726886291e-06, | |
| "loss": 0.983, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.4712134632418069, | |
| "grad_norm": 2.36685887518906, | |
| "learning_rate": 4.711300035423309e-06, | |
| "loss": 1.0277, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.47475642161204606, | |
| "grad_norm": 4.128356071985117, | |
| "learning_rate": 4.7467233439603264e-06, | |
| "loss": 1.0411, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.4782993799822852, | |
| "grad_norm": 2.7297669503368804, | |
| "learning_rate": 4.782146652497344e-06, | |
| "loss": 1.0553, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.4818423383525244, | |
| "grad_norm": 2.7607226135533103, | |
| "learning_rate": 4.817569961034362e-06, | |
| "loss": 1.0547, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.4853852967227635, | |
| "grad_norm": 4.187021213318743, | |
| "learning_rate": 4.852993269571378e-06, | |
| "loss": 1.0437, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.48892825509300264, | |
| "grad_norm": 3.7584391728566695, | |
| "learning_rate": 4.888416578108396e-06, | |
| "loss": 1.035, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.4924712134632418, | |
| "grad_norm": 4.830912639451228, | |
| "learning_rate": 4.923839886645413e-06, | |
| "loss": 1.0675, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.49601417183348095, | |
| "grad_norm": 4.685752123568836, | |
| "learning_rate": 4.95926319518243e-06, | |
| "loss": 1.0493, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4995571302037201, | |
| "grad_norm": 3.375680932358239, | |
| "learning_rate": 4.994686503719448e-06, | |
| "loss": 1.0686, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.5031000885739593, | |
| "grad_norm": 2.4729024999298534, | |
| "learning_rate": 4.999994474499561e-06, | |
| "loss": 1.0283, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.5066430469441984, | |
| "grad_norm": 4.50380809454446, | |
| "learning_rate": 4.999973825606614e-06, | |
| "loss": 1.0188, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.5101860053144376, | |
| "grad_norm": 3.418425163716614, | |
| "learning_rate": 4.999937881373025e-06, | |
| "loss": 1.0617, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.5137289636846767, | |
| "grad_norm": 3.6088000942292164, | |
| "learning_rate": 4.999886642018707e-06, | |
| "loss": 1.0723, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.5172719220549159, | |
| "grad_norm": 6.244089618662361, | |
| "learning_rate": 4.999820107857154e-06, | |
| "loss": 1.0662, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.520814880425155, | |
| "grad_norm": 2.609964349424898, | |
| "learning_rate": 4.999738279295433e-06, | |
| "loss": 1.0324, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.5243578387953941, | |
| "grad_norm": 2.679897624387073, | |
| "learning_rate": 4.9996411568341896e-06, | |
| "loss": 1.0207, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.5279007971656333, | |
| "grad_norm": 3.27573058708962, | |
| "learning_rate": 4.999528741067638e-06, | |
| "loss": 1.0939, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.5314437555358724, | |
| "grad_norm": 3.3784052333905756, | |
| "learning_rate": 4.99940103268356e-06, | |
| "loss": 1.0193, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5349867139061116, | |
| "grad_norm": 4.924298481099745, | |
| "learning_rate": 4.999258032463301e-06, | |
| "loss": 1.1053, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.5385296722763507, | |
| "grad_norm": 4.1853626057858895, | |
| "learning_rate": 4.999099741281766e-06, | |
| "loss": 1.0337, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.54207263064659, | |
| "grad_norm": 3.5502444781095104, | |
| "learning_rate": 4.998926160107411e-06, | |
| "loss": 1.0786, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.545615589016829, | |
| "grad_norm": 3.335300103709776, | |
| "learning_rate": 4.998737290002241e-06, | |
| "loss": 1.0507, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.5491585473870682, | |
| "grad_norm": 3.114815124169259, | |
| "learning_rate": 4.9985331321218e-06, | |
| "loss": 1.0352, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.5527015057573074, | |
| "grad_norm": 4.900399906908037, | |
| "learning_rate": 4.998313687715169e-06, | |
| "loss": 1.0244, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.5562444641275465, | |
| "grad_norm": 2.853630801128127, | |
| "learning_rate": 4.9980789581249515e-06, | |
| "loss": 1.0552, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.5597874224977857, | |
| "grad_norm": 3.824968946809653, | |
| "learning_rate": 4.9978289447872695e-06, | |
| "loss": 1.0109, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.5633303808680248, | |
| "grad_norm": 3.6496584120718314, | |
| "learning_rate": 4.997563649231755e-06, | |
| "loss": 1.0097, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.566873339238264, | |
| "grad_norm": 4.089881200621581, | |
| "learning_rate": 4.997283073081541e-06, | |
| "loss": 1.0687, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.5704162976085031, | |
| "grad_norm": 4.25767273401204, | |
| "learning_rate": 4.996987218053247e-06, | |
| "loss": 1.0032, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.5739592559787422, | |
| "grad_norm": 4.054242038282677, | |
| "learning_rate": 4.996676085956973e-06, | |
| "loss": 1.0109, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.5775022143489814, | |
| "grad_norm": 2.2212311640306934, | |
| "learning_rate": 4.996349678696288e-06, | |
| "loss": 0.9873, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.5810451727192205, | |
| "grad_norm": 2.910691796089737, | |
| "learning_rate": 4.996007998268219e-06, | |
| "loss": 1.0389, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.5845881310894597, | |
| "grad_norm": 1.804064223013201, | |
| "learning_rate": 4.995651046763232e-06, | |
| "loss": 1.0065, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5881310894596988, | |
| "grad_norm": 1.787168345455913, | |
| "learning_rate": 4.99527882636523e-06, | |
| "loss": 0.9964, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.591674047829938, | |
| "grad_norm": 2.6825449022104584, | |
| "learning_rate": 4.99489133935153e-06, | |
| "loss": 1.0113, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.5952170062001771, | |
| "grad_norm": 3.3430457632929986, | |
| "learning_rate": 4.9944885880928576e-06, | |
| "loss": 1.0159, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.5987599645704162, | |
| "grad_norm": 3.4188233454866777, | |
| "learning_rate": 4.994070575053324e-06, | |
| "loss": 1.0332, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.6023029229406555, | |
| "grad_norm": 5.035300065424226, | |
| "learning_rate": 4.993637302790417e-06, | |
| "loss": 1.0072, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.6058458813108946, | |
| "grad_norm": 2.4629577630265067, | |
| "learning_rate": 4.9931887739549845e-06, | |
| "loss": 1.0246, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.6093888396811338, | |
| "grad_norm": 3.2222908571387605, | |
| "learning_rate": 4.9927249912912135e-06, | |
| "loss": 1.1202, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.6129317980513729, | |
| "grad_norm": 3.329571486141443, | |
| "learning_rate": 4.99224595763662e-06, | |
| "loss": 1.0052, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.6164747564216121, | |
| "grad_norm": 3.2262886392708023, | |
| "learning_rate": 4.991751675922029e-06, | |
| "loss": 1.0563, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.6200177147918512, | |
| "grad_norm": 2.430384347609413, | |
| "learning_rate": 4.991242149171554e-06, | |
| "loss": 1.0084, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6235606731620903, | |
| "grad_norm": 2.2313346912151437, | |
| "learning_rate": 4.990717380502581e-06, | |
| "loss": 1.1098, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.6271036315323295, | |
| "grad_norm": 3.269110632691307, | |
| "learning_rate": 4.990177373125752e-06, | |
| "loss": 0.9681, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.6306465899025686, | |
| "grad_norm": 2.8574079693862213, | |
| "learning_rate": 4.989622130344939e-06, | |
| "loss": 1.0192, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.6341895482728078, | |
| "grad_norm": 3.663783109216899, | |
| "learning_rate": 4.989051655557228e-06, | |
| "loss": 0.9997, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.6377325066430469, | |
| "grad_norm": 3.3537695839211916, | |
| "learning_rate": 4.9884659522528985e-06, | |
| "loss": 0.9669, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.6412754650132861, | |
| "grad_norm": 2.809218376275673, | |
| "learning_rate": 4.987865024015401e-06, | |
| "loss": 1.0155, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.6448184233835252, | |
| "grad_norm": 5.035643249923235, | |
| "learning_rate": 4.9872488745213356e-06, | |
| "loss": 1.0125, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.6483613817537643, | |
| "grad_norm": 3.6981075116685798, | |
| "learning_rate": 4.986617507540426e-06, | |
| "loss": 0.9861, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.6519043401240036, | |
| "grad_norm": 2.5959218064380725, | |
| "learning_rate": 4.985970926935504e-06, | |
| "loss": 1.0936, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.6554472984942427, | |
| "grad_norm": 2.72099024631383, | |
| "learning_rate": 4.985309136662478e-06, | |
| "loss": 1.0458, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.6589902568644819, | |
| "grad_norm": 2.3741120569873937, | |
| "learning_rate": 4.984632140770314e-06, | |
| "loss": 0.9733, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.662533215234721, | |
| "grad_norm": 2.9843690685487316, | |
| "learning_rate": 4.983939943401009e-06, | |
| "loss": 0.9865, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.6660761736049602, | |
| "grad_norm": 3.753859508812206, | |
| "learning_rate": 4.9832325487895625e-06, | |
| "loss": 1.0373, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.6696191319751993, | |
| "grad_norm": 2.36323312181139, | |
| "learning_rate": 4.98250996126396e-06, | |
| "loss": 1.0007, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.6731620903454384, | |
| "grad_norm": 2.4128489083499916, | |
| "learning_rate": 4.981772185245135e-06, | |
| "loss": 1.0155, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.6767050487156776, | |
| "grad_norm": 4.471122710526832, | |
| "learning_rate": 4.98101922524695e-06, | |
| "loss": 1.0625, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.6802480070859167, | |
| "grad_norm": 3.9450022308941035, | |
| "learning_rate": 4.980251085876163e-06, | |
| "loss": 1.0608, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.6837909654561559, | |
| "grad_norm": 3.750505881130023, | |
| "learning_rate": 4.979467771832407e-06, | |
| "loss": 1.0401, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.687333923826395, | |
| "grad_norm": 3.0818679668346918, | |
| "learning_rate": 4.978669287908152e-06, | |
| "loss": 0.9782, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.6908768821966342, | |
| "grad_norm": 3.585328199676442, | |
| "learning_rate": 4.9778556389886836e-06, | |
| "loss": 1.0293, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.6944198405668733, | |
| "grad_norm": 3.352267909342498, | |
| "learning_rate": 4.97702683005207e-06, | |
| "loss": 1.0443, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.6979627989371124, | |
| "grad_norm": 2.740609338046614, | |
| "learning_rate": 4.976182866169128e-06, | |
| "loss": 0.983, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.7015057573073517, | |
| "grad_norm": 3.815853619798988, | |
| "learning_rate": 4.9753237525033995e-06, | |
| "loss": 1.0241, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.7050487156775908, | |
| "grad_norm": 2.056928416192008, | |
| "learning_rate": 4.974449494311113e-06, | |
| "loss": 0.935, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.70859167404783, | |
| "grad_norm": 4.501534397772864, | |
| "learning_rate": 4.973560096941157e-06, | |
| "loss": 1.0417, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.70859167404783, | |
| "eval_loss": 0.8831750154495239, | |
| "eval_runtime": 377.54, | |
| "eval_samples_per_second": 25.181, | |
| "eval_steps_per_second": 3.149, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7121346324180691, | |
| "grad_norm": 4.163953326960491, | |
| "learning_rate": 4.97265556583504e-06, | |
| "loss": 0.9787, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.7156775907883083, | |
| "grad_norm": 2.1612827561442143, | |
| "learning_rate": 4.971735906526867e-06, | |
| "loss": 1.0187, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.7192205491585474, | |
| "grad_norm": 4.132524372144837, | |
| "learning_rate": 4.9708011246432954e-06, | |
| "loss": 1.0447, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.7227635075287865, | |
| "grad_norm": 4.902391594790166, | |
| "learning_rate": 4.969851225903511e-06, | |
| "loss": 1.0849, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.7263064658990257, | |
| "grad_norm": 4.040812745701054, | |
| "learning_rate": 4.968886216119181e-06, | |
| "loss": 0.9977, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.7298494242692648, | |
| "grad_norm": 4.403191939391695, | |
| "learning_rate": 4.967906101194432e-06, | |
| "loss": 1.0151, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.733392382639504, | |
| "grad_norm": 4.888547155083341, | |
| "learning_rate": 4.9669108871258005e-06, | |
| "loss": 1.0488, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.7369353410097431, | |
| "grad_norm": 2.5511865400666247, | |
| "learning_rate": 4.965900580002208e-06, | |
| "loss": 0.9839, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.7404782993799823, | |
| "grad_norm": 3.7363422093828267, | |
| "learning_rate": 4.9648751860049146e-06, | |
| "loss": 0.9671, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.7440212577502214, | |
| "grad_norm": 4.5882225405684105, | |
| "learning_rate": 4.963834711407487e-06, | |
| "loss": 1.0153, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.7475642161204605, | |
| "grad_norm": 3.3728967510985175, | |
| "learning_rate": 4.962779162575757e-06, | |
| "loss": 0.9866, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.7511071744906997, | |
| "grad_norm": 3.3582439513903695, | |
| "learning_rate": 4.961708545967782e-06, | |
| "loss": 1.0012, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.7546501328609388, | |
| "grad_norm": 4.229044845753943, | |
| "learning_rate": 4.960622868133811e-06, | |
| "loss": 1.0264, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.7581930912311781, | |
| "grad_norm": 3.604122128833424, | |
| "learning_rate": 4.959522135716238e-06, | |
| "loss": 1.0334, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.7617360496014172, | |
| "grad_norm": 4.330367291558989, | |
| "learning_rate": 4.958406355449564e-06, | |
| "loss": 1.0528, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.7652790079716564, | |
| "grad_norm": 3.96229330754017, | |
| "learning_rate": 4.957275534160356e-06, | |
| "loss": 1.0142, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.7688219663418955, | |
| "grad_norm": 1.880123067964198, | |
| "learning_rate": 4.956129678767206e-06, | |
| "loss": 0.9585, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.7723649247121346, | |
| "grad_norm": 2.9357741019622026, | |
| "learning_rate": 4.954968796280685e-06, | |
| "loss": 1.0118, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.7759078830823738, | |
| "grad_norm": 5.337878559961154, | |
| "learning_rate": 4.953792893803308e-06, | |
| "loss": 0.96, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.7794508414526129, | |
| "grad_norm": 2.7737448076628146, | |
| "learning_rate": 4.952601978529479e-06, | |
| "loss": 1.0095, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.7829937998228521, | |
| "grad_norm": 2.81198860896816, | |
| "learning_rate": 4.951396057745457e-06, | |
| "loss": 1.0025, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.7865367581930912, | |
| "grad_norm": 3.2768364907663843, | |
| "learning_rate": 4.950175138829306e-06, | |
| "loss": 1.0062, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.7900797165633304, | |
| "grad_norm": 4.503697644314717, | |
| "learning_rate": 4.948939229250855e-06, | |
| "loss": 0.9866, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.7936226749335695, | |
| "grad_norm": 3.709697666688961, | |
| "learning_rate": 4.947688336571644e-06, | |
| "loss": 1.0234, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.7971656333038086, | |
| "grad_norm": 2.9177293521142804, | |
| "learning_rate": 4.946422468444886e-06, | |
| "loss": 0.9501, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.8007085916740478, | |
| "grad_norm": 3.3045345091796197, | |
| "learning_rate": 4.945141632615416e-06, | |
| "loss": 1.0335, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.804251550044287, | |
| "grad_norm": 4.0701401436058795, | |
| "learning_rate": 4.943845836919642e-06, | |
| "loss": 1.0438, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.8077945084145262, | |
| "grad_norm": 3.9759562983887213, | |
| "learning_rate": 4.942535089285505e-06, | |
| "loss": 1.0283, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.8113374667847653, | |
| "grad_norm": 5.075198666819325, | |
| "learning_rate": 4.9412093977324196e-06, | |
| "loss": 1.0087, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.8148804251550045, | |
| "grad_norm": 4.64113304477617, | |
| "learning_rate": 4.9398687703712324e-06, | |
| "loss": 1.0335, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.8184233835252436, | |
| "grad_norm": 3.6121906044220835, | |
| "learning_rate": 4.938513215404171e-06, | |
| "loss": 1.036, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.8219663418954827, | |
| "grad_norm": 2.8641112530538297, | |
| "learning_rate": 4.9371427411247905e-06, | |
| "loss": 0.9476, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.8255093002657219, | |
| "grad_norm": 2.2471640500283194, | |
| "learning_rate": 4.935757355917929e-06, | |
| "loss": 1.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.829052258635961, | |
| "grad_norm": 2.6668789188777553, | |
| "learning_rate": 4.93435706825965e-06, | |
| "loss": 1.042, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.8325952170062002, | |
| "grad_norm": 3.382540393461139, | |
| "learning_rate": 4.932941886717193e-06, | |
| "loss": 0.9925, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.8361381753764393, | |
| "grad_norm": 2.9612515882054224, | |
| "learning_rate": 4.931511819948924e-06, | |
| "loss": 1.0038, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.8396811337466785, | |
| "grad_norm": 5.805032752593361, | |
| "learning_rate": 4.930066876704276e-06, | |
| "loss": 1.0752, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.8432240921169176, | |
| "grad_norm": 2.906268834013272, | |
| "learning_rate": 4.9286070658237025e-06, | |
| "loss": 0.9574, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.8467670504871567, | |
| "grad_norm": 3.908291806399892, | |
| "learning_rate": 4.9271323962386185e-06, | |
| "loss": 0.9355, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.8503100088573959, | |
| "grad_norm": 2.862223610977262, | |
| "learning_rate": 4.925642876971347e-06, | |
| "loss": 0.9913, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.853852967227635, | |
| "grad_norm": 3.885251950370837, | |
| "learning_rate": 4.924138517135068e-06, | |
| "loss": 0.9437, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.8573959255978743, | |
| "grad_norm": 2.687403470850269, | |
| "learning_rate": 4.922619325933753e-06, | |
| "loss": 1.0183, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.8609388839681134, | |
| "grad_norm": 5.1744645832504945, | |
| "learning_rate": 4.921085312662119e-06, | |
| "loss": 0.9639, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.8644818423383526, | |
| "grad_norm": 4.73053239403457, | |
| "learning_rate": 4.919536486705569e-06, | |
| "loss": 1.0124, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.8680248007085917, | |
| "grad_norm": 4.4563037107783865, | |
| "learning_rate": 4.917972857540126e-06, | |
| "loss": 0.99, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.8715677590788308, | |
| "grad_norm": 4.614255774835929, | |
| "learning_rate": 4.916394434732391e-06, | |
| "loss": 1.0037, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.87511071744907, | |
| "grad_norm": 3.533058939111727, | |
| "learning_rate": 4.914801227939467e-06, | |
| "loss": 1.0177, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.8786536758193091, | |
| "grad_norm": 2.6727964096166583, | |
| "learning_rate": 4.913193246908916e-06, | |
| "loss": 0.9957, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.8821966341895483, | |
| "grad_norm": 6.368757949715121, | |
| "learning_rate": 4.911570501478686e-06, | |
| "loss": 1.0324, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.8857395925597874, | |
| "grad_norm": 3.238302734032586, | |
| "learning_rate": 4.909933001577057e-06, | |
| "loss": 0.9778, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8892825509300266, | |
| "grad_norm": 2.9414571649379173, | |
| "learning_rate": 4.908280757222585e-06, | |
| "loss": 1.0183, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.8928255093002657, | |
| "grad_norm": 3.8675143162693417, | |
| "learning_rate": 4.906613778524029e-06, | |
| "loss": 1.0417, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.8963684676705048, | |
| "grad_norm": 4.062896818204324, | |
| "learning_rate": 4.9049320756803e-06, | |
| "loss": 0.9951, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.899911426040744, | |
| "grad_norm": 3.965608738547987, | |
| "learning_rate": 4.9032356589803935e-06, | |
| "loss": 1.0096, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.9034543844109831, | |
| "grad_norm": 2.4470182814478845, | |
| "learning_rate": 4.901524538803325e-06, | |
| "loss": 0.9706, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.9069973427812223, | |
| "grad_norm": 3.3652865356433788, | |
| "learning_rate": 4.899798725618071e-06, | |
| "loss": 1.0189, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.9105403011514615, | |
| "grad_norm": 2.1015419863160316, | |
| "learning_rate": 4.898058229983502e-06, | |
| "loss": 0.9427, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.9140832595217007, | |
| "grad_norm": 2.494263988797181, | |
| "learning_rate": 4.896303062548321e-06, | |
| "loss": 0.9542, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.9176262178919398, | |
| "grad_norm": 5.0132756287008355, | |
| "learning_rate": 4.894533234050992e-06, | |
| "loss": 1.0177, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.9211691762621789, | |
| "grad_norm": 3.848341234829432, | |
| "learning_rate": 4.892748755319679e-06, | |
| "loss": 0.9785, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.9247121346324181, | |
| "grad_norm": 3.9783991127336824, | |
| "learning_rate": 4.890949637272184e-06, | |
| "loss": 0.9964, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.9282550930026572, | |
| "grad_norm": 3.7994392920413333, | |
| "learning_rate": 4.8891358909158695e-06, | |
| "loss": 1.0164, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.9317980513728964, | |
| "grad_norm": 3.646774315012477, | |
| "learning_rate": 4.887307527347598e-06, | |
| "loss": 1.008, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.9353410097431355, | |
| "grad_norm": 6.203065849865684, | |
| "learning_rate": 4.885464557753666e-06, | |
| "loss": 1.0426, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.9388839681133747, | |
| "grad_norm": 2.9353445219470444, | |
| "learning_rate": 4.88360699340973e-06, | |
| "loss": 1.0052, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.9424269264836138, | |
| "grad_norm": 3.9291014774722033, | |
| "learning_rate": 4.88173484568074e-06, | |
| "loss": 0.9606, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.9459698848538529, | |
| "grad_norm": 3.9955198747460634, | |
| "learning_rate": 4.8798481260208715e-06, | |
| "loss": 0.9862, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.9495128432240921, | |
| "grad_norm": 4.103474267744412, | |
| "learning_rate": 4.877946845973453e-06, | |
| "loss": 1.008, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.9530558015943312, | |
| "grad_norm": 3.817035998173409, | |
| "learning_rate": 4.876031017170898e-06, | |
| "loss": 0.9696, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.9565987599645704, | |
| "grad_norm": 5.198464280181255, | |
| "learning_rate": 4.874100651334629e-06, | |
| "loss": 1.0248, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.9601417183348095, | |
| "grad_norm": 3.289343037136709, | |
| "learning_rate": 4.872155760275012e-06, | |
| "loss": 0.9793, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.9636846767050488, | |
| "grad_norm": 2.2435930827644746, | |
| "learning_rate": 4.87019635589128e-06, | |
| "loss": 1.0101, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.9672276350752879, | |
| "grad_norm": 4.310791558682954, | |
| "learning_rate": 4.86822245017146e-06, | |
| "loss": 0.995, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.970770593445527, | |
| "grad_norm": 3.352727892906536, | |
| "learning_rate": 4.866234055192306e-06, | |
| "loss": 0.9751, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.9743135518157662, | |
| "grad_norm": 5.068267192871768, | |
| "learning_rate": 4.864231183119212e-06, | |
| "loss": 0.9629, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.9778565101860053, | |
| "grad_norm": 3.5363020465890704, | |
| "learning_rate": 4.862213846206155e-06, | |
| "loss": 0.9977, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.9813994685562445, | |
| "grad_norm": 3.3894885205413834, | |
| "learning_rate": 4.860182056795604e-06, | |
| "loss": 0.9575, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.9849424269264836, | |
| "grad_norm": 2.6287369097602578, | |
| "learning_rate": 4.8581358273184545e-06, | |
| "loss": 0.989, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.9884853852967228, | |
| "grad_norm": 2.9015866832665185, | |
| "learning_rate": 4.856075170293948e-06, | |
| "loss": 1.0018, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.9920283436669619, | |
| "grad_norm": 3.0658535183131423, | |
| "learning_rate": 4.854000098329596e-06, | |
| "loss": 1.0078, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.995571302037201, | |
| "grad_norm": 2.5404639262387025, | |
| "learning_rate": 4.851910624121106e-06, | |
| "loss": 0.9407, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.9991142604074402, | |
| "grad_norm": 3.158604742158805, | |
| "learning_rate": 4.849806760452299e-06, | |
| "loss": 0.98, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.0026572187776794, | |
| "grad_norm": 3.3616076027225965, | |
| "learning_rate": 4.8476885201950345e-06, | |
| "loss": 0.9476, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.0062001771479185, | |
| "grad_norm": 3.690305513366549, | |
| "learning_rate": 4.84555591630913e-06, | |
| "loss": 0.9102, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.0097431355181576, | |
| "grad_norm": 5.359946307013767, | |
| "learning_rate": 4.843408961842285e-06, | |
| "loss": 0.9232, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.0132860938883967, | |
| "grad_norm": 3.569793560947344, | |
| "learning_rate": 4.841247669929995e-06, | |
| "loss": 0.935, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.016829052258636, | |
| "grad_norm": 3.874750269772003, | |
| "learning_rate": 4.839072053795479e-06, | |
| "loss": 0.9331, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.0203720106288752, | |
| "grad_norm": 2.7385873822832023, | |
| "learning_rate": 4.83688212674959e-06, | |
| "loss": 0.9371, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.0239149689991143, | |
| "grad_norm": 3.665069410011044, | |
| "learning_rate": 4.834677902190742e-06, | |
| "loss": 0.9085, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.0274579273693534, | |
| "grad_norm": 2.3598498586364416, | |
| "learning_rate": 4.832459393604822e-06, | |
| "loss": 0.8526, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.0310008857395925, | |
| "grad_norm": 2.7411380905564613, | |
| "learning_rate": 4.830226614565109e-06, | |
| "loss": 0.9451, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.0345438441098318, | |
| "grad_norm": 4.338981698988937, | |
| "learning_rate": 4.8279795787321935e-06, | |
| "loss": 0.9065, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.038086802480071, | |
| "grad_norm": 5.731424678260782, | |
| "learning_rate": 4.8257182998538895e-06, | |
| "loss": 0.8988, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.04162976085031, | |
| "grad_norm": 4.552868648680658, | |
| "learning_rate": 4.823442791765157e-06, | |
| "loss": 0.9059, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.045172719220549, | |
| "grad_norm": 2.50768692217334, | |
| "learning_rate": 4.821153068388007e-06, | |
| "loss": 0.9601, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.0487156775907882, | |
| "grad_norm": 5.095351842225188, | |
| "learning_rate": 4.818849143731428e-06, | |
| "loss": 0.9152, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.0522586359610275, | |
| "grad_norm": 2.8481200255546493, | |
| "learning_rate": 4.816531031891292e-06, | |
| "loss": 0.8828, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.0558015943312666, | |
| "grad_norm": 4.464320906945187, | |
| "learning_rate": 4.814198747050271e-06, | |
| "loss": 0.9552, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 1.0593445527015057, | |
| "grad_norm": 5.546584102940785, | |
| "learning_rate": 4.811852303477751e-06, | |
| "loss": 0.8654, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 1.0628875110717448, | |
| "grad_norm": 4.466057137043603, | |
| "learning_rate": 4.809491715529744e-06, | |
| "loss": 0.8941, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.0628875110717448, | |
| "eval_loss": 0.8596345782279968, | |
| "eval_runtime": 368.3497, | |
| "eval_samples_per_second": 25.81, | |
| "eval_steps_per_second": 3.228, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.066430469441984, | |
| "grad_norm": 4.240898370302153, | |
| "learning_rate": 4.8071169976488e-06, | |
| "loss": 0.9238, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 1.0699734278122233, | |
| "grad_norm": 3.2580988514752343, | |
| "learning_rate": 4.804728164363918e-06, | |
| "loss": 0.9158, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.0735163861824624, | |
| "grad_norm": 3.870662909162037, | |
| "learning_rate": 4.80232523029046e-06, | |
| "loss": 0.9688, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 1.0770593445527015, | |
| "grad_norm": 1.7822539984397539, | |
| "learning_rate": 4.799908210130058e-06, | |
| "loss": 0.9053, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.0806023029229406, | |
| "grad_norm": 3.7732124960041276, | |
| "learning_rate": 4.797477118670524e-06, | |
| "loss": 0.9815, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.08414526129318, | |
| "grad_norm": 6.115099007176549, | |
| "learning_rate": 4.7950319707857615e-06, | |
| "loss": 0.9064, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 1.087688219663419, | |
| "grad_norm": 2.577014162447891, | |
| "learning_rate": 4.792572781435678e-06, | |
| "loss": 0.8382, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 1.091231178033658, | |
| "grad_norm": 3.220759744400382, | |
| "learning_rate": 4.790099565666086e-06, | |
| "loss": 0.8572, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 1.0947741364038972, | |
| "grad_norm": 2.7603504810469097, | |
| "learning_rate": 4.787612338608614e-06, | |
| "loss": 0.9017, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 1.0983170947741363, | |
| "grad_norm": 3.4076917515040686, | |
| "learning_rate": 4.785111115480615e-06, | |
| "loss": 0.9043, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.1018600531443756, | |
| "grad_norm": 2.714069163764211, | |
| "learning_rate": 4.782595911585074e-06, | |
| "loss": 0.9445, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 1.1054030115146147, | |
| "grad_norm": 2.579769298355691, | |
| "learning_rate": 4.780066742310512e-06, | |
| "loss": 0.8789, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.1089459698848538, | |
| "grad_norm": 4.326106933810614, | |
| "learning_rate": 4.777523623130894e-06, | |
| "loss": 0.9087, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 1.112488928255093, | |
| "grad_norm": 4.203656763039969, | |
| "learning_rate": 4.774966569605531e-06, | |
| "loss": 0.9168, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 1.1160318866253323, | |
| "grad_norm": 4.154066900716089, | |
| "learning_rate": 4.772395597378991e-06, | |
| "loss": 0.8687, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.1195748449955714, | |
| "grad_norm": 2.08540747712982, | |
| "learning_rate": 4.769810722180994e-06, | |
| "loss": 0.871, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 1.1231178033658105, | |
| "grad_norm": 3.1465129817119677, | |
| "learning_rate": 4.767211959826326e-06, | |
| "loss": 0.9231, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 1.1266607617360496, | |
| "grad_norm": 2.3015425077734233, | |
| "learning_rate": 4.764599326214736e-06, | |
| "loss": 0.91, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 1.1302037201062887, | |
| "grad_norm": 3.2081119900478083, | |
| "learning_rate": 4.761972837330839e-06, | |
| "loss": 0.9247, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 1.133746678476528, | |
| "grad_norm": 3.093019376379044, | |
| "learning_rate": 4.7593325092440204e-06, | |
| "loss": 0.8783, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.137289636846767, | |
| "grad_norm": 4.476714117074904, | |
| "learning_rate": 4.756678358108337e-06, | |
| "loss": 0.9356, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 1.1408325952170062, | |
| "grad_norm": 4.415252399299131, | |
| "learning_rate": 4.754010400162416e-06, | |
| "loss": 0.8873, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 1.1443755535872453, | |
| "grad_norm": 4.618410771551369, | |
| "learning_rate": 4.7513286517293585e-06, | |
| "loss": 0.9271, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 1.1479185119574846, | |
| "grad_norm": 3.1205309158472465, | |
| "learning_rate": 4.74863312921664e-06, | |
| "loss": 0.8835, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.1514614703277237, | |
| "grad_norm": 2.4282160366985255, | |
| "learning_rate": 4.7459238491160056e-06, | |
| "loss": 0.9308, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.1550044286979628, | |
| "grad_norm": 2.865738470619868, | |
| "learning_rate": 4.743200828003374e-06, | |
| "loss": 0.9414, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 1.158547387068202, | |
| "grad_norm": 3.6374100587906835, | |
| "learning_rate": 4.740464082538735e-06, | |
| "loss": 0.9106, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 1.162090345438441, | |
| "grad_norm": 3.0695217920809053, | |
| "learning_rate": 4.737713629466045e-06, | |
| "loss": 0.8616, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.1656333038086801, | |
| "grad_norm": 3.9353520892249363, | |
| "learning_rate": 4.734949485613126e-06, | |
| "loss": 0.8914, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 1.1691762621789195, | |
| "grad_norm": 3.484414702314974, | |
| "learning_rate": 4.732171667891564e-06, | |
| "loss": 0.92, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.1727192205491586, | |
| "grad_norm": 3.504870240653996, | |
| "learning_rate": 4.729380193296605e-06, | |
| "loss": 0.9396, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 1.1762621789193977, | |
| "grad_norm": 5.929401699342908, | |
| "learning_rate": 4.726575078907049e-06, | |
| "loss": 0.9188, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 1.1798051372896368, | |
| "grad_norm": 6.554517511673939, | |
| "learning_rate": 4.723756341885148e-06, | |
| "loss": 0.9534, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 1.183348095659876, | |
| "grad_norm": 2.434654285685298, | |
| "learning_rate": 4.7209239994765e-06, | |
| "loss": 0.8497, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 1.1868910540301152, | |
| "grad_norm": 2.9006017781540483, | |
| "learning_rate": 4.718078069009944e-06, | |
| "loss": 0.9326, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.1904340124003543, | |
| "grad_norm": 6.939455220904752, | |
| "learning_rate": 4.71521856789745e-06, | |
| "loss": 0.9234, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.1939769707705934, | |
| "grad_norm": 3.3273297930889814, | |
| "learning_rate": 4.712345513634021e-06, | |
| "loss": 0.9146, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 1.1975199291408325, | |
| "grad_norm": 3.0714433546937774, | |
| "learning_rate": 4.709458923797579e-06, | |
| "loss": 0.9112, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 1.2010628875110718, | |
| "grad_norm": 3.686574518500066, | |
| "learning_rate": 4.7065588160488565e-06, | |
| "loss": 0.9353, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 1.204605845881311, | |
| "grad_norm": 3.1307294434556256, | |
| "learning_rate": 4.703645208131294e-06, | |
| "loss": 0.8906, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.20814880425155, | |
| "grad_norm": 3.9540023879464616, | |
| "learning_rate": 4.70071811787093e-06, | |
| "loss": 0.9389, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 1.2116917626217891, | |
| "grad_norm": 4.99757735756388, | |
| "learning_rate": 4.697777563176288e-06, | |
| "loss": 0.8728, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 1.2152347209920284, | |
| "grad_norm": 1.8831614270222023, | |
| "learning_rate": 4.694823562038271e-06, | |
| "loss": 0.8971, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 1.2187776793622676, | |
| "grad_norm": 3.3841095933963747, | |
| "learning_rate": 4.69185613253005e-06, | |
| "loss": 0.9404, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.2223206377325067, | |
| "grad_norm": 2.5639299660820543, | |
| "learning_rate": 4.688875292806952e-06, | |
| "loss": 0.8651, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.2258635961027458, | |
| "grad_norm": 2.5209787475843024, | |
| "learning_rate": 4.685881061106352e-06, | |
| "loss": 0.8783, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 1.2294065544729849, | |
| "grad_norm": 4.174003458453298, | |
| "learning_rate": 4.68287345574756e-06, | |
| "loss": 0.939, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 1.2329495128432242, | |
| "grad_norm": 3.7213954731195944, | |
| "learning_rate": 4.679852495131708e-06, | |
| "loss": 0.9698, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 1.2364924712134633, | |
| "grad_norm": 3.1050229374536826, | |
| "learning_rate": 4.676818197741637e-06, | |
| "loss": 0.901, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 1.2400354295837024, | |
| "grad_norm": 4.537242489543826, | |
| "learning_rate": 4.673770582141788e-06, | |
| "loss": 0.8826, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.2435783879539415, | |
| "grad_norm": 3.157545512432448, | |
| "learning_rate": 4.670709666978081e-06, | |
| "loss": 0.9426, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 1.2471213463241808, | |
| "grad_norm": 3.6885040878766975, | |
| "learning_rate": 4.667635470977811e-06, | |
| "loss": 0.9253, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 1.25066430469442, | |
| "grad_norm": 3.8470192299130495, | |
| "learning_rate": 4.664548012949523e-06, | |
| "loss": 0.9516, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 1.254207263064659, | |
| "grad_norm": 2.7610537115436897, | |
| "learning_rate": 4.661447311782905e-06, | |
| "loss": 0.9632, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 1.2577502214348981, | |
| "grad_norm": 2.9076857370419726, | |
| "learning_rate": 4.658333386448668e-06, | |
| "loss": 0.8516, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.2612931798051372, | |
| "grad_norm": 2.8712653113719178, | |
| "learning_rate": 4.655206255998429e-06, | |
| "loss": 0.8681, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 1.2648361381753763, | |
| "grad_norm": 3.5281659347458443, | |
| "learning_rate": 4.652065939564601e-06, | |
| "loss": 0.8612, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 1.2683790965456156, | |
| "grad_norm": 3.0200930161561836, | |
| "learning_rate": 4.648912456360266e-06, | |
| "loss": 0.9232, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 1.2719220549158547, | |
| "grad_norm": 2.9141762965804068, | |
| "learning_rate": 4.645745825679069e-06, | |
| "loss": 0.8704, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 1.2754650132860939, | |
| "grad_norm": 3.466517729779936, | |
| "learning_rate": 4.642566066895089e-06, | |
| "loss": 0.9167, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.2790079716563332, | |
| "grad_norm": 4.016886275498443, | |
| "learning_rate": 4.639373199462728e-06, | |
| "loss": 0.8753, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 1.2825509300265723, | |
| "grad_norm": 2.6960913937478064, | |
| "learning_rate": 4.636167242916588e-06, | |
| "loss": 0.9387, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 1.2860938883968114, | |
| "grad_norm": 4.156865933729297, | |
| "learning_rate": 4.6329482168713535e-06, | |
| "loss": 0.8807, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 1.2896368467670505, | |
| "grad_norm": 2.3052353180349194, | |
| "learning_rate": 4.62971614102167e-06, | |
| "loss": 0.9344, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 1.2931798051372896, | |
| "grad_norm": 4.351236836672874, | |
| "learning_rate": 4.626471035142027e-06, | |
| "loss": 0.9368, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.2967227635075287, | |
| "grad_norm": 2.96988427417053, | |
| "learning_rate": 4.62321291908663e-06, | |
| "loss": 0.9225, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 1.300265721877768, | |
| "grad_norm": 2.803880488767538, | |
| "learning_rate": 4.619941812789287e-06, | |
| "loss": 0.9065, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 1.3038086802480071, | |
| "grad_norm": 2.2501268712600115, | |
| "learning_rate": 4.616657736263282e-06, | |
| "loss": 0.9095, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 1.3073516386182462, | |
| "grad_norm": 3.530457146463456, | |
| "learning_rate": 4.613360709601251e-06, | |
| "loss": 0.8956, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 1.3108945969884853, | |
| "grad_norm": 2.9920752839459848, | |
| "learning_rate": 4.6100507529750656e-06, | |
| "loss": 0.8907, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.3144375553587246, | |
| "grad_norm": 2.3035469610025614, | |
| "learning_rate": 4.6067278866357025e-06, | |
| "loss": 0.9135, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 1.3179805137289637, | |
| "grad_norm": 3.742157836005142, | |
| "learning_rate": 4.603392130913123e-06, | |
| "loss": 0.9146, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 1.3215234720992028, | |
| "grad_norm": 2.228186526375547, | |
| "learning_rate": 4.600043506216151e-06, | |
| "loss": 0.9103, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 1.325066430469442, | |
| "grad_norm": 2.4528087238581504, | |
| "learning_rate": 4.5966820330323405e-06, | |
| "loss": 0.9298, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 1.328609388839681, | |
| "grad_norm": 2.8414827920653125, | |
| "learning_rate": 4.59330773192786e-06, | |
| "loss": 0.8779, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.3321523472099202, | |
| "grad_norm": 3.492728703129015, | |
| "learning_rate": 4.5899206235473585e-06, | |
| "loss": 0.9399, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 1.3356953055801595, | |
| "grad_norm": 4.648463092813606, | |
| "learning_rate": 4.586520728613842e-06, | |
| "loss": 0.9026, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 1.3392382639503986, | |
| "grad_norm": 3.9103385940119297, | |
| "learning_rate": 4.583108067928552e-06, | |
| "loss": 0.8996, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 1.3427812223206377, | |
| "grad_norm": 4.1525594833780035, | |
| "learning_rate": 4.579682662370829e-06, | |
| "loss": 0.911, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 1.346324180690877, | |
| "grad_norm": 4.692836743880663, | |
| "learning_rate": 4.576244532897988e-06, | |
| "loss": 0.8638, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.349867139061116, | |
| "grad_norm": 2.805532914309198, | |
| "learning_rate": 4.572793700545197e-06, | |
| "loss": 0.9105, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 1.3534100974313552, | |
| "grad_norm": 3.179829503013336, | |
| "learning_rate": 4.569330186425339e-06, | |
| "loss": 0.9251, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 1.3569530558015943, | |
| "grad_norm": 2.862631874344196, | |
| "learning_rate": 4.565854011728885e-06, | |
| "loss": 0.9681, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 1.3604960141718334, | |
| "grad_norm": 3.7867164985506876, | |
| "learning_rate": 4.562365197723771e-06, | |
| "loss": 0.9298, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 1.3640389725420725, | |
| "grad_norm": 4.25563448626548, | |
| "learning_rate": 4.558863765755257e-06, | |
| "loss": 0.8872, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.3675819309123118, | |
| "grad_norm": 2.742439626228035, | |
| "learning_rate": 4.555349737245808e-06, | |
| "loss": 0.8776, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 1.371124889282551, | |
| "grad_norm": 3.4373516712170504, | |
| "learning_rate": 4.5518231336949526e-06, | |
| "loss": 0.8886, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 1.37466784765279, | |
| "grad_norm": 3.098577500060214, | |
| "learning_rate": 4.548283976679158e-06, | |
| "loss": 0.8762, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 1.3782108060230294, | |
| "grad_norm": 4.088083429018507, | |
| "learning_rate": 4.5447322878516965e-06, | |
| "loss": 0.8655, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 1.3817537643932685, | |
| "grad_norm": 4.5726701868460475, | |
| "learning_rate": 4.541168088942511e-06, | |
| "loss": 0.9061, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.3852967227635076, | |
| "grad_norm": 3.093081976098804, | |
| "learning_rate": 4.537591401758084e-06, | |
| "loss": 0.934, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 1.3888396811337467, | |
| "grad_norm": 5.171297134853851, | |
| "learning_rate": 4.5340022481813055e-06, | |
| "loss": 0.9712, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 1.3923826395039858, | |
| "grad_norm": 2.889572353723012, | |
| "learning_rate": 4.530400650171335e-06, | |
| "loss": 0.8755, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 1.3959255978742249, | |
| "grad_norm": 5.753273400624572, | |
| "learning_rate": 4.526786629763471e-06, | |
| "loss": 0.8735, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 1.3994685562444642, | |
| "grad_norm": 5.098809443054654, | |
| "learning_rate": 4.523160209069014e-06, | |
| "loss": 0.8922, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.4030115146147033, | |
| "grad_norm": 2.357594211768667, | |
| "learning_rate": 4.5195214102751324e-06, | |
| "loss": 0.9088, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 1.4065544729849424, | |
| "grad_norm": 3.9367579628501184, | |
| "learning_rate": 4.515870255644727e-06, | |
| "loss": 0.9186, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 1.4100974313551815, | |
| "grad_norm": 5.456286963862879, | |
| "learning_rate": 4.512206767516291e-06, | |
| "loss": 0.9111, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 1.4136403897254208, | |
| "grad_norm": 3.5921469100201944, | |
| "learning_rate": 4.508530968303781e-06, | |
| "loss": 0.9028, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 1.41718334809566, | |
| "grad_norm": 2.9603120534047056, | |
| "learning_rate": 4.504842880496472e-06, | |
| "loss": 0.8972, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.41718334809566, | |
| "eval_loss": 0.837660551071167, | |
| "eval_runtime": 368.7294, | |
| "eval_samples_per_second": 25.783, | |
| "eval_steps_per_second": 3.225, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.420726306465899, | |
| "grad_norm": 4.883873708709509, | |
| "learning_rate": 4.5011425266588225e-06, | |
| "loss": 0.9461, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 1.4242692648361381, | |
| "grad_norm": 4.135582187777777, | |
| "learning_rate": 4.497429929430341e-06, | |
| "loss": 0.9508, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 1.4278122232063772, | |
| "grad_norm": 4.734069578966871, | |
| "learning_rate": 4.493705111525439e-06, | |
| "loss": 0.9336, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 1.4313551815766163, | |
| "grad_norm": 3.218508259496232, | |
| "learning_rate": 4.4899680957333e-06, | |
| "loss": 0.8421, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 1.4348981399468557, | |
| "grad_norm": 2.1197851091950355, | |
| "learning_rate": 4.486218904917735e-06, | |
| "loss": 0.8656, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.4384410983170948, | |
| "grad_norm": 3.0696326095251694, | |
| "learning_rate": 4.482457562017043e-06, | |
| "loss": 0.8596, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 1.4419840566873339, | |
| "grad_norm": 3.0008100293231617, | |
| "learning_rate": 4.478684090043875e-06, | |
| "loss": 0.92, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 1.4455270150575732, | |
| "grad_norm": 2.9367769313434207, | |
| "learning_rate": 4.474898512085088e-06, | |
| "loss": 0.8598, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 1.4490699734278123, | |
| "grad_norm": 2.3972324015727473, | |
| "learning_rate": 4.471100851301605e-06, | |
| "loss": 0.8952, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 1.4526129317980514, | |
| "grad_norm": 2.2321286554170476, | |
| "learning_rate": 4.467291130928277e-06, | |
| "loss": 0.9081, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.4561558901682905, | |
| "grad_norm": 2.5187305501003725, | |
| "learning_rate": 4.463469374273737e-06, | |
| "loss": 0.9273, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 1.4596988485385296, | |
| "grad_norm": 5.185336322080168, | |
| "learning_rate": 4.459635604720255e-06, | |
| "loss": 0.8962, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 1.4632418069087687, | |
| "grad_norm": 2.6043475650440895, | |
| "learning_rate": 4.4557898457236025e-06, | |
| "loss": 0.9125, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 1.466784765279008, | |
| "grad_norm": 3.0322932631838233, | |
| "learning_rate": 4.4519321208129044e-06, | |
| "loss": 0.8977, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 1.4703277236492471, | |
| "grad_norm": 4.742695243075337, | |
| "learning_rate": 4.448062453590493e-06, | |
| "loss": 0.9128, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.4738706820194862, | |
| "grad_norm": 2.709718326834174, | |
| "learning_rate": 4.444180867731769e-06, | |
| "loss": 0.8838, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 1.4774136403897256, | |
| "grad_norm": 4.119470722998265, | |
| "learning_rate": 4.44028738698505e-06, | |
| "loss": 0.8819, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 1.4809565987599647, | |
| "grad_norm": 3.1816783974486382, | |
| "learning_rate": 4.436382035171432e-06, | |
| "loss": 0.8797, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 1.4844995571302038, | |
| "grad_norm": 3.1914690524711844, | |
| "learning_rate": 4.4324648361846424e-06, | |
| "loss": 0.8278, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 1.4880425155004429, | |
| "grad_norm": 4.009571063093233, | |
| "learning_rate": 4.428535813990885e-06, | |
| "loss": 0.9445, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.491585473870682, | |
| "grad_norm": 2.9833892404793736, | |
| "learning_rate": 4.424594992628708e-06, | |
| "loss": 0.8951, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 1.495128432240921, | |
| "grad_norm": 2.478036010569002, | |
| "learning_rate": 4.420642396208844e-06, | |
| "loss": 0.8963, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 1.4986713906111604, | |
| "grad_norm": 3.24442393322022, | |
| "learning_rate": 4.416678048914069e-06, | |
| "loss": 0.8875, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 1.5022143489813995, | |
| "grad_norm": 5.0690401321371334, | |
| "learning_rate": 4.412701974999057e-06, | |
| "loss": 0.9041, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 1.5057573073516386, | |
| "grad_norm": 3.5032838009682172, | |
| "learning_rate": 4.4087141987902215e-06, | |
| "loss": 0.9024, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.509300265721878, | |
| "grad_norm": 3.4593082915022384, | |
| "learning_rate": 4.404714744685578e-06, | |
| "loss": 0.9299, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 1.512843224092117, | |
| "grad_norm": 2.906522716091139, | |
| "learning_rate": 4.4007036371545865e-06, | |
| "loss": 0.8399, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 1.5163861824623561, | |
| "grad_norm": 2.556244604059082, | |
| "learning_rate": 4.396680900738007e-06, | |
| "loss": 0.8959, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 1.5199291408325952, | |
| "grad_norm": 2.872414224448131, | |
| "learning_rate": 4.392646560047746e-06, | |
| "loss": 0.8837, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 1.5234720992028343, | |
| "grad_norm": 4.371819876343486, | |
| "learning_rate": 4.388600639766711e-06, | |
| "loss": 0.9246, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.5270150575730734, | |
| "grad_norm": 3.5523806587587514, | |
| "learning_rate": 4.384543164648649e-06, | |
| "loss": 0.931, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 1.5305580159433125, | |
| "grad_norm": 2.929477900211409, | |
| "learning_rate": 4.380474159518007e-06, | |
| "loss": 0.8985, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 1.5341009743135519, | |
| "grad_norm": 4.211828377945081, | |
| "learning_rate": 4.3763936492697735e-06, | |
| "loss": 0.8785, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 1.537643932683791, | |
| "grad_norm": 2.4701458590021956, | |
| "learning_rate": 4.372301658869327e-06, | |
| "loss": 0.9385, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 1.54118689105403, | |
| "grad_norm": 3.142058476836683, | |
| "learning_rate": 4.368198213352286e-06, | |
| "loss": 0.902, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.5447298494242694, | |
| "grad_norm": 2.771138222528823, | |
| "learning_rate": 4.3640833378243505e-06, | |
| "loss": 0.8804, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 1.5482728077945085, | |
| "grad_norm": 3.9280654601321006, | |
| "learning_rate": 4.3599570574611545e-06, | |
| "loss": 0.8938, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 1.5518157661647476, | |
| "grad_norm": 2.2418472839675463, | |
| "learning_rate": 4.355819397508106e-06, | |
| "loss": 0.8968, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 1.5553587245349867, | |
| "grad_norm": 3.399601556100489, | |
| "learning_rate": 4.35167038328024e-06, | |
| "loss": 0.8546, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 1.5589016829052258, | |
| "grad_norm": 3.455386669336896, | |
| "learning_rate": 4.3475100401620555e-06, | |
| "loss": 0.8987, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.562444641275465, | |
| "grad_norm": 4.528613569077828, | |
| "learning_rate": 4.3433383936073635e-06, | |
| "loss": 0.9096, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 1.565987599645704, | |
| "grad_norm": 2.885188625705028, | |
| "learning_rate": 4.3391554691391345e-06, | |
| "loss": 0.8747, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 1.5695305580159433, | |
| "grad_norm": 4.53397939933421, | |
| "learning_rate": 4.334961292349339e-06, | |
| "loss": 0.9238, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 1.5730735163861824, | |
| "grad_norm": 4.595528706412981, | |
| "learning_rate": 4.33075588889879e-06, | |
| "loss": 0.9103, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 1.5766164747564217, | |
| "grad_norm": 4.301564262537775, | |
| "learning_rate": 4.326539284516989e-06, | |
| "loss": 0.8638, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.5801594331266609, | |
| "grad_norm": 2.0941698417183883, | |
| "learning_rate": 4.322311505001964e-06, | |
| "loss": 0.9186, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 1.5837023914969, | |
| "grad_norm": 2.2302494709941416, | |
| "learning_rate": 4.318072576220119e-06, | |
| "loss": 0.9041, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 1.587245349867139, | |
| "grad_norm": 3.6126726071073914, | |
| "learning_rate": 4.31382252410607e-06, | |
| "loss": 0.9306, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 1.5907883082373782, | |
| "grad_norm": 3.4874125803596265, | |
| "learning_rate": 4.309561374662486e-06, | |
| "loss": 0.9067, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 1.5943312666076173, | |
| "grad_norm": 3.2939583547971076, | |
| "learning_rate": 4.3052891539599315e-06, | |
| "loss": 0.9511, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.5978742249778564, | |
| "grad_norm": 4.614562529533774, | |
| "learning_rate": 4.301005888136711e-06, | |
| "loss": 0.8829, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 1.6014171833480957, | |
| "grad_norm": 3.801219562406557, | |
| "learning_rate": 4.2967116033987015e-06, | |
| "loss": 0.912, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 1.6049601417183348, | |
| "grad_norm": 3.753568798707887, | |
| "learning_rate": 4.292406326019198e-06, | |
| "loss": 0.8699, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 1.6085031000885741, | |
| "grad_norm": 4.73422889733671, | |
| "learning_rate": 4.288090082338749e-06, | |
| "loss": 0.8836, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 1.6120460584588132, | |
| "grad_norm": 2.9548744217680873, | |
| "learning_rate": 4.283762898764998e-06, | |
| "loss": 0.8952, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.6155890168290523, | |
| "grad_norm": 3.206506941283786, | |
| "learning_rate": 4.2794248017725226e-06, | |
| "loss": 0.8603, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 1.6191319751992914, | |
| "grad_norm": 3.530944230336076, | |
| "learning_rate": 4.275075817902667e-06, | |
| "loss": 0.9217, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 1.6226749335695305, | |
| "grad_norm": 3.203969300098245, | |
| "learning_rate": 4.270715973763387e-06, | |
| "loss": 0.8971, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 1.6262178919397696, | |
| "grad_norm": 3.0239142090145052, | |
| "learning_rate": 4.2663452960290805e-06, | |
| "loss": 0.9334, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 1.6297608503100087, | |
| "grad_norm": 2.5777090574009, | |
| "learning_rate": 4.261963811440432e-06, | |
| "loss": 0.8392, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.633303808680248, | |
| "grad_norm": 4.058353343726418, | |
| "learning_rate": 4.25757154680424e-06, | |
| "loss": 0.8933, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 1.6368467670504872, | |
| "grad_norm": 3.507645687067127, | |
| "learning_rate": 4.253168528993261e-06, | |
| "loss": 0.8899, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 1.6403897254207263, | |
| "grad_norm": 3.513884615072957, | |
| "learning_rate": 4.248754784946038e-06, | |
| "loss": 0.9113, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 1.6439326837909656, | |
| "grad_norm": 2.773723856571234, | |
| "learning_rate": 4.244330341666743e-06, | |
| "loss": 0.9056, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 1.6474756421612047, | |
| "grad_norm": 3.4180543280648243, | |
| "learning_rate": 4.239895226225005e-06, | |
| "loss": 0.8966, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.6510186005314438, | |
| "grad_norm": 3.918770035240026, | |
| "learning_rate": 4.2354494657557485e-06, | |
| "loss": 0.8769, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 1.6545615589016829, | |
| "grad_norm": 3.372636086492444, | |
| "learning_rate": 4.230993087459028e-06, | |
| "loss": 0.8915, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 1.658104517271922, | |
| "grad_norm": 3.510054489740414, | |
| "learning_rate": 4.226526118599858e-06, | |
| "loss": 0.9184, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 1.661647475642161, | |
| "grad_norm": 2.0157440996114526, | |
| "learning_rate": 4.222048586508048e-06, | |
| "loss": 0.9172, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 1.6651904340124002, | |
| "grad_norm": 3.6068994268498074, | |
| "learning_rate": 4.2175605185780375e-06, | |
| "loss": 0.8873, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.6687333923826395, | |
| "grad_norm": 4.420449955455039, | |
| "learning_rate": 4.213061942268724e-06, | |
| "loss": 0.8436, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 1.6722763507528786, | |
| "grad_norm": 3.1825503459026168, | |
| "learning_rate": 4.208552885103299e-06, | |
| "loss": 0.8543, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 1.675819309123118, | |
| "grad_norm": 2.8992183977213535, | |
| "learning_rate": 4.204033374669077e-06, | |
| "loss": 0.8824, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 1.679362267493357, | |
| "grad_norm": 2.799570684359126, | |
| "learning_rate": 4.19950343861733e-06, | |
| "loss": 0.8671, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 1.6829052258635961, | |
| "grad_norm": 3.641582622193844, | |
| "learning_rate": 4.194963104663112e-06, | |
| "loss": 0.8628, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.6864481842338352, | |
| "grad_norm": 4.398919078323408, | |
| "learning_rate": 4.1904124005850954e-06, | |
| "loss": 0.9005, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 1.6899911426040743, | |
| "grad_norm": 2.9512298802973143, | |
| "learning_rate": 4.185851354225401e-06, | |
| "loss": 0.9078, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 1.6935341009743134, | |
| "grad_norm": 3.8044700286481246, | |
| "learning_rate": 4.181279993489423e-06, | |
| "loss": 0.9168, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 1.6970770593445526, | |
| "grad_norm": 3.308352478584529, | |
| "learning_rate": 4.176698346345663e-06, | |
| "loss": 0.8434, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 1.7006200177147919, | |
| "grad_norm": 3.904869182178536, | |
| "learning_rate": 4.1721064408255555e-06, | |
| "loss": 0.9005, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.704162976085031, | |
| "grad_norm": 2.6424522860538615, | |
| "learning_rate": 4.167504305023298e-06, | |
| "loss": 0.9278, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 1.7077059344552703, | |
| "grad_norm": 5.57177952217328, | |
| "learning_rate": 4.162891967095679e-06, | |
| "loss": 0.8677, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 1.7112488928255094, | |
| "grad_norm": 2.7234322141678686, | |
| "learning_rate": 4.158269455261906e-06, | |
| "loss": 0.8629, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 1.7147918511957485, | |
| "grad_norm": 4.837352939716468, | |
| "learning_rate": 4.1536367978034335e-06, | |
| "loss": 0.9231, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 1.7183348095659876, | |
| "grad_norm": 2.5650760898970653, | |
| "learning_rate": 4.148994023063787e-06, | |
| "loss": 0.91, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.7218777679362267, | |
| "grad_norm": 2.1623432794551216, | |
| "learning_rate": 4.1443411594483915e-06, | |
| "loss": 0.8876, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 1.7254207263064658, | |
| "grad_norm": 5.619389806185287, | |
| "learning_rate": 4.139678235424399e-06, | |
| "loss": 0.8599, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 1.728963684676705, | |
| "grad_norm": 4.021341243552535, | |
| "learning_rate": 4.135005279520514e-06, | |
| "loss": 0.9074, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 1.7325066430469442, | |
| "grad_norm": 3.5635632300979707, | |
| "learning_rate": 4.130322320326816e-06, | |
| "loss": 0.8933, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 1.7360496014171833, | |
| "grad_norm": 2.540277205876661, | |
| "learning_rate": 4.125629386494587e-06, | |
| "loss": 0.9291, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.7395925597874224, | |
| "grad_norm": 3.7695170204780957, | |
| "learning_rate": 4.120926506736137e-06, | |
| "loss": 0.903, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 1.7431355181576618, | |
| "grad_norm": 1.914697967954277, | |
| "learning_rate": 4.116213709824625e-06, | |
| "loss": 0.8321, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 1.7466784765279009, | |
| "grad_norm": 3.4586226366917194, | |
| "learning_rate": 4.111491024593889e-06, | |
| "loss": 0.8858, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 1.75022143489814, | |
| "grad_norm": 2.2832262533351195, | |
| "learning_rate": 4.10675847993826e-06, | |
| "loss": 0.9102, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 1.753764393268379, | |
| "grad_norm": 4.02421843778651, | |
| "learning_rate": 4.102016104812396e-06, | |
| "loss": 0.8392, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.7573073516386182, | |
| "grad_norm": 3.890461917932318, | |
| "learning_rate": 4.0972639282311e-06, | |
| "loss": 0.8785, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 1.7608503100088573, | |
| "grad_norm": 2.297623079323031, | |
| "learning_rate": 4.092501979269137e-06, | |
| "loss": 0.8855, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 1.7643932683790964, | |
| "grad_norm": 3.6677681482613216, | |
| "learning_rate": 4.087730287061065e-06, | |
| "loss": 0.8625, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 1.7679362267493357, | |
| "grad_norm": 3.9358016909534395, | |
| "learning_rate": 4.082948880801054e-06, | |
| "loss": 0.833, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 1.7714791851195748, | |
| "grad_norm": 4.267043354085718, | |
| "learning_rate": 4.078157789742706e-06, | |
| "loss": 0.9039, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.7714791851195748, | |
| "eval_loss": 0.8149307370185852, | |
| "eval_runtime": 369.1142, | |
| "eval_samples_per_second": 25.756, | |
| "eval_steps_per_second": 3.221, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.7750221434898141, | |
| "grad_norm": 3.7339229818305384, | |
| "learning_rate": 4.073357043198874e-06, | |
| "loss": 0.8925, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 1.7785651018600532, | |
| "grad_norm": 3.428649696183049, | |
| "learning_rate": 4.068546670541487e-06, | |
| "loss": 0.8256, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 1.7821080602302923, | |
| "grad_norm": 2.561924691977688, | |
| "learning_rate": 4.06372670120137e-06, | |
| "loss": 0.8685, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 1.7856510186005314, | |
| "grad_norm": 2.2436966577943855, | |
| "learning_rate": 4.05889716466806e-06, | |
| "loss": 0.8773, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 1.7891939769707705, | |
| "grad_norm": 2.5739823412815395, | |
| "learning_rate": 4.054058090489628e-06, | |
| "loss": 0.9268, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.7927369353410096, | |
| "grad_norm": 3.639815355685089, | |
| "learning_rate": 4.049209508272501e-06, | |
| "loss": 0.901, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 1.7962798937112487, | |
| "grad_norm": 4.803567565137283, | |
| "learning_rate": 4.044351447681276e-06, | |
| "loss": 0.8509, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 1.799822852081488, | |
| "grad_norm": 4.961777896889485, | |
| "learning_rate": 4.0394839384385395e-06, | |
| "loss": 0.9093, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 1.8033658104517272, | |
| "grad_norm": 3.0658650580570823, | |
| "learning_rate": 4.034607010324689e-06, | |
| "loss": 0.8937, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 1.8069087688219665, | |
| "grad_norm": 3.236704486524868, | |
| "learning_rate": 4.029720693177747e-06, | |
| "loss": 0.8738, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.8104517271922056, | |
| "grad_norm": 4.564091129067981, | |
| "learning_rate": 4.024825016893182e-06, | |
| "loss": 0.8737, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 1.8139946855624447, | |
| "grad_norm": 2.347358666581137, | |
| "learning_rate": 4.01992001142372e-06, | |
| "loss": 0.8632, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 1.8175376439326838, | |
| "grad_norm": 3.5395322977603927, | |
| "learning_rate": 4.015005706779169e-06, | |
| "loss": 0.8579, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 1.821080602302923, | |
| "grad_norm": 2.3411115987360827, | |
| "learning_rate": 4.010082133026229e-06, | |
| "loss": 0.9116, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 1.824623560673162, | |
| "grad_norm": 5.420278546986004, | |
| "learning_rate": 4.005149320288308e-06, | |
| "loss": 0.9216, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.828166519043401, | |
| "grad_norm": 3.177246662363069, | |
| "learning_rate": 4.000207298745347e-06, | |
| "loss": 0.8348, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 1.8317094774136404, | |
| "grad_norm": 3.260983474589408, | |
| "learning_rate": 3.995256098633618e-06, | |
| "loss": 0.8853, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 1.8352524357838795, | |
| "grad_norm": 4.262465852736409, | |
| "learning_rate": 3.9902957502455605e-06, | |
| "loss": 0.8776, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 1.8387953941541186, | |
| "grad_norm": 4.227525664714203, | |
| "learning_rate": 3.985326283929577e-06, | |
| "loss": 0.8614, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 1.842338352524358, | |
| "grad_norm": 3.1605952131114625, | |
| "learning_rate": 3.9803477300898574e-06, | |
| "loss": 0.863, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.845881310894597, | |
| "grad_norm": 2.751402878519728, | |
| "learning_rate": 3.975360119186192e-06, | |
| "loss": 0.8683, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 1.8494242692648362, | |
| "grad_norm": 3.1105411540099714, | |
| "learning_rate": 3.970363481733784e-06, | |
| "loss": 0.9019, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 1.8529672276350753, | |
| "grad_norm": 2.090165548223681, | |
| "learning_rate": 3.965357848303061e-06, | |
| "loss": 0.9317, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 1.8565101860053144, | |
| "grad_norm": 2.551059143441233, | |
| "learning_rate": 3.960343249519493e-06, | |
| "loss": 0.8711, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 1.8600531443755535, | |
| "grad_norm": 4.2607971452767766, | |
| "learning_rate": 3.955319716063397e-06, | |
| "loss": 0.8526, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.8635961027457926, | |
| "grad_norm": 1.8942460590897414, | |
| "learning_rate": 3.950287278669759e-06, | |
| "loss": 0.8988, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 1.867139061116032, | |
| "grad_norm": 3.6086912268779483, | |
| "learning_rate": 3.945245968128039e-06, | |
| "loss": 0.828, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 1.870682019486271, | |
| "grad_norm": 2.2171636599062294, | |
| "learning_rate": 3.940195815281984e-06, | |
| "loss": 0.8195, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 1.8742249778565103, | |
| "grad_norm": 3.3159052435056333, | |
| "learning_rate": 3.935136851029441e-06, | |
| "loss": 0.9019, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 1.8777679362267494, | |
| "grad_norm": 2.4712113724302998, | |
| "learning_rate": 3.930069106322167e-06, | |
| "loss": 0.867, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.8813108945969885, | |
| "grad_norm": 3.5773386774255473, | |
| "learning_rate": 3.924992612165638e-06, | |
| "loss": 0.9161, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 1.8848538529672276, | |
| "grad_norm": 3.63341154286338, | |
| "learning_rate": 3.919907399618864e-06, | |
| "loss": 0.9039, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 1.8883968113374667, | |
| "grad_norm": 2.5738022632421806, | |
| "learning_rate": 3.914813499794193e-06, | |
| "loss": 0.9, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 1.8919397697077058, | |
| "grad_norm": 3.282128941108443, | |
| "learning_rate": 3.909710943857125e-06, | |
| "loss": 0.8783, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 1.895482728077945, | |
| "grad_norm": 3.079231075434731, | |
| "learning_rate": 3.904599763026117e-06, | |
| "loss": 0.8829, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.8990256864481843, | |
| "grad_norm": 4.0065620278734055, | |
| "learning_rate": 3.899479988572401e-06, | |
| "loss": 0.9157, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 1.9025686448184234, | |
| "grad_norm": 4.857718153080153, | |
| "learning_rate": 3.89435165181978e-06, | |
| "loss": 0.8971, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 1.9061116031886627, | |
| "grad_norm": 5.451024881061867, | |
| "learning_rate": 3.8892147841444465e-06, | |
| "loss": 0.9133, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 1.9096545615589018, | |
| "grad_norm": 2.9783216918153688, | |
| "learning_rate": 3.884069416974785e-06, | |
| "loss": 0.8671, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 1.9131975199291409, | |
| "grad_norm": 4.130442740763265, | |
| "learning_rate": 3.878915581791184e-06, | |
| "loss": 0.8812, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.91674047829938, | |
| "grad_norm": 3.087103014469501, | |
| "learning_rate": 3.873753310125838e-06, | |
| "loss": 0.8596, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 1.920283436669619, | |
| "grad_norm": 3.4980061190534832, | |
| "learning_rate": 3.868582633562561e-06, | |
| "loss": 0.9347, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 1.9238263950398582, | |
| "grad_norm": 2.7736175156145353, | |
| "learning_rate": 3.863403583736586e-06, | |
| "loss": 0.8216, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 1.9273693534100973, | |
| "grad_norm": 3.2203528253737184, | |
| "learning_rate": 3.858216192334377e-06, | |
| "loss": 0.9563, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 1.9309123117803366, | |
| "grad_norm": 3.423660636662842, | |
| "learning_rate": 3.853020491093436e-06, | |
| "loss": 0.9045, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.9344552701505757, | |
| "grad_norm": 4.576810058530241, | |
| "learning_rate": 3.847816511802104e-06, | |
| "loss": 0.8914, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 1.9379982285208148, | |
| "grad_norm": 4.986380910931554, | |
| "learning_rate": 3.842604286299366e-06, | |
| "loss": 0.9222, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 1.9415411868910541, | |
| "grad_norm": 5.049582396983484, | |
| "learning_rate": 3.837383846474663e-06, | |
| "loss": 0.8764, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 1.9450841452612933, | |
| "grad_norm": 3.943843485476312, | |
| "learning_rate": 3.832155224267693e-06, | |
| "loss": 0.8614, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 1.9486271036315324, | |
| "grad_norm": 4.818067048868772, | |
| "learning_rate": 3.8269184516682114e-06, | |
| "loss": 0.8844, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.9521700620017715, | |
| "grad_norm": 2.6435129006124356, | |
| "learning_rate": 3.821673560715844e-06, | |
| "loss": 0.8859, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 1.9557130203720106, | |
| "grad_norm": 4.218938965276267, | |
| "learning_rate": 3.816420583499883e-06, | |
| "loss": 0.8694, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 1.9592559787422497, | |
| "grad_norm": 3.1248706808657993, | |
| "learning_rate": 3.811159552159097e-06, | |
| "loss": 0.8484, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 1.9627989371124888, | |
| "grad_norm": 3.806523511219131, | |
| "learning_rate": 3.8058904988815274e-06, | |
| "loss": 0.8471, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 1.966341895482728, | |
| "grad_norm": 3.51824367536095, | |
| "learning_rate": 3.800613455904299e-06, | |
| "loss": 0.9077, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.9698848538529672, | |
| "grad_norm": 3.3138759907728677, | |
| "learning_rate": 3.795328455513418e-06, | |
| "loss": 0.8493, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 1.9734278122232065, | |
| "grad_norm": 2.581058466584183, | |
| "learning_rate": 3.7900355300435744e-06, | |
| "loss": 0.8834, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 1.9769707705934456, | |
| "grad_norm": 2.712205874433446, | |
| "learning_rate": 3.7847347118779464e-06, | |
| "loss": 0.8902, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 1.9805137289636847, | |
| "grad_norm": 1.9869529582535677, | |
| "learning_rate": 3.7794260334480026e-06, | |
| "loss": 0.849, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 1.9840566873339238, | |
| "grad_norm": 5.413750266589672, | |
| "learning_rate": 3.7741095272333008e-06, | |
| "loss": 0.8644, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.987599645704163, | |
| "grad_norm": 2.941600930177304, | |
| "learning_rate": 3.76878522576129e-06, | |
| "loss": 0.8478, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 1.991142604074402, | |
| "grad_norm": 4.120948714739335, | |
| "learning_rate": 3.7634531616071137e-06, | |
| "loss": 0.9119, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 1.9946855624446411, | |
| "grad_norm": 3.1166160914538636, | |
| "learning_rate": 3.758113367393409e-06, | |
| "loss": 0.8953, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 1.9982285208148804, | |
| "grad_norm": 3.148186400991171, | |
| "learning_rate": 3.7527658757901046e-06, | |
| "loss": 0.8957, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 2.0017714791851198, | |
| "grad_norm": 4.237349805163588, | |
| "learning_rate": 3.7474107195142273e-06, | |
| "loss": 0.8242, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 2.005314437555359, | |
| "grad_norm": 3.964803901086038, | |
| "learning_rate": 3.7420479313296964e-06, | |
| "loss": 0.733, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 2.008857395925598, | |
| "grad_norm": 2.851459294376265, | |
| "learning_rate": 3.7366775440471213e-06, | |
| "loss": 0.8224, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 2.012400354295837, | |
| "grad_norm": 3.6339236231049847, | |
| "learning_rate": 3.7312995905236105e-06, | |
| "loss": 0.8078, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 2.015943312666076, | |
| "grad_norm": 2.394977571241479, | |
| "learning_rate": 3.725914103662559e-06, | |
| "loss": 0.7777, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 2.0194862710363153, | |
| "grad_norm": 3.5149972337594475, | |
| "learning_rate": 3.7205211164134547e-06, | |
| "loss": 0.7742, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 2.0230292294065544, | |
| "grad_norm": 4.113453512038928, | |
| "learning_rate": 3.7151206617716734e-06, | |
| "loss": 0.7539, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 2.0265721877767935, | |
| "grad_norm": 3.2750961089816353, | |
| "learning_rate": 3.709712772778279e-06, | |
| "loss": 0.7788, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 2.0301151461470326, | |
| "grad_norm": 4.035570508399602, | |
| "learning_rate": 3.70429748251982e-06, | |
| "loss": 0.7829, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 2.033658104517272, | |
| "grad_norm": 4.436303471963281, | |
| "learning_rate": 3.698874824128126e-06, | |
| "loss": 0.7311, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 2.0372010628875112, | |
| "grad_norm": 5.818707534945472, | |
| "learning_rate": 3.693444830780107e-06, | |
| "loss": 0.773, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.0407440212577503, | |
| "grad_norm": 3.540550130014629, | |
| "learning_rate": 3.6880075356975515e-06, | |
| "loss": 0.7814, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 2.0442869796279894, | |
| "grad_norm": 3.6346205822234943, | |
| "learning_rate": 3.6825629721469188e-06, | |
| "loss": 0.8135, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 2.0478299379982285, | |
| "grad_norm": 2.4682825796443475, | |
| "learning_rate": 3.6771111734391397e-06, | |
| "loss": 0.745, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 2.0513728963684676, | |
| "grad_norm": 2.7295024854988195, | |
| "learning_rate": 3.6716521729294104e-06, | |
| "loss": 0.7792, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 2.0549158547387067, | |
| "grad_norm": 3.3999116860452525, | |
| "learning_rate": 3.66618600401699e-06, | |
| "loss": 0.7841, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 2.058458813108946, | |
| "grad_norm": 1.979950381057992, | |
| "learning_rate": 3.660712700144995e-06, | |
| "loss": 0.7577, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 2.062001771479185, | |
| "grad_norm": 2.961231731307014, | |
| "learning_rate": 3.655232294800194e-06, | |
| "loss": 0.8112, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 2.065544729849424, | |
| "grad_norm": 2.9168214431871546, | |
| "learning_rate": 3.6497448215128054e-06, | |
| "loss": 0.7407, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 2.0690876882196636, | |
| "grad_norm": 3.3537918642406814, | |
| "learning_rate": 3.6442503138562902e-06, | |
| "loss": 0.7432, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 2.0726306465899027, | |
| "grad_norm": 2.885605458154861, | |
| "learning_rate": 3.638748805447146e-06, | |
| "loss": 0.7657, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 2.076173604960142, | |
| "grad_norm": 4.064439601513717, | |
| "learning_rate": 3.6332403299447046e-06, | |
| "loss": 0.7374, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 2.079716563330381, | |
| "grad_norm": 4.214372884804176, | |
| "learning_rate": 3.6277249210509208e-06, | |
| "loss": 0.758, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 2.08325952170062, | |
| "grad_norm": 3.8146529692577937, | |
| "learning_rate": 3.6222026125101717e-06, | |
| "loss": 0.7635, | |
| "step": 11760 | |
| }, | |
| { | |
| "epoch": 2.086802480070859, | |
| "grad_norm": 4.249159301555663, | |
| "learning_rate": 3.6166734381090483e-06, | |
| "loss": 0.7928, | |
| "step": 11780 | |
| }, | |
| { | |
| "epoch": 2.090345438441098, | |
| "grad_norm": 5.7403075464380375, | |
| "learning_rate": 3.611137431676146e-06, | |
| "loss": 0.7451, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 2.0938883968113373, | |
| "grad_norm": 3.6236953170629667, | |
| "learning_rate": 3.605594627081861e-06, | |
| "loss": 0.7332, | |
| "step": 11820 | |
| }, | |
| { | |
| "epoch": 2.0974313551815764, | |
| "grad_norm": 3.240921081992807, | |
| "learning_rate": 3.6000450582381823e-06, | |
| "loss": 0.75, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 2.100974313551816, | |
| "grad_norm": 4.877179694529326, | |
| "learning_rate": 3.5944887590984846e-06, | |
| "loss": 0.7824, | |
| "step": 11860 | |
| }, | |
| { | |
| "epoch": 2.104517271922055, | |
| "grad_norm": 2.2176442012470576, | |
| "learning_rate": 3.5889257636573183e-06, | |
| "loss": 0.7936, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 2.108060230292294, | |
| "grad_norm": 4.221294517545139, | |
| "learning_rate": 3.583356105950203e-06, | |
| "loss": 0.7548, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 2.1116031886625333, | |
| "grad_norm": 3.8503143338096786, | |
| "learning_rate": 3.5777798200534214e-06, | |
| "loss": 0.7967, | |
| "step": 11920 | |
| }, | |
| { | |
| "epoch": 2.1151461470327724, | |
| "grad_norm": 3.1910071043427326, | |
| "learning_rate": 3.5721969400838073e-06, | |
| "loss": 0.7786, | |
| "step": 11940 | |
| }, | |
| { | |
| "epoch": 2.1186891054030115, | |
| "grad_norm": 2.860111918382593, | |
| "learning_rate": 3.5666075001985386e-06, | |
| "loss": 0.7517, | |
| "step": 11960 | |
| }, | |
| { | |
| "epoch": 2.1222320637732506, | |
| "grad_norm": 2.0905030982179134, | |
| "learning_rate": 3.561011534594928e-06, | |
| "loss": 0.7558, | |
| "step": 11980 | |
| }, | |
| { | |
| "epoch": 2.1257750221434897, | |
| "grad_norm": 5.060645708453129, | |
| "learning_rate": 3.555409077510215e-06, | |
| "loss": 0.7414, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.1257750221434897, | |
| "eval_loss": 0.8194052577018738, | |
| "eval_runtime": 368.7991, | |
| "eval_samples_per_second": 25.778, | |
| "eval_steps_per_second": 3.224, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.129317980513729, | |
| "grad_norm": 3.6709097863394655, | |
| "learning_rate": 3.549800163221353e-06, | |
| "loss": 0.7369, | |
| "step": 12020 | |
| }, | |
| { | |
| "epoch": 2.132860938883968, | |
| "grad_norm": 4.6404940480464125, | |
| "learning_rate": 3.5441848260448035e-06, | |
| "loss": 0.7919, | |
| "step": 12040 | |
| }, | |
| { | |
| "epoch": 2.1364038972542074, | |
| "grad_norm": 3.4473691964125353, | |
| "learning_rate": 3.5385631003363245e-06, | |
| "loss": 0.7841, | |
| "step": 12060 | |
| }, | |
| { | |
| "epoch": 2.1399468556244465, | |
| "grad_norm": 4.36997053485404, | |
| "learning_rate": 3.532935020490761e-06, | |
| "loss": 0.7681, | |
| "step": 12080 | |
| }, | |
| { | |
| "epoch": 2.1434898139946856, | |
| "grad_norm": 3.642775603252494, | |
| "learning_rate": 3.5273006209418297e-06, | |
| "loss": 0.7377, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 2.1470327723649247, | |
| "grad_norm": 2.9088288240530806, | |
| "learning_rate": 3.5216599361619193e-06, | |
| "loss": 0.7356, | |
| "step": 12120 | |
| }, | |
| { | |
| "epoch": 2.150575730735164, | |
| "grad_norm": 4.94624315541774, | |
| "learning_rate": 3.5160130006618665e-06, | |
| "loss": 0.7688, | |
| "step": 12140 | |
| }, | |
| { | |
| "epoch": 2.154118689105403, | |
| "grad_norm": 3.431747470285621, | |
| "learning_rate": 3.5103598489907553e-06, | |
| "loss": 0.7322, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 2.157661647475642, | |
| "grad_norm": 3.5652670345700876, | |
| "learning_rate": 3.5047005157357e-06, | |
| "loss": 0.7752, | |
| "step": 12180 | |
| }, | |
| { | |
| "epoch": 2.161204605845881, | |
| "grad_norm": 4.315860117945172, | |
| "learning_rate": 3.4990350355216347e-06, | |
| "loss": 0.7443, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 2.1647475642161202, | |
| "grad_norm": 3.963300128138939, | |
| "learning_rate": 3.493363443011102e-06, | |
| "loss": 0.7595, | |
| "step": 12220 | |
| }, | |
| { | |
| "epoch": 2.16829052258636, | |
| "grad_norm": 3.3278580876056623, | |
| "learning_rate": 3.487685772904041e-06, | |
| "loss": 0.7573, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 2.171833480956599, | |
| "grad_norm": 3.6502088387557516, | |
| "learning_rate": 3.4820020599375755e-06, | |
| "loss": 0.7675, | |
| "step": 12260 | |
| }, | |
| { | |
| "epoch": 2.175376439326838, | |
| "grad_norm": 2.110435980731087, | |
| "learning_rate": 3.476312338885799e-06, | |
| "loss": 0.7659, | |
| "step": 12280 | |
| }, | |
| { | |
| "epoch": 2.178919397697077, | |
| "grad_norm": 3.132585579346833, | |
| "learning_rate": 3.4706166445595657e-06, | |
| "loss": 0.7691, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 2.182462356067316, | |
| "grad_norm": 5.677900838287172, | |
| "learning_rate": 3.4649150118062737e-06, | |
| "loss": 0.7543, | |
| "step": 12320 | |
| }, | |
| { | |
| "epoch": 2.1860053144375553, | |
| "grad_norm": 3.5849859798668047, | |
| "learning_rate": 3.4592074755096533e-06, | |
| "loss": 0.7485, | |
| "step": 12340 | |
| }, | |
| { | |
| "epoch": 2.1895482728077944, | |
| "grad_norm": 2.897975653877466, | |
| "learning_rate": 3.453494070589556e-06, | |
| "loss": 0.741, | |
| "step": 12360 | |
| }, | |
| { | |
| "epoch": 2.1930912311780335, | |
| "grad_norm": 2.300880578954949, | |
| "learning_rate": 3.4477748320017386e-06, | |
| "loss": 0.7245, | |
| "step": 12380 | |
| }, | |
| { | |
| "epoch": 2.1966341895482726, | |
| "grad_norm": 3.4934403676076213, | |
| "learning_rate": 3.442049794737647e-06, | |
| "loss": 0.7645, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 2.200177147918512, | |
| "grad_norm": 3.0309935521375695, | |
| "learning_rate": 3.436318993824206e-06, | |
| "loss": 0.7822, | |
| "step": 12420 | |
| }, | |
| { | |
| "epoch": 2.2037201062887513, | |
| "grad_norm": 5.162246999416868, | |
| "learning_rate": 3.430582464323603e-06, | |
| "loss": 0.7638, | |
| "step": 12440 | |
| }, | |
| { | |
| "epoch": 2.2072630646589904, | |
| "grad_norm": 4.3206905241817175, | |
| "learning_rate": 3.4248402413330766e-06, | |
| "loss": 0.7872, | |
| "step": 12460 | |
| }, | |
| { | |
| "epoch": 2.2108060230292295, | |
| "grad_norm": 4.236342233174995, | |
| "learning_rate": 3.419092359984695e-06, | |
| "loss": 0.7546, | |
| "step": 12480 | |
| }, | |
| { | |
| "epoch": 2.2143489813994686, | |
| "grad_norm": 2.8151366817841756, | |
| "learning_rate": 3.41333885544515e-06, | |
| "loss": 0.7635, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.2178919397697077, | |
| "grad_norm": 3.2839030456741978, | |
| "learning_rate": 3.4075797629155336e-06, | |
| "loss": 0.7588, | |
| "step": 12520 | |
| }, | |
| { | |
| "epoch": 2.2214348981399468, | |
| "grad_norm": 3.0947148397280997, | |
| "learning_rate": 3.4018151176311267e-06, | |
| "loss": 0.7277, | |
| "step": 12540 | |
| }, | |
| { | |
| "epoch": 2.224977856510186, | |
| "grad_norm": 3.5428621372363063, | |
| "learning_rate": 3.396044954861185e-06, | |
| "loss": 0.7679, | |
| "step": 12560 | |
| }, | |
| { | |
| "epoch": 2.228520814880425, | |
| "grad_norm": 2.02419126865859, | |
| "learning_rate": 3.39026930990872e-06, | |
| "loss": 0.7446, | |
| "step": 12580 | |
| }, | |
| { | |
| "epoch": 2.2320637732506645, | |
| "grad_norm": 4.0306736526937765, | |
| "learning_rate": 3.384488218110285e-06, | |
| "loss": 0.7599, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 2.2356067316209036, | |
| "grad_norm": 3.3079541839461606, | |
| "learning_rate": 3.378701714835756e-06, | |
| "loss": 0.7325, | |
| "step": 12620 | |
| }, | |
| { | |
| "epoch": 2.2391496899911427, | |
| "grad_norm": 3.6500019106828754, | |
| "learning_rate": 3.3729098354881207e-06, | |
| "loss": 0.7834, | |
| "step": 12640 | |
| }, | |
| { | |
| "epoch": 2.242692648361382, | |
| "grad_norm": 2.9776776199055073, | |
| "learning_rate": 3.367112615503256e-06, | |
| "loss": 0.7479, | |
| "step": 12660 | |
| }, | |
| { | |
| "epoch": 2.246235606731621, | |
| "grad_norm": 2.8136251511132, | |
| "learning_rate": 3.3613100903497165e-06, | |
| "loss": 0.7972, | |
| "step": 12680 | |
| }, | |
| { | |
| "epoch": 2.24977856510186, | |
| "grad_norm": 2.6979166100675633, | |
| "learning_rate": 3.355502295528512e-06, | |
| "loss": 0.785, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 2.253321523472099, | |
| "grad_norm": 2.555788234192105, | |
| "learning_rate": 3.349689266572896e-06, | |
| "loss": 0.7337, | |
| "step": 12720 | |
| }, | |
| { | |
| "epoch": 2.2568644818423382, | |
| "grad_norm": 4.085833264183358, | |
| "learning_rate": 3.3438710390481423e-06, | |
| "loss": 0.7795, | |
| "step": 12740 | |
| }, | |
| { | |
| "epoch": 2.2604074402125773, | |
| "grad_norm": 4.190825689674867, | |
| "learning_rate": 3.338047648551333e-06, | |
| "loss": 0.7946, | |
| "step": 12760 | |
| }, | |
| { | |
| "epoch": 2.263950398582817, | |
| "grad_norm": 3.1997981806226496, | |
| "learning_rate": 3.3322191307111386e-06, | |
| "loss": 0.7573, | |
| "step": 12780 | |
| }, | |
| { | |
| "epoch": 2.267493356953056, | |
| "grad_norm": 3.0496961823219237, | |
| "learning_rate": 3.326385521187598e-06, | |
| "loss": 0.7191, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 2.271036315323295, | |
| "grad_norm": 2.3669933953301996, | |
| "learning_rate": 3.320546855671903e-06, | |
| "loss": 0.7787, | |
| "step": 12820 | |
| }, | |
| { | |
| "epoch": 2.274579273693534, | |
| "grad_norm": 4.128804064988176, | |
| "learning_rate": 3.3147031698861783e-06, | |
| "loss": 0.8122, | |
| "step": 12840 | |
| }, | |
| { | |
| "epoch": 2.2781222320637733, | |
| "grad_norm": 3.6541280879376288, | |
| "learning_rate": 3.308854499583265e-06, | |
| "loss": 0.8089, | |
| "step": 12860 | |
| }, | |
| { | |
| "epoch": 2.2816651904340124, | |
| "grad_norm": 3.9405511693030513, | |
| "learning_rate": 3.3030008805464987e-06, | |
| "loss": 0.7806, | |
| "step": 12880 | |
| }, | |
| { | |
| "epoch": 2.2852081488042515, | |
| "grad_norm": 2.7534555896912183, | |
| "learning_rate": 3.297142348589493e-06, | |
| "loss": 0.7826, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 2.2887511071744906, | |
| "grad_norm": 3.3797355039239956, | |
| "learning_rate": 3.2912789395559226e-06, | |
| "loss": 0.8049, | |
| "step": 12920 | |
| }, | |
| { | |
| "epoch": 2.2922940655447297, | |
| "grad_norm": 3.9407118834982677, | |
| "learning_rate": 3.285410689319295e-06, | |
| "loss": 0.7897, | |
| "step": 12940 | |
| }, | |
| { | |
| "epoch": 2.2958370239149692, | |
| "grad_norm": 2.4540627878279713, | |
| "learning_rate": 3.2795376337827416e-06, | |
| "loss": 0.7869, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 2.299379982285208, | |
| "grad_norm": 5.328758496807459, | |
| "learning_rate": 3.273659808878794e-06, | |
| "loss": 0.7567, | |
| "step": 12980 | |
| }, | |
| { | |
| "epoch": 2.3029229406554474, | |
| "grad_norm": 2.939528537204539, | |
| "learning_rate": 3.2677772505691614e-06, | |
| "loss": 0.7337, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.3064658990256866, | |
| "grad_norm": 2.1870838009976206, | |
| "learning_rate": 3.2618899948445143e-06, | |
| "loss": 0.8036, | |
| "step": 13020 | |
| }, | |
| { | |
| "epoch": 2.3100088573959257, | |
| "grad_norm": 2.3749989783363037, | |
| "learning_rate": 3.255998077724261e-06, | |
| "loss": 0.7477, | |
| "step": 13040 | |
| }, | |
| { | |
| "epoch": 2.3135518157661648, | |
| "grad_norm": 4.691180192385719, | |
| "learning_rate": 3.250101535256333e-06, | |
| "loss": 0.7982, | |
| "step": 13060 | |
| }, | |
| { | |
| "epoch": 2.317094774136404, | |
| "grad_norm": 3.160712448911439, | |
| "learning_rate": 3.2442004035169566e-06, | |
| "loss": 0.7429, | |
| "step": 13080 | |
| }, | |
| { | |
| "epoch": 2.320637732506643, | |
| "grad_norm": 3.397209885685089, | |
| "learning_rate": 3.2382947186104385e-06, | |
| "loss": 0.7749, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 2.324180690876882, | |
| "grad_norm": 3.15455456395674, | |
| "learning_rate": 3.232384516668943e-06, | |
| "loss": 0.7693, | |
| "step": 13120 | |
| }, | |
| { | |
| "epoch": 2.327723649247121, | |
| "grad_norm": 5.565654365935215, | |
| "learning_rate": 3.2264698338522664e-06, | |
| "loss": 0.772, | |
| "step": 13140 | |
| }, | |
| { | |
| "epoch": 2.3312666076173603, | |
| "grad_norm": 4.698072365225265, | |
| "learning_rate": 3.2205507063476255e-06, | |
| "loss": 0.7808, | |
| "step": 13160 | |
| }, | |
| { | |
| "epoch": 2.3348095659876, | |
| "grad_norm": 3.59501906125545, | |
| "learning_rate": 3.2146271703694277e-06, | |
| "loss": 0.7505, | |
| "step": 13180 | |
| }, | |
| { | |
| "epoch": 2.338352524357839, | |
| "grad_norm": 3.679313411486785, | |
| "learning_rate": 3.208699262159052e-06, | |
| "loss": 0.7336, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 2.341895482728078, | |
| "grad_norm": 2.8791170352318276, | |
| "learning_rate": 3.2027670179846294e-06, | |
| "loss": 0.7307, | |
| "step": 13220 | |
| }, | |
| { | |
| "epoch": 2.345438441098317, | |
| "grad_norm": 4.059760879698832, | |
| "learning_rate": 3.196830474140816e-06, | |
| "loss": 0.753, | |
| "step": 13240 | |
| }, | |
| { | |
| "epoch": 2.348981399468556, | |
| "grad_norm": 2.996057141115257, | |
| "learning_rate": 3.190889666948579e-06, | |
| "loss": 0.7399, | |
| "step": 13260 | |
| }, | |
| { | |
| "epoch": 2.3525243578387953, | |
| "grad_norm": 4.234482920426088, | |
| "learning_rate": 3.184944632754964e-06, | |
| "loss": 0.7904, | |
| "step": 13280 | |
| }, | |
| { | |
| "epoch": 2.3560673162090344, | |
| "grad_norm": 2.3964341925650463, | |
| "learning_rate": 3.1789954079328835e-06, | |
| "loss": 0.7534, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 2.3596102745792735, | |
| "grad_norm": 2.9159272754142918, | |
| "learning_rate": 3.1730420288808862e-06, | |
| "loss": 0.7834, | |
| "step": 13320 | |
| }, | |
| { | |
| "epoch": 2.3631532329495126, | |
| "grad_norm": 3.781684460395673, | |
| "learning_rate": 3.1670845320229355e-06, | |
| "loss": 0.771, | |
| "step": 13340 | |
| }, | |
| { | |
| "epoch": 2.366696191319752, | |
| "grad_norm": 2.826416108672929, | |
| "learning_rate": 3.161122953808192e-06, | |
| "loss": 0.7354, | |
| "step": 13360 | |
| }, | |
| { | |
| "epoch": 2.3702391496899913, | |
| "grad_norm": 5.104935252190013, | |
| "learning_rate": 3.1551573307107867e-06, | |
| "loss": 0.7503, | |
| "step": 13380 | |
| }, | |
| { | |
| "epoch": 2.3737821080602304, | |
| "grad_norm": 4.124050748848566, | |
| "learning_rate": 3.149187699229595e-06, | |
| "loss": 0.7191, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.3773250664304695, | |
| "grad_norm": 4.433917574735756, | |
| "learning_rate": 3.1432140958880186e-06, | |
| "loss": 0.7036, | |
| "step": 13420 | |
| }, | |
| { | |
| "epoch": 2.3808680248007086, | |
| "grad_norm": 6.0333723611681975, | |
| "learning_rate": 3.1372365572337592e-06, | |
| "loss": 0.6947, | |
| "step": 13440 | |
| }, | |
| { | |
| "epoch": 2.3844109831709477, | |
| "grad_norm": 3.496479107809089, | |
| "learning_rate": 3.1312551198385964e-06, | |
| "loss": 0.8186, | |
| "step": 13460 | |
| }, | |
| { | |
| "epoch": 2.387953941541187, | |
| "grad_norm": 2.9985043132815608, | |
| "learning_rate": 3.1252698202981613e-06, | |
| "loss": 0.762, | |
| "step": 13480 | |
| }, | |
| { | |
| "epoch": 2.391496899911426, | |
| "grad_norm": 3.5778965231230733, | |
| "learning_rate": 3.1192806952317155e-06, | |
| "loss": 0.7475, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.395039858281665, | |
| "grad_norm": 2.4853017301906046, | |
| "learning_rate": 3.113287781281927e-06, | |
| "loss": 0.7673, | |
| "step": 13520 | |
| }, | |
| { | |
| "epoch": 2.3985828166519045, | |
| "grad_norm": 4.385979644321999, | |
| "learning_rate": 3.107291115114643e-06, | |
| "loss": 0.7664, | |
| "step": 13540 | |
| }, | |
| { | |
| "epoch": 2.4021257750221436, | |
| "grad_norm": 3.519870816810653, | |
| "learning_rate": 3.1012907334186676e-06, | |
| "loss": 0.7708, | |
| "step": 13560 | |
| }, | |
| { | |
| "epoch": 2.4056687333923827, | |
| "grad_norm": 3.6501676511740913, | |
| "learning_rate": 3.09528667290554e-06, | |
| "loss": 0.7354, | |
| "step": 13580 | |
| }, | |
| { | |
| "epoch": 2.409211691762622, | |
| "grad_norm": 5.282285428849241, | |
| "learning_rate": 3.0892789703093025e-06, | |
| "loss": 0.7679, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.412754650132861, | |
| "grad_norm": 4.970772234231322, | |
| "learning_rate": 3.0832676623862847e-06, | |
| "loss": 0.7753, | |
| "step": 13620 | |
| }, | |
| { | |
| "epoch": 2.4162976085031, | |
| "grad_norm": 5.300930589688842, | |
| "learning_rate": 3.0772527859148726e-06, | |
| "loss": 0.7309, | |
| "step": 13640 | |
| }, | |
| { | |
| "epoch": 2.419840566873339, | |
| "grad_norm": 3.195715973824964, | |
| "learning_rate": 3.0712343776952845e-06, | |
| "loss": 0.8118, | |
| "step": 13660 | |
| }, | |
| { | |
| "epoch": 2.4233835252435783, | |
| "grad_norm": 3.8485686008854025, | |
| "learning_rate": 3.0652124745493483e-06, | |
| "loss": 0.7677, | |
| "step": 13680 | |
| }, | |
| { | |
| "epoch": 2.4269264836138174, | |
| "grad_norm": 3.6307249326313844, | |
| "learning_rate": 3.0591871133202733e-06, | |
| "loss": 0.7562, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.430469441984057, | |
| "grad_norm": 3.316064861185543, | |
| "learning_rate": 3.0531583308724267e-06, | |
| "loss": 0.7626, | |
| "step": 13720 | |
| }, | |
| { | |
| "epoch": 2.434012400354296, | |
| "grad_norm": 6.634549317490503, | |
| "learning_rate": 3.0471261640911065e-06, | |
| "loss": 0.758, | |
| "step": 13740 | |
| }, | |
| { | |
| "epoch": 2.437555358724535, | |
| "grad_norm": 3.175037432084709, | |
| "learning_rate": 3.0410906498823176e-06, | |
| "loss": 0.747, | |
| "step": 13760 | |
| }, | |
| { | |
| "epoch": 2.441098317094774, | |
| "grad_norm": 4.104765159107976, | |
| "learning_rate": 3.0350518251725466e-06, | |
| "loss": 0.7529, | |
| "step": 13780 | |
| }, | |
| { | |
| "epoch": 2.4446412754650133, | |
| "grad_norm": 4.108216250616023, | |
| "learning_rate": 3.02900972690853e-06, | |
| "loss": 0.7329, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.4481842338352524, | |
| "grad_norm": 5.179386294221434, | |
| "learning_rate": 3.0229643920570368e-06, | |
| "loss": 0.7756, | |
| "step": 13820 | |
| }, | |
| { | |
| "epoch": 2.4517271922054915, | |
| "grad_norm": 3.532213415891956, | |
| "learning_rate": 3.0169158576046364e-06, | |
| "loss": 0.7857, | |
| "step": 13840 | |
| }, | |
| { | |
| "epoch": 2.4552701505757306, | |
| "grad_norm": 3.1668720104134898, | |
| "learning_rate": 3.0108641605574746e-06, | |
| "loss": 0.7689, | |
| "step": 13860 | |
| }, | |
| { | |
| "epoch": 2.4588131089459697, | |
| "grad_norm": 5.323518128860221, | |
| "learning_rate": 3.0048093379410455e-06, | |
| "loss": 0.7193, | |
| "step": 13880 | |
| }, | |
| { | |
| "epoch": 2.4623560673162093, | |
| "grad_norm": 2.8074163772217346, | |
| "learning_rate": 2.998751426799967e-06, | |
| "loss": 0.7663, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.4658990256864484, | |
| "grad_norm": 3.3433870569143402, | |
| "learning_rate": 2.9926904641977524e-06, | |
| "loss": 0.7351, | |
| "step": 13920 | |
| }, | |
| { | |
| "epoch": 2.4694419840566875, | |
| "grad_norm": 4.317693104328955, | |
| "learning_rate": 2.986626487216586e-06, | |
| "loss": 0.7303, | |
| "step": 13940 | |
| }, | |
| { | |
| "epoch": 2.4729849424269266, | |
| "grad_norm": 2.9507964917970093, | |
| "learning_rate": 2.9805595329570926e-06, | |
| "loss": 0.7355, | |
| "step": 13960 | |
| }, | |
| { | |
| "epoch": 2.4765279007971657, | |
| "grad_norm": 5.5399797509230035, | |
| "learning_rate": 2.974489638538115e-06, | |
| "loss": 0.7673, | |
| "step": 13980 | |
| }, | |
| { | |
| "epoch": 2.4800708591674048, | |
| "grad_norm": 6.341760350289788, | |
| "learning_rate": 2.9684168410964815e-06, | |
| "loss": 0.7332, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.4800708591674048, | |
| "eval_loss": 0.8049691915512085, | |
| "eval_runtime": 366.4293, | |
| "eval_samples_per_second": 25.945, | |
| "eval_steps_per_second": 3.245, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.483613817537644, | |
| "grad_norm": 3.362490165226843, | |
| "learning_rate": 2.9623411777867845e-06, | |
| "loss": 0.8132, | |
| "step": 14020 | |
| }, | |
| { | |
| "epoch": 2.487156775907883, | |
| "grad_norm": 3.067701488951309, | |
| "learning_rate": 2.9562626857811486e-06, | |
| "loss": 0.73, | |
| "step": 14040 | |
| }, | |
| { | |
| "epoch": 2.490699734278122, | |
| "grad_norm": 5.481445388434678, | |
| "learning_rate": 2.950181402269007e-06, | |
| "loss": 0.7727, | |
| "step": 14060 | |
| }, | |
| { | |
| "epoch": 2.4942426926483616, | |
| "grad_norm": 3.704158248514089, | |
| "learning_rate": 2.944097364456867e-06, | |
| "loss": 0.7594, | |
| "step": 14080 | |
| }, | |
| { | |
| "epoch": 2.4977856510186003, | |
| "grad_norm": 3.703332272719033, | |
| "learning_rate": 2.9380106095680943e-06, | |
| "loss": 0.7816, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.50132860938884, | |
| "grad_norm": 5.075701987314385, | |
| "learning_rate": 2.931921174842672e-06, | |
| "loss": 0.8127, | |
| "step": 14120 | |
| }, | |
| { | |
| "epoch": 2.504871567759079, | |
| "grad_norm": 3.3757570715718126, | |
| "learning_rate": 2.925829097536983e-06, | |
| "loss": 0.7594, | |
| "step": 14140 | |
| }, | |
| { | |
| "epoch": 2.508414526129318, | |
| "grad_norm": 5.003414946307021, | |
| "learning_rate": 2.9197344149235762e-06, | |
| "loss": 0.802, | |
| "step": 14160 | |
| }, | |
| { | |
| "epoch": 2.511957484499557, | |
| "grad_norm": 3.883907931049023, | |
| "learning_rate": 2.9136371642909406e-06, | |
| "loss": 0.7292, | |
| "step": 14180 | |
| }, | |
| { | |
| "epoch": 2.5155004428697962, | |
| "grad_norm": 4.7644188631188555, | |
| "learning_rate": 2.9075373829432766e-06, | |
| "loss": 0.7899, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.5190434012400353, | |
| "grad_norm": 3.1005288937544795, | |
| "learning_rate": 2.901435108200269e-06, | |
| "loss": 0.7501, | |
| "step": 14220 | |
| }, | |
| { | |
| "epoch": 2.5225863596102744, | |
| "grad_norm": 4.202314740845049, | |
| "learning_rate": 2.8953303773968566e-06, | |
| "loss": 0.733, | |
| "step": 14240 | |
| }, | |
| { | |
| "epoch": 2.526129317980514, | |
| "grad_norm": 3.0865315047496256, | |
| "learning_rate": 2.889223227883006e-06, | |
| "loss": 0.7218, | |
| "step": 14260 | |
| }, | |
| { | |
| "epoch": 2.5296722763507526, | |
| "grad_norm": 3.687380874866973, | |
| "learning_rate": 2.8831136970234798e-06, | |
| "loss": 0.7539, | |
| "step": 14280 | |
| }, | |
| { | |
| "epoch": 2.533215234720992, | |
| "grad_norm": 4.787195673053088, | |
| "learning_rate": 2.8770018221976126e-06, | |
| "loss": 0.7733, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.5367581930912313, | |
| "grad_norm": 3.131210531061707, | |
| "learning_rate": 2.8708876407990794e-06, | |
| "loss": 0.8023, | |
| "step": 14320 | |
| }, | |
| { | |
| "epoch": 2.5403011514614704, | |
| "grad_norm": 6.575229105312035, | |
| "learning_rate": 2.8647711902356653e-06, | |
| "loss": 0.7857, | |
| "step": 14340 | |
| }, | |
| { | |
| "epoch": 2.5438441098317095, | |
| "grad_norm": 3.132660461857889, | |
| "learning_rate": 2.858652507929042e-06, | |
| "loss": 0.6994, | |
| "step": 14360 | |
| }, | |
| { | |
| "epoch": 2.5473870682019486, | |
| "grad_norm": 4.422023062404442, | |
| "learning_rate": 2.852531631314531e-06, | |
| "loss": 0.7629, | |
| "step": 14380 | |
| }, | |
| { | |
| "epoch": 2.5509300265721877, | |
| "grad_norm": 6.144792250633855, | |
| "learning_rate": 2.846408597840884e-06, | |
| "loss": 0.8015, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.554472984942427, | |
| "grad_norm": 4.254153197416055, | |
| "learning_rate": 2.8402834449700444e-06, | |
| "loss": 0.8166, | |
| "step": 14420 | |
| }, | |
| { | |
| "epoch": 2.5580159433126664, | |
| "grad_norm": 4.276352369102927, | |
| "learning_rate": 2.8341562101769258e-06, | |
| "loss": 0.7488, | |
| "step": 14440 | |
| }, | |
| { | |
| "epoch": 2.561558901682905, | |
| "grad_norm": 2.7388450962012008, | |
| "learning_rate": 2.8280269309491783e-06, | |
| "loss": 0.731, | |
| "step": 14460 | |
| }, | |
| { | |
| "epoch": 2.5651018600531446, | |
| "grad_norm": 3.155033747973735, | |
| "learning_rate": 2.821895644786958e-06, | |
| "loss": 0.7601, | |
| "step": 14480 | |
| }, | |
| { | |
| "epoch": 2.5686448184233837, | |
| "grad_norm": 3.8509609959663242, | |
| "learning_rate": 2.815762389202703e-06, | |
| "loss": 0.7773, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.5721877767936228, | |
| "grad_norm": 2.7079281092995617, | |
| "learning_rate": 2.8096272017208996e-06, | |
| "loss": 0.6832, | |
| "step": 14520 | |
| }, | |
| { | |
| "epoch": 2.575730735163862, | |
| "grad_norm": 4.486116320592051, | |
| "learning_rate": 2.8034901198778537e-06, | |
| "loss": 0.7792, | |
| "step": 14540 | |
| }, | |
| { | |
| "epoch": 2.579273693534101, | |
| "grad_norm": 3.703073425582054, | |
| "learning_rate": 2.7973511812214614e-06, | |
| "loss": 0.7297, | |
| "step": 14560 | |
| }, | |
| { | |
| "epoch": 2.58281665190434, | |
| "grad_norm": 4.327931475336945, | |
| "learning_rate": 2.79121042331098e-06, | |
| "loss": 0.7332, | |
| "step": 14580 | |
| }, | |
| { | |
| "epoch": 2.586359610274579, | |
| "grad_norm": 4.730270311129735, | |
| "learning_rate": 2.7850678837167943e-06, | |
| "loss": 0.7537, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.5899025686448183, | |
| "grad_norm": 4.061197200463917, | |
| "learning_rate": 2.778923600020193e-06, | |
| "loss": 0.7364, | |
| "step": 14620 | |
| }, | |
| { | |
| "epoch": 2.5934455270150574, | |
| "grad_norm": 5.875022772190702, | |
| "learning_rate": 2.7727776098131355e-06, | |
| "loss": 0.763, | |
| "step": 14640 | |
| }, | |
| { | |
| "epoch": 2.596988485385297, | |
| "grad_norm": 4.2525211564332155, | |
| "learning_rate": 2.76662995069802e-06, | |
| "loss": 0.7106, | |
| "step": 14660 | |
| }, | |
| { | |
| "epoch": 2.600531443755536, | |
| "grad_norm": 2.812095781009687, | |
| "learning_rate": 2.760480660287457e-06, | |
| "loss": 0.7964, | |
| "step": 14680 | |
| }, | |
| { | |
| "epoch": 2.604074402125775, | |
| "grad_norm": 2.982535796988278, | |
| "learning_rate": 2.7543297762040367e-06, | |
| "loss": 0.7471, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.6076173604960142, | |
| "grad_norm": 3.3609650163002285, | |
| "learning_rate": 2.748177336080099e-06, | |
| "loss": 0.7404, | |
| "step": 14720 | |
| }, | |
| { | |
| "epoch": 2.6111603188662533, | |
| "grad_norm": 3.3659754899747183, | |
| "learning_rate": 2.7420233775575062e-06, | |
| "loss": 0.7088, | |
| "step": 14740 | |
| }, | |
| { | |
| "epoch": 2.6147032772364924, | |
| "grad_norm": 4.8930238508897865, | |
| "learning_rate": 2.73586793828741e-06, | |
| "loss": 0.8097, | |
| "step": 14760 | |
| }, | |
| { | |
| "epoch": 2.6182462356067315, | |
| "grad_norm": 3.7139806486443523, | |
| "learning_rate": 2.7297110559300196e-06, | |
| "loss": 0.726, | |
| "step": 14780 | |
| }, | |
| { | |
| "epoch": 2.6217891939769706, | |
| "grad_norm": 5.786625818280847, | |
| "learning_rate": 2.7235527681543745e-06, | |
| "loss": 0.7663, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.6253321523472097, | |
| "grad_norm": 2.4715288407192917, | |
| "learning_rate": 2.717393112638113e-06, | |
| "loss": 0.8067, | |
| "step": 14820 | |
| }, | |
| { | |
| "epoch": 2.6288751107174493, | |
| "grad_norm": 2.48294124433425, | |
| "learning_rate": 2.7112321270672427e-06, | |
| "loss": 0.7436, | |
| "step": 14840 | |
| }, | |
| { | |
| "epoch": 2.6324180690876884, | |
| "grad_norm": 4.533346896730641, | |
| "learning_rate": 2.705069849135905e-06, | |
| "loss": 0.7542, | |
| "step": 14860 | |
| }, | |
| { | |
| "epoch": 2.6359610274579275, | |
| "grad_norm": 2.595943549639139, | |
| "learning_rate": 2.698906316546154e-06, | |
| "loss": 0.7206, | |
| "step": 14880 | |
| }, | |
| { | |
| "epoch": 2.6395039858281666, | |
| "grad_norm": 4.011814531987077, | |
| "learning_rate": 2.6927415670077133e-06, | |
| "loss": 0.7981, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 2.6430469441984057, | |
| "grad_norm": 3.060330539530146, | |
| "learning_rate": 2.6865756382377577e-06, | |
| "loss": 0.7805, | |
| "step": 14920 | |
| }, | |
| { | |
| "epoch": 2.646589902568645, | |
| "grad_norm": 2.9995394478553803, | |
| "learning_rate": 2.6804085679606735e-06, | |
| "loss": 0.7601, | |
| "step": 14940 | |
| }, | |
| { | |
| "epoch": 2.650132860938884, | |
| "grad_norm": 5.280778921498777, | |
| "learning_rate": 2.674240393907832e-06, | |
| "loss": 0.7646, | |
| "step": 14960 | |
| }, | |
| { | |
| "epoch": 2.653675819309123, | |
| "grad_norm": 3.798446112260494, | |
| "learning_rate": 2.6680711538173595e-06, | |
| "loss": 0.7871, | |
| "step": 14980 | |
| }, | |
| { | |
| "epoch": 2.657218777679362, | |
| "grad_norm": 5.01912287250632, | |
| "learning_rate": 2.661900885433899e-06, | |
| "loss": 0.745, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.6607617360496016, | |
| "grad_norm": 2.960773034701044, | |
| "learning_rate": 2.6557296265083917e-06, | |
| "loss": 0.7822, | |
| "step": 15020 | |
| }, | |
| { | |
| "epoch": 2.6643046944198403, | |
| "grad_norm": 3.6183436876712287, | |
| "learning_rate": 2.649557414797834e-06, | |
| "loss": 0.7811, | |
| "step": 15040 | |
| }, | |
| { | |
| "epoch": 2.66784765279008, | |
| "grad_norm": 2.9204062861952242, | |
| "learning_rate": 2.6433842880650552e-06, | |
| "loss": 0.7684, | |
| "step": 15060 | |
| }, | |
| { | |
| "epoch": 2.671390611160319, | |
| "grad_norm": 3.8802496727785587, | |
| "learning_rate": 2.63721028407848e-06, | |
| "loss": 0.6913, | |
| "step": 15080 | |
| }, | |
| { | |
| "epoch": 2.674933569530558, | |
| "grad_norm": 2.5788899500499554, | |
| "learning_rate": 2.6310354406119022e-06, | |
| "loss": 0.7309, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 2.678476527900797, | |
| "grad_norm": 4.277286678336026, | |
| "learning_rate": 2.6248597954442493e-06, | |
| "loss": 0.7644, | |
| "step": 15120 | |
| }, | |
| { | |
| "epoch": 2.6820194862710363, | |
| "grad_norm": 3.188282097160713, | |
| "learning_rate": 2.6186833863593576e-06, | |
| "loss": 0.7619, | |
| "step": 15140 | |
| }, | |
| { | |
| "epoch": 2.6855624446412754, | |
| "grad_norm": 2.200272309526565, | |
| "learning_rate": 2.6125062511457344e-06, | |
| "loss": 0.7518, | |
| "step": 15160 | |
| }, | |
| { | |
| "epoch": 2.6891054030115145, | |
| "grad_norm": 4.286800017508891, | |
| "learning_rate": 2.6063284275963296e-06, | |
| "loss": 0.7551, | |
| "step": 15180 | |
| }, | |
| { | |
| "epoch": 2.692648361381754, | |
| "grad_norm": 3.9692599529255337, | |
| "learning_rate": 2.6001499535083067e-06, | |
| "loss": 0.7885, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.6961913197519927, | |
| "grad_norm": 3.3386892906925203, | |
| "learning_rate": 2.593970866682806e-06, | |
| "loss": 0.7603, | |
| "step": 15220 | |
| }, | |
| { | |
| "epoch": 2.699734278122232, | |
| "grad_norm": 3.5522070573455298, | |
| "learning_rate": 2.5877912049247206e-06, | |
| "loss": 0.7833, | |
| "step": 15240 | |
| }, | |
| { | |
| "epoch": 2.7032772364924713, | |
| "grad_norm": 3.6763130005746327, | |
| "learning_rate": 2.5816110060424566e-06, | |
| "loss": 0.7451, | |
| "step": 15260 | |
| }, | |
| { | |
| "epoch": 2.7068201948627104, | |
| "grad_norm": 4.065296942712161, | |
| "learning_rate": 2.57543030784771e-06, | |
| "loss": 0.7856, | |
| "step": 15280 | |
| }, | |
| { | |
| "epoch": 2.7103631532329495, | |
| "grad_norm": 3.925534297841757, | |
| "learning_rate": 2.5692491481552314e-06, | |
| "loss": 0.7869, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 2.7139061116031886, | |
| "grad_norm": 5.0603926851389245, | |
| "learning_rate": 2.5630675647825913e-06, | |
| "loss": 0.7616, | |
| "step": 15320 | |
| }, | |
| { | |
| "epoch": 2.7174490699734277, | |
| "grad_norm": 4.055554853144999, | |
| "learning_rate": 2.5568855955499573e-06, | |
| "loss": 0.7882, | |
| "step": 15340 | |
| }, | |
| { | |
| "epoch": 2.720992028343667, | |
| "grad_norm": 3.478661550213012, | |
| "learning_rate": 2.5507032782798553e-06, | |
| "loss": 0.7852, | |
| "step": 15360 | |
| }, | |
| { | |
| "epoch": 2.7245349867139064, | |
| "grad_norm": 2.171548931614822, | |
| "learning_rate": 2.5445206507969395e-06, | |
| "loss": 0.77, | |
| "step": 15380 | |
| }, | |
| { | |
| "epoch": 2.728077945084145, | |
| "grad_norm": 3.0583258036393604, | |
| "learning_rate": 2.5383377509277648e-06, | |
| "loss": 0.7404, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 2.7316209034543846, | |
| "grad_norm": 3.1484952352447273, | |
| "learning_rate": 2.5321546165005497e-06, | |
| "loss": 0.7266, | |
| "step": 15420 | |
| }, | |
| { | |
| "epoch": 2.7351638618246237, | |
| "grad_norm": 2.942970058363834, | |
| "learning_rate": 2.5259712853449503e-06, | |
| "loss": 0.7527, | |
| "step": 15440 | |
| }, | |
| { | |
| "epoch": 2.738706820194863, | |
| "grad_norm": 2.645281388123703, | |
| "learning_rate": 2.5197877952918243e-06, | |
| "loss": 0.7662, | |
| "step": 15460 | |
| }, | |
| { | |
| "epoch": 2.742249778565102, | |
| "grad_norm": 3.4746684651389548, | |
| "learning_rate": 2.5136041841730026e-06, | |
| "loss": 0.7628, | |
| "step": 15480 | |
| }, | |
| { | |
| "epoch": 2.745792736935341, | |
| "grad_norm": 3.7116192950002795, | |
| "learning_rate": 2.5074204898210587e-06, | |
| "loss": 0.7428, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.74933569530558, | |
| "grad_norm": 5.909705442043078, | |
| "learning_rate": 2.50123675006907e-06, | |
| "loss": 0.7538, | |
| "step": 15520 | |
| }, | |
| { | |
| "epoch": 2.752878653675819, | |
| "grad_norm": 3.8189355746524045, | |
| "learning_rate": 2.4950530027503963e-06, | |
| "loss": 0.7647, | |
| "step": 15540 | |
| }, | |
| { | |
| "epoch": 2.7564216120460587, | |
| "grad_norm": 3.5256750418834515, | |
| "learning_rate": 2.4888692856984446e-06, | |
| "loss": 0.7332, | |
| "step": 15560 | |
| }, | |
| { | |
| "epoch": 2.7599645704162974, | |
| "grad_norm": 5.2988278310924715, | |
| "learning_rate": 2.482685636746432e-06, | |
| "loss": 0.7446, | |
| "step": 15580 | |
| }, | |
| { | |
| "epoch": 2.763507528786537, | |
| "grad_norm": 2.9280069670539146, | |
| "learning_rate": 2.4765020937271615e-06, | |
| "loss": 0.7999, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.767050487156776, | |
| "grad_norm": 3.3339855601700243, | |
| "learning_rate": 2.4703186944727885e-06, | |
| "loss": 0.7421, | |
| "step": 15620 | |
| }, | |
| { | |
| "epoch": 2.770593445527015, | |
| "grad_norm": 3.1483076854706993, | |
| "learning_rate": 2.464135476814589e-06, | |
| "loss": 0.7523, | |
| "step": 15640 | |
| }, | |
| { | |
| "epoch": 2.7741364038972542, | |
| "grad_norm": 3.1964072857533687, | |
| "learning_rate": 2.4579524785827254e-06, | |
| "loss": 0.7793, | |
| "step": 15660 | |
| }, | |
| { | |
| "epoch": 2.7776793622674933, | |
| "grad_norm": 5.327494932320985, | |
| "learning_rate": 2.451769737606021e-06, | |
| "loss": 0.7604, | |
| "step": 15680 | |
| }, | |
| { | |
| "epoch": 2.7812223206377324, | |
| "grad_norm": 2.0488240092746413, | |
| "learning_rate": 2.4455872917117233e-06, | |
| "loss": 0.7198, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 2.7847652790079716, | |
| "grad_norm": 3.6254982549079506, | |
| "learning_rate": 2.439405178725274e-06, | |
| "loss": 0.7811, | |
| "step": 15720 | |
| }, | |
| { | |
| "epoch": 2.7883082373782107, | |
| "grad_norm": 3.813833819224438, | |
| "learning_rate": 2.4332234364700793e-06, | |
| "loss": 0.7857, | |
| "step": 15740 | |
| }, | |
| { | |
| "epoch": 2.7918511957484498, | |
| "grad_norm": 2.332438247153177, | |
| "learning_rate": 2.427042102767278e-06, | |
| "loss": 0.7741, | |
| "step": 15760 | |
| }, | |
| { | |
| "epoch": 2.7953941541186893, | |
| "grad_norm": 5.637052434142593, | |
| "learning_rate": 2.4208612154355054e-06, | |
| "loss": 0.7873, | |
| "step": 15780 | |
| }, | |
| { | |
| "epoch": 2.7989371124889284, | |
| "grad_norm": 3.61371927386971, | |
| "learning_rate": 2.4146808122906685e-06, | |
| "loss": 0.7667, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 2.8024800708591675, | |
| "grad_norm": 3.081712372431115, | |
| "learning_rate": 2.408500931145713e-06, | |
| "loss": 0.7637, | |
| "step": 15820 | |
| }, | |
| { | |
| "epoch": 2.8060230292294066, | |
| "grad_norm": 2.8637593806841064, | |
| "learning_rate": 2.4023216098103892e-06, | |
| "loss": 0.7406, | |
| "step": 15840 | |
| }, | |
| { | |
| "epoch": 2.8095659875996457, | |
| "grad_norm": 4.855719945200848, | |
| "learning_rate": 2.396142886091023e-06, | |
| "loss": 0.762, | |
| "step": 15860 | |
| }, | |
| { | |
| "epoch": 2.813108945969885, | |
| "grad_norm": 2.3565232263934592, | |
| "learning_rate": 2.389964797790283e-06, | |
| "loss": 0.7539, | |
| "step": 15880 | |
| }, | |
| { | |
| "epoch": 2.816651904340124, | |
| "grad_norm": 3.4605355857001325, | |
| "learning_rate": 2.383787382706953e-06, | |
| "loss": 0.7435, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 2.820194862710363, | |
| "grad_norm": 2.2057409877337992, | |
| "learning_rate": 2.377610678635693e-06, | |
| "loss": 0.7737, | |
| "step": 15920 | |
| }, | |
| { | |
| "epoch": 2.823737821080602, | |
| "grad_norm": 2.136429659045698, | |
| "learning_rate": 2.371434723366818e-06, | |
| "loss": 0.7759, | |
| "step": 15940 | |
| }, | |
| { | |
| "epoch": 2.8272807794508417, | |
| "grad_norm": 2.9028483092829815, | |
| "learning_rate": 2.3652595546860595e-06, | |
| "loss": 0.7826, | |
| "step": 15960 | |
| }, | |
| { | |
| "epoch": 2.8308237378210808, | |
| "grad_norm": 3.7595789211806383, | |
| "learning_rate": 2.359085210374335e-06, | |
| "loss": 0.7565, | |
| "step": 15980 | |
| }, | |
| { | |
| "epoch": 2.83436669619132, | |
| "grad_norm": 3.402275688537832, | |
| "learning_rate": 2.3529117282075207e-06, | |
| "loss": 0.7222, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.83436669619132, | |
| "eval_loss": 0.7888814210891724, | |
| "eval_runtime": 368.71, | |
| "eval_samples_per_second": 25.784, | |
| "eval_steps_per_second": 3.225, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.837909654561559, | |
| "grad_norm": 4.2698147796391455, | |
| "learning_rate": 2.3467391459562163e-06, | |
| "loss": 0.772, | |
| "step": 16020 | |
| }, | |
| { | |
| "epoch": 2.841452612931798, | |
| "grad_norm": 5.560388355211716, | |
| "learning_rate": 2.340567501385518e-06, | |
| "loss": 0.7719, | |
| "step": 16040 | |
| }, | |
| { | |
| "epoch": 2.844995571302037, | |
| "grad_norm": 2.294411499429463, | |
| "learning_rate": 2.3343968322547816e-06, | |
| "loss": 0.7737, | |
| "step": 16060 | |
| }, | |
| { | |
| "epoch": 2.8485385296722763, | |
| "grad_norm": 1.995090359134686, | |
| "learning_rate": 2.3282271763173984e-06, | |
| "loss": 0.7808, | |
| "step": 16080 | |
| }, | |
| { | |
| "epoch": 2.8520814880425154, | |
| "grad_norm": 2.8221763040945422, | |
| "learning_rate": 2.322058571320559e-06, | |
| "loss": 0.7943, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.8556244464127545, | |
| "grad_norm": 2.8411701267751264, | |
| "learning_rate": 2.315891055005024e-06, | |
| "loss": 0.7458, | |
| "step": 16120 | |
| }, | |
| { | |
| "epoch": 2.859167404782994, | |
| "grad_norm": 3.841302334883737, | |
| "learning_rate": 2.3097246651048937e-06, | |
| "loss": 0.77, | |
| "step": 16140 | |
| }, | |
| { | |
| "epoch": 2.8627103631532327, | |
| "grad_norm": 3.890480722348098, | |
| "learning_rate": 2.3035594393473777e-06, | |
| "loss": 0.7384, | |
| "step": 16160 | |
| }, | |
| { | |
| "epoch": 2.8662533215234722, | |
| "grad_norm": 5.679565106603388, | |
| "learning_rate": 2.297395415452562e-06, | |
| "loss": 0.803, | |
| "step": 16180 | |
| }, | |
| { | |
| "epoch": 2.8697962798937113, | |
| "grad_norm": 3.337771681609959, | |
| "learning_rate": 2.2912326311331774e-06, | |
| "loss": 0.7028, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.8733392382639504, | |
| "grad_norm": 3.9468562372490443, | |
| "learning_rate": 2.285071124094375e-06, | |
| "loss": 0.778, | |
| "step": 16220 | |
| }, | |
| { | |
| "epoch": 2.8768821966341895, | |
| "grad_norm": 3.7924647740545203, | |
| "learning_rate": 2.2789109320334885e-06, | |
| "loss": 0.7559, | |
| "step": 16240 | |
| }, | |
| { | |
| "epoch": 2.8804251550044286, | |
| "grad_norm": 3.3047453094089505, | |
| "learning_rate": 2.2727520926398067e-06, | |
| "loss": 0.7563, | |
| "step": 16260 | |
| }, | |
| { | |
| "epoch": 2.8839681133746677, | |
| "grad_norm": 4.691213784478345, | |
| "learning_rate": 2.2665946435943425e-06, | |
| "loss": 0.7708, | |
| "step": 16280 | |
| }, | |
| { | |
| "epoch": 2.887511071744907, | |
| "grad_norm": 5.050030297267503, | |
| "learning_rate": 2.2604386225696035e-06, | |
| "loss": 0.7855, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.8910540301151464, | |
| "grad_norm": 3.297233505976855, | |
| "learning_rate": 2.254284067229359e-06, | |
| "loss": 0.7273, | |
| "step": 16320 | |
| }, | |
| { | |
| "epoch": 2.894596988485385, | |
| "grad_norm": 3.0043017323996106, | |
| "learning_rate": 2.24813101522841e-06, | |
| "loss": 0.7538, | |
| "step": 16340 | |
| }, | |
| { | |
| "epoch": 2.8981399468556246, | |
| "grad_norm": 3.7341413882543857, | |
| "learning_rate": 2.2419795042123644e-06, | |
| "loss": 0.7414, | |
| "step": 16360 | |
| }, | |
| { | |
| "epoch": 2.9016829052258637, | |
| "grad_norm": 4.878090641375587, | |
| "learning_rate": 2.2358295718173966e-06, | |
| "loss": 0.7679, | |
| "step": 16380 | |
| }, | |
| { | |
| "epoch": 2.905225863596103, | |
| "grad_norm": 4.429314718401584, | |
| "learning_rate": 2.2296812556700245e-06, | |
| "loss": 0.7517, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 2.908768821966342, | |
| "grad_norm": 2.9862439233870943, | |
| "learning_rate": 2.2235345933868785e-06, | |
| "loss": 0.7818, | |
| "step": 16420 | |
| }, | |
| { | |
| "epoch": 2.912311780336581, | |
| "grad_norm": 6.698492865099133, | |
| "learning_rate": 2.2173896225744704e-06, | |
| "loss": 0.7695, | |
| "step": 16440 | |
| }, | |
| { | |
| "epoch": 2.91585473870682, | |
| "grad_norm": 4.153101032253308, | |
| "learning_rate": 2.2112463808289613e-06, | |
| "loss": 0.7296, | |
| "step": 16460 | |
| }, | |
| { | |
| "epoch": 2.919397697077059, | |
| "grad_norm": 2.779471876140217, | |
| "learning_rate": 2.2051049057359354e-06, | |
| "loss": 0.7283, | |
| "step": 16480 | |
| }, | |
| { | |
| "epoch": 2.9229406554472988, | |
| "grad_norm": 3.2836392616484984, | |
| "learning_rate": 2.1989652348701683e-06, | |
| "loss": 0.7383, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.9264836138175374, | |
| "grad_norm": 1.895818806571468, | |
| "learning_rate": 2.192827405795395e-06, | |
| "loss": 0.7345, | |
| "step": 16520 | |
| }, | |
| { | |
| "epoch": 2.930026572187777, | |
| "grad_norm": 5.6943248214395545, | |
| "learning_rate": 2.1866914560640832e-06, | |
| "loss": 0.7717, | |
| "step": 16540 | |
| }, | |
| { | |
| "epoch": 2.933569530558016, | |
| "grad_norm": 4.513933666928205, | |
| "learning_rate": 2.1805574232172044e-06, | |
| "loss": 0.7773, | |
| "step": 16560 | |
| }, | |
| { | |
| "epoch": 2.937112488928255, | |
| "grad_norm": 4.620092058556062, | |
| "learning_rate": 2.1744253447839988e-06, | |
| "loss": 0.7592, | |
| "step": 16580 | |
| }, | |
| { | |
| "epoch": 2.9406554472984943, | |
| "grad_norm": 3.858292964944665, | |
| "learning_rate": 2.16829525828175e-06, | |
| "loss": 0.7854, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 2.9441984056687334, | |
| "grad_norm": 3.8882587756941174, | |
| "learning_rate": 2.1621672012155552e-06, | |
| "loss": 0.7434, | |
| "step": 16620 | |
| }, | |
| { | |
| "epoch": 2.9477413640389725, | |
| "grad_norm": 4.499656195479335, | |
| "learning_rate": 2.1560412110780967e-06, | |
| "loss": 0.7695, | |
| "step": 16640 | |
| }, | |
| { | |
| "epoch": 2.9512843224092116, | |
| "grad_norm": 3.7965733733040494, | |
| "learning_rate": 2.149917325349408e-06, | |
| "loss": 0.7197, | |
| "step": 16660 | |
| }, | |
| { | |
| "epoch": 2.954827280779451, | |
| "grad_norm": 3.860376338376774, | |
| "learning_rate": 2.143795581496648e-06, | |
| "loss": 0.7403, | |
| "step": 16680 | |
| }, | |
| { | |
| "epoch": 2.9583702391496898, | |
| "grad_norm": 3.323974522469437, | |
| "learning_rate": 2.1376760169738746e-06, | |
| "loss": 0.7497, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 2.9619131975199293, | |
| "grad_norm": 2.3281130863968382, | |
| "learning_rate": 2.131558669221806e-06, | |
| "loss": 0.7319, | |
| "step": 16720 | |
| }, | |
| { | |
| "epoch": 2.9654561558901684, | |
| "grad_norm": 3.4381149445643517, | |
| "learning_rate": 2.125443575667603e-06, | |
| "loss": 0.7817, | |
| "step": 16740 | |
| }, | |
| { | |
| "epoch": 2.9689991142604075, | |
| "grad_norm": 3.9685288684047815, | |
| "learning_rate": 2.1193307737246336e-06, | |
| "loss": 0.7764, | |
| "step": 16760 | |
| }, | |
| { | |
| "epoch": 2.9725420726306466, | |
| "grad_norm": 3.396199970398508, | |
| "learning_rate": 2.113220300792243e-06, | |
| "loss": 0.7661, | |
| "step": 16780 | |
| }, | |
| { | |
| "epoch": 2.9760850310008857, | |
| "grad_norm": 3.6420647109134943, | |
| "learning_rate": 2.10711219425553e-06, | |
| "loss": 0.7035, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.979627989371125, | |
| "grad_norm": 4.626273852138959, | |
| "learning_rate": 2.101006491485112e-06, | |
| "loss": 0.753, | |
| "step": 16820 | |
| }, | |
| { | |
| "epoch": 2.983170947741364, | |
| "grad_norm": 2.4460574774339654, | |
| "learning_rate": 2.0949032298369035e-06, | |
| "loss": 0.7692, | |
| "step": 16840 | |
| }, | |
| { | |
| "epoch": 2.986713906111603, | |
| "grad_norm": 3.41069245958657, | |
| "learning_rate": 2.0888024466518804e-06, | |
| "loss": 0.6976, | |
| "step": 16860 | |
| }, | |
| { | |
| "epoch": 2.990256864481842, | |
| "grad_norm": 3.403936749564734, | |
| "learning_rate": 2.082704179255857e-06, | |
| "loss": 0.7946, | |
| "step": 16880 | |
| }, | |
| { | |
| "epoch": 2.9937998228520817, | |
| "grad_norm": 4.194199735821774, | |
| "learning_rate": 2.076608464959255e-06, | |
| "loss": 0.7235, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 2.997342781222321, | |
| "grad_norm": 4.995370798020937, | |
| "learning_rate": 2.0705153410568753e-06, | |
| "loss": 0.7518, | |
| "step": 16920 | |
| }, | |
| { | |
| "epoch": 3.00088573959256, | |
| "grad_norm": 4.498354371200463, | |
| "learning_rate": 2.0644248448276698e-06, | |
| "loss": 0.6865, | |
| "step": 16940 | |
| }, | |
| { | |
| "epoch": 3.004428697962799, | |
| "grad_norm": 5.9848761265551484, | |
| "learning_rate": 2.0583370135345157e-06, | |
| "loss": 0.6598, | |
| "step": 16960 | |
| }, | |
| { | |
| "epoch": 3.007971656333038, | |
| "grad_norm": 3.1703728963133844, | |
| "learning_rate": 2.0522518844239834e-06, | |
| "loss": 0.634, | |
| "step": 16980 | |
| }, | |
| { | |
| "epoch": 3.011514614703277, | |
| "grad_norm": 4.221579231662142, | |
| "learning_rate": 2.0461694947261127e-06, | |
| "loss": 0.6631, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.0150575730735163, | |
| "grad_norm": 4.154018707851057, | |
| "learning_rate": 2.0400898816541807e-06, | |
| "loss": 0.6633, | |
| "step": 17020 | |
| }, | |
| { | |
| "epoch": 3.0186005314437554, | |
| "grad_norm": 4.631953700832906, | |
| "learning_rate": 2.034013082404479e-06, | |
| "loss": 0.6674, | |
| "step": 17040 | |
| }, | |
| { | |
| "epoch": 3.0221434898139945, | |
| "grad_norm": 3.085154700215037, | |
| "learning_rate": 2.0279391341560823e-06, | |
| "loss": 0.6241, | |
| "step": 17060 | |
| }, | |
| { | |
| "epoch": 3.025686448184234, | |
| "grad_norm": 4.415029344873564, | |
| "learning_rate": 2.0218680740706227e-06, | |
| "loss": 0.6436, | |
| "step": 17080 | |
| }, | |
| { | |
| "epoch": 3.029229406554473, | |
| "grad_norm": 4.050012850016261, | |
| "learning_rate": 2.0157999392920626e-06, | |
| "loss": 0.6809, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 3.0327723649247122, | |
| "grad_norm": 3.474053983443441, | |
| "learning_rate": 2.009734766946465e-06, | |
| "loss": 0.6748, | |
| "step": 17120 | |
| }, | |
| { | |
| "epoch": 3.0363153232949514, | |
| "grad_norm": 5.502084090817595, | |
| "learning_rate": 2.0036725941417695e-06, | |
| "loss": 0.7077, | |
| "step": 17140 | |
| }, | |
| { | |
| "epoch": 3.0398582816651905, | |
| "grad_norm": 4.7328265250493375, | |
| "learning_rate": 1.997613457967565e-06, | |
| "loss": 0.6685, | |
| "step": 17160 | |
| }, | |
| { | |
| "epoch": 3.0434012400354296, | |
| "grad_norm": 4.990841825704372, | |
| "learning_rate": 1.991557395494858e-06, | |
| "loss": 0.6576, | |
| "step": 17180 | |
| }, | |
| { | |
| "epoch": 3.0469441984056687, | |
| "grad_norm": 4.446445857081803, | |
| "learning_rate": 1.9855044437758542e-06, | |
| "loss": 0.6291, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 3.0504871567759078, | |
| "grad_norm": 4.010559033356023, | |
| "learning_rate": 1.9794546398437233e-06, | |
| "loss": 0.6821, | |
| "step": 17220 | |
| }, | |
| { | |
| "epoch": 3.054030115146147, | |
| "grad_norm": 5.668917044427614, | |
| "learning_rate": 1.973408020712378e-06, | |
| "loss": 0.6501, | |
| "step": 17240 | |
| }, | |
| { | |
| "epoch": 3.057573073516386, | |
| "grad_norm": 4.507165538436801, | |
| "learning_rate": 1.967364623376245e-06, | |
| "loss": 0.6634, | |
| "step": 17260 | |
| }, | |
| { | |
| "epoch": 3.0611160318866255, | |
| "grad_norm": 6.594268797496839, | |
| "learning_rate": 1.9613244848100393e-06, | |
| "loss": 0.6777, | |
| "step": 17280 | |
| }, | |
| { | |
| "epoch": 3.0646589902568646, | |
| "grad_norm": 5.86179333081565, | |
| "learning_rate": 1.9552876419685404e-06, | |
| "loss": 0.6966, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 3.0682019486271037, | |
| "grad_norm": 6.897830395606888, | |
| "learning_rate": 1.94925413178636e-06, | |
| "loss": 0.6358, | |
| "step": 17320 | |
| }, | |
| { | |
| "epoch": 3.071744906997343, | |
| "grad_norm": 3.105114998536212, | |
| "learning_rate": 1.9432239911777234e-06, | |
| "loss": 0.6144, | |
| "step": 17340 | |
| }, | |
| { | |
| "epoch": 3.075287865367582, | |
| "grad_norm": 4.226605746771805, | |
| "learning_rate": 1.9371972570362386e-06, | |
| "loss": 0.6445, | |
| "step": 17360 | |
| }, | |
| { | |
| "epoch": 3.078830823737821, | |
| "grad_norm": 3.2064699379152946, | |
| "learning_rate": 1.9311739662346714e-06, | |
| "loss": 0.6295, | |
| "step": 17380 | |
| }, | |
| { | |
| "epoch": 3.08237378210806, | |
| "grad_norm": 4.509110054894344, | |
| "learning_rate": 1.925154155624723e-06, | |
| "loss": 0.6584, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 3.0859167404782992, | |
| "grad_norm": 2.7613419445656877, | |
| "learning_rate": 1.9191378620367992e-06, | |
| "loss": 0.6872, | |
| "step": 17420 | |
| }, | |
| { | |
| "epoch": 3.0894596988485383, | |
| "grad_norm": 5.249702772830893, | |
| "learning_rate": 1.91312512227979e-06, | |
| "loss": 0.659, | |
| "step": 17440 | |
| }, | |
| { | |
| "epoch": 3.093002657218778, | |
| "grad_norm": 4.594127042773178, | |
| "learning_rate": 1.907115973140841e-06, | |
| "loss": 0.6445, | |
| "step": 17460 | |
| }, | |
| { | |
| "epoch": 3.096545615589017, | |
| "grad_norm": 3.5475423306782, | |
| "learning_rate": 1.9011104513851306e-06, | |
| "loss": 0.6446, | |
| "step": 17480 | |
| }, | |
| { | |
| "epoch": 3.100088573959256, | |
| "grad_norm": 6.443218004353607, | |
| "learning_rate": 1.8951085937556447e-06, | |
| "loss": 0.6642, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 3.103631532329495, | |
| "grad_norm": 4.817902509140939, | |
| "learning_rate": 1.889110436972949e-06, | |
| "loss": 0.6675, | |
| "step": 17520 | |
| }, | |
| { | |
| "epoch": 3.1071744906997343, | |
| "grad_norm": 3.6588947567775576, | |
| "learning_rate": 1.8831160177349694e-06, | |
| "loss": 0.6011, | |
| "step": 17540 | |
| }, | |
| { | |
| "epoch": 3.1107174490699734, | |
| "grad_norm": 3.28321156533759, | |
| "learning_rate": 1.8771253727167639e-06, | |
| "loss": 0.6553, | |
| "step": 17560 | |
| }, | |
| { | |
| "epoch": 3.1142604074402125, | |
| "grad_norm": 5.587131615275665, | |
| "learning_rate": 1.8711385385702973e-06, | |
| "loss": 0.6896, | |
| "step": 17580 | |
| }, | |
| { | |
| "epoch": 3.1178033658104516, | |
| "grad_norm": 4.969644996204083, | |
| "learning_rate": 1.8651555519242215e-06, | |
| "loss": 0.648, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 3.1213463241806907, | |
| "grad_norm": 4.564430542899775, | |
| "learning_rate": 1.8591764493836468e-06, | |
| "loss": 0.6673, | |
| "step": 17620 | |
| }, | |
| { | |
| "epoch": 3.1248892825509302, | |
| "grad_norm": 4.821775727343219, | |
| "learning_rate": 1.8532012675299198e-06, | |
| "loss": 0.6368, | |
| "step": 17640 | |
| }, | |
| { | |
| "epoch": 3.1284322409211693, | |
| "grad_norm": 3.550472754727026, | |
| "learning_rate": 1.8472300429203998e-06, | |
| "loss": 0.6763, | |
| "step": 17660 | |
| }, | |
| { | |
| "epoch": 3.1319751992914084, | |
| "grad_norm": 5.387384363073119, | |
| "learning_rate": 1.8412628120882359e-06, | |
| "loss": 0.6228, | |
| "step": 17680 | |
| }, | |
| { | |
| "epoch": 3.1355181576616475, | |
| "grad_norm": 2.0363557538364527, | |
| "learning_rate": 1.8352996115421417e-06, | |
| "loss": 0.6165, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 3.1390611160318866, | |
| "grad_norm": 3.302139015386827, | |
| "learning_rate": 1.829340477766172e-06, | |
| "loss": 0.6668, | |
| "step": 17720 | |
| }, | |
| { | |
| "epoch": 3.1426040744021257, | |
| "grad_norm": 4.799357203523113, | |
| "learning_rate": 1.8233854472195014e-06, | |
| "loss": 0.6657, | |
| "step": 17740 | |
| }, | |
| { | |
| "epoch": 3.146147032772365, | |
| "grad_norm": 4.447535055541927, | |
| "learning_rate": 1.8174345563361992e-06, | |
| "loss": 0.6814, | |
| "step": 17760 | |
| }, | |
| { | |
| "epoch": 3.149689991142604, | |
| "grad_norm": 2.19457736885214, | |
| "learning_rate": 1.8114878415250082e-06, | |
| "loss": 0.6682, | |
| "step": 17780 | |
| }, | |
| { | |
| "epoch": 3.153232949512843, | |
| "grad_norm": 3.0879621968638755, | |
| "learning_rate": 1.8055453391691209e-06, | |
| "loss": 0.6, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 3.156775907883082, | |
| "grad_norm": 3.9283834956111705, | |
| "learning_rate": 1.7996070856259568e-06, | |
| "loss": 0.6664, | |
| "step": 17820 | |
| }, | |
| { | |
| "epoch": 3.1603188662533217, | |
| "grad_norm": 5.062334002651309, | |
| "learning_rate": 1.7936731172269414e-06, | |
| "loss": 0.6691, | |
| "step": 17840 | |
| }, | |
| { | |
| "epoch": 3.163861824623561, | |
| "grad_norm": 3.592859470647672, | |
| "learning_rate": 1.7877434702772807e-06, | |
| "loss": 0.6632, | |
| "step": 17860 | |
| }, | |
| { | |
| "epoch": 3.1674047829938, | |
| "grad_norm": 3.5217893259822306, | |
| "learning_rate": 1.7818181810557428e-06, | |
| "loss": 0.6588, | |
| "step": 17880 | |
| }, | |
| { | |
| "epoch": 3.170947741364039, | |
| "grad_norm": 5.792763255978902, | |
| "learning_rate": 1.7758972858144351e-06, | |
| "loss": 0.6843, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 3.174490699734278, | |
| "grad_norm": 5.619010064865972, | |
| "learning_rate": 1.7699808207785796e-06, | |
| "loss": 0.6304, | |
| "step": 17920 | |
| }, | |
| { | |
| "epoch": 3.178033658104517, | |
| "grad_norm": 3.233388216079928, | |
| "learning_rate": 1.7640688221462955e-06, | |
| "loss": 0.6481, | |
| "step": 17940 | |
| }, | |
| { | |
| "epoch": 3.1815766164747563, | |
| "grad_norm": 7.189211258809608, | |
| "learning_rate": 1.7581613260883733e-06, | |
| "loss": 0.6516, | |
| "step": 17960 | |
| }, | |
| { | |
| "epoch": 3.1851195748449954, | |
| "grad_norm": 4.794500314138224, | |
| "learning_rate": 1.7522583687480587e-06, | |
| "loss": 0.6276, | |
| "step": 17980 | |
| }, | |
| { | |
| "epoch": 3.1886625332152345, | |
| "grad_norm": 5.414264886735773, | |
| "learning_rate": 1.7463599862408265e-06, | |
| "loss": 0.6461, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.1886625332152345, | |
| "eval_loss": 0.814194917678833, | |
| "eval_runtime": 367.8278, | |
| "eval_samples_per_second": 25.846, | |
| "eval_steps_per_second": 3.232, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 3.192205491585474, | |
| "grad_norm": 2.678076603574593, | |
| "learning_rate": 1.7404662146541622e-06, | |
| "loss": 0.6586, | |
| "step": 18020 | |
| }, | |
| { | |
| "epoch": 3.195748449955713, | |
| "grad_norm": 5.435434057895623, | |
| "learning_rate": 1.7345770900473424e-06, | |
| "loss": 0.6378, | |
| "step": 18040 | |
| }, | |
| { | |
| "epoch": 3.1992914083259523, | |
| "grad_norm": 4.182207392231193, | |
| "learning_rate": 1.7286926484512088e-06, | |
| "loss": 0.6429, | |
| "step": 18060 | |
| }, | |
| { | |
| "epoch": 3.2028343666961914, | |
| "grad_norm": 6.163291864345331, | |
| "learning_rate": 1.722812925867955e-06, | |
| "loss": 0.6215, | |
| "step": 18080 | |
| }, | |
| { | |
| "epoch": 3.2063773250664305, | |
| "grad_norm": 4.221207066086723, | |
| "learning_rate": 1.7169379582709018e-06, | |
| "loss": 0.6734, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 3.2099202834366696, | |
| "grad_norm": 3.6319759767031345, | |
| "learning_rate": 1.711067781604277e-06, | |
| "loss": 0.6688, | |
| "step": 18120 | |
| }, | |
| { | |
| "epoch": 3.2134632418069087, | |
| "grad_norm": 2.4361609804138986, | |
| "learning_rate": 1.7052024317829986e-06, | |
| "loss": 0.6779, | |
| "step": 18140 | |
| }, | |
| { | |
| "epoch": 3.217006200177148, | |
| "grad_norm": 3.2229485582444646, | |
| "learning_rate": 1.69934194469245e-06, | |
| "loss": 0.5963, | |
| "step": 18160 | |
| }, | |
| { | |
| "epoch": 3.220549158547387, | |
| "grad_norm": 4.04542335006295, | |
| "learning_rate": 1.6934863561882664e-06, | |
| "loss": 0.6149, | |
| "step": 18180 | |
| }, | |
| { | |
| "epoch": 3.2240921169176264, | |
| "grad_norm": 4.224812708749921, | |
| "learning_rate": 1.687635702096111e-06, | |
| "loss": 0.6544, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 3.2276350752878655, | |
| "grad_norm": 2.5768170370991133, | |
| "learning_rate": 1.681790018211457e-06, | |
| "loss": 0.6455, | |
| "step": 18220 | |
| }, | |
| { | |
| "epoch": 3.2311780336581046, | |
| "grad_norm": 8.446037248628688, | |
| "learning_rate": 1.6759493402993713e-06, | |
| "loss": 0.6399, | |
| "step": 18240 | |
| }, | |
| { | |
| "epoch": 3.2347209920283437, | |
| "grad_norm": 4.421664554190382, | |
| "learning_rate": 1.6701137040942884e-06, | |
| "loss": 0.6605, | |
| "step": 18260 | |
| }, | |
| { | |
| "epoch": 3.238263950398583, | |
| "grad_norm": 3.867362303030101, | |
| "learning_rate": 1.664283145299801e-06, | |
| "loss": 0.6197, | |
| "step": 18280 | |
| }, | |
| { | |
| "epoch": 3.241806908768822, | |
| "grad_norm": 2.9062273272206975, | |
| "learning_rate": 1.658457699588436e-06, | |
| "loss": 0.6415, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 3.245349867139061, | |
| "grad_norm": 5.355980117276988, | |
| "learning_rate": 1.6526374026014366e-06, | |
| "loss": 0.6154, | |
| "step": 18320 | |
| }, | |
| { | |
| "epoch": 3.2488928255093, | |
| "grad_norm": 3.447853280839097, | |
| "learning_rate": 1.6468222899485464e-06, | |
| "loss": 0.6004, | |
| "step": 18340 | |
| }, | |
| { | |
| "epoch": 3.2524357838795392, | |
| "grad_norm": 3.7093160550377813, | |
| "learning_rate": 1.6410123972077884e-06, | |
| "loss": 0.6604, | |
| "step": 18360 | |
| }, | |
| { | |
| "epoch": 3.255978742249779, | |
| "grad_norm": 5.03373607670554, | |
| "learning_rate": 1.6352077599252508e-06, | |
| "loss": 0.6942, | |
| "step": 18380 | |
| }, | |
| { | |
| "epoch": 3.259521700620018, | |
| "grad_norm": 2.7267954903558462, | |
| "learning_rate": 1.6294084136148677e-06, | |
| "loss": 0.6245, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 3.263064658990257, | |
| "grad_norm": 3.422200189756036, | |
| "learning_rate": 1.6236143937582006e-06, | |
| "loss": 0.6454, | |
| "step": 18420 | |
| }, | |
| { | |
| "epoch": 3.266607617360496, | |
| "grad_norm": 3.004333600805996, | |
| "learning_rate": 1.6178257358042238e-06, | |
| "loss": 0.6308, | |
| "step": 18440 | |
| }, | |
| { | |
| "epoch": 3.270150575730735, | |
| "grad_norm": 3.205706761360872, | |
| "learning_rate": 1.6120424751691078e-06, | |
| "loss": 0.7113, | |
| "step": 18460 | |
| }, | |
| { | |
| "epoch": 3.2736935341009743, | |
| "grad_norm": 4.169462011619208, | |
| "learning_rate": 1.6062646472359967e-06, | |
| "loss": 0.6739, | |
| "step": 18480 | |
| }, | |
| { | |
| "epoch": 3.2772364924712134, | |
| "grad_norm": 5.474090442872105, | |
| "learning_rate": 1.6004922873548014e-06, | |
| "loss": 0.6459, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 3.2807794508414525, | |
| "grad_norm": 4.123370814980775, | |
| "learning_rate": 1.594725430841975e-06, | |
| "loss": 0.6329, | |
| "step": 18520 | |
| }, | |
| { | |
| "epoch": 3.2843224092116916, | |
| "grad_norm": 3.386294588511836, | |
| "learning_rate": 1.5889641129803013e-06, | |
| "loss": 0.6978, | |
| "step": 18540 | |
| }, | |
| { | |
| "epoch": 3.287865367581931, | |
| "grad_norm": 4.479997130104378, | |
| "learning_rate": 1.5832083690186763e-06, | |
| "loss": 0.6942, | |
| "step": 18560 | |
| }, | |
| { | |
| "epoch": 3.2914083259521703, | |
| "grad_norm": 3.403229229215997, | |
| "learning_rate": 1.5774582341718952e-06, | |
| "loss": 0.6561, | |
| "step": 18580 | |
| }, | |
| { | |
| "epoch": 3.2949512843224094, | |
| "grad_norm": 3.3840120900240045, | |
| "learning_rate": 1.571713743620435e-06, | |
| "loss": 0.6464, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 3.2984942426926485, | |
| "grad_norm": 2.166301971846268, | |
| "learning_rate": 1.5659749325102391e-06, | |
| "loss": 0.6633, | |
| "step": 18620 | |
| }, | |
| { | |
| "epoch": 3.3020372010628876, | |
| "grad_norm": 4.571502120958036, | |
| "learning_rate": 1.5602418359525029e-06, | |
| "loss": 0.6449, | |
| "step": 18640 | |
| }, | |
| { | |
| "epoch": 3.3055801594331267, | |
| "grad_norm": 6.850347513041369, | |
| "learning_rate": 1.5545144890234618e-06, | |
| "loss": 0.6375, | |
| "step": 18660 | |
| }, | |
| { | |
| "epoch": 3.3091231178033658, | |
| "grad_norm": 5.268732271957646, | |
| "learning_rate": 1.5487929267641688e-06, | |
| "loss": 0.6387, | |
| "step": 18680 | |
| }, | |
| { | |
| "epoch": 3.312666076173605, | |
| "grad_norm": 6.854691620973651, | |
| "learning_rate": 1.5430771841802894e-06, | |
| "loss": 0.6792, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 3.316209034543844, | |
| "grad_norm": 5.772199960663563, | |
| "learning_rate": 1.537367296241881e-06, | |
| "loss": 0.5957, | |
| "step": 18720 | |
| }, | |
| { | |
| "epoch": 3.319751992914083, | |
| "grad_norm": 4.543759004099246, | |
| "learning_rate": 1.531663297883183e-06, | |
| "loss": 0.6704, | |
| "step": 18740 | |
| }, | |
| { | |
| "epoch": 3.323294951284322, | |
| "grad_norm": 4.253141544006728, | |
| "learning_rate": 1.525965224002398e-06, | |
| "loss": 0.6591, | |
| "step": 18760 | |
| }, | |
| { | |
| "epoch": 3.3268379096545617, | |
| "grad_norm": 4.030607569941474, | |
| "learning_rate": 1.5202731094614848e-06, | |
| "loss": 0.6153, | |
| "step": 18780 | |
| }, | |
| { | |
| "epoch": 3.330380868024801, | |
| "grad_norm": 4.541306419220621, | |
| "learning_rate": 1.5145869890859404e-06, | |
| "loss": 0.6801, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 3.33392382639504, | |
| "grad_norm": 4.860276758268095, | |
| "learning_rate": 1.5089068976645876e-06, | |
| "loss": 0.6129, | |
| "step": 18820 | |
| }, | |
| { | |
| "epoch": 3.337466784765279, | |
| "grad_norm": 3.7898282834137875, | |
| "learning_rate": 1.503232869949364e-06, | |
| "loss": 0.647, | |
| "step": 18840 | |
| }, | |
| { | |
| "epoch": 3.341009743135518, | |
| "grad_norm": 4.475193399989839, | |
| "learning_rate": 1.4975649406551081e-06, | |
| "loss": 0.6015, | |
| "step": 18860 | |
| }, | |
| { | |
| "epoch": 3.3445527015057572, | |
| "grad_norm": 3.824693434860729, | |
| "learning_rate": 1.4919031444593458e-06, | |
| "loss": 0.6672, | |
| "step": 18880 | |
| }, | |
| { | |
| "epoch": 3.3480956598759963, | |
| "grad_norm": 3.5036219590743447, | |
| "learning_rate": 1.4862475160020806e-06, | |
| "loss": 0.6771, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 3.3516386182462354, | |
| "grad_norm": 4.1272200518774325, | |
| "learning_rate": 1.48059808988558e-06, | |
| "loss": 0.6757, | |
| "step": 18920 | |
| }, | |
| { | |
| "epoch": 3.3551815766164745, | |
| "grad_norm": 3.5389364787677957, | |
| "learning_rate": 1.4749549006741655e-06, | |
| "loss": 0.7042, | |
| "step": 18940 | |
| }, | |
| { | |
| "epoch": 3.358724534986714, | |
| "grad_norm": 2.5975704276819, | |
| "learning_rate": 1.4693179828939985e-06, | |
| "loss": 0.6987, | |
| "step": 18960 | |
| }, | |
| { | |
| "epoch": 3.362267493356953, | |
| "grad_norm": 3.8727633599446794, | |
| "learning_rate": 1.463687371032871e-06, | |
| "loss": 0.6685, | |
| "step": 18980 | |
| }, | |
| { | |
| "epoch": 3.3658104517271923, | |
| "grad_norm": 3.803175388434909, | |
| "learning_rate": 1.4580630995399949e-06, | |
| "loss": 0.6214, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 3.3693534100974314, | |
| "grad_norm": 3.1226214687691445, | |
| "learning_rate": 1.4524452028257884e-06, | |
| "loss": 0.6516, | |
| "step": 19020 | |
| }, | |
| { | |
| "epoch": 3.3728963684676705, | |
| "grad_norm": 3.9984001662113986, | |
| "learning_rate": 1.4468337152616712e-06, | |
| "loss": 0.6686, | |
| "step": 19040 | |
| }, | |
| { | |
| "epoch": 3.3764393268379096, | |
| "grad_norm": 5.2905284840587985, | |
| "learning_rate": 1.4412286711798473e-06, | |
| "loss": 0.643, | |
| "step": 19060 | |
| }, | |
| { | |
| "epoch": 3.3799822852081487, | |
| "grad_norm": 3.996754961897811, | |
| "learning_rate": 1.4356301048730987e-06, | |
| "loss": 0.6707, | |
| "step": 19080 | |
| }, | |
| { | |
| "epoch": 3.383525243578388, | |
| "grad_norm": 2.7287259969894757, | |
| "learning_rate": 1.4300380505945754e-06, | |
| "loss": 0.647, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 3.387068201948627, | |
| "grad_norm": 4.627474263327022, | |
| "learning_rate": 1.4244525425575862e-06, | |
| "loss": 0.6579, | |
| "step": 19120 | |
| }, | |
| { | |
| "epoch": 3.3906111603188664, | |
| "grad_norm": 6.3578848860264054, | |
| "learning_rate": 1.418873614935387e-06, | |
| "loss": 0.6214, | |
| "step": 19140 | |
| }, | |
| { | |
| "epoch": 3.3941541186891055, | |
| "grad_norm": 2.5409208272433292, | |
| "learning_rate": 1.4133013018609762e-06, | |
| "loss": 0.6916, | |
| "step": 19160 | |
| }, | |
| { | |
| "epoch": 3.3976970770593447, | |
| "grad_norm": 5.73185853916136, | |
| "learning_rate": 1.4077356374268808e-06, | |
| "loss": 0.639, | |
| "step": 19180 | |
| }, | |
| { | |
| "epoch": 3.4012400354295838, | |
| "grad_norm": 5.547561124073059, | |
| "learning_rate": 1.4021766556849492e-06, | |
| "loss": 0.6472, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 3.404782993799823, | |
| "grad_norm": 6.154343627624317, | |
| "learning_rate": 1.3966243906461477e-06, | |
| "loss": 0.632, | |
| "step": 19220 | |
| }, | |
| { | |
| "epoch": 3.408325952170062, | |
| "grad_norm": 3.352883710099438, | |
| "learning_rate": 1.3910788762803448e-06, | |
| "loss": 0.6399, | |
| "step": 19240 | |
| }, | |
| { | |
| "epoch": 3.411868910540301, | |
| "grad_norm": 5.151354471677131, | |
| "learning_rate": 1.3855401465161072e-06, | |
| "loss": 0.6439, | |
| "step": 19260 | |
| }, | |
| { | |
| "epoch": 3.41541186891054, | |
| "grad_norm": 3.533259346429167, | |
| "learning_rate": 1.3800082352404964e-06, | |
| "loss": 0.7011, | |
| "step": 19280 | |
| }, | |
| { | |
| "epoch": 3.4189548272807793, | |
| "grad_norm": 2.422586203772127, | |
| "learning_rate": 1.3744831762988492e-06, | |
| "loss": 0.6802, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 3.422497785651019, | |
| "grad_norm": 2.0987507209228773, | |
| "learning_rate": 1.368965003494586e-06, | |
| "loss": 0.653, | |
| "step": 19320 | |
| }, | |
| { | |
| "epoch": 3.426040744021258, | |
| "grad_norm": 5.860157027237845, | |
| "learning_rate": 1.3634537505889927e-06, | |
| "loss": 0.6517, | |
| "step": 19340 | |
| }, | |
| { | |
| "epoch": 3.429583702391497, | |
| "grad_norm": 4.150793117319128, | |
| "learning_rate": 1.3579494513010178e-06, | |
| "loss": 0.6702, | |
| "step": 19360 | |
| }, | |
| { | |
| "epoch": 3.433126660761736, | |
| "grad_norm": 2.376892166127197, | |
| "learning_rate": 1.352452139307068e-06, | |
| "loss": 0.6578, | |
| "step": 19380 | |
| }, | |
| { | |
| "epoch": 3.436669619131975, | |
| "grad_norm": 3.5677324284202974, | |
| "learning_rate": 1.3469618482407993e-06, | |
| "loss": 0.6466, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 3.4402125775022143, | |
| "grad_norm": 4.23134121034947, | |
| "learning_rate": 1.3414786116929102e-06, | |
| "loss": 0.6529, | |
| "step": 19420 | |
| }, | |
| { | |
| "epoch": 3.4437555358724534, | |
| "grad_norm": 5.745078327509347, | |
| "learning_rate": 1.3360024632109431e-06, | |
| "loss": 0.6484, | |
| "step": 19440 | |
| }, | |
| { | |
| "epoch": 3.4472984942426925, | |
| "grad_norm": 3.6442103656861864, | |
| "learning_rate": 1.3305334362990697e-06, | |
| "loss": 0.6669, | |
| "step": 19460 | |
| }, | |
| { | |
| "epoch": 3.4508414526129316, | |
| "grad_norm": 36.732566773014995, | |
| "learning_rate": 1.3250715644178926e-06, | |
| "loss": 0.6526, | |
| "step": 19480 | |
| }, | |
| { | |
| "epoch": 3.454384410983171, | |
| "grad_norm": 6.375922451901222, | |
| "learning_rate": 1.3196168809842384e-06, | |
| "loss": 0.6773, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 3.4579273693534103, | |
| "grad_norm": 4.596315770556884, | |
| "learning_rate": 1.314169419370952e-06, | |
| "loss": 0.6634, | |
| "step": 19520 | |
| }, | |
| { | |
| "epoch": 3.4614703277236494, | |
| "grad_norm": 5.605691895671033, | |
| "learning_rate": 1.3087292129066947e-06, | |
| "loss": 0.6925, | |
| "step": 19540 | |
| }, | |
| { | |
| "epoch": 3.4650132860938885, | |
| "grad_norm": 4.310812747641876, | |
| "learning_rate": 1.3032962948757406e-06, | |
| "loss": 0.6323, | |
| "step": 19560 | |
| }, | |
| { | |
| "epoch": 3.4685562444641276, | |
| "grad_norm": 2.539953889588533, | |
| "learning_rate": 1.2978706985177702e-06, | |
| "loss": 0.6603, | |
| "step": 19580 | |
| }, | |
| { | |
| "epoch": 3.4720992028343667, | |
| "grad_norm": 4.700922466636149, | |
| "learning_rate": 1.2924524570276676e-06, | |
| "loss": 0.6387, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 3.475642161204606, | |
| "grad_norm": 3.386892338712282, | |
| "learning_rate": 1.2870416035553213e-06, | |
| "loss": 0.665, | |
| "step": 19620 | |
| }, | |
| { | |
| "epoch": 3.479185119574845, | |
| "grad_norm": 4.047488784454614, | |
| "learning_rate": 1.2816381712054157e-06, | |
| "loss": 0.6442, | |
| "step": 19640 | |
| }, | |
| { | |
| "epoch": 3.482728077945084, | |
| "grad_norm": 4.456956526186442, | |
| "learning_rate": 1.2762421930372318e-06, | |
| "loss": 0.637, | |
| "step": 19660 | |
| }, | |
| { | |
| "epoch": 3.4862710363153235, | |
| "grad_norm": 4.141098829330102, | |
| "learning_rate": 1.2708537020644465e-06, | |
| "loss": 0.6384, | |
| "step": 19680 | |
| }, | |
| { | |
| "epoch": 3.4898139946855626, | |
| "grad_norm": 3.179545115166026, | |
| "learning_rate": 1.265472731254926e-06, | |
| "loss": 0.6259, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 3.4933569530558017, | |
| "grad_norm": 3.085506510833184, | |
| "learning_rate": 1.2600993135305278e-06, | |
| "loss": 0.6297, | |
| "step": 19720 | |
| }, | |
| { | |
| "epoch": 3.496899911426041, | |
| "grad_norm": 4.667399574209881, | |
| "learning_rate": 1.254733481766898e-06, | |
| "loss": 0.6576, | |
| "step": 19740 | |
| }, | |
| { | |
| "epoch": 3.50044286979628, | |
| "grad_norm": 4.264096999147888, | |
| "learning_rate": 1.2493752687932687e-06, | |
| "loss": 0.6778, | |
| "step": 19760 | |
| }, | |
| { | |
| "epoch": 3.503985828166519, | |
| "grad_norm": 2.0289989732438936, | |
| "learning_rate": 1.2440247073922627e-06, | |
| "loss": 0.6264, | |
| "step": 19780 | |
| }, | |
| { | |
| "epoch": 3.507528786536758, | |
| "grad_norm": 2.316066403465445, | |
| "learning_rate": 1.2386818302996847e-06, | |
| "loss": 0.6594, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 3.5110717449069972, | |
| "grad_norm": 4.626840288617283, | |
| "learning_rate": 1.233346670204327e-06, | |
| "loss": 0.691, | |
| "step": 19820 | |
| }, | |
| { | |
| "epoch": 3.5146147032772364, | |
| "grad_norm": 3.3340384188287193, | |
| "learning_rate": 1.228019259747769e-06, | |
| "loss": 0.6249, | |
| "step": 19840 | |
| }, | |
| { | |
| "epoch": 3.518157661647476, | |
| "grad_norm": 3.501104395294738, | |
| "learning_rate": 1.2226996315241743e-06, | |
| "loss": 0.6646, | |
| "step": 19860 | |
| }, | |
| { | |
| "epoch": 3.5217006200177146, | |
| "grad_norm": 1.7915799562011805, | |
| "learning_rate": 1.217387818080093e-06, | |
| "loss": 0.6616, | |
| "step": 19880 | |
| }, | |
| { | |
| "epoch": 3.525243578387954, | |
| "grad_norm": 4.085420259424753, | |
| "learning_rate": 1.2120838519142664e-06, | |
| "loss": 0.6475, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 3.528786536758193, | |
| "grad_norm": 2.889218336443782, | |
| "learning_rate": 1.2067877654774195e-06, | |
| "loss": 0.6577, | |
| "step": 19920 | |
| }, | |
| { | |
| "epoch": 3.5323294951284323, | |
| "grad_norm": 3.4338184164603773, | |
| "learning_rate": 1.20149959117207e-06, | |
| "loss": 0.6706, | |
| "step": 19940 | |
| }, | |
| { | |
| "epoch": 3.5358724534986714, | |
| "grad_norm": 5.515577025317211, | |
| "learning_rate": 1.196219361352329e-06, | |
| "loss": 0.646, | |
| "step": 19960 | |
| }, | |
| { | |
| "epoch": 3.5394154118689105, | |
| "grad_norm": 5.178906055641794, | |
| "learning_rate": 1.1909471083236999e-06, | |
| "loss": 0.6457, | |
| "step": 19980 | |
| }, | |
| { | |
| "epoch": 3.5429583702391496, | |
| "grad_norm": 2.503351819448841, | |
| "learning_rate": 1.1856828643428813e-06, | |
| "loss": 0.644, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 3.5429583702391496, | |
| "eval_loss": 0.8062734603881836, | |
| "eval_runtime": 374.2695, | |
| "eval_samples_per_second": 25.401, | |
| "eval_steps_per_second": 3.177, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 3.5465013286093887, | |
| "grad_norm": 4.448169389650755, | |
| "learning_rate": 1.1804266616175747e-06, | |
| "loss": 0.6384, | |
| "step": 20020 | |
| }, | |
| { | |
| "epoch": 3.5500442869796283, | |
| "grad_norm": 7.501920429960917, | |
| "learning_rate": 1.17517853230628e-06, | |
| "loss": 0.6745, | |
| "step": 20040 | |
| }, | |
| { | |
| "epoch": 3.553587245349867, | |
| "grad_norm": 4.264187361101836, | |
| "learning_rate": 1.169938508518103e-06, | |
| "loss": 0.6495, | |
| "step": 20060 | |
| }, | |
| { | |
| "epoch": 3.5571302037201065, | |
| "grad_norm": 4.121288134877933, | |
| "learning_rate": 1.1647066223125606e-06, | |
| "loss": 0.6297, | |
| "step": 20080 | |
| }, | |
| { | |
| "epoch": 3.5606731620903456, | |
| "grad_norm": 4.279937646373024, | |
| "learning_rate": 1.1594829056993794e-06, | |
| "loss": 0.6421, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 3.5642161204605847, | |
| "grad_norm": 2.387088275176141, | |
| "learning_rate": 1.1542673906383045e-06, | |
| "loss": 0.6768, | |
| "step": 20120 | |
| }, | |
| { | |
| "epoch": 3.5677590788308238, | |
| "grad_norm": 4.372760494789765, | |
| "learning_rate": 1.1490601090389014e-06, | |
| "loss": 0.6512, | |
| "step": 20140 | |
| }, | |
| { | |
| "epoch": 3.571302037201063, | |
| "grad_norm": 3.3326241965778425, | |
| "learning_rate": 1.1438610927603614e-06, | |
| "loss": 0.6615, | |
| "step": 20160 | |
| }, | |
| { | |
| "epoch": 3.574844995571302, | |
| "grad_norm": 3.04200662462561, | |
| "learning_rate": 1.1386703736113092e-06, | |
| "loss": 0.6343, | |
| "step": 20180 | |
| }, | |
| { | |
| "epoch": 3.578387953941541, | |
| "grad_norm": 3.4142428638907254, | |
| "learning_rate": 1.1334879833496033e-06, | |
| "loss": 0.6929, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 3.58193091231178, | |
| "grad_norm": 2.3882204523624484, | |
| "learning_rate": 1.1283139536821446e-06, | |
| "loss": 0.6017, | |
| "step": 20220 | |
| }, | |
| { | |
| "epoch": 3.5854738706820193, | |
| "grad_norm": 2.733962119410601, | |
| "learning_rate": 1.1231483162646851e-06, | |
| "loss": 0.6503, | |
| "step": 20240 | |
| }, | |
| { | |
| "epoch": 3.589016829052259, | |
| "grad_norm": 3.7061102390832383, | |
| "learning_rate": 1.1179911027016277e-06, | |
| "loss": 0.6049, | |
| "step": 20260 | |
| }, | |
| { | |
| "epoch": 3.592559787422498, | |
| "grad_norm": 4.725193440295439, | |
| "learning_rate": 1.1128423445458378e-06, | |
| "loss": 0.6488, | |
| "step": 20280 | |
| }, | |
| { | |
| "epoch": 3.596102745792737, | |
| "grad_norm": 6.502036568180023, | |
| "learning_rate": 1.1077020732984508e-06, | |
| "loss": 0.635, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 3.599645704162976, | |
| "grad_norm": 3.9545813460025676, | |
| "learning_rate": 1.1025703204086758e-06, | |
| "loss": 0.679, | |
| "step": 20320 | |
| }, | |
| { | |
| "epoch": 3.6031886625332152, | |
| "grad_norm": 3.6704467908903013, | |
| "learning_rate": 1.097447117273602e-06, | |
| "loss": 0.6222, | |
| "step": 20340 | |
| }, | |
| { | |
| "epoch": 3.6067316209034543, | |
| "grad_norm": 3.6721020929579655, | |
| "learning_rate": 1.0923324952380158e-06, | |
| "loss": 0.6313, | |
| "step": 20360 | |
| }, | |
| { | |
| "epoch": 3.6102745792736934, | |
| "grad_norm": 2.6332813999535247, | |
| "learning_rate": 1.0872264855941974e-06, | |
| "loss": 0.6067, | |
| "step": 20380 | |
| }, | |
| { | |
| "epoch": 3.6138175376439325, | |
| "grad_norm": 6.199873637972493, | |
| "learning_rate": 1.0821291195817368e-06, | |
| "loss": 0.6525, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 3.6173604960141716, | |
| "grad_norm": 5.810381931293109, | |
| "learning_rate": 1.077040428387341e-06, | |
| "loss": 0.6836, | |
| "step": 20420 | |
| }, | |
| { | |
| "epoch": 3.620903454384411, | |
| "grad_norm": 3.782224795545236, | |
| "learning_rate": 1.0719604431446424e-06, | |
| "loss": 0.6494, | |
| "step": 20440 | |
| }, | |
| { | |
| "epoch": 3.6244464127546503, | |
| "grad_norm": 3.813897952858666, | |
| "learning_rate": 1.0668891949340066e-06, | |
| "loss": 0.6666, | |
| "step": 20460 | |
| }, | |
| { | |
| "epoch": 3.6279893711248894, | |
| "grad_norm": 3.689851683344868, | |
| "learning_rate": 1.061826714782348e-06, | |
| "loss": 0.665, | |
| "step": 20480 | |
| }, | |
| { | |
| "epoch": 3.6315323294951285, | |
| "grad_norm": 2.9395205953490975, | |
| "learning_rate": 1.0567730336629332e-06, | |
| "loss": 0.6364, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 3.6350752878653676, | |
| "grad_norm": 3.6265579695009444, | |
| "learning_rate": 1.0517281824951958e-06, | |
| "loss": 0.6308, | |
| "step": 20520 | |
| }, | |
| { | |
| "epoch": 3.6386182462356067, | |
| "grad_norm": 3.827147896558186, | |
| "learning_rate": 1.0466921921445455e-06, | |
| "loss": 0.6372, | |
| "step": 20540 | |
| }, | |
| { | |
| "epoch": 3.642161204605846, | |
| "grad_norm": 4.1574203155116445, | |
| "learning_rate": 1.0416650934221797e-06, | |
| "loss": 0.6439, | |
| "step": 20560 | |
| }, | |
| { | |
| "epoch": 3.645704162976085, | |
| "grad_norm": 4.4390839250431515, | |
| "learning_rate": 1.0366469170848966e-06, | |
| "loss": 0.6009, | |
| "step": 20580 | |
| }, | |
| { | |
| "epoch": 3.649247121346324, | |
| "grad_norm": 5.0129876566121165, | |
| "learning_rate": 1.0316376938349037e-06, | |
| "loss": 0.692, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 3.6527900797165636, | |
| "grad_norm": 3.704661473447835, | |
| "learning_rate": 1.0266374543196312e-06, | |
| "loss": 0.6231, | |
| "step": 20620 | |
| }, | |
| { | |
| "epoch": 3.656333038086802, | |
| "grad_norm": 3.8011769756171954, | |
| "learning_rate": 1.021646229131548e-06, | |
| "loss": 0.6669, | |
| "step": 20640 | |
| }, | |
| { | |
| "epoch": 3.6598759964570418, | |
| "grad_norm": 4.1871605406787875, | |
| "learning_rate": 1.0166640488079682e-06, | |
| "loss": 0.6749, | |
| "step": 20660 | |
| }, | |
| { | |
| "epoch": 3.663418954827281, | |
| "grad_norm": 4.393069433206878, | |
| "learning_rate": 1.0116909438308689e-06, | |
| "loss": 0.6444, | |
| "step": 20680 | |
| }, | |
| { | |
| "epoch": 3.66696191319752, | |
| "grad_norm": 3.8353595768451725, | |
| "learning_rate": 1.006726944626704e-06, | |
| "loss": 0.6717, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 3.670504871567759, | |
| "grad_norm": 5.904031353317382, | |
| "learning_rate": 1.0017720815662137e-06, | |
| "loss": 0.634, | |
| "step": 20720 | |
| }, | |
| { | |
| "epoch": 3.674047829937998, | |
| "grad_norm": 4.827433370442803, | |
| "learning_rate": 9.968263849642434e-07, | |
| "loss": 0.6189, | |
| "step": 20740 | |
| }, | |
| { | |
| "epoch": 3.6775907883082373, | |
| "grad_norm": 5.58653647794514, | |
| "learning_rate": 9.91889885079555e-07, | |
| "loss": 0.6557, | |
| "step": 20760 | |
| }, | |
| { | |
| "epoch": 3.6811337466784764, | |
| "grad_norm": 6.026946025657904, | |
| "learning_rate": 9.869626121146442e-07, | |
| "loss": 0.6468, | |
| "step": 20780 | |
| }, | |
| { | |
| "epoch": 3.684676705048716, | |
| "grad_norm": 2.8168530647204384, | |
| "learning_rate": 9.820445962155526e-07, | |
| "loss": 0.6782, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 3.6882196634189546, | |
| "grad_norm": 5.029755822261837, | |
| "learning_rate": 9.771358674716886e-07, | |
| "loss": 0.6575, | |
| "step": 20820 | |
| }, | |
| { | |
| "epoch": 3.691762621789194, | |
| "grad_norm": 3.0952751296440826, | |
| "learning_rate": 9.722364559156373e-07, | |
| "loss": 0.6524, | |
| "step": 20840 | |
| }, | |
| { | |
| "epoch": 3.6953055801594332, | |
| "grad_norm": 3.5997851014244033, | |
| "learning_rate": 9.673463915229786e-07, | |
| "loss": 0.672, | |
| "step": 20860 | |
| }, | |
| { | |
| "epoch": 3.6988485385296723, | |
| "grad_norm": 2.1873288789656913, | |
| "learning_rate": 9.62465704212108e-07, | |
| "loss": 0.7021, | |
| "step": 20880 | |
| }, | |
| { | |
| "epoch": 3.7023914968999114, | |
| "grad_norm": 3.8549619707969636, | |
| "learning_rate": 9.575944238440473e-07, | |
| "loss": 0.6788, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 3.7059344552701505, | |
| "grad_norm": 2.517253170655585, | |
| "learning_rate": 9.527325802222651e-07, | |
| "loss": 0.6652, | |
| "step": 20920 | |
| }, | |
| { | |
| "epoch": 3.7094774136403896, | |
| "grad_norm": 4.344342864101945, | |
| "learning_rate": 9.478802030924964e-07, | |
| "loss": 0.66, | |
| "step": 20940 | |
| }, | |
| { | |
| "epoch": 3.7130203720106287, | |
| "grad_norm": 5.501462412298871, | |
| "learning_rate": 9.430373221425534e-07, | |
| "loss": 0.6083, | |
| "step": 20960 | |
| }, | |
| { | |
| "epoch": 3.7165633303808683, | |
| "grad_norm": 5.05343578594501, | |
| "learning_rate": 9.382039670021548e-07, | |
| "loss": 0.6454, | |
| "step": 20980 | |
| }, | |
| { | |
| "epoch": 3.720106288751107, | |
| "grad_norm": 3.501826208404435, | |
| "learning_rate": 9.333801672427339e-07, | |
| "loss": 0.6739, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 3.7236492471213465, | |
| "grad_norm": 2.7034068335281836, | |
| "learning_rate": 9.285659523772636e-07, | |
| "loss": 0.6527, | |
| "step": 21020 | |
| }, | |
| { | |
| "epoch": 3.7271922054915856, | |
| "grad_norm": 5.720702183074802, | |
| "learning_rate": 9.237613518600763e-07, | |
| "loss": 0.6369, | |
| "step": 21040 | |
| }, | |
| { | |
| "epoch": 3.7307351638618247, | |
| "grad_norm": 3.1813749076707207, | |
| "learning_rate": 9.189663950866795e-07, | |
| "loss": 0.6318, | |
| "step": 21060 | |
| }, | |
| { | |
| "epoch": 3.734278122232064, | |
| "grad_norm": 6.860403932855227, | |
| "learning_rate": 9.141811113935786e-07, | |
| "loss": 0.6501, | |
| "step": 21080 | |
| }, | |
| { | |
| "epoch": 3.737821080602303, | |
| "grad_norm": 5.995799501979168, | |
| "learning_rate": 9.094055300580992e-07, | |
| "loss": 0.686, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 3.741364038972542, | |
| "grad_norm": 3.980336679764352, | |
| "learning_rate": 9.046396802982041e-07, | |
| "loss": 0.6047, | |
| "step": 21120 | |
| }, | |
| { | |
| "epoch": 3.744906997342781, | |
| "grad_norm": 6.358653317261383, | |
| "learning_rate": 8.998835912723162e-07, | |
| "loss": 0.6597, | |
| "step": 21140 | |
| }, | |
| { | |
| "epoch": 3.7484499557130206, | |
| "grad_norm": 5.811283042903301, | |
| "learning_rate": 8.951372920791412e-07, | |
| "loss": 0.6643, | |
| "step": 21160 | |
| }, | |
| { | |
| "epoch": 3.7519929140832593, | |
| "grad_norm": 3.162264075637326, | |
| "learning_rate": 8.904008117574886e-07, | |
| "loss": 0.6319, | |
| "step": 21180 | |
| }, | |
| { | |
| "epoch": 3.755535872453499, | |
| "grad_norm": 4.2075636016893805, | |
| "learning_rate": 8.856741792860923e-07, | |
| "loss": 0.6902, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 3.759078830823738, | |
| "grad_norm": 4.321253377690551, | |
| "learning_rate": 8.80957423583439e-07, | |
| "loss": 0.7127, | |
| "step": 21220 | |
| }, | |
| { | |
| "epoch": 3.762621789193977, | |
| "grad_norm": 4.271237705673551, | |
| "learning_rate": 8.762505735075833e-07, | |
| "loss": 0.6617, | |
| "step": 21240 | |
| }, | |
| { | |
| "epoch": 3.766164747564216, | |
| "grad_norm": 3.0868138114437866, | |
| "learning_rate": 8.715536578559763e-07, | |
| "loss": 0.6178, | |
| "step": 21260 | |
| }, | |
| { | |
| "epoch": 3.7697077059344553, | |
| "grad_norm": 5.628060531285577, | |
| "learning_rate": 8.668667053652907e-07, | |
| "loss": 0.6439, | |
| "step": 21280 | |
| }, | |
| { | |
| "epoch": 3.7732506643046944, | |
| "grad_norm": 2.4998507467015245, | |
| "learning_rate": 8.621897447112395e-07, | |
| "loss": 0.6257, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 3.7767936226749335, | |
| "grad_norm": 3.5267634187350874, | |
| "learning_rate": 8.575228045084044e-07, | |
| "loss": 0.6537, | |
| "step": 21320 | |
| }, | |
| { | |
| "epoch": 3.7803365810451726, | |
| "grad_norm": 5.53390702998575, | |
| "learning_rate": 8.528659133100616e-07, | |
| "loss": 0.6343, | |
| "step": 21340 | |
| }, | |
| { | |
| "epoch": 3.7838795394154117, | |
| "grad_norm": 5.193506072777215, | |
| "learning_rate": 8.482190996080042e-07, | |
| "loss": 0.6457, | |
| "step": 21360 | |
| }, | |
| { | |
| "epoch": 3.787422497785651, | |
| "grad_norm": 3.306608753288234, | |
| "learning_rate": 8.435823918323682e-07, | |
| "loss": 0.674, | |
| "step": 21380 | |
| }, | |
| { | |
| "epoch": 3.7909654561558903, | |
| "grad_norm": 5.818341238525181, | |
| "learning_rate": 8.389558183514615e-07, | |
| "loss": 0.6551, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 3.7945084145261294, | |
| "grad_norm": 1.869734323352245, | |
| "learning_rate": 8.34339407471586e-07, | |
| "loss": 0.6328, | |
| "step": 21420 | |
| }, | |
| { | |
| "epoch": 3.7980513728963685, | |
| "grad_norm": 3.061312982527864, | |
| "learning_rate": 8.297331874368702e-07, | |
| "loss": 0.6127, | |
| "step": 21440 | |
| }, | |
| { | |
| "epoch": 3.8015943312666076, | |
| "grad_norm": 2.810183936591826, | |
| "learning_rate": 8.2513718642909e-07, | |
| "loss": 0.6226, | |
| "step": 21460 | |
| }, | |
| { | |
| "epoch": 3.8051372896368467, | |
| "grad_norm": 3.6433306189517127, | |
| "learning_rate": 8.205514325674993e-07, | |
| "loss": 0.6773, | |
| "step": 21480 | |
| }, | |
| { | |
| "epoch": 3.808680248007086, | |
| "grad_norm": 6.17382443818236, | |
| "learning_rate": 8.159759539086603e-07, | |
| "loss": 0.6604, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 3.812223206377325, | |
| "grad_norm": 3.3939319572629563, | |
| "learning_rate": 8.114107784462677e-07, | |
| "loss": 0.6187, | |
| "step": 21520 | |
| }, | |
| { | |
| "epoch": 3.815766164747564, | |
| "grad_norm": 5.234765372841271, | |
| "learning_rate": 8.068559341109791e-07, | |
| "loss": 0.6466, | |
| "step": 21540 | |
| }, | |
| { | |
| "epoch": 3.8193091231178036, | |
| "grad_norm": 3.1266699022361357, | |
| "learning_rate": 8.023114487702446e-07, | |
| "loss": 0.6708, | |
| "step": 21560 | |
| }, | |
| { | |
| "epoch": 3.8228520814880427, | |
| "grad_norm": 4.632817513631642, | |
| "learning_rate": 7.977773502281355e-07, | |
| "loss": 0.6564, | |
| "step": 21580 | |
| }, | |
| { | |
| "epoch": 3.8263950398582818, | |
| "grad_norm": 5.332789338782664, | |
| "learning_rate": 7.932536662251747e-07, | |
| "loss": 0.6521, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 3.829937998228521, | |
| "grad_norm": 4.360391036257879, | |
| "learning_rate": 7.887404244381683e-07, | |
| "loss": 0.6484, | |
| "step": 21620 | |
| }, | |
| { | |
| "epoch": 3.83348095659876, | |
| "grad_norm": 3.444603203101798, | |
| "learning_rate": 7.84237652480033e-07, | |
| "loss": 0.6651, | |
| "step": 21640 | |
| }, | |
| { | |
| "epoch": 3.837023914968999, | |
| "grad_norm": 4.102682609294379, | |
| "learning_rate": 7.797453778996284e-07, | |
| "loss": 0.6597, | |
| "step": 21660 | |
| }, | |
| { | |
| "epoch": 3.840566873339238, | |
| "grad_norm": 3.7772046806599917, | |
| "learning_rate": 7.752636281815923e-07, | |
| "loss": 0.669, | |
| "step": 21680 | |
| }, | |
| { | |
| "epoch": 3.8441098317094773, | |
| "grad_norm": 4.059355924000635, | |
| "learning_rate": 7.707924307461664e-07, | |
| "loss": 0.6333, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 3.8476527900797164, | |
| "grad_norm": 5.671547408240144, | |
| "learning_rate": 7.663318129490313e-07, | |
| "loss": 0.6299, | |
| "step": 21720 | |
| }, | |
| { | |
| "epoch": 3.851195748449956, | |
| "grad_norm": 3.8675019069565595, | |
| "learning_rate": 7.61881802081142e-07, | |
| "loss": 0.6915, | |
| "step": 21740 | |
| }, | |
| { | |
| "epoch": 3.8547387068201946, | |
| "grad_norm": 4.07995418510124, | |
| "learning_rate": 7.57442425368555e-07, | |
| "loss": 0.6051, | |
| "step": 21760 | |
| }, | |
| { | |
| "epoch": 3.858281665190434, | |
| "grad_norm": 4.316690207012028, | |
| "learning_rate": 7.53013709972267e-07, | |
| "loss": 0.6172, | |
| "step": 21780 | |
| }, | |
| { | |
| "epoch": 3.8618246235606732, | |
| "grad_norm": 2.973078463900836, | |
| "learning_rate": 7.485956829880455e-07, | |
| "loss": 0.6679, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 3.8653675819309123, | |
| "grad_norm": 5.5663765632065525, | |
| "learning_rate": 7.441883714462641e-07, | |
| "loss": 0.6259, | |
| "step": 21820 | |
| }, | |
| { | |
| "epoch": 3.8689105403011514, | |
| "grad_norm": 4.08577312555125, | |
| "learning_rate": 7.397918023117389e-07, | |
| "loss": 0.6318, | |
| "step": 21840 | |
| }, | |
| { | |
| "epoch": 3.8724534986713905, | |
| "grad_norm": 4.203797767658477, | |
| "learning_rate": 7.354060024835599e-07, | |
| "loss": 0.6391, | |
| "step": 21860 | |
| }, | |
| { | |
| "epoch": 3.8759964570416297, | |
| "grad_norm": 2.811964097058608, | |
| "learning_rate": 7.310309987949294e-07, | |
| "loss": 0.6946, | |
| "step": 21880 | |
| }, | |
| { | |
| "epoch": 3.8795394154118688, | |
| "grad_norm": 3.853792141294366, | |
| "learning_rate": 7.266668180129946e-07, | |
| "loss": 0.6468, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 3.8830823737821083, | |
| "grad_norm": 5.804907803894153, | |
| "learning_rate": 7.223134868386903e-07, | |
| "loss": 0.6124, | |
| "step": 21920 | |
| }, | |
| { | |
| "epoch": 3.886625332152347, | |
| "grad_norm": 5.778097135701537, | |
| "learning_rate": 7.179710319065672e-07, | |
| "loss": 0.7053, | |
| "step": 21940 | |
| }, | |
| { | |
| "epoch": 3.8901682905225865, | |
| "grad_norm": 3.6866863372568712, | |
| "learning_rate": 7.136394797846338e-07, | |
| "loss": 0.6541, | |
| "step": 21960 | |
| }, | |
| { | |
| "epoch": 3.8937112488928256, | |
| "grad_norm": 3.29536698985515, | |
| "learning_rate": 7.093188569741962e-07, | |
| "loss": 0.6287, | |
| "step": 21980 | |
| }, | |
| { | |
| "epoch": 3.8972542072630647, | |
| "grad_norm": 4.9327123681203275, | |
| "learning_rate": 7.050091899096869e-07, | |
| "loss": 0.6666, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.8972542072630647, | |
| "eval_loss": 0.8004346489906311, | |
| "eval_runtime": 378.4036, | |
| "eval_samples_per_second": 25.124, | |
| "eval_steps_per_second": 3.142, | |
| "step": 22000 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 28225, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2905457032298496.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |