| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.9996597771540356, | |
| "eval_steps": 500, | |
| "global_step": 11756, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003402228459641065, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 3.809091090277921e-07, | |
| "loss": 4.24, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.00680445691928213, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 7.618182180555842e-07, | |
| "loss": 4.4323, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.010206685378923195, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 1.1427273270833762e-06, | |
| "loss": 4.2758, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01360891383856426, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 1.5236364361111684e-06, | |
| "loss": 4.1231, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.017011142298205325, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 1.9045455451389605e-06, | |
| "loss": 4.097, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02041337075784639, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 2.2854546541667524e-06, | |
| "loss": 4.0712, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.023815599217487455, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 2.6663637631945448e-06, | |
| "loss": 3.8851, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02721782767712852, | |
| "grad_norm": 6.78125, | |
| "learning_rate": 3.0472728722223367e-06, | |
| "loss": 3.6937, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.030620056136769585, | |
| "grad_norm": 8.25, | |
| "learning_rate": 3.4281819812501286e-06, | |
| "loss": 3.6468, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.03402228459641065, | |
| "grad_norm": 9.625, | |
| "learning_rate": 3.809091090277921e-06, | |
| "loss": 3.4787, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03742451305605171, | |
| "grad_norm": 7.53125, | |
| "learning_rate": 4.190000199305713e-06, | |
| "loss": 3.3235, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.04082674151569278, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 4.570909308333505e-06, | |
| "loss": 3.2806, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04422896997533384, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 4.951818417361297e-06, | |
| "loss": 3.0432, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.04763119843497491, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 5.3327275263890896e-06, | |
| "loss": 2.8991, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05103342689461597, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 5.7136366354168815e-06, | |
| "loss": 2.8202, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05443565535425704, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 6.094545744444673e-06, | |
| "loss": 2.6361, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0578378838138981, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 6.475454853472465e-06, | |
| "loss": 2.5525, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.06124011227353917, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 6.856363962500257e-06, | |
| "loss": 2.5685, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.06464234073318023, | |
| "grad_norm": 2.125, | |
| "learning_rate": 7.237273071528049e-06, | |
| "loss": 2.5133, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.0680445691928213, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 7.618182180555842e-06, | |
| "loss": 2.4096, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07144679765246237, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 7.999091289583632e-06, | |
| "loss": 2.4864, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.07484902611210342, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 8.380000398611426e-06, | |
| "loss": 2.4321, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.07825125457174449, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 8.760909507639218e-06, | |
| "loss": 2.3582, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.08165348303138556, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 9.14181861666701e-06, | |
| "loss": 2.3401, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08505571149102663, | |
| "grad_norm": 2.625, | |
| "learning_rate": 9.522727725694802e-06, | |
| "loss": 2.3312, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.08845793995066768, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 9.903636834722594e-06, | |
| "loss": 2.3672, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.09186016841030875, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 1.0284545943750385e-05, | |
| "loss": 2.3025, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.09526239686994982, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 1.0665455052778179e-05, | |
| "loss": 2.3273, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.09866462532959089, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.104636416180597e-05, | |
| "loss": 2.2746, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.10206685378923194, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.1427273270833763e-05, | |
| "loss": 2.3196, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10546908224887301, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 1.1808182379861553e-05, | |
| "loss": 2.2645, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.10887131070851408, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.2189091488889347e-05, | |
| "loss": 2.2902, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.11227353916815515, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.2570000597917139e-05, | |
| "loss": 2.2503, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1156757676277962, | |
| "grad_norm": 1.5, | |
| "learning_rate": 1.295090970694493e-05, | |
| "loss": 2.1882, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.11907799608743727, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 1.3331818815972723e-05, | |
| "loss": 2.2266, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.12248022454707834, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.344607904627746e-05, | |
| "loss": 2.2011, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1258824530067194, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 1.3446017810126854e-05, | |
| "loss": 2.1828, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.12928468146636046, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 1.3445905544333626e-05, | |
| "loss": 2.2727, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.13268690992600152, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.344574224974991e-05, | |
| "loss": 2.2222, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.1360891383856426, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.3445527927615165e-05, | |
| "loss": 2.2107, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.13949136684528365, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 1.3445262579556173e-05, | |
| "loss": 2.1671, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.14289359530492474, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 1.3444946207587011e-05, | |
| "loss": 2.1878, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1462958237645658, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 1.3444578814109056e-05, | |
| "loss": 2.1358, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.14969805222420685, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.3444160401910943e-05, | |
| "loss": 2.1564, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.15310028068384793, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 1.3443690974168565e-05, | |
| "loss": 2.1756, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.15650250914348898, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.344317053444504e-05, | |
| "loss": 2.1606, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.15990473760313004, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.344259908669068e-05, | |
| "loss": 2.2352, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.16330696606277112, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 1.3441976635242969e-05, | |
| "loss": 2.1258, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.16670919452241217, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.3441303184826526e-05, | |
| "loss": 2.1533, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.17011142298205326, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.3440578740553065e-05, | |
| "loss": 2.1179, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1735136514416943, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 1.3439803307921367e-05, | |
| "loss": 2.1868, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.17691587990133537, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.343897689281723e-05, | |
| "loss": 2.1144, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.18031810836097645, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 1.343809950151342e-05, | |
| "loss": 2.1722, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1837203368206175, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.3437171140669643e-05, | |
| "loss": 2.1725, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.18712256528025856, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 1.3436191817332471e-05, | |
| "loss": 2.1871, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.19052479373989964, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.3435161538935297e-05, | |
| "loss": 2.2134, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1939270221995407, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.3434080313298288e-05, | |
| "loss": 2.1545, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.19732925065918178, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.3432948148628312e-05, | |
| "loss": 2.1173, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.20073147911882283, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.3431765053518884e-05, | |
| "loss": 2.1703, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.20413370757846389, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.3430531036950099e-05, | |
| "loss": 2.1662, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.20753593603810497, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.3429246108288562e-05, | |
| "loss": 2.153, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.21093816449774602, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.3427910277287318e-05, | |
| "loss": 2.1421, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.21434039295738708, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 1.3426523554085776e-05, | |
| "loss": 2.1315, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.21774262141702816, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 1.342508594920964e-05, | |
| "loss": 2.1187, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.22114484987666921, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.342359747357082e-05, | |
| "loss": 2.1447, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2245470783363103, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.3422058138467349e-05, | |
| "loss": 2.1614, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.22794930679595135, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.3420467955583304e-05, | |
| "loss": 2.1521, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2313515352555924, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.3418826936988714e-05, | |
| "loss": 2.1474, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2347537637152335, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.3417135095139467e-05, | |
| "loss": 2.1887, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.23815599217487454, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.341539244287722e-05, | |
| "loss": 2.1432, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2415582206345156, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.3413598993429295e-05, | |
| "loss": 2.1202, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.24496044909415668, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.3411754760408584e-05, | |
| "loss": 2.201, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.24836267755379773, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.3409859757813437e-05, | |
| "loss": 2.104, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2517649060134388, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.3407914000027573e-05, | |
| "loss": 2.1118, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.25516713447307987, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 1.3405917501819956e-05, | |
| "loss": 2.1533, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2585693629327209, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 1.340387027834468e-05, | |
| "loss": 2.0738, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.261971591392362, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.3401772345140874e-05, | |
| "loss": 2.1696, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.26537381985200303, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.3399623718132557e-05, | |
| "loss": 2.0847, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.26877604831164414, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.3397424413628542e-05, | |
| "loss": 2.1644, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.2721782767712852, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.3395174448322298e-05, | |
| "loss": 2.0891, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.27558050523092625, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.3392873839291825e-05, | |
| "loss": 2.1638, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2789827336905673, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.339052260399953e-05, | |
| "loss": 2.078, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.28238496215020836, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.3388120760292085e-05, | |
| "loss": 2.1191, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2857871906098495, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.33856683264003e-05, | |
| "loss": 2.0554, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.2891894190694905, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.3383165320938983e-05, | |
| "loss": 2.0385, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2925916475291316, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.3380611762906796e-05, | |
| "loss": 2.1071, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.29599387598877264, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.3378007671686113e-05, | |
| "loss": 2.1171, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.2993961044484137, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 1.337535306704287e-05, | |
| "loss": 2.1264, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3027983329080548, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.337264796912642e-05, | |
| "loss": 2.0562, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.30620056136769586, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.3369892398469373e-05, | |
| "loss": 2.1343, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3096027898273369, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 1.3367086375987447e-05, | |
| "loss": 2.0563, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.31300501828697797, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.3364229922979311e-05, | |
| "loss": 2.1302, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.316407246746619, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.3361323061126409e-05, | |
| "loss": 2.0733, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3198094752062601, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.3358365812492812e-05, | |
| "loss": 2.1027, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.3232117036659012, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.3355358199525042e-05, | |
| "loss": 2.0455, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.32661393212554224, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.3352300245051904e-05, | |
| "loss": 2.0785, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3300161605851833, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.3349191972284314e-05, | |
| "loss": 2.1594, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.33341838904482435, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.3346033404815114e-05, | |
| "loss": 2.066, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.3368206175044654, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.3342824566618907e-05, | |
| "loss": 2.1451, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.3402228459641065, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.3339565482051866e-05, | |
| "loss": 2.152, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.34362507442374757, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.3336256175851549e-05, | |
| "loss": 2.1232, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3470273028833886, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.3332896673136717e-05, | |
| "loss": 2.1158, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3504295313430297, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.3329486999407136e-05, | |
| "loss": 2.102, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.35383175980267073, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.3326027180543387e-05, | |
| "loss": 2.1266, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.35723398826231184, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.3322517242806673e-05, | |
| "loss": 2.0884, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3606362167219529, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 1.3318957212838615e-05, | |
| "loss": 2.0793, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.36403844518159395, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.3315347117661048e-05, | |
| "loss": 2.0574, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.367440673641235, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.3311686984675822e-05, | |
| "loss": 2.0716, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.37084290210087606, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.3307976841664591e-05, | |
| "loss": 2.0523, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.3742451305605171, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.33042167167886e-05, | |
| "loss": 2.0203, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3776473590201582, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.330040663858848e-05, | |
| "loss": 2.0823, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3810495874797993, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.3296546635984012e-05, | |
| "loss": 2.0758, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.38445181593944033, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.3292636738273931e-05, | |
| "loss": 2.1138, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3878540443990814, | |
| "grad_norm": 1.5, | |
| "learning_rate": 1.3288676975135689e-05, | |
| "loss": 2.0277, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.39125627285872244, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 1.3284667376625236e-05, | |
| "loss": 2.042, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.39465850131836355, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.3280607973176785e-05, | |
| "loss": 2.114, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3980607297780046, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.327649879560259e-05, | |
| "loss": 2.0477, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.40146295823764566, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.3272339875092701e-05, | |
| "loss": 2.0101, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.4048651866972867, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.3268131243214744e-05, | |
| "loss": 2.1261, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.40826741515692777, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.326387293191366e-05, | |
| "loss": 2.0788, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.4116696436165688, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.325956497351148e-05, | |
| "loss": 2.0694, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.41507187207620994, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.3255207400707076e-05, | |
| "loss": 2.11, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.418474100535851, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.3250800246575906e-05, | |
| "loss": 2.0621, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.42187632899549204, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.3246343544569764e-05, | |
| "loss": 2.0923, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4252785574551331, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.3241837328516535e-05, | |
| "loss": 2.1005, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.42868078591477415, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.323728163261993e-05, | |
| "loss": 2.0634, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.43208301437441526, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.323267649145923e-05, | |
| "loss": 2.0635, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.4354852428340563, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.3228021939989018e-05, | |
| "loss": 2.131, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4388874712936974, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.3223318013538927e-05, | |
| "loss": 2.1021, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.44228969975333843, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.3218564747813355e-05, | |
| "loss": 2.0758, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4456919282129795, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.3213762178891202e-05, | |
| "loss": 2.0198, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4490941566726206, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.3208910343225603e-05, | |
| "loss": 2.1226, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.45249638513226165, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.3204009277643636e-05, | |
| "loss": 2.077, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4558986135919027, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.3199059019346055e-05, | |
| "loss": 2.1154, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.45930084205154376, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.3194059605907003e-05, | |
| "loss": 2.1109, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4627030705111848, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.318901107527373e-05, | |
| "loss": 2.1108, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.46610529897082587, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.3183913465766294e-05, | |
| "loss": 2.1203, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.469507527430467, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.3178766816077288e-05, | |
| "loss": 2.0667, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.47290975589010803, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.317357116527153e-05, | |
| "loss": 2.0428, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.4763119843497491, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.3168326552785775e-05, | |
| "loss": 2.0836, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.47971421280939014, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.3163033018428418e-05, | |
| "loss": 2.0031, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4831164412690312, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.315769060237918e-05, | |
| "loss": 2.096, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4865186697286723, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.3152299345188815e-05, | |
| "loss": 2.0325, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.48992089818831336, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.3146859287778799e-05, | |
| "loss": 2.0444, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.4933231266479544, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.3141370471441016e-05, | |
| "loss": 2.0971, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.49672535510759547, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.3135832937837444e-05, | |
| "loss": 2.0014, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.5001275835672365, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.3130246728999852e-05, | |
| "loss": 2.0086, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.5035298120268776, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.3124611887329459e-05, | |
| "loss": 2.0079, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.5069320404865186, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.3118928455596627e-05, | |
| "loss": 2.0654, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.5103342689461597, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.3113196476940538e-05, | |
| "loss": 2.0195, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5137364974058009, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.3107415994868855e-05, | |
| "loss": 2.0196, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.5171387258654419, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.3101587053257404e-05, | |
| "loss": 2.0552, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.520540954325083, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.3095709696349833e-05, | |
| "loss": 2.0833, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.523943182784724, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.3089783968757277e-05, | |
| "loss": 2.1067, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.5273454112443651, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.308380991545802e-05, | |
| "loss": 2.0313, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5307476397040061, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.3077787581797163e-05, | |
| "loss": 2.0918, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5341498681636472, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.3071717013486259e-05, | |
| "loss": 2.0505, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5375520966232883, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.3065598256602989e-05, | |
| "loss": 2.1166, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5409543250829293, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.3059431357590797e-05, | |
| "loss": 2.1196, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.5443565535425704, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.3053216363258537e-05, | |
| "loss": 2.0623, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5477587820022114, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.3046953320780136e-05, | |
| "loss": 2.051, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.5511610104618525, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.304064227769421e-05, | |
| "loss": 2.0341, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5545632389214936, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.3034283281903722e-05, | |
| "loss": 2.001, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5579654673811346, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.3027876381675611e-05, | |
| "loss": 1.9871, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.5613676958407757, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.3021421625640427e-05, | |
| "loss": 2.0712, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5647699243004167, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.3014919062791965e-05, | |
| "loss": 2.0444, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5681721527600578, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.3008368742486882e-05, | |
| "loss": 2.0598, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.571574381219699, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.300177071444434e-05, | |
| "loss": 2.0744, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.57497660967934, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.299512502874561e-05, | |
| "loss": 1.9854, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.578378838138981, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.2988431735833709e-05, | |
| "loss": 2.0348, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.581781066598622, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.2981690886513001e-05, | |
| "loss": 2.0189, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5851832950582632, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.2974902531948826e-05, | |
| "loss": 1.9997, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5885855235179043, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.2968066723667104e-05, | |
| "loss": 1.9861, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5919877519775453, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.2961183513553937e-05, | |
| "loss": 2.0284, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.5953899804371864, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.2954252953855236e-05, | |
| "loss": 2.0376, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5987922088968274, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.2947275097176301e-05, | |
| "loss": 2.0059, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.6021944373564685, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.2940249996481436e-05, | |
| "loss": 2.0906, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.6055966658161096, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.2933177705093541e-05, | |
| "loss": 2.0076, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.6089988942757506, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.2926058276693715e-05, | |
| "loss": 2.0247, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.6124011227353917, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.2918891765320837e-05, | |
| "loss": 2.113, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6158033511950327, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.2911678225371164e-05, | |
| "loss": 2.0201, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.6192055796546738, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.2904417711597916e-05, | |
| "loss": 2.0172, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.6226078081143149, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.289711027911086e-05, | |
| "loss": 2.1396, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.6260100365739559, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.2889755983375892e-05, | |
| "loss": 2.045, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.629412265033597, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.2882354880214616e-05, | |
| "loss": 2.012, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.632814493493238, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.2874907025803922e-05, | |
| "loss": 2.058, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.6362167219528791, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.2867412476675554e-05, | |
| "loss": 2.0796, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6396189504125201, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.2859871289715688e-05, | |
| "loss": 2.0956, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6430211788721613, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.2852283522164496e-05, | |
| "loss": 1.983, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.6464234073318024, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.2844649231615713e-05, | |
| "loss": 1.9861, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6498256357914434, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.2836968476016196e-05, | |
| "loss": 2.0683, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.6532278642510845, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.2829241313665494e-05, | |
| "loss": 2.0916, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6566300927107255, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.2821467803215395e-05, | |
| "loss": 2.0254, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6600323211703666, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.2813648003669482e-05, | |
| "loss": 2.0332, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.6634345496300077, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.2805781974382694e-05, | |
| "loss": 2.0225, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6668367780896487, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.2797869775060866e-05, | |
| "loss": 2.0563, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6702390065492898, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.2789911465760281e-05, | |
| "loss": 2.0027, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6736412350089308, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.2781907106887209e-05, | |
| "loss": 1.9895, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.6770434634685719, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.2773856759197455e-05, | |
| "loss": 2.0175, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.680445691928213, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.2765760483795895e-05, | |
| "loss": 2.0702, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.683847920387854, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.275761834213601e-05, | |
| "loss": 2.023, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.6872501488474951, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.2749430396019423e-05, | |
| "loss": 2.0051, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6906523773071361, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.2741196707595429e-05, | |
| "loss": 2.017, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6940546057667772, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.273291733936052e-05, | |
| "loss": 2.0481, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.6974568342264184, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.2724592354157912e-05, | |
| "loss": 2.0281, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.7008590626860594, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.2716221815177076e-05, | |
| "loss": 2.0459, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.7042612911457005, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.2707805785953245e-05, | |
| "loss": 2.0705, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.7076635196053415, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.2699344330366942e-05, | |
| "loss": 2.0759, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.7110657480649826, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.2690837512643495e-05, | |
| "loss": 2.0324, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.7144679765246237, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.2682285397352535e-05, | |
| "loss": 1.9784, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7178702049842647, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.2673688049407526e-05, | |
| "loss": 1.9902, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.7212724334439058, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.266504553406526e-05, | |
| "loss": 2.0631, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.7246746619035468, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.2656357916925368e-05, | |
| "loss": 2.0039, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.7280768903631879, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.2647625263929817e-05, | |
| "loss": 1.9975, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.7314791188228289, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.2638847641362408e-05, | |
| "loss": 2.0368, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.73488134728247, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.2630025115848282e-05, | |
| "loss": 2.0954, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.7382835757421111, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.2621157754353404e-05, | |
| "loss": 2.0297, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.7416858042017521, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.2612245624184062e-05, | |
| "loss": 2.0445, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7450880326613932, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.2603288792986354e-05, | |
| "loss": 2.0587, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.7484902611210342, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.2594287328745672e-05, | |
| "loss": 2.0126, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7518924895806753, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.258524129978619e-05, | |
| "loss": 2.0213, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7552947180403164, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.257615077477034e-05, | |
| "loss": 1.9826, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7586969464999574, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.25670158226983e-05, | |
| "loss": 2.0467, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.7620991749595986, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.2557836512907456e-05, | |
| "loss": 1.9924, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.7655014034192396, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.2548612915071894e-05, | |
| "loss": 1.9864, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7689036318788807, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.2539345099201851e-05, | |
| "loss": 1.9966, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7723058603385218, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.2530033135643203e-05, | |
| "loss": 2.0092, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7757080887981628, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.2520677095076918e-05, | |
| "loss": 1.97, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7791103172578039, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.2511277048518522e-05, | |
| "loss": 1.9781, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.7825125457174449, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.2501833067317562e-05, | |
| "loss": 2.0167, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.785914774177086, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.2492345223157068e-05, | |
| "loss": 2.0108, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7893170026367271, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.2482813588053004e-05, | |
| "loss": 2.0094, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7927192310963681, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 1.2473238234353713e-05, | |
| "loss": 1.9266, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.7961214595560092, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.2463619234739388e-05, | |
| "loss": 1.9982, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.7995236880156502, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.2453956662221504e-05, | |
| "loss": 2.0688, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.8029259164752913, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.2444250590142271e-05, | |
| "loss": 1.9658, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.8063281449349324, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.2434501092174075e-05, | |
| "loss": 1.9954, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.8097303733945734, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.242470824231892e-05, | |
| "loss": 2.0507, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.8131326018542145, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.241487211490786e-05, | |
| "loss": 2.0469, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.8165348303138555, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.2404992784600451e-05, | |
| "loss": 2.0436, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8199370587734967, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.2395070326384164e-05, | |
| "loss": 2.0195, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.8233392872331377, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.238510481557383e-05, | |
| "loss": 1.9674, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.8267415156927788, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.2375096327811061e-05, | |
| "loss": 1.9918, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.8301437441524199, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.2365044939063687e-05, | |
| "loss": 2.0161, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.8335459726120609, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.2354950725625158e-05, | |
| "loss": 2.0303, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.836948201071702, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.2344813764113985e-05, | |
| "loss": 1.973, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.840350429531343, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.2334634131473154e-05, | |
| "loss": 2.0389, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.8437526579909841, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.2324411904969535e-05, | |
| "loss": 2.0597, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.8471548864506252, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.2314147162193302e-05, | |
| "loss": 2.029, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8505571149102662, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.2303839981057342e-05, | |
| "loss": 2.0216, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8539593433699073, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.2293490439796658e-05, | |
| "loss": 1.9839, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8573615718295483, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.2283098616967793e-05, | |
| "loss": 2.0373, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8607638002891894, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.2272664591448208e-05, | |
| "loss": 2.075, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.8641660287488305, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.2262188442435706e-05, | |
| "loss": 2.071, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.8675682572084715, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.2251670249447816e-05, | |
| "loss": 2.0474, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8709704856681126, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.22411100923212e-05, | |
| "loss": 1.9866, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8743727141277536, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.2230508051211039e-05, | |
| "loss": 2.0365, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.8777749425873947, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.2219864206590427e-05, | |
| "loss": 2.0041, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.8811771710470359, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.2209178639249763e-05, | |
| "loss": 2.0164, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.8845793995066769, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.2198451430296135e-05, | |
| "loss": 2.0469, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.887981627966318, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.2187682661152705e-05, | |
| "loss": 1.9873, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.891383856425959, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 1.2176872413558087e-05, | |
| "loss": 2.0442, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.8947860848856001, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.2166020769565741e-05, | |
| "loss": 2.0356, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8981883133452412, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.2155127811543326e-05, | |
| "loss": 2.0253, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.9015905418048822, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.2144193622172099e-05, | |
| "loss": 1.974, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.9049927702645233, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.2133218284446276e-05, | |
| "loss": 2.0084, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.9083949987241643, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.2122201881672392e-05, | |
| "loss": 2.1215, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.9117972271838054, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.2111144497468698e-05, | |
| "loss": 1.9749, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.9151994556434464, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.2100046215764493e-05, | |
| "loss": 1.9601, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.9186016841030875, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.2088907120799507e-05, | |
| "loss": 1.9761, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9220039125627286, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.2077727297123258e-05, | |
| "loss": 2.0309, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.9254061410223696, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.2066506829594404e-05, | |
| "loss": 2.0306, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.9288083694820107, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.2055245803380112e-05, | |
| "loss": 2.0073, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.9322105979416517, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.2043944303955393e-05, | |
| "loss": 1.9904, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.9356128264012928, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.2032602417102472e-05, | |
| "loss": 2.0916, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.939015054860934, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.2021220228910125e-05, | |
| "loss": 1.9665, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.942417283320575, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.2009797825773027e-05, | |
| "loss": 1.9822, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.9458195117802161, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.1998335294391099e-05, | |
| "loss": 1.9947, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.9492217402398571, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.1986832721768856e-05, | |
| "loss": 1.9626, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.9526239686994982, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.1975290195214724e-05, | |
| "loss": 1.9772, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9560261971591393, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.1963707802340409e-05, | |
| "loss": 2.0471, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9594284256187803, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.1952085631060207e-05, | |
| "loss": 1.9514, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9628306540784214, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.1940423769590349e-05, | |
| "loss": 1.9974, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9662328825380624, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.1928722306448326e-05, | |
| "loss": 2.0036, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.9696351109977035, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 1.1916981330452221e-05, | |
| "loss": 1.9803, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9730373394573446, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.1905200930720032e-05, | |
| "loss": 2.0608, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9764395679169856, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.1893381196668997e-05, | |
| "loss": 1.9857, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.9798417963766267, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.1881522218014912e-05, | |
| "loss": 2.0197, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9832440248362677, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.1869624084771457e-05, | |
| "loss": 1.9883, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.9866462532959088, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.185768688724951e-05, | |
| "loss": 2.0941, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9900484817555499, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.184571071605645e-05, | |
| "loss": 1.9953, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.9934507102151909, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.1833695662095493e-05, | |
| "loss": 1.9833, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.996852938674832, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.1821641816564982e-05, | |
| "loss": 2.0431, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.000255167134473, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.1809549270957697e-05, | |
| "loss": 1.886, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.0036573955941142, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.1797418117060173e-05, | |
| "loss": 1.9804, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.0070596240537553, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.1785248446951988e-05, | |
| "loss": 2.0657, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.0104618525133964, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.1773040353005074e-05, | |
| "loss": 2.0112, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.0138640809730373, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.1760793927883016e-05, | |
| "loss": 2.0262, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.0172663094326784, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.174850926454034e-05, | |
| "loss": 2.0007, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.0206685378923195, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.1736186456221816e-05, | |
| "loss": 1.9723, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0240707663519606, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.1723825596461751e-05, | |
| "loss": 1.9384, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.0274729948116017, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.1711426779083267e-05, | |
| "loss": 1.9556, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.0308752232712426, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.1698990098197604e-05, | |
| "loss": 1.9963, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.0342774517308837, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.1686515648203396e-05, | |
| "loss": 1.9429, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.0376796801905248, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.1674003523785957e-05, | |
| "loss": 1.8885, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.041081908650166, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.1661453819916565e-05, | |
| "loss": 1.9456, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.0444841371098068, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.1648866631851738e-05, | |
| "loss": 1.9386, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.047886365569448, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.1636242055132511e-05, | |
| "loss": 1.9569, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.051288594029089, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.1623580185583711e-05, | |
| "loss": 1.9159, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.0546908224887301, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.1610881119313231e-05, | |
| "loss": 1.9094, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.0580930509483712, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.1598144952711302e-05, | |
| "loss": 2.0189, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.0614952794080121, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.1585371782449755e-05, | |
| "loss": 2.0053, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.0648975078676532, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.1572561705481294e-05, | |
| "loss": 1.9826, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.0682997363272944, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.1559714819038756e-05, | |
| "loss": 1.9597, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.0717019647869355, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.1546831220634377e-05, | |
| "loss": 1.9255, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.0751041932465766, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.1533911008059046e-05, | |
| "loss": 1.9859, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.0785064217062175, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.1520954279381567e-05, | |
| "loss": 1.9651, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.0819086501658586, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.1507961132947917e-05, | |
| "loss": 1.9321, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.0853108786254997, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.1494931667380492e-05, | |
| "loss": 1.9215, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.0887131070851408, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.1481865981577362e-05, | |
| "loss": 1.982, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.092115335544782, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.1468764174711526e-05, | |
| "loss": 1.9728, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.0955175640044228, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.1455626346230147e-05, | |
| "loss": 2.0267, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.098919792464064, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.1442452595853809e-05, | |
| "loss": 1.9484, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.102322020923705, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.1429243023575758e-05, | |
| "loss": 1.9867, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.1057242493833461, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.1415997729661134e-05, | |
| "loss": 1.9269, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.1091264778429872, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.140271681464622e-05, | |
| "loss": 1.9095, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.1125287063026281, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.1389400379337676e-05, | |
| "loss": 2.0021, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.1159309347622692, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.137604852481177e-05, | |
| "loss": 2.0117, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.1193331632219103, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 1.1362661352413616e-05, | |
| "loss": 1.9835, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.1227353916815515, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.1349238963756402e-05, | |
| "loss": 1.9492, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.1261376201411926, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.1335781460720621e-05, | |
| "loss": 1.9394, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.1295398486008335, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.1322288945453292e-05, | |
| "loss": 1.9442, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.1329420770604746, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.1308761520367196e-05, | |
| "loss": 1.9256, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.1363443055201157, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.1295199288140082e-05, | |
| "loss": 1.9861, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.1397465339797568, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.1281602351713905e-05, | |
| "loss": 1.9598, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.143148762439398, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.1267970814294032e-05, | |
| "loss": 1.9839, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.1465509908990388, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.1254304779348466e-05, | |
| "loss": 1.9654, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.14995321935868, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.1240604350607055e-05, | |
| "loss": 1.9536, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.153355447818321, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.122686963206071e-05, | |
| "loss": 1.9331, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.156757676277962, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.1213100727960614e-05, | |
| "loss": 1.9218, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.1601599047376032, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.1199297742817428e-05, | |
| "loss": 1.9979, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.163562133197244, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.11854607814005e-05, | |
| "loss": 2.02, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.1669643616568852, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.117158994873707e-05, | |
| "loss": 2.0195, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.1703665901165263, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.1157685350111472e-05, | |
| "loss": 2.0053, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.1737688185761674, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.1143747091064334e-05, | |
| "loss": 2.014, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.1771710470358085, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.1129775277391782e-05, | |
| "loss": 1.9057, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.1805732754954494, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.1115770015144628e-05, | |
| "loss": 1.9496, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.1839755039550905, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.1101731410627574e-05, | |
| "loss": 1.9163, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.1873777324147317, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.1087659570398397e-05, | |
| "loss": 1.9717, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.1907799608743728, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.1073554601267147e-05, | |
| "loss": 2.0302, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.1941821893340139, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.1059416610295336e-05, | |
| "loss": 1.9523, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.1975844177936548, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.104524570479512e-05, | |
| "loss": 1.9842, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.2009866462532959, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.1031041992328483e-05, | |
| "loss": 2.0036, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.204388874712937, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.1016805580706439e-05, | |
| "loss": 2.048, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.207791103172578, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.1002536577988182e-05, | |
| "loss": 1.9545, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.2111933316322192, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.0988235092480297e-05, | |
| "loss": 1.9575, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.21459556009186, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.0973901232735917e-05, | |
| "loss": 1.9759, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.2179977885515012, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.0959535107553909e-05, | |
| "loss": 1.9737, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.2214000170111423, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.0945136825978049e-05, | |
| "loss": 2.0414, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.2248022454707834, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.0930706497296186e-05, | |
| "loss": 1.9566, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.2282044739304245, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.0916244231039415e-05, | |
| "loss": 1.9614, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.2316067023900654, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.0901750136981258e-05, | |
| "loss": 2.0045, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.2350089308497065, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 1.0887224325136807e-05, | |
| "loss": 1.9703, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.2384111593093476, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.0872666905761921e-05, | |
| "loss": 1.9609, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.2418133877689888, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.0858077989352354e-05, | |
| "loss": 1.9865, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.2452156162286299, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.084345768664294e-05, | |
| "loss": 1.9276, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.2486178446882708, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.0828806108606748e-05, | |
| "loss": 1.9673, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.2520200731479119, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.081412336645423e-05, | |
| "loss": 1.9522, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.255422301607553, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.0799409571632395e-05, | |
| "loss": 1.8882, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.258824530067194, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.0784664835823945e-05, | |
| "loss": 1.9378, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.2622267585268352, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.076988927094643e-05, | |
| "loss": 2.0231, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.265628986986476, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.0755082989151417e-05, | |
| "loss": 1.925, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.2690312154461172, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.0740246102823613e-05, | |
| "loss": 1.8958, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.2724334439057583, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.0725378724580027e-05, | |
| "loss": 1.9536, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.2758356723653994, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.0710480967269115e-05, | |
| "loss": 1.9541, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.2792379008250405, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.0695552943969919e-05, | |
| "loss": 1.9327, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.2826401292846814, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.0680594767991203e-05, | |
| "loss": 1.9935, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.2860423577443225, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.0665606552870612e-05, | |
| "loss": 1.9933, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.2894445862039636, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.0650588412373792e-05, | |
| "loss": 1.9314, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.2928468146636047, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.0635540460493534e-05, | |
| "loss": 1.9136, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.2962490431232458, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.0620462811448904e-05, | |
| "loss": 1.9175, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.2996512715828867, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.0605355579684382e-05, | |
| "loss": 1.9929, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.3030535000425278, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.0590218879868998e-05, | |
| "loss": 1.9072, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.306455728502169, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.0575052826895442e-05, | |
| "loss": 1.9315, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.30985795696181, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.0559857535879212e-05, | |
| "loss": 2.045, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.3132601854214512, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.0544633122157734e-05, | |
| "loss": 1.9443, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.316662413881092, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.0529379701289476e-05, | |
| "loss": 1.9742, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.3200646423407332, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.051409738905309e-05, | |
| "loss": 1.9852, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.3234668708003743, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.0498786301446519e-05, | |
| "loss": 1.997, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.3268690992600152, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.0483446554686125e-05, | |
| "loss": 1.9083, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.3302713277196565, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.0468078265205796e-05, | |
| "loss": 1.974, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.3336735561792974, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.0452681549656073e-05, | |
| "loss": 1.9885, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.3370757846389385, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.0437256524903258e-05, | |
| "loss": 1.9872, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.3404780130985796, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.0421803308028533e-05, | |
| "loss": 1.9477, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.3438802415582205, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.0406322016327067e-05, | |
| "loss": 2.0032, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.3472824700178618, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.0390812767307123e-05, | |
| "loss": 1.9942, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.3506846984775027, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.0375275678689174e-05, | |
| "loss": 2.0242, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.3540869269371438, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.0359710868405e-05, | |
| "loss": 2.0306, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.357489155396785, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.0344118454596807e-05, | |
| "loss": 1.9709, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.3608913838564258, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.032849855561631e-05, | |
| "loss": 1.9812, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3642936123160672, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.0312851290023851e-05, | |
| "loss": 2.0006, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.367695840775708, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.0297176776587497e-05, | |
| "loss": 1.9679, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.3710980692353492, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.028147513428213e-05, | |
| "loss": 1.934, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.3745002976949903, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.026574648228855e-05, | |
| "loss": 1.9867, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.3779025261546312, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.0249990939992573e-05, | |
| "loss": 1.899, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.3813047546142723, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.023420862698412e-05, | |
| "loss": 1.9799, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.3847069830739134, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.021839966305631e-05, | |
| "loss": 2.0251, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.3881092115335545, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.0202564168204549e-05, | |
| "loss": 1.9332, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.3915114399931956, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.0186702262625632e-05, | |
| "loss": 1.971, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.3949136684528365, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.0170814066716807e-05, | |
| "loss": 1.9266, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.3983158969124776, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.0154899701074883e-05, | |
| "loss": 1.9282, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.4017181253721187, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.0138959286495303e-05, | |
| "loss": 2.0014, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.4051203538317598, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.0122992943971232e-05, | |
| "loss": 1.9463, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.408522582291401, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.0107000794692637e-05, | |
| "loss": 2.003, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.4119248107510418, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.0090982960045363e-05, | |
| "loss": 2.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.415327039210683, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.0074939561610221e-05, | |
| "loss": 1.9832, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.418729267670324, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.005887072116206e-05, | |
| "loss": 1.8977, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.4221314961299651, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.0042776560668832e-05, | |
| "loss": 1.9778, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.4255337245896063, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.0026657202290696e-05, | |
| "loss": 1.9389, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.4289359530492471, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.0010512768379053e-05, | |
| "loss": 1.909, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.4323381815088883, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 9.994343381475644e-06, | |
| "loss": 1.9563, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.4357404099685294, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 9.978149164311613e-06, | |
| "loss": 1.9725, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.4391426384281705, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 9.961930239806571e-06, | |
| "loss": 2.0237, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.4425448668878116, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 9.945686731067668e-06, | |
| "loss": 1.9415, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.4459470953474525, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 9.929418761388654e-06, | |
| "loss": 1.9221, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.4493493238070936, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 9.91312645424895e-06, | |
| "loss": 1.9062, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.4527515522667347, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 9.896809933312702e-06, | |
| "loss": 1.9621, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.4561537807263758, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 9.88046932242785e-06, | |
| "loss": 1.9721, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.459556009186017, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 9.864104745625186e-06, | |
| "loss": 2.0143, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.4629582376456578, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 9.847716327117408e-06, | |
| "loss": 1.9356, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.466360466105299, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 9.831304191298181e-06, | |
| "loss": 1.9466, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.46976269456494, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 9.814868462741196e-06, | |
| "loss": 1.9112, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.4731649230245811, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 9.798409266199217e-06, | |
| "loss": 1.9464, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.4765671514842222, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 9.781926726603141e-06, | |
| "loss": 1.9421, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.4799693799438631, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 9.765420969061045e-06, | |
| "loss": 2.0682, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.4833716084035042, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 9.748892118857236e-06, | |
| "loss": 1.9912, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.4867738368631453, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 9.73234030145131e-06, | |
| "loss": 1.9594, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.4901760653227865, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 9.71576564247718e-06, | |
| "loss": 1.9444, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.4935782937824276, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 9.699168267742144e-06, | |
| "loss": 1.9882, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.4969805222420685, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 9.682548303225915e-06, | |
| "loss": 1.9076, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.5003827507017096, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 9.665905875079679e-06, | |
| "loss": 1.9594, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.5037849791613507, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 9.649241109625111e-06, | |
| "loss": 2.0808, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.5071872076209918, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 9.632554133353453e-06, | |
| "loss": 1.9688, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.510589436080633, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 9.615845072924522e-06, | |
| "loss": 1.971, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.5139916645402738, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 9.59911405516577e-06, | |
| "loss": 1.9759, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.517393892999915, | |
| "grad_norm": 2.125, | |
| "learning_rate": 9.582361207071299e-06, | |
| "loss": 1.975, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.520796121459556, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 9.565586655800928e-06, | |
| "loss": 1.9975, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.5241983499191971, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 9.5487905286792e-06, | |
| "loss": 1.966, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.5276005783788382, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 9.531972953194425e-06, | |
| "loss": 1.9374, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.5310028068384791, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 9.51513405699772e-06, | |
| "loss": 1.9567, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.5344050352981202, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 9.498273967902033e-06, | |
| "loss": 1.9704, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.5378072637577613, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 9.481392813881164e-06, | |
| "loss": 1.9064, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.5412094922174024, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 9.464490723068811e-06, | |
| "loss": 1.9553, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.5446117206770436, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 9.447567823757589e-06, | |
| "loss": 1.9416, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.5480139491366844, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 9.430624244398053e-06, | |
| "loss": 2.0401, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.5514161775963256, | |
| "grad_norm": 2.125, | |
| "learning_rate": 9.413660113597731e-06, | |
| "loss": 1.9495, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.5548184060559667, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 9.396675560120143e-06, | |
| "loss": 2.0093, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.5582206345156078, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 9.379670712883817e-06, | |
| "loss": 1.974, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.5616228629752489, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 9.362645700961327e-06, | |
| "loss": 1.935, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.5650250914348898, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 9.345600653578297e-06, | |
| "loss": 1.9727, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.5684273198945309, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 9.328535700112433e-06, | |
| "loss": 1.9115, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.571829548354172, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 9.311450970092529e-06, | |
| "loss": 1.9329, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.575231776813813, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 9.294346593197489e-06, | |
| "loss": 1.9534, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.5786340052734542, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 9.277222699255353e-06, | |
| "loss": 1.9047, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.582036233733095, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 9.260079418242293e-06, | |
| "loss": 1.9975, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.5854384621927362, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 9.242916880281638e-06, | |
| "loss": 1.9347, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.5888406906523773, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 9.225735215642885e-06, | |
| "loss": 1.9552, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.5922429191120182, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 9.208534554740706e-06, | |
| "loss": 1.9052, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.5956451475716595, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 9.191315028133966e-06, | |
| "loss": 1.9881, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.5990473760313004, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 9.17407676652472e-06, | |
| "loss": 1.9671, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.6024496044909415, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 9.156819900757237e-06, | |
| "loss": 1.9753, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.6058518329505826, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 9.139544561816991e-06, | |
| "loss": 1.9516, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.6092540614102235, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 9.122250880829674e-06, | |
| "loss": 1.9615, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.6126562898698649, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 9.104938989060205e-06, | |
| "loss": 1.9325, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.6160585183295058, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 9.087609017911725e-06, | |
| "loss": 1.9227, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.6194607467891469, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 9.070261098924604e-06, | |
| "loss": 1.9796, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.622862975248788, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 9.052895363775442e-06, | |
| "loss": 1.977, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.6262652037084289, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 9.035511944276075e-06, | |
| "loss": 1.8778, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.6296674321680702, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 9.018110972372563e-06, | |
| "loss": 1.924, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.633069660627711, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 9.000692580144194e-06, | |
| "loss": 1.9173, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.6364718890873522, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 8.983256899802485e-06, | |
| "loss": 1.9433, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.6398741175469933, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 8.96580406369018e-06, | |
| "loss": 1.9947, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.6432763460066342, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 8.948334204280234e-06, | |
| "loss": 1.9073, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.6466785744662755, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 8.930847454174817e-06, | |
| "loss": 1.9565, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.6500808029259164, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8.913343946104305e-06, | |
| "loss": 1.8945, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.6534830313855575, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.895823812926273e-06, | |
| "loss": 1.9491, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.6568852598451986, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8.878287187624486e-06, | |
| "loss": 1.8916, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.6602874883048395, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 8.860734203307893e-06, | |
| "loss": 1.9758, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.6636897167644809, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 8.84316499320961e-06, | |
| "loss": 1.9791, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.6670919452241217, | |
| "grad_norm": 2.0, | |
| "learning_rate": 8.825579690685907e-06, | |
| "loss": 2.0407, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.6704941736837629, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 8.807978429215212e-06, | |
| "loss": 2.0039, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.673896402143404, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8.79036134239708e-06, | |
| "loss": 2.0093, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.6772986306030448, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 8.772728563951189e-06, | |
| "loss": 1.8997, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.6807008590626862, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 8.755080227716316e-06, | |
| "loss": 1.908, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.684103087522327, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 8.737416467649337e-06, | |
| "loss": 1.9478, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.6875053159819682, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8.71973741782419e-06, | |
| "loss": 1.9497, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.6909075444416093, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 8.70204321243087e-06, | |
| "loss": 1.9035, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.6943097729012502, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.684333985774413e-06, | |
| "loss": 1.9666, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.6977120013608915, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 8.666609872273867e-06, | |
| "loss": 1.9943, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.7011142298205324, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.648871006461278e-06, | |
| "loss": 1.9293, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.7045164582801735, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 8.631117522980663e-06, | |
| "loss": 1.9369, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.7079186867398146, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 8.613349556587001e-06, | |
| "loss": 1.9117, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.7113209151994555, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 8.59556724214519e-06, | |
| "loss": 1.9757, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.7147231436590968, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 8.577770714629042e-06, | |
| "loss": 1.9838, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.7181253721187377, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 8.559960109120251e-06, | |
| "loss": 1.9571, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.7215276005783788, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 8.542135560807365e-06, | |
| "loss": 1.9588, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.72492982903802, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8.524297204984759e-06, | |
| "loss": 1.9542, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.7283320574976608, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 8.506445177051624e-06, | |
| "loss": 1.9691, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.7317342859573022, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 8.488579612510915e-06, | |
| "loss": 1.9141, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.735136514416943, | |
| "grad_norm": 2.0, | |
| "learning_rate": 8.470700646968339e-06, | |
| "loss": 2.0129, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.7385387428765842, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.452808416131319e-06, | |
| "loss": 1.9424, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.7419409713362253, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 8.434903055807971e-06, | |
| "loss": 1.9041, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.7453431997958662, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 8.416984701906065e-06, | |
| "loss": 1.9514, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.7487454282555075, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 8.399053490431994e-06, | |
| "loss": 1.9846, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.7521476567151484, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 8.38110955748975e-06, | |
| "loss": 1.9242, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.7555498851747895, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 8.363153039279882e-06, | |
| "loss": 1.9853, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.7589521136344306, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8.345184072098464e-06, | |
| "loss": 2.0005, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.7623543420940715, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.327202792336068e-06, | |
| "loss": 2.0181, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.7657565705537128, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.309209336476713e-06, | |
| "loss": 1.9119, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.7691587990133537, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 8.29120384109685e-06, | |
| "loss": 1.9061, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.7725610274729948, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 8.273186442864303e-06, | |
| "loss": 1.9584, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.775963255932636, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 8.25515727853725e-06, | |
| "loss": 1.9456, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.7793654843922768, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 8.23711648496318e-06, | |
| "loss": 1.9162, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.7827677128519182, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 8.219064199077837e-06, | |
| "loss": 1.9735, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.786169941311559, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.201000557904217e-06, | |
| "loss": 1.9512, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.7895721697712001, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 8.182925698551491e-06, | |
| "loss": 1.9886, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.7929743982308413, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 8.164839758213986e-06, | |
| "loss": 1.9956, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.7963766266904821, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 8.14674287417013e-06, | |
| "loss": 1.9076, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.7997788551501235, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 8.128635183781433e-06, | |
| "loss": 1.912, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.8031810836097644, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 8.11051682449141e-06, | |
| "loss": 1.9582, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.8065833120694055, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.092387933824571e-06, | |
| "loss": 1.979, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.8099855405290466, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 8.074248649385357e-06, | |
| "loss": 1.9679, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.8133877689886875, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 8.056099108857101e-06, | |
| "loss": 1.9288, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.8167899974483288, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.037939450000985e-06, | |
| "loss": 1.922, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.8201922259079697, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 8.019769810654989e-06, | |
| "loss": 1.9022, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.8235944543676108, | |
| "grad_norm": 2.0, | |
| "learning_rate": 8.00159032873285e-06, | |
| "loss": 1.9698, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.826996682827252, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 7.98340114222302e-06, | |
| "loss": 1.9087, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.8303989112868928, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 7.9652023891876e-06, | |
| "loss": 1.9785, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.8338011397465341, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 7.946994207761316e-06, | |
| "loss": 1.9983, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.837203368206175, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 7.928776736150451e-06, | |
| "loss": 2.0148, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.8406055966658161, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 7.910550112631802e-06, | |
| "loss": 1.9808, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.8440078251254572, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 7.892314475551641e-06, | |
| "loss": 1.9153, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.8474100535850981, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 7.87406996332465e-06, | |
| "loss": 1.9285, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.8508122820447395, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 7.855816714432878e-06, | |
| "loss": 1.952, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.8542145105043804, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 7.837554867424685e-06, | |
| "loss": 1.9335, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.8576167389640215, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 7.8192845609137e-06, | |
| "loss": 1.943, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.8610189674236626, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 7.801005933577753e-06, | |
| "loss": 2.0204, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.8644211958833035, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 7.782719124157842e-06, | |
| "loss": 1.915, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.8678234243429448, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 7.764424271457067e-06, | |
| "loss": 1.9207, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.8712256528025857, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 7.746121514339576e-06, | |
| "loss": 1.9593, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.8746278812622268, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 7.727810991729512e-06, | |
| "loss": 1.904, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 1.878030109721868, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 7.709492842609971e-06, | |
| "loss": 1.9757, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.8814323381815088, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 7.691167206021928e-06, | |
| "loss": 1.938, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 1.88483456664115, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 7.67283422106319e-06, | |
| "loss": 1.956, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.888236795100791, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 7.654494026887346e-06, | |
| "loss": 1.9298, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.8916390235604321, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 7.636146762702703e-06, | |
| "loss": 1.8893, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.8950412520200732, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 7.617792567771233e-06, | |
| "loss": 1.9309, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 1.8984434804797141, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 7.59943158140751e-06, | |
| "loss": 1.9064, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.9018457089393552, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 7.581063942977662e-06, | |
| "loss": 1.9647, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 1.9052479373989963, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 7.56268979189831e-06, | |
| "loss": 1.9417, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.9086501658586374, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 7.544309267635502e-06, | |
| "loss": 1.96, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 1.9120523943182786, | |
| "grad_norm": 2.25, | |
| "learning_rate": 7.525922509703665e-06, | |
| "loss": 1.9672, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.9154546227779194, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 7.507529657664538e-06, | |
| "loss": 1.9975, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 1.9188568512375606, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 7.489130851126123e-06, | |
| "loss": 1.9763, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.9222590796972017, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 7.470726229741613e-06, | |
| "loss": 1.9206, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.9256613081568428, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 7.45231593320834e-06, | |
| "loss": 2.0314, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.9290635366164839, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 7.433900101266712e-06, | |
| "loss": 1.9449, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 1.9324657650761248, | |
| "grad_norm": 2.0, | |
| "learning_rate": 7.415478873699151e-06, | |
| "loss": 1.9294, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.9358679935357659, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 7.3970523903290335e-06, | |
| "loss": 1.8888, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 1.939270221995407, | |
| "grad_norm": 2.25, | |
| "learning_rate": 7.378620791019634e-06, | |
| "loss": 1.9365, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.942672450455048, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 7.360184215673055e-06, | |
| "loss": 1.9441, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 1.9460746789146892, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 7.341742804229166e-06, | |
| "loss": 1.9156, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.94947690737433, | |
| "grad_norm": 2.375, | |
| "learning_rate": 7.32329669666455e-06, | |
| "loss": 1.9051, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 1.9528791358339712, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 7.304846032991432e-06, | |
| "loss": 2.0019, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.9562813642936123, | |
| "grad_norm": 1.875, | |
| "learning_rate": 7.2863909532566196e-06, | |
| "loss": 1.8679, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.9596835927532534, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 7.2679315975404405e-06, | |
| "loss": 1.9605, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.9630858212128945, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 7.249468105955679e-06, | |
| "loss": 1.9355, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 1.9664880496725354, | |
| "grad_norm": 2.0, | |
| "learning_rate": 7.231000618646511e-06, | |
| "loss": 1.8908, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.9698902781321765, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 7.212529275787436e-06, | |
| "loss": 1.9578, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 1.9732925065918177, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 7.194054217582234e-06, | |
| "loss": 1.9287, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.9766947350514585, | |
| "grad_norm": 2.375, | |
| "learning_rate": 7.17557558426287e-06, | |
| "loss": 1.9672, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 1.9800969635110999, | |
| "grad_norm": 2.0, | |
| "learning_rate": 7.157093516088451e-06, | |
| "loss": 1.9581, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.9834991919707408, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 7.138608153344156e-06, | |
| "loss": 1.9872, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 1.9869014204303819, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 7.120119636340172e-06, | |
| "loss": 1.9525, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.990303648890023, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 7.101628105410625e-06, | |
| "loss": 1.9093, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.9937058773496639, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 7.0831337009125195e-06, | |
| "loss": 1.9706, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.9971081058093052, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 7.064636563224674e-06, | |
| "loss": 1.9331, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 2.000510334268946, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 7.046136832746647e-06, | |
| "loss": 1.9434, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 2.0039125627285874, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 7.027634649897679e-06, | |
| "loss": 1.8678, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 2.0073147911882283, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 7.009130155115627e-06, | |
| "loss": 1.9193, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.010717019647869, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.990623488855899e-06, | |
| "loss": 1.9459, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 2.0141192481075105, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 6.972114791590378e-06, | |
| "loss": 1.9229, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 2.0175214765671514, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 6.953604203806366e-06, | |
| "loss": 1.9008, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 2.0209237050267927, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 6.935091866005518e-06, | |
| "loss": 1.9513, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 2.0243259334864336, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.9165779187027685e-06, | |
| "loss": 1.9013, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.0277281619460745, | |
| "grad_norm": 2.25, | |
| "learning_rate": 6.898062502425267e-06, | |
| "loss": 1.914, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 2.031130390405716, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 6.87954575771132e-06, | |
| "loss": 1.8773, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 2.0345326188653567, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 6.861027825109312e-06, | |
| "loss": 1.9337, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 2.037934847324998, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 6.842508845176642e-06, | |
| "loss": 1.9866, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 2.041337075784639, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 6.8239889584786644e-06, | |
| "loss": 1.9557, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.04473930424428, | |
| "grad_norm": 2.0, | |
| "learning_rate": 6.805468305587612e-06, | |
| "loss": 1.9082, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 2.048141532703921, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 6.786947027081537e-06, | |
| "loss": 1.8822, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 2.051543761163562, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 6.768425263543234e-06, | |
| "loss": 1.9611, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 2.0549459896232034, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 6.7499031555591875e-06, | |
| "loss": 1.9623, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 2.0583482180828443, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 6.7313808437184895e-06, | |
| "loss": 1.9902, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.061750446542485, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 6.71285846861178e-06, | |
| "loss": 1.9358, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 2.0651526750021265, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 6.694336170830184e-06, | |
| "loss": 1.8377, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 2.0685549034617674, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 6.675814090964238e-06, | |
| "loss": 1.9771, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 2.0719571319214087, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 6.6572923696028185e-06, | |
| "loss": 1.8634, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 2.0753593603810496, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 6.638771147332086e-06, | |
| "loss": 1.9388, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.0787615888406905, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 6.62025056473442e-06, | |
| "loss": 1.918, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 2.082163817300332, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 6.601730762387327e-06, | |
| "loss": 1.9617, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 2.0855660457599727, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 6.583211880862406e-06, | |
| "loss": 1.9056, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 2.0889682742196136, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 6.56469406072426e-06, | |
| "loss": 1.9458, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 2.092370502679255, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 6.546177442529437e-06, | |
| "loss": 1.9393, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.095772731138896, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 6.5276621668253645e-06, | |
| "loss": 1.9038, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 2.099174959598537, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 6.509148374149276e-06, | |
| "loss": 1.9621, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 2.102577188058178, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 6.490636205027152e-06, | |
| "loss": 1.9206, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 2.105979416517819, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 6.472125799972643e-06, | |
| "loss": 1.9409, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 2.1093816449774603, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 6.453617299486017e-06, | |
| "loss": 1.9348, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.112783873437101, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 6.435110844053086e-06, | |
| "loss": 1.9364, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 2.1161861018967425, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 6.416606574144131e-06, | |
| "loss": 1.9042, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 2.1195883303563834, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 6.398104630212853e-06, | |
| "loss": 1.9547, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 2.1229905588160243, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 6.379605152695294e-06, | |
| "loss": 1.9768, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 2.1263927872756656, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.361108282008776e-06, | |
| "loss": 1.9522, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.1297950157353065, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 6.342614158550832e-06, | |
| "loss": 1.9168, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 2.133197244194948, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 6.324122922698143e-06, | |
| "loss": 1.9871, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 2.1365994726545887, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 6.305634714805481e-06, | |
| "loss": 1.9398, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 2.1400017011142296, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 6.287149675204619e-06, | |
| "loss": 1.9629, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 2.143403929573871, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 6.268667944203294e-06, | |
| "loss": 1.9102, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.146806158033512, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 6.2501896620841255e-06, | |
| "loss": 1.8596, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 2.150208386493153, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 6.231714969103553e-06, | |
| "loss": 1.7886, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 2.153610614952794, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 6.213244005490776e-06, | |
| "loss": 1.9695, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 2.157012843412435, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 6.194776911446687e-06, | |
| "loss": 1.971, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 2.1604150718720763, | |
| "grad_norm": 2.375, | |
| "learning_rate": 6.176313827142807e-06, | |
| "loss": 1.9136, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.163817300331717, | |
| "grad_norm": 2.25, | |
| "learning_rate": 6.157854892720216e-06, | |
| "loss": 1.9184, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 2.1672195287913585, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 6.139400248288503e-06, | |
| "loss": 1.9933, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 2.1706217572509994, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 6.120950033924691e-06, | |
| "loss": 1.9114, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 2.1740239857106403, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 6.102504389672177e-06, | |
| "loss": 1.9974, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 2.1774262141702816, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 6.084063455539671e-06, | |
| "loss": 1.8925, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.1808284426299225, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 6.065627371500128e-06, | |
| "loss": 1.9208, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 2.184230671089564, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 6.0471962774896946e-06, | |
| "loss": 1.8757, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 2.1876328995492047, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 6.0287703134066385e-06, | |
| "loss": 1.905, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 2.1910351280088456, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 6.010349619110283e-06, | |
| "loss": 1.8878, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 2.194437356468487, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.991934334419968e-06, | |
| "loss": 1.9549, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.197839584928128, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.973524599113954e-06, | |
| "loss": 1.9137, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 2.201241813387769, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 5.9551205529283955e-06, | |
| "loss": 1.9856, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 2.20464404184741, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.936722335556252e-06, | |
| "loss": 1.9262, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 2.208046270307051, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 5.91833008664625e-06, | |
| "loss": 1.9596, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 2.2114484987666922, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.89994394580181e-06, | |
| "loss": 1.907, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.214850727226333, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.881564052579987e-06, | |
| "loss": 1.938, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 2.2182529556859745, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.863190546490422e-06, | |
| "loss": 1.9615, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 2.2216551841456154, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.844823566994264e-06, | |
| "loss": 1.9353, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 2.2250574126052562, | |
| "grad_norm": 2.75, | |
| "learning_rate": 5.826463253503132e-06, | |
| "loss": 1.98, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 2.2284596410648976, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.808109745378048e-06, | |
| "loss": 1.8649, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.2318618695245385, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.789763181928373e-06, | |
| "loss": 1.9079, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 2.23526409798418, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.771423702410762e-06, | |
| "loss": 1.9156, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 2.2386663264438207, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.753091446028094e-06, | |
| "loss": 1.9416, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 2.2420685549034616, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.734766551928427e-06, | |
| "loss": 1.8595, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 2.245470783363103, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.716449159203939e-06, | |
| "loss": 1.9292, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.248873011822744, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.698139406889855e-06, | |
| "loss": 1.9578, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 2.252275240282385, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.679837433963432e-06, | |
| "loss": 1.9706, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 2.255677468742026, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 5.661543379342855e-06, | |
| "loss": 1.9641, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 2.259079697201667, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 5.643257381886218e-06, | |
| "loss": 1.9505, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 2.2624819256613082, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 5.624979580390459e-06, | |
| "loss": 1.9631, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.265884154120949, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.6067101135902996e-06, | |
| "loss": 1.9767, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 2.2692863825805905, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.588449120157205e-06, | |
| "loss": 1.9077, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 2.2726886110402313, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.57019673869832e-06, | |
| "loss": 1.9133, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 2.2760908394998722, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.5519531077554244e-06, | |
| "loss": 1.8405, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 2.2794930679595136, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.533718365803875e-06, | |
| "loss": 1.8948, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.2828952964191545, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.51549265125156e-06, | |
| "loss": 1.9344, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 2.286297524878796, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.4972761024378514e-06, | |
| "loss": 1.842, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 2.2896997533384367, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 5.479068857632542e-06, | |
| "loss": 1.9172, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 2.2931019817980776, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 5.46087105503481e-06, | |
| "loss": 1.9252, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 2.296504210257719, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.4426828327721594e-06, | |
| "loss": 1.9356, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.29990643871736, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.4245043288993795e-06, | |
| "loss": 1.9462, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 2.303308667177001, | |
| "grad_norm": 2.375, | |
| "learning_rate": 5.406335681397498e-06, | |
| "loss": 1.9788, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 2.306710895636642, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.388177028172714e-06, | |
| "loss": 1.9221, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 2.310113124096283, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 5.370028507055387e-06, | |
| "loss": 1.9344, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 2.313515352555924, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.351890255798953e-06, | |
| "loss": 1.871, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.316917581015565, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 5.333762412078907e-06, | |
| "loss": 1.975, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 2.3203198094752064, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.315645113491743e-06, | |
| "loss": 1.9103, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 2.3237220379348473, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.2975384975539145e-06, | |
| "loss": 1.9036, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 2.327124266394488, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.279442701700792e-06, | |
| "loss": 1.9292, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 2.3305264948541295, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 5.261357863285613e-06, | |
| "loss": 1.9181, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.3339287233137704, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 5.243284119578448e-06, | |
| "loss": 1.8917, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 2.3373309517734118, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 5.225221607765159e-06, | |
| "loss": 1.9389, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 2.3407331802330527, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 5.207170464946342e-06, | |
| "loss": 1.9298, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 2.3441354086926935, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.189130828136312e-06, | |
| "loss": 1.9011, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 2.347537637152335, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 5.1711028342620375e-06, | |
| "loss": 1.908, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.3509398656119758, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 5.153086620162123e-06, | |
| "loss": 1.8829, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 2.354342094071617, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.135082322585758e-06, | |
| "loss": 1.9441, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 2.357744322531258, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.117090078191676e-06, | |
| "loss": 1.9403, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 2.361146550990899, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.09911002354713e-06, | |
| "loss": 1.9478, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 2.36454877945054, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.081142295126842e-06, | |
| "loss": 1.8916, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.367951007910181, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.063187029311983e-06, | |
| "loss": 1.9323, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 2.3713532363698224, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 5.045244362389115e-06, | |
| "loss": 1.9571, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 2.3747554648294633, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 5.027314430549185e-06, | |
| "loss": 1.9486, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 2.378157693289104, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.009397369886466e-06, | |
| "loss": 1.944, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 2.3815599217487455, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 4.991493316397536e-06, | |
| "loss": 1.9539, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.3849621502083864, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 4.973602405980251e-06, | |
| "loss": 1.8877, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 2.3883643786680278, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.955724774432697e-06, | |
| "loss": 1.9579, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 2.3917666071276686, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 4.937860557452174e-06, | |
| "loss": 1.9066, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 2.3951688355873095, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 4.920009890634164e-06, | |
| "loss": 1.9488, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 2.398571064046951, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 4.902172909471289e-06, | |
| "loss": 1.9939, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.4019732925065918, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 4.884349749352304e-06, | |
| "loss": 1.9718, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 2.405375520966233, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 4.866540545561045e-06, | |
| "loss": 1.9198, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 2.408777749425874, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 4.848745433275427e-06, | |
| "loss": 1.8993, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 2.412179977885515, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 4.830964547566399e-06, | |
| "loss": 1.9977, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 2.415582206345156, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 4.813198023396925e-06, | |
| "loss": 1.911, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.418984434804797, | |
| "grad_norm": 2.25, | |
| "learning_rate": 4.795445995620965e-06, | |
| "loss": 1.977, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 2.4223866632644384, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 4.777708598982436e-06, | |
| "loss": 1.9065, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 2.4257888917240793, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.759985968114213e-06, | |
| "loss": 1.9569, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 2.42919112018372, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 4.742278237537088e-06, | |
| "loss": 1.9151, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 2.4325933486433615, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 4.72458554165875e-06, | |
| "loss": 1.984, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.4359955771030024, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 4.706908014772776e-06, | |
| "loss": 1.9921, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 2.4393978055626437, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 4.689245791057602e-06, | |
| "loss": 1.9753, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 2.4428000340222846, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 4.671599004575511e-06, | |
| "loss": 1.9305, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 2.4462022624819255, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 4.653967789271607e-06, | |
| "loss": 1.8709, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 2.449604490941567, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 4.636352278972806e-06, | |
| "loss": 1.9123, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.4530067194012077, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 4.618752607386824e-06, | |
| "loss": 1.8976, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 2.456408947860849, | |
| "grad_norm": 2.375, | |
| "learning_rate": 4.601168908101142e-06, | |
| "loss": 2.0117, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 2.45981117632049, | |
| "grad_norm": 2.25, | |
| "learning_rate": 4.5836013145820175e-06, | |
| "loss": 1.8844, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 2.463213404780131, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.5660499601734545e-06, | |
| "loss": 1.9541, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 2.466615633239772, | |
| "grad_norm": 2.375, | |
| "learning_rate": 4.548514978096198e-06, | |
| "loss": 1.9029, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.470017861699413, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 4.5309965014467246e-06, | |
| "loss": 1.9122, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 2.4734200901590544, | |
| "grad_norm": 2.125, | |
| "learning_rate": 4.513494663196221e-06, | |
| "loss": 1.8935, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 2.4768223186186953, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 4.496009596189593e-06, | |
| "loss": 1.9198, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 2.480224547078336, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 4.478541433144435e-06, | |
| "loss": 1.8702, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 2.4836267755379775, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.461090306650046e-06, | |
| "loss": 1.9336, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.4870290039976184, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.443656349166409e-06, | |
| "loss": 1.9156, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 2.4904312324572597, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.426239693023181e-06, | |
| "loss": 1.949, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 2.4938334609169006, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 4.408840470418706e-06, | |
| "loss": 1.9331, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 2.4972356893765415, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 4.391458813418992e-06, | |
| "loss": 1.9376, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 2.500637917836183, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.374094853956726e-06, | |
| "loss": 1.8894, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.5040401462958237, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.3567487238302625e-06, | |
| "loss": 2.0008, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 2.507442374755465, | |
| "grad_norm": 2.5, | |
| "learning_rate": 4.3394205547026224e-06, | |
| "loss": 1.8901, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 2.510844603215106, | |
| "grad_norm": 2.25, | |
| "learning_rate": 4.322110478100502e-06, | |
| "loss": 1.9533, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 2.514246831674747, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.3048186254132606e-06, | |
| "loss": 1.9216, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 2.517649060134388, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 4.287545127891939e-06, | |
| "loss": 1.9397, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.521051288594029, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.270290116648254e-06, | |
| "loss": 1.9161, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 2.5244535170536704, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.2530537226536075e-06, | |
| "loss": 1.8427, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 2.5278557455133113, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 4.235836076738085e-06, | |
| "loss": 1.917, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 2.531257973972952, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 4.218637309589471e-06, | |
| "loss": 1.8681, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 2.5346602024325935, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.201457551752256e-06, | |
| "loss": 1.9049, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.5380624308922344, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.184296933626636e-06, | |
| "loss": 1.9001, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 2.5414646593518757, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 4.167155585467538e-06, | |
| "loss": 1.895, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 2.5448668878115166, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 4.150033637383623e-06, | |
| "loss": 1.9132, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 2.5482691162711575, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 4.132931219336289e-06, | |
| "loss": 1.9031, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 2.551671344730799, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 4.115848461138707e-06, | |
| "loss": 1.8727, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.5550735731904397, | |
| "grad_norm": 2.5, | |
| "learning_rate": 4.0987854924548134e-06, | |
| "loss": 1.8808, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 2.558475801650081, | |
| "grad_norm": 2.5, | |
| "learning_rate": 4.081742442798342e-06, | |
| "loss": 1.9265, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 2.561878030109722, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 4.064719441531834e-06, | |
| "loss": 1.9463, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 2.565280258569363, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 4.04771661786565e-06, | |
| "loss": 1.9341, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 2.568682487029004, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 4.030734100857004e-06, | |
| "loss": 1.9036, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.572084715488645, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 4.013772019408969e-06, | |
| "loss": 1.9604, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 2.5754869439482864, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 3.9968305022695076e-06, | |
| "loss": 1.8938, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 2.5788891724079273, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 3.979909678030498e-06, | |
| "loss": 1.976, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 2.582291400867568, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.9630096751267395e-06, | |
| "loss": 1.9534, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 2.5856936293272095, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.946130621835003e-06, | |
| "loss": 1.9374, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.5890958577868504, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 3.929272646273037e-06, | |
| "loss": 1.9044, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 2.5924980862464917, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.9124358763986045e-06, | |
| "loss": 1.9723, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 2.5959003147061326, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 3.895620440008517e-06, | |
| "loss": 1.8593, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 2.5993025431657735, | |
| "grad_norm": 2.5, | |
| "learning_rate": 3.878826464737643e-06, | |
| "loss": 1.9203, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 2.602704771625415, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 3.862054078057968e-06, | |
| "loss": 1.9127, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.6061070000850557, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 3.845303407277605e-06, | |
| "loss": 1.8969, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 2.609509228544697, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.828574579539842e-06, | |
| "loss": 1.957, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 2.612911457004338, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.811867721822161e-06, | |
| "loss": 1.9497, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 2.616313685463979, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 3.7951829609352926e-06, | |
| "loss": 1.9144, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 2.61971591392362, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 3.778520423522247e-06, | |
| "loss": 1.9252, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.623118142383261, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 3.7618802360573384e-06, | |
| "loss": 1.9192, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 2.6265203708429024, | |
| "grad_norm": 2.0, | |
| "learning_rate": 3.7452625248452478e-06, | |
| "loss": 1.887, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 2.6299225993025432, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 3.728667416020052e-06, | |
| "loss": 1.9326, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 2.633324827762184, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 3.7120950355442677e-06, | |
| "loss": 1.9739, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 2.6367270562218255, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.6955455092078956e-06, | |
| "loss": 1.9417, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.6401292846814663, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.679018962627461e-06, | |
| "loss": 1.9288, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 2.6435315131411077, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 3.6625155212450754e-06, | |
| "loss": 1.9062, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 2.6469337416007486, | |
| "grad_norm": 2.625, | |
| "learning_rate": 3.6460353103274615e-06, | |
| "loss": 1.9304, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 2.6503359700603895, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 3.6295784549650233e-06, | |
| "loss": 1.9378, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 2.6537381985200303, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.613145080070886e-06, | |
| "loss": 1.9244, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.6571404269796717, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.59673531037995e-06, | |
| "loss": 1.8997, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 2.660542655439313, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.5803492704479488e-06, | |
| "loss": 1.9715, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 2.663944883898954, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 3.5639870846504873e-06, | |
| "loss": 1.917, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 2.667347112358595, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 3.54764887718212e-06, | |
| "loss": 1.9122, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 2.6707493408182357, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.5313347720553963e-06, | |
| "loss": 1.9234, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.674151569277877, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 3.5150448930999113e-06, | |
| "loss": 1.9519, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 2.6775537977375183, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.4987793639613926e-06, | |
| "loss": 1.9065, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 2.6809560261971592, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 3.482538308100727e-06, | |
| "loss": 1.8604, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 2.6843582546568, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.4663218487930547e-06, | |
| "loss": 1.8554, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 2.687760483116441, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 3.4501301091268043e-06, | |
| "loss": 1.936, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.6911627115760823, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.433963212002789e-06, | |
| "loss": 1.8966, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 2.6945649400357237, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.41782128013325e-06, | |
| "loss": 1.9634, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.6979671684953646, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 3.4017044360409375e-06, | |
| "loss": 1.922, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 2.7013693969550054, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 3.3856128020581783e-06, | |
| "loss": 1.9411, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 2.7047716254146463, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.3695465003259376e-06, | |
| "loss": 1.8679, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.7081738538742877, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 3.353505652792909e-06, | |
| "loss": 1.906, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 2.711576082333929, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 3.3374903812145784e-06, | |
| "loss": 1.8951, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 2.71497831079357, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 3.3215008071522965e-06, | |
| "loss": 1.9556, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 2.7183805392532108, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 3.3055370519723652e-06, | |
| "loss": 1.9427, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 2.7217827677128517, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 3.289599236845113e-06, | |
| "loss": 1.9533, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.725184996172493, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.273687482743974e-06, | |
| "loss": 1.9608, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 2.7285872246321343, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 3.2578019104445702e-06, | |
| "loss": 1.9894, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 2.731989453091775, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 3.241942640523791e-06, | |
| "loss": 1.864, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 2.735391681551416, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 3.2261097933588893e-06, | |
| "loss": 1.9567, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 2.738793910011057, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 3.210303489126551e-06, | |
| "loss": 1.9093, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.7421961384706983, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 3.1945238478020003e-06, | |
| "loss": 1.9673, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 2.745598366930339, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.1787709891580763e-06, | |
| "loss": 1.9712, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 2.7490005953899805, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.1630450327643315e-06, | |
| "loss": 1.9127, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 2.7524028238496214, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.147346097986121e-06, | |
| "loss": 1.9763, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 2.7558050523092623, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 3.1316743039836908e-06, | |
| "loss": 1.8313, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.7592072807689036, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 3.1160297697112855e-06, | |
| "loss": 1.9062, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 2.7626095092285445, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.10041261391624e-06, | |
| "loss": 1.9072, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 2.766011737688186, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 3.0848229551380702e-06, | |
| "loss": 1.932, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 2.7694139661478268, | |
| "grad_norm": 2.375, | |
| "learning_rate": 3.069260911707586e-06, | |
| "loss": 1.9311, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 2.7728161946074676, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 3.0537266017459856e-06, | |
| "loss": 1.9067, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.776218423067109, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.0382201431639656e-06, | |
| "loss": 1.978, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 2.77962065152675, | |
| "grad_norm": 2.375, | |
| "learning_rate": 3.0227416536608095e-06, | |
| "loss": 1.9084, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 2.783022879986391, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.0072912507235167e-06, | |
| "loss": 1.8865, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 2.786425108446032, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.991869051625898e-06, | |
| "loss": 1.9293, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 2.789827336905673, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.9764751734276803e-06, | |
| "loss": 1.9127, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.7932295653653143, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 2.9611097329736394e-06, | |
| "loss": 1.9198, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 2.796631793824955, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.9457728468926836e-06, | |
| "loss": 1.9261, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 2.8000340222845965, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.930464631596993e-06, | |
| "loss": 1.9068, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 2.8034362507442374, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.915185203281126e-06, | |
| "loss": 1.947, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 2.8068384792038783, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 2.899934677921133e-06, | |
| "loss": 1.9014, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.8102407076635196, | |
| "grad_norm": 2.25, | |
| "learning_rate": 2.884713171273686e-06, | |
| "loss": 1.9012, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 2.8136429361231605, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.869520798875194e-06, | |
| "loss": 1.9299, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 2.817045164582802, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.8543576760409264e-06, | |
| "loss": 1.9472, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 2.8204473930424427, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.839223917864142e-06, | |
| "loss": 1.9323, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 2.8238496215020836, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 2.824119639215203e-06, | |
| "loss": 1.9394, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.827251849961725, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.809044954740723e-06, | |
| "loss": 1.9369, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 2.830654078421366, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.7939999788626755e-06, | |
| "loss": 1.9025, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 2.834056306881007, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 2.778984825777543e-06, | |
| "loss": 1.908, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 2.837458535340648, | |
| "grad_norm": 2.5, | |
| "learning_rate": 2.763999609455441e-06, | |
| "loss": 1.9814, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 2.840860763800289, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 2.7490444436392535e-06, | |
| "loss": 1.9804, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.8442629922599303, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.7341194418437747e-06, | |
| "loss": 1.9187, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 2.847665220719571, | |
| "grad_norm": 2.25, | |
| "learning_rate": 2.7192247173548356e-06, | |
| "loss": 1.8885, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 2.8510674491792125, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.7043603832284616e-06, | |
| "loss": 1.9056, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 2.8544696776388534, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 2.689526552289997e-06, | |
| "loss": 1.9068, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 2.8578719060984943, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 2.6747233371332606e-06, | |
| "loss": 2.0559, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.8612741345581356, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.6599508501196876e-06, | |
| "loss": 1.9102, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 2.8646763630177765, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.6452092033774744e-06, | |
| "loss": 1.878, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 2.868078591477418, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.630498508800734e-06, | |
| "loss": 1.9412, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 2.8714808199370587, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.6158188780486312e-06, | |
| "loss": 1.8957, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 2.8748830483966996, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 2.6011704225445548e-06, | |
| "loss": 1.8656, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.878285276856341, | |
| "grad_norm": 2.5, | |
| "learning_rate": 2.586553253475264e-06, | |
| "loss": 1.9598, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 2.881687505315982, | |
| "grad_norm": 2.25, | |
| "learning_rate": 2.5719674817900346e-06, | |
| "loss": 1.957, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 2.885089733775623, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.5574132181998334e-06, | |
| "loss": 1.9725, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 2.888491962235264, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 2.5428905731764664e-06, | |
| "loss": 1.9228, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 2.891894190694905, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.5283996569517464e-06, | |
| "loss": 1.938, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.8952964191545463, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.5139405795166538e-06, | |
| "loss": 1.9243, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 2.898698647614187, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.4995134506204964e-06, | |
| "loss": 1.9328, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 2.9021008760738285, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.48511837977009e-06, | |
| "loss": 1.9199, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 2.9055031045334694, | |
| "grad_norm": 2.625, | |
| "learning_rate": 2.4707554762289077e-06, | |
| "loss": 1.9613, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 2.9089053329931103, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.4564248490162763e-06, | |
| "loss": 1.9547, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.9123075614527516, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 2.442126606906526e-06, | |
| "loss": 2.0251, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 2.9157097899123925, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.4278608584281694e-06, | |
| "loss": 1.9231, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 2.919112018372034, | |
| "grad_norm": 2.625, | |
| "learning_rate": 2.413627711863091e-06, | |
| "loss": 1.9295, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 2.9225142468316747, | |
| "grad_norm": 2.5, | |
| "learning_rate": 2.399427275245705e-06, | |
| "loss": 1.9444, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 2.9259164752913156, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 2.3852596563621536e-06, | |
| "loss": 1.9794, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.929318703750957, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.3711249627494803e-06, | |
| "loss": 1.9096, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 2.932720932210598, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 2.3570233016948133e-06, | |
| "loss": 1.9062, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 2.936123160670239, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 2.3429547802345537e-06, | |
| "loss": 1.8779, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 2.93952538912988, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.3289195051535584e-06, | |
| "loss": 1.8901, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.942927617589521, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 2.3149175829843367e-06, | |
| "loss": 1.9073, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.9463298460491623, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.3009491200062343e-06, | |
| "loss": 1.9434, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 2.949732074508803, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.287014222244634e-06, | |
| "loss": 1.88, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 2.9531343029684445, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 2.273112995470147e-06, | |
| "loss": 1.968, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 2.9565365314280854, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 2.259245545197807e-06, | |
| "loss": 1.9048, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 2.9599387598877263, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.245411976686278e-06, | |
| "loss": 1.9502, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.9633409883473676, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 2.231612394937042e-06, | |
| "loss": 1.87, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 2.9667432168070085, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.217846904693616e-06, | |
| "loss": 1.9337, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 2.97014544526665, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 2.2041156104407518e-06, | |
| "loss": 1.9095, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 2.9735476737262907, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 2.1904186164036358e-06, | |
| "loss": 1.9346, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 2.9769499021859316, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.1767560265471087e-06, | |
| "loss": 1.9296, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.980352130645573, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 2.163127944574872e-06, | |
| "loss": 1.9386, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 2.983754359105214, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.149534473928699e-06, | |
| "loss": 1.9189, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 2.987156587564855, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.135975717787654e-06, | |
| "loss": 1.8996, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 2.990558816024496, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.1224517790673003e-06, | |
| "loss": 1.937, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 2.993961044484137, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.108962760418933e-06, | |
| "loss": 1.9724, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.9973632729437782, | |
| "grad_norm": 2.5, | |
| "learning_rate": 2.0955087642287833e-06, | |
| "loss": 1.9497, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 3.000765501403419, | |
| "grad_norm": 2.5, | |
| "learning_rate": 2.0820898926172546e-06, | |
| "loss": 1.9683, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 3.0041677298630605, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.0687062474381516e-06, | |
| "loss": 1.9146, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 3.0075699583227014, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.05535793027788e-06, | |
| "loss": 1.9749, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 3.0109721867823422, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.042045042454711e-06, | |
| "loss": 1.9554, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 3.0143744152419836, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 2.028767685017981e-06, | |
| "loss": 1.8963, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 3.0177766437016245, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 2.015525958747352e-06, | |
| "loss": 1.938, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 3.021178872161266, | |
| "grad_norm": 2.625, | |
| "learning_rate": 2.0023199641520177e-06, | |
| "loss": 1.9223, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 3.0245811006209067, | |
| "grad_norm": 2.625, | |
| "learning_rate": 1.989149801469974e-06, | |
| "loss": 1.8825, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 3.0279833290805476, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.97601557066723e-06, | |
| "loss": 1.9489, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 3.031385557540189, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.9629173714370583e-06, | |
| "loss": 1.9236, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 3.03478778599983, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.949855303199246e-06, | |
| "loss": 1.9561, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 3.038190014459471, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.9368294650993263e-06, | |
| "loss": 1.8969, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 3.041592242919112, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.92383995600784e-06, | |
| "loss": 1.9331, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 3.044994471378753, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.910886874519575e-06, | |
| "loss": 1.9734, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 3.0483966998383942, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.8979703189528225e-06, | |
| "loss": 1.918, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 3.051798928298035, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.885090387348631e-06, | |
| "loss": 1.9162, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 3.0552011567576765, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.8722471774700541e-06, | |
| "loss": 1.9047, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 3.0586033852173173, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.8594407868014222e-06, | |
| "loss": 1.9391, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 3.0620056136769582, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.8466713125475953e-06, | |
| "loss": 1.9597, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.0654078421365996, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.8339388516332183e-06, | |
| "loss": 1.9123, | |
| "step": 9010 | |
| }, | |
| { | |
| "epoch": 3.0688100705962404, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.8212435007019987e-06, | |
| "loss": 1.9063, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 3.072212299055882, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.8085853561159651e-06, | |
| "loss": 1.8604, | |
| "step": 9030 | |
| }, | |
| { | |
| "epoch": 3.0756145275155227, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.7959645139547367e-06, | |
| "loss": 1.9165, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 3.0790167559751636, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 1.7833810700147973e-06, | |
| "loss": 1.9096, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 3.082418984434805, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.770835119808758e-06, | |
| "loss": 1.9433, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 3.0858212128944458, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.7583267585646496e-06, | |
| "loss": 1.972, | |
| "step": 9070 | |
| }, | |
| { | |
| "epoch": 3.089223441354087, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.7458560812251807e-06, | |
| "loss": 1.9191, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 3.092625669813728, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.7334231824470327e-06, | |
| "loss": 1.882, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 3.096027898273369, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.7210281566001321e-06, | |
| "loss": 1.9086, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.09943012673301, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.7086710977669391e-06, | |
| "loss": 1.9225, | |
| "step": 9110 | |
| }, | |
| { | |
| "epoch": 3.102832355192651, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.6963520997417304e-06, | |
| "loss": 1.9364, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 3.1062345836522924, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.684071256029885e-06, | |
| "loss": 1.962, | |
| "step": 9130 | |
| }, | |
| { | |
| "epoch": 3.1096368121119333, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.6718286598471834e-06, | |
| "loss": 1.9557, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 3.113039040571574, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.6596244041190884e-06, | |
| "loss": 1.963, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 3.1164412690312155, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.6474585814800486e-06, | |
| "loss": 1.8665, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 3.1198434974908564, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.6353312842727971e-06, | |
| "loss": 1.9364, | |
| "step": 9170 | |
| }, | |
| { | |
| "epoch": 3.1232457259504978, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.6232426045476368e-06, | |
| "loss": 1.9379, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 3.1266479544101387, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.6111926340617594e-06, | |
| "loss": 1.8696, | |
| "step": 9190 | |
| }, | |
| { | |
| "epoch": 3.1300501828697795, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.599181464278531e-06, | |
| "loss": 1.9511, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.133452411329421, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.587209186366815e-06, | |
| "loss": 1.9289, | |
| "step": 9210 | |
| }, | |
| { | |
| "epoch": 3.1368546397890618, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.5752758912002694e-06, | |
| "loss": 1.8937, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 3.140256868248703, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.5633816693566608e-06, | |
| "loss": 1.8763, | |
| "step": 9230 | |
| }, | |
| { | |
| "epoch": 3.143659096708344, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.5515266111171768e-06, | |
| "loss": 1.9913, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 3.147061325167985, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.5397108064657348e-06, | |
| "loss": 1.8861, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 3.150463553627626, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.5279343450883104e-06, | |
| "loss": 1.9029, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 3.153865782087267, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.5161973163722477e-06, | |
| "loss": 1.9382, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 3.1572680105469084, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.5044998094055818e-06, | |
| "loss": 1.8859, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 3.1606702390065493, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.4928419129763672e-06, | |
| "loss": 1.8785, | |
| "step": 9290 | |
| }, | |
| { | |
| "epoch": 3.16407246746619, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.4812237155720006e-06, | |
| "loss": 1.8864, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.1674746959258315, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 1.4696453053785496e-06, | |
| "loss": 1.8698, | |
| "step": 9310 | |
| }, | |
| { | |
| "epoch": 3.1708769243854724, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.4581067702800793e-06, | |
| "loss": 1.9852, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 3.1742791528451137, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.4466081978579942e-06, | |
| "loss": 1.98, | |
| "step": 9330 | |
| }, | |
| { | |
| "epoch": 3.1776813813047546, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.4351496753903699e-06, | |
| "loss": 1.925, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 3.1810836097643955, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.4237312898512816e-06, | |
| "loss": 1.9355, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 3.184485838224037, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.4123531279101576e-06, | |
| "loss": 1.9966, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 3.1878880666836777, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.4010152759311148e-06, | |
| "loss": 1.8377, | |
| "step": 9370 | |
| }, | |
| { | |
| "epoch": 3.191290295143319, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.3897178199723027e-06, | |
| "loss": 1.9501, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 3.19469252360296, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.3784608457852537e-06, | |
| "loss": 1.9103, | |
| "step": 9390 | |
| }, | |
| { | |
| "epoch": 3.198094752062601, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.3672444388142238e-06, | |
| "loss": 1.9575, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.201496980522242, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.3560686841955576e-06, | |
| "loss": 1.929, | |
| "step": 9410 | |
| }, | |
| { | |
| "epoch": 3.204899208981883, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.3449336667570272e-06, | |
| "loss": 1.9606, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 3.2083014374415244, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.3338394710172017e-06, | |
| "loss": 1.9379, | |
| "step": 9430 | |
| }, | |
| { | |
| "epoch": 3.2117036659011653, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 1.3227861811847961e-06, | |
| "loss": 1.8995, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 3.215105894360806, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.3117738811580378e-06, | |
| "loss": 1.9038, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 3.2185081228204475, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.3008026545240273e-06, | |
| "loss": 1.9499, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 3.2219103512800884, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.2898725845581015e-06, | |
| "loss": 1.9625, | |
| "step": 9470 | |
| }, | |
| { | |
| "epoch": 3.2253125797397297, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.2789837542232062e-06, | |
| "loss": 2.0014, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 3.2287148081993706, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.2681362461692674e-06, | |
| "loss": 1.9227, | |
| "step": 9490 | |
| }, | |
| { | |
| "epoch": 3.2321170366590115, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 1.2573301427325523e-06, | |
| "loss": 1.9411, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.235519265118653, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.246565525935065e-06, | |
| "loss": 1.8898, | |
| "step": 9510 | |
| }, | |
| { | |
| "epoch": 3.2389214935782937, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.2358424774839005e-06, | |
| "loss": 1.8962, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 3.242323722037935, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.2251610787706435e-06, | |
| "loss": 1.9404, | |
| "step": 9530 | |
| }, | |
| { | |
| "epoch": 3.245725950497576, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.2145214108707407e-06, | |
| "loss": 1.8978, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 3.249128178957217, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.2039235545428843e-06, | |
| "loss": 1.9312, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 3.252530407416858, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.1933675902284088e-06, | |
| "loss": 1.8721, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 3.255932635876499, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.182853598050669e-06, | |
| "loss": 1.9304, | |
| "step": 9570 | |
| }, | |
| { | |
| "epoch": 3.2593348643361404, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.1723816578144417e-06, | |
| "loss": 1.8912, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 3.2627370927957813, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.1619518490053083e-06, | |
| "loss": 1.8852, | |
| "step": 9590 | |
| }, | |
| { | |
| "epoch": 3.266139321255422, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.1515642507890646e-06, | |
| "loss": 1.9256, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.2695415497150635, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.141218942011112e-06, | |
| "loss": 1.8988, | |
| "step": 9610 | |
| }, | |
| { | |
| "epoch": 3.2729437781747044, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.1309160011958583e-06, | |
| "loss": 1.9262, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 3.2763460066343457, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.1206555065461265e-06, | |
| "loss": 1.9177, | |
| "step": 9630 | |
| }, | |
| { | |
| "epoch": 3.2797482350939866, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.1104375359425585e-06, | |
| "loss": 1.9117, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 3.2831504635536275, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.100262166943023e-06, | |
| "loss": 1.9711, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 3.286552692013269, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.0901294767820318e-06, | |
| "loss": 1.9243, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 3.2899549204729097, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.0800395423701436e-06, | |
| "loss": 1.9023, | |
| "step": 9670 | |
| }, | |
| { | |
| "epoch": 3.293357148932551, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.0699924402933917e-06, | |
| "loss": 1.938, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 3.296759377392192, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.0599882468126933e-06, | |
| "loss": 1.9328, | |
| "step": 9690 | |
| }, | |
| { | |
| "epoch": 3.300161605851833, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.0500270378632782e-06, | |
| "loss": 1.9429, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.303563834311474, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.0401088890541082e-06, | |
| "loss": 1.9068, | |
| "step": 9710 | |
| }, | |
| { | |
| "epoch": 3.306966062771115, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.0302338756673032e-06, | |
| "loss": 1.9121, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 3.3103682912307564, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.0204020726575725e-06, | |
| "loss": 1.9197, | |
| "step": 9730 | |
| }, | |
| { | |
| "epoch": 3.3137705196903973, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.0106135546516385e-06, | |
| "loss": 1.9347, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 3.317172748150038, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.0008683959476827e-06, | |
| "loss": 1.929, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 3.3205749766096795, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 9.911666705147721e-07, | |
| "loss": 1.8878, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 3.3239772050693204, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 9.815084519922975e-07, | |
| "loss": 1.8525, | |
| "step": 9770 | |
| }, | |
| { | |
| "epoch": 3.3273794335289617, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 9.718938136894211e-07, | |
| "loss": 1.8368, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 3.3307816619886026, | |
| "grad_norm": 2.0, | |
| "learning_rate": 9.623228285845155e-07, | |
| "loss": 1.8964, | |
| "step": 9790 | |
| }, | |
| { | |
| "epoch": 3.3341838904482435, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 9.527955693246117e-07, | |
| "loss": 1.9062, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.337586118907885, | |
| "grad_norm": 2.125, | |
| "learning_rate": 9.433121082248422e-07, | |
| "loss": 1.87, | |
| "step": 9810 | |
| }, | |
| { | |
| "epoch": 3.3409883473675257, | |
| "grad_norm": 2.5, | |
| "learning_rate": 9.33872517267902e-07, | |
| "loss": 1.9351, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 3.344390575827167, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 9.244768681034954e-07, | |
| "loss": 1.9826, | |
| "step": 9830 | |
| }, | |
| { | |
| "epoch": 3.347792804286808, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 9.151252320477888e-07, | |
| "loss": 1.9788, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 3.351195032746449, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 9.058176800828842e-07, | |
| "loss": 1.9306, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 3.35459726120609, | |
| "grad_norm": 2.375, | |
| "learning_rate": 8.965542828562589e-07, | |
| "loss": 1.9304, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 3.357999489665731, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 8.873351106802486e-07, | |
| "loss": 1.9565, | |
| "step": 9870 | |
| }, | |
| { | |
| "epoch": 3.3614017181253724, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 8.781602335315041e-07, | |
| "loss": 1.9325, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 3.3648039465850133, | |
| "grad_norm": 2.25, | |
| "learning_rate": 8.690297210504589e-07, | |
| "loss": 1.9074, | |
| "step": 9890 | |
| }, | |
| { | |
| "epoch": 3.368206175044654, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 8.599436425408064e-07, | |
| "loss": 1.9338, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.3716084035042955, | |
| "grad_norm": 2.625, | |
| "learning_rate": 8.509020669689717e-07, | |
| "loss": 1.9236, | |
| "step": 9910 | |
| }, | |
| { | |
| "epoch": 3.3750106319639364, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 8.419050629635849e-07, | |
| "loss": 1.9387, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 3.3784128604235777, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 8.329526988149661e-07, | |
| "loss": 1.9503, | |
| "step": 9930 | |
| }, | |
| { | |
| "epoch": 3.3818150888832186, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 8.240450424745993e-07, | |
| "loss": 1.9232, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 3.3852173173428595, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 8.151821615546263e-07, | |
| "loss": 1.9435, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 3.388619545802501, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8.063641233273221e-07, | |
| "loss": 1.9005, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 3.3920217742621417, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 7.975909947245956e-07, | |
| "loss": 1.864, | |
| "step": 9970 | |
| }, | |
| { | |
| "epoch": 3.3954240027217826, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 7.888628423374738e-07, | |
| "loss": 1.9707, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 3.398826231181424, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 7.801797324156009e-07, | |
| "loss": 1.9314, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 3.402228459641065, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 7.715417308667326e-07, | |
| "loss": 1.9229, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.405630688100706, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 7.629489032562336e-07, | |
| "loss": 1.86, | |
| "step": 10010 | |
| }, | |
| { | |
| "epoch": 3.409032916560347, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 7.544013148065898e-07, | |
| "loss": 1.9123, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 3.412435145019988, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 7.45899030396898e-07, | |
| "loss": 1.8735, | |
| "step": 10030 | |
| }, | |
| { | |
| "epoch": 3.4158373734796292, | |
| "grad_norm": 2.375, | |
| "learning_rate": 7.374421145623891e-07, | |
| "loss": 1.9386, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 3.41923960193927, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 7.290306314939283e-07, | |
| "loss": 1.8794, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 3.4226418303989115, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 7.206646450375306e-07, | |
| "loss": 1.9236, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 3.4260440588585523, | |
| "grad_norm": 2.25, | |
| "learning_rate": 7.123442186938769e-07, | |
| "loss": 1.9224, | |
| "step": 10070 | |
| }, | |
| { | |
| "epoch": 3.4294462873181932, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 7.040694156178301e-07, | |
| "loss": 1.9089, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 3.4328485157778346, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.958402986179579e-07, | |
| "loss": 1.9395, | |
| "step": 10090 | |
| }, | |
| { | |
| "epoch": 3.4362507442374755, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 6.87656930156057e-07, | |
| "loss": 1.9217, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.439652972697117, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 6.795193723466726e-07, | |
| "loss": 1.9458, | |
| "step": 10110 | |
| }, | |
| { | |
| "epoch": 3.4430552011567577, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 6.714276869566347e-07, | |
| "loss": 1.9698, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 3.4464574296163986, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 6.633819354045855e-07, | |
| "loss": 1.9773, | |
| "step": 10130 | |
| }, | |
| { | |
| "epoch": 3.44985965807604, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 6.553821787605149e-07, | |
| "loss": 1.8458, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 3.453261886535681, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 6.474284777452948e-07, | |
| "loss": 1.9633, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 3.456664114995322, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 6.395208927302167e-07, | |
| "loss": 1.9253, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 3.460066343454963, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 6.31659483736541e-07, | |
| "loss": 1.8867, | |
| "step": 10170 | |
| }, | |
| { | |
| "epoch": 3.463468571914604, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 6.238443104350302e-07, | |
| "loss": 1.9415, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 3.466870800374245, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 6.160754321455092e-07, | |
| "loss": 1.8688, | |
| "step": 10190 | |
| }, | |
| { | |
| "epoch": 3.470273028833886, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 6.083529078364046e-07, | |
| "loss": 1.8777, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.4736752572935274, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 6.006767961242978e-07, | |
| "loss": 1.8808, | |
| "step": 10210 | |
| }, | |
| { | |
| "epoch": 3.4770774857531683, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.930471552734888e-07, | |
| "loss": 1.9203, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 3.480479714212809, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.854640431955407e-07, | |
| "loss": 1.9427, | |
| "step": 10230 | |
| }, | |
| { | |
| "epoch": 3.4838819426724505, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.779275174488542e-07, | |
| "loss": 1.9229, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 3.4872841711320914, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 5.704376352382198e-07, | |
| "loss": 1.8909, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.4906863995917328, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.629944534143905e-07, | |
| "loss": 1.9481, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 3.4940886280513737, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 5.555980284736454e-07, | |
| "loss": 1.9152, | |
| "step": 10270 | |
| }, | |
| { | |
| "epoch": 3.4974908565110145, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 5.482484165573627e-07, | |
| "loss": 1.9002, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 3.500893084970656, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 5.409456734515961e-07, | |
| "loss": 1.9427, | |
| "step": 10290 | |
| }, | |
| { | |
| "epoch": 3.5042953134302968, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 5.336898545866455e-07, | |
| "loss": 1.9312, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.5076975418899377, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.264810150366431e-07, | |
| "loss": 1.9146, | |
| "step": 10310 | |
| }, | |
| { | |
| "epoch": 3.511099770349579, | |
| "grad_norm": 2.625, | |
| "learning_rate": 5.193192095191315e-07, | |
| "loss": 1.932, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 3.51450199880922, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.122044923946488e-07, | |
| "loss": 1.9544, | |
| "step": 10330 | |
| }, | |
| { | |
| "epoch": 3.517904227268861, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.051369176663161e-07, | |
| "loss": 1.9132, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 3.521306455728502, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 4.981165389794265e-07, | |
| "loss": 1.9379, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.524708684188143, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 4.911434096210408e-07, | |
| "loss": 1.8495, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 3.5281109126477843, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 4.842175825195817e-07, | |
| "loss": 1.964, | |
| "step": 10370 | |
| }, | |
| { | |
| "epoch": 3.531513141107425, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 4.773391102444278e-07, | |
| "loss": 1.8755, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 3.5349153695670665, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 4.705080450055242e-07, | |
| "loss": 1.902, | |
| "step": 10390 | |
| }, | |
| { | |
| "epoch": 3.5383175980267074, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 4.63724438652977e-07, | |
| "loss": 1.9428, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.5417198264863483, | |
| "grad_norm": 2.125, | |
| "learning_rate": 4.5698834267666295e-07, | |
| "loss": 1.8812, | |
| "step": 10410 | |
| }, | |
| { | |
| "epoch": 3.5451220549459896, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 4.502998082058419e-07, | |
| "loss": 1.9378, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 3.5485242834056305, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 4.4365888600876105e-07, | |
| "loss": 1.8586, | |
| "step": 10430 | |
| }, | |
| { | |
| "epoch": 3.551926511865272, | |
| "grad_norm": 2.5, | |
| "learning_rate": 4.3706562649227966e-07, | |
| "loss": 1.9303, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 3.5553287403249128, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.305200797014755e-07, | |
| "loss": 1.8785, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.5587309687845536, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 4.2402229531927284e-07, | |
| "loss": 1.8698, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 3.562133197244195, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 4.1757232266606775e-07, | |
| "loss": 1.9134, | |
| "step": 10470 | |
| }, | |
| { | |
| "epoch": 3.565535425703836, | |
| "grad_norm": 2.0, | |
| "learning_rate": 4.1117021069934086e-07, | |
| "loss": 1.9092, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 3.568937654163477, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 4.048160080133004e-07, | |
| "loss": 1.8521, | |
| "step": 10490 | |
| }, | |
| { | |
| "epoch": 3.572339882623118, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.985097628385017e-07, | |
| "loss": 1.9322, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.575742111082759, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.9225152304149186e-07, | |
| "loss": 1.95, | |
| "step": 10510 | |
| }, | |
| { | |
| "epoch": 3.5791443395424003, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 3.8604133612443344e-07, | |
| "loss": 1.8966, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 3.582546568002041, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 3.798792492247598e-07, | |
| "loss": 1.8615, | |
| "step": 10530 | |
| }, | |
| { | |
| "epoch": 3.5859487964616825, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.737653091148046e-07, | |
| "loss": 1.9687, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 3.5893510249213234, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 3.6769956220144835e-07, | |
| "loss": 1.9133, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.5927532533809643, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.61682054525775e-07, | |
| "loss": 1.9313, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 3.5961554818406056, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 3.5571283176270955e-07, | |
| "loss": 2.0094, | |
| "step": 10570 | |
| }, | |
| { | |
| "epoch": 3.5995577103002465, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.4979193922068417e-07, | |
| "loss": 1.9955, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 3.602959938759888, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 3.439194218412834e-07, | |
| "loss": 1.9294, | |
| "step": 10590 | |
| }, | |
| { | |
| "epoch": 3.6063621672195287, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 3.380953241989119e-07, | |
| "loss": 1.8658, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.6097643956791696, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 3.3231969050044987e-07, | |
| "loss": 1.9264, | |
| "step": 10610 | |
| }, | |
| { | |
| "epoch": 3.613166624138811, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.2659256458491855e-07, | |
| "loss": 1.9539, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 3.616568852598452, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.209139899231508e-07, | |
| "loss": 1.9833, | |
| "step": 10630 | |
| }, | |
| { | |
| "epoch": 3.619971081058093, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.1528400961745953e-07, | |
| "loss": 1.9088, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 3.623373309517734, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 3.0970266640130633e-07, | |
| "loss": 1.9261, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.626775537977375, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.0417000263898494e-07, | |
| "loss": 1.8439, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 3.6301777664370163, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 2.9868606032529224e-07, | |
| "loss": 1.9474, | |
| "step": 10670 | |
| }, | |
| { | |
| "epoch": 3.633579994896657, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.932508810852159e-07, | |
| "loss": 1.9432, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 3.6369822233562985, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 2.8786450617361245e-07, | |
| "loss": 1.8769, | |
| "step": 10690 | |
| }, | |
| { | |
| "epoch": 3.6403844518159394, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.825269764748977e-07, | |
| "loss": 1.9754, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.6437866802755803, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 2.772383325027377e-07, | |
| "loss": 1.9327, | |
| "step": 10710 | |
| }, | |
| { | |
| "epoch": 3.6471889087352216, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 2.719986143997357e-07, | |
| "loss": 1.916, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 3.6505911371948625, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 2.668078619371333e-07, | |
| "loss": 1.8941, | |
| "step": 10730 | |
| }, | |
| { | |
| "epoch": 3.653993365654504, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 2.616661145145063e-07, | |
| "loss": 1.9525, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 3.6573955941141447, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 2.5657341115946487e-07, | |
| "loss": 1.8995, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.6607978225737856, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 2.5152979052736e-07, | |
| "loss": 1.9815, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 3.664200051033427, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 2.46535290900983e-07, | |
| "loss": 1.8823, | |
| "step": 10770 | |
| }, | |
| { | |
| "epoch": 3.667602279493068, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 2.4158995019028676e-07, | |
| "loss": 1.9158, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 3.671004507952709, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 2.3669380593208516e-07, | |
| "loss": 1.8857, | |
| "step": 10790 | |
| }, | |
| { | |
| "epoch": 3.67440673641235, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.3184689528977832e-07, | |
| "loss": 1.8922, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.677808964871991, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.270492550530667e-07, | |
| "loss": 1.9044, | |
| "step": 10810 | |
| }, | |
| { | |
| "epoch": 3.6812111933316323, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.2230092163766907e-07, | |
| "loss": 1.9365, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 3.684613421791273, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.1760193108504913e-07, | |
| "loss": 1.894, | |
| "step": 10830 | |
| }, | |
| { | |
| "epoch": 3.6880156502509145, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.1295231906214332e-07, | |
| "loss": 1.9366, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 3.6914178787105554, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 2.0835212086108594e-07, | |
| "loss": 1.9098, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.6948201071701963, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 2.038013713989457e-07, | |
| "loss": 1.9487, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 3.6982223356298376, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.9930010521745713e-07, | |
| "loss": 1.8716, | |
| "step": 10870 | |
| }, | |
| { | |
| "epoch": 3.7016245640894785, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.9484835648276147e-07, | |
| "loss": 1.8958, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 3.70502679254912, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.904461589851424e-07, | |
| "loss": 1.8943, | |
| "step": 10890 | |
| }, | |
| { | |
| "epoch": 3.7084290210087607, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.8609354613877697e-07, | |
| "loss": 1.8747, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.7118312494684016, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.817905509814755e-07, | |
| "loss": 1.9229, | |
| "step": 10910 | |
| }, | |
| { | |
| "epoch": 3.715233477928043, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.7753720617443335e-07, | |
| "loss": 1.9303, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 3.718635706387684, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.7333354400198364e-07, | |
| "loss": 1.9388, | |
| "step": 10930 | |
| }, | |
| { | |
| "epoch": 3.722037934847325, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.691795963713496e-07, | |
| "loss": 1.892, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 3.725440163306966, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.6507539481240707e-07, | |
| "loss": 1.9215, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.728842391766607, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.6102097047744054e-07, | |
| "loss": 1.9803, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 3.7322446202262483, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.5701635414090798e-07, | |
| "loss": 1.9324, | |
| "step": 10970 | |
| }, | |
| { | |
| "epoch": 3.735646848685889, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.530615761992094e-07, | |
| "loss": 1.8066, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 3.7390490771455305, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.4915666667045188e-07, | |
| "loss": 1.8818, | |
| "step": 10990 | |
| }, | |
| { | |
| "epoch": 3.7424513056051714, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.4530165519422625e-07, | |
| "loss": 1.9121, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.7458535340648123, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.4149657103138097e-07, | |
| "loss": 1.9224, | |
| "step": 11010 | |
| }, | |
| { | |
| "epoch": 3.7492557625244536, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.377414430637975e-07, | |
| "loss": 1.9537, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 3.7526579909840945, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.3403629979417308e-07, | |
| "loss": 1.9439, | |
| "step": 11030 | |
| }, | |
| { | |
| "epoch": 3.756060219443736, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.303811693458042e-07, | |
| "loss": 1.9555, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 3.7594624479033767, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.2677607946237328e-07, | |
| "loss": 1.9296, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.7628646763630176, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 1.2322105750773803e-07, | |
| "loss": 1.9048, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 3.766266904822659, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.1971613046572323e-07, | |
| "loss": 1.9255, | |
| "step": 11070 | |
| }, | |
| { | |
| "epoch": 3.7696691332823, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 1.1626132493991633e-07, | |
| "loss": 1.9011, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 3.773071361741941, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.1285666715346502e-07, | |
| "loss": 1.8918, | |
| "step": 11090 | |
| }, | |
| { | |
| "epoch": 3.776473590201582, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.0950218294888028e-07, | |
| "loss": 1.84, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.779875818661223, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.0619789778783557e-07, | |
| "loss": 1.979, | |
| "step": 11110 | |
| }, | |
| { | |
| "epoch": 3.7832780471208642, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 1.0294383675097872e-07, | |
| "loss": 1.9141, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 3.786680275580505, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 9.974002453774011e-08, | |
| "loss": 1.98, | |
| "step": 11130 | |
| }, | |
| { | |
| "epoch": 3.7900825040401465, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 9.658648546614084e-08, | |
| "loss": 1.9723, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 3.7934847324997873, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 9.348324347261734e-08, | |
| "loss": 1.8887, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.7968869609594282, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 9.04303221118288e-08, | |
| "loss": 1.8763, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 3.8002891894190696, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 8.742774455648695e-08, | |
| "loss": 1.9326, | |
| "step": 11170 | |
| }, | |
| { | |
| "epoch": 3.8036914178787105, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 8.447553359717545e-08, | |
| "loss": 1.8815, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 3.807093646338352, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.157371164217902e-08, | |
| "loss": 1.971, | |
| "step": 11190 | |
| }, | |
| { | |
| "epoch": 3.8104958747979927, | |
| "grad_norm": 2.375, | |
| "learning_rate": 7.872230071731239e-08, | |
| "loss": 1.9483, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.8138981032576336, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 7.592132246575323e-08, | |
| "loss": 1.9457, | |
| "step": 11210 | |
| }, | |
| { | |
| "epoch": 3.817300331717275, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 7.317079814787934e-08, | |
| "loss": 1.9193, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 3.820702560176916, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 7.047074864110375e-08, | |
| "loss": 1.9131, | |
| "step": 11230 | |
| }, | |
| { | |
| "epoch": 3.824104788636557, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 6.782119443972094e-08, | |
| "loss": 1.9334, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 3.827507017096198, | |
| "grad_norm": 2.625, | |
| "learning_rate": 6.522215565474712e-08, | |
| "loss": 1.958, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.830909245555839, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 6.267365201377092e-08, | |
| "loss": 1.9266, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 3.8343114740154802, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 6.017570286079965e-08, | |
| "loss": 1.9022, | |
| "step": 11270 | |
| }, | |
| { | |
| "epoch": 3.837713702475121, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 5.77283271561175e-08, | |
| "loss": 1.8612, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 3.8411159309347624, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 5.5331543476137706e-08, | |
| "loss": 1.9326, | |
| "step": 11290 | |
| }, | |
| { | |
| "epoch": 3.8445181593944033, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.298537001326303e-08, | |
| "loss": 1.8951, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.847920387854044, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 5.068982457574685e-08, | |
| "loss": 1.9788, | |
| "step": 11310 | |
| }, | |
| { | |
| "epoch": 3.8513226163136856, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 4.8444924587559654e-08, | |
| "loss": 1.9643, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 3.8547248447733264, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 4.625068708825534e-08, | |
| "loss": 1.9245, | |
| "step": 11330 | |
| }, | |
| { | |
| "epoch": 3.8581270732329678, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 4.4107128732841385e-08, | |
| "loss": 1.8401, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 3.8615293016926087, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 4.20142657916557e-08, | |
| "loss": 1.9087, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.8649315301522496, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.99721141502382e-08, | |
| "loss": 1.9401, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 3.868333758611891, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.798068930921441e-08, | |
| "loss": 1.9699, | |
| "step": 11370 | |
| }, | |
| { | |
| "epoch": 3.8717359870715318, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 3.6040006384174545e-08, | |
| "loss": 1.954, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 3.875138215531173, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 3.4150080105563755e-08, | |
| "loss": 1.8693, | |
| "step": 11390 | |
| }, | |
| { | |
| "epoch": 3.878540443990814, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.231092481856271e-08, | |
| "loss": 1.9307, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.881942672450455, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.052255448298612e-08, | |
| "loss": 1.956, | |
| "step": 11410 | |
| }, | |
| { | |
| "epoch": 3.885344900910096, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.878498267317298e-08, | |
| "loss": 1.9185, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 3.888747129369737, | |
| "grad_norm": 2.5, | |
| "learning_rate": 2.7098222577882825e-08, | |
| "loss": 1.8685, | |
| "step": 11430 | |
| }, | |
| { | |
| "epoch": 3.8921493578293784, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 2.5462287000197963e-08, | |
| "loss": 1.9734, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 3.8955515862890193, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.3877188357427174e-08, | |
| "loss": 1.8995, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.89895381474866, | |
| "grad_norm": 2.25, | |
| "learning_rate": 2.2342938681005695e-08, | |
| "loss": 1.8764, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 3.9023560432083015, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.085954961641164e-08, | |
| "loss": 1.8865, | |
| "step": 11470 | |
| }, | |
| { | |
| "epoch": 3.9057582716679424, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.9427032423071165e-08, | |
| "loss": 1.8932, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 3.9091605001275838, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.8045397974277166e-08, | |
| "loss": 1.9042, | |
| "step": 11490 | |
| }, | |
| { | |
| "epoch": 3.9125627285872246, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.6714656757104883e-08, | |
| "loss": 1.94, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.9159649570468655, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.5434818872331314e-08, | |
| "loss": 1.8879, | |
| "step": 11510 | |
| }, | |
| { | |
| "epoch": 3.919367185506507, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.4205894034362065e-08, | |
| "loss": 1.9147, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 3.9227694139661478, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.3027891571153722e-08, | |
| "loss": 1.8714, | |
| "step": 11530 | |
| }, | |
| { | |
| "epoch": 3.926171642425789, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.1900820424145176e-08, | |
| "loss": 1.9371, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 3.92957387088543, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.0824689148190455e-08, | |
| "loss": 1.9505, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.932976099345071, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 9.799505911490794e-09, | |
| "loss": 1.8738, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 3.936378327804712, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 8.825278495535672e-09, | |
| "loss": 1.8447, | |
| "step": 11570 | |
| }, | |
| { | |
| "epoch": 3.939780556264353, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 7.902014295042352e-09, | |
| "loss": 1.8987, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 3.9431827847239944, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 7.029720317899902e-09, | |
| "loss": 1.9864, | |
| "step": 11590 | |
| }, | |
| { | |
| "epoch": 3.9465850131836353, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 6.20840318511545e-09, | |
| "loss": 1.9454, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.949987241643276, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 5.438069130766418e-09, | |
| "loss": 1.9871, | |
| "step": 11610 | |
| }, | |
| { | |
| "epoch": 3.9533894701029175, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.718724001949017e-09, | |
| "loss": 1.8746, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 3.9567916985625584, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 4.050373258737196e-09, | |
| "loss": 1.9578, | |
| "step": 11630 | |
| }, | |
| { | |
| "epoch": 3.9601939270221997, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 3.4330219741408427e-09, | |
| "loss": 1.9242, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 3.9635961554818406, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 2.8666748340662245e-09, | |
| "loss": 1.9133, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 3.9669983839414815, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.351336137279413e-09, | |
| "loss": 1.9196, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 3.970400612401123, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.887009795377922e-09, | |
| "loss": 1.9906, | |
| "step": 11670 | |
| }, | |
| { | |
| "epoch": 3.9738028408607637, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.473699332754879e-09, | |
| "loss": 1.8989, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 3.977205069320405, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.1114078865781264e-09, | |
| "loss": 1.8962, | |
| "step": 11690 | |
| }, | |
| { | |
| "epoch": 3.980607297780046, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 8.001382067626036e-10, | |
| "loss": 1.944, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 3.984009526239687, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.398926559516878e-10, | |
| "loss": 1.8959, | |
| "step": 11710 | |
| }, | |
| { | |
| "epoch": 3.987411754699328, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.306732094962939e-10, | |
| "loss": 1.9388, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 3.990813983158969, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.7248145544367861e-10, | |
| "loss": 1.9133, | |
| "step": 11730 | |
| }, | |
| { | |
| "epoch": 3.9942162116186104, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 6.531859452325864e-11, | |
| "loss": 1.957, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 3.9976184400782513, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 9.185440136907336e-12, | |
| "loss": 1.9494, | |
| "step": 11750 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 11756, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0768921731962634e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |