diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8258 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.9996597771540356, + "eval_steps": 500, + "global_step": 11756, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003402228459641065, + "grad_norm": 7.6875, + "learning_rate": 3.809091090277921e-07, + "loss": 4.24, + "step": 10 + }, + { + "epoch": 0.00680445691928213, + "grad_norm": 6.46875, + "learning_rate": 7.618182180555842e-07, + "loss": 4.4323, + "step": 20 + }, + { + "epoch": 0.010206685378923195, + "grad_norm": 8.4375, + "learning_rate": 1.1427273270833762e-06, + "loss": 4.2758, + "step": 30 + }, + { + "epoch": 0.01360891383856426, + "grad_norm": 7.53125, + "learning_rate": 1.5236364361111684e-06, + "loss": 4.1231, + "step": 40 + }, + { + "epoch": 0.017011142298205325, + "grad_norm": 5.90625, + "learning_rate": 1.9045455451389605e-06, + "loss": 4.097, + "step": 50 + }, + { + "epoch": 0.02041337075784639, + "grad_norm": 5.15625, + "learning_rate": 2.2854546541667524e-06, + "loss": 4.0712, + "step": 60 + }, + { + "epoch": 0.023815599217487455, + "grad_norm": 4.5625, + "learning_rate": 2.6663637631945448e-06, + "loss": 3.8851, + "step": 70 + }, + { + "epoch": 0.02721782767712852, + "grad_norm": 6.78125, + "learning_rate": 3.0472728722223367e-06, + "loss": 3.6937, + "step": 80 + }, + { + "epoch": 0.030620056136769585, + "grad_norm": 8.25, + "learning_rate": 3.4281819812501286e-06, + "loss": 3.6468, + "step": 90 + }, + { + "epoch": 0.03402228459641065, + "grad_norm": 9.625, + "learning_rate": 3.809091090277921e-06, + "loss": 3.4787, + "step": 100 + }, + { + "epoch": 0.03742451305605171, + "grad_norm": 7.53125, + "learning_rate": 4.190000199305713e-06, + "loss": 3.3235, + "step": 110 + }, + { + "epoch": 0.04082674151569278, + "grad_norm": 9.4375, + "learning_rate": 4.570909308333505e-06, + "loss": 3.2806, + "step": 120 + }, + { + "epoch": 0.04422896997533384, + "grad_norm": 10.3125, + "learning_rate": 4.951818417361297e-06, + "loss": 3.0432, + "step": 130 + }, + { + "epoch": 0.04763119843497491, + "grad_norm": 5.84375, + "learning_rate": 5.3327275263890896e-06, + "loss": 2.8991, + "step": 140 + }, + { + "epoch": 0.05103342689461597, + "grad_norm": 4.1875, + "learning_rate": 5.7136366354168815e-06, + "loss": 2.8202, + "step": 150 + }, + { + "epoch": 0.05443565535425704, + "grad_norm": 1.828125, + "learning_rate": 6.094545744444673e-06, + "loss": 2.6361, + "step": 160 + }, + { + "epoch": 0.0578378838138981, + "grad_norm": 1.8359375, + "learning_rate": 6.475454853472465e-06, + "loss": 2.5525, + "step": 170 + }, + { + "epoch": 0.06124011227353917, + "grad_norm": 1.765625, + "learning_rate": 6.856363962500257e-06, + "loss": 2.5685, + "step": 180 + }, + { + "epoch": 0.06464234073318023, + "grad_norm": 2.125, + "learning_rate": 7.237273071528049e-06, + "loss": 2.5133, + "step": 190 + }, + { + "epoch": 0.0680445691928213, + "grad_norm": 1.71875, + "learning_rate": 7.618182180555842e-06, + "loss": 2.4096, + "step": 200 + }, + { + "epoch": 0.07144679765246237, + "grad_norm": 1.9140625, + "learning_rate": 7.999091289583632e-06, + "loss": 2.4864, + "step": 210 + }, + { + "epoch": 0.07484902611210342, + "grad_norm": 1.9765625, + "learning_rate": 8.380000398611426e-06, + "loss": 2.4321, + "step": 220 + }, + { + "epoch": 0.07825125457174449, + "grad_norm": 2.3125, + "learning_rate": 8.760909507639218e-06, + "loss": 2.3582, + "step": 230 + }, + { + "epoch": 0.08165348303138556, + "grad_norm": 2.3125, + "learning_rate": 9.14181861666701e-06, + "loss": 2.3401, + "step": 240 + }, + { + "epoch": 0.08505571149102663, + "grad_norm": 2.625, + "learning_rate": 9.522727725694802e-06, + "loss": 2.3312, + "step": 250 + }, + { + "epoch": 0.08845793995066768, + "grad_norm": 1.9609375, + "learning_rate": 9.903636834722594e-06, + "loss": 2.3672, + "step": 260 + }, + { + "epoch": 0.09186016841030875, + "grad_norm": 1.453125, + "learning_rate": 1.0284545943750385e-05, + "loss": 2.3025, + "step": 270 + }, + { + "epoch": 0.09526239686994982, + "grad_norm": 1.46875, + "learning_rate": 1.0665455052778179e-05, + "loss": 2.3273, + "step": 280 + }, + { + "epoch": 0.09866462532959089, + "grad_norm": 2.25, + "learning_rate": 1.104636416180597e-05, + "loss": 2.2746, + "step": 290 + }, + { + "epoch": 0.10206685378923194, + "grad_norm": 1.5859375, + "learning_rate": 1.1427273270833763e-05, + "loss": 2.3196, + "step": 300 + }, + { + "epoch": 0.10546908224887301, + "grad_norm": 1.5078125, + "learning_rate": 1.1808182379861553e-05, + "loss": 2.2645, + "step": 310 + }, + { + "epoch": 0.10887131070851408, + "grad_norm": 1.6640625, + "learning_rate": 1.2189091488889347e-05, + "loss": 2.2902, + "step": 320 + }, + { + "epoch": 0.11227353916815515, + "grad_norm": 1.5859375, + "learning_rate": 1.2570000597917139e-05, + "loss": 2.2503, + "step": 330 + }, + { + "epoch": 0.1156757676277962, + "grad_norm": 1.5, + "learning_rate": 1.295090970694493e-05, + "loss": 2.1882, + "step": 340 + }, + { + "epoch": 0.11907799608743727, + "grad_norm": 1.359375, + "learning_rate": 1.3331818815972723e-05, + "loss": 2.2266, + "step": 350 + }, + { + "epoch": 0.12248022454707834, + "grad_norm": 1.8125, + "learning_rate": 1.344607904627746e-05, + "loss": 2.2011, + "step": 360 + }, + { + "epoch": 0.1258824530067194, + "grad_norm": 1.4765625, + "learning_rate": 1.3446017810126854e-05, + "loss": 2.1828, + "step": 370 + }, + { + "epoch": 0.12928468146636046, + "grad_norm": 1.5234375, + "learning_rate": 1.3445905544333626e-05, + "loss": 2.2727, + "step": 380 + }, + { + "epoch": 0.13268690992600152, + "grad_norm": 1.6328125, + "learning_rate": 1.344574224974991e-05, + "loss": 2.2222, + "step": 390 + }, + { + "epoch": 0.1360891383856426, + "grad_norm": 1.59375, + "learning_rate": 1.3445527927615165e-05, + "loss": 2.2107, + "step": 400 + }, + { + "epoch": 0.13949136684528365, + "grad_norm": 1.515625, + "learning_rate": 1.3445262579556173e-05, + "loss": 2.1671, + "step": 410 + }, + { + "epoch": 0.14289359530492474, + "grad_norm": 1.3671875, + "learning_rate": 1.3444946207587011e-05, + "loss": 2.1878, + "step": 420 + }, + { + "epoch": 0.1462958237645658, + "grad_norm": 1.4453125, + "learning_rate": 1.3444578814109056e-05, + "loss": 2.1358, + "step": 430 + }, + { + "epoch": 0.14969805222420685, + "grad_norm": 1.734375, + "learning_rate": 1.3444160401910943e-05, + "loss": 2.1564, + "step": 440 + }, + { + "epoch": 0.15310028068384793, + "grad_norm": 1.4765625, + "learning_rate": 1.3443690974168565e-05, + "loss": 2.1756, + "step": 450 + }, + { + "epoch": 0.15650250914348898, + "grad_norm": 1.546875, + "learning_rate": 1.344317053444504e-05, + "loss": 2.1606, + "step": 460 + }, + { + "epoch": 0.15990473760313004, + "grad_norm": 1.78125, + "learning_rate": 1.344259908669068e-05, + "loss": 2.2352, + "step": 470 + }, + { + "epoch": 0.16330696606277112, + "grad_norm": 1.5078125, + "learning_rate": 1.3441976635242969e-05, + "loss": 2.1258, + "step": 480 + }, + { + "epoch": 0.16670919452241217, + "grad_norm": 1.6484375, + "learning_rate": 1.3441303184826526e-05, + "loss": 2.1533, + "step": 490 + }, + { + "epoch": 0.17011142298205326, + "grad_norm": 1.78125, + "learning_rate": 1.3440578740553065e-05, + "loss": 2.1179, + "step": 500 + }, + { + "epoch": 0.1735136514416943, + "grad_norm": 1.484375, + "learning_rate": 1.3439803307921367e-05, + "loss": 2.1868, + "step": 510 + }, + { + "epoch": 0.17691587990133537, + "grad_norm": 1.671875, + "learning_rate": 1.343897689281723e-05, + "loss": 2.1144, + "step": 520 + }, + { + "epoch": 0.18031810836097645, + "grad_norm": 1.5078125, + "learning_rate": 1.343809950151342e-05, + "loss": 2.1722, + "step": 530 + }, + { + "epoch": 0.1837203368206175, + "grad_norm": 1.6171875, + "learning_rate": 1.3437171140669643e-05, + "loss": 2.1725, + "step": 540 + }, + { + "epoch": 0.18712256528025856, + "grad_norm": 1.5234375, + "learning_rate": 1.3436191817332471e-05, + "loss": 2.1871, + "step": 550 + }, + { + "epoch": 0.19052479373989964, + "grad_norm": 1.7890625, + "learning_rate": 1.3435161538935297e-05, + "loss": 2.2134, + "step": 560 + }, + { + "epoch": 0.1939270221995407, + "grad_norm": 1.78125, + "learning_rate": 1.3434080313298288e-05, + "loss": 2.1545, + "step": 570 + }, + { + "epoch": 0.19732925065918178, + "grad_norm": 1.6328125, + "learning_rate": 1.3432948148628312e-05, + "loss": 2.1173, + "step": 580 + }, + { + "epoch": 0.20073147911882283, + "grad_norm": 1.640625, + "learning_rate": 1.3431765053518884e-05, + "loss": 2.1703, + "step": 590 + }, + { + "epoch": 0.20413370757846389, + "grad_norm": 1.6796875, + "learning_rate": 1.3430531036950099e-05, + "loss": 2.1662, + "step": 600 + }, + { + "epoch": 0.20753593603810497, + "grad_norm": 1.6171875, + "learning_rate": 1.3429246108288562e-05, + "loss": 2.153, + "step": 610 + }, + { + "epoch": 0.21093816449774602, + "grad_norm": 1.6328125, + "learning_rate": 1.3427910277287318e-05, + "loss": 2.1421, + "step": 620 + }, + { + "epoch": 0.21434039295738708, + "grad_norm": 1.4453125, + "learning_rate": 1.3426523554085776e-05, + "loss": 2.1315, + "step": 630 + }, + { + "epoch": 0.21774262141702816, + "grad_norm": 1.5703125, + "learning_rate": 1.342508594920964e-05, + "loss": 2.1187, + "step": 640 + }, + { + "epoch": 0.22114484987666921, + "grad_norm": 1.7578125, + "learning_rate": 1.342359747357082e-05, + "loss": 2.1447, + "step": 650 + }, + { + "epoch": 0.2245470783363103, + "grad_norm": 1.671875, + "learning_rate": 1.3422058138467349e-05, + "loss": 2.1614, + "step": 660 + }, + { + "epoch": 0.22794930679595135, + "grad_norm": 1.5390625, + "learning_rate": 1.3420467955583304e-05, + "loss": 2.1521, + "step": 670 + }, + { + "epoch": 0.2313515352555924, + "grad_norm": 1.6953125, + "learning_rate": 1.3418826936988714e-05, + "loss": 2.1474, + "step": 680 + }, + { + "epoch": 0.2347537637152335, + "grad_norm": 1.6484375, + "learning_rate": 1.3417135095139467e-05, + "loss": 2.1887, + "step": 690 + }, + { + "epoch": 0.23815599217487454, + "grad_norm": 1.71875, + "learning_rate": 1.341539244287722e-05, + "loss": 2.1432, + "step": 700 + }, + { + "epoch": 0.2415582206345156, + "grad_norm": 1.8046875, + "learning_rate": 1.3413598993429295e-05, + "loss": 2.1202, + "step": 710 + }, + { + "epoch": 0.24496044909415668, + "grad_norm": 1.7578125, + "learning_rate": 1.3411754760408584e-05, + "loss": 2.201, + "step": 720 + }, + { + "epoch": 0.24836267755379773, + "grad_norm": 1.5390625, + "learning_rate": 1.3409859757813437e-05, + "loss": 2.104, + "step": 730 + }, + { + "epoch": 0.2517649060134388, + "grad_norm": 1.703125, + "learning_rate": 1.3407914000027573e-05, + "loss": 2.1118, + "step": 740 + }, + { + "epoch": 0.25516713447307987, + "grad_norm": 1.5546875, + "learning_rate": 1.3405917501819956e-05, + "loss": 2.1533, + "step": 750 + }, + { + "epoch": 0.2585693629327209, + "grad_norm": 1.3828125, + "learning_rate": 1.340387027834468e-05, + "loss": 2.0738, + "step": 760 + }, + { + "epoch": 0.261971591392362, + "grad_norm": 1.625, + "learning_rate": 1.3401772345140874e-05, + "loss": 2.1696, + "step": 770 + }, + { + "epoch": 0.26537381985200303, + "grad_norm": 1.921875, + "learning_rate": 1.3399623718132557e-05, + "loss": 2.0847, + "step": 780 + }, + { + "epoch": 0.26877604831164414, + "grad_norm": 1.5390625, + "learning_rate": 1.3397424413628542e-05, + "loss": 2.1644, + "step": 790 + }, + { + "epoch": 0.2721782767712852, + "grad_norm": 1.640625, + "learning_rate": 1.3395174448322298e-05, + "loss": 2.0891, + "step": 800 + }, + { + "epoch": 0.27558050523092625, + "grad_norm": 1.9453125, + "learning_rate": 1.3392873839291825e-05, + "loss": 2.1638, + "step": 810 + }, + { + "epoch": 0.2789827336905673, + "grad_norm": 1.625, + "learning_rate": 1.339052260399953e-05, + "loss": 2.078, + "step": 820 + }, + { + "epoch": 0.28238496215020836, + "grad_norm": 1.7890625, + "learning_rate": 1.3388120760292085e-05, + "loss": 2.1191, + "step": 830 + }, + { + "epoch": 0.2857871906098495, + "grad_norm": 1.765625, + "learning_rate": 1.33856683264003e-05, + "loss": 2.0554, + "step": 840 + }, + { + "epoch": 0.2891894190694905, + "grad_norm": 1.8203125, + "learning_rate": 1.3383165320938983e-05, + "loss": 2.0385, + "step": 850 + }, + { + "epoch": 0.2925916475291316, + "grad_norm": 1.7109375, + "learning_rate": 1.3380611762906796e-05, + "loss": 2.1071, + "step": 860 + }, + { + "epoch": 0.29599387598877264, + "grad_norm": 1.6640625, + "learning_rate": 1.3378007671686113e-05, + "loss": 2.1171, + "step": 870 + }, + { + "epoch": 0.2993961044484137, + "grad_norm": 1.4609375, + "learning_rate": 1.337535306704287e-05, + "loss": 2.1264, + "step": 880 + }, + { + "epoch": 0.3027983329080548, + "grad_norm": 1.75, + "learning_rate": 1.337264796912642e-05, + "loss": 2.0562, + "step": 890 + }, + { + "epoch": 0.30620056136769586, + "grad_norm": 1.78125, + "learning_rate": 1.3369892398469373e-05, + "loss": 2.1343, + "step": 900 + }, + { + "epoch": 0.3096027898273369, + "grad_norm": 1.53125, + "learning_rate": 1.3367086375987447e-05, + "loss": 2.0563, + "step": 910 + }, + { + "epoch": 0.31300501828697797, + "grad_norm": 1.7578125, + "learning_rate": 1.3364229922979311e-05, + "loss": 2.1302, + "step": 920 + }, + { + "epoch": 0.316407246746619, + "grad_norm": 1.609375, + "learning_rate": 1.3361323061126409e-05, + "loss": 2.0733, + "step": 930 + }, + { + "epoch": 0.3198094752062601, + "grad_norm": 1.921875, + "learning_rate": 1.3358365812492812e-05, + "loss": 2.1027, + "step": 940 + }, + { + "epoch": 0.3232117036659012, + "grad_norm": 1.7265625, + "learning_rate": 1.3355358199525042e-05, + "loss": 2.0455, + "step": 950 + }, + { + "epoch": 0.32661393212554224, + "grad_norm": 1.6953125, + "learning_rate": 1.3352300245051904e-05, + "loss": 2.0785, + "step": 960 + }, + { + "epoch": 0.3300161605851833, + "grad_norm": 1.671875, + "learning_rate": 1.3349191972284314e-05, + "loss": 2.1594, + "step": 970 + }, + { + "epoch": 0.33341838904482435, + "grad_norm": 1.78125, + "learning_rate": 1.3346033404815114e-05, + "loss": 2.066, + "step": 980 + }, + { + "epoch": 0.3368206175044654, + "grad_norm": 1.59375, + "learning_rate": 1.3342824566618907e-05, + "loss": 2.1451, + "step": 990 + }, + { + "epoch": 0.3402228459641065, + "grad_norm": 1.6953125, + "learning_rate": 1.3339565482051866e-05, + "loss": 2.152, + "step": 1000 + }, + { + "epoch": 0.34362507442374757, + "grad_norm": 1.7109375, + "learning_rate": 1.3336256175851549e-05, + "loss": 2.1232, + "step": 1010 + }, + { + "epoch": 0.3470273028833886, + "grad_norm": 1.8828125, + "learning_rate": 1.3332896673136717e-05, + "loss": 2.1158, + "step": 1020 + }, + { + "epoch": 0.3504295313430297, + "grad_norm": 1.7421875, + "learning_rate": 1.3329486999407136e-05, + "loss": 2.102, + "step": 1030 + }, + { + "epoch": 0.35383175980267073, + "grad_norm": 1.8125, + "learning_rate": 1.3326027180543387e-05, + "loss": 2.1266, + "step": 1040 + }, + { + "epoch": 0.35723398826231184, + "grad_norm": 1.421875, + "learning_rate": 1.3322517242806673e-05, + "loss": 2.0884, + "step": 1050 + }, + { + "epoch": 0.3606362167219529, + "grad_norm": 1.5546875, + "learning_rate": 1.3318957212838615e-05, + "loss": 2.0793, + "step": 1060 + }, + { + "epoch": 0.36403844518159395, + "grad_norm": 1.78125, + "learning_rate": 1.3315347117661048e-05, + "loss": 2.0574, + "step": 1070 + }, + { + "epoch": 0.367440673641235, + "grad_norm": 1.6171875, + "learning_rate": 1.3311686984675822e-05, + "loss": 2.0716, + "step": 1080 + }, + { + "epoch": 0.37084290210087606, + "grad_norm": 1.8671875, + "learning_rate": 1.3307976841664591e-05, + "loss": 2.0523, + "step": 1090 + }, + { + "epoch": 0.3742451305605171, + "grad_norm": 1.703125, + "learning_rate": 1.33042167167886e-05, + "loss": 2.0203, + "step": 1100 + }, + { + "epoch": 0.3776473590201582, + "grad_norm": 1.546875, + "learning_rate": 1.330040663858848e-05, + "loss": 2.0823, + "step": 1110 + }, + { + "epoch": 0.3810495874797993, + "grad_norm": 1.796875, + "learning_rate": 1.3296546635984012e-05, + "loss": 2.0758, + "step": 1120 + }, + { + "epoch": 0.38445181593944033, + "grad_norm": 1.7421875, + "learning_rate": 1.3292636738273931e-05, + "loss": 2.1138, + "step": 1130 + }, + { + "epoch": 0.3878540443990814, + "grad_norm": 1.5, + "learning_rate": 1.3288676975135689e-05, + "loss": 2.0277, + "step": 1140 + }, + { + "epoch": 0.39125627285872244, + "grad_norm": 1.5703125, + "learning_rate": 1.3284667376625236e-05, + "loss": 2.042, + "step": 1150 + }, + { + "epoch": 0.39465850131836355, + "grad_norm": 1.8515625, + "learning_rate": 1.3280607973176785e-05, + "loss": 2.114, + "step": 1160 + }, + { + "epoch": 0.3980607297780046, + "grad_norm": 1.796875, + "learning_rate": 1.327649879560259e-05, + "loss": 2.0477, + "step": 1170 + }, + { + "epoch": 0.40146295823764566, + "grad_norm": 1.8046875, + "learning_rate": 1.3272339875092701e-05, + "loss": 2.0101, + "step": 1180 + }, + { + "epoch": 0.4048651866972867, + "grad_norm": 1.984375, + "learning_rate": 1.3268131243214744e-05, + "loss": 2.1261, + "step": 1190 + }, + { + "epoch": 0.40826741515692777, + "grad_norm": 1.9375, + "learning_rate": 1.326387293191366e-05, + "loss": 2.0788, + "step": 1200 + }, + { + "epoch": 0.4116696436165688, + "grad_norm": 1.78125, + "learning_rate": 1.325956497351148e-05, + "loss": 2.0694, + "step": 1210 + }, + { + "epoch": 0.41507187207620994, + "grad_norm": 1.9296875, + "learning_rate": 1.3255207400707076e-05, + "loss": 2.11, + "step": 1220 + }, + { + "epoch": 0.418474100535851, + "grad_norm": 1.796875, + "learning_rate": 1.3250800246575906e-05, + "loss": 2.0621, + "step": 1230 + }, + { + "epoch": 0.42187632899549204, + "grad_norm": 1.6875, + "learning_rate": 1.3246343544569764e-05, + "loss": 2.0923, + "step": 1240 + }, + { + "epoch": 0.4252785574551331, + "grad_norm": 1.6640625, + "learning_rate": 1.3241837328516535e-05, + "loss": 2.1005, + "step": 1250 + }, + { + "epoch": 0.42868078591477415, + "grad_norm": 1.953125, + "learning_rate": 1.323728163261993e-05, + "loss": 2.0634, + "step": 1260 + }, + { + "epoch": 0.43208301437441526, + "grad_norm": 1.859375, + "learning_rate": 1.323267649145923e-05, + "loss": 2.0635, + "step": 1270 + }, + { + "epoch": 0.4354852428340563, + "grad_norm": 1.640625, + "learning_rate": 1.3228021939989018e-05, + "loss": 2.131, + "step": 1280 + }, + { + "epoch": 0.4388874712936974, + "grad_norm": 1.7421875, + "learning_rate": 1.3223318013538927e-05, + "loss": 2.1021, + "step": 1290 + }, + { + "epoch": 0.44228969975333843, + "grad_norm": 1.734375, + "learning_rate": 1.3218564747813355e-05, + "loss": 2.0758, + "step": 1300 + }, + { + "epoch": 0.4456919282129795, + "grad_norm": 1.6953125, + "learning_rate": 1.3213762178891202e-05, + "loss": 2.0198, + "step": 1310 + }, + { + "epoch": 0.4490941566726206, + "grad_norm": 1.8515625, + "learning_rate": 1.3208910343225603e-05, + "loss": 2.1226, + "step": 1320 + }, + { + "epoch": 0.45249638513226165, + "grad_norm": 1.703125, + "learning_rate": 1.3204009277643636e-05, + "loss": 2.077, + "step": 1330 + }, + { + "epoch": 0.4558986135919027, + "grad_norm": 1.6953125, + "learning_rate": 1.3199059019346055e-05, + "loss": 2.1154, + "step": 1340 + }, + { + "epoch": 0.45930084205154376, + "grad_norm": 1.8984375, + "learning_rate": 1.3194059605907003e-05, + "loss": 2.1109, + "step": 1350 + }, + { + "epoch": 0.4627030705111848, + "grad_norm": 1.8203125, + "learning_rate": 1.318901107527373e-05, + "loss": 2.1108, + "step": 1360 + }, + { + "epoch": 0.46610529897082587, + "grad_norm": 2.09375, + "learning_rate": 1.3183913465766294e-05, + "loss": 2.1203, + "step": 1370 + }, + { + "epoch": 0.469507527430467, + "grad_norm": 1.8671875, + "learning_rate": 1.3178766816077288e-05, + "loss": 2.0667, + "step": 1380 + }, + { + "epoch": 0.47290975589010803, + "grad_norm": 1.8671875, + "learning_rate": 1.317357116527153e-05, + "loss": 2.0428, + "step": 1390 + }, + { + "epoch": 0.4763119843497491, + "grad_norm": 1.703125, + "learning_rate": 1.3168326552785775e-05, + "loss": 2.0836, + "step": 1400 + }, + { + "epoch": 0.47971421280939014, + "grad_norm": 1.6015625, + "learning_rate": 1.3163033018428418e-05, + "loss": 2.0031, + "step": 1410 + }, + { + "epoch": 0.4831164412690312, + "grad_norm": 2.0625, + "learning_rate": 1.315769060237918e-05, + "loss": 2.096, + "step": 1420 + }, + { + "epoch": 0.4865186697286723, + "grad_norm": 1.828125, + "learning_rate": 1.3152299345188815e-05, + "loss": 2.0325, + "step": 1430 + }, + { + "epoch": 0.48992089818831336, + "grad_norm": 1.65625, + "learning_rate": 1.3146859287778799e-05, + "loss": 2.0444, + "step": 1440 + }, + { + "epoch": 0.4933231266479544, + "grad_norm": 2.140625, + "learning_rate": 1.3141370471441016e-05, + "loss": 2.0971, + "step": 1450 + }, + { + "epoch": 0.49672535510759547, + "grad_norm": 2.0, + "learning_rate": 1.3135832937837444e-05, + "loss": 2.0014, + "step": 1460 + }, + { + "epoch": 0.5001275835672365, + "grad_norm": 1.6796875, + "learning_rate": 1.3130246728999852e-05, + "loss": 2.0086, + "step": 1470 + }, + { + "epoch": 0.5035298120268776, + "grad_norm": 1.78125, + "learning_rate": 1.3124611887329459e-05, + "loss": 2.0079, + "step": 1480 + }, + { + "epoch": 0.5069320404865186, + "grad_norm": 1.9296875, + "learning_rate": 1.3118928455596627e-05, + "loss": 2.0654, + "step": 1490 + }, + { + "epoch": 0.5103342689461597, + "grad_norm": 1.875, + "learning_rate": 1.3113196476940538e-05, + "loss": 2.0195, + "step": 1500 + }, + { + "epoch": 0.5137364974058009, + "grad_norm": 1.8203125, + "learning_rate": 1.3107415994868855e-05, + "loss": 2.0196, + "step": 1510 + }, + { + "epoch": 0.5171387258654419, + "grad_norm": 2.125, + "learning_rate": 1.3101587053257404e-05, + "loss": 2.0552, + "step": 1520 + }, + { + "epoch": 0.520540954325083, + "grad_norm": 1.734375, + "learning_rate": 1.3095709696349833e-05, + "loss": 2.0833, + "step": 1530 + }, + { + "epoch": 0.523943182784724, + "grad_norm": 1.765625, + "learning_rate": 1.3089783968757277e-05, + "loss": 2.1067, + "step": 1540 + }, + { + "epoch": 0.5273454112443651, + "grad_norm": 1.9921875, + "learning_rate": 1.308380991545802e-05, + "loss": 2.0313, + "step": 1550 + }, + { + "epoch": 0.5307476397040061, + "grad_norm": 1.9296875, + "learning_rate": 1.3077787581797163e-05, + "loss": 2.0918, + "step": 1560 + }, + { + "epoch": 0.5341498681636472, + "grad_norm": 1.609375, + "learning_rate": 1.3071717013486259e-05, + "loss": 2.0505, + "step": 1570 + }, + { + "epoch": 0.5375520966232883, + "grad_norm": 1.421875, + "learning_rate": 1.3065598256602989e-05, + "loss": 2.1166, + "step": 1580 + }, + { + "epoch": 0.5409543250829293, + "grad_norm": 1.6015625, + "learning_rate": 1.3059431357590797e-05, + "loss": 2.1196, + "step": 1590 + }, + { + "epoch": 0.5443565535425704, + "grad_norm": 1.765625, + "learning_rate": 1.3053216363258537e-05, + "loss": 2.0623, + "step": 1600 + }, + { + "epoch": 0.5477587820022114, + "grad_norm": 1.671875, + "learning_rate": 1.3046953320780136e-05, + "loss": 2.051, + "step": 1610 + }, + { + "epoch": 0.5511610104618525, + "grad_norm": 1.734375, + "learning_rate": 1.304064227769421e-05, + "loss": 2.0341, + "step": 1620 + }, + { + "epoch": 0.5545632389214936, + "grad_norm": 1.8671875, + "learning_rate": 1.3034283281903722e-05, + "loss": 2.001, + "step": 1630 + }, + { + "epoch": 0.5579654673811346, + "grad_norm": 2.125, + "learning_rate": 1.3027876381675611e-05, + "loss": 1.9871, + "step": 1640 + }, + { + "epoch": 0.5613676958407757, + "grad_norm": 1.8359375, + "learning_rate": 1.3021421625640427e-05, + "loss": 2.0712, + "step": 1650 + }, + { + "epoch": 0.5647699243004167, + "grad_norm": 1.8671875, + "learning_rate": 1.3014919062791965e-05, + "loss": 2.0444, + "step": 1660 + }, + { + "epoch": 0.5681721527600578, + "grad_norm": 1.9609375, + "learning_rate": 1.3008368742486882e-05, + "loss": 2.0598, + "step": 1670 + }, + { + "epoch": 0.571574381219699, + "grad_norm": 1.8828125, + "learning_rate": 1.300177071444434e-05, + "loss": 2.0744, + "step": 1680 + }, + { + "epoch": 0.57497660967934, + "grad_norm": 2.109375, + "learning_rate": 1.299512502874561e-05, + "loss": 1.9854, + "step": 1690 + }, + { + "epoch": 0.578378838138981, + "grad_norm": 2.0, + "learning_rate": 1.2988431735833709e-05, + "loss": 2.0348, + "step": 1700 + }, + { + "epoch": 0.581781066598622, + "grad_norm": 1.84375, + "learning_rate": 1.2981690886513001e-05, + "loss": 2.0189, + "step": 1710 + }, + { + "epoch": 0.5851832950582632, + "grad_norm": 1.875, + "learning_rate": 1.2974902531948826e-05, + "loss": 1.9997, + "step": 1720 + }, + { + "epoch": 0.5885855235179043, + "grad_norm": 1.6640625, + "learning_rate": 1.2968066723667104e-05, + "loss": 1.9861, + "step": 1730 + }, + { + "epoch": 0.5919877519775453, + "grad_norm": 1.796875, + "learning_rate": 1.2961183513553937e-05, + "loss": 2.0284, + "step": 1740 + }, + { + "epoch": 0.5953899804371864, + "grad_norm": 1.734375, + "learning_rate": 1.2954252953855236e-05, + "loss": 2.0376, + "step": 1750 + }, + { + "epoch": 0.5987922088968274, + "grad_norm": 1.7734375, + "learning_rate": 1.2947275097176301e-05, + "loss": 2.0059, + "step": 1760 + }, + { + "epoch": 0.6021944373564685, + "grad_norm": 2.09375, + "learning_rate": 1.2940249996481436e-05, + "loss": 2.0906, + "step": 1770 + }, + { + "epoch": 0.6055966658161096, + "grad_norm": 1.8359375, + "learning_rate": 1.2933177705093541e-05, + "loss": 2.0076, + "step": 1780 + }, + { + "epoch": 0.6089988942757506, + "grad_norm": 1.7265625, + "learning_rate": 1.2926058276693715e-05, + "loss": 2.0247, + "step": 1790 + }, + { + "epoch": 0.6124011227353917, + "grad_norm": 1.8359375, + "learning_rate": 1.2918891765320837e-05, + "loss": 2.113, + "step": 1800 + }, + { + "epoch": 0.6158033511950327, + "grad_norm": 1.671875, + "learning_rate": 1.2911678225371164e-05, + "loss": 2.0201, + "step": 1810 + }, + { + "epoch": 0.6192055796546738, + "grad_norm": 1.8828125, + "learning_rate": 1.2904417711597916e-05, + "loss": 2.0172, + "step": 1820 + }, + { + "epoch": 0.6226078081143149, + "grad_norm": 1.9609375, + "learning_rate": 1.289711027911086e-05, + "loss": 2.1396, + "step": 1830 + }, + { + "epoch": 0.6260100365739559, + "grad_norm": 1.75, + "learning_rate": 1.2889755983375892e-05, + "loss": 2.045, + "step": 1840 + }, + { + "epoch": 0.629412265033597, + "grad_norm": 1.9375, + "learning_rate": 1.2882354880214616e-05, + "loss": 2.012, + "step": 1850 + }, + { + "epoch": 0.632814493493238, + "grad_norm": 1.8671875, + "learning_rate": 1.2874907025803922e-05, + "loss": 2.058, + "step": 1860 + }, + { + "epoch": 0.6362167219528791, + "grad_norm": 1.8359375, + "learning_rate": 1.2867412476675554e-05, + "loss": 2.0796, + "step": 1870 + }, + { + "epoch": 0.6396189504125201, + "grad_norm": 1.8671875, + "learning_rate": 1.2859871289715688e-05, + "loss": 2.0956, + "step": 1880 + }, + { + "epoch": 0.6430211788721613, + "grad_norm": 1.7421875, + "learning_rate": 1.2852283522164496e-05, + "loss": 1.983, + "step": 1890 + }, + { + "epoch": 0.6464234073318024, + "grad_norm": 1.921875, + "learning_rate": 1.2844649231615713e-05, + "loss": 1.9861, + "step": 1900 + }, + { + "epoch": 0.6498256357914434, + "grad_norm": 1.890625, + "learning_rate": 1.2836968476016196e-05, + "loss": 2.0683, + "step": 1910 + }, + { + "epoch": 0.6532278642510845, + "grad_norm": 1.6875, + "learning_rate": 1.2829241313665494e-05, + "loss": 2.0916, + "step": 1920 + }, + { + "epoch": 0.6566300927107255, + "grad_norm": 1.609375, + "learning_rate": 1.2821467803215395e-05, + "loss": 2.0254, + "step": 1930 + }, + { + "epoch": 0.6600323211703666, + "grad_norm": 1.9765625, + "learning_rate": 1.2813648003669482e-05, + "loss": 2.0332, + "step": 1940 + }, + { + "epoch": 0.6634345496300077, + "grad_norm": 1.9140625, + "learning_rate": 1.2805781974382694e-05, + "loss": 2.0225, + "step": 1950 + }, + { + "epoch": 0.6668367780896487, + "grad_norm": 1.859375, + "learning_rate": 1.2797869775060866e-05, + "loss": 2.0563, + "step": 1960 + }, + { + "epoch": 0.6702390065492898, + "grad_norm": 1.6953125, + "learning_rate": 1.2789911465760281e-05, + "loss": 2.0027, + "step": 1970 + }, + { + "epoch": 0.6736412350089308, + "grad_norm": 1.890625, + "learning_rate": 1.2781907106887209e-05, + "loss": 1.9895, + "step": 1980 + }, + { + "epoch": 0.6770434634685719, + "grad_norm": 2.015625, + "learning_rate": 1.2773856759197455e-05, + "loss": 2.0175, + "step": 1990 + }, + { + "epoch": 0.680445691928213, + "grad_norm": 1.7890625, + "learning_rate": 1.2765760483795895e-05, + "loss": 2.0702, + "step": 2000 + }, + { + "epoch": 0.683847920387854, + "grad_norm": 1.796875, + "learning_rate": 1.275761834213601e-05, + "loss": 2.023, + "step": 2010 + }, + { + "epoch": 0.6872501488474951, + "grad_norm": 1.9140625, + "learning_rate": 1.2749430396019423e-05, + "loss": 2.0051, + "step": 2020 + }, + { + "epoch": 0.6906523773071361, + "grad_norm": 1.9765625, + "learning_rate": 1.2741196707595429e-05, + "loss": 2.017, + "step": 2030 + }, + { + "epoch": 0.6940546057667772, + "grad_norm": 1.9296875, + "learning_rate": 1.273291733936052e-05, + "loss": 2.0481, + "step": 2040 + }, + { + "epoch": 0.6974568342264184, + "grad_norm": 1.7265625, + "learning_rate": 1.2724592354157912e-05, + "loss": 2.0281, + "step": 2050 + }, + { + "epoch": 0.7008590626860594, + "grad_norm": 1.8984375, + "learning_rate": 1.2716221815177076e-05, + "loss": 2.0459, + "step": 2060 + }, + { + "epoch": 0.7042612911457005, + "grad_norm": 2.21875, + "learning_rate": 1.2707805785953245e-05, + "loss": 2.0705, + "step": 2070 + }, + { + "epoch": 0.7076635196053415, + "grad_norm": 2.109375, + "learning_rate": 1.2699344330366942e-05, + "loss": 2.0759, + "step": 2080 + }, + { + "epoch": 0.7110657480649826, + "grad_norm": 1.765625, + "learning_rate": 1.2690837512643495e-05, + "loss": 2.0324, + "step": 2090 + }, + { + "epoch": 0.7144679765246237, + "grad_norm": 1.75, + "learning_rate": 1.2682285397352535e-05, + "loss": 1.9784, + "step": 2100 + }, + { + "epoch": 0.7178702049842647, + "grad_norm": 1.9140625, + "learning_rate": 1.2673688049407526e-05, + "loss": 1.9902, + "step": 2110 + }, + { + "epoch": 0.7212724334439058, + "grad_norm": 1.890625, + "learning_rate": 1.266504553406526e-05, + "loss": 2.0631, + "step": 2120 + }, + { + "epoch": 0.7246746619035468, + "grad_norm": 2.015625, + "learning_rate": 1.2656357916925368e-05, + "loss": 2.0039, + "step": 2130 + }, + { + "epoch": 0.7280768903631879, + "grad_norm": 2.15625, + "learning_rate": 1.2647625263929817e-05, + "loss": 1.9975, + "step": 2140 + }, + { + "epoch": 0.7314791188228289, + "grad_norm": 1.71875, + "learning_rate": 1.2638847641362408e-05, + "loss": 2.0368, + "step": 2150 + }, + { + "epoch": 0.73488134728247, + "grad_norm": 1.9296875, + "learning_rate": 1.2630025115848282e-05, + "loss": 2.0954, + "step": 2160 + }, + { + "epoch": 0.7382835757421111, + "grad_norm": 1.6484375, + "learning_rate": 1.2621157754353404e-05, + "loss": 2.0297, + "step": 2170 + }, + { + "epoch": 0.7416858042017521, + "grad_norm": 1.65625, + "learning_rate": 1.2612245624184062e-05, + "loss": 2.0445, + "step": 2180 + }, + { + "epoch": 0.7450880326613932, + "grad_norm": 1.7578125, + "learning_rate": 1.2603288792986354e-05, + "loss": 2.0587, + "step": 2190 + }, + { + "epoch": 0.7484902611210342, + "grad_norm": 1.8203125, + "learning_rate": 1.2594287328745672e-05, + "loss": 2.0126, + "step": 2200 + }, + { + "epoch": 0.7518924895806753, + "grad_norm": 1.7890625, + "learning_rate": 1.258524129978619e-05, + "loss": 2.0213, + "step": 2210 + }, + { + "epoch": 0.7552947180403164, + "grad_norm": 1.953125, + "learning_rate": 1.257615077477034e-05, + "loss": 1.9826, + "step": 2220 + }, + { + "epoch": 0.7586969464999574, + "grad_norm": 1.8515625, + "learning_rate": 1.25670158226983e-05, + "loss": 2.0467, + "step": 2230 + }, + { + "epoch": 0.7620991749595986, + "grad_norm": 1.9765625, + "learning_rate": 1.2557836512907456e-05, + "loss": 1.9924, + "step": 2240 + }, + { + "epoch": 0.7655014034192396, + "grad_norm": 2.140625, + "learning_rate": 1.2548612915071894e-05, + "loss": 1.9864, + "step": 2250 + }, + { + "epoch": 0.7689036318788807, + "grad_norm": 1.921875, + "learning_rate": 1.2539345099201851e-05, + "loss": 1.9966, + "step": 2260 + }, + { + "epoch": 0.7723058603385218, + "grad_norm": 1.875, + "learning_rate": 1.2530033135643203e-05, + "loss": 2.0092, + "step": 2270 + }, + { + "epoch": 0.7757080887981628, + "grad_norm": 2.1875, + "learning_rate": 1.2520677095076918e-05, + "loss": 1.97, + "step": 2280 + }, + { + "epoch": 0.7791103172578039, + "grad_norm": 1.96875, + "learning_rate": 1.2511277048518522e-05, + "loss": 1.9781, + "step": 2290 + }, + { + "epoch": 0.7825125457174449, + "grad_norm": 1.953125, + "learning_rate": 1.2501833067317562e-05, + "loss": 2.0167, + "step": 2300 + }, + { + "epoch": 0.785914774177086, + "grad_norm": 2.0, + "learning_rate": 1.2492345223157068e-05, + "loss": 2.0108, + "step": 2310 + }, + { + "epoch": 0.7893170026367271, + "grad_norm": 1.6328125, + "learning_rate": 1.2482813588053004e-05, + "loss": 2.0094, + "step": 2320 + }, + { + "epoch": 0.7927192310963681, + "grad_norm": 1.3671875, + "learning_rate": 1.2473238234353713e-05, + "loss": 1.9266, + "step": 2330 + }, + { + "epoch": 0.7961214595560092, + "grad_norm": 1.765625, + "learning_rate": 1.2463619234739388e-05, + "loss": 1.9982, + "step": 2340 + }, + { + "epoch": 0.7995236880156502, + "grad_norm": 1.875, + "learning_rate": 1.2453956662221504e-05, + "loss": 2.0688, + "step": 2350 + }, + { + "epoch": 0.8029259164752913, + "grad_norm": 1.890625, + "learning_rate": 1.2444250590142271e-05, + "loss": 1.9658, + "step": 2360 + }, + { + "epoch": 0.8063281449349324, + "grad_norm": 1.953125, + "learning_rate": 1.2434501092174075e-05, + "loss": 1.9954, + "step": 2370 + }, + { + "epoch": 0.8097303733945734, + "grad_norm": 1.7421875, + "learning_rate": 1.242470824231892e-05, + "loss": 2.0507, + "step": 2380 + }, + { + "epoch": 0.8131326018542145, + "grad_norm": 1.7109375, + "learning_rate": 1.241487211490786e-05, + "loss": 2.0469, + "step": 2390 + }, + { + "epoch": 0.8165348303138555, + "grad_norm": 1.8203125, + "learning_rate": 1.2404992784600451e-05, + "loss": 2.0436, + "step": 2400 + }, + { + "epoch": 0.8199370587734967, + "grad_norm": 1.78125, + "learning_rate": 1.2395070326384164e-05, + "loss": 2.0195, + "step": 2410 + }, + { + "epoch": 0.8233392872331377, + "grad_norm": 2.21875, + "learning_rate": 1.238510481557383e-05, + "loss": 1.9674, + "step": 2420 + }, + { + "epoch": 0.8267415156927788, + "grad_norm": 1.9609375, + "learning_rate": 1.2375096327811061e-05, + "loss": 1.9918, + "step": 2430 + }, + { + "epoch": 0.8301437441524199, + "grad_norm": 2.078125, + "learning_rate": 1.2365044939063687e-05, + "loss": 2.0161, + "step": 2440 + }, + { + "epoch": 0.8335459726120609, + "grad_norm": 1.9140625, + "learning_rate": 1.2354950725625158e-05, + "loss": 2.0303, + "step": 2450 + }, + { + "epoch": 0.836948201071702, + "grad_norm": 2.109375, + "learning_rate": 1.2344813764113985e-05, + "loss": 1.973, + "step": 2460 + }, + { + "epoch": 0.840350429531343, + "grad_norm": 1.9296875, + "learning_rate": 1.2334634131473154e-05, + "loss": 2.0389, + "step": 2470 + }, + { + "epoch": 0.8437526579909841, + "grad_norm": 1.78125, + "learning_rate": 1.2324411904969535e-05, + "loss": 2.0597, + "step": 2480 + }, + { + "epoch": 0.8471548864506252, + "grad_norm": 1.7734375, + "learning_rate": 1.2314147162193302e-05, + "loss": 2.029, + "step": 2490 + }, + { + "epoch": 0.8505571149102662, + "grad_norm": 1.921875, + "learning_rate": 1.2303839981057342e-05, + "loss": 2.0216, + "step": 2500 + }, + { + "epoch": 0.8539593433699073, + "grad_norm": 1.96875, + "learning_rate": 1.2293490439796658e-05, + "loss": 1.9839, + "step": 2510 + }, + { + "epoch": 0.8573615718295483, + "grad_norm": 1.78125, + "learning_rate": 1.2283098616967793e-05, + "loss": 2.0373, + "step": 2520 + }, + { + "epoch": 0.8607638002891894, + "grad_norm": 1.75, + "learning_rate": 1.2272664591448208e-05, + "loss": 2.075, + "step": 2530 + }, + { + "epoch": 0.8641660287488305, + "grad_norm": 1.890625, + "learning_rate": 1.2262188442435706e-05, + "loss": 2.071, + "step": 2540 + }, + { + "epoch": 0.8675682572084715, + "grad_norm": 1.7734375, + "learning_rate": 1.2251670249447816e-05, + "loss": 2.0474, + "step": 2550 + }, + { + "epoch": 0.8709704856681126, + "grad_norm": 1.7578125, + "learning_rate": 1.22411100923212e-05, + "loss": 1.9866, + "step": 2560 + }, + { + "epoch": 0.8743727141277536, + "grad_norm": 1.859375, + "learning_rate": 1.2230508051211039e-05, + "loss": 2.0365, + "step": 2570 + }, + { + "epoch": 0.8777749425873947, + "grad_norm": 2.03125, + "learning_rate": 1.2219864206590427e-05, + "loss": 2.0041, + "step": 2580 + }, + { + "epoch": 0.8811771710470359, + "grad_norm": 1.9921875, + "learning_rate": 1.2209178639249763e-05, + "loss": 2.0164, + "step": 2590 + }, + { + "epoch": 0.8845793995066769, + "grad_norm": 1.7578125, + "learning_rate": 1.2198451430296135e-05, + "loss": 2.0469, + "step": 2600 + }, + { + "epoch": 0.887981627966318, + "grad_norm": 1.921875, + "learning_rate": 1.2187682661152705e-05, + "loss": 1.9873, + "step": 2610 + }, + { + "epoch": 0.891383856425959, + "grad_norm": 1.5078125, + "learning_rate": 1.2176872413558087e-05, + "loss": 2.0442, + "step": 2620 + }, + { + "epoch": 0.8947860848856001, + "grad_norm": 1.6640625, + "learning_rate": 1.2166020769565741e-05, + "loss": 2.0356, + "step": 2630 + }, + { + "epoch": 0.8981883133452412, + "grad_norm": 1.9453125, + "learning_rate": 1.2155127811543326e-05, + "loss": 2.0253, + "step": 2640 + }, + { + "epoch": 0.9015905418048822, + "grad_norm": 1.8671875, + "learning_rate": 1.2144193622172099e-05, + "loss": 1.974, + "step": 2650 + }, + { + "epoch": 0.9049927702645233, + "grad_norm": 1.8203125, + "learning_rate": 1.2133218284446276e-05, + "loss": 2.0084, + "step": 2660 + }, + { + "epoch": 0.9083949987241643, + "grad_norm": 1.9609375, + "learning_rate": 1.2122201881672392e-05, + "loss": 2.1215, + "step": 2670 + }, + { + "epoch": 0.9117972271838054, + "grad_norm": 1.9140625, + "learning_rate": 1.2111144497468698e-05, + "loss": 1.9749, + "step": 2680 + }, + { + "epoch": 0.9151994556434464, + "grad_norm": 1.75, + "learning_rate": 1.2100046215764493e-05, + "loss": 1.9601, + "step": 2690 + }, + { + "epoch": 0.9186016841030875, + "grad_norm": 2.03125, + "learning_rate": 1.2088907120799507e-05, + "loss": 1.9761, + "step": 2700 + }, + { + "epoch": 0.9220039125627286, + "grad_norm": 1.90625, + "learning_rate": 1.2077727297123258e-05, + "loss": 2.0309, + "step": 2710 + }, + { + "epoch": 0.9254061410223696, + "grad_norm": 1.6953125, + "learning_rate": 1.2066506829594404e-05, + "loss": 2.0306, + "step": 2720 + }, + { + "epoch": 0.9288083694820107, + "grad_norm": 1.765625, + "learning_rate": 1.2055245803380112e-05, + "loss": 2.0073, + "step": 2730 + }, + { + "epoch": 0.9322105979416517, + "grad_norm": 2.046875, + "learning_rate": 1.2043944303955393e-05, + "loss": 1.9904, + "step": 2740 + }, + { + "epoch": 0.9356128264012928, + "grad_norm": 1.8984375, + "learning_rate": 1.2032602417102472e-05, + "loss": 2.0916, + "step": 2750 + }, + { + "epoch": 0.939015054860934, + "grad_norm": 1.8828125, + "learning_rate": 1.2021220228910125e-05, + "loss": 1.9665, + "step": 2760 + }, + { + "epoch": 0.942417283320575, + "grad_norm": 1.984375, + "learning_rate": 1.2009797825773027e-05, + "loss": 1.9822, + "step": 2770 + }, + { + "epoch": 0.9458195117802161, + "grad_norm": 2.109375, + "learning_rate": 1.1998335294391099e-05, + "loss": 1.9947, + "step": 2780 + }, + { + "epoch": 0.9492217402398571, + "grad_norm": 1.7578125, + "learning_rate": 1.1986832721768856e-05, + "loss": 1.9626, + "step": 2790 + }, + { + "epoch": 0.9526239686994982, + "grad_norm": 1.8515625, + "learning_rate": 1.1975290195214724e-05, + "loss": 1.9772, + "step": 2800 + }, + { + "epoch": 0.9560261971591393, + "grad_norm": 1.921875, + "learning_rate": 1.1963707802340409e-05, + "loss": 2.0471, + "step": 2810 + }, + { + "epoch": 0.9594284256187803, + "grad_norm": 1.8984375, + "learning_rate": 1.1952085631060207e-05, + "loss": 1.9514, + "step": 2820 + }, + { + "epoch": 0.9628306540784214, + "grad_norm": 1.9453125, + "learning_rate": 1.1940423769590349e-05, + "loss": 1.9974, + "step": 2830 + }, + { + "epoch": 0.9662328825380624, + "grad_norm": 1.7578125, + "learning_rate": 1.1928722306448326e-05, + "loss": 2.0036, + "step": 2840 + }, + { + "epoch": 0.9696351109977035, + "grad_norm": 1.453125, + "learning_rate": 1.1916981330452221e-05, + "loss": 1.9803, + "step": 2850 + }, + { + "epoch": 0.9730373394573446, + "grad_norm": 1.8515625, + "learning_rate": 1.1905200930720032e-05, + "loss": 2.0608, + "step": 2860 + }, + { + "epoch": 0.9764395679169856, + "grad_norm": 1.8984375, + "learning_rate": 1.1893381196668997e-05, + "loss": 1.9857, + "step": 2870 + }, + { + "epoch": 0.9798417963766267, + "grad_norm": 1.6171875, + "learning_rate": 1.1881522218014912e-05, + "loss": 2.0197, + "step": 2880 + }, + { + "epoch": 0.9832440248362677, + "grad_norm": 1.8984375, + "learning_rate": 1.1869624084771457e-05, + "loss": 1.9883, + "step": 2890 + }, + { + "epoch": 0.9866462532959088, + "grad_norm": 1.8203125, + "learning_rate": 1.185768688724951e-05, + "loss": 2.0941, + "step": 2900 + }, + { + "epoch": 0.9900484817555499, + "grad_norm": 1.7109375, + "learning_rate": 1.184571071605645e-05, + "loss": 1.9953, + "step": 2910 + }, + { + "epoch": 0.9934507102151909, + "grad_norm": 1.7265625, + "learning_rate": 1.1833695662095493e-05, + "loss": 1.9833, + "step": 2920 + }, + { + "epoch": 0.996852938674832, + "grad_norm": 1.9765625, + "learning_rate": 1.1821641816564982e-05, + "loss": 2.0431, + "step": 2930 + }, + { + "epoch": 1.000255167134473, + "grad_norm": 1.71875, + "learning_rate": 1.1809549270957697e-05, + "loss": 1.886, + "step": 2940 + }, + { + "epoch": 1.0036573955941142, + "grad_norm": 2.078125, + "learning_rate": 1.1797418117060173e-05, + "loss": 1.9804, + "step": 2950 + }, + { + "epoch": 1.0070596240537553, + "grad_norm": 1.875, + "learning_rate": 1.1785248446951988e-05, + "loss": 2.0657, + "step": 2960 + }, + { + "epoch": 1.0104618525133964, + "grad_norm": 1.9296875, + "learning_rate": 1.1773040353005074e-05, + "loss": 2.0112, + "step": 2970 + }, + { + "epoch": 1.0138640809730373, + "grad_norm": 2.015625, + "learning_rate": 1.1760793927883016e-05, + "loss": 2.0262, + "step": 2980 + }, + { + "epoch": 1.0172663094326784, + "grad_norm": 2.109375, + "learning_rate": 1.174850926454034e-05, + "loss": 2.0007, + "step": 2990 + }, + { + "epoch": 1.0206685378923195, + "grad_norm": 2.03125, + "learning_rate": 1.1736186456221816e-05, + "loss": 1.9723, + "step": 3000 + }, + { + "epoch": 1.0240707663519606, + "grad_norm": 2.0625, + "learning_rate": 1.1723825596461751e-05, + "loss": 1.9384, + "step": 3010 + }, + { + "epoch": 1.0274729948116017, + "grad_norm": 1.96875, + "learning_rate": 1.1711426779083267e-05, + "loss": 1.9556, + "step": 3020 + }, + { + "epoch": 1.0308752232712426, + "grad_norm": 1.828125, + "learning_rate": 1.1698990098197604e-05, + "loss": 1.9963, + "step": 3030 + }, + { + "epoch": 1.0342774517308837, + "grad_norm": 2.09375, + "learning_rate": 1.1686515648203396e-05, + "loss": 1.9429, + "step": 3040 + }, + { + "epoch": 1.0376796801905248, + "grad_norm": 2.203125, + "learning_rate": 1.1674003523785957e-05, + "loss": 1.8885, + "step": 3050 + }, + { + "epoch": 1.041081908650166, + "grad_norm": 1.9765625, + "learning_rate": 1.1661453819916565e-05, + "loss": 1.9456, + "step": 3060 + }, + { + "epoch": 1.0444841371098068, + "grad_norm": 2.015625, + "learning_rate": 1.1648866631851738e-05, + "loss": 1.9386, + "step": 3070 + }, + { + "epoch": 1.047886365569448, + "grad_norm": 2.09375, + "learning_rate": 1.1636242055132511e-05, + "loss": 1.9569, + "step": 3080 + }, + { + "epoch": 1.051288594029089, + "grad_norm": 1.8671875, + "learning_rate": 1.1623580185583711e-05, + "loss": 1.9159, + "step": 3090 + }, + { + "epoch": 1.0546908224887301, + "grad_norm": 1.9296875, + "learning_rate": 1.1610881119313231e-05, + "loss": 1.9094, + "step": 3100 + }, + { + "epoch": 1.0580930509483712, + "grad_norm": 2.078125, + "learning_rate": 1.1598144952711302e-05, + "loss": 2.0189, + "step": 3110 + }, + { + "epoch": 1.0614952794080121, + "grad_norm": 1.8515625, + "learning_rate": 1.1585371782449755e-05, + "loss": 2.0053, + "step": 3120 + }, + { + "epoch": 1.0648975078676532, + "grad_norm": 2.15625, + "learning_rate": 1.1572561705481294e-05, + "loss": 1.9826, + "step": 3130 + }, + { + "epoch": 1.0682997363272944, + "grad_norm": 2.015625, + "learning_rate": 1.1559714819038756e-05, + "loss": 1.9597, + "step": 3140 + }, + { + "epoch": 1.0717019647869355, + "grad_norm": 1.734375, + "learning_rate": 1.1546831220634377e-05, + "loss": 1.9255, + "step": 3150 + }, + { + "epoch": 1.0751041932465766, + "grad_norm": 2.109375, + "learning_rate": 1.1533911008059046e-05, + "loss": 1.9859, + "step": 3160 + }, + { + "epoch": 1.0785064217062175, + "grad_norm": 1.7578125, + "learning_rate": 1.1520954279381567e-05, + "loss": 1.9651, + "step": 3170 + }, + { + "epoch": 1.0819086501658586, + "grad_norm": 1.9296875, + "learning_rate": 1.1507961132947917e-05, + "loss": 1.9321, + "step": 3180 + }, + { + "epoch": 1.0853108786254997, + "grad_norm": 1.8046875, + "learning_rate": 1.1494931667380492e-05, + "loss": 1.9215, + "step": 3190 + }, + { + "epoch": 1.0887131070851408, + "grad_norm": 1.9453125, + "learning_rate": 1.1481865981577362e-05, + "loss": 1.982, + "step": 3200 + }, + { + "epoch": 1.092115335544782, + "grad_norm": 2.125, + "learning_rate": 1.1468764174711526e-05, + "loss": 1.9728, + "step": 3210 + }, + { + "epoch": 1.0955175640044228, + "grad_norm": 2.046875, + "learning_rate": 1.1455626346230147e-05, + "loss": 2.0267, + "step": 3220 + }, + { + "epoch": 1.098919792464064, + "grad_norm": 2.359375, + "learning_rate": 1.1442452595853809e-05, + "loss": 1.9484, + "step": 3230 + }, + { + "epoch": 1.102322020923705, + "grad_norm": 2.0, + "learning_rate": 1.1429243023575758e-05, + "loss": 1.9867, + "step": 3240 + }, + { + "epoch": 1.1057242493833461, + "grad_norm": 1.8046875, + "learning_rate": 1.1415997729661134e-05, + "loss": 1.9269, + "step": 3250 + }, + { + "epoch": 1.1091264778429872, + "grad_norm": 1.953125, + "learning_rate": 1.140271681464622e-05, + "loss": 1.9095, + "step": 3260 + }, + { + "epoch": 1.1125287063026281, + "grad_norm": 1.8515625, + "learning_rate": 1.1389400379337676e-05, + "loss": 2.0021, + "step": 3270 + }, + { + "epoch": 1.1159309347622692, + "grad_norm": 2.046875, + "learning_rate": 1.137604852481177e-05, + "loss": 2.0117, + "step": 3280 + }, + { + "epoch": 1.1193331632219103, + "grad_norm": 1.5546875, + "learning_rate": 1.1362661352413616e-05, + "loss": 1.9835, + "step": 3290 + }, + { + "epoch": 1.1227353916815515, + "grad_norm": 2.1875, + "learning_rate": 1.1349238963756402e-05, + "loss": 1.9492, + "step": 3300 + }, + { + "epoch": 1.1261376201411926, + "grad_norm": 2.0, + "learning_rate": 1.1335781460720621e-05, + "loss": 1.9394, + "step": 3310 + }, + { + "epoch": 1.1295398486008335, + "grad_norm": 1.703125, + "learning_rate": 1.1322288945453292e-05, + "loss": 1.9442, + "step": 3320 + }, + { + "epoch": 1.1329420770604746, + "grad_norm": 1.84375, + "learning_rate": 1.1308761520367196e-05, + "loss": 1.9256, + "step": 3330 + }, + { + "epoch": 1.1363443055201157, + "grad_norm": 1.96875, + "learning_rate": 1.1295199288140082e-05, + "loss": 1.9861, + "step": 3340 + }, + { + "epoch": 1.1397465339797568, + "grad_norm": 2.265625, + "learning_rate": 1.1281602351713905e-05, + "loss": 1.9598, + "step": 3350 + }, + { + "epoch": 1.143148762439398, + "grad_norm": 2.09375, + "learning_rate": 1.1267970814294032e-05, + "loss": 1.9839, + "step": 3360 + }, + { + "epoch": 1.1465509908990388, + "grad_norm": 2.125, + "learning_rate": 1.1254304779348466e-05, + "loss": 1.9654, + "step": 3370 + }, + { + "epoch": 1.14995321935868, + "grad_norm": 1.9296875, + "learning_rate": 1.1240604350607055e-05, + "loss": 1.9536, + "step": 3380 + }, + { + "epoch": 1.153355447818321, + "grad_norm": 1.9296875, + "learning_rate": 1.122686963206071e-05, + "loss": 1.9331, + "step": 3390 + }, + { + "epoch": 1.156757676277962, + "grad_norm": 1.921875, + "learning_rate": 1.1213100727960614e-05, + "loss": 1.9218, + "step": 3400 + }, + { + "epoch": 1.1601599047376032, + "grad_norm": 1.9765625, + "learning_rate": 1.1199297742817428e-05, + "loss": 1.9979, + "step": 3410 + }, + { + "epoch": 1.163562133197244, + "grad_norm": 2.25, + "learning_rate": 1.11854607814005e-05, + "loss": 2.02, + "step": 3420 + }, + { + "epoch": 1.1669643616568852, + "grad_norm": 2.09375, + "learning_rate": 1.117158994873707e-05, + "loss": 2.0195, + "step": 3430 + }, + { + "epoch": 1.1703665901165263, + "grad_norm": 1.984375, + "learning_rate": 1.1157685350111472e-05, + "loss": 2.0053, + "step": 3440 + }, + { + "epoch": 1.1737688185761674, + "grad_norm": 1.84375, + "learning_rate": 1.1143747091064334e-05, + "loss": 2.014, + "step": 3450 + }, + { + "epoch": 1.1771710470358085, + "grad_norm": 2.0625, + "learning_rate": 1.1129775277391782e-05, + "loss": 1.9057, + "step": 3460 + }, + { + "epoch": 1.1805732754954494, + "grad_norm": 2.140625, + "learning_rate": 1.1115770015144628e-05, + "loss": 1.9496, + "step": 3470 + }, + { + "epoch": 1.1839755039550905, + "grad_norm": 1.828125, + "learning_rate": 1.1101731410627574e-05, + "loss": 1.9163, + "step": 3480 + }, + { + "epoch": 1.1873777324147317, + "grad_norm": 1.890625, + "learning_rate": 1.1087659570398397e-05, + "loss": 1.9717, + "step": 3490 + }, + { + "epoch": 1.1907799608743728, + "grad_norm": 2.078125, + "learning_rate": 1.1073554601267147e-05, + "loss": 2.0302, + "step": 3500 + }, + { + "epoch": 1.1941821893340139, + "grad_norm": 1.796875, + "learning_rate": 1.1059416610295336e-05, + "loss": 1.9523, + "step": 3510 + }, + { + "epoch": 1.1975844177936548, + "grad_norm": 2.015625, + "learning_rate": 1.104524570479512e-05, + "loss": 1.9842, + "step": 3520 + }, + { + "epoch": 1.2009866462532959, + "grad_norm": 1.875, + "learning_rate": 1.1031041992328483e-05, + "loss": 2.0036, + "step": 3530 + }, + { + "epoch": 1.204388874712937, + "grad_norm": 2.03125, + "learning_rate": 1.1016805580706439e-05, + "loss": 2.048, + "step": 3540 + }, + { + "epoch": 1.207791103172578, + "grad_norm": 2.0625, + "learning_rate": 1.1002536577988182e-05, + "loss": 1.9545, + "step": 3550 + }, + { + "epoch": 1.2111933316322192, + "grad_norm": 1.9921875, + "learning_rate": 1.0988235092480297e-05, + "loss": 1.9575, + "step": 3560 + }, + { + "epoch": 1.21459556009186, + "grad_norm": 2.015625, + "learning_rate": 1.0973901232735917e-05, + "loss": 1.9759, + "step": 3570 + }, + { + "epoch": 1.2179977885515012, + "grad_norm": 2.078125, + "learning_rate": 1.0959535107553909e-05, + "loss": 1.9737, + "step": 3580 + }, + { + "epoch": 1.2214000170111423, + "grad_norm": 1.890625, + "learning_rate": 1.0945136825978049e-05, + "loss": 2.0414, + "step": 3590 + }, + { + "epoch": 1.2248022454707834, + "grad_norm": 2.0625, + "learning_rate": 1.0930706497296186e-05, + "loss": 1.9566, + "step": 3600 + }, + { + "epoch": 1.2282044739304245, + "grad_norm": 1.8125, + "learning_rate": 1.0916244231039415e-05, + "loss": 1.9614, + "step": 3610 + }, + { + "epoch": 1.2316067023900654, + "grad_norm": 2.09375, + "learning_rate": 1.0901750136981258e-05, + "loss": 2.0045, + "step": 3620 + }, + { + "epoch": 1.2350089308497065, + "grad_norm": 1.578125, + "learning_rate": 1.0887224325136807e-05, + "loss": 1.9703, + "step": 3630 + }, + { + "epoch": 1.2384111593093476, + "grad_norm": 2.265625, + "learning_rate": 1.0872666905761921e-05, + "loss": 1.9609, + "step": 3640 + }, + { + "epoch": 1.2418133877689888, + "grad_norm": 1.9296875, + "learning_rate": 1.0858077989352354e-05, + "loss": 1.9865, + "step": 3650 + }, + { + "epoch": 1.2452156162286299, + "grad_norm": 1.84375, + "learning_rate": 1.084345768664294e-05, + "loss": 1.9276, + "step": 3660 + }, + { + "epoch": 1.2486178446882708, + "grad_norm": 2.25, + "learning_rate": 1.0828806108606748e-05, + "loss": 1.9673, + "step": 3670 + }, + { + "epoch": 1.2520200731479119, + "grad_norm": 2.15625, + "learning_rate": 1.081412336645423e-05, + "loss": 1.9522, + "step": 3680 + }, + { + "epoch": 1.255422301607553, + "grad_norm": 1.953125, + "learning_rate": 1.0799409571632395e-05, + "loss": 1.8882, + "step": 3690 + }, + { + "epoch": 1.258824530067194, + "grad_norm": 1.9765625, + "learning_rate": 1.0784664835823945e-05, + "loss": 1.9378, + "step": 3700 + }, + { + "epoch": 1.2622267585268352, + "grad_norm": 1.7421875, + "learning_rate": 1.076988927094643e-05, + "loss": 2.0231, + "step": 3710 + }, + { + "epoch": 1.265628986986476, + "grad_norm": 2.03125, + "learning_rate": 1.0755082989151417e-05, + "loss": 1.925, + "step": 3720 + }, + { + "epoch": 1.2690312154461172, + "grad_norm": 2.15625, + "learning_rate": 1.0740246102823613e-05, + "loss": 1.8958, + "step": 3730 + }, + { + "epoch": 1.2724334439057583, + "grad_norm": 2.015625, + "learning_rate": 1.0725378724580027e-05, + "loss": 1.9536, + "step": 3740 + }, + { + "epoch": 1.2758356723653994, + "grad_norm": 1.953125, + "learning_rate": 1.0710480967269115e-05, + "loss": 1.9541, + "step": 3750 + }, + { + "epoch": 1.2792379008250405, + "grad_norm": 1.734375, + "learning_rate": 1.0695552943969919e-05, + "loss": 1.9327, + "step": 3760 + }, + { + "epoch": 1.2826401292846814, + "grad_norm": 1.9375, + "learning_rate": 1.0680594767991203e-05, + "loss": 1.9935, + "step": 3770 + }, + { + "epoch": 1.2860423577443225, + "grad_norm": 2.078125, + "learning_rate": 1.0665606552870612e-05, + "loss": 1.9933, + "step": 3780 + }, + { + "epoch": 1.2894445862039636, + "grad_norm": 2.125, + "learning_rate": 1.0650588412373792e-05, + "loss": 1.9314, + "step": 3790 + }, + { + "epoch": 1.2928468146636047, + "grad_norm": 1.609375, + "learning_rate": 1.0635540460493534e-05, + "loss": 1.9136, + "step": 3800 + }, + { + "epoch": 1.2962490431232458, + "grad_norm": 1.796875, + "learning_rate": 1.0620462811448904e-05, + "loss": 1.9175, + "step": 3810 + }, + { + "epoch": 1.2996512715828867, + "grad_norm": 2.125, + "learning_rate": 1.0605355579684382e-05, + "loss": 1.9929, + "step": 3820 + }, + { + "epoch": 1.3030535000425278, + "grad_norm": 2.109375, + "learning_rate": 1.0590218879868998e-05, + "loss": 1.9072, + "step": 3830 + }, + { + "epoch": 1.306455728502169, + "grad_norm": 2.296875, + "learning_rate": 1.0575052826895442e-05, + "loss": 1.9315, + "step": 3840 + }, + { + "epoch": 1.30985795696181, + "grad_norm": 1.78125, + "learning_rate": 1.0559857535879212e-05, + "loss": 2.045, + "step": 3850 + }, + { + "epoch": 1.3132601854214512, + "grad_norm": 2.15625, + "learning_rate": 1.0544633122157734e-05, + "loss": 1.9443, + "step": 3860 + }, + { + "epoch": 1.316662413881092, + "grad_norm": 1.890625, + "learning_rate": 1.0529379701289476e-05, + "loss": 1.9742, + "step": 3870 + }, + { + "epoch": 1.3200646423407332, + "grad_norm": 1.7890625, + "learning_rate": 1.051409738905309e-05, + "loss": 1.9852, + "step": 3880 + }, + { + "epoch": 1.3234668708003743, + "grad_norm": 2.1875, + "learning_rate": 1.0498786301446519e-05, + "loss": 1.997, + "step": 3890 + }, + { + "epoch": 1.3268690992600152, + "grad_norm": 2.0, + "learning_rate": 1.0483446554686125e-05, + "loss": 1.9083, + "step": 3900 + }, + { + "epoch": 1.3302713277196565, + "grad_norm": 1.8046875, + "learning_rate": 1.0468078265205796e-05, + "loss": 1.974, + "step": 3910 + }, + { + "epoch": 1.3336735561792974, + "grad_norm": 1.875, + "learning_rate": 1.0452681549656073e-05, + "loss": 1.9885, + "step": 3920 + }, + { + "epoch": 1.3370757846389385, + "grad_norm": 1.9609375, + "learning_rate": 1.0437256524903258e-05, + "loss": 1.9872, + "step": 3930 + }, + { + "epoch": 1.3404780130985796, + "grad_norm": 2.046875, + "learning_rate": 1.0421803308028533e-05, + "loss": 1.9477, + "step": 3940 + }, + { + "epoch": 1.3438802415582205, + "grad_norm": 1.9296875, + "learning_rate": 1.0406322016327067e-05, + "loss": 2.0032, + "step": 3950 + }, + { + "epoch": 1.3472824700178618, + "grad_norm": 2.015625, + "learning_rate": 1.0390812767307123e-05, + "loss": 1.9942, + "step": 3960 + }, + { + "epoch": 1.3506846984775027, + "grad_norm": 1.8984375, + "learning_rate": 1.0375275678689174e-05, + "loss": 2.0242, + "step": 3970 + }, + { + "epoch": 1.3540869269371438, + "grad_norm": 1.90625, + "learning_rate": 1.0359710868405e-05, + "loss": 2.0306, + "step": 3980 + }, + { + "epoch": 1.357489155396785, + "grad_norm": 2.140625, + "learning_rate": 1.0344118454596807e-05, + "loss": 1.9709, + "step": 3990 + }, + { + "epoch": 1.3608913838564258, + "grad_norm": 1.9765625, + "learning_rate": 1.032849855561631e-05, + "loss": 1.9812, + "step": 4000 + }, + { + "epoch": 1.3642936123160672, + "grad_norm": 2.09375, + "learning_rate": 1.0312851290023851e-05, + "loss": 2.0006, + "step": 4010 + }, + { + "epoch": 1.367695840775708, + "grad_norm": 2.078125, + "learning_rate": 1.0297176776587497e-05, + "loss": 1.9679, + "step": 4020 + }, + { + "epoch": 1.3710980692353492, + "grad_norm": 2.375, + "learning_rate": 1.028147513428213e-05, + "loss": 1.934, + "step": 4030 + }, + { + "epoch": 1.3745002976949903, + "grad_norm": 2.046875, + "learning_rate": 1.026574648228855e-05, + "loss": 1.9867, + "step": 4040 + }, + { + "epoch": 1.3779025261546312, + "grad_norm": 2.359375, + "learning_rate": 1.0249990939992573e-05, + "loss": 1.899, + "step": 4050 + }, + { + "epoch": 1.3813047546142723, + "grad_norm": 2.15625, + "learning_rate": 1.023420862698412e-05, + "loss": 1.9799, + "step": 4060 + }, + { + "epoch": 1.3847069830739134, + "grad_norm": 1.9609375, + "learning_rate": 1.021839966305631e-05, + "loss": 2.0251, + "step": 4070 + }, + { + "epoch": 1.3881092115335545, + "grad_norm": 2.0625, + "learning_rate": 1.0202564168204549e-05, + "loss": 1.9332, + "step": 4080 + }, + { + "epoch": 1.3915114399931956, + "grad_norm": 2.1875, + "learning_rate": 1.0186702262625632e-05, + "loss": 1.971, + "step": 4090 + }, + { + "epoch": 1.3949136684528365, + "grad_norm": 2.0625, + "learning_rate": 1.0170814066716807e-05, + "loss": 1.9266, + "step": 4100 + }, + { + "epoch": 1.3983158969124776, + "grad_norm": 1.984375, + "learning_rate": 1.0154899701074883e-05, + "loss": 1.9282, + "step": 4110 + }, + { + "epoch": 1.4017181253721187, + "grad_norm": 2.046875, + "learning_rate": 1.0138959286495303e-05, + "loss": 2.0014, + "step": 4120 + }, + { + "epoch": 1.4051203538317598, + "grad_norm": 2.125, + "learning_rate": 1.0122992943971232e-05, + "loss": 1.9463, + "step": 4130 + }, + { + "epoch": 1.408522582291401, + "grad_norm": 1.875, + "learning_rate": 1.0107000794692637e-05, + "loss": 2.003, + "step": 4140 + }, + { + "epoch": 1.4119248107510418, + "grad_norm": 2.234375, + "learning_rate": 1.0090982960045363e-05, + "loss": 2.0, + "step": 4150 + }, + { + "epoch": 1.415327039210683, + "grad_norm": 2.203125, + "learning_rate": 1.0074939561610221e-05, + "loss": 1.9832, + "step": 4160 + }, + { + "epoch": 1.418729267670324, + "grad_norm": 2.078125, + "learning_rate": 1.005887072116206e-05, + "loss": 1.8977, + "step": 4170 + }, + { + "epoch": 1.4221314961299651, + "grad_norm": 1.65625, + "learning_rate": 1.0042776560668832e-05, + "loss": 1.9778, + "step": 4180 + }, + { + "epoch": 1.4255337245896063, + "grad_norm": 1.9921875, + "learning_rate": 1.0026657202290696e-05, + "loss": 1.9389, + "step": 4190 + }, + { + "epoch": 1.4289359530492471, + "grad_norm": 2.21875, + "learning_rate": 1.0010512768379053e-05, + "loss": 1.909, + "step": 4200 + }, + { + "epoch": 1.4323381815088883, + "grad_norm": 2.109375, + "learning_rate": 9.994343381475644e-06, + "loss": 1.9563, + "step": 4210 + }, + { + "epoch": 1.4357404099685294, + "grad_norm": 2.09375, + "learning_rate": 9.978149164311613e-06, + "loss": 1.9725, + "step": 4220 + }, + { + "epoch": 1.4391426384281705, + "grad_norm": 1.71875, + "learning_rate": 9.961930239806571e-06, + "loss": 2.0237, + "step": 4230 + }, + { + "epoch": 1.4425448668878116, + "grad_norm": 1.953125, + "learning_rate": 9.945686731067668e-06, + "loss": 1.9415, + "step": 4240 + }, + { + "epoch": 1.4459470953474525, + "grad_norm": 2.0625, + "learning_rate": 9.929418761388654e-06, + "loss": 1.9221, + "step": 4250 + }, + { + "epoch": 1.4493493238070936, + "grad_norm": 2.046875, + "learning_rate": 9.91312645424895e-06, + "loss": 1.9062, + "step": 4260 + }, + { + "epoch": 1.4527515522667347, + "grad_norm": 2.40625, + "learning_rate": 9.896809933312702e-06, + "loss": 1.9621, + "step": 4270 + }, + { + "epoch": 1.4561537807263758, + "grad_norm": 2.265625, + "learning_rate": 9.88046932242785e-06, + "loss": 1.9721, + "step": 4280 + }, + { + "epoch": 1.459556009186017, + "grad_norm": 1.9765625, + "learning_rate": 9.864104745625186e-06, + "loss": 2.0143, + "step": 4290 + }, + { + "epoch": 1.4629582376456578, + "grad_norm": 2.359375, + "learning_rate": 9.847716327117408e-06, + "loss": 1.9356, + "step": 4300 + }, + { + "epoch": 1.466360466105299, + "grad_norm": 2.140625, + "learning_rate": 9.831304191298181e-06, + "loss": 1.9466, + "step": 4310 + }, + { + "epoch": 1.46976269456494, + "grad_norm": 1.890625, + "learning_rate": 9.814868462741196e-06, + "loss": 1.9112, + "step": 4320 + }, + { + "epoch": 1.4731649230245811, + "grad_norm": 1.953125, + "learning_rate": 9.798409266199217e-06, + "loss": 1.9464, + "step": 4330 + }, + { + "epoch": 1.4765671514842222, + "grad_norm": 2.046875, + "learning_rate": 9.781926726603141e-06, + "loss": 1.9421, + "step": 4340 + }, + { + "epoch": 1.4799693799438631, + "grad_norm": 2.09375, + "learning_rate": 9.765420969061045e-06, + "loss": 2.0682, + "step": 4350 + }, + { + "epoch": 1.4833716084035042, + "grad_norm": 1.7734375, + "learning_rate": 9.748892118857236e-06, + "loss": 1.9912, + "step": 4360 + }, + { + "epoch": 1.4867738368631453, + "grad_norm": 1.921875, + "learning_rate": 9.73234030145131e-06, + "loss": 1.9594, + "step": 4370 + }, + { + "epoch": 1.4901760653227865, + "grad_norm": 2.34375, + "learning_rate": 9.71576564247718e-06, + "loss": 1.9444, + "step": 4380 + }, + { + "epoch": 1.4935782937824276, + "grad_norm": 2.09375, + "learning_rate": 9.699168267742144e-06, + "loss": 1.9882, + "step": 4390 + }, + { + "epoch": 1.4969805222420685, + "grad_norm": 1.84375, + "learning_rate": 9.682548303225915e-06, + "loss": 1.9076, + "step": 4400 + }, + { + "epoch": 1.5003827507017096, + "grad_norm": 2.015625, + "learning_rate": 9.665905875079679e-06, + "loss": 1.9594, + "step": 4410 + }, + { + "epoch": 1.5037849791613507, + "grad_norm": 2.03125, + "learning_rate": 9.649241109625111e-06, + "loss": 2.0808, + "step": 4420 + }, + { + "epoch": 1.5071872076209918, + "grad_norm": 1.9375, + "learning_rate": 9.632554133353453e-06, + "loss": 1.9688, + "step": 4430 + }, + { + "epoch": 1.510589436080633, + "grad_norm": 1.953125, + "learning_rate": 9.615845072924522e-06, + "loss": 1.971, + "step": 4440 + }, + { + "epoch": 1.5139916645402738, + "grad_norm": 1.9609375, + "learning_rate": 9.59911405516577e-06, + "loss": 1.9759, + "step": 4450 + }, + { + "epoch": 1.517393892999915, + "grad_norm": 2.125, + "learning_rate": 9.582361207071299e-06, + "loss": 1.975, + "step": 4460 + }, + { + "epoch": 1.520796121459556, + "grad_norm": 1.90625, + "learning_rate": 9.565586655800928e-06, + "loss": 1.9975, + "step": 4470 + }, + { + "epoch": 1.5241983499191971, + "grad_norm": 1.9453125, + "learning_rate": 9.5487905286792e-06, + "loss": 1.966, + "step": 4480 + }, + { + "epoch": 1.5276005783788382, + "grad_norm": 2.078125, + "learning_rate": 9.531972953194425e-06, + "loss": 1.9374, + "step": 4490 + }, + { + "epoch": 1.5310028068384791, + "grad_norm": 2.0625, + "learning_rate": 9.51513405699772e-06, + "loss": 1.9567, + "step": 4500 + }, + { + "epoch": 1.5344050352981202, + "grad_norm": 2.359375, + "learning_rate": 9.498273967902033e-06, + "loss": 1.9704, + "step": 4510 + }, + { + "epoch": 1.5378072637577613, + "grad_norm": 2.078125, + "learning_rate": 9.481392813881164e-06, + "loss": 1.9064, + "step": 4520 + }, + { + "epoch": 1.5412094922174024, + "grad_norm": 2.21875, + "learning_rate": 9.464490723068811e-06, + "loss": 1.9553, + "step": 4530 + }, + { + "epoch": 1.5446117206770436, + "grad_norm": 2.171875, + "learning_rate": 9.447567823757589e-06, + "loss": 1.9416, + "step": 4540 + }, + { + "epoch": 1.5480139491366844, + "grad_norm": 1.859375, + "learning_rate": 9.430624244398053e-06, + "loss": 2.0401, + "step": 4550 + }, + { + "epoch": 1.5514161775963256, + "grad_norm": 2.125, + "learning_rate": 9.413660113597731e-06, + "loss": 1.9495, + "step": 4560 + }, + { + "epoch": 1.5548184060559667, + "grad_norm": 2.296875, + "learning_rate": 9.396675560120143e-06, + "loss": 2.0093, + "step": 4570 + }, + { + "epoch": 1.5582206345156078, + "grad_norm": 2.203125, + "learning_rate": 9.379670712883817e-06, + "loss": 1.974, + "step": 4580 + }, + { + "epoch": 1.5616228629752489, + "grad_norm": 1.96875, + "learning_rate": 9.362645700961327e-06, + "loss": 1.935, + "step": 4590 + }, + { + "epoch": 1.5650250914348898, + "grad_norm": 2.171875, + "learning_rate": 9.345600653578297e-06, + "loss": 1.9727, + "step": 4600 + }, + { + "epoch": 1.5684273198945309, + "grad_norm": 2.34375, + "learning_rate": 9.328535700112433e-06, + "loss": 1.9115, + "step": 4610 + }, + { + "epoch": 1.571829548354172, + "grad_norm": 2.109375, + "learning_rate": 9.311450970092529e-06, + "loss": 1.9329, + "step": 4620 + }, + { + "epoch": 1.575231776813813, + "grad_norm": 1.9609375, + "learning_rate": 9.294346593197489e-06, + "loss": 1.9534, + "step": 4630 + }, + { + "epoch": 1.5786340052734542, + "grad_norm": 1.9609375, + "learning_rate": 9.277222699255353e-06, + "loss": 1.9047, + "step": 4640 + }, + { + "epoch": 1.582036233733095, + "grad_norm": 1.9765625, + "learning_rate": 9.260079418242293e-06, + "loss": 1.9975, + "step": 4650 + }, + { + "epoch": 1.5854384621927362, + "grad_norm": 2.359375, + "learning_rate": 9.242916880281638e-06, + "loss": 1.9347, + "step": 4660 + }, + { + "epoch": 1.5888406906523773, + "grad_norm": 2.1875, + "learning_rate": 9.225735215642885e-06, + "loss": 1.9552, + "step": 4670 + }, + { + "epoch": 1.5922429191120182, + "grad_norm": 2.109375, + "learning_rate": 9.208534554740706e-06, + "loss": 1.9052, + "step": 4680 + }, + { + "epoch": 1.5956451475716595, + "grad_norm": 2.1875, + "learning_rate": 9.191315028133966e-06, + "loss": 1.9881, + "step": 4690 + }, + { + "epoch": 1.5990473760313004, + "grad_norm": 2.0625, + "learning_rate": 9.17407676652472e-06, + "loss": 1.9671, + "step": 4700 + }, + { + "epoch": 1.6024496044909415, + "grad_norm": 2.203125, + "learning_rate": 9.156819900757237e-06, + "loss": 1.9753, + "step": 4710 + }, + { + "epoch": 1.6058518329505826, + "grad_norm": 1.9140625, + "learning_rate": 9.139544561816991e-06, + "loss": 1.9516, + "step": 4720 + }, + { + "epoch": 1.6092540614102235, + "grad_norm": 2.234375, + "learning_rate": 9.122250880829674e-06, + "loss": 1.9615, + "step": 4730 + }, + { + "epoch": 1.6126562898698649, + "grad_norm": 2.15625, + "learning_rate": 9.104938989060205e-06, + "loss": 1.9325, + "step": 4740 + }, + { + "epoch": 1.6160585183295058, + "grad_norm": 1.984375, + "learning_rate": 9.087609017911725e-06, + "loss": 1.9227, + "step": 4750 + }, + { + "epoch": 1.6194607467891469, + "grad_norm": 2.109375, + "learning_rate": 9.070261098924604e-06, + "loss": 1.9796, + "step": 4760 + }, + { + "epoch": 1.622862975248788, + "grad_norm": 2.1875, + "learning_rate": 9.052895363775442e-06, + "loss": 1.977, + "step": 4770 + }, + { + "epoch": 1.6262652037084289, + "grad_norm": 2.046875, + "learning_rate": 9.035511944276075e-06, + "loss": 1.8778, + "step": 4780 + }, + { + "epoch": 1.6296674321680702, + "grad_norm": 2.546875, + "learning_rate": 9.018110972372563e-06, + "loss": 1.924, + "step": 4790 + }, + { + "epoch": 1.633069660627711, + "grad_norm": 1.9140625, + "learning_rate": 9.000692580144194e-06, + "loss": 1.9173, + "step": 4800 + }, + { + "epoch": 1.6364718890873522, + "grad_norm": 2.40625, + "learning_rate": 8.983256899802485e-06, + "loss": 1.9433, + "step": 4810 + }, + { + "epoch": 1.6398741175469933, + "grad_norm": 2.09375, + "learning_rate": 8.96580406369018e-06, + "loss": 1.9947, + "step": 4820 + }, + { + "epoch": 1.6432763460066342, + "grad_norm": 1.9921875, + "learning_rate": 8.948334204280234e-06, + "loss": 1.9073, + "step": 4830 + }, + { + "epoch": 1.6466785744662755, + "grad_norm": 2.3125, + "learning_rate": 8.930847454174817e-06, + "loss": 1.9565, + "step": 4840 + }, + { + "epoch": 1.6500808029259164, + "grad_norm": 2.15625, + "learning_rate": 8.913343946104305e-06, + "loss": 1.8945, + "step": 4850 + }, + { + "epoch": 1.6534830313855575, + "grad_norm": 2.296875, + "learning_rate": 8.895823812926273e-06, + "loss": 1.9491, + "step": 4860 + }, + { + "epoch": 1.6568852598451986, + "grad_norm": 2.203125, + "learning_rate": 8.878287187624486e-06, + "loss": 1.8916, + "step": 4870 + }, + { + "epoch": 1.6602874883048395, + "grad_norm": 1.9296875, + "learning_rate": 8.860734203307893e-06, + "loss": 1.9758, + "step": 4880 + }, + { + "epoch": 1.6636897167644809, + "grad_norm": 1.9453125, + "learning_rate": 8.84316499320961e-06, + "loss": 1.9791, + "step": 4890 + }, + { + "epoch": 1.6670919452241217, + "grad_norm": 2.0, + "learning_rate": 8.825579690685907e-06, + "loss": 2.0407, + "step": 4900 + }, + { + "epoch": 1.6704941736837629, + "grad_norm": 1.953125, + "learning_rate": 8.807978429215212e-06, + "loss": 2.0039, + "step": 4910 + }, + { + "epoch": 1.673896402143404, + "grad_norm": 2.203125, + "learning_rate": 8.79036134239708e-06, + "loss": 2.0093, + "step": 4920 + }, + { + "epoch": 1.6772986306030448, + "grad_norm": 2.265625, + "learning_rate": 8.772728563951189e-06, + "loss": 1.8997, + "step": 4930 + }, + { + "epoch": 1.6807008590626862, + "grad_norm": 2.140625, + "learning_rate": 8.755080227716316e-06, + "loss": 1.908, + "step": 4940 + }, + { + "epoch": 1.684103087522327, + "grad_norm": 1.8515625, + "learning_rate": 8.737416467649337e-06, + "loss": 1.9478, + "step": 4950 + }, + { + "epoch": 1.6875053159819682, + "grad_norm": 2.203125, + "learning_rate": 8.71973741782419e-06, + "loss": 1.9497, + "step": 4960 + }, + { + "epoch": 1.6909075444416093, + "grad_norm": 1.8125, + "learning_rate": 8.70204321243087e-06, + "loss": 1.9035, + "step": 4970 + }, + { + "epoch": 1.6943097729012502, + "grad_norm": 2.171875, + "learning_rate": 8.684333985774413e-06, + "loss": 1.9666, + "step": 4980 + }, + { + "epoch": 1.6977120013608915, + "grad_norm": 2.484375, + "learning_rate": 8.666609872273867e-06, + "loss": 1.9943, + "step": 4990 + }, + { + "epoch": 1.7011142298205324, + "grad_norm": 2.234375, + "learning_rate": 8.648871006461278e-06, + "loss": 1.9293, + "step": 5000 + }, + { + "epoch": 1.7045164582801735, + "grad_norm": 2.140625, + "learning_rate": 8.631117522980663e-06, + "loss": 1.9369, + "step": 5010 + }, + { + "epoch": 1.7079186867398146, + "grad_norm": 2.046875, + "learning_rate": 8.613349556587001e-06, + "loss": 1.9117, + "step": 5020 + }, + { + "epoch": 1.7113209151994555, + "grad_norm": 2.078125, + "learning_rate": 8.59556724214519e-06, + "loss": 1.9757, + "step": 5030 + }, + { + "epoch": 1.7147231436590968, + "grad_norm": 2.328125, + "learning_rate": 8.577770714629042e-06, + "loss": 1.9838, + "step": 5040 + }, + { + "epoch": 1.7181253721187377, + "grad_norm": 2.328125, + "learning_rate": 8.559960109120251e-06, + "loss": 1.9571, + "step": 5050 + }, + { + "epoch": 1.7215276005783788, + "grad_norm": 2.140625, + "learning_rate": 8.542135560807365e-06, + "loss": 1.9588, + "step": 5060 + }, + { + "epoch": 1.72492982903802, + "grad_norm": 2.15625, + "learning_rate": 8.524297204984759e-06, + "loss": 1.9542, + "step": 5070 + }, + { + "epoch": 1.7283320574976608, + "grad_norm": 1.9765625, + "learning_rate": 8.506445177051624e-06, + "loss": 1.9691, + "step": 5080 + }, + { + "epoch": 1.7317342859573022, + "grad_norm": 1.953125, + "learning_rate": 8.488579612510915e-06, + "loss": 1.9141, + "step": 5090 + }, + { + "epoch": 1.735136514416943, + "grad_norm": 2.0, + "learning_rate": 8.470700646968339e-06, + "loss": 2.0129, + "step": 5100 + }, + { + "epoch": 1.7385387428765842, + "grad_norm": 2.171875, + "learning_rate": 8.452808416131319e-06, + "loss": 1.9424, + "step": 5110 + }, + { + "epoch": 1.7419409713362253, + "grad_norm": 1.8984375, + "learning_rate": 8.434903055807971e-06, + "loss": 1.9041, + "step": 5120 + }, + { + "epoch": 1.7453431997958662, + "grad_norm": 1.859375, + "learning_rate": 8.416984701906065e-06, + "loss": 1.9514, + "step": 5130 + }, + { + "epoch": 1.7487454282555075, + "grad_norm": 1.7421875, + "learning_rate": 8.399053490431994e-06, + "loss": 1.9846, + "step": 5140 + }, + { + "epoch": 1.7521476567151484, + "grad_norm": 2.03125, + "learning_rate": 8.38110955748975e-06, + "loss": 1.9242, + "step": 5150 + }, + { + "epoch": 1.7555498851747895, + "grad_norm": 2.015625, + "learning_rate": 8.363153039279882e-06, + "loss": 1.9853, + "step": 5160 + }, + { + "epoch": 1.7589521136344306, + "grad_norm": 2.15625, + "learning_rate": 8.345184072098464e-06, + "loss": 2.0005, + "step": 5170 + }, + { + "epoch": 1.7623543420940715, + "grad_norm": 2.171875, + "learning_rate": 8.327202792336068e-06, + "loss": 2.0181, + "step": 5180 + }, + { + "epoch": 1.7657565705537128, + "grad_norm": 2.234375, + "learning_rate": 8.309209336476713e-06, + "loss": 1.9119, + "step": 5190 + }, + { + "epoch": 1.7691587990133537, + "grad_norm": 2.328125, + "learning_rate": 8.29120384109685e-06, + "loss": 1.9061, + "step": 5200 + }, + { + "epoch": 1.7725610274729948, + "grad_norm": 2.046875, + "learning_rate": 8.273186442864303e-06, + "loss": 1.9584, + "step": 5210 + }, + { + "epoch": 1.775963255932636, + "grad_norm": 2.1875, + "learning_rate": 8.25515727853725e-06, + "loss": 1.9456, + "step": 5220 + }, + { + "epoch": 1.7793654843922768, + "grad_norm": 2.109375, + "learning_rate": 8.23711648496318e-06, + "loss": 1.9162, + "step": 5230 + }, + { + "epoch": 1.7827677128519182, + "grad_norm": 2.3125, + "learning_rate": 8.219064199077837e-06, + "loss": 1.9735, + "step": 5240 + }, + { + "epoch": 1.786169941311559, + "grad_norm": 2.296875, + "learning_rate": 8.201000557904217e-06, + "loss": 1.9512, + "step": 5250 + }, + { + "epoch": 1.7895721697712001, + "grad_norm": 2.046875, + "learning_rate": 8.182925698551491e-06, + "loss": 1.9886, + "step": 5260 + }, + { + "epoch": 1.7929743982308413, + "grad_norm": 2.390625, + "learning_rate": 8.164839758213986e-06, + "loss": 1.9956, + "step": 5270 + }, + { + "epoch": 1.7963766266904821, + "grad_norm": 2.28125, + "learning_rate": 8.14674287417013e-06, + "loss": 1.9076, + "step": 5280 + }, + { + "epoch": 1.7997788551501235, + "grad_norm": 1.84375, + "learning_rate": 8.128635183781433e-06, + "loss": 1.912, + "step": 5290 + }, + { + "epoch": 1.8031810836097644, + "grad_norm": 2.21875, + "learning_rate": 8.11051682449141e-06, + "loss": 1.9582, + "step": 5300 + }, + { + "epoch": 1.8065833120694055, + "grad_norm": 2.296875, + "learning_rate": 8.092387933824571e-06, + "loss": 1.979, + "step": 5310 + }, + { + "epoch": 1.8099855405290466, + "grad_norm": 2.46875, + "learning_rate": 8.074248649385357e-06, + "loss": 1.9679, + "step": 5320 + }, + { + "epoch": 1.8133877689886875, + "grad_norm": 2.21875, + "learning_rate": 8.056099108857101e-06, + "loss": 1.9288, + "step": 5330 + }, + { + "epoch": 1.8167899974483288, + "grad_norm": 2.296875, + "learning_rate": 8.037939450000985e-06, + "loss": 1.922, + "step": 5340 + }, + { + "epoch": 1.8201922259079697, + "grad_norm": 2.1875, + "learning_rate": 8.019769810654989e-06, + "loss": 1.9022, + "step": 5350 + }, + { + "epoch": 1.8235944543676108, + "grad_norm": 2.0, + "learning_rate": 8.00159032873285e-06, + "loss": 1.9698, + "step": 5360 + }, + { + "epoch": 1.826996682827252, + "grad_norm": 2.171875, + "learning_rate": 7.98340114222302e-06, + "loss": 1.9087, + "step": 5370 + }, + { + "epoch": 1.8303989112868928, + "grad_norm": 2.140625, + "learning_rate": 7.9652023891876e-06, + "loss": 1.9785, + "step": 5380 + }, + { + "epoch": 1.8338011397465341, + "grad_norm": 2.015625, + "learning_rate": 7.946994207761316e-06, + "loss": 1.9983, + "step": 5390 + }, + { + "epoch": 1.837203368206175, + "grad_norm": 2.328125, + "learning_rate": 7.928776736150451e-06, + "loss": 2.0148, + "step": 5400 + }, + { + "epoch": 1.8406055966658161, + "grad_norm": 2.109375, + "learning_rate": 7.910550112631802e-06, + "loss": 1.9808, + "step": 5410 + }, + { + "epoch": 1.8440078251254572, + "grad_norm": 2.15625, + "learning_rate": 7.892314475551641e-06, + "loss": 1.9153, + "step": 5420 + }, + { + "epoch": 1.8474100535850981, + "grad_norm": 2.109375, + "learning_rate": 7.87406996332465e-06, + "loss": 1.9285, + "step": 5430 + }, + { + "epoch": 1.8508122820447395, + "grad_norm": 2.34375, + "learning_rate": 7.855816714432878e-06, + "loss": 1.952, + "step": 5440 + }, + { + "epoch": 1.8542145105043804, + "grad_norm": 2.203125, + "learning_rate": 7.837554867424685e-06, + "loss": 1.9335, + "step": 5450 + }, + { + "epoch": 1.8576167389640215, + "grad_norm": 2.34375, + "learning_rate": 7.8192845609137e-06, + "loss": 1.943, + "step": 5460 + }, + { + "epoch": 1.8610189674236626, + "grad_norm": 2.203125, + "learning_rate": 7.801005933577753e-06, + "loss": 2.0204, + "step": 5470 + }, + { + "epoch": 1.8644211958833035, + "grad_norm": 2.1875, + "learning_rate": 7.782719124157842e-06, + "loss": 1.915, + "step": 5480 + }, + { + "epoch": 1.8678234243429448, + "grad_norm": 2.21875, + "learning_rate": 7.764424271457067e-06, + "loss": 1.9207, + "step": 5490 + }, + { + "epoch": 1.8712256528025857, + "grad_norm": 2.015625, + "learning_rate": 7.746121514339576e-06, + "loss": 1.9593, + "step": 5500 + }, + { + "epoch": 1.8746278812622268, + "grad_norm": 1.828125, + "learning_rate": 7.727810991729512e-06, + "loss": 1.904, + "step": 5510 + }, + { + "epoch": 1.878030109721868, + "grad_norm": 1.9140625, + "learning_rate": 7.709492842609971e-06, + "loss": 1.9757, + "step": 5520 + }, + { + "epoch": 1.8814323381815088, + "grad_norm": 1.9140625, + "learning_rate": 7.691167206021928e-06, + "loss": 1.938, + "step": 5530 + }, + { + "epoch": 1.88483456664115, + "grad_norm": 2.484375, + "learning_rate": 7.67283422106319e-06, + "loss": 1.956, + "step": 5540 + }, + { + "epoch": 1.888236795100791, + "grad_norm": 1.7578125, + "learning_rate": 7.654494026887346e-06, + "loss": 1.9298, + "step": 5550 + }, + { + "epoch": 1.8916390235604321, + "grad_norm": 1.890625, + "learning_rate": 7.636146762702703e-06, + "loss": 1.8893, + "step": 5560 + }, + { + "epoch": 1.8950412520200732, + "grad_norm": 2.15625, + "learning_rate": 7.617792567771233e-06, + "loss": 1.9309, + "step": 5570 + }, + { + "epoch": 1.8984434804797141, + "grad_norm": 2.578125, + "learning_rate": 7.59943158140751e-06, + "loss": 1.9064, + "step": 5580 + }, + { + "epoch": 1.9018457089393552, + "grad_norm": 2.203125, + "learning_rate": 7.581063942977662e-06, + "loss": 1.9647, + "step": 5590 + }, + { + "epoch": 1.9052479373989963, + "grad_norm": 2.1875, + "learning_rate": 7.56268979189831e-06, + "loss": 1.9417, + "step": 5600 + }, + { + "epoch": 1.9086501658586374, + "grad_norm": 2.421875, + "learning_rate": 7.544309267635502e-06, + "loss": 1.96, + "step": 5610 + }, + { + "epoch": 1.9120523943182786, + "grad_norm": 2.25, + "learning_rate": 7.525922509703665e-06, + "loss": 1.9672, + "step": 5620 + }, + { + "epoch": 1.9154546227779194, + "grad_norm": 2.1875, + "learning_rate": 7.507529657664538e-06, + "loss": 1.9975, + "step": 5630 + }, + { + "epoch": 1.9188568512375606, + "grad_norm": 2.078125, + "learning_rate": 7.489130851126123e-06, + "loss": 1.9763, + "step": 5640 + }, + { + "epoch": 1.9222590796972017, + "grad_norm": 2.171875, + "learning_rate": 7.470726229741613e-06, + "loss": 1.9206, + "step": 5650 + }, + { + "epoch": 1.9256613081568428, + "grad_norm": 2.484375, + "learning_rate": 7.45231593320834e-06, + "loss": 2.0314, + "step": 5660 + }, + { + "epoch": 1.9290635366164839, + "grad_norm": 2.109375, + "learning_rate": 7.433900101266712e-06, + "loss": 1.9449, + "step": 5670 + }, + { + "epoch": 1.9324657650761248, + "grad_norm": 2.0, + "learning_rate": 7.415478873699151e-06, + "loss": 1.9294, + "step": 5680 + }, + { + "epoch": 1.9358679935357659, + "grad_norm": 1.8828125, + "learning_rate": 7.3970523903290335e-06, + "loss": 1.8888, + "step": 5690 + }, + { + "epoch": 1.939270221995407, + "grad_norm": 2.25, + "learning_rate": 7.378620791019634e-06, + "loss": 1.9365, + "step": 5700 + }, + { + "epoch": 1.942672450455048, + "grad_norm": 1.8828125, + "learning_rate": 7.360184215673055e-06, + "loss": 1.9441, + "step": 5710 + }, + { + "epoch": 1.9460746789146892, + "grad_norm": 2.28125, + "learning_rate": 7.341742804229166e-06, + "loss": 1.9156, + "step": 5720 + }, + { + "epoch": 1.94947690737433, + "grad_norm": 2.375, + "learning_rate": 7.32329669666455e-06, + "loss": 1.9051, + "step": 5730 + }, + { + "epoch": 1.9528791358339712, + "grad_norm": 2.109375, + "learning_rate": 7.304846032991432e-06, + "loss": 2.0019, + "step": 5740 + }, + { + "epoch": 1.9562813642936123, + "grad_norm": 1.875, + "learning_rate": 7.2863909532566196e-06, + "loss": 1.8679, + "step": 5750 + }, + { + "epoch": 1.9596835927532534, + "grad_norm": 2.234375, + "learning_rate": 7.2679315975404405e-06, + "loss": 1.9605, + "step": 5760 + }, + { + "epoch": 1.9630858212128945, + "grad_norm": 1.9375, + "learning_rate": 7.249468105955679e-06, + "loss": 1.9355, + "step": 5770 + }, + { + "epoch": 1.9664880496725354, + "grad_norm": 2.0, + "learning_rate": 7.231000618646511e-06, + "loss": 1.8908, + "step": 5780 + }, + { + "epoch": 1.9698902781321765, + "grad_norm": 2.203125, + "learning_rate": 7.212529275787436e-06, + "loss": 1.9578, + "step": 5790 + }, + { + "epoch": 1.9732925065918177, + "grad_norm": 2.265625, + "learning_rate": 7.194054217582234e-06, + "loss": 1.9287, + "step": 5800 + }, + { + "epoch": 1.9766947350514585, + "grad_norm": 2.375, + "learning_rate": 7.17557558426287e-06, + "loss": 1.9672, + "step": 5810 + }, + { + "epoch": 1.9800969635110999, + "grad_norm": 2.0, + "learning_rate": 7.157093516088451e-06, + "loss": 1.9581, + "step": 5820 + }, + { + "epoch": 1.9834991919707408, + "grad_norm": 2.015625, + "learning_rate": 7.138608153344156e-06, + "loss": 1.9872, + "step": 5830 + }, + { + "epoch": 1.9869014204303819, + "grad_norm": 1.921875, + "learning_rate": 7.120119636340172e-06, + "loss": 1.9525, + "step": 5840 + }, + { + "epoch": 1.990303648890023, + "grad_norm": 1.890625, + "learning_rate": 7.101628105410625e-06, + "loss": 1.9093, + "step": 5850 + }, + { + "epoch": 1.9937058773496639, + "grad_norm": 2.234375, + "learning_rate": 7.0831337009125195e-06, + "loss": 1.9706, + "step": 5860 + }, + { + "epoch": 1.9971081058093052, + "grad_norm": 2.3125, + "learning_rate": 7.064636563224674e-06, + "loss": 1.9331, + "step": 5870 + }, + { + "epoch": 2.000510334268946, + "grad_norm": 2.203125, + "learning_rate": 7.046136832746647e-06, + "loss": 1.9434, + "step": 5880 + }, + { + "epoch": 2.0039125627285874, + "grad_norm": 2.265625, + "learning_rate": 7.027634649897679e-06, + "loss": 1.8678, + "step": 5890 + }, + { + "epoch": 2.0073147911882283, + "grad_norm": 2.421875, + "learning_rate": 7.009130155115627e-06, + "loss": 1.9193, + "step": 5900 + }, + { + "epoch": 2.010717019647869, + "grad_norm": 2.125, + "learning_rate": 6.990623488855899e-06, + "loss": 1.9459, + "step": 5910 + }, + { + "epoch": 2.0141192481075105, + "grad_norm": 2.46875, + "learning_rate": 6.972114791590378e-06, + "loss": 1.9229, + "step": 5920 + }, + { + "epoch": 2.0175214765671514, + "grad_norm": 2.03125, + "learning_rate": 6.953604203806366e-06, + "loss": 1.9008, + "step": 5930 + }, + { + "epoch": 2.0209237050267927, + "grad_norm": 2.5625, + "learning_rate": 6.935091866005518e-06, + "loss": 1.9513, + "step": 5940 + }, + { + "epoch": 2.0243259334864336, + "grad_norm": 2.125, + "learning_rate": 6.9165779187027685e-06, + "loss": 1.9013, + "step": 5950 + }, + { + "epoch": 2.0277281619460745, + "grad_norm": 2.25, + "learning_rate": 6.898062502425267e-06, + "loss": 1.914, + "step": 5960 + }, + { + "epoch": 2.031130390405716, + "grad_norm": 2.015625, + "learning_rate": 6.87954575771132e-06, + "loss": 1.8773, + "step": 5970 + }, + { + "epoch": 2.0345326188653567, + "grad_norm": 2.234375, + "learning_rate": 6.861027825109312e-06, + "loss": 1.9337, + "step": 5980 + }, + { + "epoch": 2.037934847324998, + "grad_norm": 2.234375, + "learning_rate": 6.842508845176642e-06, + "loss": 1.9866, + "step": 5990 + }, + { + "epoch": 2.041337075784639, + "grad_norm": 1.9921875, + "learning_rate": 6.8239889584786644e-06, + "loss": 1.9557, + "step": 6000 + }, + { + "epoch": 2.04473930424428, + "grad_norm": 2.0, + "learning_rate": 6.805468305587612e-06, + "loss": 1.9082, + "step": 6010 + }, + { + "epoch": 2.048141532703921, + "grad_norm": 2.234375, + "learning_rate": 6.786947027081537e-06, + "loss": 1.8822, + "step": 6020 + }, + { + "epoch": 2.051543761163562, + "grad_norm": 2.296875, + "learning_rate": 6.768425263543234e-06, + "loss": 1.9611, + "step": 6030 + }, + { + "epoch": 2.0549459896232034, + "grad_norm": 2.171875, + "learning_rate": 6.7499031555591875e-06, + "loss": 1.9623, + "step": 6040 + }, + { + "epoch": 2.0583482180828443, + "grad_norm": 2.328125, + "learning_rate": 6.7313808437184895e-06, + "loss": 1.9902, + "step": 6050 + }, + { + "epoch": 2.061750446542485, + "grad_norm": 2.21875, + "learning_rate": 6.71285846861178e-06, + "loss": 1.9358, + "step": 6060 + }, + { + "epoch": 2.0651526750021265, + "grad_norm": 2.40625, + "learning_rate": 6.694336170830184e-06, + "loss": 1.8377, + "step": 6070 + }, + { + "epoch": 2.0685549034617674, + "grad_norm": 2.359375, + "learning_rate": 6.675814090964238e-06, + "loss": 1.9771, + "step": 6080 + }, + { + "epoch": 2.0719571319214087, + "grad_norm": 2.0625, + "learning_rate": 6.6572923696028185e-06, + "loss": 1.8634, + "step": 6090 + }, + { + "epoch": 2.0753593603810496, + "grad_norm": 2.609375, + "learning_rate": 6.638771147332086e-06, + "loss": 1.9388, + "step": 6100 + }, + { + "epoch": 2.0787615888406905, + "grad_norm": 2.203125, + "learning_rate": 6.62025056473442e-06, + "loss": 1.918, + "step": 6110 + }, + { + "epoch": 2.082163817300332, + "grad_norm": 2.234375, + "learning_rate": 6.601730762387327e-06, + "loss": 1.9617, + "step": 6120 + }, + { + "epoch": 2.0855660457599727, + "grad_norm": 2.234375, + "learning_rate": 6.583211880862406e-06, + "loss": 1.9056, + "step": 6130 + }, + { + "epoch": 2.0889682742196136, + "grad_norm": 2.15625, + "learning_rate": 6.56469406072426e-06, + "loss": 1.9458, + "step": 6140 + }, + { + "epoch": 2.092370502679255, + "grad_norm": 2.109375, + "learning_rate": 6.546177442529437e-06, + "loss": 1.9393, + "step": 6150 + }, + { + "epoch": 2.095772731138896, + "grad_norm": 2.140625, + "learning_rate": 6.5276621668253645e-06, + "loss": 1.9038, + "step": 6160 + }, + { + "epoch": 2.099174959598537, + "grad_norm": 2.265625, + "learning_rate": 6.509148374149276e-06, + "loss": 1.9621, + "step": 6170 + }, + { + "epoch": 2.102577188058178, + "grad_norm": 2.015625, + "learning_rate": 6.490636205027152e-06, + "loss": 1.9206, + "step": 6180 + }, + { + "epoch": 2.105979416517819, + "grad_norm": 2.515625, + "learning_rate": 6.472125799972643e-06, + "loss": 1.9409, + "step": 6190 + }, + { + "epoch": 2.1093816449774603, + "grad_norm": 2.53125, + "learning_rate": 6.453617299486017e-06, + "loss": 1.9348, + "step": 6200 + }, + { + "epoch": 2.112783873437101, + "grad_norm": 2.109375, + "learning_rate": 6.435110844053086e-06, + "loss": 1.9364, + "step": 6210 + }, + { + "epoch": 2.1161861018967425, + "grad_norm": 2.46875, + "learning_rate": 6.416606574144131e-06, + "loss": 1.9042, + "step": 6220 + }, + { + "epoch": 2.1195883303563834, + "grad_norm": 2.34375, + "learning_rate": 6.398104630212853e-06, + "loss": 1.9547, + "step": 6230 + }, + { + "epoch": 2.1229905588160243, + "grad_norm": 2.4375, + "learning_rate": 6.379605152695294e-06, + "loss": 1.9768, + "step": 6240 + }, + { + "epoch": 2.1263927872756656, + "grad_norm": 2.125, + "learning_rate": 6.361108282008776e-06, + "loss": 1.9522, + "step": 6250 + }, + { + "epoch": 2.1297950157353065, + "grad_norm": 1.8359375, + "learning_rate": 6.342614158550832e-06, + "loss": 1.9168, + "step": 6260 + }, + { + "epoch": 2.133197244194948, + "grad_norm": 2.3125, + "learning_rate": 6.324122922698143e-06, + "loss": 1.9871, + "step": 6270 + }, + { + "epoch": 2.1365994726545887, + "grad_norm": 2.28125, + "learning_rate": 6.305634714805481e-06, + "loss": 1.9398, + "step": 6280 + }, + { + "epoch": 2.1400017011142296, + "grad_norm": 1.921875, + "learning_rate": 6.287149675204619e-06, + "loss": 1.9629, + "step": 6290 + }, + { + "epoch": 2.143403929573871, + "grad_norm": 2.421875, + "learning_rate": 6.268667944203294e-06, + "loss": 1.9102, + "step": 6300 + }, + { + "epoch": 2.146806158033512, + "grad_norm": 2.28125, + "learning_rate": 6.2501896620841255e-06, + "loss": 1.8596, + "step": 6310 + }, + { + "epoch": 2.150208386493153, + "grad_norm": 2.265625, + "learning_rate": 6.231714969103553e-06, + "loss": 1.7886, + "step": 6320 + }, + { + "epoch": 2.153610614952794, + "grad_norm": 2.3125, + "learning_rate": 6.213244005490776e-06, + "loss": 1.9695, + "step": 6330 + }, + { + "epoch": 2.157012843412435, + "grad_norm": 2.09375, + "learning_rate": 6.194776911446687e-06, + "loss": 1.971, + "step": 6340 + }, + { + "epoch": 2.1604150718720763, + "grad_norm": 2.375, + "learning_rate": 6.176313827142807e-06, + "loss": 1.9136, + "step": 6350 + }, + { + "epoch": 2.163817300331717, + "grad_norm": 2.25, + "learning_rate": 6.157854892720216e-06, + "loss": 1.9184, + "step": 6360 + }, + { + "epoch": 2.1672195287913585, + "grad_norm": 2.09375, + "learning_rate": 6.139400248288503e-06, + "loss": 1.9933, + "step": 6370 + }, + { + "epoch": 2.1706217572509994, + "grad_norm": 1.8984375, + "learning_rate": 6.120950033924691e-06, + "loss": 1.9114, + "step": 6380 + }, + { + "epoch": 2.1740239857106403, + "grad_norm": 2.078125, + "learning_rate": 6.102504389672177e-06, + "loss": 1.9974, + "step": 6390 + }, + { + "epoch": 2.1774262141702816, + "grad_norm": 1.9140625, + "learning_rate": 6.084063455539671e-06, + "loss": 1.8925, + "step": 6400 + }, + { + "epoch": 2.1808284426299225, + "grad_norm": 2.40625, + "learning_rate": 6.065627371500128e-06, + "loss": 1.9208, + "step": 6410 + }, + { + "epoch": 2.184230671089564, + "grad_norm": 2.609375, + "learning_rate": 6.0471962774896946e-06, + "loss": 1.8757, + "step": 6420 + }, + { + "epoch": 2.1876328995492047, + "grad_norm": 1.8203125, + "learning_rate": 6.0287703134066385e-06, + "loss": 1.905, + "step": 6430 + }, + { + "epoch": 2.1910351280088456, + "grad_norm": 2.46875, + "learning_rate": 6.010349619110283e-06, + "loss": 1.8878, + "step": 6440 + }, + { + "epoch": 2.194437356468487, + "grad_norm": 2.15625, + "learning_rate": 5.991934334419968e-06, + "loss": 1.9549, + "step": 6450 + }, + { + "epoch": 2.197839584928128, + "grad_norm": 2.125, + "learning_rate": 5.973524599113954e-06, + "loss": 1.9137, + "step": 6460 + }, + { + "epoch": 2.201241813387769, + "grad_norm": 2.453125, + "learning_rate": 5.9551205529283955e-06, + "loss": 1.9856, + "step": 6470 + }, + { + "epoch": 2.20464404184741, + "grad_norm": 2.09375, + "learning_rate": 5.936722335556252e-06, + "loss": 1.9262, + "step": 6480 + }, + { + "epoch": 2.208046270307051, + "grad_norm": 1.9609375, + "learning_rate": 5.91833008664625e-06, + "loss": 1.9596, + "step": 6490 + }, + { + "epoch": 2.2114484987666922, + "grad_norm": 2.28125, + "learning_rate": 5.89994394580181e-06, + "loss": 1.907, + "step": 6500 + }, + { + "epoch": 2.214850727226333, + "grad_norm": 2.125, + "learning_rate": 5.881564052579987e-06, + "loss": 1.938, + "step": 6510 + }, + { + "epoch": 2.2182529556859745, + "grad_norm": 2.1875, + "learning_rate": 5.863190546490422e-06, + "loss": 1.9615, + "step": 6520 + }, + { + "epoch": 2.2216551841456154, + "grad_norm": 2.078125, + "learning_rate": 5.844823566994264e-06, + "loss": 1.9353, + "step": 6530 + }, + { + "epoch": 2.2250574126052562, + "grad_norm": 2.75, + "learning_rate": 5.826463253503132e-06, + "loss": 1.98, + "step": 6540 + }, + { + "epoch": 2.2284596410648976, + "grad_norm": 2.25, + "learning_rate": 5.808109745378048e-06, + "loss": 1.8649, + "step": 6550 + }, + { + "epoch": 2.2318618695245385, + "grad_norm": 2.265625, + "learning_rate": 5.789763181928373e-06, + "loss": 1.9079, + "step": 6560 + }, + { + "epoch": 2.23526409798418, + "grad_norm": 2.421875, + "learning_rate": 5.771423702410762e-06, + "loss": 1.9156, + "step": 6570 + }, + { + "epoch": 2.2386663264438207, + "grad_norm": 2.0, + "learning_rate": 5.753091446028094e-06, + "loss": 1.9416, + "step": 6580 + }, + { + "epoch": 2.2420685549034616, + "grad_norm": 2.265625, + "learning_rate": 5.734766551928427e-06, + "loss": 1.8595, + "step": 6590 + }, + { + "epoch": 2.245470783363103, + "grad_norm": 2.3125, + "learning_rate": 5.716449159203939e-06, + "loss": 1.9292, + "step": 6600 + }, + { + "epoch": 2.248873011822744, + "grad_norm": 2.15625, + "learning_rate": 5.698139406889855e-06, + "loss": 1.9578, + "step": 6610 + }, + { + "epoch": 2.252275240282385, + "grad_norm": 2.203125, + "learning_rate": 5.679837433963432e-06, + "loss": 1.9706, + "step": 6620 + }, + { + "epoch": 2.255677468742026, + "grad_norm": 2.359375, + "learning_rate": 5.661543379342855e-06, + "loss": 1.9641, + "step": 6630 + }, + { + "epoch": 2.259079697201667, + "grad_norm": 2.328125, + "learning_rate": 5.643257381886218e-06, + "loss": 1.9505, + "step": 6640 + }, + { + "epoch": 2.2624819256613082, + "grad_norm": 2.046875, + "learning_rate": 5.624979580390459e-06, + "loss": 1.9631, + "step": 6650 + }, + { + "epoch": 2.265884154120949, + "grad_norm": 2.375, + "learning_rate": 5.6067101135902996e-06, + "loss": 1.9767, + "step": 6660 + }, + { + "epoch": 2.2692863825805905, + "grad_norm": 1.8515625, + "learning_rate": 5.588449120157205e-06, + "loss": 1.9077, + "step": 6670 + }, + { + "epoch": 2.2726886110402313, + "grad_norm": 2.3125, + "learning_rate": 5.57019673869832e-06, + "loss": 1.9133, + "step": 6680 + }, + { + "epoch": 2.2760908394998722, + "grad_norm": 2.265625, + "learning_rate": 5.5519531077554244e-06, + "loss": 1.8405, + "step": 6690 + }, + { + "epoch": 2.2794930679595136, + "grad_norm": 2.375, + "learning_rate": 5.533718365803875e-06, + "loss": 1.8948, + "step": 6700 + }, + { + "epoch": 2.2828952964191545, + "grad_norm": 2.265625, + "learning_rate": 5.51549265125156e-06, + "loss": 1.9344, + "step": 6710 + }, + { + "epoch": 2.286297524878796, + "grad_norm": 2.015625, + "learning_rate": 5.4972761024378514e-06, + "loss": 1.842, + "step": 6720 + }, + { + "epoch": 2.2896997533384367, + "grad_norm": 2.28125, + "learning_rate": 5.479068857632542e-06, + "loss": 1.9172, + "step": 6730 + }, + { + "epoch": 2.2931019817980776, + "grad_norm": 2.171875, + "learning_rate": 5.46087105503481e-06, + "loss": 1.9252, + "step": 6740 + }, + { + "epoch": 2.296504210257719, + "grad_norm": 2.21875, + "learning_rate": 5.4426828327721594e-06, + "loss": 1.9356, + "step": 6750 + }, + { + "epoch": 2.29990643871736, + "grad_norm": 2.3125, + "learning_rate": 5.4245043288993795e-06, + "loss": 1.9462, + "step": 6760 + }, + { + "epoch": 2.303308667177001, + "grad_norm": 2.375, + "learning_rate": 5.406335681397498e-06, + "loss": 1.9788, + "step": 6770 + }, + { + "epoch": 2.306710895636642, + "grad_norm": 2.578125, + "learning_rate": 5.388177028172714e-06, + "loss": 1.9221, + "step": 6780 + }, + { + "epoch": 2.310113124096283, + "grad_norm": 1.9609375, + "learning_rate": 5.370028507055387e-06, + "loss": 1.9344, + "step": 6790 + }, + { + "epoch": 2.313515352555924, + "grad_norm": 2.140625, + "learning_rate": 5.351890255798953e-06, + "loss": 1.871, + "step": 6800 + }, + { + "epoch": 2.316917581015565, + "grad_norm": 1.984375, + "learning_rate": 5.333762412078907e-06, + "loss": 1.975, + "step": 6810 + }, + { + "epoch": 2.3203198094752064, + "grad_norm": 2.21875, + "learning_rate": 5.315645113491743e-06, + "loss": 1.9103, + "step": 6820 + }, + { + "epoch": 2.3237220379348473, + "grad_norm": 2.203125, + "learning_rate": 5.2975384975539145e-06, + "loss": 1.9036, + "step": 6830 + }, + { + "epoch": 2.327124266394488, + "grad_norm": 2.140625, + "learning_rate": 5.279442701700792e-06, + "loss": 1.9292, + "step": 6840 + }, + { + "epoch": 2.3305264948541295, + "grad_norm": 2.34375, + "learning_rate": 5.261357863285613e-06, + "loss": 1.9181, + "step": 6850 + }, + { + "epoch": 2.3339287233137704, + "grad_norm": 2.359375, + "learning_rate": 5.243284119578448e-06, + "loss": 1.8917, + "step": 6860 + }, + { + "epoch": 2.3373309517734118, + "grad_norm": 2.484375, + "learning_rate": 5.225221607765159e-06, + "loss": 1.9389, + "step": 6870 + }, + { + "epoch": 2.3407331802330527, + "grad_norm": 2.6875, + "learning_rate": 5.207170464946342e-06, + "loss": 1.9298, + "step": 6880 + }, + { + "epoch": 2.3441354086926935, + "grad_norm": 2.078125, + "learning_rate": 5.189130828136312e-06, + "loss": 1.9011, + "step": 6890 + }, + { + "epoch": 2.347537637152335, + "grad_norm": 2.40625, + "learning_rate": 5.1711028342620375e-06, + "loss": 1.908, + "step": 6900 + }, + { + "epoch": 2.3509398656119758, + "grad_norm": 2.65625, + "learning_rate": 5.153086620162123e-06, + "loss": 1.8829, + "step": 6910 + }, + { + "epoch": 2.354342094071617, + "grad_norm": 2.25, + "learning_rate": 5.135082322585758e-06, + "loss": 1.9441, + "step": 6920 + }, + { + "epoch": 2.357744322531258, + "grad_norm": 2.4375, + "learning_rate": 5.117090078191676e-06, + "loss": 1.9403, + "step": 6930 + }, + { + "epoch": 2.361146550990899, + "grad_norm": 2.46875, + "learning_rate": 5.09911002354713e-06, + "loss": 1.9478, + "step": 6940 + }, + { + "epoch": 2.36454877945054, + "grad_norm": 2.0625, + "learning_rate": 5.081142295126842e-06, + "loss": 1.8916, + "step": 6950 + }, + { + "epoch": 2.367951007910181, + "grad_norm": 2.4375, + "learning_rate": 5.063187029311983e-06, + "loss": 1.9323, + "step": 6960 + }, + { + "epoch": 2.3713532363698224, + "grad_norm": 1.9375, + "learning_rate": 5.045244362389115e-06, + "loss": 1.9571, + "step": 6970 + }, + { + "epoch": 2.3747554648294633, + "grad_norm": 1.8359375, + "learning_rate": 5.027314430549185e-06, + "loss": 1.9486, + "step": 6980 + }, + { + "epoch": 2.378157693289104, + "grad_norm": 2.4375, + "learning_rate": 5.009397369886466e-06, + "loss": 1.944, + "step": 6990 + }, + { + "epoch": 2.3815599217487455, + "grad_norm": 2.390625, + "learning_rate": 4.991493316397536e-06, + "loss": 1.9539, + "step": 7000 + }, + { + "epoch": 2.3849621502083864, + "grad_norm": 2.21875, + "learning_rate": 4.973602405980251e-06, + "loss": 1.8877, + "step": 7010 + }, + { + "epoch": 2.3883643786680278, + "grad_norm": 2.1875, + "learning_rate": 4.955724774432697e-06, + "loss": 1.9579, + "step": 7020 + }, + { + "epoch": 2.3917666071276686, + "grad_norm": 2.4375, + "learning_rate": 4.937860557452174e-06, + "loss": 1.9066, + "step": 7030 + }, + { + "epoch": 2.3951688355873095, + "grad_norm": 2.328125, + "learning_rate": 4.920009890634164e-06, + "loss": 1.9488, + "step": 7040 + }, + { + "epoch": 2.398571064046951, + "grad_norm": 2.203125, + "learning_rate": 4.902172909471289e-06, + "loss": 1.9939, + "step": 7050 + }, + { + "epoch": 2.4019732925065918, + "grad_norm": 2.390625, + "learning_rate": 4.884349749352304e-06, + "loss": 1.9718, + "step": 7060 + }, + { + "epoch": 2.405375520966233, + "grad_norm": 2.53125, + "learning_rate": 4.866540545561045e-06, + "loss": 1.9198, + "step": 7070 + }, + { + "epoch": 2.408777749425874, + "grad_norm": 2.421875, + "learning_rate": 4.848745433275427e-06, + "loss": 1.8993, + "step": 7080 + }, + { + "epoch": 2.412179977885515, + "grad_norm": 2.65625, + "learning_rate": 4.830964547566399e-06, + "loss": 1.9977, + "step": 7090 + }, + { + "epoch": 2.415582206345156, + "grad_norm": 2.265625, + "learning_rate": 4.813198023396925e-06, + "loss": 1.911, + "step": 7100 + }, + { + "epoch": 2.418984434804797, + "grad_norm": 2.25, + "learning_rate": 4.795445995620965e-06, + "loss": 1.977, + "step": 7110 + }, + { + "epoch": 2.4223866632644384, + "grad_norm": 2.203125, + "learning_rate": 4.777708598982436e-06, + "loss": 1.9065, + "step": 7120 + }, + { + "epoch": 2.4257888917240793, + "grad_norm": 2.28125, + "learning_rate": 4.759985968114213e-06, + "loss": 1.9569, + "step": 7130 + }, + { + "epoch": 2.42919112018372, + "grad_norm": 2.59375, + "learning_rate": 4.742278237537088e-06, + "loss": 1.9151, + "step": 7140 + }, + { + "epoch": 2.4325933486433615, + "grad_norm": 1.90625, + "learning_rate": 4.72458554165875e-06, + "loss": 1.984, + "step": 7150 + }, + { + "epoch": 2.4359955771030024, + "grad_norm": 1.9453125, + "learning_rate": 4.706908014772776e-06, + "loss": 1.9921, + "step": 7160 + }, + { + "epoch": 2.4393978055626437, + "grad_norm": 2.515625, + "learning_rate": 4.689245791057602e-06, + "loss": 1.9753, + "step": 7170 + }, + { + "epoch": 2.4428000340222846, + "grad_norm": 1.9765625, + "learning_rate": 4.671599004575511e-06, + "loss": 1.9305, + "step": 7180 + }, + { + "epoch": 2.4462022624819255, + "grad_norm": 2.34375, + "learning_rate": 4.653967789271607e-06, + "loss": 1.8709, + "step": 7190 + }, + { + "epoch": 2.449604490941567, + "grad_norm": 2.359375, + "learning_rate": 4.636352278972806e-06, + "loss": 1.9123, + "step": 7200 + }, + { + "epoch": 2.4530067194012077, + "grad_norm": 2.046875, + "learning_rate": 4.618752607386824e-06, + "loss": 1.8976, + "step": 7210 + }, + { + "epoch": 2.456408947860849, + "grad_norm": 2.375, + "learning_rate": 4.601168908101142e-06, + "loss": 2.0117, + "step": 7220 + }, + { + "epoch": 2.45981117632049, + "grad_norm": 2.25, + "learning_rate": 4.5836013145820175e-06, + "loss": 1.8844, + "step": 7230 + }, + { + "epoch": 2.463213404780131, + "grad_norm": 2.40625, + "learning_rate": 4.5660499601734545e-06, + "loss": 1.9541, + "step": 7240 + }, + { + "epoch": 2.466615633239772, + "grad_norm": 2.375, + "learning_rate": 4.548514978096198e-06, + "loss": 1.9029, + "step": 7250 + }, + { + "epoch": 2.470017861699413, + "grad_norm": 2.34375, + "learning_rate": 4.5309965014467246e-06, + "loss": 1.9122, + "step": 7260 + }, + { + "epoch": 2.4734200901590544, + "grad_norm": 2.125, + "learning_rate": 4.513494663196221e-06, + "loss": 1.8935, + "step": 7270 + }, + { + "epoch": 2.4768223186186953, + "grad_norm": 2.546875, + "learning_rate": 4.496009596189593e-06, + "loss": 1.9198, + "step": 7280 + }, + { + "epoch": 2.480224547078336, + "grad_norm": 2.71875, + "learning_rate": 4.478541433144435e-06, + "loss": 1.8702, + "step": 7290 + }, + { + "epoch": 2.4836267755379775, + "grad_norm": 2.171875, + "learning_rate": 4.461090306650046e-06, + "loss": 1.9336, + "step": 7300 + }, + { + "epoch": 2.4870290039976184, + "grad_norm": 2.40625, + "learning_rate": 4.443656349166409e-06, + "loss": 1.9156, + "step": 7310 + }, + { + "epoch": 2.4904312324572597, + "grad_norm": 2.078125, + "learning_rate": 4.426239693023181e-06, + "loss": 1.949, + "step": 7320 + }, + { + "epoch": 2.4938334609169006, + "grad_norm": 2.34375, + "learning_rate": 4.408840470418706e-06, + "loss": 1.9331, + "step": 7330 + }, + { + "epoch": 2.4972356893765415, + "grad_norm": 2.046875, + "learning_rate": 4.391458813418992e-06, + "loss": 1.9376, + "step": 7340 + }, + { + "epoch": 2.500637917836183, + "grad_norm": 2.171875, + "learning_rate": 4.374094853956726e-06, + "loss": 1.8894, + "step": 7350 + }, + { + "epoch": 2.5040401462958237, + "grad_norm": 2.40625, + "learning_rate": 4.3567487238302625e-06, + "loss": 2.0008, + "step": 7360 + }, + { + "epoch": 2.507442374755465, + "grad_norm": 2.5, + "learning_rate": 4.3394205547026224e-06, + "loss": 1.8901, + "step": 7370 + }, + { + "epoch": 2.510844603215106, + "grad_norm": 2.25, + "learning_rate": 4.322110478100502e-06, + "loss": 1.9533, + "step": 7380 + }, + { + "epoch": 2.514246831674747, + "grad_norm": 2.171875, + "learning_rate": 4.3048186254132606e-06, + "loss": 1.9216, + "step": 7390 + }, + { + "epoch": 2.517649060134388, + "grad_norm": 2.453125, + "learning_rate": 4.287545127891939e-06, + "loss": 1.9397, + "step": 7400 + }, + { + "epoch": 2.521051288594029, + "grad_norm": 2.1875, + "learning_rate": 4.270290116648254e-06, + "loss": 1.9161, + "step": 7410 + }, + { + "epoch": 2.5244535170536704, + "grad_norm": 2.484375, + "learning_rate": 4.2530537226536075e-06, + "loss": 1.8427, + "step": 7420 + }, + { + "epoch": 2.5278557455133113, + "grad_norm": 2.84375, + "learning_rate": 4.235836076738085e-06, + "loss": 1.917, + "step": 7430 + }, + { + "epoch": 2.531257973972952, + "grad_norm": 2.453125, + "learning_rate": 4.218637309589471e-06, + "loss": 1.8681, + "step": 7440 + }, + { + "epoch": 2.5346602024325935, + "grad_norm": 2.171875, + "learning_rate": 4.201457551752256e-06, + "loss": 1.9049, + "step": 7450 + }, + { + "epoch": 2.5380624308922344, + "grad_norm": 2.1875, + "learning_rate": 4.184296933626636e-06, + "loss": 1.9001, + "step": 7460 + }, + { + "epoch": 2.5414646593518757, + "grad_norm": 2.46875, + "learning_rate": 4.167155585467538e-06, + "loss": 1.895, + "step": 7470 + }, + { + "epoch": 2.5448668878115166, + "grad_norm": 1.890625, + "learning_rate": 4.150033637383623e-06, + "loss": 1.9132, + "step": 7480 + }, + { + "epoch": 2.5482691162711575, + "grad_norm": 2.296875, + "learning_rate": 4.132931219336289e-06, + "loss": 1.9031, + "step": 7490 + }, + { + "epoch": 2.551671344730799, + "grad_norm": 2.15625, + "learning_rate": 4.115848461138707e-06, + "loss": 1.8727, + "step": 7500 + }, + { + "epoch": 2.5550735731904397, + "grad_norm": 2.5, + "learning_rate": 4.0987854924548134e-06, + "loss": 1.8808, + "step": 7510 + }, + { + "epoch": 2.558475801650081, + "grad_norm": 2.5, + "learning_rate": 4.081742442798342e-06, + "loss": 1.9265, + "step": 7520 + }, + { + "epoch": 2.561878030109722, + "grad_norm": 2.390625, + "learning_rate": 4.064719441531834e-06, + "loss": 1.9463, + "step": 7530 + }, + { + "epoch": 2.565280258569363, + "grad_norm": 2.6875, + "learning_rate": 4.04771661786565e-06, + "loss": 1.9341, + "step": 7540 + }, + { + "epoch": 2.568682487029004, + "grad_norm": 1.9296875, + "learning_rate": 4.030734100857004e-06, + "loss": 1.9036, + "step": 7550 + }, + { + "epoch": 2.572084715488645, + "grad_norm": 2.21875, + "learning_rate": 4.013772019408969e-06, + "loss": 1.9604, + "step": 7560 + }, + { + "epoch": 2.5754869439482864, + "grad_norm": 2.171875, + "learning_rate": 3.9968305022695076e-06, + "loss": 1.8938, + "step": 7570 + }, + { + "epoch": 2.5788891724079273, + "grad_norm": 2.0625, + "learning_rate": 3.979909678030498e-06, + "loss": 1.976, + "step": 7580 + }, + { + "epoch": 2.582291400867568, + "grad_norm": 2.609375, + "learning_rate": 3.9630096751267395e-06, + "loss": 1.9534, + "step": 7590 + }, + { + "epoch": 2.5856936293272095, + "grad_norm": 2.1875, + "learning_rate": 3.946130621835003e-06, + "loss": 1.9374, + "step": 7600 + }, + { + "epoch": 2.5890958577868504, + "grad_norm": 2.359375, + "learning_rate": 3.929272646273037e-06, + "loss": 1.9044, + "step": 7610 + }, + { + "epoch": 2.5924980862464917, + "grad_norm": 2.265625, + "learning_rate": 3.9124358763986045e-06, + "loss": 1.9723, + "step": 7620 + }, + { + "epoch": 2.5959003147061326, + "grad_norm": 2.578125, + "learning_rate": 3.895620440008517e-06, + "loss": 1.8593, + "step": 7630 + }, + { + "epoch": 2.5993025431657735, + "grad_norm": 2.5, + "learning_rate": 3.878826464737643e-06, + "loss": 1.9203, + "step": 7640 + }, + { + "epoch": 2.602704771625415, + "grad_norm": 2.5625, + "learning_rate": 3.862054078057968e-06, + "loss": 1.9127, + "step": 7650 + }, + { + "epoch": 2.6061070000850557, + "grad_norm": 2.421875, + "learning_rate": 3.845303407277605e-06, + "loss": 1.8969, + "step": 7660 + }, + { + "epoch": 2.609509228544697, + "grad_norm": 2.078125, + "learning_rate": 3.828574579539842e-06, + "loss": 1.957, + "step": 7670 + }, + { + "epoch": 2.612911457004338, + "grad_norm": 2.046875, + "learning_rate": 3.811867721822161e-06, + "loss": 1.9497, + "step": 7680 + }, + { + "epoch": 2.616313685463979, + "grad_norm": 2.484375, + "learning_rate": 3.7951829609352926e-06, + "loss": 1.9144, + "step": 7690 + }, + { + "epoch": 2.61971591392362, + "grad_norm": 2.640625, + "learning_rate": 3.778520423522247e-06, + "loss": 1.9252, + "step": 7700 + }, + { + "epoch": 2.623118142383261, + "grad_norm": 2.390625, + "learning_rate": 3.7618802360573384e-06, + "loss": 1.9192, + "step": 7710 + }, + { + "epoch": 2.6265203708429024, + "grad_norm": 2.0, + "learning_rate": 3.7452625248452478e-06, + "loss": 1.887, + "step": 7720 + }, + { + "epoch": 2.6299225993025432, + "grad_norm": 2.390625, + "learning_rate": 3.728667416020052e-06, + "loss": 1.9326, + "step": 7730 + }, + { + "epoch": 2.633324827762184, + "grad_norm": 2.484375, + "learning_rate": 3.7120950355442677e-06, + "loss": 1.9739, + "step": 7740 + }, + { + "epoch": 2.6367270562218255, + "grad_norm": 2.1875, + "learning_rate": 3.6955455092078956e-06, + "loss": 1.9417, + "step": 7750 + }, + { + "epoch": 2.6401292846814663, + "grad_norm": 2.078125, + "learning_rate": 3.679018962627461e-06, + "loss": 1.9288, + "step": 7760 + }, + { + "epoch": 2.6435315131411077, + "grad_norm": 2.0625, + "learning_rate": 3.6625155212450754e-06, + "loss": 1.9062, + "step": 7770 + }, + { + "epoch": 2.6469337416007486, + "grad_norm": 2.625, + "learning_rate": 3.6460353103274615e-06, + "loss": 1.9304, + "step": 7780 + }, + { + "epoch": 2.6503359700603895, + "grad_norm": 2.109375, + "learning_rate": 3.6295784549650233e-06, + "loss": 1.9378, + "step": 7790 + }, + { + "epoch": 2.6537381985200303, + "grad_norm": 2.234375, + "learning_rate": 3.613145080070886e-06, + "loss": 1.9244, + "step": 7800 + }, + { + "epoch": 2.6571404269796717, + "grad_norm": 2.328125, + "learning_rate": 3.59673531037995e-06, + "loss": 1.8997, + "step": 7810 + }, + { + "epoch": 2.660542655439313, + "grad_norm": 2.203125, + "learning_rate": 3.5803492704479488e-06, + "loss": 1.9715, + "step": 7820 + }, + { + "epoch": 2.663944883898954, + "grad_norm": 2.0625, + "learning_rate": 3.5639870846504873e-06, + "loss": 1.917, + "step": 7830 + }, + { + "epoch": 2.667347112358595, + "grad_norm": 2.4375, + "learning_rate": 3.54764887718212e-06, + "loss": 1.9122, + "step": 7840 + }, + { + "epoch": 2.6707493408182357, + "grad_norm": 2.265625, + "learning_rate": 3.5313347720553963e-06, + "loss": 1.9234, + "step": 7850 + }, + { + "epoch": 2.674151569277877, + "grad_norm": 2.359375, + "learning_rate": 3.5150448930999113e-06, + "loss": 1.9519, + "step": 7860 + }, + { + "epoch": 2.6775537977375183, + "grad_norm": 2.25, + "learning_rate": 3.4987793639613926e-06, + "loss": 1.9065, + "step": 7870 + }, + { + "epoch": 2.6809560261971592, + "grad_norm": 2.171875, + "learning_rate": 3.482538308100727e-06, + "loss": 1.8604, + "step": 7880 + }, + { + "epoch": 2.6843582546568, + "grad_norm": 2.328125, + "learning_rate": 3.4663218487930547e-06, + "loss": 1.8554, + "step": 7890 + }, + { + "epoch": 2.687760483116441, + "grad_norm": 2.4375, + "learning_rate": 3.4501301091268043e-06, + "loss": 1.936, + "step": 7900 + }, + { + "epoch": 2.6911627115760823, + "grad_norm": 2.328125, + "learning_rate": 3.433963212002789e-06, + "loss": 1.8966, + "step": 7910 + }, + { + "epoch": 2.6945649400357237, + "grad_norm": 2.15625, + "learning_rate": 3.41782128013325e-06, + "loss": 1.9634, + "step": 7920 + }, + { + "epoch": 2.6979671684953646, + "grad_norm": 2.546875, + "learning_rate": 3.4017044360409375e-06, + "loss": 1.922, + "step": 7930 + }, + { + "epoch": 2.7013693969550054, + "grad_norm": 2.4375, + "learning_rate": 3.3856128020581783e-06, + "loss": 1.9411, + "step": 7940 + }, + { + "epoch": 2.7047716254146463, + "grad_norm": 2.265625, + "learning_rate": 3.3695465003259376e-06, + "loss": 1.8679, + "step": 7950 + }, + { + "epoch": 2.7081738538742877, + "grad_norm": 1.953125, + "learning_rate": 3.353505652792909e-06, + "loss": 1.906, + "step": 7960 + }, + { + "epoch": 2.711576082333929, + "grad_norm": 2.421875, + "learning_rate": 3.3374903812145784e-06, + "loss": 1.8951, + "step": 7970 + }, + { + "epoch": 2.71497831079357, + "grad_norm": 2.546875, + "learning_rate": 3.3215008071522965e-06, + "loss": 1.9556, + "step": 7980 + }, + { + "epoch": 2.7183805392532108, + "grad_norm": 2.21875, + "learning_rate": 3.3055370519723652e-06, + "loss": 1.9427, + "step": 7990 + }, + { + "epoch": 2.7217827677128517, + "grad_norm": 2.71875, + "learning_rate": 3.289599236845113e-06, + "loss": 1.9533, + "step": 8000 + }, + { + "epoch": 2.725184996172493, + "grad_norm": 2.609375, + "learning_rate": 3.273687482743974e-06, + "loss": 1.9608, + "step": 8010 + }, + { + "epoch": 2.7285872246321343, + "grad_norm": 1.9609375, + "learning_rate": 3.2578019104445702e-06, + "loss": 1.9894, + "step": 8020 + }, + { + "epoch": 2.731989453091775, + "grad_norm": 2.46875, + "learning_rate": 3.241942640523791e-06, + "loss": 1.864, + "step": 8030 + }, + { + "epoch": 2.735391681551416, + "grad_norm": 2.40625, + "learning_rate": 3.2261097933588893e-06, + "loss": 1.9567, + "step": 8040 + }, + { + "epoch": 2.738793910011057, + "grad_norm": 2.65625, + "learning_rate": 3.210303489126551e-06, + "loss": 1.9093, + "step": 8050 + }, + { + "epoch": 2.7421961384706983, + "grad_norm": 2.4375, + "learning_rate": 3.1945238478020003e-06, + "loss": 1.9673, + "step": 8060 + }, + { + "epoch": 2.745598366930339, + "grad_norm": 2.265625, + "learning_rate": 3.1787709891580763e-06, + "loss": 1.9712, + "step": 8070 + }, + { + "epoch": 2.7490005953899805, + "grad_norm": 2.265625, + "learning_rate": 3.1630450327643315e-06, + "loss": 1.9127, + "step": 8080 + }, + { + "epoch": 2.7524028238496214, + "grad_norm": 2.234375, + "learning_rate": 3.147346097986121e-06, + "loss": 1.9763, + "step": 8090 + }, + { + "epoch": 2.7558050523092623, + "grad_norm": 1.9453125, + "learning_rate": 3.1316743039836908e-06, + "loss": 1.8313, + "step": 8100 + }, + { + "epoch": 2.7592072807689036, + "grad_norm": 2.0625, + "learning_rate": 3.1160297697112855e-06, + "loss": 1.9062, + "step": 8110 + }, + { + "epoch": 2.7626095092285445, + "grad_norm": 2.25, + "learning_rate": 3.10041261391624e-06, + "loss": 1.9072, + "step": 8120 + }, + { + "epoch": 2.766011737688186, + "grad_norm": 2.546875, + "learning_rate": 3.0848229551380702e-06, + "loss": 1.932, + "step": 8130 + }, + { + "epoch": 2.7694139661478268, + "grad_norm": 2.375, + "learning_rate": 3.069260911707586e-06, + "loss": 1.9311, + "step": 8140 + }, + { + "epoch": 2.7728161946074676, + "grad_norm": 2.6875, + "learning_rate": 3.0537266017459856e-06, + "loss": 1.9067, + "step": 8150 + }, + { + "epoch": 2.776218423067109, + "grad_norm": 2.203125, + "learning_rate": 3.0382201431639656e-06, + "loss": 1.978, + "step": 8160 + }, + { + "epoch": 2.77962065152675, + "grad_norm": 2.375, + "learning_rate": 3.0227416536608095e-06, + "loss": 1.9084, + "step": 8170 + }, + { + "epoch": 2.783022879986391, + "grad_norm": 2.203125, + "learning_rate": 3.0072912507235167e-06, + "loss": 1.8865, + "step": 8180 + }, + { + "epoch": 2.786425108446032, + "grad_norm": 2.015625, + "learning_rate": 2.991869051625898e-06, + "loss": 1.9293, + "step": 8190 + }, + { + "epoch": 2.789827336905673, + "grad_norm": 2.59375, + "learning_rate": 2.9764751734276803e-06, + "loss": 1.9127, + "step": 8200 + }, + { + "epoch": 2.7932295653653143, + "grad_norm": 2.453125, + "learning_rate": 2.9611097329736394e-06, + "loss": 1.9198, + "step": 8210 + }, + { + "epoch": 2.796631793824955, + "grad_norm": 2.3125, + "learning_rate": 2.9457728468926836e-06, + "loss": 1.9261, + "step": 8220 + }, + { + "epoch": 2.8000340222845965, + "grad_norm": 2.59375, + "learning_rate": 2.930464631596993e-06, + "loss": 1.9068, + "step": 8230 + }, + { + "epoch": 2.8034362507442374, + "grad_norm": 2.40625, + "learning_rate": 2.915185203281126e-06, + "loss": 1.947, + "step": 8240 + }, + { + "epoch": 2.8068384792038783, + "grad_norm": 2.34375, + "learning_rate": 2.899934677921133e-06, + "loss": 1.9014, + "step": 8250 + }, + { + "epoch": 2.8102407076635196, + "grad_norm": 2.25, + "learning_rate": 2.884713171273686e-06, + "loss": 1.9012, + "step": 8260 + }, + { + "epoch": 2.8136429361231605, + "grad_norm": 2.3125, + "learning_rate": 2.869520798875194e-06, + "loss": 1.9299, + "step": 8270 + }, + { + "epoch": 2.817045164582802, + "grad_norm": 2.046875, + "learning_rate": 2.8543576760409264e-06, + "loss": 1.9472, + "step": 8280 + }, + { + "epoch": 2.8204473930424427, + "grad_norm": 2.140625, + "learning_rate": 2.839223917864142e-06, + "loss": 1.9323, + "step": 8290 + }, + { + "epoch": 2.8238496215020836, + "grad_norm": 2.203125, + "learning_rate": 2.824119639215203e-06, + "loss": 1.9394, + "step": 8300 + }, + { + "epoch": 2.827251849961725, + "grad_norm": 2.515625, + "learning_rate": 2.809044954740723e-06, + "loss": 1.9369, + "step": 8310 + }, + { + "epoch": 2.830654078421366, + "grad_norm": 2.46875, + "learning_rate": 2.7939999788626755e-06, + "loss": 1.9025, + "step": 8320 + }, + { + "epoch": 2.834056306881007, + "grad_norm": 2.390625, + "learning_rate": 2.778984825777543e-06, + "loss": 1.908, + "step": 8330 + }, + { + "epoch": 2.837458535340648, + "grad_norm": 2.5, + "learning_rate": 2.763999609455441e-06, + "loss": 1.9814, + "step": 8340 + }, + { + "epoch": 2.840860763800289, + "grad_norm": 2.421875, + "learning_rate": 2.7490444436392535e-06, + "loss": 1.9804, + "step": 8350 + }, + { + "epoch": 2.8442629922599303, + "grad_norm": 2.359375, + "learning_rate": 2.7341194418437747e-06, + "loss": 1.9187, + "step": 8360 + }, + { + "epoch": 2.847665220719571, + "grad_norm": 2.25, + "learning_rate": 2.7192247173548356e-06, + "loss": 1.8885, + "step": 8370 + }, + { + "epoch": 2.8510674491792125, + "grad_norm": 2.515625, + "learning_rate": 2.7043603832284616e-06, + "loss": 1.9056, + "step": 8380 + }, + { + "epoch": 2.8544696776388534, + "grad_norm": 2.5625, + "learning_rate": 2.689526552289997e-06, + "loss": 1.9068, + "step": 8390 + }, + { + "epoch": 2.8578719060984943, + "grad_norm": 1.9375, + "learning_rate": 2.6747233371332606e-06, + "loss": 2.0559, + "step": 8400 + }, + { + "epoch": 2.8612741345581356, + "grad_norm": 2.140625, + "learning_rate": 2.6599508501196876e-06, + "loss": 1.9102, + "step": 8410 + }, + { + "epoch": 2.8646763630177765, + "grad_norm": 2.3125, + "learning_rate": 2.6452092033774744e-06, + "loss": 1.878, + "step": 8420 + }, + { + "epoch": 2.868078591477418, + "grad_norm": 2.21875, + "learning_rate": 2.630498508800734e-06, + "loss": 1.9412, + "step": 8430 + }, + { + "epoch": 2.8714808199370587, + "grad_norm": 2.59375, + "learning_rate": 2.6158188780486312e-06, + "loss": 1.8957, + "step": 8440 + }, + { + "epoch": 2.8748830483966996, + "grad_norm": 2.65625, + "learning_rate": 2.6011704225445548e-06, + "loss": 1.8656, + "step": 8450 + }, + { + "epoch": 2.878285276856341, + "grad_norm": 2.5, + "learning_rate": 2.586553253475264e-06, + "loss": 1.9598, + "step": 8460 + }, + { + "epoch": 2.881687505315982, + "grad_norm": 2.25, + "learning_rate": 2.5719674817900346e-06, + "loss": 1.957, + "step": 8470 + }, + { + "epoch": 2.885089733775623, + "grad_norm": 2.296875, + "learning_rate": 2.5574132181998334e-06, + "loss": 1.9725, + "step": 8480 + }, + { + "epoch": 2.888491962235264, + "grad_norm": 1.9765625, + "learning_rate": 2.5428905731764664e-06, + "loss": 1.9228, + "step": 8490 + }, + { + "epoch": 2.891894190694905, + "grad_norm": 2.40625, + "learning_rate": 2.5283996569517464e-06, + "loss": 1.938, + "step": 8500 + }, + { + "epoch": 2.8952964191545463, + "grad_norm": 2.21875, + "learning_rate": 2.5139405795166538e-06, + "loss": 1.9243, + "step": 8510 + }, + { + "epoch": 2.898698647614187, + "grad_norm": 2.3125, + "learning_rate": 2.4995134506204964e-06, + "loss": 1.9328, + "step": 8520 + }, + { + "epoch": 2.9021008760738285, + "grad_norm": 2.15625, + "learning_rate": 2.48511837977009e-06, + "loss": 1.9199, + "step": 8530 + }, + { + "epoch": 2.9055031045334694, + "grad_norm": 2.625, + "learning_rate": 2.4707554762289077e-06, + "loss": 1.9613, + "step": 8540 + }, + { + "epoch": 2.9089053329931103, + "grad_norm": 2.046875, + "learning_rate": 2.4564248490162763e-06, + "loss": 1.9547, + "step": 8550 + }, + { + "epoch": 2.9123075614527516, + "grad_norm": 2.328125, + "learning_rate": 2.442126606906526e-06, + "loss": 2.0251, + "step": 8560 + }, + { + "epoch": 2.9157097899123925, + "grad_norm": 2.40625, + "learning_rate": 2.4278608584281694e-06, + "loss": 1.9231, + "step": 8570 + }, + { + "epoch": 2.919112018372034, + "grad_norm": 2.625, + "learning_rate": 2.413627711863091e-06, + "loss": 1.9295, + "step": 8580 + }, + { + "epoch": 2.9225142468316747, + "grad_norm": 2.5, + "learning_rate": 2.399427275245705e-06, + "loss": 1.9444, + "step": 8590 + }, + { + "epoch": 2.9259164752913156, + "grad_norm": 2.328125, + "learning_rate": 2.3852596563621536e-06, + "loss": 1.9794, + "step": 8600 + }, + { + "epoch": 2.929318703750957, + "grad_norm": 2.1875, + "learning_rate": 2.3711249627494803e-06, + "loss": 1.9096, + "step": 8610 + }, + { + "epoch": 2.932720932210598, + "grad_norm": 2.578125, + "learning_rate": 2.3570233016948133e-06, + "loss": 1.9062, + "step": 8620 + }, + { + "epoch": 2.936123160670239, + "grad_norm": 2.34375, + "learning_rate": 2.3429547802345537e-06, + "loss": 1.8779, + "step": 8630 + }, + { + "epoch": 2.93952538912988, + "grad_norm": 2.265625, + "learning_rate": 2.3289195051535584e-06, + "loss": 1.8901, + "step": 8640 + }, + { + "epoch": 2.942927617589521, + "grad_norm": 2.203125, + "learning_rate": 2.3149175829843367e-06, + "loss": 1.9073, + "step": 8650 + }, + { + "epoch": 2.9463298460491623, + "grad_norm": 2.46875, + "learning_rate": 2.3009491200062343e-06, + "loss": 1.9434, + "step": 8660 + }, + { + "epoch": 2.949732074508803, + "grad_norm": 2.1875, + "learning_rate": 2.287014222244634e-06, + "loss": 1.88, + "step": 8670 + }, + { + "epoch": 2.9531343029684445, + "grad_norm": 2.109375, + "learning_rate": 2.273112995470147e-06, + "loss": 1.968, + "step": 8680 + }, + { + "epoch": 2.9565365314280854, + "grad_norm": 2.03125, + "learning_rate": 2.259245545197807e-06, + "loss": 1.9048, + "step": 8690 + }, + { + "epoch": 2.9599387598877263, + "grad_norm": 2.46875, + "learning_rate": 2.245411976686278e-06, + "loss": 1.9502, + "step": 8700 + }, + { + "epoch": 2.9633409883473676, + "grad_norm": 2.546875, + "learning_rate": 2.231612394937042e-06, + "loss": 1.87, + "step": 8710 + }, + { + "epoch": 2.9667432168070085, + "grad_norm": 2.234375, + "learning_rate": 2.217846904693616e-06, + "loss": 1.9337, + "step": 8720 + }, + { + "epoch": 2.97014544526665, + "grad_norm": 2.609375, + "learning_rate": 2.2041156104407518e-06, + "loss": 1.9095, + "step": 8730 + }, + { + "epoch": 2.9735476737262907, + "grad_norm": 2.4375, + "learning_rate": 2.1904186164036358e-06, + "loss": 1.9346, + "step": 8740 + }, + { + "epoch": 2.9769499021859316, + "grad_norm": 2.09375, + "learning_rate": 2.1767560265471087e-06, + "loss": 1.9296, + "step": 8750 + }, + { + "epoch": 2.980352130645573, + "grad_norm": 2.484375, + "learning_rate": 2.163127944574872e-06, + "loss": 1.9386, + "step": 8760 + }, + { + "epoch": 2.983754359105214, + "grad_norm": 2.40625, + "learning_rate": 2.149534473928699e-06, + "loss": 1.9189, + "step": 8770 + }, + { + "epoch": 2.987156587564855, + "grad_norm": 2.46875, + "learning_rate": 2.135975717787654e-06, + "loss": 1.8996, + "step": 8780 + }, + { + "epoch": 2.990558816024496, + "grad_norm": 2.1875, + "learning_rate": 2.1224517790673003e-06, + "loss": 1.937, + "step": 8790 + }, + { + "epoch": 2.993961044484137, + "grad_norm": 2.234375, + "learning_rate": 2.108962760418933e-06, + "loss": 1.9724, + "step": 8800 + }, + { + "epoch": 2.9973632729437782, + "grad_norm": 2.5, + "learning_rate": 2.0955087642287833e-06, + "loss": 1.9497, + "step": 8810 + }, + { + "epoch": 3.000765501403419, + "grad_norm": 2.5, + "learning_rate": 2.0820898926172546e-06, + "loss": 1.9683, + "step": 8820 + }, + { + "epoch": 3.0041677298630605, + "grad_norm": 2.375, + "learning_rate": 2.0687062474381516e-06, + "loss": 1.9146, + "step": 8830 + }, + { + "epoch": 3.0075699583227014, + "grad_norm": 2.515625, + "learning_rate": 2.05535793027788e-06, + "loss": 1.9749, + "step": 8840 + }, + { + "epoch": 3.0109721867823422, + "grad_norm": 2.46875, + "learning_rate": 2.042045042454711e-06, + "loss": 1.9554, + "step": 8850 + }, + { + "epoch": 3.0143744152419836, + "grad_norm": 2.53125, + "learning_rate": 2.028767685017981e-06, + "loss": 1.8963, + "step": 8860 + }, + { + "epoch": 3.0177766437016245, + "grad_norm": 2.671875, + "learning_rate": 2.015525958747352e-06, + "loss": 1.938, + "step": 8870 + }, + { + "epoch": 3.021178872161266, + "grad_norm": 2.625, + "learning_rate": 2.0023199641520177e-06, + "loss": 1.9223, + "step": 8880 + }, + { + "epoch": 3.0245811006209067, + "grad_norm": 2.625, + "learning_rate": 1.989149801469974e-06, + "loss": 1.8825, + "step": 8890 + }, + { + "epoch": 3.0279833290805476, + "grad_norm": 2.703125, + "learning_rate": 1.97601557066723e-06, + "loss": 1.9489, + "step": 8900 + }, + { + "epoch": 3.031385557540189, + "grad_norm": 2.109375, + "learning_rate": 1.9629173714370583e-06, + "loss": 1.9236, + "step": 8910 + }, + { + "epoch": 3.03478778599983, + "grad_norm": 2.078125, + "learning_rate": 1.949855303199246e-06, + "loss": 1.9561, + "step": 8920 + }, + { + "epoch": 3.038190014459471, + "grad_norm": 2.484375, + "learning_rate": 1.9368294650993263e-06, + "loss": 1.8969, + "step": 8930 + }, + { + "epoch": 3.041592242919112, + "grad_norm": 2.125, + "learning_rate": 1.92383995600784e-06, + "loss": 1.9331, + "step": 8940 + }, + { + "epoch": 3.044994471378753, + "grad_norm": 2.40625, + "learning_rate": 1.910886874519575e-06, + "loss": 1.9734, + "step": 8950 + }, + { + "epoch": 3.0483966998383942, + "grad_norm": 2.0625, + "learning_rate": 1.8979703189528225e-06, + "loss": 1.918, + "step": 8960 + }, + { + "epoch": 3.051798928298035, + "grad_norm": 2.40625, + "learning_rate": 1.885090387348631e-06, + "loss": 1.9162, + "step": 8970 + }, + { + "epoch": 3.0552011567576765, + "grad_norm": 2.421875, + "learning_rate": 1.8722471774700541e-06, + "loss": 1.9047, + "step": 8980 + }, + { + "epoch": 3.0586033852173173, + "grad_norm": 2.40625, + "learning_rate": 1.8594407868014222e-06, + "loss": 1.9391, + "step": 8990 + }, + { + "epoch": 3.0620056136769582, + "grad_norm": 2.53125, + "learning_rate": 1.8466713125475953e-06, + "loss": 1.9597, + "step": 9000 + }, + { + "epoch": 3.0654078421365996, + "grad_norm": 2.125, + "learning_rate": 1.8339388516332183e-06, + "loss": 1.9123, + "step": 9010 + }, + { + "epoch": 3.0688100705962404, + "grad_norm": 2.265625, + "learning_rate": 1.8212435007019987e-06, + "loss": 1.9063, + "step": 9020 + }, + { + "epoch": 3.072212299055882, + "grad_norm": 2.0625, + "learning_rate": 1.8085853561159651e-06, + "loss": 1.8604, + "step": 9030 + }, + { + "epoch": 3.0756145275155227, + "grad_norm": 2.203125, + "learning_rate": 1.7959645139547367e-06, + "loss": 1.9165, + "step": 9040 + }, + { + "epoch": 3.0790167559751636, + "grad_norm": 2.8125, + "learning_rate": 1.7833810700147973e-06, + "loss": 1.9096, + "step": 9050 + }, + { + "epoch": 3.082418984434805, + "grad_norm": 2.203125, + "learning_rate": 1.770835119808758e-06, + "loss": 1.9433, + "step": 9060 + }, + { + "epoch": 3.0858212128944458, + "grad_norm": 2.46875, + "learning_rate": 1.7583267585646496e-06, + "loss": 1.972, + "step": 9070 + }, + { + "epoch": 3.089223441354087, + "grad_norm": 2.40625, + "learning_rate": 1.7458560812251807e-06, + "loss": 1.9191, + "step": 9080 + }, + { + "epoch": 3.092625669813728, + "grad_norm": 2.046875, + "learning_rate": 1.7334231824470327e-06, + "loss": 1.882, + "step": 9090 + }, + { + "epoch": 3.096027898273369, + "grad_norm": 2.40625, + "learning_rate": 1.7210281566001321e-06, + "loss": 1.9086, + "step": 9100 + }, + { + "epoch": 3.09943012673301, + "grad_norm": 2.09375, + "learning_rate": 1.7086710977669391e-06, + "loss": 1.9225, + "step": 9110 + }, + { + "epoch": 3.102832355192651, + "grad_norm": 2.515625, + "learning_rate": 1.6963520997417304e-06, + "loss": 1.9364, + "step": 9120 + }, + { + "epoch": 3.1062345836522924, + "grad_norm": 2.40625, + "learning_rate": 1.684071256029885e-06, + "loss": 1.962, + "step": 9130 + }, + { + "epoch": 3.1096368121119333, + "grad_norm": 2.25, + "learning_rate": 1.6718286598471834e-06, + "loss": 1.9557, + "step": 9140 + }, + { + "epoch": 3.113039040571574, + "grad_norm": 2.234375, + "learning_rate": 1.6596244041190884e-06, + "loss": 1.963, + "step": 9150 + }, + { + "epoch": 3.1164412690312155, + "grad_norm": 2.453125, + "learning_rate": 1.6474585814800486e-06, + "loss": 1.8665, + "step": 9160 + }, + { + "epoch": 3.1198434974908564, + "grad_norm": 2.234375, + "learning_rate": 1.6353312842727971e-06, + "loss": 1.9364, + "step": 9170 + }, + { + "epoch": 3.1232457259504978, + "grad_norm": 1.9921875, + "learning_rate": 1.6232426045476368e-06, + "loss": 1.9379, + "step": 9180 + }, + { + "epoch": 3.1266479544101387, + "grad_norm": 2.484375, + "learning_rate": 1.6111926340617594e-06, + "loss": 1.8696, + "step": 9190 + }, + { + "epoch": 3.1300501828697795, + "grad_norm": 2.546875, + "learning_rate": 1.599181464278531e-06, + "loss": 1.9511, + "step": 9200 + }, + { + "epoch": 3.133452411329421, + "grad_norm": 2.125, + "learning_rate": 1.587209186366815e-06, + "loss": 1.9289, + "step": 9210 + }, + { + "epoch": 3.1368546397890618, + "grad_norm": 2.296875, + "learning_rate": 1.5752758912002694e-06, + "loss": 1.8937, + "step": 9220 + }, + { + "epoch": 3.140256868248703, + "grad_norm": 2.265625, + "learning_rate": 1.5633816693566608e-06, + "loss": 1.8763, + "step": 9230 + }, + { + "epoch": 3.143659096708344, + "grad_norm": 2.3125, + "learning_rate": 1.5515266111171768e-06, + "loss": 1.9913, + "step": 9240 + }, + { + "epoch": 3.147061325167985, + "grad_norm": 2.5, + "learning_rate": 1.5397108064657348e-06, + "loss": 1.8861, + "step": 9250 + }, + { + "epoch": 3.150463553627626, + "grad_norm": 2.109375, + "learning_rate": 1.5279343450883104e-06, + "loss": 1.9029, + "step": 9260 + }, + { + "epoch": 3.153865782087267, + "grad_norm": 2.4375, + "learning_rate": 1.5161973163722477e-06, + "loss": 1.9382, + "step": 9270 + }, + { + "epoch": 3.1572680105469084, + "grad_norm": 2.421875, + "learning_rate": 1.5044998094055818e-06, + "loss": 1.8859, + "step": 9280 + }, + { + "epoch": 3.1606702390065493, + "grad_norm": 2.375, + "learning_rate": 1.4928419129763672e-06, + "loss": 1.8785, + "step": 9290 + }, + { + "epoch": 3.16407246746619, + "grad_norm": 2.6875, + "learning_rate": 1.4812237155720006e-06, + "loss": 1.8864, + "step": 9300 + }, + { + "epoch": 3.1674746959258315, + "grad_norm": 2.53125, + "learning_rate": 1.4696453053785496e-06, + "loss": 1.8698, + "step": 9310 + }, + { + "epoch": 3.1708769243854724, + "grad_norm": 2.296875, + "learning_rate": 1.4581067702800793e-06, + "loss": 1.9852, + "step": 9320 + }, + { + "epoch": 3.1742791528451137, + "grad_norm": 2.3125, + "learning_rate": 1.4466081978579942e-06, + "loss": 1.98, + "step": 9330 + }, + { + "epoch": 3.1776813813047546, + "grad_norm": 2.34375, + "learning_rate": 1.4351496753903699e-06, + "loss": 1.925, + "step": 9340 + }, + { + "epoch": 3.1810836097643955, + "grad_norm": 2.5, + "learning_rate": 1.4237312898512816e-06, + "loss": 1.9355, + "step": 9350 + }, + { + "epoch": 3.184485838224037, + "grad_norm": 2.703125, + "learning_rate": 1.4123531279101576e-06, + "loss": 1.9966, + "step": 9360 + }, + { + "epoch": 3.1878880666836777, + "grad_norm": 2.578125, + "learning_rate": 1.4010152759311148e-06, + "loss": 1.8377, + "step": 9370 + }, + { + "epoch": 3.191290295143319, + "grad_norm": 2.296875, + "learning_rate": 1.3897178199723027e-06, + "loss": 1.9501, + "step": 9380 + }, + { + "epoch": 3.19469252360296, + "grad_norm": 2.390625, + "learning_rate": 1.3784608457852537e-06, + "loss": 1.9103, + "step": 9390 + }, + { + "epoch": 3.198094752062601, + "grad_norm": 2.578125, + "learning_rate": 1.3672444388142238e-06, + "loss": 1.9575, + "step": 9400 + }, + { + "epoch": 3.201496980522242, + "grad_norm": 2.328125, + "learning_rate": 1.3560686841955576e-06, + "loss": 1.929, + "step": 9410 + }, + { + "epoch": 3.204899208981883, + "grad_norm": 2.375, + "learning_rate": 1.3449336667570272e-06, + "loss": 1.9606, + "step": 9420 + }, + { + "epoch": 3.2083014374415244, + "grad_norm": 2.3125, + "learning_rate": 1.3338394710172017e-06, + "loss": 1.9379, + "step": 9430 + }, + { + "epoch": 3.2117036659011653, + "grad_norm": 2.640625, + "learning_rate": 1.3227861811847961e-06, + "loss": 1.8995, + "step": 9440 + }, + { + "epoch": 3.215105894360806, + "grad_norm": 2.203125, + "learning_rate": 1.3117738811580378e-06, + "loss": 1.9038, + "step": 9450 + }, + { + "epoch": 3.2185081228204475, + "grad_norm": 2.234375, + "learning_rate": 1.3008026545240273e-06, + "loss": 1.9499, + "step": 9460 + }, + { + "epoch": 3.2219103512800884, + "grad_norm": 2.234375, + "learning_rate": 1.2898725845581015e-06, + "loss": 1.9625, + "step": 9470 + }, + { + "epoch": 3.2253125797397297, + "grad_norm": 2.234375, + "learning_rate": 1.2789837542232062e-06, + "loss": 2.0014, + "step": 9480 + }, + { + "epoch": 3.2287148081993706, + "grad_norm": 2.375, + "learning_rate": 1.2681362461692674e-06, + "loss": 1.9227, + "step": 9490 + }, + { + "epoch": 3.2321170366590115, + "grad_norm": 1.90625, + "learning_rate": 1.2573301427325523e-06, + "loss": 1.9411, + "step": 9500 + }, + { + "epoch": 3.235519265118653, + "grad_norm": 1.9375, + "learning_rate": 1.246565525935065e-06, + "loss": 1.8898, + "step": 9510 + }, + { + "epoch": 3.2389214935782937, + "grad_norm": 2.25, + "learning_rate": 1.2358424774839005e-06, + "loss": 1.8962, + "step": 9520 + }, + { + "epoch": 3.242323722037935, + "grad_norm": 2.5, + "learning_rate": 1.2251610787706435e-06, + "loss": 1.9404, + "step": 9530 + }, + { + "epoch": 3.245725950497576, + "grad_norm": 2.265625, + "learning_rate": 1.2145214108707407e-06, + "loss": 1.8978, + "step": 9540 + }, + { + "epoch": 3.249128178957217, + "grad_norm": 2.140625, + "learning_rate": 1.2039235545428843e-06, + "loss": 1.9312, + "step": 9550 + }, + { + "epoch": 3.252530407416858, + "grad_norm": 2.140625, + "learning_rate": 1.1933675902284088e-06, + "loss": 1.8721, + "step": 9560 + }, + { + "epoch": 3.255932635876499, + "grad_norm": 2.171875, + "learning_rate": 1.182853598050669e-06, + "loss": 1.9304, + "step": 9570 + }, + { + "epoch": 3.2593348643361404, + "grad_norm": 2.34375, + "learning_rate": 1.1723816578144417e-06, + "loss": 1.8912, + "step": 9580 + }, + { + "epoch": 3.2627370927957813, + "grad_norm": 2.375, + "learning_rate": 1.1619518490053083e-06, + "loss": 1.8852, + "step": 9590 + }, + { + "epoch": 3.266139321255422, + "grad_norm": 2.359375, + "learning_rate": 1.1515642507890646e-06, + "loss": 1.9256, + "step": 9600 + }, + { + "epoch": 3.2695415497150635, + "grad_norm": 2.375, + "learning_rate": 1.141218942011112e-06, + "loss": 1.8988, + "step": 9610 + }, + { + "epoch": 3.2729437781747044, + "grad_norm": 2.4375, + "learning_rate": 1.1309160011958583e-06, + "loss": 1.9262, + "step": 9620 + }, + { + "epoch": 3.2763460066343457, + "grad_norm": 2.078125, + "learning_rate": 1.1206555065461265e-06, + "loss": 1.9177, + "step": 9630 + }, + { + "epoch": 3.2797482350939866, + "grad_norm": 2.28125, + "learning_rate": 1.1104375359425585e-06, + "loss": 1.9117, + "step": 9640 + }, + { + "epoch": 3.2831504635536275, + "grad_norm": 2.703125, + "learning_rate": 1.100262166943023e-06, + "loss": 1.9711, + "step": 9650 + }, + { + "epoch": 3.286552692013269, + "grad_norm": 2.296875, + "learning_rate": 1.0901294767820318e-06, + "loss": 1.9243, + "step": 9660 + }, + { + "epoch": 3.2899549204729097, + "grad_norm": 2.4375, + "learning_rate": 1.0800395423701436e-06, + "loss": 1.9023, + "step": 9670 + }, + { + "epoch": 3.293357148932551, + "grad_norm": 2.140625, + "learning_rate": 1.0699924402933917e-06, + "loss": 1.938, + "step": 9680 + }, + { + "epoch": 3.296759377392192, + "grad_norm": 2.359375, + "learning_rate": 1.0599882468126933e-06, + "loss": 1.9328, + "step": 9690 + }, + { + "epoch": 3.300161605851833, + "grad_norm": 2.109375, + "learning_rate": 1.0500270378632782e-06, + "loss": 1.9429, + "step": 9700 + }, + { + "epoch": 3.303563834311474, + "grad_norm": 2.171875, + "learning_rate": 1.0401088890541082e-06, + "loss": 1.9068, + "step": 9710 + }, + { + "epoch": 3.306966062771115, + "grad_norm": 2.28125, + "learning_rate": 1.0302338756673032e-06, + "loss": 1.9121, + "step": 9720 + }, + { + "epoch": 3.3103682912307564, + "grad_norm": 2.28125, + "learning_rate": 1.0204020726575725e-06, + "loss": 1.9197, + "step": 9730 + }, + { + "epoch": 3.3137705196903973, + "grad_norm": 2.09375, + "learning_rate": 1.0106135546516385e-06, + "loss": 1.9347, + "step": 9740 + }, + { + "epoch": 3.317172748150038, + "grad_norm": 1.9375, + "learning_rate": 1.0008683959476827e-06, + "loss": 1.929, + "step": 9750 + }, + { + "epoch": 3.3205749766096795, + "grad_norm": 2.203125, + "learning_rate": 9.911666705147721e-07, + "loss": 1.8878, + "step": 9760 + }, + { + "epoch": 3.3239772050693204, + "grad_norm": 2.359375, + "learning_rate": 9.815084519922975e-07, + "loss": 1.8525, + "step": 9770 + }, + { + "epoch": 3.3273794335289617, + "grad_norm": 2.03125, + "learning_rate": 9.718938136894211e-07, + "loss": 1.8368, + "step": 9780 + }, + { + "epoch": 3.3307816619886026, + "grad_norm": 2.0, + "learning_rate": 9.623228285845155e-07, + "loss": 1.8964, + "step": 9790 + }, + { + "epoch": 3.3341838904482435, + "grad_norm": 2.796875, + "learning_rate": 9.527955693246117e-07, + "loss": 1.9062, + "step": 9800 + }, + { + "epoch": 3.337586118907885, + "grad_norm": 2.125, + "learning_rate": 9.433121082248422e-07, + "loss": 1.87, + "step": 9810 + }, + { + "epoch": 3.3409883473675257, + "grad_norm": 2.5, + "learning_rate": 9.33872517267902e-07, + "loss": 1.9351, + "step": 9820 + }, + { + "epoch": 3.344390575827167, + "grad_norm": 2.21875, + "learning_rate": 9.244768681034954e-07, + "loss": 1.9826, + "step": 9830 + }, + { + "epoch": 3.347792804286808, + "grad_norm": 2.5625, + "learning_rate": 9.151252320477888e-07, + "loss": 1.9788, + "step": 9840 + }, + { + "epoch": 3.351195032746449, + "grad_norm": 1.9765625, + "learning_rate": 9.058176800828842e-07, + "loss": 1.9306, + "step": 9850 + }, + { + "epoch": 3.35459726120609, + "grad_norm": 2.375, + "learning_rate": 8.965542828562589e-07, + "loss": 1.9304, + "step": 9860 + }, + { + "epoch": 3.357999489665731, + "grad_norm": 2.546875, + "learning_rate": 8.873351106802486e-07, + "loss": 1.9565, + "step": 9870 + }, + { + "epoch": 3.3614017181253724, + "grad_norm": 2.28125, + "learning_rate": 8.781602335315041e-07, + "loss": 1.9325, + "step": 9880 + }, + { + "epoch": 3.3648039465850133, + "grad_norm": 2.25, + "learning_rate": 8.690297210504589e-07, + "loss": 1.9074, + "step": 9890 + }, + { + "epoch": 3.368206175044654, + "grad_norm": 2.65625, + "learning_rate": 8.599436425408064e-07, + "loss": 1.9338, + "step": 9900 + }, + { + "epoch": 3.3716084035042955, + "grad_norm": 2.625, + "learning_rate": 8.509020669689717e-07, + "loss": 1.9236, + "step": 9910 + }, + { + "epoch": 3.3750106319639364, + "grad_norm": 2.5625, + "learning_rate": 8.419050629635849e-07, + "loss": 1.9387, + "step": 9920 + }, + { + "epoch": 3.3784128604235777, + "grad_norm": 2.4375, + "learning_rate": 8.329526988149661e-07, + "loss": 1.9503, + "step": 9930 + }, + { + "epoch": 3.3818150888832186, + "grad_norm": 2.1875, + "learning_rate": 8.240450424745993e-07, + "loss": 1.9232, + "step": 9940 + }, + { + "epoch": 3.3852173173428595, + "grad_norm": 2.546875, + "learning_rate": 8.151821615546263e-07, + "loss": 1.9435, + "step": 9950 + }, + { + "epoch": 3.388619545802501, + "grad_norm": 2.203125, + "learning_rate": 8.063641233273221e-07, + "loss": 1.9005, + "step": 9960 + }, + { + "epoch": 3.3920217742621417, + "grad_norm": 2.609375, + "learning_rate": 7.975909947245956e-07, + "loss": 1.864, + "step": 9970 + }, + { + "epoch": 3.3954240027217826, + "grad_norm": 2.15625, + "learning_rate": 7.888628423374738e-07, + "loss": 1.9707, + "step": 9980 + }, + { + "epoch": 3.398826231181424, + "grad_norm": 2.53125, + "learning_rate": 7.801797324156009e-07, + "loss": 1.9314, + "step": 9990 + }, + { + "epoch": 3.402228459641065, + "grad_norm": 2.546875, + "learning_rate": 7.715417308667326e-07, + "loss": 1.9229, + "step": 10000 + }, + { + "epoch": 3.405630688100706, + "grad_norm": 2.5625, + "learning_rate": 7.629489032562336e-07, + "loss": 1.86, + "step": 10010 + }, + { + "epoch": 3.409032916560347, + "grad_norm": 2.4375, + "learning_rate": 7.544013148065898e-07, + "loss": 1.9123, + "step": 10020 + }, + { + "epoch": 3.412435145019988, + "grad_norm": 1.8515625, + "learning_rate": 7.45899030396898e-07, + "loss": 1.8735, + "step": 10030 + }, + { + "epoch": 3.4158373734796292, + "grad_norm": 2.375, + "learning_rate": 7.374421145623891e-07, + "loss": 1.9386, + "step": 10040 + }, + { + "epoch": 3.41923960193927, + "grad_norm": 2.5625, + "learning_rate": 7.290306314939283e-07, + "loss": 1.8794, + "step": 10050 + }, + { + "epoch": 3.4226418303989115, + "grad_norm": 2.296875, + "learning_rate": 7.206646450375306e-07, + "loss": 1.9236, + "step": 10060 + }, + { + "epoch": 3.4260440588585523, + "grad_norm": 2.25, + "learning_rate": 7.123442186938769e-07, + "loss": 1.9224, + "step": 10070 + }, + { + "epoch": 3.4294462873181932, + "grad_norm": 2.28125, + "learning_rate": 7.040694156178301e-07, + "loss": 1.9089, + "step": 10080 + }, + { + "epoch": 3.4328485157778346, + "grad_norm": 2.125, + "learning_rate": 6.958402986179579e-07, + "loss": 1.9395, + "step": 10090 + }, + { + "epoch": 3.4362507442374755, + "grad_norm": 2.703125, + "learning_rate": 6.87656930156057e-07, + "loss": 1.9217, + "step": 10100 + }, + { + "epoch": 3.439652972697117, + "grad_norm": 2.203125, + "learning_rate": 6.795193723466726e-07, + "loss": 1.9458, + "step": 10110 + }, + { + "epoch": 3.4430552011567577, + "grad_norm": 1.828125, + "learning_rate": 6.714276869566347e-07, + "loss": 1.9698, + "step": 10120 + }, + { + "epoch": 3.4464574296163986, + "grad_norm": 2.3125, + "learning_rate": 6.633819354045855e-07, + "loss": 1.9773, + "step": 10130 + }, + { + "epoch": 3.44985965807604, + "grad_norm": 2.34375, + "learning_rate": 6.553821787605149e-07, + "loss": 1.8458, + "step": 10140 + }, + { + "epoch": 3.453261886535681, + "grad_norm": 2.265625, + "learning_rate": 6.474284777452948e-07, + "loss": 1.9633, + "step": 10150 + }, + { + "epoch": 3.456664114995322, + "grad_norm": 2.234375, + "learning_rate": 6.395208927302167e-07, + "loss": 1.9253, + "step": 10160 + }, + { + "epoch": 3.460066343454963, + "grad_norm": 1.984375, + "learning_rate": 6.31659483736541e-07, + "loss": 1.8867, + "step": 10170 + }, + { + "epoch": 3.463468571914604, + "grad_norm": 2.46875, + "learning_rate": 6.238443104350302e-07, + "loss": 1.9415, + "step": 10180 + }, + { + "epoch": 3.466870800374245, + "grad_norm": 2.4375, + "learning_rate": 6.160754321455092e-07, + "loss": 1.8688, + "step": 10190 + }, + { + "epoch": 3.470273028833886, + "grad_norm": 2.359375, + "learning_rate": 6.083529078364046e-07, + "loss": 1.8777, + "step": 10200 + }, + { + "epoch": 3.4736752572935274, + "grad_norm": 2.046875, + "learning_rate": 6.006767961242978e-07, + "loss": 1.8808, + "step": 10210 + }, + { + "epoch": 3.4770774857531683, + "grad_norm": 2.140625, + "learning_rate": 5.930471552734888e-07, + "loss": 1.9203, + "step": 10220 + }, + { + "epoch": 3.480479714212809, + "grad_norm": 2.21875, + "learning_rate": 5.854640431955407e-07, + "loss": 1.9427, + "step": 10230 + }, + { + "epoch": 3.4838819426724505, + "grad_norm": 2.609375, + "learning_rate": 5.779275174488542e-07, + "loss": 1.9229, + "step": 10240 + }, + { + "epoch": 3.4872841711320914, + "grad_norm": 2.328125, + "learning_rate": 5.704376352382198e-07, + "loss": 1.8909, + "step": 10250 + }, + { + "epoch": 3.4906863995917328, + "grad_norm": 2.25, + "learning_rate": 5.629944534143905e-07, + "loss": 1.9481, + "step": 10260 + }, + { + "epoch": 3.4940886280513737, + "grad_norm": 2.390625, + "learning_rate": 5.555980284736454e-07, + "loss": 1.9152, + "step": 10270 + }, + { + "epoch": 3.4974908565110145, + "grad_norm": 2.03125, + "learning_rate": 5.482484165573627e-07, + "loss": 1.9002, + "step": 10280 + }, + { + "epoch": 3.500893084970656, + "grad_norm": 2.34375, + "learning_rate": 5.409456734515961e-07, + "loss": 1.9427, + "step": 10290 + }, + { + "epoch": 3.5042953134302968, + "grad_norm": 2.390625, + "learning_rate": 5.336898545866455e-07, + "loss": 1.9312, + "step": 10300 + }, + { + "epoch": 3.5076975418899377, + "grad_norm": 2.3125, + "learning_rate": 5.264810150366431e-07, + "loss": 1.9146, + "step": 10310 + }, + { + "epoch": 3.511099770349579, + "grad_norm": 2.625, + "learning_rate": 5.193192095191315e-07, + "loss": 1.932, + "step": 10320 + }, + { + "epoch": 3.51450199880922, + "grad_norm": 2.21875, + "learning_rate": 5.122044923946488e-07, + "loss": 1.9544, + "step": 10330 + }, + { + "epoch": 3.517904227268861, + "grad_norm": 2.21875, + "learning_rate": 5.051369176663161e-07, + "loss": 1.9132, + "step": 10340 + }, + { + "epoch": 3.521306455728502, + "grad_norm": 2.09375, + "learning_rate": 4.981165389794265e-07, + "loss": 1.9379, + "step": 10350 + }, + { + "epoch": 3.524708684188143, + "grad_norm": 2.359375, + "learning_rate": 4.911434096210408e-07, + "loss": 1.8495, + "step": 10360 + }, + { + "epoch": 3.5281109126477843, + "grad_norm": 2.53125, + "learning_rate": 4.842175825195817e-07, + "loss": 1.964, + "step": 10370 + }, + { + "epoch": 3.531513141107425, + "grad_norm": 2.09375, + "learning_rate": 4.773391102444278e-07, + "loss": 1.8755, + "step": 10380 + }, + { + "epoch": 3.5349153695670665, + "grad_norm": 2.8125, + "learning_rate": 4.705080450055242e-07, + "loss": 1.902, + "step": 10390 + }, + { + "epoch": 3.5383175980267074, + "grad_norm": 3.03125, + "learning_rate": 4.63724438652977e-07, + "loss": 1.9428, + "step": 10400 + }, + { + "epoch": 3.5417198264863483, + "grad_norm": 2.125, + "learning_rate": 4.5698834267666295e-07, + "loss": 1.8812, + "step": 10410 + }, + { + "epoch": 3.5451220549459896, + "grad_norm": 2.265625, + "learning_rate": 4.502998082058419e-07, + "loss": 1.9378, + "step": 10420 + }, + { + "epoch": 3.5485242834056305, + "grad_norm": 2.546875, + "learning_rate": 4.4365888600876105e-07, + "loss": 1.8586, + "step": 10430 + }, + { + "epoch": 3.551926511865272, + "grad_norm": 2.5, + "learning_rate": 4.3706562649227966e-07, + "loss": 1.9303, + "step": 10440 + }, + { + "epoch": 3.5553287403249128, + "grad_norm": 2.28125, + "learning_rate": 4.305200797014755e-07, + "loss": 1.8785, + "step": 10450 + }, + { + "epoch": 3.5587309687845536, + "grad_norm": 2.296875, + "learning_rate": 4.2402229531927284e-07, + "loss": 1.8698, + "step": 10460 + }, + { + "epoch": 3.562133197244195, + "grad_norm": 2.203125, + "learning_rate": 4.1757232266606775e-07, + "loss": 1.9134, + "step": 10470 + }, + { + "epoch": 3.565535425703836, + "grad_norm": 2.0, + "learning_rate": 4.1117021069934086e-07, + "loss": 1.9092, + "step": 10480 + }, + { + "epoch": 3.568937654163477, + "grad_norm": 2.578125, + "learning_rate": 4.048160080133004e-07, + "loss": 1.8521, + "step": 10490 + }, + { + "epoch": 3.572339882623118, + "grad_norm": 2.046875, + "learning_rate": 3.985097628385017e-07, + "loss": 1.9322, + "step": 10500 + }, + { + "epoch": 3.575742111082759, + "grad_norm": 2.265625, + "learning_rate": 3.9225152304149186e-07, + "loss": 1.95, + "step": 10510 + }, + { + "epoch": 3.5791443395424003, + "grad_norm": 2.40625, + "learning_rate": 3.8604133612443344e-07, + "loss": 1.8966, + "step": 10520 + }, + { + "epoch": 3.582546568002041, + "grad_norm": 2.28125, + "learning_rate": 3.798792492247598e-07, + "loss": 1.8615, + "step": 10530 + }, + { + "epoch": 3.5859487964616825, + "grad_norm": 2.203125, + "learning_rate": 3.737653091148046e-07, + "loss": 1.9687, + "step": 10540 + }, + { + "epoch": 3.5893510249213234, + "grad_norm": 2.109375, + "learning_rate": 3.6769956220144835e-07, + "loss": 1.9133, + "step": 10550 + }, + { + "epoch": 3.5927532533809643, + "grad_norm": 2.203125, + "learning_rate": 3.61682054525775e-07, + "loss": 1.9313, + "step": 10560 + }, + { + "epoch": 3.5961554818406056, + "grad_norm": 2.359375, + "learning_rate": 3.5571283176270955e-07, + "loss": 2.0094, + "step": 10570 + }, + { + "epoch": 3.5995577103002465, + "grad_norm": 2.328125, + "learning_rate": 3.4979193922068417e-07, + "loss": 1.9955, + "step": 10580 + }, + { + "epoch": 3.602959938759888, + "grad_norm": 2.359375, + "learning_rate": 3.439194218412834e-07, + "loss": 1.9294, + "step": 10590 + }, + { + "epoch": 3.6063621672195287, + "grad_norm": 2.390625, + "learning_rate": 3.380953241989119e-07, + "loss": 1.8658, + "step": 10600 + }, + { + "epoch": 3.6097643956791696, + "grad_norm": 2.859375, + "learning_rate": 3.3231969050044987e-07, + "loss": 1.9264, + "step": 10610 + }, + { + "epoch": 3.613166624138811, + "grad_norm": 2.15625, + "learning_rate": 3.2659256458491855e-07, + "loss": 1.9539, + "step": 10620 + }, + { + "epoch": 3.616568852598452, + "grad_norm": 2.609375, + "learning_rate": 3.209139899231508e-07, + "loss": 1.9833, + "step": 10630 + }, + { + "epoch": 3.619971081058093, + "grad_norm": 2.328125, + "learning_rate": 3.1528400961745953e-07, + "loss": 1.9088, + "step": 10640 + }, + { + "epoch": 3.623373309517734, + "grad_norm": 2.359375, + "learning_rate": 3.0970266640130633e-07, + "loss": 1.9261, + "step": 10650 + }, + { + "epoch": 3.626775537977375, + "grad_norm": 2.1875, + "learning_rate": 3.0417000263898494e-07, + "loss": 1.8439, + "step": 10660 + }, + { + "epoch": 3.6301777664370163, + "grad_norm": 2.421875, + "learning_rate": 2.9868606032529224e-07, + "loss": 1.9474, + "step": 10670 + }, + { + "epoch": 3.633579994896657, + "grad_norm": 2.296875, + "learning_rate": 2.932508810852159e-07, + "loss": 1.9432, + "step": 10680 + }, + { + "epoch": 3.6369822233562985, + "grad_norm": 2.84375, + "learning_rate": 2.8786450617361245e-07, + "loss": 1.8769, + "step": 10690 + }, + { + "epoch": 3.6403844518159394, + "grad_norm": 2.40625, + "learning_rate": 2.825269764748977e-07, + "loss": 1.9754, + "step": 10700 + }, + { + "epoch": 3.6437866802755803, + "grad_norm": 2.109375, + "learning_rate": 2.772383325027377e-07, + "loss": 1.9327, + "step": 10710 + }, + { + "epoch": 3.6471889087352216, + "grad_norm": 2.421875, + "learning_rate": 2.719986143997357e-07, + "loss": 1.916, + "step": 10720 + }, + { + "epoch": 3.6505911371948625, + "grad_norm": 2.328125, + "learning_rate": 2.668078619371333e-07, + "loss": 1.8941, + "step": 10730 + }, + { + "epoch": 3.653993365654504, + "grad_norm": 2.4375, + "learning_rate": 2.616661145145063e-07, + "loss": 1.9525, + "step": 10740 + }, + { + "epoch": 3.6573955941141447, + "grad_norm": 2.546875, + "learning_rate": 2.5657341115946487e-07, + "loss": 1.8995, + "step": 10750 + }, + { + "epoch": 3.6607978225737856, + "grad_norm": 2.65625, + "learning_rate": 2.5152979052736e-07, + "loss": 1.9815, + "step": 10760 + }, + { + "epoch": 3.664200051033427, + "grad_norm": 2.765625, + "learning_rate": 2.46535290900983e-07, + "loss": 1.8823, + "step": 10770 + }, + { + "epoch": 3.667602279493068, + "grad_norm": 2.171875, + "learning_rate": 2.4158995019028676e-07, + "loss": 1.9158, + "step": 10780 + }, + { + "epoch": 3.671004507952709, + "grad_norm": 2.671875, + "learning_rate": 2.3669380593208516e-07, + "loss": 1.8857, + "step": 10790 + }, + { + "epoch": 3.67440673641235, + "grad_norm": 2.40625, + "learning_rate": 2.3184689528977832e-07, + "loss": 1.8922, + "step": 10800 + }, + { + "epoch": 3.677808964871991, + "grad_norm": 2.3125, + "learning_rate": 2.270492550530667e-07, + "loss": 1.9044, + "step": 10810 + }, + { + "epoch": 3.6812111933316323, + "grad_norm": 2.1875, + "learning_rate": 2.2230092163766907e-07, + "loss": 1.9365, + "step": 10820 + }, + { + "epoch": 3.684613421791273, + "grad_norm": 2.15625, + "learning_rate": 2.1760193108504913e-07, + "loss": 1.894, + "step": 10830 + }, + { + "epoch": 3.6880156502509145, + "grad_norm": 2.265625, + "learning_rate": 2.1295231906214332e-07, + "loss": 1.9366, + "step": 10840 + }, + { + "epoch": 3.6914178787105554, + "grad_norm": 1.921875, + "learning_rate": 2.0835212086108594e-07, + "loss": 1.9098, + "step": 10850 + }, + { + "epoch": 3.6948201071701963, + "grad_norm": 2.390625, + "learning_rate": 2.038013713989457e-07, + "loss": 1.9487, + "step": 10860 + }, + { + "epoch": 3.6982223356298376, + "grad_norm": 2.328125, + "learning_rate": 1.9930010521745713e-07, + "loss": 1.8716, + "step": 10870 + }, + { + "epoch": 3.7016245640894785, + "grad_norm": 2.21875, + "learning_rate": 1.9484835648276147e-07, + "loss": 1.8958, + "step": 10880 + }, + { + "epoch": 3.70502679254912, + "grad_norm": 2.390625, + "learning_rate": 1.904461589851424e-07, + "loss": 1.8943, + "step": 10890 + }, + { + "epoch": 3.7084290210087607, + "grad_norm": 1.9296875, + "learning_rate": 1.8609354613877697e-07, + "loss": 1.8747, + "step": 10900 + }, + { + "epoch": 3.7118312494684016, + "grad_norm": 2.296875, + "learning_rate": 1.817905509814755e-07, + "loss": 1.9229, + "step": 10910 + }, + { + "epoch": 3.715233477928043, + "grad_norm": 2.25, + "learning_rate": 1.7753720617443335e-07, + "loss": 1.9303, + "step": 10920 + }, + { + "epoch": 3.718635706387684, + "grad_norm": 2.328125, + "learning_rate": 1.7333354400198364e-07, + "loss": 1.9388, + "step": 10930 + }, + { + "epoch": 3.722037934847325, + "grad_norm": 2.015625, + "learning_rate": 1.691795963713496e-07, + "loss": 1.892, + "step": 10940 + }, + { + "epoch": 3.725440163306966, + "grad_norm": 2.3125, + "learning_rate": 1.6507539481240707e-07, + "loss": 1.9215, + "step": 10950 + }, + { + "epoch": 3.728842391766607, + "grad_norm": 2.28125, + "learning_rate": 1.6102097047744054e-07, + "loss": 1.9803, + "step": 10960 + }, + { + "epoch": 3.7322446202262483, + "grad_norm": 2.046875, + "learning_rate": 1.5701635414090798e-07, + "loss": 1.9324, + "step": 10970 + }, + { + "epoch": 3.735646848685889, + "grad_norm": 2.515625, + "learning_rate": 1.530615761992094e-07, + "loss": 1.8066, + "step": 10980 + }, + { + "epoch": 3.7390490771455305, + "grad_norm": 2.171875, + "learning_rate": 1.4915666667045188e-07, + "loss": 1.8818, + "step": 10990 + }, + { + "epoch": 3.7424513056051714, + "grad_norm": 2.390625, + "learning_rate": 1.4530165519422625e-07, + "loss": 1.9121, + "step": 11000 + }, + { + "epoch": 3.7458535340648123, + "grad_norm": 2.359375, + "learning_rate": 1.4149657103138097e-07, + "loss": 1.9224, + "step": 11010 + }, + { + "epoch": 3.7492557625244536, + "grad_norm": 2.5, + "learning_rate": 1.377414430637975e-07, + "loss": 1.9537, + "step": 11020 + }, + { + "epoch": 3.7526579909840945, + "grad_norm": 2.5, + "learning_rate": 1.3403629979417308e-07, + "loss": 1.9439, + "step": 11030 + }, + { + "epoch": 3.756060219443736, + "grad_norm": 2.375, + "learning_rate": 1.303811693458042e-07, + "loss": 1.9555, + "step": 11040 + }, + { + "epoch": 3.7594624479033767, + "grad_norm": 2.171875, + "learning_rate": 1.2677607946237328e-07, + "loss": 1.9296, + "step": 11050 + }, + { + "epoch": 3.7628646763630176, + "grad_norm": 2.46875, + "learning_rate": 1.2322105750773803e-07, + "loss": 1.9048, + "step": 11060 + }, + { + "epoch": 3.766266904822659, + "grad_norm": 2.609375, + "learning_rate": 1.1971613046572323e-07, + "loss": 1.9255, + "step": 11070 + }, + { + "epoch": 3.7696691332823, + "grad_norm": 2.34375, + "learning_rate": 1.1626132493991633e-07, + "loss": 1.9011, + "step": 11080 + }, + { + "epoch": 3.773071361741941, + "grad_norm": 2.28125, + "learning_rate": 1.1285666715346502e-07, + "loss": 1.8918, + "step": 11090 + }, + { + "epoch": 3.776473590201582, + "grad_norm": 2.484375, + "learning_rate": 1.0950218294888028e-07, + "loss": 1.84, + "step": 11100 + }, + { + "epoch": 3.779875818661223, + "grad_norm": 2.65625, + "learning_rate": 1.0619789778783557e-07, + "loss": 1.979, + "step": 11110 + }, + { + "epoch": 3.7832780471208642, + "grad_norm": 2.4375, + "learning_rate": 1.0294383675097872e-07, + "loss": 1.9141, + "step": 11120 + }, + { + "epoch": 3.786680275580505, + "grad_norm": 2.09375, + "learning_rate": 9.974002453774011e-08, + "loss": 1.98, + "step": 11130 + }, + { + "epoch": 3.7900825040401465, + "grad_norm": 2.484375, + "learning_rate": 9.658648546614084e-08, + "loss": 1.9723, + "step": 11140 + }, + { + "epoch": 3.7934847324997873, + "grad_norm": 2.421875, + "learning_rate": 9.348324347261734e-08, + "loss": 1.8887, + "step": 11150 + }, + { + "epoch": 3.7968869609594282, + "grad_norm": 2.546875, + "learning_rate": 9.04303221118288e-08, + "loss": 1.8763, + "step": 11160 + }, + { + "epoch": 3.8002891894190696, + "grad_norm": 2.46875, + "learning_rate": 8.742774455648695e-08, + "loss": 1.9326, + "step": 11170 + }, + { + "epoch": 3.8036914178787105, + "grad_norm": 1.9765625, + "learning_rate": 8.447553359717545e-08, + "loss": 1.8815, + "step": 11180 + }, + { + "epoch": 3.807093646338352, + "grad_norm": 2.296875, + "learning_rate": 8.157371164217902e-08, + "loss": 1.971, + "step": 11190 + }, + { + "epoch": 3.8104958747979927, + "grad_norm": 2.375, + "learning_rate": 7.872230071731239e-08, + "loss": 1.9483, + "step": 11200 + }, + { + "epoch": 3.8138981032576336, + "grad_norm": 2.609375, + "learning_rate": 7.592132246575323e-08, + "loss": 1.9457, + "step": 11210 + }, + { + "epoch": 3.817300331717275, + "grad_norm": 2.28125, + "learning_rate": 7.317079814787934e-08, + "loss": 1.9193, + "step": 11220 + }, + { + "epoch": 3.820702560176916, + "grad_norm": 2.203125, + "learning_rate": 7.047074864110375e-08, + "loss": 1.9131, + "step": 11230 + }, + { + "epoch": 3.824104788636557, + "grad_norm": 2.21875, + "learning_rate": 6.782119443972094e-08, + "loss": 1.9334, + "step": 11240 + }, + { + "epoch": 3.827507017096198, + "grad_norm": 2.625, + "learning_rate": 6.522215565474712e-08, + "loss": 1.958, + "step": 11250 + }, + { + "epoch": 3.830909245555839, + "grad_norm": 2.421875, + "learning_rate": 6.267365201377092e-08, + "loss": 1.9266, + "step": 11260 + }, + { + "epoch": 3.8343114740154802, + "grad_norm": 2.53125, + "learning_rate": 6.017570286079965e-08, + "loss": 1.9022, + "step": 11270 + }, + { + "epoch": 3.837713702475121, + "grad_norm": 2.34375, + "learning_rate": 5.77283271561175e-08, + "loss": 1.8612, + "step": 11280 + }, + { + "epoch": 3.8411159309347624, + "grad_norm": 2.453125, + "learning_rate": 5.5331543476137706e-08, + "loss": 1.9326, + "step": 11290 + }, + { + "epoch": 3.8445181593944033, + "grad_norm": 2.296875, + "learning_rate": 5.298537001326303e-08, + "loss": 1.8951, + "step": 11300 + }, + { + "epoch": 3.847920387854044, + "grad_norm": 2.40625, + "learning_rate": 5.068982457574685e-08, + "loss": 1.9788, + "step": 11310 + }, + { + "epoch": 3.8513226163136856, + "grad_norm": 2.609375, + "learning_rate": 4.8444924587559654e-08, + "loss": 1.9643, + "step": 11320 + }, + { + "epoch": 3.8547248447733264, + "grad_norm": 2.5625, + "learning_rate": 4.625068708825534e-08, + "loss": 1.9245, + "step": 11330 + }, + { + "epoch": 3.8581270732329678, + "grad_norm": 2.34375, + "learning_rate": 4.4107128732841385e-08, + "loss": 1.8401, + "step": 11340 + }, + { + "epoch": 3.8615293016926087, + "grad_norm": 2.09375, + "learning_rate": 4.20142657916557e-08, + "loss": 1.9087, + "step": 11350 + }, + { + "epoch": 3.8649315301522496, + "grad_norm": 2.140625, + "learning_rate": 3.99721141502382e-08, + "loss": 1.9401, + "step": 11360 + }, + { + "epoch": 3.868333758611891, + "grad_norm": 2.328125, + "learning_rate": 3.798068930921441e-08, + "loss": 1.9699, + "step": 11370 + }, + { + "epoch": 3.8717359870715318, + "grad_norm": 2.0625, + "learning_rate": 3.6040006384174545e-08, + "loss": 1.954, + "step": 11380 + }, + { + "epoch": 3.875138215531173, + "grad_norm": 2.40625, + "learning_rate": 3.4150080105563755e-08, + "loss": 1.8693, + "step": 11390 + }, + { + "epoch": 3.878540443990814, + "grad_norm": 2.078125, + "learning_rate": 3.231092481856271e-08, + "loss": 1.9307, + "step": 11400 + }, + { + "epoch": 3.881942672450455, + "grad_norm": 2.328125, + "learning_rate": 3.052255448298612e-08, + "loss": 1.956, + "step": 11410 + }, + { + "epoch": 3.885344900910096, + "grad_norm": 2.234375, + "learning_rate": 2.878498267317298e-08, + "loss": 1.9185, + "step": 11420 + }, + { + "epoch": 3.888747129369737, + "grad_norm": 2.5, + "learning_rate": 2.7098222577882825e-08, + "loss": 1.8685, + "step": 11430 + }, + { + "epoch": 3.8921493578293784, + "grad_norm": 2.328125, + "learning_rate": 2.5462287000197963e-08, + "loss": 1.9734, + "step": 11440 + }, + { + "epoch": 3.8955515862890193, + "grad_norm": 2.09375, + "learning_rate": 2.3877188357427174e-08, + "loss": 1.8995, + "step": 11450 + }, + { + "epoch": 3.89895381474866, + "grad_norm": 2.25, + "learning_rate": 2.2342938681005695e-08, + "loss": 1.8764, + "step": 11460 + }, + { + "epoch": 3.9023560432083015, + "grad_norm": 2.265625, + "learning_rate": 2.085954961641164e-08, + "loss": 1.8865, + "step": 11470 + }, + { + "epoch": 3.9057582716679424, + "grad_norm": 2.359375, + "learning_rate": 1.9427032423071165e-08, + "loss": 1.8932, + "step": 11480 + }, + { + "epoch": 3.9091605001275838, + "grad_norm": 2.25, + "learning_rate": 1.8045397974277166e-08, + "loss": 1.9042, + "step": 11490 + }, + { + "epoch": 3.9125627285872246, + "grad_norm": 2.1875, + "learning_rate": 1.6714656757104883e-08, + "loss": 1.94, + "step": 11500 + }, + { + "epoch": 3.9159649570468655, + "grad_norm": 2.28125, + "learning_rate": 1.5434818872331314e-08, + "loss": 1.8879, + "step": 11510 + }, + { + "epoch": 3.919367185506507, + "grad_norm": 2.046875, + "learning_rate": 1.4205894034362065e-08, + "loss": 1.9147, + "step": 11520 + }, + { + "epoch": 3.9227694139661478, + "grad_norm": 2.484375, + "learning_rate": 1.3027891571153722e-08, + "loss": 1.8714, + "step": 11530 + }, + { + "epoch": 3.926171642425789, + "grad_norm": 2.03125, + "learning_rate": 1.1900820424145176e-08, + "loss": 1.9371, + "step": 11540 + }, + { + "epoch": 3.92957387088543, + "grad_norm": 2.0, + "learning_rate": 1.0824689148190455e-08, + "loss": 1.9505, + "step": 11550 + }, + { + "epoch": 3.932976099345071, + "grad_norm": 2.453125, + "learning_rate": 9.799505911490794e-09, + "loss": 1.8738, + "step": 11560 + }, + { + "epoch": 3.936378327804712, + "grad_norm": 2.328125, + "learning_rate": 8.825278495535672e-09, + "loss": 1.8447, + "step": 11570 + }, + { + "epoch": 3.939780556264353, + "grad_norm": 2.28125, + "learning_rate": 7.902014295042352e-09, + "loss": 1.8987, + "step": 11580 + }, + { + "epoch": 3.9431827847239944, + "grad_norm": 2.46875, + "learning_rate": 7.029720317899902e-09, + "loss": 1.9864, + "step": 11590 + }, + { + "epoch": 3.9465850131836353, + "grad_norm": 2.796875, + "learning_rate": 6.20840318511545e-09, + "loss": 1.9454, + "step": 11600 + }, + { + "epoch": 3.949987241643276, + "grad_norm": 2.59375, + "learning_rate": 5.438069130766418e-09, + "loss": 1.9871, + "step": 11610 + }, + { + "epoch": 3.9533894701029175, + "grad_norm": 2.40625, + "learning_rate": 4.718724001949017e-09, + "loss": 1.8746, + "step": 11620 + }, + { + "epoch": 3.9567916985625584, + "grad_norm": 2.46875, + "learning_rate": 4.050373258737196e-09, + "loss": 1.9578, + "step": 11630 + }, + { + "epoch": 3.9601939270221997, + "grad_norm": 2.171875, + "learning_rate": 3.4330219741408427e-09, + "loss": 1.9242, + "step": 11640 + }, + { + "epoch": 3.9635961554818406, + "grad_norm": 2.703125, + "learning_rate": 2.8666748340662245e-09, + "loss": 1.9133, + "step": 11650 + }, + { + "epoch": 3.9669983839414815, + "grad_norm": 2.0625, + "learning_rate": 2.351336137279413e-09, + "loss": 1.9196, + "step": 11660 + }, + { + "epoch": 3.970400612401123, + "grad_norm": 1.78125, + "learning_rate": 1.887009795377922e-09, + "loss": 1.9906, + "step": 11670 + }, + { + "epoch": 3.9738028408607637, + "grad_norm": 2.296875, + "learning_rate": 1.473699332754879e-09, + "loss": 1.8989, + "step": 11680 + }, + { + "epoch": 3.977205069320405, + "grad_norm": 2.609375, + "learning_rate": 1.1114078865781264e-09, + "loss": 1.8962, + "step": 11690 + }, + { + "epoch": 3.980607297780046, + "grad_norm": 2.34375, + "learning_rate": 8.001382067626036e-10, + "loss": 1.944, + "step": 11700 + }, + { + "epoch": 3.984009526239687, + "grad_norm": 2.265625, + "learning_rate": 5.398926559516878e-10, + "loss": 1.8959, + "step": 11710 + }, + { + "epoch": 3.987411754699328, + "grad_norm": 2.328125, + "learning_rate": 3.306732094962939e-10, + "loss": 1.9388, + "step": 11720 + }, + { + "epoch": 3.990813983158969, + "grad_norm": 2.359375, + "learning_rate": 1.7248145544367861e-10, + "loss": 1.9133, + "step": 11730 + }, + { + "epoch": 3.9942162116186104, + "grad_norm": 1.96875, + "learning_rate": 6.531859452325864e-11, + "loss": 1.957, + "step": 11740 + }, + { + "epoch": 3.9976184400782513, + "grad_norm": 2.3125, + "learning_rate": 9.185440136907336e-12, + "loss": 1.9494, + "step": 11750 + } + ], + "logging_steps": 10, + "max_steps": 11756, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.0768921731962634e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}